aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2012-02-03 17:12:42 -0500
committerJiri Kosina <jkosina@suse.cz>2012-02-03 17:13:05 -0500
commit972c5ae961d6e5103e2b33d935cfa4145fd47140 (patch)
tree350b2a76b979ba8766c09838617df67ff330eca0 /fs
parent5196d20305d5e30d871111d3a876cf067dd94255 (diff)
parent7c7ed8ec337bf5f62cc5287a6eb6b2f1b7504c2f (diff)
Merge branch 'master' into for-next
Sync with Linus' tree to be able to apply patch to a newer code (namely drivers/gpu/drm/gma500/psb_intel_lvds.c)
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/cache.c64
-rw-r--r--fs/9p/fid.c8
-rw-r--r--fs/9p/v9fs.c59
-rw-r--r--fs/9p/vfs_addr.c13
-rw-r--r--fs/9p/vfs_dentry.c12
-rw-r--r--fs/9p/vfs_dir.c13
-rw-r--r--fs/9p/vfs_file.c34
-rw-r--r--fs/9p/vfs_inode.c165
-rw-r--r--fs/9p/vfs_inode_dotl.c127
-rw-r--r--fs/9p/vfs_super.c12
-rw-r--r--fs/9p/xattr.c16
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/aio.c11
-rw-r--r--fs/autofs4/autofs_i.h1
-rw-r--r--fs/autofs4/inode.c1
-rw-r--r--fs/autofs4/waitq.c40
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c13
-rw-r--r--fs/btrfs/Kconfig19
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c1131
-rw-r--r--fs/btrfs/backref.h5
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/check-integrity.c3069
-rw-r--r--fs/btrfs/check-integrity.h36
-rw-r--r--fs/btrfs/ctree.c42
-rw-r--r--fs/btrfs/ctree.h239
-rw-r--r--fs/btrfs/delayed-inode.c45
-rw-r--r--fs/btrfs/delayed-ref.c153
-rw-r--r--fs/btrfs/delayed-ref.h104
-rw-r--r--fs/btrfs/disk-io.c131
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/export.c2
-rw-r--r--fs/btrfs/extent-tree.c514
-rw-r--r--fs/btrfs/extent_io.c8
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c16
-rw-r--r--fs/btrfs/free-space-cache.c422
-rw-r--r--fs/btrfs/inode-map.c4
-rw-r--r--fs/btrfs/inode.c76
-rw-r--r--fs/btrfs/ioctl.c270
-rw-r--r--fs/btrfs/ioctl.h54
-rw-r--r--fs/btrfs/locking.c53
-rw-r--r--fs/btrfs/relocation.c20
-rw-r--r--fs/btrfs/scrub.c12
-rw-r--r--fs/btrfs/super.c190
-rw-r--r--fs/btrfs/transaction.c20
-rw-r--r--fs/btrfs/tree-log.c8
-rw-r--r--fs/btrfs/ulist.c220
-rw-r--r--fs/btrfs/ulist.h68
-rw-r--r--fs/btrfs/volumes.c993
-rw-r--r--fs/btrfs/volumes.h54
-rw-r--r--fs/btrfs/xattr.c2
-rw-r--r--fs/ceph/caps.c4
-rw-r--r--fs/ceph/dir.c80
-rw-r--r--fs/ceph/export.c6
-rw-r--r--fs/ceph/inode.c3
-rw-r--r--fs/ceph/mds_client.c14
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/super.c31
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/ceph/xattr.c26
-rw-r--r--fs/char_dev.c6
-rw-r--r--fs/cifs/Kconfig3
-rw-r--r--fs/cifs/cifs_debug.c11
-rw-r--r--fs/cifs/cifs_spnego.c10
-rw-r--r--fs/cifs/cifs_unicode.c41
-rw-r--r--fs/cifs/cifs_unicode.h20
-rw-r--r--fs/cifs/cifsacl.c2
-rw-r--r--fs/cifs/cifsencrypt.c21
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifssmb.c162
-rw-r--r--fs/cifs/connect.c305
-rw-r--r--fs/cifs/readdir.c9
-rw-r--r--fs/cifs/sess.c34
-rw-r--r--fs/cifs/smbencrypt.c2
-rw-r--r--fs/coda/cnode.c38
-rw-r--r--fs/coda/coda_fs_i.h4
-rw-r--r--fs/coda/dir.c29
-rw-r--r--fs/coda/inode.c10
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/dcache.c106
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/devpts/inode.c4
-rw-r--r--fs/direct-io.c57
-rw-r--r--fs/dlm/config.c130
-rw-r--r--fs/dlm/config.h17
-rw-r--r--fs/dlm/debug_fs.c28
-rw-r--r--fs/dlm/dir.c1
-rw-r--r--fs/dlm/dlm_internal.h60
-rw-r--r--fs/dlm/lock.c87
-rw-r--r--fs/dlm/lockspace.c71
-rw-r--r--fs/dlm/member.c486
-rw-r--r--fs/dlm/member.h10
-rw-r--r--fs/dlm/rcom.c99
-rw-r--r--fs/dlm/rcom.h2
-rw-r--r--fs/dlm/recover.c87
-rw-r--r--fs/dlm/recoverd.c53
-rw-r--r--fs/dlm/user.c5
-rw-r--r--fs/ecryptfs/crypto.c54
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/inode.c48
-rw-r--r--fs/ecryptfs/keystore.c5
-rw-r--r--fs/ecryptfs/miscdev.c140
-rw-r--r--fs/ecryptfs/mmap.c8
-rw-r--r--fs/ecryptfs/read_write.c96
-rw-r--r--fs/eventpoll.c234
-rw-r--r--fs/exec.c4
-rw-r--r--fs/exofs/Kconfig11
-rw-r--r--fs/exofs/Kconfig.ore12
-rw-r--r--fs/exofs/ore.c8
-rw-r--r--fs/exofs/ore_raid.c78
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/ext2/ialloc.c7
-rw-r--r--fs/ext2/inode.c5
-rw-r--r--fs/ext2/ioctl.c22
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext2/xattr.c1
-rw-r--r--fs/ext2/xattr_security.c1
-rw-r--r--fs/ext2/xattr_trusted.c1
-rw-r--r--fs/ext2/xattr_user.c1
-rw-r--r--fs/ext3/ialloc.c8
-rw-r--r--fs/ext3/inode.c43
-rw-r--r--fs/ext3/ioctl.c6
-rw-r--r--fs/ext3/namei.c11
-rw-r--r--fs/ext3/super.c15
-rw-r--r--fs/ext3/xattr_security.c1
-rw-r--r--fs/ext3/xattr_trusted.c1
-rw-r--r--fs/ext3/xattr_user.c1
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/ext4.h29
-rw-r--r--fs/ext4/extents.c11
-rw-r--r--fs/ext4/ialloc.c18
-rw-r--r--fs/ext4/indirect.c1
-rw-r--r--fs/ext4/inode.c144
-rw-r--r--fs/ext4/ioctl.c92
-rw-r--r--fs/ext4/mballoc.c2
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/namei.c2
-rw-r--r--fs/ext4/page-io.c1
-rw-r--r--fs/ext4/resize.c1175
-rw-r--r--fs/ext4/super.c24
-rw-r--r--fs/ext4/xattr_security.c6
-rw-r--r--fs/ext4/xattr_trusted.c1
-rw-r--r--fs/ext4/xattr_user.c1
-rw-r--r--fs/fat/namei_vfat.c3
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--fs/fuse/dev.c57
-rw-r--r--fs/fuse/dir.c58
-rw-r--r--fs/fuse/file.c58
-rw-r--r--fs/fuse/fuse_i.h10
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/glock.h7
-rw-r--r--fs/gfs2/incore.h60
-rw-r--r--fs/gfs2/inode.c4
-rw-r--r--fs/gfs2/lock_dlm.c993
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/ops_fstype.c31
-rw-r--r--fs/gfs2/recovery.c11
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/gfs2/sys.c33
-rw-r--r--fs/gfs2/sys.h2
-rw-r--r--fs/hfsplus/super.c11
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/inode.c5
-rw-r--r--fs/ioprio.c24
-rw-r--r--fs/isofs/inode.c7
-rw-r--r--fs/jbd/checkpoint.c27
-rw-r--r--fs/jbd/commit.c6
-rw-r--r--fs/jbd/journal.c1
-rw-r--r--fs/jbd/recovery.c4
-rw-r--r--fs/jbd/revoke.c34
-rw-r--r--fs/jbd/transaction.c38
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/revoke.c34
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/jffs2/erase.c17
-rw-r--r--fs/jffs2/fs.c1
-rw-r--r--fs/jffs2/readinode.c22
-rw-r--r--fs/jffs2/scan.c12
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jffs2/wbuf.c38
-rw-r--r--fs/jffs2/writev.c32
-rw-r--r--fs/lockd/mon.c2
-rw-r--r--fs/logfs/dev_mtd.c80
-rw-r--r--fs/logfs/dir.c2
-rw-r--r--fs/logfs/file.c2
-rw-r--r--fs/logfs/gc.c2
-rw-r--r--fs/logfs/inode.c4
-rw-r--r--fs/logfs/journal.c1
-rw-r--r--fs/logfs/logfs.h5
-rw-r--r--fs/logfs/readwrite.c51
-rw-r--r--fs/logfs/segment.c51
-rw-r--r--fs/logfs/super.c3
-rw-r--r--fs/mpage.c4
-rw-r--r--fs/namei.c28
-rw-r--r--fs/nfs/blocklayout/blocklayout.c202
-rw-r--r--fs/nfs/blocklayout/blocklayout.h12
-rw-r--r--fs/nfs/blocklayout/extents.c176
-rw-r--r--fs/nfs/callback.h2
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c4
-rw-r--r--fs/nfs/client.c12
-rw-r--r--fs/nfs/file.c4
-rw-r--r--fs/nfs/idmap.c83
-rw-r--r--fs/nfs/inode.c4
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs4_fs.h3
-rw-r--r--fs/nfs/nfs4filelayout.c9
-rw-r--r--fs/nfs/nfs4filelayoutdev.c2
-rw-r--r--fs/nfs/nfs4proc.c177
-rw-r--r--fs/nfs/nfs4state.c104
-rw-r--r--fs/nfs/nfs4xdr.c137
-rw-r--r--fs/nfs/objlayout/objio_osd.c3
-rw-r--r--fs/nfs/objlayout/objlayout.c4
-rw-r--r--fs/nfs/pnfs.c42
-rw-r--r--fs/nfs/pnfs.h1
-rw-r--r--fs/nfs/super.c43
-rw-r--r--fs/nfs/write.c31
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/export.c12
-rw-r--r--fs/nfsd/fault_inject.c91
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/nfs4callback.c2
-rw-r--r--fs/nfsd/nfs4idmap.c11
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4recover.c22
-rw-r--r--fs/nfsd/nfs4state.c328
-rw-r--r--fs/nfsd/nfs4xdr.c3
-rw-r--r--fs/nfsd/nfsctl.c10
-rw-r--r--fs/nfsd/nfsd.h20
-rw-r--r--fs/nfsd/state.h3
-rw-r--r--fs/nfsd/vfs.c17
-rw-r--r--fs/nls/nls_base.c73
-rw-r--r--fs/notify/mark.c8
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/pipe.c2
-rw-r--r--fs/proc/array.c9
-rw-r--r--fs/proc/base.c683
-rw-r--r--fs/proc/inode.c18
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/root.c70
-rw-r--r--fs/proc/stat.c2
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/qnx4/inode.c62
-rw-r--r--fs/quota/dquot.c8
-rw-r--r--fs/reiserfs/bitmap.c3
-rw-r--r--fs/reiserfs/journal.c64
-rw-r--r--fs/reiserfs/super.c81
-rw-r--r--fs/romfs/mmap-nommu.c28
-rw-r--r--fs/squashfs/cache.c30
-rw-r--r--fs/squashfs/inode.c4
-rw-r--r--fs/squashfs/squashfs_fs_sb.h1
-rw-r--r--fs/squashfs/super.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/sysfs/file.c6
-rw-r--r--fs/sysfs/inode.c5
-rw-r--r--fs/ubifs/debug.c90
-rw-r--r--fs/ubifs/debug.h75
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/lpt.c6
-rw-r--r--fs/ubifs/replay.c8
-rw-r--r--fs/ubifs/tnc.c58
-rw-r--r--fs/ubifs/tnc_misc.c10
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/udf/file.c6
-rw-r--r--fs/udf/inode.c57
-rw-r--r--fs/udf/super.c6
-rw-r--r--fs/udf/symlink.c14
-rw-r--r--fs/xfs/xfs_aops.c29
-rw-r--r--fs/xfs/xfs_attr.c4
-rw-r--r--fs/xfs/xfs_attr_leaf.c9
-rw-r--r--fs/xfs/xfs_bmap.c116
-rw-r--r--fs/xfs/xfs_dfrag.c43
-rw-r--r--fs/xfs/xfs_discard.c4
-rw-r--r--fs/xfs/xfs_file.c184
-rw-r--r--fs/xfs/xfs_fs_subr.c2
-rw-r--r--fs/xfs/xfs_iget.c24
-rw-r--r--fs/xfs/xfs_inode.c193
-rw-r--r--fs/xfs/xfs_inode.h114
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_iomap.c46
-rw-r--r--fs/xfs/xfs_iops.c46
-rw-r--r--fs/xfs/xfs_qm_syscalls.c8
-rw-r--r--fs/xfs/xfs_super.c8
-rw-r--r--fs/xfs/xfs_sync.c9
-rw-r--r--fs/xfs/xfs_trace.h29
-rw-r--r--fs/xfs/xfs_vnodeops.c47
292 files changed, 14423 insertions, 4685 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 945aa5f02f9b..a9ea73d6dcf3 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -62,8 +62,8 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
62 uint16_t klen = 0; 62 uint16_t klen = 0;
63 63
64 v9ses = (struct v9fs_session_info *)cookie_netfs_data; 64 v9ses = (struct v9fs_session_info *)cookie_netfs_data;
65 P9_DPRINTK(P9_DEBUG_FSC, "session %p buf %p size %u", v9ses, 65 p9_debug(P9_DEBUG_FSC, "session %p buf %p size %u\n",
66 buffer, bufmax); 66 v9ses, buffer, bufmax);
67 67
68 if (v9ses->cachetag) 68 if (v9ses->cachetag)
69 klen = strlen(v9ses->cachetag); 69 klen = strlen(v9ses->cachetag);
@@ -72,7 +72,7 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
72 return 0; 72 return 0;
73 73
74 memcpy(buffer, v9ses->cachetag, klen); 74 memcpy(buffer, v9ses->cachetag, klen);
75 P9_DPRINTK(P9_DEBUG_FSC, "cache session tag %s", v9ses->cachetag); 75 p9_debug(P9_DEBUG_FSC, "cache session tag %s\n", v9ses->cachetag);
76 return klen; 76 return klen;
77} 77}
78 78
@@ -91,14 +91,14 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
91 v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, 91 v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
92 &v9fs_cache_session_index_def, 92 &v9fs_cache_session_index_def,
93 v9ses); 93 v9ses);
94 P9_DPRINTK(P9_DEBUG_FSC, "session %p get cookie %p", v9ses, 94 p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
95 v9ses->fscache); 95 v9ses, v9ses->fscache);
96} 96}
97 97
98void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) 98void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
99{ 99{
100 P9_DPRINTK(P9_DEBUG_FSC, "session %p put cookie %p", v9ses, 100 p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n",
101 v9ses->fscache); 101 v9ses, v9ses->fscache);
102 fscache_relinquish_cookie(v9ses->fscache, 0); 102 fscache_relinquish_cookie(v9ses->fscache, 0);
103 v9ses->fscache = NULL; 103 v9ses->fscache = NULL;
104} 104}
@@ -109,8 +109,8 @@ static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
109{ 109{
110 const struct v9fs_inode *v9inode = cookie_netfs_data; 110 const struct v9fs_inode *v9inode = cookie_netfs_data;
111 memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path)); 111 memcpy(buffer, &v9inode->qid.path, sizeof(v9inode->qid.path));
112 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode, 112 p9_debug(P9_DEBUG_FSC, "inode %p get key %llu\n",
113 v9inode->qid.path); 113 &v9inode->vfs_inode, v9inode->qid.path);
114 return sizeof(v9inode->qid.path); 114 return sizeof(v9inode->qid.path);
115} 115}
116 116
@@ -120,8 +120,8 @@ static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
120 const struct v9fs_inode *v9inode = cookie_netfs_data; 120 const struct v9fs_inode *v9inode = cookie_netfs_data;
121 *size = i_size_read(&v9inode->vfs_inode); 121 *size = i_size_read(&v9inode->vfs_inode);
122 122
123 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode, 123 p9_debug(P9_DEBUG_FSC, "inode %p get attr %llu\n",
124 *size); 124 &v9inode->vfs_inode, *size);
125} 125}
126 126
127static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, 127static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
@@ -129,8 +129,8 @@ static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
129{ 129{
130 const struct v9fs_inode *v9inode = cookie_netfs_data; 130 const struct v9fs_inode *v9inode = cookie_netfs_data;
131 memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version)); 131 memcpy(buffer, &v9inode->qid.version, sizeof(v9inode->qid.version));
132 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode, 132 p9_debug(P9_DEBUG_FSC, "inode %p get aux %u\n",
133 v9inode->qid.version); 133 &v9inode->vfs_inode, v9inode->qid.version);
134 return sizeof(v9inode->qid.version); 134 return sizeof(v9inode->qid.version);
135} 135}
136 136
@@ -206,8 +206,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
206 &v9fs_cache_inode_index_def, 206 &v9fs_cache_inode_index_def,
207 v9inode); 207 v9inode);
208 208
209 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, 209 p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
210 v9inode->fscache); 210 inode, v9inode->fscache);
211} 211}
212 212
213void v9fs_cache_inode_put_cookie(struct inode *inode) 213void v9fs_cache_inode_put_cookie(struct inode *inode)
@@ -216,8 +216,8 @@ void v9fs_cache_inode_put_cookie(struct inode *inode)
216 216
217 if (!v9inode->fscache) 217 if (!v9inode->fscache)
218 return; 218 return;
219 P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, 219 p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n",
220 v9inode->fscache); 220 inode, v9inode->fscache);
221 221
222 fscache_relinquish_cookie(v9inode->fscache, 0); 222 fscache_relinquish_cookie(v9inode->fscache, 0);
223 v9inode->fscache = NULL; 223 v9inode->fscache = NULL;
@@ -229,8 +229,8 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
229 229
230 if (!v9inode->fscache) 230 if (!v9inode->fscache)
231 return; 231 return;
232 P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, 232 p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n",
233 v9inode->fscache); 233 inode, v9inode->fscache);
234 234
235 fscache_relinquish_cookie(v9inode->fscache, 1); 235 fscache_relinquish_cookie(v9inode->fscache, 1);
236 v9inode->fscache = NULL; 236 v9inode->fscache = NULL;
@@ -272,8 +272,8 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
272 v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, 272 v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
273 &v9fs_cache_inode_index_def, 273 &v9fs_cache_inode_index_def,
274 v9inode); 274 v9inode);
275 P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", 275 p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
276 inode, old, v9inode->fscache); 276 inode, old, v9inode->fscache);
277 277
278 spin_unlock(&v9inode->fscache_lock); 278 spin_unlock(&v9inode->fscache_lock);
279} 279}
@@ -323,7 +323,7 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
323 int ret; 323 int ret;
324 const struct v9fs_inode *v9inode = V9FS_I(inode); 324 const struct v9fs_inode *v9inode = V9FS_I(inode);
325 325
326 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); 326 p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
327 if (!v9inode->fscache) 327 if (!v9inode->fscache)
328 return -ENOBUFS; 328 return -ENOBUFS;
329 329
@@ -335,13 +335,13 @@ int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
335 switch (ret) { 335 switch (ret) {
336 case -ENOBUFS: 336 case -ENOBUFS:
337 case -ENODATA: 337 case -ENODATA:
338 P9_DPRINTK(P9_DEBUG_FSC, "page/inode not in cache %d", ret); 338 p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret);
339 return 1; 339 return 1;
340 case 0: 340 case 0:
341 P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); 341 p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
342 return ret; 342 return ret;
343 default: 343 default:
344 P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); 344 p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
345 return ret; 345 return ret;
346 } 346 }
347} 347}
@@ -361,7 +361,7 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
361 int ret; 361 int ret;
362 const struct v9fs_inode *v9inode = V9FS_I(inode); 362 const struct v9fs_inode *v9inode = V9FS_I(inode);
363 363
364 P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); 364 p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages);
365 if (!v9inode->fscache) 365 if (!v9inode->fscache)
366 return -ENOBUFS; 366 return -ENOBUFS;
367 367
@@ -373,15 +373,15 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
373 switch (ret) { 373 switch (ret) {
374 case -ENOBUFS: 374 case -ENOBUFS:
375 case -ENODATA: 375 case -ENODATA:
376 P9_DPRINTK(P9_DEBUG_FSC, "pages/inodes not in cache %d", ret); 376 p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret);
377 return 1; 377 return 1;
378 case 0: 378 case 0:
379 BUG_ON(!list_empty(pages)); 379 BUG_ON(!list_empty(pages));
380 BUG_ON(*nr_pages != 0); 380 BUG_ON(*nr_pages != 0);
381 P9_DPRINTK(P9_DEBUG_FSC, "BIO submitted"); 381 p9_debug(P9_DEBUG_FSC, "BIO submitted\n");
382 return ret; 382 return ret;
383 default: 383 default:
384 P9_DPRINTK(P9_DEBUG_FSC, "ret %d", ret); 384 p9_debug(P9_DEBUG_FSC, "ret %d\n", ret);
385 return ret; 385 return ret;
386 } 386 }
387} 387}
@@ -396,9 +396,9 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
396 int ret; 396 int ret;
397 const struct v9fs_inode *v9inode = V9FS_I(inode); 397 const struct v9fs_inode *v9inode = V9FS_I(inode);
398 398
399 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); 399 p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
400 ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL); 400 ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
401 P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); 401 p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret);
402 if (ret != 0) 402 if (ret != 0)
403 v9fs_uncache_page(inode, page); 403 v9fs_uncache_page(inode, page);
404} 404}
@@ -409,7 +409,7 @@ void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
409void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) 409void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
410{ 410{
411 const struct v9fs_inode *v9inode = V9FS_I(inode); 411 const struct v9fs_inode *v9inode = V9FS_I(inode);
412 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); 412 p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page);
413 if (PageFsCache(page)) 413 if (PageFsCache(page))
414 fscache_wait_on_page_write(v9inode->fscache, page); 414 fscache_wait_on_page_write(v9inode->fscache, page);
415} 415}
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 85b67ffa2a43..da8eefbe830d 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -45,8 +45,8 @@ int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid)
45{ 45{
46 struct v9fs_dentry *dent; 46 struct v9fs_dentry *dent;
47 47
48 P9_DPRINTK(P9_DEBUG_VFS, "fid %d dentry %s\n", 48 p9_debug(P9_DEBUG_VFS, "fid %d dentry %s\n",
49 fid->fid, dentry->d_name.name); 49 fid->fid, dentry->d_name.name);
50 50
51 dent = dentry->d_fsdata; 51 dent = dentry->d_fsdata;
52 if (!dent) { 52 if (!dent) {
@@ -79,8 +79,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, u32 uid, int any)
79 struct v9fs_dentry *dent; 79 struct v9fs_dentry *dent;
80 struct p9_fid *fid, *ret; 80 struct p9_fid *fid, *ret;
81 81
82 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", 82 p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n",
83 dentry->d_name.name, dentry, uid, any); 83 dentry->d_name.name, dentry, uid, any);
84 dent = (struct v9fs_dentry *) dentry->d_fsdata; 84 dent = (struct v9fs_dentry *) dentry->d_fsdata;
85 ret = NULL; 85 ret = NULL;
86 if (dent) { 86 if (dent) {
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2b78014a124a..1964f98e74be 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -23,6 +23,8 @@
23 * 23 *
24 */ 24 */
25 25
26#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
27
26#include <linux/module.h> 28#include <linux/module.h>
27#include <linux/errno.h> 29#include <linux/errno.h>
28#include <linux/fs.h> 30#include <linux/fs.h>
@@ -85,15 +87,15 @@ static int get_cache_mode(char *s)
85 87
86 if (!strcmp(s, "loose")) { 88 if (!strcmp(s, "loose")) {
87 version = CACHE_LOOSE; 89 version = CACHE_LOOSE;
88 P9_DPRINTK(P9_DEBUG_9P, "Cache mode: loose\n"); 90 p9_debug(P9_DEBUG_9P, "Cache mode: loose\n");
89 } else if (!strcmp(s, "fscache")) { 91 } else if (!strcmp(s, "fscache")) {
90 version = CACHE_FSCACHE; 92 version = CACHE_FSCACHE;
91 P9_DPRINTK(P9_DEBUG_9P, "Cache mode: fscache\n"); 93 p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
92 } else if (!strcmp(s, "none")) { 94 } else if (!strcmp(s, "none")) {
93 version = CACHE_NONE; 95 version = CACHE_NONE;
94 P9_DPRINTK(P9_DEBUG_9P, "Cache mode: none\n"); 96 p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
95 } else 97 } else
96 printk(KERN_INFO "9p: Unknown Cache mode %s.\n", s); 98 pr_info("Unknown Cache mode %s\n", s);
97 return version; 99 return version;
98} 100}
99 101
@@ -140,8 +142,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
140 case Opt_debug: 142 case Opt_debug:
141 r = match_int(&args[0], &option); 143 r = match_int(&args[0], &option);
142 if (r < 0) { 144 if (r < 0) {
143 P9_DPRINTK(P9_DEBUG_ERROR, 145 p9_debug(P9_DEBUG_ERROR,
144 "integer field, but no integer?\n"); 146 "integer field, but no integer?\n");
145 ret = r; 147 ret = r;
146 continue; 148 continue;
147 } 149 }
@@ -154,8 +156,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
154 case Opt_dfltuid: 156 case Opt_dfltuid:
155 r = match_int(&args[0], &option); 157 r = match_int(&args[0], &option);
156 if (r < 0) { 158 if (r < 0) {
157 P9_DPRINTK(P9_DEBUG_ERROR, 159 p9_debug(P9_DEBUG_ERROR,
158 "integer field, but no integer?\n"); 160 "integer field, but no integer?\n");
159 ret = r; 161 ret = r;
160 continue; 162 continue;
161 } 163 }
@@ -164,8 +166,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
164 case Opt_dfltgid: 166 case Opt_dfltgid:
165 r = match_int(&args[0], &option); 167 r = match_int(&args[0], &option);
166 if (r < 0) { 168 if (r < 0) {
167 P9_DPRINTK(P9_DEBUG_ERROR, 169 p9_debug(P9_DEBUG_ERROR,
168 "integer field, but no integer?\n"); 170 "integer field, but no integer?\n");
169 ret = r; 171 ret = r;
170 continue; 172 continue;
171 } 173 }
@@ -174,8 +176,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
174 case Opt_afid: 176 case Opt_afid:
175 r = match_int(&args[0], &option); 177 r = match_int(&args[0], &option);
176 if (r < 0) { 178 if (r < 0) {
177 P9_DPRINTK(P9_DEBUG_ERROR, 179 p9_debug(P9_DEBUG_ERROR,
178 "integer field, but no integer?\n"); 180 "integer field, but no integer?\n");
179 ret = r; 181 ret = r;
180 continue; 182 continue;
181 } 183 }
@@ -205,8 +207,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
205 s = match_strdup(&args[0]); 207 s = match_strdup(&args[0]);
206 if (!s) { 208 if (!s) {
207 ret = -ENOMEM; 209 ret = -ENOMEM;
208 P9_DPRINTK(P9_DEBUG_ERROR, 210 p9_debug(P9_DEBUG_ERROR,
209 "problem allocating copy of cache arg\n"); 211 "problem allocating copy of cache arg\n");
210 goto free_and_return; 212 goto free_and_return;
211 } 213 }
212 ret = get_cache_mode(s); 214 ret = get_cache_mode(s);
@@ -223,8 +225,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
223 s = match_strdup(&args[0]); 225 s = match_strdup(&args[0]);
224 if (!s) { 226 if (!s) {
225 ret = -ENOMEM; 227 ret = -ENOMEM;
226 P9_DPRINTK(P9_DEBUG_ERROR, 228 p9_debug(P9_DEBUG_ERROR,
227 "problem allocating copy of access arg\n"); 229 "problem allocating copy of access arg\n");
228 goto free_and_return; 230 goto free_and_return;
229 } 231 }
230 232
@@ -240,8 +242,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
240 v9ses->uid = simple_strtoul(s, &e, 10); 242 v9ses->uid = simple_strtoul(s, &e, 10);
241 if (*e != '\0') { 243 if (*e != '\0') {
242 ret = -EINVAL; 244 ret = -EINVAL;
243 printk(KERN_INFO "9p: Unknown access " 245 pr_info("Unknown access argument %s\n",
244 "argument %s.\n", s); 246 s);
245 kfree(s); 247 kfree(s);
246 goto free_and_return; 248 goto free_and_return;
247 } 249 }
@@ -254,9 +256,8 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
254#ifdef CONFIG_9P_FS_POSIX_ACL 256#ifdef CONFIG_9P_FS_POSIX_ACL
255 v9ses->flags |= V9FS_POSIX_ACL; 257 v9ses->flags |= V9FS_POSIX_ACL;
256#else 258#else
257 P9_DPRINTK(P9_DEBUG_ERROR, 259 p9_debug(P9_DEBUG_ERROR,
258 "Not defined CONFIG_9P_FS_POSIX_ACL. " 260 "Not defined CONFIG_9P_FS_POSIX_ACL. Ignoring posixacl option\n");
259 "Ignoring posixacl option\n");
260#endif 261#endif
261 break; 262 break;
262 263
@@ -318,7 +319,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
318 if (IS_ERR(v9ses->clnt)) { 319 if (IS_ERR(v9ses->clnt)) {
319 retval = PTR_ERR(v9ses->clnt); 320 retval = PTR_ERR(v9ses->clnt);
320 v9ses->clnt = NULL; 321 v9ses->clnt = NULL;
321 P9_DPRINTK(P9_DEBUG_ERROR, "problem initializing 9p client\n"); 322 p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
322 goto error; 323 goto error;
323 } 324 }
324 325
@@ -371,7 +372,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
371 if (IS_ERR(fid)) { 372 if (IS_ERR(fid)) {
372 retval = PTR_ERR(fid); 373 retval = PTR_ERR(fid);
373 fid = NULL; 374 fid = NULL;
374 P9_DPRINTK(P9_DEBUG_ERROR, "cannot attach\n"); 375 p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
375 goto error; 376 goto error;
376 } 377 }
377 378
@@ -429,7 +430,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
429 */ 430 */
430 431
431void v9fs_session_cancel(struct v9fs_session_info *v9ses) { 432void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
432 P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); 433 p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
433 p9_client_disconnect(v9ses->clnt); 434 p9_client_disconnect(v9ses->clnt);
434} 435}
435 436
@@ -442,7 +443,7 @@ void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
442 443
443void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses) 444void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses)
444{ 445{
445 P9_DPRINTK(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses); 446 p9_debug(P9_DEBUG_ERROR, "begin cancel session %p\n", v9ses);
446 p9_client_begin_disconnect(v9ses->clnt); 447 p9_client_begin_disconnect(v9ses->clnt);
447} 448}
448 449
@@ -591,23 +592,23 @@ static void v9fs_cache_unregister(void)
591static int __init init_v9fs(void) 592static int __init init_v9fs(void)
592{ 593{
593 int err; 594 int err;
594 printk(KERN_INFO "Installing v9fs 9p2000 file system support\n"); 595 pr_info("Installing v9fs 9p2000 file system support\n");
595 /* TODO: Setup list of registered trasnport modules */ 596 /* TODO: Setup list of registered trasnport modules */
596 err = register_filesystem(&v9fs_fs_type); 597 err = register_filesystem(&v9fs_fs_type);
597 if (err < 0) { 598 if (err < 0) {
598 printk(KERN_ERR "Failed to register filesystem\n"); 599 pr_err("Failed to register filesystem\n");
599 return err; 600 return err;
600 } 601 }
601 602
602 err = v9fs_cache_register(); 603 err = v9fs_cache_register();
603 if (err < 0) { 604 if (err < 0) {
604 printk(KERN_ERR "Failed to register v9fs for caching\n"); 605 pr_err("Failed to register v9fs for caching\n");
605 goto out_fs_unreg; 606 goto out_fs_unreg;
606 } 607 }
607 608
608 err = v9fs_sysfs_init(); 609 err = v9fs_sysfs_init();
609 if (err < 0) { 610 if (err < 0) {
610 printk(KERN_ERR "Failed to register with sysfs\n"); 611 pr_err("Failed to register with sysfs\n");
611 goto out_sysfs_cleanup; 612 goto out_sysfs_cleanup;
612 } 613 }
613 614
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 2524e4cbb8ea..0ad61c6a65a5 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -56,7 +56,7 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
56 struct inode *inode; 56 struct inode *inode;
57 57
58 inode = page->mapping->host; 58 inode = page->mapping->host;
59 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 59 p9_debug(P9_DEBUG_VFS, "\n");
60 60
61 BUG_ON(!PageLocked(page)); 61 BUG_ON(!PageLocked(page));
62 62
@@ -116,14 +116,14 @@ static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping,
116 struct inode *inode; 116 struct inode *inode;
117 117
118 inode = mapping->host; 118 inode = mapping->host;
119 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); 119 p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp);
120 120
121 ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages); 121 ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages);
122 if (ret == 0) 122 if (ret == 0)
123 return ret; 123 return ret;
124 124
125 ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp); 125 ret = read_cache_pages(mapping, pages, (void *)v9fs_vfs_readpage, filp);
126 P9_DPRINTK(P9_DEBUG_VFS, " = %d\n", ret); 126 p9_debug(P9_DEBUG_VFS, " = %d\n", ret);
127 return ret; 127 return ret;
128} 128}
129 129
@@ -263,10 +263,9 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
263 * Now that we do caching with cache mode enabled, We need 263 * Now that we do caching with cache mode enabled, We need
264 * to support direct IO 264 * to support direct IO
265 */ 265 */
266 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " 266 p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
267 "off/no(%lld/%lu) EINVAL\n", 267 iocb->ki_filp->f_path.dentry->d_name.name,
268 iocb->ki_filp->f_path.dentry->d_name.name, 268 (long long)pos, nr_segs);
269 (long long) pos, nr_segs);
270 269
271 return -EINVAL; 270 return -EINVAL;
272} 271}
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index e022890c6f40..d529437ff442 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -53,8 +53,8 @@
53 53
54static int v9fs_dentry_delete(const struct dentry *dentry) 54static int v9fs_dentry_delete(const struct dentry *dentry)
55{ 55{
56 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, 56 p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
57 dentry); 57 dentry->d_name.name, dentry);
58 58
59 return 1; 59 return 1;
60} 60}
@@ -66,8 +66,8 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
66 */ 66 */
67static int v9fs_cached_dentry_delete(const struct dentry *dentry) 67static int v9fs_cached_dentry_delete(const struct dentry *dentry)
68{ 68{
69 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", 69 p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
70 dentry->d_name.name, dentry); 70 dentry->d_name.name, dentry);
71 71
72 /* Don't cache negative dentries */ 72 /* Don't cache negative dentries */
73 if (!dentry->d_inode) 73 if (!dentry->d_inode)
@@ -86,8 +86,8 @@ static void v9fs_dentry_release(struct dentry *dentry)
86 struct v9fs_dentry *dent; 86 struct v9fs_dentry *dent;
87 struct p9_fid *temp, *current_fid; 87 struct p9_fid *temp, *current_fid;
88 88
89 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, 89 p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
90 dentry); 90 dentry->d_name.name, dentry);
91 dent = dentry->d_fsdata; 91 dent = dentry->d_fsdata;
92 if (dent) { 92 if (dent) {
93 list_for_each_entry_safe(current_fid, temp, &dent->fidlist, 93 list_for_each_entry_safe(current_fid, temp, &dent->fidlist,
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 598fff1a54e5..ff911e779651 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -140,7 +140,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
140 int reclen = 0; 140 int reclen = 0;
141 struct p9_rdir *rdir; 141 struct p9_rdir *rdir;
142 142
143 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 143 p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
144 fid = filp->private_data; 144 fid = filp->private_data;
145 145
146 buflen = fid->clnt->msize - P9_IOHDRSZ; 146 buflen = fid->clnt->msize - P9_IOHDRSZ;
@@ -168,7 +168,7 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
168 err = p9stat_read(fid->clnt, rdir->buf + rdir->head, 168 err = p9stat_read(fid->clnt, rdir->buf + rdir->head,
169 rdir->tail - rdir->head, &st); 169 rdir->tail - rdir->head, &st);
170 if (err) { 170 if (err) {
171 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 171 p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
172 err = -EIO; 172 err = -EIO;
173 p9stat_free(&st); 173 p9stat_free(&st);
174 goto unlock_and_exit; 174 goto unlock_and_exit;
@@ -213,7 +213,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
213 struct p9_dirent curdirent; 213 struct p9_dirent curdirent;
214 u64 oldoffset = 0; 214 u64 oldoffset = 0;
215 215
216 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name); 216 p9_debug(P9_DEBUG_VFS, "name %s\n", filp->f_path.dentry->d_name.name);
217 fid = filp->private_data; 217 fid = filp->private_data;
218 218
219 buflen = fid->clnt->msize - P9_READDIRHDRSZ; 219 buflen = fid->clnt->msize - P9_READDIRHDRSZ;
@@ -244,7 +244,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
244 rdir->tail - rdir->head, 244 rdir->tail - rdir->head,
245 &curdirent); 245 &curdirent);
246 if (err < 0) { 246 if (err < 0) {
247 P9_DPRINTK(P9_DEBUG_VFS, "returned %d\n", err); 247 p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
248 err = -EIO; 248 err = -EIO;
249 goto unlock_and_exit; 249 goto unlock_and_exit;
250 } 250 }
@@ -290,9 +290,8 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
290 struct p9_fid *fid; 290 struct p9_fid *fid;
291 291
292 fid = filp->private_data; 292 fid = filp->private_data;
293 P9_DPRINTK(P9_DEBUG_VFS, 293 p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n",
294 "v9fs_dir_release: inode: %p filp: %p fid: %d\n", 294 inode, filp, fid ? fid->fid : -1);
295 inode, filp, fid ? fid->fid : -1);
296 if (fid) 295 if (fid)
297 p9_client_clunk(fid); 296 p9_client_clunk(fid);
298 return 0; 297 return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 62857a810a79..fc06fd27065e 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -61,7 +61,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
61 struct p9_fid *fid; 61 struct p9_fid *fid;
62 int omode; 62 int omode;
63 63
64 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); 64 p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
65 v9inode = V9FS_I(inode); 65 v9inode = V9FS_I(inode);
66 v9ses = v9fs_inode2v9ses(inode); 66 v9ses = v9fs_inode2v9ses(inode);
67 if (v9fs_proto_dotl(v9ses)) 67 if (v9fs_proto_dotl(v9ses))
@@ -135,7 +135,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
135 int res = 0; 135 int res = 0;
136 struct inode *inode = filp->f_path.dentry->d_inode; 136 struct inode *inode = filp->f_path.dentry->d_inode;
137 137
138 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); 138 p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
139 139
140 /* No mandatory locks */ 140 /* No mandatory locks */
141 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 141 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -204,7 +204,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
204 break; 204 break;
205 if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd)) 205 if (status == P9_LOCK_BLOCKED && !IS_SETLKW(cmd))
206 break; 206 break;
207 schedule_timeout_interruptible(P9_LOCK_TIMEOUT); 207 if (schedule_timeout_interruptible(P9_LOCK_TIMEOUT) != 0)
208 break;
208 } 209 }
209 210
210 /* map 9p status to VFS status */ 211 /* map 9p status to VFS status */
@@ -304,8 +305,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
304 struct inode *inode = filp->f_path.dentry->d_inode; 305 struct inode *inode = filp->f_path.dentry->d_inode;
305 int ret = -ENOLCK; 306 int ret = -ENOLCK;
306 307
307 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, 308 p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
308 cmd, fl, filp->f_path.dentry->d_name.name); 309 filp, cmd, fl, filp->f_path.dentry->d_name.name);
309 310
310 /* No mandatory locks */ 311 /* No mandatory locks */
311 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 312 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -340,8 +341,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
340 struct inode *inode = filp->f_path.dentry->d_inode; 341 struct inode *inode = filp->f_path.dentry->d_inode;
341 int ret = -ENOLCK; 342 int ret = -ENOLCK;
342 343
343 P9_DPRINTK(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", filp, 344 p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n",
344 cmd, fl, filp->f_path.dentry->d_name.name); 345 filp, cmd, fl, filp->f_path.dentry->d_name.name);
345 346
346 /* No mandatory locks */ 347 /* No mandatory locks */
347 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 348 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -384,8 +385,8 @@ v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
384{ 385{
385 int n, total, size; 386 int n, total, size;
386 387
387 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, 388 p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n",
388 (long long unsigned) offset, count); 389 fid->fid, (long long unsigned)offset, count);
389 n = 0; 390 n = 0;
390 total = 0; 391 total = 0;
391 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; 392 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -443,7 +444,7 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
443 struct p9_fid *fid; 444 struct p9_fid *fid;
444 size_t size; 445 size_t size;
445 446
446 P9_DPRINTK(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); 447 p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset);
447 fid = filp->private_data; 448 fid = filp->private_data;
448 449
449 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; 450 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -470,8 +471,8 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
470 loff_t origin = *offset; 471 loff_t origin = *offset;
471 unsigned long pg_start, pg_end; 472 unsigned long pg_start, pg_end;
472 473
473 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 474 p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
474 (int)count, (int)*offset); 475 data, (int)count, (int)*offset);
475 476
476 clnt = fid->clnt; 477 clnt = fid->clnt;
477 do { 478 do {
@@ -552,7 +553,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
552 return retval; 553 return retval;
553 554
554 mutex_lock(&inode->i_mutex); 555 mutex_lock(&inode->i_mutex);
555 P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); 556 p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
556 557
557 fid = filp->private_data; 558 fid = filp->private_data;
558 v9fs_blank_wstat(&wstat); 559 v9fs_blank_wstat(&wstat);
@@ -575,8 +576,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
575 return retval; 576 return retval;
576 577
577 mutex_lock(&inode->i_mutex); 578 mutex_lock(&inode->i_mutex);
578 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_file_fsync_dotl: filp %p datasync %x\n", 579 p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
579 filp, datasync);
580 580
581 fid = filp->private_data; 581 fid = filp->private_data;
582 582
@@ -607,8 +607,8 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
607 struct inode *inode = filp->f_path.dentry->d_inode; 607 struct inode *inode = filp->f_path.dentry->d_inode;
608 608
609 609
610 P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n", 610 p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n",
611 page, (unsigned long)filp->private_data); 611 page, (unsigned long)filp->private_data);
612 612
613 v9inode = V9FS_I(inode); 613 v9inode = V9FS_I(inode);
614 /* make sure the cache has finished storing the page */ 614 /* make sure the cache has finished storing the page */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e0f20de6aa2b..014c8dd62962 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -23,6 +23,8 @@
23 * 23 *
24 */ 24 */
25 25
26#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
27
26#include <linux/module.h> 28#include <linux/module.h>
27#include <linux/errno.h> 29#include <linux/errno.h>
28#include <linux/fs.h> 30#include <linux/fs.h>
@@ -88,6 +90,32 @@ static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode)
88} 90}
89 91
90/** 92/**
93 * p9mode2perm- convert plan9 mode bits to unix permission bits
94 * @v9ses: v9fs session information
95 * @stat: p9_wstat from which mode need to be derived
96 *
97 */
98static int p9mode2perm(struct v9fs_session_info *v9ses,
99 struct p9_wstat *stat)
100{
101 int res;
102 int mode = stat->mode;
103
104 res = mode & S_IALLUGO;
105 if (v9fs_proto_dotu(v9ses)) {
106 if ((mode & P9_DMSETUID) == P9_DMSETUID)
107 res |= S_ISUID;
108
109 if ((mode & P9_DMSETGID) == P9_DMSETGID)
110 res |= S_ISGID;
111
112 if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
113 res |= S_ISVTX;
114 }
115 return res;
116}
117
118/**
91 * p9mode2unixmode- convert plan9 mode bits to unix mode bits 119 * p9mode2unixmode- convert plan9 mode bits to unix mode bits
92 * @v9ses: v9fs session information 120 * @v9ses: v9fs session information
93 * @stat: p9_wstat from which mode need to be derived 121 * @stat: p9_wstat from which mode need to be derived
@@ -100,8 +128,8 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
100 int res; 128 int res;
101 u32 mode = stat->mode; 129 u32 mode = stat->mode;
102 130
103 res = mode & S_IALLUGO;
104 *rdev = 0; 131 *rdev = 0;
132 res = p9mode2perm(v9ses, stat);
105 133
106 if ((mode & P9_DMDIR) == P9_DMDIR) 134 if ((mode & P9_DMDIR) == P9_DMDIR)
107 res |= S_IFDIR; 135 res |= S_IFDIR;
@@ -128,24 +156,13 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
128 res |= S_IFBLK; 156 res |= S_IFBLK;
129 break; 157 break;
130 default: 158 default:
131 P9_DPRINTK(P9_DEBUG_ERROR, 159 p9_debug(P9_DEBUG_ERROR, "Unknown special type %c %s\n",
132 "Unknown special type %c %s\n", type, 160 type, stat->extension);
133 stat->extension);
134 }; 161 };
135 *rdev = MKDEV(major, minor); 162 *rdev = MKDEV(major, minor);
136 } else 163 } else
137 res |= S_IFREG; 164 res |= S_IFREG;
138 165
139 if (v9fs_proto_dotu(v9ses)) {
140 if ((mode & P9_DMSETUID) == P9_DMSETUID)
141 res |= S_ISUID;
142
143 if ((mode & P9_DMSETGID) == P9_DMSETGID)
144 res |= S_ISGID;
145
146 if ((mode & P9_DMSETVTX) == P9_DMSETVTX)
147 res |= S_ISVTX;
148 }
149 return res; 166 return res;
150} 167}
151 168
@@ -275,8 +292,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
275 } else if (v9fs_proto_dotu(v9ses)) { 292 } else if (v9fs_proto_dotu(v9ses)) {
276 inode->i_op = &v9fs_file_inode_operations; 293 inode->i_op = &v9fs_file_inode_operations;
277 } else { 294 } else {
278 P9_DPRINTK(P9_DEBUG_ERROR, 295 p9_debug(P9_DEBUG_ERROR,
279 "special files without extended mode\n"); 296 "special files without extended mode\n");
280 err = -EINVAL; 297 err = -EINVAL;
281 goto error; 298 goto error;
282 } 299 }
@@ -301,8 +318,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
301 break; 318 break;
302 case S_IFLNK: 319 case S_IFLNK:
303 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { 320 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
304 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " 321 p9_debug(P9_DEBUG_ERROR,
305 "legacy protocol.\n"); 322 "extended modes used with legacy protocol\n");
306 err = -EINVAL; 323 err = -EINVAL;
307 goto error; 324 goto error;
308 } 325 }
@@ -329,8 +346,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
329 346
330 break; 347 break;
331 default: 348 default:
332 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n", 349 p9_debug(P9_DEBUG_ERROR, "BAD mode 0x%hx S_IFMT 0x%x\n",
333 mode, mode & S_IFMT); 350 mode, mode & S_IFMT);
334 err = -EINVAL; 351 err = -EINVAL;
335 goto error; 352 goto error;
336 } 353 }
@@ -352,11 +369,12 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
352 struct inode *inode; 369 struct inode *inode;
353 struct v9fs_session_info *v9ses = sb->s_fs_info; 370 struct v9fs_session_info *v9ses = sb->s_fs_info;
354 371
355 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode); 372 p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
356 373
357 inode = new_inode(sb); 374 inode = new_inode(sb);
358 if (!inode) { 375 if (!inode) {
359 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); 376 pr_warn("%s (%d): Problem allocating inode\n",
377 __func__, task_pid_nr(current));
360 return ERR_PTR(-ENOMEM); 378 return ERR_PTR(-ENOMEM);
361 } 379 }
362 err = v9fs_init_inode(v9ses, inode, mode, rdev); 380 err = v9fs_init_inode(v9ses, inode, mode, rdev);
@@ -573,15 +591,15 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
573 struct p9_fid *v9fid, *dfid; 591 struct p9_fid *v9fid, *dfid;
574 struct v9fs_session_info *v9ses; 592 struct v9fs_session_info *v9ses;
575 593
576 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n", 594 p9_debug(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %x\n",
577 dir, dentry, flags); 595 dir, dentry, flags);
578 596
579 v9ses = v9fs_inode2v9ses(dir); 597 v9ses = v9fs_inode2v9ses(dir);
580 inode = dentry->d_inode; 598 inode = dentry->d_inode;
581 dfid = v9fs_fid_lookup(dentry->d_parent); 599 dfid = v9fs_fid_lookup(dentry->d_parent);
582 if (IS_ERR(dfid)) { 600 if (IS_ERR(dfid)) {
583 retval = PTR_ERR(dfid); 601 retval = PTR_ERR(dfid);
584 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", retval); 602 p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
585 return retval; 603 return retval;
586 } 604 }
587 if (v9fs_proto_dotl(v9ses)) 605 if (v9fs_proto_dotl(v9ses))
@@ -630,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
630 struct p9_fid *dfid, *ofid, *fid; 648 struct p9_fid *dfid, *ofid, *fid;
631 struct inode *inode; 649 struct inode *inode;
632 650
633 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 651 p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
634 652
635 err = 0; 653 err = 0;
636 ofid = NULL; 654 ofid = NULL;
@@ -639,7 +657,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
639 dfid = v9fs_fid_lookup(dentry->d_parent); 657 dfid = v9fs_fid_lookup(dentry->d_parent);
640 if (IS_ERR(dfid)) { 658 if (IS_ERR(dfid)) {
641 err = PTR_ERR(dfid); 659 err = PTR_ERR(dfid);
642 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); 660 p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
643 return ERR_PTR(err); 661 return ERR_PTR(err);
644 } 662 }
645 663
@@ -647,36 +665,41 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
647 ofid = p9_client_walk(dfid, 0, NULL, 1); 665 ofid = p9_client_walk(dfid, 0, NULL, 1);
648 if (IS_ERR(ofid)) { 666 if (IS_ERR(ofid)) {
649 err = PTR_ERR(ofid); 667 err = PTR_ERR(ofid);
650 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 668 p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
651 return ERR_PTR(err); 669 return ERR_PTR(err);
652 } 670 }
653 671
654 err = p9_client_fcreate(ofid, name, perm, mode, extension); 672 err = p9_client_fcreate(ofid, name, perm, mode, extension);
655 if (err < 0) { 673 if (err < 0) {
656 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err); 674 p9_debug(P9_DEBUG_VFS, "p9_client_fcreate failed %d\n", err);
657 goto error;
658 }
659
660 /* now walk from the parent so we can get unopened fid */
661 fid = p9_client_walk(dfid, 1, &name, 1);
662 if (IS_ERR(fid)) {
663 err = PTR_ERR(fid);
664 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
665 fid = NULL;
666 goto error; 675 goto error;
667 } 676 }
668 677
669 /* instantiate inode and assign the unopened fid to the dentry */ 678 if (!(perm & P9_DMLINK)) {
670 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); 679 /* now walk from the parent so we can get unopened fid */
671 if (IS_ERR(inode)) { 680 fid = p9_client_walk(dfid, 1, &name, 1);
672 err = PTR_ERR(inode); 681 if (IS_ERR(fid)) {
673 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 682 err = PTR_ERR(fid);
674 goto error; 683 p9_debug(P9_DEBUG_VFS,
684 "p9_client_walk failed %d\n", err);
685 fid = NULL;
686 goto error;
687 }
688 /*
689 * instantiate inode and assign the unopened fid to the dentry
690 */
691 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
692 if (IS_ERR(inode)) {
693 err = PTR_ERR(inode);
694 p9_debug(P9_DEBUG_VFS,
695 "inode creation failed %d\n", err);
696 goto error;
697 }
698 err = v9fs_fid_add(dentry, fid);
699 if (err < 0)
700 goto error;
701 d_instantiate(dentry, inode);
675 } 702 }
676 err = v9fs_fid_add(dentry, fid);
677 if (err < 0)
678 goto error;
679 d_instantiate(dentry, inode);
680 return ofid; 703 return ofid;
681error: 704error:
682 if (ofid) 705 if (ofid)
@@ -788,7 +811,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
788 struct p9_fid *fid; 811 struct p9_fid *fid;
789 struct v9fs_session_info *v9ses; 812 struct v9fs_session_info *v9ses;
790 813
791 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 814 p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
792 err = 0; 815 err = 0;
793 v9ses = v9fs_inode2v9ses(dir); 816 v9ses = v9fs_inode2v9ses(dir);
794 perm = unixmode2p9mode(v9ses, mode | S_IFDIR); 817 perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -826,8 +849,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
826 char *name; 849 char *name;
827 int result = 0; 850 int result = 0;
828 851
829 P9_DPRINTK(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", 852 p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n",
830 dir, dentry->d_name.name, dentry, nameidata); 853 dir, dentry->d_name.name, dentry, nameidata);
831 854
832 if (dentry->d_name.len > NAME_MAX) 855 if (dentry->d_name.len > NAME_MAX)
833 return ERR_PTR(-ENAMETOOLONG); 856 return ERR_PTR(-ENAMETOOLONG);
@@ -933,7 +956,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
933 struct p9_fid *newdirfid; 956 struct p9_fid *newdirfid;
934 struct p9_wstat wstat; 957 struct p9_wstat wstat;
935 958
936 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 959 p9_debug(P9_DEBUG_VFS, "\n");
937 retval = 0; 960 retval = 0;
938 old_inode = old_dentry->d_inode; 961 old_inode = old_dentry->d_inode;
939 new_inode = new_dentry->d_inode; 962 new_inode = new_dentry->d_inode;
@@ -969,8 +992,7 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
969 * 9P .u can only handle file rename in the same directory 992 * 9P .u can only handle file rename in the same directory
970 */ 993 */
971 994
972 P9_DPRINTK(P9_DEBUG_ERROR, 995 p9_debug(P9_DEBUG_ERROR, "old dir and new dir are different\n");
973 "old dir and new dir are different\n");
974 retval = -EXDEV; 996 retval = -EXDEV;
975 goto clunk_newdir; 997 goto clunk_newdir;
976 } 998 }
@@ -1026,7 +1048,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1026 struct p9_fid *fid; 1048 struct p9_fid *fid;
1027 struct p9_wstat *st; 1049 struct p9_wstat *st;
1028 1050
1029 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 1051 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
1030 err = -EPERM; 1052 err = -EPERM;
1031 v9ses = v9fs_dentry2v9ses(dentry); 1053 v9ses = v9fs_dentry2v9ses(dentry);
1032 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 1054 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -1063,7 +1085,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
1063 struct p9_fid *fid; 1085 struct p9_fid *fid;
1064 struct p9_wstat wstat; 1086 struct p9_wstat wstat;
1065 1087
1066 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 1088 p9_debug(P9_DEBUG_VFS, "\n");
1067 retval = inode_change_ok(dentry->d_inode, iattr); 1089 retval = inode_change_ok(dentry->d_inode, iattr);
1068 if (retval) 1090 if (retval)
1069 return retval; 1091 return retval;
@@ -1162,7 +1184,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
1162 set_nlink(inode, i_nlink); 1184 set_nlink(inode, i_nlink);
1163 } 1185 }
1164 } 1186 }
1165 mode = stat->mode & S_IALLUGO; 1187 mode = p9mode2perm(v9ses, stat);
1166 mode |= inode->i_mode & ~S_IALLUGO; 1188 mode |= inode->i_mode & ~S_IALLUGO;
1167 inode->i_mode = mode; 1189 inode->i_mode = mode;
1168 i_size_write(inode, stat->length); 1190 i_size_write(inode, stat->length);
@@ -1208,7 +1230,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1208 struct p9_fid *fid; 1230 struct p9_fid *fid;
1209 struct p9_wstat *st; 1231 struct p9_wstat *st;
1210 1232
1211 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); 1233 p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
1212 retval = -EPERM; 1234 retval = -EPERM;
1213 v9ses = v9fs_dentry2v9ses(dentry); 1235 v9ses = v9fs_dentry2v9ses(dentry);
1214 fid = v9fs_fid_lookup(dentry); 1236 fid = v9fs_fid_lookup(dentry);
@@ -1230,8 +1252,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1230 /* copy extension buffer into buffer */ 1252 /* copy extension buffer into buffer */
1231 strncpy(buffer, st->extension, buflen); 1253 strncpy(buffer, st->extension, buflen);
1232 1254
1233 P9_DPRINTK(P9_DEBUG_VFS, 1255 p9_debug(P9_DEBUG_VFS, "%s -> %s (%s)\n",
1234 "%s -> %s (%s)\n", dentry->d_name.name, st->extension, buffer); 1256 dentry->d_name.name, st->extension, buffer);
1235 1257
1236 retval = strnlen(buffer, buflen); 1258 retval = strnlen(buffer, buflen);
1237done: 1259done:
@@ -1252,7 +1274,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
1252 int len = 0; 1274 int len = 0;
1253 char *link = __getname(); 1275 char *link = __getname();
1254 1276
1255 P9_DPRINTK(P9_DEBUG_VFS, "%s n", dentry->d_name.name); 1277 p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
1256 1278
1257 if (!link) 1279 if (!link)
1258 link = ERR_PTR(-ENOMEM); 1280 link = ERR_PTR(-ENOMEM);
@@ -1283,8 +1305,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1283{ 1305{
1284 char *s = nd_get_link(nd); 1306 char *s = nd_get_link(nd);
1285 1307
1286 P9_DPRINTK(P9_DEBUG_VFS, " %s %s\n", dentry->d_name.name, 1308 p9_debug(P9_DEBUG_VFS, " %s %s\n",
1287 IS_ERR(s) ? "<error>" : s); 1309 dentry->d_name.name, IS_ERR(s) ? "<error>" : s);
1288 if (!IS_ERR(s)) 1310 if (!IS_ERR(s))
1289 __putname(s); 1311 __putname(s);
1290} 1312}
@@ -1306,7 +1328,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1306 1328
1307 v9ses = v9fs_inode2v9ses(dir); 1329 v9ses = v9fs_inode2v9ses(dir);
1308 if (!v9fs_proto_dotu(v9ses)) { 1330 if (!v9fs_proto_dotu(v9ses)) {
1309 P9_DPRINTK(P9_DEBUG_ERROR, "not extended\n"); 1331 p9_debug(P9_DEBUG_ERROR, "not extended\n");
1310 return -EPERM; 1332 return -EPERM;
1311 } 1333 }
1312 1334
@@ -1333,8 +1355,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1333static int 1355static int
1334v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 1356v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1335{ 1357{
1336 P9_DPRINTK(P9_DEBUG_VFS, " %lu,%s,%s\n", dir->i_ino, 1358 p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
1337 dentry->d_name.name, symname); 1359 dir->i_ino, dentry->d_name.name, symname);
1338 1360
1339 return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); 1361 return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
1340} 1362}
@@ -1355,9 +1377,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1355 char *name; 1377 char *name;
1356 struct p9_fid *oldfid; 1378 struct p9_fid *oldfid;
1357 1379
1358 P9_DPRINTK(P9_DEBUG_VFS, 1380 p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n",
1359 " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, 1381 dir->i_ino, dentry->d_name.name, old_dentry->d_name.name);
1360 old_dentry->d_name.name);
1361 1382
1362 oldfid = v9fs_fid_clone(old_dentry); 1383 oldfid = v9fs_fid_clone(old_dentry);
1363 if (IS_ERR(oldfid)) 1384 if (IS_ERR(oldfid))
@@ -1398,9 +1419,9 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
1398 char *name; 1419 char *name;
1399 u32 perm; 1420 u32 perm;
1400 1421
1401 P9_DPRINTK(P9_DEBUG_VFS, 1422 p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
1402 " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino, 1423 dir->i_ino, dentry->d_name.name, mode,
1403 dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); 1424 MAJOR(rdev), MINOR(rdev));
1404 1425
1405 if (!new_valid_dev(rdev)) 1426 if (!new_valid_dev(rdev))
1406 return -EINVAL; 1427 return -EINVAL;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 8ef152ac6a16..a1e6c990cd41 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -283,13 +283,13 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
283 } 283 }
284 284
285 name = (char *) dentry->d_name.name; 285 name = (char *) dentry->d_name.name;
286 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_create_dotl: name:%s flags:0x%x " 286 p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
287 "mode:0x%hx\n", name, flags, omode); 287 name, flags, omode);
288 288
289 dfid = v9fs_fid_lookup(dentry->d_parent); 289 dfid = v9fs_fid_lookup(dentry->d_parent);
290 if (IS_ERR(dfid)) { 290 if (IS_ERR(dfid)) {
291 err = PTR_ERR(dfid); 291 err = PTR_ERR(dfid);
292 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); 292 p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
293 return err; 293 return err;
294 } 294 }
295 295
@@ -297,7 +297,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
297 ofid = p9_client_walk(dfid, 0, NULL, 1); 297 ofid = p9_client_walk(dfid, 0, NULL, 1);
298 if (IS_ERR(ofid)) { 298 if (IS_ERR(ofid)) {
299 err = PTR_ERR(ofid); 299 err = PTR_ERR(ofid);
300 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 300 p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
301 return err; 301 return err;
302 } 302 }
303 303
@@ -307,16 +307,15 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
307 /* Update mode based on ACL value */ 307 /* Update mode based on ACL value */
308 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); 308 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
309 if (err) { 309 if (err) {
310 P9_DPRINTK(P9_DEBUG_VFS, 310 p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n",
311 "Failed to get acl values in creat %d\n", err); 311 err);
312 goto error; 312 goto error;
313 } 313 }
314 err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), 314 err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags),
315 mode, gid, &qid); 315 mode, gid, &qid);
316 if (err < 0) { 316 if (err < 0) {
317 P9_DPRINTK(P9_DEBUG_VFS, 317 p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n",
318 "p9_client_open_dotl failed in creat %d\n", 318 err);
319 err);
320 goto error; 319 goto error;
321 } 320 }
322 v9fs_invalidate_inode_attr(dir); 321 v9fs_invalidate_inode_attr(dir);
@@ -325,14 +324,14 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
325 fid = p9_client_walk(dfid, 1, &name, 1); 324 fid = p9_client_walk(dfid, 1, &name, 1);
326 if (IS_ERR(fid)) { 325 if (IS_ERR(fid)) {
327 err = PTR_ERR(fid); 326 err = PTR_ERR(fid);
328 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 327 p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
329 fid = NULL; 328 fid = NULL;
330 goto error; 329 goto error;
331 } 330 }
332 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); 331 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
333 if (IS_ERR(inode)) { 332 if (IS_ERR(inode)) {
334 err = PTR_ERR(inode); 333 err = PTR_ERR(inode);
335 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 334 p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
336 goto error; 335 goto error;
337 } 336 }
338 err = v9fs_fid_add(dentry, fid); 337 err = v9fs_fid_add(dentry, fid);
@@ -408,7 +407,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
408 struct dentry *dir_dentry; 407 struct dentry *dir_dentry;
409 struct posix_acl *dacl = NULL, *pacl = NULL; 408 struct posix_acl *dacl = NULL, *pacl = NULL;
410 409
411 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 410 p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
412 err = 0; 411 err = 0;
413 v9ses = v9fs_inode2v9ses(dir); 412 v9ses = v9fs_inode2v9ses(dir);
414 413
@@ -420,7 +419,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
420 dfid = v9fs_fid_lookup(dir_dentry); 419 dfid = v9fs_fid_lookup(dir_dentry);
421 if (IS_ERR(dfid)) { 420 if (IS_ERR(dfid)) {
422 err = PTR_ERR(dfid); 421 err = PTR_ERR(dfid);
423 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); 422 p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
424 dfid = NULL; 423 dfid = NULL;
425 goto error; 424 goto error;
426 } 425 }
@@ -430,8 +429,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
430 /* Update mode based on ACL value */ 429 /* Update mode based on ACL value */
431 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); 430 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
432 if (err) { 431 if (err) {
433 P9_DPRINTK(P9_DEBUG_VFS, 432 p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mkdir %d\n",
434 "Failed to get acl values in mkdir %d\n", err); 433 err);
435 goto error; 434 goto error;
436 } 435 }
437 name = (char *) dentry->d_name.name; 436 name = (char *) dentry->d_name.name;
@@ -444,8 +443,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
444 fid = p9_client_walk(dfid, 1, &name, 1); 443 fid = p9_client_walk(dfid, 1, &name, 1);
445 if (IS_ERR(fid)) { 444 if (IS_ERR(fid)) {
446 err = PTR_ERR(fid); 445 err = PTR_ERR(fid);
447 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", 446 p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
448 err); 447 err);
449 fid = NULL; 448 fid = NULL;
450 goto error; 449 goto error;
451 } 450 }
@@ -453,8 +452,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
453 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); 452 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
454 if (IS_ERR(inode)) { 453 if (IS_ERR(inode)) {
455 err = PTR_ERR(inode); 454 err = PTR_ERR(inode);
456 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 455 p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
457 err); 456 err);
458 goto error; 457 goto error;
459 } 458 }
460 err = v9fs_fid_add(dentry, fid); 459 err = v9fs_fid_add(dentry, fid);
@@ -495,7 +494,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
495 struct p9_fid *fid; 494 struct p9_fid *fid;
496 struct p9_stat_dotl *st; 495 struct p9_stat_dotl *st;
497 496
498 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 497 p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
499 err = -EPERM; 498 err = -EPERM;
500 v9ses = v9fs_dentry2v9ses(dentry); 499 v9ses = v9fs_dentry2v9ses(dentry);
501 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 500 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
@@ -523,6 +522,46 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
523 return 0; 522 return 0;
524} 523}
525 524
525/*
526 * Attribute flags.
527 */
528#define P9_ATTR_MODE (1 << 0)
529#define P9_ATTR_UID (1 << 1)
530#define P9_ATTR_GID (1 << 2)
531#define P9_ATTR_SIZE (1 << 3)
532#define P9_ATTR_ATIME (1 << 4)
533#define P9_ATTR_MTIME (1 << 5)
534#define P9_ATTR_CTIME (1 << 6)
535#define P9_ATTR_ATIME_SET (1 << 7)
536#define P9_ATTR_MTIME_SET (1 << 8)
537
538struct dotl_iattr_map {
539 int iattr_valid;
540 int p9_iattr_valid;
541};
542
543static int v9fs_mapped_iattr_valid(int iattr_valid)
544{
545 int i;
546 int p9_iattr_valid = 0;
547 struct dotl_iattr_map dotl_iattr_map[] = {
548 { ATTR_MODE, P9_ATTR_MODE },
549 { ATTR_UID, P9_ATTR_UID },
550 { ATTR_GID, P9_ATTR_GID },
551 { ATTR_SIZE, P9_ATTR_SIZE },
552 { ATTR_ATIME, P9_ATTR_ATIME },
553 { ATTR_MTIME, P9_ATTR_MTIME },
554 { ATTR_CTIME, P9_ATTR_CTIME },
555 { ATTR_ATIME_SET, P9_ATTR_ATIME_SET },
556 { ATTR_MTIME_SET, P9_ATTR_MTIME_SET },
557 };
558 for (i = 0; i < ARRAY_SIZE(dotl_iattr_map); i++) {
559 if (iattr_valid & dotl_iattr_map[i].iattr_valid)
560 p9_iattr_valid |= dotl_iattr_map[i].p9_iattr_valid;
561 }
562 return p9_iattr_valid;
563}
564
526/** 565/**
527 * v9fs_vfs_setattr_dotl - set file metadata 566 * v9fs_vfs_setattr_dotl - set file metadata
528 * @dentry: file whose metadata to set 567 * @dentry: file whose metadata to set
@@ -537,13 +576,13 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
537 struct p9_fid *fid; 576 struct p9_fid *fid;
538 struct p9_iattr_dotl p9attr; 577 struct p9_iattr_dotl p9attr;
539 578
540 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 579 p9_debug(P9_DEBUG_VFS, "\n");
541 580
542 retval = inode_change_ok(dentry->d_inode, iattr); 581 retval = inode_change_ok(dentry->d_inode, iattr);
543 if (retval) 582 if (retval)
544 return retval; 583 return retval;
545 584
546 p9attr.valid = iattr->ia_valid; 585 p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid);
547 p9attr.mode = iattr->ia_mode; 586 p9attr.mode = iattr->ia_mode;
548 p9attr.uid = iattr->ia_uid; 587 p9attr.uid = iattr->ia_uid;
549 p9attr.gid = iattr->ia_gid; 588 p9attr.gid = iattr->ia_gid;
@@ -670,14 +709,13 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
670 struct v9fs_session_info *v9ses; 709 struct v9fs_session_info *v9ses;
671 710
672 name = (char *) dentry->d_name.name; 711 name = (char *) dentry->d_name.name;
673 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", 712 p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
674 dir->i_ino, name, symname);
675 v9ses = v9fs_inode2v9ses(dir); 713 v9ses = v9fs_inode2v9ses(dir);
676 714
677 dfid = v9fs_fid_lookup(dentry->d_parent); 715 dfid = v9fs_fid_lookup(dentry->d_parent);
678 if (IS_ERR(dfid)) { 716 if (IS_ERR(dfid)) {
679 err = PTR_ERR(dfid); 717 err = PTR_ERR(dfid);
680 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); 718 p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
681 return err; 719 return err;
682 } 720 }
683 721
@@ -687,7 +725,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
687 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid); 725 err = p9_client_symlink(dfid, name, (char *)symname, gid, &qid);
688 726
689 if (err < 0) { 727 if (err < 0) {
690 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err); 728 p9_debug(P9_DEBUG_VFS, "p9_client_symlink failed %d\n", err);
691 goto error; 729 goto error;
692 } 730 }
693 731
@@ -697,8 +735,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
697 fid = p9_client_walk(dfid, 1, &name, 1); 735 fid = p9_client_walk(dfid, 1, &name, 1);
698 if (IS_ERR(fid)) { 736 if (IS_ERR(fid)) {
699 err = PTR_ERR(fid); 737 err = PTR_ERR(fid);
700 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", 738 p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
701 err); 739 err);
702 fid = NULL; 740 fid = NULL;
703 goto error; 741 goto error;
704 } 742 }
@@ -707,8 +745,8 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
707 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); 745 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
708 if (IS_ERR(inode)) { 746 if (IS_ERR(inode)) {
709 err = PTR_ERR(inode); 747 err = PTR_ERR(inode);
710 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 748 p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
711 err); 749 err);
712 goto error; 750 goto error;
713 } 751 }
714 err = v9fs_fid_add(dentry, fid); 752 err = v9fs_fid_add(dentry, fid);
@@ -751,9 +789,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
751 struct p9_fid *dfid, *oldfid; 789 struct p9_fid *dfid, *oldfid;
752 struct v9fs_session_info *v9ses; 790 struct v9fs_session_info *v9ses;
753 791
754 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", 792 p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
755 dir->i_ino, old_dentry->d_name.name, 793 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
756 dentry->d_name.name);
757 794
758 v9ses = v9fs_inode2v9ses(dir); 795 v9ses = v9fs_inode2v9ses(dir);
759 dir_dentry = v9fs_dentry_from_dir_inode(dir); 796 dir_dentry = v9fs_dentry_from_dir_inode(dir);
@@ -770,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
770 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name); 807 err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
771 808
772 if (err < 0) { 809 if (err < 0) {
773 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_link failed %d\n", err); 810 p9_debug(P9_DEBUG_VFS, "p9_client_link failed %d\n", err);
774 return err; 811 return err;
775 } 812 }
776 813
@@ -813,9 +850,9 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
813 struct dentry *dir_dentry; 850 struct dentry *dir_dentry;
814 struct posix_acl *dacl = NULL, *pacl = NULL; 851 struct posix_acl *dacl = NULL, *pacl = NULL;
815 852
816 P9_DPRINTK(P9_DEBUG_VFS, 853 p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n",
817 " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", dir->i_ino, 854 dir->i_ino, dentry->d_name.name, omode,
818 dentry->d_name.name, omode, MAJOR(rdev), MINOR(rdev)); 855 MAJOR(rdev), MINOR(rdev));
819 856
820 if (!new_valid_dev(rdev)) 857 if (!new_valid_dev(rdev))
821 return -EINVAL; 858 return -EINVAL;
@@ -825,7 +862,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
825 dfid = v9fs_fid_lookup(dir_dentry); 862 dfid = v9fs_fid_lookup(dir_dentry);
826 if (IS_ERR(dfid)) { 863 if (IS_ERR(dfid)) {
827 err = PTR_ERR(dfid); 864 err = PTR_ERR(dfid);
828 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err); 865 p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
829 dfid = NULL; 866 dfid = NULL;
830 goto error; 867 goto error;
831 } 868 }
@@ -835,8 +872,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
835 /* Update mode based on ACL value */ 872 /* Update mode based on ACL value */
836 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl); 873 err = v9fs_acl_mode(dir, &mode, &dacl, &pacl);
837 if (err) { 874 if (err) {
838 P9_DPRINTK(P9_DEBUG_VFS, 875 p9_debug(P9_DEBUG_VFS, "Failed to get acl values in mknod %d\n",
839 "Failed to get acl values in mknod %d\n", err); 876 err);
840 goto error; 877 goto error;
841 } 878 }
842 name = (char *) dentry->d_name.name; 879 name = (char *) dentry->d_name.name;
@@ -851,8 +888,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
851 fid = p9_client_walk(dfid, 1, &name, 1); 888 fid = p9_client_walk(dfid, 1, &name, 1);
852 if (IS_ERR(fid)) { 889 if (IS_ERR(fid)) {
853 err = PTR_ERR(fid); 890 err = PTR_ERR(fid);
854 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", 891 p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
855 err); 892 err);
856 fid = NULL; 893 fid = NULL;
857 goto error; 894 goto error;
858 } 895 }
@@ -860,8 +897,8 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
860 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb); 897 inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
861 if (IS_ERR(inode)) { 898 if (IS_ERR(inode)) {
862 err = PTR_ERR(inode); 899 err = PTR_ERR(inode);
863 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 900 p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
864 err); 901 err);
865 goto error; 902 goto error;
866 } 903 }
867 err = v9fs_fid_add(dentry, fid); 904 err = v9fs_fid_add(dentry, fid);
@@ -905,7 +942,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
905 char *link = __getname(); 942 char *link = __getname();
906 char *target; 943 char *target;
907 944
908 P9_DPRINTK(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); 945 p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name);
909 946
910 if (!link) { 947 if (!link) {
911 link = ERR_PTR(-ENOMEM); 948 link = ERR_PTR(-ENOMEM);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f68ff65a32a5..7b0cd87b07c2 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -121,7 +121,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
121 struct p9_fid *fid; 121 struct p9_fid *fid;
122 int retval = 0; 122 int retval = 0;
123 123
124 P9_DPRINTK(P9_DEBUG_VFS, " \n"); 124 p9_debug(P9_DEBUG_VFS, "\n");
125 125
126 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); 126 v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
127 if (!v9ses) 127 if (!v9ses)
@@ -191,7 +191,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
191 goto release_sb; 191 goto release_sb;
192 v9fs_fid_add(root, fid); 192 v9fs_fid_add(root, fid);
193 193
194 P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); 194 p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n");
195 return dget(sb->s_root); 195 return dget(sb->s_root);
196 196
197clunk_fid: 197clunk_fid:
@@ -223,7 +223,7 @@ static void v9fs_kill_super(struct super_block *s)
223{ 223{
224 struct v9fs_session_info *v9ses = s->s_fs_info; 224 struct v9fs_session_info *v9ses = s->s_fs_info;
225 225
226 P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); 226 p9_debug(P9_DEBUG_VFS, " %p\n", s);
227 227
228 kill_anon_super(s); 228 kill_anon_super(s);
229 229
@@ -231,7 +231,7 @@ static void v9fs_kill_super(struct super_block *s)
231 v9fs_session_close(v9ses); 231 v9fs_session_close(v9ses);
232 kfree(v9ses); 232 kfree(v9ses);
233 s->s_fs_info = NULL; 233 s->s_fs_info = NULL;
234 P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); 234 p9_debug(P9_DEBUG_VFS, "exiting kill_super\n");
235} 235}
236 236
237static void 237static void
@@ -303,7 +303,7 @@ static int v9fs_write_inode(struct inode *inode,
303 * send an fsync request to server irrespective of 303 * send an fsync request to server irrespective of
304 * wbc->sync_mode. 304 * wbc->sync_mode.
305 */ 305 */
306 P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); 306 p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
307 v9inode = V9FS_I(inode); 307 v9inode = V9FS_I(inode);
308 if (!v9inode->writeback_fid) 308 if (!v9inode->writeback_fid)
309 return 0; 309 return 0;
@@ -326,7 +326,7 @@ static int v9fs_write_inode_dotl(struct inode *inode,
326 * send an fsync request to server irrespective of 326 * send an fsync request to server irrespective of
327 * wbc->sync_mode. 327 * wbc->sync_mode.
328 */ 328 */
329 P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); 329 p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
330 v9inode = V9FS_I(inode); 330 v9inode = V9FS_I(inode);
331 if (!v9inode->writeback_fid) 331 if (!v9inode->writeback_fid)
332 return 0; 332 return 0;
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index d288773871b3..29653b70a9c3 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -32,8 +32,8 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name,
32 attr_fid = p9_client_xattrwalk(fid, name, &attr_size); 32 attr_fid = p9_client_xattrwalk(fid, name, &attr_size);
33 if (IS_ERR(attr_fid)) { 33 if (IS_ERR(attr_fid)) {
34 retval = PTR_ERR(attr_fid); 34 retval = PTR_ERR(attr_fid);
35 P9_DPRINTK(P9_DEBUG_VFS, 35 p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n",
36 "p9_client_attrwalk failed %zd\n", retval); 36 retval);
37 attr_fid = NULL; 37 attr_fid = NULL;
38 goto error; 38 goto error;
39 } 39 }
@@ -87,8 +87,8 @@ ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name,
87{ 87{
88 struct p9_fid *fid; 88 struct p9_fid *fid;
89 89
90 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu\n", 90 p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu\n",
91 __func__, name, buffer_size); 91 name, buffer_size);
92 fid = v9fs_fid_lookup(dentry); 92 fid = v9fs_fid_lookup(dentry);
93 if (IS_ERR(fid)) 93 if (IS_ERR(fid))
94 return PTR_ERR(fid); 94 return PTR_ERR(fid);
@@ -115,8 +115,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
115 int retval, msize, write_count; 115 int retval, msize, write_count;
116 struct p9_fid *fid = NULL; 116 struct p9_fid *fid = NULL;
117 117
118 P9_DPRINTK(P9_DEBUG_VFS, "%s: name = %s value_len = %zu flags = %d\n", 118 p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n",
119 __func__, name, value_len, flags); 119 name, value_len, flags);
120 120
121 fid = v9fs_fid_clone(dentry); 121 fid = v9fs_fid_clone(dentry);
122 if (IS_ERR(fid)) { 122 if (IS_ERR(fid)) {
@@ -129,8 +129,8 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name,
129 */ 129 */
130 retval = p9_client_xattrcreate(fid, name, value_len, flags); 130 retval = p9_client_xattrcreate(fid, name, value_len, flags);
131 if (retval < 0) { 131 if (retval < 0) {
132 P9_DPRINTK(P9_DEBUG_VFS, 132 p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
133 "p9_client_xattrcreate failed %d\n", retval); 133 retval);
134 goto error; 134 goto error;
135 } 135 }
136 msize = fid->clnt->msize; 136 msize = fid->clnt->msize;
diff --git a/fs/Kconfig b/fs/Kconfig
index 30145d886bc2..d621f02a3f9e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -218,6 +218,8 @@ source "fs/exofs/Kconfig"
218 218
219endif # MISC_FILESYSTEMS 219endif # MISC_FILESYSTEMS
220 220
221source "fs/exofs/Kconfig.ore"
222
221menuconfig NETWORK_FILESYSTEMS 223menuconfig NETWORK_FILESYSTEMS
222 bool "Network File Systems" 224 bool "Network File Systems"
223 default y 225 default y
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7973b7..e95d1b64082c 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF
27 bool 27 bool
28 depends on COMPAT && BINFMT_ELF 28 depends on COMPAT && BINFMT_ELF
29 29
30config ARCH_BINFMT_ELF_RANDOMIZE_PIE
31 bool
32
30config BINFMT_ELF_FDPIC 33config BINFMT_ELF_FDPIC
31 bool "Kernel support for FDPIC ELF binaries" 34 bool "Kernel support for FDPIC ELF binaries"
32 default y 35 default y
diff --git a/fs/aio.c b/fs/aio.c
index 78c514cfd212..969beb0e2231 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -476,14 +476,21 @@ static void kiocb_batch_init(struct kiocb_batch *batch, long total)
476 batch->count = total; 476 batch->count = total;
477} 477}
478 478
479static void kiocb_batch_free(struct kiocb_batch *batch) 479static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
480{ 480{
481 struct kiocb *req, *n; 481 struct kiocb *req, *n;
482 482
483 if (list_empty(&batch->head))
484 return;
485
486 spin_lock_irq(&ctx->ctx_lock);
483 list_for_each_entry_safe(req, n, &batch->head, ki_batch) { 487 list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
484 list_del(&req->ki_batch); 488 list_del(&req->ki_batch);
489 list_del(&req->ki_list);
485 kmem_cache_free(kiocb_cachep, req); 490 kmem_cache_free(kiocb_cachep, req);
491 ctx->reqs_active--;
486 } 492 }
493 spin_unlock_irq(&ctx->ctx_lock);
487} 494}
488 495
489/* 496/*
@@ -1742,7 +1749,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1742 } 1749 }
1743 blk_finish_plug(&plug); 1750 blk_finish_plug(&plug);
1744 1751
1745 kiocb_batch_free(&batch); 1752 kiocb_batch_free(ctx, &batch);
1746 put_ioctx(ctx); 1753 put_ioctx(ctx);
1747 return i ? i : ret; 1754 return i ? i : ret;
1748} 1755}
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5869d4e974a9..d8d8e7ba6a1e 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -116,6 +116,7 @@ struct autofs_sb_info {
116 int needs_reghost; 116 int needs_reghost;
117 struct super_block *sb; 117 struct super_block *sb;
118 struct mutex wq_mutex; 118 struct mutex wq_mutex;
119 struct mutex pipe_mutex;
119 spinlock_t fs_lock; 120 spinlock_t fs_lock;
120 struct autofs_wait_queue *queues; /* Wait queue pointer */ 121 struct autofs_wait_queue *queues; /* Wait queue pointer */
121 spinlock_t lookup_lock; 122 spinlock_t lookup_lock;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2ba44c79d548..e16980b00b8d 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -225,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
225 sbi->min_proto = 0; 225 sbi->min_proto = 0;
226 sbi->max_proto = 0; 226 sbi->max_proto = 0;
227 mutex_init(&sbi->wq_mutex); 227 mutex_init(&sbi->wq_mutex);
228 mutex_init(&sbi->pipe_mutex);
228 spin_lock_init(&sbi->fs_lock); 229 spin_lock_init(&sbi->fs_lock);
229 sbi->queues = NULL; 230 sbi->queues = NULL;
230 spin_lock_init(&sbi->lookup_lock); 231 spin_lock_init(&sbi->lookup_lock);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e1fbdeef85db..da8876d38a7b 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
56 mutex_unlock(&sbi->wq_mutex); 56 mutex_unlock(&sbi->wq_mutex);
57} 57}
58 58
59static int autofs4_write(struct file *file, const void *addr, int bytes) 59static int autofs4_write(struct autofs_sb_info *sbi,
60 struct file *file, const void *addr, int bytes)
60{ 61{
61 unsigned long sigpipe, flags; 62 unsigned long sigpipe, flags;
62 mm_segment_t fs; 63 mm_segment_t fs;
63 const char *data = (const char *)addr; 64 const char *data = (const char *)addr;
64 ssize_t wr = 0; 65 ssize_t wr = 0;
65 66
66 /** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
67
68 sigpipe = sigismember(&current->pending.signal, SIGPIPE); 67 sigpipe = sigismember(&current->pending.signal, SIGPIPE);
69 68
70 /* Save pointer to user space and point back to kernel space */ 69 /* Save pointer to user space and point back to kernel space */
71 fs = get_fs(); 70 fs = get_fs();
72 set_fs(KERNEL_DS); 71 set_fs(KERNEL_DS);
73 72
73 mutex_lock(&sbi->pipe_mutex);
74 while (bytes && 74 while (bytes &&
75 (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) { 75 (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
76 data += wr; 76 data += wr;
77 bytes -= wr; 77 bytes -= wr;
78 } 78 }
79 mutex_unlock(&sbi->pipe_mutex);
79 80
80 set_fs(fs); 81 set_fs(fs);
81 82
@@ -110,6 +111,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
110 111
111 pkt.hdr.proto_version = sbi->version; 112 pkt.hdr.proto_version = sbi->version;
112 pkt.hdr.type = type; 113 pkt.hdr.type = type;
114 mutex_lock(&sbi->wq_mutex);
115
116 /* Check if we have become catatonic */
117 if (sbi->catatonic) {
118 mutex_unlock(&sbi->wq_mutex);
119 return;
120 }
113 switch (type) { 121 switch (type) {
114 /* Kernel protocol v4 missing and expire packets */ 122 /* Kernel protocol v4 missing and expire packets */
115 case autofs_ptype_missing: 123 case autofs_ptype_missing:
@@ -163,22 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
163 } 171 }
164 default: 172 default:
165 printk("autofs4_notify_daemon: bad type %d!\n", type); 173 printk("autofs4_notify_daemon: bad type %d!\n", type);
174 mutex_unlock(&sbi->wq_mutex);
166 return; 175 return;
167 } 176 }
168 177
169 /* Check if we have become catatonic */ 178 pipe = sbi->pipe;
170 mutex_lock(&sbi->wq_mutex); 179 get_file(pipe);
171 if (!sbi->catatonic) { 180
172 pipe = sbi->pipe;
173 get_file(pipe);
174 }
175 mutex_unlock(&sbi->wq_mutex); 181 mutex_unlock(&sbi->wq_mutex);
176 182
177 if (pipe) { 183 if (autofs4_write(sbi, pipe, &pkt, pktsz))
178 if (autofs4_write(pipe, &pkt, pktsz)) 184 autofs4_catatonic_mode(sbi);
179 autofs4_catatonic_mode(sbi); 185 fput(pipe);
180 fput(pipe);
181 }
182} 186}
183 187
184static int autofs4_getpath(struct autofs_sb_info *sbi, 188static int autofs4_getpath(struct autofs_sb_info *sbi,
@@ -257,6 +261,9 @@ static int validate_request(struct autofs_wait_queue **wait,
257 struct autofs_wait_queue *wq; 261 struct autofs_wait_queue *wq;
258 struct autofs_info *ino; 262 struct autofs_info *ino;
259 263
264 if (sbi->catatonic)
265 return -ENOENT;
266
260 /* Wait in progress, continue; */ 267 /* Wait in progress, continue; */
261 wq = autofs4_find_wait(sbi, qstr); 268 wq = autofs4_find_wait(sbi, qstr);
262 if (wq) { 269 if (wq) {
@@ -289,6 +296,9 @@ static int validate_request(struct autofs_wait_queue **wait,
289 if (mutex_lock_interruptible(&sbi->wq_mutex)) 296 if (mutex_lock_interruptible(&sbi->wq_mutex))
290 return -EINTR; 297 return -EINTR;
291 298
299 if (sbi->catatonic)
300 return -ENOENT;
301
292 wq = autofs4_find_wait(sbi, qstr); 302 wq = autofs4_find_wait(sbi, qstr);
293 if (wq) { 303 if (wq) {
294 *wait = wq; 304 *wait = wq;
@@ -389,7 +399,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
389 399
390 ret = validate_request(&wq, sbi, &qstr, dentry, notify); 400 ret = validate_request(&wq, sbi, &qstr, dentry, notify);
391 if (ret <= 0) { 401 if (ret <= 0) {
392 if (ret == 0) 402 if (ret != -EINTR)
393 mutex_unlock(&sbi->wq_mutex); 403 mutex_unlock(&sbi->wq_mutex);
394 kfree(qstr.name); 404 kfree(qstr.name);
395 return ret; 405 return ret;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 21ac5ee4b43f..bcb884e2d613 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
794 * default mmap base, as well as whatever program they 794 * default mmap base, as well as whatever program they
795 * might try to exec. This is because the brk will 795 * might try to exec. This is because the brk will
796 * follow the loader, and is not movable. */ 796 * follow the loader, and is not movable. */
797#if defined(CONFIG_X86) || defined(CONFIG_ARM) 797#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
798 /* Memory randomization might have been switched off 798 /* Memory randomization might have been switched off
799 * in runtime via sysctl. 799 * in runtime via sysctl.
800 * If that is the case, retain the original non-zero 800 * If that is the case, retain the original non-zero
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 69a5b6fbee2b..0e575d1304b4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,7 +25,6 @@
25#include <linux/uio.h> 25#include <linux/uio.h>
26#include <linux/namei.h> 26#include <linux/namei.h>
27#include <linux/log2.h> 27#include <linux/log2.h>
28#include <linux/kmemleak.h>
29#include <linux/cleancache.h> 28#include <linux/cleancache.h>
30#include <asm/uaccess.h> 29#include <asm/uaccess.h>
31#include "internal.h" 30#include "internal.h"
@@ -521,7 +520,7 @@ static struct super_block *blockdev_superblock __read_mostly;
521void __init bdev_cache_init(void) 520void __init bdev_cache_init(void)
522{ 521{
523 int err; 522 int err;
524 struct vfsmount *bd_mnt; 523 static struct vfsmount *bd_mnt;
525 524
526 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 525 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
527 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 526 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -533,12 +532,7 @@ void __init bdev_cache_init(void)
533 bd_mnt = kern_mount(&bd_type); 532 bd_mnt = kern_mount(&bd_type);
534 if (IS_ERR(bd_mnt)) 533 if (IS_ERR(bd_mnt))
535 panic("Cannot create bdev pseudo-fs"); 534 panic("Cannot create bdev pseudo-fs");
536 /* 535 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
537 * This vfsmount structure is only used to obtain the
538 * blockdev_superblock, so tell kmemleak not to report it.
539 */
540 kmemleak_not_leak(bd_mnt);
541 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
542} 536}
543 537
544/* 538/*
@@ -1145,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1145 mutex_lock_nested(&bdev->bd_mutex, for_part); 1139 mutex_lock_nested(&bdev->bd_mutex, for_part);
1146 if (!bdev->bd_openers) { 1140 if (!bdev->bd_openers) {
1147 bdev->bd_disk = disk; 1141 bdev->bd_disk = disk;
1142 bdev->bd_queue = disk->queue;
1148 bdev->bd_contains = bdev; 1143 bdev->bd_contains = bdev;
1149 if (!partno) { 1144 if (!partno) {
1150 struct backing_dev_info *bdi; 1145 struct backing_dev_info *bdi;
@@ -1165,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1165 disk_put_part(bdev->bd_part); 1160 disk_put_part(bdev->bd_part);
1166 bdev->bd_part = NULL; 1161 bdev->bd_part = NULL;
1167 bdev->bd_disk = NULL; 1162 bdev->bd_disk = NULL;
1163 bdev->bd_queue = NULL;
1168 mutex_unlock(&bdev->bd_mutex); 1164 mutex_unlock(&bdev->bd_mutex);
1169 disk_unblock_events(disk); 1165 disk_unblock_events(disk);
1170 put_disk(disk); 1166 put_disk(disk);
@@ -1238,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1238 disk_put_part(bdev->bd_part); 1234 disk_put_part(bdev->bd_part);
1239 bdev->bd_disk = NULL; 1235 bdev->bd_disk = NULL;
1240 bdev->bd_part = NULL; 1236 bdev->bd_part = NULL;
1237 bdev->bd_queue = NULL;
1241 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); 1238 bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
1242 if (bdev != bdev->bd_contains) 1239 if (bdev != bdev->bd_contains)
1243 __blkdev_put(bdev->bd_contains, mode, 1); 1240 __blkdev_put(bdev->bd_contains, mode, 1);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ecb9fd3be143..d33f01c08b60 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -31,3 +31,22 @@ config BTRFS_FS_POSIX_ACL
31 Linux website <http://acl.bestbits.at/>. 31 Linux website <http://acl.bestbits.at/>.
32 32
33 If you don't know what Access Control Lists are, say N 33 If you don't know what Access Control Lists are, say N
34
35config BTRFS_FS_CHECK_INTEGRITY
36 bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
37 depends on BTRFS_FS
38 help
39 Adds code that examines all block write requests (including
40 writes of the super block). The goal is to verify that the
41 state of the filesystem on disk is always consistent, i.e.,
42 after a power-loss or kernel panic event the filesystem is
43 in a consistent state.
44
45 If the integrity check tool is included and activated in
46 the mount options, plenty of kernel memory is used, and
47 plenty of additional CPU cycles are spent. Enabling this
48 functionality is not intended for normal use.
49
50 In most cases, unless you are a btrfs developer who needs
51 to verify the integrity of (super)-block write requests
52 during the run of a regression test, say N
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index c0ddfd29c5e5..0c4fa2befae7 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o 11 reada.o backref.o ulist.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
14btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 22c64fff1bd5..633c701a287d 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -19,18 +19,789 @@
19#include "ctree.h" 19#include "ctree.h"
20#include "disk-io.h" 20#include "disk-io.h"
21#include "backref.h" 21#include "backref.h"
22#include "ulist.h"
23#include "transaction.h"
24#include "delayed-ref.h"
22 25
23struct __data_ref { 26/*
27 * this structure records all encountered refs on the way up to the root
28 */
29struct __prelim_ref {
24 struct list_head list; 30 struct list_head list;
25 u64 inum; 31 u64 root_id;
26 u64 root; 32 struct btrfs_key key;
27 u64 extent_data_item_offset; 33 int level;
34 int count;
35 u64 parent;
36 u64 wanted_disk_byte;
28}; 37};
29 38
30struct __shared_ref { 39static int __add_prelim_ref(struct list_head *head, u64 root_id,
31 struct list_head list; 40 struct btrfs_key *key, int level, u64 parent,
41 u64 wanted_disk_byte, int count)
42{
43 struct __prelim_ref *ref;
44
45 /* in case we're adding delayed refs, we're holding the refs spinlock */
46 ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
47 if (!ref)
48 return -ENOMEM;
49
50 ref->root_id = root_id;
51 if (key)
52 ref->key = *key;
53 else
54 memset(&ref->key, 0, sizeof(ref->key));
55
56 ref->level = level;
57 ref->count = count;
58 ref->parent = parent;
59 ref->wanted_disk_byte = wanted_disk_byte;
60 list_add_tail(&ref->list, head);
61
62 return 0;
63}
64
65static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
66 struct ulist *parents,
67 struct extent_buffer *eb, int level,
68 u64 wanted_objectid, u64 wanted_disk_byte)
69{
70 int ret;
71 int slot;
72 struct btrfs_file_extent_item *fi;
73 struct btrfs_key key;
32 u64 disk_byte; 74 u64 disk_byte;
33}; 75
76add_parent:
77 ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
78 if (ret < 0)
79 return ret;
80
81 if (level != 0)
82 return 0;
83
84 /*
85 * if the current leaf is full with EXTENT_DATA items, we must
86 * check the next one if that holds a reference as well.
87 * ref->count cannot be used to skip this check.
88 * repeat this until we don't find any additional EXTENT_DATA items.
89 */
90 while (1) {
91 ret = btrfs_next_leaf(root, path);
92 if (ret < 0)
93 return ret;
94 if (ret)
95 return 0;
96
97 eb = path->nodes[0];
98 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
99 btrfs_item_key_to_cpu(eb, &key, slot);
100 if (key.objectid != wanted_objectid ||
101 key.type != BTRFS_EXTENT_DATA_KEY)
102 return 0;
103 fi = btrfs_item_ptr(eb, slot,
104 struct btrfs_file_extent_item);
105 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
106 if (disk_byte == wanted_disk_byte)
107 goto add_parent;
108 }
109 }
110
111 return 0;
112}
113
114/*
115 * resolve an indirect backref in the form (root_id, key, level)
116 * to a logical address
117 */
118static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
119 struct __prelim_ref *ref,
120 struct ulist *parents)
121{
122 struct btrfs_path *path;
123 struct btrfs_root *root;
124 struct btrfs_key root_key;
125 struct btrfs_key key = {0};
126 struct extent_buffer *eb;
127 int ret = 0;
128 int root_level;
129 int level = ref->level;
130
131 path = btrfs_alloc_path();
132 if (!path)
133 return -ENOMEM;
134
135 root_key.objectid = ref->root_id;
136 root_key.type = BTRFS_ROOT_ITEM_KEY;
137 root_key.offset = (u64)-1;
138 root = btrfs_read_fs_root_no_name(fs_info, &root_key);
139 if (IS_ERR(root)) {
140 ret = PTR_ERR(root);
141 goto out;
142 }
143
144 rcu_read_lock();
145 root_level = btrfs_header_level(root->node);
146 rcu_read_unlock();
147
148 if (root_level + 1 == level)
149 goto out;
150
151 path->lowest_level = level;
152 ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
153 pr_debug("search slot in root %llu (level %d, ref count %d) returned "
154 "%d for key (%llu %u %llu)\n",
155 (unsigned long long)ref->root_id, level, ref->count, ret,
156 (unsigned long long)ref->key.objectid, ref->key.type,
157 (unsigned long long)ref->key.offset);
158 if (ret < 0)
159 goto out;
160
161 eb = path->nodes[level];
162 if (!eb) {
163 WARN_ON(1);
164 ret = 1;
165 goto out;
166 }
167
168 if (level == 0) {
169 if (ret == 1 && path->slots[0] >= btrfs_header_nritems(eb)) {
170 ret = btrfs_next_leaf(root, path);
171 if (ret)
172 goto out;
173 eb = path->nodes[0];
174 }
175
176 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
177 }
178
179 /* the last two parameters will only be used for level == 0 */
180 ret = add_all_parents(root, path, parents, eb, level, key.objectid,
181 ref->wanted_disk_byte);
182out:
183 btrfs_free_path(path);
184 return ret;
185}
186
187/*
188 * resolve all indirect backrefs from the list
189 */
190static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
191 struct list_head *head)
192{
193 int err;
194 int ret = 0;
195 struct __prelim_ref *ref;
196 struct __prelim_ref *ref_safe;
197 struct __prelim_ref *new_ref;
198 struct ulist *parents;
199 struct ulist_node *node;
200
201 parents = ulist_alloc(GFP_NOFS);
202 if (!parents)
203 return -ENOMEM;
204
205 /*
206 * _safe allows us to insert directly after the current item without
207 * iterating over the newly inserted items.
208 * we're also allowed to re-assign ref during iteration.
209 */
210 list_for_each_entry_safe(ref, ref_safe, head, list) {
211 if (ref->parent) /* already direct */
212 continue;
213 if (ref->count == 0)
214 continue;
215 err = __resolve_indirect_ref(fs_info, ref, parents);
216 if (err) {
217 if (ret == 0)
218 ret = err;
219 continue;
220 }
221
222 /* we put the first parent into the ref at hand */
223 node = ulist_next(parents, NULL);
224 ref->parent = node ? node->val : 0;
225
226 /* additional parents require new refs being added here */
227 while ((node = ulist_next(parents, node))) {
228 new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
229 if (!new_ref) {
230 ret = -ENOMEM;
231 break;
232 }
233 memcpy(new_ref, ref, sizeof(*ref));
234 new_ref->parent = node->val;
235 list_add(&new_ref->list, &ref->list);
236 }
237 ulist_reinit(parents);
238 }
239
240 ulist_free(parents);
241 return ret;
242}
243
244/*
245 * merge two lists of backrefs and adjust counts accordingly
246 *
247 * mode = 1: merge identical keys, if key is set
248 * mode = 2: merge identical parents
249 */
250static int __merge_refs(struct list_head *head, int mode)
251{
252 struct list_head *pos1;
253
254 list_for_each(pos1, head) {
255 struct list_head *n2;
256 struct list_head *pos2;
257 struct __prelim_ref *ref1;
258
259 ref1 = list_entry(pos1, struct __prelim_ref, list);
260
261 if (mode == 1 && ref1->key.type == 0)
262 continue;
263 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
264 pos2 = n2, n2 = pos2->next) {
265 struct __prelim_ref *ref2;
266
267 ref2 = list_entry(pos2, struct __prelim_ref, list);
268
269 if (mode == 1) {
270 if (memcmp(&ref1->key, &ref2->key,
271 sizeof(ref1->key)) ||
272 ref1->level != ref2->level ||
273 ref1->root_id != ref2->root_id)
274 continue;
275 ref1->count += ref2->count;
276 } else {
277 if (ref1->parent != ref2->parent)
278 continue;
279 ref1->count += ref2->count;
280 }
281 list_del(&ref2->list);
282 kfree(ref2);
283 }
284
285 }
286 return 0;
287}
288
289/*
290 * add all currently queued delayed refs from this head whose seq nr is
291 * smaller or equal that seq to the list
292 */
293static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
294 struct btrfs_key *info_key,
295 struct list_head *prefs)
296{
297 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
298 struct rb_node *n = &head->node.rb_node;
299 int sgn;
300 int ret = 0;
301
302 if (extent_op && extent_op->update_key)
303 btrfs_disk_key_to_cpu(info_key, &extent_op->key);
304
305 while ((n = rb_prev(n))) {
306 struct btrfs_delayed_ref_node *node;
307 node = rb_entry(n, struct btrfs_delayed_ref_node,
308 rb_node);
309 if (node->bytenr != head->node.bytenr)
310 break;
311 WARN_ON(node->is_head);
312
313 if (node->seq > seq)
314 continue;
315
316 switch (node->action) {
317 case BTRFS_ADD_DELAYED_EXTENT:
318 case BTRFS_UPDATE_DELAYED_HEAD:
319 WARN_ON(1);
320 continue;
321 case BTRFS_ADD_DELAYED_REF:
322 sgn = 1;
323 break;
324 case BTRFS_DROP_DELAYED_REF:
325 sgn = -1;
326 break;
327 default:
328 BUG_ON(1);
329 }
330 switch (node->type) {
331 case BTRFS_TREE_BLOCK_REF_KEY: {
332 struct btrfs_delayed_tree_ref *ref;
333
334 ref = btrfs_delayed_node_to_tree_ref(node);
335 ret = __add_prelim_ref(prefs, ref->root, info_key,
336 ref->level + 1, 0, node->bytenr,
337 node->ref_mod * sgn);
338 break;
339 }
340 case BTRFS_SHARED_BLOCK_REF_KEY: {
341 struct btrfs_delayed_tree_ref *ref;
342
343 ref = btrfs_delayed_node_to_tree_ref(node);
344 ret = __add_prelim_ref(prefs, ref->root, info_key,
345 ref->level + 1, ref->parent,
346 node->bytenr,
347 node->ref_mod * sgn);
348 break;
349 }
350 case BTRFS_EXTENT_DATA_REF_KEY: {
351 struct btrfs_delayed_data_ref *ref;
352 struct btrfs_key key;
353
354 ref = btrfs_delayed_node_to_data_ref(node);
355
356 key.objectid = ref->objectid;
357 key.type = BTRFS_EXTENT_DATA_KEY;
358 key.offset = ref->offset;
359 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
360 node->bytenr,
361 node->ref_mod * sgn);
362 break;
363 }
364 case BTRFS_SHARED_DATA_REF_KEY: {
365 struct btrfs_delayed_data_ref *ref;
366 struct btrfs_key key;
367
368 ref = btrfs_delayed_node_to_data_ref(node);
369
370 key.objectid = ref->objectid;
371 key.type = BTRFS_EXTENT_DATA_KEY;
372 key.offset = ref->offset;
373 ret = __add_prelim_ref(prefs, ref->root, &key, 0,
374 ref->parent, node->bytenr,
375 node->ref_mod * sgn);
376 break;
377 }
378 default:
379 WARN_ON(1);
380 }
381 BUG_ON(ret);
382 }
383
384 return 0;
385}
386
387/*
388 * add all inline backrefs for bytenr to the list
389 */
390static int __add_inline_refs(struct btrfs_fs_info *fs_info,
391 struct btrfs_path *path, u64 bytenr,
392 struct btrfs_key *info_key, int *info_level,
393 struct list_head *prefs)
394{
395 int ret = 0;
396 int slot;
397 struct extent_buffer *leaf;
398 struct btrfs_key key;
399 unsigned long ptr;
400 unsigned long end;
401 struct btrfs_extent_item *ei;
402 u64 flags;
403 u64 item_size;
404
405 /*
406 * enumerate all inline refs
407 */
408 leaf = path->nodes[0];
409 slot = path->slots[0] - 1;
410
411 item_size = btrfs_item_size_nr(leaf, slot);
412 BUG_ON(item_size < sizeof(*ei));
413
414 ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
415 flags = btrfs_extent_flags(leaf, ei);
416
417 ptr = (unsigned long)(ei + 1);
418 end = (unsigned long)ei + item_size;
419
420 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
421 struct btrfs_tree_block_info *info;
422 struct btrfs_disk_key disk_key;
423
424 info = (struct btrfs_tree_block_info *)ptr;
425 *info_level = btrfs_tree_block_level(leaf, info);
426 btrfs_tree_block_key(leaf, info, &disk_key);
427 btrfs_disk_key_to_cpu(info_key, &disk_key);
428 ptr += sizeof(struct btrfs_tree_block_info);
429 BUG_ON(ptr > end);
430 } else {
431 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
432 }
433
434 while (ptr < end) {
435 struct btrfs_extent_inline_ref *iref;
436 u64 offset;
437 int type;
438
439 iref = (struct btrfs_extent_inline_ref *)ptr;
440 type = btrfs_extent_inline_ref_type(leaf, iref);
441 offset = btrfs_extent_inline_ref_offset(leaf, iref);
442
443 switch (type) {
444 case BTRFS_SHARED_BLOCK_REF_KEY:
445 ret = __add_prelim_ref(prefs, 0, info_key,
446 *info_level + 1, offset,
447 bytenr, 1);
448 break;
449 case BTRFS_SHARED_DATA_REF_KEY: {
450 struct btrfs_shared_data_ref *sdref;
451 int count;
452
453 sdref = (struct btrfs_shared_data_ref *)(iref + 1);
454 count = btrfs_shared_data_ref_count(leaf, sdref);
455 ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
456 bytenr, count);
457 break;
458 }
459 case BTRFS_TREE_BLOCK_REF_KEY:
460 ret = __add_prelim_ref(prefs, offset, info_key,
461 *info_level + 1, 0, bytenr, 1);
462 break;
463 case BTRFS_EXTENT_DATA_REF_KEY: {
464 struct btrfs_extent_data_ref *dref;
465 int count;
466 u64 root;
467
468 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
469 count = btrfs_extent_data_ref_count(leaf, dref);
470 key.objectid = btrfs_extent_data_ref_objectid(leaf,
471 dref);
472 key.type = BTRFS_EXTENT_DATA_KEY;
473 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
474 root = btrfs_extent_data_ref_root(leaf, dref);
475 ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
476 count);
477 break;
478 }
479 default:
480 WARN_ON(1);
481 }
482 BUG_ON(ret);
483 ptr += btrfs_extent_inline_ref_size(type);
484 }
485
486 return 0;
487}
488
489/*
490 * add all non-inline backrefs for bytenr to the list
491 */
492static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
493 struct btrfs_path *path, u64 bytenr,
494 struct btrfs_key *info_key, int info_level,
495 struct list_head *prefs)
496{
497 struct btrfs_root *extent_root = fs_info->extent_root;
498 int ret;
499 int slot;
500 struct extent_buffer *leaf;
501 struct btrfs_key key;
502
503 while (1) {
504 ret = btrfs_next_item(extent_root, path);
505 if (ret < 0)
506 break;
507 if (ret) {
508 ret = 0;
509 break;
510 }
511
512 slot = path->slots[0];
513 leaf = path->nodes[0];
514 btrfs_item_key_to_cpu(leaf, &key, slot);
515
516 if (key.objectid != bytenr)
517 break;
518 if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
519 continue;
520 if (key.type > BTRFS_SHARED_DATA_REF_KEY)
521 break;
522
523 switch (key.type) {
524 case BTRFS_SHARED_BLOCK_REF_KEY:
525 ret = __add_prelim_ref(prefs, 0, info_key,
526 info_level + 1, key.offset,
527 bytenr, 1);
528 break;
529 case BTRFS_SHARED_DATA_REF_KEY: {
530 struct btrfs_shared_data_ref *sdref;
531 int count;
532
533 sdref = btrfs_item_ptr(leaf, slot,
534 struct btrfs_shared_data_ref);
535 count = btrfs_shared_data_ref_count(leaf, sdref);
536 ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
537 bytenr, count);
538 break;
539 }
540 case BTRFS_TREE_BLOCK_REF_KEY:
541 ret = __add_prelim_ref(prefs, key.offset, info_key,
542 info_level + 1, 0, bytenr, 1);
543 break;
544 case BTRFS_EXTENT_DATA_REF_KEY: {
545 struct btrfs_extent_data_ref *dref;
546 int count;
547 u64 root;
548
549 dref = btrfs_item_ptr(leaf, slot,
550 struct btrfs_extent_data_ref);
551 count = btrfs_extent_data_ref_count(leaf, dref);
552 key.objectid = btrfs_extent_data_ref_objectid(leaf,
553 dref);
554 key.type = BTRFS_EXTENT_DATA_KEY;
555 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
556 root = btrfs_extent_data_ref_root(leaf, dref);
557 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
558 bytenr, count);
559 break;
560 }
561 default:
562 WARN_ON(1);
563 }
564 BUG_ON(ret);
565 }
566
567 return ret;
568}
569
570/*
571 * this adds all existing backrefs (inline backrefs, backrefs and delayed
572 * refs) for the given bytenr to the refs list, merges duplicates and resolves
573 * indirect refs to their parent bytenr.
574 * When roots are found, they're added to the roots list
575 *
576 * FIXME some caching might speed things up
577 */
578static int find_parent_nodes(struct btrfs_trans_handle *trans,
579 struct btrfs_fs_info *fs_info, u64 bytenr,
580 u64 seq, struct ulist *refs, struct ulist *roots)
581{
582 struct btrfs_key key;
583 struct btrfs_path *path;
584 struct btrfs_key info_key = { 0 };
585 struct btrfs_delayed_ref_root *delayed_refs = NULL;
586 struct btrfs_delayed_ref_head *head = NULL;
587 int info_level = 0;
588 int ret;
589 struct list_head prefs_delayed;
590 struct list_head prefs;
591 struct __prelim_ref *ref;
592
593 INIT_LIST_HEAD(&prefs);
594 INIT_LIST_HEAD(&prefs_delayed);
595
596 key.objectid = bytenr;
597 key.type = BTRFS_EXTENT_ITEM_KEY;
598 key.offset = (u64)-1;
599
600 path = btrfs_alloc_path();
601 if (!path)
602 return -ENOMEM;
603
604 /*
605 * grab both a lock on the path and a lock on the delayed ref head.
606 * We need both to get a consistent picture of how the refs look
607 * at a specified point in time
608 */
609again:
610 ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
611 if (ret < 0)
612 goto out;
613 BUG_ON(ret == 0);
614
615 /*
616 * look if there are updates for this ref queued and lock the head
617 */
618 delayed_refs = &trans->transaction->delayed_refs;
619 spin_lock(&delayed_refs->lock);
620 head = btrfs_find_delayed_ref_head(trans, bytenr);
621 if (head) {
622 if (!mutex_trylock(&head->mutex)) {
623 atomic_inc(&head->node.refs);
624 spin_unlock(&delayed_refs->lock);
625
626 btrfs_release_path(path);
627
628 /*
629 * Mutex was contended, block until it's
630 * released and try again
631 */
632 mutex_lock(&head->mutex);
633 mutex_unlock(&head->mutex);
634 btrfs_put_delayed_ref(&head->node);
635 goto again;
636 }
637 ret = __add_delayed_refs(head, seq, &info_key, &prefs_delayed);
638 if (ret)
639 goto out;
640 }
641 spin_unlock(&delayed_refs->lock);
642
643 if (path->slots[0]) {
644 struct extent_buffer *leaf;
645 int slot;
646
647 leaf = path->nodes[0];
648 slot = path->slots[0] - 1;
649 btrfs_item_key_to_cpu(leaf, &key, slot);
650 if (key.objectid == bytenr &&
651 key.type == BTRFS_EXTENT_ITEM_KEY) {
652 ret = __add_inline_refs(fs_info, path, bytenr,
653 &info_key, &info_level, &prefs);
654 if (ret)
655 goto out;
656 ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
657 info_level, &prefs);
658 if (ret)
659 goto out;
660 }
661 }
662 btrfs_release_path(path);
663
664 /*
665 * when adding the delayed refs above, the info_key might not have
666 * been known yet. Go over the list and replace the missing keys
667 */
668 list_for_each_entry(ref, &prefs_delayed, list) {
669 if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
670 memcpy(&ref->key, &info_key, sizeof(ref->key));
671 }
672 list_splice_init(&prefs_delayed, &prefs);
673
674 ret = __merge_refs(&prefs, 1);
675 if (ret)
676 goto out;
677
678 ret = __resolve_indirect_refs(fs_info, &prefs);
679 if (ret)
680 goto out;
681
682 ret = __merge_refs(&prefs, 2);
683 if (ret)
684 goto out;
685
686 while (!list_empty(&prefs)) {
687 ref = list_first_entry(&prefs, struct __prelim_ref, list);
688 list_del(&ref->list);
689 if (ref->count < 0)
690 WARN_ON(1);
691 if (ref->count && ref->root_id && ref->parent == 0) {
692 /* no parent == root of tree */
693 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
694 BUG_ON(ret < 0);
695 }
696 if (ref->count && ref->parent) {
697 ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
698 BUG_ON(ret < 0);
699 }
700 kfree(ref);
701 }
702
703out:
704 if (head)
705 mutex_unlock(&head->mutex);
706 btrfs_free_path(path);
707 while (!list_empty(&prefs)) {
708 ref = list_first_entry(&prefs, struct __prelim_ref, list);
709 list_del(&ref->list);
710 kfree(ref);
711 }
712 while (!list_empty(&prefs_delayed)) {
713 ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
714 list);
715 list_del(&ref->list);
716 kfree(ref);
717 }
718
719 return ret;
720}
721
722/*
723 * Finds all leafs with a reference to the specified combination of bytenr and
724 * offset. key_list_head will point to a list of corresponding keys (caller must
725 * free each list element). The leafs will be stored in the leafs ulist, which
726 * must be freed with ulist_free.
727 *
728 * returns 0 on success, <0 on error
729 */
730static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
731 struct btrfs_fs_info *fs_info, u64 bytenr,
732 u64 num_bytes, u64 seq, struct ulist **leafs)
733{
734 struct ulist *tmp;
735 int ret;
736
737 tmp = ulist_alloc(GFP_NOFS);
738 if (!tmp)
739 return -ENOMEM;
740 *leafs = ulist_alloc(GFP_NOFS);
741 if (!*leafs) {
742 ulist_free(tmp);
743 return -ENOMEM;
744 }
745
746 ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
747 ulist_free(tmp);
748
749 if (ret < 0 && ret != -ENOENT) {
750 ulist_free(*leafs);
751 return ret;
752 }
753
754 return 0;
755}
756
757/*
758 * walk all backrefs for a given extent to find all roots that reference this
759 * extent. Walking a backref means finding all extents that reference this
760 * extent and in turn walk the backrefs of those, too. Naturally this is a
761 * recursive process, but here it is implemented in an iterative fashion: We
762 * find all referencing extents for the extent in question and put them on a
763 * list. In turn, we find all referencing extents for those, further appending
764 * to the list. The way we iterate the list allows adding more elements after
765 * the current while iterating. The process stops when we reach the end of the
766 * list. Found roots are added to the roots list.
767 *
768 * returns 0 on success, < 0 on error.
769 */
770int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
771 struct btrfs_fs_info *fs_info, u64 bytenr,
772 u64 num_bytes, u64 seq, struct ulist **roots)
773{
774 struct ulist *tmp;
775 struct ulist_node *node = NULL;
776 int ret;
777
778 tmp = ulist_alloc(GFP_NOFS);
779 if (!tmp)
780 return -ENOMEM;
781 *roots = ulist_alloc(GFP_NOFS);
782 if (!*roots) {
783 ulist_free(tmp);
784 return -ENOMEM;
785 }
786
787 while (1) {
788 ret = find_parent_nodes(trans, fs_info, bytenr, seq,
789 tmp, *roots);
790 if (ret < 0 && ret != -ENOENT) {
791 ulist_free(tmp);
792 ulist_free(*roots);
793 return ret;
794 }
795 node = ulist_next(tmp, node);
796 if (!node)
797 break;
798 bytenr = node->val;
799 }
800
801 ulist_free(tmp);
802 return 0;
803}
804
34 805
35static int __inode_info(u64 inum, u64 ioff, u8 key_type, 806static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path, 807 struct btrfs_root *fs_root, struct btrfs_path *path,
@@ -181,8 +952,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 952 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY || 953 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical || 954 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical) 955 found_key->objectid + found_key->offset <= logical) {
956 pr_debug("logical %llu is not within any extent\n",
957 (unsigned long long)logical);
185 return -ENOENT; 958 return -ENOENT;
959 }
186 960
187 eb = path->nodes[0]; 961 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]); 962 item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,6 +965,13 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 965 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei); 966 flags = btrfs_extent_flags(eb, ei);
193 967
968 pr_debug("logical %llu is at position %llu within the extent (%llu "
969 "EXTENT_ITEM %llu) flags %#llx size %u\n",
970 (unsigned long long)logical,
971 (unsigned long long)(logical - found_key->objectid),
972 (unsigned long long)found_key->objectid,
973 (unsigned long long)found_key->offset,
974 (unsigned long long)flags, item_size);
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) 975 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK; 976 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA) 977 if (flags & BTRFS_EXTENT_FLAG_DATA)
@@ -287,128 +1068,11 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
287 return 0; 1068 return 0;
288} 1069}
289 1070
290static int __data_list_add(struct list_head *head, u64 inum, 1071static int iterate_leaf_refs(struct btrfs_fs_info *fs_info,
291 u64 extent_data_item_offset, u64 root) 1072 struct btrfs_path *path, u64 logical,
292{ 1073 u64 orig_extent_item_objectid,
293 struct __data_ref *ref; 1074 u64 extent_item_pos, u64 root,
294 1075 iterate_extent_inodes_t *iterate, void *ctx)
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{ 1076{
413 u64 disk_byte; 1077 u64 disk_byte;
414 struct btrfs_key key; 1078 struct btrfs_key key;
@@ -416,8 +1080,10 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
416 struct extent_buffer *eb; 1080 struct extent_buffer *eb;
417 int slot; 1081 int slot;
418 int nritems; 1082 int nritems;
419 int ret; 1083 int ret = 0;
420 int found = 0; 1084 int extent_type;
1085 u64 data_offset;
1086 u64 data_len;
421 1087
422 eb = read_tree_block(fs_info->tree_root, logical, 1088 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0); 1089 fs_info->tree_root->leafsize, 0);
@@ -435,149 +1101,99 @@ static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
435 if (key.type != BTRFS_EXTENT_DATA_KEY) 1101 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue; 1102 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); 1103 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) { 1104 extent_type = btrfs_file_extent_type(eb, fi);
439 free_extent_buffer(eb); 1105 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
440 return -EIO; 1106 continue;
441 } 1107 /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); 1108 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) { 1109 if (disk_byte != orig_extent_item_objectid)
444 if (found) 1110 continue;
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459 1111
460 if (!found) { 1112 data_offset = btrfs_file_extent_offset(eb, fi);
461 printk(KERN_ERR "btrfs: failed to follow shared data backref " 1113 data_len = btrfs_file_extent_num_bytes(eb, fi);
462 "to parent %llu\n", logical); 1114
463 WARN_ON(1); 1115 if (extent_item_pos < data_offset ||
464 ret = -EIO; 1116 extent_item_pos >= data_offset + data_len)
1117 continue;
1118
1119 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
1120 "root %llu\n", orig_extent_item_objectid,
1121 key.objectid, key.offset, root);
1122 ret = iterate(key.objectid,
1123 key.offset + (extent_item_pos - data_offset),
1124 root, ctx);
1125 if (ret) {
1126 pr_debug("stopping iteration because ret=%d\n", ret);
1127 break;
1128 }
465 } 1129 }
466 1130
467 free_extent_buffer(eb); 1131 free_extent_buffer(eb);
1132
468 return ret; 1133 return ret;
469} 1134}
470 1135
471/* 1136/*
472 * calls iterate() for every inode that references the extent identified by 1137 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it 1138 * the given parameters.
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops. 1139 * when the iterator function returns a non-zero value, iteration stops.
1140 * path is guaranteed to be in released state when iterate() is called.
476 */ 1141 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info, 1142int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path, 1143 struct btrfs_path *path,
479 u64 extent_item_objectid, 1144 u64 extent_item_objectid, u64 extent_item_pos,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx) 1145 iterate_extent_inodes_t *iterate, void *ctx)
482{ 1146{
483 unsigned long ptr = 0;
484 int last;
485 int ret; 1147 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs); 1148 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs); 1149 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d; 1150 struct btrfs_trans_handle *trans;
497 struct __shared_ref *ref_s; 1151 struct ulist *refs;
1152 struct ulist *roots;
1153 struct ulist_node *ref_node = NULL;
1154 struct ulist_node *root_node = NULL;
1155 struct seq_list seq_elem;
1156 struct btrfs_delayed_ref_root *delayed_refs;
1157
1158 trans = btrfs_join_transaction(fs_info->extent_root);
1159 if (IS_ERR(trans))
1160 return PTR_ERR(trans);
1161
1162 pr_debug("resolving all inodes for extent %llu\n",
1163 extent_item_objectid);
1164
1165 delayed_refs = &trans->transaction->delayed_refs;
1166 spin_lock(&delayed_refs->lock);
1167 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
1168 spin_unlock(&delayed_refs->lock);
1169
1170 ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
1171 extent_item_pos, seq_elem.seq,
1172 &refs);
498 1173
499 eb = path->nodes[0]; 1174 if (ret)
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 1175 goto out;
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524 1176
525 /* ... then we proceed to in-tree references and ... */ 1177 while (!ret && (ref_node = ulist_next(refs, ref_node))) {
526 while (!ret) { 1178 ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
527 ++path->slots[0]; 1179 seq_elem.seq, &roots);
528 if (path->slots[0] > btrfs_header_nritems(eb)) { 1180 if (ret)
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break; 1181 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) { 1182 while (!ret && (root_node = ulist_next(roots, root_node))) {
541 dref = btrfs_item_ptr(eb, path->slots[0], 1183 pr_debug("root %llu references leaf %llu\n",
542 struct btrfs_extent_data_ref); 1184 root_node->val, ref_node->val);
543 ret = __data_list_add_eb(&data_refs, eb, dref); 1185 ret = iterate_leaf_refs(fs_info, path, ref_node->val,
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) { 1186 extent_item_objectid,
545 ret = __shared_list_add(&shared_refs, key.offset); 1187 extent_item_pos, root_node->val,
1188 iterate, ctx);
546 } 1189 }
547 } 1190 }
548 1191
549 btrfs_release_path(path); 1192 ulist_free(refs);
550 1193 ulist_free(roots);
551 /* 1194out:
552 * ... only at the very end we can process the refs we found. this is 1195 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
553 * because the iterator function we call is allowed to make tree lookups 1196 btrfs_end_transaction(trans, fs_info->extent_root);
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret; 1197 return ret;
582} 1198}
583 1199
@@ -586,19 +1202,20 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
586 iterate_extent_inodes_t *iterate, void *ctx) 1202 iterate_extent_inodes_t *iterate, void *ctx)
587{ 1203{
588 int ret; 1204 int ret;
589 u64 offset; 1205 u64 extent_item_pos;
590 struct btrfs_key found_key; 1206 struct btrfs_key found_key;
591 1207
592 ret = extent_from_logical(fs_info, logical, path, 1208 ret = extent_from_logical(fs_info, logical, path,
593 &found_key); 1209 &found_key);
1210 btrfs_release_path(path);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 1211 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL; 1212 ret = -EINVAL;
596 if (ret < 0) 1213 if (ret < 0)
597 return ret; 1214 return ret;
598 1215
599 offset = logical - found_key.objectid; 1216 extent_item_pos = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid, 1217 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx); 1218 extent_item_pos, iterate, ctx);
602 1219
603 return ret; 1220 return ret;
604} 1221}
@@ -643,6 +1260,10 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) { 1260 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref); 1261 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */ 1262 /* path must be released before calling iterate()! */
1263 pr_debug("following ref at offset %u for inode %llu in "
1264 "tree %llu\n", cur,
1265 (unsigned long long)found_key.objectid,
1266 (unsigned long long)fs_root->objectid);
646 ret = iterate(parent, iref, eb, ctx); 1267 ret = iterate(parent, iref, eb, ctx);
647 if (ret) { 1268 if (ret) {
648 free_extent_buffer(eb); 1269 free_extent_buffer(eb);
@@ -683,10 +1304,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
683 return PTR_ERR(fspath); 1304 return PTR_ERR(fspath);
684 1305
685 if (fspath > fspath_min) { 1306 if (fspath > fspath_min) {
1307 pr_debug("path resolved: %s\n", fspath);
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath; 1308 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt; 1309 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min; 1310 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else { 1311 } else {
1312 pr_debug("missed path, not enough space. missing bytes: %lu, "
1313 "constructed so far: %s\n",
1314 (unsigned long)(fspath_min - fspath), fspath_min);
690 ++ipath->fspath->elem_missed; 1315 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath; 1316 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0; 1317 ipath->fspath->bytes_left = 0;
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 92618837cb8f..d00dfa9ca934 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -20,6 +20,7 @@
20#define __BTRFS_BACKREF__ 20#define __BTRFS_BACKREF__
21 21
22#include "ioctl.h" 22#include "ioctl.h"
23#include "ulist.h"
23 24
24struct inode_fs_paths { 25struct inode_fs_paths {
25 struct btrfs_path *btrfs_path; 26 struct btrfs_path *btrfs_path;
@@ -54,6 +55,10 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
54 55
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); 56int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56 57
58int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
59 struct btrfs_fs_info *fs_info, u64 bytenr,
60 u64 num_bytes, u64 seq, struct ulist **roots);
61
57struct btrfs_data_container *init_data_container(u32 total_bytes); 62struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, 63struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path); 64 struct btrfs_path *path);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 634608d2a6d0..9b9b15fd5204 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -51,6 +51,9 @@ struct btrfs_inode {
51 /* held while logging the inode in tree-log.c */ 51 /* held while logging the inode in tree-log.c */
52 struct mutex log_mutex; 52 struct mutex log_mutex;
53 53
54 /* held while doing delalloc reservations */
55 struct mutex delalloc_mutex;
56
54 /* used to order data wrt metadata */ 57 /* used to order data wrt metadata */
55 struct btrfs_ordered_inode_tree ordered_tree; 58 struct btrfs_ordered_inode_tree ordered_tree;
56 59
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
new file mode 100644
index 000000000000..b669a7d8e499
--- /dev/null
+++ b/fs/btrfs/check-integrity.c
@@ -0,0 +1,3069 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19/*
20 * This module can be used to catch cases when the btrfs kernel
21 * code executes write requests to the disk that bring the file
22 * system in an inconsistent state. In such a state, a power-loss
23 * or kernel panic event would cause that the data on disk is
24 * lost or at least damaged.
25 *
26 * Code is added that examines all block write requests during
27 * runtime (including writes of the super block). Three rules
28 * are verified and an error is printed on violation of the
29 * rules:
30 * 1. It is not allowed to write a disk block which is
31 * currently referenced by the super block (either directly
32 * or indirectly).
33 * 2. When a super block is written, it is verified that all
34 * referenced (directly or indirectly) blocks fulfill the
35 * following requirements:
36 * 2a. All referenced blocks have either been present when
37 * the file system was mounted, (i.e., they have been
38 * referenced by the super block) or they have been
39 * written since then and the write completion callback
40 * was called and a FLUSH request to the device where
41 * these blocks are located was received and completed.
42 * 2b. All referenced blocks need to have a generation
43 * number which is equal to the parent's number.
44 *
45 * One issue that was found using this module was that the log
46 * tree on disk became temporarily corrupted because disk blocks
47 * that had been in use for the log tree had been freed and
48 * reused too early, while being referenced by the written super
49 * block.
50 *
51 * The search term in the kernel log that can be used to filter
52 * on the existence of detected integrity issues is
53 * "btrfs: attempt".
54 *
55 * The integrity check is enabled via mount options. These
56 * mount options are only supported if the integrity check
57 * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY.
58 *
59 * Example #1, apply integrity checks to all metadata:
60 * mount /dev/sdb1 /mnt -o check_int
61 *
62 * Example #2, apply integrity checks to all metadata and
63 * to data extents:
64 * mount /dev/sdb1 /mnt -o check_int_data
65 *
66 * Example #3, apply integrity checks to all metadata and dump
67 * the tree that the super block references to kernel messages
68 * each time after a super block was written:
69 * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263
70 *
71 * If the integrity check tool is included and activated in
72 * the mount options, plenty of kernel memory is used, and
73 * plenty of additional CPU cycles are spent. Enabling this
74 * functionality is not intended for normal use. In most
75 * cases, unless you are a btrfs developer who needs to verify
76 * the integrity of (super)-block write requests, do not
77 * enable the config option BTRFS_FS_CHECK_INTEGRITY to
78 * include and compile the integrity check tool.
79 */
80
81#include <linux/sched.h>
82#include <linux/slab.h>
83#include <linux/buffer_head.h>
84#include <linux/mutex.h>
85#include <linux/crc32c.h>
86#include <linux/genhd.h>
87#include <linux/blkdev.h>
88#include "ctree.h"
89#include "disk-io.h"
90#include "transaction.h"
91#include "extent_io.h"
92#include "disk-io.h"
93#include "volumes.h"
94#include "print-tree.h"
95#include "locking.h"
96#include "check-integrity.h"
97
98#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
99#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
100#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100
101#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051
102#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807
103#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530
104#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
105#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters,
106 * excluding " [...]" */
107#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
108
109#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
110
111/*
112 * The definition of the bitmask fields for the print_mask.
113 * They are specified with the mount option check_integrity_print_mask.
114 */
115#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001
116#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002
117#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004
118#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008
119#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010
120#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020
121#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040
122#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080
123#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100
124#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200
125#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
126#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
127#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
128
129struct btrfsic_dev_state;
130struct btrfsic_state;
131
132struct btrfsic_block {
133 u32 magic_num; /* only used for debug purposes */
134 unsigned int is_metadata:1; /* if it is meta-data, not data-data */
135 unsigned int is_superblock:1; /* if it is one of the superblocks */
136 unsigned int is_iodone:1; /* if is done by lower subsystem */
137 unsigned int iodone_w_error:1; /* error was indicated to endio */
138 unsigned int never_written:1; /* block was added because it was
139 * referenced, not because it was
140 * written */
141 unsigned int mirror_num:2; /* large enough to hold
142 * BTRFS_SUPER_MIRROR_MAX */
143 struct btrfsic_dev_state *dev_state;
144 u64 dev_bytenr; /* key, physical byte num on disk */
145 u64 logical_bytenr; /* logical byte num on disk */
146 u64 generation;
147 struct btrfs_disk_key disk_key; /* extra info to print in case of
148 * issues, will not always be correct */
149 struct list_head collision_resolving_node; /* list node */
150 struct list_head all_blocks_node; /* list node */
151
152 /* the following two lists contain block_link items */
153 struct list_head ref_to_list; /* list */
154 struct list_head ref_from_list; /* list */
155 struct btrfsic_block *next_in_same_bio;
156 void *orig_bio_bh_private;
157 union {
158 bio_end_io_t *bio;
159 bh_end_io_t *bh;
160 } orig_bio_bh_end_io;
161 int submit_bio_bh_rw;
162 u64 flush_gen; /* only valid if !never_written */
163};
164
165/*
166 * Elements of this type are allocated dynamically and required because
167 * each block object can refer to and can be ref from multiple blocks.
168 * The key to lookup them in the hashtable is the dev_bytenr of
169 * the block ref to plus the one from the block refered from.
170 * The fact that they are searchable via a hashtable and that a
171 * ref_cnt is maintained is not required for the btrfs integrity
172 * check algorithm itself, it is only used to make the output more
173 * beautiful in case that an error is detected (an error is defined
174 * as a write operation to a block while that block is still referenced).
175 */
176struct btrfsic_block_link {
177 u32 magic_num; /* only used for debug purposes */
178 u32 ref_cnt;
179 struct list_head node_ref_to; /* list node */
180 struct list_head node_ref_from; /* list node */
181 struct list_head collision_resolving_node; /* list node */
182 struct btrfsic_block *block_ref_to;
183 struct btrfsic_block *block_ref_from;
184 u64 parent_generation;
185};
186
187struct btrfsic_dev_state {
188 u32 magic_num; /* only used for debug purposes */
189 struct block_device *bdev;
190 struct btrfsic_state *state;
191 struct list_head collision_resolving_node; /* list node */
192 struct btrfsic_block dummy_block_for_bio_bh_flush;
193 u64 last_flush_gen;
194 char name[BDEVNAME_SIZE];
195};
196
197struct btrfsic_block_hashtable {
198 struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE];
199};
200
201struct btrfsic_block_link_hashtable {
202 struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE];
203};
204
205struct btrfsic_dev_state_hashtable {
206 struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE];
207};
208
209struct btrfsic_block_data_ctx {
210 u64 start; /* virtual bytenr */
211 u64 dev_bytenr; /* physical bytenr on device */
212 u32 len;
213 struct btrfsic_dev_state *dev;
214 char *data;
215 struct buffer_head *bh; /* do not use if set to NULL */
216};
217
218/* This structure is used to implement recursion without occupying
219 * any stack space, refer to btrfsic_process_metablock() */
220struct btrfsic_stack_frame {
221 u32 magic;
222 u32 nr;
223 int error;
224 int i;
225 int limit_nesting;
226 int num_copies;
227 int mirror_num;
228 struct btrfsic_block *block;
229 struct btrfsic_block_data_ctx *block_ctx;
230 struct btrfsic_block *next_block;
231 struct btrfsic_block_data_ctx next_block_ctx;
232 struct btrfs_header *hdr;
233 struct btrfsic_stack_frame *prev;
234};
235
236/* Some state per mounted filesystem */
237struct btrfsic_state {
238 u32 print_mask;
239 int include_extent_data;
240 int csum_size;
241 struct list_head all_blocks_list;
242 struct btrfsic_block_hashtable block_hashtable;
243 struct btrfsic_block_link_hashtable block_link_hashtable;
244 struct btrfs_root *root;
245 u64 max_superblock_generation;
246 struct btrfsic_block *latest_superblock;
247};
248
249static void btrfsic_block_init(struct btrfsic_block *b);
250static struct btrfsic_block *btrfsic_block_alloc(void);
251static void btrfsic_block_free(struct btrfsic_block *b);
252static void btrfsic_block_link_init(struct btrfsic_block_link *n);
253static struct btrfsic_block_link *btrfsic_block_link_alloc(void);
254static void btrfsic_block_link_free(struct btrfsic_block_link *n);
255static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds);
256static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void);
257static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds);
258static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h);
259static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
260 struct btrfsic_block_hashtable *h);
261static void btrfsic_block_hashtable_remove(struct btrfsic_block *b);
262static struct btrfsic_block *btrfsic_block_hashtable_lookup(
263 struct block_device *bdev,
264 u64 dev_bytenr,
265 struct btrfsic_block_hashtable *h);
266static void btrfsic_block_link_hashtable_init(
267 struct btrfsic_block_link_hashtable *h);
268static void btrfsic_block_link_hashtable_add(
269 struct btrfsic_block_link *l,
270 struct btrfsic_block_link_hashtable *h);
271static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l);
272static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
273 struct block_device *bdev_ref_to,
274 u64 dev_bytenr_ref_to,
275 struct block_device *bdev_ref_from,
276 u64 dev_bytenr_ref_from,
277 struct btrfsic_block_link_hashtable *h);
278static void btrfsic_dev_state_hashtable_init(
279 struct btrfsic_dev_state_hashtable *h);
280static void btrfsic_dev_state_hashtable_add(
281 struct btrfsic_dev_state *ds,
282 struct btrfsic_dev_state_hashtable *h);
283static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds);
284static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
285 struct block_device *bdev,
286 struct btrfsic_dev_state_hashtable *h);
287static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void);
288static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf);
289static int btrfsic_process_superblock(struct btrfsic_state *state,
290 struct btrfs_fs_devices *fs_devices);
291static int btrfsic_process_metablock(struct btrfsic_state *state,
292 struct btrfsic_block *block,
293 struct btrfsic_block_data_ctx *block_ctx,
294 struct btrfs_header *hdr,
295 int limit_nesting, int force_iodone_flag);
296static int btrfsic_create_link_to_next_block(
297 struct btrfsic_state *state,
298 struct btrfsic_block *block,
299 struct btrfsic_block_data_ctx
300 *block_ctx, u64 next_bytenr,
301 int limit_nesting,
302 struct btrfsic_block_data_ctx *next_block_ctx,
303 struct btrfsic_block **next_blockp,
304 int force_iodone_flag,
305 int *num_copiesp, int *mirror_nump,
306 struct btrfs_disk_key *disk_key,
307 u64 parent_generation);
308static int btrfsic_handle_extent_data(struct btrfsic_state *state,
309 struct btrfsic_block *block,
310 struct btrfsic_block_data_ctx *block_ctx,
311 u32 item_offset, int force_iodone_flag);
312static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
313 struct btrfsic_block_data_ctx *block_ctx_out,
314 int mirror_num);
315static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
316 u32 len, struct block_device *bdev,
317 struct btrfsic_block_data_ctx *block_ctx_out);
318static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
319static int btrfsic_read_block(struct btrfsic_state *state,
320 struct btrfsic_block_data_ctx *block_ctx);
321static void btrfsic_dump_database(struct btrfsic_state *state);
322static int btrfsic_test_for_metadata(struct btrfsic_state *state,
323 const u8 *data, unsigned int size);
324static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
325 u64 dev_bytenr, u8 *mapped_data,
326 unsigned int len, struct bio *bio,
327 int *bio_is_patched,
328 struct buffer_head *bh,
329 int submit_bio_bh_rw);
330static int btrfsic_process_written_superblock(
331 struct btrfsic_state *state,
332 struct btrfsic_block *const block,
333 struct btrfs_super_block *const super_hdr);
334static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status);
335static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate);
336static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state,
337 const struct btrfsic_block *block,
338 int recursion_level);
339static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
340 struct btrfsic_block *const block,
341 int recursion_level);
342static void btrfsic_print_add_link(const struct btrfsic_state *state,
343 const struct btrfsic_block_link *l);
344static void btrfsic_print_rem_link(const struct btrfsic_state *state,
345 const struct btrfsic_block_link *l);
346static char btrfsic_get_block_type(const struct btrfsic_state *state,
347 const struct btrfsic_block *block);
348static void btrfsic_dump_tree(const struct btrfsic_state *state);
349static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
350 const struct btrfsic_block *block,
351 int indent_level);
352static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
353 struct btrfsic_state *state,
354 struct btrfsic_block_data_ctx *next_block_ctx,
355 struct btrfsic_block *next_block,
356 struct btrfsic_block *from_block,
357 u64 parent_generation);
358static struct btrfsic_block *btrfsic_block_lookup_or_add(
359 struct btrfsic_state *state,
360 struct btrfsic_block_data_ctx *block_ctx,
361 const char *additional_string,
362 int is_metadata,
363 int is_iodone,
364 int never_written,
365 int mirror_num,
366 int *was_created);
367static int btrfsic_process_superblock_dev_mirror(
368 struct btrfsic_state *state,
369 struct btrfsic_dev_state *dev_state,
370 struct btrfs_device *device,
371 int superblock_mirror_num,
372 struct btrfsic_dev_state **selected_dev_state,
373 struct btrfs_super_block *selected_super);
374static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
375 struct block_device *bdev);
376static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
377 u64 bytenr,
378 struct btrfsic_dev_state *dev_state,
379 u64 dev_bytenr, char *data);
380
381static struct mutex btrfsic_mutex;
382static int btrfsic_is_initialized;
383static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable;
384
385
386static void btrfsic_block_init(struct btrfsic_block *b)
387{
388 b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER;
389 b->dev_state = NULL;
390 b->dev_bytenr = 0;
391 b->logical_bytenr = 0;
392 b->generation = BTRFSIC_GENERATION_UNKNOWN;
393 b->disk_key.objectid = 0;
394 b->disk_key.type = 0;
395 b->disk_key.offset = 0;
396 b->is_metadata = 0;
397 b->is_superblock = 0;
398 b->is_iodone = 0;
399 b->iodone_w_error = 0;
400 b->never_written = 0;
401 b->mirror_num = 0;
402 b->next_in_same_bio = NULL;
403 b->orig_bio_bh_private = NULL;
404 b->orig_bio_bh_end_io.bio = NULL;
405 INIT_LIST_HEAD(&b->collision_resolving_node);
406 INIT_LIST_HEAD(&b->all_blocks_node);
407 INIT_LIST_HEAD(&b->ref_to_list);
408 INIT_LIST_HEAD(&b->ref_from_list);
409 b->submit_bio_bh_rw = 0;
410 b->flush_gen = 0;
411}
412
413static struct btrfsic_block *btrfsic_block_alloc(void)
414{
415 struct btrfsic_block *b;
416
417 b = kzalloc(sizeof(*b), GFP_NOFS);
418 if (NULL != b)
419 btrfsic_block_init(b);
420
421 return b;
422}
423
424static void btrfsic_block_free(struct btrfsic_block *b)
425{
426 BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num));
427 kfree(b);
428}
429
430static void btrfsic_block_link_init(struct btrfsic_block_link *l)
431{
432 l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER;
433 l->ref_cnt = 1;
434 INIT_LIST_HEAD(&l->node_ref_to);
435 INIT_LIST_HEAD(&l->node_ref_from);
436 INIT_LIST_HEAD(&l->collision_resolving_node);
437 l->block_ref_to = NULL;
438 l->block_ref_from = NULL;
439}
440
441static struct btrfsic_block_link *btrfsic_block_link_alloc(void)
442{
443 struct btrfsic_block_link *l;
444
445 l = kzalloc(sizeof(*l), GFP_NOFS);
446 if (NULL != l)
447 btrfsic_block_link_init(l);
448
449 return l;
450}
451
452static void btrfsic_block_link_free(struct btrfsic_block_link *l)
453{
454 BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num));
455 kfree(l);
456}
457
458static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds)
459{
460 ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER;
461 ds->bdev = NULL;
462 ds->state = NULL;
463 ds->name[0] = '\0';
464 INIT_LIST_HEAD(&ds->collision_resolving_node);
465 ds->last_flush_gen = 0;
466 btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush);
467 ds->dummy_block_for_bio_bh_flush.is_iodone = 1;
468 ds->dummy_block_for_bio_bh_flush.dev_state = ds;
469}
470
471static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void)
472{
473 struct btrfsic_dev_state *ds;
474
475 ds = kzalloc(sizeof(*ds), GFP_NOFS);
476 if (NULL != ds)
477 btrfsic_dev_state_init(ds);
478
479 return ds;
480}
481
482static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds)
483{
484 BUG_ON(!(NULL == ds ||
485 BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num));
486 kfree(ds);
487}
488
489static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h)
490{
491 int i;
492
493 for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++)
494 INIT_LIST_HEAD(h->table + i);
495}
496
497static void btrfsic_block_hashtable_add(struct btrfsic_block *b,
498 struct btrfsic_block_hashtable *h)
499{
500 const unsigned int hashval =
501 (((unsigned int)(b->dev_bytenr >> 16)) ^
502 ((unsigned int)((uintptr_t)b->dev_state->bdev))) &
503 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
504
505 list_add(&b->collision_resolving_node, h->table + hashval);
506}
507
508static void btrfsic_block_hashtable_remove(struct btrfsic_block *b)
509{
510 list_del(&b->collision_resolving_node);
511}
512
513static struct btrfsic_block *btrfsic_block_hashtable_lookup(
514 struct block_device *bdev,
515 u64 dev_bytenr,
516 struct btrfsic_block_hashtable *h)
517{
518 const unsigned int hashval =
519 (((unsigned int)(dev_bytenr >> 16)) ^
520 ((unsigned int)((uintptr_t)bdev))) &
521 (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
522 struct list_head *elem;
523
524 list_for_each(elem, h->table + hashval) {
525 struct btrfsic_block *const b =
526 list_entry(elem, struct btrfsic_block,
527 collision_resolving_node);
528
529 if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
530 return b;
531 }
532
533 return NULL;
534}
535
536static void btrfsic_block_link_hashtable_init(
537 struct btrfsic_block_link_hashtable *h)
538{
539 int i;
540
541 for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++)
542 INIT_LIST_HEAD(h->table + i);
543}
544
545static void btrfsic_block_link_hashtable_add(
546 struct btrfsic_block_link *l,
547 struct btrfsic_block_link_hashtable *h)
548{
549 const unsigned int hashval =
550 (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^
551 ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^
552 ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^
553 ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev)))
554 & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
555
556 BUG_ON(NULL == l->block_ref_to);
557 BUG_ON(NULL == l->block_ref_from);
558 list_add(&l->collision_resolving_node, h->table + hashval);
559}
560
561static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l)
562{
563 list_del(&l->collision_resolving_node);
564}
565
566static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
567 struct block_device *bdev_ref_to,
568 u64 dev_bytenr_ref_to,
569 struct block_device *bdev_ref_from,
570 u64 dev_bytenr_ref_from,
571 struct btrfsic_block_link_hashtable *h)
572{
573 const unsigned int hashval =
574 (((unsigned int)(dev_bytenr_ref_to >> 16)) ^
575 ((unsigned int)(dev_bytenr_ref_from >> 16)) ^
576 ((unsigned int)((uintptr_t)bdev_ref_to)) ^
577 ((unsigned int)((uintptr_t)bdev_ref_from))) &
578 (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
579 struct list_head *elem;
580
581 list_for_each(elem, h->table + hashval) {
582 struct btrfsic_block_link *const l =
583 list_entry(elem, struct btrfsic_block_link,
584 collision_resolving_node);
585
586 BUG_ON(NULL == l->block_ref_to);
587 BUG_ON(NULL == l->block_ref_from);
588 if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
589 l->block_ref_to->dev_bytenr == dev_bytenr_ref_to &&
590 l->block_ref_from->dev_state->bdev == bdev_ref_from &&
591 l->block_ref_from->dev_bytenr == dev_bytenr_ref_from)
592 return l;
593 }
594
595 return NULL;
596}
597
598static void btrfsic_dev_state_hashtable_init(
599 struct btrfsic_dev_state_hashtable *h)
600{
601 int i;
602
603 for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++)
604 INIT_LIST_HEAD(h->table + i);
605}
606
607static void btrfsic_dev_state_hashtable_add(
608 struct btrfsic_dev_state *ds,
609 struct btrfsic_dev_state_hashtable *h)
610{
611 const unsigned int hashval =
612 (((unsigned int)((uintptr_t)ds->bdev)) &
613 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
614
615 list_add(&ds->collision_resolving_node, h->table + hashval);
616}
617
618static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds)
619{
620 list_del(&ds->collision_resolving_node);
621}
622
623static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
624 struct block_device *bdev,
625 struct btrfsic_dev_state_hashtable *h)
626{
627 const unsigned int hashval =
628 (((unsigned int)((uintptr_t)bdev)) &
629 (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
630 struct list_head *elem;
631
632 list_for_each(elem, h->table + hashval) {
633 struct btrfsic_dev_state *const ds =
634 list_entry(elem, struct btrfsic_dev_state,
635 collision_resolving_node);
636
637 if (ds->bdev == bdev)
638 return ds;
639 }
640
641 return NULL;
642}
643
644static int btrfsic_process_superblock(struct btrfsic_state *state,
645 struct btrfs_fs_devices *fs_devices)
646{
647 int ret;
648 struct btrfs_super_block *selected_super;
649 struct list_head *dev_head = &fs_devices->devices;
650 struct btrfs_device *device;
651 struct btrfsic_dev_state *selected_dev_state = NULL;
652 int pass;
653
654 BUG_ON(NULL == state);
655 selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
656 if (NULL == selected_super) {
657 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
658 return -1;
659 }
660
661 list_for_each_entry(device, dev_head, dev_list) {
662 int i;
663 struct btrfsic_dev_state *dev_state;
664
665 if (!device->bdev || !device->name)
666 continue;
667
668 dev_state = btrfsic_dev_state_lookup(device->bdev);
669 BUG_ON(NULL == dev_state);
670 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
671 ret = btrfsic_process_superblock_dev_mirror(
672 state, dev_state, device, i,
673 &selected_dev_state, selected_super);
674 if (0 != ret && 0 == i) {
675 kfree(selected_super);
676 return ret;
677 }
678 }
679 }
680
681 if (NULL == state->latest_superblock) {
682 printk(KERN_INFO "btrfsic: no superblock found!\n");
683 kfree(selected_super);
684 return -1;
685 }
686
687 state->csum_size = btrfs_super_csum_size(selected_super);
688
689 for (pass = 0; pass < 3; pass++) {
690 int num_copies;
691 int mirror_num;
692 u64 next_bytenr;
693
694 switch (pass) {
695 case 0:
696 next_bytenr = btrfs_super_root(selected_super);
697 if (state->print_mask &
698 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
699 printk(KERN_INFO "root@%llu\n",
700 (unsigned long long)next_bytenr);
701 break;
702 case 1:
703 next_bytenr = btrfs_super_chunk_root(selected_super);
704 if (state->print_mask &
705 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
706 printk(KERN_INFO "chunk@%llu\n",
707 (unsigned long long)next_bytenr);
708 break;
709 case 2:
710 next_bytenr = btrfs_super_log_root(selected_super);
711 if (0 == next_bytenr)
712 continue;
713 if (state->print_mask &
714 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
715 printk(KERN_INFO "log@%llu\n",
716 (unsigned long long)next_bytenr);
717 break;
718 }
719
720 num_copies =
721 btrfs_num_copies(&state->root->fs_info->mapping_tree,
722 next_bytenr, PAGE_SIZE);
723 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
724 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
725 (unsigned long long)next_bytenr, num_copies);
726
727 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
728 struct btrfsic_block *next_block;
729 struct btrfsic_block_data_ctx tmp_next_block_ctx;
730 struct btrfsic_block_link *l;
731 struct btrfs_header *hdr;
732
733 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
734 &tmp_next_block_ctx,
735 mirror_num);
736 if (ret) {
737 printk(KERN_INFO "btrfsic:"
738 " btrfsic_map_block(root @%llu,"
739 " mirror %d) failed!\n",
740 (unsigned long long)next_bytenr,
741 mirror_num);
742 kfree(selected_super);
743 return -1;
744 }
745
746 next_block = btrfsic_block_hashtable_lookup(
747 tmp_next_block_ctx.dev->bdev,
748 tmp_next_block_ctx.dev_bytenr,
749 &state->block_hashtable);
750 BUG_ON(NULL == next_block);
751
752 l = btrfsic_block_link_hashtable_lookup(
753 tmp_next_block_ctx.dev->bdev,
754 tmp_next_block_ctx.dev_bytenr,
755 state->latest_superblock->dev_state->
756 bdev,
757 state->latest_superblock->dev_bytenr,
758 &state->block_link_hashtable);
759 BUG_ON(NULL == l);
760
761 ret = btrfsic_read_block(state, &tmp_next_block_ctx);
762 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
763 printk(KERN_INFO
764 "btrfsic: read @logical %llu failed!\n",
765 (unsigned long long)
766 tmp_next_block_ctx.start);
767 btrfsic_release_block_ctx(&tmp_next_block_ctx);
768 kfree(selected_super);
769 return -1;
770 }
771
772 hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
773 ret = btrfsic_process_metablock(state,
774 next_block,
775 &tmp_next_block_ctx,
776 hdr,
777 BTRFS_MAX_LEVEL + 3, 1);
778 btrfsic_release_block_ctx(&tmp_next_block_ctx);
779 }
780 }
781
782 kfree(selected_super);
783 return ret;
784}
785
786static int btrfsic_process_superblock_dev_mirror(
787 struct btrfsic_state *state,
788 struct btrfsic_dev_state *dev_state,
789 struct btrfs_device *device,
790 int superblock_mirror_num,
791 struct btrfsic_dev_state **selected_dev_state,
792 struct btrfs_super_block *selected_super)
793{
794 struct btrfs_super_block *super_tmp;
795 u64 dev_bytenr;
796 struct buffer_head *bh;
797 struct btrfsic_block *superblock_tmp;
798 int pass;
799 struct block_device *const superblock_bdev = device->bdev;
800
801 /* super block bytenr is always the unmapped device bytenr */
802 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
803 bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
804 if (NULL == bh)
805 return -1;
806 super_tmp = (struct btrfs_super_block *)
807 (bh->b_data + (dev_bytenr & 4095));
808
809 if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
810 strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
811 sizeof(super_tmp->magic)) ||
812 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
813 brelse(bh);
814 return 0;
815 }
816
817 superblock_tmp =
818 btrfsic_block_hashtable_lookup(superblock_bdev,
819 dev_bytenr,
820 &state->block_hashtable);
821 if (NULL == superblock_tmp) {
822 superblock_tmp = btrfsic_block_alloc();
823 if (NULL == superblock_tmp) {
824 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
825 brelse(bh);
826 return -1;
827 }
828 /* for superblock, only the dev_bytenr makes sense */
829 superblock_tmp->dev_bytenr = dev_bytenr;
830 superblock_tmp->dev_state = dev_state;
831 superblock_tmp->logical_bytenr = dev_bytenr;
832 superblock_tmp->generation = btrfs_super_generation(super_tmp);
833 superblock_tmp->is_metadata = 1;
834 superblock_tmp->is_superblock = 1;
835 superblock_tmp->is_iodone = 1;
836 superblock_tmp->never_written = 0;
837 superblock_tmp->mirror_num = 1 + superblock_mirror_num;
838 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
839 printk(KERN_INFO "New initial S-block (bdev %p, %s)"
840 " @%llu (%s/%llu/%d)\n",
841 superblock_bdev, device->name,
842 (unsigned long long)dev_bytenr,
843 dev_state->name,
844 (unsigned long long)dev_bytenr,
845 superblock_mirror_num);
846 list_add(&superblock_tmp->all_blocks_node,
847 &state->all_blocks_list);
848 btrfsic_block_hashtable_add(superblock_tmp,
849 &state->block_hashtable);
850 }
851
852 /* select the one with the highest generation field */
853 if (btrfs_super_generation(super_tmp) >
854 state->max_superblock_generation ||
855 0 == state->max_superblock_generation) {
856 memcpy(selected_super, super_tmp, sizeof(*selected_super));
857 *selected_dev_state = dev_state;
858 state->max_superblock_generation =
859 btrfs_super_generation(super_tmp);
860 state->latest_superblock = superblock_tmp;
861 }
862
863 for (pass = 0; pass < 3; pass++) {
864 u64 next_bytenr;
865 int num_copies;
866 int mirror_num;
867 const char *additional_string = NULL;
868 struct btrfs_disk_key tmp_disk_key;
869
870 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
871 tmp_disk_key.offset = 0;
872 switch (pass) {
873 case 0:
874 tmp_disk_key.objectid =
875 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
876 additional_string = "initial root ";
877 next_bytenr = btrfs_super_root(super_tmp);
878 break;
879 case 1:
880 tmp_disk_key.objectid =
881 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
882 additional_string = "initial chunk ";
883 next_bytenr = btrfs_super_chunk_root(super_tmp);
884 break;
885 case 2:
886 tmp_disk_key.objectid =
887 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
888 additional_string = "initial log ";
889 next_bytenr = btrfs_super_log_root(super_tmp);
890 if (0 == next_bytenr)
891 continue;
892 break;
893 }
894
895 num_copies =
896 btrfs_num_copies(&state->root->fs_info->mapping_tree,
897 next_bytenr, PAGE_SIZE);
898 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
899 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
900 (unsigned long long)next_bytenr, num_copies);
901 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
902 struct btrfsic_block *next_block;
903 struct btrfsic_block_data_ctx tmp_next_block_ctx;
904 struct btrfsic_block_link *l;
905
906 if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
907 &tmp_next_block_ctx,
908 mirror_num)) {
909 printk(KERN_INFO "btrfsic: btrfsic_map_block("
910 "bytenr @%llu, mirror %d) failed!\n",
911 (unsigned long long)next_bytenr,
912 mirror_num);
913 brelse(bh);
914 return -1;
915 }
916
917 next_block = btrfsic_block_lookup_or_add(
918 state, &tmp_next_block_ctx,
919 additional_string, 1, 1, 0,
920 mirror_num, NULL);
921 if (NULL == next_block) {
922 btrfsic_release_block_ctx(&tmp_next_block_ctx);
923 brelse(bh);
924 return -1;
925 }
926
927 next_block->disk_key = tmp_disk_key;
928 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
929 l = btrfsic_block_link_lookup_or_add(
930 state, &tmp_next_block_ctx,
931 next_block, superblock_tmp,
932 BTRFSIC_GENERATION_UNKNOWN);
933 btrfsic_release_block_ctx(&tmp_next_block_ctx);
934 if (NULL == l) {
935 brelse(bh);
936 return -1;
937 }
938 }
939 }
940 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES)
941 btrfsic_dump_tree_sub(state, superblock_tmp, 0);
942
943 brelse(bh);
944 return 0;
945}
946
947static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void)
948{
949 struct btrfsic_stack_frame *sf;
950
951 sf = kzalloc(sizeof(*sf), GFP_NOFS);
952 if (NULL == sf)
953 printk(KERN_INFO "btrfsic: alloc memory failed!\n");
954 else
955 sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER;
956 return sf;
957}
958
959static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf)
960{
961 BUG_ON(!(NULL == sf ||
962 BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic));
963 kfree(sf);
964}
965
966static int btrfsic_process_metablock(
967 struct btrfsic_state *state,
968 struct btrfsic_block *const first_block,
969 struct btrfsic_block_data_ctx *const first_block_ctx,
970 struct btrfs_header *const first_hdr,
971 int first_limit_nesting, int force_iodone_flag)
972{
973 struct btrfsic_stack_frame initial_stack_frame = { 0 };
974 struct btrfsic_stack_frame *sf;
975 struct btrfsic_stack_frame *next_stack;
976
977 sf = &initial_stack_frame;
978 sf->error = 0;
979 sf->i = -1;
980 sf->limit_nesting = first_limit_nesting;
981 sf->block = first_block;
982 sf->block_ctx = first_block_ctx;
983 sf->next_block = NULL;
984 sf->hdr = first_hdr;
985 sf->prev = NULL;
986
987continue_with_new_stack_frame:
988 sf->block->generation = le64_to_cpu(sf->hdr->generation);
989 if (0 == sf->hdr->level) {
990 struct btrfs_leaf *const leafhdr =
991 (struct btrfs_leaf *)sf->hdr;
992
993 if (-1 == sf->i) {
994 sf->nr = le32_to_cpu(leafhdr->header.nritems);
995
996 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
997 printk(KERN_INFO
998 "leaf %llu items %d generation %llu"
999 " owner %llu\n",
1000 (unsigned long long)
1001 sf->block_ctx->start,
1002 sf->nr,
1003 (unsigned long long)
1004 le64_to_cpu(leafhdr->header.generation),
1005 (unsigned long long)
1006 le64_to_cpu(leafhdr->header.owner));
1007 }
1008
1009continue_with_current_leaf_stack_frame:
1010 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1011 sf->i++;
1012 sf->num_copies = 0;
1013 }
1014
1015 if (sf->i < sf->nr) {
1016 struct btrfs_item *disk_item = leafhdr->items + sf->i;
1017 struct btrfs_disk_key *disk_key = &disk_item->key;
1018 u8 type;
1019 const u32 item_offset = le32_to_cpu(disk_item->offset);
1020
1021 type = disk_key->type;
1022
1023 if (BTRFS_ROOT_ITEM_KEY == type) {
1024 const struct btrfs_root_item *const root_item =
1025 (struct btrfs_root_item *)
1026 (sf->block_ctx->data +
1027 offsetof(struct btrfs_leaf, items) +
1028 item_offset);
1029 const u64 next_bytenr =
1030 le64_to_cpu(root_item->bytenr);
1031
1032 sf->error =
1033 btrfsic_create_link_to_next_block(
1034 state,
1035 sf->block,
1036 sf->block_ctx,
1037 next_bytenr,
1038 sf->limit_nesting,
1039 &sf->next_block_ctx,
1040 &sf->next_block,
1041 force_iodone_flag,
1042 &sf->num_copies,
1043 &sf->mirror_num,
1044 disk_key,
1045 le64_to_cpu(root_item->
1046 generation));
1047 if (sf->error)
1048 goto one_stack_frame_backwards;
1049
1050 if (NULL != sf->next_block) {
1051 struct btrfs_header *const next_hdr =
1052 (struct btrfs_header *)
1053 sf->next_block_ctx.data;
1054
1055 next_stack =
1056 btrfsic_stack_frame_alloc();
1057 if (NULL == next_stack) {
1058 btrfsic_release_block_ctx(
1059 &sf->
1060 next_block_ctx);
1061 goto one_stack_frame_backwards;
1062 }
1063
1064 next_stack->i = -1;
1065 next_stack->block = sf->next_block;
1066 next_stack->block_ctx =
1067 &sf->next_block_ctx;
1068 next_stack->next_block = NULL;
1069 next_stack->hdr = next_hdr;
1070 next_stack->limit_nesting =
1071 sf->limit_nesting - 1;
1072 next_stack->prev = sf;
1073 sf = next_stack;
1074 goto continue_with_new_stack_frame;
1075 }
1076 } else if (BTRFS_EXTENT_DATA_KEY == type &&
1077 state->include_extent_data) {
1078 sf->error = btrfsic_handle_extent_data(
1079 state,
1080 sf->block,
1081 sf->block_ctx,
1082 item_offset,
1083 force_iodone_flag);
1084 if (sf->error)
1085 goto one_stack_frame_backwards;
1086 }
1087
1088 goto continue_with_current_leaf_stack_frame;
1089 }
1090 } else {
1091 struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
1092
1093 if (-1 == sf->i) {
1094 sf->nr = le32_to_cpu(nodehdr->header.nritems);
1095
1096 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1097 printk(KERN_INFO "node %llu level %d items %d"
1098 " generation %llu owner %llu\n",
1099 (unsigned long long)
1100 sf->block_ctx->start,
1101 nodehdr->header.level, sf->nr,
1102 (unsigned long long)
1103 le64_to_cpu(nodehdr->header.generation),
1104 (unsigned long long)
1105 le64_to_cpu(nodehdr->header.owner));
1106 }
1107
1108continue_with_current_node_stack_frame:
1109 if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) {
1110 sf->i++;
1111 sf->num_copies = 0;
1112 }
1113
1114 if (sf->i < sf->nr) {
1115 struct btrfs_key_ptr *disk_key_ptr =
1116 nodehdr->ptrs + sf->i;
1117 const u64 next_bytenr =
1118 le64_to_cpu(disk_key_ptr->blockptr);
1119
1120 sf->error = btrfsic_create_link_to_next_block(
1121 state,
1122 sf->block,
1123 sf->block_ctx,
1124 next_bytenr,
1125 sf->limit_nesting,
1126 &sf->next_block_ctx,
1127 &sf->next_block,
1128 force_iodone_flag,
1129 &sf->num_copies,
1130 &sf->mirror_num,
1131 &disk_key_ptr->key,
1132 le64_to_cpu(disk_key_ptr->generation));
1133 if (sf->error)
1134 goto one_stack_frame_backwards;
1135
1136 if (NULL != sf->next_block) {
1137 struct btrfs_header *const next_hdr =
1138 (struct btrfs_header *)
1139 sf->next_block_ctx.data;
1140
1141 next_stack = btrfsic_stack_frame_alloc();
1142 if (NULL == next_stack)
1143 goto one_stack_frame_backwards;
1144
1145 next_stack->i = -1;
1146 next_stack->block = sf->next_block;
1147 next_stack->block_ctx = &sf->next_block_ctx;
1148 next_stack->next_block = NULL;
1149 next_stack->hdr = next_hdr;
1150 next_stack->limit_nesting =
1151 sf->limit_nesting - 1;
1152 next_stack->prev = sf;
1153 sf = next_stack;
1154 goto continue_with_new_stack_frame;
1155 }
1156
1157 goto continue_with_current_node_stack_frame;
1158 }
1159 }
1160
1161one_stack_frame_backwards:
1162 if (NULL != sf->prev) {
1163 struct btrfsic_stack_frame *const prev = sf->prev;
1164
1165 /* the one for the initial block is freed in the caller */
1166 btrfsic_release_block_ctx(sf->block_ctx);
1167
1168 if (sf->error) {
1169 prev->error = sf->error;
1170 btrfsic_stack_frame_free(sf);
1171 sf = prev;
1172 goto one_stack_frame_backwards;
1173 }
1174
1175 btrfsic_stack_frame_free(sf);
1176 sf = prev;
1177 goto continue_with_new_stack_frame;
1178 } else {
1179 BUG_ON(&initial_stack_frame != sf);
1180 }
1181
1182 return sf->error;
1183}
1184
1185static int btrfsic_create_link_to_next_block(
1186 struct btrfsic_state *state,
1187 struct btrfsic_block *block,
1188 struct btrfsic_block_data_ctx *block_ctx,
1189 u64 next_bytenr,
1190 int limit_nesting,
1191 struct btrfsic_block_data_ctx *next_block_ctx,
1192 struct btrfsic_block **next_blockp,
1193 int force_iodone_flag,
1194 int *num_copiesp, int *mirror_nump,
1195 struct btrfs_disk_key *disk_key,
1196 u64 parent_generation)
1197{
1198 struct btrfsic_block *next_block = NULL;
1199 int ret;
1200 struct btrfsic_block_link *l;
1201 int did_alloc_block_link;
1202 int block_was_created;
1203
1204 *next_blockp = NULL;
1205 if (0 == *num_copiesp) {
1206 *num_copiesp =
1207 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1208 next_bytenr, PAGE_SIZE);
1209 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1210 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1211 (unsigned long long)next_bytenr, *num_copiesp);
1212 *mirror_nump = 1;
1213 }
1214
1215 if (*mirror_nump > *num_copiesp)
1216 return 0;
1217
1218 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1219 printk(KERN_INFO
1220 "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
1221 *mirror_nump);
1222 ret = btrfsic_map_block(state, next_bytenr,
1223 BTRFSIC_BLOCK_SIZE,
1224 next_block_ctx, *mirror_nump);
1225 if (ret) {
1226 printk(KERN_INFO
1227 "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
1228 (unsigned long long)next_bytenr, *mirror_nump);
1229 btrfsic_release_block_ctx(next_block_ctx);
1230 *next_blockp = NULL;
1231 return -1;
1232 }
1233
1234 next_block = btrfsic_block_lookup_or_add(state,
1235 next_block_ctx, "referenced ",
1236 1, force_iodone_flag,
1237 !force_iodone_flag,
1238 *mirror_nump,
1239 &block_was_created);
1240 if (NULL == next_block) {
1241 btrfsic_release_block_ctx(next_block_ctx);
1242 *next_blockp = NULL;
1243 return -1;
1244 }
1245 if (block_was_created) {
1246 l = NULL;
1247 next_block->generation = BTRFSIC_GENERATION_UNKNOWN;
1248 } else {
1249 if (next_block->logical_bytenr != next_bytenr &&
1250 !(!next_block->is_metadata &&
1251 0 == next_block->logical_bytenr)) {
1252 printk(KERN_INFO
1253 "Referenced block @%llu (%s/%llu/%d)"
1254 " found in hash table, %c,"
1255 " bytenr mismatch (!= stored %llu).\n",
1256 (unsigned long long)next_bytenr,
1257 next_block_ctx->dev->name,
1258 (unsigned long long)next_block_ctx->dev_bytenr,
1259 *mirror_nump,
1260 btrfsic_get_block_type(state, next_block),
1261 (unsigned long long)next_block->logical_bytenr);
1262 } else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1263 printk(KERN_INFO
1264 "Referenced block @%llu (%s/%llu/%d)"
1265 " found in hash table, %c.\n",
1266 (unsigned long long)next_bytenr,
1267 next_block_ctx->dev->name,
1268 (unsigned long long)next_block_ctx->dev_bytenr,
1269 *mirror_nump,
1270 btrfsic_get_block_type(state, next_block));
1271 next_block->logical_bytenr = next_bytenr;
1272
1273 next_block->mirror_num = *mirror_nump;
1274 l = btrfsic_block_link_hashtable_lookup(
1275 next_block_ctx->dev->bdev,
1276 next_block_ctx->dev_bytenr,
1277 block_ctx->dev->bdev,
1278 block_ctx->dev_bytenr,
1279 &state->block_link_hashtable);
1280 }
1281
1282 next_block->disk_key = *disk_key;
1283 if (NULL == l) {
1284 l = btrfsic_block_link_alloc();
1285 if (NULL == l) {
1286 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1287 btrfsic_release_block_ctx(next_block_ctx);
1288 *next_blockp = NULL;
1289 return -1;
1290 }
1291
1292 did_alloc_block_link = 1;
1293 l->block_ref_to = next_block;
1294 l->block_ref_from = block;
1295 l->ref_cnt = 1;
1296 l->parent_generation = parent_generation;
1297
1298 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1299 btrfsic_print_add_link(state, l);
1300
1301 list_add(&l->node_ref_to, &block->ref_to_list);
1302 list_add(&l->node_ref_from, &next_block->ref_from_list);
1303
1304 btrfsic_block_link_hashtable_add(l,
1305 &state->block_link_hashtable);
1306 } else {
1307 did_alloc_block_link = 0;
1308 if (0 == limit_nesting) {
1309 l->ref_cnt++;
1310 l->parent_generation = parent_generation;
1311 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1312 btrfsic_print_add_link(state, l);
1313 }
1314 }
1315
1316 if (limit_nesting > 0 && did_alloc_block_link) {
1317 ret = btrfsic_read_block(state, next_block_ctx);
1318 if (ret < (int)BTRFSIC_BLOCK_SIZE) {
1319 printk(KERN_INFO
1320 "btrfsic: read block @logical %llu failed!\n",
1321 (unsigned long long)next_bytenr);
1322 btrfsic_release_block_ctx(next_block_ctx);
1323 *next_blockp = NULL;
1324 return -1;
1325 }
1326
1327 *next_blockp = next_block;
1328 } else {
1329 *next_blockp = NULL;
1330 }
1331 (*mirror_nump)++;
1332
1333 return 0;
1334}
1335
1336static int btrfsic_handle_extent_data(
1337 struct btrfsic_state *state,
1338 struct btrfsic_block *block,
1339 struct btrfsic_block_data_ctx *block_ctx,
1340 u32 item_offset, int force_iodone_flag)
1341{
1342 int ret;
1343 struct btrfs_file_extent_item *file_extent_item =
1344 (struct btrfs_file_extent_item *)(block_ctx->data +
1345 offsetof(struct btrfs_leaf,
1346 items) + item_offset);
1347 u64 next_bytenr =
1348 le64_to_cpu(file_extent_item->disk_bytenr) +
1349 le64_to_cpu(file_extent_item->offset);
1350 u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
1351 u64 generation = le64_to_cpu(file_extent_item->generation);
1352 struct btrfsic_block_link *l;
1353
1354 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1355 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
1356 " offset = %llu, num_bytes = %llu\n",
1357 file_extent_item->type,
1358 (unsigned long long)
1359 le64_to_cpu(file_extent_item->disk_bytenr),
1360 (unsigned long long)
1361 le64_to_cpu(file_extent_item->offset),
1362 (unsigned long long)
1363 le64_to_cpu(file_extent_item->num_bytes));
1364 if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
1365 ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
1366 return 0;
1367 while (num_bytes > 0) {
1368 u32 chunk_len;
1369 int num_copies;
1370 int mirror_num;
1371
1372 if (num_bytes > BTRFSIC_BLOCK_SIZE)
1373 chunk_len = BTRFSIC_BLOCK_SIZE;
1374 else
1375 chunk_len = num_bytes;
1376
1377 num_copies =
1378 btrfs_num_copies(&state->root->fs_info->mapping_tree,
1379 next_bytenr, PAGE_SIZE);
1380 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
1381 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
1382 (unsigned long long)next_bytenr, num_copies);
1383 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
1384 struct btrfsic_block_data_ctx next_block_ctx;
1385 struct btrfsic_block *next_block;
1386 int block_was_created;
1387
1388 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1389 printk(KERN_INFO "btrfsic_handle_extent_data("
1390 "mirror_num=%d)\n", mirror_num);
1391 if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
1392 printk(KERN_INFO
1393 "\tdisk_bytenr = %llu, num_bytes %u\n",
1394 (unsigned long long)next_bytenr,
1395 chunk_len);
1396 ret = btrfsic_map_block(state, next_bytenr,
1397 chunk_len, &next_block_ctx,
1398 mirror_num);
1399 if (ret) {
1400 printk(KERN_INFO
1401 "btrfsic: btrfsic_map_block(@%llu,"
1402 " mirror=%d) failed!\n",
1403 (unsigned long long)next_bytenr,
1404 mirror_num);
1405 return -1;
1406 }
1407
1408 next_block = btrfsic_block_lookup_or_add(
1409 state,
1410 &next_block_ctx,
1411 "referenced ",
1412 0,
1413 force_iodone_flag,
1414 !force_iodone_flag,
1415 mirror_num,
1416 &block_was_created);
1417 if (NULL == next_block) {
1418 printk(KERN_INFO
1419 "btrfsic: error, kmalloc failed!\n");
1420 btrfsic_release_block_ctx(&next_block_ctx);
1421 return -1;
1422 }
1423 if (!block_was_created) {
1424 if (next_block->logical_bytenr != next_bytenr &&
1425 !(!next_block->is_metadata &&
1426 0 == next_block->logical_bytenr)) {
1427 printk(KERN_INFO
1428 "Referenced block"
1429 " @%llu (%s/%llu/%d)"
1430 " found in hash table, D,"
1431 " bytenr mismatch"
1432 " (!= stored %llu).\n",
1433 (unsigned long long)next_bytenr,
1434 next_block_ctx.dev->name,
1435 (unsigned long long)
1436 next_block_ctx.dev_bytenr,
1437 mirror_num,
1438 (unsigned long long)
1439 next_block->logical_bytenr);
1440 }
1441 next_block->logical_bytenr = next_bytenr;
1442 next_block->mirror_num = mirror_num;
1443 }
1444
1445 l = btrfsic_block_link_lookup_or_add(state,
1446 &next_block_ctx,
1447 next_block, block,
1448 generation);
1449 btrfsic_release_block_ctx(&next_block_ctx);
1450 if (NULL == l)
1451 return -1;
1452 }
1453
1454 next_bytenr += chunk_len;
1455 num_bytes -= chunk_len;
1456 }
1457
1458 return 0;
1459}
1460
1461static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
1462 struct btrfsic_block_data_ctx *block_ctx_out,
1463 int mirror_num)
1464{
1465 int ret;
1466 u64 length;
1467 struct btrfs_bio *multi = NULL;
1468 struct btrfs_device *device;
1469
1470 length = len;
1471 ret = btrfs_map_block(&state->root->fs_info->mapping_tree, READ,
1472 bytenr, &length, &multi, mirror_num);
1473
1474 device = multi->stripes[0].dev;
1475 block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev);
1476 block_ctx_out->dev_bytenr = multi->stripes[0].physical;
1477 block_ctx_out->start = bytenr;
1478 block_ctx_out->len = len;
1479 block_ctx_out->data = NULL;
1480 block_ctx_out->bh = NULL;
1481
1482 if (0 == ret)
1483 kfree(multi);
1484 if (NULL == block_ctx_out->dev) {
1485 ret = -ENXIO;
1486 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#1)!\n");
1487 }
1488
1489 return ret;
1490}
1491
1492static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
1493 u32 len, struct block_device *bdev,
1494 struct btrfsic_block_data_ctx *block_ctx_out)
1495{
1496 block_ctx_out->dev = btrfsic_dev_state_lookup(bdev);
1497 block_ctx_out->dev_bytenr = bytenr;
1498 block_ctx_out->start = bytenr;
1499 block_ctx_out->len = len;
1500 block_ctx_out->data = NULL;
1501 block_ctx_out->bh = NULL;
1502 if (NULL != block_ctx_out->dev) {
1503 return 0;
1504 } else {
1505 printk(KERN_INFO "btrfsic: error, cannot lookup dev (#2)!\n");
1506 return -ENXIO;
1507 }
1508}
1509
1510static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
1511{
1512 if (NULL != block_ctx->bh) {
1513 brelse(block_ctx->bh);
1514 block_ctx->bh = NULL;
1515 }
1516}
1517
1518static int btrfsic_read_block(struct btrfsic_state *state,
1519 struct btrfsic_block_data_ctx *block_ctx)
1520{
1521 block_ctx->bh = NULL;
1522 if (block_ctx->dev_bytenr & 4095) {
1523 printk(KERN_INFO
1524 "btrfsic: read_block() with unaligned bytenr %llu\n",
1525 (unsigned long long)block_ctx->dev_bytenr);
1526 return -1;
1527 }
1528 if (block_ctx->len > 4096) {
1529 printk(KERN_INFO
1530 "btrfsic: read_block() with too huge size %d\n",
1531 block_ctx->len);
1532 return -1;
1533 }
1534
1535 block_ctx->bh = __bread(block_ctx->dev->bdev,
1536 block_ctx->dev_bytenr >> 12, 4096);
1537 if (NULL == block_ctx->bh)
1538 return -1;
1539 block_ctx->data = block_ctx->bh->b_data;
1540
1541 return block_ctx->len;
1542}
1543
1544static void btrfsic_dump_database(struct btrfsic_state *state)
1545{
1546 struct list_head *elem_all;
1547
1548 BUG_ON(NULL == state);
1549
1550 printk(KERN_INFO "all_blocks_list:\n");
1551 list_for_each(elem_all, &state->all_blocks_list) {
1552 const struct btrfsic_block *const b_all =
1553 list_entry(elem_all, struct btrfsic_block,
1554 all_blocks_node);
1555 struct list_head *elem_ref_to;
1556 struct list_head *elem_ref_from;
1557
1558 printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
1559 btrfsic_get_block_type(state, b_all),
1560 (unsigned long long)b_all->logical_bytenr,
1561 b_all->dev_state->name,
1562 (unsigned long long)b_all->dev_bytenr,
1563 b_all->mirror_num);
1564
1565 list_for_each(elem_ref_to, &b_all->ref_to_list) {
1566 const struct btrfsic_block_link *const l =
1567 list_entry(elem_ref_to,
1568 struct btrfsic_block_link,
1569 node_ref_to);
1570
1571 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1572 " refers %u* to"
1573 " %c @%llu (%s/%llu/%d)\n",
1574 btrfsic_get_block_type(state, b_all),
1575 (unsigned long long)b_all->logical_bytenr,
1576 b_all->dev_state->name,
1577 (unsigned long long)b_all->dev_bytenr,
1578 b_all->mirror_num,
1579 l->ref_cnt,
1580 btrfsic_get_block_type(state, l->block_ref_to),
1581 (unsigned long long)
1582 l->block_ref_to->logical_bytenr,
1583 l->block_ref_to->dev_state->name,
1584 (unsigned long long)l->block_ref_to->dev_bytenr,
1585 l->block_ref_to->mirror_num);
1586 }
1587
1588 list_for_each(elem_ref_from, &b_all->ref_from_list) {
1589 const struct btrfsic_block_link *const l =
1590 list_entry(elem_ref_from,
1591 struct btrfsic_block_link,
1592 node_ref_from);
1593
1594 printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
1595 " is ref %u* from"
1596 " %c @%llu (%s/%llu/%d)\n",
1597 btrfsic_get_block_type(state, b_all),
1598 (unsigned long long)b_all->logical_bytenr,
1599 b_all->dev_state->name,
1600 (unsigned long long)b_all->dev_bytenr,
1601 b_all->mirror_num,
1602 l->ref_cnt,
1603 btrfsic_get_block_type(state, l->block_ref_from),
1604 (unsigned long long)
1605 l->block_ref_from->logical_bytenr,
1606 l->block_ref_from->dev_state->name,
1607 (unsigned long long)
1608 l->block_ref_from->dev_bytenr,
1609 l->block_ref_from->mirror_num);
1610 }
1611
1612 printk(KERN_INFO "\n");
1613 }
1614}
1615
1616/*
1617 * Test whether the disk block contains a tree block (leaf or node)
1618 * (note that this test fails for the super block)
1619 */
1620static int btrfsic_test_for_metadata(struct btrfsic_state *state,
1621 const u8 *data, unsigned int size)
1622{
1623 struct btrfs_header *h;
1624 u8 csum[BTRFS_CSUM_SIZE];
1625 u32 crc = ~(u32)0;
1626 int fail = 0;
1627 int crc_fail = 0;
1628
1629 h = (struct btrfs_header *)data;
1630
1631 if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
1632 fail++;
1633
1634 crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
1635 btrfs_csum_final(crc, csum);
1636 if (memcmp(csum, h->csum, state->csum_size))
1637 crc_fail++;
1638
1639 return fail || crc_fail;
1640}
1641
1642static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
1643 u64 dev_bytenr,
1644 u8 *mapped_data, unsigned int len,
1645 struct bio *bio,
1646 int *bio_is_patched,
1647 struct buffer_head *bh,
1648 int submit_bio_bh_rw)
1649{
1650 int is_metadata;
1651 struct btrfsic_block *block;
1652 struct btrfsic_block_data_ctx block_ctx;
1653 int ret;
1654 struct btrfsic_state *state = dev_state->state;
1655 struct block_device *bdev = dev_state->bdev;
1656
1657 WARN_ON(len > PAGE_SIZE);
1658 is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
1659 if (NULL != bio_is_patched)
1660 *bio_is_patched = 0;
1661
1662 block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
1663 &state->block_hashtable);
1664 if (NULL != block) {
1665 u64 bytenr = 0;
1666 struct list_head *elem_ref_to;
1667 struct list_head *tmp_ref_to;
1668
1669 if (block->is_superblock) {
1670 bytenr = le64_to_cpu(((struct btrfs_super_block *)
1671 mapped_data)->bytenr);
1672 is_metadata = 1;
1673 if (state->print_mask &
1674 BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
1675 printk(KERN_INFO
1676 "[before new superblock is written]:\n");
1677 btrfsic_dump_tree_sub(state, block, 0);
1678 }
1679 }
1680 if (is_metadata) {
1681 if (!block->is_superblock) {
1682 bytenr = le64_to_cpu(((struct btrfs_header *)
1683 mapped_data)->bytenr);
1684 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
1685 dev_state,
1686 dev_bytenr,
1687 mapped_data);
1688 }
1689 if (block->logical_bytenr != bytenr) {
1690 printk(KERN_INFO
1691 "Written block @%llu (%s/%llu/%d)"
1692 " found in hash table, %c,"
1693 " bytenr mismatch"
1694 " (!= stored %llu).\n",
1695 (unsigned long long)bytenr,
1696 dev_state->name,
1697 (unsigned long long)dev_bytenr,
1698 block->mirror_num,
1699 btrfsic_get_block_type(state, block),
1700 (unsigned long long)
1701 block->logical_bytenr);
1702 block->logical_bytenr = bytenr;
1703 } else if (state->print_mask &
1704 BTRFSIC_PRINT_MASK_VERBOSE)
1705 printk(KERN_INFO
1706 "Written block @%llu (%s/%llu/%d)"
1707 " found in hash table, %c.\n",
1708 (unsigned long long)bytenr,
1709 dev_state->name,
1710 (unsigned long long)dev_bytenr,
1711 block->mirror_num,
1712 btrfsic_get_block_type(state, block));
1713 } else {
1714 bytenr = block->logical_bytenr;
1715 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1716 printk(KERN_INFO
1717 "Written block @%llu (%s/%llu/%d)"
1718 " found in hash table, %c.\n",
1719 (unsigned long long)bytenr,
1720 dev_state->name,
1721 (unsigned long long)dev_bytenr,
1722 block->mirror_num,
1723 btrfsic_get_block_type(state, block));
1724 }
1725
1726 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1727 printk(KERN_INFO
1728 "ref_to_list: %cE, ref_from_list: %cE\n",
1729 list_empty(&block->ref_to_list) ? ' ' : '!',
1730 list_empty(&block->ref_from_list) ? ' ' : '!');
1731 if (btrfsic_is_block_ref_by_superblock(state, block, 0)) {
1732 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1733 " @%llu (%s/%llu/%d), old(gen=%llu,"
1734 " objectid=%llu, type=%d, offset=%llu),"
1735 " new(gen=%llu),"
1736 " which is referenced by most recent superblock"
1737 " (superblockgen=%llu)!\n",
1738 btrfsic_get_block_type(state, block),
1739 (unsigned long long)bytenr,
1740 dev_state->name,
1741 (unsigned long long)dev_bytenr,
1742 block->mirror_num,
1743 (unsigned long long)block->generation,
1744 (unsigned long long)
1745 le64_to_cpu(block->disk_key.objectid),
1746 block->disk_key.type,
1747 (unsigned long long)
1748 le64_to_cpu(block->disk_key.offset),
1749 (unsigned long long)
1750 le64_to_cpu(((struct btrfs_header *)
1751 mapped_data)->generation),
1752 (unsigned long long)
1753 state->max_superblock_generation);
1754 btrfsic_dump_tree(state);
1755 }
1756
1757 if (!block->is_iodone && !block->never_written) {
1758 printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
1759 " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
1760 " which is not yet iodone!\n",
1761 btrfsic_get_block_type(state, block),
1762 (unsigned long long)bytenr,
1763 dev_state->name,
1764 (unsigned long long)dev_bytenr,
1765 block->mirror_num,
1766 (unsigned long long)block->generation,
1767 (unsigned long long)
1768 le64_to_cpu(((struct btrfs_header *)
1769 mapped_data)->generation));
1770 /* it would not be safe to go on */
1771 btrfsic_dump_tree(state);
1772 return;
1773 }
1774
1775 /*
1776 * Clear all references of this block. Do not free
1777 * the block itself even if is not referenced anymore
1778 * because it still carries valueable information
1779 * like whether it was ever written and IO completed.
1780 */
1781 list_for_each_safe(elem_ref_to, tmp_ref_to,
1782 &block->ref_to_list) {
1783 struct btrfsic_block_link *const l =
1784 list_entry(elem_ref_to,
1785 struct btrfsic_block_link,
1786 node_ref_to);
1787
1788 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1789 btrfsic_print_rem_link(state, l);
1790 l->ref_cnt--;
1791 if (0 == l->ref_cnt) {
1792 list_del(&l->node_ref_to);
1793 list_del(&l->node_ref_from);
1794 btrfsic_block_link_hashtable_remove(l);
1795 btrfsic_block_link_free(l);
1796 }
1797 }
1798
1799 if (block->is_superblock)
1800 ret = btrfsic_map_superblock(state, bytenr, len,
1801 bdev, &block_ctx);
1802 else
1803 ret = btrfsic_map_block(state, bytenr, len,
1804 &block_ctx, 0);
1805 if (ret) {
1806 printk(KERN_INFO
1807 "btrfsic: btrfsic_map_block(root @%llu)"
1808 " failed!\n", (unsigned long long)bytenr);
1809 return;
1810 }
1811 block_ctx.data = mapped_data;
1812 /* the following is required in case of writes to mirrors,
1813 * use the same that was used for the lookup */
1814 block_ctx.dev = dev_state;
1815 block_ctx.dev_bytenr = dev_bytenr;
1816
1817 if (is_metadata || state->include_extent_data) {
1818 block->never_written = 0;
1819 block->iodone_w_error = 0;
1820 if (NULL != bio) {
1821 block->is_iodone = 0;
1822 BUG_ON(NULL == bio_is_patched);
1823 if (!*bio_is_patched) {
1824 block->orig_bio_bh_private =
1825 bio->bi_private;
1826 block->orig_bio_bh_end_io.bio =
1827 bio->bi_end_io;
1828 block->next_in_same_bio = NULL;
1829 bio->bi_private = block;
1830 bio->bi_end_io = btrfsic_bio_end_io;
1831 *bio_is_patched = 1;
1832 } else {
1833 struct btrfsic_block *chained_block =
1834 (struct btrfsic_block *)
1835 bio->bi_private;
1836
1837 BUG_ON(NULL == chained_block);
1838 block->orig_bio_bh_private =
1839 chained_block->orig_bio_bh_private;
1840 block->orig_bio_bh_end_io.bio =
1841 chained_block->orig_bio_bh_end_io.
1842 bio;
1843 block->next_in_same_bio = chained_block;
1844 bio->bi_private = block;
1845 }
1846 } else if (NULL != bh) {
1847 block->is_iodone = 0;
1848 block->orig_bio_bh_private = bh->b_private;
1849 block->orig_bio_bh_end_io.bh = bh->b_end_io;
1850 block->next_in_same_bio = NULL;
1851 bh->b_private = block;
1852 bh->b_end_io = btrfsic_bh_end_io;
1853 } else {
1854 block->is_iodone = 1;
1855 block->orig_bio_bh_private = NULL;
1856 block->orig_bio_bh_end_io.bio = NULL;
1857 block->next_in_same_bio = NULL;
1858 }
1859 }
1860
1861 block->flush_gen = dev_state->last_flush_gen + 1;
1862 block->submit_bio_bh_rw = submit_bio_bh_rw;
1863 if (is_metadata) {
1864 block->logical_bytenr = bytenr;
1865 block->is_metadata = 1;
1866 if (block->is_superblock) {
1867 ret = btrfsic_process_written_superblock(
1868 state,
1869 block,
1870 (struct btrfs_super_block *)
1871 mapped_data);
1872 if (state->print_mask &
1873 BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
1874 printk(KERN_INFO
1875 "[after new superblock is written]:\n");
1876 btrfsic_dump_tree_sub(state, block, 0);
1877 }
1878 } else {
1879 block->mirror_num = 0; /* unknown */
1880 ret = btrfsic_process_metablock(
1881 state,
1882 block,
1883 &block_ctx,
1884 (struct btrfs_header *)
1885 block_ctx.data,
1886 0, 0);
1887 }
1888 if (ret)
1889 printk(KERN_INFO
1890 "btrfsic: btrfsic_process_metablock"
1891 "(root @%llu) failed!\n",
1892 (unsigned long long)dev_bytenr);
1893 } else {
1894 block->is_metadata = 0;
1895 block->mirror_num = 0; /* unknown */
1896 block->generation = BTRFSIC_GENERATION_UNKNOWN;
1897 if (!state->include_extent_data
1898 && list_empty(&block->ref_from_list)) {
1899 /*
1900 * disk block is overwritten with extent
1901 * data (not meta data) and we are configured
1902 * to not include extent data: take the
1903 * chance and free the block's memory
1904 */
1905 btrfsic_block_hashtable_remove(block);
1906 list_del(&block->all_blocks_node);
1907 btrfsic_block_free(block);
1908 }
1909 }
1910 btrfsic_release_block_ctx(&block_ctx);
1911 } else {
1912 /* block has not been found in hash table */
1913 u64 bytenr;
1914
1915 if (!is_metadata) {
1916 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1917 printk(KERN_INFO "Written block (%s/%llu/?)"
1918 " !found in hash table, D.\n",
1919 dev_state->name,
1920 (unsigned long long)dev_bytenr);
1921 if (!state->include_extent_data)
1922 return; /* ignore that written D block */
1923
1924 /* this is getting ugly for the
1925 * include_extent_data case... */
1926 bytenr = 0; /* unknown */
1927 block_ctx.start = bytenr;
1928 block_ctx.len = len;
1929 block_ctx.bh = NULL;
1930 } else {
1931 bytenr = le64_to_cpu(((struct btrfs_header *)
1932 mapped_data)->bytenr);
1933 btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
1934 dev_bytenr,
1935 mapped_data);
1936 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
1937 printk(KERN_INFO
1938 "Written block @%llu (%s/%llu/?)"
1939 " !found in hash table, M.\n",
1940 (unsigned long long)bytenr,
1941 dev_state->name,
1942 (unsigned long long)dev_bytenr);
1943
1944 ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
1945 0);
1946 if (ret) {
1947 printk(KERN_INFO
1948 "btrfsic: btrfsic_map_block(root @%llu)"
1949 " failed!\n",
1950 (unsigned long long)dev_bytenr);
1951 return;
1952 }
1953 }
1954 block_ctx.data = mapped_data;
1955 /* the following is required in case of writes to mirrors,
1956 * use the same that was used for the lookup */
1957 block_ctx.dev = dev_state;
1958 block_ctx.dev_bytenr = dev_bytenr;
1959
1960 block = btrfsic_block_alloc();
1961 if (NULL == block) {
1962 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
1963 btrfsic_release_block_ctx(&block_ctx);
1964 return;
1965 }
1966 block->dev_state = dev_state;
1967 block->dev_bytenr = dev_bytenr;
1968 block->logical_bytenr = bytenr;
1969 block->is_metadata = is_metadata;
1970 block->never_written = 0;
1971 block->iodone_w_error = 0;
1972 block->mirror_num = 0; /* unknown */
1973 block->flush_gen = dev_state->last_flush_gen + 1;
1974 block->submit_bio_bh_rw = submit_bio_bh_rw;
1975 if (NULL != bio) {
1976 block->is_iodone = 0;
1977 BUG_ON(NULL == bio_is_patched);
1978 if (!*bio_is_patched) {
1979 block->orig_bio_bh_private = bio->bi_private;
1980 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
1981 block->next_in_same_bio = NULL;
1982 bio->bi_private = block;
1983 bio->bi_end_io = btrfsic_bio_end_io;
1984 *bio_is_patched = 1;
1985 } else {
1986 struct btrfsic_block *chained_block =
1987 (struct btrfsic_block *)
1988 bio->bi_private;
1989
1990 BUG_ON(NULL == chained_block);
1991 block->orig_bio_bh_private =
1992 chained_block->orig_bio_bh_private;
1993 block->orig_bio_bh_end_io.bio =
1994 chained_block->orig_bio_bh_end_io.bio;
1995 block->next_in_same_bio = chained_block;
1996 bio->bi_private = block;
1997 }
1998 } else if (NULL != bh) {
1999 block->is_iodone = 0;
2000 block->orig_bio_bh_private = bh->b_private;
2001 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2002 block->next_in_same_bio = NULL;
2003 bh->b_private = block;
2004 bh->b_end_io = btrfsic_bh_end_io;
2005 } else {
2006 block->is_iodone = 1;
2007 block->orig_bio_bh_private = NULL;
2008 block->orig_bio_bh_end_io.bio = NULL;
2009 block->next_in_same_bio = NULL;
2010 }
2011 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2012 printk(KERN_INFO
2013 "New written %c-block @%llu (%s/%llu/%d)\n",
2014 is_metadata ? 'M' : 'D',
2015 (unsigned long long)block->logical_bytenr,
2016 block->dev_state->name,
2017 (unsigned long long)block->dev_bytenr,
2018 block->mirror_num);
2019 list_add(&block->all_blocks_node, &state->all_blocks_list);
2020 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2021
2022 if (is_metadata) {
2023 ret = btrfsic_process_metablock(state, block,
2024 &block_ctx,
2025 (struct btrfs_header *)
2026 block_ctx.data, 0, 0);
2027 if (ret)
2028 printk(KERN_INFO
2029 "btrfsic: process_metablock(root @%llu)"
2030 " failed!\n",
2031 (unsigned long long)dev_bytenr);
2032 }
2033 btrfsic_release_block_ctx(&block_ctx);
2034 }
2035}
2036
2037static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
2038{
2039 struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
2040 int iodone_w_error;
2041
2042 /* mutex is not held! This is not save if IO is not yet completed
2043 * on umount */
2044 iodone_w_error = 0;
2045 if (bio_error_status)
2046 iodone_w_error = 1;
2047
2048 BUG_ON(NULL == block);
2049 bp->bi_private = block->orig_bio_bh_private;
2050 bp->bi_end_io = block->orig_bio_bh_end_io.bio;
2051
2052 do {
2053 struct btrfsic_block *next_block;
2054 struct btrfsic_dev_state *const dev_state = block->dev_state;
2055
2056 if ((dev_state->state->print_mask &
2057 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2058 printk(KERN_INFO
2059 "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
2060 bio_error_status,
2061 btrfsic_get_block_type(dev_state->state, block),
2062 (unsigned long long)block->logical_bytenr,
2063 dev_state->name,
2064 (unsigned long long)block->dev_bytenr,
2065 block->mirror_num);
2066 next_block = block->next_in_same_bio;
2067 block->iodone_w_error = iodone_w_error;
2068 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2069 dev_state->last_flush_gen++;
2070 if ((dev_state->state->print_mask &
2071 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2072 printk(KERN_INFO
2073 "bio_end_io() new %s flush_gen=%llu\n",
2074 dev_state->name,
2075 (unsigned long long)
2076 dev_state->last_flush_gen);
2077 }
2078 if (block->submit_bio_bh_rw & REQ_FUA)
2079 block->flush_gen = 0; /* FUA completed means block is
2080 * on disk */
2081 block->is_iodone = 1; /* for FLUSH, this releases the block */
2082 block = next_block;
2083 } while (NULL != block);
2084
2085 bp->bi_end_io(bp, bio_error_status);
2086}
2087
2088static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
2089{
2090 struct btrfsic_block *block = (struct btrfsic_block *)bh->b_private;
2091 int iodone_w_error = !uptodate;
2092 struct btrfsic_dev_state *dev_state;
2093
2094 BUG_ON(NULL == block);
2095 dev_state = block->dev_state;
2096 if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2097 printk(KERN_INFO
2098 "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
2099 iodone_w_error,
2100 btrfsic_get_block_type(dev_state->state, block),
2101 (unsigned long long)block->logical_bytenr,
2102 block->dev_state->name,
2103 (unsigned long long)block->dev_bytenr,
2104 block->mirror_num);
2105
2106 block->iodone_w_error = iodone_w_error;
2107 if (block->submit_bio_bh_rw & REQ_FLUSH) {
2108 dev_state->last_flush_gen++;
2109 if ((dev_state->state->print_mask &
2110 BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
2111 printk(KERN_INFO
2112 "bh_end_io() new %s flush_gen=%llu\n",
2113 dev_state->name,
2114 (unsigned long long)dev_state->last_flush_gen);
2115 }
2116 if (block->submit_bio_bh_rw & REQ_FUA)
2117 block->flush_gen = 0; /* FUA completed means block is on disk */
2118
2119 bh->b_private = block->orig_bio_bh_private;
2120 bh->b_end_io = block->orig_bio_bh_end_io.bh;
2121 block->is_iodone = 1; /* for FLUSH, this releases the block */
2122 bh->b_end_io(bh, uptodate);
2123}
2124
2125static int btrfsic_process_written_superblock(
2126 struct btrfsic_state *state,
2127 struct btrfsic_block *const superblock,
2128 struct btrfs_super_block *const super_hdr)
2129{
2130 int pass;
2131
2132 superblock->generation = btrfs_super_generation(super_hdr);
2133 if (!(superblock->generation > state->max_superblock_generation ||
2134 0 == state->max_superblock_generation)) {
2135 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2136 printk(KERN_INFO
2137 "btrfsic: superblock @%llu (%s/%llu/%d)"
2138 " with old gen %llu <= %llu\n",
2139 (unsigned long long)superblock->logical_bytenr,
2140 superblock->dev_state->name,
2141 (unsigned long long)superblock->dev_bytenr,
2142 superblock->mirror_num,
2143 (unsigned long long)
2144 btrfs_super_generation(super_hdr),
2145 (unsigned long long)
2146 state->max_superblock_generation);
2147 } else {
2148 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
2149 printk(KERN_INFO
2150 "btrfsic: got new superblock @%llu (%s/%llu/%d)"
2151 " with new gen %llu > %llu\n",
2152 (unsigned long long)superblock->logical_bytenr,
2153 superblock->dev_state->name,
2154 (unsigned long long)superblock->dev_bytenr,
2155 superblock->mirror_num,
2156 (unsigned long long)
2157 btrfs_super_generation(super_hdr),
2158 (unsigned long long)
2159 state->max_superblock_generation);
2160
2161 state->max_superblock_generation =
2162 btrfs_super_generation(super_hdr);
2163 state->latest_superblock = superblock;
2164 }
2165
2166 for (pass = 0; pass < 3; pass++) {
2167 int ret;
2168 u64 next_bytenr;
2169 struct btrfsic_block *next_block;
2170 struct btrfsic_block_data_ctx tmp_next_block_ctx;
2171 struct btrfsic_block_link *l;
2172 int num_copies;
2173 int mirror_num;
2174 const char *additional_string = NULL;
2175 struct btrfs_disk_key tmp_disk_key;
2176
2177 tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
2178 tmp_disk_key.offset = 0;
2179
2180 switch (pass) {
2181 case 0:
2182 tmp_disk_key.objectid =
2183 cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
2184 additional_string = "root ";
2185 next_bytenr = btrfs_super_root(super_hdr);
2186 if (state->print_mask &
2187 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2188 printk(KERN_INFO "root@%llu\n",
2189 (unsigned long long)next_bytenr);
2190 break;
2191 case 1:
2192 tmp_disk_key.objectid =
2193 cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
2194 additional_string = "chunk ";
2195 next_bytenr = btrfs_super_chunk_root(super_hdr);
2196 if (state->print_mask &
2197 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2198 printk(KERN_INFO "chunk@%llu\n",
2199 (unsigned long long)next_bytenr);
2200 break;
2201 case 2:
2202 tmp_disk_key.objectid =
2203 cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
2204 additional_string = "log ";
2205 next_bytenr = btrfs_super_log_root(super_hdr);
2206 if (0 == next_bytenr)
2207 continue;
2208 if (state->print_mask &
2209 BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
2210 printk(KERN_INFO "log@%llu\n",
2211 (unsigned long long)next_bytenr);
2212 break;
2213 }
2214
2215 num_copies =
2216 btrfs_num_copies(&state->root->fs_info->mapping_tree,
2217 next_bytenr, PAGE_SIZE);
2218 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
2219 printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
2220 (unsigned long long)next_bytenr, num_copies);
2221 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2222 int was_created;
2223
2224 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2225 printk(KERN_INFO
2226 "btrfsic_process_written_superblock("
2227 "mirror_num=%d)\n", mirror_num);
2228 ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
2229 &tmp_next_block_ctx,
2230 mirror_num);
2231 if (ret) {
2232 printk(KERN_INFO
2233 "btrfsic: btrfsic_map_block(@%llu,"
2234 " mirror=%d) failed!\n",
2235 (unsigned long long)next_bytenr,
2236 mirror_num);
2237 return -1;
2238 }
2239
2240 next_block = btrfsic_block_lookup_or_add(
2241 state,
2242 &tmp_next_block_ctx,
2243 additional_string,
2244 1, 0, 1,
2245 mirror_num,
2246 &was_created);
2247 if (NULL == next_block) {
2248 printk(KERN_INFO
2249 "btrfsic: error, kmalloc failed!\n");
2250 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2251 return -1;
2252 }
2253
2254 next_block->disk_key = tmp_disk_key;
2255 if (was_created)
2256 next_block->generation =
2257 BTRFSIC_GENERATION_UNKNOWN;
2258 l = btrfsic_block_link_lookup_or_add(
2259 state,
2260 &tmp_next_block_ctx,
2261 next_block,
2262 superblock,
2263 BTRFSIC_GENERATION_UNKNOWN);
2264 btrfsic_release_block_ctx(&tmp_next_block_ctx);
2265 if (NULL == l)
2266 return -1;
2267 }
2268 }
2269
2270 if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
2271 WARN_ON(1);
2272 btrfsic_dump_tree(state);
2273 }
2274
2275 return 0;
2276}
2277
2278static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
2279 struct btrfsic_block *const block,
2280 int recursion_level)
2281{
2282 struct list_head *elem_ref_to;
2283 int ret = 0;
2284
2285 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2286 /*
2287 * Note that this situation can happen and does not
2288 * indicate an error in regular cases. It happens
2289 * when disk blocks are freed and later reused.
2290 * The check-integrity module is not aware of any
2291 * block free operations, it just recognizes block
2292 * write operations. Therefore it keeps the linkage
2293 * information for a block until a block is
2294 * rewritten. This can temporarily cause incorrect
2295 * and even circular linkage informations. This
2296 * causes no harm unless such blocks are referenced
2297 * by the most recent super block.
2298 */
2299 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2300 printk(KERN_INFO
2301 "btrfsic: abort cyclic linkage (case 1).\n");
2302
2303 return ret;
2304 }
2305
2306 /*
2307 * This algorithm is recursive because the amount of used stack
2308 * space is very small and the max recursion depth is limited.
2309 */
2310 list_for_each(elem_ref_to, &block->ref_to_list) {
2311 const struct btrfsic_block_link *const l =
2312 list_entry(elem_ref_to, struct btrfsic_block_link,
2313 node_ref_to);
2314
2315 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2316 printk(KERN_INFO
2317 "rl=%d, %c @%llu (%s/%llu/%d)"
2318 " %u* refers to %c @%llu (%s/%llu/%d)\n",
2319 recursion_level,
2320 btrfsic_get_block_type(state, block),
2321 (unsigned long long)block->logical_bytenr,
2322 block->dev_state->name,
2323 (unsigned long long)block->dev_bytenr,
2324 block->mirror_num,
2325 l->ref_cnt,
2326 btrfsic_get_block_type(state, l->block_ref_to),
2327 (unsigned long long)
2328 l->block_ref_to->logical_bytenr,
2329 l->block_ref_to->dev_state->name,
2330 (unsigned long long)l->block_ref_to->dev_bytenr,
2331 l->block_ref_to->mirror_num);
2332 if (l->block_ref_to->never_written) {
2333 printk(KERN_INFO "btrfs: attempt to write superblock"
2334 " which references block %c @%llu (%s/%llu/%d)"
2335 " which is never written!\n",
2336 btrfsic_get_block_type(state, l->block_ref_to),
2337 (unsigned long long)
2338 l->block_ref_to->logical_bytenr,
2339 l->block_ref_to->dev_state->name,
2340 (unsigned long long)l->block_ref_to->dev_bytenr,
2341 l->block_ref_to->mirror_num);
2342 ret = -1;
2343 } else if (!l->block_ref_to->is_iodone) {
2344 printk(KERN_INFO "btrfs: attempt to write superblock"
2345 " which references block %c @%llu (%s/%llu/%d)"
2346 " which is not yet iodone!\n",
2347 btrfsic_get_block_type(state, l->block_ref_to),
2348 (unsigned long long)
2349 l->block_ref_to->logical_bytenr,
2350 l->block_ref_to->dev_state->name,
2351 (unsigned long long)l->block_ref_to->dev_bytenr,
2352 l->block_ref_to->mirror_num);
2353 ret = -1;
2354 } else if (l->parent_generation !=
2355 l->block_ref_to->generation &&
2356 BTRFSIC_GENERATION_UNKNOWN !=
2357 l->parent_generation &&
2358 BTRFSIC_GENERATION_UNKNOWN !=
2359 l->block_ref_to->generation) {
2360 printk(KERN_INFO "btrfs: attempt to write superblock"
2361 " which references block %c @%llu (%s/%llu/%d)"
2362 " with generation %llu !="
2363 " parent generation %llu!\n",
2364 btrfsic_get_block_type(state, l->block_ref_to),
2365 (unsigned long long)
2366 l->block_ref_to->logical_bytenr,
2367 l->block_ref_to->dev_state->name,
2368 (unsigned long long)l->block_ref_to->dev_bytenr,
2369 l->block_ref_to->mirror_num,
2370 (unsigned long long)l->block_ref_to->generation,
2371 (unsigned long long)l->parent_generation);
2372 ret = -1;
2373 } else if (l->block_ref_to->flush_gen >
2374 l->block_ref_to->dev_state->last_flush_gen) {
2375 printk(KERN_INFO "btrfs: attempt to write superblock"
2376 " which references block %c @%llu (%s/%llu/%d)"
2377 " which is not flushed out of disk's write cache"
2378 " (block flush_gen=%llu,"
2379 " dev->flush_gen=%llu)!\n",
2380 btrfsic_get_block_type(state, l->block_ref_to),
2381 (unsigned long long)
2382 l->block_ref_to->logical_bytenr,
2383 l->block_ref_to->dev_state->name,
2384 (unsigned long long)l->block_ref_to->dev_bytenr,
2385 l->block_ref_to->mirror_num,
2386 (unsigned long long)block->flush_gen,
2387 (unsigned long long)
2388 l->block_ref_to->dev_state->last_flush_gen);
2389 ret = -1;
2390 } else if (-1 == btrfsic_check_all_ref_blocks(state,
2391 l->block_ref_to,
2392 recursion_level +
2393 1)) {
2394 ret = -1;
2395 }
2396 }
2397
2398 return ret;
2399}
2400
2401static int btrfsic_is_block_ref_by_superblock(
2402 const struct btrfsic_state *state,
2403 const struct btrfsic_block *block,
2404 int recursion_level)
2405{
2406 struct list_head *elem_ref_from;
2407
2408 if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
2409 /* refer to comment at "abort cyclic linkage (case 1)" */
2410 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2411 printk(KERN_INFO
2412 "btrfsic: abort cyclic linkage (case 2).\n");
2413
2414 return 0;
2415 }
2416
2417 /*
2418 * This algorithm is recursive because the amount of used stack space
2419 * is very small and the max recursion depth is limited.
2420 */
2421 list_for_each(elem_ref_from, &block->ref_from_list) {
2422 const struct btrfsic_block_link *const l =
2423 list_entry(elem_ref_from, struct btrfsic_block_link,
2424 node_ref_from);
2425
2426 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2427 printk(KERN_INFO
2428 "rl=%d, %c @%llu (%s/%llu/%d)"
2429 " is ref %u* from %c @%llu (%s/%llu/%d)\n",
2430 recursion_level,
2431 btrfsic_get_block_type(state, block),
2432 (unsigned long long)block->logical_bytenr,
2433 block->dev_state->name,
2434 (unsigned long long)block->dev_bytenr,
2435 block->mirror_num,
2436 l->ref_cnt,
2437 btrfsic_get_block_type(state, l->block_ref_from),
2438 (unsigned long long)
2439 l->block_ref_from->logical_bytenr,
2440 l->block_ref_from->dev_state->name,
2441 (unsigned long long)
2442 l->block_ref_from->dev_bytenr,
2443 l->block_ref_from->mirror_num);
2444 if (l->block_ref_from->is_superblock &&
2445 state->latest_superblock->dev_bytenr ==
2446 l->block_ref_from->dev_bytenr &&
2447 state->latest_superblock->dev_state->bdev ==
2448 l->block_ref_from->dev_state->bdev)
2449 return 1;
2450 else if (btrfsic_is_block_ref_by_superblock(state,
2451 l->block_ref_from,
2452 recursion_level +
2453 1))
2454 return 1;
2455 }
2456
2457 return 0;
2458}
2459
2460static void btrfsic_print_add_link(const struct btrfsic_state *state,
2461 const struct btrfsic_block_link *l)
2462{
2463 printk(KERN_INFO
2464 "Add %u* link from %c @%llu (%s/%llu/%d)"
2465 " to %c @%llu (%s/%llu/%d).\n",
2466 l->ref_cnt,
2467 btrfsic_get_block_type(state, l->block_ref_from),
2468 (unsigned long long)l->block_ref_from->logical_bytenr,
2469 l->block_ref_from->dev_state->name,
2470 (unsigned long long)l->block_ref_from->dev_bytenr,
2471 l->block_ref_from->mirror_num,
2472 btrfsic_get_block_type(state, l->block_ref_to),
2473 (unsigned long long)l->block_ref_to->logical_bytenr,
2474 l->block_ref_to->dev_state->name,
2475 (unsigned long long)l->block_ref_to->dev_bytenr,
2476 l->block_ref_to->mirror_num);
2477}
2478
2479static void btrfsic_print_rem_link(const struct btrfsic_state *state,
2480 const struct btrfsic_block_link *l)
2481{
2482 printk(KERN_INFO
2483 "Rem %u* link from %c @%llu (%s/%llu/%d)"
2484 " to %c @%llu (%s/%llu/%d).\n",
2485 l->ref_cnt,
2486 btrfsic_get_block_type(state, l->block_ref_from),
2487 (unsigned long long)l->block_ref_from->logical_bytenr,
2488 l->block_ref_from->dev_state->name,
2489 (unsigned long long)l->block_ref_from->dev_bytenr,
2490 l->block_ref_from->mirror_num,
2491 btrfsic_get_block_type(state, l->block_ref_to),
2492 (unsigned long long)l->block_ref_to->logical_bytenr,
2493 l->block_ref_to->dev_state->name,
2494 (unsigned long long)l->block_ref_to->dev_bytenr,
2495 l->block_ref_to->mirror_num);
2496}
2497
2498static char btrfsic_get_block_type(const struct btrfsic_state *state,
2499 const struct btrfsic_block *block)
2500{
2501 if (block->is_superblock &&
2502 state->latest_superblock->dev_bytenr == block->dev_bytenr &&
2503 state->latest_superblock->dev_state->bdev == block->dev_state->bdev)
2504 return 'S';
2505 else if (block->is_superblock)
2506 return 's';
2507 else if (block->is_metadata)
2508 return 'M';
2509 else
2510 return 'D';
2511}
2512
2513static void btrfsic_dump_tree(const struct btrfsic_state *state)
2514{
2515 btrfsic_dump_tree_sub(state, state->latest_superblock, 0);
2516}
2517
2518static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
2519 const struct btrfsic_block *block,
2520 int indent_level)
2521{
2522 struct list_head *elem_ref_to;
2523 int indent_add;
2524 static char buf[80];
2525 int cursor_position;
2526
2527 /*
2528 * Should better fill an on-stack buffer with a complete line and
2529 * dump it at once when it is time to print a newline character.
2530 */
2531
2532 /*
2533 * This algorithm is recursive because the amount of used stack space
2534 * is very small and the max recursion depth is limited.
2535 */
2536 indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
2537 btrfsic_get_block_type(state, block),
2538 (unsigned long long)block->logical_bytenr,
2539 block->dev_state->name,
2540 (unsigned long long)block->dev_bytenr,
2541 block->mirror_num);
2542 if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2543 printk("[...]\n");
2544 return;
2545 }
2546 printk(buf);
2547 indent_level += indent_add;
2548 if (list_empty(&block->ref_to_list)) {
2549 printk("\n");
2550 return;
2551 }
2552 if (block->mirror_num > 1 &&
2553 !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) {
2554 printk(" [...]\n");
2555 return;
2556 }
2557
2558 cursor_position = indent_level;
2559 list_for_each(elem_ref_to, &block->ref_to_list) {
2560 const struct btrfsic_block_link *const l =
2561 list_entry(elem_ref_to, struct btrfsic_block_link,
2562 node_ref_to);
2563
2564 while (cursor_position < indent_level) {
2565 printk(" ");
2566 cursor_position++;
2567 }
2568 if (l->ref_cnt > 1)
2569 indent_add = sprintf(buf, " %d*--> ", l->ref_cnt);
2570 else
2571 indent_add = sprintf(buf, " --> ");
2572 if (indent_level + indent_add >
2573 BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
2574 printk("[...]\n");
2575 cursor_position = 0;
2576 continue;
2577 }
2578
2579 printk(buf);
2580
2581 btrfsic_dump_tree_sub(state, l->block_ref_to,
2582 indent_level + indent_add);
2583 cursor_position = 0;
2584 }
2585}
2586
2587static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add(
2588 struct btrfsic_state *state,
2589 struct btrfsic_block_data_ctx *next_block_ctx,
2590 struct btrfsic_block *next_block,
2591 struct btrfsic_block *from_block,
2592 u64 parent_generation)
2593{
2594 struct btrfsic_block_link *l;
2595
2596 l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev,
2597 next_block_ctx->dev_bytenr,
2598 from_block->dev_state->bdev,
2599 from_block->dev_bytenr,
2600 &state->block_link_hashtable);
2601 if (NULL == l) {
2602 l = btrfsic_block_link_alloc();
2603 if (NULL == l) {
2604 printk(KERN_INFO
2605 "btrfsic: error, kmalloc" " failed!\n");
2606 return NULL;
2607 }
2608
2609 l->block_ref_to = next_block;
2610 l->block_ref_from = from_block;
2611 l->ref_cnt = 1;
2612 l->parent_generation = parent_generation;
2613
2614 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2615 btrfsic_print_add_link(state, l);
2616
2617 list_add(&l->node_ref_to, &from_block->ref_to_list);
2618 list_add(&l->node_ref_from, &next_block->ref_from_list);
2619
2620 btrfsic_block_link_hashtable_add(l,
2621 &state->block_link_hashtable);
2622 } else {
2623 l->ref_cnt++;
2624 l->parent_generation = parent_generation;
2625 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2626 btrfsic_print_add_link(state, l);
2627 }
2628
2629 return l;
2630}
2631
2632static struct btrfsic_block *btrfsic_block_lookup_or_add(
2633 struct btrfsic_state *state,
2634 struct btrfsic_block_data_ctx *block_ctx,
2635 const char *additional_string,
2636 int is_metadata,
2637 int is_iodone,
2638 int never_written,
2639 int mirror_num,
2640 int *was_created)
2641{
2642 struct btrfsic_block *block;
2643
2644 block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev,
2645 block_ctx->dev_bytenr,
2646 &state->block_hashtable);
2647 if (NULL == block) {
2648 struct btrfsic_dev_state *dev_state;
2649
2650 block = btrfsic_block_alloc();
2651 if (NULL == block) {
2652 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
2653 return NULL;
2654 }
2655 dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev);
2656 if (NULL == dev_state) {
2657 printk(KERN_INFO
2658 "btrfsic: error, lookup dev_state failed!\n");
2659 btrfsic_block_free(block);
2660 return NULL;
2661 }
2662 block->dev_state = dev_state;
2663 block->dev_bytenr = block_ctx->dev_bytenr;
2664 block->logical_bytenr = block_ctx->start;
2665 block->is_metadata = is_metadata;
2666 block->is_iodone = is_iodone;
2667 block->never_written = never_written;
2668 block->mirror_num = mirror_num;
2669 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
2670 printk(KERN_INFO
2671 "New %s%c-block @%llu (%s/%llu/%d)\n",
2672 additional_string,
2673 btrfsic_get_block_type(state, block),
2674 (unsigned long long)block->logical_bytenr,
2675 dev_state->name,
2676 (unsigned long long)block->dev_bytenr,
2677 mirror_num);
2678 list_add(&block->all_blocks_node, &state->all_blocks_list);
2679 btrfsic_block_hashtable_add(block, &state->block_hashtable);
2680 if (NULL != was_created)
2681 *was_created = 1;
2682 } else {
2683 if (NULL != was_created)
2684 *was_created = 0;
2685 }
2686
2687 return block;
2688}
2689
2690static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
2691 u64 bytenr,
2692 struct btrfsic_dev_state *dev_state,
2693 u64 dev_bytenr, char *data)
2694{
2695 int num_copies;
2696 int mirror_num;
2697 int ret;
2698 struct btrfsic_block_data_ctx block_ctx;
2699 int match = 0;
2700
2701 num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
2702 bytenr, PAGE_SIZE);
2703
2704 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2705 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2706 &block_ctx, mirror_num);
2707 if (ret) {
2708 printk(KERN_INFO "btrfsic:"
2709 " btrfsic_map_block(logical @%llu,"
2710 " mirror %d) failed!\n",
2711 (unsigned long long)bytenr, mirror_num);
2712 continue;
2713 }
2714
2715 if (dev_state->bdev == block_ctx.dev->bdev &&
2716 dev_bytenr == block_ctx.dev_bytenr) {
2717 match++;
2718 btrfsic_release_block_ctx(&block_ctx);
2719 break;
2720 }
2721 btrfsic_release_block_ctx(&block_ctx);
2722 }
2723
2724 if (!match) {
2725 printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
2726 " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
2727 " phys_bytenr=%llu)!\n",
2728 (unsigned long long)bytenr, dev_state->name,
2729 (unsigned long long)dev_bytenr);
2730 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
2731 ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
2732 &block_ctx, mirror_num);
2733 if (ret)
2734 continue;
2735
2736 printk(KERN_INFO "Read logical bytenr @%llu maps to"
2737 " (%s/%llu/%d)\n",
2738 (unsigned long long)bytenr,
2739 block_ctx.dev->name,
2740 (unsigned long long)block_ctx.dev_bytenr,
2741 mirror_num);
2742 }
2743 WARN_ON(1);
2744 }
2745}
2746
2747static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
2748 struct block_device *bdev)
2749{
2750 struct btrfsic_dev_state *ds;
2751
2752 ds = btrfsic_dev_state_hashtable_lookup(bdev,
2753 &btrfsic_dev_state_hashtable);
2754 return ds;
2755}
2756
2757int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2758{
2759 struct btrfsic_dev_state *dev_state;
2760
2761 if (!btrfsic_is_initialized)
2762 return submit_bh(rw, bh);
2763
2764 mutex_lock(&btrfsic_mutex);
2765 /* since btrfsic_submit_bh() might also be called before
2766 * btrfsic_mount(), this might return NULL */
2767 dev_state = btrfsic_dev_state_lookup(bh->b_bdev);
2768
2769 /* Only called to write the superblock (incl. FLUSH/FUA) */
2770 if (NULL != dev_state &&
2771 (rw & WRITE) && bh->b_size > 0) {
2772 u64 dev_bytenr;
2773
2774 dev_bytenr = 4096 * bh->b_blocknr;
2775 if (dev_state->state->print_mask &
2776 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2777 printk(KERN_INFO
2778 "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
2779 " size=%lu, data=%p, bdev=%p)\n",
2780 rw, (unsigned long)bh->b_blocknr,
2781 (unsigned long long)dev_bytenr,
2782 (unsigned long)bh->b_size, bh->b_data,
2783 bh->b_bdev);
2784 btrfsic_process_written_block(dev_state, dev_bytenr,
2785 bh->b_data, bh->b_size, NULL,
2786 NULL, bh, rw);
2787 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2788 if (dev_state->state->print_mask &
2789 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2790 printk(KERN_INFO
2791 "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
2792 rw, bh->b_bdev);
2793 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2794 if ((dev_state->state->print_mask &
2795 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2796 BTRFSIC_PRINT_MASK_VERBOSE)))
2797 printk(KERN_INFO
2798 "btrfsic_submit_bh(%s) with FLUSH"
2799 " but dummy block already in use"
2800 " (ignored)!\n",
2801 dev_state->name);
2802 } else {
2803 struct btrfsic_block *const block =
2804 &dev_state->dummy_block_for_bio_bh_flush;
2805
2806 block->is_iodone = 0;
2807 block->never_written = 0;
2808 block->iodone_w_error = 0;
2809 block->flush_gen = dev_state->last_flush_gen + 1;
2810 block->submit_bio_bh_rw = rw;
2811 block->orig_bio_bh_private = bh->b_private;
2812 block->orig_bio_bh_end_io.bh = bh->b_end_io;
2813 block->next_in_same_bio = NULL;
2814 bh->b_private = block;
2815 bh->b_end_io = btrfsic_bh_end_io;
2816 }
2817 }
2818 mutex_unlock(&btrfsic_mutex);
2819 return submit_bh(rw, bh);
2820}
2821
2822void btrfsic_submit_bio(int rw, struct bio *bio)
2823{
2824 struct btrfsic_dev_state *dev_state;
2825
2826 if (!btrfsic_is_initialized) {
2827 submit_bio(rw, bio);
2828 return;
2829 }
2830
2831 mutex_lock(&btrfsic_mutex);
2832 /* since btrfsic_submit_bio() is also called before
2833 * btrfsic_mount(), this might return NULL */
2834 dev_state = btrfsic_dev_state_lookup(bio->bi_bdev);
2835 if (NULL != dev_state &&
2836 (rw & WRITE) && NULL != bio->bi_io_vec) {
2837 unsigned int i;
2838 u64 dev_bytenr;
2839 int bio_is_patched;
2840
2841 dev_bytenr = 512 * bio->bi_sector;
2842 bio_is_patched = 0;
2843 if (dev_state->state->print_mask &
2844 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2845 printk(KERN_INFO
2846 "submit_bio(rw=0x%x, bi_vcnt=%u,"
2847 " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
2848 rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,
2849 (unsigned long long)dev_bytenr,
2850 bio->bi_bdev);
2851
2852 for (i = 0; i < bio->bi_vcnt; i++) {
2853 u8 *mapped_data;
2854
2855 mapped_data = kmap(bio->bi_io_vec[i].bv_page);
2856 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2857 BTRFSIC_PRINT_MASK_VERBOSE) ==
2858 (dev_state->state->print_mask &
2859 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2860 BTRFSIC_PRINT_MASK_VERBOSE)))
2861 printk(KERN_INFO
2862 "#%u: page=%p, mapped=%p, len=%u,"
2863 " offset=%u\n",
2864 i, bio->bi_io_vec[i].bv_page,
2865 mapped_data,
2866 bio->bi_io_vec[i].bv_len,
2867 bio->bi_io_vec[i].bv_offset);
2868 btrfsic_process_written_block(dev_state, dev_bytenr,
2869 mapped_data,
2870 bio->bi_io_vec[i].bv_len,
2871 bio, &bio_is_patched,
2872 NULL, rw);
2873 kunmap(bio->bi_io_vec[i].bv_page);
2874 dev_bytenr += bio->bi_io_vec[i].bv_len;
2875 }
2876 } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
2877 if (dev_state->state->print_mask &
2878 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
2879 printk(KERN_INFO
2880 "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
2881 rw, bio->bi_bdev);
2882 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
2883 if ((dev_state->state->print_mask &
2884 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
2885 BTRFSIC_PRINT_MASK_VERBOSE)))
2886 printk(KERN_INFO
2887 "btrfsic_submit_bio(%s) with FLUSH"
2888 " but dummy block already in use"
2889 " (ignored)!\n",
2890 dev_state->name);
2891 } else {
2892 struct btrfsic_block *const block =
2893 &dev_state->dummy_block_for_bio_bh_flush;
2894
2895 block->is_iodone = 0;
2896 block->never_written = 0;
2897 block->iodone_w_error = 0;
2898 block->flush_gen = dev_state->last_flush_gen + 1;
2899 block->submit_bio_bh_rw = rw;
2900 block->orig_bio_bh_private = bio->bi_private;
2901 block->orig_bio_bh_end_io.bio = bio->bi_end_io;
2902 block->next_in_same_bio = NULL;
2903 bio->bi_private = block;
2904 bio->bi_end_io = btrfsic_bio_end_io;
2905 }
2906 }
2907 mutex_unlock(&btrfsic_mutex);
2908
2909 submit_bio(rw, bio);
2910}
2911
2912int btrfsic_mount(struct btrfs_root *root,
2913 struct btrfs_fs_devices *fs_devices,
2914 int including_extent_data, u32 print_mask)
2915{
2916 int ret;
2917 struct btrfsic_state *state;
2918 struct list_head *dev_head = &fs_devices->devices;
2919 struct btrfs_device *device;
2920
2921 state = kzalloc(sizeof(*state), GFP_NOFS);
2922 if (NULL == state) {
2923 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
2924 return -1;
2925 }
2926
2927 if (!btrfsic_is_initialized) {
2928 mutex_init(&btrfsic_mutex);
2929 btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable);
2930 btrfsic_is_initialized = 1;
2931 }
2932 mutex_lock(&btrfsic_mutex);
2933 state->root = root;
2934 state->print_mask = print_mask;
2935 state->include_extent_data = including_extent_data;
2936 state->csum_size = 0;
2937 INIT_LIST_HEAD(&state->all_blocks_list);
2938 btrfsic_block_hashtable_init(&state->block_hashtable);
2939 btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
2940 state->max_superblock_generation = 0;
2941 state->latest_superblock = NULL;
2942
2943 list_for_each_entry(device, dev_head, dev_list) {
2944 struct btrfsic_dev_state *ds;
2945 char *p;
2946
2947 if (!device->bdev || !device->name)
2948 continue;
2949
2950 ds = btrfsic_dev_state_alloc();
2951 if (NULL == ds) {
2952 printk(KERN_INFO
2953 "btrfs check-integrity: kmalloc() failed!\n");
2954 mutex_unlock(&btrfsic_mutex);
2955 return -1;
2956 }
2957 ds->bdev = device->bdev;
2958 ds->state = state;
2959 bdevname(ds->bdev, ds->name);
2960 ds->name[BDEVNAME_SIZE - 1] = '\0';
2961 for (p = ds->name; *p != '\0'; p++);
2962 while (p > ds->name && *p != '/')
2963 p--;
2964 if (*p == '/')
2965 p++;
2966 strlcpy(ds->name, p, sizeof(ds->name));
2967 btrfsic_dev_state_hashtable_add(ds,
2968 &btrfsic_dev_state_hashtable);
2969 }
2970
2971 ret = btrfsic_process_superblock(state, fs_devices);
2972 if (0 != ret) {
2973 mutex_unlock(&btrfsic_mutex);
2974 btrfsic_unmount(root, fs_devices);
2975 return ret;
2976 }
2977
2978 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE)
2979 btrfsic_dump_database(state);
2980 if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE)
2981 btrfsic_dump_tree(state);
2982
2983 mutex_unlock(&btrfsic_mutex);
2984 return 0;
2985}
2986
2987void btrfsic_unmount(struct btrfs_root *root,
2988 struct btrfs_fs_devices *fs_devices)
2989{
2990 struct list_head *elem_all;
2991 struct list_head *tmp_all;
2992 struct btrfsic_state *state;
2993 struct list_head *dev_head = &fs_devices->devices;
2994 struct btrfs_device *device;
2995
2996 if (!btrfsic_is_initialized)
2997 return;
2998
2999 mutex_lock(&btrfsic_mutex);
3000
3001 state = NULL;
3002 list_for_each_entry(device, dev_head, dev_list) {
3003 struct btrfsic_dev_state *ds;
3004
3005 if (!device->bdev || !device->name)
3006 continue;
3007
3008 ds = btrfsic_dev_state_hashtable_lookup(
3009 device->bdev,
3010 &btrfsic_dev_state_hashtable);
3011 if (NULL != ds) {
3012 state = ds->state;
3013 btrfsic_dev_state_hashtable_remove(ds);
3014 btrfsic_dev_state_free(ds);
3015 }
3016 }
3017
3018 if (NULL == state) {
3019 printk(KERN_INFO
3020 "btrfsic: error, cannot find state information"
3021 " on umount!\n");
3022 mutex_unlock(&btrfsic_mutex);
3023 return;
3024 }
3025
3026 /*
3027 * Don't care about keeping the lists' state up to date,
3028 * just free all memory that was allocated dynamically.
3029 * Free the blocks and the block_links.
3030 */
3031 list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
3032 struct btrfsic_block *const b_all =
3033 list_entry(elem_all, struct btrfsic_block,
3034 all_blocks_node);
3035 struct list_head *elem_ref_to;
3036 struct list_head *tmp_ref_to;
3037
3038 list_for_each_safe(elem_ref_to, tmp_ref_to,
3039 &b_all->ref_to_list) {
3040 struct btrfsic_block_link *const l =
3041 list_entry(elem_ref_to,
3042 struct btrfsic_block_link,
3043 node_ref_to);
3044
3045 if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
3046 btrfsic_print_rem_link(state, l);
3047
3048 l->ref_cnt--;
3049 if (0 == l->ref_cnt)
3050 btrfsic_block_link_free(l);
3051 }
3052
3053 if (b_all->is_iodone)
3054 btrfsic_block_free(b_all);
3055 else
3056 printk(KERN_INFO "btrfs: attempt to free %c-block"
3057 " @%llu (%s/%llu/%d) on umount which is"
3058 " not yet iodone!\n",
3059 btrfsic_get_block_type(state, b_all),
3060 (unsigned long long)b_all->logical_bytenr,
3061 b_all->dev_state->name,
3062 (unsigned long long)b_all->dev_bytenr,
3063 b_all->mirror_num);
3064 }
3065
3066 mutex_unlock(&btrfsic_mutex);
3067
3068 kfree(state);
3069}
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
new file mode 100644
index 000000000000..8b59175cc502
--- /dev/null
+++ b/fs/btrfs/check-integrity.h
@@ -0,0 +1,36 @@
1/*
2 * Copyright (C) STRATO AG 2011. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#if !defined(__BTRFS_CHECK_INTEGRITY__)
20#define __BTRFS_CHECK_INTEGRITY__
21
22#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
23int btrfsic_submit_bh(int rw, struct buffer_head *bh);
24void btrfsic_submit_bio(int rw, struct bio *bio);
25#else
26#define btrfsic_submit_bh submit_bh
27#define btrfsic_submit_bio submit_bio
28#endif
29
30int btrfsic_mount(struct btrfs_root *root,
31 struct btrfs_fs_devices *fs_devices,
32 int including_extent_data, u32 print_mask);
33void btrfsic_unmount(struct btrfs_root *root,
34 struct btrfs_fs_devices *fs_devices);
35
36#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dede441bdeee..0639a555e16e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -240,7 +240,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
240 240
241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 241 cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
242 new_root_objectid, &disk_key, level, 242 new_root_objectid, &disk_key, level,
243 buf->start, 0); 243 buf->start, 0, 1);
244 if (IS_ERR(cow)) 244 if (IS_ERR(cow))
245 return PTR_ERR(cow); 245 return PTR_ERR(cow);
246 246
@@ -261,9 +261,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
261 261
262 WARN_ON(btrfs_header_generation(buf) > trans->transid); 262 WARN_ON(btrfs_header_generation(buf) > trans->transid);
263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 263 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
264 ret = btrfs_inc_ref(trans, root, cow, 1); 264 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
265 else 265 else
266 ret = btrfs_inc_ref(trans, root, cow, 0); 266 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
267 267
268 if (ret) 268 if (ret)
269 return ret; 269 return ret;
@@ -350,14 +350,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
350 if ((owner == root->root_key.objectid || 350 if ((owner == root->root_key.objectid ||
351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 351 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 352 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
353 ret = btrfs_inc_ref(trans, root, buf, 1); 353 ret = btrfs_inc_ref(trans, root, buf, 1, 1);
354 BUG_ON(ret); 354 BUG_ON(ret);
355 355
356 if (root->root_key.objectid == 356 if (root->root_key.objectid ==
357 BTRFS_TREE_RELOC_OBJECTID) { 357 BTRFS_TREE_RELOC_OBJECTID) {
358 ret = btrfs_dec_ref(trans, root, buf, 0); 358 ret = btrfs_dec_ref(trans, root, buf, 0, 1);
359 BUG_ON(ret); 359 BUG_ON(ret);
360 ret = btrfs_inc_ref(trans, root, cow, 1); 360 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
361 BUG_ON(ret); 361 BUG_ON(ret);
362 } 362 }
363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 363 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -365,9 +365,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 365
366 if (root->root_key.objectid == 366 if (root->root_key.objectid ==
367 BTRFS_TREE_RELOC_OBJECTID) 367 BTRFS_TREE_RELOC_OBJECTID)
368 ret = btrfs_inc_ref(trans, root, cow, 1); 368 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
369 else 369 else
370 ret = btrfs_inc_ref(trans, root, cow, 0); 370 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
371 BUG_ON(ret); 371 BUG_ON(ret);
372 } 372 }
373 if (new_flags != 0) { 373 if (new_flags != 0) {
@@ -381,11 +381,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 381 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
382 if (root->root_key.objectid == 382 if (root->root_key.objectid ==
383 BTRFS_TREE_RELOC_OBJECTID) 383 BTRFS_TREE_RELOC_OBJECTID)
384 ret = btrfs_inc_ref(trans, root, cow, 1); 384 ret = btrfs_inc_ref(trans, root, cow, 1, 1);
385 else 385 else
386 ret = btrfs_inc_ref(trans, root, cow, 0); 386 ret = btrfs_inc_ref(trans, root, cow, 0, 1);
387 BUG_ON(ret); 387 BUG_ON(ret);
388 ret = btrfs_dec_ref(trans, root, buf, 1); 388 ret = btrfs_dec_ref(trans, root, buf, 1, 1);
389 BUG_ON(ret); 389 BUG_ON(ret);
390 } 390 }
391 clean_tree_block(trans, root, buf); 391 clean_tree_block(trans, root, buf);
@@ -446,7 +446,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
446 446
447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 447 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
448 root->root_key.objectid, &disk_key, 448 root->root_key.objectid, &disk_key,
449 level, search_start, empty_size); 449 level, search_start, empty_size, 1);
450 if (IS_ERR(cow)) 450 if (IS_ERR(cow))
451 return PTR_ERR(cow); 451 return PTR_ERR(cow);
452 452
@@ -484,7 +484,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
484 rcu_assign_pointer(root->node, cow); 484 rcu_assign_pointer(root->node, cow);
485 485
486 btrfs_free_tree_block(trans, root, buf, parent_start, 486 btrfs_free_tree_block(trans, root, buf, parent_start,
487 last_ref); 487 last_ref, 1);
488 free_extent_buffer(buf); 488 free_extent_buffer(buf);
489 add_root_to_dirty_list(root); 489 add_root_to_dirty_list(root);
490 } else { 490 } else {
@@ -500,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
500 trans->transid); 500 trans->transid);
501 btrfs_mark_buffer_dirty(parent); 501 btrfs_mark_buffer_dirty(parent);
502 btrfs_free_tree_block(trans, root, buf, parent_start, 502 btrfs_free_tree_block(trans, root, buf, parent_start,
503 last_ref); 503 last_ref, 1);
504 } 504 }
505 if (unlock_orig) 505 if (unlock_orig)
506 btrfs_tree_unlock(buf); 506 btrfs_tree_unlock(buf);
@@ -957,7 +957,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
957 free_extent_buffer(mid); 957 free_extent_buffer(mid);
958 958
959 root_sub_used(root, mid->len); 959 root_sub_used(root, mid->len);
960 btrfs_free_tree_block(trans, root, mid, 0, 1); 960 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
961 /* once for the root ptr */ 961 /* once for the root ptr */
962 free_extent_buffer(mid); 962 free_extent_buffer(mid);
963 return 0; 963 return 0;
@@ -1015,7 +1015,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1015 if (wret) 1015 if (wret)
1016 ret = wret; 1016 ret = wret;
1017 root_sub_used(root, right->len); 1017 root_sub_used(root, right->len);
1018 btrfs_free_tree_block(trans, root, right, 0, 1); 1018 btrfs_free_tree_block(trans, root, right, 0, 1, 0);
1019 free_extent_buffer(right); 1019 free_extent_buffer(right);
1020 right = NULL; 1020 right = NULL;
1021 } else { 1021 } else {
@@ -1055,7 +1055,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1055 if (wret) 1055 if (wret)
1056 ret = wret; 1056 ret = wret;
1057 root_sub_used(root, mid->len); 1057 root_sub_used(root, mid->len);
1058 btrfs_free_tree_block(trans, root, mid, 0, 1); 1058 btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
1059 free_extent_buffer(mid); 1059 free_extent_buffer(mid);
1060 mid = NULL; 1060 mid = NULL;
1061 } else { 1061 } else {
@@ -2089,7 +2089,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2089 2089
2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2090 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2091 root->root_key.objectid, &lower_key, 2091 root->root_key.objectid, &lower_key,
2092 level, root->node->start, 0); 2092 level, root->node->start, 0, 0);
2093 if (IS_ERR(c)) 2093 if (IS_ERR(c))
2094 return PTR_ERR(c); 2094 return PTR_ERR(c);
2095 2095
@@ -2216,7 +2216,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2216 2216
2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 2217 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
2218 root->root_key.objectid, 2218 root->root_key.objectid,
2219 &disk_key, level, c->start, 0); 2219 &disk_key, level, c->start, 0, 0);
2220 if (IS_ERR(split)) 2220 if (IS_ERR(split))
2221 return PTR_ERR(split); 2221 return PTR_ERR(split);
2222 2222
@@ -2970,7 +2970,7 @@ again:
2970 2970
2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2971 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2972 root->root_key.objectid, 2972 root->root_key.objectid,
2973 &disk_key, 0, l->start, 0); 2973 &disk_key, 0, l->start, 0, 0);
2974 if (IS_ERR(right)) 2974 if (IS_ERR(right))
2975 return PTR_ERR(right); 2975 return PTR_ERR(right);
2976 2976
@@ -3781,7 +3781,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3781 3781
3782 root_sub_used(root, leaf->len); 3782 root_sub_used(root, leaf->len);
3783 3783
3784 btrfs_free_tree_block(trans, root, leaf, 0, 1); 3784 btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
3785 return 0; 3785 return 0;
3786} 3786}
3787/* 3787/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 67385033323d..27ebe61d3ccc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -86,6 +86,9 @@ struct btrfs_ordered_sum;
86/* holds checksums of all the data extents */ 86/* holds checksums of all the data extents */
87#define BTRFS_CSUM_TREE_OBJECTID 7ULL 87#define BTRFS_CSUM_TREE_OBJECTID 7ULL
88 88
89/* for storing balance parameters in the root tree */
90#define BTRFS_BALANCE_OBJECTID -4ULL
91
89/* orhpan objectid for tracking unlinked/truncated files */ 92/* orhpan objectid for tracking unlinked/truncated files */
90#define BTRFS_ORPHAN_OBJECTID -5ULL 93#define BTRFS_ORPHAN_OBJECTID -5ULL
91 94
@@ -692,6 +695,54 @@ struct btrfs_root_ref {
692 __le16 name_len; 695 __le16 name_len;
693} __attribute__ ((__packed__)); 696} __attribute__ ((__packed__));
694 697
698struct btrfs_disk_balance_args {
699 /*
700 * profiles to operate on, single is denoted by
701 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
702 */
703 __le64 profiles;
704
705 /* usage filter */
706 __le64 usage;
707
708 /* devid filter */
709 __le64 devid;
710
711 /* devid subset filter [pstart..pend) */
712 __le64 pstart;
713 __le64 pend;
714
715 /* btrfs virtual address space subset filter [vstart..vend) */
716 __le64 vstart;
717 __le64 vend;
718
719 /*
720 * profile to convert to, single is denoted by
721 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
722 */
723 __le64 target;
724
725 /* BTRFS_BALANCE_ARGS_* */
726 __le64 flags;
727
728 __le64 unused[8];
729} __attribute__ ((__packed__));
730
731/*
732 * store balance parameters to disk so that balance can be properly
733 * resumed after crash or unmount
734 */
735struct btrfs_balance_item {
736 /* BTRFS_BALANCE_* */
737 __le64 flags;
738
739 struct btrfs_disk_balance_args data;
740 struct btrfs_disk_balance_args meta;
741 struct btrfs_disk_balance_args sys;
742
743 __le64 unused[4];
744} __attribute__ ((__packed__));
745
695#define BTRFS_FILE_EXTENT_INLINE 0 746#define BTRFS_FILE_EXTENT_INLINE 0
696#define BTRFS_FILE_EXTENT_REG 1 747#define BTRFS_FILE_EXTENT_REG 1
697#define BTRFS_FILE_EXTENT_PREALLOC 2 748#define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -751,14 +802,32 @@ struct btrfs_csum_item {
751} __attribute__ ((__packed__)); 802} __attribute__ ((__packed__));
752 803
753/* different types of block groups (and chunks) */ 804/* different types of block groups (and chunks) */
754#define BTRFS_BLOCK_GROUP_DATA (1 << 0) 805#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
755#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) 806#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
756#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) 807#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
757#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) 808#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
758#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 809#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
759#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 810#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
760#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 811#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
761#define BTRFS_NR_RAID_TYPES 5 812#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
813#define BTRFS_NR_RAID_TYPES 5
814
815#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
816 BTRFS_BLOCK_GROUP_SYSTEM | \
817 BTRFS_BLOCK_GROUP_METADATA)
818
819#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
820 BTRFS_BLOCK_GROUP_RAID1 | \
821 BTRFS_BLOCK_GROUP_DUP | \
822 BTRFS_BLOCK_GROUP_RAID10)
823/*
824 * We need a bit for restriper to be able to tell when chunks of type
825 * SINGLE are available. This "extended" profile format is used in
826 * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
827 * (on-disk). The corresponding on-disk bit in chunk.type is reserved
828 * to avoid remappings between two formats in future.
829 */
830#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
762 831
763struct btrfs_block_group_item { 832struct btrfs_block_group_item {
764 __le64 used; 833 __le64 used;
@@ -916,6 +985,7 @@ struct btrfs_block_group_cache {
916struct reloc_control; 985struct reloc_control;
917struct btrfs_device; 986struct btrfs_device;
918struct btrfs_fs_devices; 987struct btrfs_fs_devices;
988struct btrfs_balance_control;
919struct btrfs_delayed_root; 989struct btrfs_delayed_root;
920struct btrfs_fs_info { 990struct btrfs_fs_info {
921 u8 fsid[BTRFS_FSID_SIZE]; 991 u8 fsid[BTRFS_FSID_SIZE];
@@ -971,7 +1041,7 @@ struct btrfs_fs_info {
971 * is required instead of the faster short fsync log commits 1041 * is required instead of the faster short fsync log commits
972 */ 1042 */
973 u64 last_trans_log_full_commit; 1043 u64 last_trans_log_full_commit;
974 unsigned long mount_opt:20; 1044 unsigned long mount_opt:21;
975 unsigned long compress_type:4; 1045 unsigned long compress_type:4;
976 u64 max_inline; 1046 u64 max_inline;
977 u64 alloc_start; 1047 u64 alloc_start;
@@ -1132,12 +1202,23 @@ struct btrfs_fs_info {
1132 spinlock_t ref_cache_lock; 1202 spinlock_t ref_cache_lock;
1133 u64 total_ref_cache_size; 1203 u64 total_ref_cache_size;
1134 1204
1205 /*
1206 * these three are in extended format (availability of single
1207 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
1208 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
1209 */
1135 u64 avail_data_alloc_bits; 1210 u64 avail_data_alloc_bits;
1136 u64 avail_metadata_alloc_bits; 1211 u64 avail_metadata_alloc_bits;
1137 u64 avail_system_alloc_bits; 1212 u64 avail_system_alloc_bits;
1138 u64 data_alloc_profile; 1213
1139 u64 metadata_alloc_profile; 1214 /* restriper state */
1140 u64 system_alloc_profile; 1215 spinlock_t balance_lock;
1216 struct mutex balance_mutex;
1217 atomic_t balance_running;
1218 atomic_t balance_pause_req;
1219 atomic_t balance_cancel_req;
1220 struct btrfs_balance_control *balance_ctl;
1221 wait_queue_head_t balance_wait_q;
1141 1222
1142 unsigned data_chunk_allocations; 1223 unsigned data_chunk_allocations;
1143 unsigned metadata_ratio; 1224 unsigned metadata_ratio;
@@ -1155,6 +1236,10 @@ struct btrfs_fs_info {
1155 int scrub_workers_refcnt; 1236 int scrub_workers_refcnt;
1156 struct btrfs_workers scrub_workers; 1237 struct btrfs_workers scrub_workers;
1157 1238
1239#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1240 u32 check_integrity_print_mask;
1241#endif
1242
1158 /* filesystem state */ 1243 /* filesystem state */
1159 u64 fs_state; 1244 u64 fs_state;
1160 1245
@@ -1383,6 +1468,8 @@ struct btrfs_ioctl_defrag_range_args {
1383#define BTRFS_DEV_ITEM_KEY 216 1468#define BTRFS_DEV_ITEM_KEY 216
1384#define BTRFS_CHUNK_ITEM_KEY 228 1469#define BTRFS_CHUNK_ITEM_KEY 228
1385 1470
1471#define BTRFS_BALANCE_ITEM_KEY 248
1472
1386/* 1473/*
1387 * string items are for debugging. They just store a short string of 1474 * string items are for debugging. They just store a short string of
1388 * data in the FS 1475 * data in the FS
@@ -1413,6 +1500,9 @@ struct btrfs_ioctl_defrag_range_args {
1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1500#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1501#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18) 1502#define BTRFS_MOUNT_RECOVERY (1 << 18)
1503#define BTRFS_MOUNT_SKIP_BALANCE (1 << 19)
1504#define BTRFS_MOUNT_CHECK_INTEGRITY (1 << 20)
1505#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
1416 1506
1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1507#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1508#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2077,8 +2167,86 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, 2167BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64); 2168 num_devices, 64);
2079 2169
2080/* struct btrfs_super_block */ 2170/* struct btrfs_balance_item */
2171BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
2081 2172
2173static inline void btrfs_balance_data(struct extent_buffer *eb,
2174 struct btrfs_balance_item *bi,
2175 struct btrfs_disk_balance_args *ba)
2176{
2177 read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2178}
2179
2180static inline void btrfs_set_balance_data(struct extent_buffer *eb,
2181 struct btrfs_balance_item *bi,
2182 struct btrfs_disk_balance_args *ba)
2183{
2184 write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
2185}
2186
2187static inline void btrfs_balance_meta(struct extent_buffer *eb,
2188 struct btrfs_balance_item *bi,
2189 struct btrfs_disk_balance_args *ba)
2190{
2191 read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2192}
2193
2194static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
2195 struct btrfs_balance_item *bi,
2196 struct btrfs_disk_balance_args *ba)
2197{
2198 write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
2199}
2200
2201static inline void btrfs_balance_sys(struct extent_buffer *eb,
2202 struct btrfs_balance_item *bi,
2203 struct btrfs_disk_balance_args *ba)
2204{
2205 read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2206}
2207
2208static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
2209 struct btrfs_balance_item *bi,
2210 struct btrfs_disk_balance_args *ba)
2211{
2212 write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
2213}
2214
2215static inline void
2216btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
2217 struct btrfs_disk_balance_args *disk)
2218{
2219 memset(cpu, 0, sizeof(*cpu));
2220
2221 cpu->profiles = le64_to_cpu(disk->profiles);
2222 cpu->usage = le64_to_cpu(disk->usage);
2223 cpu->devid = le64_to_cpu(disk->devid);
2224 cpu->pstart = le64_to_cpu(disk->pstart);
2225 cpu->pend = le64_to_cpu(disk->pend);
2226 cpu->vstart = le64_to_cpu(disk->vstart);
2227 cpu->vend = le64_to_cpu(disk->vend);
2228 cpu->target = le64_to_cpu(disk->target);
2229 cpu->flags = le64_to_cpu(disk->flags);
2230}
2231
2232static inline void
2233btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
2234 struct btrfs_balance_args *cpu)
2235{
2236 memset(disk, 0, sizeof(*disk));
2237
2238 disk->profiles = cpu_to_le64(cpu->profiles);
2239 disk->usage = cpu_to_le64(cpu->usage);
2240 disk->devid = cpu_to_le64(cpu->devid);
2241 disk->pstart = cpu_to_le64(cpu->pstart);
2242 disk->pend = cpu_to_le64(cpu->pend);
2243 disk->vstart = cpu_to_le64(cpu->vstart);
2244 disk->vend = cpu_to_le64(cpu->vend);
2245 disk->target = cpu_to_le64(cpu->target);
2246 disk->flags = cpu_to_le64(cpu->flags);
2247}
2248
2249/* struct btrfs_super_block */
2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2250BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
2083BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); 2251BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
2084BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, 2252BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2196,7 +2364,7 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
2196 return btrfs_item_size(eb, e) - offset; 2364 return btrfs_item_size(eb, e) - offset;
2197} 2365}
2198 2366
2199static inline struct btrfs_root *btrfs_sb(struct super_block *sb) 2367static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
2200{ 2368{
2201 return sb->s_fs_info; 2369 return sb->s_fs_info;
2202} 2370}
@@ -2277,11 +2445,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
2277 struct btrfs_root *root, u32 blocksize, 2445 struct btrfs_root *root, u32 blocksize,
2278 u64 parent, u64 root_objectid, 2446 u64 parent, u64 root_objectid,
2279 struct btrfs_disk_key *key, int level, 2447 struct btrfs_disk_key *key, int level,
2280 u64 hint, u64 empty_size); 2448 u64 hint, u64 empty_size, int for_cow);
2281void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2449void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
2282 struct btrfs_root *root, 2450 struct btrfs_root *root,
2283 struct extent_buffer *buf, 2451 struct extent_buffer *buf,
2284 u64 parent, int last_ref); 2452 u64 parent, int last_ref, int for_cow);
2285struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2453struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
2286 struct btrfs_root *root, 2454 struct btrfs_root *root,
2287 u64 bytenr, u32 blocksize, 2455 u64 bytenr, u32 blocksize,
@@ -2301,17 +2469,17 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
2301 u64 search_end, struct btrfs_key *ins, 2469 u64 search_end, struct btrfs_key *ins,
2302 u64 data); 2470 u64 data);
2303int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2471int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2304 struct extent_buffer *buf, int full_backref); 2472 struct extent_buffer *buf, int full_backref, int for_cow);
2305int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2473int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2306 struct extent_buffer *buf, int full_backref); 2474 struct extent_buffer *buf, int full_backref, int for_cow);
2307int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 2475int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2308 struct btrfs_root *root, 2476 struct btrfs_root *root,
2309 u64 bytenr, u64 num_bytes, u64 flags, 2477 u64 bytenr, u64 num_bytes, u64 flags,
2310 int is_data); 2478 int is_data);
2311int btrfs_free_extent(struct btrfs_trans_handle *trans, 2479int btrfs_free_extent(struct btrfs_trans_handle *trans,
2312 struct btrfs_root *root, 2480 struct btrfs_root *root,
2313 u64 bytenr, u64 num_bytes, u64 parent, 2481 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
2314 u64 root_objectid, u64 owner, u64 offset); 2482 u64 owner, u64 offset, int for_cow);
2315 2483
2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2484int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, 2485int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -2323,7 +2491,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2323int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2491int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2324 struct btrfs_root *root, 2492 struct btrfs_root *root,
2325 u64 bytenr, u64 num_bytes, u64 parent, 2493 u64 bytenr, u64 num_bytes, u64 parent,
2326 u64 root_objectid, u64 owner, u64 offset); 2494 u64 root_objectid, u64 owner, u64 offset, int for_cow);
2327 2495
2328int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2496int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2329 struct btrfs_root *root); 2497 struct btrfs_root *root);
@@ -2482,10 +2650,18 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2482} 2650}
2483 2651
2484int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2652int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2653static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
2654{
2655 ++p->slots[0];
2656 if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
2657 return btrfs_next_leaf(root, p);
2658 return 0;
2659}
2485int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2660int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2486int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2661int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2487void btrfs_drop_snapshot(struct btrfs_root *root, 2662void btrfs_drop_snapshot(struct btrfs_root *root,
2488 struct btrfs_block_rsv *block_rsv, int update_ref); 2663 struct btrfs_block_rsv *block_rsv, int update_ref,
2664 int for_reloc);
2489int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2665int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2490 struct btrfs_root *root, 2666 struct btrfs_root *root,
2491 struct extent_buffer *node, 2667 struct extent_buffer *node,
@@ -2500,6 +2676,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2500} 2676}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info) 2677static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{ 2678{
2679 kfree(fs_info->balance_ctl);
2503 kfree(fs_info->delayed_root); 2680 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root); 2681 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root); 2682 kfree(fs_info->tree_root);
@@ -2510,6 +2687,24 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2510 kfree(fs_info->super_for_commit); 2687 kfree(fs_info->super_for_commit);
2511 kfree(fs_info); 2688 kfree(fs_info);
2512} 2689}
2690/**
2691 * profile_is_valid - tests whether a given profile is valid and reduced
2692 * @flags: profile to validate
2693 * @extended: if true @flags is treated as an extended profile
2694 */
2695static inline int profile_is_valid(u64 flags, int extended)
2696{
2697 u64 mask = ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
2698
2699 flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
2700 if (extended)
2701 mask &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2702
2703 if (flags & mask)
2704 return 0;
2705 /* true if zero or exactly one bit set */
2706 return (flags & (~flags + 1)) == flags;
2707}
2513 2708
2514/* root-item.c */ 2709/* root-item.c */
2515int btrfs_find_root_ref(struct btrfs_root *tree_root, 2710int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 9c1eccc2c503..fe4cd0f1cef1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -595,8 +595,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
598 if (!ret) 598 if (!ret) {
599 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
600 item->key.objectid,
601 num_bytes, 1);
599 item->bytes_reserved = num_bytes; 602 item->bytes_reserved = num_bytes;
603 }
600 604
601 return ret; 605 return ret;
602} 606}
@@ -610,6 +614,9 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
610 return; 614 return;
611 615
612 rsv = &root->fs_info->delayed_block_rsv; 616 rsv = &root->fs_info->delayed_block_rsv;
617 trace_btrfs_space_reservation(root->fs_info, "delayed_item",
618 item->key.objectid, item->bytes_reserved,
619 0);
613 btrfs_block_rsv_release(root, rsv, 620 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 621 item->bytes_reserved);
615} 622}
@@ -624,7 +631,7 @@ static int btrfs_delayed_inode_reserve_metadata(
624 struct btrfs_block_rsv *dst_rsv; 631 struct btrfs_block_rsv *dst_rsv;
625 u64 num_bytes; 632 u64 num_bytes;
626 int ret; 633 int ret;
627 int release = false; 634 bool release = false;
628 635
629 src_rsv = trans->block_rsv; 636 src_rsv = trans->block_rsv;
630 dst_rsv = &root->fs_info->delayed_block_rsv; 637 dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -651,8 +658,13 @@ static int btrfs_delayed_inode_reserve_metadata(
651 */ 658 */
652 if (ret == -EAGAIN) 659 if (ret == -EAGAIN)
653 ret = -ENOSPC; 660 ret = -ENOSPC;
654 if (!ret) 661 if (!ret) {
655 node->bytes_reserved = num_bytes; 662 node->bytes_reserved = num_bytes;
663 trace_btrfs_space_reservation(root->fs_info,
664 "delayed_inode",
665 btrfs_ino(inode),
666 num_bytes, 1);
667 }
656 return ret; 668 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) { 669 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock); 670 spin_lock(&BTRFS_I(inode)->lock);
@@ -707,11 +719,17 @@ out:
707 * reservation here. I think it may be time for a documentation page on 719 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work. 720 * how block rsvs. work.
709 */ 721 */
710 if (!ret) 722 if (!ret) {
723 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
724 btrfs_ino(inode), num_bytes, 1);
711 node->bytes_reserved = num_bytes; 725 node->bytes_reserved = num_bytes;
726 }
712 727
713 if (release) 728 if (release) {
729 trace_btrfs_space_reservation(root->fs_info, "delalloc",
730 btrfs_ino(inode), num_bytes, 0);
714 btrfs_block_rsv_release(root, src_rsv, num_bytes); 731 btrfs_block_rsv_release(root, src_rsv, num_bytes);
732 }
715 733
716 return ret; 734 return ret;
717} 735}
@@ -725,6 +743,8 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
725 return; 743 return;
726 744
727 rsv = &root->fs_info->delayed_block_rsv; 745 rsv = &root->fs_info->delayed_block_rsv;
746 trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
747 node->inode_id, node->bytes_reserved, 0);
728 btrfs_block_rsv_release(root, rsv, 748 btrfs_block_rsv_release(root, rsv,
729 node->bytes_reserved); 749 node->bytes_reserved);
730 node->bytes_reserved = 0; 750 node->bytes_reserved = 0;
@@ -1372,13 +1392,6 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1372 goto release_node; 1392 goto release_node;
1373 } 1393 }
1374 1394
1375 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1376 /*
1377 * we have reserved enough space when we start a new transaction,
1378 * so reserving metadata failure is impossible
1379 */
1380 BUG_ON(ret);
1381
1382 delayed_item->key.objectid = btrfs_ino(dir); 1395 delayed_item->key.objectid = btrfs_ino(dir);
1383 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); 1396 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
1384 delayed_item->key.offset = index; 1397 delayed_item->key.offset = index;
@@ -1391,6 +1404,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1391 dir_item->type = type; 1404 dir_item->type = type;
1392 memcpy((char *)(dir_item + 1), name, name_len); 1405 memcpy((char *)(dir_item + 1), name, name_len);
1393 1406
1407 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1408 /*
1409 * we have reserved enough space when we start a new transaction,
1410 * so reserving metadata failure is impossible
1411 */
1412 BUG_ON(ret);
1413
1414
1394 mutex_lock(&delayed_node->mutex); 1415 mutex_lock(&delayed_node->mutex);
1395 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); 1416 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1396 if (unlikely(ret)) { 1417 if (unlikely(ret)) {
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 125cf76fcd08..66e4f29505a3 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,6 +101,11 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
101 return -1; 101 return -1;
102 if (ref1->type > ref2->type) 102 if (ref1->type > ref2->type)
103 return 1; 103 return 1;
104 /* merging of sequenced refs is not allowed */
105 if (ref1->seq < ref2->seq)
106 return -1;
107 if (ref1->seq > ref2->seq)
108 return 1;
104 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || 109 if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
105 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { 110 ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
106 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), 111 return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
@@ -150,16 +155,22 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
150 155
151/* 156/*
152 * find an head entry based on bytenr. This returns the delayed ref 157 * find an head entry based on bytenr. This returns the delayed ref
153 * head if it was able to find one, or NULL if nothing was in that spot 158 * head if it was able to find one, or NULL if nothing was in that spot.
159 * If return_bigger is given, the next bigger entry is returned if no exact
160 * match is found.
154 */ 161 */
155static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root, 162static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
156 u64 bytenr, 163 u64 bytenr,
157 struct btrfs_delayed_ref_node **last) 164 struct btrfs_delayed_ref_node **last,
165 int return_bigger)
158{ 166{
159 struct rb_node *n = root->rb_node; 167 struct rb_node *n;
160 struct btrfs_delayed_ref_node *entry; 168 struct btrfs_delayed_ref_node *entry;
161 int cmp; 169 int cmp = 0;
162 170
171again:
172 n = root->rb_node;
173 entry = NULL;
163 while (n) { 174 while (n) {
164 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node); 175 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
165 WARN_ON(!entry->in_tree); 176 WARN_ON(!entry->in_tree);
@@ -182,6 +193,19 @@ static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
182 else 193 else
183 return entry; 194 return entry;
184 } 195 }
196 if (entry && return_bigger) {
197 if (cmp > 0) {
198 n = rb_next(&entry->rb_node);
199 if (!n)
200 n = rb_first(root);
201 entry = rb_entry(n, struct btrfs_delayed_ref_node,
202 rb_node);
203 bytenr = entry->bytenr;
204 return_bigger = 0;
205 goto again;
206 }
207 return entry;
208 }
185 return NULL; 209 return NULL;
186} 210}
187 211
@@ -209,6 +233,24 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
209 return 0; 233 return 0;
210} 234}
211 235
236int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
237 u64 seq)
238{
239 struct seq_list *elem;
240
241 assert_spin_locked(&delayed_refs->lock);
242 if (list_empty(&delayed_refs->seq_head))
243 return 0;
244
245 elem = list_first_entry(&delayed_refs->seq_head, struct seq_list, list);
246 if (seq >= elem->seq) {
247 pr_debug("holding back delayed_ref %llu, lowest is %llu (%p)\n",
248 seq, elem->seq, delayed_refs);
249 return 1;
250 }
251 return 0;
252}
253
212int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 254int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
213 struct list_head *cluster, u64 start) 255 struct list_head *cluster, u64 start)
214{ 256{
@@ -223,20 +265,8 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
223 node = rb_first(&delayed_refs->root); 265 node = rb_first(&delayed_refs->root);
224 } else { 266 } else {
225 ref = NULL; 267 ref = NULL;
226 find_ref_head(&delayed_refs->root, start, &ref); 268 find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
227 if (ref) { 269 if (ref) {
228 struct btrfs_delayed_ref_node *tmp;
229
230 node = rb_prev(&ref->rb_node);
231 while (node) {
232 tmp = rb_entry(node,
233 struct btrfs_delayed_ref_node,
234 rb_node);
235 if (tmp->bytenr < start)
236 break;
237 ref = tmp;
238 node = rb_prev(&ref->rb_node);
239 }
240 node = &ref->rb_node; 270 node = &ref->rb_node;
241 } else 271 } else
242 node = rb_first(&delayed_refs->root); 272 node = rb_first(&delayed_refs->root);
@@ -390,7 +420,8 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
390 * this does all the dirty work in terms of maintaining the correct 420 * this does all the dirty work in terms of maintaining the correct
391 * overall modification count. 421 * overall modification count.
392 */ 422 */
393static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans, 423static noinline int add_delayed_ref_head(struct btrfs_fs_info *fs_info,
424 struct btrfs_trans_handle *trans,
394 struct btrfs_delayed_ref_node *ref, 425 struct btrfs_delayed_ref_node *ref,
395 u64 bytenr, u64 num_bytes, 426 u64 bytenr, u64 num_bytes,
396 int action, int is_data) 427 int action, int is_data)
@@ -437,6 +468,7 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
437 ref->action = 0; 468 ref->action = 0;
438 ref->is_head = 1; 469 ref->is_head = 1;
439 ref->in_tree = 1; 470 ref->in_tree = 1;
471 ref->seq = 0;
440 472
441 head_ref = btrfs_delayed_node_to_head(ref); 473 head_ref = btrfs_delayed_node_to_head(ref);
442 head_ref->must_insert_reserved = must_insert_reserved; 474 head_ref->must_insert_reserved = must_insert_reserved;
@@ -468,14 +500,17 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
468/* 500/*
469 * helper to insert a delayed tree ref into the rbtree. 501 * helper to insert a delayed tree ref into the rbtree.
470 */ 502 */
471static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans, 503static noinline int add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
504 struct btrfs_trans_handle *trans,
472 struct btrfs_delayed_ref_node *ref, 505 struct btrfs_delayed_ref_node *ref,
473 u64 bytenr, u64 num_bytes, u64 parent, 506 u64 bytenr, u64 num_bytes, u64 parent,
474 u64 ref_root, int level, int action) 507 u64 ref_root, int level, int action,
508 int for_cow)
475{ 509{
476 struct btrfs_delayed_ref_node *existing; 510 struct btrfs_delayed_ref_node *existing;
477 struct btrfs_delayed_tree_ref *full_ref; 511 struct btrfs_delayed_tree_ref *full_ref;
478 struct btrfs_delayed_ref_root *delayed_refs; 512 struct btrfs_delayed_ref_root *delayed_refs;
513 u64 seq = 0;
479 514
480 if (action == BTRFS_ADD_DELAYED_EXTENT) 515 if (action == BTRFS_ADD_DELAYED_EXTENT)
481 action = BTRFS_ADD_DELAYED_REF; 516 action = BTRFS_ADD_DELAYED_REF;
@@ -491,14 +526,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
491 ref->is_head = 0; 526 ref->is_head = 0;
492 ref->in_tree = 1; 527 ref->in_tree = 1;
493 528
529 if (need_ref_seq(for_cow, ref_root))
530 seq = inc_delayed_seq(delayed_refs);
531 ref->seq = seq;
532
494 full_ref = btrfs_delayed_node_to_tree_ref(ref); 533 full_ref = btrfs_delayed_node_to_tree_ref(ref);
495 if (parent) { 534 full_ref->parent = parent;
496 full_ref->parent = parent; 535 full_ref->root = ref_root;
536 if (parent)
497 ref->type = BTRFS_SHARED_BLOCK_REF_KEY; 537 ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
498 } else { 538 else
499 full_ref->root = ref_root;
500 ref->type = BTRFS_TREE_BLOCK_REF_KEY; 539 ref->type = BTRFS_TREE_BLOCK_REF_KEY;
501 }
502 full_ref->level = level; 540 full_ref->level = level;
503 541
504 trace_btrfs_delayed_tree_ref(ref, full_ref, action); 542 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
@@ -522,15 +560,17 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
522/* 560/*
523 * helper to insert a delayed data ref into the rbtree. 561 * helper to insert a delayed data ref into the rbtree.
524 */ 562 */
525static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans, 563static noinline int add_delayed_data_ref(struct btrfs_fs_info *fs_info,
564 struct btrfs_trans_handle *trans,
526 struct btrfs_delayed_ref_node *ref, 565 struct btrfs_delayed_ref_node *ref,
527 u64 bytenr, u64 num_bytes, u64 parent, 566 u64 bytenr, u64 num_bytes, u64 parent,
528 u64 ref_root, u64 owner, u64 offset, 567 u64 ref_root, u64 owner, u64 offset,
529 int action) 568 int action, int for_cow)
530{ 569{
531 struct btrfs_delayed_ref_node *existing; 570 struct btrfs_delayed_ref_node *existing;
532 struct btrfs_delayed_data_ref *full_ref; 571 struct btrfs_delayed_data_ref *full_ref;
533 struct btrfs_delayed_ref_root *delayed_refs; 572 struct btrfs_delayed_ref_root *delayed_refs;
573 u64 seq = 0;
534 574
535 if (action == BTRFS_ADD_DELAYED_EXTENT) 575 if (action == BTRFS_ADD_DELAYED_EXTENT)
536 action = BTRFS_ADD_DELAYED_REF; 576 action = BTRFS_ADD_DELAYED_REF;
@@ -546,14 +586,18 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
546 ref->is_head = 0; 586 ref->is_head = 0;
547 ref->in_tree = 1; 587 ref->in_tree = 1;
548 588
589 if (need_ref_seq(for_cow, ref_root))
590 seq = inc_delayed_seq(delayed_refs);
591 ref->seq = seq;
592
549 full_ref = btrfs_delayed_node_to_data_ref(ref); 593 full_ref = btrfs_delayed_node_to_data_ref(ref);
550 if (parent) { 594 full_ref->parent = parent;
551 full_ref->parent = parent; 595 full_ref->root = ref_root;
596 if (parent)
552 ref->type = BTRFS_SHARED_DATA_REF_KEY; 597 ref->type = BTRFS_SHARED_DATA_REF_KEY;
553 } else { 598 else
554 full_ref->root = ref_root;
555 ref->type = BTRFS_EXTENT_DATA_REF_KEY; 599 ref->type = BTRFS_EXTENT_DATA_REF_KEY;
556 } 600
557 full_ref->objectid = owner; 601 full_ref->objectid = owner;
558 full_ref->offset = offset; 602 full_ref->offset = offset;
559 603
@@ -580,10 +624,12 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
580 * to make sure the delayed ref is eventually processed before this 624 * to make sure the delayed ref is eventually processed before this
581 * transaction commits. 625 * transaction commits.
582 */ 626 */
583int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 627int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
628 struct btrfs_trans_handle *trans,
584 u64 bytenr, u64 num_bytes, u64 parent, 629 u64 bytenr, u64 num_bytes, u64 parent,
585 u64 ref_root, int level, int action, 630 u64 ref_root, int level, int action,
586 struct btrfs_delayed_extent_op *extent_op) 631 struct btrfs_delayed_extent_op *extent_op,
632 int for_cow)
587{ 633{
588 struct btrfs_delayed_tree_ref *ref; 634 struct btrfs_delayed_tree_ref *ref;
589 struct btrfs_delayed_ref_head *head_ref; 635 struct btrfs_delayed_ref_head *head_ref;
@@ -610,13 +656,17 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
610 * insert both the head node and the new ref without dropping 656 * insert both the head node and the new ref without dropping
611 * the spin lock 657 * the spin lock
612 */ 658 */
613 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 659 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
614 action, 0); 660 num_bytes, action, 0);
615 BUG_ON(ret); 661 BUG_ON(ret);
616 662
617 ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes, 663 ret = add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
618 parent, ref_root, level, action); 664 num_bytes, parent, ref_root, level, action,
665 for_cow);
619 BUG_ON(ret); 666 BUG_ON(ret);
667 if (!need_ref_seq(for_cow, ref_root) &&
668 waitqueue_active(&delayed_refs->seq_wait))
669 wake_up(&delayed_refs->seq_wait);
620 spin_unlock(&delayed_refs->lock); 670 spin_unlock(&delayed_refs->lock);
621 return 0; 671 return 0;
622} 672}
@@ -624,11 +674,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
624/* 674/*
625 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref. 675 * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
626 */ 676 */
627int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 677int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
678 struct btrfs_trans_handle *trans,
628 u64 bytenr, u64 num_bytes, 679 u64 bytenr, u64 num_bytes,
629 u64 parent, u64 ref_root, 680 u64 parent, u64 ref_root,
630 u64 owner, u64 offset, int action, 681 u64 owner, u64 offset, int action,
631 struct btrfs_delayed_extent_op *extent_op) 682 struct btrfs_delayed_extent_op *extent_op,
683 int for_cow)
632{ 684{
633 struct btrfs_delayed_data_ref *ref; 685 struct btrfs_delayed_data_ref *ref;
634 struct btrfs_delayed_ref_head *head_ref; 686 struct btrfs_delayed_ref_head *head_ref;
@@ -655,18 +707,23 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
655 * insert both the head node and the new ref without dropping 707 * insert both the head node and the new ref without dropping
656 * the spin lock 708 * the spin lock
657 */ 709 */
658 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes, 710 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
659 action, 1); 711 num_bytes, action, 1);
660 BUG_ON(ret); 712 BUG_ON(ret);
661 713
662 ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes, 714 ret = add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
663 parent, ref_root, owner, offset, action); 715 num_bytes, parent, ref_root, owner, offset,
716 action, for_cow);
664 BUG_ON(ret); 717 BUG_ON(ret);
718 if (!need_ref_seq(for_cow, ref_root) &&
719 waitqueue_active(&delayed_refs->seq_wait))
720 wake_up(&delayed_refs->seq_wait);
665 spin_unlock(&delayed_refs->lock); 721 spin_unlock(&delayed_refs->lock);
666 return 0; 722 return 0;
667} 723}
668 724
669int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 725int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
726 struct btrfs_trans_handle *trans,
670 u64 bytenr, u64 num_bytes, 727 u64 bytenr, u64 num_bytes,
671 struct btrfs_delayed_extent_op *extent_op) 728 struct btrfs_delayed_extent_op *extent_op)
672{ 729{
@@ -683,11 +740,13 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
683 delayed_refs = &trans->transaction->delayed_refs; 740 delayed_refs = &trans->transaction->delayed_refs;
684 spin_lock(&delayed_refs->lock); 741 spin_lock(&delayed_refs->lock);
685 742
686 ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, 743 ret = add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
687 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 744 num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
688 extent_op->is_data); 745 extent_op->is_data);
689 BUG_ON(ret); 746 BUG_ON(ret);
690 747
748 if (waitqueue_active(&delayed_refs->seq_wait))
749 wake_up(&delayed_refs->seq_wait);
691 spin_unlock(&delayed_refs->lock); 750 spin_unlock(&delayed_refs->lock);
692 return 0; 751 return 0;
693} 752}
@@ -704,7 +763,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
704 struct btrfs_delayed_ref_root *delayed_refs; 763 struct btrfs_delayed_ref_root *delayed_refs;
705 764
706 delayed_refs = &trans->transaction->delayed_refs; 765 delayed_refs = &trans->transaction->delayed_refs;
707 ref = find_ref_head(&delayed_refs->root, bytenr, NULL); 766 ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
708 if (ref) 767 if (ref)
709 return btrfs_delayed_node_to_head(ref); 768 return btrfs_delayed_node_to_head(ref);
710 return NULL; 769 return NULL;
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index e287e3b0eab0..d8f244d94925 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -33,6 +33,9 @@ struct btrfs_delayed_ref_node {
33 /* the size of the extent */ 33 /* the size of the extent */
34 u64 num_bytes; 34 u64 num_bytes;
35 35
36 /* seq number to keep track of insertion order */
37 u64 seq;
38
36 /* ref count on this data structure */ 39 /* ref count on this data structure */
37 atomic_t refs; 40 atomic_t refs;
38 41
@@ -98,19 +101,15 @@ struct btrfs_delayed_ref_head {
98 101
99struct btrfs_delayed_tree_ref { 102struct btrfs_delayed_tree_ref {
100 struct btrfs_delayed_ref_node node; 103 struct btrfs_delayed_ref_node node;
101 union { 104 u64 root;
102 u64 root; 105 u64 parent;
103 u64 parent;
104 };
105 int level; 106 int level;
106}; 107};
107 108
108struct btrfs_delayed_data_ref { 109struct btrfs_delayed_data_ref {
109 struct btrfs_delayed_ref_node node; 110 struct btrfs_delayed_ref_node node;
110 union { 111 u64 root;
111 u64 root; 112 u64 parent;
112 u64 parent;
113 };
114 u64 objectid; 113 u64 objectid;
115 u64 offset; 114 u64 offset;
116}; 115};
@@ -140,6 +139,26 @@ struct btrfs_delayed_ref_root {
140 int flushing; 139 int flushing;
141 140
142 u64 run_delayed_start; 141 u64 run_delayed_start;
142
143 /*
144 * seq number of delayed refs. We need to know if a backref was being
145 * added before the currently processed ref or afterwards.
146 */
147 u64 seq;
148
149 /*
150 * seq_list holds a list of all seq numbers that are currently being
151 * added to the list. While walking backrefs (btrfs_find_all_roots,
152 * qgroups), which might take some time, no newer ref must be processed,
153 * as it might influence the outcome of the walk.
154 */
155 struct list_head seq_head;
156
157 /*
158 * when the only refs we have in the list must not be processed, we want
159 * to wait for more refs to show up or for the end of backref walking.
160 */
161 wait_queue_head_t seq_wait;
143}; 162};
144 163
145static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) 164static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
@@ -151,16 +170,21 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
151 } 170 }
152} 171}
153 172
154int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, 173int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
174 struct btrfs_trans_handle *trans,
155 u64 bytenr, u64 num_bytes, u64 parent, 175 u64 bytenr, u64 num_bytes, u64 parent,
156 u64 ref_root, int level, int action, 176 u64 ref_root, int level, int action,
157 struct btrfs_delayed_extent_op *extent_op); 177 struct btrfs_delayed_extent_op *extent_op,
158int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, 178 int for_cow);
179int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
180 struct btrfs_trans_handle *trans,
159 u64 bytenr, u64 num_bytes, 181 u64 bytenr, u64 num_bytes,
160 u64 parent, u64 ref_root, 182 u64 parent, u64 ref_root,
161 u64 owner, u64 offset, int action, 183 u64 owner, u64 offset, int action,
162 struct btrfs_delayed_extent_op *extent_op); 184 struct btrfs_delayed_extent_op *extent_op,
163int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, 185 int for_cow);
186int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
187 struct btrfs_trans_handle *trans,
164 u64 bytenr, u64 num_bytes, 188 u64 bytenr, u64 num_bytes,
165 struct btrfs_delayed_extent_op *extent_op); 189 struct btrfs_delayed_extent_op *extent_op);
166 190
@@ -170,6 +194,60 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
170 struct btrfs_delayed_ref_head *head); 194 struct btrfs_delayed_ref_head *head);
171int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 195int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
172 struct list_head *cluster, u64 search_start); 196 struct list_head *cluster, u64 search_start);
197
198struct seq_list {
199 struct list_head list;
200 u64 seq;
201};
202
203static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
204{
205 assert_spin_locked(&delayed_refs->lock);
206 ++delayed_refs->seq;
207 return delayed_refs->seq;
208}
209
210static inline void
211btrfs_get_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
212 struct seq_list *elem)
213{
214 assert_spin_locked(&delayed_refs->lock);
215 elem->seq = delayed_refs->seq;
216 list_add_tail(&elem->list, &delayed_refs->seq_head);
217}
218
219static inline void
220btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
221 struct seq_list *elem)
222{
223 spin_lock(&delayed_refs->lock);
224 list_del(&elem->list);
225 wake_up(&delayed_refs->seq_wait);
226 spin_unlock(&delayed_refs->lock);
227}
228
229int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
230 u64 seq);
231
232/*
233 * delayed refs with a ref_seq > 0 must be held back during backref walking.
234 * this only applies to items in one of the fs-trees. for_cow items never need
235 * to be held back, so they won't get a ref_seq number.
236 */
237static inline int need_ref_seq(int for_cow, u64 rootid)
238{
239 if (for_cow)
240 return 0;
241
242 if (rootid == BTRFS_FS_TREE_OBJECTID)
243 return 1;
244
245 if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
246 return 1;
247
248 return 0;
249}
250
173/* 251/*
174 * a node might live in a head or a regular ref, this lets you 252 * a node might live in a head or a regular ref, this lets you
175 * test for the proper type to use. 253 * test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f99a099a7747..811d9f918b1c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -43,6 +43,7 @@
43#include "tree-log.h" 43#include "tree-log.h"
44#include "free-space-cache.h" 44#include "free-space-cache.h"
45#include "inode-map.h" 45#include "inode-map.h"
46#include "check-integrity.h"
46 47
47static struct extent_io_ops btree_extent_io_ops; 48static struct extent_io_ops btree_extent_io_ops;
48static void end_workqueue_fn(struct btrfs_work *work); 49static void end_workqueue_fn(struct btrfs_work *work);
@@ -872,7 +873,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
872 873
873#ifdef CONFIG_MIGRATION 874#ifdef CONFIG_MIGRATION
874static int btree_migratepage(struct address_space *mapping, 875static int btree_migratepage(struct address_space *mapping,
875 struct page *newpage, struct page *page) 876 struct page *newpage, struct page *page,
877 enum migrate_mode mode)
876{ 878{
877 /* 879 /*
878 * we can't safely write a btree page from here, 880 * we can't safely write a btree page from here,
@@ -887,7 +889,7 @@ static int btree_migratepage(struct address_space *mapping,
887 if (page_has_private(page) && 889 if (page_has_private(page) &&
888 !try_to_release_page(page, GFP_KERNEL)) 890 !try_to_release_page(page, GFP_KERNEL))
889 return -EAGAIN; 891 return -EAGAIN;
890 return migrate_page(mapping, newpage, page); 892 return migrate_page(mapping, newpage, page, mode);
891} 893}
892#endif 894#endif
893 895
@@ -960,6 +962,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
960 tree = &BTRFS_I(page->mapping->host)->io_tree; 962 tree = &BTRFS_I(page->mapping->host)->io_tree;
961 map = &BTRFS_I(page->mapping->host)->extent_tree; 963 map = &BTRFS_I(page->mapping->host)->extent_tree;
962 964
965 /*
966 * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
967 * slab allocation from alloc_extent_state down the callchain where
968 * it'd hit a BUG_ON as those flags are not allowed.
969 */
970 gfp_flags &= ~GFP_SLAB_BUG_MASK;
971
963 ret = try_release_extent_state(map, tree, page, gfp_flags); 972 ret = try_release_extent_state(map, tree, page, gfp_flags);
964 if (!ret) 973 if (!ret)
965 return 0; 974 return 0;
@@ -1142,7 +1151,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1142 root->orphan_item_inserted = 0; 1151 root->orphan_item_inserted = 0;
1143 root->orphan_cleanup_state = 0; 1152 root->orphan_cleanup_state = 0;
1144 1153
1145 root->fs_info = fs_info;
1146 root->objectid = objectid; 1154 root->objectid = objectid;
1147 root->last_trans = 0; 1155 root->last_trans = 0;
1148 root->highest_objectid = 0; 1156 root->highest_objectid = 0;
@@ -1216,6 +1224,14 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1216 return 0; 1224 return 0;
1217} 1225}
1218 1226
1227static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
1228{
1229 struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
1230 if (root)
1231 root->fs_info = fs_info;
1232 return root;
1233}
1234
1219static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 1235static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1220 struct btrfs_fs_info *fs_info) 1236 struct btrfs_fs_info *fs_info)
1221{ 1237{
@@ -1223,7 +1239,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1223 struct btrfs_root *tree_root = fs_info->tree_root; 1239 struct btrfs_root *tree_root = fs_info->tree_root;
1224 struct extent_buffer *leaf; 1240 struct extent_buffer *leaf;
1225 1241
1226 root = kzalloc(sizeof(*root), GFP_NOFS); 1242 root = btrfs_alloc_root(fs_info);
1227 if (!root) 1243 if (!root)
1228 return ERR_PTR(-ENOMEM); 1244 return ERR_PTR(-ENOMEM);
1229 1245
@@ -1243,7 +1259,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1243 root->ref_cows = 0; 1259 root->ref_cows = 0;
1244 1260
1245 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1261 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
1246 BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0); 1262 BTRFS_TREE_LOG_OBJECTID, NULL,
1263 0, 0, 0, 0);
1247 if (IS_ERR(leaf)) { 1264 if (IS_ERR(leaf)) {
1248 kfree(root); 1265 kfree(root);
1249 return ERR_CAST(leaf); 1266 return ERR_CAST(leaf);
@@ -1317,7 +1334,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1317 u32 blocksize; 1334 u32 blocksize;
1318 int ret = 0; 1335 int ret = 0;
1319 1336
1320 root = kzalloc(sizeof(*root), GFP_NOFS); 1337 root = btrfs_alloc_root(fs_info);
1321 if (!root) 1338 if (!root)
1322 return ERR_PTR(-ENOMEM); 1339 return ERR_PTR(-ENOMEM);
1323 if (location->offset == (u64)-1) { 1340 if (location->offset == (u64)-1) {
@@ -1873,9 +1890,9 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1873} 1890}
1874 1891
1875 1892
1876struct btrfs_root *open_ctree(struct super_block *sb, 1893int open_ctree(struct super_block *sb,
1877 struct btrfs_fs_devices *fs_devices, 1894 struct btrfs_fs_devices *fs_devices,
1878 char *options) 1895 char *options)
1879{ 1896{
1880 u32 sectorsize; 1897 u32 sectorsize;
1881 u32 nodesize; 1898 u32 nodesize;
@@ -1887,8 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1887 struct btrfs_key location; 1904 struct btrfs_key location;
1888 struct buffer_head *bh; 1905 struct buffer_head *bh;
1889 struct btrfs_super_block *disk_super; 1906 struct btrfs_super_block *disk_super;
1890 struct btrfs_root *tree_root = btrfs_sb(sb); 1907 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1891 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1908 struct btrfs_root *tree_root;
1892 struct btrfs_root *extent_root; 1909 struct btrfs_root *extent_root;
1893 struct btrfs_root *csum_root; 1910 struct btrfs_root *csum_root;
1894 struct btrfs_root *chunk_root; 1911 struct btrfs_root *chunk_root;
@@ -1899,16 +1916,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1899 int num_backups_tried = 0; 1916 int num_backups_tried = 0;
1900 int backup_index = 0; 1917 int backup_index = 0;
1901 1918
1902 extent_root = fs_info->extent_root = 1919 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
1903 kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1920 extent_root = fs_info->extent_root = btrfs_alloc_root(fs_info);
1904 csum_root = fs_info->csum_root = 1921 csum_root = fs_info->csum_root = btrfs_alloc_root(fs_info);
1905 kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 1922 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
1906 chunk_root = fs_info->chunk_root = 1923 dev_root = fs_info->dev_root = btrfs_alloc_root(fs_info);
1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 dev_root = fs_info->dev_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 1924
1911 if (!extent_root || !csum_root || !chunk_root || !dev_root) { 1925 if (!tree_root || !extent_root || !csum_root ||
1926 !chunk_root || !dev_root) {
1912 err = -ENOMEM; 1927 err = -ENOMEM;
1913 goto fail; 1928 goto fail;
1914 } 1929 }
@@ -1997,6 +2012,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1997 init_waitqueue_head(&fs_info->scrub_pause_wait); 2012 init_waitqueue_head(&fs_info->scrub_pause_wait);
1998 init_rwsem(&fs_info->scrub_super_lock); 2013 init_rwsem(&fs_info->scrub_super_lock);
1999 fs_info->scrub_workers_refcnt = 0; 2014 fs_info->scrub_workers_refcnt = 0;
2015#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2016 fs_info->check_integrity_print_mask = 0;
2017#endif
2018
2019 spin_lock_init(&fs_info->balance_lock);
2020 mutex_init(&fs_info->balance_mutex);
2021 atomic_set(&fs_info->balance_running, 0);
2022 atomic_set(&fs_info->balance_pause_req, 0);
2023 atomic_set(&fs_info->balance_cancel_req, 0);
2024 fs_info->balance_ctl = NULL;
2025 init_waitqueue_head(&fs_info->balance_wait_q);
2000 2026
2001 sb->s_blocksize = 4096; 2027 sb->s_blocksize = 4096;
2002 sb->s_blocksize_bits = blksize_bits(4096); 2028 sb->s_blocksize_bits = blksize_bits(4096);
@@ -2266,9 +2292,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2266 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), 2292 (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
2267 BTRFS_UUID_SIZE); 2293 BTRFS_UUID_SIZE);
2268 2294
2269 mutex_lock(&fs_info->chunk_mutex);
2270 ret = btrfs_read_chunk_tree(chunk_root); 2295 ret = btrfs_read_chunk_tree(chunk_root);
2271 mutex_unlock(&fs_info->chunk_mutex);
2272 if (ret) { 2296 if (ret) {
2273 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2297 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
2274 sb->s_id); 2298 sb->s_id);
@@ -2317,9 +2341,6 @@ retry_root_backup:
2317 2341
2318 fs_info->generation = generation; 2342 fs_info->generation = generation;
2319 fs_info->last_trans_committed = generation; 2343 fs_info->last_trans_committed = generation;
2320 fs_info->data_alloc_profile = (u64)-1;
2321 fs_info->metadata_alloc_profile = (u64)-1;
2322 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
2323 2344
2324 ret = btrfs_init_space_info(fs_info); 2345 ret = btrfs_init_space_info(fs_info);
2325 if (ret) { 2346 if (ret) {
@@ -2352,6 +2373,19 @@ retry_root_backup:
2352 btrfs_set_opt(fs_info->mount_opt, SSD); 2373 btrfs_set_opt(fs_info->mount_opt, SSD);
2353 } 2374 }
2354 2375
2376#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2377 if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
2378 ret = btrfsic_mount(tree_root, fs_devices,
2379 btrfs_test_opt(tree_root,
2380 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
2381 1 : 0,
2382 fs_info->check_integrity_print_mask);
2383 if (ret)
2384 printk(KERN_WARNING "btrfs: failed to initialize"
2385 " integrity check module %s\n", sb->s_id);
2386 }
2387#endif
2388
2355 /* do not make disk changes in broken FS */ 2389 /* do not make disk changes in broken FS */
2356 if (btrfs_super_log_root(disk_super) != 0 && 2390 if (btrfs_super_log_root(disk_super) != 0 &&
2357 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) { 2391 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
@@ -2367,7 +2401,7 @@ retry_root_backup:
2367 btrfs_level_size(tree_root, 2401 btrfs_level_size(tree_root,
2368 btrfs_super_log_root_level(disk_super)); 2402 btrfs_super_log_root_level(disk_super));
2369 2403
2370 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 2404 log_tree_root = btrfs_alloc_root(fs_info);
2371 if (!log_tree_root) { 2405 if (!log_tree_root) {
2372 err = -ENOMEM; 2406 err = -ENOMEM;
2373 goto fail_trans_kthread; 2407 goto fail_trans_kthread;
@@ -2422,13 +2456,17 @@ retry_root_backup:
2422 if (!err) 2456 if (!err)
2423 err = btrfs_orphan_cleanup(fs_info->tree_root); 2457 err = btrfs_orphan_cleanup(fs_info->tree_root);
2424 up_read(&fs_info->cleanup_work_sem); 2458 up_read(&fs_info->cleanup_work_sem);
2459
2460 if (!err)
2461 err = btrfs_recover_balance(fs_info->tree_root);
2462
2425 if (err) { 2463 if (err) {
2426 close_ctree(tree_root); 2464 close_ctree(tree_root);
2427 return ERR_PTR(err); 2465 return err;
2428 } 2466 }
2429 } 2467 }
2430 2468
2431 return tree_root; 2469 return 0;
2432 2470
2433fail_trans_kthread: 2471fail_trans_kthread:
2434 kthread_stop(fs_info->transaction_kthread); 2472 kthread_stop(fs_info->transaction_kthread);
@@ -2474,8 +2512,7 @@ fail_srcu:
2474 cleanup_srcu_struct(&fs_info->subvol_srcu); 2512 cleanup_srcu_struct(&fs_info->subvol_srcu);
2475fail: 2513fail:
2476 btrfs_close_devices(fs_info->fs_devices); 2514 btrfs_close_devices(fs_info->fs_devices);
2477 free_fs_info(fs_info); 2515 return err;
2478 return ERR_PTR(err);
2479 2516
2480recovery_tree_root: 2517recovery_tree_root:
2481 if (!btrfs_test_opt(tree_root, RECOVERY)) 2518 if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2630,7 +2667,7 @@ static int write_dev_supers(struct btrfs_device *device,
2630 * we fua the first super. The others we allow 2667 * we fua the first super. The others we allow
2631 * to go down lazy. 2668 * to go down lazy.
2632 */ 2669 */
2633 ret = submit_bh(WRITE_FUA, bh); 2670 ret = btrfsic_submit_bh(WRITE_FUA, bh);
2634 if (ret) 2671 if (ret)
2635 errors++; 2672 errors++;
2636 } 2673 }
@@ -2707,7 +2744,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
2707 device->flush_bio = bio; 2744 device->flush_bio = bio;
2708 2745
2709 bio_get(bio); 2746 bio_get(bio);
2710 submit_bio(WRITE_FLUSH, bio); 2747 btrfsic_submit_bio(WRITE_FLUSH, bio);
2711 2748
2712 return 0; 2749 return 0;
2713} 2750}
@@ -2971,6 +3008,9 @@ int close_ctree(struct btrfs_root *root)
2971 fs_info->closing = 1; 3008 fs_info->closing = 1;
2972 smp_mb(); 3009 smp_mb();
2973 3010
3011 /* pause restriper - we want to resume on mount */
3012 btrfs_pause_balance(root->fs_info);
3013
2974 btrfs_scrub_cancel(root); 3014 btrfs_scrub_cancel(root);
2975 3015
2976 /* wait for any defraggers to finish */ 3016 /* wait for any defraggers to finish */
@@ -2978,7 +3018,7 @@ int close_ctree(struct btrfs_root *root)
2978 (atomic_read(&fs_info->defrag_running) == 0)); 3018 (atomic_read(&fs_info->defrag_running) == 0));
2979 3019
2980 /* clear out the rbtree of defraggable inodes */ 3020 /* clear out the rbtree of defraggable inodes */
2981 btrfs_run_defrag_inodes(root->fs_info); 3021 btrfs_run_defrag_inodes(fs_info);
2982 3022
2983 /* 3023 /*
2984 * Here come 2 situations when btrfs is broken to flip readonly: 3024 * Here come 2 situations when btrfs is broken to flip readonly:
@@ -3007,8 +3047,8 @@ int close_ctree(struct btrfs_root *root)
3007 3047
3008 btrfs_put_block_group_cache(fs_info); 3048 btrfs_put_block_group_cache(fs_info);
3009 3049
3010 kthread_stop(root->fs_info->transaction_kthread); 3050 kthread_stop(fs_info->transaction_kthread);
3011 kthread_stop(root->fs_info->cleaner_kthread); 3051 kthread_stop(fs_info->cleaner_kthread);
3012 3052
3013 fs_info->closing = 2; 3053 fs_info->closing = 2;
3014 smp_mb(); 3054 smp_mb();
@@ -3026,14 +3066,14 @@ int close_ctree(struct btrfs_root *root)
3026 free_extent_buffer(fs_info->extent_root->commit_root); 3066 free_extent_buffer(fs_info->extent_root->commit_root);
3027 free_extent_buffer(fs_info->tree_root->node); 3067 free_extent_buffer(fs_info->tree_root->node);
3028 free_extent_buffer(fs_info->tree_root->commit_root); 3068 free_extent_buffer(fs_info->tree_root->commit_root);
3029 free_extent_buffer(root->fs_info->chunk_root->node); 3069 free_extent_buffer(fs_info->chunk_root->node);
3030 free_extent_buffer(root->fs_info->chunk_root->commit_root); 3070 free_extent_buffer(fs_info->chunk_root->commit_root);
3031 free_extent_buffer(root->fs_info->dev_root->node); 3071 free_extent_buffer(fs_info->dev_root->node);
3032 free_extent_buffer(root->fs_info->dev_root->commit_root); 3072 free_extent_buffer(fs_info->dev_root->commit_root);
3033 free_extent_buffer(root->fs_info->csum_root->node); 3073 free_extent_buffer(fs_info->csum_root->node);
3034 free_extent_buffer(root->fs_info->csum_root->commit_root); 3074 free_extent_buffer(fs_info->csum_root->commit_root);
3035 3075
3036 btrfs_free_block_groups(root->fs_info); 3076 btrfs_free_block_groups(fs_info);
3037 3077
3038 del_fs_roots(fs_info); 3078 del_fs_roots(fs_info);
3039 3079
@@ -3053,14 +3093,17 @@ int close_ctree(struct btrfs_root *root)
3053 btrfs_stop_workers(&fs_info->caching_workers); 3093 btrfs_stop_workers(&fs_info->caching_workers);
3054 btrfs_stop_workers(&fs_info->readahead_workers); 3094 btrfs_stop_workers(&fs_info->readahead_workers);
3055 3095
3096#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3097 if (btrfs_test_opt(root, CHECK_INTEGRITY))
3098 btrfsic_unmount(root, fs_info->fs_devices);
3099#endif
3100
3056 btrfs_close_devices(fs_info->fs_devices); 3101 btrfs_close_devices(fs_info->fs_devices);
3057 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3102 btrfs_mapping_tree_free(&fs_info->mapping_tree);
3058 3103
3059 bdi_destroy(&fs_info->bdi); 3104 bdi_destroy(&fs_info->bdi);
3060 cleanup_srcu_struct(&fs_info->subvol_srcu); 3105 cleanup_srcu_struct(&fs_info->subvol_srcu);
3061 3106
3062 free_fs_info(fs_info);
3063
3064 return 0; 3107 return 0;
3065} 3108}
3066 3109
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c99d0a8f13fa..e4bc4741319b 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,9 +46,9 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
46 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
47int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
48 struct btrfs_root *root, struct extent_buffer *buf); 48 struct btrfs_root *root, struct extent_buffer *buf);
49struct btrfs_root *open_ctree(struct super_block *sb, 49int open_ctree(struct super_block *sb,
50 struct btrfs_fs_devices *fs_devices, 50 struct btrfs_fs_devices *fs_devices,
51 char *options); 51 char *options);
52int close_ctree(struct btrfs_root *root); 52int close_ctree(struct btrfs_root *root);
53int write_ctree_super(struct btrfs_trans_handle *trans, 53int write_ctree_super(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root, int max_mirrors); 54 struct btrfs_root *root, int max_mirrors);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1b8dc33778f9..5f77166fd01c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -67,7 +67,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
67 u64 root_objectid, u32 generation, 67 u64 root_objectid, u32 generation,
68 int check_generation) 68 int check_generation)
69{ 69{
70 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 70 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
71 struct btrfs_root *root; 71 struct btrfs_root *root;
72 struct inode *inode; 72 struct inode *inode;
73 struct btrfs_key key; 73 struct btrfs_key key;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5fbe576d2ba..283af7a676a3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -34,23 +34,24 @@
34#include "locking.h" 34#include "locking.h"
35#include "free-space-cache.h" 35#include "free-space-cache.h"
36 36
37/* control flags for do_chunk_alloc's force field 37/*
38 * control flags for do_chunk_alloc's force field
38 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk 39 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
39 * if we really need one. 40 * if we really need one.
40 * 41 *
41 * CHUNK_ALLOC_FORCE means it must try to allocate one
42 *
43 * CHUNK_ALLOC_LIMITED means to only try and allocate one 42 * CHUNK_ALLOC_LIMITED means to only try and allocate one
44 * if we have very few chunks already allocated. This is 43 * if we have very few chunks already allocated. This is
45 * used as part of the clustering code to help make sure 44 * used as part of the clustering code to help make sure
46 * we have a good pool of storage to cluster in, without 45 * we have a good pool of storage to cluster in, without
47 * filling the FS with empty chunks 46 * filling the FS with empty chunks
48 * 47 *
48 * CHUNK_ALLOC_FORCE means it must try to allocate one
49 *
49 */ 50 */
50enum { 51enum {
51 CHUNK_ALLOC_NO_FORCE = 0, 52 CHUNK_ALLOC_NO_FORCE = 0,
52 CHUNK_ALLOC_FORCE = 1, 53 CHUNK_ALLOC_LIMITED = 1,
53 CHUNK_ALLOC_LIMITED = 2, 54 CHUNK_ALLOC_FORCE = 2,
54}; 55};
55 56
56/* 57/*
@@ -618,8 +619,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
618 struct list_head *head = &info->space_info; 619 struct list_head *head = &info->space_info;
619 struct btrfs_space_info *found; 620 struct btrfs_space_info *found;
620 621
621 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | 622 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
622 BTRFS_BLOCK_GROUP_METADATA;
623 623
624 rcu_read_lock(); 624 rcu_read_lock();
625 list_for_each_entry_rcu(found, head, list) { 625 list_for_each_entry_rcu(found, head, list) {
@@ -1872,20 +1872,24 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1872int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 1872int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1873 struct btrfs_root *root, 1873 struct btrfs_root *root,
1874 u64 bytenr, u64 num_bytes, u64 parent, 1874 u64 bytenr, u64 num_bytes, u64 parent,
1875 u64 root_objectid, u64 owner, u64 offset) 1875 u64 root_objectid, u64 owner, u64 offset, int for_cow)
1876{ 1876{
1877 int ret; 1877 int ret;
1878 struct btrfs_fs_info *fs_info = root->fs_info;
1879
1878 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && 1880 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1879 root_objectid == BTRFS_TREE_LOG_OBJECTID); 1881 root_objectid == BTRFS_TREE_LOG_OBJECTID);
1880 1882
1881 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1883 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1882 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 1884 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1885 num_bytes,
1883 parent, root_objectid, (int)owner, 1886 parent, root_objectid, (int)owner,
1884 BTRFS_ADD_DELAYED_REF, NULL); 1887 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1885 } else { 1888 } else {
1886 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 1889 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1890 num_bytes,
1887 parent, root_objectid, owner, offset, 1891 parent, root_objectid, owner, offset,
1888 BTRFS_ADD_DELAYED_REF, NULL); 1892 BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1889 } 1893 }
1890 return ret; 1894 return ret;
1891} 1895}
@@ -2233,6 +2237,28 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2233 } 2237 }
2234 2238
2235 /* 2239 /*
2240 * locked_ref is the head node, so we have to go one
2241 * node back for any delayed ref updates
2242 */
2243 ref = select_delayed_ref(locked_ref);
2244
2245 if (ref && ref->seq &&
2246 btrfs_check_delayed_seq(delayed_refs, ref->seq)) {
2247 /*
2248 * there are still refs with lower seq numbers in the
2249 * process of being added. Don't run this ref yet.
2250 */
2251 list_del_init(&locked_ref->cluster);
2252 mutex_unlock(&locked_ref->mutex);
2253 locked_ref = NULL;
2254 delayed_refs->num_heads_ready++;
2255 spin_unlock(&delayed_refs->lock);
2256 cond_resched();
2257 spin_lock(&delayed_refs->lock);
2258 continue;
2259 }
2260
2261 /*
2236 * record the must insert reserved flag before we 2262 * record the must insert reserved flag before we
2237 * drop the spin lock. 2263 * drop the spin lock.
2238 */ 2264 */
@@ -2242,11 +2268,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2242 extent_op = locked_ref->extent_op; 2268 extent_op = locked_ref->extent_op;
2243 locked_ref->extent_op = NULL; 2269 locked_ref->extent_op = NULL;
2244 2270
2245 /*
2246 * locked_ref is the head node, so we have to go one
2247 * node back for any delayed ref updates
2248 */
2249 ref = select_delayed_ref(locked_ref);
2250 if (!ref) { 2271 if (!ref) {
2251 /* All delayed refs have been processed, Go ahead 2272 /* All delayed refs have been processed, Go ahead
2252 * and send the head node to run_one_delayed_ref, 2273 * and send the head node to run_one_delayed_ref,
@@ -2267,9 +2288,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2267 BUG_ON(ret); 2288 BUG_ON(ret);
2268 kfree(extent_op); 2289 kfree(extent_op);
2269 2290
2270 cond_resched(); 2291 goto next;
2271 spin_lock(&delayed_refs->lock);
2272 continue;
2273 } 2292 }
2274 2293
2275 list_del_init(&locked_ref->cluster); 2294 list_del_init(&locked_ref->cluster);
@@ -2279,7 +2298,12 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2279 ref->in_tree = 0; 2298 ref->in_tree = 0;
2280 rb_erase(&ref->rb_node, &delayed_refs->root); 2299 rb_erase(&ref->rb_node, &delayed_refs->root);
2281 delayed_refs->num_entries--; 2300 delayed_refs->num_entries--;
2282 2301 /*
2302 * we modified num_entries, but as we're currently running
2303 * delayed refs, skip
2304 * wake_up(&delayed_refs->seq_wait);
2305 * here.
2306 */
2283 spin_unlock(&delayed_refs->lock); 2307 spin_unlock(&delayed_refs->lock);
2284 2308
2285 ret = run_one_delayed_ref(trans, root, ref, extent_op, 2309 ret = run_one_delayed_ref(trans, root, ref, extent_op,
@@ -2289,13 +2313,34 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2289 btrfs_put_delayed_ref(ref); 2313 btrfs_put_delayed_ref(ref);
2290 kfree(extent_op); 2314 kfree(extent_op);
2291 count++; 2315 count++;
2292 2316next:
2317 do_chunk_alloc(trans, root->fs_info->extent_root,
2318 2 * 1024 * 1024,
2319 btrfs_get_alloc_profile(root, 0),
2320 CHUNK_ALLOC_NO_FORCE);
2293 cond_resched(); 2321 cond_resched();
2294 spin_lock(&delayed_refs->lock); 2322 spin_lock(&delayed_refs->lock);
2295 } 2323 }
2296 return count; 2324 return count;
2297} 2325}
2298 2326
2327
2328static void wait_for_more_refs(struct btrfs_delayed_ref_root *delayed_refs,
2329 unsigned long num_refs)
2330{
2331 struct list_head *first_seq = delayed_refs->seq_head.next;
2332
2333 spin_unlock(&delayed_refs->lock);
2334 pr_debug("waiting for more refs (num %ld, first %p)\n",
2335 num_refs, first_seq);
2336 wait_event(delayed_refs->seq_wait,
2337 num_refs != delayed_refs->num_entries ||
2338 delayed_refs->seq_head.next != first_seq);
2339 pr_debug("done waiting for more refs (num %ld, first %p)\n",
2340 delayed_refs->num_entries, delayed_refs->seq_head.next);
2341 spin_lock(&delayed_refs->lock);
2342}
2343
2299/* 2344/*
2300 * this starts processing the delayed reference count updates and 2345 * this starts processing the delayed reference count updates and
2301 * extent insertions we have queued up so far. count can be 2346 * extent insertions we have queued up so far. count can be
@@ -2311,15 +2356,23 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2311 struct btrfs_delayed_ref_node *ref; 2356 struct btrfs_delayed_ref_node *ref;
2312 struct list_head cluster; 2357 struct list_head cluster;
2313 int ret; 2358 int ret;
2359 u64 delayed_start;
2314 int run_all = count == (unsigned long)-1; 2360 int run_all = count == (unsigned long)-1;
2315 int run_most = 0; 2361 int run_most = 0;
2362 unsigned long num_refs = 0;
2363 int consider_waiting;
2316 2364
2317 if (root == root->fs_info->extent_root) 2365 if (root == root->fs_info->extent_root)
2318 root = root->fs_info->tree_root; 2366 root = root->fs_info->tree_root;
2319 2367
2368 do_chunk_alloc(trans, root->fs_info->extent_root,
2369 2 * 1024 * 1024, btrfs_get_alloc_profile(root, 0),
2370 CHUNK_ALLOC_NO_FORCE);
2371
2320 delayed_refs = &trans->transaction->delayed_refs; 2372 delayed_refs = &trans->transaction->delayed_refs;
2321 INIT_LIST_HEAD(&cluster); 2373 INIT_LIST_HEAD(&cluster);
2322again: 2374again:
2375 consider_waiting = 0;
2323 spin_lock(&delayed_refs->lock); 2376 spin_lock(&delayed_refs->lock);
2324 if (count == 0) { 2377 if (count == 0) {
2325 count = delayed_refs->num_entries * 2; 2378 count = delayed_refs->num_entries * 2;
@@ -2336,11 +2389,35 @@ again:
2336 * of refs to process starting at the first one we are able to 2389 * of refs to process starting at the first one we are able to
2337 * lock 2390 * lock
2338 */ 2391 */
2392 delayed_start = delayed_refs->run_delayed_start;
2339 ret = btrfs_find_ref_cluster(trans, &cluster, 2393 ret = btrfs_find_ref_cluster(trans, &cluster,
2340 delayed_refs->run_delayed_start); 2394 delayed_refs->run_delayed_start);
2341 if (ret) 2395 if (ret)
2342 break; 2396 break;
2343 2397
2398 if (delayed_start >= delayed_refs->run_delayed_start) {
2399 if (consider_waiting == 0) {
2400 /*
2401 * btrfs_find_ref_cluster looped. let's do one
2402 * more cycle. if we don't run any delayed ref
2403 * during that cycle (because we can't because
2404 * all of them are blocked) and if the number of
2405 * refs doesn't change, we avoid busy waiting.
2406 */
2407 consider_waiting = 1;
2408 num_refs = delayed_refs->num_entries;
2409 } else {
2410 wait_for_more_refs(delayed_refs, num_refs);
2411 /*
2412 * after waiting, things have changed. we
2413 * dropped the lock and someone else might have
2414 * run some refs, built new clusters and so on.
2415 * therefore, we restart staleness detection.
2416 */
2417 consider_waiting = 0;
2418 }
2419 }
2420
2344 ret = run_clustered_refs(trans, root, &cluster); 2421 ret = run_clustered_refs(trans, root, &cluster);
2345 BUG_ON(ret < 0); 2422 BUG_ON(ret < 0);
2346 2423
@@ -2348,6 +2425,11 @@ again:
2348 2425
2349 if (count == 0) 2426 if (count == 0)
2350 break; 2427 break;
2428
2429 if (ret || delayed_refs->run_delayed_start == 0) {
2430 /* refs were run, let's reset staleness detection */
2431 consider_waiting = 0;
2432 }
2351 } 2433 }
2352 2434
2353 if (run_all) { 2435 if (run_all) {
@@ -2405,7 +2487,8 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2405 extent_op->update_key = 0; 2487 extent_op->update_key = 0;
2406 extent_op->is_data = is_data ? 1 : 0; 2488 extent_op->is_data = is_data ? 1 : 0;
2407 2489
2408 ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op); 2490 ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2491 num_bytes, extent_op);
2409 if (ret) 2492 if (ret)
2410 kfree(extent_op); 2493 kfree(extent_op);
2411 return ret; 2494 return ret;
@@ -2590,7 +2673,7 @@ out:
2590static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2673static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2591 struct btrfs_root *root, 2674 struct btrfs_root *root,
2592 struct extent_buffer *buf, 2675 struct extent_buffer *buf,
2593 int full_backref, int inc) 2676 int full_backref, int inc, int for_cow)
2594{ 2677{
2595 u64 bytenr; 2678 u64 bytenr;
2596 u64 num_bytes; 2679 u64 num_bytes;
@@ -2603,7 +2686,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2603 int level; 2686 int level;
2604 int ret = 0; 2687 int ret = 0;
2605 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 2688 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2606 u64, u64, u64, u64, u64, u64); 2689 u64, u64, u64, u64, u64, u64, int);
2607 2690
2608 ref_root = btrfs_header_owner(buf); 2691 ref_root = btrfs_header_owner(buf);
2609 nritems = btrfs_header_nritems(buf); 2692 nritems = btrfs_header_nritems(buf);
@@ -2640,14 +2723,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2640 key.offset -= btrfs_file_extent_offset(buf, fi); 2723 key.offset -= btrfs_file_extent_offset(buf, fi);
2641 ret = process_func(trans, root, bytenr, num_bytes, 2724 ret = process_func(trans, root, bytenr, num_bytes,
2642 parent, ref_root, key.objectid, 2725 parent, ref_root, key.objectid,
2643 key.offset); 2726 key.offset, for_cow);
2644 if (ret) 2727 if (ret)
2645 goto fail; 2728 goto fail;
2646 } else { 2729 } else {
2647 bytenr = btrfs_node_blockptr(buf, i); 2730 bytenr = btrfs_node_blockptr(buf, i);
2648 num_bytes = btrfs_level_size(root, level - 1); 2731 num_bytes = btrfs_level_size(root, level - 1);
2649 ret = process_func(trans, root, bytenr, num_bytes, 2732 ret = process_func(trans, root, bytenr, num_bytes,
2650 parent, ref_root, level - 1, 0); 2733 parent, ref_root, level - 1, 0,
2734 for_cow);
2651 if (ret) 2735 if (ret)
2652 goto fail; 2736 goto fail;
2653 } 2737 }
@@ -2659,15 +2743,15 @@ fail:
2659} 2743}
2660 2744
2661int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2745int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2662 struct extent_buffer *buf, int full_backref) 2746 struct extent_buffer *buf, int full_backref, int for_cow)
2663{ 2747{
2664 return __btrfs_mod_ref(trans, root, buf, full_backref, 1); 2748 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2665} 2749}
2666 2750
2667int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2751int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2668 struct extent_buffer *buf, int full_backref) 2752 struct extent_buffer *buf, int full_backref, int for_cow)
2669{ 2753{
2670 return __btrfs_mod_ref(trans, root, buf, full_backref, 0); 2754 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2671} 2755}
2672 2756
2673static int write_one_cache_group(struct btrfs_trans_handle *trans, 2757static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2993,9 +3077,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2993 INIT_LIST_HEAD(&found->block_groups[i]); 3077 INIT_LIST_HEAD(&found->block_groups[i]);
2994 init_rwsem(&found->groups_sem); 3078 init_rwsem(&found->groups_sem);
2995 spin_lock_init(&found->lock); 3079 spin_lock_init(&found->lock);
2996 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA | 3080 found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
2997 BTRFS_BLOCK_GROUP_SYSTEM |
2998 BTRFS_BLOCK_GROUP_METADATA);
2999 found->total_bytes = total_bytes; 3081 found->total_bytes = total_bytes;
3000 found->disk_total = total_bytes * factor; 3082 found->disk_total = total_bytes * factor;
3001 found->bytes_used = bytes_used; 3083 found->bytes_used = bytes_used;
@@ -3016,20 +3098,27 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3016 3098
3017static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) 3099static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3018{ 3100{
3019 u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | 3101 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3020 BTRFS_BLOCK_GROUP_RAID1 | 3102
3021 BTRFS_BLOCK_GROUP_RAID10 | 3103 /* chunk -> extended profile */
3022 BTRFS_BLOCK_GROUP_DUP); 3104 if (extra_flags == 0)
3023 if (extra_flags) { 3105 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3024 if (flags & BTRFS_BLOCK_GROUP_DATA) 3106
3025 fs_info->avail_data_alloc_bits |= extra_flags; 3107 if (flags & BTRFS_BLOCK_GROUP_DATA)
3026 if (flags & BTRFS_BLOCK_GROUP_METADATA) 3108 fs_info->avail_data_alloc_bits |= extra_flags;
3027 fs_info->avail_metadata_alloc_bits |= extra_flags; 3109 if (flags & BTRFS_BLOCK_GROUP_METADATA)
3028 if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3110 fs_info->avail_metadata_alloc_bits |= extra_flags;
3029 fs_info->avail_system_alloc_bits |= extra_flags; 3111 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3030 } 3112 fs_info->avail_system_alloc_bits |= extra_flags;
3031} 3113}
3032 3114
3115/*
3116 * @flags: available profiles in extended format (see ctree.h)
3117 *
3118 * Returns reduced profile in chunk format. If profile changing is in
3119 * progress (either running or paused) picks the target profile (if it's
3120 * already available), otherwise falls back to plain reducing.
3121 */
3033u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3122u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3034{ 3123{
3035 /* 3124 /*
@@ -3040,6 +3129,34 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3040 u64 num_devices = root->fs_info->fs_devices->rw_devices + 3129 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3041 root->fs_info->fs_devices->missing_devices; 3130 root->fs_info->fs_devices->missing_devices;
3042 3131
3132 /* pick restriper's target profile if it's available */
3133 spin_lock(&root->fs_info->balance_lock);
3134 if (root->fs_info->balance_ctl) {
3135 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
3136 u64 tgt = 0;
3137
3138 if ((flags & BTRFS_BLOCK_GROUP_DATA) &&
3139 (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3140 (flags & bctl->data.target)) {
3141 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3142 } else if ((flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
3143 (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3144 (flags & bctl->sys.target)) {
3145 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3146 } else if ((flags & BTRFS_BLOCK_GROUP_METADATA) &&
3147 (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3148 (flags & bctl->meta.target)) {
3149 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3150 }
3151
3152 if (tgt) {
3153 spin_unlock(&root->fs_info->balance_lock);
3154 flags = tgt;
3155 goto out;
3156 }
3157 }
3158 spin_unlock(&root->fs_info->balance_lock);
3159
3043 if (num_devices == 1) 3160 if (num_devices == 1)
3044 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3161 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3045 if (num_devices < 4) 3162 if (num_devices < 4)
@@ -3059,22 +3176,25 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3059 if ((flags & BTRFS_BLOCK_GROUP_RAID0) && 3176 if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3060 ((flags & BTRFS_BLOCK_GROUP_RAID1) | 3177 ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3061 (flags & BTRFS_BLOCK_GROUP_RAID10) | 3178 (flags & BTRFS_BLOCK_GROUP_RAID10) |
3062 (flags & BTRFS_BLOCK_GROUP_DUP))) 3179 (flags & BTRFS_BLOCK_GROUP_DUP))) {
3063 flags &= ~BTRFS_BLOCK_GROUP_RAID0; 3180 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3181 }
3182
3183out:
3184 /* extended -> chunk profile */
3185 flags &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3064 return flags; 3186 return flags;
3065} 3187}
3066 3188
3067static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) 3189static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3068{ 3190{
3069 if (flags & BTRFS_BLOCK_GROUP_DATA) 3191 if (flags & BTRFS_BLOCK_GROUP_DATA)
3070 flags |= root->fs_info->avail_data_alloc_bits & 3192 flags |= root->fs_info->avail_data_alloc_bits;
3071 root->fs_info->data_alloc_profile;
3072 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 3193 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3073 flags |= root->fs_info->avail_system_alloc_bits & 3194 flags |= root->fs_info->avail_system_alloc_bits;
3074 root->fs_info->system_alloc_profile;
3075 else if (flags & BTRFS_BLOCK_GROUP_METADATA) 3195 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3076 flags |= root->fs_info->avail_metadata_alloc_bits & 3196 flags |= root->fs_info->avail_metadata_alloc_bits;
3077 root->fs_info->metadata_alloc_profile; 3197
3078 return btrfs_reduce_alloc_profile(root, flags); 3198 return btrfs_reduce_alloc_profile(root, flags);
3079} 3199}
3080 3200
@@ -3191,6 +3311,8 @@ commit_trans:
3191 return -ENOSPC; 3311 return -ENOSPC;
3192 } 3312 }
3193 data_sinfo->bytes_may_use += bytes; 3313 data_sinfo->bytes_may_use += bytes;
3314 trace_btrfs_space_reservation(root->fs_info, "space_info",
3315 (u64)data_sinfo, bytes, 1);
3194 spin_unlock(&data_sinfo->lock); 3316 spin_unlock(&data_sinfo->lock);
3195 3317
3196 return 0; 3318 return 0;
@@ -3210,6 +3332,8 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3210 data_sinfo = BTRFS_I(inode)->space_info; 3332 data_sinfo = BTRFS_I(inode)->space_info;
3211 spin_lock(&data_sinfo->lock); 3333 spin_lock(&data_sinfo->lock);
3212 data_sinfo->bytes_may_use -= bytes; 3334 data_sinfo->bytes_may_use -= bytes;
3335 trace_btrfs_space_reservation(root->fs_info, "space_info",
3336 (u64)data_sinfo, bytes, 0);
3213 spin_unlock(&data_sinfo->lock); 3337 spin_unlock(&data_sinfo->lock);
3214} 3338}
3215 3339
@@ -3257,27 +3381,15 @@ static int should_alloc_chunk(struct btrfs_root *root,
3257 if (num_bytes - num_allocated < thresh) 3381 if (num_bytes - num_allocated < thresh)
3258 return 1; 3382 return 1;
3259 } 3383 }
3260
3261 /*
3262 * we have two similar checks here, one based on percentage
3263 * and once based on a hard number of 256MB. The idea
3264 * is that if we have a good amount of free
3265 * room, don't allocate a chunk. A good mount is
3266 * less than 80% utilized of the chunks we have allocated,
3267 * or more than 256MB free
3268 */
3269 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3270 return 0;
3271
3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3273 return 0;
3274
3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy); 3384 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3276 3385
3277 /* 256MB or 5% of the FS */ 3386 /* 256MB or 2% of the FS */
3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3387 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 2));
3388 /* system chunks need a much small threshold */
3389 if (sinfo->flags & BTRFS_BLOCK_GROUP_SYSTEM)
3390 thresh = 32 * 1024 * 1024;
3279 3391
3280 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3392 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 8))
3281 return 0; 3393 return 0;
3282 return 1; 3394 return 1;
3283} 3395}
@@ -3291,7 +3403,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3291 int wait_for_alloc = 0; 3403 int wait_for_alloc = 0;
3292 int ret = 0; 3404 int ret = 0;
3293 3405
3294 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3406 BUG_ON(!profile_is_valid(flags, 0));
3295 3407
3296 space_info = __find_space_info(extent_root->fs_info, flags); 3408 space_info = __find_space_info(extent_root->fs_info, flags);
3297 if (!space_info) { 3409 if (!space_info) {
@@ -3303,7 +3415,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3303 3415
3304again: 3416again:
3305 spin_lock(&space_info->lock); 3417 spin_lock(&space_info->lock);
3306 if (space_info->force_alloc) 3418 if (force < space_info->force_alloc)
3307 force = space_info->force_alloc; 3419 force = space_info->force_alloc;
3308 if (space_info->full) { 3420 if (space_info->full) {
3309 spin_unlock(&space_info->lock); 3421 spin_unlock(&space_info->lock);
@@ -3582,6 +3694,10 @@ again:
3582 if (used <= space_info->total_bytes) { 3694 if (used <= space_info->total_bytes) {
3583 if (used + orig_bytes <= space_info->total_bytes) { 3695 if (used + orig_bytes <= space_info->total_bytes) {
3584 space_info->bytes_may_use += orig_bytes; 3696 space_info->bytes_may_use += orig_bytes;
3697 trace_btrfs_space_reservation(root->fs_info,
3698 "space_info",
3699 (u64)space_info,
3700 orig_bytes, 1);
3585 ret = 0; 3701 ret = 0;
3586 } else { 3702 } else {
3587 /* 3703 /*
@@ -3649,6 +3765,10 @@ again:
3649 3765
3650 if (used + num_bytes < space_info->total_bytes + avail) { 3766 if (used + num_bytes < space_info->total_bytes + avail) {
3651 space_info->bytes_may_use += orig_bytes; 3767 space_info->bytes_may_use += orig_bytes;
3768 trace_btrfs_space_reservation(root->fs_info,
3769 "space_info",
3770 (u64)space_info,
3771 orig_bytes, 1);
3652 ret = 0; 3772 ret = 0;
3653 } else { 3773 } else {
3654 wait_ordered = true; 3774 wait_ordered = true;
@@ -3755,7 +3875,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3755 spin_unlock(&block_rsv->lock); 3875 spin_unlock(&block_rsv->lock);
3756} 3876}
3757 3877
3758static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, 3878static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
3879 struct btrfs_block_rsv *block_rsv,
3759 struct btrfs_block_rsv *dest, u64 num_bytes) 3880 struct btrfs_block_rsv *dest, u64 num_bytes)
3760{ 3881{
3761 struct btrfs_space_info *space_info = block_rsv->space_info; 3882 struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,6 +3912,9 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3791 if (num_bytes) { 3912 if (num_bytes) {
3792 spin_lock(&space_info->lock); 3913 spin_lock(&space_info->lock);
3793 space_info->bytes_may_use -= num_bytes; 3914 space_info->bytes_may_use -= num_bytes;
3915 trace_btrfs_space_reservation(fs_info, "space_info",
3916 (u64)space_info,
3917 num_bytes, 0);
3794 space_info->reservation_progress++; 3918 space_info->reservation_progress++;
3795 spin_unlock(&space_info->lock); 3919 spin_unlock(&space_info->lock);
3796 } 3920 }
@@ -3947,7 +4071,8 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
3947 if (global_rsv->full || global_rsv == block_rsv || 4071 if (global_rsv->full || global_rsv == block_rsv ||
3948 block_rsv->space_info != global_rsv->space_info) 4072 block_rsv->space_info != global_rsv->space_info)
3949 global_rsv = NULL; 4073 global_rsv = NULL;
3950 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes); 4074 block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4075 num_bytes);
3951} 4076}
3952 4077
3953/* 4078/*
@@ -4006,11 +4131,15 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4006 num_bytes = sinfo->total_bytes - num_bytes; 4131 num_bytes = sinfo->total_bytes - num_bytes;
4007 block_rsv->reserved += num_bytes; 4132 block_rsv->reserved += num_bytes;
4008 sinfo->bytes_may_use += num_bytes; 4133 sinfo->bytes_may_use += num_bytes;
4134 trace_btrfs_space_reservation(fs_info, "space_info",
4135 (u64)sinfo, num_bytes, 1);
4009 } 4136 }
4010 4137
4011 if (block_rsv->reserved >= block_rsv->size) { 4138 if (block_rsv->reserved >= block_rsv->size) {
4012 num_bytes = block_rsv->reserved - block_rsv->size; 4139 num_bytes = block_rsv->reserved - block_rsv->size;
4013 sinfo->bytes_may_use -= num_bytes; 4140 sinfo->bytes_may_use -= num_bytes;
4141 trace_btrfs_space_reservation(fs_info, "space_info",
4142 (u64)sinfo, num_bytes, 0);
4014 sinfo->reservation_progress++; 4143 sinfo->reservation_progress++;
4015 block_rsv->reserved = block_rsv->size; 4144 block_rsv->reserved = block_rsv->size;
4016 block_rsv->full = 1; 4145 block_rsv->full = 1;
@@ -4045,7 +4174,8 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4045 4174
4046static void release_global_block_rsv(struct btrfs_fs_info *fs_info) 4175static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4047{ 4176{
4048 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1); 4177 block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4178 (u64)-1);
4049 WARN_ON(fs_info->delalloc_block_rsv.size > 0); 4179 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4050 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0); 4180 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4051 WARN_ON(fs_info->trans_block_rsv.size > 0); 4181 WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4062,6 +4192,8 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4062 if (!trans->bytes_reserved) 4192 if (!trans->bytes_reserved)
4063 return; 4193 return;
4064 4194
4195 trace_btrfs_space_reservation(root->fs_info, "transaction", (u64)trans,
4196 trans->bytes_reserved, 0);
4065 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 4197 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4066 trans->bytes_reserved = 0; 4198 trans->bytes_reserved = 0;
4067} 4199}
@@ -4079,6 +4211,8 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4079 * when we are truly done with the orphan item. 4211 * when we are truly done with the orphan item.
4080 */ 4212 */
4081 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4213 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4214 trace_btrfs_space_reservation(root->fs_info, "orphan",
4215 btrfs_ino(inode), num_bytes, 1);
4082 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4216 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4083} 4217}
4084 4218
@@ -4086,6 +4220,8 @@ void btrfs_orphan_release_metadata(struct inode *inode)
4086{ 4220{
4087 struct btrfs_root *root = BTRFS_I(inode)->root; 4221 struct btrfs_root *root = BTRFS_I(inode)->root;
4088 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 4222 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4223 trace_btrfs_space_reservation(root->fs_info, "orphan",
4224 btrfs_ino(inode), num_bytes, 0);
4089 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 4225 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4090} 4226}
4091 4227
@@ -4213,12 +4349,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4213 /* Need to be holding the i_mutex here if we aren't free space cache */ 4349 /* Need to be holding the i_mutex here if we aren't free space cache */
4214 if (btrfs_is_free_space_inode(root, inode)) 4350 if (btrfs_is_free_space_inode(root, inode))
4215 flush = 0; 4351 flush = 0;
4216 else
4217 WARN_ON(!mutex_is_locked(&inode->i_mutex));
4218 4352
4219 if (flush && btrfs_transaction_in_commit(root->fs_info)) 4353 if (flush && btrfs_transaction_in_commit(root->fs_info))
4220 schedule_timeout(1); 4354 schedule_timeout(1);
4221 4355
4356 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4222 num_bytes = ALIGN(num_bytes, root->sectorsize); 4357 num_bytes = ALIGN(num_bytes, root->sectorsize);
4223 4358
4224 spin_lock(&BTRFS_I(inode)->lock); 4359 spin_lock(&BTRFS_I(inode)->lock);
@@ -4266,8 +4401,14 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4266 if (dropped) 4401 if (dropped)
4267 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4402 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4268 4403
4269 if (to_free) 4404 if (to_free) {
4270 btrfs_block_rsv_release(root, block_rsv, to_free); 4405 btrfs_block_rsv_release(root, block_rsv, to_free);
4406 trace_btrfs_space_reservation(root->fs_info,
4407 "delalloc",
4408 btrfs_ino(inode),
4409 to_free, 0);
4410 }
4411 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4271 return ret; 4412 return ret;
4272 } 4413 }
4273 4414
@@ -4278,7 +4419,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4278 } 4419 }
4279 BTRFS_I(inode)->reserved_extents += nr_extents; 4420 BTRFS_I(inode)->reserved_extents += nr_extents;
4280 spin_unlock(&BTRFS_I(inode)->lock); 4421 spin_unlock(&BTRFS_I(inode)->lock);
4422 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4281 4423
4424 if (to_reserve)
4425 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4426 btrfs_ino(inode), to_reserve, 1);
4282 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4427 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4283 4428
4284 return 0; 4429 return 0;
@@ -4308,6 +4453,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4308 if (dropped > 0) 4453 if (dropped > 0)
4309 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4454 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4310 4455
4456 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4457 btrfs_ino(inode), to_free, 0);
4311 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4458 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4312 to_free); 4459 to_free);
4313} 4460}
@@ -4562,7 +4709,10 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4562 cache->reserved += num_bytes; 4709 cache->reserved += num_bytes;
4563 space_info->bytes_reserved += num_bytes; 4710 space_info->bytes_reserved += num_bytes;
4564 if (reserve == RESERVE_ALLOC) { 4711 if (reserve == RESERVE_ALLOC) {
4565 BUG_ON(space_info->bytes_may_use < num_bytes); 4712 trace_btrfs_space_reservation(cache->fs_info,
4713 "space_info",
4714 (u64)space_info,
4715 num_bytes, 0);
4566 space_info->bytes_may_use -= num_bytes; 4716 space_info->bytes_may_use -= num_bytes;
4567 } 4717 }
4568 } 4718 }
@@ -4928,6 +5078,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4928 rb_erase(&head->node.rb_node, &delayed_refs->root); 5078 rb_erase(&head->node.rb_node, &delayed_refs->root);
4929 5079
4930 delayed_refs->num_entries--; 5080 delayed_refs->num_entries--;
5081 if (waitqueue_active(&delayed_refs->seq_wait))
5082 wake_up(&delayed_refs->seq_wait);
4931 5083
4932 /* 5084 /*
4933 * we don't take a ref on the node because we're removing it from the 5085 * we don't take a ref on the node because we're removing it from the
@@ -4955,16 +5107,17 @@ out:
4955void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 5107void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4956 struct btrfs_root *root, 5108 struct btrfs_root *root,
4957 struct extent_buffer *buf, 5109 struct extent_buffer *buf,
4958 u64 parent, int last_ref) 5110 u64 parent, int last_ref, int for_cow)
4959{ 5111{
4960 struct btrfs_block_group_cache *cache = NULL; 5112 struct btrfs_block_group_cache *cache = NULL;
4961 int ret; 5113 int ret;
4962 5114
4963 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 5115 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4964 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len, 5116 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
4965 parent, root->root_key.objectid, 5117 buf->start, buf->len,
4966 btrfs_header_level(buf), 5118 parent, root->root_key.objectid,
4967 BTRFS_DROP_DELAYED_REF, NULL); 5119 btrfs_header_level(buf),
5120 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
4968 BUG_ON(ret); 5121 BUG_ON(ret);
4969 } 5122 }
4970 5123
@@ -4999,12 +5152,12 @@ out:
4999 btrfs_put_block_group(cache); 5152 btrfs_put_block_group(cache);
5000} 5153}
5001 5154
5002int btrfs_free_extent(struct btrfs_trans_handle *trans, 5155int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5003 struct btrfs_root *root, 5156 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5004 u64 bytenr, u64 num_bytes, u64 parent, 5157 u64 owner, u64 offset, int for_cow)
5005 u64 root_objectid, u64 owner, u64 offset)
5006{ 5158{
5007 int ret; 5159 int ret;
5160 struct btrfs_fs_info *fs_info = root->fs_info;
5008 5161
5009 /* 5162 /*
5010 * tree log blocks never actually go into the extent allocation 5163 * tree log blocks never actually go into the extent allocation
@@ -5016,14 +5169,17 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
5016 btrfs_pin_extent(root, bytenr, num_bytes, 1); 5169 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5017 ret = 0; 5170 ret = 0;
5018 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { 5171 } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5019 ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes, 5172 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5173 num_bytes,
5020 parent, root_objectid, (int)owner, 5174 parent, root_objectid, (int)owner,
5021 BTRFS_DROP_DELAYED_REF, NULL); 5175 BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5022 BUG_ON(ret); 5176 BUG_ON(ret);
5023 } else { 5177 } else {
5024 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 5178 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5025 parent, root_objectid, owner, 5179 num_bytes,
5026 offset, BTRFS_DROP_DELAYED_REF, NULL); 5180 parent, root_objectid, owner,
5181 offset, BTRFS_DROP_DELAYED_REF,
5182 NULL, for_cow);
5027 BUG_ON(ret); 5183 BUG_ON(ret);
5028 } 5184 }
5029 return ret; 5185 return ret;
@@ -5146,6 +5302,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5146 ins->objectid = 0; 5302 ins->objectid = 0;
5147 ins->offset = 0; 5303 ins->offset = 0;
5148 5304
5305 trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5306
5149 space_info = __find_space_info(root->fs_info, data); 5307 space_info = __find_space_info(root->fs_info, data);
5150 if (!space_info) { 5308 if (!space_info) {
5151 printk(KERN_ERR "No space info for %llu\n", data); 5309 printk(KERN_ERR "No space info for %llu\n", data);
@@ -5295,15 +5453,6 @@ alloc:
5295 if (unlikely(block_group->ro)) 5453 if (unlikely(block_group->ro))
5296 goto loop; 5454 goto loop;
5297 5455
5298 spin_lock(&block_group->free_space_ctl->tree_lock);
5299 if (cached &&
5300 block_group->free_space_ctl->free_space <
5301 num_bytes + empty_cluster + empty_size) {
5302 spin_unlock(&block_group->free_space_ctl->tree_lock);
5303 goto loop;
5304 }
5305 spin_unlock(&block_group->free_space_ctl->tree_lock);
5306
5307 /* 5456 /*
5308 * Ok we want to try and use the cluster allocator, so 5457 * Ok we want to try and use the cluster allocator, so
5309 * lets look there 5458 * lets look there
@@ -5331,6 +5480,8 @@ alloc:
5331 if (offset) { 5480 if (offset) {
5332 /* we have a block, we're done */ 5481 /* we have a block, we're done */
5333 spin_unlock(&last_ptr->refill_lock); 5482 spin_unlock(&last_ptr->refill_lock);
5483 trace_btrfs_reserve_extent_cluster(root,
5484 block_group, search_start, num_bytes);
5334 goto checks; 5485 goto checks;
5335 } 5486 }
5336 5487
@@ -5349,8 +5500,15 @@ refill_cluster:
5349 * plenty of times and not have found 5500 * plenty of times and not have found
5350 * anything, so we are likely way too 5501 * anything, so we are likely way too
5351 * fragmented for the clustering stuff to find 5502 * fragmented for the clustering stuff to find
5352 * anything. */ 5503 * anything.
5353 if (loop >= LOOP_NO_EMPTY_SIZE) { 5504 *
5505 * However, if the cluster is taken from the
5506 * current block group, release the cluster
5507 * first, so that we stand a better chance of
5508 * succeeding in the unclustered
5509 * allocation. */
5510 if (loop >= LOOP_NO_EMPTY_SIZE &&
5511 last_ptr->block_group != block_group) {
5354 spin_unlock(&last_ptr->refill_lock); 5512 spin_unlock(&last_ptr->refill_lock);
5355 goto unclustered_alloc; 5513 goto unclustered_alloc;
5356 } 5514 }
@@ -5361,6 +5519,11 @@ refill_cluster:
5361 */ 5519 */
5362 btrfs_return_cluster_to_free_space(NULL, last_ptr); 5520 btrfs_return_cluster_to_free_space(NULL, last_ptr);
5363 5521
5522 if (loop >= LOOP_NO_EMPTY_SIZE) {
5523 spin_unlock(&last_ptr->refill_lock);
5524 goto unclustered_alloc;
5525 }
5526
5364 /* allocate a cluster in this block group */ 5527 /* allocate a cluster in this block group */
5365 ret = btrfs_find_space_cluster(trans, root, 5528 ret = btrfs_find_space_cluster(trans, root,
5366 block_group, last_ptr, 5529 block_group, last_ptr,
@@ -5377,6 +5540,9 @@ refill_cluster:
5377 if (offset) { 5540 if (offset) {
5378 /* we found one, proceed */ 5541 /* we found one, proceed */
5379 spin_unlock(&last_ptr->refill_lock); 5542 spin_unlock(&last_ptr->refill_lock);
5543 trace_btrfs_reserve_extent_cluster(root,
5544 block_group, search_start,
5545 num_bytes);
5380 goto checks; 5546 goto checks;
5381 } 5547 }
5382 } else if (!cached && loop > LOOP_CACHING_NOWAIT 5548 } else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,6 +5567,15 @@ refill_cluster:
5401 } 5567 }
5402 5568
5403unclustered_alloc: 5569unclustered_alloc:
5570 spin_lock(&block_group->free_space_ctl->tree_lock);
5571 if (cached &&
5572 block_group->free_space_ctl->free_space <
5573 num_bytes + empty_cluster + empty_size) {
5574 spin_unlock(&block_group->free_space_ctl->tree_lock);
5575 goto loop;
5576 }
5577 spin_unlock(&block_group->free_space_ctl->tree_lock);
5578
5404 offset = btrfs_find_space_for_alloc(block_group, search_start, 5579 offset = btrfs_find_space_for_alloc(block_group, search_start,
5405 num_bytes, empty_size); 5580 num_bytes, empty_size);
5406 /* 5581 /*
@@ -5438,9 +5613,6 @@ checks:
5438 goto loop; 5613 goto loop;
5439 } 5614 }
5440 5615
5441 ins->objectid = search_start;
5442 ins->offset = num_bytes;
5443
5444 if (offset < search_start) 5616 if (offset < search_start)
5445 btrfs_add_free_space(used_block_group, offset, 5617 btrfs_add_free_space(used_block_group, offset,
5446 search_start - offset); 5618 search_start - offset);
@@ -5457,6 +5629,8 @@ checks:
5457 ins->objectid = search_start; 5629 ins->objectid = search_start;
5458 ins->offset = num_bytes; 5630 ins->offset = num_bytes;
5459 5631
5632 trace_btrfs_reserve_extent(orig_root, block_group,
5633 search_start, num_bytes);
5460 if (offset < search_start) 5634 if (offset < search_start)
5461 btrfs_add_free_space(used_block_group, offset, 5635 btrfs_add_free_space(used_block_group, offset,
5462 search_start - offset); 5636 search_start - offset);
@@ -5621,6 +5795,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
5621 u64 search_end, struct btrfs_key *ins, 5795 u64 search_end, struct btrfs_key *ins,
5622 u64 data) 5796 u64 data)
5623{ 5797{
5798 bool final_tried = false;
5624 int ret; 5799 int ret;
5625 u64 search_start = 0; 5800 u64 search_start = 0;
5626 5801
@@ -5640,22 +5815,25 @@ again:
5640 search_start, search_end, hint_byte, 5815 search_start, search_end, hint_byte,
5641 ins, data); 5816 ins, data);
5642 5817
5643 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5818 if (ret == -ENOSPC) {
5644 num_bytes = num_bytes >> 1; 5819 if (!final_tried) {
5645 num_bytes = num_bytes & ~(root->sectorsize - 1); 5820 num_bytes = num_bytes >> 1;
5646 num_bytes = max(num_bytes, min_alloc_size); 5821 num_bytes = num_bytes & ~(root->sectorsize - 1);
5647 do_chunk_alloc(trans, root->fs_info->extent_root, 5822 num_bytes = max(num_bytes, min_alloc_size);
5648 num_bytes, data, CHUNK_ALLOC_FORCE); 5823 do_chunk_alloc(trans, root->fs_info->extent_root,
5649 goto again; 5824 num_bytes, data, CHUNK_ALLOC_FORCE);
5650 } 5825 if (num_bytes == min_alloc_size)
5651 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) { 5826 final_tried = true;
5652 struct btrfs_space_info *sinfo; 5827 goto again;
5653 5828 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
5654 sinfo = __find_space_info(root->fs_info, data); 5829 struct btrfs_space_info *sinfo;
5655 printk(KERN_ERR "btrfs allocation failed flags %llu, " 5830
5656 "wanted %llu\n", (unsigned long long)data, 5831 sinfo = __find_space_info(root->fs_info, data);
5657 (unsigned long long)num_bytes); 5832 printk(KERN_ERR "btrfs allocation failed flags %llu, "
5658 dump_space_info(sinfo, num_bytes, 1); 5833 "wanted %llu\n", (unsigned long long)data,
5834 (unsigned long long)num_bytes);
5835 dump_space_info(sinfo, num_bytes, 1);
5836 }
5659 } 5837 }
5660 5838
5661 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); 5839 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
@@ -5842,9 +6020,10 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5842 6020
5843 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); 6021 BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
5844 6022
5845 ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset, 6023 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
5846 0, root_objectid, owner, offset, 6024 ins->offset, 0,
5847 BTRFS_ADD_DELAYED_EXTENT, NULL); 6025 root_objectid, owner, offset,
6026 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
5848 return ret; 6027 return ret;
5849} 6028}
5850 6029
@@ -5997,10 +6176,11 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5997 return ERR_PTR(-ENOSPC); 6176 return ERR_PTR(-ENOSPC);
5998} 6177}
5999 6178
6000static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize) 6179static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6180 struct btrfs_block_rsv *block_rsv, u32 blocksize)
6001{ 6181{
6002 block_rsv_add_bytes(block_rsv, blocksize, 0); 6182 block_rsv_add_bytes(block_rsv, blocksize, 0);
6003 block_rsv_release_bytes(block_rsv, NULL, 0); 6183 block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6004} 6184}
6005 6185
6006/* 6186/*
@@ -6014,7 +6194,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6014 struct btrfs_root *root, u32 blocksize, 6194 struct btrfs_root *root, u32 blocksize,
6015 u64 parent, u64 root_objectid, 6195 u64 parent, u64 root_objectid,
6016 struct btrfs_disk_key *key, int level, 6196 struct btrfs_disk_key *key, int level,
6017 u64 hint, u64 empty_size) 6197 u64 hint, u64 empty_size, int for_cow)
6018{ 6198{
6019 struct btrfs_key ins; 6199 struct btrfs_key ins;
6020 struct btrfs_block_rsv *block_rsv; 6200 struct btrfs_block_rsv *block_rsv;
@@ -6030,7 +6210,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6030 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize, 6210 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6031 empty_size, hint, (u64)-1, &ins, 0); 6211 empty_size, hint, (u64)-1, &ins, 0);
6032 if (ret) { 6212 if (ret) {
6033 unuse_block_rsv(block_rsv, blocksize); 6213 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6034 return ERR_PTR(ret); 6214 return ERR_PTR(ret);
6035 } 6215 }
6036 6216
@@ -6058,10 +6238,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6058 extent_op->update_flags = 1; 6238 extent_op->update_flags = 1;
6059 extent_op->is_data = 0; 6239 extent_op->is_data = 0;
6060 6240
6061 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid, 6241 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6242 ins.objectid,
6062 ins.offset, parent, root_objectid, 6243 ins.offset, parent, root_objectid,
6063 level, BTRFS_ADD_DELAYED_EXTENT, 6244 level, BTRFS_ADD_DELAYED_EXTENT,
6064 extent_op); 6245 extent_op, for_cow);
6065 BUG_ON(ret); 6246 BUG_ON(ret);
6066 } 6247 }
6067 return buf; 6248 return buf;
@@ -6078,6 +6259,7 @@ struct walk_control {
6078 int keep_locks; 6259 int keep_locks;
6079 int reada_slot; 6260 int reada_slot;
6080 int reada_count; 6261 int reada_count;
6262 int for_reloc;
6081}; 6263};
6082 6264
6083#define DROP_REFERENCE 1 6265#define DROP_REFERENCE 1
@@ -6216,9 +6398,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6216 /* wc->stage == UPDATE_BACKREF */ 6398 /* wc->stage == UPDATE_BACKREF */
6217 if (!(wc->flags[level] & flag)) { 6399 if (!(wc->flags[level] & flag)) {
6218 BUG_ON(!path->locks[level]); 6400 BUG_ON(!path->locks[level]);
6219 ret = btrfs_inc_ref(trans, root, eb, 1); 6401 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6220 BUG_ON(ret); 6402 BUG_ON(ret);
6221 ret = btrfs_dec_ref(trans, root, eb, 0); 6403 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6222 BUG_ON(ret); 6404 BUG_ON(ret);
6223 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 6405 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6224 eb->len, flag, 0); 6406 eb->len, flag, 0);
@@ -6362,7 +6544,7 @@ skip:
6362 } 6544 }
6363 6545
6364 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 6546 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6365 root->root_key.objectid, level - 1, 0); 6547 root->root_key.objectid, level - 1, 0, 0);
6366 BUG_ON(ret); 6548 BUG_ON(ret);
6367 } 6549 }
6368 btrfs_tree_unlock(next); 6550 btrfs_tree_unlock(next);
@@ -6436,9 +6618,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6436 if (wc->refs[level] == 1) { 6618 if (wc->refs[level] == 1) {
6437 if (level == 0) { 6619 if (level == 0) {
6438 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 6620 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6439 ret = btrfs_dec_ref(trans, root, eb, 1); 6621 ret = btrfs_dec_ref(trans, root, eb, 1,
6622 wc->for_reloc);
6440 else 6623 else
6441 ret = btrfs_dec_ref(trans, root, eb, 0); 6624 ret = btrfs_dec_ref(trans, root, eb, 0,
6625 wc->for_reloc);
6442 BUG_ON(ret); 6626 BUG_ON(ret);
6443 } 6627 }
6444 /* make block locked assertion in clean_tree_block happy */ 6628 /* make block locked assertion in clean_tree_block happy */
@@ -6465,7 +6649,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6465 btrfs_header_owner(path->nodes[level + 1])); 6649 btrfs_header_owner(path->nodes[level + 1]));
6466 } 6650 }
6467 6651
6468 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1); 6652 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
6469out: 6653out:
6470 wc->refs[level] = 0; 6654 wc->refs[level] = 0;
6471 wc->flags[level] = 0; 6655 wc->flags[level] = 0;
@@ -6549,7 +6733,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6549 * blocks are properly updated. 6733 * blocks are properly updated.
6550 */ 6734 */
6551void btrfs_drop_snapshot(struct btrfs_root *root, 6735void btrfs_drop_snapshot(struct btrfs_root *root,
6552 struct btrfs_block_rsv *block_rsv, int update_ref) 6736 struct btrfs_block_rsv *block_rsv, int update_ref,
6737 int for_reloc)
6553{ 6738{
6554 struct btrfs_path *path; 6739 struct btrfs_path *path;
6555 struct btrfs_trans_handle *trans; 6740 struct btrfs_trans_handle *trans;
@@ -6637,6 +6822,7 @@ void btrfs_drop_snapshot(struct btrfs_root *root,
6637 wc->stage = DROP_REFERENCE; 6822 wc->stage = DROP_REFERENCE;
6638 wc->update_ref = update_ref; 6823 wc->update_ref = update_ref;
6639 wc->keep_locks = 0; 6824 wc->keep_locks = 0;
6825 wc->for_reloc = for_reloc;
6640 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6826 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6641 6827
6642 while (1) { 6828 while (1) {
@@ -6721,6 +6907,7 @@ out:
6721 * drop subtree rooted at tree block 'node'. 6907 * drop subtree rooted at tree block 'node'.
6722 * 6908 *
6723 * NOTE: this function will unlock and release tree block 'node' 6909 * NOTE: this function will unlock and release tree block 'node'
6910 * only used by relocation code
6724 */ 6911 */
6725int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 6912int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6726 struct btrfs_root *root, 6913 struct btrfs_root *root,
@@ -6765,6 +6952,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6765 wc->stage = DROP_REFERENCE; 6952 wc->stage = DROP_REFERENCE;
6766 wc->update_ref = 0; 6953 wc->update_ref = 0;
6767 wc->keep_locks = 1; 6954 wc->keep_locks = 1;
6955 wc->for_reloc = 1;
6768 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); 6956 wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
6769 6957
6770 while (1) { 6958 while (1) {
@@ -6792,6 +6980,29 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6792 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 6980 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
6793 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 6981 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
6794 6982
6983 if (root->fs_info->balance_ctl) {
6984 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
6985 u64 tgt = 0;
6986
6987 /* pick restriper's target profile and return */
6988 if (flags & BTRFS_BLOCK_GROUP_DATA &&
6989 bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6990 tgt = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
6991 } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
6992 bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6993 tgt = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
6994 } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
6995 bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
6996 tgt = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
6997 }
6998
6999 if (tgt) {
7000 /* extended -> chunk profile */
7001 tgt &= ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
7002 return tgt;
7003 }
7004 }
7005
6795 /* 7006 /*
6796 * we add in the count of missing devices because we want 7007 * we add in the count of missing devices because we want
6797 * to make sure that any RAID levels on a degraded FS 7008 * to make sure that any RAID levels on a degraded FS
@@ -7085,7 +7296,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7085 * space to fit our block group in. 7296 * space to fit our block group in.
7086 */ 7297 */
7087 if (device->total_bytes > device->bytes_used + min_free) { 7298 if (device->total_bytes > device->bytes_used + min_free) {
7088 ret = find_free_dev_extent(NULL, device, min_free, 7299 ret = find_free_dev_extent(device, min_free,
7089 &dev_offset, NULL); 7300 &dev_offset, NULL);
7090 if (!ret) 7301 if (!ret)
7091 dev_nr++; 7302 dev_nr++;
@@ -7447,6 +7658,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7447 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, 7658 ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7448 &cache->space_info); 7659 &cache->space_info);
7449 BUG_ON(ret); 7660 BUG_ON(ret);
7661 update_global_block_rsv(root->fs_info);
7450 7662
7451 spin_lock(&cache->space_info->lock); 7663 spin_lock(&cache->space_info->lock);
7452 cache->space_info->bytes_readonly += cache->bytes_super; 7664 cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7466,6 +7678,22 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7466 return 0; 7678 return 0;
7467} 7679}
7468 7680
7681static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7682{
7683 u64 extra_flags = flags & BTRFS_BLOCK_GROUP_PROFILE_MASK;
7684
7685 /* chunk -> extended profile */
7686 if (extra_flags == 0)
7687 extra_flags = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
7688
7689 if (flags & BTRFS_BLOCK_GROUP_DATA)
7690 fs_info->avail_data_alloc_bits &= ~extra_flags;
7691 if (flags & BTRFS_BLOCK_GROUP_METADATA)
7692 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7693 if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7694 fs_info->avail_system_alloc_bits &= ~extra_flags;
7695}
7696
7469int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 7697int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7470 struct btrfs_root *root, u64 group_start) 7698 struct btrfs_root *root, u64 group_start)
7471{ 7699{
@@ -7476,6 +7704,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7476 struct btrfs_key key; 7704 struct btrfs_key key;
7477 struct inode *inode; 7705 struct inode *inode;
7478 int ret; 7706 int ret;
7707 int index;
7479 int factor; 7708 int factor;
7480 7709
7481 root = root->fs_info->extent_root; 7710 root = root->fs_info->extent_root;
@@ -7491,6 +7720,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7491 free_excluded_extents(root, block_group); 7720 free_excluded_extents(root, block_group);
7492 7721
7493 memcpy(&key, &block_group->key, sizeof(key)); 7722 memcpy(&key, &block_group->key, sizeof(key));
7723 index = get_block_group_index(block_group);
7494 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 7724 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7495 BTRFS_BLOCK_GROUP_RAID1 | 7725 BTRFS_BLOCK_GROUP_RAID1 |
7496 BTRFS_BLOCK_GROUP_RAID10)) 7726 BTRFS_BLOCK_GROUP_RAID10))
@@ -7565,6 +7795,8 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7565 * are still on the list after taking the semaphore 7795 * are still on the list after taking the semaphore
7566 */ 7796 */
7567 list_del_init(&block_group->list); 7797 list_del_init(&block_group->list);
7798 if (list_empty(&block_group->space_info->block_groups[index]))
7799 clear_avail_alloc_bits(root->fs_info, block_group->flags);
7568 up_write(&block_group->space_info->groups_sem); 7800 up_write(&block_group->space_info->groups_sem);
7569 7801
7570 if (block_group->cached == BTRFS_CACHE_STARTED) 7802 if (block_group->cached == BTRFS_CACHE_STARTED)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 49f3c9dc09f4..fcf77e1ded40 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -18,6 +18,7 @@
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h" 20#include "volumes.h"
21#include "check-integrity.h"
21 22
22static struct kmem_cache *extent_state_cache; 23static struct kmem_cache *extent_state_cache;
23static struct kmem_cache *extent_buffer_cache; 24static struct kmem_cache *extent_buffer_cache;
@@ -1895,7 +1896,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1895 } 1896 }
1896 bio->bi_bdev = dev->bdev; 1897 bio->bi_bdev = dev->bdev;
1897 bio_add_page(bio, page, length, start-page_offset(page)); 1898 bio_add_page(bio, page, length, start-page_offset(page));
1898 submit_bio(WRITE_SYNC, bio); 1899 btrfsic_submit_bio(WRITE_SYNC, bio);
1899 wait_for_completion(&compl); 1900 wait_for_completion(&compl);
1900 1901
1901 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 1902 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
@@ -2393,7 +2394,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
2393 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 2394 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2394 mirror_num, bio_flags, start); 2395 mirror_num, bio_flags, start);
2395 else 2396 else
2396 submit_bio(rw, bio); 2397 btrfsic_submit_bio(rw, bio);
2397 2398
2398 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2399 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2399 ret = -EOPNOTSUPP; 2400 ret = -EOPNOTSUPP;
@@ -3579,6 +3580,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3579 atomic_set(&eb->blocking_writers, 0); 3580 atomic_set(&eb->blocking_writers, 0);
3580 atomic_set(&eb->spinning_readers, 0); 3581 atomic_set(&eb->spinning_readers, 0);
3581 atomic_set(&eb->spinning_writers, 0); 3582 atomic_set(&eb->spinning_writers, 0);
3583 eb->lock_nested = 0;
3582 init_waitqueue_head(&eb->write_lock_wq); 3584 init_waitqueue_head(&eb->write_lock_wq);
3583 init_waitqueue_head(&eb->read_lock_wq); 3585 init_waitqueue_head(&eb->read_lock_wq);
3584 3586
@@ -3907,6 +3909,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3907 while (start <= end) { 3909 while (start <= end) {
3908 index = start >> PAGE_CACHE_SHIFT; 3910 index = start >> PAGE_CACHE_SHIFT;
3909 page = find_get_page(tree->mapping, index); 3911 page = find_get_page(tree->mapping, index);
3912 if (!page)
3913 return 1;
3910 uptodate = PageUptodate(page); 3914 uptodate = PageUptodate(page);
3911 page_cache_release(page); 3915 page_cache_release(page);
3912 if (!uptodate) { 3916 if (!uptodate) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7604c3001322..bc6a042cb6fc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -129,6 +129,7 @@ struct extent_buffer {
129 struct list_head leak_list; 129 struct list_head leak_list;
130 struct rcu_head rcu_head; 130 struct rcu_head rcu_head;
131 atomic_t refs; 131 atomic_t refs;
132 pid_t lock_owner;
132 133
133 /* count of read lock holders on the extent buffer */ 134 /* count of read lock holders on the extent buffer */
134 atomic_t write_locks; 135 atomic_t write_locks;
@@ -137,6 +138,7 @@ struct extent_buffer {
137 atomic_t blocking_readers; 138 atomic_t blocking_readers;
138 atomic_t spinning_readers; 139 atomic_t spinning_readers;
139 atomic_t spinning_writers; 140 atomic_t spinning_writers;
141 int lock_nested;
140 142
141 /* protects write locks */ 143 /* protects write locks */
142 rwlock_t lock; 144 rwlock_t lock;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97fbe939c050..859ba2dd8890 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -678,7 +678,7 @@ next_slot:
678 disk_bytenr, num_bytes, 0, 678 disk_bytenr, num_bytes, 0,
679 root->root_key.objectid, 679 root->root_key.objectid,
680 new_key.objectid, 680 new_key.objectid,
681 start - extent_offset); 681 start - extent_offset, 0);
682 BUG_ON(ret); 682 BUG_ON(ret);
683 *hint_byte = disk_bytenr; 683 *hint_byte = disk_bytenr;
684 } 684 }
@@ -753,7 +753,7 @@ next_slot:
753 disk_bytenr, num_bytes, 0, 753 disk_bytenr, num_bytes, 0,
754 root->root_key.objectid, 754 root->root_key.objectid,
755 key.objectid, key.offset - 755 key.objectid, key.offset -
756 extent_offset); 756 extent_offset, 0);
757 BUG_ON(ret); 757 BUG_ON(ret);
758 inode_sub_bytes(inode, 758 inode_sub_bytes(inode,
759 extent_end - key.offset); 759 extent_end - key.offset);
@@ -962,7 +962,7 @@ again:
962 962
963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 963 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
964 root->root_key.objectid, 964 root->root_key.objectid,
965 ino, orig_offset); 965 ino, orig_offset, 0);
966 BUG_ON(ret); 966 BUG_ON(ret);
967 967
968 if (split == start) { 968 if (split == start) {
@@ -989,7 +989,7 @@ again:
989 del_nr++; 989 del_nr++;
990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 990 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
991 0, root->root_key.objectid, 991 0, root->root_key.objectid,
992 ino, orig_offset); 992 ino, orig_offset, 0);
993 BUG_ON(ret); 993 BUG_ON(ret);
994 } 994 }
995 other_start = 0; 995 other_start = 0;
@@ -1006,7 +1006,7 @@ again:
1006 del_nr++; 1006 del_nr++;
1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1007 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1008 0, root->root_key.objectid, 1008 0, root->root_key.objectid,
1009 ino, orig_offset); 1009 ino, orig_offset, 0);
1010 BUG_ON(ret); 1010 BUG_ON(ret);
1011 } 1011 }
1012 if (del_nr == 0) { 1012 if (del_nr == 0) {
@@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1081again: 1081again:
1082 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1083 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1084 mask); 1084 mask | __GFP_WRITE);
1085 if (!pages[i]) { 1085 if (!pages[i]) {
1086 faili = i - 1; 1086 faili = i - 1;
1087 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1136,7 +1136,8 @@ again:
1136 GFP_NOFS); 1136 GFP_NOFS);
1137 } 1137 }
1138 for (i = 0; i < num_pages; i++) { 1138 for (i = 0; i < num_pages; i++) {
1139 clear_page_dirty_for_io(pages[i]); 1139 if (clear_page_dirty_for_io(pages[i]))
1140 account_page_redirty(pages[i]);
1140 set_page_extent_mapped(pages[i]); 1141 set_page_extent_mapped(pages[i]);
1141 WARN_ON(!PageLocked(pages[i])); 1142 WARN_ON(!PageLocked(pages[i]));
1142 } 1143 }
@@ -1273,7 +1274,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1273 dirty_pages); 1274 dirty_pages);
1274 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1275 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1275 btrfs_btree_balance_dirty(root, 1); 1276 btrfs_btree_balance_dirty(root, 1);
1276 btrfs_throttle(root);
1277 1277
1278 pos += copied; 1278 pos += copied;
1279 num_written += copied; 1279 num_written += copied;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9a897bf79538..c2f20594c9f7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -319,9 +319,11 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
319 io_ctl_unmap_page(io_ctl); 319 io_ctl_unmap_page(io_ctl);
320 320
321 for (i = 0; i < io_ctl->num_pages; i++) { 321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]); 322 if (io_ctl->pages[i]) {
323 unlock_page(io_ctl->pages[i]); 323 ClearPageChecked(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]); 324 unlock_page(io_ctl->pages[i]);
325 page_cache_release(io_ctl->pages[i]);
326 }
325 } 327 }
326} 328}
327 329
@@ -635,7 +637,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
635 if (!num_entries) 637 if (!num_entries)
636 return 0; 638 return 0;
637 639
638 io_ctl_init(&io_ctl, inode, root); 640 ret = io_ctl_init(&io_ctl, inode, root);
641 if (ret)
642 return ret;
643
639 ret = readahead_cache(inode); 644 ret = readahead_cache(inode);
640 if (ret) 645 if (ret)
641 goto out; 646 goto out;
@@ -838,7 +843,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
838 struct io_ctl io_ctl; 843 struct io_ctl io_ctl;
839 struct list_head bitmap_list; 844 struct list_head bitmap_list;
840 struct btrfs_key key; 845 struct btrfs_key key;
841 u64 start, end, len; 846 u64 start, extent_start, extent_end, len;
842 int entries = 0; 847 int entries = 0;
843 int bitmaps = 0; 848 int bitmaps = 0;
844 int ret; 849 int ret;
@@ -849,7 +854,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
849 if (!i_size_read(inode)) 854 if (!i_size_read(inode))
850 return -1; 855 return -1;
851 856
852 io_ctl_init(&io_ctl, inode, root); 857 ret = io_ctl_init(&io_ctl, inode, root);
858 if (ret)
859 return -1;
853 860
854 /* Get the cluster for this block_group if it exists */ 861 /* Get the cluster for this block_group if it exists */
855 if (block_group && !list_empty(&block_group->cluster_list)) 862 if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,25 +864,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
857 struct btrfs_free_cluster, 864 struct btrfs_free_cluster,
858 block_group_list); 865 block_group_list);
859 866
860 /*
861 * We shouldn't have switched the pinned extents yet so this is the
862 * right one
863 */
864 unpin = root->fs_info->pinned_extents;
865
866 /* Lock all pages first so we can lock the extent safely. */ 867 /* Lock all pages first so we can lock the extent safely. */
867 io_ctl_prepare_pages(&io_ctl, inode, 0); 868 io_ctl_prepare_pages(&io_ctl, inode, 0);
868 869
869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 870 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
870 0, &cached_state, GFP_NOFS); 871 0, &cached_state, GFP_NOFS);
871 872
872 /*
873 * When searching for pinned extents, we need to start at our start
874 * offset.
875 */
876 if (block_group)
877 start = block_group->key.objectid;
878
879 node = rb_first(&ctl->free_space_offset); 873 node = rb_first(&ctl->free_space_offset);
880 if (!node && cluster) { 874 if (!node && cluster) {
881 node = rb_first(&cluster->root); 875 node = rb_first(&cluster->root);
@@ -918,9 +912,20 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
918 * We want to add any pinned extents to our free space cache 912 * We want to add any pinned extents to our free space cache
919 * so we don't leak the space 913 * so we don't leak the space
920 */ 914 */
915
916 /*
917 * We shouldn't have switched the pinned extents yet so this is the
918 * right one
919 */
920 unpin = root->fs_info->pinned_extents;
921
922 if (block_group)
923 start = block_group->key.objectid;
924
921 while (block_group && (start < block_group->key.objectid + 925 while (block_group && (start < block_group->key.objectid +
922 block_group->key.offset)) { 926 block_group->key.offset)) {
923 ret = find_first_extent_bit(unpin, start, &start, &end, 927 ret = find_first_extent_bit(unpin, start,
928 &extent_start, &extent_end,
924 EXTENT_DIRTY); 929 EXTENT_DIRTY);
925 if (ret) { 930 if (ret) {
926 ret = 0; 931 ret = 0;
@@ -928,20 +933,21 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
928 } 933 }
929 934
930 /* This pinned extent is out of our range */ 935 /* This pinned extent is out of our range */
931 if (start >= block_group->key.objectid + 936 if (extent_start >= block_group->key.objectid +
932 block_group->key.offset) 937 block_group->key.offset)
933 break; 938 break;
934 939
935 len = block_group->key.objectid + 940 extent_start = max(extent_start, start);
936 block_group->key.offset - start; 941 extent_end = min(block_group->key.objectid +
937 len = min(len, end + 1 - start); 942 block_group->key.offset, extent_end + 1);
943 len = extent_end - extent_start;
938 944
939 entries++; 945 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL); 946 ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
941 if (ret) 947 if (ret)
942 goto out_nospc; 948 goto out_nospc;
943 949
944 start = end + 1; 950 start = extent_end;
945 } 951 }
946 952
947 /* Write out the bitmaps */ 953 /* Write out the bitmaps */
@@ -2236,7 +2242,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2236 if (entry->bitmap) { 2242 if (entry->bitmap) {
2237 ret = btrfs_alloc_from_bitmap(block_group, 2243 ret = btrfs_alloc_from_bitmap(block_group,
2238 cluster, entry, bytes, 2244 cluster, entry, bytes,
2239 min_start); 2245 cluster->window_start);
2240 if (ret == 0) { 2246 if (ret == 0) {
2241 node = rb_next(&entry->offset_index); 2247 node = rb_next(&entry->offset_index);
2242 if (!node) 2248 if (!node)
@@ -2245,6 +2251,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2245 offset_index); 2251 offset_index);
2246 continue; 2252 continue;
2247 } 2253 }
2254 cluster->window_start += bytes;
2248 } else { 2255 } else {
2249 ret = entry->offset; 2256 ret = entry->offset;
2250 2257
@@ -2283,23 +2290,23 @@ out:
2283static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, 2290static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2284 struct btrfs_free_space *entry, 2291 struct btrfs_free_space *entry,
2285 struct btrfs_free_cluster *cluster, 2292 struct btrfs_free_cluster *cluster,
2286 u64 offset, u64 bytes, u64 min_bytes) 2293 u64 offset, u64 bytes,
2294 u64 cont1_bytes, u64 min_bytes)
2287{ 2295{
2288 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2296 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2289 unsigned long next_zero; 2297 unsigned long next_zero;
2290 unsigned long i; 2298 unsigned long i;
2291 unsigned long search_bits; 2299 unsigned long want_bits;
2292 unsigned long total_bits; 2300 unsigned long min_bits;
2293 unsigned long found_bits; 2301 unsigned long found_bits;
2294 unsigned long start = 0; 2302 unsigned long start = 0;
2295 unsigned long total_found = 0; 2303 unsigned long total_found = 0;
2296 int ret; 2304 int ret;
2297 bool found = false;
2298 2305
2299 i = offset_to_bit(entry->offset, block_group->sectorsize, 2306 i = offset_to_bit(entry->offset, block_group->sectorsize,
2300 max_t(u64, offset, entry->offset)); 2307 max_t(u64, offset, entry->offset));
2301 search_bits = bytes_to_bits(bytes, block_group->sectorsize); 2308 want_bits = bytes_to_bits(bytes, block_group->sectorsize);
2302 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2309 min_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
2303 2310
2304again: 2311again:
2305 found_bits = 0; 2312 found_bits = 0;
@@ -2308,7 +2315,7 @@ again:
2308 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { 2315 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
2309 next_zero = find_next_zero_bit(entry->bitmap, 2316 next_zero = find_next_zero_bit(entry->bitmap,
2310 BITS_PER_BITMAP, i); 2317 BITS_PER_BITMAP, i);
2311 if (next_zero - i >= search_bits) { 2318 if (next_zero - i >= min_bits) {
2312 found_bits = next_zero - i; 2319 found_bits = next_zero - i;
2313 break; 2320 break;
2314 } 2321 }
@@ -2318,10 +2325,9 @@ again:
2318 if (!found_bits) 2325 if (!found_bits)
2319 return -ENOSPC; 2326 return -ENOSPC;
2320 2327
2321 if (!found) { 2328 if (!total_found) {
2322 start = i; 2329 start = i;
2323 cluster->max_size = 0; 2330 cluster->max_size = 0;
2324 found = true;
2325 } 2331 }
2326 2332
2327 total_found += found_bits; 2333 total_found += found_bits;
@@ -2329,13 +2335,8 @@ again:
2329 if (cluster->max_size < found_bits * block_group->sectorsize) 2335 if (cluster->max_size < found_bits * block_group->sectorsize)
2330 cluster->max_size = found_bits * block_group->sectorsize; 2336 cluster->max_size = found_bits * block_group->sectorsize;
2331 2337
2332 if (total_found < total_bits) { 2338 if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2333 i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); 2339 i = next_zero + 1;
2334 if (i - start > total_bits * 2) {
2335 total_found = 0;
2336 cluster->max_size = 0;
2337 found = false;
2338 }
2339 goto again; 2340 goto again;
2340 } 2341 }
2341 2342
@@ -2346,28 +2347,31 @@ again:
2346 &entry->offset_index, 1); 2347 &entry->offset_index, 1);
2347 BUG_ON(ret); 2348 BUG_ON(ret);
2348 2349
2350 trace_btrfs_setup_cluster(block_group, cluster,
2351 total_found * block_group->sectorsize, 1);
2349 return 0; 2352 return 0;
2350} 2353}
2351 2354
2352/* 2355/*
2353 * This searches the block group for just extents to fill the cluster with. 2356 * This searches the block group for just extents to fill the cluster with.
2357 * Try to find a cluster with at least bytes total bytes, at least one
2358 * extent of cont1_bytes, and other clusters of at least min_bytes.
2354 */ 2359 */
2355static noinline int 2360static noinline int
2356setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group, 2361setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2357 struct btrfs_free_cluster *cluster, 2362 struct btrfs_free_cluster *cluster,
2358 struct list_head *bitmaps, u64 offset, u64 bytes, 2363 struct list_head *bitmaps, u64 offset, u64 bytes,
2359 u64 min_bytes) 2364 u64 cont1_bytes, u64 min_bytes)
2360{ 2365{
2361 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2366 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2362 struct btrfs_free_space *first = NULL; 2367 struct btrfs_free_space *first = NULL;
2363 struct btrfs_free_space *entry = NULL; 2368 struct btrfs_free_space *entry = NULL;
2364 struct btrfs_free_space *prev = NULL;
2365 struct btrfs_free_space *last; 2369 struct btrfs_free_space *last;
2366 struct rb_node *node; 2370 struct rb_node *node;
2367 u64 window_start; 2371 u64 window_start;
2368 u64 window_free; 2372 u64 window_free;
2369 u64 max_extent; 2373 u64 max_extent;
2370 u64 max_gap = 128 * 1024; 2374 u64 total_size = 0;
2371 2375
2372 entry = tree_search_offset(ctl, offset, 0, 1); 2376 entry = tree_search_offset(ctl, offset, 0, 1);
2373 if (!entry) 2377 if (!entry)
@@ -2377,8 +2381,8 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2377 * We don't want bitmaps, so just move along until we find a normal 2381 * We don't want bitmaps, so just move along until we find a normal
2378 * extent entry. 2382 * extent entry.
2379 */ 2383 */
2380 while (entry->bitmap) { 2384 while (entry->bitmap || entry->bytes < min_bytes) {
2381 if (list_empty(&entry->list)) 2385 if (entry->bitmap && list_empty(&entry->list))
2382 list_add_tail(&entry->list, bitmaps); 2386 list_add_tail(&entry->list, bitmaps);
2383 node = rb_next(&entry->offset_index); 2387 node = rb_next(&entry->offset_index);
2384 if (!node) 2388 if (!node)
@@ -2391,12 +2395,9 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2391 max_extent = entry->bytes; 2395 max_extent = entry->bytes;
2392 first = entry; 2396 first = entry;
2393 last = entry; 2397 last = entry;
2394 prev = entry;
2395 2398
2396 while (window_free <= min_bytes) { 2399 for (node = rb_next(&entry->offset_index); node;
2397 node = rb_next(&entry->offset_index); 2400 node = rb_next(&entry->offset_index)) {
2398 if (!node)
2399 return -ENOSPC;
2400 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2401 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2401 2402
2402 if (entry->bitmap) { 2403 if (entry->bitmap) {
@@ -2405,26 +2406,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2405 continue; 2406 continue;
2406 } 2407 }
2407 2408
2408 /* 2409 if (entry->bytes < min_bytes)
2409 * we haven't filled the empty size and the window is 2410 continue;
2410 * very large. reset and try again 2411
2411 */ 2412 last = entry;
2412 if (entry->offset - (prev->offset + prev->bytes) > max_gap || 2413 window_free += entry->bytes;
2413 entry->offset - window_start > (min_bytes * 2)) { 2414 if (entry->bytes > max_extent)
2414 first = entry;
2415 window_start = entry->offset;
2416 window_free = entry->bytes;
2417 last = entry;
2418 max_extent = entry->bytes; 2415 max_extent = entry->bytes;
2419 } else {
2420 last = entry;
2421 window_free += entry->bytes;
2422 if (entry->bytes > max_extent)
2423 max_extent = entry->bytes;
2424 }
2425 prev = entry;
2426 } 2416 }
2427 2417
2418 if (window_free < bytes || max_extent < cont1_bytes)
2419 return -ENOSPC;
2420
2428 cluster->window_start = first->offset; 2421 cluster->window_start = first->offset;
2429 2422
2430 node = &first->offset_index; 2423 node = &first->offset_index;
@@ -2438,17 +2431,18 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2438 2431
2439 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2432 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2440 node = rb_next(&entry->offset_index); 2433 node = rb_next(&entry->offset_index);
2441 if (entry->bitmap) 2434 if (entry->bitmap || entry->bytes < min_bytes)
2442 continue; 2435 continue;
2443 2436
2444 rb_erase(&entry->offset_index, &ctl->free_space_offset); 2437 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2445 ret = tree_insert_offset(&cluster->root, entry->offset, 2438 ret = tree_insert_offset(&cluster->root, entry->offset,
2446 &entry->offset_index, 0); 2439 &entry->offset_index, 0);
2440 total_size += entry->bytes;
2447 BUG_ON(ret); 2441 BUG_ON(ret);
2448 } while (node && entry != last); 2442 } while (node && entry != last);
2449 2443
2450 cluster->max_size = max_extent; 2444 cluster->max_size = max_extent;
2451 2445 trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2452 return 0; 2446 return 0;
2453} 2447}
2454 2448
@@ -2460,7 +2454,7 @@ static noinline int
2460setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, 2454setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2461 struct btrfs_free_cluster *cluster, 2455 struct btrfs_free_cluster *cluster,
2462 struct list_head *bitmaps, u64 offset, u64 bytes, 2456 struct list_head *bitmaps, u64 offset, u64 bytes,
2463 u64 min_bytes) 2457 u64 cont1_bytes, u64 min_bytes)
2464{ 2458{
2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2459 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2466 struct btrfs_free_space *entry; 2460 struct btrfs_free_space *entry;
@@ -2482,10 +2476,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2482 } 2476 }
2483 2477
2484 list_for_each_entry(entry, bitmaps, list) { 2478 list_for_each_entry(entry, bitmaps, list) {
2485 if (entry->bytes < min_bytes) 2479 if (entry->bytes < bytes)
2486 continue; 2480 continue;
2487 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset, 2481 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2488 bytes, min_bytes); 2482 bytes, cont1_bytes, min_bytes);
2489 if (!ret) 2483 if (!ret)
2490 return 0; 2484 return 0;
2491 } 2485 }
@@ -2499,7 +2493,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2499 2493
2500/* 2494/*
2501 * here we try to find a cluster of blocks in a block group. The goal 2495 * here we try to find a cluster of blocks in a block group. The goal
2502 * is to find at least bytes free and up to empty_size + bytes free. 2496 * is to find at least bytes+empty_size.
2503 * We might not find them all in one contiguous area. 2497 * We might not find them all in one contiguous area.
2504 * 2498 *
2505 * returns zero and sets up cluster if things worked out, otherwise 2499 * returns zero and sets up cluster if things worked out, otherwise
@@ -2515,23 +2509,24 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2515 struct btrfs_free_space *entry, *tmp; 2509 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps); 2510 LIST_HEAD(bitmaps);
2517 u64 min_bytes; 2511 u64 min_bytes;
2512 u64 cont1_bytes;
2518 int ret; 2513 int ret;
2519 2514
2520 /* for metadata, allow allocates with more holes */ 2515 /*
2516 * Choose the minimum extent size we'll require for this
2517 * cluster. For SSD_SPREAD, don't allow any fragmentation.
2518 * For metadata, allow allocates with smaller extents. For
2519 * data, keep it dense.
2520 */
2521 if (btrfs_test_opt(root, SSD_SPREAD)) { 2521 if (btrfs_test_opt(root, SSD_SPREAD)) {
2522 min_bytes = bytes + empty_size; 2522 cont1_bytes = min_bytes = bytes + empty_size;
2523 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { 2523 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2524 /* 2524 cont1_bytes = bytes;
2525 * we want to do larger allocations when we are 2525 min_bytes = block_group->sectorsize;
2526 * flushing out the delayed refs, it helps prevent 2526 } else {
2527 * making more work as we go along. 2527 cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
2528 */ 2528 min_bytes = block_group->sectorsize;
2529 if (trans->transaction->delayed_refs.flushing) 2529 }
2530 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2531 else
2532 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2533 } else
2534 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2535 2530
2536 spin_lock(&ctl->tree_lock); 2531 spin_lock(&ctl->tree_lock);
2537 2532
@@ -2539,7 +2534,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2539 * If we know we don't have enough space to make a cluster don't even 2534 * If we know we don't have enough space to make a cluster don't even
2540 * bother doing all the work to try and find one. 2535 * bother doing all the work to try and find one.
2541 */ 2536 */
2542 if (ctl->free_space < min_bytes) { 2537 if (ctl->free_space < bytes) {
2543 spin_unlock(&ctl->tree_lock); 2538 spin_unlock(&ctl->tree_lock);
2544 return -ENOSPC; 2539 return -ENOSPC;
2545 } 2540 }
@@ -2552,11 +2547,17 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2552 goto out; 2547 goto out;
2553 } 2548 }
2554 2549
2550 trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
2551 min_bytes);
2552
2553 INIT_LIST_HEAD(&bitmaps);
2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2554 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2556 bytes, min_bytes); 2555 bytes + empty_size,
2556 cont1_bytes, min_bytes);
2557 if (ret) 2557 if (ret)
2558 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps, 2558 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2559 offset, bytes, min_bytes); 2559 offset, bytes + empty_size,
2560 cont1_bytes, min_bytes);
2560 2561
2561 /* Clear our temporary list */ 2562 /* Clear our temporary list */
2562 list_for_each_entry_safe(entry, tmp, &bitmaps, list) 2563 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2568,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2567 list_add_tail(&cluster->block_group_list, 2568 list_add_tail(&cluster->block_group_list,
2568 &block_group->cluster_list); 2569 &block_group->cluster_list);
2569 cluster->block_group = block_group; 2570 cluster->block_group = block_group;
2571 } else {
2572 trace_btrfs_failed_cluster_setup(block_group);
2570 } 2573 }
2571out: 2574out:
2572 spin_unlock(&cluster->lock); 2575 spin_unlock(&cluster->lock);
@@ -2588,17 +2591,57 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2588 cluster->block_group = NULL; 2591 cluster->block_group = NULL;
2589} 2592}
2590 2593
2591int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, 2594static int do_trimming(struct btrfs_block_group_cache *block_group,
2592 u64 *trimmed, u64 start, u64 end, u64 minlen) 2595 u64 *total_trimmed, u64 start, u64 bytes,
2596 u64 reserved_start, u64 reserved_bytes)
2593{ 2597{
2594 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2598 struct btrfs_space_info *space_info = block_group->space_info;
2595 struct btrfs_free_space *entry = NULL;
2596 struct btrfs_fs_info *fs_info = block_group->fs_info; 2599 struct btrfs_fs_info *fs_info = block_group->fs_info;
2597 u64 bytes = 0; 2600 int ret;
2598 u64 actually_trimmed; 2601 int update = 0;
2599 int ret = 0; 2602 u64 trimmed = 0;
2600 2603
2601 *trimmed = 0; 2604 spin_lock(&space_info->lock);
2605 spin_lock(&block_group->lock);
2606 if (!block_group->ro) {
2607 block_group->reserved += reserved_bytes;
2608 space_info->bytes_reserved += reserved_bytes;
2609 update = 1;
2610 }
2611 spin_unlock(&block_group->lock);
2612 spin_unlock(&space_info->lock);
2613
2614 ret = btrfs_error_discard_extent(fs_info->extent_root,
2615 start, bytes, &trimmed);
2616 if (!ret)
2617 *total_trimmed += trimmed;
2618
2619 btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2620
2621 if (update) {
2622 spin_lock(&space_info->lock);
2623 spin_lock(&block_group->lock);
2624 if (block_group->ro)
2625 space_info->bytes_readonly += reserved_bytes;
2626 block_group->reserved -= reserved_bytes;
2627 space_info->bytes_reserved -= reserved_bytes;
2628 spin_unlock(&space_info->lock);
2629 spin_unlock(&block_group->lock);
2630 }
2631
2632 return ret;
2633}
2634
2635static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2636 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2637{
2638 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2639 struct btrfs_free_space *entry;
2640 struct rb_node *node;
2641 int ret = 0;
2642 u64 extent_start;
2643 u64 extent_bytes;
2644 u64 bytes;
2602 2645
2603 while (start < end) { 2646 while (start < end) {
2604 spin_lock(&ctl->tree_lock); 2647 spin_lock(&ctl->tree_lock);
@@ -2609,81 +2652,118 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2609 } 2652 }
2610 2653
2611 entry = tree_search_offset(ctl, start, 0, 1); 2654 entry = tree_search_offset(ctl, start, 0, 1);
2612 if (!entry) 2655 if (!entry) {
2613 entry = tree_search_offset(ctl,
2614 offset_to_bitmap(ctl, start),
2615 1, 1);
2616
2617 if (!entry || entry->offset >= end) {
2618 spin_unlock(&ctl->tree_lock); 2656 spin_unlock(&ctl->tree_lock);
2619 break; 2657 break;
2620 } 2658 }
2621 2659
2622 if (entry->bitmap) { 2660 /* skip bitmaps */
2623 ret = search_bitmap(ctl, entry, &start, &bytes); 2661 while (entry->bitmap) {
2624 if (!ret) { 2662 node = rb_next(&entry->offset_index);
2625 if (start >= end) { 2663 if (!node) {
2626 spin_unlock(&ctl->tree_lock);
2627 break;
2628 }
2629 bytes = min(bytes, end - start);
2630 bitmap_clear_bits(ctl, entry, start, bytes);
2631 if (entry->bytes == 0)
2632 free_bitmap(ctl, entry);
2633 } else {
2634 start = entry->offset + BITS_PER_BITMAP *
2635 block_group->sectorsize;
2636 spin_unlock(&ctl->tree_lock); 2664 spin_unlock(&ctl->tree_lock);
2637 ret = 0; 2665 goto out;
2638 continue;
2639 } 2666 }
2640 } else { 2667 entry = rb_entry(node, struct btrfs_free_space,
2641 start = entry->offset; 2668 offset_index);
2642 bytes = min(entry->bytes, end - start);
2643 unlink_free_space(ctl, entry);
2644 kmem_cache_free(btrfs_free_space_cachep, entry);
2645 } 2669 }
2646 2670
2671 if (entry->offset >= end) {
2672 spin_unlock(&ctl->tree_lock);
2673 break;
2674 }
2675
2676 extent_start = entry->offset;
2677 extent_bytes = entry->bytes;
2678 start = max(start, extent_start);
2679 bytes = min(extent_start + extent_bytes, end) - start;
2680 if (bytes < minlen) {
2681 spin_unlock(&ctl->tree_lock);
2682 goto next;
2683 }
2684
2685 unlink_free_space(ctl, entry);
2686 kmem_cache_free(btrfs_free_space_cachep, entry);
2687
2647 spin_unlock(&ctl->tree_lock); 2688 spin_unlock(&ctl->tree_lock);
2648 2689
2649 if (bytes >= minlen) { 2690 ret = do_trimming(block_group, total_trimmed, start, bytes,
2650 struct btrfs_space_info *space_info; 2691 extent_start, extent_bytes);
2651 int update = 0; 2692 if (ret)
2652 2693 break;
2653 space_info = block_group->space_info; 2694next:
2654 spin_lock(&space_info->lock); 2695 start += bytes;
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2663
2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2665 start,
2666 bytes,
2667 &actually_trimmed);
2668
2669 btrfs_add_free_space(block_group, start, bytes);
2670 if (update) {
2671 spin_lock(&space_info->lock);
2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2680 2696
2681 if (ret) 2697 if (fatal_signal_pending(current)) {
2682 break; 2698 ret = -ERESTARTSYS;
2683 *trimmed += actually_trimmed; 2699 break;
2700 }
2701
2702 cond_resched();
2703 }
2704out:
2705 return ret;
2706}
2707
2708static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
2709 u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2710{
2711 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2712 struct btrfs_free_space *entry;
2713 int ret = 0;
2714 int ret2;
2715 u64 bytes;
2716 u64 offset = offset_to_bitmap(ctl, start);
2717
2718 while (offset < end) {
2719 bool next_bitmap = false;
2720
2721 spin_lock(&ctl->tree_lock);
2722
2723 if (ctl->free_space < minlen) {
2724 spin_unlock(&ctl->tree_lock);
2725 break;
2726 }
2727
2728 entry = tree_search_offset(ctl, offset, 1, 0);
2729 if (!entry) {
2730 spin_unlock(&ctl->tree_lock);
2731 next_bitmap = true;
2732 goto next;
2733 }
2734
2735 bytes = minlen;
2736 ret2 = search_bitmap(ctl, entry, &start, &bytes);
2737 if (ret2 || start >= end) {
2738 spin_unlock(&ctl->tree_lock);
2739 next_bitmap = true;
2740 goto next;
2741 }
2742
2743 bytes = min(bytes, end - start);
2744 if (bytes < minlen) {
2745 spin_unlock(&ctl->tree_lock);
2746 goto next;
2747 }
2748
2749 bitmap_clear_bits(ctl, entry, start, bytes);
2750 if (entry->bytes == 0)
2751 free_bitmap(ctl, entry);
2752
2753 spin_unlock(&ctl->tree_lock);
2754
2755 ret = do_trimming(block_group, total_trimmed, start, bytes,
2756 start, bytes);
2757 if (ret)
2758 break;
2759next:
2760 if (next_bitmap) {
2761 offset += BITS_PER_BITMAP * ctl->unit;
2762 } else {
2763 start += bytes;
2764 if (start >= offset + BITS_PER_BITMAP * ctl->unit)
2765 offset += BITS_PER_BITMAP * ctl->unit;
2684 } 2766 }
2685 start += bytes;
2686 bytes = 0;
2687 2767
2688 if (fatal_signal_pending(current)) { 2768 if (fatal_signal_pending(current)) {
2689 ret = -ERESTARTSYS; 2769 ret = -ERESTARTSYS;
@@ -2696,6 +2776,22 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2696 return ret; 2776 return ret;
2697} 2777}
2698 2778
2779int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2780 u64 *trimmed, u64 start, u64 end, u64 minlen)
2781{
2782 int ret;
2783
2784 *trimmed = 0;
2785
2786 ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
2787 if (ret)
2788 return ret;
2789
2790 ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
2791
2792 return ret;
2793}
2794
2699/* 2795/*
2700 * Find the left-most item in the cache tree, and then return the 2796 * Find the left-most item in the cache tree, and then return the
2701 * smallest inode number in the item. 2797 * smallest inode number in the item.
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index f8962a957d65..213ffa86ce1b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -438,6 +438,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
438 trans->bytes_reserved); 438 trans->bytes_reserved);
439 if (ret) 439 if (ret)
440 goto out; 440 goto out;
441 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
442 trans->bytes_reserved, 1);
441again: 443again:
442 inode = lookup_free_ino_inode(root, path); 444 inode = lookup_free_ino_inode(root, path);
443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 445 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -498,6 +500,8 @@ again:
498out_put: 500out_put:
499 iput(inode); 501 iput(inode);
500out_release: 502out_release:
503 trace_btrfs_space_reservation(root->fs_info, "ino_cache", (u64)trans,
504 trans->bytes_reserved, 0);
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); 505 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
502out: 506out:
503 trans->block_rsv = rsv; 507 trans->block_rsv = rsv;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81b235a61f8c..32214fe0f7e3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1951,12 +1951,28 @@ enum btrfs_orphan_cleanup_state {
1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 1951void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1952 struct btrfs_root *root) 1952 struct btrfs_root *root)
1953{ 1953{
1954 struct btrfs_block_rsv *block_rsv;
1954 int ret; 1955 int ret;
1955 1956
1956 if (!list_empty(&root->orphan_list) || 1957 if (!list_empty(&root->orphan_list) ||
1957 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) 1958 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
1958 return; 1959 return;
1959 1960
1961 spin_lock(&root->orphan_lock);
1962 if (!list_empty(&root->orphan_list)) {
1963 spin_unlock(&root->orphan_lock);
1964 return;
1965 }
1966
1967 if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
1968 spin_unlock(&root->orphan_lock);
1969 return;
1970 }
1971
1972 block_rsv = root->orphan_block_rsv;
1973 root->orphan_block_rsv = NULL;
1974 spin_unlock(&root->orphan_lock);
1975
1960 if (root->orphan_item_inserted && 1976 if (root->orphan_item_inserted &&
1961 btrfs_root_refs(&root->root_item) > 0) { 1977 btrfs_root_refs(&root->root_item) > 0) {
1962 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, 1978 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
@@ -1965,10 +1981,9 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
1965 root->orphan_item_inserted = 0; 1981 root->orphan_item_inserted = 0;
1966 } 1982 }
1967 1983
1968 if (root->orphan_block_rsv) { 1984 if (block_rsv) {
1969 WARN_ON(root->orphan_block_rsv->size > 0); 1985 WARN_ON(block_rsv->size > 0);
1970 btrfs_free_block_rsv(root, root->orphan_block_rsv); 1986 btrfs_free_block_rsv(root, block_rsv);
1971 root->orphan_block_rsv = NULL;
1972 } 1987 }
1973} 1988}
1974 1989
@@ -2224,14 +2239,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2224 continue; 2239 continue;
2225 } 2240 }
2226 nr_truncate++; 2241 nr_truncate++;
2227 /*
2228 * Need to hold the imutex for reservation purposes, not
2229 * a huge deal here but I have a WARN_ON in
2230 * btrfs_delalloc_reserve_space to catch offenders.
2231 */
2232 mutex_lock(&inode->i_mutex);
2233 ret = btrfs_truncate(inode); 2242 ret = btrfs_truncate(inode);
2234 mutex_unlock(&inode->i_mutex);
2235 } else { 2243 } else {
2236 nr_unlink++; 2244 nr_unlink++;
2237 } 2245 }
@@ -2845,7 +2853,7 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2845 BUG_ON(!root->fs_info->enospc_unlink); 2853 BUG_ON(!root->fs_info->enospc_unlink);
2846 root->fs_info->enospc_unlink = 0; 2854 root->fs_info->enospc_unlink = 0;
2847 } 2855 }
2848 btrfs_end_transaction_throttle(trans, root); 2856 btrfs_end_transaction(trans, root);
2849} 2857}
2850 2858
2851static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2859static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -3009,7 +3017,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3009 int pending_del_nr = 0; 3017 int pending_del_nr = 0;
3010 int pending_del_slot = 0; 3018 int pending_del_slot = 0;
3011 int extent_type = -1; 3019 int extent_type = -1;
3012 int encoding;
3013 int ret; 3020 int ret;
3014 int err = 0; 3021 int err = 0;
3015 u64 ino = btrfs_ino(inode); 3022 u64 ino = btrfs_ino(inode);
@@ -3059,7 +3066,6 @@ search_again:
3059 leaf = path->nodes[0]; 3066 leaf = path->nodes[0];
3060 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 3067 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3061 found_type = btrfs_key_type(&found_key); 3068 found_type = btrfs_key_type(&found_key);
3062 encoding = 0;
3063 3069
3064 if (found_key.objectid != ino) 3070 if (found_key.objectid != ino)
3065 break; 3071 break;
@@ -3072,10 +3078,6 @@ search_again:
3072 fi = btrfs_item_ptr(leaf, path->slots[0], 3078 fi = btrfs_item_ptr(leaf, path->slots[0],
3073 struct btrfs_file_extent_item); 3079 struct btrfs_file_extent_item);
3074 extent_type = btrfs_file_extent_type(leaf, fi); 3080 extent_type = btrfs_file_extent_type(leaf, fi);
3075 encoding = btrfs_file_extent_compression(leaf, fi);
3076 encoding |= btrfs_file_extent_encryption(leaf, fi);
3077 encoding |= btrfs_file_extent_other_encoding(leaf, fi);
3078
3079 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3081 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3080 item_end += 3082 item_end +=
3081 btrfs_file_extent_num_bytes(leaf, fi); 3083 btrfs_file_extent_num_bytes(leaf, fi);
@@ -3103,7 +3105,7 @@ search_again:
3103 if (extent_type != BTRFS_FILE_EXTENT_INLINE) { 3105 if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3104 u64 num_dec; 3106 u64 num_dec;
3105 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); 3107 extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3106 if (!del_item && !encoding) { 3108 if (!del_item) {
3107 u64 orig_num_bytes = 3109 u64 orig_num_bytes =
3108 btrfs_file_extent_num_bytes(leaf, fi); 3110 btrfs_file_extent_num_bytes(leaf, fi);
3109 extent_num_bytes = new_size - 3111 extent_num_bytes = new_size -
@@ -3179,7 +3181,7 @@ delete:
3179 ret = btrfs_free_extent(trans, root, extent_start, 3181 ret = btrfs_free_extent(trans, root, extent_start,
3180 extent_num_bytes, 0, 3182 extent_num_bytes, 0,
3181 btrfs_header_owner(leaf), 3183 btrfs_header_owner(leaf),
3182 ino, extent_offset); 3184 ino, extent_offset, 0);
3183 BUG_ON(ret); 3185 BUG_ON(ret);
3184 } 3186 }
3185 3187
@@ -3434,7 +3436,7 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
3434 i_size_write(inode, newsize); 3436 i_size_write(inode, newsize);
3435 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); 3437 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3436 ret = btrfs_update_inode(trans, root, inode); 3438 ret = btrfs_update_inode(trans, root, inode);
3437 btrfs_end_transaction_throttle(trans, root); 3439 btrfs_end_transaction(trans, root);
3438 } else { 3440 } else {
3439 3441
3440 /* 3442 /*
@@ -4655,7 +4657,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4655 } 4657 }
4656out_unlock: 4658out_unlock:
4657 nr = trans->blocks_used; 4659 nr = trans->blocks_used;
4658 btrfs_end_transaction_throttle(trans, root); 4660 btrfs_end_transaction(trans, root);
4659 btrfs_btree_balance_dirty(root, nr); 4661 btrfs_btree_balance_dirty(root, nr);
4660 if (drop_inode) { 4662 if (drop_inode) {
4661 inode_dec_link_count(inode); 4663 inode_dec_link_count(inode);
@@ -4723,7 +4725,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4723 } 4725 }
4724out_unlock: 4726out_unlock:
4725 nr = trans->blocks_used; 4727 nr = trans->blocks_used;
4726 btrfs_end_transaction_throttle(trans, root); 4728 btrfs_end_transaction(trans, root);
4727 if (drop_inode) { 4729 if (drop_inode) {
4728 inode_dec_link_count(inode); 4730 inode_dec_link_count(inode);
4729 iput(inode); 4731 iput(inode);
@@ -4782,7 +4784,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4782 } 4784 }
4783 4785
4784 nr = trans->blocks_used; 4786 nr = trans->blocks_used;
4785 btrfs_end_transaction_throttle(trans, root); 4787 btrfs_end_transaction(trans, root);
4786fail: 4788fail:
4787 if (drop_inode) { 4789 if (drop_inode) {
4788 inode_dec_link_count(inode); 4790 inode_dec_link_count(inode);
@@ -4848,7 +4850,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
4848 4850
4849out_fail: 4851out_fail:
4850 nr = trans->blocks_used; 4852 nr = trans->blocks_used;
4851 btrfs_end_transaction_throttle(trans, root); 4853 btrfs_end_transaction(trans, root);
4852 if (drop_on_err) 4854 if (drop_on_err)
4853 iput(inode); 4855 iput(inode);
4854 btrfs_btree_balance_dirty(root, nr); 4856 btrfs_btree_balance_dirty(root, nr);
@@ -5121,7 +5123,7 @@ again:
5121 } 5123 }
5122 flush_dcache_page(page); 5124 flush_dcache_page(page);
5123 } else if (create && PageUptodate(page)) { 5125 } else if (create && PageUptodate(page)) {
5124 WARN_ON(1); 5126 BUG();
5125 if (!trans) { 5127 if (!trans) {
5126 kunmap(page); 5128 kunmap(page);
5127 free_extent_map(em); 5129 free_extent_map(em);
@@ -6399,21 +6401,23 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
6399 unsigned long zero_start; 6401 unsigned long zero_start;
6400 loff_t size; 6402 loff_t size;
6401 int ret; 6403 int ret;
6404 int reserved = 0;
6402 u64 page_start; 6405 u64 page_start;
6403 u64 page_end; 6406 u64 page_end;
6404 6407
6405 /* Need this to keep space reservations serialized */
6406 mutex_lock(&inode->i_mutex);
6407 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 6408 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
6408 mutex_unlock(&inode->i_mutex); 6409 if (!ret) {
6409 if (!ret)
6410 ret = btrfs_update_time(vma->vm_file); 6410 ret = btrfs_update_time(vma->vm_file);
6411 reserved = 1;
6412 }
6411 if (ret) { 6413 if (ret) {
6412 if (ret == -ENOMEM) 6414 if (ret == -ENOMEM)
6413 ret = VM_FAULT_OOM; 6415 ret = VM_FAULT_OOM;
6414 else /* -ENOSPC, -EIO, etc */ 6416 else /* -ENOSPC, -EIO, etc */
6415 ret = VM_FAULT_SIGBUS; 6417 ret = VM_FAULT_SIGBUS;
6416 goto out; 6418 if (reserved)
6419 goto out;
6420 goto out_noreserve;
6417 } 6421 }
6418 6422
6419 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6423 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
@@ -6494,8 +6498,9 @@ out_unlock:
6494 if (!ret) 6498 if (!ret)
6495 return VM_FAULT_LOCKED; 6499 return VM_FAULT_LOCKED;
6496 unlock_page(page); 6500 unlock_page(page);
6497 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6498out: 6501out:
6502 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
6503out_noreserve:
6499 return ret; 6504 return ret;
6500} 6505}
6501 6506
@@ -6668,7 +6673,7 @@ end_trans:
6668 err = ret; 6673 err = ret;
6669 6674
6670 nr = trans->blocks_used; 6675 nr = trans->blocks_used;
6671 ret = btrfs_end_transaction_throttle(trans, root); 6676 ret = btrfs_end_transaction(trans, root);
6672 btrfs_btree_balance_dirty(root, nr); 6677 btrfs_btree_balance_dirty(root, nr);
6673 } 6678 }
6674 6679
@@ -6749,6 +6754,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6749 extent_io_tree_init(&ei->io_tree, &inode->i_data); 6754 extent_io_tree_init(&ei->io_tree, &inode->i_data);
6750 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data); 6755 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
6751 mutex_init(&ei->log_mutex); 6756 mutex_init(&ei->log_mutex);
6757 mutex_init(&ei->delalloc_mutex);
6752 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6758 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6753 INIT_LIST_HEAD(&ei->i_orphan); 6759 INIT_LIST_HEAD(&ei->i_orphan);
6754 INIT_LIST_HEAD(&ei->delalloc_inodes); 6760 INIT_LIST_HEAD(&ei->delalloc_inodes);
@@ -7074,7 +7080,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7074 btrfs_end_log_trans(root); 7080 btrfs_end_log_trans(root);
7075 } 7081 }
7076out_fail: 7082out_fail:
7077 btrfs_end_transaction_throttle(trans, root); 7083 btrfs_end_transaction(trans, root);
7078out_notrans: 7084out_notrans:
7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 7085 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7080 up_read(&root->fs_info->subvol_sem); 7086 up_read(&root->fs_info->subvol_sem);
@@ -7246,7 +7252,7 @@ out_unlock:
7246 if (!err) 7252 if (!err)
7247 d_instantiate(dentry, inode); 7253 d_instantiate(dentry, inode);
7248 nr = trans->blocks_used; 7254 nr = trans->blocks_used;
7249 btrfs_end_transaction_throttle(trans, root); 7255 btrfs_end_transaction(trans, root);
7250 if (drop_inode) { 7256 if (drop_inode) {
7251 inode_dec_link_count(inode); 7257 inode_dec_link_count(inode);
7252 iput(inode); 7258 iput(inode);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5441ff1480fd..03bb62a9ee24 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -176,6 +176,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
176 struct btrfs_trans_handle *trans; 176 struct btrfs_trans_handle *trans;
177 unsigned int flags, oldflags; 177 unsigned int flags, oldflags;
178 int ret; 178 int ret;
179 u64 ip_oldflags;
180 unsigned int i_oldflags;
179 181
180 if (btrfs_root_readonly(root)) 182 if (btrfs_root_readonly(root))
181 return -EROFS; 183 return -EROFS;
@@ -192,6 +194,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
192 194
193 mutex_lock(&inode->i_mutex); 195 mutex_lock(&inode->i_mutex);
194 196
197 ip_oldflags = ip->flags;
198 i_oldflags = inode->i_flags;
199
195 flags = btrfs_mask_flags(inode->i_mode, flags); 200 flags = btrfs_mask_flags(inode->i_mode, flags);
196 oldflags = btrfs_flags_to_ioctl(ip->flags); 201 oldflags = btrfs_flags_to_ioctl(ip->flags);
197 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 202 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -249,19 +254,24 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
249 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 254 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
250 } 255 }
251 256
252 trans = btrfs_join_transaction(root); 257 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(IS_ERR(trans)); 258 if (IS_ERR(trans)) {
259 ret = PTR_ERR(trans);
260 goto out_drop;
261 }
254 262
255 btrfs_update_iflags(inode); 263 btrfs_update_iflags(inode);
256 inode->i_ctime = CURRENT_TIME; 264 inode->i_ctime = CURRENT_TIME;
257 ret = btrfs_update_inode(trans, root, inode); 265 ret = btrfs_update_inode(trans, root, inode);
258 BUG_ON(ret);
259 266
260 btrfs_end_transaction(trans, root); 267 btrfs_end_transaction(trans, root);
268 out_drop:
269 if (ret) {
270 ip->flags = ip_oldflags;
271 inode->i_flags = i_oldflags;
272 }
261 273
262 mnt_drop_write_file(file); 274 mnt_drop_write_file(file);
263
264 ret = 0;
265 out_unlock: 275 out_unlock:
266 mutex_unlock(&inode->i_mutex); 276 mutex_unlock(&inode->i_mutex);
267 return ret; 277 return ret;
@@ -276,14 +286,13 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
276 286
277static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) 287static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
278{ 288{
279 struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info; 289 struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
280 struct btrfs_fs_info *fs_info = root->fs_info;
281 struct btrfs_device *device; 290 struct btrfs_device *device;
282 struct request_queue *q; 291 struct request_queue *q;
283 struct fstrim_range range; 292 struct fstrim_range range;
284 u64 minlen = ULLONG_MAX; 293 u64 minlen = ULLONG_MAX;
285 u64 num_devices = 0; 294 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 295 u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
287 int ret; 296 int ret;
288 297
289 if (!capable(CAP_SYS_ADMIN)) 298 if (!capable(CAP_SYS_ADMIN))
@@ -312,7 +321,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
312 321
313 range.len = min(range.len, total_bytes - range.start); 322 range.len = min(range.len, total_bytes - range.start);
314 range.minlen = max(range.minlen, minlen); 323 range.minlen = max(range.minlen, minlen);
315 ret = btrfs_trim_fs(root, &range); 324 ret = btrfs_trim_fs(fs_info->tree_root, &range);
316 if (ret < 0) 325 if (ret < 0)
317 return ret; 326 return ret;
318 327
@@ -358,7 +367,7 @@ static noinline int create_subvol(struct btrfs_root *root,
358 return PTR_ERR(trans); 367 return PTR_ERR(trans);
359 368
360 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 369 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
361 0, objectid, NULL, 0, 0, 0); 370 0, objectid, NULL, 0, 0, 0, 0);
362 if (IS_ERR(leaf)) { 371 if (IS_ERR(leaf)) {
363 ret = PTR_ERR(leaf); 372 ret = PTR_ERR(leaf);
364 goto fail; 373 goto fail;
@@ -858,10 +867,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
858 return 0; 867 return 0;
859 file_end = (isize - 1) >> PAGE_CACHE_SHIFT; 868 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
860 869
861 mutex_lock(&inode->i_mutex);
862 ret = btrfs_delalloc_reserve_space(inode, 870 ret = btrfs_delalloc_reserve_space(inode,
863 num_pages << PAGE_CACHE_SHIFT); 871 num_pages << PAGE_CACHE_SHIFT);
864 mutex_unlock(&inode->i_mutex);
865 if (ret) 872 if (ret)
866 return ret; 873 return ret;
867again: 874again:
@@ -1058,7 +1065,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1058 i = range->start >> PAGE_CACHE_SHIFT; 1065 i = range->start >> PAGE_CACHE_SHIFT;
1059 } 1066 }
1060 if (!max_to_defrag) 1067 if (!max_to_defrag)
1061 max_to_defrag = last_index; 1068 max_to_defrag = last_index + 1;
1062 1069
1063 /* 1070 /*
1064 * make writeback starts from i, so the defrag range can be 1071 * make writeback starts from i, so the defrag range can be
@@ -1203,13 +1210,21 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1203 if (!capable(CAP_SYS_ADMIN)) 1210 if (!capable(CAP_SYS_ADMIN))
1204 return -EPERM; 1211 return -EPERM;
1205 1212
1213 mutex_lock(&root->fs_info->volume_mutex);
1214 if (root->fs_info->balance_ctl) {
1215 printk(KERN_INFO "btrfs: balance in progress\n");
1216 ret = -EINVAL;
1217 goto out;
1218 }
1219
1206 vol_args = memdup_user(arg, sizeof(*vol_args)); 1220 vol_args = memdup_user(arg, sizeof(*vol_args));
1207 if (IS_ERR(vol_args)) 1221 if (IS_ERR(vol_args)) {
1208 return PTR_ERR(vol_args); 1222 ret = PTR_ERR(vol_args);
1223 goto out;
1224 }
1209 1225
1210 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1226 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1211 1227
1212 mutex_lock(&root->fs_info->volume_mutex);
1213 sizestr = vol_args->name; 1228 sizestr = vol_args->name;
1214 devstr = strchr(sizestr, ':'); 1229 devstr = strchr(sizestr, ':');
1215 if (devstr) { 1230 if (devstr) {
@@ -1226,7 +1241,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1226 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", 1241 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1227 (unsigned long long)devid); 1242 (unsigned long long)devid);
1228 ret = -EINVAL; 1243 ret = -EINVAL;
1229 goto out_unlock; 1244 goto out_free;
1230 } 1245 }
1231 if (!strcmp(sizestr, "max")) 1246 if (!strcmp(sizestr, "max"))
1232 new_size = device->bdev->bd_inode->i_size; 1247 new_size = device->bdev->bd_inode->i_size;
@@ -1241,7 +1256,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1241 new_size = memparse(sizestr, NULL); 1256 new_size = memparse(sizestr, NULL);
1242 if (new_size == 0) { 1257 if (new_size == 0) {
1243 ret = -EINVAL; 1258 ret = -EINVAL;
1244 goto out_unlock; 1259 goto out_free;
1245 } 1260 }
1246 } 1261 }
1247 1262
@@ -1250,7 +1265,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1250 if (mod < 0) { 1265 if (mod < 0) {
1251 if (new_size > old_size) { 1266 if (new_size > old_size) {
1252 ret = -EINVAL; 1267 ret = -EINVAL;
1253 goto out_unlock; 1268 goto out_free;
1254 } 1269 }
1255 new_size = old_size - new_size; 1270 new_size = old_size - new_size;
1256 } else if (mod > 0) { 1271 } else if (mod > 0) {
@@ -1259,11 +1274,11 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1259 1274
1260 if (new_size < 256 * 1024 * 1024) { 1275 if (new_size < 256 * 1024 * 1024) {
1261 ret = -EINVAL; 1276 ret = -EINVAL;
1262 goto out_unlock; 1277 goto out_free;
1263 } 1278 }
1264 if (new_size > device->bdev->bd_inode->i_size) { 1279 if (new_size > device->bdev->bd_inode->i_size) {
1265 ret = -EFBIG; 1280 ret = -EFBIG;
1266 goto out_unlock; 1281 goto out_free;
1267 } 1282 }
1268 1283
1269 do_div(new_size, root->sectorsize); 1284 do_div(new_size, root->sectorsize);
@@ -1276,7 +1291,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1276 trans = btrfs_start_transaction(root, 0); 1291 trans = btrfs_start_transaction(root, 0);
1277 if (IS_ERR(trans)) { 1292 if (IS_ERR(trans)) {
1278 ret = PTR_ERR(trans); 1293 ret = PTR_ERR(trans);
1279 goto out_unlock; 1294 goto out_free;
1280 } 1295 }
1281 ret = btrfs_grow_device(trans, device, new_size); 1296 ret = btrfs_grow_device(trans, device, new_size);
1282 btrfs_commit_transaction(trans, root); 1297 btrfs_commit_transaction(trans, root);
@@ -1284,9 +1299,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1284 ret = btrfs_shrink_device(device, new_size); 1299 ret = btrfs_shrink_device(device, new_size);
1285 } 1300 }
1286 1301
1287out_unlock: 1302out_free:
1288 mutex_unlock(&root->fs_info->volume_mutex);
1289 kfree(vol_args); 1303 kfree(vol_args);
1304out:
1305 mutex_unlock(&root->fs_info->volume_mutex);
1290 return ret; 1306 return ret;
1291} 1307}
1292 1308
@@ -2052,14 +2068,25 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2052 if (!capable(CAP_SYS_ADMIN)) 2068 if (!capable(CAP_SYS_ADMIN))
2053 return -EPERM; 2069 return -EPERM;
2054 2070
2071 mutex_lock(&root->fs_info->volume_mutex);
2072 if (root->fs_info->balance_ctl) {
2073 printk(KERN_INFO "btrfs: balance in progress\n");
2074 ret = -EINVAL;
2075 goto out;
2076 }
2077
2055 vol_args = memdup_user(arg, sizeof(*vol_args)); 2078 vol_args = memdup_user(arg, sizeof(*vol_args));
2056 if (IS_ERR(vol_args)) 2079 if (IS_ERR(vol_args)) {
2057 return PTR_ERR(vol_args); 2080 ret = PTR_ERR(vol_args);
2081 goto out;
2082 }
2058 2083
2059 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2084 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2060 ret = btrfs_init_new_device(root, vol_args->name); 2085 ret = btrfs_init_new_device(root, vol_args->name);
2061 2086
2062 kfree(vol_args); 2087 kfree(vol_args);
2088out:
2089 mutex_unlock(&root->fs_info->volume_mutex);
2063 return ret; 2090 return ret;
2064} 2091}
2065 2092
@@ -2074,14 +2101,25 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
2074 if (root->fs_info->sb->s_flags & MS_RDONLY) 2101 if (root->fs_info->sb->s_flags & MS_RDONLY)
2075 return -EROFS; 2102 return -EROFS;
2076 2103
2104 mutex_lock(&root->fs_info->volume_mutex);
2105 if (root->fs_info->balance_ctl) {
2106 printk(KERN_INFO "btrfs: balance in progress\n");
2107 ret = -EINVAL;
2108 goto out;
2109 }
2110
2077 vol_args = memdup_user(arg, sizeof(*vol_args)); 2111 vol_args = memdup_user(arg, sizeof(*vol_args));
2078 if (IS_ERR(vol_args)) 2112 if (IS_ERR(vol_args)) {
2079 return PTR_ERR(vol_args); 2113 ret = PTR_ERR(vol_args);
2114 goto out;
2115 }
2080 2116
2081 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2117 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2082 ret = btrfs_rm_device(root, vol_args->name); 2118 ret = btrfs_rm_device(root, vol_args->name);
2083 2119
2084 kfree(vol_args); 2120 kfree(vol_args);
2121out:
2122 mutex_unlock(&root->fs_info->volume_mutex);
2085 return ret; 2123 return ret;
2086} 2124}
2087 2125
@@ -2427,7 +2465,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2427 disko, diskl, 0, 2465 disko, diskl, 0,
2428 root->root_key.objectid, 2466 root->root_key.objectid,
2429 btrfs_ino(inode), 2467 btrfs_ino(inode),
2430 new_key.offset - datao); 2468 new_key.offset - datao,
2469 0);
2431 BUG_ON(ret); 2470 BUG_ON(ret);
2432 } 2471 }
2433 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 2472 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -2977,7 +3016,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2977{ 3016{
2978 int ret = 0; 3017 int ret = 0;
2979 int size; 3018 int size;
2980 u64 extent_offset; 3019 u64 extent_item_pos;
2981 struct btrfs_ioctl_logical_ino_args *loi; 3020 struct btrfs_ioctl_logical_ino_args *loi;
2982 struct btrfs_data_container *inodes = NULL; 3021 struct btrfs_data_container *inodes = NULL;
2983 struct btrfs_path *path = NULL; 3022 struct btrfs_path *path = NULL;
@@ -3008,15 +3047,17 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
3008 } 3047 }
3009 3048
3010 ret = extent_from_logical(root->fs_info, loi->logical, path, &key); 3049 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3050 btrfs_release_path(path);
3011 3051
3012 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) 3052 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3013 ret = -ENOENT; 3053 ret = -ENOENT;
3014 if (ret < 0) 3054 if (ret < 0)
3015 goto out; 3055 goto out;
3016 3056
3017 extent_offset = loi->logical - key.objectid; 3057 extent_item_pos = loi->logical - key.objectid;
3018 ret = iterate_extent_inodes(root->fs_info, path, key.objectid, 3058 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3019 extent_offset, build_ino_list, inodes); 3059 extent_item_pos, build_ino_list,
3060 inodes);
3020 3061
3021 if (ret < 0) 3062 if (ret < 0)
3022 goto out; 3063 goto out;
@@ -3034,6 +3075,163 @@ out:
3034 return ret; 3075 return ret;
3035} 3076}
3036 3077
3078void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
3079 struct btrfs_ioctl_balance_args *bargs)
3080{
3081 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3082
3083 bargs->flags = bctl->flags;
3084
3085 if (atomic_read(&fs_info->balance_running))
3086 bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
3087 if (atomic_read(&fs_info->balance_pause_req))
3088 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
3089 if (atomic_read(&fs_info->balance_cancel_req))
3090 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
3091
3092 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
3093 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
3094 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
3095
3096 if (lock) {
3097 spin_lock(&fs_info->balance_lock);
3098 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3099 spin_unlock(&fs_info->balance_lock);
3100 } else {
3101 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3102 }
3103}
3104
3105static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
3106{
3107 struct btrfs_fs_info *fs_info = root->fs_info;
3108 struct btrfs_ioctl_balance_args *bargs;
3109 struct btrfs_balance_control *bctl;
3110 int ret;
3111
3112 if (!capable(CAP_SYS_ADMIN))
3113 return -EPERM;
3114
3115 if (fs_info->sb->s_flags & MS_RDONLY)
3116 return -EROFS;
3117
3118 mutex_lock(&fs_info->volume_mutex);
3119 mutex_lock(&fs_info->balance_mutex);
3120
3121 if (arg) {
3122 bargs = memdup_user(arg, sizeof(*bargs));
3123 if (IS_ERR(bargs)) {
3124 ret = PTR_ERR(bargs);
3125 goto out;
3126 }
3127
3128 if (bargs->flags & BTRFS_BALANCE_RESUME) {
3129 if (!fs_info->balance_ctl) {
3130 ret = -ENOTCONN;
3131 goto out_bargs;
3132 }
3133
3134 bctl = fs_info->balance_ctl;
3135 spin_lock(&fs_info->balance_lock);
3136 bctl->flags |= BTRFS_BALANCE_RESUME;
3137 spin_unlock(&fs_info->balance_lock);
3138
3139 goto do_balance;
3140 }
3141 } else {
3142 bargs = NULL;
3143 }
3144
3145 if (fs_info->balance_ctl) {
3146 ret = -EINPROGRESS;
3147 goto out_bargs;
3148 }
3149
3150 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3151 if (!bctl) {
3152 ret = -ENOMEM;
3153 goto out_bargs;
3154 }
3155
3156 bctl->fs_info = fs_info;
3157 if (arg) {
3158 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
3159 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
3160 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
3161
3162 bctl->flags = bargs->flags;
3163 } else {
3164 /* balance everything - no filters */
3165 bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
3166 }
3167
3168do_balance:
3169 ret = btrfs_balance(bctl, bargs);
3170 /*
3171 * bctl is freed in __cancel_balance or in free_fs_info if
3172 * restriper was paused all the way until unmount
3173 */
3174 if (arg) {
3175 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3176 ret = -EFAULT;
3177 }
3178
3179out_bargs:
3180 kfree(bargs);
3181out:
3182 mutex_unlock(&fs_info->balance_mutex);
3183 mutex_unlock(&fs_info->volume_mutex);
3184 return ret;
3185}
3186
3187static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
3188{
3189 if (!capable(CAP_SYS_ADMIN))
3190 return -EPERM;
3191
3192 switch (cmd) {
3193 case BTRFS_BALANCE_CTL_PAUSE:
3194 return btrfs_pause_balance(root->fs_info);
3195 case BTRFS_BALANCE_CTL_CANCEL:
3196 return btrfs_cancel_balance(root->fs_info);
3197 }
3198
3199 return -EINVAL;
3200}
3201
3202static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
3203 void __user *arg)
3204{
3205 struct btrfs_fs_info *fs_info = root->fs_info;
3206 struct btrfs_ioctl_balance_args *bargs;
3207 int ret = 0;
3208
3209 if (!capable(CAP_SYS_ADMIN))
3210 return -EPERM;
3211
3212 mutex_lock(&fs_info->balance_mutex);
3213 if (!fs_info->balance_ctl) {
3214 ret = -ENOTCONN;
3215 goto out;
3216 }
3217
3218 bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
3219 if (!bargs) {
3220 ret = -ENOMEM;
3221 goto out;
3222 }
3223
3224 update_ioctl_balance_args(fs_info, 1, bargs);
3225
3226 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3227 ret = -EFAULT;
3228
3229 kfree(bargs);
3230out:
3231 mutex_unlock(&fs_info->balance_mutex);
3232 return ret;
3233}
3234
3037long btrfs_ioctl(struct file *file, unsigned int 3235long btrfs_ioctl(struct file *file, unsigned int
3038 cmd, unsigned long arg) 3236 cmd, unsigned long arg)
3039{ 3237{
@@ -3078,7 +3276,7 @@ long btrfs_ioctl(struct file *file, unsigned int
3078 case BTRFS_IOC_DEV_INFO: 3276 case BTRFS_IOC_DEV_INFO:
3079 return btrfs_ioctl_dev_info(root, argp); 3277 return btrfs_ioctl_dev_info(root, argp);
3080 case BTRFS_IOC_BALANCE: 3278 case BTRFS_IOC_BALANCE:
3081 return btrfs_balance(root->fs_info->dev_root); 3279 return btrfs_ioctl_balance(root, NULL);
3082 case BTRFS_IOC_CLONE: 3280 case BTRFS_IOC_CLONE:
3083 return btrfs_ioctl_clone(file, arg, 0, 0, 0); 3281 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
3084 case BTRFS_IOC_CLONE_RANGE: 3282 case BTRFS_IOC_CLONE_RANGE:
@@ -3110,6 +3308,12 @@ long btrfs_ioctl(struct file *file, unsigned int
3110 return btrfs_ioctl_scrub_cancel(root, argp); 3308 return btrfs_ioctl_scrub_cancel(root, argp);
3111 case BTRFS_IOC_SCRUB_PROGRESS: 3309 case BTRFS_IOC_SCRUB_PROGRESS:
3112 return btrfs_ioctl_scrub_progress(root, argp); 3310 return btrfs_ioctl_scrub_progress(root, argp);
3311 case BTRFS_IOC_BALANCE_V2:
3312 return btrfs_ioctl_balance(root, argp);
3313 case BTRFS_IOC_BALANCE_CTL:
3314 return btrfs_ioctl_balance_ctl(root, arg);
3315 case BTRFS_IOC_BALANCE_PROGRESS:
3316 return btrfs_ioctl_balance_progress(root, argp);
3113 } 3317 }
3114 3318
3115 return -ENOTTY; 3319 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 252ae9915de8..4f69028a68c4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -109,6 +109,55 @@ struct btrfs_ioctl_fs_info_args {
109 __u64 reserved[124]; /* pad to 1k */ 109 __u64 reserved[124]; /* pad to 1k */
110}; 110};
111 111
112/* balance control ioctl modes */
113#define BTRFS_BALANCE_CTL_PAUSE 1
114#define BTRFS_BALANCE_CTL_CANCEL 2
115
116/*
117 * this is packed, because it should be exactly the same as its disk
118 * byte order counterpart (struct btrfs_disk_balance_args)
119 */
120struct btrfs_balance_args {
121 __u64 profiles;
122 __u64 usage;
123 __u64 devid;
124 __u64 pstart;
125 __u64 pend;
126 __u64 vstart;
127 __u64 vend;
128
129 __u64 target;
130
131 __u64 flags;
132
133 __u64 unused[8];
134} __attribute__ ((__packed__));
135
136/* report balance progress to userspace */
137struct btrfs_balance_progress {
138 __u64 expected; /* estimated # of chunks that will be
139 * relocated to fulfill the request */
140 __u64 considered; /* # of chunks we have considered so far */
141 __u64 completed; /* # of chunks relocated so far */
142};
143
144#define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0)
145#define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1)
146#define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2)
147
148struct btrfs_ioctl_balance_args {
149 __u64 flags; /* in/out */
150 __u64 state; /* out */
151
152 struct btrfs_balance_args data; /* in/out */
153 struct btrfs_balance_args meta; /* in/out */
154 struct btrfs_balance_args sys; /* in/out */
155
156 struct btrfs_balance_progress stat; /* out */
157
158 __u64 unused[72]; /* pad to 1k */
159};
160
112#define BTRFS_INO_LOOKUP_PATH_MAX 4080 161#define BTRFS_INO_LOOKUP_PATH_MAX 4080
113struct btrfs_ioctl_ino_lookup_args { 162struct btrfs_ioctl_ino_lookup_args {
114 __u64 treeid; 163 __u64 treeid;
@@ -272,6 +321,11 @@ struct btrfs_ioctl_logical_ino_args {
272 struct btrfs_ioctl_dev_info_args) 321 struct btrfs_ioctl_dev_info_args)
273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 322#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
274 struct btrfs_ioctl_fs_info_args) 323 struct btrfs_ioctl_fs_info_args)
324#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
325 struct btrfs_ioctl_balance_args)
326#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
327#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
328 struct btrfs_ioctl_balance_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ 329#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args) 330 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ 331#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d77b67c4b275..5e178d8f7167 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,6 +33,14 @@ void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
33 */ 33 */
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
35{ 35{
36 if (eb->lock_nested) {
37 read_lock(&eb->lock);
38 if (eb->lock_nested && current->pid == eb->lock_owner) {
39 read_unlock(&eb->lock);
40 return;
41 }
42 read_unlock(&eb->lock);
43 }
36 if (rw == BTRFS_WRITE_LOCK) { 44 if (rw == BTRFS_WRITE_LOCK) {
37 if (atomic_read(&eb->blocking_writers) == 0) { 45 if (atomic_read(&eb->blocking_writers) == 0) {
38 WARN_ON(atomic_read(&eb->spinning_writers) != 1); 46 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,6 +65,14 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
57 */ 65 */
58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) 66void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
59{ 67{
68 if (eb->lock_nested) {
69 read_lock(&eb->lock);
70 if (&eb->lock_nested && current->pid == eb->lock_owner) {
71 read_unlock(&eb->lock);
72 return;
73 }
74 read_unlock(&eb->lock);
75 }
60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) { 76 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
61 BUG_ON(atomic_read(&eb->blocking_writers) != 1); 77 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
62 write_lock(&eb->lock); 78 write_lock(&eb->lock);
@@ -81,12 +97,25 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
81void btrfs_tree_read_lock(struct extent_buffer *eb) 97void btrfs_tree_read_lock(struct extent_buffer *eb)
82{ 98{
83again: 99again:
100 read_lock(&eb->lock);
101 if (atomic_read(&eb->blocking_writers) &&
102 current->pid == eb->lock_owner) {
103 /*
104 * This extent is already write-locked by our thread. We allow
105 * an additional read lock to be added because it's for the same
106 * thread. btrfs_find_all_roots() depends on this as it may be
107 * called on a partly (write-)locked tree.
108 */
109 BUG_ON(eb->lock_nested);
110 eb->lock_nested = 1;
111 read_unlock(&eb->lock);
112 return;
113 }
114 read_unlock(&eb->lock);
84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); 115 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
85 read_lock(&eb->lock); 116 read_lock(&eb->lock);
86 if (atomic_read(&eb->blocking_writers)) { 117 if (atomic_read(&eb->blocking_writers)) {
87 read_unlock(&eb->lock); 118 read_unlock(&eb->lock);
88 wait_event(eb->write_lock_wq,
89 atomic_read(&eb->blocking_writers) == 0);
90 goto again; 119 goto again;
91 } 120 }
92 atomic_inc(&eb->read_locks); 121 atomic_inc(&eb->read_locks);
@@ -129,6 +158,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
129 } 158 }
130 atomic_inc(&eb->write_locks); 159 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers); 160 atomic_inc(&eb->spinning_writers);
161 eb->lock_owner = current->pid;
132 return 1; 162 return 1;
133} 163}
134 164
@@ -137,6 +167,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
137 */ 167 */
138void btrfs_tree_read_unlock(struct extent_buffer *eb) 168void btrfs_tree_read_unlock(struct extent_buffer *eb)
139{ 169{
170 if (eb->lock_nested) {
171 read_lock(&eb->lock);
172 if (eb->lock_nested && current->pid == eb->lock_owner) {
173 eb->lock_nested = 0;
174 read_unlock(&eb->lock);
175 return;
176 }
177 read_unlock(&eb->lock);
178 }
140 btrfs_assert_tree_read_locked(eb); 179 btrfs_assert_tree_read_locked(eb);
141 WARN_ON(atomic_read(&eb->spinning_readers) == 0); 180 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
142 atomic_dec(&eb->spinning_readers); 181 atomic_dec(&eb->spinning_readers);
@@ -149,6 +188,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
149 */ 188 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) 189void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{ 190{
191 if (eb->lock_nested) {
192 read_lock(&eb->lock);
193 if (eb->lock_nested && current->pid == eb->lock_owner) {
194 eb->lock_nested = 0;
195 read_unlock(&eb->lock);
196 return;
197 }
198 read_unlock(&eb->lock);
199 }
152 btrfs_assert_tree_read_locked(eb); 200 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 201 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers)) 202 if (atomic_dec_and_test(&eb->blocking_readers))
@@ -181,6 +229,7 @@ again:
181 WARN_ON(atomic_read(&eb->spinning_writers)); 229 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers); 230 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks); 231 atomic_inc(&eb->write_locks);
232 eb->lock_owner = current->pid;
184 return 0; 233 return 0;
185} 234}
186 235
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfb55434a469..8c1aae2c845d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1604,12 +1604,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr, 1604 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1605 num_bytes, parent, 1605 num_bytes, parent,
1606 btrfs_header_owner(leaf), 1606 btrfs_header_owner(leaf),
1607 key.objectid, key.offset); 1607 key.objectid, key.offset, 1);
1608 BUG_ON(ret); 1608 BUG_ON(ret);
1609 1609
1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1610 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1611 parent, btrfs_header_owner(leaf), 1611 parent, btrfs_header_owner(leaf),
1612 key.objectid, key.offset); 1612 key.objectid, key.offset, 1);
1613 BUG_ON(ret); 1613 BUG_ON(ret);
1614 } 1614 }
1615 if (dirty) 1615 if (dirty)
@@ -1778,21 +1778,23 @@ again:
1778 1778
1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, 1779 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1780 path->nodes[level]->start, 1780 path->nodes[level]->start,
1781 src->root_key.objectid, level - 1, 0); 1781 src->root_key.objectid, level - 1, 0,
1782 1);
1782 BUG_ON(ret); 1783 BUG_ON(ret);
1783 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 1784 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1784 0, dest->root_key.objectid, level - 1, 1785 0, dest->root_key.objectid, level - 1,
1785 0); 1786 0, 1);
1786 BUG_ON(ret); 1787 BUG_ON(ret);
1787 1788
1788 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, 1789 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1789 path->nodes[level]->start, 1790 path->nodes[level]->start,
1790 src->root_key.objectid, level - 1, 0); 1791 src->root_key.objectid, level - 1, 0,
1792 1);
1791 BUG_ON(ret); 1793 BUG_ON(ret);
1792 1794
1793 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 1795 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1794 0, dest->root_key.objectid, level - 1, 1796 0, dest->root_key.objectid, level - 1,
1795 0); 1797 0, 1);
1796 BUG_ON(ret); 1798 BUG_ON(ret);
1797 1799
1798 btrfs_unlock_up_safe(path, 0); 1800 btrfs_unlock_up_safe(path, 0);
@@ -2244,7 +2246,7 @@ again:
2244 } else { 2246 } else {
2245 list_del_init(&reloc_root->root_list); 2247 list_del_init(&reloc_root->root_list);
2246 } 2248 }
2247 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0); 2249 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
2248 } 2250 }
2249 2251
2250 if (found) { 2252 if (found) {
@@ -2558,7 +2560,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2558 node->eb->start, blocksize, 2560 node->eb->start, blocksize,
2559 upper->eb->start, 2561 upper->eb->start,
2560 btrfs_header_owner(upper->eb), 2562 btrfs_header_owner(upper->eb),
2561 node->level, 0); 2563 node->level, 0, 1);
2562 BUG_ON(ret); 2564 BUG_ON(ret);
2563 2565
2564 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2566 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2947,9 +2949,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2947 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2949 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2948 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2950 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2949 while (index <= last_index) { 2951 while (index <= last_index) {
2950 mutex_lock(&inode->i_mutex);
2951 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE); 2952 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2952 mutex_unlock(&inode->i_mutex);
2953 if (ret) 2953 if (ret)
2954 goto out; 2954 goto out;
2955 2955
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ddf2c90d3fc0..9770cc5bfb76 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -25,6 +25,7 @@
25#include "transaction.h" 25#include "transaction.h"
26#include "backref.h" 26#include "backref.h"
27#include "extent_io.h" 27#include "extent_io.h"
28#include "check-integrity.h"
28 29
29/* 30/*
30 * This is only the first step towards a full-features scrub. It reads all 31 * This is only the first step towards a full-features scrub. It reads all
@@ -309,7 +310,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
309 u8 ref_level; 310 u8 ref_level;
310 unsigned long ptr = 0; 311 unsigned long ptr = 0;
311 const int bufsize = 4096; 312 const int bufsize = 4096;
312 u64 extent_offset; 313 u64 extent_item_pos;
313 314
314 path = btrfs_alloc_path(); 315 path = btrfs_alloc_path();
315 316
@@ -329,12 +330,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
329 if (ret < 0) 330 if (ret < 0)
330 goto out; 331 goto out;
331 332
332 extent_offset = swarn.logical - found_key.objectid; 333 extent_item_pos = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset; 334 swarn.extent_item_size = found_key.offset;
334 335
335 eb = path->nodes[0]; 336 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); 337 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]); 338 item_size = btrfs_item_size_nr(eb, path->slots[0]);
339 btrfs_release_path(path);
338 340
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 341 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do { 342 do {
@@ -351,7 +353,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
351 } else { 353 } else {
352 swarn.path = path; 354 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid, 355 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset, 356 extent_item_pos,
355 scrub_print_warning_inode, &swarn); 357 scrub_print_warning_inode, &swarn);
356 } 358 }
357 359
@@ -732,7 +734,7 @@ static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
732 bio_add_page(bio, page, PAGE_SIZE, 0); 734 bio_add_page(bio, page, PAGE_SIZE, 0);
733 bio->bi_end_io = scrub_fixup_end_io; 735 bio->bi_end_io = scrub_fixup_end_io;
734 bio->bi_private = &complete; 736 bio->bi_private = &complete;
735 submit_bio(rw, bio); 737 btrfsic_submit_bio(rw, bio);
736 738
737 /* this will also unplug the queue */ 739 /* this will also unplug the queue */
738 wait_for_completion(&complete); 740 wait_for_completion(&complete);
@@ -958,7 +960,7 @@ static int scrub_submit(struct scrub_dev *sdev)
958 sdev->curr = -1; 960 sdev->curr = -1;
959 atomic_inc(&sdev->in_flight); 961 atomic_inc(&sdev->in_flight);
960 962
961 submit_bio(READ, sbio->bio); 963 btrfsic_submit_bio(READ, sbio->bio);
962 964
963 return 0; 965 return 0;
964} 966}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ae488aa1966a..3ce97b217cbe 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -147,13 +147,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
147 147
148static void btrfs_put_super(struct super_block *sb) 148static void btrfs_put_super(struct super_block *sb)
149{ 149{
150 struct btrfs_root *root = btrfs_sb(sb); 150 (void)close_ctree(btrfs_sb(sb)->tree_root);
151 int ret; 151 /* FIXME: need to fix VFS to return error? */
152 152 /* AV: return it _where_? ->put_super() can be triggered by any number
153 ret = close_ctree(root); 153 * of async events, up to and including delivery of SIGKILL to the
154 sb->s_fs_info = NULL; 154 * last process that kept it busy. Or segfault in the aforementioned
155 155 * process... Whom would you report that to?
156 (void)ret; /* FIXME: need to fix VFS to return error? */ 156 */
157} 157}
158 158
159enum { 159enum {
@@ -163,8 +163,11 @@ enum {
163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 163 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err, 167 Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
168 Opt_check_integrity, Opt_check_integrity_including_extent_data,
169 Opt_check_integrity_print_mask,
170 Opt_err,
168}; 171};
169 172
170static match_table_t tokens = { 173static match_table_t tokens = {
@@ -199,6 +202,10 @@ static match_table_t tokens = {
199 {Opt_inode_cache, "inode_cache"}, 202 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "nospace_cache"}, 203 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"}, 204 {Opt_recovery, "recovery"},
205 {Opt_skip_balance, "skip_balance"},
206 {Opt_check_integrity, "check_int"},
207 {Opt_check_integrity_including_extent_data, "check_int_data"},
208 {Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
202 {Opt_err, NULL}, 209 {Opt_err, NULL},
203}; 210};
204 211
@@ -397,6 +404,40 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
397 printk(KERN_INFO "btrfs: enabling auto recovery"); 404 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY); 405 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break; 406 break;
407 case Opt_skip_balance:
408 btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
409 break;
410#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
411 case Opt_check_integrity_including_extent_data:
412 printk(KERN_INFO "btrfs: enabling check integrity"
413 " including extent data\n");
414 btrfs_set_opt(info->mount_opt,
415 CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
416 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
417 break;
418 case Opt_check_integrity:
419 printk(KERN_INFO "btrfs: enabling check integrity\n");
420 btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
421 break;
422 case Opt_check_integrity_print_mask:
423 intarg = 0;
424 match_int(&args[0], &intarg);
425 if (intarg) {
426 info->check_integrity_print_mask = intarg;
427 printk(KERN_INFO "btrfs:"
428 " check_integrity_print_mask 0x%x\n",
429 info->check_integrity_print_mask);
430 }
431 break;
432#else
433 case Opt_check_integrity_including_extent_data:
434 case Opt_check_integrity:
435 case Opt_check_integrity_print_mask:
436 printk(KERN_ERR "btrfs: support for check_integrity*"
437 " not compiled in!\n");
438 ret = -EINVAL;
439 goto out;
440#endif
400 case Opt_err: 441 case Opt_err:
401 printk(KERN_INFO "btrfs: unrecognized mount option " 442 printk(KERN_INFO "btrfs: unrecognized mount option "
402 "'%s'\n", p); 443 "'%s'\n", p);
@@ -500,7 +541,8 @@ out:
500static struct dentry *get_default_root(struct super_block *sb, 541static struct dentry *get_default_root(struct super_block *sb,
501 u64 subvol_objectid) 542 u64 subvol_objectid)
502{ 543{
503 struct btrfs_root *root = sb->s_fs_info; 544 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
545 struct btrfs_root *root = fs_info->tree_root;
504 struct btrfs_root *new_root; 546 struct btrfs_root *new_root;
505 struct btrfs_dir_item *di; 547 struct btrfs_dir_item *di;
506 struct btrfs_path *path; 548 struct btrfs_path *path;
@@ -530,7 +572,7 @@ static struct dentry *get_default_root(struct super_block *sb,
530 * will mount by default if we haven't been given a specific subvolume 572 * will mount by default if we haven't been given a specific subvolume
531 * to mount. 573 * to mount.
532 */ 574 */
533 dir_id = btrfs_super_root_dir(root->fs_info->super_copy); 575 dir_id = btrfs_super_root_dir(fs_info->super_copy);
534 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 576 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
535 if (IS_ERR(di)) { 577 if (IS_ERR(di)) {
536 btrfs_free_path(path); 578 btrfs_free_path(path);
@@ -544,7 +586,7 @@ static struct dentry *get_default_root(struct super_block *sb,
544 */ 586 */
545 btrfs_free_path(path); 587 btrfs_free_path(path);
546 dir_id = BTRFS_FIRST_FREE_OBJECTID; 588 dir_id = BTRFS_FIRST_FREE_OBJECTID;
547 new_root = root->fs_info->fs_root; 589 new_root = fs_info->fs_root;
548 goto setup_root; 590 goto setup_root;
549 } 591 }
550 592
@@ -552,7 +594,7 @@ static struct dentry *get_default_root(struct super_block *sb,
552 btrfs_free_path(path); 594 btrfs_free_path(path);
553 595
554find_root: 596find_root:
555 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 597 new_root = btrfs_read_fs_root_no_name(fs_info, &location);
556 if (IS_ERR(new_root)) 598 if (IS_ERR(new_root))
557 return ERR_CAST(new_root); 599 return ERR_CAST(new_root);
558 600
@@ -588,7 +630,7 @@ static int btrfs_fill_super(struct super_block *sb,
588{ 630{
589 struct inode *inode; 631 struct inode *inode;
590 struct dentry *root_dentry; 632 struct dentry *root_dentry;
591 struct btrfs_root *tree_root; 633 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
592 struct btrfs_key key; 634 struct btrfs_key key;
593 int err; 635 int err;
594 636
@@ -603,18 +645,16 @@ static int btrfs_fill_super(struct super_block *sb,
603 sb->s_flags |= MS_POSIXACL; 645 sb->s_flags |= MS_POSIXACL;
604#endif 646#endif
605 647
606 tree_root = open_ctree(sb, fs_devices, (char *)data); 648 err = open_ctree(sb, fs_devices, (char *)data);
607 649 if (err) {
608 if (IS_ERR(tree_root)) {
609 printk("btrfs: open_ctree failed\n"); 650 printk("btrfs: open_ctree failed\n");
610 return PTR_ERR(tree_root); 651 return err;
611 } 652 }
612 sb->s_fs_info = tree_root;
613 653
614 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 654 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
615 key.type = BTRFS_INODE_ITEM_KEY; 655 key.type = BTRFS_INODE_ITEM_KEY;
616 key.offset = 0; 656 key.offset = 0;
617 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL); 657 inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
618 if (IS_ERR(inode)) { 658 if (IS_ERR(inode)) {
619 err = PTR_ERR(inode); 659 err = PTR_ERR(inode);
620 goto fail_close; 660 goto fail_close;
@@ -631,23 +671,25 @@ static int btrfs_fill_super(struct super_block *sb,
631 671
632 save_mount_options(sb, data); 672 save_mount_options(sb, data);
633 cleancache_init_fs(sb); 673 cleancache_init_fs(sb);
674 sb->s_flags |= MS_ACTIVE;
634 return 0; 675 return 0;
635 676
636fail_close: 677fail_close:
637 close_ctree(tree_root); 678 close_ctree(fs_info->tree_root);
638 return err; 679 return err;
639} 680}
640 681
641int btrfs_sync_fs(struct super_block *sb, int wait) 682int btrfs_sync_fs(struct super_block *sb, int wait)
642{ 683{
643 struct btrfs_trans_handle *trans; 684 struct btrfs_trans_handle *trans;
644 struct btrfs_root *root = btrfs_sb(sb); 685 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
686 struct btrfs_root *root = fs_info->tree_root;
645 int ret; 687 int ret;
646 688
647 trace_btrfs_sync_fs(wait); 689 trace_btrfs_sync_fs(wait);
648 690
649 if (!wait) { 691 if (!wait) {
650 filemap_flush(root->fs_info->btree_inode->i_mapping); 692 filemap_flush(fs_info->btree_inode->i_mapping);
651 return 0; 693 return 0;
652 } 694 }
653 695
@@ -663,8 +705,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
663 705
664static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) 706static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
665{ 707{
666 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 708 struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
667 struct btrfs_fs_info *info = root->fs_info; 709 struct btrfs_root *root = info->tree_root;
668 char *compress_type; 710 char *compress_type;
669 711
670 if (btrfs_test_opt(root, DEGRADED)) 712 if (btrfs_test_opt(root, DEGRADED))
@@ -722,28 +764,25 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
722 seq_puts(seq, ",autodefrag"); 764 seq_puts(seq, ",autodefrag");
723 if (btrfs_test_opt(root, INODE_MAP_CACHE)) 765 if (btrfs_test_opt(root, INODE_MAP_CACHE))
724 seq_puts(seq, ",inode_cache"); 766 seq_puts(seq, ",inode_cache");
767 if (btrfs_test_opt(root, SKIP_BALANCE))
768 seq_puts(seq, ",skip_balance");
725 return 0; 769 return 0;
726} 770}
727 771
728static int btrfs_test_super(struct super_block *s, void *data) 772static int btrfs_test_super(struct super_block *s, void *data)
729{ 773{
730 struct btrfs_root *test_root = data; 774 struct btrfs_fs_info *p = data;
731 struct btrfs_root *root = btrfs_sb(s); 775 struct btrfs_fs_info *fs_info = btrfs_sb(s);
732 776
733 /* 777 return fs_info->fs_devices == p->fs_devices;
734 * If this super block is going away, return false as it
735 * can't match as an existing super block.
736 */
737 if (!atomic_read(&s->s_active))
738 return 0;
739 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
740} 778}
741 779
742static int btrfs_set_super(struct super_block *s, void *data) 780static int btrfs_set_super(struct super_block *s, void *data)
743{ 781{
744 s->s_fs_info = data; 782 int err = set_anon_super(s, data);
745 783 if (!err)
746 return set_anon_super(s, data); 784 s->s_fs_info = data;
785 return err;
747} 786}
748 787
749/* 788/*
@@ -903,12 +942,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
903 if (!fs_info) 942 if (!fs_info)
904 return ERR_PTR(-ENOMEM); 943 return ERR_PTR(-ENOMEM);
905 944
906 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
907 if (!fs_info->tree_root) {
908 error = -ENOMEM;
909 goto error_fs_info;
910 }
911 fs_info->tree_root->fs_info = fs_info;
912 fs_info->fs_devices = fs_devices; 945 fs_info->fs_devices = fs_devices;
913 946
914 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 947 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -928,43 +961,30 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
928 } 961 }
929 962
930 bdev = fs_devices->latest_bdev; 963 bdev = fs_devices->latest_bdev;
931 s = sget(fs_type, btrfs_test_super, btrfs_set_super, 964 s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
932 fs_info->tree_root);
933 if (IS_ERR(s)) { 965 if (IS_ERR(s)) {
934 error = PTR_ERR(s); 966 error = PTR_ERR(s);
935 goto error_close_devices; 967 goto error_close_devices;
936 } 968 }
937 969
938 if (s->s_root) { 970 if (s->s_root) {
939 if ((flags ^ s->s_flags) & MS_RDONLY) {
940 deactivate_locked_super(s);
941 error = -EBUSY;
942 goto error_close_devices;
943 }
944
945 btrfs_close_devices(fs_devices); 971 btrfs_close_devices(fs_devices);
946 free_fs_info(fs_info); 972 free_fs_info(fs_info);
973 if ((flags ^ s->s_flags) & MS_RDONLY)
974 error = -EBUSY;
947 } else { 975 } else {
948 char b[BDEVNAME_SIZE]; 976 char b[BDEVNAME_SIZE];
949 977
950 s->s_flags = flags | MS_NOSEC; 978 s->s_flags = flags | MS_NOSEC;
951 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 979 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
952 btrfs_sb(s)->fs_info->bdev_holder = fs_type; 980 btrfs_sb(s)->bdev_holder = fs_type;
953 error = btrfs_fill_super(s, fs_devices, data, 981 error = btrfs_fill_super(s, fs_devices, data,
954 flags & MS_SILENT ? 1 : 0); 982 flags & MS_SILENT ? 1 : 0);
955 if (error) {
956 deactivate_locked_super(s);
957 return ERR_PTR(error);
958 }
959
960 s->s_flags |= MS_ACTIVE;
961 } 983 }
962 984
963 root = get_default_root(s, subvol_objectid); 985 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
964 if (IS_ERR(root)) { 986 if (IS_ERR(root))
965 deactivate_locked_super(s); 987 deactivate_locked_super(s);
966 return root;
967 }
968 988
969 return root; 989 return root;
970 990
@@ -977,7 +997,8 @@ error_fs_info:
977 997
978static int btrfs_remount(struct super_block *sb, int *flags, char *data) 998static int btrfs_remount(struct super_block *sb, int *flags, char *data)
979{ 999{
980 struct btrfs_root *root = btrfs_sb(sb); 1000 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1001 struct btrfs_root *root = fs_info->tree_root;
981 int ret; 1002 int ret;
982 1003
983 ret = btrfs_parse_options(root, data); 1004 ret = btrfs_parse_options(root, data);
@@ -993,13 +1014,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
993 ret = btrfs_commit_super(root); 1014 ret = btrfs_commit_super(root);
994 WARN_ON(ret); 1015 WARN_ON(ret);
995 } else { 1016 } else {
996 if (root->fs_info->fs_devices->rw_devices == 0) 1017 if (fs_info->fs_devices->rw_devices == 0)
997 return -EACCES; 1018 return -EACCES;
998 1019
999 if (btrfs_super_log_root(root->fs_info->super_copy) != 0) 1020 if (btrfs_super_log_root(fs_info->super_copy) != 0)
1000 return -EINVAL; 1021 return -EINVAL;
1001 1022
1002 ret = btrfs_cleanup_fs_roots(root->fs_info); 1023 ret = btrfs_cleanup_fs_roots(fs_info);
1003 WARN_ON(ret); 1024 WARN_ON(ret);
1004 1025
1005 /* recover relocation */ 1026 /* recover relocation */
@@ -1168,18 +1189,18 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1168 1189
1169static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1190static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1170{ 1191{
1171 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1192 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
1172 struct btrfs_super_block *disk_super = root->fs_info->super_copy; 1193 struct btrfs_super_block *disk_super = fs_info->super_copy;
1173 struct list_head *head = &root->fs_info->space_info; 1194 struct list_head *head = &fs_info->space_info;
1174 struct btrfs_space_info *found; 1195 struct btrfs_space_info *found;
1175 u64 total_used = 0; 1196 u64 total_used = 0;
1176 u64 total_free_data = 0; 1197 u64 total_free_data = 0;
1177 int bits = dentry->d_sb->s_blocksize_bits; 1198 int bits = dentry->d_sb->s_blocksize_bits;
1178 __be32 *fsid = (__be32 *)root->fs_info->fsid; 1199 __be32 *fsid = (__be32 *)fs_info->fsid;
1179 int ret; 1200 int ret;
1180 1201
1181 /* holding chunk_muext to avoid allocating new chunks */ 1202 /* holding chunk_muext to avoid allocating new chunks */
1182 mutex_lock(&root->fs_info->chunk_mutex); 1203 mutex_lock(&fs_info->chunk_mutex);
1183 rcu_read_lock(); 1204 rcu_read_lock();
1184 list_for_each_entry_rcu(found, head, list) { 1205 list_for_each_entry_rcu(found, head, list) {
1185 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1206 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1198,14 +1219,14 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1198 buf->f_bsize = dentry->d_sb->s_blocksize; 1219 buf->f_bsize = dentry->d_sb->s_blocksize;
1199 buf->f_type = BTRFS_SUPER_MAGIC; 1220 buf->f_type = BTRFS_SUPER_MAGIC;
1200 buf->f_bavail = total_free_data; 1221 buf->f_bavail = total_free_data;
1201 ret = btrfs_calc_avail_data_space(root, &total_free_data); 1222 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1202 if (ret) { 1223 if (ret) {
1203 mutex_unlock(&root->fs_info->chunk_mutex); 1224 mutex_unlock(&fs_info->chunk_mutex);
1204 return ret; 1225 return ret;
1205 } 1226 }
1206 buf->f_bavail += total_free_data; 1227 buf->f_bavail += total_free_data;
1207 buf->f_bavail = buf->f_bavail >> bits; 1228 buf->f_bavail = buf->f_bavail >> bits;
1208 mutex_unlock(&root->fs_info->chunk_mutex); 1229 mutex_unlock(&fs_info->chunk_mutex);
1209 1230
1210 /* We treat it as constant endianness (it doesn't matter _which_) 1231 /* We treat it as constant endianness (it doesn't matter _which_)
1211 because we want the fsid to come out the same whether mounted 1232 because we want the fsid to come out the same whether mounted
@@ -1219,11 +1240,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1219 return 0; 1240 return 0;
1220} 1241}
1221 1242
1243static void btrfs_kill_super(struct super_block *sb)
1244{
1245 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1246 kill_anon_super(sb);
1247 free_fs_info(fs_info);
1248}
1249
1222static struct file_system_type btrfs_fs_type = { 1250static struct file_system_type btrfs_fs_type = {
1223 .owner = THIS_MODULE, 1251 .owner = THIS_MODULE,
1224 .name = "btrfs", 1252 .name = "btrfs",
1225 .mount = btrfs_mount, 1253 .mount = btrfs_mount,
1226 .kill_sb = kill_anon_super, 1254 .kill_sb = btrfs_kill_super,
1227 .fs_flags = FS_REQUIRES_DEV, 1255 .fs_flags = FS_REQUIRES_DEV,
1228}; 1256};
1229 1257
@@ -1257,17 +1285,17 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
1257 1285
1258static int btrfs_freeze(struct super_block *sb) 1286static int btrfs_freeze(struct super_block *sb)
1259{ 1287{
1260 struct btrfs_root *root = btrfs_sb(sb); 1288 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1261 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1289 mutex_lock(&fs_info->transaction_kthread_mutex);
1262 mutex_lock(&root->fs_info->cleaner_mutex); 1290 mutex_lock(&fs_info->cleaner_mutex);
1263 return 0; 1291 return 0;
1264} 1292}
1265 1293
1266static int btrfs_unfreeze(struct super_block *sb) 1294static int btrfs_unfreeze(struct super_block *sb)
1267{ 1295{
1268 struct btrfs_root *root = btrfs_sb(sb); 1296 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1269 mutex_unlock(&root->fs_info->cleaner_mutex); 1297 mutex_unlock(&fs_info->cleaner_mutex);
1270 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1298 mutex_unlock(&fs_info->transaction_kthread_mutex);
1271 return 0; 1299 return 0;
1272} 1300}
1273 1301
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 81376d94cd3c..287a6728b1ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -36,6 +36,8 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
36 WARN_ON(atomic_read(&transaction->use_count) == 0); 36 WARN_ON(atomic_read(&transaction->use_count) == 0);
37 if (atomic_dec_and_test(&transaction->use_count)) { 37 if (atomic_dec_and_test(&transaction->use_count)) {
38 BUG_ON(!list_empty(&transaction->list)); 38 BUG_ON(!list_empty(&transaction->list));
39 WARN_ON(transaction->delayed_refs.root.rb_node);
40 WARN_ON(!list_empty(&transaction->delayed_refs.seq_head));
39 memset(transaction, 0, sizeof(*transaction)); 41 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 42 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 43 }
@@ -108,8 +110,11 @@ loop:
108 cur_trans->delayed_refs.num_heads = 0; 110 cur_trans->delayed_refs.num_heads = 0;
109 cur_trans->delayed_refs.flushing = 0; 111 cur_trans->delayed_refs.flushing = 0;
110 cur_trans->delayed_refs.run_delayed_start = 0; 112 cur_trans->delayed_refs.run_delayed_start = 0;
113 cur_trans->delayed_refs.seq = 1;
114 init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
111 spin_lock_init(&cur_trans->commit_lock); 115 spin_lock_init(&cur_trans->commit_lock);
112 spin_lock_init(&cur_trans->delayed_refs.lock); 116 spin_lock_init(&cur_trans->delayed_refs.lock);
117 INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
113 118
114 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 119 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
115 list_add_tail(&cur_trans->list, &root->fs_info->trans_list); 120 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
@@ -321,6 +326,8 @@ again:
321 } 326 }
322 327
323 if (num_bytes) { 328 if (num_bytes) {
329 trace_btrfs_space_reservation(root->fs_info, "transaction",
330 (u64)h, num_bytes, 1);
324 h->block_rsv = &root->fs_info->trans_block_rsv; 331 h->block_rsv = &root->fs_info->trans_block_rsv;
325 h->bytes_reserved = num_bytes; 332 h->bytes_reserved = num_bytes;
326 } 333 }
@@ -467,19 +474,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
467 474
468 btrfs_trans_release_metadata(trans, root); 475 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL; 476 trans->block_rsv = NULL;
470 while (count < 4) { 477 while (count < 2) {
471 unsigned long cur = trans->delayed_ref_updates; 478 unsigned long cur = trans->delayed_ref_updates;
472 trans->delayed_ref_updates = 0; 479 trans->delayed_ref_updates = 0;
473 if (cur && 480 if (cur &&
474 trans->transaction->delayed_refs.num_heads_ready > 64) { 481 trans->transaction->delayed_refs.num_heads_ready > 64) {
475 trans->delayed_ref_updates = 0; 482 trans->delayed_ref_updates = 0;
476
477 /*
478 * do a full flush if the transaction is trying
479 * to close
480 */
481 if (trans->transaction->delayed_refs.flushing)
482 cur = 0;
483 btrfs_run_delayed_refs(trans, root, cur); 483 btrfs_run_delayed_refs(trans, root, cur);
484 } else { 484 } else {
485 break; 485 break;
@@ -1393,9 +1393,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1393 1393
1394 if (btrfs_header_backref_rev(root->node) < 1394 if (btrfs_header_backref_rev(root->node) <
1395 BTRFS_MIXED_BACKREF_REV) 1395 BTRFS_MIXED_BACKREF_REV)
1396 btrfs_drop_snapshot(root, NULL, 0); 1396 btrfs_drop_snapshot(root, NULL, 0, 0);
1397 else 1397 else
1398 btrfs_drop_snapshot(root, NULL, 1); 1398 btrfs_drop_snapshot(root, NULL, 1, 0);
1399 } 1399 }
1400 return 0; 1400 return 0;
1401} 1401}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3568374d419d..966cc74f5d6c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -589,7 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
589 ret = btrfs_inc_extent_ref(trans, root, 589 ret = btrfs_inc_extent_ref(trans, root,
590 ins.objectid, ins.offset, 590 ins.objectid, ins.offset,
591 0, root->root_key.objectid, 591 0, root->root_key.objectid,
592 key->objectid, offset); 592 key->objectid, offset, 0);
593 BUG_ON(ret); 593 BUG_ON(ret);
594 } else { 594 } else {
595 /* 595 /*
@@ -1957,7 +1957,8 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
1957 1957
1958 finish_wait(&root->log_commit_wait[index], &wait); 1958 finish_wait(&root->log_commit_wait[index], &wait);
1959 mutex_lock(&root->log_mutex); 1959 mutex_lock(&root->log_mutex);
1960 } while (root->log_transid < transid + 2 && 1960 } while (root->fs_info->last_trans_log_full_commit !=
1961 trans->transid && root->log_transid < transid + 2 &&
1961 atomic_read(&root->log_commit[index])); 1962 atomic_read(&root->log_commit[index]));
1962 return 0; 1963 return 0;
1963} 1964}
@@ -1966,7 +1967,8 @@ static int wait_for_writer(struct btrfs_trans_handle *trans,
1966 struct btrfs_root *root) 1967 struct btrfs_root *root)
1967{ 1968{
1968 DEFINE_WAIT(wait); 1969 DEFINE_WAIT(wait);
1969 while (atomic_read(&root->log_writers)) { 1970 while (root->fs_info->last_trans_log_full_commit !=
1971 trans->transid && atomic_read(&root->log_writers)) {
1970 prepare_to_wait(&root->log_writer_wait, 1972 prepare_to_wait(&root->log_writer_wait,
1971 &wait, TASK_UNINTERRUPTIBLE); 1973 &wait, TASK_UNINTERRUPTIBLE);
1972 mutex_unlock(&root->log_mutex); 1974 mutex_unlock(&root->log_mutex);
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
new file mode 100644
index 000000000000..12f5147bd2b1
--- /dev/null
+++ b/fs/btrfs/ulist.c
@@ -0,0 +1,220 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 */
6
7#include <linux/slab.h>
8#include <linux/module.h>
9#include "ulist.h"
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 *
21 * A sample usage for ulists is the enumeration of directed graphs without
22 * visiting a node twice. The pseudo-code could look like this:
23 *
24 * ulist = ulist_alloc();
25 * ulist_add(ulist, root);
26 * elem = NULL;
27 *
28 * while ((elem = ulist_next(ulist, elem)) {
29 * for (all child nodes n in elem)
30 * ulist_add(ulist, n);
31 * do something useful with the node;
32 * }
33 * ulist_free(ulist);
34 *
35 * This assumes the graph nodes are adressable by u64. This stems from the
36 * usage for tree enumeration in btrfs, where the logical addresses are
37 * 64 bit.
38 *
39 * It is also useful for tree enumeration which could be done elegantly
40 * recursively, but is not possible due to kernel stack limitations. The
41 * loop would be similar to the above.
42 */
43
44/**
45 * ulist_init - freshly initialize a ulist
46 * @ulist: the ulist to initialize
47 *
48 * Note: don't use this function to init an already used ulist, use
49 * ulist_reinit instead.
50 */
51void ulist_init(struct ulist *ulist)
52{
53 ulist->nnodes = 0;
54 ulist->nodes = ulist->int_nodes;
55 ulist->nodes_alloced = ULIST_SIZE;
56}
57EXPORT_SYMBOL(ulist_init);
58
59/**
60 * ulist_fini - free up additionally allocated memory for the ulist
61 * @ulist: the ulist from which to free the additional memory
62 *
63 * This is useful in cases where the base 'struct ulist' has been statically
64 * allocated.
65 */
66void ulist_fini(struct ulist *ulist)
67{
68 /*
69 * The first ULIST_SIZE elements are stored inline in struct ulist.
70 * Only if more elements are alocated they need to be freed.
71 */
72 if (ulist->nodes_alloced > ULIST_SIZE)
73 kfree(ulist->nodes);
74 ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
75}
76EXPORT_SYMBOL(ulist_fini);
77
78/**
79 * ulist_reinit - prepare a ulist for reuse
80 * @ulist: ulist to be reused
81 *
82 * Free up all additional memory allocated for the list elements and reinit
83 * the ulist.
84 */
85void ulist_reinit(struct ulist *ulist)
86{
87 ulist_fini(ulist);
88 ulist_init(ulist);
89}
90EXPORT_SYMBOL(ulist_reinit);
91
92/**
93 * ulist_alloc - dynamically allocate a ulist
94 * @gfp_mask: allocation flags to for base allocation
95 *
96 * The allocated ulist will be returned in an initialized state.
97 */
98struct ulist *ulist_alloc(unsigned long gfp_mask)
99{
100 struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
101
102 if (!ulist)
103 return NULL;
104
105 ulist_init(ulist);
106
107 return ulist;
108}
109EXPORT_SYMBOL(ulist_alloc);
110
111/**
112 * ulist_free - free dynamically allocated ulist
113 * @ulist: ulist to free
114 *
115 * It is not necessary to call ulist_fini before.
116 */
117void ulist_free(struct ulist *ulist)
118{
119 if (!ulist)
120 return;
121 ulist_fini(ulist);
122 kfree(ulist);
123}
124EXPORT_SYMBOL(ulist_free);
125
126/**
127 * ulist_add - add an element to the ulist
128 * @ulist: ulist to add the element to
129 * @val: value to add to ulist
130 * @aux: auxiliary value to store along with val
131 * @gfp_mask: flags to use for allocation
132 *
133 * Note: locking must be provided by the caller. In case of rwlocks write
134 * locking is needed
135 *
136 * Add an element to a ulist. The @val will only be added if it doesn't
137 * already exist. If it is added, the auxiliary value @aux is stored along with
138 * it. In case @val already exists in the ulist, @aux is ignored, even if
139 * it differs from the already stored value.
140 *
141 * ulist_add returns 0 if @val already exists in ulist and 1 if @val has been
142 * inserted.
143 * In case of allocation failure -ENOMEM is returned and the ulist stays
144 * unaltered.
145 */
146int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
147 unsigned long gfp_mask)
148{
149 int i;
150
151 for (i = 0; i < ulist->nnodes; ++i) {
152 if (ulist->nodes[i].val == val)
153 return 0;
154 }
155
156 if (ulist->nnodes >= ulist->nodes_alloced) {
157 u64 new_alloced = ulist->nodes_alloced + 128;
158 struct ulist_node *new_nodes;
159 void *old = NULL;
160
161 /*
162 * if nodes_alloced == ULIST_SIZE no memory has been allocated
163 * yet, so pass NULL to krealloc
164 */
165 if (ulist->nodes_alloced > ULIST_SIZE)
166 old = ulist->nodes;
167
168 new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
169 gfp_mask);
170 if (!new_nodes)
171 return -ENOMEM;
172
173 if (!old)
174 memcpy(new_nodes, ulist->int_nodes,
175 sizeof(ulist->int_nodes));
176
177 ulist->nodes = new_nodes;
178 ulist->nodes_alloced = new_alloced;
179 }
180 ulist->nodes[ulist->nnodes].val = val;
181 ulist->nodes[ulist->nnodes].aux = aux;
182 ++ulist->nnodes;
183
184 return 1;
185}
186EXPORT_SYMBOL(ulist_add);
187
188/**
189 * ulist_next - iterate ulist
190 * @ulist: ulist to iterate
191 * @prev: previously returned element or %NULL to start iteration
192 *
193 * Note: locking must be provided by the caller. In case of rwlocks only read
194 * locking is needed
195 *
196 * This function is used to iterate an ulist. The iteration is started with
197 * @prev = %NULL. It returns the next element from the ulist or %NULL when the
198 * end is reached. No guarantee is made with respect to the order in which
199 * the elements are returned. They might neither be returned in order of
200 * addition nor in ascending order.
201 * It is allowed to call ulist_add during an enumeration. Newly added items
202 * are guaranteed to show up in the running enumeration.
203 */
204struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
205{
206 int next;
207
208 if (ulist->nnodes == 0)
209 return NULL;
210
211 if (!prev)
212 return &ulist->nodes[0];
213
214 next = (prev - ulist->nodes) + 1;
215 if (next < 0 || next >= ulist->nnodes)
216 return NULL;
217
218 return &ulist->nodes[next];
219}
220EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
new file mode 100644
index 000000000000..2e25dec58ec0
--- /dev/null
+++ b/fs/btrfs/ulist.h
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 2011 STRATO AG
3 * written by Arne Jansen <sensille@gmx.net>
4 * Distributed under the GNU GPL license version 2.
5 *
6 */
7
8#ifndef __ULIST__
9#define __ULIST__
10
11/*
12 * ulist is a generic data structure to hold a collection of unique u64
13 * values. The only operations it supports is adding to the list and
14 * enumerating it.
15 * It is possible to store an auxiliary value along with the key.
16 *
17 * The implementation is preliminary and can probably be sped up
18 * significantly. A first step would be to store the values in an rbtree
19 * as soon as ULIST_SIZE is exceeded.
20 */
21
22/*
23 * number of elements statically allocated inside struct ulist
24 */
25#define ULIST_SIZE 16
26
27/*
28 * element of the list
29 */
30struct ulist_node {
31 u64 val; /* value to store */
32 unsigned long aux; /* auxiliary value saved along with the val */
33};
34
35struct ulist {
36 /*
37 * number of elements stored in list
38 */
39 unsigned long nnodes;
40
41 /*
42 * number of nodes we already have room for
43 */
44 unsigned long nodes_alloced;
45
46 /*
47 * pointer to the array storing the elements. The first ULIST_SIZE
48 * elements are stored inline. In this case the it points to int_nodes.
49 * After exceeding ULIST_SIZE, dynamic memory is allocated.
50 */
51 struct ulist_node *nodes;
52
53 /*
54 * inline storage space for the first ULIST_SIZE entries
55 */
56 struct ulist_node int_nodes[ULIST_SIZE];
57};
58
59void ulist_init(struct ulist *ulist);
60void ulist_fini(struct ulist *ulist);
61void ulist_reinit(struct ulist *ulist);
62struct ulist *ulist_alloc(unsigned long gfp_mask);
63void ulist_free(struct ulist *ulist);
64int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
65 unsigned long gfp_mask);
66struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
67
68#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f4b839fd3c9d..0b4e2af7954d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h> 25#include <linux/capability.h>
26#include <linux/kthread.h>
26#include <asm/div64.h> 27#include <asm/div64.h>
27#include "compat.h" 28#include "compat.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -32,6 +33,7 @@
32#include "print-tree.h" 33#include "print-tree.h"
33#include "volumes.h" 34#include "volumes.h"
34#include "async-thread.h" 35#include "async-thread.h"
36#include "check-integrity.h"
35 37
36static int init_first_rw_device(struct btrfs_trans_handle *trans, 38static int init_first_rw_device(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 39 struct btrfs_root *root,
@@ -246,7 +248,7 @@ loop_lock:
246 sync_pending = 0; 248 sync_pending = 0;
247 } 249 }
248 250
249 submit_bio(cur->bi_rw, cur); 251 btrfsic_submit_bio(cur->bi_rw, cur);
250 num_run++; 252 num_run++;
251 batch_run++; 253 batch_run++;
252 if (need_resched()) 254 if (need_resched())
@@ -706,8 +708,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
706 u64 devid; 708 u64 devid;
707 u64 transid; 709 u64 transid;
708 710
709 mutex_lock(&uuid_mutex);
710
711 flags |= FMODE_EXCL; 711 flags |= FMODE_EXCL;
712 bdev = blkdev_get_by_path(path, flags, holder); 712 bdev = blkdev_get_by_path(path, flags, holder);
713 713
@@ -716,6 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
716 goto error; 716 goto error;
717 } 717 }
718 718
719 mutex_lock(&uuid_mutex);
719 ret = set_blocksize(bdev, 4096); 720 ret = set_blocksize(bdev, 4096);
720 if (ret) 721 if (ret)
721 goto error_close; 722 goto error_close;
@@ -737,9 +738,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
737 738
738 brelse(bh); 739 brelse(bh);
739error_close: 740error_close:
741 mutex_unlock(&uuid_mutex);
740 blkdev_put(bdev, flags); 742 blkdev_put(bdev, flags);
741error: 743error:
742 mutex_unlock(&uuid_mutex);
743 return ret; 744 return ret;
744} 745}
745 746
@@ -829,7 +830,6 @@ out:
829 830
830/* 831/*
831 * find_free_dev_extent - find free space in the specified device 832 * find_free_dev_extent - find free space in the specified device
832 * @trans: transaction handler
833 * @device: the device which we search the free space in 833 * @device: the device which we search the free space in
834 * @num_bytes: the size of the free space that we need 834 * @num_bytes: the size of the free space that we need
835 * @start: store the start of the free space. 835 * @start: store the start of the free space.
@@ -848,8 +848,7 @@ out:
848 * But if we don't find suitable free space, it is used to store the size of 848 * But if we don't find suitable free space, it is used to store the size of
849 * the max free space. 849 * the max free space.
850 */ 850 */
851int find_free_dev_extent(struct btrfs_trans_handle *trans, 851int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
852 struct btrfs_device *device, u64 num_bytes,
853 u64 *start, u64 *len) 852 u64 *start, u64 *len)
854{ 853{
855 struct btrfs_key key; 854 struct btrfs_key key;
@@ -893,7 +892,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
893 key.offset = search_start; 892 key.offset = search_start;
894 key.type = BTRFS_DEV_EXTENT_KEY; 893 key.type = BTRFS_DEV_EXTENT_KEY;
895 894
896 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 895 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
897 if (ret < 0) 896 if (ret < 0)
898 goto out; 897 goto out;
899 if (ret > 0) { 898 if (ret > 0) {
@@ -1282,7 +1281,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1282 bool clear_super = false; 1281 bool clear_super = false;
1283 1282
1284 mutex_lock(&uuid_mutex); 1283 mutex_lock(&uuid_mutex);
1285 mutex_lock(&root->fs_info->volume_mutex);
1286 1284
1287 all_avail = root->fs_info->avail_data_alloc_bits | 1285 all_avail = root->fs_info->avail_data_alloc_bits |
1288 root->fs_info->avail_system_alloc_bits | 1286 root->fs_info->avail_system_alloc_bits |
@@ -1452,7 +1450,6 @@ error_close:
1452 if (bdev) 1450 if (bdev)
1453 blkdev_put(bdev, FMODE_READ | FMODE_EXCL); 1451 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1454out: 1452out:
1455 mutex_unlock(&root->fs_info->volume_mutex);
1456 mutex_unlock(&uuid_mutex); 1453 mutex_unlock(&uuid_mutex);
1457 return ret; 1454 return ret;
1458error_undo: 1455error_undo:
@@ -1469,8 +1466,7 @@ error_undo:
1469/* 1466/*
1470 * does all the dirty work required for changing file system's UUID. 1467 * does all the dirty work required for changing file system's UUID.
1471 */ 1468 */
1472static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, 1469static int btrfs_prepare_sprout(struct btrfs_root *root)
1473 struct btrfs_root *root)
1474{ 1470{
1475 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1471 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1476 struct btrfs_fs_devices *old_devices; 1472 struct btrfs_fs_devices *old_devices;
@@ -1629,7 +1625,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1629 } 1625 }
1630 1626
1631 filemap_write_and_wait(bdev->bd_inode->i_mapping); 1627 filemap_write_and_wait(bdev->bd_inode->i_mapping);
1632 mutex_lock(&root->fs_info->volume_mutex);
1633 1628
1634 devices = &root->fs_info->fs_devices->devices; 1629 devices = &root->fs_info->fs_devices->devices;
1635 /* 1630 /*
@@ -1695,7 +1690,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1695 1690
1696 if (seeding_dev) { 1691 if (seeding_dev) {
1697 sb->s_flags &= ~MS_RDONLY; 1692 sb->s_flags &= ~MS_RDONLY;
1698 ret = btrfs_prepare_sprout(trans, root); 1693 ret = btrfs_prepare_sprout(root);
1699 BUG_ON(ret); 1694 BUG_ON(ret);
1700 } 1695 }
1701 1696
@@ -1757,8 +1752,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1757 ret = btrfs_relocate_sys_chunks(root); 1752 ret = btrfs_relocate_sys_chunks(root);
1758 BUG_ON(ret); 1753 BUG_ON(ret);
1759 } 1754 }
1760out: 1755
1761 mutex_unlock(&root->fs_info->volume_mutex);
1762 return ret; 1756 return ret;
1763error: 1757error:
1764 blkdev_put(bdev, FMODE_EXCL); 1758 blkdev_put(bdev, FMODE_EXCL);
@@ -1766,7 +1760,7 @@ error:
1766 mutex_unlock(&uuid_mutex); 1760 mutex_unlock(&uuid_mutex);
1767 up_write(&sb->s_umount); 1761 up_write(&sb->s_umount);
1768 } 1762 }
1769 goto out; 1763 return ret;
1770} 1764}
1771 1765
1772static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 1766static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -2077,6 +2071,362 @@ error:
2077 return ret; 2071 return ret;
2078} 2072}
2079 2073
2074static int insert_balance_item(struct btrfs_root *root,
2075 struct btrfs_balance_control *bctl)
2076{
2077 struct btrfs_trans_handle *trans;
2078 struct btrfs_balance_item *item;
2079 struct btrfs_disk_balance_args disk_bargs;
2080 struct btrfs_path *path;
2081 struct extent_buffer *leaf;
2082 struct btrfs_key key;
2083 int ret, err;
2084
2085 path = btrfs_alloc_path();
2086 if (!path)
2087 return -ENOMEM;
2088
2089 trans = btrfs_start_transaction(root, 0);
2090 if (IS_ERR(trans)) {
2091 btrfs_free_path(path);
2092 return PTR_ERR(trans);
2093 }
2094
2095 key.objectid = BTRFS_BALANCE_OBJECTID;
2096 key.type = BTRFS_BALANCE_ITEM_KEY;
2097 key.offset = 0;
2098
2099 ret = btrfs_insert_empty_item(trans, root, path, &key,
2100 sizeof(*item));
2101 if (ret)
2102 goto out;
2103
2104 leaf = path->nodes[0];
2105 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2106
2107 memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2108
2109 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2110 btrfs_set_balance_data(leaf, item, &disk_bargs);
2111 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2112 btrfs_set_balance_meta(leaf, item, &disk_bargs);
2113 btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2114 btrfs_set_balance_sys(leaf, item, &disk_bargs);
2115
2116 btrfs_set_balance_flags(leaf, item, bctl->flags);
2117
2118 btrfs_mark_buffer_dirty(leaf);
2119out:
2120 btrfs_free_path(path);
2121 err = btrfs_commit_transaction(trans, root);
2122 if (err && !ret)
2123 ret = err;
2124 return ret;
2125}
2126
2127static int del_balance_item(struct btrfs_root *root)
2128{
2129 struct btrfs_trans_handle *trans;
2130 struct btrfs_path *path;
2131 struct btrfs_key key;
2132 int ret, err;
2133
2134 path = btrfs_alloc_path();
2135 if (!path)
2136 return -ENOMEM;
2137
2138 trans = btrfs_start_transaction(root, 0);
2139 if (IS_ERR(trans)) {
2140 btrfs_free_path(path);
2141 return PTR_ERR(trans);
2142 }
2143
2144 key.objectid = BTRFS_BALANCE_OBJECTID;
2145 key.type = BTRFS_BALANCE_ITEM_KEY;
2146 key.offset = 0;
2147
2148 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2149 if (ret < 0)
2150 goto out;
2151 if (ret > 0) {
2152 ret = -ENOENT;
2153 goto out;
2154 }
2155
2156 ret = btrfs_del_item(trans, root, path);
2157out:
2158 btrfs_free_path(path);
2159 err = btrfs_commit_transaction(trans, root);
2160 if (err && !ret)
2161 ret = err;
2162 return ret;
2163}
2164
2165/*
2166 * This is a heuristic used to reduce the number of chunks balanced on
2167 * resume after balance was interrupted.
2168 */
2169static void update_balance_args(struct btrfs_balance_control *bctl)
2170{
2171 /*
2172 * Turn on soft mode for chunk types that were being converted.
2173 */
2174 if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2175 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2176 if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2177 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2178 if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2179 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2180
2181 /*
2182 * Turn on usage filter if is not already used. The idea is
2183 * that chunks that we have already balanced should be
2184 * reasonably full. Don't do it for chunks that are being
2185 * converted - that will keep us from relocating unconverted
2186 * (albeit full) chunks.
2187 */
2188 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2189 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2190 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2191 bctl->data.usage = 90;
2192 }
2193 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2194 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2195 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2196 bctl->sys.usage = 90;
2197 }
2198 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2199 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2200 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2201 bctl->meta.usage = 90;
2202 }
2203}
2204
2205/*
2206 * Should be called with both balance and volume mutexes held to
2207 * serialize other volume operations (add_dev/rm_dev/resize) with
2208 * restriper. Same goes for unset_balance_control.
2209 */
2210static void set_balance_control(struct btrfs_balance_control *bctl)
2211{
2212 struct btrfs_fs_info *fs_info = bctl->fs_info;
2213
2214 BUG_ON(fs_info->balance_ctl);
2215
2216 spin_lock(&fs_info->balance_lock);
2217 fs_info->balance_ctl = bctl;
2218 spin_unlock(&fs_info->balance_lock);
2219}
2220
2221static void unset_balance_control(struct btrfs_fs_info *fs_info)
2222{
2223 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2224
2225 BUG_ON(!fs_info->balance_ctl);
2226
2227 spin_lock(&fs_info->balance_lock);
2228 fs_info->balance_ctl = NULL;
2229 spin_unlock(&fs_info->balance_lock);
2230
2231 kfree(bctl);
2232}
2233
2234/*
2235 * Balance filters. Return 1 if chunk should be filtered out
2236 * (should not be balanced).
2237 */
2238static int chunk_profiles_filter(u64 chunk_profile,
2239 struct btrfs_balance_args *bargs)
2240{
2241 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2242
2243 if (chunk_profile == 0)
2244 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2245
2246 if (bargs->profiles & chunk_profile)
2247 return 0;
2248
2249 return 1;
2250}
2251
2252static u64 div_factor_fine(u64 num, int factor)
2253{
2254 if (factor <= 0)
2255 return 0;
2256 if (factor >= 100)
2257 return num;
2258
2259 num *= factor;
2260 do_div(num, 100);
2261 return num;
2262}
2263
2264static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2265 struct btrfs_balance_args *bargs)
2266{
2267 struct btrfs_block_group_cache *cache;
2268 u64 chunk_used, user_thresh;
2269 int ret = 1;
2270
2271 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2272 chunk_used = btrfs_block_group_used(&cache->item);
2273
2274 user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2275 if (chunk_used < user_thresh)
2276 ret = 0;
2277
2278 btrfs_put_block_group(cache);
2279 return ret;
2280}
2281
2282static int chunk_devid_filter(struct extent_buffer *leaf,
2283 struct btrfs_chunk *chunk,
2284 struct btrfs_balance_args *bargs)
2285{
2286 struct btrfs_stripe *stripe;
2287 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2288 int i;
2289
2290 for (i = 0; i < num_stripes; i++) {
2291 stripe = btrfs_stripe_nr(chunk, i);
2292 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2293 return 0;
2294 }
2295
2296 return 1;
2297}
2298
2299/* [pstart, pend) */
2300static int chunk_drange_filter(struct extent_buffer *leaf,
2301 struct btrfs_chunk *chunk,
2302 u64 chunk_offset,
2303 struct btrfs_balance_args *bargs)
2304{
2305 struct btrfs_stripe *stripe;
2306 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2307 u64 stripe_offset;
2308 u64 stripe_length;
2309 int factor;
2310 int i;
2311
2312 if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2313 return 0;
2314
2315 if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2316 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2317 factor = 2;
2318 else
2319 factor = 1;
2320 factor = num_stripes / factor;
2321
2322 for (i = 0; i < num_stripes; i++) {
2323 stripe = btrfs_stripe_nr(chunk, i);
2324 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2325 continue;
2326
2327 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2328 stripe_length = btrfs_chunk_length(leaf, chunk);
2329 do_div(stripe_length, factor);
2330
2331 if (stripe_offset < bargs->pend &&
2332 stripe_offset + stripe_length > bargs->pstart)
2333 return 0;
2334 }
2335
2336 return 1;
2337}
2338
2339/* [vstart, vend) */
2340static int chunk_vrange_filter(struct extent_buffer *leaf,
2341 struct btrfs_chunk *chunk,
2342 u64 chunk_offset,
2343 struct btrfs_balance_args *bargs)
2344{
2345 if (chunk_offset < bargs->vend &&
2346 chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2347 /* at least part of the chunk is inside this vrange */
2348 return 0;
2349
2350 return 1;
2351}
2352
2353static int chunk_soft_convert_filter(u64 chunk_profile,
2354 struct btrfs_balance_args *bargs)
2355{
2356 if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2357 return 0;
2358
2359 chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2360
2361 if (chunk_profile == 0)
2362 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2363
2364 if (bargs->target & chunk_profile)
2365 return 1;
2366
2367 return 0;
2368}
2369
2370static int should_balance_chunk(struct btrfs_root *root,
2371 struct extent_buffer *leaf,
2372 struct btrfs_chunk *chunk, u64 chunk_offset)
2373{
2374 struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2375 struct btrfs_balance_args *bargs = NULL;
2376 u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2377
2378 /* type filter */
2379 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2380 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2381 return 0;
2382 }
2383
2384 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2385 bargs = &bctl->data;
2386 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2387 bargs = &bctl->sys;
2388 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2389 bargs = &bctl->meta;
2390
2391 /* profiles filter */
2392 if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2393 chunk_profiles_filter(chunk_type, bargs)) {
2394 return 0;
2395 }
2396
2397 /* usage filter */
2398 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2399 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2400 return 0;
2401 }
2402
2403 /* devid filter */
2404 if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2405 chunk_devid_filter(leaf, chunk, bargs)) {
2406 return 0;
2407 }
2408
2409 /* drange filter, makes sense only with devid filter */
2410 if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2411 chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2412 return 0;
2413 }
2414
2415 /* vrange filter */
2416 if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2417 chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2418 return 0;
2419 }
2420
2421 /* soft profile changing mode */
2422 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2423 chunk_soft_convert_filter(chunk_type, bargs)) {
2424 return 0;
2425 }
2426
2427 return 1;
2428}
2429
2080static u64 div_factor(u64 num, int factor) 2430static u64 div_factor(u64 num, int factor)
2081{ 2431{
2082 if (factor == 10) 2432 if (factor == 10)
@@ -2086,29 +2436,28 @@ static u64 div_factor(u64 num, int factor)
2086 return num; 2436 return num;
2087} 2437}
2088 2438
2089int btrfs_balance(struct btrfs_root *dev_root) 2439static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2090{ 2440{
2091 int ret; 2441 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2092 struct list_head *devices = &dev_root->fs_info->fs_devices->devices; 2442 struct btrfs_root *chunk_root = fs_info->chunk_root;
2443 struct btrfs_root *dev_root = fs_info->dev_root;
2444 struct list_head *devices;
2093 struct btrfs_device *device; 2445 struct btrfs_device *device;
2094 u64 old_size; 2446 u64 old_size;
2095 u64 size_to_free; 2447 u64 size_to_free;
2448 struct btrfs_chunk *chunk;
2096 struct btrfs_path *path; 2449 struct btrfs_path *path;
2097 struct btrfs_key key; 2450 struct btrfs_key key;
2098 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2099 struct btrfs_trans_handle *trans;
2100 struct btrfs_key found_key; 2451 struct btrfs_key found_key;
2101 2452 struct btrfs_trans_handle *trans;
2102 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2453 struct extent_buffer *leaf;
2103 return -EROFS; 2454 int slot;
2104 2455 int ret;
2105 if (!capable(CAP_SYS_ADMIN)) 2456 int enospc_errors = 0;
2106 return -EPERM; 2457 bool counting = true;
2107
2108 mutex_lock(&dev_root->fs_info->volume_mutex);
2109 dev_root = dev_root->fs_info->dev_root;
2110 2458
2111 /* step one make some room on all the devices */ 2459 /* step one make some room on all the devices */
2460 devices = &fs_info->fs_devices->devices;
2112 list_for_each_entry(device, devices, dev_list) { 2461 list_for_each_entry(device, devices, dev_list) {
2113 old_size = device->total_bytes; 2462 old_size = device->total_bytes;
2114 size_to_free = div_factor(old_size, 1); 2463 size_to_free = div_factor(old_size, 1);
@@ -2137,11 +2486,23 @@ int btrfs_balance(struct btrfs_root *dev_root)
2137 ret = -ENOMEM; 2486 ret = -ENOMEM;
2138 goto error; 2487 goto error;
2139 } 2488 }
2489
2490 /* zero out stat counters */
2491 spin_lock(&fs_info->balance_lock);
2492 memset(&bctl->stat, 0, sizeof(bctl->stat));
2493 spin_unlock(&fs_info->balance_lock);
2494again:
2140 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2495 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2141 key.offset = (u64)-1; 2496 key.offset = (u64)-1;
2142 key.type = BTRFS_CHUNK_ITEM_KEY; 2497 key.type = BTRFS_CHUNK_ITEM_KEY;
2143 2498
2144 while (1) { 2499 while (1) {
2500 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2501 atomic_read(&fs_info->balance_cancel_req)) {
2502 ret = -ECANCELED;
2503 goto error;
2504 }
2505
2145 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); 2506 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2146 if (ret < 0) 2507 if (ret < 0)
2147 goto error; 2508 goto error;
@@ -2151,15 +2512,19 @@ int btrfs_balance(struct btrfs_root *dev_root)
2151 * failed 2512 * failed
2152 */ 2513 */
2153 if (ret == 0) 2514 if (ret == 0)
2154 break; 2515 BUG(); /* FIXME break ? */
2155 2516
2156 ret = btrfs_previous_item(chunk_root, path, 0, 2517 ret = btrfs_previous_item(chunk_root, path, 0,
2157 BTRFS_CHUNK_ITEM_KEY); 2518 BTRFS_CHUNK_ITEM_KEY);
2158 if (ret) 2519 if (ret) {
2520 ret = 0;
2159 break; 2521 break;
2522 }
2523
2524 leaf = path->nodes[0];
2525 slot = path->slots[0];
2526 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2160 2527
2161 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2162 path->slots[0]);
2163 if (found_key.objectid != key.objectid) 2528 if (found_key.objectid != key.objectid)
2164 break; 2529 break;
2165 2530
@@ -2167,22 +2532,375 @@ int btrfs_balance(struct btrfs_root *dev_root)
2167 if (found_key.offset == 0) 2532 if (found_key.offset == 0)
2168 break; 2533 break;
2169 2534
2535 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2536
2537 if (!counting) {
2538 spin_lock(&fs_info->balance_lock);
2539 bctl->stat.considered++;
2540 spin_unlock(&fs_info->balance_lock);
2541 }
2542
2543 ret = should_balance_chunk(chunk_root, leaf, chunk,
2544 found_key.offset);
2170 btrfs_release_path(path); 2545 btrfs_release_path(path);
2546 if (!ret)
2547 goto loop;
2548
2549 if (counting) {
2550 spin_lock(&fs_info->balance_lock);
2551 bctl->stat.expected++;
2552 spin_unlock(&fs_info->balance_lock);
2553 goto loop;
2554 }
2555
2171 ret = btrfs_relocate_chunk(chunk_root, 2556 ret = btrfs_relocate_chunk(chunk_root,
2172 chunk_root->root_key.objectid, 2557 chunk_root->root_key.objectid,
2173 found_key.objectid, 2558 found_key.objectid,
2174 found_key.offset); 2559 found_key.offset);
2175 if (ret && ret != -ENOSPC) 2560 if (ret && ret != -ENOSPC)
2176 goto error; 2561 goto error;
2562 if (ret == -ENOSPC) {
2563 enospc_errors++;
2564 } else {
2565 spin_lock(&fs_info->balance_lock);
2566 bctl->stat.completed++;
2567 spin_unlock(&fs_info->balance_lock);
2568 }
2569loop:
2177 key.offset = found_key.offset - 1; 2570 key.offset = found_key.offset - 1;
2178 } 2571 }
2179 ret = 0; 2572
2573 if (counting) {
2574 btrfs_release_path(path);
2575 counting = false;
2576 goto again;
2577 }
2180error: 2578error:
2181 btrfs_free_path(path); 2579 btrfs_free_path(path);
2182 mutex_unlock(&dev_root->fs_info->volume_mutex); 2580 if (enospc_errors) {
2581 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2582 enospc_errors);
2583 if (!ret)
2584 ret = -ENOSPC;
2585 }
2586
2183 return ret; 2587 return ret;
2184} 2588}
2185 2589
2590static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2591{
2592 /* cancel requested || normal exit path */
2593 return atomic_read(&fs_info->balance_cancel_req) ||
2594 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2595 atomic_read(&fs_info->balance_cancel_req) == 0);
2596}
2597
2598static void __cancel_balance(struct btrfs_fs_info *fs_info)
2599{
2600 int ret;
2601
2602 unset_balance_control(fs_info);
2603 ret = del_balance_item(fs_info->tree_root);
2604 BUG_ON(ret);
2605}
2606
2607void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2608 struct btrfs_ioctl_balance_args *bargs);
2609
2610/*
2611 * Should be called with both balance and volume mutexes held
2612 */
2613int btrfs_balance(struct btrfs_balance_control *bctl,
2614 struct btrfs_ioctl_balance_args *bargs)
2615{
2616 struct btrfs_fs_info *fs_info = bctl->fs_info;
2617 u64 allowed;
2618 int ret;
2619
2620 if (btrfs_fs_closing(fs_info) ||
2621 atomic_read(&fs_info->balance_pause_req) ||
2622 atomic_read(&fs_info->balance_cancel_req)) {
2623 ret = -EINVAL;
2624 goto out;
2625 }
2626
2627 /*
2628 * In case of mixed groups both data and meta should be picked,
2629 * and identical options should be given for both of them.
2630 */
2631 allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2632 if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2633 (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2634 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2635 !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2636 memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2637 printk(KERN_ERR "btrfs: with mixed groups data and "
2638 "metadata balance options must be the same\n");
2639 ret = -EINVAL;
2640 goto out;
2641 }
2642 }
2643
2644 /*
2645 * Profile changing sanity checks. Skip them if a simple
2646 * balance is requested.
2647 */
2648 if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2649 BTRFS_BALANCE_ARGS_CONVERT))
2650 goto do_balance;
2651
2652 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2653 if (fs_info->fs_devices->num_devices == 1)
2654 allowed |= BTRFS_BLOCK_GROUP_DUP;
2655 else if (fs_info->fs_devices->num_devices < 4)
2656 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2657 else
2658 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2659 BTRFS_BLOCK_GROUP_RAID10);
2660
2661 if (!profile_is_valid(bctl->data.target, 1) ||
2662 bctl->data.target & ~allowed) {
2663 printk(KERN_ERR "btrfs: unable to start balance with target "
2664 "data profile %llu\n",
2665 (unsigned long long)bctl->data.target);
2666 ret = -EINVAL;
2667 goto out;
2668 }
2669 if (!profile_is_valid(bctl->meta.target, 1) ||
2670 bctl->meta.target & ~allowed) {
2671 printk(KERN_ERR "btrfs: unable to start balance with target "
2672 "metadata profile %llu\n",
2673 (unsigned long long)bctl->meta.target);
2674 ret = -EINVAL;
2675 goto out;
2676 }
2677 if (!profile_is_valid(bctl->sys.target, 1) ||
2678 bctl->sys.target & ~allowed) {
2679 printk(KERN_ERR "btrfs: unable to start balance with target "
2680 "system profile %llu\n",
2681 (unsigned long long)bctl->sys.target);
2682 ret = -EINVAL;
2683 goto out;
2684 }
2685
2686 if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2687 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2688 ret = -EINVAL;
2689 goto out;
2690 }
2691
2692 /* allow to reduce meta or sys integrity only if force set */
2693 allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2694 BTRFS_BLOCK_GROUP_RAID10;
2695 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2696 (fs_info->avail_system_alloc_bits & allowed) &&
2697 !(bctl->sys.target & allowed)) ||
2698 ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2699 (fs_info->avail_metadata_alloc_bits & allowed) &&
2700 !(bctl->meta.target & allowed))) {
2701 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2702 printk(KERN_INFO "btrfs: force reducing metadata "
2703 "integrity\n");
2704 } else {
2705 printk(KERN_ERR "btrfs: balance will reduce metadata "
2706 "integrity, use force if you want this\n");
2707 ret = -EINVAL;
2708 goto out;
2709 }
2710 }
2711
2712do_balance:
2713 ret = insert_balance_item(fs_info->tree_root, bctl);
2714 if (ret && ret != -EEXIST)
2715 goto out;
2716
2717 if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2718 BUG_ON(ret == -EEXIST);
2719 set_balance_control(bctl);
2720 } else {
2721 BUG_ON(ret != -EEXIST);
2722 spin_lock(&fs_info->balance_lock);
2723 update_balance_args(bctl);
2724 spin_unlock(&fs_info->balance_lock);
2725 }
2726
2727 atomic_inc(&fs_info->balance_running);
2728 mutex_unlock(&fs_info->balance_mutex);
2729
2730 ret = __btrfs_balance(fs_info);
2731
2732 mutex_lock(&fs_info->balance_mutex);
2733 atomic_dec(&fs_info->balance_running);
2734
2735 if (bargs) {
2736 memset(bargs, 0, sizeof(*bargs));
2737 update_ioctl_balance_args(fs_info, 0, bargs);
2738 }
2739
2740 if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2741 balance_need_close(fs_info)) {
2742 __cancel_balance(fs_info);
2743 }
2744
2745 wake_up(&fs_info->balance_wait_q);
2746
2747 return ret;
2748out:
2749 if (bctl->flags & BTRFS_BALANCE_RESUME)
2750 __cancel_balance(fs_info);
2751 else
2752 kfree(bctl);
2753 return ret;
2754}
2755
2756static int balance_kthread(void *data)
2757{
2758 struct btrfs_balance_control *bctl =
2759 (struct btrfs_balance_control *)data;
2760 struct btrfs_fs_info *fs_info = bctl->fs_info;
2761 int ret = 0;
2762
2763 mutex_lock(&fs_info->volume_mutex);
2764 mutex_lock(&fs_info->balance_mutex);
2765
2766 set_balance_control(bctl);
2767
2768 if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2769 printk(KERN_INFO "btrfs: force skipping balance\n");
2770 } else {
2771 printk(KERN_INFO "btrfs: continuing balance\n");
2772 ret = btrfs_balance(bctl, NULL);
2773 }
2774
2775 mutex_unlock(&fs_info->balance_mutex);
2776 mutex_unlock(&fs_info->volume_mutex);
2777 return ret;
2778}
2779
2780int btrfs_recover_balance(struct btrfs_root *tree_root)
2781{
2782 struct task_struct *tsk;
2783 struct btrfs_balance_control *bctl;
2784 struct btrfs_balance_item *item;
2785 struct btrfs_disk_balance_args disk_bargs;
2786 struct btrfs_path *path;
2787 struct extent_buffer *leaf;
2788 struct btrfs_key key;
2789 int ret;
2790
2791 path = btrfs_alloc_path();
2792 if (!path)
2793 return -ENOMEM;
2794
2795 bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2796 if (!bctl) {
2797 ret = -ENOMEM;
2798 goto out;
2799 }
2800
2801 key.objectid = BTRFS_BALANCE_OBJECTID;
2802 key.type = BTRFS_BALANCE_ITEM_KEY;
2803 key.offset = 0;
2804
2805 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2806 if (ret < 0)
2807 goto out_bctl;
2808 if (ret > 0) { /* ret = -ENOENT; */
2809 ret = 0;
2810 goto out_bctl;
2811 }
2812
2813 leaf = path->nodes[0];
2814 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2815
2816 bctl->fs_info = tree_root->fs_info;
2817 bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2818
2819 btrfs_balance_data(leaf, item, &disk_bargs);
2820 btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2821 btrfs_balance_meta(leaf, item, &disk_bargs);
2822 btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2823 btrfs_balance_sys(leaf, item, &disk_bargs);
2824 btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2825
2826 tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2827 if (IS_ERR(tsk))
2828 ret = PTR_ERR(tsk);
2829 else
2830 goto out;
2831
2832out_bctl:
2833 kfree(bctl);
2834out:
2835 btrfs_free_path(path);
2836 return ret;
2837}
2838
2839int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2840{
2841 int ret = 0;
2842
2843 mutex_lock(&fs_info->balance_mutex);
2844 if (!fs_info->balance_ctl) {
2845 mutex_unlock(&fs_info->balance_mutex);
2846 return -ENOTCONN;
2847 }
2848
2849 if (atomic_read(&fs_info->balance_running)) {
2850 atomic_inc(&fs_info->balance_pause_req);
2851 mutex_unlock(&fs_info->balance_mutex);
2852
2853 wait_event(fs_info->balance_wait_q,
2854 atomic_read(&fs_info->balance_running) == 0);
2855
2856 mutex_lock(&fs_info->balance_mutex);
2857 /* we are good with balance_ctl ripped off from under us */
2858 BUG_ON(atomic_read(&fs_info->balance_running));
2859 atomic_dec(&fs_info->balance_pause_req);
2860 } else {
2861 ret = -ENOTCONN;
2862 }
2863
2864 mutex_unlock(&fs_info->balance_mutex);
2865 return ret;
2866}
2867
2868int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2869{
2870 mutex_lock(&fs_info->balance_mutex);
2871 if (!fs_info->balance_ctl) {
2872 mutex_unlock(&fs_info->balance_mutex);
2873 return -ENOTCONN;
2874 }
2875
2876 atomic_inc(&fs_info->balance_cancel_req);
2877 /*
2878 * if we are running just wait and return, balance item is
2879 * deleted in btrfs_balance in this case
2880 */
2881 if (atomic_read(&fs_info->balance_running)) {
2882 mutex_unlock(&fs_info->balance_mutex);
2883 wait_event(fs_info->balance_wait_q,
2884 atomic_read(&fs_info->balance_running) == 0);
2885 mutex_lock(&fs_info->balance_mutex);
2886 } else {
2887 /* __cancel_balance needs volume_mutex */
2888 mutex_unlock(&fs_info->balance_mutex);
2889 mutex_lock(&fs_info->volume_mutex);
2890 mutex_lock(&fs_info->balance_mutex);
2891
2892 if (fs_info->balance_ctl)
2893 __cancel_balance(fs_info);
2894
2895 mutex_unlock(&fs_info->volume_mutex);
2896 }
2897
2898 BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2899 atomic_dec(&fs_info->balance_cancel_req);
2900 mutex_unlock(&fs_info->balance_mutex);
2901 return 0;
2902}
2903
2186/* 2904/*
2187 * shrinking a device means finding all of the device extents past 2905 * shrinking a device means finding all of the device extents past
2188 * the new size, and then following the back refs to the chunks. 2906 * the new size, and then following the back refs to the chunks.
@@ -2323,8 +3041,7 @@ done:
2323 return ret; 3041 return ret;
2324} 3042}
2325 3043
2326static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, 3044static int btrfs_add_system_chunk(struct btrfs_root *root,
2327 struct btrfs_root *root,
2328 struct btrfs_key *key, 3045 struct btrfs_key *key,
2329 struct btrfs_chunk *chunk, int item_size) 3046 struct btrfs_chunk *chunk, int item_size)
2330{ 3047{
@@ -2441,10 +3158,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2441 max_stripe_size = 1024 * 1024 * 1024; 3158 max_stripe_size = 1024 * 1024 * 1024;
2442 max_chunk_size = 10 * max_stripe_size; 3159 max_chunk_size = 10 * max_stripe_size;
2443 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 3160 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2444 max_stripe_size = 256 * 1024 * 1024; 3161 /* for larger filesystems, use larger metadata chunks */
3162 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3163 max_stripe_size = 1024 * 1024 * 1024;
3164 else
3165 max_stripe_size = 256 * 1024 * 1024;
2445 max_chunk_size = max_stripe_size; 3166 max_chunk_size = max_stripe_size;
2446 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 3167 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2447 max_stripe_size = 8 * 1024 * 1024; 3168 max_stripe_size = 32 * 1024 * 1024;
2448 max_chunk_size = 2 * max_stripe_size; 3169 max_chunk_size = 2 * max_stripe_size;
2449 } else { 3170 } else {
2450 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n", 3171 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2496,7 +3217,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2496 if (total_avail == 0) 3217 if (total_avail == 0)
2497 continue; 3218 continue;
2498 3219
2499 ret = find_free_dev_extent(trans, device, 3220 ret = find_free_dev_extent(device,
2500 max_stripe_size * dev_stripes, 3221 max_stripe_size * dev_stripes,
2501 &dev_offset, &max_avail); 3222 &dev_offset, &max_avail);
2502 if (ret && ret != -ENOSPC) 3223 if (ret && ret != -ENOSPC)
@@ -2687,7 +3408,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2687 BUG_ON(ret); 3408 BUG_ON(ret);
2688 3409
2689 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 3410 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2690 ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, 3411 ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
2691 item_size); 3412 item_size);
2692 BUG_ON(ret); 3413 BUG_ON(ret);
2693 } 3414 }
@@ -2752,8 +3473,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2752 return ret; 3473 return ret;
2753 3474
2754 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 3475 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2755 (fs_info->metadata_alloc_profile & 3476 fs_info->avail_metadata_alloc_bits;
2756 fs_info->avail_metadata_alloc_bits);
2757 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3477 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2758 3478
2759 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, 3479 ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
@@ -2763,8 +3483,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2763 sys_chunk_offset = chunk_offset + chunk_size; 3483 sys_chunk_offset = chunk_offset + chunk_size;
2764 3484
2765 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | 3485 alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2766 (fs_info->system_alloc_profile & 3486 fs_info->avail_system_alloc_bits;
2767 fs_info->avail_system_alloc_bits);
2768 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); 3487 alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2769 3488
2770 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, 3489 ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
@@ -2901,26 +3620,13 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2901 u64 stripe_nr; 3620 u64 stripe_nr;
2902 u64 stripe_nr_orig; 3621 u64 stripe_nr_orig;
2903 u64 stripe_nr_end; 3622 u64 stripe_nr_end;
2904 int stripes_allocated = 8;
2905 int stripes_required = 1;
2906 int stripe_index; 3623 int stripe_index;
2907 int i; 3624 int i;
3625 int ret = 0;
2908 int num_stripes; 3626 int num_stripes;
2909 int max_errors = 0; 3627 int max_errors = 0;
2910 struct btrfs_bio *bbio = NULL; 3628 struct btrfs_bio *bbio = NULL;
2911 3629
2912 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2913 stripes_allocated = 1;
2914again:
2915 if (bbio_ret) {
2916 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2917 GFP_NOFS);
2918 if (!bbio)
2919 return -ENOMEM;
2920
2921 atomic_set(&bbio->error, 0);
2922 }
2923
2924 read_lock(&em_tree->lock); 3630 read_lock(&em_tree->lock);
2925 em = lookup_extent_mapping(em_tree, logical, *length); 3631 em = lookup_extent_mapping(em_tree, logical, *length);
2926 read_unlock(&em_tree->lock); 3632 read_unlock(&em_tree->lock);
@@ -2939,32 +3645,6 @@ again:
2939 if (mirror_num > map->num_stripes) 3645 if (mirror_num > map->num_stripes)
2940 mirror_num = 0; 3646 mirror_num = 0;
2941 3647
2942 /* if our btrfs_bio struct is too small, back off and try again */
2943 if (rw & REQ_WRITE) {
2944 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2945 BTRFS_BLOCK_GROUP_DUP)) {
2946 stripes_required = map->num_stripes;
2947 max_errors = 1;
2948 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2949 stripes_required = map->sub_stripes;
2950 max_errors = 1;
2951 }
2952 }
2953 if (rw & REQ_DISCARD) {
2954 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2955 BTRFS_BLOCK_GROUP_RAID1 |
2956 BTRFS_BLOCK_GROUP_DUP |
2957 BTRFS_BLOCK_GROUP_RAID10)) {
2958 stripes_required = map->num_stripes;
2959 }
2960 }
2961 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2962 stripes_allocated < stripes_required) {
2963 stripes_allocated = map->num_stripes;
2964 free_extent_map(em);
2965 kfree(bbio);
2966 goto again;
2967 }
2968 stripe_nr = offset; 3648 stripe_nr = offset;
2969 /* 3649 /*
2970 * stripe_nr counts the total number of stripes we have to stride 3650 * stripe_nr counts the total number of stripes we have to stride
@@ -2980,10 +3660,7 @@ again:
2980 3660
2981 if (rw & REQ_DISCARD) 3661 if (rw & REQ_DISCARD)
2982 *length = min_t(u64, em->len - offset, *length); 3662 *length = min_t(u64, em->len - offset, *length);
2983 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | 3663 else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
2984 BTRFS_BLOCK_GROUP_RAID1 |
2985 BTRFS_BLOCK_GROUP_RAID10 |
2986 BTRFS_BLOCK_GROUP_DUP)) {
2987 /* we limit the length of each bio to what fits in a stripe */ 3664 /* we limit the length of each bio to what fits in a stripe */
2988 *length = min_t(u64, em->len - offset, 3665 *length = min_t(u64, em->len - offset,
2989 map->stripe_len - stripe_offset); 3666 map->stripe_len - stripe_offset);
@@ -3059,81 +3736,55 @@ again:
3059 } 3736 }
3060 BUG_ON(stripe_index >= map->num_stripes); 3737 BUG_ON(stripe_index >= map->num_stripes);
3061 3738
3739 bbio = kzalloc(btrfs_bio_size(num_stripes), GFP_NOFS);
3740 if (!bbio) {
3741 ret = -ENOMEM;
3742 goto out;
3743 }
3744 atomic_set(&bbio->error, 0);
3745
3062 if (rw & REQ_DISCARD) { 3746 if (rw & REQ_DISCARD) {
3747 int factor = 0;
3748 int sub_stripes = 0;
3749 u64 stripes_per_dev = 0;
3750 u32 remaining_stripes = 0;
3751
3752 if (map->type &
3753 (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3754 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3755 sub_stripes = 1;
3756 else
3757 sub_stripes = map->sub_stripes;
3758
3759 factor = map->num_stripes / sub_stripes;
3760 stripes_per_dev = div_u64_rem(stripe_nr_end -
3761 stripe_nr_orig,
3762 factor,
3763 &remaining_stripes);
3764 }
3765
3063 for (i = 0; i < num_stripes; i++) { 3766 for (i = 0; i < num_stripes; i++) {
3064 bbio->stripes[i].physical = 3767 bbio->stripes[i].physical =
3065 map->stripes[stripe_index].physical + 3768 map->stripes[stripe_index].physical +
3066 stripe_offset + stripe_nr * map->stripe_len; 3769 stripe_offset + stripe_nr * map->stripe_len;
3067 bbio->stripes[i].dev = map->stripes[stripe_index].dev; 3770 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3068 3771
3069 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3772 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3070 u64 stripes; 3773 BTRFS_BLOCK_GROUP_RAID10)) {
3071 u32 last_stripe = 0; 3774 bbio->stripes[i].length = stripes_per_dev *
3072 int j; 3775 map->stripe_len;
3073 3776 if (i / sub_stripes < remaining_stripes)
3074 div_u64_rem(stripe_nr_end - 1, 3777 bbio->stripes[i].length +=
3075 map->num_stripes, 3778 map->stripe_len;
3076 &last_stripe); 3779 if (i < sub_stripes)
3077
3078 for (j = 0; j < map->num_stripes; j++) {
3079 u32 test;
3080
3081 div_u64_rem(stripe_nr_end - 1 - j,
3082 map->num_stripes, &test);
3083 if (test == stripe_index)
3084 break;
3085 }
3086 stripes = stripe_nr_end - 1 - j;
3087 do_div(stripes, map->num_stripes);
3088 bbio->stripes[i].length = map->stripe_len *
3089 (stripes - stripe_nr + 1);
3090
3091 if (i == 0) {
3092 bbio->stripes[i].length -=
3093 stripe_offset;
3094 stripe_offset = 0;
3095 }
3096 if (stripe_index == last_stripe)
3097 bbio->stripes[i].length -=
3098 stripe_end_offset;
3099 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3100 u64 stripes;
3101 int j;
3102 int factor = map->num_stripes /
3103 map->sub_stripes;
3104 u32 last_stripe = 0;
3105
3106 div_u64_rem(stripe_nr_end - 1,
3107 factor, &last_stripe);
3108 last_stripe *= map->sub_stripes;
3109
3110 for (j = 0; j < factor; j++) {
3111 u32 test;
3112
3113 div_u64_rem(stripe_nr_end - 1 - j,
3114 factor, &test);
3115
3116 if (test ==
3117 stripe_index / map->sub_stripes)
3118 break;
3119 }
3120 stripes = stripe_nr_end - 1 - j;
3121 do_div(stripes, factor);
3122 bbio->stripes[i].length = map->stripe_len *
3123 (stripes - stripe_nr + 1);
3124
3125 if (i < map->sub_stripes) {
3126 bbio->stripes[i].length -= 3780 bbio->stripes[i].length -=
3127 stripe_offset; 3781 stripe_offset;
3128 if (i == map->sub_stripes - 1) 3782 if ((i / sub_stripes + 1) %
3129 stripe_offset = 0; 3783 sub_stripes == remaining_stripes)
3130 }
3131 if (stripe_index >= last_stripe &&
3132 stripe_index <= (last_stripe +
3133 map->sub_stripes - 1)) {
3134 bbio->stripes[i].length -= 3784 bbio->stripes[i].length -=
3135 stripe_end_offset; 3785 stripe_end_offset;
3136 } 3786 if (i == sub_stripes - 1)
3787 stripe_offset = 0;
3137 } else 3788 } else
3138 bbio->stripes[i].length = *length; 3789 bbio->stripes[i].length = *length;
3139 3790
@@ -3155,15 +3806,22 @@ again:
3155 stripe_index++; 3806 stripe_index++;
3156 } 3807 }
3157 } 3808 }
3158 if (bbio_ret) { 3809
3159 *bbio_ret = bbio; 3810 if (rw & REQ_WRITE) {
3160 bbio->num_stripes = num_stripes; 3811 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
3161 bbio->max_errors = max_errors; 3812 BTRFS_BLOCK_GROUP_RAID10 |
3162 bbio->mirror_num = mirror_num; 3813 BTRFS_BLOCK_GROUP_DUP)) {
3814 max_errors = 1;
3815 }
3163 } 3816 }
3817
3818 *bbio_ret = bbio;
3819 bbio->num_stripes = num_stripes;
3820 bbio->max_errors = max_errors;
3821 bbio->mirror_num = mirror_num;
3164out: 3822out:
3165 free_extent_map(em); 3823 free_extent_map(em);
3166 return 0; 3824 return ret;
3167} 3825}
3168 3826
3169int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3827int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
@@ -3304,7 +3962,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
3304 /* don't bother with additional async steps for reads, right now */ 3962 /* don't bother with additional async steps for reads, right now */
3305 if (!(rw & REQ_WRITE)) { 3963 if (!(rw & REQ_WRITE)) {
3306 bio_get(bio); 3964 bio_get(bio);
3307 submit_bio(rw, bio); 3965 btrfsic_submit_bio(rw, bio);
3308 bio_put(bio); 3966 bio_put(bio);
3309 return 0; 3967 return 0;
3310 } 3968 }
@@ -3399,7 +4057,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3399 if (async_submit) 4057 if (async_submit)
3400 schedule_bio(root, dev, rw, bio); 4058 schedule_bio(root, dev, rw, bio);
3401 else 4059 else
3402 submit_bio(rw, bio); 4060 btrfsic_submit_bio(rw, bio);
3403 } else { 4061 } else {
3404 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; 4062 bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3405 bio->bi_sector = logical >> 9; 4063 bio->bi_sector = logical >> 9;
@@ -3568,7 +4226,7 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3568 struct btrfs_fs_devices *fs_devices; 4226 struct btrfs_fs_devices *fs_devices;
3569 int ret; 4227 int ret;
3570 4228
3571 mutex_lock(&uuid_mutex); 4229 BUG_ON(!mutex_is_locked(&uuid_mutex));
3572 4230
3573 fs_devices = root->fs_info->fs_devices->seed; 4231 fs_devices = root->fs_info->fs_devices->seed;
3574 while (fs_devices) { 4232 while (fs_devices) {
@@ -3606,7 +4264,6 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3606 fs_devices->seed = root->fs_info->fs_devices->seed; 4264 fs_devices->seed = root->fs_info->fs_devices->seed;
3607 root->fs_info->fs_devices->seed = fs_devices; 4265 root->fs_info->fs_devices->seed = fs_devices;
3608out: 4266out:
3609 mutex_unlock(&uuid_mutex);
3610 return ret; 4267 return ret;
3611} 4268}
3612 4269
@@ -3749,6 +4406,9 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3749 if (!path) 4406 if (!path)
3750 return -ENOMEM; 4407 return -ENOMEM;
3751 4408
4409 mutex_lock(&uuid_mutex);
4410 lock_chunks(root);
4411
3752 /* first we search for all of the device items, and then we 4412 /* first we search for all of the device items, and then we
3753 * read in all of the chunk items. This way we can create chunk 4413 * read in all of the chunk items. This way we can create chunk
3754 * mappings that reference all of the devices that are afound 4414 * mappings that reference all of the devices that are afound
@@ -3799,6 +4459,9 @@ again:
3799 } 4459 }
3800 ret = 0; 4460 ret = 0;
3801error: 4461error:
4462 unlock_chunks(root);
4463 mutex_unlock(&uuid_mutex);
4464
3802 btrfs_free_path(path); 4465 btrfs_free_path(path);
3803 return ret; 4466 return ret;
3804} 4467}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 78f2d4d4f37f..19ac95048b88 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -186,6 +186,51 @@ struct map_lookup {
186#define map_lookup_size(n) (sizeof(struct map_lookup) + \ 186#define map_lookup_size(n) (sizeof(struct map_lookup) + \
187 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
188 188
189/*
190 * Restriper's general type filter
191 */
192#define BTRFS_BALANCE_DATA (1ULL << 0)
193#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
194#define BTRFS_BALANCE_METADATA (1ULL << 2)
195
196#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
197 BTRFS_BALANCE_SYSTEM | \
198 BTRFS_BALANCE_METADATA)
199
200#define BTRFS_BALANCE_FORCE (1ULL << 3)
201#define BTRFS_BALANCE_RESUME (1ULL << 4)
202
203/*
204 * Balance filters
205 */
206#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
207#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
208#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
209#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
210#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
211
212/*
213 * Profile changing flags. When SOFT is set we won't relocate chunk if
214 * it already has the target profile (even though it may be
215 * half-filled).
216 */
217#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
218#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
219
220struct btrfs_balance_args;
221struct btrfs_balance_progress;
222struct btrfs_balance_control {
223 struct btrfs_fs_info *fs_info;
224
225 struct btrfs_balance_args data;
226 struct btrfs_balance_args meta;
227 struct btrfs_balance_args sys;
228
229 u64 flags;
230
231 struct btrfs_balance_progress stat;
232};
233
189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 234int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
190 u64 end, u64 *length); 235 u64 end, u64 *length);
191 236
@@ -228,9 +273,12 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
228 u8 *uuid, u8 *fsid); 273 u8 *uuid, u8 *fsid);
229int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 274int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
230int btrfs_init_new_device(struct btrfs_root *root, char *path); 275int btrfs_init_new_device(struct btrfs_root *root, char *path);
231int btrfs_balance(struct btrfs_root *dev_root); 276int btrfs_balance(struct btrfs_balance_control *bctl,
277 struct btrfs_ioctl_balance_args *bargs);
278int btrfs_recover_balance(struct btrfs_root *tree_root);
279int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
280int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
232int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 281int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
233int find_free_dev_extent(struct btrfs_trans_handle *trans, 282int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
234 struct btrfs_device *device, u64 num_bytes,
235 u64 *start, u64 *max_avail); 283 u64 *start, u64 *max_avail);
236#endif 284#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3848b04e310e..e7a5659087e6 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -200,7 +200,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
200 ret = btrfs_update_inode(trans, root, inode); 200 ret = btrfs_update_inode(trans, root, inode);
201 BUG_ON(ret); 201 BUG_ON(ret);
202out: 202out:
203 btrfs_end_transaction_throttle(trans, root); 203 btrfs_end_transaction(trans, root);
204 return ret; 204 return ret;
205} 205}
206 206
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b60fc8bfb3e9..620daad201db 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -641,10 +641,10 @@ static int __cap_is_valid(struct ceph_cap *cap)
641 unsigned long ttl; 641 unsigned long ttl;
642 u32 gen; 642 u32 gen;
643 643
644 spin_lock(&cap->session->s_cap_lock); 644 spin_lock(&cap->session->s_gen_ttl_lock);
645 gen = cap->session->s_cap_gen; 645 gen = cap->session->s_cap_gen;
646 ttl = cap->session->s_cap_ttl; 646 ttl = cap->session->s_cap_ttl;
647 spin_unlock(&cap->session->s_cap_lock); 647 spin_unlock(&cap->session->s_gen_ttl_lock);
648 648
649 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { 649 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
650 dout("__cap_is_valid %p cap %p issued %s " 650 dout("__cap_is_valid %p cap %p issued %s "
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 74fd74719dc2..3e8094be4604 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -973,12 +973,12 @@ static int dentry_lease_is_valid(struct dentry *dentry)
973 973
974 spin_lock(&dentry->d_lock); 974 spin_lock(&dentry->d_lock);
975 di = ceph_dentry(dentry); 975 di = ceph_dentry(dentry);
976 if (di && di->lease_session) { 976 if (di->lease_session) {
977 s = di->lease_session; 977 s = di->lease_session;
978 spin_lock(&s->s_cap_lock); 978 spin_lock(&s->s_gen_ttl_lock);
979 gen = s->s_cap_gen; 979 gen = s->s_cap_gen;
980 ttl = s->s_cap_ttl; 980 ttl = s->s_cap_ttl;
981 spin_unlock(&s->s_cap_lock); 981 spin_unlock(&s->s_gen_ttl_lock);
982 982
983 if (di->lease_gen == gen && 983 if (di->lease_gen == gen &&
984 time_before(jiffies, dentry->d_time) && 984 time_before(jiffies, dentry->d_time) &&
@@ -1072,13 +1072,11 @@ static void ceph_d_release(struct dentry *dentry)
1072 struct ceph_dentry_info *di = ceph_dentry(dentry); 1072 struct ceph_dentry_info *di = ceph_dentry(dentry);
1073 1073
1074 dout("d_release %p\n", dentry); 1074 dout("d_release %p\n", dentry);
1075 if (di) { 1075 ceph_dentry_lru_del(dentry);
1076 ceph_dentry_lru_del(dentry); 1076 if (di->lease_session)
1077 if (di->lease_session) 1077 ceph_put_mds_session(di->lease_session);
1078 ceph_put_mds_session(di->lease_session); 1078 kmem_cache_free(ceph_dentry_cachep, di);
1079 kmem_cache_free(ceph_dentry_cachep, di); 1079 dentry->d_fsdata = NULL;
1080 dentry->d_fsdata = NULL;
1081 }
1082} 1080}
1083 1081
1084static int ceph_snapdir_d_revalidate(struct dentry *dentry, 1082static int ceph_snapdir_d_revalidate(struct dentry *dentry,
@@ -1096,17 +1094,36 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1096 */ 1094 */
1097void ceph_dir_set_complete(struct inode *inode) 1095void ceph_dir_set_complete(struct inode *inode)
1098{ 1096{
1099 /* not yet implemented */ 1097 struct dentry *dentry = d_find_any_alias(inode);
1098
1099 if (dentry && ceph_dentry(dentry) &&
1100 ceph_test_mount_opt(ceph_sb_to_client(dentry->d_sb), DCACHE)) {
1101 dout(" marking %p (%p) complete\n", inode, dentry);
1102 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1103 }
1104 dput(dentry);
1100} 1105}
1101 1106
1102void ceph_dir_clear_complete(struct inode *inode) 1107void ceph_dir_clear_complete(struct inode *inode)
1103{ 1108{
1104 /* not yet implemented */ 1109 struct dentry *dentry = d_find_any_alias(inode);
1110
1111 if (dentry && ceph_dentry(dentry)) {
1112 dout(" marking %p (%p) complete\n", inode, dentry);
1113 set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1114 }
1115 dput(dentry);
1105} 1116}
1106 1117
1107bool ceph_dir_test_complete(struct inode *inode) 1118bool ceph_dir_test_complete(struct inode *inode)
1108{ 1119{
1109 /* not yet implemented */ 1120 struct dentry *dentry = d_find_any_alias(inode);
1121
1122 if (dentry && ceph_dentry(dentry)) {
1123 dout(" marking %p (%p) NOT complete\n", inode, dentry);
1124 clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags);
1125 }
1126 dput(dentry);
1110 return false; 1127 return false;
1111} 1128}
1112 1129
@@ -1220,6 +1237,7 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1220 do { 1237 do {
1221 ceph_mdsc_get_request(req); 1238 ceph_mdsc_get_request(req);
1222 spin_unlock(&ci->i_unsafe_lock); 1239 spin_unlock(&ci->i_unsafe_lock);
1240
1223 dout("dir_fsync %p wait on tid %llu (until %llu)\n", 1241 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1224 inode, req->r_tid, last_tid); 1242 inode, req->r_tid, last_tid);
1225 if (req->r_timeout) { 1243 if (req->r_timeout) {
@@ -1232,9 +1250,9 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1232 } else { 1250 } else {
1233 wait_for_completion(&req->r_safe_completion); 1251 wait_for_completion(&req->r_safe_completion);
1234 } 1252 }
1235 spin_lock(&ci->i_unsafe_lock);
1236 ceph_mdsc_put_request(req); 1253 ceph_mdsc_put_request(req);
1237 1254
1255 spin_lock(&ci->i_unsafe_lock);
1238 if (ret || list_empty(head)) 1256 if (ret || list_empty(head))
1239 break; 1257 break;
1240 req = list_entry(head->next, 1258 req = list_entry(head->next,
@@ -1259,13 +1277,11 @@ void ceph_dentry_lru_add(struct dentry *dn)
1259 1277
1260 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1278 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1261 dn->d_name.len, dn->d_name.name); 1279 dn->d_name.len, dn->d_name.name);
1262 if (di) { 1280 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1263 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1281 spin_lock(&mdsc->dentry_lru_lock);
1264 spin_lock(&mdsc->dentry_lru_lock); 1282 list_add_tail(&di->lru, &mdsc->dentry_lru);
1265 list_add_tail(&di->lru, &mdsc->dentry_lru); 1283 mdsc->num_dentry++;
1266 mdsc->num_dentry++; 1284 spin_unlock(&mdsc->dentry_lru_lock);
1267 spin_unlock(&mdsc->dentry_lru_lock);
1268 }
1269} 1285}
1270 1286
1271void ceph_dentry_lru_touch(struct dentry *dn) 1287void ceph_dentry_lru_touch(struct dentry *dn)
@@ -1275,12 +1291,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1275 1291
1276 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1292 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1277 dn->d_name.len, dn->d_name.name, di->offset); 1293 dn->d_name.len, dn->d_name.name, di->offset);
1278 if (di) { 1294 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1279 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1295 spin_lock(&mdsc->dentry_lru_lock);
1280 spin_lock(&mdsc->dentry_lru_lock); 1296 list_move_tail(&di->lru, &mdsc->dentry_lru);
1281 list_move_tail(&di->lru, &mdsc->dentry_lru); 1297 spin_unlock(&mdsc->dentry_lru_lock);
1282 spin_unlock(&mdsc->dentry_lru_lock);
1283 }
1284} 1298}
1285 1299
1286void ceph_dentry_lru_del(struct dentry *dn) 1300void ceph_dentry_lru_del(struct dentry *dn)
@@ -1290,13 +1304,11 @@ void ceph_dentry_lru_del(struct dentry *dn)
1290 1304
1291 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1305 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1292 dn->d_name.len, dn->d_name.name); 1306 dn->d_name.len, dn->d_name.name);
1293 if (di) { 1307 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1294 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc; 1308 spin_lock(&mdsc->dentry_lru_lock);
1295 spin_lock(&mdsc->dentry_lru_lock); 1309 list_del_init(&di->lru);
1296 list_del_init(&di->lru); 1310 mdsc->num_dentry--;
1297 mdsc->num_dentry--; 1311 spin_unlock(&mdsc->dentry_lru_lock);
1298 spin_unlock(&mdsc->dentry_lru_lock);
1299 }
1300} 1312}
1301 1313
1302/* 1314/*
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9fbcdecaaccd..fbb2a643ef10 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -56,9 +56,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
56 return -EINVAL; 56 return -EINVAL;
57 57
58 spin_lock(&dentry->d_lock); 58 spin_lock(&dentry->d_lock);
59 parent = dget(dentry->d_parent); 59 parent = dentry->d_parent;
60 spin_unlock(&dentry->d_lock);
61
62 if (*max_len >= connected_handle_length) { 60 if (*max_len >= connected_handle_length) {
63 dout("encode_fh %p connectable\n", dentry); 61 dout("encode_fh %p connectable\n", dentry);
64 cfh->ino = ceph_ino(dentry->d_inode); 62 cfh->ino = ceph_ino(dentry->d_inode);
@@ -81,7 +79,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
81 *max_len = handle_length; 79 *max_len = handle_length;
82 type = 255; 80 type = 255;
83 } 81 }
84 dput(parent); 82 spin_unlock(&dentry->d_lock);
85 return type; 83 return type;
86} 84}
87 85
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 25283e7a37f8..2c489378b4cd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -850,11 +850,12 @@ static void ceph_set_dentry_offset(struct dentry *dn)
850{ 850{
851 struct dentry *dir = dn->d_parent; 851 struct dentry *dir = dn->d_parent;
852 struct inode *inode = dir->d_inode; 852 struct inode *inode = dir->d_inode;
853 struct ceph_inode_info *ci = ceph_inode(inode); 853 struct ceph_inode_info *ci;
854 struct ceph_dentry_info *di; 854 struct ceph_dentry_info *di;
855 855
856 BUG_ON(!inode); 856 BUG_ON(!inode);
857 857
858 ci = ceph_inode(inode);
858 di = ceph_dentry(dn); 859 di = ceph_dentry(dn);
859 860
860 spin_lock(&ci->i_ceph_lock); 861 spin_lock(&ci->i_ceph_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6203d805eb45..866e8d7ca37d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -262,6 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
262 /* trace */ 262 /* trace */
263 ceph_decode_32_safe(&p, end, len, bad); 263 ceph_decode_32_safe(&p, end, len, bad);
264 if (len > 0) { 264 if (len > 0) {
265 ceph_decode_need(&p, end, len, bad);
265 err = parse_reply_info_trace(&p, p+len, info, features); 266 err = parse_reply_info_trace(&p, p+len, info, features);
266 if (err < 0) 267 if (err < 0)
267 goto out_bad; 268 goto out_bad;
@@ -270,6 +271,7 @@ static int parse_reply_info(struct ceph_msg *msg,
270 /* extra */ 271 /* extra */
271 ceph_decode_32_safe(&p, end, len, bad); 272 ceph_decode_32_safe(&p, end, len, bad);
272 if (len > 0) { 273 if (len > 0) {
274 ceph_decode_need(&p, end, len, bad);
273 err = parse_reply_info_extra(&p, p+len, info, features); 275 err = parse_reply_info_extra(&p, p+len, info, features);
274 if (err < 0) 276 if (err < 0)
275 goto out_bad; 277 goto out_bad;
@@ -398,9 +400,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
398 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; 400 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
399 s->s_con.peer_name.num = cpu_to_le64(mds); 401 s->s_con.peer_name.num = cpu_to_le64(mds);
400 402
401 spin_lock_init(&s->s_cap_lock); 403 spin_lock_init(&s->s_gen_ttl_lock);
402 s->s_cap_gen = 0; 404 s->s_cap_gen = 0;
403 s->s_cap_ttl = 0; 405 s->s_cap_ttl = 0;
406
407 spin_lock_init(&s->s_cap_lock);
404 s->s_renew_requested = 0; 408 s->s_renew_requested = 0;
405 s->s_renew_seq = 0; 409 s->s_renew_seq = 0;
406 INIT_LIST_HEAD(&s->s_caps); 410 INIT_LIST_HEAD(&s->s_caps);
@@ -2326,10 +2330,10 @@ static void handle_session(struct ceph_mds_session *session,
2326 case CEPH_SESSION_STALE: 2330 case CEPH_SESSION_STALE:
2327 pr_info("mds%d caps went stale, renewing\n", 2331 pr_info("mds%d caps went stale, renewing\n",
2328 session->s_mds); 2332 session->s_mds);
2329 spin_lock(&session->s_cap_lock); 2333 spin_lock(&session->s_gen_ttl_lock);
2330 session->s_cap_gen++; 2334 session->s_cap_gen++;
2331 session->s_cap_ttl = 0; 2335 session->s_cap_ttl = 0;
2332 spin_unlock(&session->s_cap_lock); 2336 spin_unlock(&session->s_gen_ttl_lock);
2333 send_renew_caps(mdsc, session); 2337 send_renew_caps(mdsc, session);
2334 break; 2338 break;
2335 2339
@@ -2772,7 +2776,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2772 di = ceph_dentry(dentry); 2776 di = ceph_dentry(dentry);
2773 switch (h->action) { 2777 switch (h->action) {
2774 case CEPH_MDS_LEASE_REVOKE: 2778 case CEPH_MDS_LEASE_REVOKE:
2775 if (di && di->lease_session == session) { 2779 if (di->lease_session == session) {
2776 if (ceph_seq_cmp(di->lease_seq, seq) > 0) 2780 if (ceph_seq_cmp(di->lease_seq, seq) > 0)
2777 h->seq = cpu_to_le32(di->lease_seq); 2781 h->seq = cpu_to_le32(di->lease_seq);
2778 __ceph_mdsc_drop_dentry_lease(dentry); 2782 __ceph_mdsc_drop_dentry_lease(dentry);
@@ -2781,7 +2785,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2781 break; 2785 break;
2782 2786
2783 case CEPH_MDS_LEASE_RENEW: 2787 case CEPH_MDS_LEASE_RENEW:
2784 if (di && di->lease_session == session && 2788 if (di->lease_session == session &&
2785 di->lease_gen == session->s_cap_gen && 2789 di->lease_gen == session->s_cap_gen &&
2786 di->lease_renew_from && 2790 di->lease_renew_from &&
2787 di->lease_renew_after == 0) { 2791 di->lease_renew_after == 0) {
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index a50ca0e39475..8c7c04ebb595 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -117,10 +117,13 @@ struct ceph_mds_session {
117 void *s_authorizer_buf, *s_authorizer_reply_buf; 117 void *s_authorizer_buf, *s_authorizer_reply_buf;
118 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len; 118 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
119 119
120 /* protected by s_cap_lock */ 120 /* protected by s_gen_ttl_lock */
121 spinlock_t s_cap_lock; 121 spinlock_t s_gen_ttl_lock;
122 u32 s_cap_gen; /* inc each time we get mds stale msg */ 122 u32 s_cap_gen; /* inc each time we get mds stale msg */
123 unsigned long s_cap_ttl; /* when session caps expire */ 123 unsigned long s_cap_ttl; /* when session caps expire */
124
125 /* protected by s_cap_lock */
126 spinlock_t s_cap_lock;
124 struct list_head s_caps; /* all caps issued by this session */ 127 struct list_head s_caps; /* all caps issued by this session */
125 int s_nr_caps, s_trim_caps; 128 int s_nr_caps, s_trim_caps;
126 int s_num_cap_releases; 129 int s_num_cap_releases;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 11bd0fc4853f..00de2c9568cd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,8 @@ enum {
131 Opt_rbytes, 131 Opt_rbytes,
132 Opt_norbytes, 132 Opt_norbytes,
133 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
134 Opt_dcache,
135 Opt_nodcache,
134 Opt_ino32, 136 Opt_ino32,
135}; 137};
136 138
@@ -152,6 +154,8 @@ static match_table_t fsopt_tokens = {
152 {Opt_rbytes, "rbytes"}, 154 {Opt_rbytes, "rbytes"},
153 {Opt_norbytes, "norbytes"}, 155 {Opt_norbytes, "norbytes"},
154 {Opt_noasyncreaddir, "noasyncreaddir"}, 156 {Opt_noasyncreaddir, "noasyncreaddir"},
157 {Opt_dcache, "dcache"},
158 {Opt_nodcache, "nodcache"},
155 {Opt_ino32, "ino32"}, 159 {Opt_ino32, "ino32"},
156 {-1, NULL} 160 {-1, NULL}
157}; 161};
@@ -231,6 +235,12 @@ static int parse_fsopt_token(char *c, void *private)
231 case Opt_noasyncreaddir: 235 case Opt_noasyncreaddir:
232 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; 236 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
233 break; 237 break;
238 case Opt_dcache:
239 fsopt->flags |= CEPH_MOUNT_OPT_DCACHE;
240 break;
241 case Opt_nodcache:
242 fsopt->flags &= ~CEPH_MOUNT_OPT_DCACHE;
243 break;
234 case Opt_ino32: 244 case Opt_ino32:
235 fsopt->flags |= CEPH_MOUNT_OPT_INO32; 245 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
236 break; 246 break;
@@ -377,6 +387,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
377 seq_puts(m, ",norbytes"); 387 seq_puts(m, ",norbytes");
378 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) 388 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
379 seq_puts(m, ",noasyncreaddir"); 389 seq_puts(m, ",noasyncreaddir");
390 if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE)
391 seq_puts(m, ",dcache");
392 else
393 seq_puts(m, ",nodcache");
380 394
381 if (fsopt->wsize) 395 if (fsopt->wsize)
382 seq_printf(m, ",wsize=%d", fsopt->wsize); 396 seq_printf(m, ",wsize=%d", fsopt->wsize);
@@ -636,19 +650,26 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
636 req->r_num_caps = 2; 650 req->r_num_caps = 2;
637 err = ceph_mdsc_do_request(mdsc, NULL, req); 651 err = ceph_mdsc_do_request(mdsc, NULL, req);
638 if (err == 0) { 652 if (err == 0) {
653 struct inode *inode = req->r_target_inode;
654 req->r_target_inode = NULL;
639 dout("open_root_inode success\n"); 655 dout("open_root_inode success\n");
640 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 656 if (ceph_ino(inode) == CEPH_INO_ROOT &&
641 fsc->sb->s_root == NULL) { 657 fsc->sb->s_root == NULL) {
642 root = d_alloc_root(req->r_target_inode); 658 root = d_alloc_root(inode);
643 ceph_init_dentry(root); 659 if (!root) {
660 iput(inode);
661 root = ERR_PTR(-ENOMEM);
662 goto out;
663 }
644 } else { 664 } else {
645 root = d_obtain_alias(req->r_target_inode); 665 root = d_obtain_alias(inode);
646 } 666 }
647 req->r_target_inode = NULL; 667 ceph_init_dentry(root);
648 dout("open_root_inode success, root dentry is %p\n", root); 668 dout("open_root_inode success, root dentry is %p\n", root);
649 } else { 669 } else {
650 root = ERR_PTR(err); 670 root = ERR_PTR(err);
651 } 671 }
672out:
652 ceph_mdsc_put_request(req); 673 ceph_mdsc_put_request(req);
653 return root; 674 return root;
654} 675}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index cb3652b37271..1421f3d875a2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -28,6 +28,7 @@
28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
30#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ 30#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
31#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
31 32
32#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) 33#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
33 34
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index a5e36e4488a7..a76f697303d9 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -111,8 +111,10 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
111} 111}
112 112
113static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 113static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
114 { true, "ceph.file.layout", ceph_vxattrcb_layout},
115 /* The following extended attribute name is deprecated */
114 { true, "ceph.layout", ceph_vxattrcb_layout}, 116 { true, "ceph.layout", ceph_vxattrcb_layout},
115 { NULL, NULL } 117 { true, NULL, NULL }
116}; 118};
117 119
118static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode) 120static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
@@ -818,6 +820,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
818 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode); 820 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
819 int issued; 821 int issued;
820 int err; 822 int err;
823 int required_blob_size;
821 int dirty; 824 int dirty;
822 825
823 if (ceph_snap(inode) != CEPH_NOSNAP) 826 if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -833,14 +836,34 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
833 return -EOPNOTSUPP; 836 return -EOPNOTSUPP;
834 } 837 }
835 838
839 err = -ENOMEM;
836 spin_lock(&ci->i_ceph_lock); 840 spin_lock(&ci->i_ceph_lock);
837 __build_xattrs(inode); 841 __build_xattrs(inode);
842retry:
838 issued = __ceph_caps_issued(ci, NULL); 843 issued = __ceph_caps_issued(ci, NULL);
839 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); 844 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
840 845
841 if (!(issued & CEPH_CAP_XATTR_EXCL)) 846 if (!(issued & CEPH_CAP_XATTR_EXCL))
842 goto do_sync; 847 goto do_sync;
843 848
849 required_blob_size = __get_required_blob_size(ci, 0, 0);
850
851 if (!ci->i_xattrs.prealloc_blob ||
852 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
853 struct ceph_buffer *blob;
854
855 spin_unlock(&ci->i_ceph_lock);
856 dout(" preaallocating new blob size=%d\n", required_blob_size);
857 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
858 if (!blob)
859 goto out;
860 spin_lock(&ci->i_ceph_lock);
861 if (ci->i_xattrs.prealloc_blob)
862 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
863 ci->i_xattrs.prealloc_blob = blob;
864 goto retry;
865 }
866
844 err = __remove_xattr_by_name(ceph_inode(inode), name); 867 err = __remove_xattr_by_name(ceph_inode(inode), name);
845 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); 868 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
846 ci->i_xattrs.dirty = true; 869 ci->i_xattrs.dirty = true;
@@ -853,6 +876,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
853do_sync: 876do_sync:
854 spin_unlock(&ci->i_ceph_lock); 877 spin_unlock(&ci->i_ceph_lock);
855 err = ceph_send_removexattr(dentry, name); 878 err = ceph_send_removexattr(dentry, name);
879out:
856 return err; 880 return err;
857} 881}
858 882
diff --git a/fs/char_dev.c b/fs/char_dev.c
index dca9e5e0f73b..3f152b92a94a 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -272,7 +272,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
272 cd = __register_chrdev_region(major, baseminor, count, name); 272 cd = __register_chrdev_region(major, baseminor, count, name);
273 if (IS_ERR(cd)) 273 if (IS_ERR(cd))
274 return PTR_ERR(cd); 274 return PTR_ERR(cd);
275 275
276 cdev = cdev_alloc(); 276 cdev = cdev_alloc();
277 if (!cdev) 277 if (!cdev)
278 goto out2; 278 goto out2;
@@ -280,7 +280,7 @@ int __register_chrdev(unsigned int major, unsigned int baseminor,
280 cdev->owner = fops->owner; 280 cdev->owner = fops->owner;
281 cdev->ops = fops; 281 cdev->ops = fops;
282 kobject_set_name(&cdev->kobj, "%s", name); 282 kobject_set_name(&cdev->kobj, "%s", name);
283 283
284 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); 284 err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
285 if (err) 285 if (err)
286 goto out; 286 goto out;
@@ -405,7 +405,7 @@ static int chrdev_open(struct inode *inode, struct file *filp)
405 goto out_cdev_put; 405 goto out_cdev_put;
406 406
407 if (filp->f_op->open) { 407 if (filp->f_op->open) {
408 ret = filp->f_op->open(inode,filp); 408 ret = filp->f_op->open(inode, filp);
409 if (ret) 409 if (ret)
410 goto out_cdev_put; 410 goto out_cdev_put;
411 } 411 }
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index f66cc1625150..0554b00a7b33 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -140,7 +140,6 @@ config CIFS_DFS_UPCALL
140 140
141config CIFS_FSCACHE 141config CIFS_FSCACHE
142 bool "Provide CIFS client caching support (EXPERIMENTAL)" 142 bool "Provide CIFS client caching support (EXPERIMENTAL)"
143 depends on EXPERIMENTAL
144 depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y 143 depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y
145 help 144 help
146 Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data 145 Makes CIFS FS-Cache capable. Say Y here if you want your CIFS data
@@ -149,7 +148,7 @@ config CIFS_FSCACHE
149 148
150config CIFS_ACL 149config CIFS_ACL
151 bool "Provide CIFS ACL support (EXPERIMENTAL)" 150 bool "Provide CIFS ACL support (EXPERIMENTAL)"
152 depends on EXPERIMENTAL && CIFS_XATTR && KEYS 151 depends on CIFS_XATTR && KEYS
153 help 152 help
154 Allows to fetch CIFS/NTFS ACL from the server. The DACL blob 153 Allows to fetch CIFS/NTFS ACL from the server. The DACL blob
155 is handed over to the application/caller. 154 is handed over to the application/caller.
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 84e8c0724704..24b3dfc05282 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -676,14 +676,23 @@ static ssize_t cifs_multiuser_mount_proc_write(struct file *file,
676{ 676{
677 char c; 677 char c;
678 int rc; 678 int rc;
679 static bool warned;
679 680
680 rc = get_user(c, buffer); 681 rc = get_user(c, buffer);
681 if (rc) 682 if (rc)
682 return rc; 683 return rc;
683 if (c == '0' || c == 'n' || c == 'N') 684 if (c == '0' || c == 'n' || c == 'N')
684 multiuser_mount = 0; 685 multiuser_mount = 0;
685 else if (c == '1' || c == 'y' || c == 'Y') 686 else if (c == '1' || c == 'y' || c == 'Y') {
686 multiuser_mount = 1; 687 multiuser_mount = 1;
688 if (!warned) {
689 warned = true;
690 printk(KERN_WARNING "CIFS VFS: The legacy multiuser "
691 "mount code is scheduled to be deprecated in "
692 "3.5. Please switch to using the multiuser "
693 "mount option.");
694 }
695 }
687 696
688 return count; 697 return count;
689} 698}
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 2272fd5fe5b7..e622863b292f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,9 +113,11 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
113 MAX_MECH_STR_LEN + 113 MAX_MECH_STR_LEN +
114 UID_KEY_LEN + (sizeof(uid_t) * 2) + 114 UID_KEY_LEN + (sizeof(uid_t) * 2) +
115 CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + 115 CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
116 USER_KEY_LEN + strlen(sesInfo->user_name) +
117 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; 116 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
118 117
118 if (sesInfo->user_name)
119 desc_len += USER_KEY_LEN + strlen(sesInfo->user_name);
120
119 spnego_key = ERR_PTR(-ENOMEM); 121 spnego_key = ERR_PTR(-ENOMEM);
120 description = kzalloc(desc_len, GFP_KERNEL); 122 description = kzalloc(desc_len, GFP_KERNEL);
121 if (description == NULL) 123 if (description == NULL)
@@ -152,8 +154,10 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
152 dp = description + strlen(description); 154 dp = description + strlen(description);
153 sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); 155 sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
154 156
155 dp = description + strlen(description); 157 if (sesInfo->user_name) {
156 sprintf(dp, ";user=%s", sesInfo->user_name); 158 dp = description + strlen(description);
159 sprintf(dp, ";user=%s", sesInfo->user_name);
160 }
157 161
158 dp = description + strlen(description); 162 dp = description + strlen(description);
159 sprintf(dp, ";pid=0x%x", current->pid); 163 sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 1b2e180b018d..fbb9da951843 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -27,17 +27,17 @@
27#include "cifs_debug.h" 27#include "cifs_debug.h"
28 28
29/* 29/*
30 * cifs_ucs2_bytes - how long will a string be after conversion? 30 * cifs_utf16_bytes - how long will a string be after conversion?
31 * @ucs - pointer to input string 31 * @utf16 - pointer to input string
32 * @maxbytes - don't go past this many bytes of input string 32 * @maxbytes - don't go past this many bytes of input string
33 * @codepage - destination codepage 33 * @codepage - destination codepage
34 * 34 *
35 * Walk a ucs2le string and return the number of bytes that the string will 35 * Walk a utf16le string and return the number of bytes that the string will
36 * be after being converted to the given charset, not including any null 36 * be after being converted to the given charset, not including any null
37 * termination required. Don't walk past maxbytes in the source buffer. 37 * termination required. Don't walk past maxbytes in the source buffer.
38 */ 38 */
39int 39int
40cifs_ucs2_bytes(const __le16 *from, int maxbytes, 40cifs_utf16_bytes(const __le16 *from, int maxbytes,
41 const struct nls_table *codepage) 41 const struct nls_table *codepage)
42{ 42{
43 int i; 43 int i;
@@ -122,7 +122,7 @@ cp_convert:
122} 122}
123 123
124/* 124/*
125 * cifs_from_ucs2 - convert utf16le string to local charset 125 * cifs_from_utf16 - convert utf16le string to local charset
126 * @to - destination buffer 126 * @to - destination buffer
127 * @from - source buffer 127 * @from - source buffer
128 * @tolen - destination buffer size (in bytes) 128 * @tolen - destination buffer size (in bytes)
@@ -130,7 +130,7 @@ cp_convert:
130 * @codepage - codepage to which characters should be converted 130 * @codepage - codepage to which characters should be converted
131 * @mapchar - should characters be remapped according to the mapchars option? 131 * @mapchar - should characters be remapped according to the mapchars option?
132 * 132 *
133 * Convert a little-endian ucs2le string (as sent by the server) to a string 133 * Convert a little-endian utf16le string (as sent by the server) to a string
134 * in the provided codepage. The tolen and fromlen parameters are to ensure 134 * in the provided codepage. The tolen and fromlen parameters are to ensure
135 * that the code doesn't walk off of the end of the buffer (which is always 135 * that the code doesn't walk off of the end of the buffer (which is always
136 * a danger if the alignment of the source buffer is off). The destination 136 * a danger if the alignment of the source buffer is off). The destination
@@ -139,12 +139,12 @@ cp_convert:
139 * null terminator). 139 * null terminator).
140 * 140 *
141 * Note that some windows versions actually send multiword UTF-16 characters 141 * Note that some windows versions actually send multiword UTF-16 characters
142 * instead of straight UCS-2. The linux nls routines however aren't able to 142 * instead of straight UTF16-2. The linux nls routines however aren't able to
143 * deal with those characters properly. In the event that we get some of 143 * deal with those characters properly. In the event that we get some of
144 * those characters, they won't be translated properly. 144 * those characters, they won't be translated properly.
145 */ 145 */
146int 146int
147cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, 147cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
148 const struct nls_table *codepage, bool mapchar) 148 const struct nls_table *codepage, bool mapchar)
149{ 149{
150 int i, charlen, safelen; 150 int i, charlen, safelen;
@@ -190,13 +190,13 @@ cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen,
190} 190}
191 191
192/* 192/*
193 * NAME: cifs_strtoUCS() 193 * NAME: cifs_strtoUTF16()
194 * 194 *
195 * FUNCTION: Convert character string to unicode string 195 * FUNCTION: Convert character string to unicode string
196 * 196 *
197 */ 197 */
198int 198int
199cifs_strtoUCS(__le16 *to, const char *from, int len, 199cifs_strtoUTF16(__le16 *to, const char *from, int len,
200 const struct nls_table *codepage) 200 const struct nls_table *codepage)
201{ 201{
202 int charlen; 202 int charlen;
@@ -206,7 +206,7 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
206 for (i = 0; len && *from; i++, from += charlen, len -= charlen) { 206 for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
207 charlen = codepage->char2uni(from, len, &wchar_to); 207 charlen = codepage->char2uni(from, len, &wchar_to);
208 if (charlen < 1) { 208 if (charlen < 1) {
209 cERROR(1, "strtoUCS: char2uni of 0x%x returned %d", 209 cERROR(1, "strtoUTF16: char2uni of 0x%x returned %d",
210 *from, charlen); 210 *from, charlen);
211 /* A question mark */ 211 /* A question mark */
212 wchar_to = 0x003f; 212 wchar_to = 0x003f;
@@ -220,7 +220,8 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
220} 220}
221 221
222/* 222/*
223 * cifs_strndup_from_ucs - copy a string from wire format to the local codepage 223 * cifs_strndup_from_utf16 - copy a string from wire format to the local
224 * codepage
224 * @src - source string 225 * @src - source string
225 * @maxlen - don't walk past this many bytes in the source string 226 * @maxlen - don't walk past this many bytes in the source string
226 * @is_unicode - is this a unicode string? 227 * @is_unicode - is this a unicode string?
@@ -231,19 +232,19 @@ cifs_strtoUCS(__le16 *to, const char *from, int len,
231 * error. 232 * error.
232 */ 233 */
233char * 234char *
234cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode, 235cifs_strndup_from_utf16(const char *src, const int maxlen,
235 const struct nls_table *codepage) 236 const bool is_unicode, const struct nls_table *codepage)
236{ 237{
237 int len; 238 int len;
238 char *dst; 239 char *dst;
239 240
240 if (is_unicode) { 241 if (is_unicode) {
241 len = cifs_ucs2_bytes((__le16 *) src, maxlen, codepage); 242 len = cifs_utf16_bytes((__le16 *) src, maxlen, codepage);
242 len += nls_nullsize(codepage); 243 len += nls_nullsize(codepage);
243 dst = kmalloc(len, GFP_KERNEL); 244 dst = kmalloc(len, GFP_KERNEL);
244 if (!dst) 245 if (!dst)
245 return NULL; 246 return NULL;
246 cifs_from_ucs2(dst, (__le16 *) src, len, maxlen, codepage, 247 cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
247 false); 248 false);
248 } else { 249 } else {
249 len = strnlen(src, maxlen); 250 len = strnlen(src, maxlen);
@@ -264,7 +265,7 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
264 * names are little endian 16 bit Unicode on the wire 265 * names are little endian 16 bit Unicode on the wire
265 */ 266 */
266int 267int
267cifsConvertToUCS(__le16 *target, const char *source, int srclen, 268cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
268 const struct nls_table *cp, int mapChars) 269 const struct nls_table *cp, int mapChars)
269{ 270{
270 int i, j, charlen; 271 int i, j, charlen;
@@ -273,7 +274,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
273 wchar_t tmp; 274 wchar_t tmp;
274 275
275 if (!mapChars) 276 if (!mapChars)
276 return cifs_strtoUCS(target, source, PATH_MAX, cp); 277 return cifs_strtoUTF16(target, source, PATH_MAX, cp);
277 278
278 for (i = 0, j = 0; i < srclen; j++) { 279 for (i = 0, j = 0; i < srclen; j++) {
279 src_char = source[i]; 280 src_char = source[i];
@@ -281,7 +282,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
281 switch (src_char) { 282 switch (src_char) {
282 case 0: 283 case 0:
283 put_unaligned(0, &target[j]); 284 put_unaligned(0, &target[j]);
284 goto ctoUCS_out; 285 goto ctoUTF16_out;
285 case ':': 286 case ':':
286 dst_char = cpu_to_le16(UNI_COLON); 287 dst_char = cpu_to_le16(UNI_COLON);
287 break; 288 break;
@@ -326,7 +327,7 @@ cifsConvertToUCS(__le16 *target, const char *source, int srclen,
326 put_unaligned(dst_char, &target[j]); 327 put_unaligned(dst_char, &target[j]);
327 } 328 }
328 329
329ctoUCS_out: 330ctoUTF16_out:
330 return i; 331 return i;
331} 332}
332 333
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 6d02fd560566..a513a546700b 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -74,16 +74,16 @@ extern const struct UniCaseRange CifsUniLowerRange[];
74#endif /* UNIUPR_NOLOWER */ 74#endif /* UNIUPR_NOLOWER */
75 75
76#ifdef __KERNEL__ 76#ifdef __KERNEL__
77int cifs_from_ucs2(char *to, const __le16 *from, int tolen, int fromlen, 77int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
78 const struct nls_table *codepage, bool mapchar); 78 const struct nls_table *codepage, bool mapchar);
79int cifs_ucs2_bytes(const __le16 *from, int maxbytes, 79int cifs_utf16_bytes(const __le16 *from, int maxbytes,
80 const struct nls_table *codepage); 80 const struct nls_table *codepage);
81int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *); 81int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
82char *cifs_strndup_from_ucs(const char *src, const int maxlen, 82char *cifs_strndup_from_utf16(const char *src, const int maxlen,
83 const bool is_unicode, 83 const bool is_unicode,
84 const struct nls_table *codepage); 84 const struct nls_table *codepage);
85extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen, 85extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
86 const struct nls_table *cp, int mapChars); 86 const struct nls_table *cp, int mapChars);
87 87
88#endif 88#endif
89 89
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 72ddf23ef6f7..c1b254487388 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -909,6 +909,8 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
909 umode_t group_mask = S_IRWXG; 909 umode_t group_mask = S_IRWXG;
910 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO; 910 umode_t other_mask = S_IRWXU | S_IRWXG | S_IRWXO;
911 911
912 if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *))
913 return;
912 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), 914 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
913 GFP_KERNEL); 915 GFP_KERNEL);
914 if (!ppace) { 916 if (!ppace) {
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 5d9b9acc5fce..63c460e503b6 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -327,7 +327,7 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
327 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME); 327 attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
328 attrptr->length = cpu_to_le16(2 * dlen); 328 attrptr->length = cpu_to_le16(2 * dlen);
329 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name); 329 blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
330 cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp); 330 cifs_strtoUTF16((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
331 331
332 return 0; 332 return 0;
333} 333}
@@ -376,7 +376,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
376 kmalloc(attrsize + 1, GFP_KERNEL); 376 kmalloc(attrsize + 1, GFP_KERNEL);
377 if (!ses->domainName) 377 if (!ses->domainName)
378 return -ENOMEM; 378 return -ENOMEM;
379 cifs_from_ucs2(ses->domainName, 379 cifs_from_utf16(ses->domainName,
380 (__le16 *)blobptr, attrsize, attrsize, 380 (__le16 *)blobptr, attrsize, attrsize,
381 nls_cp, false); 381 nls_cp, false);
382 break; 382 break;
@@ -420,15 +420,20 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
420 } 420 }
421 421
422 /* convert ses->user_name to unicode and uppercase */ 422 /* convert ses->user_name to unicode and uppercase */
423 len = strlen(ses->user_name); 423 len = ses->user_name ? strlen(ses->user_name) : 0;
424 user = kmalloc(2 + (len * 2), GFP_KERNEL); 424 user = kmalloc(2 + (len * 2), GFP_KERNEL);
425 if (user == NULL) { 425 if (user == NULL) {
426 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); 426 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
427 rc = -ENOMEM; 427 rc = -ENOMEM;
428 return rc; 428 return rc;
429 } 429 }
430 len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp); 430
431 UniStrupr(user); 431 if (len) {
432 len = cifs_strtoUTF16((__le16 *)user, ses->user_name, len, nls_cp);
433 UniStrupr(user);
434 } else {
435 memset(user, '\0', 2);
436 }
432 437
433 rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 438 rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
434 (char *)user, 2 * len); 439 (char *)user, 2 * len);
@@ -448,8 +453,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
448 rc = -ENOMEM; 453 rc = -ENOMEM;
449 return rc; 454 return rc;
450 } 455 }
451 len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, 456 len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len,
452 nls_cp); 457 nls_cp);
453 rc = 458 rc =
454 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 459 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
455 (char *)domain, 2 * len); 460 (char *)domain, 2 * len);
@@ -468,7 +473,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
468 rc = -ENOMEM; 473 rc = -ENOMEM;
469 return rc; 474 return rc;
470 } 475 }
471 len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, 476 len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len,
472 nls_cp); 477 nls_cp);
473 rc = 478 rc =
474 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 479 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ba53c1c6c6cc..76e7d8b6da17 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -879,6 +879,8 @@ require use of the stronger protocol */
879#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */ 879#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */
880#endif /* UPCALL */ 880#endif /* UPCALL */
881#else /* do not allow weak pw hash */ 881#else /* do not allow weak pw hash */
882#define CIFSSEC_MUST_LANMAN 0
883#define CIFSSEC_MUST_PLNTXT 0
882#ifdef CONFIG_CIFS_UPCALL 884#ifdef CONFIG_CIFS_UPCALL
883#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */ 885#define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */
884#else 886#else
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6600aa2d2ef3..8b7794c31591 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -821,8 +821,8 @@ PsxDelete:
821 821
822 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 822 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
823 name_len = 823 name_len =
824 cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, 824 cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
825 PATH_MAX, nls_codepage, remap); 825 PATH_MAX, nls_codepage, remap);
826 name_len++; /* trailing null */ 826 name_len++; /* trailing null */
827 name_len *= 2; 827 name_len *= 2;
828 } else { /* BB add path length overrun check */ 828 } else { /* BB add path length overrun check */
@@ -893,8 +893,8 @@ DelFileRetry:
893 893
894 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 894 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
895 name_len = 895 name_len =
896 cifsConvertToUCS((__le16 *) pSMB->fileName, fileName, 896 cifsConvertToUTF16((__le16 *) pSMB->fileName, fileName,
897 PATH_MAX, nls_codepage, remap); 897 PATH_MAX, nls_codepage, remap);
898 name_len++; /* trailing null */ 898 name_len++; /* trailing null */
899 name_len *= 2; 899 name_len *= 2;
900 } else { /* BB improve check for buffer overruns BB */ 900 } else { /* BB improve check for buffer overruns BB */
@@ -938,8 +938,8 @@ RmDirRetry:
938 return rc; 938 return rc;
939 939
940 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 940 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
941 name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, dirName, 941 name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, dirName,
942 PATH_MAX, nls_codepage, remap); 942 PATH_MAX, nls_codepage, remap);
943 name_len++; /* trailing null */ 943 name_len++; /* trailing null */
944 name_len *= 2; 944 name_len *= 2;
945 } else { /* BB improve check for buffer overruns BB */ 945 } else { /* BB improve check for buffer overruns BB */
@@ -981,8 +981,8 @@ MkDirRetry:
981 return rc; 981 return rc;
982 982
983 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 983 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
984 name_len = cifsConvertToUCS((__le16 *) pSMB->DirName, name, 984 name_len = cifsConvertToUTF16((__le16 *) pSMB->DirName, name,
985 PATH_MAX, nls_codepage, remap); 985 PATH_MAX, nls_codepage, remap);
986 name_len++; /* trailing null */ 986 name_len++; /* trailing null */
987 name_len *= 2; 987 name_len *= 2;
988 } else { /* BB improve check for buffer overruns BB */ 988 } else { /* BB improve check for buffer overruns BB */
@@ -1030,8 +1030,8 @@ PsxCreat:
1030 1030
1031 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 1031 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
1032 name_len = 1032 name_len =
1033 cifsConvertToUCS((__le16 *) pSMB->FileName, name, 1033 cifsConvertToUTF16((__le16 *) pSMB->FileName, name,
1034 PATH_MAX, nls_codepage, remap); 1034 PATH_MAX, nls_codepage, remap);
1035 name_len++; /* trailing null */ 1035 name_len++; /* trailing null */
1036 name_len *= 2; 1036 name_len *= 2;
1037 } else { /* BB improve the check for buffer overruns BB */ 1037 } else { /* BB improve the check for buffer overruns BB */
@@ -1197,8 +1197,8 @@ OldOpenRetry:
1197 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 1197 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
1198 count = 1; /* account for one byte pad to word boundary */ 1198 count = 1; /* account for one byte pad to word boundary */
1199 name_len = 1199 name_len =
1200 cifsConvertToUCS((__le16 *) (pSMB->fileName + 1), 1200 cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
1201 fileName, PATH_MAX, nls_codepage, remap); 1201 fileName, PATH_MAX, nls_codepage, remap);
1202 name_len++; /* trailing null */ 1202 name_len++; /* trailing null */
1203 name_len *= 2; 1203 name_len *= 2;
1204 } else { /* BB improve check for buffer overruns BB */ 1204 } else { /* BB improve check for buffer overruns BB */
@@ -1304,8 +1304,8 @@ openRetry:
1304 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 1304 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
1305 count = 1; /* account for one byte pad to word boundary */ 1305 count = 1; /* account for one byte pad to word boundary */
1306 name_len = 1306 name_len =
1307 cifsConvertToUCS((__le16 *) (pSMB->fileName + 1), 1307 cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
1308 fileName, PATH_MAX, nls_codepage, remap); 1308 fileName, PATH_MAX, nls_codepage, remap);
1309 name_len++; /* trailing null */ 1309 name_len++; /* trailing null */
1310 name_len *= 2; 1310 name_len *= 2;
1311 pSMB->NameLength = cpu_to_le16(name_len); 1311 pSMB->NameLength = cpu_to_le16(name_len);
@@ -2649,16 +2649,16 @@ renameRetry:
2649 2649
2650 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 2650 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
2651 name_len = 2651 name_len =
2652 cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName, 2652 cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
2653 PATH_MAX, nls_codepage, remap); 2653 PATH_MAX, nls_codepage, remap);
2654 name_len++; /* trailing null */ 2654 name_len++; /* trailing null */
2655 name_len *= 2; 2655 name_len *= 2;
2656 pSMB->OldFileName[name_len] = 0x04; /* pad */ 2656 pSMB->OldFileName[name_len] = 0x04; /* pad */
2657 /* protocol requires ASCII signature byte on Unicode string */ 2657 /* protocol requires ASCII signature byte on Unicode string */
2658 pSMB->OldFileName[name_len + 1] = 0x00; 2658 pSMB->OldFileName[name_len + 1] = 0x00;
2659 name_len2 = 2659 name_len2 =
2660 cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], 2660 cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
2661 toName, PATH_MAX, nls_codepage, remap); 2661 toName, PATH_MAX, nls_codepage, remap);
2662 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; 2662 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
2663 name_len2 *= 2; /* convert to bytes */ 2663 name_len2 *= 2; /* convert to bytes */
2664 } else { /* BB improve the check for buffer overruns BB */ 2664 } else { /* BB improve the check for buffer overruns BB */
@@ -2738,10 +2738,12 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifs_tcon *pTcon,
2738 /* unicode only call */ 2738 /* unicode only call */
2739 if (target_name == NULL) { 2739 if (target_name == NULL) {
2740 sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid); 2740 sprintf(dummy_string, "cifs%x", pSMB->hdr.Mid);
2741 len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name, 2741 len_of_str =
2742 cifsConvertToUTF16((__le16 *)rename_info->target_name,
2742 dummy_string, 24, nls_codepage, remap); 2743 dummy_string, 24, nls_codepage, remap);
2743 } else { 2744 } else {
2744 len_of_str = cifsConvertToUCS((__le16 *)rename_info->target_name, 2745 len_of_str =
2746 cifsConvertToUTF16((__le16 *)rename_info->target_name,
2745 target_name, PATH_MAX, nls_codepage, 2747 target_name, PATH_MAX, nls_codepage,
2746 remap); 2748 remap);
2747 } 2749 }
@@ -2795,17 +2797,17 @@ copyRetry:
2795 pSMB->Flags = cpu_to_le16(flags & COPY_TREE); 2797 pSMB->Flags = cpu_to_le16(flags & COPY_TREE);
2796 2798
2797 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 2799 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
2798 name_len = cifsConvertToUCS((__le16 *) pSMB->OldFileName, 2800 name_len = cifsConvertToUTF16((__le16 *) pSMB->OldFileName,
2799 fromName, PATH_MAX, nls_codepage, 2801 fromName, PATH_MAX, nls_codepage,
2800 remap); 2802 remap);
2801 name_len++; /* trailing null */ 2803 name_len++; /* trailing null */
2802 name_len *= 2; 2804 name_len *= 2;
2803 pSMB->OldFileName[name_len] = 0x04; /* pad */ 2805 pSMB->OldFileName[name_len] = 0x04; /* pad */
2804 /* protocol requires ASCII signature byte on Unicode string */ 2806 /* protocol requires ASCII signature byte on Unicode string */
2805 pSMB->OldFileName[name_len + 1] = 0x00; 2807 pSMB->OldFileName[name_len + 1] = 0x00;
2806 name_len2 = 2808 name_len2 =
2807 cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], 2809 cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
2808 toName, PATH_MAX, nls_codepage, remap); 2810 toName, PATH_MAX, nls_codepage, remap);
2809 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; 2811 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
2810 name_len2 *= 2; /* convert to bytes */ 2812 name_len2 *= 2; /* convert to bytes */
2811 } else { /* BB improve the check for buffer overruns BB */ 2813 } else { /* BB improve the check for buffer overruns BB */
@@ -2861,9 +2863,9 @@ createSymLinkRetry:
2861 2863
2862 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 2864 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
2863 name_len = 2865 name_len =
2864 cifs_strtoUCS((__le16 *) pSMB->FileName, fromName, PATH_MAX 2866 cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName,
2865 /* find define for this maxpathcomponent */ 2867 /* find define for this maxpathcomponent */
2866 , nls_codepage); 2868 PATH_MAX, nls_codepage);
2867 name_len++; /* trailing null */ 2869 name_len++; /* trailing null */
2868 name_len *= 2; 2870 name_len *= 2;
2869 2871
@@ -2885,9 +2887,9 @@ createSymLinkRetry:
2885 data_offset = (char *) (&pSMB->hdr.Protocol) + offset; 2887 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
2886 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 2888 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
2887 name_len_target = 2889 name_len_target =
2888 cifs_strtoUCS((__le16 *) data_offset, toName, PATH_MAX 2890 cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX
2889 /* find define for this maxpathcomponent */ 2891 /* find define for this maxpathcomponent */
2890 , nls_codepage); 2892 , nls_codepage);
2891 name_len_target++; /* trailing null */ 2893 name_len_target++; /* trailing null */
2892 name_len_target *= 2; 2894 name_len_target *= 2;
2893 } else { /* BB improve the check for buffer overruns BB */ 2895 } else { /* BB improve the check for buffer overruns BB */
@@ -2949,8 +2951,8 @@ createHardLinkRetry:
2949 return rc; 2951 return rc;
2950 2952
2951 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 2953 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
2952 name_len = cifsConvertToUCS((__le16 *) pSMB->FileName, toName, 2954 name_len = cifsConvertToUTF16((__le16 *) pSMB->FileName, toName,
2953 PATH_MAX, nls_codepage, remap); 2955 PATH_MAX, nls_codepage, remap);
2954 name_len++; /* trailing null */ 2956 name_len++; /* trailing null */
2955 name_len *= 2; 2957 name_len *= 2;
2956 2958
@@ -2972,8 +2974,8 @@ createHardLinkRetry:
2972 data_offset = (char *) (&pSMB->hdr.Protocol) + offset; 2974 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
2973 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 2975 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
2974 name_len_target = 2976 name_len_target =
2975 cifsConvertToUCS((__le16 *) data_offset, fromName, PATH_MAX, 2977 cifsConvertToUTF16((__le16 *) data_offset, fromName,
2976 nls_codepage, remap); 2978 PATH_MAX, nls_codepage, remap);
2977 name_len_target++; /* trailing null */ 2979 name_len_target++; /* trailing null */
2978 name_len_target *= 2; 2980 name_len_target *= 2;
2979 } else { /* BB improve the check for buffer overruns BB */ 2981 } else { /* BB improve the check for buffer overruns BB */
@@ -3042,8 +3044,8 @@ winCreateHardLinkRetry:
3042 3044
3043 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 3045 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
3044 name_len = 3046 name_len =
3045 cifsConvertToUCS((__le16 *) pSMB->OldFileName, fromName, 3047 cifsConvertToUTF16((__le16 *) pSMB->OldFileName, fromName,
3046 PATH_MAX, nls_codepage, remap); 3048 PATH_MAX, nls_codepage, remap);
3047 name_len++; /* trailing null */ 3049 name_len++; /* trailing null */
3048 name_len *= 2; 3050 name_len *= 2;
3049 3051
@@ -3051,8 +3053,8 @@ winCreateHardLinkRetry:
3051 pSMB->OldFileName[name_len] = 0x04; 3053 pSMB->OldFileName[name_len] = 0x04;
3052 pSMB->OldFileName[name_len + 1] = 0x00; /* pad */ 3054 pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
3053 name_len2 = 3055 name_len2 =
3054 cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2], 3056 cifsConvertToUTF16((__le16 *)&pSMB->OldFileName[name_len+2],
3055 toName, PATH_MAX, nls_codepage, remap); 3057 toName, PATH_MAX, nls_codepage, remap);
3056 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ; 3058 name_len2 += 1 /* trailing null */ + 1 /* Signature word */ ;
3057 name_len2 *= 2; /* convert to bytes */ 3059 name_len2 *= 2; /* convert to bytes */
3058 } else { /* BB improve the check for buffer overruns BB */ 3060 } else { /* BB improve the check for buffer overruns BB */
@@ -3108,8 +3110,8 @@ querySymLinkRetry:
3108 3110
3109 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 3111 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
3110 name_len = 3112 name_len =
3111 cifs_strtoUCS((__le16 *) pSMB->FileName, searchName, 3113 cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName,
3112 PATH_MAX, nls_codepage); 3114 PATH_MAX, nls_codepage);
3113 name_len++; /* trailing null */ 3115 name_len++; /* trailing null */
3114 name_len *= 2; 3116 name_len *= 2;
3115 } else { /* BB improve the check for buffer overruns BB */ 3117 } else { /* BB improve the check for buffer overruns BB */
@@ -3166,8 +3168,8 @@ querySymLinkRetry:
3166 is_unicode = false; 3168 is_unicode = false;
3167 3169
3168 /* BB FIXME investigate remapping reserved chars here */ 3170 /* BB FIXME investigate remapping reserved chars here */
3169 *symlinkinfo = cifs_strndup_from_ucs(data_start, count, 3171 *symlinkinfo = cifs_strndup_from_utf16(data_start,
3170 is_unicode, nls_codepage); 3172 count, is_unicode, nls_codepage);
3171 if (!*symlinkinfo) 3173 if (!*symlinkinfo)
3172 rc = -ENOMEM; 3174 rc = -ENOMEM;
3173 } 3175 }
@@ -3450,8 +3452,9 @@ queryAclRetry:
3450 3452
3451 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 3453 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
3452 name_len = 3454 name_len =
3453 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 3455 cifsConvertToUTF16((__le16 *) pSMB->FileName,
3454 PATH_MAX, nls_codepage, remap); 3456 searchName, PATH_MAX, nls_codepage,
3457 remap);
3455 name_len++; /* trailing null */ 3458 name_len++; /* trailing null */
3456 name_len *= 2; 3459 name_len *= 2;
3457 pSMB->FileName[name_len] = 0; 3460 pSMB->FileName[name_len] = 0;
@@ -3537,8 +3540,8 @@ setAclRetry:
3537 return rc; 3540 return rc;
3538 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 3541 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
3539 name_len = 3542 name_len =
3540 cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, 3543 cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
3541 PATH_MAX, nls_codepage, remap); 3544 PATH_MAX, nls_codepage, remap);
3542 name_len++; /* trailing null */ 3545 name_len++; /* trailing null */
3543 name_len *= 2; 3546 name_len *= 2;
3544 } else { /* BB improve the check for buffer overruns BB */ 3547 } else { /* BB improve the check for buffer overruns BB */
@@ -3948,8 +3951,9 @@ QInfRetry:
3948 3951
3949 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 3952 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
3950 name_len = 3953 name_len =
3951 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 3954 cifsConvertToUTF16((__le16 *) pSMB->FileName,
3952 PATH_MAX, nls_codepage, remap); 3955 searchName, PATH_MAX, nls_codepage,
3956 remap);
3953 name_len++; /* trailing null */ 3957 name_len++; /* trailing null */
3954 name_len *= 2; 3958 name_len *= 2;
3955 } else { 3959 } else {
@@ -4086,8 +4090,8 @@ QPathInfoRetry:
4086 4090
4087 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 4091 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
4088 name_len = 4092 name_len =
4089 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 4093 cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
4090 PATH_MAX, nls_codepage, remap); 4094 PATH_MAX, nls_codepage, remap);
4091 name_len++; /* trailing null */ 4095 name_len++; /* trailing null */
4092 name_len *= 2; 4096 name_len *= 2;
4093 } else { /* BB improve the check for buffer overruns BB */ 4097 } else { /* BB improve the check for buffer overruns BB */
@@ -4255,8 +4259,8 @@ UnixQPathInfoRetry:
4255 4259
4256 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 4260 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
4257 name_len = 4261 name_len =
4258 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 4262 cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
4259 PATH_MAX, nls_codepage, remap); 4263 PATH_MAX, nls_codepage, remap);
4260 name_len++; /* trailing null */ 4264 name_len++; /* trailing null */
4261 name_len *= 2; 4265 name_len *= 2;
4262 } else { /* BB improve the check for buffer overruns BB */ 4266 } else { /* BB improve the check for buffer overruns BB */
@@ -4344,8 +4348,8 @@ findFirstRetry:
4344 4348
4345 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 4349 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
4346 name_len = 4350 name_len =
4347 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 4351 cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
4348 PATH_MAX, nls_codepage, remap); 4352 PATH_MAX, nls_codepage, remap);
4349 /* We can not add the asterik earlier in case 4353 /* We can not add the asterik earlier in case
4350 it got remapped to 0xF03A as if it were part of the 4354 it got remapped to 0xF03A as if it were part of the
4351 directory name instead of a wildcard */ 4355 directory name instead of a wildcard */
@@ -4656,8 +4660,9 @@ GetInodeNumberRetry:
4656 4660
4657 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 4661 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
4658 name_len = 4662 name_len =
4659 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 4663 cifsConvertToUTF16((__le16 *) pSMB->FileName,
4660 PATH_MAX, nls_codepage, remap); 4664 searchName, PATH_MAX, nls_codepage,
4665 remap);
4661 name_len++; /* trailing null */ 4666 name_len++; /* trailing null */
4662 name_len *= 2; 4667 name_len *= 2;
4663 } else { /* BB improve the check for buffer overruns BB */ 4668 } else { /* BB improve the check for buffer overruns BB */
@@ -4794,9 +4799,9 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4794 rc = -ENOMEM; 4799 rc = -ENOMEM;
4795 goto parse_DFS_referrals_exit; 4800 goto parse_DFS_referrals_exit;
4796 } 4801 }
4797 cifsConvertToUCS((__le16 *) tmp, searchName, 4802 cifsConvertToUTF16((__le16 *) tmp, searchName,
4798 PATH_MAX, nls_codepage, remap); 4803 PATH_MAX, nls_codepage, remap);
4799 node->path_consumed = cifs_ucs2_bytes(tmp, 4804 node->path_consumed = cifs_utf16_bytes(tmp,
4800 le16_to_cpu(pSMBr->PathConsumed), 4805 le16_to_cpu(pSMBr->PathConsumed),
4801 nls_codepage); 4806 nls_codepage);
4802 kfree(tmp); 4807 kfree(tmp);
@@ -4809,8 +4814,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4809 /* copy DfsPath */ 4814 /* copy DfsPath */
4810 temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset); 4815 temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
4811 max_len = data_end - temp; 4816 max_len = data_end - temp;
4812 node->path_name = cifs_strndup_from_ucs(temp, max_len, 4817 node->path_name = cifs_strndup_from_utf16(temp, max_len,
4813 is_unicode, nls_codepage); 4818 is_unicode, nls_codepage);
4814 if (!node->path_name) { 4819 if (!node->path_name) {
4815 rc = -ENOMEM; 4820 rc = -ENOMEM;
4816 goto parse_DFS_referrals_exit; 4821 goto parse_DFS_referrals_exit;
@@ -4819,8 +4824,8 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
4819 /* copy link target UNC */ 4824 /* copy link target UNC */
4820 temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset); 4825 temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
4821 max_len = data_end - temp; 4826 max_len = data_end - temp;
4822 node->node_name = cifs_strndup_from_ucs(temp, max_len, 4827 node->node_name = cifs_strndup_from_utf16(temp, max_len,
4823 is_unicode, nls_codepage); 4828 is_unicode, nls_codepage);
4824 if (!node->node_name) 4829 if (!node->node_name)
4825 rc = -ENOMEM; 4830 rc = -ENOMEM;
4826 } 4831 }
@@ -4873,8 +4878,9 @@ getDFSRetry:
4873 if (ses->capabilities & CAP_UNICODE) { 4878 if (ses->capabilities & CAP_UNICODE) {
4874 pSMB->hdr.Flags2 |= SMBFLG2_UNICODE; 4879 pSMB->hdr.Flags2 |= SMBFLG2_UNICODE;
4875 name_len = 4880 name_len =
4876 cifsConvertToUCS((__le16 *) pSMB->RequestFileName, 4881 cifsConvertToUTF16((__le16 *) pSMB->RequestFileName,
4877 searchName, PATH_MAX, nls_codepage, remap); 4882 searchName, PATH_MAX, nls_codepage,
4883 remap);
4878 name_len++; /* trailing null */ 4884 name_len++; /* trailing null */
4879 name_len *= 2; 4885 name_len *= 2;
4880 } else { /* BB improve the check for buffer overruns BB */ 4886 } else { /* BB improve the check for buffer overruns BB */
@@ -5506,8 +5512,8 @@ SetEOFRetry:
5506 5512
5507 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5513 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5508 name_len = 5514 name_len =
5509 cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, 5515 cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
5510 PATH_MAX, nls_codepage, remap); 5516 PATH_MAX, nls_codepage, remap);
5511 name_len++; /* trailing null */ 5517 name_len++; /* trailing null */
5512 name_len *= 2; 5518 name_len *= 2;
5513 } else { /* BB improve the check for buffer overruns BB */ 5519 } else { /* BB improve the check for buffer overruns BB */
@@ -5796,8 +5802,8 @@ SetTimesRetry:
5796 5802
5797 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5803 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5798 name_len = 5804 name_len =
5799 cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, 5805 cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
5800 PATH_MAX, nls_codepage, remap); 5806 PATH_MAX, nls_codepage, remap);
5801 name_len++; /* trailing null */ 5807 name_len++; /* trailing null */
5802 name_len *= 2; 5808 name_len *= 2;
5803 } else { /* BB improve the check for buffer overruns BB */ 5809 } else { /* BB improve the check for buffer overruns BB */
@@ -5877,8 +5883,8 @@ SetAttrLgcyRetry:
5877 5883
5878 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 5884 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
5879 name_len = 5885 name_len =
5880 ConvertToUCS((__le16 *) pSMB->fileName, fileName, 5886 ConvertToUTF16((__le16 *) pSMB->fileName, fileName,
5881 PATH_MAX, nls_codepage); 5887 PATH_MAX, nls_codepage);
5882 name_len++; /* trailing null */ 5888 name_len++; /* trailing null */
5883 name_len *= 2; 5889 name_len *= 2;
5884 } else { /* BB improve the check for buffer overruns BB */ 5890 } else { /* BB improve the check for buffer overruns BB */
@@ -6030,8 +6036,8 @@ setPermsRetry:
6030 6036
6031 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 6037 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
6032 name_len = 6038 name_len =
6033 cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, 6039 cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
6034 PATH_MAX, nls_codepage, remap); 6040 PATH_MAX, nls_codepage, remap);
6035 name_len++; /* trailing null */ 6041 name_len++; /* trailing null */
6036 name_len *= 2; 6042 name_len *= 2;
6037 } else { /* BB improve the check for buffer overruns BB */ 6043 } else { /* BB improve the check for buffer overruns BB */
@@ -6123,8 +6129,8 @@ QAllEAsRetry:
6123 6129
6124 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 6130 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
6125 list_len = 6131 list_len =
6126 cifsConvertToUCS((__le16 *) pSMB->FileName, searchName, 6132 cifsConvertToUTF16((__le16 *) pSMB->FileName, searchName,
6127 PATH_MAX, nls_codepage, remap); 6133 PATH_MAX, nls_codepage, remap);
6128 list_len++; /* trailing null */ 6134 list_len++; /* trailing null */
6129 list_len *= 2; 6135 list_len *= 2;
6130 } else { /* BB improve the check for buffer overruns BB */ 6136 } else { /* BB improve the check for buffer overruns BB */
@@ -6301,8 +6307,8 @@ SetEARetry:
6301 6307
6302 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 6308 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
6303 name_len = 6309 name_len =
6304 cifsConvertToUCS((__le16 *) pSMB->FileName, fileName, 6310 cifsConvertToUTF16((__le16 *) pSMB->FileName, fileName,
6305 PATH_MAX, nls_codepage, remap); 6311 PATH_MAX, nls_codepage, remap);
6306 name_len++; /* trailing null */ 6312 name_len++; /* trailing null */
6307 name_len *= 2; 6313 name_len *= 2;
6308 } else { /* BB improve the check for buffer overruns BB */ 6314 } else { /* BB improve the check for buffer overruns BB */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4666780f315d..986709a8d903 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -38,6 +38,7 @@
38#include <asm/processor.h> 38#include <asm/processor.h>
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/module.h> 40#include <linux/module.h>
41#include <keys/user-type.h>
41#include <net/ipv6.h> 42#include <net/ipv6.h>
42#include "cifspdu.h" 43#include "cifspdu.h"
43#include "cifsglob.h" 44#include "cifsglob.h"
@@ -225,74 +226,90 @@ static int check2ndT2(struct smb_hdr *pSMB)
225 226
226static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) 227static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
227{ 228{
228 struct smb_t2_rsp *pSMB2 = (struct smb_t2_rsp *)psecond; 229 struct smb_t2_rsp *pSMBs = (struct smb_t2_rsp *)psecond;
229 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB; 230 struct smb_t2_rsp *pSMBt = (struct smb_t2_rsp *)pTargetSMB;
230 char *data_area_of_target; 231 char *data_area_of_tgt;
231 char *data_area_of_buf2; 232 char *data_area_of_src;
232 int remaining; 233 int remaining;
233 unsigned int byte_count, total_in_buf; 234 unsigned int byte_count, total_in_tgt;
234 __u16 total_data_size, total_in_buf2; 235 __u16 tgt_total_cnt, src_total_cnt, total_in_src;
235 236
236 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); 237 src_total_cnt = get_unaligned_le16(&pSMBs->t2_rsp.TotalDataCount);
238 tgt_total_cnt = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
237 239
238 if (total_data_size != 240 if (tgt_total_cnt != src_total_cnt)
239 get_unaligned_le16(&pSMB2->t2_rsp.TotalDataCount)) 241 cFYI(1, "total data count of primary and secondary t2 differ "
240 cFYI(1, "total data size of primary and secondary t2 differ"); 242 "source=%hu target=%hu", src_total_cnt, tgt_total_cnt);
241 243
242 total_in_buf = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); 244 total_in_tgt = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
243 245
244 remaining = total_data_size - total_in_buf; 246 remaining = tgt_total_cnt - total_in_tgt;
245 247
246 if (remaining < 0) 248 if (remaining < 0) {
249 cFYI(1, "Server sent too much data. tgt_total_cnt=%hu "
250 "total_in_tgt=%hu", tgt_total_cnt, total_in_tgt);
247 return -EPROTO; 251 return -EPROTO;
252 }
248 253
249 if (remaining == 0) /* nothing to do, ignore */ 254 if (remaining == 0) {
255 /* nothing to do, ignore */
256 cFYI(1, "no more data remains");
250 return 0; 257 return 0;
258 }
251 259
252 total_in_buf2 = get_unaligned_le16(&pSMB2->t2_rsp.DataCount); 260 total_in_src = get_unaligned_le16(&pSMBs->t2_rsp.DataCount);
253 if (remaining < total_in_buf2) { 261 if (remaining < total_in_src)
254 cFYI(1, "transact2 2nd response contains too much data"); 262 cFYI(1, "transact2 2nd response contains too much data");
255 }
256 263
257 /* find end of first SMB data area */ 264 /* find end of first SMB data area */
258 data_area_of_target = (char *)&pSMBt->hdr.Protocol + 265 data_area_of_tgt = (char *)&pSMBt->hdr.Protocol +
259 get_unaligned_le16(&pSMBt->t2_rsp.DataOffset); 266 get_unaligned_le16(&pSMBt->t2_rsp.DataOffset);
260 /* validate target area */
261 267
262 data_area_of_buf2 = (char *)&pSMB2->hdr.Protocol + 268 /* validate target area */
263 get_unaligned_le16(&pSMB2->t2_rsp.DataOffset); 269 data_area_of_src = (char *)&pSMBs->hdr.Protocol +
270 get_unaligned_le16(&pSMBs->t2_rsp.DataOffset);
264 271
265 data_area_of_target += total_in_buf; 272 data_area_of_tgt += total_in_tgt;
266 273
267 /* copy second buffer into end of first buffer */ 274 total_in_tgt += total_in_src;
268 total_in_buf += total_in_buf2;
269 /* is the result too big for the field? */ 275 /* is the result too big for the field? */
270 if (total_in_buf > USHRT_MAX) 276 if (total_in_tgt > USHRT_MAX) {
277 cFYI(1, "coalesced DataCount too large (%u)", total_in_tgt);
271 return -EPROTO; 278 return -EPROTO;
272 put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount); 279 }
280 put_unaligned_le16(total_in_tgt, &pSMBt->t2_rsp.DataCount);
273 281
274 /* fix up the BCC */ 282 /* fix up the BCC */
275 byte_count = get_bcc(pTargetSMB); 283 byte_count = get_bcc(pTargetSMB);
276 byte_count += total_in_buf2; 284 byte_count += total_in_src;
277 /* is the result too big for the field? */ 285 /* is the result too big for the field? */
278 if (byte_count > USHRT_MAX) 286 if (byte_count > USHRT_MAX) {
287 cFYI(1, "coalesced BCC too large (%u)", byte_count);
279 return -EPROTO; 288 return -EPROTO;
289 }
280 put_bcc(byte_count, pTargetSMB); 290 put_bcc(byte_count, pTargetSMB);
281 291
282 byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); 292 byte_count = be32_to_cpu(pTargetSMB->smb_buf_length);
283 byte_count += total_in_buf2; 293 byte_count += total_in_src;
284 /* don't allow buffer to overflow */ 294 /* don't allow buffer to overflow */
285 if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) 295 if (byte_count > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
296 cFYI(1, "coalesced BCC exceeds buffer size (%u)", byte_count);
286 return -ENOBUFS; 297 return -ENOBUFS;
298 }
287 pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); 299 pTargetSMB->smb_buf_length = cpu_to_be32(byte_count);
288 300
289 memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); 301 /* copy second buffer into end of first buffer */
302 memcpy(data_area_of_tgt, data_area_of_src, total_in_src);
290 303
291 if (remaining == total_in_buf2) { 304 if (remaining != total_in_src) {
292 cFYI(1, "found the last secondary response"); 305 /* more responses to go */
293 return 0; /* we are done */ 306 cFYI(1, "waiting for more secondary responses");
294 } else /* more responses to go */
295 return 1; 307 return 1;
308 }
309
310 /* we are done */
311 cFYI(1, "found the last secondary response");
312 return 0;
296} 313}
297 314
298static void 315static void
@@ -1578,11 +1595,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1578 } 1595 }
1579 } 1596 }
1580 1597
1581 if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) { 1598#ifndef CONFIG_KEYS
1582 cERROR(1, "Multiuser mounts currently require krb5 " 1599 /* Muliuser mounts require CONFIG_KEYS support */
1583 "authentication!"); 1600 if (vol->multiuser) {
1601 cERROR(1, "Multiuser mounts require kernels with "
1602 "CONFIG_KEYS enabled.");
1584 goto cifs_parse_mount_err; 1603 goto cifs_parse_mount_err;
1585 } 1604 }
1605#endif
1586 1606
1587 if (vol->UNCip == NULL) 1607 if (vol->UNCip == NULL)
1588 vol->UNCip = &vol->UNC[2]; 1608 vol->UNCip = &vol->UNC[2];
@@ -1981,10 +2001,16 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
1981 return 0; 2001 return 0;
1982 break; 2002 break;
1983 default: 2003 default:
2004 /* NULL username means anonymous session */
2005 if (ses->user_name == NULL) {
2006 if (!vol->nullauth)
2007 return 0;
2008 break;
2009 }
2010
1984 /* anything else takes username/password */ 2011 /* anything else takes username/password */
1985 if (ses->user_name == NULL) 2012 if (strncmp(ses->user_name,
1986 return 0; 2013 vol->username ? vol->username : "",
1987 if (strncmp(ses->user_name, vol->username,
1988 MAX_USERNAME_SIZE)) 2014 MAX_USERNAME_SIZE))
1989 return 0; 2015 return 0;
1990 if (strlen(vol->username) != 0 && 2016 if (strlen(vol->username) != 0 &&
@@ -2039,6 +2065,132 @@ cifs_put_smb_ses(struct cifs_ses *ses)
2039 cifs_put_tcp_session(server); 2065 cifs_put_tcp_session(server);
2040} 2066}
2041 2067
2068#ifdef CONFIG_KEYS
2069
2070/* strlen("cifs:a:") + INET6_ADDRSTRLEN + 1 */
2071#define CIFSCREDS_DESC_SIZE (7 + INET6_ADDRSTRLEN + 1)
2072
2073/* Populate username and pw fields from keyring if possible */
2074static int
2075cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
2076{
2077 int rc = 0;
2078 char *desc, *delim, *payload;
2079 ssize_t len;
2080 struct key *key;
2081 struct TCP_Server_Info *server = ses->server;
2082 struct sockaddr_in *sa;
2083 struct sockaddr_in6 *sa6;
2084 struct user_key_payload *upayload;
2085
2086 desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
2087 if (!desc)
2088 return -ENOMEM;
2089
2090 /* try to find an address key first */
2091 switch (server->dstaddr.ss_family) {
2092 case AF_INET:
2093 sa = (struct sockaddr_in *)&server->dstaddr;
2094 sprintf(desc, "cifs:a:%pI4", &sa->sin_addr.s_addr);
2095 break;
2096 case AF_INET6:
2097 sa6 = (struct sockaddr_in6 *)&server->dstaddr;
2098 sprintf(desc, "cifs:a:%pI6c", &sa6->sin6_addr.s6_addr);
2099 break;
2100 default:
2101 cFYI(1, "Bad ss_family (%hu)", server->dstaddr.ss_family);
2102 rc = -EINVAL;
2103 goto out_err;
2104 }
2105
2106 cFYI(1, "%s: desc=%s", __func__, desc);
2107 key = request_key(&key_type_logon, desc, "");
2108 if (IS_ERR(key)) {
2109 if (!ses->domainName) {
2110 cFYI(1, "domainName is NULL");
2111 rc = PTR_ERR(key);
2112 goto out_err;
2113 }
2114
2115 /* didn't work, try to find a domain key */
2116 sprintf(desc, "cifs:d:%s", ses->domainName);
2117 cFYI(1, "%s: desc=%s", __func__, desc);
2118 key = request_key(&key_type_logon, desc, "");
2119 if (IS_ERR(key)) {
2120 rc = PTR_ERR(key);
2121 goto out_err;
2122 }
2123 }
2124
2125 down_read(&key->sem);
2126 upayload = key->payload.data;
2127 if (IS_ERR_OR_NULL(upayload)) {
2128 rc = PTR_ERR(key);
2129 goto out_key_put;
2130 }
2131
2132 /* find first : in payload */
2133 payload = (char *)upayload->data;
2134 delim = strnchr(payload, upayload->datalen, ':');
2135 cFYI(1, "payload=%s", payload);
2136 if (!delim) {
2137 cFYI(1, "Unable to find ':' in payload (datalen=%d)",
2138 upayload->datalen);
2139 rc = -EINVAL;
2140 goto out_key_put;
2141 }
2142
2143 len = delim - payload;
2144 if (len > MAX_USERNAME_SIZE || len <= 0) {
2145 cFYI(1, "Bad value from username search (len=%ld)", len);
2146 rc = -EINVAL;
2147 goto out_key_put;
2148 }
2149
2150 vol->username = kstrndup(payload, len, GFP_KERNEL);
2151 if (!vol->username) {
2152 cFYI(1, "Unable to allocate %ld bytes for username", len);
2153 rc = -ENOMEM;
2154 goto out_key_put;
2155 }
2156 cFYI(1, "%s: username=%s", __func__, vol->username);
2157
2158 len = key->datalen - (len + 1);
2159 if (len > MAX_PASSWORD_SIZE || len <= 0) {
2160 cFYI(1, "Bad len for password search (len=%ld)", len);
2161 rc = -EINVAL;
2162 kfree(vol->username);
2163 vol->username = NULL;
2164 goto out_key_put;
2165 }
2166
2167 ++delim;
2168 vol->password = kstrndup(delim, len, GFP_KERNEL);
2169 if (!vol->password) {
2170 cFYI(1, "Unable to allocate %ld bytes for password", len);
2171 rc = -ENOMEM;
2172 kfree(vol->username);
2173 vol->username = NULL;
2174 goto out_key_put;
2175 }
2176
2177out_key_put:
2178 up_read(&key->sem);
2179 key_put(key);
2180out_err:
2181 kfree(desc);
2182 cFYI(1, "%s: returning %d", __func__, rc);
2183 return rc;
2184}
2185#else /* ! CONFIG_KEYS */
2186static inline int
2187cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
2188 struct cifs_ses *ses __attribute__((unused)))
2189{
2190 return -ENOSYS;
2191}
2192#endif /* CONFIG_KEYS */
2193
2042static bool warned_on_ntlm; /* globals init to false automatically */ 2194static bool warned_on_ntlm; /* globals init to false automatically */
2043 2195
2044static struct cifs_ses * 2196static struct cifs_ses *
@@ -2914,18 +3066,33 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
2914#define CIFS_DEFAULT_IOSIZE (1024 * 1024) 3066#define CIFS_DEFAULT_IOSIZE (1024 * 1024)
2915 3067
2916/* 3068/*
2917 * Windows only supports a max of 60k reads. Default to that when posix 3069 * Windows only supports a max of 60kb reads and 65535 byte writes. Default to
2918 * extensions aren't in force. 3070 * those values when posix extensions aren't in force. In actuality here, we
3071 * use 65536 to allow for a write that is a multiple of 4k. Most servers seem
3072 * to be ok with the extra byte even though Windows doesn't send writes that
3073 * are that large.
3074 *
3075 * Citation:
3076 *
3077 * http://blogs.msdn.com/b/openspecification/archive/2009/04/10/smb-maximum-transmit-buffer-size-and-performance-tuning.aspx
2919 */ 3078 */
2920#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024) 3079#define CIFS_DEFAULT_NON_POSIX_RSIZE (60 * 1024)
3080#define CIFS_DEFAULT_NON_POSIX_WSIZE (65536)
2921 3081
2922static unsigned int 3082static unsigned int
2923cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info) 3083cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *pvolume_info)
2924{ 3084{
2925 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); 3085 __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
2926 struct TCP_Server_Info *server = tcon->ses->server; 3086 struct TCP_Server_Info *server = tcon->ses->server;
2927 unsigned int wsize = pvolume_info->wsize ? pvolume_info->wsize : 3087 unsigned int wsize;
2928 CIFS_DEFAULT_IOSIZE; 3088
3089 /* start with specified wsize, or default */
3090 if (pvolume_info->wsize)
3091 wsize = pvolume_info->wsize;
3092 else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
3093 wsize = CIFS_DEFAULT_IOSIZE;
3094 else
3095 wsize = CIFS_DEFAULT_NON_POSIX_WSIZE;
2929 3096
2930 /* can server support 24-bit write sizes? (via UNIX extensions) */ 3097 /* can server support 24-bit write sizes? (via UNIX extensions) */
2931 if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) 3098 if (!tcon->unix_ext || !(unix_cap & CIFS_UNIX_LARGE_WRITE_CAP))
@@ -3136,10 +3303,9 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
3136 return -EINVAL; 3303 return -EINVAL;
3137 3304
3138 if (volume_info->nullauth) { 3305 if (volume_info->nullauth) {
3139 cFYI(1, "null user"); 3306 cFYI(1, "Anonymous login");
3140 volume_info->username = kzalloc(1, GFP_KERNEL); 3307 kfree(volume_info->username);
3141 if (volume_info->username == NULL) 3308 volume_info->username = NULL;
3142 return -ENOMEM;
3143 } else if (volume_info->username) { 3309 } else if (volume_info->username) {
3144 /* BB fixme parse for domain name here */ 3310 /* BB fixme parse for domain name here */
3145 cFYI(1, "Username: %s", volume_info->username); 3311 cFYI(1, "Username: %s", volume_info->username);
@@ -3478,7 +3644,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
3478 if (ses->capabilities & CAP_UNICODE) { 3644 if (ses->capabilities & CAP_UNICODE) {
3479 smb_buffer->Flags2 |= SMBFLG2_UNICODE; 3645 smb_buffer->Flags2 |= SMBFLG2_UNICODE;
3480 length = 3646 length =
3481 cifs_strtoUCS((__le16 *) bcc_ptr, tree, 3647 cifs_strtoUTF16((__le16 *) bcc_ptr, tree,
3482 6 /* max utf8 char length in bytes */ * 3648 6 /* max utf8 char length in bytes */ *
3483 (/* server len*/ + 256 /* share len */), nls_codepage); 3649 (/* server len*/ + 256 /* share len */), nls_codepage);
3484 bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */ 3650 bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */
@@ -3533,7 +3699,7 @@ CIFSTCon(unsigned int xid, struct cifs_ses *ses,
3533 3699
3534 /* mostly informational -- no need to fail on error here */ 3700 /* mostly informational -- no need to fail on error here */
3535 kfree(tcon->nativeFileSystem); 3701 kfree(tcon->nativeFileSystem);
3536 tcon->nativeFileSystem = cifs_strndup_from_ucs(bcc_ptr, 3702 tcon->nativeFileSystem = cifs_strndup_from_utf16(bcc_ptr,
3537 bytes_left, is_unicode, 3703 bytes_left, is_unicode,
3538 nls_codepage); 3704 nls_codepage);
3539 3705
@@ -3657,16 +3823,38 @@ int cifs_setup_session(unsigned int xid, struct cifs_ses *ses,
3657 return rc; 3823 return rc;
3658} 3824}
3659 3825
3826static int
3827cifs_set_vol_auth(struct smb_vol *vol, struct cifs_ses *ses)
3828{
3829 switch (ses->server->secType) {
3830 case Kerberos:
3831 vol->secFlg = CIFSSEC_MUST_KRB5;
3832 return 0;
3833 case NTLMv2:
3834 vol->secFlg = CIFSSEC_MUST_NTLMV2;
3835 break;
3836 case NTLM:
3837 vol->secFlg = CIFSSEC_MUST_NTLM;
3838 break;
3839 case RawNTLMSSP:
3840 vol->secFlg = CIFSSEC_MUST_NTLMSSP;
3841 break;
3842 case LANMAN:
3843 vol->secFlg = CIFSSEC_MUST_LANMAN;
3844 break;
3845 }
3846
3847 return cifs_set_cifscreds(vol, ses);
3848}
3849
3660static struct cifs_tcon * 3850static struct cifs_tcon *
3661cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) 3851cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
3662{ 3852{
3853 int rc;
3663 struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb); 3854 struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
3664 struct cifs_ses *ses; 3855 struct cifs_ses *ses;
3665 struct cifs_tcon *tcon = NULL; 3856 struct cifs_tcon *tcon = NULL;
3666 struct smb_vol *vol_info; 3857 struct smb_vol *vol_info;
3667 char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */
3668 /* We used to have this as MAX_USERNAME which is */
3669 /* way too big now (256 instead of 32) */
3670 3858
3671 vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL); 3859 vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL);
3672 if (vol_info == NULL) { 3860 if (vol_info == NULL) {
@@ -3674,8 +3862,6 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
3674 goto out; 3862 goto out;
3675 } 3863 }
3676 3864
3677 snprintf(username, sizeof(username), "krb50x%x", fsuid);
3678 vol_info->username = username;
3679 vol_info->local_nls = cifs_sb->local_nls; 3865 vol_info->local_nls = cifs_sb->local_nls;
3680 vol_info->linux_uid = fsuid; 3866 vol_info->linux_uid = fsuid;
3681 vol_info->cred_uid = fsuid; 3867 vol_info->cred_uid = fsuid;
@@ -3685,8 +3871,11 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
3685 vol_info->local_lease = master_tcon->local_lease; 3871 vol_info->local_lease = master_tcon->local_lease;
3686 vol_info->no_linux_ext = !master_tcon->unix_ext; 3872 vol_info->no_linux_ext = !master_tcon->unix_ext;
3687 3873
3688 /* FIXME: allow for other secFlg settings */ 3874 rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
3689 vol_info->secFlg = CIFSSEC_MUST_KRB5; 3875 if (rc) {
3876 tcon = ERR_PTR(rc);
3877 goto out;
3878 }
3690 3879
3691 /* get a reference for the same TCP session */ 3880 /* get a reference for the same TCP session */
3692 spin_lock(&cifs_tcp_ses_lock); 3881 spin_lock(&cifs_tcp_ses_lock);
@@ -3709,6 +3898,8 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid)
3709 if (ses->capabilities & CAP_UNIX) 3898 if (ses->capabilities & CAP_UNIX)
3710 reset_cifs_unix_caps(0, tcon, NULL, vol_info); 3899 reset_cifs_unix_caps(0, tcon, NULL, vol_info);
3711out: 3900out:
3901 kfree(vol_info->username);
3902 kfree(vol_info->password);
3712 kfree(vol_info); 3903 kfree(vol_info);
3713 3904
3714 return tcon; 3905 return tcon;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index a090bbe6ee29..e2bbc683e018 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -647,10 +647,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
647 647
648 name.name = scratch_buf; 648 name.name = scratch_buf;
649 name.len = 649 name.len =
650 cifs_from_ucs2((char *)name.name, (__le16 *)de.name, 650 cifs_from_utf16((char *)name.name, (__le16 *)de.name,
651 UNICODE_NAME_MAX, 651 UNICODE_NAME_MAX,
652 min(de.namelen, (size_t)max_len), nlt, 652 min_t(size_t, de.namelen,
653 cifs_sb->mnt_cifs_flags & 653 (size_t)max_len), nlt,
654 cifs_sb->mnt_cifs_flags &
654 CIFS_MOUNT_MAP_SPECIAL_CHR); 655 CIFS_MOUNT_MAP_SPECIAL_CHR);
655 name.len -= nls_nullsize(nlt); 656 name.len -= nls_nullsize(nlt);
656 } else { 657 } else {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4ec3ee9d72cc..d85efad5765f 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -167,16 +167,16 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp)
167 int bytes_ret = 0; 167 int bytes_ret = 0;
168 168
169 /* Copy OS version */ 169 /* Copy OS version */
170 bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32, 170 bytes_ret = cifs_strtoUTF16((__le16 *)bcc_ptr, "Linux version ", 32,
171 nls_cp); 171 nls_cp);
172 bcc_ptr += 2 * bytes_ret; 172 bcc_ptr += 2 * bytes_ret;
173 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release, 173 bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, init_utsname()->release,
174 32, nls_cp); 174 32, nls_cp);
175 bcc_ptr += 2 * bytes_ret; 175 bcc_ptr += 2 * bytes_ret;
176 bcc_ptr += 2; /* trailing null */ 176 bcc_ptr += 2; /* trailing null */
177 177
178 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, 178 bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS,
179 32, nls_cp); 179 32, nls_cp);
180 bcc_ptr += 2 * bytes_ret; 180 bcc_ptr += 2 * bytes_ret;
181 bcc_ptr += 2; /* trailing null */ 181 bcc_ptr += 2; /* trailing null */
182 182
@@ -197,8 +197,8 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses,
197 *(bcc_ptr+1) = 0; 197 *(bcc_ptr+1) = 0;
198 bytes_ret = 0; 198 bytes_ret = 0;
199 } else 199 } else
200 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName, 200 bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->domainName,
201 256, nls_cp); 201 256, nls_cp);
202 bcc_ptr += 2 * bytes_ret; 202 bcc_ptr += 2 * bytes_ret;
203 bcc_ptr += 2; /* account for null terminator */ 203 bcc_ptr += 2; /* account for null terminator */
204 204
@@ -226,8 +226,8 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
226 *bcc_ptr = 0; 226 *bcc_ptr = 0;
227 *(bcc_ptr+1) = 0; 227 *(bcc_ptr+1) = 0;
228 } else { 228 } else {
229 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name, 229 bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
230 MAX_USERNAME_SIZE, nls_cp); 230 MAX_USERNAME_SIZE, nls_cp);
231 } 231 }
232 bcc_ptr += 2 * bytes_ret; 232 bcc_ptr += 2 * bytes_ret;
233 bcc_ptr += 2; /* account for null termination */ 233 bcc_ptr += 2; /* account for null termination */
@@ -287,7 +287,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
287 cFYI(1, "bleft %d", bleft); 287 cFYI(1, "bleft %d", bleft);
288 288
289 kfree(ses->serverOS); 289 kfree(ses->serverOS);
290 ses->serverOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 290 ses->serverOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
291 cFYI(1, "serverOS=%s", ses->serverOS); 291 cFYI(1, "serverOS=%s", ses->serverOS);
292 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; 292 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
293 data += len; 293 data += len;
@@ -296,7 +296,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
296 return; 296 return;
297 297
298 kfree(ses->serverNOS); 298 kfree(ses->serverNOS);
299 ses->serverNOS = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 299 ses->serverNOS = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
300 cFYI(1, "serverNOS=%s", ses->serverNOS); 300 cFYI(1, "serverNOS=%s", ses->serverNOS);
301 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2; 301 len = (UniStrnlen((wchar_t *) data, bleft / 2) * 2) + 2;
302 data += len; 302 data += len;
@@ -305,7 +305,7 @@ decode_unicode_ssetup(char **pbcc_area, int bleft, struct cifs_ses *ses,
305 return; 305 return;
306 306
307 kfree(ses->serverDomain); 307 kfree(ses->serverDomain);
308 ses->serverDomain = cifs_strndup_from_ucs(data, bleft, true, nls_cp); 308 ses->serverDomain = cifs_strndup_from_utf16(data, bleft, true, nls_cp);
309 cFYI(1, "serverDomain=%s", ses->serverDomain); 309 cFYI(1, "serverDomain=%s", ses->serverDomain);
310 310
311 return; 311 return;
@@ -502,8 +502,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
502 tmp += 2; 502 tmp += 2;
503 } else { 503 } else {
504 int len; 504 int len;
505 len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, 505 len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
506 MAX_USERNAME_SIZE, nls_cp); 506 MAX_USERNAME_SIZE, nls_cp);
507 len *= 2; /* unicode is 2 bytes each */ 507 len *= 2; /* unicode is 2 bytes each */
508 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); 508 sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
509 sec_blob->DomainName.Length = cpu_to_le16(len); 509 sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -518,8 +518,8 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
518 tmp += 2; 518 tmp += 2;
519 } else { 519 } else {
520 int len; 520 int len;
521 len = cifs_strtoUCS((__le16 *)tmp, ses->user_name, 521 len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
522 MAX_USERNAME_SIZE, nls_cp); 522 MAX_USERNAME_SIZE, nls_cp);
523 len *= 2; /* unicode is 2 bytes each */ 523 len *= 2; /* unicode is 2 bytes each */
524 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 524 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
525 sec_blob->UserName.Length = cpu_to_le16(len); 525 sec_blob->UserName.Length = cpu_to_le16(len);
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 80d850881938..d5cd9aa7eacc 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -213,7 +213,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
213 213
214 /* Password cannot be longer than 128 characters */ 214 /* Password cannot be longer than 128 characters */
215 if (passwd) /* Password must be converted to NT unicode */ 215 if (passwd) /* Password must be converted to NT unicode */
216 len = cifs_strtoUCS(wpwd, passwd, 128, codepage); 216 len = cifs_strtoUTF16(wpwd, passwd, 128, codepage);
217 else { 217 else {
218 len = 0; 218 len = 0;
219 *wpwd = 0; /* Ensure string is null terminated */ 219 *wpwd = 0; /* Ensure string is null terminated */
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 6475877b0763..911cf30d057d 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
88 - link the two up if this is needed 88 - link the two up if this is needed
89 - fill in the attributes 89 - fill in the attributes
90*/ 90*/
91int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb) 91struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
92{ 92{
93 struct coda_vattr attr; 93 struct coda_vattr attr;
94 struct inode *inode;
94 int error; 95 int error;
95 96
96 /* We get inode numbers from Venus -- see venus source */ 97 /* We get inode numbers from Venus -- see venus source */
97 error = venus_getattr(sb, fid, &attr); 98 error = venus_getattr(sb, fid, &attr);
98 if ( error ) { 99 if (error)
99 *inode = NULL; 100 return ERR_PTR(error);
100 return error;
101 }
102 101
103 *inode = coda_iget(sb, fid, &attr); 102 inode = coda_iget(sb, fid, &attr);
104 if ( IS_ERR(*inode) ) { 103 if (IS_ERR(inode))
105 printk("coda_cnode_make: coda_iget failed\n"); 104 printk("coda_cnode_make: coda_iget failed\n");
106 return PTR_ERR(*inode); 105 return inode;
107 }
108 return 0;
109} 106}
110 107
111 108
@@ -156,19 +153,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
156} 153}
157 154
158/* the CONTROL inode is made without asking attributes from Venus */ 155/* the CONTROL inode is made without asking attributes from Venus */
159int coda_cnode_makectl(struct inode **inode, struct super_block *sb) 156struct inode *coda_cnode_makectl(struct super_block *sb)
160{ 157{
161 int error = -ENOMEM; 158 struct inode *inode = new_inode(sb);
162 159 if (inode) {
163 *inode = new_inode(sb); 160 inode->i_ino = CTL_INO;
164 if (*inode) { 161 inode->i_op = &coda_ioctl_inode_operations;
165 (*inode)->i_ino = CTL_INO; 162 inode->i_fop = &coda_ioctl_operations;
166 (*inode)->i_op = &coda_ioctl_inode_operations; 163 inode->i_mode = 0444;
167 (*inode)->i_fop = &coda_ioctl_operations; 164 return inode;
168 (*inode)->i_mode = 0444;
169 error = 0;
170 } 165 }
171 166 return ERR_PTR(-ENOMEM);
172 return error;
173} 167}
174 168
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index e35071b1de0e..b24fdfd8a3f0 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -49,9 +49,9 @@ struct coda_file_info {
49#define C_DYING 0x4 /* from venus (which died) */ 49#define C_DYING 0x4 /* from venus (which died) */
50#define C_PURGE 0x8 50#define C_PURGE 0x8
51 51
52int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *); 52struct inode *coda_cnode_make(struct CodaFid *, struct super_block *);
53struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr); 53struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
54int coda_cnode_makectl(struct inode **inode, struct super_block *sb); 54struct inode *coda_cnode_makectl(struct super_block *sb);
55struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb); 55struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
56void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *); 56void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
57 57
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 83d2fd8ec24b..177515829062 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -96,12 +96,11 @@ const struct file_operations coda_dir_operations = {
96/* access routines: lookup, readlink, permission */ 96/* access routines: lookup, readlink, permission */
97static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd) 97static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
98{ 98{
99 struct inode *inode = NULL; 99 struct super_block *sb = dir->i_sb;
100 struct CodaFid resfid = { { 0, } };
101 int type = 0;
102 int error = 0;
103 const char *name = entry->d_name.name; 100 const char *name = entry->d_name.name;
104 size_t length = entry->d_name.len; 101 size_t length = entry->d_name.len;
102 struct inode *inode;
103 int type = 0;
105 104
106 if (length > CODA_MAXNAMLEN) { 105 if (length > CODA_MAXNAMLEN) {
107 printk(KERN_ERR "name too long: lookup, %s (%*s)\n", 106 printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
@@ -111,23 +110,21 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
111 110
112 /* control object, create inode on the fly */ 111 /* control object, create inode on the fly */
113 if (coda_isroot(dir) && coda_iscontrol(name, length)) { 112 if (coda_isroot(dir) && coda_iscontrol(name, length)) {
114 error = coda_cnode_makectl(&inode, dir->i_sb); 113 inode = coda_cnode_makectl(sb);
115 type = CODA_NOCACHE; 114 type = CODA_NOCACHE;
116 goto exit; 115 } else {
116 struct CodaFid fid = { { 0, } };
117 int error = venus_lookup(sb, coda_i2f(dir), name, length,
118 &type, &fid);
119 inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error);
117 } 120 }
118 121
119 error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length, 122 if (!IS_ERR(inode) && (type & CODA_NOCACHE))
120 &type, &resfid);
121 if (!error)
122 error = coda_cnode_make(&inode, &resfid, dir->i_sb);
123
124 if (error && error != -ENOENT)
125 return ERR_PTR(error);
126
127exit:
128 if (inode && (type & CODA_NOCACHE))
129 coda_flag_inode(inode, C_VATTR | C_PURGE); 123 coda_flag_inode(inode, C_VATTR | C_PURGE);
130 124
125 if (inode == ERR_PTR(-ENOENT))
126 inode = NULL;
127
131 return d_splice_alias(inode, entry); 128 return d_splice_alias(inode, entry);
132} 129}
133 130
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 1c08a8cd673a..5e2e1b3f068d 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -204,10 +204,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
204 printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid)); 204 printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
205 205
206 /* make root inode */ 206 /* make root inode */
207 error = coda_cnode_make(&root, &fid, sb); 207 root = coda_cnode_make(&fid, sb);
208 if ( error || !root ) { 208 if (IS_ERR(root)) {
209 printk("Failure of coda_cnode_make for root: error %d\n", error); 209 error = PTR_ERR(root);
210 goto error; 210 printk("Failure of coda_cnode_make for root: error %d\n", error);
211 root = NULL;
212 goto error;
211 } 213 }
212 214
213 printk("coda_read_super: rootinode is %ld dev %s\n", 215 printk("coda_read_super: rootinode is %ld dev %s\n",
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a10e428b32b4..a26bea10e81b 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -105,6 +105,7 @@
105 105
106#include <linux/hiddev.h> 106#include <linux/hiddev.h>
107 107
108#define __DVB_CORE__
108#include <linux/dvb/audio.h> 109#include <linux/dvb/audio.h>
109#include <linux/dvb/dmx.h> 110#include <linux/dvb/dmx.h>
110#include <linux/dvb/frontend.h> 111#include <linux/dvb/frontend.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 9791b1e7eee4..16a53cc2cc02 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -243,6 +243,7 @@ static void dentry_lru_add(struct dentry *dentry)
243static void __dentry_lru_del(struct dentry *dentry) 243static void __dentry_lru_del(struct dentry *dentry)
244{ 244{
245 list_del_init(&dentry->d_lru); 245 list_del_init(&dentry->d_lru);
246 dentry->d_flags &= ~DCACHE_SHRINK_LIST;
246 dentry->d_sb->s_nr_dentry_unused--; 247 dentry->d_sb->s_nr_dentry_unused--;
247 dentry_stat.nr_unused--; 248 dentry_stat.nr_unused--;
248} 249}
@@ -276,15 +277,15 @@ static void dentry_lru_prune(struct dentry *dentry)
276 } 277 }
277} 278}
278 279
279static void dentry_lru_move_tail(struct dentry *dentry) 280static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
280{ 281{
281 spin_lock(&dcache_lru_lock); 282 spin_lock(&dcache_lru_lock);
282 if (list_empty(&dentry->d_lru)) { 283 if (list_empty(&dentry->d_lru)) {
283 list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 284 list_add_tail(&dentry->d_lru, list);
284 dentry->d_sb->s_nr_dentry_unused++; 285 dentry->d_sb->s_nr_dentry_unused++;
285 dentry_stat.nr_unused++; 286 dentry_stat.nr_unused++;
286 } else { 287 } else {
287 list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru); 288 list_move_tail(&dentry->d_lru, list);
288 } 289 }
289 spin_unlock(&dcache_lru_lock); 290 spin_unlock(&dcache_lru_lock);
290} 291}
@@ -770,14 +771,18 @@ static void shrink_dentry_list(struct list_head *list)
770} 771}
771 772
772/** 773/**
773 * __shrink_dcache_sb - shrink the dentry LRU on a given superblock 774 * prune_dcache_sb - shrink the dcache
774 * @sb: superblock to shrink dentry LRU. 775 * @sb: superblock
775 * @count: number of entries to prune 776 * @count: number of entries to try to free
776 * @flags: flags to control the dentry processing 777 *
778 * Attempt to shrink the superblock dcache LRU by @count entries. This is
779 * done when we need more memory an called from the superblock shrinker
780 * function.
777 * 781 *
778 * If flags contains DCACHE_REFERENCED reference dentries will not be pruned. 782 * This function may fail to free any resources if all the dentries are in
783 * use.
779 */ 784 */
780static void __shrink_dcache_sb(struct super_block *sb, int count, int flags) 785void prune_dcache_sb(struct super_block *sb, int count)
781{ 786{
782 struct dentry *dentry; 787 struct dentry *dentry;
783 LIST_HEAD(referenced); 788 LIST_HEAD(referenced);
@@ -796,18 +801,13 @@ relock:
796 goto relock; 801 goto relock;
797 } 802 }
798 803
799 /* 804 if (dentry->d_flags & DCACHE_REFERENCED) {
800 * If we are honouring the DCACHE_REFERENCED flag and the
801 * dentry has this flag set, don't free it. Clear the flag
802 * and put it back on the LRU.
803 */
804 if (flags & DCACHE_REFERENCED &&
805 dentry->d_flags & DCACHE_REFERENCED) {
806 dentry->d_flags &= ~DCACHE_REFERENCED; 805 dentry->d_flags &= ~DCACHE_REFERENCED;
807 list_move(&dentry->d_lru, &referenced); 806 list_move(&dentry->d_lru, &referenced);
808 spin_unlock(&dentry->d_lock); 807 spin_unlock(&dentry->d_lock);
809 } else { 808 } else {
810 list_move_tail(&dentry->d_lru, &tmp); 809 list_move_tail(&dentry->d_lru, &tmp);
810 dentry->d_flags |= DCACHE_SHRINK_LIST;
811 spin_unlock(&dentry->d_lock); 811 spin_unlock(&dentry->d_lock);
812 if (!--count) 812 if (!--count)
813 break; 813 break;
@@ -822,23 +822,6 @@ relock:
822} 822}
823 823
824/** 824/**
825 * prune_dcache_sb - shrink the dcache
826 * @sb: superblock
827 * @nr_to_scan: number of entries to try to free
828 *
829 * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
830 * done when we need more memory an called from the superblock shrinker
831 * function.
832 *
833 * This function may fail to free any resources if all the dentries are in
834 * use.
835 */
836void prune_dcache_sb(struct super_block *sb, int nr_to_scan)
837{
838 __shrink_dcache_sb(sb, nr_to_scan, DCACHE_REFERENCED);
839}
840
841/**
842 * shrink_dcache_sb - shrink dcache for a superblock 825 * shrink_dcache_sb - shrink dcache for a superblock
843 * @sb: superblock 826 * @sb: superblock
844 * 827 *
@@ -1092,7 +1075,7 @@ EXPORT_SYMBOL(have_submounts);
1092 * drop the lock and return early due to latency 1075 * drop the lock and return early due to latency
1093 * constraints. 1076 * constraints.
1094 */ 1077 */
1095static int select_parent(struct dentry * parent) 1078static int select_parent(struct dentry *parent, struct list_head *dispose)
1096{ 1079{
1097 struct dentry *this_parent; 1080 struct dentry *this_parent;
1098 struct list_head *next; 1081 struct list_head *next;
@@ -1114,17 +1097,21 @@ resume:
1114 1097
1115 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 1098 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
1116 1099
1117 /* 1100 /*
1118 * move only zero ref count dentries to the end 1101 * move only zero ref count dentries to the dispose list.
1119 * of the unused list for prune_dcache 1102 *
1103 * Those which are presently on the shrink list, being processed
1104 * by shrink_dentry_list(), shouldn't be moved. Otherwise the
1105 * loop in shrink_dcache_parent() might not make any progress
1106 * and loop forever.
1120 */ 1107 */
1121 if (!dentry->d_count) { 1108 if (dentry->d_count) {
1122 dentry_lru_move_tail(dentry);
1123 found++;
1124 } else {
1125 dentry_lru_del(dentry); 1109 dentry_lru_del(dentry);
1110 } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
1111 dentry_lru_move_list(dentry, dispose);
1112 dentry->d_flags |= DCACHE_SHRINK_LIST;
1113 found++;
1126 } 1114 }
1127
1128 /* 1115 /*
1129 * We can return to the caller if we have found some (this 1116 * We can return to the caller if we have found some (this
1130 * ensures forward progress). We'll be coming back to find 1117 * ensures forward progress). We'll be coming back to find
@@ -1181,14 +1168,13 @@ rename_retry:
1181 * 1168 *
1182 * Prune the dcache to remove unused children of the parent dentry. 1169 * Prune the dcache to remove unused children of the parent dentry.
1183 */ 1170 */
1184
1185void shrink_dcache_parent(struct dentry * parent) 1171void shrink_dcache_parent(struct dentry * parent)
1186{ 1172{
1187 struct super_block *sb = parent->d_sb; 1173 LIST_HEAD(dispose);
1188 int found; 1174 int found;
1189 1175
1190 while ((found = select_parent(parent)) != 0) 1176 while ((found = select_parent(parent, &dispose)) != 0)
1191 __shrink_dcache_sb(sb, found, 0); 1177 shrink_dentry_list(&dispose);
1192} 1178}
1193EXPORT_SYMBOL(shrink_dcache_parent); 1179EXPORT_SYMBOL(shrink_dcache_parent);
1194 1180
@@ -1461,6 +1447,23 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1461} 1447}
1462EXPORT_SYMBOL(d_alloc_root); 1448EXPORT_SYMBOL(d_alloc_root);
1463 1449
1450struct dentry *d_make_root(struct inode *root_inode)
1451{
1452 struct dentry *res = NULL;
1453
1454 if (root_inode) {
1455 static const struct qstr name = { .name = "/", .len = 1 };
1456
1457 res = __d_alloc(root_inode->i_sb, &name);
1458 if (res)
1459 d_instantiate(res, root_inode);
1460 else
1461 iput(root_inode);
1462 }
1463 return res;
1464}
1465EXPORT_SYMBOL(d_make_root);
1466
1464static struct dentry * __d_find_any_alias(struct inode *inode) 1467static struct dentry * __d_find_any_alias(struct inode *inode)
1465{ 1468{
1466 struct dentry *alias; 1469 struct dentry *alias;
@@ -1472,7 +1475,14 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
1472 return alias; 1475 return alias;
1473} 1476}
1474 1477
1475static struct dentry * d_find_any_alias(struct inode *inode) 1478/**
1479 * d_find_any_alias - find any alias for a given inode
1480 * @inode: inode to find an alias for
1481 *
1482 * If any aliases exist for the given inode, take and return a
1483 * reference for one of them. If no aliases exist, return %NULL.
1484 */
1485struct dentry *d_find_any_alias(struct inode *inode)
1476{ 1486{
1477 struct dentry *de; 1487 struct dentry *de;
1478 1488
@@ -1481,7 +1491,7 @@ static struct dentry * d_find_any_alias(struct inode *inode)
1481 spin_unlock(&inode->i_lock); 1491 spin_unlock(&inode->i_lock);
1482 return de; 1492 return de;
1483} 1493}
1484 1494EXPORT_SYMBOL(d_find_any_alias);
1485 1495
1486/** 1496/**
1487 * d_obtain_alias - find or allocate a dentry for a given inode 1497 * d_obtain_alias - find or allocate a dentry for a given inode
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index f65d4455c5e5..ef023eef0464 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -540,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_blob);
540 * debugfs_print_regs32 - use seq_print to describe a set of registers 540 * debugfs_print_regs32 - use seq_print to describe a set of registers
541 * @s: the seq_file structure being used to generate output 541 * @s: the seq_file structure being used to generate output
542 * @regs: an array if struct debugfs_reg32 structures 542 * @regs: an array if struct debugfs_reg32 structures
543 * @mregs: the length of the above array 543 * @nregs: the length of the above array
544 * @base: the base address to be used in reading the registers 544 * @base: the base address to be used in reading the registers
545 * @prefix: a string to be prefixed to every output line 545 * @prefix: a string to be prefixed to every output line
546 * 546 *
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 79673eb71151..c4e2a58a2e82 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -301,7 +301,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
301 301
302 inode = new_inode(s); 302 inode = new_inode(s);
303 if (!inode) 303 if (!inode)
304 goto free_fsi; 304 goto fail;
305 inode->i_ino = 1; 305 inode->i_ino = 1;
306 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 306 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
307 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; 307 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
@@ -316,8 +316,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
316 printk(KERN_ERR "devpts: get root dentry failed\n"); 316 printk(KERN_ERR "devpts: get root dentry failed\n");
317 iput(inode); 317 iput(inode);
318 318
319free_fsi:
320 kfree(s->s_fs_info);
321fail: 319fail:
322 return -ENOMEM; 320 return -ENOMEM;
323} 321}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d740ab67ff6e..4a588dbd11bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -36,6 +36,7 @@
36#include <linux/rwsem.h> 36#include <linux/rwsem.h>
37#include <linux/uio.h> 37#include <linux/uio.h>
38#include <linux/atomic.h> 38#include <linux/atomic.h>
39#include <linux/prefetch.h>
39 40
40/* 41/*
41 * How many user pages to map in one call to get_user_pages(). This determines 42 * How many user pages to map in one call to get_user_pages(). This determines
@@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
580{ 581{
581 int ret; 582 int ret;
582 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ 583 sector_t fs_startblk; /* Into file, in filesystem-sized blocks */
584 sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
583 unsigned long fs_count; /* Number of filesystem-sized blocks */ 585 unsigned long fs_count; /* Number of filesystem-sized blocks */
584 unsigned long dio_count;/* Number of dio_block-sized blocks */
585 unsigned long blkmask;
586 int create; 586 int create;
587 587
588 /* 588 /*
@@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
593 if (ret == 0) { 593 if (ret == 0) {
594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); 594 BUG_ON(sdio->block_in_file >= sdio->final_block_in_request);
595 fs_startblk = sdio->block_in_file >> sdio->blkfactor; 595 fs_startblk = sdio->block_in_file >> sdio->blkfactor;
596 dio_count = sdio->final_block_in_request - sdio->block_in_file; 596 fs_endblk = (sdio->final_block_in_request - 1) >>
597 fs_count = dio_count >> sdio->blkfactor; 597 sdio->blkfactor;
598 blkmask = (1 << sdio->blkfactor) - 1; 598 fs_count = fs_endblk - fs_startblk + 1;
599 if (dio_count & blkmask)
600 fs_count++;
601 599
602 map_bh->b_state = 0; 600 map_bh->b_state = 0;
603 map_bh->b_size = fs_count << dio->inode->i_blkbits; 601 map_bh->b_size = fs_count << dio->inode->i_blkbits;
@@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio)
1090 * individual fields and will generate much worse code. This is important 1088 * individual fields and will generate much worse code. This is important
1091 * for the whole file. 1089 * for the whole file.
1092 */ 1090 */
1093ssize_t 1091static inline ssize_t
1094__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1092do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1095 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1093 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1096 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1094 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1097 dio_submit_t submit_io, int flags) 1095 dio_submit_t submit_io, int flags)
@@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1100 size_t size; 1098 size_t size;
1101 unsigned long addr; 1099 unsigned long addr;
1102 unsigned blkbits = inode->i_blkbits; 1100 unsigned blkbits = inode->i_blkbits;
1103 unsigned bdev_blkbits = 0;
1104 unsigned blocksize_mask = (1 << blkbits) - 1; 1101 unsigned blocksize_mask = (1 << blkbits) - 1;
1105 ssize_t retval = -EINVAL; 1102 ssize_t retval = -EINVAL;
1106 loff_t end = offset; 1103 loff_t end = offset;
@@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1113 if (rw & WRITE) 1110 if (rw & WRITE)
1114 rw = WRITE_ODIRECT; 1111 rw = WRITE_ODIRECT;
1115 1112
1116 if (bdev) 1113 /*
1117 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1114 * Avoid references to bdev if not absolutely needed to give
1115 * the early prefetch in the caller enough time.
1116 */
1118 1117
1119 if (offset & blocksize_mask) { 1118 if (offset & blocksize_mask) {
1120 if (bdev) 1119 if (bdev)
1121 blkbits = bdev_blkbits; 1120 blkbits = blksize_bits(bdev_logical_block_size(bdev));
1122 blocksize_mask = (1 << blkbits) - 1; 1121 blocksize_mask = (1 << blkbits) - 1;
1123 if (offset & blocksize_mask) 1122 if (offset & blocksize_mask)
1124 goto out; 1123 goto out;
@@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1129 addr = (unsigned long)iov[seg].iov_base; 1128 addr = (unsigned long)iov[seg].iov_base;
1130 size = iov[seg].iov_len; 1129 size = iov[seg].iov_len;
1131 end += size; 1130 end += size;
1132 if ((addr & blocksize_mask) || (size & blocksize_mask)) { 1131 if (unlikely((addr & blocksize_mask) ||
1132 (size & blocksize_mask))) {
1133 if (bdev) 1133 if (bdev)
1134 blkbits = bdev_blkbits; 1134 blkbits = blksize_bits(
1135 bdev_logical_block_size(bdev));
1135 blocksize_mask = (1 << blkbits) - 1; 1136 blocksize_mask = (1 << blkbits) - 1;
1136 if ((addr & blocksize_mask) || (size & blocksize_mask)) 1137 if ((addr & blocksize_mask) || (size & blocksize_mask))
1137 goto out; 1138 goto out;
1138 } 1139 }
1139 } 1140 }
@@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1316out: 1317out:
1317 return retval; 1318 return retval;
1318} 1319}
1320
1321ssize_t
1322__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1323 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1324 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1325 dio_submit_t submit_io, int flags)
1326{
1327 /*
1328 * The block device state is needed in the end to finally
1329 * submit everything. Since it's likely to be cache cold
1330 * prefetch it here as first thing to hide some of the
1331 * latency.
1332 *
1333 * Attempt to prefetch the pieces we likely need later.
1334 */
1335 prefetch(&bdev->bd_disk->part_tbl);
1336 prefetch(bdev->bd_queue);
1337 prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
1338
1339 return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
1340 nr_segs, get_block, end_io,
1341 submit_io, flags);
1342}
1343
1319EXPORT_SYMBOL(__blockdev_direct_IO); 1344EXPORT_SYMBOL(__blockdev_direct_IO);
1320 1345
1321static __init int dio_init(void) 1346static __init int dio_init(void)
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 6cf72fcc0d0c..e7e327d43fa5 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -17,6 +17,7 @@
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/in.h> 18#include <linux/in.h>
19#include <linux/in6.h> 19#include <linux/in6.h>
20#include <linux/dlmconstants.h>
20#include <net/ipv6.h> 21#include <net/ipv6.h>
21#include <net/sock.h> 22#include <net/sock.h>
22 23
@@ -36,6 +37,7 @@
36static struct config_group *space_list; 37static struct config_group *space_list;
37static struct config_group *comm_list; 38static struct config_group *comm_list;
38static struct dlm_comm *local_comm; 39static struct dlm_comm *local_comm;
40static uint32_t dlm_comm_count;
39 41
40struct dlm_clusters; 42struct dlm_clusters;
41struct dlm_cluster; 43struct dlm_cluster;
@@ -103,6 +105,8 @@ struct dlm_cluster {
103 unsigned int cl_timewarn_cs; 105 unsigned int cl_timewarn_cs;
104 unsigned int cl_waitwarn_us; 106 unsigned int cl_waitwarn_us;
105 unsigned int cl_new_rsb_count; 107 unsigned int cl_new_rsb_count;
108 unsigned int cl_recover_callbacks;
109 char cl_cluster_name[DLM_LOCKSPACE_LEN];
106}; 110};
107 111
108enum { 112enum {
@@ -118,6 +122,8 @@ enum {
118 CLUSTER_ATTR_TIMEWARN_CS, 122 CLUSTER_ATTR_TIMEWARN_CS,
119 CLUSTER_ATTR_WAITWARN_US, 123 CLUSTER_ATTR_WAITWARN_US,
120 CLUSTER_ATTR_NEW_RSB_COUNT, 124 CLUSTER_ATTR_NEW_RSB_COUNT,
125 CLUSTER_ATTR_RECOVER_CALLBACKS,
126 CLUSTER_ATTR_CLUSTER_NAME,
121}; 127};
122 128
123struct cluster_attribute { 129struct cluster_attribute {
@@ -126,6 +132,27 @@ struct cluster_attribute {
126 ssize_t (*store)(struct dlm_cluster *, const char *, size_t); 132 ssize_t (*store)(struct dlm_cluster *, const char *, size_t);
127}; 133};
128 134
135static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf)
136{
137 return sprintf(buf, "%s\n", cl->cl_cluster_name);
138}
139
140static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl,
141 const char *buf, size_t len)
142{
143 strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN);
144 strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN);
145 return len;
146}
147
148static struct cluster_attribute cluster_attr_cluster_name = {
149 .attr = { .ca_owner = THIS_MODULE,
150 .ca_name = "cluster_name",
151 .ca_mode = S_IRUGO | S_IWUSR },
152 .show = cluster_cluster_name_read,
153 .store = cluster_cluster_name_write,
154};
155
129static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, 156static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
130 int *info_field, int check_zero, 157 int *info_field, int check_zero,
131 const char *buf, size_t len) 158 const char *buf, size_t len)
@@ -171,6 +198,7 @@ CLUSTER_ATTR(protocol, 0);
171CLUSTER_ATTR(timewarn_cs, 1); 198CLUSTER_ATTR(timewarn_cs, 1);
172CLUSTER_ATTR(waitwarn_us, 0); 199CLUSTER_ATTR(waitwarn_us, 0);
173CLUSTER_ATTR(new_rsb_count, 0); 200CLUSTER_ATTR(new_rsb_count, 0);
201CLUSTER_ATTR(recover_callbacks, 0);
174 202
175static struct configfs_attribute *cluster_attrs[] = { 203static struct configfs_attribute *cluster_attrs[] = {
176 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, 204 [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
@@ -185,6 +213,8 @@ static struct configfs_attribute *cluster_attrs[] = {
185 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, 213 [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr,
186 [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, 214 [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr,
187 [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr, 215 [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count.attr,
216 [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks.attr,
217 [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name.attr,
188 NULL, 218 NULL,
189}; 219};
190 220
@@ -293,6 +323,7 @@ struct dlm_comms {
293 323
294struct dlm_comm { 324struct dlm_comm {
295 struct config_item item; 325 struct config_item item;
326 int seq;
296 int nodeid; 327 int nodeid;
297 int local; 328 int local;
298 int addr_count; 329 int addr_count;
@@ -309,6 +340,7 @@ struct dlm_node {
309 int nodeid; 340 int nodeid;
310 int weight; 341 int weight;
311 int new; 342 int new;
343 int comm_seq; /* copy of cm->seq when nd->nodeid is set */
312}; 344};
313 345
314static struct configfs_group_operations clusters_ops = { 346static struct configfs_group_operations clusters_ops = {
@@ -455,6 +487,9 @@ static struct config_group *make_cluster(struct config_group *g,
455 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; 487 cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
456 cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; 488 cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us;
457 cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; 489 cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count;
490 cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks;
491 memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name,
492 DLM_LOCKSPACE_LEN);
458 493
459 space_list = &sps->ss_group; 494 space_list = &sps->ss_group;
460 comm_list = &cms->cs_group; 495 comm_list = &cms->cs_group;
@@ -558,6 +593,11 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
558 return ERR_PTR(-ENOMEM); 593 return ERR_PTR(-ENOMEM);
559 594
560 config_item_init_type_name(&cm->item, name, &comm_type); 595 config_item_init_type_name(&cm->item, name, &comm_type);
596
597 cm->seq = dlm_comm_count++;
598 if (!cm->seq)
599 cm->seq = dlm_comm_count++;
600
561 cm->nodeid = -1; 601 cm->nodeid = -1;
562 cm->local = 0; 602 cm->local = 0;
563 cm->addr_count = 0; 603 cm->addr_count = 0;
@@ -801,7 +841,10 @@ static ssize_t node_nodeid_read(struct dlm_node *nd, char *buf)
801static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf, 841static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
802 size_t len) 842 size_t len)
803{ 843{
844 uint32_t seq = 0;
804 nd->nodeid = simple_strtol(buf, NULL, 0); 845 nd->nodeid = simple_strtol(buf, NULL, 0);
846 dlm_comm_seq(nd->nodeid, &seq);
847 nd->comm_seq = seq;
805 return len; 848 return len;
806} 849}
807 850
@@ -908,13 +951,13 @@ static void put_comm(struct dlm_comm *cm)
908} 951}
909 952
910/* caller must free mem */ 953/* caller must free mem */
911int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, 954int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
912 int **new_out, int *new_count_out) 955 int *count_out)
913{ 956{
914 struct dlm_space *sp; 957 struct dlm_space *sp;
915 struct dlm_node *nd; 958 struct dlm_node *nd;
916 int i = 0, rv = 0, ids_count = 0, new_count = 0; 959 struct dlm_config_node *nodes, *node;
917 int *ids, *new; 960 int rv, count;
918 961
919 sp = get_space(lsname); 962 sp = get_space(lsname);
920 if (!sp) 963 if (!sp)
@@ -927,73 +970,42 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
927 goto out; 970 goto out;
928 } 971 }
929 972
930 ids_count = sp->members_count; 973 count = sp->members_count;
931 974
932 ids = kcalloc(ids_count, sizeof(int), GFP_NOFS); 975 nodes = kcalloc(count, sizeof(struct dlm_config_node), GFP_NOFS);
933 if (!ids) { 976 if (!nodes) {
934 rv = -ENOMEM; 977 rv = -ENOMEM;
935 goto out; 978 goto out;
936 } 979 }
937 980
981 node = nodes;
938 list_for_each_entry(nd, &sp->members, list) { 982 list_for_each_entry(nd, &sp->members, list) {
939 ids[i++] = nd->nodeid; 983 node->nodeid = nd->nodeid;
940 if (nd->new) 984 node->weight = nd->weight;
941 new_count++; 985 node->new = nd->new;
942 } 986 node->comm_seq = nd->comm_seq;
943 987 node++;
944 if (ids_count != i)
945 printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
946
947 if (!new_count)
948 goto out_ids;
949 988
950 new = kcalloc(new_count, sizeof(int), GFP_NOFS); 989 nd->new = 0;
951 if (!new) {
952 kfree(ids);
953 rv = -ENOMEM;
954 goto out;
955 } 990 }
956 991
957 i = 0; 992 *count_out = count;
958 list_for_each_entry(nd, &sp->members, list) { 993 *nodes_out = nodes;
959 if (nd->new) { 994 rv = 0;
960 new[i++] = nd->nodeid;
961 nd->new = 0;
962 }
963 }
964 *new_count_out = new_count;
965 *new_out = new;
966
967 out_ids:
968 *ids_count_out = ids_count;
969 *ids_out = ids;
970 out: 995 out:
971 mutex_unlock(&sp->members_lock); 996 mutex_unlock(&sp->members_lock);
972 put_space(sp); 997 put_space(sp);
973 return rv; 998 return rv;
974} 999}
975 1000
976int dlm_node_weight(char *lsname, int nodeid) 1001int dlm_comm_seq(int nodeid, uint32_t *seq)
977{ 1002{
978 struct dlm_space *sp; 1003 struct dlm_comm *cm = get_comm(nodeid, NULL);
979 struct dlm_node *nd; 1004 if (!cm)
980 int w = -EEXIST; 1005 return -EEXIST;
981 1006 *seq = cm->seq;
982 sp = get_space(lsname); 1007 put_comm(cm);
983 if (!sp) 1008 return 0;
984 goto out;
985
986 mutex_lock(&sp->members_lock);
987 list_for_each_entry(nd, &sp->members, list) {
988 if (nd->nodeid != nodeid)
989 continue;
990 w = nd->weight;
991 break;
992 }
993 mutex_unlock(&sp->members_lock);
994 put_space(sp);
995 out:
996 return w;
997} 1009}
998 1010
999int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) 1011int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr)
@@ -1047,6 +1059,8 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
1047#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ 1059#define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */
1048#define DEFAULT_WAITWARN_US 0 1060#define DEFAULT_WAITWARN_US 0
1049#define DEFAULT_NEW_RSB_COUNT 128 1061#define DEFAULT_NEW_RSB_COUNT 128
1062#define DEFAULT_RECOVER_CALLBACKS 0
1063#define DEFAULT_CLUSTER_NAME ""
1050 1064
1051struct dlm_config_info dlm_config = { 1065struct dlm_config_info dlm_config = {
1052 .ci_tcp_port = DEFAULT_TCP_PORT, 1066 .ci_tcp_port = DEFAULT_TCP_PORT,
@@ -1060,6 +1074,8 @@ struct dlm_config_info dlm_config = {
1060 .ci_protocol = DEFAULT_PROTOCOL, 1074 .ci_protocol = DEFAULT_PROTOCOL,
1061 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, 1075 .ci_timewarn_cs = DEFAULT_TIMEWARN_CS,
1062 .ci_waitwarn_us = DEFAULT_WAITWARN_US, 1076 .ci_waitwarn_us = DEFAULT_WAITWARN_US,
1063 .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT 1077 .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT,
1078 .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS,
1079 .ci_cluster_name = DEFAULT_CLUSTER_NAME
1064}; 1080};
1065 1081
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 3099d0dd26c0..9f5e3663bb0c 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,13 @@
14#ifndef __CONFIG_DOT_H__ 14#ifndef __CONFIG_DOT_H__
15#define __CONFIG_DOT_H__ 15#define __CONFIG_DOT_H__
16 16
17struct dlm_config_node {
18 int nodeid;
19 int weight;
20 int new;
21 uint32_t comm_seq;
22};
23
17#define DLM_MAX_ADDR_COUNT 3 24#define DLM_MAX_ADDR_COUNT 3
18 25
19struct dlm_config_info { 26struct dlm_config_info {
@@ -29,15 +36,17 @@ struct dlm_config_info {
29 int ci_timewarn_cs; 36 int ci_timewarn_cs;
30 int ci_waitwarn_us; 37 int ci_waitwarn_us;
31 int ci_new_rsb_count; 38 int ci_new_rsb_count;
39 int ci_recover_callbacks;
40 char ci_cluster_name[DLM_LOCKSPACE_LEN];
32}; 41};
33 42
34extern struct dlm_config_info dlm_config; 43extern struct dlm_config_info dlm_config;
35 44
36int dlm_config_init(void); 45int dlm_config_init(void);
37void dlm_config_exit(void); 46void dlm_config_exit(void);
38int dlm_node_weight(char *lsname, int nodeid); 47int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out,
39int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out, 48 int *count_out);
40 int **new_out, int *new_count_out); 49int dlm_comm_seq(int nodeid, uint32_t *seq);
41int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); 50int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
42int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); 51int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
43int dlm_our_nodeid(void); 52int dlm_our_nodeid(void);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 59779237e2b4..3dca2b39e83f 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -393,6 +393,7 @@ static const struct seq_operations format3_seq_ops;
393 393
394static void *table_seq_start(struct seq_file *seq, loff_t *pos) 394static void *table_seq_start(struct seq_file *seq, loff_t *pos)
395{ 395{
396 struct rb_node *node;
396 struct dlm_ls *ls = seq->private; 397 struct dlm_ls *ls = seq->private;
397 struct rsbtbl_iter *ri; 398 struct rsbtbl_iter *ri;
398 struct dlm_rsb *r; 399 struct dlm_rsb *r;
@@ -418,9 +419,10 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
418 ri->format = 3; 419 ri->format = 3;
419 420
420 spin_lock(&ls->ls_rsbtbl[bucket].lock); 421 spin_lock(&ls->ls_rsbtbl[bucket].lock);
421 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { 422 if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
422 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, 423 for (node = rb_first(&ls->ls_rsbtbl[bucket].keep); node;
423 res_hashchain) { 424 node = rb_next(node)) {
425 r = rb_entry(node, struct dlm_rsb, res_hashnode);
424 if (!entry--) { 426 if (!entry--) {
425 dlm_hold_rsb(r); 427 dlm_hold_rsb(r);
426 ri->rsb = r; 428 ri->rsb = r;
@@ -449,9 +451,9 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
449 } 451 }
450 452
451 spin_lock(&ls->ls_rsbtbl[bucket].lock); 453 spin_lock(&ls->ls_rsbtbl[bucket].lock);
452 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { 454 if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
453 r = list_first_entry(&ls->ls_rsbtbl[bucket].list, 455 node = rb_first(&ls->ls_rsbtbl[bucket].keep);
454 struct dlm_rsb, res_hashchain); 456 r = rb_entry(node, struct dlm_rsb, res_hashnode);
455 dlm_hold_rsb(r); 457 dlm_hold_rsb(r);
456 ri->rsb = r; 458 ri->rsb = r;
457 ri->bucket = bucket; 459 ri->bucket = bucket;
@@ -467,7 +469,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
467{ 469{
468 struct dlm_ls *ls = seq->private; 470 struct dlm_ls *ls = seq->private;
469 struct rsbtbl_iter *ri = iter_ptr; 471 struct rsbtbl_iter *ri = iter_ptr;
470 struct list_head *next; 472 struct rb_node *next;
471 struct dlm_rsb *r, *rp; 473 struct dlm_rsb *r, *rp;
472 loff_t n = *pos; 474 loff_t n = *pos;
473 unsigned bucket; 475 unsigned bucket;
@@ -480,10 +482,10 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
480 482
481 spin_lock(&ls->ls_rsbtbl[bucket].lock); 483 spin_lock(&ls->ls_rsbtbl[bucket].lock);
482 rp = ri->rsb; 484 rp = ri->rsb;
483 next = rp->res_hashchain.next; 485 next = rb_next(&rp->res_hashnode);
484 486
485 if (next != &ls->ls_rsbtbl[bucket].list) { 487 if (next) {
486 r = list_entry(next, struct dlm_rsb, res_hashchain); 488 r = rb_entry(next, struct dlm_rsb, res_hashnode);
487 dlm_hold_rsb(r); 489 dlm_hold_rsb(r);
488 ri->rsb = r; 490 ri->rsb = r;
489 spin_unlock(&ls->ls_rsbtbl[bucket].lock); 491 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
@@ -511,9 +513,9 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos)
511 } 513 }
512 514
513 spin_lock(&ls->ls_rsbtbl[bucket].lock); 515 spin_lock(&ls->ls_rsbtbl[bucket].lock);
514 if (!list_empty(&ls->ls_rsbtbl[bucket].list)) { 516 if (!RB_EMPTY_ROOT(&ls->ls_rsbtbl[bucket].keep)) {
515 r = list_first_entry(&ls->ls_rsbtbl[bucket].list, 517 next = rb_first(&ls->ls_rsbtbl[bucket].keep);
516 struct dlm_rsb, res_hashchain); 518 r = rb_entry(next, struct dlm_rsb, res_hashnode);
517 dlm_hold_rsb(r); 519 dlm_hold_rsb(r);
518 ri->rsb = r; 520 ri->rsb = r;
519 ri->bucket = bucket; 521 ri->bucket = bucket;
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 7b84c1dbc82e..83641574b016 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -290,7 +290,6 @@ int dlm_recover_directory(struct dlm_ls *ls)
290 290
291 out_status: 291 out_status:
292 error = 0; 292 error = 0;
293 dlm_set_recover_status(ls, DLM_RS_DIR);
294 log_debug(ls, "dlm_recover_directory %d entries", count); 293 log_debug(ls, "dlm_recover_directory %d entries", count);
295 out_free: 294 out_free:
296 kfree(last_name); 295 kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index fe2860c02449..3a564d197e99 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2010 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -103,8 +103,8 @@ struct dlm_dirtable {
103}; 103};
104 104
105struct dlm_rsbtable { 105struct dlm_rsbtable {
106 struct list_head list; 106 struct rb_root keep;
107 struct list_head toss; 107 struct rb_root toss;
108 spinlock_t lock; 108 spinlock_t lock;
109}; 109};
110 110
@@ -117,6 +117,10 @@ struct dlm_member {
117 struct list_head list; 117 struct list_head list;
118 int nodeid; 118 int nodeid;
119 int weight; 119 int weight;
120 int slot;
121 int slot_prev;
122 int comm_seq;
123 uint32_t generation;
120}; 124};
121 125
122/* 126/*
@@ -125,10 +129,8 @@ struct dlm_member {
125 129
126struct dlm_recover { 130struct dlm_recover {
127 struct list_head list; 131 struct list_head list;
128 int *nodeids; /* nodeids of all members */ 132 struct dlm_config_node *nodes;
129 int node_count; 133 int nodes_count;
130 int *new; /* nodeids of new members */
131 int new_count;
132 uint64_t seq; 134 uint64_t seq;
133}; 135};
134 136
@@ -285,7 +287,10 @@ struct dlm_rsb {
285 unsigned long res_toss_time; 287 unsigned long res_toss_time;
286 uint32_t res_first_lkid; 288 uint32_t res_first_lkid;
287 struct list_head res_lookup; /* lkbs waiting on first */ 289 struct list_head res_lookup; /* lkbs waiting on first */
288 struct list_head res_hashchain; /* rsbtbl */ 290 union {
291 struct list_head res_hashchain;
292 struct rb_node res_hashnode; /* rsbtbl */
293 };
289 struct list_head res_grantqueue; 294 struct list_head res_grantqueue;
290 struct list_head res_convertqueue; 295 struct list_head res_convertqueue;
291 struct list_head res_waitqueue; 296 struct list_head res_waitqueue;
@@ -334,7 +339,9 @@ static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag)
334/* dlm_header is first element of all structs sent between nodes */ 339/* dlm_header is first element of all structs sent between nodes */
335 340
336#define DLM_HEADER_MAJOR 0x00030000 341#define DLM_HEADER_MAJOR 0x00030000
337#define DLM_HEADER_MINOR 0x00000000 342#define DLM_HEADER_MINOR 0x00000001
343
344#define DLM_HEADER_SLOTS 0x00000001
338 345
339#define DLM_MSG 1 346#define DLM_MSG 1
340#define DLM_RCOM 2 347#define DLM_RCOM 2
@@ -422,10 +429,34 @@ union dlm_packet {
422 struct dlm_rcom rcom; 429 struct dlm_rcom rcom;
423}; 430};
424 431
432#define DLM_RSF_NEED_SLOTS 0x00000001
433
434/* RCOM_STATUS data */
435struct rcom_status {
436 __le32 rs_flags;
437 __le32 rs_unused1;
438 __le64 rs_unused2;
439};
440
441/* RCOM_STATUS_REPLY data */
425struct rcom_config { 442struct rcom_config {
426 __le32 rf_lvblen; 443 __le32 rf_lvblen;
427 __le32 rf_lsflags; 444 __le32 rf_lsflags;
428 __le64 rf_unused; 445
446 /* DLM_HEADER_SLOTS adds: */
447 __le32 rf_flags;
448 __le16 rf_our_slot;
449 __le16 rf_num_slots;
450 __le32 rf_generation;
451 __le32 rf_unused1;
452 __le64 rf_unused2;
453};
454
455struct rcom_slot {
456 __le32 ro_nodeid;
457 __le16 ro_slot;
458 __le16 ro_unused1;
459 __le64 ro_unused2;
429}; 460};
430 461
431struct rcom_lock { 462struct rcom_lock {
@@ -452,6 +483,7 @@ struct dlm_ls {
452 struct list_head ls_list; /* list of lockspaces */ 483 struct list_head ls_list; /* list of lockspaces */
453 dlm_lockspace_t *ls_local_handle; 484 dlm_lockspace_t *ls_local_handle;
454 uint32_t ls_global_id; /* global unique lockspace ID */ 485 uint32_t ls_global_id; /* global unique lockspace ID */
486 uint32_t ls_generation;
455 uint32_t ls_exflags; 487 uint32_t ls_exflags;
456 int ls_lvblen; 488 int ls_lvblen;
457 int ls_count; /* refcount of processes in 489 int ls_count; /* refcount of processes in
@@ -490,6 +522,11 @@ struct dlm_ls {
490 int ls_total_weight; 522 int ls_total_weight;
491 int *ls_node_array; 523 int *ls_node_array;
492 524
525 int ls_slot;
526 int ls_num_slots;
527 int ls_slots_size;
528 struct dlm_slot *ls_slots;
529
493 struct dlm_rsb ls_stub_rsb; /* for returning errors */ 530 struct dlm_rsb ls_stub_rsb; /* for returning errors */
494 struct dlm_lkb ls_stub_lkb; /* for returning errors */ 531 struct dlm_lkb ls_stub_lkb; /* for returning errors */
495 struct dlm_message ls_stub_ms; /* for faking a reply */ 532 struct dlm_message ls_stub_ms; /* for faking a reply */
@@ -537,6 +574,9 @@ struct dlm_ls {
537 struct list_head ls_root_list; /* root resources */ 574 struct list_head ls_root_list; /* root resources */
538 struct rw_semaphore ls_root_sem; /* protect root_list */ 575 struct rw_semaphore ls_root_sem; /* protect root_list */
539 576
577 const struct dlm_lockspace_ops *ls_ops;
578 void *ls_ops_arg;
579
540 int ls_namelen; 580 int ls_namelen;
541 char ls_name[1]; 581 char ls_name[1];
542}; 582};
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 83b5e32514e1..d47183043c59 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -56,6 +56,7 @@
56 L: receive_xxxx_reply() <- R: send_xxxx_reply() 56 L: receive_xxxx_reply() <- R: send_xxxx_reply()
57*/ 57*/
58#include <linux/types.h> 58#include <linux/types.h>
59#include <linux/rbtree.h>
59#include <linux/slab.h> 60#include <linux/slab.h>
60#include "dlm_internal.h" 61#include "dlm_internal.h"
61#include <linux/dlm_device.h> 62#include <linux/dlm_device.h>
@@ -380,6 +381,8 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
380 381
381 r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain); 382 r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
382 list_del(&r->res_hashchain); 383 list_del(&r->res_hashchain);
384 /* Convert the empty list_head to a NULL rb_node for tree usage: */
385 memset(&r->res_hashnode, 0, sizeof(struct rb_node));
383 ls->ls_new_rsb_count--; 386 ls->ls_new_rsb_count--;
384 spin_unlock(&ls->ls_new_rsb_spin); 387 spin_unlock(&ls->ls_new_rsb_spin);
385 388
@@ -388,7 +391,6 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
388 memcpy(r->res_name, name, len); 391 memcpy(r->res_name, name, len);
389 mutex_init(&r->res_mutex); 392 mutex_init(&r->res_mutex);
390 393
391 INIT_LIST_HEAD(&r->res_hashchain);
392 INIT_LIST_HEAD(&r->res_lookup); 394 INIT_LIST_HEAD(&r->res_lookup);
393 INIT_LIST_HEAD(&r->res_grantqueue); 395 INIT_LIST_HEAD(&r->res_grantqueue);
394 INIT_LIST_HEAD(&r->res_convertqueue); 396 INIT_LIST_HEAD(&r->res_convertqueue);
@@ -400,14 +402,31 @@ static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
400 return 0; 402 return 0;
401} 403}
402 404
403static int search_rsb_list(struct list_head *head, char *name, int len, 405static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
406{
407 char maxname[DLM_RESNAME_MAXLEN];
408
409 memset(maxname, 0, DLM_RESNAME_MAXLEN);
410 memcpy(maxname, name, nlen);
411 return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
412}
413
414static int search_rsb_tree(struct rb_root *tree, char *name, int len,
404 unsigned int flags, struct dlm_rsb **r_ret) 415 unsigned int flags, struct dlm_rsb **r_ret)
405{ 416{
417 struct rb_node *node = tree->rb_node;
406 struct dlm_rsb *r; 418 struct dlm_rsb *r;
407 int error = 0; 419 int error = 0;
408 420 int rc;
409 list_for_each_entry(r, head, res_hashchain) { 421
410 if (len == r->res_length && !memcmp(name, r->res_name, len)) 422 while (node) {
423 r = rb_entry(node, struct dlm_rsb, res_hashnode);
424 rc = rsb_cmp(r, name, len);
425 if (rc < 0)
426 node = node->rb_left;
427 else if (rc > 0)
428 node = node->rb_right;
429 else
411 goto found; 430 goto found;
412 } 431 }
413 *r_ret = NULL; 432 *r_ret = NULL;
@@ -420,22 +439,54 @@ static int search_rsb_list(struct list_head *head, char *name, int len,
420 return error; 439 return error;
421} 440}
422 441
442static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
443{
444 struct rb_node **newn = &tree->rb_node;
445 struct rb_node *parent = NULL;
446 int rc;
447
448 while (*newn) {
449 struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
450 res_hashnode);
451
452 parent = *newn;
453 rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
454 if (rc < 0)
455 newn = &parent->rb_left;
456 else if (rc > 0)
457 newn = &parent->rb_right;
458 else {
459 log_print("rsb_insert match");
460 dlm_dump_rsb(rsb);
461 dlm_dump_rsb(cur);
462 return -EEXIST;
463 }
464 }
465
466 rb_link_node(&rsb->res_hashnode, parent, newn);
467 rb_insert_color(&rsb->res_hashnode, tree);
468 return 0;
469}
470
423static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, 471static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b,
424 unsigned int flags, struct dlm_rsb **r_ret) 472 unsigned int flags, struct dlm_rsb **r_ret)
425{ 473{
426 struct dlm_rsb *r; 474 struct dlm_rsb *r;
427 int error; 475 int error;
428 476
429 error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r); 477 error = search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, flags, &r);
430 if (!error) { 478 if (!error) {
431 kref_get(&r->res_ref); 479 kref_get(&r->res_ref);
432 goto out; 480 goto out;
433 } 481 }
434 error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); 482 error = search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, flags, &r);
435 if (error) 483 if (error)
436 goto out; 484 goto out;
437 485
438 list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list); 486 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
487 error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
488 if (error)
489 return error;
439 490
440 if (dlm_no_directory(ls)) 491 if (dlm_no_directory(ls))
441 goto out; 492 goto out;
@@ -527,8 +578,7 @@ static int find_rsb(struct dlm_ls *ls, char *name, int namelen,
527 nodeid = 0; 578 nodeid = 0;
528 r->res_nodeid = nodeid; 579 r->res_nodeid = nodeid;
529 } 580 }
530 list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); 581 error = rsb_insert(r, &ls->ls_rsbtbl[bucket].keep);
531 error = 0;
532 out_unlock: 582 out_unlock:
533 spin_unlock(&ls->ls_rsbtbl[bucket].lock); 583 spin_unlock(&ls->ls_rsbtbl[bucket].lock);
534 out: 584 out:
@@ -556,7 +606,8 @@ static void toss_rsb(struct kref *kref)
556 606
557 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r);); 607 DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
558 kref_init(&r->res_ref); 608 kref_init(&r->res_ref);
559 list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); 609 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
610 rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
560 r->res_toss_time = jiffies; 611 r->res_toss_time = jiffies;
561 if (r->res_lvbptr) { 612 if (r->res_lvbptr) {
562 dlm_free_lvb(r->res_lvbptr); 613 dlm_free_lvb(r->res_lvbptr);
@@ -1082,19 +1133,19 @@ static void dir_remove(struct dlm_rsb *r)
1082 r->res_name, r->res_length); 1133 r->res_name, r->res_length);
1083} 1134}
1084 1135
1085/* FIXME: shouldn't this be able to exit as soon as one non-due rsb is 1136/* FIXME: make this more efficient */
1086 found since they are in order of newest to oldest? */
1087 1137
1088static int shrink_bucket(struct dlm_ls *ls, int b) 1138static int shrink_bucket(struct dlm_ls *ls, int b)
1089{ 1139{
1140 struct rb_node *n;
1090 struct dlm_rsb *r; 1141 struct dlm_rsb *r;
1091 int count = 0, found; 1142 int count = 0, found;
1092 1143
1093 for (;;) { 1144 for (;;) {
1094 found = 0; 1145 found = 0;
1095 spin_lock(&ls->ls_rsbtbl[b].lock); 1146 spin_lock(&ls->ls_rsbtbl[b].lock);
1096 list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, 1147 for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = rb_next(n)) {
1097 res_hashchain) { 1148 r = rb_entry(n, struct dlm_rsb, res_hashnode);
1098 if (!time_after_eq(jiffies, r->res_toss_time + 1149 if (!time_after_eq(jiffies, r->res_toss_time +
1099 dlm_config.ci_toss_secs * HZ)) 1150 dlm_config.ci_toss_secs * HZ))
1100 continue; 1151 continue;
@@ -1108,7 +1159,7 @@ static int shrink_bucket(struct dlm_ls *ls, int b)
1108 } 1159 }
1109 1160
1110 if (kref_put(&r->res_ref, kill_rsb)) { 1161 if (kref_put(&r->res_ref, kill_rsb)) {
1111 list_del(&r->res_hashchain); 1162 rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
1112 spin_unlock(&ls->ls_rsbtbl[b].lock); 1163 spin_unlock(&ls->ls_rsbtbl[b].lock);
1113 1164
1114 if (is_master(r)) 1165 if (is_master(r))
@@ -4441,10 +4492,12 @@ int dlm_purge_locks(struct dlm_ls *ls)
4441 4492
4442static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket) 4493static struct dlm_rsb *find_purged_rsb(struct dlm_ls *ls, int bucket)
4443{ 4494{
4495 struct rb_node *n;
4444 struct dlm_rsb *r, *r_ret = NULL; 4496 struct dlm_rsb *r, *r_ret = NULL;
4445 4497
4446 spin_lock(&ls->ls_rsbtbl[bucket].lock); 4498 spin_lock(&ls->ls_rsbtbl[bucket].lock);
4447 list_for_each_entry(r, &ls->ls_rsbtbl[bucket].list, res_hashchain) { 4499 for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
4500 r = rb_entry(n, struct dlm_rsb, res_hashnode);
4448 if (!rsb_flag(r, RSB_LOCKS_PURGED)) 4501 if (!rsb_flag(r, RSB_LOCKS_PURGED))
4449 continue; 4502 continue;
4450 hold_rsb(r); 4503 hold_rsb(r);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index a1d8f1af144b..a1ea25face82 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -386,12 +386,15 @@ static void threads_stop(void)
386 dlm_lowcomms_stop(); 386 dlm_lowcomms_stop();
387} 387}
388 388
389static int new_lockspace(const char *name, int namelen, void **lockspace, 389static int new_lockspace(const char *name, const char *cluster,
390 uint32_t flags, int lvblen) 390 uint32_t flags, int lvblen,
391 const struct dlm_lockspace_ops *ops, void *ops_arg,
392 int *ops_result, dlm_lockspace_t **lockspace)
391{ 393{
392 struct dlm_ls *ls; 394 struct dlm_ls *ls;
393 int i, size, error; 395 int i, size, error;
394 int do_unreg = 0; 396 int do_unreg = 0;
397 int namelen = strlen(name);
395 398
396 if (namelen > DLM_LOCKSPACE_LEN) 399 if (namelen > DLM_LOCKSPACE_LEN)
397 return -EINVAL; 400 return -EINVAL;
@@ -403,8 +406,24 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
403 return -EINVAL; 406 return -EINVAL;
404 407
405 if (!dlm_user_daemon_available()) { 408 if (!dlm_user_daemon_available()) {
406 module_put(THIS_MODULE); 409 log_print("dlm user daemon not available");
407 return -EUNATCH; 410 error = -EUNATCH;
411 goto out;
412 }
413
414 if (ops && ops_result) {
415 if (!dlm_config.ci_recover_callbacks)
416 *ops_result = -EOPNOTSUPP;
417 else
418 *ops_result = 0;
419 }
420
421 if (dlm_config.ci_recover_callbacks && cluster &&
422 strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) {
423 log_print("dlm cluster name %s mismatch %s",
424 dlm_config.ci_cluster_name, cluster);
425 error = -EBADR;
426 goto out;
408 } 427 }
409 428
410 error = 0; 429 error = 0;
@@ -442,6 +461,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
442 ls->ls_flags = 0; 461 ls->ls_flags = 0;
443 ls->ls_scan_time = jiffies; 462 ls->ls_scan_time = jiffies;
444 463
464 if (ops && dlm_config.ci_recover_callbacks) {
465 ls->ls_ops = ops;
466 ls->ls_ops_arg = ops_arg;
467 }
468
445 if (flags & DLM_LSFL_TIMEWARN) 469 if (flags & DLM_LSFL_TIMEWARN)
446 set_bit(LSFL_TIMEWARN, &ls->ls_flags); 470 set_bit(LSFL_TIMEWARN, &ls->ls_flags);
447 471
@@ -457,8 +481,8 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
457 if (!ls->ls_rsbtbl) 481 if (!ls->ls_rsbtbl)
458 goto out_lsfree; 482 goto out_lsfree;
459 for (i = 0; i < size; i++) { 483 for (i = 0; i < size; i++) {
460 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list); 484 ls->ls_rsbtbl[i].keep.rb_node = NULL;
461 INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss); 485 ls->ls_rsbtbl[i].toss.rb_node = NULL;
462 spin_lock_init(&ls->ls_rsbtbl[i].lock); 486 spin_lock_init(&ls->ls_rsbtbl[i].lock);
463 } 487 }
464 488
@@ -525,6 +549,11 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
525 if (!ls->ls_recover_buf) 549 if (!ls->ls_recover_buf)
526 goto out_dirfree; 550 goto out_dirfree;
527 551
552 ls->ls_slot = 0;
553 ls->ls_num_slots = 0;
554 ls->ls_slots_size = 0;
555 ls->ls_slots = NULL;
556
528 INIT_LIST_HEAD(&ls->ls_recover_list); 557 INIT_LIST_HEAD(&ls->ls_recover_list);
529 spin_lock_init(&ls->ls_recover_list_lock); 558 spin_lock_init(&ls->ls_recover_list_lock);
530 ls->ls_recover_list_count = 0; 559 ls->ls_recover_list_count = 0;
@@ -614,8 +643,10 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
614 return error; 643 return error;
615} 644}
616 645
617int dlm_new_lockspace(const char *name, int namelen, void **lockspace, 646int dlm_new_lockspace(const char *name, const char *cluster,
618 uint32_t flags, int lvblen) 647 uint32_t flags, int lvblen,
648 const struct dlm_lockspace_ops *ops, void *ops_arg,
649 int *ops_result, dlm_lockspace_t **lockspace)
619{ 650{
620 int error = 0; 651 int error = 0;
621 652
@@ -625,7 +656,8 @@ int dlm_new_lockspace(const char *name, int namelen, void **lockspace,
625 if (error) 656 if (error)
626 goto out; 657 goto out;
627 658
628 error = new_lockspace(name, namelen, lockspace, flags, lvblen); 659 error = new_lockspace(name, cluster, flags, lvblen, ops, ops_arg,
660 ops_result, lockspace);
629 if (!error) 661 if (!error)
630 ls_count++; 662 ls_count++;
631 if (error > 0) 663 if (error > 0)
@@ -685,7 +717,7 @@ static int lockspace_busy(struct dlm_ls *ls, int force)
685static int release_lockspace(struct dlm_ls *ls, int force) 717static int release_lockspace(struct dlm_ls *ls, int force)
686{ 718{
687 struct dlm_rsb *rsb; 719 struct dlm_rsb *rsb;
688 struct list_head *head; 720 struct rb_node *n;
689 int i, busy, rv; 721 int i, busy, rv;
690 722
691 busy = lockspace_busy(ls, force); 723 busy = lockspace_busy(ls, force);
@@ -746,20 +778,15 @@ static int release_lockspace(struct dlm_ls *ls, int force)
746 */ 778 */
747 779
748 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 780 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
749 head = &ls->ls_rsbtbl[i].list; 781 while ((n = rb_first(&ls->ls_rsbtbl[i].keep))) {
750 while (!list_empty(head)) { 782 rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
751 rsb = list_entry(head->next, struct dlm_rsb, 783 rb_erase(n, &ls->ls_rsbtbl[i].keep);
752 res_hashchain);
753
754 list_del(&rsb->res_hashchain);
755 dlm_free_rsb(rsb); 784 dlm_free_rsb(rsb);
756 } 785 }
757 786
758 head = &ls->ls_rsbtbl[i].toss; 787 while ((n = rb_first(&ls->ls_rsbtbl[i].toss))) {
759 while (!list_empty(head)) { 788 rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
760 rsb = list_entry(head->next, struct dlm_rsb, 789 rb_erase(n, &ls->ls_rsbtbl[i].toss);
761 res_hashchain);
762 list_del(&rsb->res_hashchain);
763 dlm_free_rsb(rsb); 790 dlm_free_rsb(rsb);
764 } 791 }
765 } 792 }
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b12532e553f8..862640a36d5c 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2009 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -19,6 +19,280 @@
19#include "config.h" 19#include "config.h"
20#include "lowcomms.h" 20#include "lowcomms.h"
21 21
22int dlm_slots_version(struct dlm_header *h)
23{
24 if ((h->h_version & 0x0000FFFF) < DLM_HEADER_SLOTS)
25 return 0;
26 return 1;
27}
28
29void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
30 struct dlm_member *memb)
31{
32 struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
33
34 if (!dlm_slots_version(&rc->rc_header))
35 return;
36
37 memb->slot = le16_to_cpu(rf->rf_our_slot);
38 memb->generation = le32_to_cpu(rf->rf_generation);
39}
40
41void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
42{
43 struct dlm_slot *slot;
44 struct rcom_slot *ro;
45 int i;
46
47 ro = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
48
49 /* ls_slots array is sparse, but not rcom_slots */
50
51 for (i = 0; i < ls->ls_slots_size; i++) {
52 slot = &ls->ls_slots[i];
53 if (!slot->nodeid)
54 continue;
55 ro->ro_nodeid = cpu_to_le32(slot->nodeid);
56 ro->ro_slot = cpu_to_le16(slot->slot);
57 ro++;
58 }
59}
60
61#define SLOT_DEBUG_LINE 128
62
63static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
64 struct rcom_slot *ro0, struct dlm_slot *array,
65 int array_size)
66{
67 char line[SLOT_DEBUG_LINE];
68 int len = SLOT_DEBUG_LINE - 1;
69 int pos = 0;
70 int ret, i;
71
72 if (!dlm_config.ci_log_debug)
73 return;
74
75 memset(line, 0, sizeof(line));
76
77 if (array) {
78 for (i = 0; i < array_size; i++) {
79 if (!array[i].nodeid)
80 continue;
81
82 ret = snprintf(line + pos, len - pos, " %d:%d",
83 array[i].slot, array[i].nodeid);
84 if (ret >= len - pos)
85 break;
86 pos += ret;
87 }
88 } else if (ro0) {
89 for (i = 0; i < num_slots; i++) {
90 ret = snprintf(line + pos, len - pos, " %d:%d",
91 ro0[i].ro_slot, ro0[i].ro_nodeid);
92 if (ret >= len - pos)
93 break;
94 pos += ret;
95 }
96 }
97
98 log_debug(ls, "generation %u slots %d%s", gen, num_slots, line);
99}
100
101int dlm_slots_copy_in(struct dlm_ls *ls)
102{
103 struct dlm_member *memb;
104 struct dlm_rcom *rc = ls->ls_recover_buf;
105 struct rcom_config *rf = (struct rcom_config *)rc->rc_buf;
106 struct rcom_slot *ro0, *ro;
107 int our_nodeid = dlm_our_nodeid();
108 int i, num_slots;
109 uint32_t gen;
110
111 if (!dlm_slots_version(&rc->rc_header))
112 return -1;
113
114 gen = le32_to_cpu(rf->rf_generation);
115 if (gen <= ls->ls_generation) {
116 log_error(ls, "dlm_slots_copy_in gen %u old %u",
117 gen, ls->ls_generation);
118 }
119 ls->ls_generation = gen;
120
121 num_slots = le16_to_cpu(rf->rf_num_slots);
122 if (!num_slots)
123 return -1;
124
125 ro0 = (struct rcom_slot *)(rc->rc_buf + sizeof(struct rcom_config));
126
127 for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
128 ro->ro_nodeid = le32_to_cpu(ro->ro_nodeid);
129 ro->ro_slot = le16_to_cpu(ro->ro_slot);
130 }
131
132 log_debug_slots(ls, gen, num_slots, ro0, NULL, 0);
133
134 list_for_each_entry(memb, &ls->ls_nodes, list) {
135 for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
136 if (ro->ro_nodeid != memb->nodeid)
137 continue;
138 memb->slot = ro->ro_slot;
139 memb->slot_prev = memb->slot;
140 break;
141 }
142
143 if (memb->nodeid == our_nodeid) {
144 if (ls->ls_slot && ls->ls_slot != memb->slot) {
145 log_error(ls, "dlm_slots_copy_in our slot "
146 "changed %d %d", ls->ls_slot,
147 memb->slot);
148 return -1;
149 }
150
151 if (!ls->ls_slot)
152 ls->ls_slot = memb->slot;
153 }
154
155 if (!memb->slot) {
156 log_error(ls, "dlm_slots_copy_in nodeid %d no slot",
157 memb->nodeid);
158 return -1;
159 }
160 }
161
162 return 0;
163}
164
165/* for any nodes that do not support slots, we will not have set memb->slot
166 in wait_status_all(), so memb->slot will remain -1, and we will not
167 assign slots or set ls_num_slots here */
168
169int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
170 struct dlm_slot **slots_out, uint32_t *gen_out)
171{
172 struct dlm_member *memb;
173 struct dlm_slot *array;
174 int our_nodeid = dlm_our_nodeid();
175 int array_size, max_slots, i;
176 int need = 0;
177 int max = 0;
178 int num = 0;
179 uint32_t gen = 0;
180
181 /* our own memb struct will have slot -1 gen 0 */
182
183 list_for_each_entry(memb, &ls->ls_nodes, list) {
184 if (memb->nodeid == our_nodeid) {
185 memb->slot = ls->ls_slot;
186 memb->generation = ls->ls_generation;
187 break;
188 }
189 }
190
191 list_for_each_entry(memb, &ls->ls_nodes, list) {
192 if (memb->generation > gen)
193 gen = memb->generation;
194
195 /* node doesn't support slots */
196
197 if (memb->slot == -1)
198 return -1;
199
200 /* node needs a slot assigned */
201
202 if (!memb->slot)
203 need++;
204
205 /* node has a slot assigned */
206
207 num++;
208
209 if (!max || max < memb->slot)
210 max = memb->slot;
211
212 /* sanity check, once slot is assigned it shouldn't change */
213
214 if (memb->slot_prev && memb->slot && memb->slot_prev != memb->slot) {
215 log_error(ls, "nodeid %d slot changed %d %d",
216 memb->nodeid, memb->slot_prev, memb->slot);
217 return -1;
218 }
219 memb->slot_prev = memb->slot;
220 }
221
222 array_size = max + need;
223
224 array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS);
225 if (!array)
226 return -ENOMEM;
227
228 num = 0;
229
230 /* fill in slots (offsets) that are used */
231
232 list_for_each_entry(memb, &ls->ls_nodes, list) {
233 if (!memb->slot)
234 continue;
235
236 if (memb->slot > array_size) {
237 log_error(ls, "invalid slot number %d", memb->slot);
238 kfree(array);
239 return -1;
240 }
241
242 array[memb->slot - 1].nodeid = memb->nodeid;
243 array[memb->slot - 1].slot = memb->slot;
244 num++;
245 }
246
247 /* assign new slots from unused offsets */
248
249 list_for_each_entry(memb, &ls->ls_nodes, list) {
250 if (memb->slot)
251 continue;
252
253 for (i = 0; i < array_size; i++) {
254 if (array[i].nodeid)
255 continue;
256
257 memb->slot = i + 1;
258 memb->slot_prev = memb->slot;
259 array[i].nodeid = memb->nodeid;
260 array[i].slot = memb->slot;
261 num++;
262
263 if (!ls->ls_slot && memb->nodeid == our_nodeid)
264 ls->ls_slot = memb->slot;
265 break;
266 }
267
268 if (!memb->slot) {
269 log_error(ls, "no free slot found");
270 kfree(array);
271 return -1;
272 }
273 }
274
275 gen++;
276
277 log_debug_slots(ls, gen, num, NULL, array, array_size);
278
279 max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
280 sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
281
282 if (num > max_slots) {
283 log_error(ls, "num_slots %d exceeds max_slots %d",
284 num, max_slots);
285 kfree(array);
286 return -1;
287 }
288
289 *gen_out = gen;
290 *slots_out = array;
291 *slots_size = array_size;
292 *num_slots = num;
293 return 0;
294}
295
22static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) 296static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
23{ 297{
24 struct dlm_member *memb = NULL; 298 struct dlm_member *memb = NULL;
@@ -43,59 +317,51 @@ static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new)
43 } 317 }
44} 318}
45 319
46static int dlm_add_member(struct dlm_ls *ls, int nodeid) 320static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node)
47{ 321{
48 struct dlm_member *memb; 322 struct dlm_member *memb;
49 int w, error; 323 int error;
50 324
51 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); 325 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
52 if (!memb) 326 if (!memb)
53 return -ENOMEM; 327 return -ENOMEM;
54 328
55 w = dlm_node_weight(ls->ls_name, nodeid); 329 error = dlm_lowcomms_connect_node(node->nodeid);
56 if (w < 0) {
57 kfree(memb);
58 return w;
59 }
60
61 error = dlm_lowcomms_connect_node(nodeid);
62 if (error < 0) { 330 if (error < 0) {
63 kfree(memb); 331 kfree(memb);
64 return error; 332 return error;
65 } 333 }
66 334
67 memb->nodeid = nodeid; 335 memb->nodeid = node->nodeid;
68 memb->weight = w; 336 memb->weight = node->weight;
337 memb->comm_seq = node->comm_seq;
69 add_ordered_member(ls, memb); 338 add_ordered_member(ls, memb);
70 ls->ls_num_nodes++; 339 ls->ls_num_nodes++;
71 return 0; 340 return 0;
72} 341}
73 342
74static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) 343static struct dlm_member *find_memb(struct list_head *head, int nodeid)
75{
76 list_move(&memb->list, &ls->ls_nodes_gone);
77 ls->ls_num_nodes--;
78}
79
80int dlm_is_member(struct dlm_ls *ls, int nodeid)
81{ 344{
82 struct dlm_member *memb; 345 struct dlm_member *memb;
83 346
84 list_for_each_entry(memb, &ls->ls_nodes, list) { 347 list_for_each_entry(memb, head, list) {
85 if (memb->nodeid == nodeid) 348 if (memb->nodeid == nodeid)
86 return 1; 349 return memb;
87 } 350 }
351 return NULL;
352}
353
354int dlm_is_member(struct dlm_ls *ls, int nodeid)
355{
356 if (find_memb(&ls->ls_nodes, nodeid))
357 return 1;
88 return 0; 358 return 0;
89} 359}
90 360
91int dlm_is_removed(struct dlm_ls *ls, int nodeid) 361int dlm_is_removed(struct dlm_ls *ls, int nodeid)
92{ 362{
93 struct dlm_member *memb; 363 if (find_memb(&ls->ls_nodes_gone, nodeid))
94 364 return 1;
95 list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
96 if (memb->nodeid == nodeid)
97 return 1;
98 }
99 return 0; 365 return 0;
100} 366}
101 367
@@ -176,7 +442,7 @@ static int ping_members(struct dlm_ls *ls)
176 error = dlm_recovery_stopped(ls); 442 error = dlm_recovery_stopped(ls);
177 if (error) 443 if (error)
178 break; 444 break;
179 error = dlm_rcom_status(ls, memb->nodeid); 445 error = dlm_rcom_status(ls, memb->nodeid, 0);
180 if (error) 446 if (error)
181 break; 447 break;
182 } 448 }
@@ -186,10 +452,88 @@ static int ping_members(struct dlm_ls *ls)
186 return error; 452 return error;
187} 453}
188 454
455static void dlm_lsop_recover_prep(struct dlm_ls *ls)
456{
457 if (!ls->ls_ops || !ls->ls_ops->recover_prep)
458 return;
459 ls->ls_ops->recover_prep(ls->ls_ops_arg);
460}
461
462static void dlm_lsop_recover_slot(struct dlm_ls *ls, struct dlm_member *memb)
463{
464 struct dlm_slot slot;
465 uint32_t seq;
466 int error;
467
468 if (!ls->ls_ops || !ls->ls_ops->recover_slot)
469 return;
470
471 /* if there is no comms connection with this node
472 or the present comms connection is newer
473 than the one when this member was added, then
474 we consider the node to have failed (versus
475 being removed due to dlm_release_lockspace) */
476
477 error = dlm_comm_seq(memb->nodeid, &seq);
478
479 if (!error && seq == memb->comm_seq)
480 return;
481
482 slot.nodeid = memb->nodeid;
483 slot.slot = memb->slot;
484
485 ls->ls_ops->recover_slot(ls->ls_ops_arg, &slot);
486}
487
488void dlm_lsop_recover_done(struct dlm_ls *ls)
489{
490 struct dlm_member *memb;
491 struct dlm_slot *slots;
492 int i, num;
493
494 if (!ls->ls_ops || !ls->ls_ops->recover_done)
495 return;
496
497 num = ls->ls_num_nodes;
498
499 slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL);
500 if (!slots)
501 return;
502
503 i = 0;
504 list_for_each_entry(memb, &ls->ls_nodes, list) {
505 if (i == num) {
506 log_error(ls, "dlm_lsop_recover_done bad num %d", num);
507 goto out;
508 }
509 slots[i].nodeid = memb->nodeid;
510 slots[i].slot = memb->slot;
511 i++;
512 }
513
514 ls->ls_ops->recover_done(ls->ls_ops_arg, slots, num,
515 ls->ls_slot, ls->ls_generation);
516 out:
517 kfree(slots);
518}
519
520static struct dlm_config_node *find_config_node(struct dlm_recover *rv,
521 int nodeid)
522{
523 int i;
524
525 for (i = 0; i < rv->nodes_count; i++) {
526 if (rv->nodes[i].nodeid == nodeid)
527 return &rv->nodes[i];
528 }
529 return NULL;
530}
531
189int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) 532int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
190{ 533{
191 struct dlm_member *memb, *safe; 534 struct dlm_member *memb, *safe;
192 int i, error, found, pos = 0, neg = 0, low = -1; 535 struct dlm_config_node *node;
536 int i, error, neg = 0, low = -1;
193 537
194 /* previously removed members that we've not finished removing need to 538 /* previously removed members that we've not finished removing need to
195 count as a negative change so the "neg" recovery steps will happen */ 539 count as a negative change so the "neg" recovery steps will happen */
@@ -202,46 +546,32 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
202 /* move departed members from ls_nodes to ls_nodes_gone */ 546 /* move departed members from ls_nodes to ls_nodes_gone */
203 547
204 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { 548 list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) {
205 found = 0; 549 node = find_config_node(rv, memb->nodeid);
206 for (i = 0; i < rv->node_count; i++) { 550 if (node && !node->new)
207 if (memb->nodeid == rv->nodeids[i]) { 551 continue;
208 found = 1;
209 break;
210 }
211 }
212 552
213 if (!found) { 553 if (!node) {
214 neg++;
215 dlm_remove_member(ls, memb);
216 log_debug(ls, "remove member %d", memb->nodeid); 554 log_debug(ls, "remove member %d", memb->nodeid);
555 } else {
556 /* removed and re-added */
557 log_debug(ls, "remove member %d comm_seq %u %u",
558 memb->nodeid, memb->comm_seq, node->comm_seq);
217 } 559 }
218 }
219
220 /* Add an entry to ls_nodes_gone for members that were removed and
221 then added again, so that previous state for these nodes will be
222 cleared during recovery. */
223
224 for (i = 0; i < rv->new_count; i++) {
225 if (!dlm_is_member(ls, rv->new[i]))
226 continue;
227 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
228 560
229 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
230 if (!memb)
231 return -ENOMEM;
232 memb->nodeid = rv->new[i];
233 list_add_tail(&memb->list, &ls->ls_nodes_gone);
234 neg++; 561 neg++;
562 list_move(&memb->list, &ls->ls_nodes_gone);
563 ls->ls_num_nodes--;
564 dlm_lsop_recover_slot(ls, memb);
235 } 565 }
236 566
237 /* add new members to ls_nodes */ 567 /* add new members to ls_nodes */
238 568
239 for (i = 0; i < rv->node_count; i++) { 569 for (i = 0; i < rv->nodes_count; i++) {
240 if (dlm_is_member(ls, rv->nodeids[i])) 570 node = &rv->nodes[i];
571 if (dlm_is_member(ls, node->nodeid))
241 continue; 572 continue;
242 dlm_add_member(ls, rv->nodeids[i]); 573 dlm_add_member(ls, node);
243 pos++; 574 log_debug(ls, "add member %d", node->nodeid);
244 log_debug(ls, "add member %d", rv->nodeids[i]);
245 } 575 }
246 576
247 list_for_each_entry(memb, &ls->ls_nodes, list) { 577 list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -251,7 +581,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
251 ls->ls_low_nodeid = low; 581 ls->ls_low_nodeid = low;
252 582
253 make_member_array(ls); 583 make_member_array(ls);
254 dlm_set_recover_status(ls, DLM_RS_NODES);
255 *neg_out = neg; 584 *neg_out = neg;
256 585
257 error = ping_members(ls); 586 error = ping_members(ls);
@@ -261,12 +590,8 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
261 ls->ls_members_result = error; 590 ls->ls_members_result = error;
262 complete(&ls->ls_members_done); 591 complete(&ls->ls_members_done);
263 } 592 }
264 if (error)
265 goto out;
266 593
267 error = dlm_recover_members_wait(ls); 594 log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
268 out:
269 log_debug(ls, "total members %d error %d", ls->ls_num_nodes, error);
270 return error; 595 return error;
271} 596}
272 597
@@ -327,26 +652,35 @@ int dlm_ls_stop(struct dlm_ls *ls)
327 */ 652 */
328 653
329 dlm_recoverd_suspend(ls); 654 dlm_recoverd_suspend(ls);
655
656 spin_lock(&ls->ls_recover_lock);
657 kfree(ls->ls_slots);
658 ls->ls_slots = NULL;
659 ls->ls_num_slots = 0;
660 ls->ls_slots_size = 0;
330 ls->ls_recover_status = 0; 661 ls->ls_recover_status = 0;
662 spin_unlock(&ls->ls_recover_lock);
663
331 dlm_recoverd_resume(ls); 664 dlm_recoverd_resume(ls);
332 665
333 if (!ls->ls_recover_begin) 666 if (!ls->ls_recover_begin)
334 ls->ls_recover_begin = jiffies; 667 ls->ls_recover_begin = jiffies;
668
669 dlm_lsop_recover_prep(ls);
335 return 0; 670 return 0;
336} 671}
337 672
338int dlm_ls_start(struct dlm_ls *ls) 673int dlm_ls_start(struct dlm_ls *ls)
339{ 674{
340 struct dlm_recover *rv = NULL, *rv_old; 675 struct dlm_recover *rv = NULL, *rv_old;
341 int *ids = NULL, *new = NULL; 676 struct dlm_config_node *nodes;
342 int error, ids_count = 0, new_count = 0; 677 int error, count;
343 678
344 rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); 679 rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
345 if (!rv) 680 if (!rv)
346 return -ENOMEM; 681 return -ENOMEM;
347 682
348 error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count, 683 error = dlm_config_nodes(ls->ls_name, &nodes, &count);
349 &new, &new_count);
350 if (error < 0) 684 if (error < 0)
351 goto fail; 685 goto fail;
352 686
@@ -361,10 +695,8 @@ int dlm_ls_start(struct dlm_ls *ls)
361 goto fail; 695 goto fail;
362 } 696 }
363 697
364 rv->nodeids = ids; 698 rv->nodes = nodes;
365 rv->node_count = ids_count; 699 rv->nodes_count = count;
366 rv->new = new;
367 rv->new_count = new_count;
368 rv->seq = ++ls->ls_recover_seq; 700 rv->seq = ++ls->ls_recover_seq;
369 rv_old = ls->ls_recover_args; 701 rv_old = ls->ls_recover_args;
370 ls->ls_recover_args = rv; 702 ls->ls_recover_args = rv;
@@ -372,9 +704,8 @@ int dlm_ls_start(struct dlm_ls *ls)
372 704
373 if (rv_old) { 705 if (rv_old) {
374 log_error(ls, "unused recovery %llx %d", 706 log_error(ls, "unused recovery %llx %d",
375 (unsigned long long)rv_old->seq, rv_old->node_count); 707 (unsigned long long)rv_old->seq, rv_old->nodes_count);
376 kfree(rv_old->nodeids); 708 kfree(rv_old->nodes);
377 kfree(rv_old->new);
378 kfree(rv_old); 709 kfree(rv_old);
379 } 710 }
380 711
@@ -383,8 +714,7 @@ int dlm_ls_start(struct dlm_ls *ls)
383 714
384 fail: 715 fail:
385 kfree(rv); 716 kfree(rv);
386 kfree(ids); 717 kfree(nodes);
387 kfree(new);
388 return error; 718 return error;
389} 719}
390 720
diff --git a/fs/dlm/member.h b/fs/dlm/member.h
index 7a26fca1e0b5..3deb70661c69 100644
--- a/fs/dlm/member.h
+++ b/fs/dlm/member.h
@@ -1,7 +1,7 @@
1/****************************************************************************** 1/******************************************************************************
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 4** Copyright (C) 2005-2011 Red Hat, Inc. All rights reserved.
5** 5**
6** This copyrighted material is made available to anyone wishing to use, 6** This copyrighted material is made available to anyone wishing to use,
7** modify, copy, or redistribute it subject to the terms and conditions 7** modify, copy, or redistribute it subject to the terms and conditions
@@ -20,6 +20,14 @@ void dlm_clear_members_gone(struct dlm_ls *ls);
20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); 20int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out);
21int dlm_is_removed(struct dlm_ls *ls, int nodeid); 21int dlm_is_removed(struct dlm_ls *ls, int nodeid);
22int dlm_is_member(struct dlm_ls *ls, int nodeid); 22int dlm_is_member(struct dlm_ls *ls, int nodeid);
23int dlm_slots_version(struct dlm_header *h);
24void dlm_slot_save(struct dlm_ls *ls, struct dlm_rcom *rc,
25 struct dlm_member *memb);
26void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc);
27int dlm_slots_copy_in(struct dlm_ls *ls);
28int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
29 struct dlm_slot **slots_out, uint32_t *gen_out);
30void dlm_lsop_recover_done(struct dlm_ls *ls);
23 31
24#endif /* __MEMBER_DOT_H__ */ 32#endif /* __MEMBER_DOT_H__ */
25 33
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index f10a50f24e8f..ac5c616c9696 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -23,6 +23,7 @@
23#include "memory.h" 23#include "memory.h"
24#include "lock.h" 24#include "lock.h"
25#include "util.h" 25#include "util.h"
26#include "member.h"
26 27
27 28
28static int rcom_response(struct dlm_ls *ls) 29static int rcom_response(struct dlm_ls *ls)
@@ -72,20 +73,30 @@ static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh,
72 dlm_lowcomms_commit_buffer(mh); 73 dlm_lowcomms_commit_buffer(mh);
73} 74}
74 75
76static void set_rcom_status(struct dlm_ls *ls, struct rcom_status *rs,
77 uint32_t flags)
78{
79 rs->rs_flags = cpu_to_le32(flags);
80}
81
75/* When replying to a status request, a node also sends back its 82/* When replying to a status request, a node also sends back its
76 configuration values. The requesting node then checks that the remote 83 configuration values. The requesting node then checks that the remote
77 node is configured the same way as itself. */ 84 node is configured the same way as itself. */
78 85
79static void make_config(struct dlm_ls *ls, struct rcom_config *rf) 86static void set_rcom_config(struct dlm_ls *ls, struct rcom_config *rf,
87 uint32_t num_slots)
80{ 88{
81 rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen); 89 rf->rf_lvblen = cpu_to_le32(ls->ls_lvblen);
82 rf->rf_lsflags = cpu_to_le32(ls->ls_exflags); 90 rf->rf_lsflags = cpu_to_le32(ls->ls_exflags);
91
92 rf->rf_our_slot = cpu_to_le16(ls->ls_slot);
93 rf->rf_num_slots = cpu_to_le16(num_slots);
94 rf->rf_generation = cpu_to_le32(ls->ls_generation);
83} 95}
84 96
85static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) 97static int check_rcom_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
86{ 98{
87 struct rcom_config *rf = (struct rcom_config *) rc->rc_buf; 99 struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
88 size_t conf_size = sizeof(struct dlm_rcom) + sizeof(struct rcom_config);
89 100
90 if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) { 101 if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
91 log_error(ls, "version mismatch: %x nodeid %d: %x", 102 log_error(ls, "version mismatch: %x nodeid %d: %x",
@@ -94,12 +105,6 @@ static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
94 return -EPROTO; 105 return -EPROTO;
95 } 106 }
96 107
97 if (rc->rc_header.h_length < conf_size) {
98 log_error(ls, "config too short: %d nodeid %d",
99 rc->rc_header.h_length, nodeid);
100 return -EPROTO;
101 }
102
103 if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen || 108 if (le32_to_cpu(rf->rf_lvblen) != ls->ls_lvblen ||
104 le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) { 109 le32_to_cpu(rf->rf_lsflags) != ls->ls_exflags) {
105 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", 110 log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -127,7 +132,18 @@ static void disallow_sync_reply(struct dlm_ls *ls)
127 spin_unlock(&ls->ls_rcom_spin); 132 spin_unlock(&ls->ls_rcom_spin);
128} 133}
129 134
130int dlm_rcom_status(struct dlm_ls *ls, int nodeid) 135/*
136 * low nodeid gathers one slot value at a time from each node.
137 * it sets need_slots=0, and saves rf_our_slot returned from each
138 * rcom_config.
139 *
140 * other nodes gather all slot values at once from the low nodeid.
141 * they set need_slots=1, and ignore the rf_our_slot returned from each
142 * rcom_config. they use the rf_num_slots returned from the low
143 * node's rcom_config.
144 */
145
146int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags)
131{ 147{
132 struct dlm_rcom *rc; 148 struct dlm_rcom *rc;
133 struct dlm_mhandle *mh; 149 struct dlm_mhandle *mh;
@@ -141,10 +157,13 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
141 goto out; 157 goto out;
142 } 158 }
143 159
144 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh); 160 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS,
161 sizeof(struct rcom_status), &rc, &mh);
145 if (error) 162 if (error)
146 goto out; 163 goto out;
147 164
165 set_rcom_status(ls, (struct rcom_status *)rc->rc_buf, status_flags);
166
148 allow_sync_reply(ls, &rc->rc_id); 167 allow_sync_reply(ls, &rc->rc_id);
149 memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size); 168 memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
150 169
@@ -161,8 +180,11 @@ int dlm_rcom_status(struct dlm_ls *ls, int nodeid)
161 /* we pretend the remote lockspace exists with 0 status */ 180 /* we pretend the remote lockspace exists with 0 status */
162 log_debug(ls, "remote node %d not ready", nodeid); 181 log_debug(ls, "remote node %d not ready", nodeid);
163 rc->rc_result = 0; 182 rc->rc_result = 0;
164 } else 183 error = 0;
165 error = check_config(ls, rc, nodeid); 184 } else {
185 error = check_rcom_config(ls, rc, nodeid);
186 }
187
166 /* the caller looks at rc_result for the remote recovery status */ 188 /* the caller looks at rc_result for the remote recovery status */
167 out: 189 out:
168 return error; 190 return error;
@@ -172,17 +194,60 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
172{ 194{
173 struct dlm_rcom *rc; 195 struct dlm_rcom *rc;
174 struct dlm_mhandle *mh; 196 struct dlm_mhandle *mh;
175 int error, nodeid = rc_in->rc_header.h_nodeid; 197 struct rcom_status *rs;
198 uint32_t status;
199 int nodeid = rc_in->rc_header.h_nodeid;
200 int len = sizeof(struct rcom_config);
201 int num_slots = 0;
202 int error;
203
204 if (!dlm_slots_version(&rc_in->rc_header)) {
205 status = dlm_recover_status(ls);
206 goto do_create;
207 }
208
209 rs = (struct rcom_status *)rc_in->rc_buf;
176 210
211 if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) {
212 status = dlm_recover_status(ls);
213 goto do_create;
214 }
215
216 spin_lock(&ls->ls_recover_lock);
217 status = ls->ls_recover_status;
218 num_slots = ls->ls_num_slots;
219 spin_unlock(&ls->ls_recover_lock);
220 len += num_slots * sizeof(struct rcom_slot);
221
222 do_create:
177 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, 223 error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY,
178 sizeof(struct rcom_config), &rc, &mh); 224 len, &rc, &mh);
179 if (error) 225 if (error)
180 return; 226 return;
227
181 rc->rc_id = rc_in->rc_id; 228 rc->rc_id = rc_in->rc_id;
182 rc->rc_seq_reply = rc_in->rc_seq; 229 rc->rc_seq_reply = rc_in->rc_seq;
183 rc->rc_result = dlm_recover_status(ls); 230 rc->rc_result = status;
184 make_config(ls, (struct rcom_config *) rc->rc_buf); 231
232 set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, num_slots);
233
234 if (!num_slots)
235 goto do_send;
236
237 spin_lock(&ls->ls_recover_lock);
238 if (ls->ls_num_slots != num_slots) {
239 spin_unlock(&ls->ls_recover_lock);
240 log_debug(ls, "receive_rcom_status num_slots %d to %d",
241 num_slots, ls->ls_num_slots);
242 rc->rc_result = 0;
243 set_rcom_config(ls, (struct rcom_config *)rc->rc_buf, 0);
244 goto do_send;
245 }
246
247 dlm_slots_copy_out(ls, rc);
248 spin_unlock(&ls->ls_recover_lock);
185 249
250 do_send:
186 send_rcom(ls, mh, rc); 251 send_rcom(ls, mh, rc);
187} 252}
188 253
diff --git a/fs/dlm/rcom.h b/fs/dlm/rcom.h
index b09abd29ba38..206723ab744d 100644
--- a/fs/dlm/rcom.h
+++ b/fs/dlm/rcom.h
@@ -14,7 +14,7 @@
14#ifndef __RCOM_DOT_H__ 14#ifndef __RCOM_DOT_H__
15#define __RCOM_DOT_H__ 15#define __RCOM_DOT_H__
16 16
17int dlm_rcom_status(struct dlm_ls *ls, int nodeid); 17int dlm_rcom_status(struct dlm_ls *ls, int nodeid, uint32_t status_flags);
18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); 18int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len);
19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); 19int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid);
20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); 20int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 14638235f7b2..34d5adf1fce7 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -85,14 +85,20 @@ uint32_t dlm_recover_status(struct dlm_ls *ls)
85 return status; 85 return status;
86} 86}
87 87
88static void _set_recover_status(struct dlm_ls *ls, uint32_t status)
89{
90 ls->ls_recover_status |= status;
91}
92
88void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) 93void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status)
89{ 94{
90 spin_lock(&ls->ls_recover_lock); 95 spin_lock(&ls->ls_recover_lock);
91 ls->ls_recover_status |= status; 96 _set_recover_status(ls, status);
92 spin_unlock(&ls->ls_recover_lock); 97 spin_unlock(&ls->ls_recover_lock);
93} 98}
94 99
95static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) 100static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status,
101 int save_slots)
96{ 102{
97 struct dlm_rcom *rc = ls->ls_recover_buf; 103 struct dlm_rcom *rc = ls->ls_recover_buf;
98 struct dlm_member *memb; 104 struct dlm_member *memb;
@@ -106,10 +112,13 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
106 goto out; 112 goto out;
107 } 113 }
108 114
109 error = dlm_rcom_status(ls, memb->nodeid); 115 error = dlm_rcom_status(ls, memb->nodeid, 0);
110 if (error) 116 if (error)
111 goto out; 117 goto out;
112 118
119 if (save_slots)
120 dlm_slot_save(ls, rc, memb);
121
113 if (rc->rc_result & wait_status) 122 if (rc->rc_result & wait_status)
114 break; 123 break;
115 if (delay < 1000) 124 if (delay < 1000)
@@ -121,7 +130,8 @@ static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status)
121 return error; 130 return error;
122} 131}
123 132
124static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status) 133static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status,
134 uint32_t status_flags)
125{ 135{
126 struct dlm_rcom *rc = ls->ls_recover_buf; 136 struct dlm_rcom *rc = ls->ls_recover_buf;
127 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; 137 int error = 0, delay = 0, nodeid = ls->ls_low_nodeid;
@@ -132,7 +142,7 @@ static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status)
132 goto out; 142 goto out;
133 } 143 }
134 144
135 error = dlm_rcom_status(ls, nodeid); 145 error = dlm_rcom_status(ls, nodeid, status_flags);
136 if (error) 146 if (error)
137 break; 147 break;
138 148
@@ -152,18 +162,56 @@ static int wait_status(struct dlm_ls *ls, uint32_t status)
152 int error; 162 int error;
153 163
154 if (ls->ls_low_nodeid == dlm_our_nodeid()) { 164 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
155 error = wait_status_all(ls, status); 165 error = wait_status_all(ls, status, 0);
156 if (!error) 166 if (!error)
157 dlm_set_recover_status(ls, status_all); 167 dlm_set_recover_status(ls, status_all);
158 } else 168 } else
159 error = wait_status_low(ls, status_all); 169 error = wait_status_low(ls, status_all, 0);
160 170
161 return error; 171 return error;
162} 172}
163 173
164int dlm_recover_members_wait(struct dlm_ls *ls) 174int dlm_recover_members_wait(struct dlm_ls *ls)
165{ 175{
166 return wait_status(ls, DLM_RS_NODES); 176 struct dlm_member *memb;
177 struct dlm_slot *slots;
178 int num_slots, slots_size;
179 int error, rv;
180 uint32_t gen;
181
182 list_for_each_entry(memb, &ls->ls_nodes, list) {
183 memb->slot = -1;
184 memb->generation = 0;
185 }
186
187 if (ls->ls_low_nodeid == dlm_our_nodeid()) {
188 error = wait_status_all(ls, DLM_RS_NODES, 1);
189 if (error)
190 goto out;
191
192 /* slots array is sparse, slots_size may be > num_slots */
193
194 rv = dlm_slots_assign(ls, &num_slots, &slots_size, &slots, &gen);
195 if (!rv) {
196 spin_lock(&ls->ls_recover_lock);
197 _set_recover_status(ls, DLM_RS_NODES_ALL);
198 ls->ls_num_slots = num_slots;
199 ls->ls_slots_size = slots_size;
200 ls->ls_slots = slots;
201 ls->ls_generation = gen;
202 spin_unlock(&ls->ls_recover_lock);
203 } else {
204 dlm_set_recover_status(ls, DLM_RS_NODES_ALL);
205 }
206 } else {
207 error = wait_status_low(ls, DLM_RS_NODES_ALL, DLM_RSF_NEED_SLOTS);
208 if (error)
209 goto out;
210
211 dlm_slots_copy_in(ls);
212 }
213 out:
214 return error;
167} 215}
168 216
169int dlm_recover_directory_wait(struct dlm_ls *ls) 217int dlm_recover_directory_wait(struct dlm_ls *ls)
@@ -542,8 +590,6 @@ int dlm_recover_locks(struct dlm_ls *ls)
542 out: 590 out:
543 if (error) 591 if (error)
544 recover_list_clear(ls); 592 recover_list_clear(ls);
545 else
546 dlm_set_recover_status(ls, DLM_RS_LOCKS);
547 return error; 593 return error;
548} 594}
549 595
@@ -715,6 +761,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
715 761
716int dlm_create_root_list(struct dlm_ls *ls) 762int dlm_create_root_list(struct dlm_ls *ls)
717{ 763{
764 struct rb_node *n;
718 struct dlm_rsb *r; 765 struct dlm_rsb *r;
719 int i, error = 0; 766 int i, error = 0;
720 767
@@ -727,7 +774,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
727 774
728 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 775 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
729 spin_lock(&ls->ls_rsbtbl[i].lock); 776 spin_lock(&ls->ls_rsbtbl[i].lock);
730 list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) { 777 for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
778 r = rb_entry(n, struct dlm_rsb, res_hashnode);
731 list_add(&r->res_root_list, &ls->ls_root_list); 779 list_add(&r->res_root_list, &ls->ls_root_list);
732 dlm_hold_rsb(r); 780 dlm_hold_rsb(r);
733 } 781 }
@@ -741,7 +789,8 @@ int dlm_create_root_list(struct dlm_ls *ls)
741 continue; 789 continue;
742 } 790 }
743 791
744 list_for_each_entry(r, &ls->ls_rsbtbl[i].toss, res_hashchain) { 792 for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = rb_next(n)) {
793 r = rb_entry(n, struct dlm_rsb, res_hashnode);
745 list_add(&r->res_root_list, &ls->ls_root_list); 794 list_add(&r->res_root_list, &ls->ls_root_list);
746 dlm_hold_rsb(r); 795 dlm_hold_rsb(r);
747 } 796 }
@@ -771,16 +820,18 @@ void dlm_release_root_list(struct dlm_ls *ls)
771 820
772void dlm_clear_toss_list(struct dlm_ls *ls) 821void dlm_clear_toss_list(struct dlm_ls *ls)
773{ 822{
774 struct dlm_rsb *r, *safe; 823 struct rb_node *n, *next;
824 struct dlm_rsb *rsb;
775 int i; 825 int i;
776 826
777 for (i = 0; i < ls->ls_rsbtbl_size; i++) { 827 for (i = 0; i < ls->ls_rsbtbl_size; i++) {
778 spin_lock(&ls->ls_rsbtbl[i].lock); 828 spin_lock(&ls->ls_rsbtbl[i].lock);
779 list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, 829 for (n = rb_first(&ls->ls_rsbtbl[i].toss); n; n = next) {
780 res_hashchain) { 830 next = rb_next(n);;
781 if (dlm_no_directory(ls) || !is_master(r)) { 831 rsb = rb_entry(n, struct dlm_rsb, res_hashnode);
782 list_del(&r->res_hashchain); 832 if (dlm_no_directory(ls) || !is_master(rsb)) {
783 dlm_free_rsb(r); 833 rb_erase(n, &ls->ls_rsbtbl[i].toss);
834 dlm_free_rsb(rsb);
784 } 835 }
785 } 836 }
786 spin_unlock(&ls->ls_rsbtbl[i].lock); 837 spin_unlock(&ls->ls_rsbtbl[i].lock);
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 774da3cf92c6..3780caf7ae0c 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -2,7 +2,7 @@
2******************************************************************************* 2*******************************************************************************
3** 3**
4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 4** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 5** Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved.
6** 6**
7** This copyrighted material is made available to anyone wishing to use, 7** This copyrighted material is made available to anyone wishing to use,
8** modify, copy, or redistribute it subject to the terms and conditions 8** modify, copy, or redistribute it subject to the terms and conditions
@@ -54,7 +54,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
54 unsigned long start; 54 unsigned long start;
55 int error, neg = 0; 55 int error, neg = 0;
56 56
57 log_debug(ls, "recover %llx", (unsigned long long)rv->seq); 57 log_debug(ls, "dlm_recover %llx", (unsigned long long)rv->seq);
58 58
59 mutex_lock(&ls->ls_recoverd_active); 59 mutex_lock(&ls->ls_recoverd_active);
60 60
@@ -76,14 +76,22 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
76 76
77 /* 77 /*
78 * Add or remove nodes from the lockspace's ls_nodes list. 78 * Add or remove nodes from the lockspace's ls_nodes list.
79 * Also waits for all nodes to complete dlm_recover_members.
80 */ 79 */
81 80
82 error = dlm_recover_members(ls, rv, &neg); 81 error = dlm_recover_members(ls, rv, &neg);
83 if (error) { 82 if (error) {
84 log_debug(ls, "recover_members failed %d", error); 83 log_debug(ls, "dlm_recover_members error %d", error);
85 goto fail; 84 goto fail;
86 } 85 }
86
87 dlm_set_recover_status(ls, DLM_RS_NODES);
88
89 error = dlm_recover_members_wait(ls);
90 if (error) {
91 log_debug(ls, "dlm_recover_members_wait error %d", error);
92 goto fail;
93 }
94
87 start = jiffies; 95 start = jiffies;
88 96
89 /* 97 /*
@@ -93,17 +101,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
93 101
94 error = dlm_recover_directory(ls); 102 error = dlm_recover_directory(ls);
95 if (error) { 103 if (error) {
96 log_debug(ls, "recover_directory failed %d", error); 104 log_debug(ls, "dlm_recover_directory error %d", error);
97 goto fail; 105 goto fail;
98 } 106 }
99 107
100 /* 108 dlm_set_recover_status(ls, DLM_RS_DIR);
101 * Wait for all nodes to complete directory rebuild.
102 */
103 109
104 error = dlm_recover_directory_wait(ls); 110 error = dlm_recover_directory_wait(ls);
105 if (error) { 111 if (error) {
106 log_debug(ls, "recover_directory_wait failed %d", error); 112 log_debug(ls, "dlm_recover_directory_wait error %d", error);
107 goto fail; 113 goto fail;
108 } 114 }
109 115
@@ -133,7 +139,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
133 139
134 error = dlm_recover_masters(ls); 140 error = dlm_recover_masters(ls);
135 if (error) { 141 if (error) {
136 log_debug(ls, "recover_masters failed %d", error); 142 log_debug(ls, "dlm_recover_masters error %d", error);
137 goto fail; 143 goto fail;
138 } 144 }
139 145
@@ -143,13 +149,15 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
143 149
144 error = dlm_recover_locks(ls); 150 error = dlm_recover_locks(ls);
145 if (error) { 151 if (error) {
146 log_debug(ls, "recover_locks failed %d", error); 152 log_debug(ls, "dlm_recover_locks error %d", error);
147 goto fail; 153 goto fail;
148 } 154 }
149 155
156 dlm_set_recover_status(ls, DLM_RS_LOCKS);
157
150 error = dlm_recover_locks_wait(ls); 158 error = dlm_recover_locks_wait(ls);
151 if (error) { 159 if (error) {
152 log_debug(ls, "recover_locks_wait failed %d", error); 160 log_debug(ls, "dlm_recover_locks_wait error %d", error);
153 goto fail; 161 goto fail;
154 } 162 }
155 163
@@ -170,7 +178,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
170 178
171 error = dlm_recover_locks_wait(ls); 179 error = dlm_recover_locks_wait(ls);
172 if (error) { 180 if (error) {
173 log_debug(ls, "recover_locks_wait failed %d", error); 181 log_debug(ls, "dlm_recover_locks_wait error %d", error);
174 goto fail; 182 goto fail;
175 } 183 }
176 } 184 }
@@ -186,9 +194,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
186 dlm_purge_requestqueue(ls); 194 dlm_purge_requestqueue(ls);
187 195
188 dlm_set_recover_status(ls, DLM_RS_DONE); 196 dlm_set_recover_status(ls, DLM_RS_DONE);
197
189 error = dlm_recover_done_wait(ls); 198 error = dlm_recover_done_wait(ls);
190 if (error) { 199 if (error) {
191 log_debug(ls, "recover_done_wait failed %d", error); 200 log_debug(ls, "dlm_recover_done_wait error %d", error);
192 goto fail; 201 goto fail;
193 } 202 }
194 203
@@ -200,34 +209,35 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
200 209
201 error = enable_locking(ls, rv->seq); 210 error = enable_locking(ls, rv->seq);
202 if (error) { 211 if (error) {
203 log_debug(ls, "enable_locking failed %d", error); 212 log_debug(ls, "enable_locking error %d", error);
204 goto fail; 213 goto fail;
205 } 214 }
206 215
207 error = dlm_process_requestqueue(ls); 216 error = dlm_process_requestqueue(ls);
208 if (error) { 217 if (error) {
209 log_debug(ls, "process_requestqueue failed %d", error); 218 log_debug(ls, "dlm_process_requestqueue error %d", error);
210 goto fail; 219 goto fail;
211 } 220 }
212 221
213 error = dlm_recover_waiters_post(ls); 222 error = dlm_recover_waiters_post(ls);
214 if (error) { 223 if (error) {
215 log_debug(ls, "recover_waiters_post failed %d", error); 224 log_debug(ls, "dlm_recover_waiters_post error %d", error);
216 goto fail; 225 goto fail;
217 } 226 }
218 227
219 dlm_grant_after_purge(ls); 228 dlm_grant_after_purge(ls);
220 229
221 log_debug(ls, "recover %llx done: %u ms", 230 log_debug(ls, "dlm_recover %llx generation %u done: %u ms",
222 (unsigned long long)rv->seq, 231 (unsigned long long)rv->seq, ls->ls_generation,
223 jiffies_to_msecs(jiffies - start)); 232 jiffies_to_msecs(jiffies - start));
224 mutex_unlock(&ls->ls_recoverd_active); 233 mutex_unlock(&ls->ls_recoverd_active);
225 234
235 dlm_lsop_recover_done(ls);
226 return 0; 236 return 0;
227 237
228 fail: 238 fail:
229 dlm_release_root_list(ls); 239 dlm_release_root_list(ls);
230 log_debug(ls, "recover %llx error %d", 240 log_debug(ls, "dlm_recover %llx error %d",
231 (unsigned long long)rv->seq, error); 241 (unsigned long long)rv->seq, error);
232 mutex_unlock(&ls->ls_recoverd_active); 242 mutex_unlock(&ls->ls_recoverd_active);
233 return error; 243 return error;
@@ -250,8 +260,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
250 260
251 if (rv) { 261 if (rv) {
252 ls_recover(ls, rv); 262 ls_recover(ls, rv);
253 kfree(rv->nodeids); 263 kfree(rv->nodes);
254 kfree(rv->new);
255 kfree(rv); 264 kfree(rv);
256 } 265 }
257} 266}
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index d8ea60756403..eb4ed9ba3098 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -392,8 +392,9 @@ static int device_create_lockspace(struct dlm_lspace_params *params)
392 if (!capable(CAP_SYS_ADMIN)) 392 if (!capable(CAP_SYS_ADMIN))
393 return -EPERM; 393 return -EPERM;
394 394
395 error = dlm_new_lockspace(params->name, strlen(params->name), 395 error = dlm_new_lockspace(params->name, NULL, params->flags,
396 &lockspace, params->flags, DLM_USER_LVB_LEN); 396 DLM_USER_LVB_LEN, NULL, NULL, NULL,
397 &lockspace);
397 if (error) 398 if (error)
398 return error; 399 return error;
399 400
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 2a834255c75d..63ab24510649 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -417,17 +417,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
417 (unsigned long long)(extent_base + extent_offset), rc); 417 (unsigned long long)(extent_base + extent_offset), rc);
418 goto out; 418 goto out;
419 } 419 }
420 if (unlikely(ecryptfs_verbosity > 0)) {
421 ecryptfs_printk(KERN_DEBUG, "Encrypting extent "
422 "with iv:\n");
423 ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
424 ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
425 "encryption:\n");
426 ecryptfs_dump_hex((char *)
427 (page_address(page)
428 + (extent_offset * crypt_stat->extent_size)),
429 8);
430 }
431 rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0, 420 rc = ecryptfs_encrypt_page_offset(crypt_stat, enc_extent_page, 0,
432 page, (extent_offset 421 page, (extent_offset
433 * crypt_stat->extent_size), 422 * crypt_stat->extent_size),
@@ -440,14 +429,6 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page,
440 goto out; 429 goto out;
441 } 430 }
442 rc = 0; 431 rc = 0;
443 if (unlikely(ecryptfs_verbosity > 0)) {
444 ecryptfs_printk(KERN_DEBUG, "Encrypt extent [0x%.16llx]; "
445 "rc = [%d]\n",
446 (unsigned long long)(extent_base + extent_offset), rc);
447 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
448 "encryption:\n");
449 ecryptfs_dump_hex((char *)(page_address(enc_extent_page)), 8);
450 }
451out: 432out:
452 return rc; 433 return rc;
453} 434}
@@ -543,17 +524,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
543 (unsigned long long)(extent_base + extent_offset), rc); 524 (unsigned long long)(extent_base + extent_offset), rc);
544 goto out; 525 goto out;
545 } 526 }
546 if (unlikely(ecryptfs_verbosity > 0)) {
547 ecryptfs_printk(KERN_DEBUG, "Decrypting extent "
548 "with iv:\n");
549 ecryptfs_dump_hex(extent_iv, crypt_stat->iv_bytes);
550 ecryptfs_printk(KERN_DEBUG, "First 8 bytes before "
551 "decryption:\n");
552 ecryptfs_dump_hex((char *)
553 (page_address(enc_extent_page)
554 + (extent_offset * crypt_stat->extent_size)),
555 8);
556 }
557 rc = ecryptfs_decrypt_page_offset(crypt_stat, page, 527 rc = ecryptfs_decrypt_page_offset(crypt_stat, page,
558 (extent_offset 528 (extent_offset
559 * crypt_stat->extent_size), 529 * crypt_stat->extent_size),
@@ -567,16 +537,6 @@ static int ecryptfs_decrypt_extent(struct page *page,
567 goto out; 537 goto out;
568 } 538 }
569 rc = 0; 539 rc = 0;
570 if (unlikely(ecryptfs_verbosity > 0)) {
571 ecryptfs_printk(KERN_DEBUG, "Decrypt extent [0x%.16llx]; "
572 "rc = [%d]\n",
573 (unsigned long long)(extent_base + extent_offset), rc);
574 ecryptfs_printk(KERN_DEBUG, "First 8 bytes after "
575 "decryption:\n");
576 ecryptfs_dump_hex((char *)(page_address(page)
577 + (extent_offset
578 * crypt_stat->extent_size)), 8);
579 }
580out: 540out:
581 return rc; 541 return rc;
582} 542}
@@ -1590,8 +1550,8 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry,
1590 */ 1550 */
1591int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) 1551int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1592{ 1552{
1593 int rc = 0; 1553 int rc;
1594 char *page_virt = NULL; 1554 char *page_virt;
1595 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; 1555 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
1596 struct ecryptfs_crypt_stat *crypt_stat = 1556 struct ecryptfs_crypt_stat *crypt_stat =
1597 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; 1557 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
@@ -1616,11 +1576,13 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1616 ecryptfs_dentry, 1576 ecryptfs_dentry,
1617 ECRYPTFS_VALIDATE_HEADER_SIZE); 1577 ECRYPTFS_VALIDATE_HEADER_SIZE);
1618 if (rc) { 1578 if (rc) {
1579 /* metadata is not in the file header, so try xattrs */
1619 memset(page_virt, 0, PAGE_CACHE_SIZE); 1580 memset(page_virt, 0, PAGE_CACHE_SIZE);
1620 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); 1581 rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
1621 if (rc) { 1582 if (rc) {
1622 printk(KERN_DEBUG "Valid eCryptfs headers not found in " 1583 printk(KERN_DEBUG "Valid eCryptfs headers not found in "
1623 "file header region or xattr region\n"); 1584 "file header region or xattr region, inode %lu\n",
1585 ecryptfs_inode->i_ino);
1624 rc = -EINVAL; 1586 rc = -EINVAL;
1625 goto out; 1587 goto out;
1626 } 1588 }
@@ -1629,7 +1591,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1629 ECRYPTFS_DONT_VALIDATE_HEADER_SIZE); 1591 ECRYPTFS_DONT_VALIDATE_HEADER_SIZE);
1630 if (rc) { 1592 if (rc) {
1631 printk(KERN_DEBUG "Valid eCryptfs headers not found in " 1593 printk(KERN_DEBUG "Valid eCryptfs headers not found in "
1632 "file xattr region either\n"); 1594 "file xattr region either, inode %lu\n",
1595 ecryptfs_inode->i_ino);
1633 rc = -EINVAL; 1596 rc = -EINVAL;
1634 } 1597 }
1635 if (crypt_stat->mount_crypt_stat->flags 1598 if (crypt_stat->mount_crypt_stat->flags
@@ -1640,7 +1603,8 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
1640 "crypto metadata only in the extended attribute " 1603 "crypto metadata only in the extended attribute "
1641 "region, but eCryptfs was mounted without " 1604 "region, but eCryptfs was mounted without "
1642 "xattr support enabled. eCryptfs will not treat " 1605 "xattr support enabled. eCryptfs will not treat "
1643 "this like an encrypted file.\n"); 1606 "this like an encrypted file, inode %lu\n",
1607 ecryptfs_inode->i_ino);
1644 rc = -EINVAL; 1608 rc = -EINVAL;
1645 } 1609 }
1646 } 1610 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index a9f29b12fbf2..a2362df58ae8 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -151,6 +151,11 @@ ecryptfs_get_key_payload_data(struct key *key)
151 * dentry name */ 151 * dentry name */
152#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as 152#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as
153 * metadata */ 153 * metadata */
154#define ECRYPTFS_MIN_PKT_LEN_SIZE 1 /* Min size to specify packet length */
155#define ECRYPTFS_MAX_PKT_LEN_SIZE 2 /* Pass at least this many bytes to
156 * ecryptfs_parse_packet_length() and
157 * ecryptfs_write_packet_length()
158 */
154/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= 159/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >=
155 * ECRYPTFS_MAX_IV_BYTES */ 160 * ECRYPTFS_MAX_IV_BYTES */
156#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 161#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 19a8ca4ab1dd..19892d7d2ed1 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -822,18 +822,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
822 size_t num_zeros = (PAGE_CACHE_SIZE 822 size_t num_zeros = (PAGE_CACHE_SIZE
823 - (ia->ia_size & ~PAGE_CACHE_MASK)); 823 - (ia->ia_size & ~PAGE_CACHE_MASK));
824 824
825
826 /*
827 * XXX(truncate) this should really happen at the begginning
828 * of ->setattr. But the code is too messy to that as part
829 * of a larger patch. ecryptfs is also totally missing out
830 * on the inode_change_ok check at the beginning of
831 * ->setattr while would include this.
832 */
833 rc = inode_newsize_ok(inode, ia->ia_size);
834 if (rc)
835 goto out;
836
837 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 825 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
838 truncate_setsize(inode, ia->ia_size); 826 truncate_setsize(inode, ia->ia_size);
839 lower_ia->ia_size = ia->ia_size; 827 lower_ia->ia_size = ia->ia_size;
@@ -883,6 +871,28 @@ out:
883 return rc; 871 return rc;
884} 872}
885 873
874static int ecryptfs_inode_newsize_ok(struct inode *inode, loff_t offset)
875{
876 struct ecryptfs_crypt_stat *crypt_stat;
877 loff_t lower_oldsize, lower_newsize;
878
879 crypt_stat = &ecryptfs_inode_to_private(inode)->crypt_stat;
880 lower_oldsize = upper_size_to_lower_size(crypt_stat,
881 i_size_read(inode));
882 lower_newsize = upper_size_to_lower_size(crypt_stat, offset);
883 if (lower_newsize > lower_oldsize) {
884 /*
885 * The eCryptfs inode and the new *lower* size are mixed here
886 * because we may not have the lower i_mutex held and/or it may
887 * not be appropriate to call inode_newsize_ok() with inodes
888 * from other filesystems.
889 */
890 return inode_newsize_ok(inode, lower_newsize);
891 }
892
893 return 0;
894}
895
886/** 896/**
887 * ecryptfs_truncate 897 * ecryptfs_truncate
888 * @dentry: The ecryptfs layer dentry 898 * @dentry: The ecryptfs layer dentry
@@ -899,6 +909,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
899 struct iattr lower_ia = { .ia_valid = 0 }; 909 struct iattr lower_ia = { .ia_valid = 0 };
900 int rc; 910 int rc;
901 911
912 rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length);
913 if (rc)
914 return rc;
915
902 rc = truncate_upper(dentry, &ia, &lower_ia); 916 rc = truncate_upper(dentry, &ia, &lower_ia);
903 if (!rc && lower_ia.ia_valid & ATTR_SIZE) { 917 if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
904 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); 918 struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
@@ -978,6 +992,16 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
978 } 992 }
979 } 993 }
980 mutex_unlock(&crypt_stat->cs_mutex); 994 mutex_unlock(&crypt_stat->cs_mutex);
995
996 rc = inode_change_ok(inode, ia);
997 if (rc)
998 goto out;
999 if (ia->ia_valid & ATTR_SIZE) {
1000 rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
1001 if (rc)
1002 goto out;
1003 }
1004
981 if (S_ISREG(inode->i_mode)) { 1005 if (S_ISREG(inode->i_mode)) {
982 rc = filemap_write_and_wait(inode->i_mapping); 1006 rc = filemap_write_and_wait(inode->i_mapping);
983 if (rc) 1007 if (rc)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ac1ad48c2376..8e3b943e330f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -109,7 +109,7 @@ int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
109 (*size) += ((unsigned char)(data[1]) + 192); 109 (*size) += ((unsigned char)(data[1]) + 192);
110 (*length_size) = 2; 110 (*length_size) = 2;
111 } else if (data[0] == 255) { 111 } else if (data[0] == 255) {
112 /* Five-byte length; we're not supposed to see this */ 112 /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
113 ecryptfs_printk(KERN_ERR, "Five-byte packet length not " 113 ecryptfs_printk(KERN_ERR, "Five-byte packet length not "
114 "supported\n"); 114 "supported\n");
115 rc = -EINVAL; 115 rc = -EINVAL;
@@ -126,7 +126,7 @@ out:
126/** 126/**
127 * ecryptfs_write_packet_length 127 * ecryptfs_write_packet_length
128 * @dest: The byte array target into which to write the length. Must 128 * @dest: The byte array target into which to write the length. Must
129 * have at least 5 bytes allocated. 129 * have at least ECRYPTFS_MAX_PKT_LEN_SIZE bytes allocated.
130 * @size: The length to write. 130 * @size: The length to write.
131 * @packet_size_length: The number of bytes used to encode the packet 131 * @packet_size_length: The number of bytes used to encode the packet
132 * length is written to this address. 132 * length is written to this address.
@@ -146,6 +146,7 @@ int ecryptfs_write_packet_length(char *dest, size_t size,
146 dest[1] = ((size - 192) % 256); 146 dest[1] = ((size - 192) % 256);
147 (*packet_size_length) = 2; 147 (*packet_size_length) = 2;
148 } else { 148 } else {
149 /* If support is added, adjust ECRYPTFS_MAX_PKT_LEN_SIZE */
149 rc = -EINVAL; 150 rc = -EINVAL;
150 ecryptfs_printk(KERN_WARNING, 151 ecryptfs_printk(KERN_WARNING,
151 "Unsupported packet size: [%zd]\n", size); 152 "Unsupported packet size: [%zd]\n", size);
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 940a82e63dc3..349209dc6a91 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -218,6 +218,29 @@ out_unlock:
218 return rc; 218 return rc;
219} 219}
220 220
221/*
222 * miscdevfs packet format:
223 * Octet 0: Type
224 * Octets 1-4: network byte order msg_ctx->counter
225 * Octets 5-N0: Size of struct ecryptfs_message to follow
226 * Octets N0-N1: struct ecryptfs_message (including data)
227 *
228 * Octets 5-N1 not written if the packet type does not include a message
229 */
230#define PKT_TYPE_SIZE 1
231#define PKT_CTR_SIZE 4
232#define MIN_NON_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE)
233#define MIN_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \
234 + ECRYPTFS_MIN_PKT_LEN_SIZE)
235/* 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES comes from tag 65 packet format */
236#define MAX_MSG_PKT_SIZE (PKT_TYPE_SIZE + PKT_CTR_SIZE \
237 + ECRYPTFS_MAX_PKT_LEN_SIZE \
238 + sizeof(struct ecryptfs_message) \
239 + 4 + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES)
240#define PKT_TYPE_OFFSET 0
241#define PKT_CTR_OFFSET PKT_TYPE_SIZE
242#define PKT_LEN_OFFSET (PKT_TYPE_SIZE + PKT_CTR_SIZE)
243
221/** 244/**
222 * ecryptfs_miscdev_read - format and send message from queue 245 * ecryptfs_miscdev_read - format and send message from queue
223 * @file: fs/ecryptfs/euid miscdevfs handle (ignored) 246 * @file: fs/ecryptfs/euid miscdevfs handle (ignored)
@@ -237,7 +260,7 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
237 struct ecryptfs_daemon *daemon; 260 struct ecryptfs_daemon *daemon;
238 struct ecryptfs_msg_ctx *msg_ctx; 261 struct ecryptfs_msg_ctx *msg_ctx;
239 size_t packet_length_size; 262 size_t packet_length_size;
240 char packet_length[3]; 263 char packet_length[ECRYPTFS_MAX_PKT_LEN_SIZE];
241 size_t i; 264 size_t i;
242 size_t total_length; 265 size_t total_length;
243 uid_t euid = current_euid(); 266 uid_t euid = current_euid();
@@ -305,15 +328,8 @@ check_list:
305 packet_length_size = 0; 328 packet_length_size = 0;
306 msg_ctx->msg_size = 0; 329 msg_ctx->msg_size = 0;
307 } 330 }
308 /* miscdevfs packet format: 331 total_length = (PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_length_size
309 * Octet 0: Type 332 + msg_ctx->msg_size);
310 * Octets 1-4: network byte order msg_ctx->counter
311 * Octets 5-N0: Size of struct ecryptfs_message to follow
312 * Octets N0-N1: struct ecryptfs_message (including data)
313 *
314 * Octets 5-N1 not written if the packet type does not
315 * include a message */
316 total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size);
317 if (count < total_length) { 333 if (count < total_length) {
318 rc = 0; 334 rc = 0;
319 printk(KERN_WARNING "%s: Only given user buffer of " 335 printk(KERN_WARNING "%s: Only given user buffer of "
@@ -324,9 +340,10 @@ check_list:
324 rc = -EFAULT; 340 rc = -EFAULT;
325 if (put_user(msg_ctx->type, buf)) 341 if (put_user(msg_ctx->type, buf))
326 goto out_unlock_msg_ctx; 342 goto out_unlock_msg_ctx;
327 if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1))) 343 if (put_user(cpu_to_be32(msg_ctx->counter),
344 (__be32 __user *)(&buf[PKT_CTR_OFFSET])))
328 goto out_unlock_msg_ctx; 345 goto out_unlock_msg_ctx;
329 i = 5; 346 i = PKT_TYPE_SIZE + PKT_CTR_SIZE;
330 if (msg_ctx->msg) { 347 if (msg_ctx->msg) {
331 if (copy_to_user(&buf[i], packet_length, packet_length_size)) 348 if (copy_to_user(&buf[i], packet_length, packet_length_size))
332 goto out_unlock_msg_ctx; 349 goto out_unlock_msg_ctx;
@@ -391,12 +408,6 @@ out:
391 * @count: Amount of data in @buf 408 * @count: Amount of data in @buf
392 * @ppos: Pointer to offset in file (ignored) 409 * @ppos: Pointer to offset in file (ignored)
393 * 410 *
394 * miscdevfs packet format:
395 * Octet 0: Type
396 * Octets 1-4: network byte order msg_ctx->counter (0's for non-response)
397 * Octets 5-N0: Size of struct ecryptfs_message to follow
398 * Octets N0-N1: struct ecryptfs_message (including data)
399 *
400 * Returns the number of bytes read from @buf 411 * Returns the number of bytes read from @buf
401 */ 412 */
402static ssize_t 413static ssize_t
@@ -405,60 +416,78 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
405{ 416{
406 __be32 counter_nbo; 417 __be32 counter_nbo;
407 u32 seq; 418 u32 seq;
408 size_t packet_size, packet_size_length, i; 419 size_t packet_size, packet_size_length;
409 ssize_t sz = 0;
410 char *data; 420 char *data;
411 uid_t euid = current_euid(); 421 uid_t euid = current_euid();
412 int rc; 422 unsigned char packet_size_peek[ECRYPTFS_MAX_PKT_LEN_SIZE];
423 ssize_t rc;
413 424
414 if (count == 0) 425 if (count == 0) {
415 goto out; 426 return 0;
427 } else if (count == MIN_NON_MSG_PKT_SIZE) {
428 /* Likely a harmless MSG_HELO or MSG_QUIT - no packet length */
429 goto memdup;
430 } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) {
431 printk(KERN_WARNING "%s: Acceptable packet size range is "
432 "[%d-%lu], but amount of data written is [%zu].",
433 __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count);
434 return -EINVAL;
435 }
436
437 if (copy_from_user(packet_size_peek, &buf[PKT_LEN_OFFSET],
438 sizeof(packet_size_peek))) {
439 printk(KERN_WARNING "%s: Error while inspecting packet size\n",
440 __func__);
441 return -EFAULT;
442 }
416 443
444 rc = ecryptfs_parse_packet_length(packet_size_peek, &packet_size,
445 &packet_size_length);
446 if (rc) {
447 printk(KERN_WARNING "%s: Error parsing packet length; "
448 "rc = [%zd]\n", __func__, rc);
449 return rc;
450 }
451
452 if ((PKT_TYPE_SIZE + PKT_CTR_SIZE + packet_size_length + packet_size)
453 != count) {
454 printk(KERN_WARNING "%s: Invalid packet size [%zu]\n", __func__,
455 packet_size);
456 return -EINVAL;
457 }
458
459memdup:
417 data = memdup_user(buf, count); 460 data = memdup_user(buf, count);
418 if (IS_ERR(data)) { 461 if (IS_ERR(data)) {
419 printk(KERN_ERR "%s: memdup_user returned error [%ld]\n", 462 printk(KERN_ERR "%s: memdup_user returned error [%ld]\n",
420 __func__, PTR_ERR(data)); 463 __func__, PTR_ERR(data));
421 goto out; 464 return PTR_ERR(data);
422 } 465 }
423 sz = count; 466 switch (data[PKT_TYPE_OFFSET]) {
424 i = 0;
425 switch (data[i++]) {
426 case ECRYPTFS_MSG_RESPONSE: 467 case ECRYPTFS_MSG_RESPONSE:
427 if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { 468 if (count < (MIN_MSG_PKT_SIZE
469 + sizeof(struct ecryptfs_message))) {
428 printk(KERN_WARNING "%s: Minimum acceptable packet " 470 printk(KERN_WARNING "%s: Minimum acceptable packet "
429 "size is [%zd], but amount of data written is " 471 "size is [%zd], but amount of data written is "
430 "only [%zd]. Discarding response packet.\n", 472 "only [%zd]. Discarding response packet.\n",
431 __func__, 473 __func__,
432 (1 + 4 + 1 + sizeof(struct ecryptfs_message)), 474 (MIN_MSG_PKT_SIZE
433 count); 475 + sizeof(struct ecryptfs_message)), count);
476 rc = -EINVAL;
434 goto out_free; 477 goto out_free;
435 } 478 }
436 memcpy(&counter_nbo, &data[i], 4); 479 memcpy(&counter_nbo, &data[PKT_CTR_OFFSET], PKT_CTR_SIZE);
437 seq = be32_to_cpu(counter_nbo); 480 seq = be32_to_cpu(counter_nbo);
438 i += 4; 481 rc = ecryptfs_miscdev_response(
439 rc = ecryptfs_parse_packet_length(&data[i], &packet_size, 482 &data[PKT_LEN_OFFSET + packet_size_length],
440 &packet_size_length); 483 packet_size, euid, current_user_ns(),
484 task_pid(current), seq);
441 if (rc) { 485 if (rc) {
442 printk(KERN_WARNING "%s: Error parsing packet length; "
443 "rc = [%d]\n", __func__, rc);
444 goto out_free;
445 }
446 i += packet_size_length;
447 if ((1 + 4 + packet_size_length + packet_size) != count) {
448 printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])"
449 " + packet_size([%zd]))([%zd]) != "
450 "count([%zd]). Invalid packet format.\n",
451 __func__, packet_size_length, packet_size,
452 (1 + packet_size_length + packet_size), count);
453 goto out_free;
454 }
455 rc = ecryptfs_miscdev_response(&data[i], packet_size,
456 euid, current_user_ns(),
457 task_pid(current), seq);
458 if (rc)
459 printk(KERN_WARNING "%s: Failed to deliver miscdev " 486 printk(KERN_WARNING "%s: Failed to deliver miscdev "
460 "response to requesting operation; rc = [%d]\n", 487 "response to requesting operation; rc = [%zd]\n",
461 __func__, rc); 488 __func__, rc);
489 goto out_free;
490 }
462 break; 491 break;
463 case ECRYPTFS_MSG_HELO: 492 case ECRYPTFS_MSG_HELO:
464 case ECRYPTFS_MSG_QUIT: 493 case ECRYPTFS_MSG_QUIT:
@@ -467,12 +496,13 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
467 ecryptfs_printk(KERN_WARNING, "Dropping miscdev " 496 ecryptfs_printk(KERN_WARNING, "Dropping miscdev "
468 "message of unrecognized type [%d]\n", 497 "message of unrecognized type [%d]\n",
469 data[0]); 498 data[0]);
470 break; 499 rc = -EINVAL;
500 goto out_free;
471 } 501 }
502 rc = count;
472out_free: 503out_free:
473 kfree(data); 504 kfree(data);
474out: 505 return rc;
475 return sz;
476} 506}
477 507
478 508
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 6a44148c5fb9..10ec695ccd68 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -57,6 +57,10 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
57 * @page: Page that is locked before this call is made 57 * @page: Page that is locked before this call is made
58 * 58 *
59 * Returns zero on success; non-zero otherwise 59 * Returns zero on success; non-zero otherwise
60 *
61 * This is where we encrypt the data and pass the encrypted data to
62 * the lower filesystem. In OpenPGP-compatible mode, we operate on
63 * entire underlying packets.
60 */ 64 */
61static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc) 65static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
62{ 66{
@@ -481,10 +485,6 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode)
481 * @copied: The amount of data copied 485 * @copied: The amount of data copied
482 * @page: The eCryptfs page 486 * @page: The eCryptfs page
483 * @fsdata: The fsdata (unused) 487 * @fsdata: The fsdata (unused)
484 *
485 * This is where we encrypt the data and pass the encrypted data to
486 * the lower filesystem. In OpenPGP-compatible mode, we operate on
487 * entire underlying packets.
488 */ 488 */
489static int ecryptfs_write_end(struct file *file, 489static int ecryptfs_write_end(struct file *file,
490 struct address_space *mapping, 490 struct address_space *mapping,
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3745f7c2b9c2..5c0106f75775 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -130,13 +130,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
130 pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT); 130 pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
131 size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK); 131 size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
132 size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page); 132 size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
133 size_t total_remaining_bytes = ((offset + size) - pos); 133 loff_t total_remaining_bytes = ((offset + size) - pos);
134
135 if (fatal_signal_pending(current)) {
136 rc = -EINTR;
137 break;
138 }
134 139
135 if (num_bytes > total_remaining_bytes) 140 if (num_bytes > total_remaining_bytes)
136 num_bytes = total_remaining_bytes; 141 num_bytes = total_remaining_bytes;
137 if (pos < offset) { 142 if (pos < offset) {
138 /* remaining zeros to write, up to destination offset */ 143 /* remaining zeros to write, up to destination offset */
139 size_t total_remaining_zeros = (offset - pos); 144 loff_t total_remaining_zeros = (offset - pos);
140 145
141 if (num_bytes > total_remaining_zeros) 146 if (num_bytes > total_remaining_zeros)
142 num_bytes = total_remaining_zeros; 147 num_bytes = total_remaining_zeros;
@@ -193,15 +198,19 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
193 } 198 }
194 pos += num_bytes; 199 pos += num_bytes;
195 } 200 }
196 if ((offset + size) > ecryptfs_file_size) { 201 if (pos > ecryptfs_file_size) {
197 i_size_write(ecryptfs_inode, (offset + size)); 202 i_size_write(ecryptfs_inode, pos);
198 if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) { 203 if (crypt_stat->flags & ECRYPTFS_ENCRYPTED) {
199 rc = ecryptfs_write_inode_size_to_metadata( 204 int rc2;
205
206 rc2 = ecryptfs_write_inode_size_to_metadata(
200 ecryptfs_inode); 207 ecryptfs_inode);
201 if (rc) { 208 if (rc2) {
202 printk(KERN_ERR "Problem with " 209 printk(KERN_ERR "Problem with "
203 "ecryptfs_write_inode_size_to_metadata; " 210 "ecryptfs_write_inode_size_to_metadata; "
204 "rc = [%d]\n", rc); 211 "rc = [%d]\n", rc2);
212 if (!rc)
213 rc = rc2;
205 goto out; 214 goto out;
206 } 215 }
207 } 216 }
@@ -273,76 +282,3 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
273 flush_dcache_page(page_for_ecryptfs); 282 flush_dcache_page(page_for_ecryptfs);
274 return rc; 283 return rc;
275} 284}
276
277#if 0
278/**
279 * ecryptfs_read
280 * @data: The virtual address into which to write the data read (and
281 * possibly decrypted) from the lower file
282 * @offset: The offset in the decrypted view of the file from which to
283 * read into @data
284 * @size: The number of bytes to read into @data
285 * @ecryptfs_file: The eCryptfs file from which to read
286 *
287 * Read an arbitrary amount of data from an arbitrary location in the
288 * eCryptfs page cache. This is done on an extent-by-extent basis;
289 * individual extents are decrypted and read from the lower page
290 * cache (via VFS reads). This function takes care of all the
291 * address translation to locations in the lower filesystem.
292 *
293 * Returns zero on success; non-zero otherwise
294 */
295int ecryptfs_read(char *data, loff_t offset, size_t size,
296 struct file *ecryptfs_file)
297{
298 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
299 struct page *ecryptfs_page;
300 char *ecryptfs_page_virt;
301 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
302 loff_t data_offset = 0;
303 loff_t pos;
304 int rc = 0;
305
306 if ((offset + size) > ecryptfs_file_size) {
307 rc = -EINVAL;
308 printk(KERN_ERR "%s: Attempt to read data past the end of the "
309 "file; offset = [%lld]; size = [%td]; "
310 "ecryptfs_file_size = [%lld]\n",
311 __func__, offset, size, ecryptfs_file_size);
312 goto out;
313 }
314 pos = offset;
315 while (pos < (offset + size)) {
316 pgoff_t ecryptfs_page_idx = (pos >> PAGE_CACHE_SHIFT);
317 size_t start_offset_in_page = (pos & ~PAGE_CACHE_MASK);
318 size_t num_bytes = (PAGE_CACHE_SIZE - start_offset_in_page);
319 size_t total_remaining_bytes = ((offset + size) - pos);
320
321 if (num_bytes > total_remaining_bytes)
322 num_bytes = total_remaining_bytes;
323 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
324 ecryptfs_page_idx);
325 if (IS_ERR(ecryptfs_page)) {
326 rc = PTR_ERR(ecryptfs_page);
327 printk(KERN_ERR "%s: Error getting page at "
328 "index [%ld] from eCryptfs inode "
329 "mapping; rc = [%d]\n", __func__,
330 ecryptfs_page_idx, rc);
331 goto out;
332 }
333 ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
334 memcpy((data + data_offset),
335 ((char *)ecryptfs_page_virt + start_offset_in_page),
336 num_bytes);
337 kunmap_atomic(ecryptfs_page_virt, KM_USER0);
338 flush_dcache_page(ecryptfs_page);
339 SetPageUptodate(ecryptfs_page);
340 unlock_page(ecryptfs_page);
341 page_cache_release(ecryptfs_page);
342 pos += num_bytes;
343 data_offset += num_bytes;
344 }
345out:
346 return rc;
347}
348#endif /* 0 */
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 828e750af23a..aabdfc38cf24 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -197,6 +197,12 @@ struct eventpoll {
197 197
198 /* The user that created the eventpoll descriptor */ 198 /* The user that created the eventpoll descriptor */
199 struct user_struct *user; 199 struct user_struct *user;
200
201 struct file *file;
202
203 /* used to optimize loop detection check */
204 int visited;
205 struct list_head visited_list_link;
200}; 206};
201 207
202/* Wait structure used by the poll hooks */ 208/* Wait structure used by the poll hooks */
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly;
255/* Slab cache used to allocate "struct eppoll_entry" */ 261/* Slab cache used to allocate "struct eppoll_entry" */
256static struct kmem_cache *pwq_cache __read_mostly; 262static struct kmem_cache *pwq_cache __read_mostly;
257 263
264/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
265static LIST_HEAD(visited_list);
266
267/*
268 * List of files with newly added links, where we may need to limit the number
269 * of emanating paths. Protected by the epmutex.
270 */
271static LIST_HEAD(tfile_check_list);
272
258#ifdef CONFIG_SYSCTL 273#ifdef CONFIG_SYSCTL
259 274
260#include <linux/sysctl.h> 275#include <linux/sysctl.h>
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = {
276}; 291};
277#endif /* CONFIG_SYSCTL */ 292#endif /* CONFIG_SYSCTL */
278 293
294static const struct file_operations eventpoll_fops;
295
296static inline int is_file_epoll(struct file *f)
297{
298 return f->f_op == &eventpoll_fops;
299}
279 300
280/* Setup the structure that is used as key for the RB tree */ 301/* Setup the structure that is used as key for the RB tree */
281static inline void ep_set_ffd(struct epoll_filefd *ffd, 302static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = {
711 .llseek = noop_llseek, 732 .llseek = noop_llseek,
712}; 733};
713 734
714/* Fast test to see if the file is an eventpoll file */
715static inline int is_file_epoll(struct file *f)
716{
717 return f->f_op == &eventpoll_fops;
718}
719
720/* 735/*
721 * This is called from eventpoll_release() to unlink files from the eventpoll 736 * This is called from eventpoll_release() to unlink files from the eventpoll
722 * interface. We need to have this facility to cleanup correctly files that are 737 * interface. We need to have this facility to cleanup correctly files that are
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
926 rb_insert_color(&epi->rbn, &ep->rbr); 941 rb_insert_color(&epi->rbn, &ep->rbr);
927} 942}
928 943
944
945
946#define PATH_ARR_SIZE 5
947/*
948 * These are the number paths of length 1 to 5, that we are allowing to emanate
949 * from a single file of interest. For example, we allow 1000 paths of length
950 * 1, to emanate from each file of interest. This essentially represents the
951 * potential wakeup paths, which need to be limited in order to avoid massive
952 * uncontrolled wakeup storms. The common use case should be a single ep which
953 * is connected to n file sources. In this case each file source has 1 path
954 * of length 1. Thus, the numbers below should be more than sufficient. These
955 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
956 * and delete can't add additional paths. Protected by the epmutex.
957 */
958static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
959static int path_count[PATH_ARR_SIZE];
960
961static int path_count_inc(int nests)
962{
963 if (++path_count[nests] > path_limits[nests])
964 return -1;
965 return 0;
966}
967
968static void path_count_init(void)
969{
970 int i;
971
972 for (i = 0; i < PATH_ARR_SIZE; i++)
973 path_count[i] = 0;
974}
975
976static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
977{
978 int error = 0;
979 struct file *file = priv;
980 struct file *child_file;
981 struct epitem *epi;
982
983 list_for_each_entry(epi, &file->f_ep_links, fllink) {
984 child_file = epi->ep->file;
985 if (is_file_epoll(child_file)) {
986 if (list_empty(&child_file->f_ep_links)) {
987 if (path_count_inc(call_nests)) {
988 error = -1;
989 break;
990 }
991 } else {
992 error = ep_call_nested(&poll_loop_ncalls,
993 EP_MAX_NESTS,
994 reverse_path_check_proc,
995 child_file, child_file,
996 current);
997 }
998 if (error != 0)
999 break;
1000 } else {
1001 printk(KERN_ERR "reverse_path_check_proc: "
1002 "file is not an ep!\n");
1003 }
1004 }
1005 return error;
1006}
1007
1008/**
1009 * reverse_path_check - The tfile_check_list is list of file *, which have
1010 * links that are proposed to be newly added. We need to
1011 * make sure that those added links don't add too many
1012 * paths such that we will spend all our time waking up
1013 * eventpoll objects.
1014 *
1015 * Returns: Returns zero if the proposed links don't create too many paths,
1016 * -1 otherwise.
1017 */
1018static int reverse_path_check(void)
1019{
1020 int length = 0;
1021 int error = 0;
1022 struct file *current_file;
1023
1024 /* let's call this for all tfiles */
1025 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1026 length++;
1027 path_count_init();
1028 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1029 reverse_path_check_proc, current_file,
1030 current_file, current);
1031 if (error)
1032 break;
1033 }
1034 return error;
1035}
1036
929/* 1037/*
930 * Must be called with "mtx" held. 1038 * Must be called with "mtx" held.
931 */ 1039 */
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
987 */ 1095 */
988 ep_rbtree_insert(ep, epi); 1096 ep_rbtree_insert(ep, epi);
989 1097
1098 /* now check if we've created too many backpaths */
1099 error = -EINVAL;
1100 if (reverse_path_check())
1101 goto error_remove_epi;
1102
990 /* We have to drop the new item inside our item list to keep track of it */ 1103 /* We have to drop the new item inside our item list to keep track of it */
991 spin_lock_irqsave(&ep->lock, flags); 1104 spin_lock_irqsave(&ep->lock, flags);
992 1105
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1011 1124
1012 return 0; 1125 return 0;
1013 1126
1127error_remove_epi:
1128 spin_lock(&tfile->f_lock);
1129 if (ep_is_linked(&epi->fllink))
1130 list_del_init(&epi->fllink);
1131 spin_unlock(&tfile->f_lock);
1132
1133 rb_erase(&epi->rbn, &ep->rbr);
1134
1014error_unregister: 1135error_unregister:
1015 ep_unregister_pollwait(ep, epi); 1136 ep_unregister_pollwait(ep, epi);
1016 1137
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1275 int error = 0; 1396 int error = 0;
1276 struct file *file = priv; 1397 struct file *file = priv;
1277 struct eventpoll *ep = file->private_data; 1398 struct eventpoll *ep = file->private_data;
1399 struct eventpoll *ep_tovisit;
1278 struct rb_node *rbp; 1400 struct rb_node *rbp;
1279 struct epitem *epi; 1401 struct epitem *epi;
1280 1402
1281 mutex_lock_nested(&ep->mtx, call_nests + 1); 1403 mutex_lock_nested(&ep->mtx, call_nests + 1);
1404 ep->visited = 1;
1405 list_add(&ep->visited_list_link, &visited_list);
1282 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1406 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1283 epi = rb_entry(rbp, struct epitem, rbn); 1407 epi = rb_entry(rbp, struct epitem, rbn);
1284 if (unlikely(is_file_epoll(epi->ffd.file))) { 1408 if (unlikely(is_file_epoll(epi->ffd.file))) {
1409 ep_tovisit = epi->ffd.file->private_data;
1410 if (ep_tovisit->visited)
1411 continue;
1285 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1412 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1286 ep_loop_check_proc, epi->ffd.file, 1413 ep_loop_check_proc, epi->ffd.file,
1287 epi->ffd.file->private_data, current); 1414 ep_tovisit, current);
1288 if (error != 0) 1415 if (error != 0)
1289 break; 1416 break;
1417 } else {
1418 /*
1419 * If we've reached a file that is not associated with
1420 * an ep, then we need to check if the newly added
1421 * links are going to add too many wakeup paths. We do
1422 * this by adding it to the tfile_check_list, if it's
1423 * not already there, and calling reverse_path_check()
1424 * during ep_insert().
1425 */
1426 if (list_empty(&epi->ffd.file->f_tfile_llink))
1427 list_add(&epi->ffd.file->f_tfile_llink,
1428 &tfile_check_list);
1290 } 1429 }
1291 } 1430 }
1292 mutex_unlock(&ep->mtx); 1431 mutex_unlock(&ep->mtx);
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1307 */ 1446 */
1308static int ep_loop_check(struct eventpoll *ep, struct file *file) 1447static int ep_loop_check(struct eventpoll *ep, struct file *file)
1309{ 1448{
1310 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1449 int ret;
1450 struct eventpoll *ep_cur, *ep_next;
1451
1452 ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1311 ep_loop_check_proc, file, ep, current); 1453 ep_loop_check_proc, file, ep, current);
1454 /* clear visited list */
1455 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1456 visited_list_link) {
1457 ep_cur->visited = 0;
1458 list_del(&ep_cur->visited_list_link);
1459 }
1460 return ret;
1461}
1462
1463static void clear_tfile_check_list(void)
1464{
1465 struct file *file;
1466
1467 /* first clear the tfile_check_list */
1468 while (!list_empty(&tfile_check_list)) {
1469 file = list_first_entry(&tfile_check_list, struct file,
1470 f_tfile_llink);
1471 list_del_init(&file->f_tfile_llink);
1472 }
1473 INIT_LIST_HEAD(&tfile_check_list);
1312} 1474}
1313 1475
1314/* 1476/*
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file)
1316 */ 1478 */
1317SYSCALL_DEFINE1(epoll_create1, int, flags) 1479SYSCALL_DEFINE1(epoll_create1, int, flags)
1318{ 1480{
1319 int error; 1481 int error, fd;
1320 struct eventpoll *ep = NULL; 1482 struct eventpoll *ep = NULL;
1483 struct file *file;
1321 1484
1322 /* Check the EPOLL_* constant for consistency. */ 1485 /* Check the EPOLL_* constant for consistency. */
1323 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1486 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1334 * Creates all the items needed to setup an eventpoll file. That is, 1497 * Creates all the items needed to setup an eventpoll file. That is,
1335 * a file structure and a free file descriptor. 1498 * a file structure and a free file descriptor.
1336 */ 1499 */
1337 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1500 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
1501 if (fd < 0) {
1502 error = fd;
1503 goto out_free_ep;
1504 }
1505 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1338 O_RDWR | (flags & O_CLOEXEC)); 1506 O_RDWR | (flags & O_CLOEXEC));
1339 if (error < 0) 1507 if (IS_ERR(file)) {
1340 ep_free(ep); 1508 error = PTR_ERR(file);
1341 1509 goto out_free_fd;
1510 }
1511 fd_install(fd, file);
1512 ep->file = file;
1513 return fd;
1514
1515out_free_fd:
1516 put_unused_fd(fd);
1517out_free_ep:
1518 ep_free(ep);
1342 return error; 1519 return error;
1343} 1520}
1344 1521
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1404 /* 1581 /*
1405 * When we insert an epoll file descriptor, inside another epoll file 1582 * When we insert an epoll file descriptor, inside another epoll file
1406 * descriptor, there is the change of creating closed loops, which are 1583 * descriptor, there is the change of creating closed loops, which are
1407 * better be handled here, than in more critical paths. 1584 * better be handled here, than in more critical paths. While we are
1585 * checking for loops we also determine the list of files reachable
1586 * and hang them on the tfile_check_list, so we can check that we
1587 * haven't created too many possible wakeup paths.
1408 * 1588 *
1409 * We hold epmutex across the loop check and the insert in this case, in 1589 * We need to hold the epmutex across both ep_insert and ep_remove
1410 * order to prevent two separate inserts from racing and each doing the 1590 * b/c we want to make sure we are looking at a coherent view of
1411 * insert "at the same time" such that ep_loop_check passes on both 1591 * epoll network.
1412 * before either one does the insert, thereby creating a cycle.
1413 */ 1592 */
1414 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { 1593 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
1415 mutex_lock(&epmutex); 1594 mutex_lock(&epmutex);
1416 did_lock_epmutex = 1; 1595 did_lock_epmutex = 1;
1417 error = -ELOOP;
1418 if (ep_loop_check(ep, tfile) != 0)
1419 goto error_tgt_fput;
1420 } 1596 }
1421 1597 if (op == EPOLL_CTL_ADD) {
1598 if (is_file_epoll(tfile)) {
1599 error = -ELOOP;
1600 if (ep_loop_check(ep, tfile) != 0)
1601 goto error_tgt_fput;
1602 } else
1603 list_add(&tfile->f_tfile_llink, &tfile_check_list);
1604 }
1422 1605
1423 mutex_lock_nested(&ep->mtx, 0); 1606 mutex_lock_nested(&ep->mtx, 0);
1424 1607
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1437 error = ep_insert(ep, &epds, tfile, fd); 1620 error = ep_insert(ep, &epds, tfile, fd);
1438 } else 1621 } else
1439 error = -EEXIST; 1622 error = -EEXIST;
1623 clear_tfile_check_list();
1440 break; 1624 break;
1441 case EPOLL_CTL_DEL: 1625 case EPOLL_CTL_DEL:
1442 if (epi) 1626 if (epi)
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1455 mutex_unlock(&ep->mtx); 1639 mutex_unlock(&ep->mtx);
1456 1640
1457error_tgt_fput: 1641error_tgt_fput:
1458 if (unlikely(did_lock_epmutex)) 1642 if (did_lock_epmutex)
1459 mutex_unlock(&epmutex); 1643 mutex_unlock(&epmutex);
1460 1644
1461 fput(tfile); 1645 fput(tfile);
diff --git a/fs/exec.c b/fs/exec.c
index 3f64b9f26e7d..aeb135c7ff5c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -59,6 +59,8 @@
59#include <asm/uaccess.h> 59#include <asm/uaccess.h>
60#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
61#include <asm/tlb.h> 61#include <asm/tlb.h>
62
63#include <trace/events/task.h>
62#include "internal.h" 64#include "internal.h"
63 65
64int core_uses_pid; 66int core_uses_pid;
@@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf)
1054{ 1056{
1055 task_lock(tsk); 1057 task_lock(tsk);
1056 1058
1059 trace_task_rename(tsk, buf);
1060
1057 /* 1061 /*
1058 * Threads may access current->comm without holding 1062 * Threads may access current->comm without holding
1059 * the task lock, so write the string carefully. 1063 * the task lock, so write the string carefully.
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
index da42f32c49be..86194b2f799d 100644
--- a/fs/exofs/Kconfig
+++ b/fs/exofs/Kconfig
@@ -1,14 +1,3 @@
1# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
2# for every ORE user we do it like this. Any user should add itself here
3# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
4# selected here, and we default to "ON". So in effect it is like been
5# selected by any of the users.
6config ORE
7 tristate
8 depends on EXOFS_FS || PNFS_OBJLAYOUT
9 select ASYNC_XOR
10 default SCSI_OSD_ULD
11
12config EXOFS_FS 1config EXOFS_FS
13 tristate "exofs: OSD based file system support" 2 tristate "exofs: OSD based file system support"
14 depends on SCSI_OSD_ULD 3 depends on SCSI_OSD_ULD
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
new file mode 100644
index 000000000000..1ca7fb7b6ba8
--- /dev/null
+++ b/fs/exofs/Kconfig.ore
@@ -0,0 +1,12 @@
1# ORE - Objects Raid Engine (libore.ko)
2#
3# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects
4# for every ORE user we do it like this. Any user should add itself here
5# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are
6# selected here, and we default to "ON". So in effect it is like been
7# selected by any of the users.
8config ORE
9 tristate
10 depends on EXOFS_FS || PNFS_OBJLAYOUT
11 select ASYNC_XOR
12 default SCSI_OSD_ULD
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index d271ad837202..49cf230554a2 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -266,7 +266,7 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
266 266
267 /* first/last seg is split */ 267 /* first/last seg is split */
268 num_raid_units += layout->group_width; 268 num_raid_units += layout->group_width;
269 sgs_per_dev = div_u64(num_raid_units, data_devs); 269 sgs_per_dev = div_u64(num_raid_units, data_devs) + 2;
270 } else { 270 } else {
271 /* For Writes add parity pages array. */ 271 /* For Writes add parity pages array. */
272 max_par_pages = num_raid_units * pages_in_unit * 272 max_par_pages = num_raid_units * pages_in_unit *
@@ -445,10 +445,10 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
445 u64 residual = ios->reading ? 445 u64 residual = ios->reading ?
446 or->in.residual : or->out.residual; 446 or->in.residual : or->out.residual;
447 u64 offset = (ios->offset + ios->length) - residual; 447 u64 offset = (ios->offset + ios->length) - residual;
448 struct ore_dev *od = ios->oc->ods[ 448 unsigned dev = per_dev->dev - ios->oc->first_dev;
449 per_dev->dev - ios->oc->first_dev]; 449 struct ore_dev *od = ios->oc->ods[dev];
450 450
451 on_dev_error(ios, od, per_dev->dev, osi.osd_err_pri, 451 on_dev_error(ios, od, dev, osi.osd_err_pri,
452 offset, residual); 452 offset, residual);
453 } 453 }
454 if (osi.osd_err_pri >= acumulated_osd_err) { 454 if (osi.osd_err_pri >= acumulated_osd_err) {
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 29c47e5c4a86..d222c77cfa1b 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -328,8 +328,8 @@ static int _alloc_read_4_write(struct ore_io_state *ios)
328/* @si contains info of the to-be-inserted page. Update of @si should be 328/* @si contains info of the to-be-inserted page. Update of @si should be
329 * maintained by caller. Specificaly si->dev, si->obj_offset, ... 329 * maintained by caller. Specificaly si->dev, si->obj_offset, ...
330 */ 330 */
331static int _add_to_read_4_write(struct ore_io_state *ios, 331static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si,
332 struct ore_striping_info *si, struct page *page) 332 struct page *page, unsigned pg_len)
333{ 333{
334 struct request_queue *q; 334 struct request_queue *q;
335 struct ore_per_dev_state *per_dev; 335 struct ore_per_dev_state *per_dev;
@@ -366,17 +366,60 @@ static int _add_to_read_4_write(struct ore_io_state *ios,
366 _ore_add_sg_seg(per_dev, gap, true); 366 _ore_add_sg_seg(per_dev, gap, true);
367 } 367 }
368 q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); 368 q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev));
369 added_len = bio_add_pc_page(q, per_dev->bio, page, PAGE_SIZE, 0); 369 added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len,
370 if (unlikely(added_len != PAGE_SIZE)) { 370 si->obj_offset % PAGE_SIZE);
371 if (unlikely(added_len != pg_len)) {
371 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", 372 ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n",
372 per_dev->bio->bi_vcnt); 373 per_dev->bio->bi_vcnt);
373 return -ENOMEM; 374 return -ENOMEM;
374 } 375 }
375 376
376 per_dev->length += PAGE_SIZE; 377 per_dev->length += pg_len;
377 return 0; 378 return 0;
378} 379}
379 380
381/* read the beginning of an unaligned first page */
382static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page)
383{
384 struct ore_striping_info si;
385 unsigned pg_len;
386
387 ore_calc_stripe_info(ios->layout, ios->offset, 0, &si);
388
389 pg_len = si.obj_offset % PAGE_SIZE;
390 si.obj_offset -= pg_len;
391
392 ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n",
393 _LLU(si.obj_offset), pg_len, page->index, si.dev);
394
395 return _add_to_r4w(ios, &si, page, pg_len);
396}
397
398/* read the end of an incomplete last page */
399static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
400{
401 struct ore_striping_info si;
402 struct page *page;
403 unsigned pg_len, p, c;
404
405 ore_calc_stripe_info(ios->layout, *offset, 0, &si);
406
407 p = si.unit_off / PAGE_SIZE;
408 c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
409 ios->layout->mirrors_p1, si.par_dev, si.dev);
410 page = ios->sp2d->_1p_stripes[p].pages[c];
411
412 pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
413 *offset += pg_len;
414
415 ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n",
416 p, c, _LLU(*offset), pg_len, si.dev, si.par_dev);
417
418 BUG_ON(!page);
419
420 return _add_to_r4w(ios, &si, page, pg_len);
421}
422
380static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) 423static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
381{ 424{
382 struct bio_vec *bv; 425 struct bio_vec *bv;
@@ -444,9 +487,13 @@ static int _read_4_write(struct ore_io_state *ios)
444 struct page **pp = &_1ps->pages[c]; 487 struct page **pp = &_1ps->pages[c];
445 bool uptodate; 488 bool uptodate;
446 489
447 if (*pp) 490 if (*pp) {
491 if (ios->offset % PAGE_SIZE)
492 /* Read the remainder of the page */
493 _add_to_r4w_first_page(ios, *pp);
448 /* to-be-written pages start here */ 494 /* to-be-written pages start here */
449 goto read_last_stripe; 495 goto read_last_stripe;
496 }
450 497
451 *pp = ios->r4w->get_page(ios->private, offset, 498 *pp = ios->r4w->get_page(ios->private, offset,
452 &uptodate); 499 &uptodate);
@@ -454,7 +501,7 @@ static int _read_4_write(struct ore_io_state *ios)
454 return -ENOMEM; 501 return -ENOMEM;
455 502
456 if (!uptodate) 503 if (!uptodate)
457 _add_to_read_4_write(ios, &read_si, *pp); 504 _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE);
458 505
459 /* Mark read-pages to be cache_released */ 506 /* Mark read-pages to be cache_released */
460 _1ps->page_is_read[c] = true; 507 _1ps->page_is_read[c] = true;
@@ -465,8 +512,11 @@ static int _read_4_write(struct ore_io_state *ios)
465 } 512 }
466 513
467read_last_stripe: 514read_last_stripe:
468 offset = ios->offset + (ios->length + PAGE_SIZE - 1) / 515 offset = ios->offset + ios->length;
469 PAGE_SIZE * PAGE_SIZE; 516 if (offset % PAGE_SIZE)
517 _add_to_r4w_last_page(ios, &offset);
518 /* offset will be aligned to next page */
519
470 last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) 520 last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe)
471 * bytes_in_stripe; 521 * bytes_in_stripe;
472 if (offset == last_stripe_end) /* Optimize for the aligned case */ 522 if (offset == last_stripe_end) /* Optimize for the aligned case */
@@ -503,7 +553,7 @@ read_last_stripe:
503 /* Mark read-pages to be cache_released */ 553 /* Mark read-pages to be cache_released */
504 _1ps->page_is_read[c] = true; 554 _1ps->page_is_read[c] = true;
505 if (!uptodate) 555 if (!uptodate)
506 _add_to_read_4_write(ios, &read_si, page); 556 _add_to_r4w(ios, &read_si, page, PAGE_SIZE);
507 } 557 }
508 558
509 offset += PAGE_SIZE; 559 offset += PAGE_SIZE;
@@ -551,7 +601,11 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
551 unsigned cur_len) 601 unsigned cur_len)
552{ 602{
553 if (ios->reading) { 603 if (ios->reading) {
554 BUG_ON(per_dev->cur_sg >= ios->sgs_per_dev); 604 if (per_dev->cur_sg >= ios->sgs_per_dev) {
605 ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" ,
606 per_dev->cur_sg, ios->sgs_per_dev);
607 return -ENOMEM;
608 }
555 _ore_add_sg_seg(per_dev, cur_len, true); 609 _ore_add_sg_seg(per_dev, cur_len, true);
556 } else { 610 } else {
557 struct __stripe_pages_2d *sp2d = ios->sp2d; 611 struct __stripe_pages_2d *sp2d = ios->sp2d;
@@ -612,8 +666,6 @@ int _ore_post_alloc_raid_stuff(struct ore_io_state *ios)
612 return -ENOMEM; 666 return -ENOMEM;
613 } 667 }
614 668
615 BUG_ON(ios->offset % PAGE_SIZE);
616
617 /* Round io down to last full strip */ 669 /* Round io down to last full strip */
618 first_stripe = div_u64(ios->offset, stripe_size); 670 first_stripe = div_u64(ios->offset, stripe_size);
619 last_stripe = div_u64(ios->offset + ios->length, stripe_size); 671 last_stripe = div_u64(ios->offset + ios->length, stripe_size);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8addfe314dc7..d22cd168c6ee 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -838,6 +838,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
838 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); 838 ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
839 if (ret) { 839 if (ret) {
840 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); 840 EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
841 dput(sb->s_root);
842 sb->s_root = NULL;
841 goto free_sbi; 843 goto free_sbi;
842 } 844 }
843 845
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index cd7f5f424a75..8b15cf8cef37 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -573,8 +573,11 @@ got:
573 inode->i_generation = sbi->s_next_generation++; 573 inode->i_generation = sbi->s_next_generation++;
574 spin_unlock(&sbi->s_next_gen_lock); 574 spin_unlock(&sbi->s_next_gen_lock);
575 if (insert_inode_locked(inode) < 0) { 575 if (insert_inode_locked(inode) < 0) {
576 err = -EINVAL; 576 ext2_error(sb, "ext2_new_inode",
577 goto fail_drop; 577 "inode number already in use - inode=%lu",
578 (unsigned long) ino);
579 err = -EIO;
580 goto fail;
578 } 581 }
579 582
580 dquot_initialize(inode); 583 dquot_initialize(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 91a6945af6d8..740cad8dcd8d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,7 +26,6 @@
26#include <linux/highuid.h> 26#include <linux/highuid.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/quotaops.h> 28#include <linux/quotaops.h>
29#include <linux/module.h>
30#include <linux/writeback.h> 29#include <linux/writeback.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/mpage.h> 31#include <linux/mpage.h>
@@ -36,10 +35,6 @@
36#include "acl.h" 35#include "acl.h"
37#include "xip.h" 36#include "xip.h"
38 37
39MODULE_AUTHOR("Remy Card and others");
40MODULE_DESCRIPTION("Second Extended Filesystem");
41MODULE_LICENSE("GPL");
42
43static int __ext2_write_inode(struct inode *inode, int do_sync); 38static int __ext2_write_inode(struct inode *inode, int do_sync);
44 39
45/* 40/*
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 1089f760c847..2de655f5d625 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -77,10 +77,11 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
77 flags = flags & EXT2_FL_USER_MODIFIABLE; 77 flags = flags & EXT2_FL_USER_MODIFIABLE;
78 flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; 78 flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE;
79 ei->i_flags = flags; 79 ei->i_flags = flags;
80 mutex_unlock(&inode->i_mutex);
81 80
82 ext2_set_inode_flags(inode); 81 ext2_set_inode_flags(inode);
83 inode->i_ctime = CURRENT_TIME_SEC; 82 inode->i_ctime = CURRENT_TIME_SEC;
83 mutex_unlock(&inode->i_mutex);
84
84 mark_inode_dirty(inode); 85 mark_inode_dirty(inode);
85setflags_out: 86setflags_out:
86 mnt_drop_write_file(filp); 87 mnt_drop_write_file(filp);
@@ -88,20 +89,29 @@ setflags_out:
88 } 89 }
89 case EXT2_IOC_GETVERSION: 90 case EXT2_IOC_GETVERSION:
90 return put_user(inode->i_generation, (int __user *) arg); 91 return put_user(inode->i_generation, (int __user *) arg);
91 case EXT2_IOC_SETVERSION: 92 case EXT2_IOC_SETVERSION: {
93 __u32 generation;
94
92 if (!inode_owner_or_capable(inode)) 95 if (!inode_owner_or_capable(inode))
93 return -EPERM; 96 return -EPERM;
94 ret = mnt_want_write_file(filp); 97 ret = mnt_want_write_file(filp);
95 if (ret) 98 if (ret)
96 return ret; 99 return ret;
97 if (get_user(inode->i_generation, (int __user *) arg)) { 100 if (get_user(generation, (int __user *) arg)) {
98 ret = -EFAULT; 101 ret = -EFAULT;
99 } else { 102 goto setversion_out;
100 inode->i_ctime = CURRENT_TIME_SEC;
101 mark_inode_dirty(inode);
102 } 103 }
104
105 mutex_lock(&inode->i_mutex);
106 inode->i_ctime = CURRENT_TIME_SEC;
107 inode->i_generation = generation;
108 mutex_unlock(&inode->i_mutex);
109
110 mark_inode_dirty(inode);
111setversion_out:
103 mnt_drop_write_file(filp); 112 mnt_drop_write_file(filp);
104 return ret; 113 return ret;
114 }
105 case EXT2_IOC_GETRSVSZ: 115 case EXT2_IOC_GETRSVSZ:
106 if (test_opt(inode->i_sb, RESERVATION) 116 if (test_opt(inode->i_sb, RESERVATION)
107 && S_ISREG(inode->i_mode) 117 && S_ISREG(inode->i_mode)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9b403f064ce0..0090595beb28 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1520,5 +1520,8 @@ static void __exit exit_ext2_fs(void)
1520 exit_ext2_xattr(); 1520 exit_ext2_xattr();
1521} 1521}
1522 1522
1523MODULE_AUTHOR("Remy Card and others");
1524MODULE_DESCRIPTION("Second Extended Filesystem");
1525MODULE_LICENSE("GPL");
1523module_init(init_ext2_fs) 1526module_init(init_ext2_fs)
1524module_exit(exit_ext2_fs) 1527module_exit(exit_ext2_fs)
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index d27b71f1d183..6dcafc7efdfd 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -54,7 +54,6 @@
54 */ 54 */
55 55
56#include <linux/buffer_head.h> 56#include <linux/buffer_head.h>
57#include <linux/module.h>
58#include <linux/init.h> 57#include <linux/init.h>
59#include <linux/slab.h> 58#include <linux/slab.h>
60#include <linux/mbcache.h> 59#include <linux/mbcache.h>
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index c922adc8ef41..be7a8d02c9a7 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -3,7 +3,6 @@
3 * Handler for storing security labels as extended attributes. 3 * Handler for storing security labels as extended attributes.
4 */ 4 */
5 5
6#include <linux/module.h>
7#include <linux/slab.h> 6#include <linux/slab.h>
8#include <linux/string.h> 7#include <linux/string.h>
9#include <linux/fs.h> 8#include <linux/fs.h>
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 667e46a8d62d..2989467d3595 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -5,7 +5,6 @@
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> 5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */ 6 */
7 7
8#include <linux/module.h>
9#include <linux/string.h> 8#include <linux/string.h>
10#include <linux/capability.h> 9#include <linux/capability.h>
11#include <linux/fs.h> 10#include <linux/fs.h>
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 099d20f47163..f470e44c4b8d 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/module.h>
10#include <linux/string.h> 9#include <linux/string.h>
11#include "ext2.h" 10#include "ext2.h"
12#include "xattr.h" 11#include "xattr.h"
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 92cc86dfa23d..1cde28438014 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -525,8 +525,12 @@ got:
525 if (IS_DIRSYNC(inode)) 525 if (IS_DIRSYNC(inode))
526 handle->h_sync = 1; 526 handle->h_sync = 1;
527 if (insert_inode_locked(inode) < 0) { 527 if (insert_inode_locked(inode) < 0) {
528 err = -EINVAL; 528 /*
529 goto fail_drop; 529 * Likely a bitmap corruption causing inode to be allocated
530 * twice.
531 */
532 err = -EIO;
533 goto fail;
530 } 534 }
531 spin_lock(&sbi->s_next_gen_lock); 535 spin_lock(&sbi->s_next_gen_lock);
532 inode->i_generation = sbi->s_next_generation++; 536 inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 15cb47088aac..2d0afeca0b47 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -22,7 +22,6 @@
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */ 23 */
24 24
25#include <linux/module.h>
26#include <linux/fs.h> 25#include <linux/fs.h>
27#include <linux/time.h> 26#include <linux/time.h>
28#include <linux/ext3_jbd.h> 27#include <linux/ext3_jbd.h>
@@ -223,8 +222,12 @@ void ext3_evict_inode (struct inode *inode)
223 * 222 *
224 * Note that directories do not have this problem because they don't 223 * Note that directories do not have this problem because they don't
225 * use page cache. 224 * use page cache.
225 *
226 * The s_journal check handles the case when ext3_get_journal() fails
227 * and puts the journal inode.
226 */ 228 */
227 if (inode->i_nlink && ext3_should_journal_data(inode) && 229 if (inode->i_nlink && ext3_should_journal_data(inode) &&
230 EXT3_SB(inode->i_sb)->s_journal &&
228 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) { 231 (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode))) {
229 tid_t commit_tid = atomic_read(&ei->i_datasync_tid); 232 tid_t commit_tid = atomic_read(&ei->i_datasync_tid);
230 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 233 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
@@ -1132,9 +1135,11 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
1132 bh = ext3_getblk(handle, inode, block, create, err); 1135 bh = ext3_getblk(handle, inode, block, create, err);
1133 if (!bh) 1136 if (!bh)
1134 return bh; 1137 return bh;
1135 if (buffer_uptodate(bh)) 1138 if (bh_uptodate_or_lock(bh))
1136 return bh; 1139 return bh;
1137 ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); 1140 get_bh(bh);
1141 bh->b_end_io = end_buffer_read_sync;
1142 submit_bh(READ | REQ_META | REQ_PRIO, bh);
1138 wait_on_buffer(bh); 1143 wait_on_buffer(bh);
1139 if (buffer_uptodate(bh)) 1144 if (buffer_uptodate(bh))
1140 return bh; 1145 return bh;
@@ -1617,7 +1622,13 @@ static int ext3_ordered_writepage(struct page *page,
1617 int err; 1622 int err;
1618 1623
1619 J_ASSERT(PageLocked(page)); 1624 J_ASSERT(PageLocked(page));
1620 WARN_ON_ONCE(IS_RDONLY(inode)); 1625 /*
1626 * We don't want to warn for emergency remount. The condition is
1627 * ordered to avoid dereferencing inode->i_sb in non-error case to
1628 * avoid slow-downs.
1629 */
1630 WARN_ON_ONCE(IS_RDONLY(inode) &&
1631 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1621 1632
1622 /* 1633 /*
1623 * We give up here if we're reentered, because it might be for a 1634 * We give up here if we're reentered, because it might be for a
@@ -1692,7 +1703,13 @@ static int ext3_writeback_writepage(struct page *page,
1692 int err; 1703 int err;
1693 1704
1694 J_ASSERT(PageLocked(page)); 1705 J_ASSERT(PageLocked(page));
1695 WARN_ON_ONCE(IS_RDONLY(inode)); 1706 /*
1707 * We don't want to warn for emergency remount. The condition is
1708 * ordered to avoid dereferencing inode->i_sb in non-error case to
1709 * avoid slow-downs.
1710 */
1711 WARN_ON_ONCE(IS_RDONLY(inode) &&
1712 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1696 1713
1697 if (ext3_journal_current_handle()) 1714 if (ext3_journal_current_handle())
1698 goto out_fail; 1715 goto out_fail;
@@ -1735,7 +1752,13 @@ static int ext3_journalled_writepage(struct page *page,
1735 int err; 1752 int err;
1736 1753
1737 J_ASSERT(PageLocked(page)); 1754 J_ASSERT(PageLocked(page));
1738 WARN_ON_ONCE(IS_RDONLY(inode)); 1755 /*
1756 * We don't want to warn for emergency remount. The condition is
1757 * ordered to avoid dereferencing inode->i_sb in non-error case to
1758 * avoid slow-downs.
1759 */
1760 WARN_ON_ONCE(IS_RDONLY(inode) &&
1761 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
1739 1762
1740 if (ext3_journal_current_handle()) 1763 if (ext3_journal_current_handle())
1741 goto no_write; 1764 goto no_write;
@@ -2064,12 +2087,10 @@ static int ext3_block_truncate_page(struct inode *inode, loff_t from)
2064 if (PageUptodate(page)) 2087 if (PageUptodate(page))
2065 set_buffer_uptodate(bh); 2088 set_buffer_uptodate(bh);
2066 2089
2067 if (!buffer_uptodate(bh)) { 2090 if (!bh_uptodate_or_lock(bh)) {
2068 err = -EIO; 2091 err = bh_submit_read(bh);
2069 ll_rw_block(READ, 1, &bh);
2070 wait_on_buffer(bh);
2071 /* Uhhuh. Read error. Complain and punt. */ 2092 /* Uhhuh. Read error. Complain and punt. */
2072 if (!buffer_uptodate(bh)) 2093 if (err)
2073 goto unlock; 2094 goto unlock;
2074 } 2095 }
2075 2096
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 8e37c41a071b..4af574ce4a46 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -134,10 +134,11 @@ flags_out:
134 goto setversion_out; 134 goto setversion_out;
135 } 135 }
136 136
137 mutex_lock(&inode->i_mutex);
137 handle = ext3_journal_start(inode, 1); 138 handle = ext3_journal_start(inode, 1);
138 if (IS_ERR(handle)) { 139 if (IS_ERR(handle)) {
139 err = PTR_ERR(handle); 140 err = PTR_ERR(handle);
140 goto setversion_out; 141 goto unlock_out;
141 } 142 }
142 err = ext3_reserve_inode_write(handle, inode, &iloc); 143 err = ext3_reserve_inode_write(handle, inode, &iloc);
143 if (err == 0) { 144 if (err == 0) {
@@ -146,6 +147,9 @@ flags_out:
146 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 147 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
147 } 148 }
148 ext3_journal_stop(handle); 149 ext3_journal_stop(handle);
150
151unlock_out:
152 mutex_unlock(&inode->i_mutex);
149setversion_out: 153setversion_out:
150 mnt_drop_write_file(filp); 154 mnt_drop_write_file(filp);
151 return err; 155 return err;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 4f35b2f315d4..e8e211795e9f 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -921,9 +921,12 @@ restart:
921 num++; 921 num++;
922 bh = ext3_getblk(NULL, dir, b++, 0, &err); 922 bh = ext3_getblk(NULL, dir, b++, 0, &err);
923 bh_use[ra_max] = bh; 923 bh_use[ra_max] = bh;
924 if (bh) 924 if (bh && !bh_uptodate_or_lock(bh)) {
925 ll_rw_block(READ | REQ_META | REQ_PRIO, 925 get_bh(bh);
926 1, &bh); 926 bh->b_end_io = end_buffer_read_sync;
927 submit_bh(READ | REQ_META | REQ_PRIO,
928 bh);
929 }
927 } 930 }
928 } 931 }
929 if ((bh = bh_use[ra_ptr++]) == NULL) 932 if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -2272,7 +2275,7 @@ retry:
2272 err = PTR_ERR(handle); 2275 err = PTR_ERR(handle);
2273 goto err_drop_inode; 2276 goto err_drop_inode;
2274 } 2277 }
2275 inc_nlink(inode); 2278 set_nlink(inode, 1);
2276 err = ext3_orphan_del(handle, inode); 2279 err = ext3_orphan_del(handle, inode);
2277 if (err) { 2280 if (err) {
2278 ext3_journal_stop(handle); 2281 ext3_journal_stop(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3a10b884e1be..726c7ef6cdf1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2059,9 +2059,10 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2059 EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; 2059 EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
2060 ext3_orphan_cleanup(sb, es); 2060 ext3_orphan_cleanup(sb, es);
2061 EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; 2061 EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
2062 if (needs_recovery) 2062 if (needs_recovery) {
2063 ext3_mark_recovery_complete(sb, es);
2063 ext3_msg(sb, KERN_INFO, "recovery complete"); 2064 ext3_msg(sb, KERN_INFO, "recovery complete");
2064 ext3_mark_recovery_complete(sb, es); 2065 }
2065 ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode", 2066 ext3_msg(sb, KERN_INFO, "mounted filesystem with %s data mode",
2066 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": 2067 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
2067 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": 2068 test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
@@ -2229,11 +2230,11 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
2229 goto out_bdev; 2230 goto out_bdev;
2230 } 2231 }
2231 journal->j_private = sb; 2232 journal->j_private = sb;
2232 ll_rw_block(READ, 1, &journal->j_sb_buffer); 2233 if (!bh_uptodate_or_lock(journal->j_sb_buffer)) {
2233 wait_on_buffer(journal->j_sb_buffer); 2234 if (bh_submit_read(journal->j_sb_buffer)) {
2234 if (!buffer_uptodate(journal->j_sb_buffer)) { 2235 ext3_msg(sb, KERN_ERR, "I/O error on journal device");
2235 ext3_msg(sb, KERN_ERR, "I/O error on journal device"); 2236 goto out_journal;
2236 goto out_journal; 2237 }
2237 } 2238 }
2238 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { 2239 if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
2239 ext3_msg(sb, KERN_ERR, 2240 ext3_msg(sb, KERN_ERR,
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3c218b8a51d4..ea26f2acab94 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -3,7 +3,6 @@
3 * Handler for storing security labels as extended attributes. 3 * Handler for storing security labels as extended attributes.
4 */ 4 */
5 5
6#include <linux/module.h>
7#include <linux/slab.h> 6#include <linux/slab.h>
8#include <linux/string.h> 7#include <linux/string.h>
9#include <linux/fs.h> 8#include <linux/fs.h>
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index dc8edda9ffe0..2526a8829de8 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -5,7 +5,6 @@
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> 5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */ 6 */
7 7
8#include <linux/module.h>
9#include <linux/string.h> 8#include <linux/string.h>
10#include <linux/capability.h> 9#include <linux/capability.h>
11#include <linux/fs.h> 10#include <linux/fs.h>
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 7a321974d584..b32e473a1e33 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -5,7 +5,6 @@
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> 5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */ 6 */
7 7
8#include <linux/module.h>
9#include <linux/string.h> 8#include <linux/string.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include <linux/ext3_jbd.h> 10#include <linux/ext3_jbd.h>
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 12ccacda44e0..f9e2cd8cf711 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -23,6 +23,8 @@
23 23
24#include <trace/events/ext4.h> 24#include <trace/events/ext4.h>
25 25
26static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
27 ext4_group_t block_group);
26/* 28/*
27 * balloc.c contains the blocks allocation and deallocation routines 29 * balloc.c contains the blocks allocation and deallocation routines
28 */ 30 */
@@ -668,7 +670,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
668 * This function returns the number of file system metadata clusters at 670 * This function returns the number of file system metadata clusters at
669 * the beginning of a block group, including the reserved gdt blocks. 671 * the beginning of a block group, including the reserved gdt blocks.
670 */ 672 */
671unsigned ext4_num_base_meta_clusters(struct super_block *sb, 673static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
672 ext4_group_t block_group) 674 ext4_group_t block_group)
673{ 675{
674 struct ext4_sb_info *sbi = EXT4_SB(sb); 676 struct ext4_sb_info *sbi = EXT4_SB(sb);
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 8efb2f0a3447..3f11656bd72e 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -13,7 +13,6 @@
13#include <linux/namei.h> 13#include <linux/namei.h>
14#include <linux/quotaops.h> 14#include <linux/quotaops.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/module.h>
17#include <linux/swap.h> 16#include <linux/swap.h>
18#include <linux/pagemap.h> 17#include <linux/pagemap.h>
19#include <linux/blkdev.h> 18#include <linux/blkdev.h>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1554b15f91bc..513004fc3d84 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -511,6 +511,14 @@ struct ext4_new_group_data {
511 __u32 free_blocks_count; 511 __u32 free_blocks_count;
512}; 512};
513 513
514/* Indexes used to index group tables in ext4_new_group_data */
515enum {
516 BLOCK_BITMAP = 0, /* block bitmap */
517 INODE_BITMAP, /* inode bitmap */
518 INODE_TABLE, /* inode tables */
519 GROUP_TABLE_COUNT,
520};
521
514/* 522/*
515 * Flags used by ext4_map_blocks() 523 * Flags used by ext4_map_blocks()
516 */ 524 */
@@ -575,6 +583,7 @@ struct ext4_new_group_data {
575 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ 583 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
576#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 584#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
577#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 585#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
586#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
578 587
579#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 588#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
580/* 589/*
@@ -957,12 +966,13 @@ struct ext4_inode_info {
957#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ 966#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
958 EXT4_MOUNT2_##opt) 967 EXT4_MOUNT2_##opt)
959 968
960#define ext4_set_bit __test_and_set_bit_le 969#define ext4_test_and_set_bit __test_and_set_bit_le
970#define ext4_set_bit __set_bit_le
961#define ext4_set_bit_atomic ext2_set_bit_atomic 971#define ext4_set_bit_atomic ext2_set_bit_atomic
962#define ext4_clear_bit __test_and_clear_bit_le 972#define ext4_test_and_clear_bit __test_and_clear_bit_le
973#define ext4_clear_bit __clear_bit_le
963#define ext4_clear_bit_atomic ext2_clear_bit_atomic 974#define ext4_clear_bit_atomic ext2_clear_bit_atomic
964#define ext4_test_bit test_bit_le 975#define ext4_test_bit test_bit_le
965#define ext4_find_first_zero_bit find_first_zero_bit_le
966#define ext4_find_next_zero_bit find_next_zero_bit_le 976#define ext4_find_next_zero_bit find_next_zero_bit_le
967#define ext4_find_next_bit find_next_bit_le 977#define ext4_find_next_bit find_next_bit_le
968 978
@@ -1397,6 +1407,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1397#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 1407#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040
1398#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 1408#define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100
1399#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 1409#define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200
1410#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
1400 1411
1401#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 1412#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
1402#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 1413#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1409,6 +1420,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1409#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 1420#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
1410#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ 1421#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
1411#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ 1422#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
1423#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x2000 /* data in inode */
1424#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
1412 1425
1413#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR 1426#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
1414#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ 1427#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1790,8 +1803,6 @@ extern void ext4_init_block_bitmap(struct super_block *sb,
1790extern unsigned ext4_free_clusters_after_init(struct super_block *sb, 1803extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
1791 ext4_group_t block_group, 1804 ext4_group_t block_group,
1792 struct ext4_group_desc *gdp); 1805 struct ext4_group_desc *gdp);
1793extern unsigned ext4_num_base_meta_clusters(struct super_block *sb,
1794 ext4_group_t block_group);
1795extern unsigned ext4_num_overhead_clusters(struct super_block *sb, 1806extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
1796 ext4_group_t block_group, 1807 ext4_group_t block_group,
1797 struct ext4_group_desc *gdp); 1808 struct ext4_group_desc *gdp);
@@ -1880,16 +1891,9 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
1880extern void ext4_set_aops(struct inode *inode); 1891extern void ext4_set_aops(struct inode *inode);
1881extern int ext4_writepage_trans_blocks(struct inode *); 1892extern int ext4_writepage_trans_blocks(struct inode *);
1882extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 1893extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1883extern int ext4_block_truncate_page(handle_t *handle,
1884 struct address_space *mapping, loff_t from);
1885extern int ext4_block_zero_page_range(handle_t *handle,
1886 struct address_space *mapping, loff_t from, loff_t length);
1887extern int ext4_discard_partial_page_buffers(handle_t *handle, 1894extern int ext4_discard_partial_page_buffers(handle_t *handle,
1888 struct address_space *mapping, loff_t from, 1895 struct address_space *mapping, loff_t from,
1889 loff_t length, int flags); 1896 loff_t length, int flags);
1890extern int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
1891 struct inode *inode, struct page *page, loff_t from,
1892 loff_t length, int flags);
1893extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 1897extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
1894extern qsize_t *ext4_get_reserved_space(struct inode *inode); 1898extern qsize_t *ext4_get_reserved_space(struct inode *inode);
1895extern void ext4_da_update_reserve_space(struct inode *inode, 1899extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -1924,6 +1928,7 @@ extern int ext4_group_add(struct super_block *sb,
1924extern int ext4_group_extend(struct super_block *sb, 1928extern int ext4_group_extend(struct super_block *sb,
1925 struct ext4_super_block *es, 1929 struct ext4_super_block *es,
1926 ext4_fsblk_t n_blocks_count); 1930 ext4_fsblk_t n_blocks_count);
1931extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
1927 1932
1928/* super.c */ 1933/* super.c */
1929extern void *ext4_kvmalloc(size_t size, gfp_t flags); 1934extern void *ext4_kvmalloc(size_t size, gfp_t flags);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 607b1557d292..74f23c292e1b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -29,7 +29,6 @@
29 * - smart tree reduction 29 * - smart tree reduction
30 */ 30 */
31 31
32#include <linux/module.h>
33#include <linux/fs.h> 32#include <linux/fs.h>
34#include <linux/time.h> 33#include <linux/time.h>
35#include <linux/jbd2.h> 34#include <linux/jbd2.h>
@@ -3281,6 +3280,9 @@ static int ext4_find_delalloc_range(struct inode *inode,
3281 ext4_lblk_t i, pg_lblk; 3280 ext4_lblk_t i, pg_lblk;
3282 pgoff_t index; 3281 pgoff_t index;
3283 3282
3283 if (!test_opt(inode->i_sb, DELALLOC))
3284 return 0;
3285
3284 /* reverse search wont work if fs block size is less than page size */ 3286 /* reverse search wont work if fs block size is less than page size */
3285 if (inode->i_blkbits < PAGE_CACHE_SHIFT) 3287 if (inode->i_blkbits < PAGE_CACHE_SHIFT)
3286 search_hint_reverse = 0; 3288 search_hint_reverse = 0;
@@ -3453,8 +3455,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3453 int err = 0; 3455 int err = 0;
3454 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3456 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3455 3457
3456 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3458 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
3457 "block %llu, max_blocks %u, flags %d, allocated %u", 3459 "block %llu, max_blocks %u, flags %x, allocated %u\n",
3458 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, 3460 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3459 flags, allocated); 3461 flags, allocated);
3460 ext4_ext_show_leaf(inode, path); 3462 ext4_ext_show_leaf(inode, path);
@@ -3625,7 +3627,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
3625 struct ext4_sb_info *sbi = EXT4_SB(sb); 3627 struct ext4_sb_info *sbi = EXT4_SB(sb);
3626 ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1); 3628 ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
3627 ext4_lblk_t ex_cluster_start, ex_cluster_end; 3629 ext4_lblk_t ex_cluster_start, ex_cluster_end;
3628 ext4_lblk_t rr_cluster_start, rr_cluster_end; 3630 ext4_lblk_t rr_cluster_start;
3629 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block); 3631 ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
3630 ext4_fsblk_t ee_start = ext4_ext_pblock(ex); 3632 ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
3631 unsigned short ee_len = ext4_ext_get_actual_len(ex); 3633 unsigned short ee_len = ext4_ext_get_actual_len(ex);
@@ -3636,7 +3638,6 @@ static int get_implied_cluster_alloc(struct super_block *sb,
3636 3638
3637 /* The requested region passed into ext4_map_blocks() */ 3639 /* The requested region passed into ext4_map_blocks() */
3638 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk); 3640 rr_cluster_start = EXT4_B2C(sbi, map->m_lblk);
3639 rr_cluster_end = EXT4_B2C(sbi, map->m_lblk + map->m_len - 1);
3640 3641
3641 if ((rr_cluster_start == ex_cluster_end) || 3642 if ((rr_cluster_start == ex_cluster_end) ||
3642 (rr_cluster_start == ex_cluster_start)) { 3643 (rr_cluster_start == ex_cluster_start)) {
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4637af036d9c..25d8c9781ad9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,7 +252,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
252 fatal = ext4_journal_get_write_access(handle, bh2); 252 fatal = ext4_journal_get_write_access(handle, bh2);
253 } 253 }
254 ext4_lock_group(sb, block_group); 254 ext4_lock_group(sb, block_group);
255 cleared = ext4_clear_bit(bit, bitmap_bh->b_data); 255 cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data);
256 if (fatal || !cleared) { 256 if (fatal || !cleared) {
257 ext4_unlock_group(sb, block_group); 257 ext4_unlock_group(sb, block_group);
258 goto out; 258 goto out;
@@ -358,7 +358,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
358 struct ext4_sb_info *sbi = EXT4_SB(sb); 358 struct ext4_sb_info *sbi = EXT4_SB(sb);
359 ext4_group_t real_ngroups = ext4_get_groups_count(sb); 359 ext4_group_t real_ngroups = ext4_get_groups_count(sb);
360 int inodes_per_group = EXT4_INODES_PER_GROUP(sb); 360 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
361 unsigned int freei, avefreei; 361 unsigned int freei, avefreei, grp_free;
362 ext4_fsblk_t freeb, avefreec; 362 ext4_fsblk_t freeb, avefreec;
363 unsigned int ndirs; 363 unsigned int ndirs;
364 int max_dirs, min_inodes; 364 int max_dirs, min_inodes;
@@ -477,8 +477,8 @@ fallback_retry:
477 for (i = 0; i < ngroups; i++) { 477 for (i = 0; i < ngroups; i++) {
478 grp = (parent_group + i) % ngroups; 478 grp = (parent_group + i) % ngroups;
479 desc = ext4_get_group_desc(sb, grp, NULL); 479 desc = ext4_get_group_desc(sb, grp, NULL);
480 if (desc && ext4_free_inodes_count(sb, desc) && 480 grp_free = ext4_free_inodes_count(sb, desc);
481 ext4_free_inodes_count(sb, desc) >= avefreei) { 481 if (desc && grp_free && grp_free >= avefreei) {
482 *group = grp; 482 *group = grp;
483 return 0; 483 return 0;
484 } 484 }
@@ -618,7 +618,7 @@ static int ext4_claim_inode(struct super_block *sb,
618 */ 618 */
619 down_read(&grp->alloc_sem); 619 down_read(&grp->alloc_sem);
620 ext4_lock_group(sb, group); 620 ext4_lock_group(sb, group);
621 if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { 621 if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
622 /* not a free inode */ 622 /* not a free inode */
623 retval = 1; 623 retval = 1;
624 goto err_ret; 624 goto err_ret;
@@ -885,8 +885,12 @@ got:
885 if (IS_DIRSYNC(inode)) 885 if (IS_DIRSYNC(inode))
886 ext4_handle_sync(handle); 886 ext4_handle_sync(handle);
887 if (insert_inode_locked(inode) < 0) { 887 if (insert_inode_locked(inode) < 0) {
888 err = -EINVAL; 888 /*
889 goto fail_drop; 889 * Likely a bitmap corruption causing inode to be allocated
890 * twice.
891 */
892 err = -EIO;
893 goto fail;
890 } 894 }
891 spin_lock(&sbi->s_next_gen_lock); 895 spin_lock(&sbi->s_next_gen_lock);
892 inode->i_generation = sbi->s_next_generation++; 896 inode->i_generation = sbi->s_next_generation++;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3cfc73fbca8e..830e1b2bf145 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,7 +20,6 @@
20 * (sct@redhat.com), 1993, 1998 20 * (sct@redhat.com), 1993, 1998
21 */ 21 */
22 22
23#include <linux/module.h>
24#include "ext4_jbd2.h" 23#include "ext4_jbd2.h"
25#include "truncate.h" 24#include "truncate.h"
26 25
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 7dbcc3e84570..feaa82fe629d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -18,7 +18,6 @@
18 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 18 * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
19 */ 19 */
20 20
21#include <linux/module.h>
22#include <linux/fs.h> 21#include <linux/fs.h>
23#include <linux/time.h> 22#include <linux/time.h>
24#include <linux/jbd2.h> 23#include <linux/jbd2.h>
@@ -72,6 +71,9 @@ static int ext4_set_bh_endio(struct buffer_head *bh, struct inode *inode);
72static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate); 71static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
73static int __ext4_journalled_writepage(struct page *page, unsigned int len); 72static int __ext4_journalled_writepage(struct page *page, unsigned int len);
74static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 73static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
74static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
75 struct inode *inode, struct page *page, loff_t from,
76 loff_t length, int flags);
75 77
76/* 78/*
77 * Test whether an inode is a fast symlink. 79 * Test whether an inode is a fast symlink.
@@ -2760,7 +2762,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2760 if (!io_end || !size) 2762 if (!io_end || !size)
2761 goto out; 2763 goto out;
2762 2764
2763 ext_debug("ext4_end_io_dio(): io_end 0x%p" 2765 ext_debug("ext4_end_io_dio(): io_end 0x%p "
2764 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", 2766 "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
2765 iocb->private, io_end->inode->i_ino, iocb, offset, 2767 iocb->private, io_end->inode->i_ino, iocb, offset,
2766 size); 2768 size);
@@ -3161,7 +3163,7 @@ int ext4_discard_partial_page_buffers(handle_t *handle,
3161 * 3163 *
3162 * Returns zero on sucess or negative on failure. 3164 * Returns zero on sucess or negative on failure.
3163 */ 3165 */
3164int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 3166static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3165 struct inode *inode, struct page *page, loff_t from, 3167 struct inode *inode, struct page *page, loff_t from,
3166 loff_t length, int flags) 3168 loff_t length, int flags)
3167{ 3169{
@@ -3301,126 +3303,6 @@ next:
3301 return err; 3303 return err;
3302} 3304}
3303 3305
3304/*
3305 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3306 * up to the end of the block which corresponds to `from'.
3307 * This required during truncate. We need to physically zero the tail end
3308 * of that block so it doesn't yield old data if the file is later grown.
3309 */
3310int ext4_block_truncate_page(handle_t *handle,
3311 struct address_space *mapping, loff_t from)
3312{
3313 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3314 unsigned length;
3315 unsigned blocksize;
3316 struct inode *inode = mapping->host;
3317
3318 blocksize = inode->i_sb->s_blocksize;
3319 length = blocksize - (offset & (blocksize - 1));
3320
3321 return ext4_block_zero_page_range(handle, mapping, from, length);
3322}
3323
3324/*
3325 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3326 * starting from file offset 'from'. The range to be zero'd must
3327 * be contained with in one block. If the specified range exceeds
3328 * the end of the block it will be shortened to end of the block
3329 * that cooresponds to 'from'
3330 */
3331int ext4_block_zero_page_range(handle_t *handle,
3332 struct address_space *mapping, loff_t from, loff_t length)
3333{
3334 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3335 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3336 unsigned blocksize, max, pos;
3337 ext4_lblk_t iblock;
3338 struct inode *inode = mapping->host;
3339 struct buffer_head *bh;
3340 struct page *page;
3341 int err = 0;
3342
3343 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3344 mapping_gfp_mask(mapping) & ~__GFP_FS);
3345 if (!page)
3346 return -ENOMEM;
3347
3348 blocksize = inode->i_sb->s_blocksize;
3349 max = blocksize - (offset & (blocksize - 1));
3350
3351 /*
3352 * correct length if it does not fall between
3353 * 'from' and the end of the block
3354 */
3355 if (length > max || length < 0)
3356 length = max;
3357
3358 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
3359
3360 if (!page_has_buffers(page))
3361 create_empty_buffers(page, blocksize, 0);
3362
3363 /* Find the buffer that contains "offset" */
3364 bh = page_buffers(page);
3365 pos = blocksize;
3366 while (offset >= pos) {
3367 bh = bh->b_this_page;
3368 iblock++;
3369 pos += blocksize;
3370 }
3371
3372 err = 0;
3373 if (buffer_freed(bh)) {
3374 BUFFER_TRACE(bh, "freed: skip");
3375 goto unlock;
3376 }
3377
3378 if (!buffer_mapped(bh)) {
3379 BUFFER_TRACE(bh, "unmapped");
3380 ext4_get_block(inode, iblock, bh, 0);
3381 /* unmapped? It's a hole - nothing to do */
3382 if (!buffer_mapped(bh)) {
3383 BUFFER_TRACE(bh, "still unmapped");
3384 goto unlock;
3385 }
3386 }
3387
3388 /* Ok, it's mapped. Make sure it's up-to-date */
3389 if (PageUptodate(page))
3390 set_buffer_uptodate(bh);
3391
3392 if (!buffer_uptodate(bh)) {
3393 err = -EIO;
3394 ll_rw_block(READ, 1, &bh);
3395 wait_on_buffer(bh);
3396 /* Uhhuh. Read error. Complain and punt. */
3397 if (!buffer_uptodate(bh))
3398 goto unlock;
3399 }
3400
3401 if (ext4_should_journal_data(inode)) {
3402 BUFFER_TRACE(bh, "get write access");
3403 err = ext4_journal_get_write_access(handle, bh);
3404 if (err)
3405 goto unlock;
3406 }
3407
3408 zero_user(page, offset, length);
3409
3410 BUFFER_TRACE(bh, "zeroed end of block");
3411
3412 err = 0;
3413 if (ext4_should_journal_data(inode)) {
3414 err = ext4_handle_dirty_metadata(handle, inode, bh);
3415 } else
3416 mark_buffer_dirty(bh);
3417
3418unlock:
3419 unlock_page(page);
3420 page_cache_release(page);
3421 return err;
3422}
3423
3424int ext4_can_truncate(struct inode *inode) 3306int ext4_can_truncate(struct inode *inode)
3425{ 3307{
3426 if (S_ISREG(inode->i_mode)) 3308 if (S_ISREG(inode->i_mode))
@@ -4647,9 +4529,19 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4647 return 0; 4529 return 0;
4648 if (is_journal_aborted(journal)) 4530 if (is_journal_aborted(journal))
4649 return -EROFS; 4531 return -EROFS;
4532 /* We have to allocate physical blocks for delalloc blocks
4533 * before flushing journal. otherwise delalloc blocks can not
4534 * be allocated any more. even more truncate on delalloc blocks
4535 * could trigger BUG by flushing delalloc blocks in journal.
4536 * There is no delalloc block in non-journal data mode.
4537 */
4538 if (val && test_opt(inode->i_sb, DELALLOC)) {
4539 err = ext4_alloc_da_blocks(inode);
4540 if (err < 0)
4541 return err;
4542 }
4650 4543
4651 jbd2_journal_lock_updates(journal); 4544 jbd2_journal_lock_updates(journal);
4652 jbd2_journal_flush(journal);
4653 4545
4654 /* 4546 /*
4655 * OK, there are no updates running now, and all cached data is 4547 * OK, there are no updates running now, and all cached data is
@@ -4661,8 +4553,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4661 4553
4662 if (val) 4554 if (val)
4663 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 4555 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4664 else 4556 else {
4557 jbd2_journal_flush(journal);
4665 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 4558 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
4559 }
4666 ext4_set_aops(inode); 4560 ext4_set_aops(inode);
4667 4561
4668 jbd2_journal_unlock_updates(journal); 4562 jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d37b3bb2a3b8..6eee25591b81 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -18,6 +18,8 @@
18#include "ext4_jbd2.h" 18#include "ext4_jbd2.h"
19#include "ext4.h" 19#include "ext4.h"
20 20
21#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
22
21long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 23long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
22{ 24{
23 struct inode *inode = filp->f_dentry->d_inode; 25 struct inode *inode = filp->f_dentry->d_inode;
@@ -158,10 +160,11 @@ flags_out:
158 goto setversion_out; 160 goto setversion_out;
159 } 161 }
160 162
163 mutex_lock(&inode->i_mutex);
161 handle = ext4_journal_start(inode, 1); 164 handle = ext4_journal_start(inode, 1);
162 if (IS_ERR(handle)) { 165 if (IS_ERR(handle)) {
163 err = PTR_ERR(handle); 166 err = PTR_ERR(handle);
164 goto setversion_out; 167 goto unlock_out;
165 } 168 }
166 err = ext4_reserve_inode_write(handle, inode, &iloc); 169 err = ext4_reserve_inode_write(handle, inode, &iloc);
167 if (err == 0) { 170 if (err == 0) {
@@ -170,6 +173,9 @@ flags_out:
170 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 173 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
171 } 174 }
172 ext4_journal_stop(handle); 175 ext4_journal_stop(handle);
176
177unlock_out:
178 mutex_unlock(&inode->i_mutex);
173setversion_out: 179setversion_out:
174 mnt_drop_write_file(filp); 180 mnt_drop_write_file(filp);
175 return err; 181 return err;
@@ -182,19 +188,22 @@ setversion_out:
182 if (err) 188 if (err)
183 return err; 189 return err;
184 190
185 if (get_user(n_blocks_count, (__u32 __user *)arg)) 191 if (get_user(n_blocks_count, (__u32 __user *)arg)) {
186 return -EFAULT; 192 err = -EFAULT;
193 goto group_extend_out;
194 }
187 195
188 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 196 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
189 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { 197 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
190 ext4_msg(sb, KERN_ERR, 198 ext4_msg(sb, KERN_ERR,
191 "Online resizing not supported with bigalloc"); 199 "Online resizing not supported with bigalloc");
192 return -EOPNOTSUPP; 200 err = -EOPNOTSUPP;
201 goto group_extend_out;
193 } 202 }
194 203
195 err = mnt_want_write_file(filp); 204 err = mnt_want_write_file(filp);
196 if (err) 205 if (err)
197 return err; 206 goto group_extend_out;
198 207
199 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 208 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
200 if (EXT4_SB(sb)->s_journal) { 209 if (EXT4_SB(sb)->s_journal) {
@@ -205,8 +214,8 @@ setversion_out:
205 if (err == 0) 214 if (err == 0)
206 err = err2; 215 err = err2;
207 mnt_drop_write_file(filp); 216 mnt_drop_write_file(filp);
217group_extend_out:
208 ext4_resize_end(sb); 218 ext4_resize_end(sb);
209
210 return err; 219 return err;
211 } 220 }
212 221
@@ -247,8 +256,7 @@ setversion_out:
247 err = ext4_move_extents(filp, donor_filp, me.orig_start, 256 err = ext4_move_extents(filp, donor_filp, me.orig_start,
248 me.donor_start, me.len, &me.moved_len); 257 me.donor_start, me.len, &me.moved_len);
249 mnt_drop_write_file(filp); 258 mnt_drop_write_file(filp);
250 if (me.moved_len > 0) 259 mnt_drop_write(filp->f_path.mnt);
251 file_remove_suid(donor_filp);
252 260
253 if (copy_to_user((struct move_extent __user *)arg, 261 if (copy_to_user((struct move_extent __user *)arg,
254 &me, sizeof(me))) 262 &me, sizeof(me)))
@@ -267,19 +275,22 @@ mext_out:
267 return err; 275 return err;
268 276
269 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, 277 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
270 sizeof(input))) 278 sizeof(input))) {
271 return -EFAULT; 279 err = -EFAULT;
280 goto group_add_out;
281 }
272 282
273 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 283 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
274 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { 284 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
275 ext4_msg(sb, KERN_ERR, 285 ext4_msg(sb, KERN_ERR,
276 "Online resizing not supported with bigalloc"); 286 "Online resizing not supported with bigalloc");
277 return -EOPNOTSUPP; 287 err = -EOPNOTSUPP;
288 goto group_add_out;
278 } 289 }
279 290
280 err = mnt_want_write_file(filp); 291 err = mnt_want_write_file(filp);
281 if (err) 292 if (err)
282 return err; 293 goto group_add_out;
283 294
284 err = ext4_group_add(sb, &input); 295 err = ext4_group_add(sb, &input);
285 if (EXT4_SB(sb)->s_journal) { 296 if (EXT4_SB(sb)->s_journal) {
@@ -290,8 +301,8 @@ mext_out:
290 if (err == 0) 301 if (err == 0)
291 err = err2; 302 err = err2;
292 mnt_drop_write_file(filp); 303 mnt_drop_write_file(filp);
304group_add_out:
293 ext4_resize_end(sb); 305 ext4_resize_end(sb);
294
295 return err; 306 return err;
296 } 307 }
297 308
@@ -331,6 +342,60 @@ mext_out:
331 return err; 342 return err;
332 } 343 }
333 344
345 case EXT4_IOC_RESIZE_FS: {
346 ext4_fsblk_t n_blocks_count;
347 struct super_block *sb = inode->i_sb;
348 int err = 0, err2 = 0;
349
350 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
351 EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
352 ext4_msg(sb, KERN_ERR,
353 "Online resizing not (yet) supported with bigalloc");
354 return -EOPNOTSUPP;
355 }
356
357 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
358 EXT4_FEATURE_INCOMPAT_META_BG)) {
359 ext4_msg(sb, KERN_ERR,
360 "Online resizing not (yet) supported with meta_bg");
361 return -EOPNOTSUPP;
362 }
363
364 if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
365 sizeof(__u64))) {
366 return -EFAULT;
367 }
368
369 if (n_blocks_count > MAX_32_NUM &&
370 !EXT4_HAS_INCOMPAT_FEATURE(sb,
371 EXT4_FEATURE_INCOMPAT_64BIT)) {
372 ext4_msg(sb, KERN_ERR,
373 "File system only supports 32-bit block numbers");
374 return -EOPNOTSUPP;
375 }
376
377 err = ext4_resize_begin(sb);
378 if (err)
379 return err;
380
381 err = mnt_want_write(filp->f_path.mnt);
382 if (err)
383 goto resizefs_out;
384
385 err = ext4_resize_fs(sb, n_blocks_count);
386 if (EXT4_SB(sb)->s_journal) {
387 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
388 err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
389 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
390 }
391 if (err == 0)
392 err = err2;
393 mnt_drop_write(filp->f_path.mnt);
394resizefs_out:
395 ext4_resize_end(sb);
396 return err;
397 }
398
334 case FITRIM: 399 case FITRIM:
335 { 400 {
336 struct request_queue *q = bdev_get_queue(sb->s_bdev); 401 struct request_queue *q = bdev_get_queue(sb->s_bdev);
@@ -429,6 +494,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
429 } 494 }
430 case EXT4_IOC_MOVE_EXT: 495 case EXT4_IOC_MOVE_EXT:
431 case FITRIM: 496 case FITRIM:
497 case EXT4_IOC_RESIZE_FS:
432 break; 498 break;
433 default: 499 default:
434 return -ENOIOCTLCMD; 500 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e2d8be8f28bf..cb990b21c698 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3671,7 +3671,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3671 ext4_group_t group; 3671 ext4_group_t group;
3672 ext4_grpblk_t bit; 3672 ext4_grpblk_t bit;
3673 3673
3674 trace_ext4_mb_release_group_pa(pa); 3674 trace_ext4_mb_release_group_pa(sb, pa);
3675 BUG_ON(pa->pa_deleted == 0); 3675 BUG_ON(pa->pa_deleted == 0);
3676 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3676 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3677 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3677 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 16ac228dbec6..e7d6bb0acfa6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -12,7 +12,6 @@
12 * 12 *
13 */ 13 */
14 14
15#include <linux/module.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
17#include "ext4_jbd2.h" 16#include "ext4_jbd2.h"
18 17
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 86edc45b52a4..2043f482375d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2315,7 +2315,7 @@ retry:
2315 err = PTR_ERR(handle); 2315 err = PTR_ERR(handle);
2316 goto err_drop_inode; 2316 goto err_drop_inode;
2317 } 2317 }
2318 inc_nlink(inode); 2318 set_nlink(inode, 1);
2319 err = ext4_orphan_del(handle, inode); 2319 err = ext4_orphan_del(handle, inode);
2320 if (err) { 2320 if (err) {
2321 ext4_journal_stop(handle); 2321 ext4_journal_stop(handle);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7e106c810c62..475851896518 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -6,7 +6,6 @@
6 * Written by Theodore Ts'o, 2010. 6 * Written by Theodore Ts'o, 2010.
7 */ 7 */
8 8
9#include <linux/module.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include <linux/time.h> 10#include <linux/time.h>
12#include <linux/jbd2.h> 11#include <linux/jbd2.h>
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 996780ab4f4e..f9d948f0eb86 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -134,6 +134,172 @@ static int verify_group_input(struct super_block *sb,
134 return err; 134 return err;
135} 135}
136 136
137/*
138 * ext4_new_flex_group_data is used by 64bit-resize interface to add a flex
139 * group each time.
140 */
141struct ext4_new_flex_group_data {
142 struct ext4_new_group_data *groups; /* new_group_data for groups
143 in the flex group */
144 __u16 *bg_flags; /* block group flags of groups
145 in @groups */
146 ext4_group_t count; /* number of groups in @groups
147 */
148};
149
150/*
151 * alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
152 * @flexbg_size.
153 *
154 * Returns NULL on failure otherwise address of the allocated structure.
155 */
156static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
157{
158 struct ext4_new_flex_group_data *flex_gd;
159
160 flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
161 if (flex_gd == NULL)
162 goto out3;
163
164 flex_gd->count = flexbg_size;
165
166 flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
167 flexbg_size, GFP_NOFS);
168 if (flex_gd->groups == NULL)
169 goto out2;
170
171 flex_gd->bg_flags = kmalloc(flexbg_size * sizeof(__u16), GFP_NOFS);
172 if (flex_gd->bg_flags == NULL)
173 goto out1;
174
175 return flex_gd;
176
177out1:
178 kfree(flex_gd->groups);
179out2:
180 kfree(flex_gd);
181out3:
182 return NULL;
183}
184
185static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
186{
187 kfree(flex_gd->bg_flags);
188 kfree(flex_gd->groups);
189 kfree(flex_gd);
190}
191
192/*
193 * ext4_alloc_group_tables() allocates block bitmaps, inode bitmaps
194 * and inode tables for a flex group.
195 *
196 * This function is used by 64bit-resize. Note that this function allocates
197 * group tables from the 1st group of groups contained by @flexgd, which may
198 * be a partial of a flex group.
199 *
200 * @sb: super block of fs to which the groups belongs
201 */
202static void ext4_alloc_group_tables(struct super_block *sb,
203 struct ext4_new_flex_group_data *flex_gd,
204 int flexbg_size)
205{
206 struct ext4_new_group_data *group_data = flex_gd->groups;
207 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
208 ext4_fsblk_t start_blk;
209 ext4_fsblk_t last_blk;
210 ext4_group_t src_group;
211 ext4_group_t bb_index = 0;
212 ext4_group_t ib_index = 0;
213 ext4_group_t it_index = 0;
214 ext4_group_t group;
215 ext4_group_t last_group;
216 unsigned overhead;
217
218 BUG_ON(flex_gd->count == 0 || group_data == NULL);
219
220 src_group = group_data[0].group;
221 last_group = src_group + flex_gd->count - 1;
222
223 BUG_ON((flexbg_size > 1) && ((src_group & ~(flexbg_size - 1)) !=
224 (last_group & ~(flexbg_size - 1))));
225next_group:
226 group = group_data[0].group;
227 start_blk = ext4_group_first_block_no(sb, src_group);
228 last_blk = start_blk + group_data[src_group - group].blocks_count;
229
230 overhead = ext4_bg_has_super(sb, src_group) ?
231 (1 + ext4_bg_num_gdb(sb, src_group) +
232 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
233
234 start_blk += overhead;
235
236 BUG_ON(src_group >= group_data[0].group + flex_gd->count);
237 /* We collect contiguous blocks as much as possible. */
238 src_group++;
239 for (; src_group <= last_group; src_group++)
240 if (!ext4_bg_has_super(sb, src_group))
241 last_blk += group_data[src_group - group].blocks_count;
242 else
243 break;
244
245 /* Allocate block bitmaps */
246 for (; bb_index < flex_gd->count; bb_index++) {
247 if (start_blk >= last_blk)
248 goto next_group;
249 group_data[bb_index].block_bitmap = start_blk++;
250 ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
251 group -= group_data[0].group;
252 group_data[group].free_blocks_count--;
253 if (flexbg_size > 1)
254 flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
255 }
256
257 /* Allocate inode bitmaps */
258 for (; ib_index < flex_gd->count; ib_index++) {
259 if (start_blk >= last_blk)
260 goto next_group;
261 group_data[ib_index].inode_bitmap = start_blk++;
262 ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
263 group -= group_data[0].group;
264 group_data[group].free_blocks_count--;
265 if (flexbg_size > 1)
266 flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
267 }
268
269 /* Allocate inode tables */
270 for (; it_index < flex_gd->count; it_index++) {
271 if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
272 goto next_group;
273 group_data[it_index].inode_table = start_blk;
274 ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
275 group -= group_data[0].group;
276 group_data[group].free_blocks_count -=
277 EXT4_SB(sb)->s_itb_per_group;
278 if (flexbg_size > 1)
279 flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
280
281 start_blk += EXT4_SB(sb)->s_itb_per_group;
282 }
283
284 if (test_opt(sb, DEBUG)) {
285 int i;
286 group = group_data[0].group;
287
288 printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
289 "%d groups, flexbg size is %d:\n", flex_gd->count,
290 flexbg_size);
291
292 for (i = 0; i < flex_gd->count; i++) {
293 printk(KERN_DEBUG "adding %s group %u: %u "
294 "blocks (%d free)\n",
295 ext4_bg_has_super(sb, group + i) ? "normal" :
296 "no-super", group + i,
297 group_data[i].blocks_count,
298 group_data[i].free_blocks_count);
299 }
300 }
301}
302
137static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, 303static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
138 ext4_fsblk_t blk) 304 ext4_fsblk_t blk)
139{ 305{
@@ -179,131 +345,250 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh)
179} 345}
180 346
181/* 347/*
182 * Set up the block and inode bitmaps, and the inode table for the new group. 348 * set_flexbg_block_bitmap() mark @count blocks starting from @block used.
349 *
350 * Helper function for ext4_setup_new_group_blocks() which set .
351 *
352 * @sb: super block
353 * @handle: journal handle
354 * @flex_gd: flex group data
355 */
356static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
357 struct ext4_new_flex_group_data *flex_gd,
358 ext4_fsblk_t block, ext4_group_t count)
359{
360 ext4_group_t count2;
361
362 ext4_debug("mark blocks [%llu/%u] used\n", block, count);
363 for (count2 = count; count > 0; count -= count2, block += count2) {
364 ext4_fsblk_t start;
365 struct buffer_head *bh;
366 ext4_group_t group;
367 int err;
368
369 ext4_get_group_no_and_offset(sb, block, &group, NULL);
370 start = ext4_group_first_block_no(sb, group);
371 group -= flex_gd->groups[0].group;
372
373 count2 = sb->s_blocksize * 8 - (block - start);
374 if (count2 > count)
375 count2 = count;
376
377 if (flex_gd->bg_flags[group] & EXT4_BG_BLOCK_UNINIT) {
378 BUG_ON(flex_gd->count > 1);
379 continue;
380 }
381
382 err = extend_or_restart_transaction(handle, 1);
383 if (err)
384 return err;
385
386 bh = sb_getblk(sb, flex_gd->groups[group].block_bitmap);
387 if (!bh)
388 return -EIO;
389
390 err = ext4_journal_get_write_access(handle, bh);
391 if (err)
392 return err;
393 ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", block,
394 block - start, count2);
395 ext4_set_bits(bh->b_data, block - start, count2);
396
397 err = ext4_handle_dirty_metadata(handle, NULL, bh);
398 if (unlikely(err))
399 return err;
400 brelse(bh);
401 }
402
403 return 0;
404}
405
406/*
407 * Set up the block and inode bitmaps, and the inode table for the new groups.
183 * This doesn't need to be part of the main transaction, since we are only 408 * This doesn't need to be part of the main transaction, since we are only
184 * changing blocks outside the actual filesystem. We still do journaling to 409 * changing blocks outside the actual filesystem. We still do journaling to
185 * ensure the recovery is correct in case of a failure just after resize. 410 * ensure the recovery is correct in case of a failure just after resize.
186 * If any part of this fails, we simply abort the resize. 411 * If any part of this fails, we simply abort the resize.
412 *
413 * setup_new_flex_group_blocks handles a flex group as follow:
414 * 1. copy super block and GDT, and initialize group tables if necessary.
415 * In this step, we only set bits in blocks bitmaps for blocks taken by
416 * super block and GDT.
417 * 2. allocate group tables in block bitmaps, that is, set bits in block
418 * bitmap for blocks taken by group tables.
187 */ 419 */
188static int setup_new_group_blocks(struct super_block *sb, 420static int setup_new_flex_group_blocks(struct super_block *sb,
189 struct ext4_new_group_data *input) 421 struct ext4_new_flex_group_data *flex_gd)
190{ 422{
423 int group_table_count[] = {1, 1, EXT4_SB(sb)->s_itb_per_group};
424 ext4_fsblk_t start;
425 ext4_fsblk_t block;
191 struct ext4_sb_info *sbi = EXT4_SB(sb); 426 struct ext4_sb_info *sbi = EXT4_SB(sb);
192 ext4_fsblk_t start = ext4_group_first_block_no(sb, input->group); 427 struct ext4_super_block *es = sbi->s_es;
193 int reserved_gdb = ext4_bg_has_super(sb, input->group) ? 428 struct ext4_new_group_data *group_data = flex_gd->groups;
194 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; 429 __u16 *bg_flags = flex_gd->bg_flags;
195 unsigned long gdblocks = ext4_bg_num_gdb(sb, input->group);
196 struct buffer_head *bh;
197 handle_t *handle; 430 handle_t *handle;
198 ext4_fsblk_t block; 431 ext4_group_t group, count;
199 ext4_grpblk_t bit; 432 struct buffer_head *bh = NULL;
200 int i; 433 int reserved_gdb, i, j, err = 0, err2;
201 int err = 0, err2; 434
435 BUG_ON(!flex_gd->count || !group_data ||
436 group_data[0].group != sbi->s_groups_count);
437
438 reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
202 439
203 /* This transaction may be extended/restarted along the way */ 440 /* This transaction may be extended/restarted along the way */
204 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA); 441 handle = ext4_journal_start_sb(sb, EXT4_MAX_TRANS_DATA);
205
206 if (IS_ERR(handle)) 442 if (IS_ERR(handle))
207 return PTR_ERR(handle); 443 return PTR_ERR(handle);
208 444
209 BUG_ON(input->group != sbi->s_groups_count); 445 group = group_data[0].group;
446 for (i = 0; i < flex_gd->count; i++, group++) {
447 unsigned long gdblocks;
210 448
211 /* Copy all of the GDT blocks into the backup in this group */ 449 gdblocks = ext4_bg_num_gdb(sb, group);
212 for (i = 0, bit = 1, block = start + 1; 450 start = ext4_group_first_block_no(sb, group);
213 i < gdblocks; i++, block++, bit++) {
214 struct buffer_head *gdb;
215 451
216 ext4_debug("update backup group %#04llx (+%d)\n", block, bit); 452 /* Copy all of the GDT blocks into the backup in this group */
217 err = extend_or_restart_transaction(handle, 1); 453 for (j = 0, block = start + 1; j < gdblocks; j++, block++) {
218 if (err) 454 struct buffer_head *gdb;
219 goto exit_journal;
220 455
221 gdb = sb_getblk(sb, block); 456 ext4_debug("update backup group %#04llx\n", block);
222 if (!gdb) { 457 err = extend_or_restart_transaction(handle, 1);
223 err = -EIO; 458 if (err)
224 goto exit_journal; 459 goto out;
225 } 460
226 if ((err = ext4_journal_get_write_access(handle, gdb))) { 461 gdb = sb_getblk(sb, block);
462 if (!gdb) {
463 err = -EIO;
464 goto out;
465 }
466
467 err = ext4_journal_get_write_access(handle, gdb);
468 if (err) {
469 brelse(gdb);
470 goto out;
471 }
472 memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
473 gdb->b_size);
474 set_buffer_uptodate(gdb);
475
476 err = ext4_handle_dirty_metadata(handle, NULL, gdb);
477 if (unlikely(err)) {
478 brelse(gdb);
479 goto out;
480 }
227 brelse(gdb); 481 brelse(gdb);
228 goto exit_journal;
229 } 482 }
230 memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); 483
231 set_buffer_uptodate(gdb); 484 /* Zero out all of the reserved backup group descriptor
232 err = ext4_handle_dirty_metadata(handle, NULL, gdb); 485 * table blocks
233 if (unlikely(err)) { 486 */
234 brelse(gdb); 487 if (ext4_bg_has_super(sb, group)) {
235 goto exit_journal; 488 err = sb_issue_zeroout(sb, gdblocks + start + 1,
489 reserved_gdb, GFP_NOFS);
490 if (err)
491 goto out;
236 } 492 }
237 brelse(gdb);
238 }
239 493
240 /* Zero out all of the reserved backup group descriptor table blocks */ 494 /* Initialize group tables of the grop @group */
241 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", 495 if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED))
242 block, sbi->s_itb_per_group); 496 goto handle_bb;
243 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
244 GFP_NOFS);
245 if (err)
246 goto exit_journal;
247 497
248 err = extend_or_restart_transaction(handle, 2); 498 /* Zero out all of the inode table blocks */
249 if (err) 499 block = group_data[i].inode_table;
250 goto exit_journal; 500 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
501 block, sbi->s_itb_per_group);
502 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group,
503 GFP_NOFS);
504 if (err)
505 goto out;
251 506
252 bh = bclean(handle, sb, input->block_bitmap); 507handle_bb:
253 if (IS_ERR(bh)) { 508 if (bg_flags[i] & EXT4_BG_BLOCK_UNINIT)
254 err = PTR_ERR(bh); 509 goto handle_ib;
255 goto exit_journal;
256 }
257 510
258 if (ext4_bg_has_super(sb, input->group)) { 511 /* Initialize block bitmap of the @group */
259 ext4_debug("mark backup group tables %#04llx (+0)\n", start); 512 block = group_data[i].block_bitmap;
260 ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb + 1); 513 err = extend_or_restart_transaction(handle, 1);
261 } 514 if (err)
515 goto out;
262 516
263 ext4_debug("mark block bitmap %#04llx (+%llu)\n", input->block_bitmap, 517 bh = bclean(handle, sb, block);
264 input->block_bitmap - start); 518 if (IS_ERR(bh)) {
265 ext4_set_bit(input->block_bitmap - start, bh->b_data); 519 err = PTR_ERR(bh);
266 ext4_debug("mark inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, 520 goto out;
267 input->inode_bitmap - start); 521 }
268 ext4_set_bit(input->inode_bitmap - start, bh->b_data); 522 if (ext4_bg_has_super(sb, group)) {
269 523 ext4_debug("mark backup superblock %#04llx (+0)\n",
270 /* Zero out all of the inode table blocks */ 524 start);
271 block = input->inode_table; 525 ext4_set_bits(bh->b_data, 0, gdblocks + reserved_gdb +
272 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n", 526 1);
273 block, sbi->s_itb_per_group); 527 }
274 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); 528 ext4_mark_bitmap_end(group_data[i].blocks_count,
275 if (err) 529 sb->s_blocksize * 8, bh->b_data);
276 goto exit_bh; 530 err = ext4_handle_dirty_metadata(handle, NULL, bh);
277 ext4_set_bits(bh->b_data, input->inode_table - start, 531 if (err)
278 sbi->s_itb_per_group); 532 goto out;
533 brelse(bh);
279 534
535handle_ib:
536 if (bg_flags[i] & EXT4_BG_INODE_UNINIT)
537 continue;
280 538
281 ext4_mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, 539 /* Initialize inode bitmap of the @group */
282 bh->b_data); 540 block = group_data[i].inode_bitmap;
283 err = ext4_handle_dirty_metadata(handle, NULL, bh); 541 err = extend_or_restart_transaction(handle, 1);
284 if (unlikely(err)) { 542 if (err)
285 ext4_std_error(sb, err); 543 goto out;
286 goto exit_bh; 544 /* Mark unused entries in inode bitmap used */
545 bh = bclean(handle, sb, block);
546 if (IS_ERR(bh)) {
547 err = PTR_ERR(bh);
548 goto out;
549 }
550
551 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb),
552 sb->s_blocksize * 8, bh->b_data);
553 err = ext4_handle_dirty_metadata(handle, NULL, bh);
554 if (err)
555 goto out;
556 brelse(bh);
287 } 557 }
288 brelse(bh); 558 bh = NULL;
289 /* Mark unused entries in inode bitmap used */ 559
290 ext4_debug("clear inode bitmap %#04llx (+%llu)\n", 560 /* Mark group tables in block bitmap */
291 input->inode_bitmap, input->inode_bitmap - start); 561 for (j = 0; j < GROUP_TABLE_COUNT; j++) {
292 if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { 562 count = group_table_count[j];
293 err = PTR_ERR(bh); 563 start = (&group_data[0].block_bitmap)[j];
294 goto exit_journal; 564 block = start;
565 for (i = 1; i < flex_gd->count; i++) {
566 block += group_table_count[j];
567 if (block == (&group_data[i].block_bitmap)[j]) {
568 count += group_table_count[j];
569 continue;
570 }
571 err = set_flexbg_block_bitmap(sb, handle,
572 flex_gd, start, count);
573 if (err)
574 goto out;
575 count = group_table_count[j];
576 start = group_data[i].block_bitmap;
577 block = start;
578 }
579
580 if (count) {
581 err = set_flexbg_block_bitmap(sb, handle,
582 flex_gd, start, count);
583 if (err)
584 goto out;
585 }
295 } 586 }
296 587
297 ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, 588out:
298 bh->b_data);
299 err = ext4_handle_dirty_metadata(handle, NULL, bh);
300 if (unlikely(err))
301 ext4_std_error(sb, err);
302exit_bh:
303 brelse(bh); 589 brelse(bh);
304 590 err2 = ext4_journal_stop(handle);
305exit_journal: 591 if (err2 && !err)
306 if ((err2 = ext4_journal_stop(handle)) && !err)
307 err = err2; 592 err = err2;
308 593
309 return err; 594 return err;
@@ -351,10 +636,10 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
351 * groups in current filesystem that have BACKUPS, or -ve error code. 636 * groups in current filesystem that have BACKUPS, or -ve error code.
352 */ 637 */
353static int verify_reserved_gdb(struct super_block *sb, 638static int verify_reserved_gdb(struct super_block *sb,
639 ext4_group_t end,
354 struct buffer_head *primary) 640 struct buffer_head *primary)
355{ 641{
356 const ext4_fsblk_t blk = primary->b_blocknr; 642 const ext4_fsblk_t blk = primary->b_blocknr;
357 const ext4_group_t end = EXT4_SB(sb)->s_groups_count;
358 unsigned three = 1; 643 unsigned three = 1;
359 unsigned five = 5; 644 unsigned five = 5;
360 unsigned seven = 7; 645 unsigned seven = 7;
@@ -429,7 +714,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
429 if (!gdb_bh) 714 if (!gdb_bh)
430 return -EIO; 715 return -EIO;
431 716
432 gdbackups = verify_reserved_gdb(sb, gdb_bh); 717 gdbackups = verify_reserved_gdb(sb, group, gdb_bh);
433 if (gdbackups < 0) { 718 if (gdbackups < 0) {
434 err = gdbackups; 719 err = gdbackups;
435 goto exit_bh; 720 goto exit_bh;
@@ -592,7 +877,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
592 err = -EIO; 877 err = -EIO;
593 goto exit_bh; 878 goto exit_bh;
594 } 879 }
595 if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { 880 gdbackups = verify_reserved_gdb(sb, group, primary[res]);
881 if (gdbackups < 0) {
596 brelse(primary[res]); 882 brelse(primary[res]);
597 err = gdbackups; 883 err = gdbackups;
598 goto exit_bh; 884 goto exit_bh;
@@ -735,6 +1021,348 @@ exit_err:
735 } 1021 }
736} 1022}
737 1023
1024/*
1025 * ext4_add_new_descs() adds @count group descriptor of groups
1026 * starting at @group
1027 *
1028 * @handle: journal handle
1029 * @sb: super block
1030 * @group: the group no. of the first group desc to be added
1031 * @resize_inode: the resize inode
1032 * @count: number of group descriptors to be added
1033 */
1034static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
1035 ext4_group_t group, struct inode *resize_inode,
1036 ext4_group_t count)
1037{
1038 struct ext4_sb_info *sbi = EXT4_SB(sb);
1039 struct ext4_super_block *es = sbi->s_es;
1040 struct buffer_head *gdb_bh;
1041 int i, gdb_off, gdb_num, err = 0;
1042
1043 for (i = 0; i < count; i++, group++) {
1044 int reserved_gdb = ext4_bg_has_super(sb, group) ?
1045 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
1046
1047 gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
1048 gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
1049
1050 /*
1051 * We will only either add reserved group blocks to a backup group
1052 * or remove reserved blocks for the first group in a new group block.
1053 * Doing both would be mean more complex code, and sane people don't
1054 * use non-sparse filesystems anymore. This is already checked above.
1055 */
1056 if (gdb_off) {
1057 gdb_bh = sbi->s_group_desc[gdb_num];
1058 err = ext4_journal_get_write_access(handle, gdb_bh);
1059
1060 if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
1061 err = reserve_backup_gdb(handle, resize_inode, group);
1062 } else
1063 err = add_new_gdb(handle, resize_inode, group);
1064 if (err)
1065 break;
1066 }
1067 return err;
1068}
1069
1070/*
1071 * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
1072 */
1073static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
1074 struct ext4_new_flex_group_data *flex_gd)
1075{
1076 struct ext4_new_group_data *group_data = flex_gd->groups;
1077 struct ext4_group_desc *gdp;
1078 struct ext4_sb_info *sbi = EXT4_SB(sb);
1079 struct buffer_head *gdb_bh;
1080 ext4_group_t group;
1081 __u16 *bg_flags = flex_gd->bg_flags;
1082 int i, gdb_off, gdb_num, err = 0;
1083
1084
1085 for (i = 0; i < flex_gd->count; i++, group_data++, bg_flags++) {
1086 group = group_data->group;
1087
1088 gdb_off = group % EXT4_DESC_PER_BLOCK(sb);
1089 gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
1090
1091 /*
1092 * get_write_access() has been called on gdb_bh by ext4_add_new_desc().
1093 */
1094 gdb_bh = sbi->s_group_desc[gdb_num];
1095 /* Update group descriptor block for new group */
1096 gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
1097 gdb_off * EXT4_DESC_SIZE(sb));
1098
1099 memset(gdp, 0, EXT4_DESC_SIZE(sb));
1100 ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
1101 ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
1102 ext4_inode_table_set(sb, gdp, group_data->inode_table);
1103 ext4_free_group_clusters_set(sb, gdp,
1104 EXT4_B2C(sbi, group_data->free_blocks_count));
1105 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
1106 gdp->bg_flags = cpu_to_le16(*bg_flags);
1107 gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
1108
1109 err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
1110 if (unlikely(err)) {
1111 ext4_std_error(sb, err);
1112 break;
1113 }
1114
1115 /*
1116 * We can allocate memory for mb_alloc based on the new group
1117 * descriptor
1118 */
1119 err = ext4_mb_add_groupinfo(sb, group, gdp);
1120 if (err)
1121 break;
1122 }
1123 return err;
1124}
1125
1126/*
1127 * ext4_update_super() updates the super block so that the newly added
1128 * groups can be seen by the filesystem.
1129 *
1130 * @sb: super block
1131 * @flex_gd: new added groups
1132 */
1133static void ext4_update_super(struct super_block *sb,
1134 struct ext4_new_flex_group_data *flex_gd)
1135{
1136 ext4_fsblk_t blocks_count = 0;
1137 ext4_fsblk_t free_blocks = 0;
1138 ext4_fsblk_t reserved_blocks = 0;
1139 struct ext4_new_group_data *group_data = flex_gd->groups;
1140 struct ext4_sb_info *sbi = EXT4_SB(sb);
1141 struct ext4_super_block *es = sbi->s_es;
1142 int i;
1143
1144 BUG_ON(flex_gd->count == 0 || group_data == NULL);
1145 /*
1146 * Make the new blocks and inodes valid next. We do this before
1147 * increasing the group count so that once the group is enabled,
1148 * all of its blocks and inodes are already valid.
1149 *
1150 * We always allocate group-by-group, then block-by-block or
1151 * inode-by-inode within a group, so enabling these
1152 * blocks/inodes before the group is live won't actually let us
1153 * allocate the new space yet.
1154 */
1155 for (i = 0; i < flex_gd->count; i++) {
1156 blocks_count += group_data[i].blocks_count;
1157 free_blocks += group_data[i].free_blocks_count;
1158 }
1159
1160 reserved_blocks = ext4_r_blocks_count(es) * 100;
1161 do_div(reserved_blocks, ext4_blocks_count(es));
1162 reserved_blocks *= blocks_count;
1163 do_div(reserved_blocks, 100);
1164
1165 ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
1166 le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
1167 flex_gd->count);
1168
1169 /*
1170 * We need to protect s_groups_count against other CPUs seeing
1171 * inconsistent state in the superblock.
1172 *
1173 * The precise rules we use are:
1174 *
1175 * * Writers must perform a smp_wmb() after updating all
1176 * dependent data and before modifying the groups count
1177 *
1178 * * Readers must perform an smp_rmb() after reading the groups
1179 * count and before reading any dependent data.
1180 *
1181 * NB. These rules can be relaxed when checking the group count
1182 * while freeing data, as we can only allocate from a block
1183 * group after serialising against the group count, and we can
1184 * only then free after serialising in turn against that
1185 * allocation.
1186 */
1187 smp_wmb();
1188
1189 /* Update the global fs size fields */
1190 sbi->s_groups_count += flex_gd->count;
1191
1192 /* Update the reserved block counts only once the new group is
1193 * active. */
1194 ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
1195 reserved_blocks);
1196
1197 /* Update the free space counts */
1198 percpu_counter_add(&sbi->s_freeclusters_counter,
1199 EXT4_B2C(sbi, free_blocks));
1200 percpu_counter_add(&sbi->s_freeinodes_counter,
1201 EXT4_INODES_PER_GROUP(sb) * flex_gd->count);
1202
1203 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
1204 EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
1205 sbi->s_log_groups_per_flex) {
1206 ext4_group_t flex_group;
1207 flex_group = ext4_flex_group(sbi, group_data[0].group);
1208 atomic_add(EXT4_B2C(sbi, free_blocks),
1209 &sbi->s_flex_groups[flex_group].free_clusters);
1210 atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
1211 &sbi->s_flex_groups[flex_group].free_inodes);
1212 }
1213
1214 if (test_opt(sb, DEBUG))
1215 printk(KERN_DEBUG "EXT4-fs: added group %u:"
1216 "%llu blocks(%llu free %llu reserved)\n", flex_gd->count,
1217 blocks_count, free_blocks, reserved_blocks);
1218}
1219
1220/* Add a flex group to an fs. Ensure we handle all possible error conditions
1221 * _before_ we start modifying the filesystem, because we cannot abort the
1222 * transaction and not have it write the data to disk.
1223 */
1224static int ext4_flex_group_add(struct super_block *sb,
1225 struct inode *resize_inode,
1226 struct ext4_new_flex_group_data *flex_gd)
1227{
1228 struct ext4_sb_info *sbi = EXT4_SB(sb);
1229 struct ext4_super_block *es = sbi->s_es;
1230 ext4_fsblk_t o_blocks_count;
1231 ext4_grpblk_t last;
1232 ext4_group_t group;
1233 handle_t *handle;
1234 unsigned reserved_gdb;
1235 int err = 0, err2 = 0, credit;
1236
1237 BUG_ON(!flex_gd->count || !flex_gd->groups || !flex_gd->bg_flags);
1238
1239 reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
1240 o_blocks_count = ext4_blocks_count(es);
1241 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
1242 BUG_ON(last);
1243
1244 err = setup_new_flex_group_blocks(sb, flex_gd);
1245 if (err)
1246 goto exit;
1247 /*
1248 * We will always be modifying at least the superblock and GDT
1249 * block. If we are adding a group past the last current GDT block,
1250 * we will also modify the inode and the dindirect block. If we
1251 * are adding a group with superblock/GDT backups we will also
1252 * modify each of the reserved GDT dindirect blocks.
1253 */
1254 credit = flex_gd->count * 4 + reserved_gdb;
1255 handle = ext4_journal_start_sb(sb, credit);
1256 if (IS_ERR(handle)) {
1257 err = PTR_ERR(handle);
1258 goto exit;
1259 }
1260
1261 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
1262 if (err)
1263 goto exit_journal;
1264
1265 group = flex_gd->groups[0].group;
1266 BUG_ON(group != EXT4_SB(sb)->s_groups_count);
1267 err = ext4_add_new_descs(handle, sb, group,
1268 resize_inode, flex_gd->count);
1269 if (err)
1270 goto exit_journal;
1271
1272 err = ext4_setup_new_descs(handle, sb, flex_gd);
1273 if (err)
1274 goto exit_journal;
1275
1276 ext4_update_super(sb, flex_gd);
1277
1278 err = ext4_handle_dirty_super(handle, sb);
1279
1280exit_journal:
1281 err2 = ext4_journal_stop(handle);
1282 if (!err)
1283 err = err2;
1284
1285 if (!err) {
1286 int i;
1287 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
1288 sizeof(struct ext4_super_block));
1289 for (i = 0; i < flex_gd->count; i++, group++) {
1290 struct buffer_head *gdb_bh;
1291 int gdb_num;
1292 gdb_num = group / EXT4_BLOCKS_PER_GROUP(sb);
1293 gdb_bh = sbi->s_group_desc[gdb_num];
1294 update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
1295 gdb_bh->b_size);
1296 }
1297 }
1298exit:
1299 return err;
1300}
1301
1302static int ext4_setup_next_flex_gd(struct super_block *sb,
1303 struct ext4_new_flex_group_data *flex_gd,
1304 ext4_fsblk_t n_blocks_count,
1305 unsigned long flexbg_size)
1306{
1307 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
1308 struct ext4_new_group_data *group_data = flex_gd->groups;
1309 ext4_fsblk_t o_blocks_count;
1310 ext4_group_t n_group;
1311 ext4_group_t group;
1312 ext4_group_t last_group;
1313 ext4_grpblk_t last;
1314 ext4_grpblk_t blocks_per_group;
1315 unsigned long i;
1316
1317 blocks_per_group = EXT4_BLOCKS_PER_GROUP(sb);
1318
1319 o_blocks_count = ext4_blocks_count(es);
1320
1321 if (o_blocks_count == n_blocks_count)
1322 return 0;
1323
1324 ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
1325 BUG_ON(last);
1326 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
1327
1328 last_group = group | (flexbg_size - 1);
1329 if (last_group > n_group)
1330 last_group = n_group;
1331
1332 flex_gd->count = last_group - group + 1;
1333
1334 for (i = 0; i < flex_gd->count; i++) {
1335 int overhead;
1336
1337 group_data[i].group = group + i;
1338 group_data[i].blocks_count = blocks_per_group;
1339 overhead = ext4_bg_has_super(sb, group + i) ?
1340 (1 + ext4_bg_num_gdb(sb, group + i) +
1341 le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
1342 group_data[i].free_blocks_count = blocks_per_group - overhead;
1343 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
1344 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
1345 flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
1346 EXT4_BG_INODE_UNINIT;
1347 else
1348 flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
1349 }
1350
1351 if (last_group == n_group &&
1352 EXT4_HAS_RO_COMPAT_FEATURE(sb,
1353 EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
1354 /* We need to initialize block bitmap of last group. */
1355 flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
1356
1357 if ((last_group == n_group) && (last != blocks_per_group - 1)) {
1358 group_data[i - 1].blocks_count = last + 1;
1359 group_data[i - 1].free_blocks_count -= blocks_per_group-
1360 last - 1;
1361 }
1362
1363 return 1;
1364}
1365
738/* Add group descriptor data to an existing or new group descriptor block. 1366/* Add group descriptor data to an existing or new group descriptor block.
739 * Ensure we handle all possible error conditions _before_ we start modifying 1367 * Ensure we handle all possible error conditions _before_ we start modifying
740 * the filesystem, because we cannot abort the transaction and not have it 1368 * the filesystem, because we cannot abort the transaction and not have it
@@ -750,16 +1378,15 @@ exit_err:
750 */ 1378 */
751int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) 1379int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
752{ 1380{
1381 struct ext4_new_flex_group_data flex_gd;
753 struct ext4_sb_info *sbi = EXT4_SB(sb); 1382 struct ext4_sb_info *sbi = EXT4_SB(sb);
754 struct ext4_super_block *es = sbi->s_es; 1383 struct ext4_super_block *es = sbi->s_es;
755 int reserved_gdb = ext4_bg_has_super(sb, input->group) ? 1384 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
756 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1385 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
757 struct buffer_head *primary = NULL;
758 struct ext4_group_desc *gdp;
759 struct inode *inode = NULL; 1386 struct inode *inode = NULL;
760 handle_t *handle;
761 int gdb_off, gdb_num; 1387 int gdb_off, gdb_num;
762 int err, err2; 1388 int err;
1389 __u16 bg_flags = 0;
763 1390
764 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); 1391 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
765 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); 1392 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
@@ -798,175 +1425,69 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
798 } 1425 }
799 1426
800 1427
801 if ((err = verify_group_input(sb, input))) 1428 err = verify_group_input(sb, input);
802 goto exit_put; 1429 if (err)
1430 goto out;
803 1431
804 if ((err = setup_new_group_blocks(sb, input))) 1432 flex_gd.count = 1;
805 goto exit_put; 1433 flex_gd.groups = input;
1434 flex_gd.bg_flags = &bg_flags;
1435 err = ext4_flex_group_add(sb, inode, &flex_gd);
1436out:
1437 iput(inode);
1438 return err;
1439} /* ext4_group_add */
806 1440
807 /* 1441/*
808 * We will always be modifying at least the superblock and a GDT 1442 * extend a group without checking assuming that checking has been done.
809 * block. If we are adding a group past the last current GDT block, 1443 */
810 * we will also modify the inode and the dindirect block. If we 1444static int ext4_group_extend_no_check(struct super_block *sb,
811 * are adding a group with superblock/GDT backups we will also 1445 ext4_fsblk_t o_blocks_count, ext4_grpblk_t add)
812 * modify each of the reserved GDT dindirect blocks. 1446{
1447 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
1448 handle_t *handle;
1449 int err = 0, err2;
1450
1451 /* We will update the superblock, one block bitmap, and
1452 * one group descriptor via ext4_group_add_blocks().
813 */ 1453 */
814 handle = ext4_journal_start_sb(sb, 1454 handle = ext4_journal_start_sb(sb, 3);
815 ext4_bg_has_super(sb, input->group) ?
816 3 + reserved_gdb : 4);
817 if (IS_ERR(handle)) { 1455 if (IS_ERR(handle)) {
818 err = PTR_ERR(handle); 1456 err = PTR_ERR(handle);
819 goto exit_put; 1457 ext4_warning(sb, "error %d on journal start", err);
1458 return err;
820 } 1459 }
821 1460
822 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) 1461 err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
823 goto exit_journal; 1462 if (err) {
824 1463 ext4_warning(sb, "error %d on journal write access", err);
825 /* 1464 goto errout;
826 * We will only either add reserved group blocks to a backup group
827 * or remove reserved blocks for the first group in a new group block.
828 * Doing both would be mean more complex code, and sane people don't
829 * use non-sparse filesystems anymore. This is already checked above.
830 */
831 if (gdb_off) {
832 primary = sbi->s_group_desc[gdb_num];
833 if ((err = ext4_journal_get_write_access(handle, primary)))
834 goto exit_journal;
835
836 if (reserved_gdb && ext4_bg_num_gdb(sb, input->group)) {
837 err = reserve_backup_gdb(handle, inode, input->group);
838 if (err)
839 goto exit_journal;
840 }
841 } else {
842 /*
843 * Note that we can access new group descriptor block safely
844 * only if add_new_gdb() succeeds.
845 */
846 err = add_new_gdb(handle, inode, input->group);
847 if (err)
848 goto exit_journal;
849 primary = sbi->s_group_desc[gdb_num];
850 } 1465 }
851 1466
852 /* 1467 ext4_blocks_count_set(es, o_blocks_count + add);
853 * OK, now we've set up the new group. Time to make it active. 1468 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
854 * 1469 o_blocks_count + add);
855 * so we have to be safe wrt. concurrent accesses the group 1470 /* We add the blocks to the bitmap and set the group need init bit */
856 * data. So we need to be careful to set all of the relevant 1471 err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
857 * group descriptor data etc. *before* we enable the group.
858 *
859 * The key field here is sbi->s_groups_count: as long as
860 * that retains its old value, nobody is going to access the new
861 * group.
862 *
863 * So first we update all the descriptor metadata for the new
864 * group; then we update the total disk blocks count; then we
865 * update the groups count to enable the group; then finally we
866 * update the free space counts so that the system can start
867 * using the new disk blocks.
868 */
869
870 /* Update group descriptor block for new group */
871 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
872 gdb_off * EXT4_DESC_SIZE(sb));
873
874 memset(gdp, 0, EXT4_DESC_SIZE(sb));
875 ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
876 ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
877 ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */
878 ext4_free_group_clusters_set(sb, gdp, input->free_blocks_count);
879 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
880 gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED);
881 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
882
883 /*
884 * We can allocate memory for mb_alloc based on the new group
885 * descriptor
886 */
887 err = ext4_mb_add_groupinfo(sb, input->group, gdp);
888 if (err) 1472 if (err)
889 goto exit_journal; 1473 goto errout;
890
891 /*
892 * Make the new blocks and inodes valid next. We do this before
893 * increasing the group count so that once the group is enabled,
894 * all of its blocks and inodes are already valid.
895 *
896 * We always allocate group-by-group, then block-by-block or
897 * inode-by-inode within a group, so enabling these
898 * blocks/inodes before the group is live won't actually let us
899 * allocate the new space yet.
900 */
901 ext4_blocks_count_set(es, ext4_blocks_count(es) +
902 input->blocks_count);
903 le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb));
904
905 /*
906 * We need to protect s_groups_count against other CPUs seeing
907 * inconsistent state in the superblock.
908 *
909 * The precise rules we use are:
910 *
911 * * Writers must perform a smp_wmb() after updating all dependent
912 * data and before modifying the groups count
913 *
914 * * Readers must perform an smp_rmb() after reading the groups count
915 * and before reading any dependent data.
916 *
917 * NB. These rules can be relaxed when checking the group count
918 * while freeing data, as we can only allocate from a block
919 * group after serialising against the group count, and we can
920 * only then free after serialising in turn against that
921 * allocation.
922 */
923 smp_wmb();
924
925 /* Update the global fs size fields */
926 sbi->s_groups_count++;
927
928 err = ext4_handle_dirty_metadata(handle, NULL, primary);
929 if (unlikely(err)) {
930 ext4_std_error(sb, err);
931 goto exit_journal;
932 }
933
934 /* Update the reserved block counts only once the new group is
935 * active. */
936 ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) +
937 input->reserved_blocks);
938
939 /* Update the free space counts */
940 percpu_counter_add(&sbi->s_freeclusters_counter,
941 EXT4_B2C(sbi, input->free_blocks_count));
942 percpu_counter_add(&sbi->s_freeinodes_counter,
943 EXT4_INODES_PER_GROUP(sb));
944
945 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
946 sbi->s_log_groups_per_flex) {
947 ext4_group_t flex_group;
948 flex_group = ext4_flex_group(sbi, input->group);
949 atomic_add(EXT4_B2C(sbi, input->free_blocks_count),
950 &sbi->s_flex_groups[flex_group].free_clusters);
951 atomic_add(EXT4_INODES_PER_GROUP(sb),
952 &sbi->s_flex_groups[flex_group].free_inodes);
953 }
954
955 ext4_handle_dirty_super(handle, sb); 1474 ext4_handle_dirty_super(handle, sb);
956 1475 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
957exit_journal: 1476 o_blocks_count + add);
958 if ((err2 = ext4_journal_stop(handle)) && !err) 1477errout:
1478 err2 = ext4_journal_stop(handle);
1479 if (err2 && !err)
959 err = err2; 1480 err = err2;
960 if (!err && primary) { 1481
961 update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, 1482 if (!err) {
1483 if (test_opt(sb, DEBUG))
1484 printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
1485 "blocks\n", ext4_blocks_count(es));
1486 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
962 sizeof(struct ext4_super_block)); 1487 sizeof(struct ext4_super_block));
963 update_backups(sb, primary->b_blocknr, primary->b_data,
964 primary->b_size);
965 } 1488 }
966exit_put:
967 iput(inode);
968 return err; 1489 return err;
969} /* ext4_group_add */ 1490}
970 1491
971/* 1492/*
972 * Extend the filesystem to the new number of blocks specified. This entry 1493 * Extend the filesystem to the new number of blocks specified. This entry
@@ -985,8 +1506,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
985 ext4_grpblk_t last; 1506 ext4_grpblk_t last;
986 ext4_grpblk_t add; 1507 ext4_grpblk_t add;
987 struct buffer_head *bh; 1508 struct buffer_head *bh;
988 handle_t *handle; 1509 int err;
989 int err, err2;
990 ext4_group_t group; 1510 ext4_group_t group;
991 1511
992 o_blocks_count = ext4_blocks_count(es); 1512 o_blocks_count = ext4_blocks_count(es);
@@ -1042,42 +1562,119 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
1042 } 1562 }
1043 brelse(bh); 1563 brelse(bh);
1044 1564
1045 /* We will update the superblock, one block bitmap, and 1565 err = ext4_group_extend_no_check(sb, o_blocks_count, add);
1046 * one group descriptor via ext4_free_blocks(). 1566 return err;
1047 */ 1567} /* ext4_group_extend */
1048 handle = ext4_journal_start_sb(sb, 3); 1568
1049 if (IS_ERR(handle)) { 1569/*
1050 err = PTR_ERR(handle); 1570 * ext4_resize_fs() resizes a fs to new size specified by @n_blocks_count
1051 ext4_warning(sb, "error %d on journal start", err); 1571 *
1052 goto exit_put; 1572 * @sb: super block of the fs to be resized
1573 * @n_blocks_count: the number of blocks resides in the resized fs
1574 */
1575int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
1576{
1577 struct ext4_new_flex_group_data *flex_gd = NULL;
1578 struct ext4_sb_info *sbi = EXT4_SB(sb);
1579 struct ext4_super_block *es = sbi->s_es;
1580 struct buffer_head *bh;
1581 struct inode *resize_inode;
1582 ext4_fsblk_t o_blocks_count;
1583 ext4_group_t o_group;
1584 ext4_group_t n_group;
1585 ext4_grpblk_t offset;
1586 unsigned long n_desc_blocks;
1587 unsigned long o_desc_blocks;
1588 unsigned long desc_blocks;
1589 int err = 0, flexbg_size = 1;
1590
1591 o_blocks_count = ext4_blocks_count(es);
1592
1593 if (test_opt(sb, DEBUG))
1594 printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
1595 "upto %llu blocks\n", o_blocks_count, n_blocks_count);
1596
1597 if (n_blocks_count < o_blocks_count) {
1598 /* On-line shrinking not supported */
1599 ext4_warning(sb, "can't shrink FS - resize aborted");
1600 return -EINVAL;
1053 } 1601 }
1054 1602
1055 if ((err = ext4_journal_get_write_access(handle, 1603 if (n_blocks_count == o_blocks_count)
1056 EXT4_SB(sb)->s_sbh))) { 1604 /* Nothing need to do */
1057 ext4_warning(sb, "error %d on journal write access", err); 1605 return 0;
1058 ext4_journal_stop(handle); 1606
1059 goto exit_put; 1607 ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
1608 ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
1609
1610 n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
1611 EXT4_DESC_PER_BLOCK(sb);
1612 o_desc_blocks = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
1613 EXT4_DESC_PER_BLOCK(sb);
1614 desc_blocks = n_desc_blocks - o_desc_blocks;
1615
1616 if (desc_blocks &&
1617 (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE) ||
1618 le16_to_cpu(es->s_reserved_gdt_blocks) < desc_blocks)) {
1619 ext4_warning(sb, "No reserved GDT blocks, can't resize");
1620 return -EPERM;
1060 } 1621 }
1061 ext4_blocks_count_set(es, o_blocks_count + add);
1062 ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
1063 o_blocks_count + add);
1064 /* We add the blocks to the bitmap and set the group need init bit */
1065 err = ext4_group_add_blocks(handle, sb, o_blocks_count, add);
1066 ext4_handle_dirty_super(handle, sb);
1067 ext4_debug("freed blocks %llu through %llu\n", o_blocks_count,
1068 o_blocks_count + add);
1069 err2 = ext4_journal_stop(handle);
1070 if (!err && err2)
1071 err = err2;
1072 1622
1073 if (err) 1623 resize_inode = ext4_iget(sb, EXT4_RESIZE_INO);
1074 goto exit_put; 1624 if (IS_ERR(resize_inode)) {
1625 ext4_warning(sb, "Error opening resize inode");
1626 return PTR_ERR(resize_inode);
1627 }
1075 1628
1629 /* See if the device is actually as big as what was requested */
1630 bh = sb_bread(sb, n_blocks_count - 1);
1631 if (!bh) {
1632 ext4_warning(sb, "can't read last block, resize aborted");
1633 return -ENOSPC;
1634 }
1635 brelse(bh);
1636
1637 if (offset != 0) {
1638 /* extend the last group */
1639 ext4_grpblk_t add;
1640 add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
1641 err = ext4_group_extend_no_check(sb, o_blocks_count, add);
1642 if (err)
1643 goto out;
1644 }
1645
1646 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
1647 es->s_log_groups_per_flex)
1648 flexbg_size = 1 << es->s_log_groups_per_flex;
1649
1650 o_blocks_count = ext4_blocks_count(es);
1651 if (o_blocks_count == n_blocks_count)
1652 goto out;
1653
1654 flex_gd = alloc_flex_gd(flexbg_size);
1655 if (flex_gd == NULL) {
1656 err = -ENOMEM;
1657 goto out;
1658 }
1659
1660 /* Add flex groups. Note that a regular group is a
1661 * flex group with 1 group.
1662 */
1663 while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
1664 flexbg_size)) {
1665 ext4_alloc_group_tables(sb, flex_gd, flexbg_size);
1666 err = ext4_flex_group_add(sb, resize_inode, flex_gd);
1667 if (unlikely(err))
1668 break;
1669 }
1670
1671out:
1672 if (flex_gd)
1673 free_flex_gd(flex_gd);
1674
1675 iput(resize_inode);
1076 if (test_opt(sb, DEBUG)) 1676 if (test_opt(sb, DEBUG))
1077 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", 1677 printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
1078 ext4_blocks_count(es)); 1678 "upto %llu blocks\n", o_blocks_count, n_blocks_count);
1079 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr, (char *)es,
1080 sizeof(struct ext4_super_block));
1081exit_put:
1082 return err; 1679 return err;
1083} /* ext4_group_extend */ 1680}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 64e2529ae9bb..502c61fd7392 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1095,7 +1095,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root)
1095 } 1095 }
1096 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { 1096 if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
1097 seq_printf(seq, ",max_batch_time=%u", 1097 seq_printf(seq, ",max_batch_time=%u",
1098 (unsigned) sbi->s_min_batch_time); 1098 (unsigned) sbi->s_max_batch_time);
1099 } 1099 }
1100 1100
1101 /* 1101 /*
@@ -2005,17 +2005,16 @@ static int ext4_fill_flex_info(struct super_block *sb)
2005 struct ext4_group_desc *gdp = NULL; 2005 struct ext4_group_desc *gdp = NULL;
2006 ext4_group_t flex_group_count; 2006 ext4_group_t flex_group_count;
2007 ext4_group_t flex_group; 2007 ext4_group_t flex_group;
2008 int groups_per_flex = 0; 2008 unsigned int groups_per_flex = 0;
2009 size_t size; 2009 size_t size;
2010 int i; 2010 int i;
2011 2011
2012 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 2012 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
2013 groups_per_flex = 1 << sbi->s_log_groups_per_flex; 2013 if (sbi->s_log_groups_per_flex < 1 || sbi->s_log_groups_per_flex > 31) {
2014
2015 if (groups_per_flex < 2) {
2016 sbi->s_log_groups_per_flex = 0; 2014 sbi->s_log_groups_per_flex = 0;
2017 return 1; 2015 return 1;
2018 } 2016 }
2017 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
2019 2018
2020 /* We allocate both existing and potentially added groups */ 2019 /* We allocate both existing and potentially added groups */
2021 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + 2020 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
@@ -3506,7 +3505,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3506 * of the filesystem. 3505 * of the filesystem.
3507 */ 3506 */
3508 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { 3507 if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
3509 ext4_msg(sb, KERN_WARNING, "bad geometry: first data" 3508 ext4_msg(sb, KERN_WARNING, "bad geometry: first data "
3510 "block %u is beyond end of filesystem (%llu)", 3509 "block %u is beyond end of filesystem (%llu)",
3511 le32_to_cpu(es->s_first_data_block), 3510 le32_to_cpu(es->s_first_data_block),
3512 ext4_blocks_count(es)); 3511 ext4_blocks_count(es));
@@ -3733,10 +3732,12 @@ no_journal:
3733 } 3732 }
3734 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 3733 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
3735 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); 3734 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
3735 iput(root);
3736 goto failed_mount4; 3736 goto failed_mount4;
3737 } 3737 }
3738 sb->s_root = d_alloc_root(root); 3738 sb->s_root = d_alloc_root(root);
3739 if (!sb->s_root) { 3739 if (!sb->s_root) {
3740 iput(root);
3740 ext4_msg(sb, KERN_ERR, "get root dentry failed"); 3741 ext4_msg(sb, KERN_ERR, "get root dentry failed");
3741 ret = -ENOMEM; 3742 ret = -ENOMEM;
3742 goto failed_mount4; 3743 goto failed_mount4;
@@ -3773,7 +3774,7 @@ no_journal:
3773 if (err) { 3774 if (err) {
3774 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3775 ext4_msg(sb, KERN_ERR, "failed to initialize system "
3775 "zone (%d)", err); 3776 "zone (%d)", err);
3776 goto failed_mount4; 3777 goto failed_mount4a;
3777 } 3778 }
3778 3779
3779 ext4_ext_init(sb); 3780 ext4_ext_init(sb);
@@ -3830,13 +3831,14 @@ cantfind_ext4:
3830failed_mount7: 3831failed_mount7:
3831 ext4_unregister_li_request(sb); 3832 ext4_unregister_li_request(sb);
3832failed_mount6: 3833failed_mount6:
3833 ext4_ext_release(sb);
3834failed_mount5:
3835 ext4_mb_release(sb); 3834 ext4_mb_release(sb);
3835failed_mount5:
3836 ext4_ext_release(sb);
3836 ext4_release_system_zone(sb); 3837 ext4_release_system_zone(sb);
3837failed_mount4: 3838failed_mount4a:
3838 iput(root); 3839 dput(sb->s_root);
3839 sb->s_root = NULL; 3840 sb->s_root = NULL;
3841failed_mount4:
3840 ext4_msg(sb, KERN_ERR, "mount failed"); 3842 ext4_msg(sb, KERN_ERR, "mount failed");
3841 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 3843 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
3842failed_mount_wq: 3844failed_mount_wq:
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 34e4350dd4d9..d2a200624af5 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -3,7 +3,6 @@
3 * Handler for storing security labels as extended attributes. 3 * Handler for storing security labels as extended attributes.
4 */ 4 */
5 5
6#include <linux/module.h>
7#include <linux/string.h> 6#include <linux/string.h>
8#include <linux/fs.h> 7#include <linux/fs.h>
9#include <linux/security.h> 8#include <linux/security.h>
@@ -48,8 +47,9 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
48 name, value, size, flags); 47 name, value, size, flags);
49} 48}
50 49
51int ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array, 50static int
52 void *fs_info) 51ext4_initxattrs(struct inode *inode, const struct xattr *xattr_array,
52 void *fs_info)
53{ 53{
54 const struct xattr *xattr; 54 const struct xattr *xattr;
55 handle_t *handle = fs_info; 55 handle_t *handle = fs_info;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 37e6ebca2cc3..95f1f4ab59a4 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -5,7 +5,6 @@
5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org> 5 * Copyright (C) 2003 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */ 6 */
7 7
8#include <linux/module.h>
9#include <linux/string.h> 8#include <linux/string.h>
10#include <linux/capability.h> 9#include <linux/capability.h>
11#include <linux/fs.h> 10#include <linux/fs.h>
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 98c375352d0e..0edb7611ffbe 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -5,7 +5,6 @@
5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org> 5 * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
6 */ 6 */
7 7
8#include <linux/module.h>
9#include <linux/string.h> 8#include <linux/string.h>
10#include <linux/fs.h> 9#include <linux/fs.h>
11#include "ext4_jbd2.h" 10#include "ext4_jbd2.h"
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 3a444b4e2368..a81eb2367d39 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -512,7 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
512 int charlen; 512 int charlen;
513 513
514 if (utf8) { 514 if (utf8) {
515 *outlen = utf8s_to_utf16s(name, len, (wchar_t *)outname); 515 *outlen = utf8s_to_utf16s(name, len, UTF16_HOST_ENDIAN,
516 (wchar_t *) outname, FAT_LFN_LEN + 2);
516 if (*outlen < 0) 517 if (*outlen < 0)
517 return *outlen; 518 return *outlen;
518 else if (*outlen > FAT_LFN_LEN) 519 else if (*outlen > FAT_LFN_LEN)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e2951506434d..f855916657ba 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/mm.h> 22#include <linux/mm.h>
23#include <linux/pagemap.h>
23#include <linux/kthread.h> 24#include <linux/kthread.h>
24#include <linux/freezer.h> 25#include <linux/freezer.h>
25#include <linux/writeback.h> 26#include <linux/writeback.h>
@@ -29,6 +30,11 @@
29#include "internal.h" 30#include "internal.h"
30 31
31/* 32/*
33 * 4MB minimal write chunk size
34 */
35#define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10))
36
37/*
32 * Passed into wb_writeback(), essentially a subset of writeback_control 38 * Passed into wb_writeback(), essentially a subset of writeback_control
33 */ 39 */
34struct wb_writeback_work { 40struct wb_writeback_work {
@@ -742,11 +748,17 @@ static long wb_writeback(struct bdi_writeback *wb,
742 if (work->for_background && !over_bground_thresh(wb->bdi)) 748 if (work->for_background && !over_bground_thresh(wb->bdi))
743 break; 749 break;
744 750
751 /*
752 * Kupdate and background works are special and we want to
753 * include all inodes that need writing. Livelock avoidance is
754 * handled by these works yielding to any other work so we are
755 * safe.
756 */
745 if (work->for_kupdate) { 757 if (work->for_kupdate) {
746 oldest_jif = jiffies - 758 oldest_jif = jiffies -
747 msecs_to_jiffies(dirty_expire_interval * 10); 759 msecs_to_jiffies(dirty_expire_interval * 10);
748 work->older_than_this = &oldest_jif; 760 } else if (work->for_background)
749 } 761 oldest_jif = jiffies;
750 762
751 trace_writeback_start(wb->bdi, work); 763 trace_writeback_start(wb->bdi, work);
752 if (list_empty(&wb->b_io)) 764 if (list_empty(&wb->b_io))
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 2aaf3eaaf13d..5f3368ab0fa9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1378,7 +1378,59 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
1378 down_read(&fc->killsb); 1378 down_read(&fc->killsb);
1379 err = -ENOENT; 1379 err = -ENOENT;
1380 if (fc->sb) 1380 if (fc->sb)
1381 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name); 1381 err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
1382 up_read(&fc->killsb);
1383 kfree(buf);
1384 return err;
1385
1386err:
1387 kfree(buf);
1388 fuse_copy_finish(cs);
1389 return err;
1390}
1391
1392static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
1393 struct fuse_copy_state *cs)
1394{
1395 struct fuse_notify_delete_out outarg;
1396 int err = -ENOMEM;
1397 char *buf;
1398 struct qstr name;
1399
1400 buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
1401 if (!buf)
1402 goto err;
1403
1404 err = -EINVAL;
1405 if (size < sizeof(outarg))
1406 goto err;
1407
1408 err = fuse_copy_one(cs, &outarg, sizeof(outarg));
1409 if (err)
1410 goto err;
1411
1412 err = -ENAMETOOLONG;
1413 if (outarg.namelen > FUSE_NAME_MAX)
1414 goto err;
1415
1416 err = -EINVAL;
1417 if (size != sizeof(outarg) + outarg.namelen + 1)
1418 goto err;
1419
1420 name.name = buf;
1421 name.len = outarg.namelen;
1422 err = fuse_copy_one(cs, buf, outarg.namelen + 1);
1423 if (err)
1424 goto err;
1425 fuse_copy_finish(cs);
1426 buf[outarg.namelen] = 0;
1427 name.hash = full_name_hash(name.name, name.len);
1428
1429 down_read(&fc->killsb);
1430 err = -ENOENT;
1431 if (fc->sb)
1432 err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
1433 outarg.child, &name);
1382 up_read(&fc->killsb); 1434 up_read(&fc->killsb);
1383 kfree(buf); 1435 kfree(buf);
1384 return err; 1436 return err;
@@ -1597,6 +1649,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
1597 case FUSE_NOTIFY_RETRIEVE: 1649 case FUSE_NOTIFY_RETRIEVE:
1598 return fuse_notify_retrieve(fc, size, cs); 1650 return fuse_notify_retrieve(fc, size, cs);
1599 1651
1652 case FUSE_NOTIFY_DELETE:
1653 return fuse_notify_delete(fc, size, cs);
1654
1600 default: 1655 default:
1601 fuse_copy_finish(cs); 1656 fuse_copy_finish(cs);
1602 return -EINVAL; 1657 return -EINVAL;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5ddd6ea8f839..206632887bb4 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -868,7 +868,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
868} 868}
869 869
870int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, 870int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
871 struct qstr *name) 871 u64 child_nodeid, struct qstr *name)
872{ 872{
873 int err = -ENOTDIR; 873 int err = -ENOTDIR;
874 struct inode *parent; 874 struct inode *parent;
@@ -895,8 +895,36 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
895 895
896 fuse_invalidate_attr(parent); 896 fuse_invalidate_attr(parent);
897 fuse_invalidate_entry(entry); 897 fuse_invalidate_entry(entry);
898
899 if (child_nodeid != 0 && entry->d_inode) {
900 mutex_lock(&entry->d_inode->i_mutex);
901 if (get_node_id(entry->d_inode) != child_nodeid) {
902 err = -ENOENT;
903 goto badentry;
904 }
905 if (d_mountpoint(entry)) {
906 err = -EBUSY;
907 goto badentry;
908 }
909 if (S_ISDIR(entry->d_inode->i_mode)) {
910 shrink_dcache_parent(entry);
911 if (!simple_empty(entry)) {
912 err = -ENOTEMPTY;
913 goto badentry;
914 }
915 entry->d_inode->i_flags |= S_DEAD;
916 }
917 dont_mount(entry);
918 clear_nlink(entry->d_inode);
919 err = 0;
920 badentry:
921 mutex_unlock(&entry->d_inode->i_mutex);
922 if (!err)
923 d_delete(entry);
924 } else {
925 err = 0;
926 }
898 dput(entry); 927 dput(entry);
899 err = 0;
900 928
901 unlock: 929 unlock:
902 mutex_unlock(&parent->i_mutex); 930 mutex_unlock(&parent->i_mutex);
@@ -1182,6 +1210,30 @@ static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
1182 return fuse_fsync_common(file, start, end, datasync, 1); 1210 return fuse_fsync_common(file, start, end, datasync, 1);
1183} 1211}
1184 1212
1213static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
1214 unsigned long arg)
1215{
1216 struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
1217
1218 /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
1219 if (fc->minor < 18)
1220 return -ENOTTY;
1221
1222 return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
1223}
1224
1225static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
1226 unsigned long arg)
1227{
1228 struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
1229
1230 if (fc->minor < 18)
1231 return -ENOTTY;
1232
1233 return fuse_ioctl_common(file, cmd, arg,
1234 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
1235}
1236
1185static bool update_mtime(unsigned ivalid) 1237static bool update_mtime(unsigned ivalid)
1186{ 1238{
1187 /* Always update if mtime is explicitly set */ 1239 /* Always update if mtime is explicitly set */
@@ -1596,6 +1648,8 @@ static const struct file_operations fuse_dir_operations = {
1596 .open = fuse_dir_open, 1648 .open = fuse_dir_open,
1597 .release = fuse_dir_release, 1649 .release = fuse_dir_release,
1598 .fsync = fuse_dir_fsync, 1650 .fsync = fuse_dir_fsync,
1651 .unlocked_ioctl = fuse_dir_ioctl,
1652 .compat_ioctl = fuse_dir_compat_ioctl,
1599}; 1653};
1600 1654
1601static const struct inode_operations fuse_common_inode_operations = { 1655static const struct inode_operations fuse_common_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0c84100acd44..4a199fd93fbd 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1555,48 +1555,16 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin)
1555 loff_t retval; 1555 loff_t retval;
1556 struct inode *inode = file->f_path.dentry->d_inode; 1556 struct inode *inode = file->f_path.dentry->d_inode;
1557 1557
1558 mutex_lock(&inode->i_mutex); 1558 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
1559 if (origin != SEEK_CUR && origin != SEEK_SET) { 1559 if (origin == SEEK_CUR || origin == SEEK_SET)
1560 retval = fuse_update_attributes(inode, NULL, file, NULL); 1560 return generic_file_llseek(file, offset, origin);
1561 if (retval)
1562 goto exit;
1563 }
1564 1561
1565 switch (origin) { 1562 mutex_lock(&inode->i_mutex);
1566 case SEEK_END: 1563 retval = fuse_update_attributes(inode, NULL, file, NULL);
1567 offset += i_size_read(inode); 1564 if (!retval)
1568 break; 1565 retval = generic_file_llseek(file, offset, origin);
1569 case SEEK_CUR:
1570 if (offset == 0) {
1571 retval = file->f_pos;
1572 goto exit;
1573 }
1574 offset += file->f_pos;
1575 break;
1576 case SEEK_DATA:
1577 if (offset >= i_size_read(inode)) {
1578 retval = -ENXIO;
1579 goto exit;
1580 }
1581 break;
1582 case SEEK_HOLE:
1583 if (offset >= i_size_read(inode)) {
1584 retval = -ENXIO;
1585 goto exit;
1586 }
1587 offset = i_size_read(inode);
1588 break;
1589 }
1590 retval = -EINVAL;
1591 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
1592 if (offset != file->f_pos) {
1593 file->f_pos = offset;
1594 file->f_version = 0;
1595 }
1596 retval = offset;
1597 }
1598exit:
1599 mutex_unlock(&inode->i_mutex); 1566 mutex_unlock(&inode->i_mutex);
1567
1600 return retval; 1568 return retval;
1601} 1569}
1602 1570
@@ -1808,7 +1776,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1808 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 1776 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
1809 1777
1810 err = -ENOMEM; 1778 err = -ENOMEM;
1811 pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); 1779 pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
1812 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); 1780 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
1813 if (!pages || !iov_page) 1781 if (!pages || !iov_page)
1814 goto out; 1782 goto out;
@@ -1958,8 +1926,8 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
1958} 1926}
1959EXPORT_SYMBOL_GPL(fuse_do_ioctl); 1927EXPORT_SYMBOL_GPL(fuse_do_ioctl);
1960 1928
1961static long fuse_file_ioctl_common(struct file *file, unsigned int cmd, 1929long fuse_ioctl_common(struct file *file, unsigned int cmd,
1962 unsigned long arg, unsigned int flags) 1930 unsigned long arg, unsigned int flags)
1963{ 1931{
1964 struct inode *inode = file->f_dentry->d_inode; 1932 struct inode *inode = file->f_dentry->d_inode;
1965 struct fuse_conn *fc = get_fuse_conn(inode); 1933 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1976,13 +1944,13 @@ static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
1976static long fuse_file_ioctl(struct file *file, unsigned int cmd, 1944static long fuse_file_ioctl(struct file *file, unsigned int cmd,
1977 unsigned long arg) 1945 unsigned long arg)
1978{ 1946{
1979 return fuse_file_ioctl_common(file, cmd, arg, 0); 1947 return fuse_ioctl_common(file, cmd, arg, 0);
1980} 1948}
1981 1949
1982static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, 1950static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
1983 unsigned long arg) 1951 unsigned long arg)
1984{ 1952{
1985 return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); 1953 return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
1986} 1954}
1987 1955
1988/* 1956/*
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1964da0257d9..572cefc78012 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -755,9 +755,15 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
755/** 755/**
756 * File-system tells the kernel to invalidate parent attributes and 756 * File-system tells the kernel to invalidate parent attributes and
757 * the dentry matching parent/name. 757 * the dentry matching parent/name.
758 *
759 * If the child_nodeid is non-zero and:
760 * - matches the inode number for the dentry matching parent/name,
761 * - is not a mount point
762 * - is a file or oan empty directory
763 * then the dentry is unhashed (d_delete()).
758 */ 764 */
759int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, 765int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
760 struct qstr *name); 766 u64 child_nodeid, struct qstr *name);
761 767
762int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 768int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
763 bool isdir); 769 bool isdir);
@@ -765,6 +771,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
765 size_t count, loff_t *ppos, int write); 771 size_t count, loff_t *ppos, int write);
766long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, 772long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
767 unsigned int flags); 773 unsigned int flags);
774long fuse_ioctl_common(struct file *file, unsigned int cmd,
775 unsigned long arg, unsigned int flags);
768unsigned fuse_file_poll(struct file *file, poll_table *wait); 776unsigned fuse_file_poll(struct file *file, poll_table *wait);
769int fuse_dev_release(struct inode *inode, struct file *file); 777int fuse_dev_release(struct inode *inode, struct file *file);
770 778
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 88e8a23d0026..376816fcd040 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1353,7 +1353,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
1353 spin_lock(&gl->gl_spin); 1353 spin_lock(&gl->gl_spin);
1354 gl->gl_reply = ret; 1354 gl->gl_reply = ret;
1355 1355
1356 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) { 1356 if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
1357 if (gfs2_should_freeze(gl)) { 1357 if (gfs2_should_freeze(gl)) {
1358 set_bit(GLF_FROZEN, &gl->gl_flags); 1358 set_bit(GLF_FROZEN, &gl->gl_flags);
1359 spin_unlock(&gl->gl_spin); 1359 spin_unlock(&gl->gl_spin);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2553b858a72e..307ac31df781 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -121,8 +121,11 @@ enum {
121 121
122struct lm_lockops { 122struct lm_lockops {
123 const char *lm_proto_name; 123 const char *lm_proto_name;
124 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); 124 int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
125 void (*lm_unmount) (struct gfs2_sbd *sdp); 125 void (*lm_first_done) (struct gfs2_sbd *sdp);
126 void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
127 unsigned int result);
128 void (*lm_unmount) (struct gfs2_sbd *sdp);
126 void (*lm_withdraw) (struct gfs2_sbd *sdp); 129 void (*lm_withdraw) (struct gfs2_sbd *sdp);
127 void (*lm_put_lock) (struct gfs2_glock *gl); 130 void (*lm_put_lock) (struct gfs2_glock *gl);
128 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, 131 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e1d3bb59945c..97742a7ea9cc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -139,8 +139,45 @@ struct gfs2_bufdata {
139#define GDLM_STRNAME_BYTES 25 139#define GDLM_STRNAME_BYTES 25
140#define GDLM_LVB_SIZE 32 140#define GDLM_LVB_SIZE 32
141 141
142/*
143 * ls_recover_flags:
144 *
145 * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
146 * held by failed nodes whose journals need recovery. Those locks should
147 * only be used for journal recovery until the journal recovery is done.
148 * This is set by the dlm recover_prep callback and cleared by the
149 * gfs2_control thread when journal recovery is complete. To avoid
150 * races between recover_prep setting and gfs2_control clearing, recover_spin
151 * is held while changing this bit and reading/writing recover_block
152 * and recover_start.
153 *
154 * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
155 *
156 * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
157 * recovery of all journals before allowing other nodes to mount the fs.
158 * This is cleared when FIRST_MOUNT_DONE is set.
159 *
160 * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
161 * recovery of all journals, and now allows other nodes to mount the fs.
162 *
163 * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
164 * BLOCK_LOCKS for the first time. The gfs2_control thread should now
165 * control clearing BLOCK_LOCKS for further recoveries.
166 *
167 * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
168 *
169 * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
170 * and recover_done(), i.e. set while recover_block == recover_start.
171 */
172
142enum { 173enum {
143 DFL_BLOCK_LOCKS = 0, 174 DFL_BLOCK_LOCKS = 0,
175 DFL_NO_DLM_OPS = 1,
176 DFL_FIRST_MOUNT = 2,
177 DFL_FIRST_MOUNT_DONE = 3,
178 DFL_MOUNT_DONE = 4,
179 DFL_UNMOUNT = 5,
180 DFL_DLM_RECOVERY = 6,
144}; 181};
145 182
146struct lm_lockname { 183struct lm_lockname {
@@ -392,6 +429,7 @@ struct gfs2_jdesc {
392#define JDF_RECOVERY 1 429#define JDF_RECOVERY 1
393 unsigned int jd_jid; 430 unsigned int jd_jid;
394 unsigned int jd_blocks; 431 unsigned int jd_blocks;
432 int jd_recover_error;
395}; 433};
396 434
397struct gfs2_statfs_change_host { 435struct gfs2_statfs_change_host {
@@ -461,6 +499,7 @@ enum {
461 SDF_NORECOVERY = 4, 499 SDF_NORECOVERY = 4,
462 SDF_DEMOTE = 5, 500 SDF_DEMOTE = 5,
463 SDF_NOJOURNALID = 6, 501 SDF_NOJOURNALID = 6,
502 SDF_RORECOVERY = 7, /* read only recovery */
464}; 503};
465 504
466#define GFS2_FSNAME_LEN 256 505#define GFS2_FSNAME_LEN 256
@@ -499,14 +538,26 @@ struct gfs2_sb_host {
499struct lm_lockstruct { 538struct lm_lockstruct {
500 int ls_jid; 539 int ls_jid;
501 unsigned int ls_first; 540 unsigned int ls_first;
502 unsigned int ls_first_done;
503 unsigned int ls_nodir; 541 unsigned int ls_nodir;
504 const struct lm_lockops *ls_ops; 542 const struct lm_lockops *ls_ops;
505 unsigned long ls_flags;
506 dlm_lockspace_t *ls_dlm; 543 dlm_lockspace_t *ls_dlm;
507 544
508 int ls_recover_jid_done; 545 int ls_recover_jid_done; /* These two are deprecated, */
509 int ls_recover_jid_status; 546 int ls_recover_jid_status; /* used previously by gfs_controld */
547
548 struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
549 struct dlm_lksb ls_control_lksb; /* control_lock */
550 char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
551 struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
552
553 spinlock_t ls_recover_spin; /* protects following fields */
554 unsigned long ls_recover_flags; /* DFL_ */
555 uint32_t ls_recover_mount; /* gen in first recover_done cb */
556 uint32_t ls_recover_start; /* gen in last recover_done cb */
557 uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
558 uint32_t ls_recover_size; /* size of recover_submit, recover_result */
559 uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
560 uint32_t *ls_recover_result; /* result of last jid recovery */
510}; 561};
511 562
512struct gfs2_sbd { 563struct gfs2_sbd {
@@ -544,6 +595,7 @@ struct gfs2_sbd {
544 wait_queue_head_t sd_glock_wait; 595 wait_queue_head_t sd_glock_wait;
545 atomic_t sd_glock_disposal; 596 atomic_t sd_glock_disposal;
546 struct completion sd_locking_init; 597 struct completion sd_locking_init;
598 struct delayed_work sd_control_work;
547 599
548 /* Inode Stuff */ 600 /* Inode Stuff */
549 601
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 017960cf1d7a..a7d611b93f0f 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -599,9 +599,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
599 error = gfs2_meta_inode_buffer(ip, &dibh); 599 error = gfs2_meta_inode_buffer(ip, &dibh);
600 if (error) 600 if (error)
601 goto fail_end_trans; 601 goto fail_end_trans;
602 inc_nlink(&ip->i_inode); 602 set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
603 if (S_ISDIR(ip->i_inode.i_mode))
604 inc_nlink(&ip->i_inode);
605 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 603 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
606 gfs2_dinode_out(ip, dibh->b_data); 604 gfs2_dinode_out(ip, dibh->b_data);
607 brelse(dibh); 605 brelse(dibh);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 98c80d8c2a62..8944d1e32ab5 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. 3 * Copyright 2004-2011 Red Hat, Inc.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -11,12 +11,15 @@
11#include <linux/dlm.h> 11#include <linux/dlm.h>
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/delay.h>
14#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
15 16
16#include "incore.h" 17#include "incore.h"
17#include "glock.h" 18#include "glock.h"
18#include "util.h" 19#include "util.h"
20#include "sys.h"
19 21
22extern struct workqueue_struct *gfs2_control_wq;
20 23
21static void gdlm_ast(void *arg) 24static void gdlm_ast(void *arg)
22{ 25{
@@ -185,34 +188,1002 @@ static void gdlm_cancel(struct gfs2_glock *gl)
185 dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); 188 dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
186} 189}
187 190
188static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname) 191/*
192 * dlm/gfs2 recovery coordination using dlm_recover callbacks
193 *
194 * 1. dlm_controld sees lockspace members change
195 * 2. dlm_controld blocks dlm-kernel locking activity
196 * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep)
197 * 4. dlm_controld starts and finishes its own user level recovery
198 * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery
199 * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot)
200 * 7. dlm_recoverd does its own lock recovery
201 * 8. dlm_recoverd unblocks dlm-kernel locking activity
202 * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation)
203 * 10. gfs2_control updates control_lock lvb with new generation and jid bits
204 * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none)
205 * 12. gfs2_recover dequeues and recovers journals of failed nodes
206 * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result)
207 * 14. gfs2_control updates control_lock lvb jid bits for recovered journals
208 * 15. gfs2_control unblocks normal locking when all journals are recovered
209 *
210 * - failures during recovery
211 *
212 * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control
213 * clears BLOCK_LOCKS (step 15), e.g. another node fails while still
214 * recovering for a prior failure. gfs2_control needs a way to detect
215 * this so it can leave BLOCK_LOCKS set in step 15. This is managed using
216 * the recover_block and recover_start values.
217 *
218 * recover_done() provides a new lockspace generation number each time it
219 * is called (step 9). This generation number is saved as recover_start.
220 * When recover_prep() is called, it sets BLOCK_LOCKS and sets
221 * recover_block = recover_start. So, while recover_block is equal to
222 * recover_start, BLOCK_LOCKS should remain set. (recover_spin must
223 * be held around the BLOCK_LOCKS/recover_block/recover_start logic.)
224 *
225 * - more specific gfs2 steps in sequence above
226 *
227 * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start
228 * 6. recover_slot records any failed jids (maybe none)
229 * 9. recover_done sets recover_start = new generation number
230 * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids
231 * 12. gfs2_recover does journal recoveries for failed jids identified above
232 * 14. gfs2_control clears control_lock lvb bits for recovered jids
233 * 15. gfs2_control checks if recover_block == recover_start (step 3 occured
234 * again) then do nothing, otherwise if recover_start > recover_block
235 * then clear BLOCK_LOCKS.
236 *
237 * - parallel recovery steps across all nodes
238 *
239 * All nodes attempt to update the control_lock lvb with the new generation
240 * number and jid bits, but only the first to get the control_lock EX will
241 * do so; others will see that it's already done (lvb already contains new
242 * generation number.)
243 *
244 * . All nodes get the same recover_prep/recover_slot/recover_done callbacks
245 * . All nodes attempt to set control_lock lvb gen + bits for the new gen
246 * . One node gets control_lock first and writes the lvb, others see it's done
247 * . All nodes attempt to recover jids for which they see control_lock bits set
248 * . One node succeeds for a jid, and that one clears the jid bit in the lvb
249 * . All nodes will eventually see all lvb bits clear and unblock locks
250 *
251 * - is there a problem with clearing an lvb bit that should be set
252 * and missing a journal recovery?
253 *
254 * 1. jid fails
255 * 2. lvb bit set for step 1
256 * 3. jid recovered for step 1
257 * 4. jid taken again (new mount)
258 * 5. jid fails (for step 4)
259 * 6. lvb bit set for step 5 (will already be set)
260 * 7. lvb bit cleared for step 3
261 *
262 * This is not a problem because the failure in step 5 does not
263 * require recovery, because the mount in step 4 could not have
264 * progressed far enough to unblock locks and access the fs. The
265 * control_mount() function waits for all recoveries to be complete
266 * for the latest lockspace generation before ever unblocking locks
267 * and returning. The mount in step 4 waits until the recovery in
268 * step 1 is done.
269 *
270 * - special case of first mounter: first node to mount the fs
271 *
272 * The first node to mount a gfs2 fs needs to check all the journals
273 * and recover any that need recovery before other nodes are allowed
274 * to mount the fs. (Others may begin mounting, but they must wait
275 * for the first mounter to be done before taking locks on the fs
276 * or accessing the fs.) This has two parts:
277 *
278 * 1. The mounted_lock tells a node it's the first to mount the fs.
279 * Each node holds the mounted_lock in PR while it's mounted.
280 * Each node tries to acquire the mounted_lock in EX when it mounts.
281 * If a node is granted the mounted_lock EX it means there are no
282 * other mounted nodes (no PR locks exist), and it is the first mounter.
283 * The mounted_lock is demoted to PR when first recovery is done, so
284 * others will fail to get an EX lock, but will get a PR lock.
285 *
286 * 2. The control_lock blocks others in control_mount() while the first
287 * mounter is doing first mount recovery of all journals.
288 * A mounting node needs to acquire control_lock in EX mode before
289 * it can proceed. The first mounter holds control_lock in EX while doing
290 * the first mount recovery, blocking mounts from other nodes, then demotes
291 * control_lock to NL when it's done (others_may_mount/first_done),
292 * allowing other nodes to continue mounting.
293 *
294 * first mounter:
295 * control_lock EX/NOQUEUE success
296 * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters)
297 * set first=1
298 * do first mounter recovery
299 * mounted_lock EX->PR
300 * control_lock EX->NL, write lvb generation
301 *
302 * other mounter:
303 * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry)
304 * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR)
305 * mounted_lock PR/NOQUEUE success
306 * read lvb generation
307 * control_lock EX->NL
308 * set first=0
309 *
310 * - mount during recovery
311 *
312 * If a node mounts while others are doing recovery (not first mounter),
313 * the mounting node will get its initial recover_done() callback without
314 * having seen any previous failures/callbacks.
315 *
316 * It must wait for all recoveries preceding its mount to be finished
317 * before it unblocks locks. It does this by repeating the "other mounter"
318 * steps above until the lvb generation number is >= its mount generation
319 * number (from initial recover_done) and all lvb bits are clear.
320 *
321 * - control_lock lvb format
322 *
323 * 4 bytes generation number: the latest dlm lockspace generation number
324 * from recover_done callback. Indicates the jid bitmap has been updated
325 * to reflect all slot failures through that generation.
326 * 4 bytes unused.
327 * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates
328 * that jid N needs recovery.
329 */
330
331#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */
332
333static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
334 char *lvb_bits)
335{
336 uint32_t gen;
337 memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
338 memcpy(&gen, lvb_bits, sizeof(uint32_t));
339 *lvb_gen = le32_to_cpu(gen);
340}
341
342static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
343 char *lvb_bits)
344{
345 uint32_t gen;
346 memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
347 gen = cpu_to_le32(lvb_gen);
348 memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
349}
350
351static int all_jid_bits_clear(char *lvb)
352{
353 int i;
354 for (i = JID_BITMAP_OFFSET; i < GDLM_LVB_SIZE; i++) {
355 if (lvb[i])
356 return 0;
357 }
358 return 1;
359}
360
361static void sync_wait_cb(void *arg)
362{
363 struct lm_lockstruct *ls = arg;
364 complete(&ls->ls_sync_wait);
365}
366
367static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name)
189{ 368{
190 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 369 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
191 int error; 370 int error;
192 371
193 if (fsname == NULL) { 372 error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls);
194 fs_info(sdp, "no fsname found\n"); 373 if (error) {
195 return -EINVAL; 374 fs_err(sdp, "%s lkid %x error %d\n",
375 name, lksb->sb_lkid, error);
376 return error;
377 }
378
379 wait_for_completion(&ls->ls_sync_wait);
380
381 if (lksb->sb_status != -DLM_EUNLOCK) {
382 fs_err(sdp, "%s lkid %x status %d\n",
383 name, lksb->sb_lkid, lksb->sb_status);
384 return -1;
385 }
386 return 0;
387}
388
389static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags,
390 unsigned int num, struct dlm_lksb *lksb, char *name)
391{
392 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
393 char strname[GDLM_STRNAME_BYTES];
394 int error, status;
395
396 memset(strname, 0, GDLM_STRNAME_BYTES);
397 snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num);
398
399 error = dlm_lock(ls->ls_dlm, mode, lksb, flags,
400 strname, GDLM_STRNAME_BYTES - 1,
401 0, sync_wait_cb, ls, NULL);
402 if (error) {
403 fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n",
404 name, lksb->sb_lkid, flags, mode, error);
405 return error;
406 }
407
408 wait_for_completion(&ls->ls_sync_wait);
409
410 status = lksb->sb_status;
411
412 if (status && status != -EAGAIN) {
413 fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n",
414 name, lksb->sb_lkid, flags, mode, status);
415 }
416
417 return status;
418}
419
420static int mounted_unlock(struct gfs2_sbd *sdp)
421{
422 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
423 return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock");
424}
425
426static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
427{
428 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
429 return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK,
430 &ls->ls_mounted_lksb, "mounted_lock");
431}
432
433static int control_unlock(struct gfs2_sbd *sdp)
434{
435 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
436 return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock");
437}
438
439static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags)
440{
441 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
442 return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK,
443 &ls->ls_control_lksb, "control_lock");
444}
445
446static void gfs2_control_func(struct work_struct *work)
447{
448 struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work);
449 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
450 char lvb_bits[GDLM_LVB_SIZE];
451 uint32_t block_gen, start_gen, lvb_gen, flags;
452 int recover_set = 0;
453 int write_lvb = 0;
454 int recover_size;
455 int i, error;
456
457 spin_lock(&ls->ls_recover_spin);
458 /*
459 * No MOUNT_DONE means we're still mounting; control_mount()
460 * will set this flag, after which this thread will take over
461 * all further clearing of BLOCK_LOCKS.
462 *
463 * FIRST_MOUNT means this node is doing first mounter recovery,
464 * for which recovery control is handled by
465 * control_mount()/control_first_done(), not this thread.
466 */
467 if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
468 test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
469 spin_unlock(&ls->ls_recover_spin);
470 return;
471 }
472 block_gen = ls->ls_recover_block;
473 start_gen = ls->ls_recover_start;
474 spin_unlock(&ls->ls_recover_spin);
475
476 /*
477 * Equal block_gen and start_gen implies we are between
478 * recover_prep and recover_done callbacks, which means
479 * dlm recovery is in progress and dlm locking is blocked.
480 * There's no point trying to do any work until recover_done.
481 */
482
483 if (block_gen == start_gen)
484 return;
485
486 /*
487 * Propagate recover_submit[] and recover_result[] to lvb:
488 * dlm_recoverd adds to recover_submit[] jids needing recovery
489 * gfs2_recover adds to recover_result[] journal recovery results
490 *
491 * set lvb bit for jids in recover_submit[] if the lvb has not
492 * yet been updated for the generation of the failure
493 *
494 * clear lvb bit for jids in recover_result[] if the result of
495 * the journal recovery is SUCCESS
496 */
497
498 error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
499 if (error) {
500 fs_err(sdp, "control lock EX error %d\n", error);
501 return;
502 }
503
504 control_lvb_read(ls, &lvb_gen, lvb_bits);
505
506 spin_lock(&ls->ls_recover_spin);
507 if (block_gen != ls->ls_recover_block ||
508 start_gen != ls->ls_recover_start) {
509 fs_info(sdp, "recover generation %u block1 %u %u\n",
510 start_gen, block_gen, ls->ls_recover_block);
511 spin_unlock(&ls->ls_recover_spin);
512 control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
513 return;
514 }
515
516 recover_size = ls->ls_recover_size;
517
518 if (lvb_gen <= start_gen) {
519 /*
520 * Clear lvb bits for jids we've successfully recovered.
521 * Because all nodes attempt to recover failed journals,
522 * a journal can be recovered multiple times successfully
523 * in succession. Only the first will really do recovery,
524 * the others find it clean, but still report a successful
525 * recovery. So, another node may have already recovered
526 * the jid and cleared the lvb bit for it.
527 */
528 for (i = 0; i < recover_size; i++) {
529 if (ls->ls_recover_result[i] != LM_RD_SUCCESS)
530 continue;
531
532 ls->ls_recover_result[i] = 0;
533
534 if (!test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET))
535 continue;
536
537 __clear_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
538 write_lvb = 1;
539 }
540 }
541
542 if (lvb_gen == start_gen) {
543 /*
544 * Failed slots before start_gen are already set in lvb.
545 */
546 for (i = 0; i < recover_size; i++) {
547 if (!ls->ls_recover_submit[i])
548 continue;
549 if (ls->ls_recover_submit[i] < lvb_gen)
550 ls->ls_recover_submit[i] = 0;
551 }
552 } else if (lvb_gen < start_gen) {
553 /*
554 * Failed slots before start_gen are not yet set in lvb.
555 */
556 for (i = 0; i < recover_size; i++) {
557 if (!ls->ls_recover_submit[i])
558 continue;
559 if (ls->ls_recover_submit[i] < start_gen) {
560 ls->ls_recover_submit[i] = 0;
561 __set_bit_le(i, lvb_bits + JID_BITMAP_OFFSET);
562 }
563 }
564 /* even if there are no bits to set, we need to write the
565 latest generation to the lvb */
566 write_lvb = 1;
567 } else {
568 /*
569 * we should be getting a recover_done() for lvb_gen soon
570 */
571 }
572 spin_unlock(&ls->ls_recover_spin);
573
574 if (write_lvb) {
575 control_lvb_write(ls, start_gen, lvb_bits);
576 flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK;
577 } else {
578 flags = DLM_LKF_CONVERT;
579 }
580
581 error = control_lock(sdp, DLM_LOCK_NL, flags);
582 if (error) {
583 fs_err(sdp, "control lock NL error %d\n", error);
584 return;
585 }
586
587 /*
588 * Everyone will see jid bits set in the lvb, run gfs2_recover_set(),
589 * and clear a jid bit in the lvb if the recovery is a success.
590 * Eventually all journals will be recovered, all jid bits will
591 * be cleared in the lvb, and everyone will clear BLOCK_LOCKS.
592 */
593
594 for (i = 0; i < recover_size; i++) {
595 if (test_bit_le(i, lvb_bits + JID_BITMAP_OFFSET)) {
596 fs_info(sdp, "recover generation %u jid %d\n",
597 start_gen, i);
598 gfs2_recover_set(sdp, i);
599 recover_set++;
600 }
601 }
602 if (recover_set)
603 return;
604
605 /*
606 * No more jid bits set in lvb, all recovery is done, unblock locks
607 * (unless a new recover_prep callback has occured blocking locks
608 * again while working above)
609 */
610
611 spin_lock(&ls->ls_recover_spin);
612 if (ls->ls_recover_block == block_gen &&
613 ls->ls_recover_start == start_gen) {
614 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
615 spin_unlock(&ls->ls_recover_spin);
616 fs_info(sdp, "recover generation %u done\n", start_gen);
617 gfs2_glock_thaw(sdp);
618 } else {
619 fs_info(sdp, "recover generation %u block2 %u %u\n",
620 start_gen, block_gen, ls->ls_recover_block);
621 spin_unlock(&ls->ls_recover_spin);
622 }
623}
624
625static int control_mount(struct gfs2_sbd *sdp)
626{
627 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
628 char lvb_bits[GDLM_LVB_SIZE];
629 uint32_t start_gen, block_gen, mount_gen, lvb_gen;
630 int mounted_mode;
631 int retries = 0;
632 int error;
633
634 memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb));
635 memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb));
636 memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE);
637 ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb;
638 init_completion(&ls->ls_sync_wait);
639
640 set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
641
642 error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK);
643 if (error) {
644 fs_err(sdp, "control_mount control_lock NL error %d\n", error);
645 return error;
646 }
647
648 error = mounted_lock(sdp, DLM_LOCK_NL, 0);
649 if (error) {
650 fs_err(sdp, "control_mount mounted_lock NL error %d\n", error);
651 control_unlock(sdp);
652 return error;
653 }
654 mounted_mode = DLM_LOCK_NL;
655
656restart:
657 if (retries++ && signal_pending(current)) {
658 error = -EINTR;
659 goto fail;
660 }
661
662 /*
663 * We always start with both locks in NL. control_lock is
664 * demoted to NL below so we don't need to do it here.
665 */
666
667 if (mounted_mode != DLM_LOCK_NL) {
668 error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
669 if (error)
670 goto fail;
671 mounted_mode = DLM_LOCK_NL;
672 }
673
674 /*
675 * Other nodes need to do some work in dlm recovery and gfs2_control
676 * before the recover_done and control_lock will be ready for us below.
677 * A delay here is not required but often avoids having to retry.
678 */
679
680 msleep_interruptible(500);
681
682 /*
683 * Acquire control_lock in EX and mounted_lock in either EX or PR.
684 * control_lock lvb keeps track of any pending journal recoveries.
685 * mounted_lock indicates if any other nodes have the fs mounted.
686 */
687
688 error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK);
689 if (error == -EAGAIN) {
690 goto restart;
691 } else if (error) {
692 fs_err(sdp, "control_mount control_lock EX error %d\n", error);
693 goto fail;
694 }
695
696 error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
697 if (!error) {
698 mounted_mode = DLM_LOCK_EX;
699 goto locks_done;
700 } else if (error != -EAGAIN) {
701 fs_err(sdp, "control_mount mounted_lock EX error %d\n", error);
702 goto fail;
703 }
704
705 error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE);
706 if (!error) {
707 mounted_mode = DLM_LOCK_PR;
708 goto locks_done;
709 } else {
710 /* not even -EAGAIN should happen here */
711 fs_err(sdp, "control_mount mounted_lock PR error %d\n", error);
712 goto fail;
713 }
714
715locks_done:
716 /*
717 * If we got both locks above in EX, then we're the first mounter.
718 * If not, then we need to wait for the control_lock lvb to be
719 * updated by other mounted nodes to reflect our mount generation.
720 *
721 * In simple first mounter cases, first mounter will see zero lvb_gen,
722 * but in cases where all existing nodes leave/fail before mounting
723 * nodes finish control_mount, then all nodes will be mounting and
724 * lvb_gen will be non-zero.
725 */
726
727 control_lvb_read(ls, &lvb_gen, lvb_bits);
728
729 if (lvb_gen == 0xFFFFFFFF) {
730 /* special value to force mount attempts to fail */
731 fs_err(sdp, "control_mount control_lock disabled\n");
732 error = -EINVAL;
733 goto fail;
734 }
735
736 if (mounted_mode == DLM_LOCK_EX) {
737 /* first mounter, keep both EX while doing first recovery */
738 spin_lock(&ls->ls_recover_spin);
739 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
740 set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
741 set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
742 spin_unlock(&ls->ls_recover_spin);
743 fs_info(sdp, "first mounter control generation %u\n", lvb_gen);
744 return 0;
745 }
746
747 error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT);
748 if (error)
749 goto fail;
750
751 /*
752 * We are not first mounter, now we need to wait for the control_lock
753 * lvb generation to be >= the generation from our first recover_done
754 * and all lvb bits to be clear (no pending journal recoveries.)
755 */
756
757 if (!all_jid_bits_clear(lvb_bits)) {
758 /* journals need recovery, wait until all are clear */
759 fs_info(sdp, "control_mount wait for journal recovery\n");
760 goto restart;
761 }
762
763 spin_lock(&ls->ls_recover_spin);
764 block_gen = ls->ls_recover_block;
765 start_gen = ls->ls_recover_start;
766 mount_gen = ls->ls_recover_mount;
767
768 if (lvb_gen < mount_gen) {
769 /* wait for mounted nodes to update control_lock lvb to our
770 generation, which might include new recovery bits set */
771 fs_info(sdp, "control_mount wait1 block %u start %u mount %u "
772 "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
773 lvb_gen, ls->ls_recover_flags);
774 spin_unlock(&ls->ls_recover_spin);
775 goto restart;
776 }
777
778 if (lvb_gen != start_gen) {
779 /* wait for mounted nodes to update control_lock lvb to the
780 latest recovery generation */
781 fs_info(sdp, "control_mount wait2 block %u start %u mount %u "
782 "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
783 lvb_gen, ls->ls_recover_flags);
784 spin_unlock(&ls->ls_recover_spin);
785 goto restart;
786 }
787
788 if (block_gen == start_gen) {
789 /* dlm recovery in progress, wait for it to finish */
790 fs_info(sdp, "control_mount wait3 block %u start %u mount %u "
791 "lvb %u flags %lx\n", block_gen, start_gen, mount_gen,
792 lvb_gen, ls->ls_recover_flags);
793 spin_unlock(&ls->ls_recover_spin);
794 goto restart;
196 } 795 }
197 796
198 error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm, 797 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
199 DLM_LSFL_FS | DLM_LSFL_NEWEXCL | 798 set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags);
200 (ls->ls_nodir ? DLM_LSFL_NODIR : 0), 799 memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
201 GDLM_LVB_SIZE); 800 memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
801 spin_unlock(&ls->ls_recover_spin);
802 return 0;
803
804fail:
805 mounted_unlock(sdp);
806 control_unlock(sdp);
807 return error;
808}
809
810static int dlm_recovery_wait(void *word)
811{
812 schedule();
813 return 0;
814}
815
816static int control_first_done(struct gfs2_sbd *sdp)
817{
818 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
819 char lvb_bits[GDLM_LVB_SIZE];
820 uint32_t start_gen, block_gen;
821 int error;
822
823restart:
824 spin_lock(&ls->ls_recover_spin);
825 start_gen = ls->ls_recover_start;
826 block_gen = ls->ls_recover_block;
827
828 if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) ||
829 !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
830 !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
831 /* sanity check, should not happen */
832 fs_err(sdp, "control_first_done start %u block %u flags %lx\n",
833 start_gen, block_gen, ls->ls_recover_flags);
834 spin_unlock(&ls->ls_recover_spin);
835 control_unlock(sdp);
836 return -1;
837 }
838
839 if (start_gen == block_gen) {
840 /*
841 * Wait for the end of a dlm recovery cycle to switch from
842 * first mounter recovery. We can ignore any recover_slot
843 * callbacks between the recover_prep and next recover_done
844 * because we are still the first mounter and any failed nodes
845 * have not fully mounted, so they don't need recovery.
846 */
847 spin_unlock(&ls->ls_recover_spin);
848 fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
849
850 wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
851 dlm_recovery_wait, TASK_UNINTERRUPTIBLE);
852 goto restart;
853 }
854
855 clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
856 set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags);
857 memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t));
858 memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t));
859 spin_unlock(&ls->ls_recover_spin);
860
861 memset(lvb_bits, 0, sizeof(lvb_bits));
862 control_lvb_write(ls, start_gen, lvb_bits);
863
864 error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT);
865 if (error)
866 fs_err(sdp, "control_first_done mounted PR error %d\n", error);
867
868 error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
202 if (error) 869 if (error)
203 printk(KERN_ERR "dlm_new_lockspace error %d", error); 870 fs_err(sdp, "control_first_done control NL error %d\n", error);
204 871
205 return error; 872 return error;
206} 873}
207 874
875/*
876 * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC)
877 * to accomodate the largest slot number. (NB dlm slot numbers start at 1,
878 * gfs2 jids start at 0, so jid = slot - 1)
879 */
880
881#define RECOVER_SIZE_INC 16
882
883static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
884 int num_slots)
885{
886 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
887 uint32_t *submit = NULL;
888 uint32_t *result = NULL;
889 uint32_t old_size, new_size;
890 int i, max_jid;
891
892 max_jid = 0;
893 for (i = 0; i < num_slots; i++) {
894 if (max_jid < slots[i].slot - 1)
895 max_jid = slots[i].slot - 1;
896 }
897
898 old_size = ls->ls_recover_size;
899
900 if (old_size >= max_jid + 1)
901 return 0;
902
903 new_size = old_size + RECOVER_SIZE_INC;
904
905 submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
906 result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
907 if (!submit || !result) {
908 kfree(submit);
909 kfree(result);
910 return -ENOMEM;
911 }
912
913 spin_lock(&ls->ls_recover_spin);
914 memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t));
915 memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t));
916 kfree(ls->ls_recover_submit);
917 kfree(ls->ls_recover_result);
918 ls->ls_recover_submit = submit;
919 ls->ls_recover_result = result;
920 ls->ls_recover_size = new_size;
921 spin_unlock(&ls->ls_recover_spin);
922 return 0;
923}
924
925static void free_recover_size(struct lm_lockstruct *ls)
926{
927 kfree(ls->ls_recover_submit);
928 kfree(ls->ls_recover_result);
929 ls->ls_recover_submit = NULL;
930 ls->ls_recover_result = NULL;
931 ls->ls_recover_size = 0;
932}
933
934/* dlm calls before it does lock recovery */
935
936static void gdlm_recover_prep(void *arg)
937{
938 struct gfs2_sbd *sdp = arg;
939 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
940
941 spin_lock(&ls->ls_recover_spin);
942 ls->ls_recover_block = ls->ls_recover_start;
943 set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
944
945 if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) ||
946 test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
947 spin_unlock(&ls->ls_recover_spin);
948 return;
949 }
950 set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
951 spin_unlock(&ls->ls_recover_spin);
952}
953
954/* dlm calls after recover_prep has been completed on all lockspace members;
955 identifies slot/jid of failed member */
956
957static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
958{
959 struct gfs2_sbd *sdp = arg;
960 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
961 int jid = slot->slot - 1;
962
963 spin_lock(&ls->ls_recover_spin);
964 if (ls->ls_recover_size < jid + 1) {
965 fs_err(sdp, "recover_slot jid %d gen %u short size %d",
966 jid, ls->ls_recover_block, ls->ls_recover_size);
967 spin_unlock(&ls->ls_recover_spin);
968 return;
969 }
970
971 if (ls->ls_recover_submit[jid]) {
972 fs_info(sdp, "recover_slot jid %d gen %u prev %u",
973 jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
974 }
975 ls->ls_recover_submit[jid] = ls->ls_recover_block;
976 spin_unlock(&ls->ls_recover_spin);
977}
978
979/* dlm calls after recover_slot and after it completes lock recovery */
980
981static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
982 int our_slot, uint32_t generation)
983{
984 struct gfs2_sbd *sdp = arg;
985 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
986
987 /* ensure the ls jid arrays are large enough */
988 set_recover_size(sdp, slots, num_slots);
989
990 spin_lock(&ls->ls_recover_spin);
991 ls->ls_recover_start = generation;
992
993 if (!ls->ls_recover_mount) {
994 ls->ls_recover_mount = generation;
995 ls->ls_jid = our_slot - 1;
996 }
997
998 if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
999 queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
1000
1001 clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
1002 smp_mb__after_clear_bit();
1003 wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
1004 spin_unlock(&ls->ls_recover_spin);
1005}
1006
1007/* gfs2_recover thread has a journal recovery result */
1008
1009static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
1010 unsigned int result)
1011{
1012 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
1013
1014 if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
1015 return;
1016
1017 /* don't care about the recovery of own journal during mount */
1018 if (jid == ls->ls_jid)
1019 return;
1020
1021 spin_lock(&ls->ls_recover_spin);
1022 if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) {
1023 spin_unlock(&ls->ls_recover_spin);
1024 return;
1025 }
1026 if (ls->ls_recover_size < jid + 1) {
1027 fs_err(sdp, "recovery_result jid %d short size %d",
1028 jid, ls->ls_recover_size);
1029 spin_unlock(&ls->ls_recover_spin);
1030 return;
1031 }
1032
1033 fs_info(sdp, "recover jid %d result %s\n", jid,
1034 result == LM_RD_GAVEUP ? "busy" : "success");
1035
1036 ls->ls_recover_result[jid] = result;
1037
1038 /* GAVEUP means another node is recovering the journal; delay our
1039 next attempt to recover it, to give the other node a chance to
1040 finish before trying again */
1041
1042 if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags))
1043 queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work,
1044 result == LM_RD_GAVEUP ? HZ : 0);
1045 spin_unlock(&ls->ls_recover_spin);
1046}
1047
1048const struct dlm_lockspace_ops gdlm_lockspace_ops = {
1049 .recover_prep = gdlm_recover_prep,
1050 .recover_slot = gdlm_recover_slot,
1051 .recover_done = gdlm_recover_done,
1052};
1053
1054static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
1055{
1056 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
1057 char cluster[GFS2_LOCKNAME_LEN];
1058 const char *fsname;
1059 uint32_t flags;
1060 int error, ops_result;
1061
1062 /*
1063 * initialize everything
1064 */
1065
1066 INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func);
1067 spin_lock_init(&ls->ls_recover_spin);
1068 ls->ls_recover_flags = 0;
1069 ls->ls_recover_mount = 0;
1070 ls->ls_recover_start = 0;
1071 ls->ls_recover_block = 0;
1072 ls->ls_recover_size = 0;
1073 ls->ls_recover_submit = NULL;
1074 ls->ls_recover_result = NULL;
1075
1076 error = set_recover_size(sdp, NULL, 0);
1077 if (error)
1078 goto fail;
1079
1080 /*
1081 * prepare dlm_new_lockspace args
1082 */
1083
1084 fsname = strchr(table, ':');
1085 if (!fsname) {
1086 fs_info(sdp, "no fsname found\n");
1087 error = -EINVAL;
1088 goto fail_free;
1089 }
1090 memset(cluster, 0, sizeof(cluster));
1091 memcpy(cluster, table, strlen(table) - strlen(fsname));
1092 fsname++;
1093
1094 flags = DLM_LSFL_FS | DLM_LSFL_NEWEXCL;
1095 if (ls->ls_nodir)
1096 flags |= DLM_LSFL_NODIR;
1097
1098 /*
1099 * create/join lockspace
1100 */
1101
1102 error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE,
1103 &gdlm_lockspace_ops, sdp, &ops_result,
1104 &ls->ls_dlm);
1105 if (error) {
1106 fs_err(sdp, "dlm_new_lockspace error %d\n", error);
1107 goto fail_free;
1108 }
1109
1110 if (ops_result < 0) {
1111 /*
1112 * dlm does not support ops callbacks,
1113 * old dlm_controld/gfs_controld are used, try without ops.
1114 */
1115 fs_info(sdp, "dlm lockspace ops not used\n");
1116 free_recover_size(ls);
1117 set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags);
1118 return 0;
1119 }
1120
1121 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) {
1122 fs_err(sdp, "dlm lockspace ops disallow jid preset\n");
1123 error = -EINVAL;
1124 goto fail_release;
1125 }
1126
1127 /*
1128 * control_mount() uses control_lock to determine first mounter,
1129 * and for later mounts, waits for any recoveries to be cleared.
1130 */
1131
1132 error = control_mount(sdp);
1133 if (error) {
1134 fs_err(sdp, "mount control error %d\n", error);
1135 goto fail_release;
1136 }
1137
1138 ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
1139 clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
1140 smp_mb__after_clear_bit();
1141 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
1142 return 0;
1143
1144fail_release:
1145 dlm_release_lockspace(ls->ls_dlm, 2);
1146fail_free:
1147 free_recover_size(ls);
1148fail:
1149 return error;
1150}
1151
1152static void gdlm_first_done(struct gfs2_sbd *sdp)
1153{
1154 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
1155 int error;
1156
1157 if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
1158 return;
1159
1160 error = control_first_done(sdp);
1161 if (error)
1162 fs_err(sdp, "mount first_done error %d\n", error);
1163}
1164
208static void gdlm_unmount(struct gfs2_sbd *sdp) 1165static void gdlm_unmount(struct gfs2_sbd *sdp)
209{ 1166{
210 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 1167 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
211 1168
1169 if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags))
1170 goto release;
1171
1172 /* wait for gfs2_control_wq to be done with this mount */
1173
1174 spin_lock(&ls->ls_recover_spin);
1175 set_bit(DFL_UNMOUNT, &ls->ls_recover_flags);
1176 spin_unlock(&ls->ls_recover_spin);
1177 flush_delayed_work_sync(&sdp->sd_control_work);
1178
1179 /* mounted_lock and control_lock will be purged in dlm recovery */
1180release:
212 if (ls->ls_dlm) { 1181 if (ls->ls_dlm) {
213 dlm_release_lockspace(ls->ls_dlm, 2); 1182 dlm_release_lockspace(ls->ls_dlm, 2);
214 ls->ls_dlm = NULL; 1183 ls->ls_dlm = NULL;
215 } 1184 }
1185
1186 free_recover_size(ls);
216} 1187}
217 1188
218static const match_table_t dlm_tokens = { 1189static const match_table_t dlm_tokens = {
@@ -226,6 +1197,8 @@ static const match_table_t dlm_tokens = {
226const struct lm_lockops gfs2_dlm_ops = { 1197const struct lm_lockops gfs2_dlm_ops = {
227 .lm_proto_name = "lock_dlm", 1198 .lm_proto_name = "lock_dlm",
228 .lm_mount = gdlm_mount, 1199 .lm_mount = gdlm_mount,
1200 .lm_first_done = gdlm_first_done,
1201 .lm_recovery_result = gdlm_recovery_result,
229 .lm_unmount = gdlm_unmount, 1202 .lm_unmount = gdlm_unmount,
230 .lm_put_lock = gdlm_put_lock, 1203 .lm_put_lock = gdlm_put_lock,
231 .lm_lock = gdlm_lock, 1204 .lm_lock = gdlm_lock,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index c150298e2d8e..a8d9bcd0e19c 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -28,6 +28,8 @@
28#include "recovery.h" 28#include "recovery.h"
29#include "dir.h" 29#include "dir.h"
30 30
31struct workqueue_struct *gfs2_control_wq;
32
31static struct shrinker qd_shrinker = { 33static struct shrinker qd_shrinker = {
32 .shrink = gfs2_shrink_qd_memory, 34 .shrink = gfs2_shrink_qd_memory,
33 .seeks = DEFAULT_SEEKS, 35 .seeks = DEFAULT_SEEKS,
@@ -146,12 +148,19 @@ static int __init init_gfs2_fs(void)
146 if (!gfs_recovery_wq) 148 if (!gfs_recovery_wq)
147 goto fail_wq; 149 goto fail_wq;
148 150
151 gfs2_control_wq = alloc_workqueue("gfs2_control",
152 WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
153 if (!gfs2_control_wq)
154 goto fail_control;
155
149 gfs2_register_debugfs(); 156 gfs2_register_debugfs();
150 157
151 printk("GFS2 installed\n"); 158 printk("GFS2 installed\n");
152 159
153 return 0; 160 return 0;
154 161
162fail_control:
163 destroy_workqueue(gfs_recovery_wq);
155fail_wq: 164fail_wq:
156 unregister_filesystem(&gfs2meta_fs_type); 165 unregister_filesystem(&gfs2meta_fs_type);
157fail_unregister: 166fail_unregister:
@@ -195,6 +204,7 @@ static void __exit exit_gfs2_fs(void)
195 unregister_filesystem(&gfs2_fs_type); 204 unregister_filesystem(&gfs2_fs_type);
196 unregister_filesystem(&gfs2meta_fs_type); 205 unregister_filesystem(&gfs2meta_fs_type);
197 destroy_workqueue(gfs_recovery_wq); 206 destroy_workqueue(gfs_recovery_wq);
207 destroy_workqueue(gfs2_control_wq);
198 208
199 rcu_barrier(); 209 rcu_barrier();
200 210
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index fe72e79e6ff9..6aacf3f230a2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
562{ 562{
563 char *message = "FIRSTMOUNT=Done"; 563 char *message = "FIRSTMOUNT=Done";
564 char *envp[] = { message, NULL }; 564 char *envp[] = { message, NULL };
565 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 565
566 ls->ls_first_done = 1; 566 fs_info(sdp, "first mount done, others may mount\n");
567
568 if (sdp->sd_lockstruct.ls_ops->lm_first_done)
569 sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
570
567 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 571 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
568} 572}
569 573
@@ -944,7 +948,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
944 struct gfs2_args *args = &sdp->sd_args; 948 struct gfs2_args *args = &sdp->sd_args;
945 const char *proto = sdp->sd_proto_name; 949 const char *proto = sdp->sd_proto_name;
946 const char *table = sdp->sd_table_name; 950 const char *table = sdp->sd_table_name;
947 const char *fsname;
948 char *o, *options; 951 char *o, *options;
949 int ret; 952 int ret;
950 953
@@ -1004,21 +1007,12 @@ hostdata_error:
1004 } 1007 }
1005 } 1008 }
1006 1009
1007 if (sdp->sd_args.ar_spectator)
1008 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
1009 else
1010 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
1011 sdp->sd_lockstruct.ls_jid);
1012
1013 fsname = strchr(table, ':');
1014 if (fsname)
1015 fsname++;
1016 if (lm->lm_mount == NULL) { 1010 if (lm->lm_mount == NULL) {
1017 fs_info(sdp, "Now mounting FS...\n"); 1011 fs_info(sdp, "Now mounting FS...\n");
1018 complete_all(&sdp->sd_locking_init); 1012 complete_all(&sdp->sd_locking_init);
1019 return 0; 1013 return 0;
1020 } 1014 }
1021 ret = lm->lm_mount(sdp, fsname); 1015 ret = lm->lm_mount(sdp, table);
1022 if (ret == 0) 1016 if (ret == 0)
1023 fs_info(sdp, "Joined cluster. Now mounting FS...\n"); 1017 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
1024 complete_all(&sdp->sd_locking_init); 1018 complete_all(&sdp->sd_locking_init);
@@ -1084,7 +1078,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1084 1078
1085 if (sdp->sd_args.ar_spectator) { 1079 if (sdp->sd_args.ar_spectator) {
1086 sb->s_flags |= MS_RDONLY; 1080 sb->s_flags |= MS_RDONLY;
1087 set_bit(SDF_NORECOVERY, &sdp->sd_flags); 1081 set_bit(SDF_RORECOVERY, &sdp->sd_flags);
1088 } 1082 }
1089 if (sdp->sd_args.ar_posix_acl) 1083 if (sdp->sd_args.ar_posix_acl)
1090 sb->s_flags |= MS_POSIXACL; 1084 sb->s_flags |= MS_POSIXACL;
@@ -1124,6 +1118,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1124 if (error) 1118 if (error)
1125 goto fail; 1119 goto fail;
1126 1120
1121 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
1122
1127 gfs2_create_debugfs_file(sdp); 1123 gfs2_create_debugfs_file(sdp);
1128 1124
1129 error = gfs2_sys_fs_add(sdp); 1125 error = gfs2_sys_fs_add(sdp);
@@ -1160,6 +1156,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1160 goto fail_sb; 1156 goto fail_sb;
1161 } 1157 }
1162 1158
1159 if (sdp->sd_args.ar_spectator)
1160 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
1161 sdp->sd_table_name);
1162 else
1163 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
1164 sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
1165
1163 error = init_inodes(sdp, DO); 1166 error = init_inodes(sdp, DO);
1164 if (error) 1167 if (error)
1165 goto fail_sb; 1168 goto fail_sb;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f2a02edcac8f..963b2d75200c 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
436 char env_status[20]; 436 char env_status[20];
437 char *envp[] = { env_jid, env_status, NULL }; 437 char *envp[] = { env_jid, env_status, NULL };
438 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 438 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
439
439 ls->ls_recover_jid_done = jid; 440 ls->ls_recover_jid_done = jid;
440 ls->ls_recover_jid_status = message; 441 ls->ls_recover_jid_status = message;
441 sprintf(env_jid, "JID=%d", jid); 442 sprintf(env_jid, "JID=%d", jid);
442 sprintf(env_status, "RECOVERY=%s", 443 sprintf(env_status, "RECOVERY=%s",
443 message == LM_RD_SUCCESS ? "Done" : "Failed"); 444 message == LM_RD_SUCCESS ? "Done" : "Failed");
444 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); 445 kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
446
447 if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
448 sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
445} 449}
446 450
447void gfs2_recover_func(struct work_struct *work) 451void gfs2_recover_func(struct work_struct *work)
@@ -512,7 +516,9 @@ void gfs2_recover_func(struct work_struct *work)
512 if (error) 516 if (error)
513 goto fail_gunlock_ji; 517 goto fail_gunlock_ji;
514 518
515 if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { 519 if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) {
520 ro = 1;
521 } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) {
516 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) 522 if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
517 ro = 1; 523 ro = 1;
518 } else { 524 } else {
@@ -577,6 +583,7 @@ fail_gunlock_j:
577 583
578 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); 584 fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
579fail: 585fail:
586 jd->jd_recover_error = error;
580 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); 587 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
581done: 588done:
582 clear_bit(JDF_RECOVERY, &jd->jd_flags); 589 clear_bit(JDF_RECOVERY, &jd->jd_flags);
@@ -605,6 +612,6 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
605 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, 612 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait,
606 TASK_UNINTERRUPTIBLE); 613 TASK_UNINTERRUPTIBLE);
607 614
608 return 0; 615 return wait ? jd->jd_recover_error : 0;
609} 616}
610 617
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 22234627f684..981bfa32121a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1108,9 +1108,9 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1108{ 1108{
1109 struct gfs2_blkreserv *rs = ip->i_res; 1109 struct gfs2_blkreserv *rs = ip->i_res;
1110 1110
1111 gfs2_blkrsv_put(ip);
1112 if (rs->rs_rgd_gh.gh_gl) 1111 if (rs->rs_rgd_gh.gh_gl)
1113 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1112 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1113 gfs2_blkrsv_put(ip);
1114} 1114}
1115 1115
1116/** 1116/**
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 443cabcfcd23..d33172c291ba 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
298 ssize_t ret; 298 ssize_t ret;
299 int val = 0; 299 int val = 0;
300 300
301 if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags)) 301 if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
302 val = 1; 302 val = 1;
303 ret = sprintf(buf, "%d\n", val); 303 ret = sprintf(buf, "%d\n", val);
304 return ret; 304 return ret;
@@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
313 val = simple_strtol(buf, NULL, 0); 313 val = simple_strtol(buf, NULL, 0);
314 314
315 if (val == 1) 315 if (val == 1)
316 set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); 316 set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
317 else if (val == 0) { 317 else if (val == 0) {
318 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags); 318 clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
319 smp_mb__after_clear_bit(); 319 smp_mb__after_clear_bit();
320 gfs2_glock_thaw(sdp); 320 gfs2_glock_thaw(sdp);
321 } else { 321 } else {
@@ -350,8 +350,8 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
350 goto out; 350 goto out;
351 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 351 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
352 goto out; 352 goto out;
353 sdp->sd_lockstruct.ls_first = first; 353 sdp->sd_lockstruct.ls_first = first;
354 rv = 0; 354 rv = 0;
355out: 355out:
356 spin_unlock(&sdp->sd_jindex_spin); 356 spin_unlock(&sdp->sd_jindex_spin);
357 return rv ? rv : len; 357 return rv ? rv : len;
@@ -360,19 +360,14 @@ out:
360static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) 360static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
361{ 361{
362 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 362 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
363 return sprintf(buf, "%d\n", ls->ls_first_done); 363 return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
364} 364}
365 365
366static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 366int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
367{ 367{
368 unsigned jid;
369 struct gfs2_jdesc *jd; 368 struct gfs2_jdesc *jd;
370 int rv; 369 int rv;
371 370
372 rv = sscanf(buf, "%u", &jid);
373 if (rv != 1)
374 return -EINVAL;
375
376 rv = -ESHUTDOWN; 371 rv = -ESHUTDOWN;
377 spin_lock(&sdp->sd_jindex_spin); 372 spin_lock(&sdp->sd_jindex_spin);
378 if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) 373 if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
@@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
389 } 384 }
390out: 385out:
391 spin_unlock(&sdp->sd_jindex_spin); 386 spin_unlock(&sdp->sd_jindex_spin);
387 return rv;
388}
389
390static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
391{
392 unsigned jid;
393 int rv;
394
395 rv = sscanf(buf, "%u", &jid);
396 if (rv != 1)
397 return -EINVAL;
398
399 rv = gfs2_recover_set(sdp, jid);
400
392 return rv ? rv : len; 401 return rv ? rv : len;
393} 402}
394 403
diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h
index e94560e836d7..79182d6ad6ac 100644
--- a/fs/gfs2/sys.h
+++ b/fs/gfs2/sys.h
@@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
19int gfs2_sys_init(void); 19int gfs2_sys_init(void);
20void gfs2_sys_uninit(void); 20void gfs2_sys_uninit(void);
21 21
22int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
23
22#endif /* __SYS_DOT_H__ */ 24#endif /* __SYS_DOT_H__ */
23 25
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index edf0a801446b..427682ca9e48 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -499,9 +499,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
499 if (!sbi->hidden_dir) { 499 if (!sbi->hidden_dir) {
500 mutex_lock(&sbi->vh_mutex); 500 mutex_lock(&sbi->vh_mutex);
501 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR); 501 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
502 hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str, 502 if (!sbi->hidden_dir) {
503 sbi->hidden_dir); 503 mutex_unlock(&sbi->vh_mutex);
504 err = -ENOMEM;
505 goto out_put_root;
506 }
507 err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
508 &str, sbi->hidden_dir);
504 mutex_unlock(&sbi->vh_mutex); 509 mutex_unlock(&sbi->vh_mutex);
510 if (err)
511 goto out_put_hidden_dir;
505 512
506 hfsplus_mark_inode_dirty(sbi->hidden_dir, 513 hfsplus_mark_inode_dirty(sbi->hidden_dir,
507 HFSPLUS_I_CAT_DIRTY); 514 HFSPLUS_I_CAT_DIRTY);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index e425ad9d0490..1e85a7ac0217 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page)
583} 583}
584 584
585static int hugetlbfs_migrate_page(struct address_space *mapping, 585static int hugetlbfs_migrate_page(struct address_space *mapping,
586 struct page *newpage, struct page *page) 586 struct page *newpage, struct page *page,
587 enum migrate_mode mode)
587{ 588{
588 int rc; 589 int rc;
589 590
diff --git a/fs/inode.c b/fs/inode.c
index 87535753ab04..fb10d86ffad7 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -322,9 +322,6 @@ EXPORT_SYMBOL(clear_nlink);
322void set_nlink(struct inode *inode, unsigned int nlink) 322void set_nlink(struct inode *inode, unsigned int nlink)
323{ 323{
324 if (!nlink) { 324 if (!nlink) {
325 printk_ratelimited(KERN_INFO
326 "set_nlink() clearing i_nlink on %s inode %li\n",
327 inode->i_sb->s_type->name, inode->i_ino);
328 clear_nlink(inode); 325 clear_nlink(inode);
329 } else { 326 } else {
330 /* Yes, some filesystems do change nlink from zero to one */ 327 /* Yes, some filesystems do change nlink from zero to one */
@@ -776,6 +773,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan)
776 else 773 else
777 __count_vm_events(PGINODESTEAL, reap); 774 __count_vm_events(PGINODESTEAL, reap);
778 spin_unlock(&sb->s_inode_lru_lock); 775 spin_unlock(&sb->s_inode_lru_lock);
776 if (current->reclaim_state)
777 current->reclaim_state->reclaimed_slab += reap;
779 778
780 dispose_list(&freeable); 779 dispose_list(&freeable);
781} 780}
diff --git a/fs/ioprio.c b/fs/ioprio.c
index f79dab83e17b..f84b380d65e5 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -48,28 +48,12 @@ int set_task_ioprio(struct task_struct *task, int ioprio)
48 if (err) 48 if (err)
49 return err; 49 return err;
50 50
51 task_lock(task); 51 ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
52 do { 52 if (ioc) {
53 ioc = task->io_context; 53 ioc_ioprio_changed(ioc, ioprio);
54 /* see wmb() in current_io_context() */ 54 put_io_context(ioc, NULL);
55 smp_read_barrier_depends();
56 if (ioc)
57 break;
58
59 ioc = alloc_io_context(GFP_ATOMIC, -1);
60 if (!ioc) {
61 err = -ENOMEM;
62 break;
63 }
64 task->io_context = ioc;
65 } while (1);
66
67 if (!err) {
68 ioc->ioprio = ioprio;
69 ioc->ioprio_changed = 1;
70 } 55 }
71 56
72 task_unlock(task);
73 return err; 57 return err;
74} 58}
75EXPORT_SYMBOL_GPL(set_task_ioprio); 59EXPORT_SYMBOL_GPL(set_task_ioprio);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 7b99f5f460be..bd62c76fb5df 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -948,8 +948,11 @@ root_found:
948 948
949 /* get the root dentry */ 949 /* get the root dentry */
950 s->s_root = d_alloc_root(inode); 950 s->s_root = d_alloc_root(inode);
951 if (!(s->s_root)) 951 if (!(s->s_root)) {
952 goto out_no_root; 952 iput(inode);
953 error = -ENOMEM;
954 goto out_no_inode;
955 }
953 956
954 kfree(opt.iocharset); 957 kfree(opt.iocharset);
955 958
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5d1a00a5041b..05f0754f2b46 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -453,8 +453,6 @@ out:
453 * 453 *
454 * Return <0 on error, 0 on success, 1 if there was nothing to clean up. 454 * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
455 * 455 *
456 * Called with the journal lock held.
457 *
458 * This is the only part of the journaling code which really needs to be 456 * This is the only part of the journaling code which really needs to be
459 * aware of transaction aborts. Checkpointing involves writing to the 457 * aware of transaction aborts. Checkpointing involves writing to the
460 * main filesystem area rather than to the journal, so it can proceed 458 * main filesystem area rather than to the journal, so it can proceed
@@ -472,13 +470,14 @@ int cleanup_journal_tail(journal_t *journal)
472 if (is_journal_aborted(journal)) 470 if (is_journal_aborted(journal))
473 return 1; 471 return 1;
474 472
475 /* OK, work out the oldest transaction remaining in the log, and 473 /*
474 * OK, work out the oldest transaction remaining in the log, and
476 * the log block it starts at. 475 * the log block it starts at.
477 * 476 *
478 * If the log is now empty, we need to work out which is the 477 * If the log is now empty, we need to work out which is the
479 * next transaction ID we will write, and where it will 478 * next transaction ID we will write, and where it will
480 * start. */ 479 * start.
481 480 */
482 spin_lock(&journal->j_state_lock); 481 spin_lock(&journal->j_state_lock);
483 spin_lock(&journal->j_list_lock); 482 spin_lock(&journal->j_list_lock);
484 transaction = journal->j_checkpoint_transactions; 483 transaction = journal->j_checkpoint_transactions;
@@ -504,7 +503,25 @@ int cleanup_journal_tail(journal_t *journal)
504 spin_unlock(&journal->j_state_lock); 503 spin_unlock(&journal->j_state_lock);
505 return 1; 504 return 1;
506 } 505 }
506 spin_unlock(&journal->j_state_lock);
507
508 /*
509 * We need to make sure that any blocks that were recently written out
510 * --- perhaps by log_do_checkpoint() --- are flushed out before we
511 * drop the transactions from the journal. It's unlikely this will be
512 * necessary, especially with an appropriately sized journal, but we
513 * need this to guarantee correctness. Fortunately
514 * cleanup_journal_tail() doesn't get called all that often.
515 */
516 if (journal->j_flags & JFS_BARRIER)
517 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
507 518
519 spin_lock(&journal->j_state_lock);
520 if (!tid_gt(first_tid, journal->j_tail_sequence)) {
521 spin_unlock(&journal->j_state_lock);
522 /* Someone else cleaned up journal so return 0 */
523 return 0;
524 }
508 /* OK, update the superblock to recover the freed space. 525 /* OK, update the superblock to recover the freed space.
509 * Physical blocks come first: have we wrapped beyond the end of 526 * Physical blocks come first: have we wrapped beyond the end of
510 * the log? */ 527 * the log? */
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 8799207df058..f2b9a571f4cf 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -392,6 +392,12 @@ void journal_commit_transaction(journal_t *journal)
392 jbd_debug (3, "JBD: commit phase 1\n"); 392 jbd_debug (3, "JBD: commit phase 1\n");
393 393
394 /* 394 /*
395 * Clear revoked flag to reflect there is no revoked buffers
396 * in the next transaction which is going to be started.
397 */
398 journal_clear_buffer_revoked_flags(journal);
399
400 /*
395 * Switch to a new revoke table. 401 * Switch to a new revoke table.
396 */ 402 */
397 journal_switch_revoke_table(journal); 403 journal_switch_revoke_table(journal);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index a96cff0c5f1d..59c09f9541b5 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -721,7 +721,6 @@ static journal_t * journal_init_common (void)
721 init_waitqueue_head(&journal->j_wait_checkpoint); 721 init_waitqueue_head(&journal->j_wait_checkpoint);
722 init_waitqueue_head(&journal->j_wait_commit); 722 init_waitqueue_head(&journal->j_wait_commit);
723 init_waitqueue_head(&journal->j_wait_updates); 723 init_waitqueue_head(&journal->j_wait_updates);
724 mutex_init(&journal->j_barrier);
725 mutex_init(&journal->j_checkpoint_mutex); 724 mutex_init(&journal->j_checkpoint_mutex);
726 spin_lock_init(&journal->j_revoke_lock); 725 spin_lock_init(&journal->j_revoke_lock);
727 spin_lock_init(&journal->j_list_lock); 726 spin_lock_init(&journal->j_list_lock);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index 5b43e96788e6..008bf062fd26 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -20,6 +20,7 @@
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd.h> 21#include <linux/jbd.h>
22#include <linux/errno.h> 22#include <linux/errno.h>
23#include <linux/blkdev.h>
23#endif 24#endif
24 25
25/* 26/*
@@ -263,6 +264,9 @@ int journal_recover(journal_t *journal)
263 err2 = sync_blockdev(journal->j_fs_dev); 264 err2 = sync_blockdev(journal->j_fs_dev);
264 if (!err) 265 if (!err)
265 err = err2; 266 err = err2;
267 /* Flush disk caches to get replayed data on the permanent storage */
268 if (journal->j_flags & JFS_BARRIER)
269 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
266 270
267 return err; 271 return err;
268} 272}
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 305a90763154..25c713e7071c 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -47,6 +47,10 @@
47 * overwriting the new data. We don't even need to clear the revoke 47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here. 48 * bit here.
49 * 49 *
50 * We cache revoke status of a buffer in the current transaction in b_states
51 * bits. As the name says, revokevalid flag indicates that the cached revoke
52 * status of a buffer is valid and we can rely on the cached status.
53 *
50 * Revoke information on buffers is a tri-state value: 54 * Revoke information on buffers is a tri-state value:
51 * 55 *
52 * RevokeValid clear: no cached revoke status, need to look it up 56 * RevokeValid clear: no cached revoke status, need to look it up
@@ -479,6 +483,36 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
479 return did_revoke; 483 return did_revoke;
480} 484}
481 485
486/*
487 * journal_clear_revoked_flags clears revoked flag of buffers in
488 * revoke table to reflect there is no revoked buffer in the next
489 * transaction which is going to be started.
490 */
491void journal_clear_buffer_revoked_flags(journal_t *journal)
492{
493 struct jbd_revoke_table_s *revoke = journal->j_revoke;
494 int i = 0;
495
496 for (i = 0; i < revoke->hash_size; i++) {
497 struct list_head *hash_list;
498 struct list_head *list_entry;
499 hash_list = &revoke->hash_table[i];
500
501 list_for_each(list_entry, hash_list) {
502 struct jbd_revoke_record_s *record;
503 struct buffer_head *bh;
504 record = (struct jbd_revoke_record_s *)list_entry;
505 bh = __find_get_block(journal->j_fs_dev,
506 record->blocknr,
507 journal->j_blocksize);
508 if (bh) {
509 clear_buffer_revoked(bh);
510 __brelse(bh);
511 }
512 }
513 }
514}
515
482/* journal_switch_revoke table select j_revoke for next transaction 516/* journal_switch_revoke table select j_revoke for next transaction
483 * we do not want to suspend any processing until all revokes are 517 * we do not want to suspend any processing until all revokes are
484 * written -bzzz 518 * written -bzzz
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 7e59c6e66f9b..7fce94b04bc3 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -426,17 +426,34 @@ int journal_restart(handle_t *handle, int nblocks)
426 * void journal_lock_updates () - establish a transaction barrier. 426 * void journal_lock_updates () - establish a transaction barrier.
427 * @journal: Journal to establish a barrier on. 427 * @journal: Journal to establish a barrier on.
428 * 428 *
429 * This locks out any further updates from being started, and blocks 429 * This locks out any further updates from being started, and blocks until all
430 * until all existing updates have completed, returning only once the 430 * existing updates have completed, returning only once the journal is in a
431 * journal is in a quiescent state with no updates running. 431 * quiescent state with no updates running.
432 * 432 *
433 * The journal lock should not be held on entry. 433 * We do not use simple mutex for synchronization as there are syscalls which
434 * want to return with filesystem locked and that trips up lockdep. Also
435 * hibernate needs to lock filesystem but locked mutex then blocks hibernation.
436 * Since locking filesystem is rare operation, we use simple counter and
437 * waitqueue for locking.
434 */ 438 */
435void journal_lock_updates(journal_t *journal) 439void journal_lock_updates(journal_t *journal)
436{ 440{
437 DEFINE_WAIT(wait); 441 DEFINE_WAIT(wait);
438 442
443wait:
444 /* Wait for previous locked operation to finish */
445 wait_event(journal->j_wait_transaction_locked,
446 journal->j_barrier_count == 0);
447
439 spin_lock(&journal->j_state_lock); 448 spin_lock(&journal->j_state_lock);
449 /*
450 * Check reliably under the lock whether we are the ones winning the race
451 * and locking the journal
452 */
453 if (journal->j_barrier_count > 0) {
454 spin_unlock(&journal->j_state_lock);
455 goto wait;
456 }
440 ++journal->j_barrier_count; 457 ++journal->j_barrier_count;
441 458
442 /* Wait until there are no running updates */ 459 /* Wait until there are no running updates */
@@ -460,14 +477,6 @@ void journal_lock_updates(journal_t *journal)
460 spin_lock(&journal->j_state_lock); 477 spin_lock(&journal->j_state_lock);
461 } 478 }
462 spin_unlock(&journal->j_state_lock); 479 spin_unlock(&journal->j_state_lock);
463
464 /*
465 * We have now established a barrier against other normal updates, but
466 * we also need to barrier against other journal_lock_updates() calls
467 * to make sure that we serialise special journal-locked operations
468 * too.
469 */
470 mutex_lock(&journal->j_barrier);
471} 480}
472 481
473/** 482/**
@@ -475,14 +484,11 @@ void journal_lock_updates(journal_t *journal)
475 * @journal: Journal to release the barrier on. 484 * @journal: Journal to release the barrier on.
476 * 485 *
477 * Release a transaction barrier obtained with journal_lock_updates(). 486 * Release a transaction barrier obtained with journal_lock_updates().
478 *
479 * Should be called without the journal lock held.
480 */ 487 */
481void journal_unlock_updates (journal_t *journal) 488void journal_unlock_updates (journal_t *journal)
482{ 489{
483 J_ASSERT(journal->j_barrier_count != 0); 490 J_ASSERT(journal->j_barrier_count != 0);
484 491
485 mutex_unlock(&journal->j_barrier);
486 spin_lock(&journal->j_state_lock); 492 spin_lock(&journal->j_state_lock);
487 --journal->j_barrier_count; 493 --journal->j_barrier_count;
488 spin_unlock(&journal->j_state_lock); 494 spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 68d704db787f..5069b8475150 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -430,6 +430,12 @@ void jbd2_journal_commit_transaction(journal_t *journal)
430 jbd_debug(3, "JBD2: commit phase 1\n"); 430 jbd_debug(3, "JBD2: commit phase 1\n");
431 431
432 /* 432 /*
433 * Clear revoked flag to reflect there is no revoked buffers
434 * in the next transaction which is going to be started.
435 */
436 jbd2_clear_buffer_revoked_flags(journal);
437
438 /*
433 * Switch to a new revoke table. 439 * Switch to a new revoke table.
434 */ 440 */
435 jbd2_journal_switch_revoke_table(journal); 441 jbd2_journal_switch_revoke_table(journal);
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 69fd93588118..30b2867d6cc9 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -47,6 +47,10 @@
47 * overwriting the new data. We don't even need to clear the revoke 47 * overwriting the new data. We don't even need to clear the revoke
48 * bit here. 48 * bit here.
49 * 49 *
50 * We cache revoke status of a buffer in the current transaction in b_states
51 * bits. As the name says, revokevalid flag indicates that the cached revoke
52 * status of a buffer is valid and we can rely on the cached status.
53 *
50 * Revoke information on buffers is a tri-state value: 54 * Revoke information on buffers is a tri-state value:
51 * 55 *
52 * RevokeValid clear: no cached revoke status, need to look it up 56 * RevokeValid clear: no cached revoke status, need to look it up
@@ -478,6 +482,36 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
478 return did_revoke; 482 return did_revoke;
479} 483}
480 484
485/*
486 * journal_clear_revoked_flag clears revoked flag of buffers in
487 * revoke table to reflect there is no revoked buffers in the next
488 * transaction which is going to be started.
489 */
490void jbd2_clear_buffer_revoked_flags(journal_t *journal)
491{
492 struct jbd2_revoke_table_s *revoke = journal->j_revoke;
493 int i = 0;
494
495 for (i = 0; i < revoke->hash_size; i++) {
496 struct list_head *hash_list;
497 struct list_head *list_entry;
498 hash_list = &revoke->hash_table[i];
499
500 list_for_each(list_entry, hash_list) {
501 struct jbd2_revoke_record_s *record;
502 struct buffer_head *bh;
503 record = (struct jbd2_revoke_record_s *)list_entry;
504 bh = __find_get_block(journal->j_fs_dev,
505 record->blocknr,
506 journal->j_blocksize);
507 if (bh) {
508 clear_buffer_revoked(bh);
509 __brelse(bh);
510 }
511 }
512 }
513}
514
481/* journal_switch_revoke table select j_revoke for next transaction 515/* journal_switch_revoke table select j_revoke for next transaction
482 * we do not want to suspend any processing until all revokes are 516 * we do not want to suspend any processing until all revokes are
483 * written -bzzz 517 * written -bzzz
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a0e41a4c080e..35ae096bed5d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -517,12 +517,13 @@ void jbd2_journal_lock_updates(journal_t *journal)
517 break; 517 break;
518 518
519 spin_lock(&transaction->t_handle_lock); 519 spin_lock(&transaction->t_handle_lock);
520 prepare_to_wait(&journal->j_wait_updates, &wait,
521 TASK_UNINTERRUPTIBLE);
520 if (!atomic_read(&transaction->t_updates)) { 522 if (!atomic_read(&transaction->t_updates)) {
521 spin_unlock(&transaction->t_handle_lock); 523 spin_unlock(&transaction->t_handle_lock);
524 finish_wait(&journal->j_wait_updates, &wait);
522 break; 525 break;
523 } 526 }
524 prepare_to_wait(&journal->j_wait_updates, &wait,
525 TASK_UNINTERRUPTIBLE);
526 spin_unlock(&transaction->t_handle_lock); 527 spin_unlock(&transaction->t_handle_lock);
527 write_unlock(&journal->j_state_lock); 528 write_unlock(&journal->j_state_lock);
528 schedule(); 529 schedule();
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index e513f1913c15..a01cdad6aad1 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -74,7 +74,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
74 ((struct erase_priv_struct *)instr->priv)->jeb = jeb; 74 ((struct erase_priv_struct *)instr->priv)->jeb = jeb;
75 ((struct erase_priv_struct *)instr->priv)->c = c; 75 ((struct erase_priv_struct *)instr->priv)->c = c;
76 76
77 ret = c->mtd->erase(c->mtd, instr); 77 ret = mtd_erase(c->mtd, instr);
78 if (!ret) 78 if (!ret)
79 return; 79 return;
80 80
@@ -336,12 +336,11 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
336 uint32_t ofs; 336 uint32_t ofs;
337 size_t retlen; 337 size_t retlen;
338 int ret = -EIO; 338 int ret = -EIO;
339 unsigned long *wordebuf;
339 340
340 if (c->mtd->point) { 341 ret = mtd_point(c->mtd, jeb->offset, c->sector_size, &retlen,
341 unsigned long *wordebuf; 342 &ebuf, NULL);
342 343 if (ret != -EOPNOTSUPP) {
343 ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size,
344 &retlen, &ebuf, NULL);
345 if (ret) { 344 if (ret) {
346 D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); 345 D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
347 goto do_flash_read; 346 goto do_flash_read;
@@ -349,7 +348,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
349 if (retlen < c->sector_size) { 348 if (retlen < c->sector_size) {
350 /* Don't muck about if it won't let us point to the whole erase sector */ 349 /* Don't muck about if it won't let us point to the whole erase sector */
351 D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen)); 350 D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen));
352 c->mtd->unpoint(c->mtd, jeb->offset, retlen); 351 mtd_unpoint(c->mtd, jeb->offset, retlen);
353 goto do_flash_read; 352 goto do_flash_read;
354 } 353 }
355 wordebuf = ebuf-sizeof(*wordebuf); 354 wordebuf = ebuf-sizeof(*wordebuf);
@@ -358,7 +357,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
358 if (*++wordebuf != ~0) 357 if (*++wordebuf != ~0)
359 break; 358 break;
360 } while(--retlen); 359 } while(--retlen);
361 c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size); 360 mtd_unpoint(c->mtd, jeb->offset, c->sector_size);
362 if (retlen) { 361 if (retlen) {
363 printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n", 362 printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n",
364 *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf)); 363 *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf));
@@ -381,7 +380,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl
381 380
382 *bad_offset = ofs; 381 *bad_offset = ofs;
383 382
384 ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf); 383 ret = mtd_read(c->mtd, ofs, readlen, &retlen, ebuf);
385 if (ret) { 384 if (ret) {
386 printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); 385 printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret);
387 ret = -EIO; 386 ret = -EIO;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 4b8afe39a87f..2e0123867cb1 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -466,7 +466,6 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
466 466
467 if (insert_inode_locked(inode) < 0) { 467 if (insert_inode_locked(inode) < 0) {
468 make_bad_inode(inode); 468 make_bad_inode(inode);
469 unlock_new_inode(inode);
470 iput(inode); 469 iput(inode);
471 return ERR_PTR(-EINVAL); 470 return ERR_PTR(-EINVAL);
472 } 471 }
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ee57bac1ba6d..3093ac4fb24c 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -62,17 +62,15 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
62#ifndef __ECOS 62#ifndef __ECOS
63 /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(), 63 /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(),
64 * adding and jffs2_flash_read_end() interface. */ 64 * adding and jffs2_flash_read_end() interface. */
65 if (c->mtd->point) { 65 err = mtd_point(c->mtd, ofs, len, &retlen, (void **)&buffer, NULL);
66 err = c->mtd->point(c->mtd, ofs, len, &retlen, 66 if (!err && retlen < len) {
67 (void **)&buffer, NULL); 67 JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize);
68 if (!err && retlen < len) { 68 mtd_unpoint(c->mtd, ofs, retlen);
69 JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize); 69 } else if (err) {
70 c->mtd->unpoint(c->mtd, ofs, retlen); 70 if (err != -EOPNOTSUPP)
71 } else if (err)
72 JFFS2_WARNING("MTD point failed: error code %d.\n", err); 71 JFFS2_WARNING("MTD point failed: error code %d.\n", err);
73 else 72 } else
74 pointed = 1; /* succefully pointed to device */ 73 pointed = 1; /* succefully pointed to device */
75 }
76#endif 74#endif
77 75
78 if (!pointed) { 76 if (!pointed) {
@@ -101,7 +99,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info
101 kfree(buffer); 99 kfree(buffer);
102#ifndef __ECOS 100#ifndef __ECOS
103 else 101 else
104 c->mtd->unpoint(c->mtd, ofs, len); 102 mtd_unpoint(c->mtd, ofs, len);
105#endif 103#endif
106 104
107 if (crc != tn->data_crc) { 105 if (crc != tn->data_crc) {
@@ -137,7 +135,7 @@ free_out:
137 kfree(buffer); 135 kfree(buffer);
138#ifndef __ECOS 136#ifndef __ECOS
139 else 137 else
140 c->mtd->unpoint(c->mtd, ofs, len); 138 mtd_unpoint(c->mtd, ofs, len);
141#endif 139#endif
142 return err; 140 return err;
143} 141}
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 28107ca136e4..f99464833bb2 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -97,15 +97,15 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
97 size_t pointlen, try_size; 97 size_t pointlen, try_size;
98 98
99 if (c->mtd->point) { 99 if (c->mtd->point) {
100 ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen, 100 ret = mtd_point(c->mtd, 0, c->mtd->size, &pointlen,
101 (void **)&flashbuf, NULL); 101 (void **)&flashbuf, NULL);
102 if (!ret && pointlen < c->mtd->size) { 102 if (!ret && pointlen < c->mtd->size) {
103 /* Don't muck about if it won't let us point to the whole flash */ 103 /* Don't muck about if it won't let us point to the whole flash */
104 D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen)); 104 D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen));
105 c->mtd->unpoint(c->mtd, 0, pointlen); 105 mtd_unpoint(c->mtd, 0, pointlen);
106 flashbuf = NULL; 106 flashbuf = NULL;
107 } 107 }
108 if (ret) 108 if (ret && ret != -EOPNOTSUPP)
109 D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); 109 D1(printk(KERN_DEBUG "MTD point failed %d\n", ret));
110 } 110 }
111#endif 111#endif
@@ -273,7 +273,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
273 kfree(flashbuf); 273 kfree(flashbuf);
274#ifndef __ECOS 274#ifndef __ECOS
275 else 275 else
276 c->mtd->unpoint(c->mtd, 0, c->mtd->size); 276 mtd_unpoint(c->mtd, 0, c->mtd->size);
277#endif 277#endif
278 kfree(s); 278 kfree(s);
279 return ret; 279 return ret;
@@ -455,7 +455,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
455 if (jffs2_cleanmarker_oob(c)) { 455 if (jffs2_cleanmarker_oob(c)) {
456 int ret; 456 int ret;
457 457
458 if (c->mtd->block_isbad(c->mtd, jeb->offset)) 458 if (mtd_block_isbad(c->mtd, jeb->offset))
459 return BLK_STATE_BADBLOCK; 459 return BLK_STATE_BADBLOCK;
460 460
461 ret = jffs2_check_nand_cleanmarker(c, jeb); 461 ret = jffs2_check_nand_cleanmarker(c, jeb);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 8be4925296cf..f2d96b5e64f6 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -335,9 +335,7 @@ static void jffs2_put_super (struct super_block *sb)
335 jffs2_flash_cleanup(c); 335 jffs2_flash_cleanup(c);
336 kfree(c->inocache_list); 336 kfree(c->inocache_list);
337 jffs2_clear_xattr_subsystem(c); 337 jffs2_clear_xattr_subsystem(c);
338 if (c->mtd->sync) 338 mtd_sync(c->mtd);
339 c->mtd->sync(c->mtd);
340
341 D1(printk(KERN_DEBUG "jffs2_put_super returning\n")); 339 D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
342} 340}
343 341
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index b09e51d2f81f..30e8f47e8a23 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -228,7 +228,7 @@ static int jffs2_verify_write(struct jffs2_sb_info *c, unsigned char *buf,
228 size_t retlen; 228 size_t retlen;
229 char *eccstr; 229 char *eccstr;
230 230
231 ret = c->mtd->read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify); 231 ret = mtd_read(c->mtd, ofs, c->wbuf_pagesize, &retlen, c->wbuf_verify);
232 if (ret && ret != -EUCLEAN && ret != -EBADMSG) { 232 if (ret && ret != -EUCLEAN && ret != -EBADMSG) {
233 printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret); 233 printk(KERN_WARNING "jffs2_verify_write(): Read back of page at %08x failed: %d\n", c->wbuf_ofs, ret);
234 return ret; 234 return ret;
@@ -337,7 +337,8 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
337 } 337 }
338 338
339 /* Do the read... */ 339 /* Do the read... */
340 ret = c->mtd->read(c->mtd, start, c->wbuf_ofs - start, &retlen, buf); 340 ret = mtd_read(c->mtd, start, c->wbuf_ofs - start, &retlen,
341 buf);
341 342
342 /* ECC recovered ? */ 343 /* ECC recovered ? */
343 if ((ret == -EUCLEAN || ret == -EBADMSG) && 344 if ((ret == -EUCLEAN || ret == -EBADMSG) &&
@@ -413,13 +414,12 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
413 if (breakme++ == 20) { 414 if (breakme++ == 20) {
414 printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs); 415 printk(KERN_NOTICE "Faking write error at 0x%08x\n", ofs);
415 breakme = 0; 416 breakme = 0;
416 c->mtd->write(c->mtd, ofs, towrite, &retlen, 417 mtd_write(c->mtd, ofs, towrite, &retlen, brokenbuf);
417 brokenbuf);
418 ret = -EIO; 418 ret = -EIO;
419 } else 419 } else
420#endif 420#endif
421 ret = c->mtd->write(c->mtd, ofs, towrite, &retlen, 421 ret = mtd_write(c->mtd, ofs, towrite, &retlen,
422 rewrite_buf); 422 rewrite_buf);
423 423
424 if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) { 424 if (ret || retlen != towrite || jffs2_verify_write(c, rewrite_buf, ofs)) {
425 /* Argh. We tried. Really we did. */ 425 /* Argh. We tried. Really we did. */
@@ -619,13 +619,14 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
619 if (breakme++ == 20) { 619 if (breakme++ == 20) {
620 printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs); 620 printk(KERN_NOTICE "Faking write error at 0x%08x\n", c->wbuf_ofs);
621 breakme = 0; 621 breakme = 0;
622 c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, 622 mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen,
623 brokenbuf); 623 brokenbuf);
624 ret = -EIO; 624 ret = -EIO;
625 } else 625 } else
626#endif 626#endif
627 627
628 ret = c->mtd->write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize, &retlen, c->wbuf); 628 ret = mtd_write(c->mtd, c->wbuf_ofs, c->wbuf_pagesize,
629 &retlen, c->wbuf);
629 630
630 if (ret) { 631 if (ret) {
631 printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret); 632 printk(KERN_WARNING "jffs2_flush_wbuf(): Write failed with %d\n", ret);
@@ -861,8 +862,8 @@ int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs,
861 v += wbuf_retlen; 862 v += wbuf_retlen;
862 863
863 if (vlen >= c->wbuf_pagesize) { 864 if (vlen >= c->wbuf_pagesize) {
864 ret = c->mtd->write(c->mtd, outvec_to, PAGE_DIV(vlen), 865 ret = mtd_write(c->mtd, outvec_to, PAGE_DIV(vlen),
865 &wbuf_retlen, v); 866 &wbuf_retlen, v);
866 if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen)) 867 if (ret < 0 || wbuf_retlen != PAGE_DIV(vlen))
867 goto outfile; 868 goto outfile;
868 869
@@ -948,11 +949,11 @@ int jffs2_flash_read(struct jffs2_sb_info *c, loff_t ofs, size_t len, size_t *re
948 int ret; 949 int ret;
949 950
950 if (!jffs2_is_writebuffered(c)) 951 if (!jffs2_is_writebuffered(c))
951 return c->mtd->read(c->mtd, ofs, len, retlen, buf); 952 return mtd_read(c->mtd, ofs, len, retlen, buf);
952 953
953 /* Read flash */ 954 /* Read flash */
954 down_read(&c->wbuf_sem); 955 down_read(&c->wbuf_sem);
955 ret = c->mtd->read(c->mtd, ofs, len, retlen, buf); 956 ret = mtd_read(c->mtd, ofs, len, retlen, buf);
956 957
957 if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) { 958 if ( (ret == -EBADMSG || ret == -EUCLEAN) && (*retlen == len) ) {
958 if (ret == -EBADMSG) 959 if (ret == -EBADMSG)
@@ -1031,7 +1032,7 @@ int jffs2_check_oob_empty(struct jffs2_sb_info *c,
1031 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1032 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
1032 ops.datbuf = NULL; 1033 ops.datbuf = NULL;
1033 1034
1034 ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); 1035 ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
1035 if (ret || ops.oobretlen != ops.ooblen) { 1036 if (ret || ops.oobretlen != ops.ooblen) {
1036 printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" 1037 printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
1037 " bytes, read %zd bytes, error %d\n", 1038 " bytes, read %zd bytes, error %d\n",
@@ -1074,7 +1075,7 @@ int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
1074 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1075 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
1075 ops.datbuf = NULL; 1076 ops.datbuf = NULL;
1076 1077
1077 ret = c->mtd->read_oob(c->mtd, jeb->offset, &ops); 1078 ret = mtd_read_oob(c->mtd, jeb->offset, &ops);
1078 if (ret || ops.oobretlen != ops.ooblen) { 1079 if (ret || ops.oobretlen != ops.ooblen) {
1079 printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd" 1080 printk(KERN_ERR "cannot read OOB for EB at %08x, requested %zd"
1080 " bytes, read %zd bytes, error %d\n", 1081 " bytes, read %zd bytes, error %d\n",
@@ -1100,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
1100 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0; 1101 ops.len = ops.ooboffs = ops.retlen = ops.oobretlen = 0;
1101 ops.datbuf = NULL; 1102 ops.datbuf = NULL;
1102 1103
1103 ret = c->mtd->write_oob(c->mtd, jeb->offset, &ops); 1104 ret = mtd_write_oob(c->mtd, jeb->offset, &ops);
1104 if (ret || ops.oobretlen != ops.ooblen) { 1105 if (ret || ops.oobretlen != ops.ooblen) {
1105 printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd" 1106 printk(KERN_ERR "cannot write OOB for EB at %08x, requested %zd"
1106 " bytes, read %zd bytes, error %d\n", 1107 " bytes, read %zd bytes, error %d\n",
@@ -1129,11 +1130,8 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
1129 if( ++jeb->bad_count < MAX_ERASE_FAILURES) 1130 if( ++jeb->bad_count < MAX_ERASE_FAILURES)
1130 return 0; 1131 return 0;
1131 1132
1132 if (!c->mtd->block_markbad)
1133 return 1; // What else can we do?
1134
1135 printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset); 1133 printk(KERN_WARNING "JFFS2: marking eraseblock at %08x\n as bad", bad_offset);
1136 ret = c->mtd->block_markbad(c->mtd, bad_offset); 1134 ret = mtd_block_markbad(c->mtd, bad_offset);
1137 1135
1138 if (ret) { 1136 if (ret) {
1139 D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret)); 1137 D1(printk(KERN_WARNING "jffs2_write_nand_badblock(): Write failed for block at %08x: error %d\n", jeb->offset, ret));
diff --git a/fs/jffs2/writev.c b/fs/jffs2/writev.c
index b9276b11bac6..a1bda9dab3f8 100644
--- a/fs/jffs2/writev.c
+++ b/fs/jffs2/writev.c
@@ -13,30 +13,6 @@
13#include <linux/mtd/mtd.h> 13#include <linux/mtd/mtd.h>
14#include "nodelist.h" 14#include "nodelist.h"
15 15
16/* This ought to be in core MTD code. All registered MTD devices
17 without writev should have this put in place. Bug the MTD
18 maintainer */
19static inline int mtd_fake_writev(struct mtd_info *mtd, const struct kvec *vecs,
20 unsigned long count, loff_t to, size_t *retlen)
21{
22 unsigned long i;
23 size_t totlen = 0, thislen;
24 int ret = 0;
25
26 for (i=0; i<count; i++) {
27 if (!vecs[i].iov_len)
28 continue;
29 ret = mtd->write(mtd, to, vecs[i].iov_len, &thislen, vecs[i].iov_base);
30 totlen += thislen;
31 if (ret || thislen != vecs[i].iov_len)
32 break;
33 to += vecs[i].iov_len;
34 }
35 if (retlen)
36 *retlen = totlen;
37 return ret;
38}
39
40int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs, 16int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
41 unsigned long count, loff_t to, size_t *retlen) 17 unsigned long count, loff_t to, size_t *retlen)
42{ 18{
@@ -50,18 +26,14 @@ int jffs2_flash_direct_writev(struct jffs2_sb_info *c, const struct kvec *vecs,
50 } 26 }
51 } 27 }
52 28
53 if (c->mtd->writev) 29 return mtd_writev(c->mtd, vecs, count, to, retlen);
54 return c->mtd->writev(c->mtd, vecs, count, to, retlen);
55 else {
56 return mtd_fake_writev(c->mtd, vecs, count, to, retlen);
57 }
58} 30}
59 31
60int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len, 32int jffs2_flash_direct_write(struct jffs2_sb_info *c, loff_t ofs, size_t len,
61 size_t *retlen, const u_char *buf) 33 size_t *retlen, const u_char *buf)
62{ 34{
63 int ret; 35 int ret;
64 ret = c->mtd->write(c->mtd, ofs, len, retlen, buf); 36 ret = mtd_write(c->mtd, ofs, len, retlen, buf);
65 37
66 if (jffs2_sum_active()) { 38 if (jffs2_sum_active()) {
67 struct kvec vecs[1]; 39 struct kvec vecs[1];
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 23d7451b2938..65ba36b80a9e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(nsm_lock);
55 * Local NSM state 55 * Local NSM state
56 */ 56 */
57u32 __read_mostly nsm_local_state; 57u32 __read_mostly nsm_local_state;
58int __read_mostly nsm_use_hostnames; 58bool __read_mostly nsm_use_hostnames;
59 59
60static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) 60static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
61{ 61{
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 339e17e9133d..9c501449450d 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -13,13 +13,14 @@
13 13
14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
15 15
16static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) 16static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
17 void *buf)
17{ 18{
18 struct mtd_info *mtd = logfs_super(sb)->s_mtd; 19 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
19 size_t retlen; 20 size_t retlen;
20 int ret; 21 int ret;
21 22
22 ret = mtd->read(mtd, ofs, len, &retlen, buf); 23 ret = mtd_read(mtd, ofs, len, &retlen, buf);
23 BUG_ON(ret == -EINVAL); 24 BUG_ON(ret == -EINVAL);
24 if (ret) 25 if (ret)
25 return ret; 26 return ret;
@@ -31,7 +32,8 @@ static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
31 return 0; 32 return 0;
32} 33}
33 34
34static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) 35static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
36 void *buf)
35{ 37{
36 struct logfs_super *super = logfs_super(sb); 38 struct logfs_super *super = logfs_super(sb);
37 struct mtd_info *mtd = super->s_mtd; 39 struct mtd_info *mtd = super->s_mtd;
@@ -47,7 +49,7 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
47 BUG_ON(len > PAGE_CACHE_SIZE); 49 BUG_ON(len > PAGE_CACHE_SIZE);
48 page_start = ofs & PAGE_CACHE_MASK; 50 page_start = ofs & PAGE_CACHE_MASK;
49 page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; 51 page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
50 ret = mtd->write(mtd, ofs, len, &retlen, buf); 52 ret = mtd_write(mtd, ofs, len, &retlen, buf);
51 if (ret || (retlen != len)) 53 if (ret || (retlen != len))
52 return -EIO; 54 return -EIO;
53 55
@@ -60,14 +62,15 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
60 * asynchronous properties. So just to prevent the first implementor of such 62 * asynchronous properties. So just to prevent the first implementor of such
61 * a thing from breaking logfs in 2350, we do the usual pointless dance to 63 * a thing from breaking logfs in 2350, we do the usual pointless dance to
62 * declare a completion variable and wait for completion before returning 64 * declare a completion variable and wait for completion before returning
63 * from mtd_erase(). What an exercise in futility! 65 * from logfs_mtd_erase(). What an exercise in futility!
64 */ 66 */
65static void logfs_erase_callback(struct erase_info *ei) 67static void logfs_erase_callback(struct erase_info *ei)
66{ 68{
67 complete((struct completion *)ei->priv); 69 complete((struct completion *)ei->priv);
68} 70}
69 71
70static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) 72static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
73 size_t len)
71{ 74{
72 struct logfs_super *super = logfs_super(sb); 75 struct logfs_super *super = logfs_super(sb);
73 struct address_space *mapping = super->s_mapping_inode->i_mapping; 76 struct address_space *mapping = super->s_mapping_inode->i_mapping;
@@ -84,7 +87,7 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
84 return 0; 87 return 0;
85} 88}
86 89
87static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, 90static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
88 int ensure_write) 91 int ensure_write)
89{ 92{
90 struct mtd_info *mtd = logfs_super(sb)->s_mtd; 93 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
@@ -102,30 +105,29 @@ static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
102 ei.len = len; 105 ei.len = len;
103 ei.callback = logfs_erase_callback; 106 ei.callback = logfs_erase_callback;
104 ei.priv = (long)&complete; 107 ei.priv = (long)&complete;
105 ret = mtd->erase(mtd, &ei); 108 ret = mtd_erase(mtd, &ei);
106 if (ret) 109 if (ret)
107 return -EIO; 110 return -EIO;
108 111
109 wait_for_completion(&complete); 112 wait_for_completion(&complete);
110 if (ei.state != MTD_ERASE_DONE) 113 if (ei.state != MTD_ERASE_DONE)
111 return -EIO; 114 return -EIO;
112 return mtd_erase_mapping(sb, ofs, len); 115 return logfs_mtd_erase_mapping(sb, ofs, len);
113} 116}
114 117
115static void mtd_sync(struct super_block *sb) 118static void logfs_mtd_sync(struct super_block *sb)
116{ 119{
117 struct mtd_info *mtd = logfs_super(sb)->s_mtd; 120 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
118 121
119 if (mtd->sync) 122 mtd_sync(mtd);
120 mtd->sync(mtd);
121} 123}
122 124
123static int mtd_readpage(void *_sb, struct page *page) 125static int logfs_mtd_readpage(void *_sb, struct page *page)
124{ 126{
125 struct super_block *sb = _sb; 127 struct super_block *sb = _sb;
126 int err; 128 int err;
127 129
128 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, 130 err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
129 page_address(page)); 131 page_address(page));
130 if (err == -EUCLEAN || err == -EBADMSG) { 132 if (err == -EUCLEAN || err == -EBADMSG) {
131 /* -EBADMSG happens regularly on power failures */ 133 /* -EBADMSG happens regularly on power failures */
@@ -143,18 +145,15 @@ static int mtd_readpage(void *_sb, struct page *page)
143 return err; 145 return err;
144} 146}
145 147
146static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) 148static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
147{ 149{
148 struct logfs_super *super = logfs_super(sb); 150 struct logfs_super *super = logfs_super(sb);
149 struct address_space *mapping = super->s_mapping_inode->i_mapping; 151 struct address_space *mapping = super->s_mapping_inode->i_mapping;
150 filler_t *filler = mtd_readpage; 152 filler_t *filler = logfs_mtd_readpage;
151 struct mtd_info *mtd = super->s_mtd; 153 struct mtd_info *mtd = super->s_mtd;
152 154
153 if (!mtd->block_isbad)
154 return NULL;
155
156 *ofs = 0; 155 *ofs = 0;
157 while (mtd->block_isbad(mtd, *ofs)) { 156 while (mtd_block_isbad(mtd, *ofs)) {
158 *ofs += mtd->erasesize; 157 *ofs += mtd->erasesize;
159 if (*ofs >= mtd->size) 158 if (*ofs >= mtd->size)
160 return NULL; 159 return NULL;
@@ -163,18 +162,15 @@ static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
163 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); 162 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
164} 163}
165 164
166static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) 165static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
167{ 166{
168 struct logfs_super *super = logfs_super(sb); 167 struct logfs_super *super = logfs_super(sb);
169 struct address_space *mapping = super->s_mapping_inode->i_mapping; 168 struct address_space *mapping = super->s_mapping_inode->i_mapping;
170 filler_t *filler = mtd_readpage; 169 filler_t *filler = logfs_mtd_readpage;
171 struct mtd_info *mtd = super->s_mtd; 170 struct mtd_info *mtd = super->s_mtd;
172 171
173 if (!mtd->block_isbad)
174 return NULL;
175
176 *ofs = mtd->size - mtd->erasesize; 172 *ofs = mtd->size - mtd->erasesize;
177 while (mtd->block_isbad(mtd, *ofs)) { 173 while (mtd_block_isbad(mtd, *ofs)) {
178 *ofs -= mtd->erasesize; 174 *ofs -= mtd->erasesize;
179 if (*ofs <= 0) 175 if (*ofs <= 0)
180 return NULL; 176 return NULL;
@@ -184,7 +180,7 @@ static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
184 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); 180 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
185} 181}
186 182
187static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, 183static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
188 size_t nr_pages) 184 size_t nr_pages)
189{ 185{
190 struct logfs_super *super = logfs_super(sb); 186 struct logfs_super *super = logfs_super(sb);
@@ -196,8 +192,8 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
196 page = find_lock_page(mapping, index + i); 192 page = find_lock_page(mapping, index + i);
197 BUG_ON(!page); 193 BUG_ON(!page);
198 194
199 err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, 195 err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
200 page_address(page)); 196 page_address(page));
201 unlock_page(page); 197 unlock_page(page);
202 page_cache_release(page); 198 page_cache_release(page);
203 if (err) 199 if (err)
@@ -206,7 +202,7 @@ static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
206 return 0; 202 return 0;
207} 203}
208 204
209static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) 205static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
210{ 206{
211 struct logfs_super *super = logfs_super(sb); 207 struct logfs_super *super = logfs_super(sb);
212 int head; 208 int head;
@@ -227,15 +223,15 @@ static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
227 len += head; 223 len += head;
228 } 224 }
229 len = PAGE_ALIGN(len); 225 len = PAGE_ALIGN(len);
230 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); 226 __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
231} 227}
232 228
233static void mtd_put_device(struct logfs_super *s) 229static void logfs_mtd_put_device(struct logfs_super *s)
234{ 230{
235 put_mtd_device(s->s_mtd); 231 put_mtd_device(s->s_mtd);
236} 232}
237 233
238static int mtd_can_write_buf(struct super_block *sb, u64 ofs) 234static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
239{ 235{
240 struct logfs_super *super = logfs_super(sb); 236 struct logfs_super *super = logfs_super(sb);
241 void *buf; 237 void *buf;
@@ -244,7 +240,7 @@ static int mtd_can_write_buf(struct super_block *sb, u64 ofs)
244 buf = kmalloc(super->s_writesize, GFP_KERNEL); 240 buf = kmalloc(super->s_writesize, GFP_KERNEL);
245 if (!buf) 241 if (!buf)
246 return -ENOMEM; 242 return -ENOMEM;
247 err = mtd_read(sb, ofs, super->s_writesize, buf); 243 err = logfs_mtd_read(sb, ofs, super->s_writesize, buf);
248 if (err) 244 if (err)
249 goto out; 245 goto out;
250 if (memchr_inv(buf, 0xff, super->s_writesize)) 246 if (memchr_inv(buf, 0xff, super->s_writesize))
@@ -255,14 +251,14 @@ out:
255} 251}
256 252
257static const struct logfs_device_ops mtd_devops = { 253static const struct logfs_device_ops mtd_devops = {
258 .find_first_sb = mtd_find_first_sb, 254 .find_first_sb = logfs_mtd_find_first_sb,
259 .find_last_sb = mtd_find_last_sb, 255 .find_last_sb = logfs_mtd_find_last_sb,
260 .readpage = mtd_readpage, 256 .readpage = logfs_mtd_readpage,
261 .writeseg = mtd_writeseg, 257 .writeseg = logfs_mtd_writeseg,
262 .erase = mtd_erase, 258 .erase = logfs_mtd_erase,
263 .can_write_buf = mtd_can_write_buf, 259 .can_write_buf = logfs_mtd_can_write_buf,
264 .sync = mtd_sync, 260 .sync = logfs_mtd_sync,
265 .put_device = mtd_put_device, 261 .put_device = logfs_mtd_put_device,
266}; 262};
267 263
268int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) 264int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 501043e8966c..3de7a32cadbe 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -71,7 +71,7 @@ static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
71 71
72static int write_inode(struct inode *inode) 72static int write_inode(struct inode *inode)
73{ 73{
74 return __logfs_write_inode(inode, WF_LOCK); 74 return __logfs_write_inode(inode, NULL, WF_LOCK);
75} 75}
76 76
77static s64 dir_seek_data(struct inode *inode, s64 pos) 77static s64 dir_seek_data(struct inode *inode, s64 pos)
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index b548c87a86f1..3886cded283c 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -230,7 +230,9 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
230 return ret; 230 return ret;
231 231
232 mutex_lock(&inode->i_mutex); 232 mutex_lock(&inode->i_mutex);
233 logfs_get_wblocks(sb, NULL, WF_LOCK);
233 logfs_write_anchor(sb); 234 logfs_write_anchor(sb);
235 logfs_put_wblocks(sb, NULL, WF_LOCK);
234 mutex_unlock(&inode->i_mutex); 236 mutex_unlock(&inode->i_mutex);
235 237
236 return 0; 238 return 0;
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index caa4419285dc..d4efb061bdc5 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -367,7 +367,7 @@ static struct gc_candidate *get_candidate(struct super_block *sb)
367 int i, max_dist; 367 int i, max_dist;
368 struct gc_candidate *cand = NULL, *this; 368 struct gc_candidate *cand = NULL, *this;
369 369
370 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS); 370 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
371 371
372 for (i = max_dist; i >= 0; i--) { 372 for (i = max_dist; i >= 0; i--) {
373 this = first_in_list(&super->s_low_list[i]); 373 this = first_in_list(&super->s_low_list[i]);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 388df1aa35e5..a422f42238b2 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -286,7 +286,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
286 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) 286 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
287 return 0; 287 return 0;
288 288
289 ret = __logfs_write_inode(inode, flags); 289 ret = __logfs_write_inode(inode, NULL, flags);
290 LOGFS_BUG_ON(ret, inode->i_sb); 290 LOGFS_BUG_ON(ret, inode->i_sb);
291 return ret; 291 return ret;
292} 292}
@@ -363,7 +363,9 @@ static void logfs_init_once(void *_li)
363 363
364static int logfs_sync_fs(struct super_block *sb, int wait) 364static int logfs_sync_fs(struct super_block *sb, int wait)
365{ 365{
366 logfs_get_wblocks(sb, NULL, WF_LOCK);
366 logfs_write_anchor(sb); 367 logfs_write_anchor(sb);
368 logfs_put_wblocks(sb, NULL, WF_LOCK);
367 return 0; 369 return 0;
368} 370}
369 371
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 9da29706f91c..1e1c369df22b 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -612,7 +612,6 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
612 if (len == 0) 612 if (len == 0)
613 return logfs_write_header(super, header, 0, type); 613 return logfs_write_header(super, header, 0, type);
614 614
615 BUG_ON(len > sb->s_blocksize);
616 compr_len = logfs_compress(buf, data, len, sb->s_blocksize); 615 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
617 if (compr_len < 0 || type == JE_ANCHOR) { 616 if (compr_len < 0 || type == JE_ANCHOR) {
618 memcpy(data, buf, len); 617 memcpy(data, buf, len);
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 926373866a55..5f0937609465 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -528,7 +528,7 @@ void logfs_destroy_inode_cache(void);
528void logfs_set_blocks(struct inode *inode, u64 no); 528void logfs_set_blocks(struct inode *inode, u64 no);
529/* these logically belong into inode.c but actually reside in readwrite.c */ 529/* these logically belong into inode.c but actually reside in readwrite.c */
530int logfs_read_inode(struct inode *inode); 530int logfs_read_inode(struct inode *inode);
531int __logfs_write_inode(struct inode *inode, long flags); 531int __logfs_write_inode(struct inode *inode, struct page *, long flags);
532void logfs_evict_inode(struct inode *inode); 532void logfs_evict_inode(struct inode *inode);
533 533
534/* journal.c */ 534/* journal.c */
@@ -577,6 +577,8 @@ void initialize_block_counters(struct page *page, struct logfs_block *block,
577 __be64 *array, int page_is_empty); 577 __be64 *array, int page_is_empty);
578int logfs_exist_block(struct inode *inode, u64 bix); 578int logfs_exist_block(struct inode *inode, u64 bix);
579int get_page_reserve(struct inode *inode, struct page *page); 579int get_page_reserve(struct inode *inode, struct page *page);
580void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
581void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
580extern struct logfs_block_ops indirect_block_ops; 582extern struct logfs_block_ops indirect_block_ops;
581 583
582/* segment.c */ 584/* segment.c */
@@ -594,6 +596,7 @@ int logfs_init_mapping(struct super_block *sb);
594void logfs_sync_area(struct logfs_area *area); 596void logfs_sync_area(struct logfs_area *area);
595void logfs_sync_segments(struct super_block *sb); 597void logfs_sync_segments(struct super_block *sb);
596void freeseg(struct super_block *sb, u32 segno); 598void freeseg(struct super_block *sb, u32 segno);
599void free_areas(struct super_block *sb);
597 600
598/* area handling */ 601/* area handling */
599int logfs_init_areas(struct super_block *sb); 602int logfs_init_areas(struct super_block *sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 2ac4217b7901..4153e65b0148 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -244,8 +244,7 @@ static void preunlock_page(struct super_block *sb, struct page *page, int lock)
244 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked 244 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
245 * in addition to PG_locked. 245 * in addition to PG_locked.
246 */ 246 */
247static void logfs_get_wblocks(struct super_block *sb, struct page *page, 247void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
248 int lock)
249{ 248{
250 struct logfs_super *super = logfs_super(sb); 249 struct logfs_super *super = logfs_super(sb);
251 250
@@ -260,8 +259,7 @@ static void logfs_get_wblocks(struct super_block *sb, struct page *page,
260 } 259 }
261} 260}
262 261
263static void logfs_put_wblocks(struct super_block *sb, struct page *page, 262void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
264 int lock)
265{ 263{
266 struct logfs_super *super = logfs_super(sb); 264 struct logfs_super *super = logfs_super(sb);
267 265
@@ -424,7 +422,7 @@ static void inode_write_block(struct logfs_block *block)
424 if (inode->i_ino == LOGFS_INO_MASTER) 422 if (inode->i_ino == LOGFS_INO_MASTER)
425 logfs_write_anchor(inode->i_sb); 423 logfs_write_anchor(inode->i_sb);
426 else { 424 else {
427 ret = __logfs_write_inode(inode, 0); 425 ret = __logfs_write_inode(inode, NULL, 0);
428 /* see indirect_write_block comment */ 426 /* see indirect_write_block comment */
429 BUG_ON(ret); 427 BUG_ON(ret);
430 } 428 }
@@ -560,8 +558,13 @@ static void inode_free_block(struct super_block *sb, struct logfs_block *block)
560static void indirect_free_block(struct super_block *sb, 558static void indirect_free_block(struct super_block *sb,
561 struct logfs_block *block) 559 struct logfs_block *block)
562{ 560{
563 ClearPagePrivate(block->page); 561 struct page *page = block->page;
564 block->page->private = 0; 562
563 if (PagePrivate(page)) {
564 ClearPagePrivate(page);
565 page_cache_release(page);
566 set_page_private(page, 0);
567 }
565 __free_block(sb, block); 568 __free_block(sb, block);
566} 569}
567 570
@@ -650,8 +653,11 @@ static void alloc_data_block(struct inode *inode, struct page *page)
650 logfs_unpack_index(page->index, &bix, &level); 653 logfs_unpack_index(page->index, &bix, &level);
651 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); 654 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
652 block->page = page; 655 block->page = page;
656
653 SetPagePrivate(page); 657 SetPagePrivate(page);
654 page->private = (unsigned long)block; 658 page_cache_get(page);
659 set_page_private(page, (unsigned long) block);
660
655 block->ops = &indirect_block_ops; 661 block->ops = &indirect_block_ops;
656} 662}
657 663
@@ -1570,11 +1576,15 @@ int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1570static int __logfs_delete(struct inode *inode, struct page *page) 1576static int __logfs_delete(struct inode *inode, struct page *page)
1571{ 1577{
1572 long flags = WF_DELETE; 1578 long flags = WF_DELETE;
1579 int err;
1573 1580
1574 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 1581 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1575 1582
1576 if (page->index < I0_BLOCKS) 1583 if (page->index < I0_BLOCKS)
1577 return logfs_write_direct(inode, page, flags); 1584 return logfs_write_direct(inode, page, flags);
1585 err = grow_inode(inode, page->index, 0);
1586 if (err)
1587 return err;
1578 return logfs_write_rec(inode, page, page->index, 0, flags); 1588 return logfs_write_rec(inode, page, page->index, 0, flags);
1579} 1589}
1580 1590
@@ -1623,7 +1633,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1623 if (inode->i_ino == LOGFS_INO_MASTER) 1633 if (inode->i_ino == LOGFS_INO_MASTER)
1624 logfs_write_anchor(inode->i_sb); 1634 logfs_write_anchor(inode->i_sb);
1625 else { 1635 else {
1626 err = __logfs_write_inode(inode, flags); 1636 err = __logfs_write_inode(inode, page, flags);
1627 } 1637 }
1628 } 1638 }
1629 } 1639 }
@@ -1873,7 +1883,7 @@ int logfs_truncate(struct inode *inode, u64 target)
1873 logfs_get_wblocks(sb, NULL, 1); 1883 logfs_get_wblocks(sb, NULL, 1);
1874 err = __logfs_truncate(inode, size); 1884 err = __logfs_truncate(inode, size);
1875 if (!err) 1885 if (!err)
1876 err = __logfs_write_inode(inode, 0); 1886 err = __logfs_write_inode(inode, NULL, 0);
1877 logfs_put_wblocks(sb, NULL, 1); 1887 logfs_put_wblocks(sb, NULL, 1);
1878 } 1888 }
1879 1889
@@ -1901,8 +1911,11 @@ static void move_page_to_inode(struct inode *inode, struct page *page)
1901 li->li_block = block; 1911 li->li_block = block;
1902 1912
1903 block->page = NULL; 1913 block->page = NULL;
1904 page->private = 0; 1914 if (PagePrivate(page)) {
1905 ClearPagePrivate(page); 1915 ClearPagePrivate(page);
1916 page_cache_release(page);
1917 set_page_private(page, 0);
1918 }
1906} 1919}
1907 1920
1908static void move_inode_to_page(struct page *page, struct inode *inode) 1921static void move_inode_to_page(struct page *page, struct inode *inode)
@@ -1918,8 +1931,12 @@ static void move_inode_to_page(struct page *page, struct inode *inode)
1918 BUG_ON(PagePrivate(page)); 1931 BUG_ON(PagePrivate(page));
1919 block->ops = &indirect_block_ops; 1932 block->ops = &indirect_block_ops;
1920 block->page = page; 1933 block->page = page;
1921 page->private = (unsigned long)block; 1934
1922 SetPagePrivate(page); 1935 if (!PagePrivate(page)) {
1936 SetPagePrivate(page);
1937 page_cache_get(page);
1938 set_page_private(page, (unsigned long) block);
1939 }
1923 1940
1924 block->inode = NULL; 1941 block->inode = NULL;
1925 li->li_block = NULL; 1942 li->li_block = NULL;
@@ -2106,14 +2123,14 @@ void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2106 ec_level); 2123 ec_level);
2107} 2124}
2108 2125
2109int __logfs_write_inode(struct inode *inode, long flags) 2126int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
2110{ 2127{
2111 struct super_block *sb = inode->i_sb; 2128 struct super_block *sb = inode->i_sb;
2112 int ret; 2129 int ret;
2113 2130
2114 logfs_get_wblocks(sb, NULL, flags & WF_LOCK); 2131 logfs_get_wblocks(sb, page, flags & WF_LOCK);
2115 ret = do_write_inode(inode); 2132 ret = do_write_inode(inode);
2116 logfs_put_wblocks(sb, NULL, flags & WF_LOCK); 2133 logfs_put_wblocks(sb, page, flags & WF_LOCK);
2117 return ret; 2134 return ret;
2118} 2135}
2119 2136
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 9d5187353255..ab798ed1cc88 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -86,7 +86,11 @@ int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
86 BUG_ON(!page); /* FIXME: reserve a pool */ 86 BUG_ON(!page); /* FIXME: reserve a pool */
87 SetPageUptodate(page); 87 SetPageUptodate(page);
88 memcpy(page_address(page) + offset, buf, copylen); 88 memcpy(page_address(page) + offset, buf, copylen);
89 SetPagePrivate(page); 89
90 if (!PagePrivate(page)) {
91 SetPagePrivate(page);
92 page_cache_get(page);
93 }
90 page_cache_release(page); 94 page_cache_release(page);
91 95
92 buf += copylen; 96 buf += copylen;
@@ -110,7 +114,10 @@ static void pad_partial_page(struct logfs_area *area)
110 page = get_mapping_page(sb, index, 0); 114 page = get_mapping_page(sb, index, 0);
111 BUG_ON(!page); /* FIXME: reserve a pool */ 115 BUG_ON(!page); /* FIXME: reserve a pool */
112 memset(page_address(page) + offset, 0xff, len); 116 memset(page_address(page) + offset, 0xff, len);
113 SetPagePrivate(page); 117 if (!PagePrivate(page)) {
118 SetPagePrivate(page);
119 page_cache_get(page);
120 }
114 page_cache_release(page); 121 page_cache_release(page);
115 } 122 }
116} 123}
@@ -130,7 +137,10 @@ static void pad_full_pages(struct logfs_area *area)
130 BUG_ON(!page); /* FIXME: reserve a pool */ 137 BUG_ON(!page); /* FIXME: reserve a pool */
131 SetPageUptodate(page); 138 SetPageUptodate(page);
132 memset(page_address(page), 0xff, PAGE_CACHE_SIZE); 139 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
133 SetPagePrivate(page); 140 if (!PagePrivate(page)) {
141 SetPagePrivate(page);
142 page_cache_get(page);
143 }
134 page_cache_release(page); 144 page_cache_release(page);
135 index++; 145 index++;
136 no_indizes--; 146 no_indizes--;
@@ -485,8 +495,12 @@ static void move_btree_to_page(struct inode *inode, struct page *page,
485 mempool_free(item, super->s_alias_pool); 495 mempool_free(item, super->s_alias_pool);
486 } 496 }
487 block->page = page; 497 block->page = page;
488 SetPagePrivate(page); 498
489 page->private = (unsigned long)block; 499 if (!PagePrivate(page)) {
500 SetPagePrivate(page);
501 page_cache_get(page);
502 set_page_private(page, (unsigned long) block);
503 }
490 block->ops = &indirect_block_ops; 504 block->ops = &indirect_block_ops;
491 initialize_block_counters(page, block, data, 0); 505 initialize_block_counters(page, block, data, 0);
492} 506}
@@ -536,8 +550,12 @@ void move_page_to_btree(struct page *page)
536 list_add(&item->list, &block->item_list); 550 list_add(&item->list, &block->item_list);
537 } 551 }
538 block->page = NULL; 552 block->page = NULL;
539 ClearPagePrivate(page); 553
540 page->private = 0; 554 if (PagePrivate(page)) {
555 ClearPagePrivate(page);
556 page_cache_release(page);
557 set_page_private(page, 0);
558 }
541 block->ops = &btree_block_ops; 559 block->ops = &btree_block_ops;
542 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, 560 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
543 block); 561 block);
@@ -702,7 +720,10 @@ void freeseg(struct super_block *sb, u32 segno)
702 page = find_get_page(mapping, ofs >> PAGE_SHIFT); 720 page = find_get_page(mapping, ofs >> PAGE_SHIFT);
703 if (!page) 721 if (!page)
704 continue; 722 continue;
705 ClearPagePrivate(page); 723 if (PagePrivate(page)) {
724 ClearPagePrivate(page);
725 page_cache_release(page);
726 }
706 page_cache_release(page); 727 page_cache_release(page);
707 } 728 }
708} 729}
@@ -841,6 +862,16 @@ static void free_area(struct logfs_area *area)
841 kfree(area); 862 kfree(area);
842} 863}
843 864
865void free_areas(struct super_block *sb)
866{
867 struct logfs_super *super = logfs_super(sb);
868 int i;
869
870 for_each_area(i)
871 free_area(super->s_area[i]);
872 free_area(super->s_journal_area);
873}
874
844static struct logfs_area *alloc_area(struct super_block *sb) 875static struct logfs_area *alloc_area(struct super_block *sb)
845{ 876{
846 struct logfs_area *area; 877 struct logfs_area *area;
@@ -923,10 +954,6 @@ err:
923void logfs_cleanup_areas(struct super_block *sb) 954void logfs_cleanup_areas(struct super_block *sb)
924{ 955{
925 struct logfs_super *super = logfs_super(sb); 956 struct logfs_super *super = logfs_super(sb);
926 int i;
927 957
928 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); 958 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
929 for_each_area(i)
930 free_area(super->s_area[i]);
931 free_area(super->s_journal_area);
932} 959}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index e795c234ea33..c9ee7f5d1caf 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -486,14 +486,15 @@ static void logfs_kill_sb(struct super_block *sb)
486 /* Alias entries slow down mount, so evict as many as possible */ 486 /* Alias entries slow down mount, so evict as many as possible */
487 sync_filesystem(sb); 487 sync_filesystem(sb);
488 logfs_write_anchor(sb); 488 logfs_write_anchor(sb);
489 free_areas(sb);
489 490
490 /* 491 /*
491 * From this point on alias entries are simply dropped - and any 492 * From this point on alias entries are simply dropped - and any
492 * writes to the object store are considered bugs. 493 * writes to the object store are considered bugs.
493 */ 494 */
494 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
495 log_super("LogFS: Now in shutdown\n"); 495 log_super("LogFS: Now in shutdown\n");
496 generic_shutdown_super(sb); 496 generic_shutdown_super(sb);
497 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
497 498
498 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); 499 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
499 500
diff --git a/fs/mpage.c b/fs/mpage.c
index fdfae9fa98cd..643e9f55ef29 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -371,9 +371,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
371 sector_t last_block_in_bio = 0; 371 sector_t last_block_in_bio = 0;
372 struct buffer_head map_bh; 372 struct buffer_head map_bh;
373 unsigned long first_logical_block = 0; 373 unsigned long first_logical_block = 0;
374 struct blk_plug plug;
375
376 blk_start_plug(&plug);
377 374
378 map_bh.b_state = 0; 375 map_bh.b_state = 0;
379 map_bh.b_size = 0; 376 map_bh.b_size = 0;
@@ -395,7 +392,6 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
395 BUG_ON(!list_empty(pages)); 392 BUG_ON(!list_empty(pages));
396 if (bio) 393 if (bio)
397 mpage_bio_submit(READ, bio); 394 mpage_bio_submit(READ, bio);
398 blk_finish_plug(&plug);
399 return 0; 395 return 0;
400} 396}
401EXPORT_SYMBOL(mpage_readpages); 397EXPORT_SYMBOL(mpage_readpages);
diff --git a/fs/namei.c b/fs/namei.c
index c283a1ec008e..208c6aa4a989 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -140,21 +140,19 @@ static int do_getname(const char __user *filename, char *page)
140 140
141static char *getname_flags(const char __user *filename, int flags, int *empty) 141static char *getname_flags(const char __user *filename, int flags, int *empty)
142{ 142{
143 char *tmp, *result; 143 char *result = __getname();
144 144 int retval;
145 result = ERR_PTR(-ENOMEM); 145
146 tmp = __getname(); 146 if (!result)
147 if (tmp) { 147 return ERR_PTR(-ENOMEM);
148 int retval = do_getname(filename, tmp); 148
149 149 retval = do_getname(filename, result);
150 result = tmp; 150 if (retval < 0) {
151 if (retval < 0) { 151 if (retval == -ENOENT && empty)
152 if (retval == -ENOENT && empty) 152 *empty = 1;
153 *empty = 1; 153 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
154 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) { 154 __putname(result);
155 __putname(tmp); 155 return ERR_PTR(retval);
156 result = ERR_PTR(retval);
157 }
158 } 156 }
159 } 157 }
160 audit_getname(result); 158 audit_getname(result);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 281ae95932c9..48cfac31f64c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,9 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
90 */ 90 */
91struct parallel_io { 91struct parallel_io {
92 struct kref refcnt; 92 struct kref refcnt;
93 struct rpc_call_ops call_ops; 93 void (*pnfs_callback) (void *data, int num_se);
94 void (*pnfs_callback) (void *data);
95 void *data; 94 void *data;
95 int bse_count;
96}; 96};
97 97
98static inline struct parallel_io *alloc_parallel(void *data) 98static inline struct parallel_io *alloc_parallel(void *data)
@@ -103,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
103 if (rv) { 103 if (rv) {
104 rv->data = data; 104 rv->data = data;
105 kref_init(&rv->refcnt); 105 kref_init(&rv->refcnt);
106 rv->bse_count = 0;
106 } 107 }
107 return rv; 108 return rv;
108} 109}
@@ -117,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
117 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 118 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
118 119
119 dprintk("%s enter\n", __func__); 120 dprintk("%s enter\n", __func__);
120 p->pnfs_callback(p->data); 121 p->pnfs_callback(p->data, p->bse_count);
121 kfree(p); 122 kfree(p);
122} 123}
123 124
@@ -146,14 +147,19 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
146{ 147{
147 struct bio *bio; 148 struct bio *bio;
148 149
150 npg = min(npg, BIO_MAX_PAGES);
149 bio = bio_alloc(GFP_NOIO, npg); 151 bio = bio_alloc(GFP_NOIO, npg);
150 if (!bio) 152 if (!bio && (current->flags & PF_MEMALLOC)) {
151 return NULL; 153 while (!bio && (npg /= 2))
154 bio = bio_alloc(GFP_NOIO, npg);
155 }
152 156
153 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; 157 if (bio) {
154 bio->bi_bdev = be->be_mdev; 158 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
155 bio->bi_end_io = end_io; 159 bio->bi_bdev = be->be_mdev;
156 bio->bi_private = par; 160 bio->bi_end_io = end_io;
161 bio->bi_private = par;
162 }
157 return bio; 163 return bio;
158} 164}
159 165
@@ -212,22 +218,15 @@ static void bl_read_cleanup(struct work_struct *work)
212} 218}
213 219
214static void 220static void
215bl_end_par_io_read(void *data) 221bl_end_par_io_read(void *data, int unused)
216{ 222{
217 struct nfs_read_data *rdata = data; 223 struct nfs_read_data *rdata = data;
218 224
225 rdata->task.tk_status = rdata->pnfs_error;
219 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); 226 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
220 schedule_work(&rdata->task.u.tk_work); 227 schedule_work(&rdata->task.u.tk_work);
221} 228}
222 229
223/* We don't want normal .rpc_call_done callback used, so we replace it
224 * with this stub.
225 */
226static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
227{
228 return;
229}
230
231static enum pnfs_try_status 230static enum pnfs_try_status
232bl_read_pagelist(struct nfs_read_data *rdata) 231bl_read_pagelist(struct nfs_read_data *rdata)
233{ 232{
@@ -247,8 +246,6 @@ bl_read_pagelist(struct nfs_read_data *rdata)
247 par = alloc_parallel(rdata); 246 par = alloc_parallel(rdata);
248 if (!par) 247 if (!par)
249 goto use_mds; 248 goto use_mds;
250 par->call_ops = *rdata->mds_ops;
251 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
252 par->pnfs_callback = bl_end_par_io_read; 249 par->pnfs_callback = bl_end_par_io_read;
253 /* At this point, we can no longer jump to use_mds */ 250 /* At this point, we can no longer jump to use_mds */
254 251
@@ -322,6 +319,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
322{ 319{
323 sector_t isect, end; 320 sector_t isect, end;
324 struct pnfs_block_extent *be; 321 struct pnfs_block_extent *be;
322 struct pnfs_block_short_extent *se;
325 323
326 dprintk("%s(%llu, %u)\n", __func__, offset, count); 324 dprintk("%s(%llu, %u)\n", __func__, offset, count);
327 if (count == 0) 325 if (count == 0)
@@ -334,8 +332,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
334 be = bl_find_get_extent(bl, isect, NULL); 332 be = bl_find_get_extent(bl, isect, NULL);
335 BUG_ON(!be); /* FIXME */ 333 BUG_ON(!be); /* FIXME */
336 len = min(end, be->be_f_offset + be->be_length) - isect; 334 len = min(end, be->be_f_offset + be->be_length) - isect;
337 if (be->be_state == PNFS_BLOCK_INVALID_DATA) 335 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
338 bl_mark_for_commit(be, isect, len); /* What if fails? */ 336 se = bl_pop_one_short_extent(be->be_inval);
337 BUG_ON(!se);
338 bl_mark_for_commit(be, isect, len, se);
339 }
339 isect += len; 340 isect += len;
340 bl_put_extent(be); 341 bl_put_extent(be);
341 } 342 }
@@ -357,7 +358,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
357 end_page_writeback(page); 358 end_page_writeback(page);
358 page_cache_release(page); 359 page_cache_release(page);
359 } while (bvec >= bio->bi_io_vec); 360 } while (bvec >= bio->bi_io_vec);
360 if (!uptodate) { 361
362 if (unlikely(!uptodate)) {
361 if (!wdata->pnfs_error) 363 if (!wdata->pnfs_error)
362 wdata->pnfs_error = -EIO; 364 wdata->pnfs_error = -EIO;
363 pnfs_set_lo_fail(wdata->lseg); 365 pnfs_set_lo_fail(wdata->lseg);
@@ -366,7 +368,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
366 put_parallel(par); 368 put_parallel(par);
367} 369}
368 370
369/* This is basically copied from mpage_end_io_read */
370static void bl_end_io_write(struct bio *bio, int err) 371static void bl_end_io_write(struct bio *bio, int err)
371{ 372{
372 struct parallel_io *par = bio->bi_private; 373 struct parallel_io *par = bio->bi_private;
@@ -392,7 +393,7 @@ static void bl_write_cleanup(struct work_struct *work)
392 dprintk("%s enter\n", __func__); 393 dprintk("%s enter\n", __func__);
393 task = container_of(work, struct rpc_task, u.tk_work); 394 task = container_of(work, struct rpc_task, u.tk_work);
394 wdata = container_of(task, struct nfs_write_data, task); 395 wdata = container_of(task, struct nfs_write_data, task);
395 if (!wdata->pnfs_error) { 396 if (likely(!wdata->pnfs_error)) {
396 /* Marks for LAYOUTCOMMIT */ 397 /* Marks for LAYOUTCOMMIT */
397 mark_extents_written(BLK_LSEG2EXT(wdata->lseg), 398 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
398 wdata->args.offset, wdata->args.count); 399 wdata->args.offset, wdata->args.count);
@@ -401,11 +402,16 @@ static void bl_write_cleanup(struct work_struct *work)
401} 402}
402 403
403/* Called when last of bios associated with a bl_write_pagelist call finishes */ 404/* Called when last of bios associated with a bl_write_pagelist call finishes */
404static void bl_end_par_io_write(void *data) 405static void bl_end_par_io_write(void *data, int num_se)
405{ 406{
406 struct nfs_write_data *wdata = data; 407 struct nfs_write_data *wdata = data;
407 408
408 wdata->task.tk_status = 0; 409 if (unlikely(wdata->pnfs_error)) {
410 bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
411 num_se);
412 }
413
414 wdata->task.tk_status = wdata->pnfs_error;
409 wdata->verf.committed = NFS_FILE_SYNC; 415 wdata->verf.committed = NFS_FILE_SYNC;
410 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); 416 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
411 schedule_work(&wdata->task.u.tk_work); 417 schedule_work(&wdata->task.u.tk_work);
@@ -484,6 +490,55 @@ cleanup:
484 return ret; 490 return ret;
485} 491}
486 492
493/* Find or create a zeroing page marked being writeback.
494 * Return ERR_PTR on error, NULL to indicate skip this page and page itself
495 * to indicate write out.
496 */
497static struct page *
498bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
499 struct pnfs_block_extent *cow_read)
500{
501 struct page *page;
502 int locked = 0;
503 page = find_get_page(inode->i_mapping, index);
504 if (page)
505 goto check_page;
506
507 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
508 if (unlikely(!page)) {
509 dprintk("%s oom\n", __func__);
510 return ERR_PTR(-ENOMEM);
511 }
512 locked = 1;
513
514check_page:
515 /* PageDirty: Other will write this out
516 * PageWriteback: Other is writing this out
517 * PageUptodate: It was read before
518 */
519 if (PageDirty(page) || PageWriteback(page)) {
520 print_page(page);
521 if (locked)
522 unlock_page(page);
523 page_cache_release(page);
524 return NULL;
525 }
526
527 if (!locked) {
528 lock_page(page);
529 locked = 1;
530 goto check_page;
531 }
532 if (!PageUptodate(page)) {
533 /* New page, readin or zero it */
534 init_page_for_write(page, cow_read);
535 }
536 set_page_writeback(page);
537 unlock_page(page);
538
539 return page;
540}
541
487static enum pnfs_try_status 542static enum pnfs_try_status
488bl_write_pagelist(struct nfs_write_data *wdata, int sync) 543bl_write_pagelist(struct nfs_write_data *wdata, int sync)
489{ 544{
@@ -508,9 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
508 */ 563 */
509 par = alloc_parallel(wdata); 564 par = alloc_parallel(wdata);
510 if (!par) 565 if (!par)
511 return PNFS_NOT_ATTEMPTED; 566 goto out_mds;
512 par->call_ops = *wdata->mds_ops;
513 par->call_ops.rpc_call_done = bl_rpc_do_nothing;
514 par->pnfs_callback = bl_end_par_io_write; 567 par->pnfs_callback = bl_end_par_io_write;
515 /* At this point, have to be more careful with error handling */ 568 /* At this point, have to be more careful with error handling */
516 569
@@ -518,12 +571,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
518 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); 571 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
519 if (!be || !is_writable(be, isect)) { 572 if (!be || !is_writable(be, isect)) {
520 dprintk("%s no matching extents!\n", __func__); 573 dprintk("%s no matching extents!\n", __func__);
521 wdata->pnfs_error = -EINVAL; 574 goto out_mds;
522 goto out;
523 } 575 }
524 576
525 /* First page inside INVALID extent */ 577 /* First page inside INVALID extent */
526 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 578 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
579 if (likely(!bl_push_one_short_extent(be->be_inval)))
580 par->bse_count++;
581 else
582 goto out_mds;
527 temp = offset >> PAGE_CACHE_SHIFT; 583 temp = offset >> PAGE_CACHE_SHIFT;
528 npg_zero = do_div(temp, npg_per_block); 584 npg_zero = do_div(temp, npg_per_block);
529 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & 585 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -543,36 +599,16 @@ fill_invalid_ext:
543 dprintk("%s zero %dth page: index %lu isect %llu\n", 599 dprintk("%s zero %dth page: index %lu isect %llu\n",
544 __func__, npg_zero, index, 600 __func__, npg_zero, index,
545 (unsigned long long)isect); 601 (unsigned long long)isect);
546 page = 602 page = bl_find_get_zeroing_page(wdata->inode, index,
547 find_or_create_page(wdata->inode->i_mapping, index, 603 cow_read);
548 GFP_NOFS); 604 if (unlikely(IS_ERR(page))) {
549 if (!page) { 605 wdata->pnfs_error = PTR_ERR(page);
550 dprintk("%s oom\n", __func__);
551 wdata->pnfs_error = -ENOMEM;
552 goto out; 606 goto out;
553 } 607 } else if (page == NULL)
554
555 /* PageDirty: Other will write this out
556 * PageWriteback: Other is writing this out
557 * PageUptodate: It was read before
558 * sector_initialized: already written out
559 */
560 if (PageDirty(page) || PageWriteback(page)) {
561 print_page(page);
562 unlock_page(page);
563 page_cache_release(page);
564 goto next_page; 608 goto next_page;
565 }
566 if (!PageUptodate(page)) {
567 /* New page, readin or zero it */
568 init_page_for_write(page, cow_read);
569 }
570 set_page_writeback(page);
571 unlock_page(page);
572 609
573 ret = bl_mark_sectors_init(be->be_inval, isect, 610 ret = bl_mark_sectors_init(be->be_inval, isect,
574 PAGE_CACHE_SECTORS, 611 PAGE_CACHE_SECTORS);
575 NULL);
576 if (unlikely(ret)) { 612 if (unlikely(ret)) {
577 dprintk("%s bl_mark_sectors_init fail %d\n", 613 dprintk("%s bl_mark_sectors_init fail %d\n",
578 __func__, ret); 614 __func__, ret);
@@ -581,6 +617,19 @@ fill_invalid_ext:
581 wdata->pnfs_error = ret; 617 wdata->pnfs_error = ret;
582 goto out; 618 goto out;
583 } 619 }
620 if (likely(!bl_push_one_short_extent(be->be_inval)))
621 par->bse_count++;
622 else {
623 end_page_writeback(page);
624 page_cache_release(page);
625 wdata->pnfs_error = -ENOMEM;
626 goto out;
627 }
628 /* FIXME: This should be done in bi_end_io */
629 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
630 page->index << PAGE_CACHE_SHIFT,
631 PAGE_CACHE_SIZE);
632
584 bio = bl_add_page_to_bio(bio, npg_zero, WRITE, 633 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
585 isect, page, be, 634 isect, page, be,
586 bl_end_io_write_zero, par); 635 bl_end_io_write_zero, par);
@@ -589,10 +638,6 @@ fill_invalid_ext:
589 bio = NULL; 638 bio = NULL;
590 goto out; 639 goto out;
591 } 640 }
592 /* FIXME: This should be done in bi_end_io */
593 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
594 page->index << PAGE_CACHE_SHIFT,
595 PAGE_CACHE_SIZE);
596next_page: 641next_page:
597 isect += PAGE_CACHE_SECTORS; 642 isect += PAGE_CACHE_SECTORS;
598 extent_length -= PAGE_CACHE_SECTORS; 643 extent_length -= PAGE_CACHE_SECTORS;
@@ -616,13 +661,21 @@ next_page:
616 wdata->pnfs_error = -EINVAL; 661 wdata->pnfs_error = -EINVAL;
617 goto out; 662 goto out;
618 } 663 }
664 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
665 if (likely(!bl_push_one_short_extent(
666 be->be_inval)))
667 par->bse_count++;
668 else {
669 wdata->pnfs_error = -ENOMEM;
670 goto out;
671 }
672 }
619 extent_length = be->be_length - 673 extent_length = be->be_length -
620 (isect - be->be_f_offset); 674 (isect - be->be_f_offset);
621 } 675 }
622 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 676 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
623 ret = bl_mark_sectors_init(be->be_inval, isect, 677 ret = bl_mark_sectors_init(be->be_inval, isect,
624 PAGE_CACHE_SECTORS, 678 PAGE_CACHE_SECTORS);
625 NULL);
626 if (unlikely(ret)) { 679 if (unlikely(ret)) {
627 dprintk("%s bl_mark_sectors_init fail %d\n", 680 dprintk("%s bl_mark_sectors_init fail %d\n",
628 __func__, ret); 681 __func__, ret);
@@ -664,6 +717,10 @@ out:
664 bl_submit_bio(WRITE, bio); 717 bl_submit_bio(WRITE, bio);
665 put_parallel(par); 718 put_parallel(par);
666 return PNFS_ATTEMPTED; 719 return PNFS_ATTEMPTED;
720out_mds:
721 bl_put_extent(be);
722 kfree(par);
723 return PNFS_NOT_ATTEMPTED;
667} 724}
668 725
669/* FIXME - range ignored */ 726/* FIXME - range ignored */
@@ -690,11 +747,17 @@ static void
690release_inval_marks(struct pnfs_inval_markings *marks) 747release_inval_marks(struct pnfs_inval_markings *marks)
691{ 748{
692 struct pnfs_inval_tracking *pos, *temp; 749 struct pnfs_inval_tracking *pos, *temp;
750 struct pnfs_block_short_extent *se, *stemp;
693 751
694 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { 752 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
695 list_del(&pos->it_link); 753 list_del(&pos->it_link);
696 kfree(pos); 754 kfree(pos);
697 } 755 }
756
757 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
758 list_del(&se->bse_node);
759 kfree(se);
760 }
698 return; 761 return;
699} 762}
700 763
@@ -779,16 +842,13 @@ bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
779static void free_blk_mountid(struct block_mount_id *mid) 842static void free_blk_mountid(struct block_mount_id *mid)
780{ 843{
781 if (mid) { 844 if (mid) {
782 struct pnfs_block_dev *dev; 845 struct pnfs_block_dev *dev, *tmp;
783 spin_lock(&mid->bm_lock); 846
784 while (!list_empty(&mid->bm_devlist)) { 847 /* No need to take bm_lock as we are last user freeing bm_devlist */
785 dev = list_first_entry(&mid->bm_devlist, 848 list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) {
786 struct pnfs_block_dev,
787 bm_node);
788 list_del(&dev->bm_node); 849 list_del(&dev->bm_node);
789 bl_free_block_dev(dev); 850 bl_free_block_dev(dev);
790 } 851 }
791 spin_unlock(&mid->bm_lock);
792 kfree(mid); 852 kfree(mid);
793 } 853 }
794} 854}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 42acf7ef5992..e31a2df28e70 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -70,6 +70,7 @@ struct pnfs_inval_markings {
70 spinlock_t im_lock; 70 spinlock_t im_lock;
71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ 71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
72 sector_t im_block_size; /* Server blocksize in sectors */ 72 sector_t im_block_size; /* Server blocksize in sectors */
73 struct list_head im_extents; /* Short extents for INVAL->RW conversion */
73}; 74};
74 75
75struct pnfs_inval_tracking { 76struct pnfs_inval_tracking {
@@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
105{ 106{
106 spin_lock_init(&marks->im_lock); 107 spin_lock_init(&marks->im_lock);
107 INIT_LIST_HEAD(&marks->im_tree.mtt_stub); 108 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
109 INIT_LIST_HEAD(&marks->im_extents);
108 marks->im_block_size = blocksize; 110 marks->im_block_size = blocksize;
109 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, 111 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
110 blocksize); 112 blocksize);
@@ -186,8 +188,7 @@ struct pnfs_block_extent *
186bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
187 struct pnfs_block_extent **cow_read); 189 struct pnfs_block_extent **cow_read);
188int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 190int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
189 sector_t offset, sector_t length, 191 sector_t offset, sector_t length);
190 sector_t **pages);
191void bl_put_extent(struct pnfs_block_extent *be); 192void bl_put_extent(struct pnfs_block_extent *be);
192struct pnfs_block_extent *bl_alloc_extent(void); 193struct pnfs_block_extent *bl_alloc_extent(void);
193int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); 194int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect);
@@ -200,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
200int bl_add_merge_extent(struct pnfs_block_layout *bl, 201int bl_add_merge_extent(struct pnfs_block_layout *bl,
201 struct pnfs_block_extent *new); 202 struct pnfs_block_extent *new);
202int bl_mark_for_commit(struct pnfs_block_extent *be, 203int bl_mark_for_commit(struct pnfs_block_extent *be,
203 sector_t offset, sector_t length); 204 sector_t offset, sector_t length,
205 struct pnfs_block_short_extent *new);
206int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
207struct pnfs_block_short_extent *
208bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
209void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
204 210
205#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 211#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 19fa7b0b8c00..1abac09f7cd5 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -110,13 +110,7 @@ static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
110 return 0; 110 return 0;
111 } else { 111 } else {
112 struct pnfs_inval_tracking *new; 112 struct pnfs_inval_tracking *new;
113 if (storage) 113 new = storage;
114 new = storage;
115 else {
116 new = kmalloc(sizeof(*new), GFP_NOFS);
117 if (!new)
118 return -ENOMEM;
119 }
120 new->it_sector = s; 114 new->it_sector = s;
121 new->it_tags = (1 << tag); 115 new->it_tags = (1 << tag);
122 list_add(&new->it_link, &pos->it_link); 116 list_add(&new->it_link, &pos->it_link);
@@ -139,11 +133,13 @@ static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
139} 133}
140 134
141/* Ensure that future operations on given range of tree will not malloc */ 135/* Ensure that future operations on given range of tree will not malloc */
142static int _preload_range(struct my_tree *tree, u64 offset, u64 length) 136static int _preload_range(struct pnfs_inval_markings *marks,
137 u64 offset, u64 length)
143{ 138{
144 u64 start, end, s; 139 u64 start, end, s;
145 int count, i, used = 0, status = -ENOMEM; 140 int count, i, used = 0, status = -ENOMEM;
146 struct pnfs_inval_tracking **storage; 141 struct pnfs_inval_tracking **storage;
142 struct my_tree *tree = &marks->im_tree;
147 143
148 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); 144 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
149 start = normalize(offset, tree->mtt_step_size); 145 start = normalize(offset, tree->mtt_step_size);
@@ -161,12 +157,11 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
161 goto out_cleanup; 157 goto out_cleanup;
162 } 158 }
163 159
164 /* Now need lock - HOW??? */ 160 spin_lock_bh(&marks->im_lock);
165
166 for (s = start; s < end; s += tree->mtt_step_size) 161 for (s = start; s < end; s += tree->mtt_step_size)
167 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); 162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163 spin_unlock_bh(&marks->im_lock);
168 164
169 /* Unlock - HOW??? */
170 status = 0; 165 status = 0;
171 166
172 out_cleanup: 167 out_cleanup:
@@ -179,41 +174,14 @@ static int _preload_range(struct my_tree *tree, u64 offset, u64 length)
179 return status; 174 return status;
180} 175}
181 176
182static void set_needs_init(sector_t *array, sector_t offset)
183{
184 sector_t *p = array;
185
186 dprintk("%s enter\n", __func__);
187 if (!p)
188 return;
189 while (*p < offset)
190 p++;
191 if (*p == offset)
192 return;
193 else if (*p == ~0) {
194 *p++ = offset;
195 *p = ~0;
196 return;
197 } else {
198 sector_t *save = p;
199 dprintk("%s Adding %llu\n", __func__, (u64)offset);
200 while (*p != ~0)
201 p++;
202 p++;
203 memmove(save + 1, save, (char *)p - (char *)save);
204 *save = offset;
205 return;
206 }
207}
208
209/* We are relying on page lock to serialize this */ 177/* We are relying on page lock to serialize this */
210int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) 178int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
211{ 179{
212 int rv; 180 int rv;
213 181
214 spin_lock(&marks->im_lock); 182 spin_lock_bh(&marks->im_lock);
215 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); 183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
216 spin_unlock(&marks->im_lock); 184 spin_unlock_bh(&marks->im_lock);
217 return rv; 185 return rv;
218} 186}
219 187
@@ -253,78 +221,39 @@ static int is_range_written(struct pnfs_inval_markings *marks,
253{ 221{
254 int rv; 222 int rv;
255 223
256 spin_lock(&marks->im_lock); 224 spin_lock_bh(&marks->im_lock);
257 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); 225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
258 spin_unlock(&marks->im_lock); 226 spin_unlock_bh(&marks->im_lock);
259 return rv; 227 return rv;
260} 228}
261 229
262/* Marks sectors in [offest, offset_length) as having been initialized. 230/* Marks sectors in [offest, offset_length) as having been initialized.
263 * All lengths are step-aligned, where step is min(pagesize, blocksize). 231 * All lengths are step-aligned, where step is min(pagesize, blocksize).
264 * Notes where partial block is initialized, and helps prepare it for 232 * Currently assumes offset is page-aligned
265 * complete initialization later.
266 */ 233 */
267/* Currently assumes offset is page-aligned */
268int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 234int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
269 sector_t offset, sector_t length, 235 sector_t offset, sector_t length)
270 sector_t **pages)
271{ 236{
272 sector_t s, start, end; 237 sector_t start, end;
273 sector_t *array = NULL; /* Pages to mark */
274 238
275 dprintk("%s(offset=%llu,len=%llu) enter\n", 239 dprintk("%s(offset=%llu,len=%llu) enter\n",
276 __func__, (u64)offset, (u64)length); 240 __func__, (u64)offset, (u64)length);
277 s = max((sector_t) 3,
278 2 * (marks->im_block_size / (PAGE_CACHE_SECTORS)));
279 dprintk("%s set max=%llu\n", __func__, (u64)s);
280 if (pages) {
281 array = kmalloc(s * sizeof(sector_t), GFP_NOFS);
282 if (!array)
283 goto outerr;
284 array[0] = ~0;
285 }
286 241
287 start = normalize(offset, marks->im_block_size); 242 start = normalize(offset, marks->im_block_size);
288 end = normalize_up(offset + length, marks->im_block_size); 243 end = normalize_up(offset + length, marks->im_block_size);
289 if (_preload_range(&marks->im_tree, start, end - start)) 244 if (_preload_range(marks, start, end - start))
290 goto outerr; 245 goto outerr;
291 246
292 spin_lock(&marks->im_lock); 247 spin_lock_bh(&marks->im_lock);
293
294 for (s = normalize_up(start, PAGE_CACHE_SECTORS);
295 s < offset; s += PAGE_CACHE_SECTORS) {
296 dprintk("%s pre-area pages\n", __func__);
297 /* Portion of used block is not initialized */
298 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
299 set_needs_init(array, s);
300 }
301 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) 248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
302 goto out_unlock; 249 goto out_unlock;
303 for (s = normalize_up(offset + length, PAGE_CACHE_SECTORS); 250 spin_unlock_bh(&marks->im_lock);
304 s < end; s += PAGE_CACHE_SECTORS) {
305 dprintk("%s post-area pages\n", __func__);
306 if (!_has_tag(&marks->im_tree, s, EXTENT_INITIALIZED))
307 set_needs_init(array, s);
308 }
309
310 spin_unlock(&marks->im_lock);
311 251
312 if (pages) {
313 if (array[0] == ~0) {
314 kfree(array);
315 *pages = NULL;
316 } else
317 *pages = array;
318 }
319 return 0; 252 return 0;
320 253
321 out_unlock: 254out_unlock:
322 spin_unlock(&marks->im_lock); 255 spin_unlock_bh(&marks->im_lock);
323 outerr: 256outerr:
324 if (pages) {
325 kfree(array);
326 *pages = NULL;
327 }
328 return -ENOMEM; 257 return -ENOMEM;
329} 258}
330 259
@@ -338,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks,
338 267
339 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, 268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
340 (u64)offset, (u64)length); 269 (u64)offset, (u64)length);
341 spin_lock(&marks->im_lock); 270 spin_lock_bh(&marks->im_lock);
342 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); 271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
343 spin_unlock(&marks->im_lock); 272 spin_unlock_bh(&marks->im_lock);
344 return status; 273 return status;
345} 274}
346 275
@@ -440,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
440 369
441/* Note the range described by offset, length is guaranteed to be contained 370/* Note the range described by offset, length is guaranteed to be contained
442 * within be. 371 * within be.
372 * new will be freed, either by this function or add_to_commitlist if they
373 * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
443 */ 374 */
444int bl_mark_for_commit(struct pnfs_block_extent *be, 375int bl_mark_for_commit(struct pnfs_block_extent *be,
445 sector_t offset, sector_t length) 376 sector_t offset, sector_t length,
377 struct pnfs_block_short_extent *new)
446{ 378{
447 sector_t new_end, end = offset + length; 379 sector_t new_end, end = offset + length;
448 struct pnfs_block_short_extent *new;
449 struct pnfs_block_layout *bl = container_of(be->be_inval, 380 struct pnfs_block_layout *bl = container_of(be->be_inval,
450 struct pnfs_block_layout, 381 struct pnfs_block_layout,
451 bl_inval); 382 bl_inval);
452 383
453 new = kmalloc(sizeof(*new), GFP_NOFS);
454 if (!new)
455 return -ENOMEM;
456
457 mark_written_sectors(be->be_inval, offset, length); 384 mark_written_sectors(be->be_inval, offset, length);
458 /* We want to add the range to commit list, but it must be 385 /* We want to add the range to commit list, but it must be
459 * block-normalized, and verified that the normalized range has 386 * block-normalized, and verified that the normalized range has
@@ -483,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
483 new->bse_mdev = be->be_mdev; 410 new->bse_mdev = be->be_mdev;
484 411
485 spin_lock(&bl->bl_ext_lock); 412 spin_lock(&bl->bl_ext_lock);
486 /* new will be freed, either by add_to_commitlist if it decides not
487 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
488 */
489 add_to_commitlist(bl, new); 413 add_to_commitlist(bl, new);
490 spin_unlock(&bl->bl_ext_lock); 414 spin_unlock(&bl->bl_ext_lock);
491 return 0; 415 return 0;
@@ -933,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
933 } 857 }
934 } 858 }
935} 859}
860
861int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
862{
863 struct pnfs_block_short_extent *new;
864
865 new = kmalloc(sizeof(*new), GFP_NOFS);
866 if (unlikely(!new))
867 return -ENOMEM;
868
869 spin_lock_bh(&marks->im_lock);
870 list_add(&new->bse_node, &marks->im_extents);
871 spin_unlock_bh(&marks->im_lock);
872
873 return 0;
874}
875
876struct pnfs_block_short_extent *
877bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
878{
879 struct pnfs_block_short_extent *rv = NULL;
880
881 spin_lock_bh(&marks->im_lock);
882 if (!list_empty(&marks->im_extents)) {
883 rv = list_entry((&marks->im_extents)->next,
884 struct pnfs_block_short_extent, bse_node);
885 list_del_init(&rv->bse_node);
886 }
887 spin_unlock_bh(&marks->im_lock);
888
889 return rv;
890}
891
892void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
893{
894 struct pnfs_block_short_extent *se = NULL, *tmp;
895
896 if (num_to_free <= 0)
897 return;
898
899 spin_lock(&marks->im_lock);
900 list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
901 list_del(&se->bse_node);
902 kfree(se);
903 if (--num_to_free == 0)
904 break;
905 }
906 spin_unlock(&marks->im_lock);
907
908 BUG_ON(num_to_free > 0);
909}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 07df5f1d85e5..c89d3b9e483c 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -162,7 +162,7 @@ struct cb_layoutrecallargs {
162 }; 162 };
163}; 163};
164 164
165extern unsigned nfs4_callback_layoutrecall( 165extern __be32 nfs4_callback_layoutrecall(
166 struct cb_layoutrecallargs *args, 166 struct cb_layoutrecallargs *args,
167 void *dummy, struct cb_process_state *cps); 167 void *dummy, struct cb_process_state *cps);
168 168
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 43926add945b..54cea8ad5a76 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -339,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
339 dprintk("%s enter. slotid %d seqid %d\n", 339 dprintk("%s enter. slotid %d seqid %d\n",
340 __func__, args->csa_slotid, args->csa_sequenceid); 340 __func__, args->csa_slotid, args->csa_sequenceid);
341 341
342 if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS) 342 if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
343 return htonl(NFS4ERR_BADSLOT); 343 return htonl(NFS4ERR_BADSLOT);
344 344
345 slot = tbl->slots + args->csa_slotid; 345 slot = tbl->slots + args->csa_slotid;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 726e59a9e50f..d50b2742f23b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -305,6 +305,10 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
305 n = ntohl(*p++); 305 n = ntohl(*p++);
306 if (n <= 0) 306 if (n <= 0)
307 goto out; 307 goto out;
308 if (n > ULONG_MAX / sizeof(*args->devs)) {
309 status = htonl(NFS4ERR_BADXDR);
310 goto out;
311 }
308 312
309 args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL); 313 args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
310 if (!args->devs) { 314 if (!args->devs) {
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 873bf00d51a2..31778f74357d 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -84,7 +84,7 @@ retry:
84/* 84/*
85 * Turn off NFSv4 uid/gid mapping when using AUTH_SYS 85 * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
86 */ 86 */
87static int nfs4_disable_idmapping = 0; 87static bool nfs4_disable_idmapping = true;
88 88
89/* 89/*
90 * RPC cruft for NFS 90 * RPC cruft for NFS
@@ -185,7 +185,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
185 clp->cl_minorversion = cl_init->minorversion; 185 clp->cl_minorversion = cl_init->minorversion;
186 clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; 186 clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
187#endif 187#endif
188 cred = rpc_lookup_machine_cred(); 188 cred = rpc_lookup_machine_cred("*");
189 if (!IS_ERR(cred)) 189 if (!IS_ERR(cred))
190 clp->cl_machine_cred = cred; 190 clp->cl_machine_cred = cred;
191 nfs_fscache_get_client_cookie(clp); 191 nfs_fscache_get_client_cookie(clp);
@@ -250,6 +250,11 @@ static void pnfs_init_server(struct nfs_server *server)
250 rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); 250 rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
251} 251}
252 252
253static void nfs4_destroy_server(struct nfs_server *server)
254{
255 nfs4_purge_state_owners(server);
256}
257
253#else 258#else
254static void nfs4_shutdown_client(struct nfs_client *clp) 259static void nfs4_shutdown_client(struct nfs_client *clp)
255{ 260{
@@ -1065,6 +1070,7 @@ static struct nfs_server *nfs_alloc_server(void)
1065 INIT_LIST_HEAD(&server->master_link); 1070 INIT_LIST_HEAD(&server->master_link);
1066 INIT_LIST_HEAD(&server->delegations); 1071 INIT_LIST_HEAD(&server->delegations);
1067 INIT_LIST_HEAD(&server->layouts); 1072 INIT_LIST_HEAD(&server->layouts);
1073 INIT_LIST_HEAD(&server->state_owners_lru);
1068 1074
1069 atomic_set(&server->active, 0); 1075 atomic_set(&server->active, 0);
1070 1076
@@ -1538,6 +1544,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
1538 1544
1539 nfs_server_insert_lists(server); 1545 nfs_server_insert_lists(server);
1540 server->mount_time = jiffies; 1546 server->mount_time = jiffies;
1547 server->destroy = nfs4_destroy_server;
1541out: 1548out:
1542 nfs_free_fattr(fattr); 1549 nfs_free_fattr(fattr);
1543 return error; 1550 return error;
@@ -1719,6 +1726,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
1719 1726
1720 /* Copy data from the source */ 1727 /* Copy data from the source */
1721 server->nfs_client = source->nfs_client; 1728 server->nfs_client = source->nfs_client;
1729 server->destroy = source->destroy;
1722 atomic_inc(&server->nfs_client->cl_count); 1730 atomic_inc(&server->nfs_client->cl_count);
1723 nfs_server_copy_userdata(server, source); 1731 nfs_server_copy_userdata(server, source);
1724 1732
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 606ef0f20aed..c43a452f7da2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -272,13 +272,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
272 datasync); 272 datasync);
273 273
274 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 274 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
275 if (ret)
276 return ret;
277 mutex_lock(&inode->i_mutex); 275 mutex_lock(&inode->i_mutex);
278 276
279 nfs_inc_stats(inode, NFSIOS_VFSFSYNC); 277 nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
280 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 278 have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
281 status = nfs_commit_inode(inode, FLUSH_SYNC); 279 status = nfs_commit_inode(inode, FLUSH_SYNC);
280 if (status >= 0 && ret < 0)
281 status = ret;
282 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); 282 have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
283 if (have_error) 283 if (have_error)
284 ret = xchg(&ctx->error, 0); 284 ret = xchg(&ctx->error, 0);
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 47d1c6ff2d8e..2c05f1991e1e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -38,6 +38,89 @@
38#include <linux/kernel.h> 38#include <linux/kernel.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/nfs_idmap.h> 40#include <linux/nfs_idmap.h>
41#include <linux/nfs_fs.h>
42
43/**
44 * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
45 * @fattr: fully initialised struct nfs_fattr
46 * @owner_name: owner name string cache
47 * @group_name: group name string cache
48 */
49void nfs_fattr_init_names(struct nfs_fattr *fattr,
50 struct nfs4_string *owner_name,
51 struct nfs4_string *group_name)
52{
53 fattr->owner_name = owner_name;
54 fattr->group_name = group_name;
55}
56
57static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
58{
59 fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
60 kfree(fattr->owner_name->data);
61}
62
63static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
64{
65 fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
66 kfree(fattr->group_name->data);
67}
68
69static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
70{
71 struct nfs4_string *owner = fattr->owner_name;
72 __u32 uid;
73
74 if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
75 return false;
76 if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
77 fattr->uid = uid;
78 fattr->valid |= NFS_ATTR_FATTR_OWNER;
79 }
80 return true;
81}
82
83static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
84{
85 struct nfs4_string *group = fattr->group_name;
86 __u32 gid;
87
88 if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
89 return false;
90 if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
91 fattr->gid = gid;
92 fattr->valid |= NFS_ATTR_FATTR_GROUP;
93 }
94 return true;
95}
96
97/**
98 * nfs_fattr_free_names - free up the NFSv4 owner and group strings
99 * @fattr: a fully initialised nfs_fattr structure
100 */
101void nfs_fattr_free_names(struct nfs_fattr *fattr)
102{
103 if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
104 nfs_fattr_free_owner_name(fattr);
105 if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
106 nfs_fattr_free_group_name(fattr);
107}
108
109/**
110 * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
111 * @server: pointer to the filesystem nfs_server structure
112 * @fattr: a fully initialised nfs_fattr structure
113 *
114 * This helper maps the cached NFSv4 owner/group strings in fattr into
115 * their numeric uid/gid equivalents, and then frees the cached strings.
116 */
117void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
118{
119 if (nfs_fattr_map_owner_name(server, fattr))
120 nfs_fattr_free_owner_name(fattr);
121 if (nfs_fattr_map_group_name(server, fattr))
122 nfs_fattr_free_group_name(fattr);
123}
41 124
42static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) 125static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
43{ 126{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 81db25e92e10..f649fba8c384 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -57,7 +57,7 @@
57#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 57#define NFS_64_BIT_INODE_NUMBERS_ENABLED 1
58 58
59/* Default is to see 64-bit inode numbers */ 59/* Default is to see 64-bit inode numbers */
60static int enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; 60static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
61 61
62static void nfs_invalidate_inode(struct inode *); 62static void nfs_invalidate_inode(struct inode *);
63static int nfs_update_inode(struct inode *, struct nfs_fattr *); 63static int nfs_update_inode(struct inode *, struct nfs_fattr *);
@@ -1020,6 +1020,8 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
1020 fattr->valid = 0; 1020 fattr->valid = 0;
1021 fattr->time_start = jiffies; 1021 fattr->time_start = jiffies;
1022 fattr->gencount = nfs_inc_attr_generation_counter(); 1022 fattr->gencount = nfs_inc_attr_generation_counter();
1023 fattr->owner_name = NULL;
1024 fattr->group_name = NULL;
1023} 1025}
1024 1026
1025struct nfs_fattr *nfs_alloc_fattr(void) 1027struct nfs_fattr *nfs_alloc_fattr(void)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3f4d95751d52..8102db9b926c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -307,6 +307,8 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
307/* write.c */ 307/* write.c */
308extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc, 308extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
309 struct list_head *head); 309 struct list_head *head);
310extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
311 struct inode *inode, int ioflags);
310extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); 312extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
311extern void nfs_writedata_release(struct nfs_write_data *wdata); 313extern void nfs_writedata_release(struct nfs_write_data *wdata);
312extern void nfs_commit_free(struct nfs_write_data *p); 314extern void nfs_commit_free(struct nfs_write_data *p);
@@ -330,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data);
330 332
331#ifdef CONFIG_MIGRATION 333#ifdef CONFIG_MIGRATION
332extern int nfs_migrate_page(struct address_space *, 334extern int nfs_migrate_page(struct address_space *,
333 struct page *, struct page *); 335 struct page *, struct page *, enum migrate_mode);
334#else 336#else
335#define nfs_migrate_page NULL 337#define nfs_migrate_page NULL
336#endif 338#endif
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 693ae22f8731..4d7d0aedc101 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -94,6 +94,8 @@ struct nfs_unique_id {
94struct nfs4_state_owner { 94struct nfs4_state_owner {
95 struct nfs_unique_id so_owner_id; 95 struct nfs_unique_id so_owner_id;
96 struct nfs_server *so_server; 96 struct nfs_server *so_server;
97 struct list_head so_lru;
98 unsigned long so_expires;
97 struct rb_node so_server_node; 99 struct rb_node so_server_node;
98 100
99 struct rpc_cred *so_cred; /* Associated cred */ 101 struct rpc_cred *so_cred; /* Associated cred */
@@ -319,6 +321,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
319 321
320extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); 322extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
321extern void nfs4_put_state_owner(struct nfs4_state_owner *); 323extern void nfs4_put_state_owner(struct nfs4_state_owner *);
324extern void nfs4_purge_state_owners(struct nfs_server *);
322extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); 325extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
323extern void nfs4_put_open_state(struct nfs4_state *); 326extern void nfs4_put_open_state(struct nfs4_state *);
324extern void nfs4_close_state(struct nfs4_state *, fmode_t); 327extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a62d36b9a99e..71ec08617e23 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -49,13 +49,14 @@ filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
49 loff_t offset) 49 loff_t offset)
50{ 50{
51 u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count; 51 u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
52 u64 tmp; 52 u64 stripe_no;
53 u32 rem;
53 54
54 offset -= flseg->pattern_offset; 55 offset -= flseg->pattern_offset;
55 tmp = offset; 56 stripe_no = div_u64(offset, stripe_width);
56 do_div(tmp, stripe_width); 57 div_u64_rem(offset, flseg->stripe_unit, &rem);
57 58
58 return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit); 59 return stripe_no * flseg->stripe_unit + rem;
59} 60}
60 61
61/* This function is used by the layout driver to calculate the 62/* This function is used by the layout driver to calculate the
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index ed388aae9689..8ae91908f5aa 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -382,7 +382,7 @@ decode_ds_addr(struct xdr_stream *streamp, gfp_t gfp_flags)
382{ 382{
383 struct nfs4_pnfs_ds_addr *da = NULL; 383 struct nfs4_pnfs_ds_addr *da = NULL;
384 char *buf, *portstr; 384 char *buf, *portstr;
385 u32 port; 385 __be16 port;
386 int nlen, rlen; 386 int nlen, rlen;
387 int tmp[2]; 387 int tmp[2];
388 __be32 *p; 388 __be32 *p;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dcda0ba7af60..f0c849c98fe4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,6 +52,7 @@
52#include <linux/namei.h> 52#include <linux/namei.h>
53#include <linux/mount.h> 53#include <linux/mount.h>
54#include <linux/module.h> 54#include <linux/module.h>
55#include <linux/nfs_idmap.h>
55#include <linux/sunrpc/bc_xprt.h> 56#include <linux/sunrpc/bc_xprt.h>
56#include <linux/xattr.h> 57#include <linux/xattr.h>
57#include <linux/utsname.h> 58#include <linux/utsname.h>
@@ -364,9 +365,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
364 * Must be called while holding tbl->slot_tbl_lock 365 * Must be called while holding tbl->slot_tbl_lock
365 */ 366 */
366static void 367static void
367nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot) 368nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
368{ 369{
369 int free_slotid = free_slot - tbl->slots;
370 int slotid = free_slotid; 370 int slotid = free_slotid;
371 371
372 BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE); 372 BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
@@ -431,7 +431,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
431 } 431 }
432 432
433 spin_lock(&tbl->slot_tbl_lock); 433 spin_lock(&tbl->slot_tbl_lock);
434 nfs4_free_slot(tbl, res->sr_slot); 434 nfs4_free_slot(tbl, res->sr_slot - tbl->slots);
435 nfs4_check_drain_fc_complete(res->sr_session); 435 nfs4_check_drain_fc_complete(res->sr_session);
436 spin_unlock(&tbl->slot_tbl_lock); 436 spin_unlock(&tbl->slot_tbl_lock);
437 res->sr_slot = NULL; 437 res->sr_slot = NULL;
@@ -554,13 +554,10 @@ int nfs41_setup_sequence(struct nfs4_session *session,
554 spin_lock(&tbl->slot_tbl_lock); 554 spin_lock(&tbl->slot_tbl_lock);
555 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && 555 if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
556 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { 556 !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
557 /* 557 /* The state manager will wait until the slot table is empty */
558 * The state manager will wait until the slot table is empty.
559 * Schedule the reset thread
560 */
561 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL); 558 rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
562 spin_unlock(&tbl->slot_tbl_lock); 559 spin_unlock(&tbl->slot_tbl_lock);
563 dprintk("%s Schedule Session Reset\n", __func__); 560 dprintk("%s session is draining\n", __func__);
564 return -EAGAIN; 561 return -EAGAIN;
565 } 562 }
566 563
@@ -765,6 +762,8 @@ struct nfs4_opendata {
765 struct nfs_openres o_res; 762 struct nfs_openres o_res;
766 struct nfs_open_confirmargs c_arg; 763 struct nfs_open_confirmargs c_arg;
767 struct nfs_open_confirmres c_res; 764 struct nfs_open_confirmres c_res;
765 struct nfs4_string owner_name;
766 struct nfs4_string group_name;
768 struct nfs_fattr f_attr; 767 struct nfs_fattr f_attr;
769 struct nfs_fattr dir_attr; 768 struct nfs_fattr dir_attr;
770 struct dentry *dir; 769 struct dentry *dir;
@@ -788,6 +787,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
788 p->o_res.server = p->o_arg.server; 787 p->o_res.server = p->o_arg.server;
789 nfs_fattr_init(&p->f_attr); 788 nfs_fattr_init(&p->f_attr);
790 nfs_fattr_init(&p->dir_attr); 789 nfs_fattr_init(&p->dir_attr);
790 nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
791} 791}
792 792
793static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, 793static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
@@ -819,6 +819,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
819 p->o_arg.name = &dentry->d_name; 819 p->o_arg.name = &dentry->d_name;
820 p->o_arg.server = server; 820 p->o_arg.server = server;
821 p->o_arg.bitmask = server->attr_bitmask; 821 p->o_arg.bitmask = server->attr_bitmask;
822 p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
822 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; 823 p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
823 if (flags & O_CREAT) { 824 if (flags & O_CREAT) {
824 u32 *s; 825 u32 *s;
@@ -855,6 +856,7 @@ static void nfs4_opendata_free(struct kref *kref)
855 dput(p->dir); 856 dput(p->dir);
856 dput(p->dentry); 857 dput(p->dentry);
857 nfs_sb_deactive(sb); 858 nfs_sb_deactive(sb);
859 nfs_fattr_free_names(&p->f_attr);
858 kfree(p); 860 kfree(p);
859} 861}
860 862
@@ -1579,6 +1581,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
1579 if (status != 0 || !data->rpc_done) 1581 if (status != 0 || !data->rpc_done)
1580 return status; 1582 return status;
1581 1583
1584 nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
1585
1582 nfs_refresh_inode(dir, o_res->dir_attr); 1586 nfs_refresh_inode(dir, o_res->dir_attr);
1583 1587
1584 if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { 1588 if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -1611,6 +1615,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1611 return status; 1615 return status;
1612 } 1616 }
1613 1617
1618 nfs_fattr_map_and_free_names(server, &data->f_attr);
1619
1614 if (o_arg->open_flags & O_CREAT) { 1620 if (o_arg->open_flags & O_CREAT) {
1615 update_changeattr(dir, &o_res->cinfo); 1621 update_changeattr(dir, &o_res->cinfo);
1616 nfs_post_op_update_inode(dir, o_res->dir_attr); 1622 nfs_post_op_update_inode(dir, o_res->dir_attr);
@@ -3431,19 +3437,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
3431 */ 3437 */
3432#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT) 3438#define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
3433 3439
3434static void buf_to_pages(const void *buf, size_t buflen,
3435 struct page **pages, unsigned int *pgbase)
3436{
3437 const void *p = buf;
3438
3439 *pgbase = offset_in_page(buf);
3440 p -= *pgbase;
3441 while (p < buf + buflen) {
3442 *(pages++) = virt_to_page(p);
3443 p += PAGE_CACHE_SIZE;
3444 }
3445}
3446
3447static int buf_to_pages_noslab(const void *buf, size_t buflen, 3440static int buf_to_pages_noslab(const void *buf, size_t buflen,
3448 struct page **pages, unsigned int *pgbase) 3441 struct page **pages, unsigned int *pgbase)
3449{ 3442{
@@ -3540,9 +3533,19 @@ out:
3540 nfs4_set_cached_acl(inode, acl); 3533 nfs4_set_cached_acl(inode, acl);
3541} 3534}
3542 3535
3536/*
3537 * The getxattr API returns the required buffer length when called with a
3538 * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
3539 * the required buf. On a NULL buf, we send a page of data to the server
3540 * guessing that the ACL request can be serviced by a page. If so, we cache
3541 * up to the page of ACL data, and the 2nd call to getxattr is serviced by
3542 * the cache. If not so, we throw away the page, and cache the required
3543 * length. The next getxattr call will then produce another round trip to
3544 * the server, this time with the input buf of the required size.
3545 */
3543static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen) 3546static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
3544{ 3547{
3545 struct page *pages[NFS4ACL_MAXPAGES]; 3548 struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };
3546 struct nfs_getaclargs args = { 3549 struct nfs_getaclargs args = {
3547 .fh = NFS_FH(inode), 3550 .fh = NFS_FH(inode),
3548 .acl_pages = pages, 3551 .acl_pages = pages,
@@ -3557,41 +3560,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3557 .rpc_argp = &args, 3560 .rpc_argp = &args,
3558 .rpc_resp = &res, 3561 .rpc_resp = &res,
3559 }; 3562 };
3560 struct page *localpage = NULL; 3563 int ret = -ENOMEM, npages, i, acl_len = 0;
3561 int ret;
3562 3564
3563 if (buflen < PAGE_SIZE) { 3565 npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
3564 /* As long as we're doing a round trip to the server anyway, 3566 /* As long as we're doing a round trip to the server anyway,
3565 * let's be prepared for a page of acl data. */ 3567 * let's be prepared for a page of acl data. */
3566 localpage = alloc_page(GFP_KERNEL); 3568 if (npages == 0)
3567 resp_buf = page_address(localpage); 3569 npages = 1;
3568 if (localpage == NULL) 3570
3569 return -ENOMEM; 3571 for (i = 0; i < npages; i++) {
3570 args.acl_pages[0] = localpage; 3572 pages[i] = alloc_page(GFP_KERNEL);
3571 args.acl_pgbase = 0; 3573 if (!pages[i])
3572 args.acl_len = PAGE_SIZE; 3574 goto out_free;
3573 } else {
3574 resp_buf = buf;
3575 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
3576 } 3575 }
3577 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0); 3576 if (npages > 1) {
3577 /* for decoding across pages */
3578 args.acl_scratch = alloc_page(GFP_KERNEL);
3579 if (!args.acl_scratch)
3580 goto out_free;
3581 }
3582 args.acl_len = npages * PAGE_SIZE;
3583 args.acl_pgbase = 0;
3584 /* Let decode_getfacl know not to fail if the ACL data is larger than
3585 * the page we send as a guess */
3586 if (buf == NULL)
3587 res.acl_flags |= NFS4_ACL_LEN_REQUEST;
3588 resp_buf = page_address(pages[0]);
3589
3590 dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
3591 __func__, buf, buflen, npages, args.acl_len);
3592 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
3593 &msg, &args.seq_args, &res.seq_res, 0);
3578 if (ret) 3594 if (ret)
3579 goto out_free; 3595 goto out_free;
3580 if (res.acl_len > args.acl_len) 3596
3581 nfs4_write_cached_acl(inode, NULL, res.acl_len); 3597 acl_len = res.acl_len - res.acl_data_offset;
3598 if (acl_len > args.acl_len)
3599 nfs4_write_cached_acl(inode, NULL, acl_len);
3582 else 3600 else
3583 nfs4_write_cached_acl(inode, resp_buf, res.acl_len); 3601 nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset,
3602 acl_len);
3584 if (buf) { 3603 if (buf) {
3585 ret = -ERANGE; 3604 ret = -ERANGE;
3586 if (res.acl_len > buflen) 3605 if (acl_len > buflen)
3587 goto out_free; 3606 goto out_free;
3588 if (localpage) 3607 _copy_from_pages(buf, pages, res.acl_data_offset,
3589 memcpy(buf, resp_buf, res.acl_len); 3608 res.acl_len);
3590 } 3609 }
3591 ret = res.acl_len; 3610 ret = acl_len;
3592out_free: 3611out_free:
3593 if (localpage) 3612 for (i = 0; i < npages; i++)
3594 __free_page(localpage); 3613 if (pages[i])
3614 __free_page(pages[i]);
3615 if (args.acl_scratch)
3616 __free_page(args.acl_scratch);
3595 return ret; 3617 return ret;
3596} 3618}
3597 3619
@@ -3622,6 +3644,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
3622 nfs_zap_acl_cache(inode); 3644 nfs_zap_acl_cache(inode);
3623 ret = nfs4_read_cached_acl(inode, buf, buflen); 3645 ret = nfs4_read_cached_acl(inode, buf, buflen);
3624 if (ret != -ENOENT) 3646 if (ret != -ENOENT)
3647 /* -ENOENT is returned if there is no ACL or if there is an ACL
3648 * but no cached acl data, just the acl length */
3625 return ret; 3649 return ret;
3626 return nfs4_get_acl_uncached(inode, buf, buflen); 3650 return nfs4_get_acl_uncached(inode, buf, buflen);
3627} 3651}
@@ -5022,23 +5046,6 @@ out:
5022 return ret; 5046 return ret;
5023} 5047}
5024 5048
5025/*
5026 * Reset the forechannel and backchannel slot tables
5027 */
5028static int nfs4_reset_slot_tables(struct nfs4_session *session)
5029{
5030 int status;
5031
5032 status = nfs4_reset_slot_table(&session->fc_slot_table,
5033 session->fc_attrs.max_reqs, 1);
5034 if (status)
5035 return status;
5036
5037 status = nfs4_reset_slot_table(&session->bc_slot_table,
5038 session->bc_attrs.max_reqs, 0);
5039 return status;
5040}
5041
5042/* Destroy the slot table */ 5049/* Destroy the slot table */
5043static void nfs4_destroy_slot_tables(struct nfs4_session *session) 5050static void nfs4_destroy_slot_tables(struct nfs4_session *session)
5044{ 5051{
@@ -5084,29 +5091,35 @@ out:
5084} 5091}
5085 5092
5086/* 5093/*
5087 * Initialize the forechannel and backchannel tables 5094 * Initialize or reset the forechannel and backchannel tables
5088 */ 5095 */
5089static int nfs4_init_slot_tables(struct nfs4_session *session) 5096static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
5090{ 5097{
5091 struct nfs4_slot_table *tbl; 5098 struct nfs4_slot_table *tbl;
5092 int status = 0; 5099 int status;
5093 5100
5094 tbl = &session->fc_slot_table; 5101 dprintk("--> %s\n", __func__);
5102 /* Fore channel */
5103 tbl = &ses->fc_slot_table;
5095 if (tbl->slots == NULL) { 5104 if (tbl->slots == NULL) {
5096 status = nfs4_init_slot_table(tbl, 5105 status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
5097 session->fc_attrs.max_reqs, 1); 5106 if (status) /* -ENOMEM */
5107 return status;
5108 } else {
5109 status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
5098 if (status) 5110 if (status)
5099 return status; 5111 return status;
5100 } 5112 }
5101 5113 /* Back channel */
5102 tbl = &session->bc_slot_table; 5114 tbl = &ses->bc_slot_table;
5103 if (tbl->slots == NULL) { 5115 if (tbl->slots == NULL) {
5104 status = nfs4_init_slot_table(tbl, 5116 status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
5105 session->bc_attrs.max_reqs, 0);
5106 if (status) 5117 if (status)
5107 nfs4_destroy_slot_tables(session); 5118 /* Fore and back channel share a connection so get
5108 } 5119 * both slot tables or neither */
5109 5120 nfs4_destroy_slot_tables(ses);
5121 } else
5122 status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
5110 return status; 5123 return status;
5111} 5124}
5112 5125
@@ -5294,13 +5307,9 @@ int nfs4_proc_create_session(struct nfs_client *clp)
5294 if (status) 5307 if (status)
5295 goto out; 5308 goto out;
5296 5309
5297 /* Init and reset the fore channel */ 5310 /* Init or reset the session slot tables */
5298 status = nfs4_init_slot_tables(session); 5311 status = nfs4_setup_session_slot_tables(session);
5299 dprintk("slot table initialization returned %d\n", status); 5312 dprintk("slot table setup returned %d\n", status);
5300 if (status)
5301 goto out;
5302 status = nfs4_reset_slot_tables(session);
5303 dprintk("slot table reset returned %d\n", status);
5304 if (status) 5313 if (status)
5305 goto out; 5314 goto out;
5306 5315
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6a7107ae6b72..a53f33b4ac3a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
49#include <linux/ratelimit.h> 49#include <linux/ratelimit.h>
50#include <linux/workqueue.h> 50#include <linux/workqueue.h>
51#include <linux/bitops.h> 51#include <linux/bitops.h>
52#include <linux/jiffies.h>
52 53
53#include "nfs4_fs.h" 54#include "nfs4_fs.h"
54#include "callback.h" 55#include "callback.h"
@@ -377,31 +378,24 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
377{ 378{
378 struct rb_node **p = &server->state_owners.rb_node, 379 struct rb_node **p = &server->state_owners.rb_node,
379 *parent = NULL; 380 *parent = NULL;
380 struct nfs4_state_owner *sp, *res = NULL; 381 struct nfs4_state_owner *sp;
381 382
382 while (*p != NULL) { 383 while (*p != NULL) {
383 parent = *p; 384 parent = *p;
384 sp = rb_entry(parent, struct nfs4_state_owner, so_server_node); 385 sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
385 386
386 if (server < sp->so_server) {
387 p = &parent->rb_left;
388 continue;
389 }
390 if (server > sp->so_server) {
391 p = &parent->rb_right;
392 continue;
393 }
394 if (cred < sp->so_cred) 387 if (cred < sp->so_cred)
395 p = &parent->rb_left; 388 p = &parent->rb_left;
396 else if (cred > sp->so_cred) 389 else if (cred > sp->so_cred)
397 p = &parent->rb_right; 390 p = &parent->rb_right;
398 else { 391 else {
392 if (!list_empty(&sp->so_lru))
393 list_del_init(&sp->so_lru);
399 atomic_inc(&sp->so_count); 394 atomic_inc(&sp->so_count);
400 res = sp; 395 return sp;
401 break;
402 } 396 }
403 } 397 }
404 return res; 398 return NULL;
405} 399}
406 400
407static struct nfs4_state_owner * 401static struct nfs4_state_owner *
@@ -421,6 +415,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
421 else if (new->so_cred > sp->so_cred) 415 else if (new->so_cred > sp->so_cred)
422 p = &parent->rb_right; 416 p = &parent->rb_right;
423 else { 417 else {
418 if (!list_empty(&sp->so_lru))
419 list_del_init(&sp->so_lru);
424 atomic_inc(&sp->so_count); 420 atomic_inc(&sp->so_count);
425 return sp; 421 return sp;
426 } 422 }
@@ -462,6 +458,7 @@ nfs4_alloc_state_owner(void)
462 spin_lock_init(&sp->so_sequence.lock); 458 spin_lock_init(&sp->so_sequence.lock);
463 INIT_LIST_HEAD(&sp->so_sequence.list); 459 INIT_LIST_HEAD(&sp->so_sequence.list);
464 atomic_set(&sp->so_count, 1); 460 atomic_set(&sp->so_count, 1);
461 INIT_LIST_HEAD(&sp->so_lru);
465 return sp; 462 return sp;
466} 463}
467 464
@@ -479,6 +476,38 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
479 } 476 }
480} 477}
481 478
479static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
480{
481 rpc_destroy_wait_queue(&sp->so_sequence.wait);
482 put_rpccred(sp->so_cred);
483 kfree(sp);
484}
485
486static void nfs4_gc_state_owners(struct nfs_server *server)
487{
488 struct nfs_client *clp = server->nfs_client;
489 struct nfs4_state_owner *sp, *tmp;
490 unsigned long time_min, time_max;
491 LIST_HEAD(doomed);
492
493 spin_lock(&clp->cl_lock);
494 time_max = jiffies;
495 time_min = (long)time_max - (long)clp->cl_lease_time;
496 list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
497 /* NB: LRU is sorted so that oldest is at the head */
498 if (time_in_range(sp->so_expires, time_min, time_max))
499 break;
500 list_move(&sp->so_lru, &doomed);
501 nfs4_remove_state_owner_locked(sp);
502 }
503 spin_unlock(&clp->cl_lock);
504
505 list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
506 list_del(&sp->so_lru);
507 nfs4_free_state_owner(sp);
508 }
509}
510
482/** 511/**
483 * nfs4_get_state_owner - Look up a state owner given a credential 512 * nfs4_get_state_owner - Look up a state owner given a credential
484 * @server: nfs_server to search 513 * @server: nfs_server to search
@@ -496,10 +525,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
496 sp = nfs4_find_state_owner_locked(server, cred); 525 sp = nfs4_find_state_owner_locked(server, cred);
497 spin_unlock(&clp->cl_lock); 526 spin_unlock(&clp->cl_lock);
498 if (sp != NULL) 527 if (sp != NULL)
499 return sp; 528 goto out;
500 new = nfs4_alloc_state_owner(); 529 new = nfs4_alloc_state_owner();
501 if (new == NULL) 530 if (new == NULL)
502 return NULL; 531 goto out;
503 new->so_server = server; 532 new->so_server = server;
504 new->so_cred = cred; 533 new->so_cred = cred;
505 spin_lock(&clp->cl_lock); 534 spin_lock(&clp->cl_lock);
@@ -511,26 +540,58 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
511 rpc_destroy_wait_queue(&new->so_sequence.wait); 540 rpc_destroy_wait_queue(&new->so_sequence.wait);
512 kfree(new); 541 kfree(new);
513 } 542 }
543out:
544 nfs4_gc_state_owners(server);
514 return sp; 545 return sp;
515} 546}
516 547
517/** 548/**
518 * nfs4_put_state_owner - Release a nfs4_state_owner 549 * nfs4_put_state_owner - Release a nfs4_state_owner
519 * @sp: state owner data to release 550 * @sp: state owner data to release
520 *
521 */ 551 */
522void nfs4_put_state_owner(struct nfs4_state_owner *sp) 552void nfs4_put_state_owner(struct nfs4_state_owner *sp)
523{ 553{
524 struct nfs_client *clp = sp->so_server->nfs_client; 554 struct nfs_server *server = sp->so_server;
525 struct rpc_cred *cred = sp->so_cred; 555 struct nfs_client *clp = server->nfs_client;
526 556
527 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) 557 if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
528 return; 558 return;
529 nfs4_remove_state_owner_locked(sp); 559
560 if (!RB_EMPTY_NODE(&sp->so_server_node)) {
561 sp->so_expires = jiffies;
562 list_add_tail(&sp->so_lru, &server->state_owners_lru);
563 spin_unlock(&clp->cl_lock);
564 } else {
565 nfs4_remove_state_owner_locked(sp);
566 spin_unlock(&clp->cl_lock);
567 nfs4_free_state_owner(sp);
568 }
569}
570
571/**
572 * nfs4_purge_state_owners - Release all cached state owners
573 * @server: nfs_server with cached state owners to release
574 *
575 * Called at umount time. Remaining state owners will be on
576 * the LRU with ref count of zero.
577 */
578void nfs4_purge_state_owners(struct nfs_server *server)
579{
580 struct nfs_client *clp = server->nfs_client;
581 struct nfs4_state_owner *sp, *tmp;
582 LIST_HEAD(doomed);
583
584 spin_lock(&clp->cl_lock);
585 list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
586 list_move(&sp->so_lru, &doomed);
587 nfs4_remove_state_owner_locked(sp);
588 }
530 spin_unlock(&clp->cl_lock); 589 spin_unlock(&clp->cl_lock);
531 rpc_destroy_wait_queue(&sp->so_sequence.wait); 590
532 put_rpccred(cred); 591 list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
533 kfree(sp); 592 list_del(&sp->so_lru);
593 nfs4_free_state_owner(sp);
594 }
534} 595}
535 596
536static struct nfs4_state * 597static struct nfs4_state *
@@ -1402,6 +1463,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
1402restart: 1463restart:
1403 rcu_read_lock(); 1464 rcu_read_lock();
1404 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { 1465 list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
1466 nfs4_purge_state_owners(server);
1405 spin_lock(&clp->cl_lock); 1467 spin_lock(&clp->cl_lock);
1406 for (pos = rb_first(&server->state_owners); 1468 for (pos = rb_first(&server->state_owners);
1407 pos != NULL; 1469 pos != NULL;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6161b213ed1..95e92e438407 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2298,7 +2298,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
2298 encode_getfh(xdr, &hdr); 2298 encode_getfh(xdr, &hdr);
2299 encode_getfattr(xdr, args->bitmask, &hdr); 2299 encode_getfattr(xdr, args->bitmask, &hdr);
2300 encode_restorefh(xdr, &hdr); 2300 encode_restorefh(xdr, &hdr);
2301 encode_getfattr(xdr, args->bitmask, &hdr); 2301 encode_getfattr(xdr, args->dir_bitmask, &hdr);
2302 encode_nops(&hdr); 2302 encode_nops(&hdr);
2303} 2303}
2304 2304
@@ -2517,11 +2517,13 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
2517 encode_compound_hdr(xdr, req, &hdr); 2517 encode_compound_hdr(xdr, req, &hdr);
2518 encode_sequence(xdr, &args->seq_args, &hdr); 2518 encode_sequence(xdr, &args->seq_args, &hdr);
2519 encode_putfh(xdr, args->fh, &hdr); 2519 encode_putfh(xdr, args->fh, &hdr);
2520 replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1; 2520 replen = hdr.replen + op_decode_hdr_maxsz + 1;
2521 encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr); 2521 encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
2522 2522
2523 xdr_inline_pages(&req->rq_rcv_buf, replen << 2, 2523 xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
2524 args->acl_pages, args->acl_pgbase, args->acl_len); 2524 args->acl_pages, args->acl_pgbase, args->acl_len);
2525 xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
2526
2525 encode_nops(&hdr); 2527 encode_nops(&hdr);
2526} 2528}
2527 2529
@@ -3790,7 +3792,8 @@ out_overflow:
3790} 3792}
3791 3793
3792static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, 3794static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3793 const struct nfs_server *server, uint32_t *uid, int may_sleep) 3795 const struct nfs_server *server, uint32_t *uid,
3796 struct nfs4_string *owner_name)
3794{ 3797{
3795 uint32_t len; 3798 uint32_t len;
3796 __be32 *p; 3799 __be32 *p;
@@ -3807,8 +3810,12 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3807 p = xdr_inline_decode(xdr, len); 3810 p = xdr_inline_decode(xdr, len);
3808 if (unlikely(!p)) 3811 if (unlikely(!p))
3809 goto out_overflow; 3812 goto out_overflow;
3810 if (!may_sleep) { 3813 if (owner_name != NULL) {
3811 /* do nothing */ 3814 owner_name->data = kmemdup(p, len, GFP_NOWAIT);
3815 if (owner_name->data != NULL) {
3816 owner_name->len = len;
3817 ret = NFS_ATTR_FATTR_OWNER_NAME;
3818 }
3812 } else if (len < XDR_MAX_NETOBJ) { 3819 } else if (len < XDR_MAX_NETOBJ) {
3813 if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0) 3820 if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
3814 ret = NFS_ATTR_FATTR_OWNER; 3821 ret = NFS_ATTR_FATTR_OWNER;
@@ -3828,7 +3835,8 @@ out_overflow:
3828} 3835}
3829 3836
3830static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, 3837static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3831 const struct nfs_server *server, uint32_t *gid, int may_sleep) 3838 const struct nfs_server *server, uint32_t *gid,
3839 struct nfs4_string *group_name)
3832{ 3840{
3833 uint32_t len; 3841 uint32_t len;
3834 __be32 *p; 3842 __be32 *p;
@@ -3845,8 +3853,12 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3845 p = xdr_inline_decode(xdr, len); 3853 p = xdr_inline_decode(xdr, len);
3846 if (unlikely(!p)) 3854 if (unlikely(!p))
3847 goto out_overflow; 3855 goto out_overflow;
3848 if (!may_sleep) { 3856 if (group_name != NULL) {
3849 /* do nothing */ 3857 group_name->data = kmemdup(p, len, GFP_NOWAIT);
3858 if (group_name->data != NULL) {
3859 group_name->len = len;
3860 ret = NFS_ATTR_FATTR_GROUP_NAME;
3861 }
3850 } else if (len < XDR_MAX_NETOBJ) { 3862 } else if (len < XDR_MAX_NETOBJ) {
3851 if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0) 3863 if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
3852 ret = NFS_ATTR_FATTR_GROUP; 3864 ret = NFS_ATTR_FATTR_GROUP;
@@ -4283,7 +4295,7 @@ xdr_error:
4283 4295
4284static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap, 4296static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4285 struct nfs_fattr *fattr, struct nfs_fh *fh, 4297 struct nfs_fattr *fattr, struct nfs_fh *fh,
4286 const struct nfs_server *server, int may_sleep) 4298 const struct nfs_server *server)
4287{ 4299{
4288 int status; 4300 int status;
4289 umode_t fmode = 0; 4301 umode_t fmode = 0;
@@ -4350,12 +4362,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
4350 goto xdr_error; 4362 goto xdr_error;
4351 fattr->valid |= status; 4363 fattr->valid |= status;
4352 4364
4353 status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep); 4365 status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
4354 if (status < 0) 4366 if (status < 0)
4355 goto xdr_error; 4367 goto xdr_error;
4356 fattr->valid |= status; 4368 fattr->valid |= status;
4357 4369
4358 status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep); 4370 status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
4359 if (status < 0) 4371 if (status < 0)
4360 goto xdr_error; 4372 goto xdr_error;
4361 fattr->valid |= status; 4373 fattr->valid |= status;
@@ -4396,7 +4408,7 @@ xdr_error:
4396} 4408}
4397 4409
4398static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4410static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4399 struct nfs_fh *fh, const struct nfs_server *server, int may_sleep) 4411 struct nfs_fh *fh, const struct nfs_server *server)
4400{ 4412{
4401 __be32 *savep; 4413 __be32 *savep;
4402 uint32_t attrlen, 4414 uint32_t attrlen,
@@ -4415,7 +4427,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
4415 if (status < 0) 4427 if (status < 0)
4416 goto xdr_error; 4428 goto xdr_error;
4417 4429
4418 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep); 4430 status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server);
4419 if (status < 0) 4431 if (status < 0)
4420 goto xdr_error; 4432 goto xdr_error;
4421 4433
@@ -4426,9 +4438,9 @@ xdr_error:
4426} 4438}
4427 4439
4428static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, 4440static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
4429 const struct nfs_server *server, int may_sleep) 4441 const struct nfs_server *server)
4430{ 4442{
4431 return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep); 4443 return decode_getfattr_generic(xdr, fattr, NULL, server);
4432} 4444}
4433 4445
4434/* 4446/*
@@ -4957,17 +4969,18 @@ decode_restorefh(struct xdr_stream *xdr)
4957} 4969}
4958 4970
4959static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req, 4971static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
4960 size_t *acl_len) 4972 struct nfs_getaclres *res)
4961{ 4973{
4962 __be32 *savep; 4974 __be32 *savep, *bm_p;
4963 uint32_t attrlen, 4975 uint32_t attrlen,
4964 bitmap[3] = {0}; 4976 bitmap[3] = {0};
4965 struct kvec *iov = req->rq_rcv_buf.head; 4977 struct kvec *iov = req->rq_rcv_buf.head;
4966 int status; 4978 int status;
4967 4979
4968 *acl_len = 0; 4980 res->acl_len = 0;
4969 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0) 4981 if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
4970 goto out; 4982 goto out;
4983 bm_p = xdr->p;
4971 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0) 4984 if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
4972 goto out; 4985 goto out;
4973 if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0) 4986 if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -4979,18 +4992,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
4979 size_t hdrlen; 4992 size_t hdrlen;
4980 u32 recvd; 4993 u32 recvd;
4981 4994
4995 /* The bitmap (xdr len + bitmaps) and the attr xdr len words
4996 * are stored with the acl data to handle the problem of
4997 * variable length bitmaps.*/
4998 xdr->p = bm_p;
4999 res->acl_data_offset = be32_to_cpup(bm_p) + 2;
5000 res->acl_data_offset <<= 2;
5001
4982 /* We ignore &savep and don't do consistency checks on 5002 /* We ignore &savep and don't do consistency checks on
4983 * the attr length. Let userspace figure it out.... */ 5003 * the attr length. Let userspace figure it out.... */
4984 hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base; 5004 hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
5005 attrlen += res->acl_data_offset;
4985 recvd = req->rq_rcv_buf.len - hdrlen; 5006 recvd = req->rq_rcv_buf.len - hdrlen;
4986 if (attrlen > recvd) { 5007 if (attrlen > recvd) {
4987 dprintk("NFS: server cheating in getattr" 5008 if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
4988 " acl reply: attrlen %u > recvd %u\n", 5009 /* getxattr interface called with a NULL buf */
5010 res->acl_len = attrlen;
5011 goto out;
5012 }
5013 dprintk("NFS: acl reply: attrlen %u > recvd %u\n",
4989 attrlen, recvd); 5014 attrlen, recvd);
4990 return -EINVAL; 5015 return -EINVAL;
4991 } 5016 }
4992 xdr_read_pages(xdr, attrlen); 5017 xdr_read_pages(xdr, attrlen);
4993 *acl_len = attrlen; 5018 res->acl_len = attrlen;
4994 } else 5019 } else
4995 status = -EOPNOTSUPP; 5020 status = -EOPNOTSUPP;
4996 5021
@@ -5696,8 +5721,7 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
5696 status = decode_open_downgrade(xdr, res); 5721 status = decode_open_downgrade(xdr, res);
5697 if (status != 0) 5722 if (status != 0)
5698 goto out; 5723 goto out;
5699 decode_getfattr(xdr, res->fattr, res->server, 5724 decode_getfattr(xdr, res->fattr, res->server);
5700 !RPC_IS_ASYNC(rqstp->rq_task));
5701out: 5725out:
5702 return status; 5726 return status;
5703} 5727}
@@ -5723,8 +5747,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5723 status = decode_access(xdr, res); 5747 status = decode_access(xdr, res);
5724 if (status != 0) 5748 if (status != 0)
5725 goto out; 5749 goto out;
5726 decode_getfattr(xdr, res->fattr, res->server, 5750 decode_getfattr(xdr, res->fattr, res->server);
5727 !RPC_IS_ASYNC(rqstp->rq_task));
5728out: 5751out:
5729 return status; 5752 return status;
5730} 5753}
@@ -5753,8 +5776,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5753 status = decode_getfh(xdr, res->fh); 5776 status = decode_getfh(xdr, res->fh);
5754 if (status) 5777 if (status)
5755 goto out; 5778 goto out;
5756 status = decode_getfattr(xdr, res->fattr, res->server 5779 status = decode_getfattr(xdr, res->fattr, res->server);
5757 ,!RPC_IS_ASYNC(rqstp->rq_task));
5758out: 5780out:
5759 return status; 5781 return status;
5760} 5782}
@@ -5780,8 +5802,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
5780 goto out; 5802 goto out;
5781 status = decode_getfh(xdr, res->fh); 5803 status = decode_getfh(xdr, res->fh);
5782 if (status == 0) 5804 if (status == 0)
5783 status = decode_getfattr(xdr, res->fattr, res->server, 5805 status = decode_getfattr(xdr, res->fattr, res->server);
5784 !RPC_IS_ASYNC(rqstp->rq_task));
5785out: 5806out:
5786 return status; 5807 return status;
5787} 5808}
@@ -5807,8 +5828,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5807 status = decode_remove(xdr, &res->cinfo); 5828 status = decode_remove(xdr, &res->cinfo);
5808 if (status) 5829 if (status)
5809 goto out; 5830 goto out;
5810 decode_getfattr(xdr, res->dir_attr, res->server, 5831 decode_getfattr(xdr, res->dir_attr, res->server);
5811 !RPC_IS_ASYNC(rqstp->rq_task));
5812out: 5832out:
5813 return status; 5833 return status;
5814} 5834}
@@ -5841,14 +5861,12 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5841 if (status) 5861 if (status)
5842 goto out; 5862 goto out;
5843 /* Current FH is target directory */ 5863 /* Current FH is target directory */
5844 if (decode_getfattr(xdr, res->new_fattr, res->server, 5864 if (decode_getfattr(xdr, res->new_fattr, res->server))
5845 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5846 goto out; 5865 goto out;
5847 status = decode_restorefh(xdr); 5866 status = decode_restorefh(xdr);
5848 if (status) 5867 if (status)
5849 goto out; 5868 goto out;
5850 decode_getfattr(xdr, res->old_fattr, res->server, 5869 decode_getfattr(xdr, res->old_fattr, res->server);
5851 !RPC_IS_ASYNC(rqstp->rq_task));
5852out: 5870out:
5853 return status; 5871 return status;
5854} 5872}
@@ -5884,14 +5902,12 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5884 * Note order: OP_LINK leaves the directory as the current 5902 * Note order: OP_LINK leaves the directory as the current
5885 * filehandle. 5903 * filehandle.
5886 */ 5904 */
5887 if (decode_getfattr(xdr, res->dir_attr, res->server, 5905 if (decode_getfattr(xdr, res->dir_attr, res->server))
5888 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5889 goto out; 5906 goto out;
5890 status = decode_restorefh(xdr); 5907 status = decode_restorefh(xdr);
5891 if (status) 5908 if (status)
5892 goto out; 5909 goto out;
5893 decode_getfattr(xdr, res->fattr, res->server, 5910 decode_getfattr(xdr, res->fattr, res->server);
5894 !RPC_IS_ASYNC(rqstp->rq_task));
5895out: 5911out:
5896 return status; 5912 return status;
5897} 5913}
@@ -5923,14 +5939,12 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5923 status = decode_getfh(xdr, res->fh); 5939 status = decode_getfh(xdr, res->fh);
5924 if (status) 5940 if (status)
5925 goto out; 5941 goto out;
5926 if (decode_getfattr(xdr, res->fattr, res->server, 5942 if (decode_getfattr(xdr, res->fattr, res->server))
5927 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
5928 goto out; 5943 goto out;
5929 status = decode_restorefh(xdr); 5944 status = decode_restorefh(xdr);
5930 if (status) 5945 if (status)
5931 goto out; 5946 goto out;
5932 decode_getfattr(xdr, res->dir_fattr, res->server, 5947 decode_getfattr(xdr, res->dir_fattr, res->server);
5933 !RPC_IS_ASYNC(rqstp->rq_task));
5934out: 5948out:
5935 return status; 5949 return status;
5936} 5950}
@@ -5962,8 +5976,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5962 status = decode_putfh(xdr); 5976 status = decode_putfh(xdr);
5963 if (status) 5977 if (status)
5964 goto out; 5978 goto out;
5965 status = decode_getfattr(xdr, res->fattr, res->server, 5979 status = decode_getfattr(xdr, res->fattr, res->server);
5966 !RPC_IS_ASYNC(rqstp->rq_task));
5967out: 5980out:
5968 return status; 5981 return status;
5969} 5982}
@@ -6028,7 +6041,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6028 status = decode_putfh(xdr); 6041 status = decode_putfh(xdr);
6029 if (status) 6042 if (status)
6030 goto out; 6043 goto out;
6031 status = decode_getacl(xdr, rqstp, &res->acl_len); 6044 status = decode_getacl(xdr, rqstp, res);
6032 6045
6033out: 6046out:
6034 return status; 6047 return status;
@@ -6061,8 +6074,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6061 * an ESTALE error. Shouldn't be a problem, 6074 * an ESTALE error. Shouldn't be a problem,
6062 * though, since fattr->valid will remain unset. 6075 * though, since fattr->valid will remain unset.
6063 */ 6076 */
6064 decode_getfattr(xdr, res->fattr, res->server, 6077 decode_getfattr(xdr, res->fattr, res->server);
6065 !RPC_IS_ASYNC(rqstp->rq_task));
6066out: 6078out:
6067 return status; 6079 return status;
6068} 6080}
@@ -6093,13 +6105,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6093 goto out; 6105 goto out;
6094 if (decode_getfh(xdr, &res->fh) != 0) 6106 if (decode_getfh(xdr, &res->fh) != 0)
6095 goto out; 6107 goto out;
6096 if (decode_getfattr(xdr, res->f_attr, res->server, 6108 if (decode_getfattr(xdr, res->f_attr, res->server) != 0)
6097 !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
6098 goto out; 6109 goto out;
6099 if (decode_restorefh(xdr) != 0) 6110 if (decode_restorefh(xdr) != 0)
6100 goto out; 6111 goto out;
6101 decode_getfattr(xdr, res->dir_attr, res->server, 6112 decode_getfattr(xdr, res->dir_attr, res->server);
6102 !RPC_IS_ASYNC(rqstp->rq_task));
6103out: 6113out:
6104 return status; 6114 return status;
6105} 6115}
@@ -6147,8 +6157,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
6147 status = decode_open(xdr, res); 6157 status = decode_open(xdr, res);
6148 if (status) 6158 if (status)
6149 goto out; 6159 goto out;
6150 decode_getfattr(xdr, res->f_attr, res->server, 6160 decode_getfattr(xdr, res->f_attr, res->server);
6151 !RPC_IS_ASYNC(rqstp->rq_task));
6152out: 6161out:
6153 return status; 6162 return status;
6154} 6163}
@@ -6175,8 +6184,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
6175 status = decode_setattr(xdr); 6184 status = decode_setattr(xdr);
6176 if (status) 6185 if (status)
6177 goto out; 6186 goto out;
6178 decode_getfattr(xdr, res->fattr, res->server, 6187 decode_getfattr(xdr, res->fattr, res->server);
6179 !RPC_IS_ASYNC(rqstp->rq_task));
6180out: 6188out:
6181 return status; 6189 return status;
6182} 6190}
@@ -6356,8 +6364,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6356 if (status) 6364 if (status)
6357 goto out; 6365 goto out;
6358 if (res->fattr) 6366 if (res->fattr)
6359 decode_getfattr(xdr, res->fattr, res->server, 6367 decode_getfattr(xdr, res->fattr, res->server);
6360 !RPC_IS_ASYNC(rqstp->rq_task));
6361 if (!status) 6368 if (!status)
6362 status = res->count; 6369 status = res->count;
6363out: 6370out:
@@ -6386,8 +6393,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
6386 if (status) 6393 if (status)
6387 goto out; 6394 goto out;
6388 if (res->fattr) 6395 if (res->fattr)
6389 decode_getfattr(xdr, res->fattr, res->server, 6396 decode_getfattr(xdr, res->fattr, res->server);
6390 !RPC_IS_ASYNC(rqstp->rq_task));
6391out: 6397out:
6392 return status; 6398 return status;
6393} 6399}
@@ -6546,8 +6552,7 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
6546 status = decode_delegreturn(xdr); 6552 status = decode_delegreturn(xdr);
6547 if (status != 0) 6553 if (status != 0)
6548 goto out; 6554 goto out;
6549 decode_getfattr(xdr, res->fattr, res->server, 6555 decode_getfattr(xdr, res->fattr, res->server);
6550 !RPC_IS_ASYNC(rqstp->rq_task));
6551out: 6556out:
6552 return status; 6557 return status;
6553} 6558}
@@ -6576,8 +6581,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
6576 goto out; 6581 goto out;
6577 xdr_enter_page(xdr, PAGE_SIZE); 6582 xdr_enter_page(xdr, PAGE_SIZE);
6578 status = decode_getfattr(xdr, &res->fs_locations->fattr, 6583 status = decode_getfattr(xdr, &res->fs_locations->fattr,
6579 res->fs_locations->server, 6584 res->fs_locations->server);
6580 !RPC_IS_ASYNC(req->rq_task));
6581out: 6585out:
6582 return status; 6586 return status;
6583} 6587}
@@ -6826,8 +6830,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
6826 status = decode_layoutcommit(xdr, rqstp, res); 6830 status = decode_layoutcommit(xdr, rqstp, res);
6827 if (status) 6831 if (status)
6828 goto out; 6832 goto out;
6829 decode_getfattr(xdr, res->fattr, res->server, 6833 decode_getfattr(xdr, res->fattr, res->server);
6830 !RPC_IS_ASYNC(rqstp->rq_task));
6831out: 6834out:
6832 return status; 6835 return status;
6833} 6836}
@@ -6958,7 +6961,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
6958 goto out_overflow; 6961 goto out_overflow;
6959 6962
6960 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh, 6963 if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
6961 entry->server, 1) < 0) 6964 entry->server) < 0)
6962 goto out_overflow; 6965 goto out_overflow;
6963 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) 6966 if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
6964 entry->ino = entry->fattr->mounted_on_fileid; 6967 entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c807ab93140e..55d01280a609 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -551,7 +551,8 @@ static const struct nfs_pageio_ops objio_pg_write_ops = {
551static struct pnfs_layoutdriver_type objlayout_type = { 551static struct pnfs_layoutdriver_type objlayout_type = {
552 .id = LAYOUT_OSD2_OBJECTS, 552 .id = LAYOUT_OSD2_OBJECTS,
553 .name = "LAYOUT_OSD2_OBJECTS", 553 .name = "LAYOUT_OSD2_OBJECTS",
554 .flags = PNFS_LAYOUTRET_ON_SETATTR, 554 .flags = PNFS_LAYOUTRET_ON_SETATTR |
555 PNFS_LAYOUTRET_ON_ERROR,
555 556
556 .alloc_layout_hdr = objlayout_alloc_layout_hdr, 557 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
557 .free_layout_hdr = objlayout_free_layout_hdr, 558 .free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 72074e3a04f9..b3c29039f5b8 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -254,6 +254,8 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
254 oir->status = rdata->task.tk_status = status; 254 oir->status = rdata->task.tk_status = status;
255 if (status >= 0) 255 if (status >= 0)
256 rdata->res.count = status; 256 rdata->res.count = status;
257 else
258 rdata->pnfs_error = status;
257 objlayout_iodone(oir); 259 objlayout_iodone(oir);
258 /* must not use oir after this point */ 260 /* must not use oir after this point */
259 261
@@ -334,6 +336,8 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
334 if (status >= 0) { 336 if (status >= 0) {
335 wdata->res.count = status; 337 wdata->res.count = status;
336 wdata->verf.committed = oir->committed; 338 wdata->verf.committed = oir->committed;
339 } else {
340 wdata->pnfs_error = status;
337 } 341 }
338 objlayout_iodone(oir); 342 objlayout_iodone(oir);
339 /* must not use oir after this point */ 343 /* must not use oir after this point */
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8e672a2b2d69..17149a490065 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1166,6 +1166,33 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1166} 1166}
1167EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); 1167EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1168 1168
1169static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head)
1170{
1171 struct nfs_pageio_descriptor pgio;
1172 LIST_HEAD(failed);
1173
1174 /* Resend all requests through the MDS */
1175 nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE);
1176 while (!list_empty(head)) {
1177 struct nfs_page *req = nfs_list_entry(head->next);
1178
1179 nfs_list_remove_request(req);
1180 if (!nfs_pageio_add_request(&pgio, req))
1181 nfs_list_add_request(req, &failed);
1182 }
1183 nfs_pageio_complete(&pgio);
1184
1185 if (!list_empty(&failed)) {
1186 /* For some reason our attempt to resend pages. Mark the
1187 * overall send request as having failed, and let
1188 * nfs_writeback_release_full deal with the error.
1189 */
1190 list_move(&failed, head);
1191 return -EIO;
1192 }
1193 return 0;
1194}
1195
1169/* 1196/*
1170 * Called by non rpc-based layout drivers 1197 * Called by non rpc-based layout drivers
1171 */ 1198 */
@@ -1175,9 +1202,17 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
1175 pnfs_set_layoutcommit(data); 1202 pnfs_set_layoutcommit(data);
1176 data->mds_ops->rpc_call_done(&data->task, data); 1203 data->mds_ops->rpc_call_done(&data->task, data);
1177 } else { 1204 } else {
1178 put_lseg(data->lseg);
1179 data->lseg = NULL;
1180 dprintk("pnfs write error = %d\n", data->pnfs_error); 1205 dprintk("pnfs write error = %d\n", data->pnfs_error);
1206 if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
1207 PNFS_LAYOUTRET_ON_ERROR) {
1208 /* Don't lo_commit on error, Server will needs to
1209 * preform a file recovery.
1210 */
1211 clear_bit(NFS_INO_LAYOUTCOMMIT,
1212 &NFS_I(data->inode)->flags);
1213 pnfs_return_layout(data->inode);
1214 }
1215 data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
1181 } 1216 }
1182 data->mds_ops->rpc_release(data); 1217 data->mds_ops->rpc_release(data);
1183} 1218}
@@ -1267,6 +1302,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
1267 put_lseg(data->lseg); 1302 put_lseg(data->lseg);
1268 data->lseg = NULL; 1303 data->lseg = NULL;
1269 dprintk("pnfs write error = %d\n", data->pnfs_error); 1304 dprintk("pnfs write error = %d\n", data->pnfs_error);
1305 if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
1306 PNFS_LAYOUTRET_ON_ERROR)
1307 pnfs_return_layout(data->inode);
1270 1308
1271 nfs_pageio_init_read_mds(&pgio, data->inode); 1309 nfs_pageio_init_read_mds(&pgio, data->inode);
1272 1310
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1509530cb111..53d593a0a4f2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -68,6 +68,7 @@ enum {
68enum layoutdriver_policy_flags { 68enum layoutdriver_policy_flags {
69 /* Should the pNFS client commit and return the layout upon a setattr */ 69 /* Should the pNFS client commit and return the layout upon a setattr */
70 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, 70 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
71 PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
71}; 72};
72 73
73struct nfs4_deviceid_node; 74struct nfs4_deviceid_node;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e463967aafb8..3dfa4f112c0a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -908,10 +908,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
908 data->auth_flavor_len = 1; 908 data->auth_flavor_len = 1;
909 data->version = version; 909 data->version = version;
910 data->minorversion = 0; 910 data->minorversion = 0;
911 security_init_mnt_opts(&data->lsm_opts);
911 } 912 }
912 return data; 913 return data;
913} 914}
914 915
916static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
917{
918 if (data) {
919 kfree(data->client_address);
920 kfree(data->mount_server.hostname);
921 kfree(data->nfs_server.export_path);
922 kfree(data->nfs_server.hostname);
923 kfree(data->fscache_uniq);
924 security_free_mnt_opts(&data->lsm_opts);
925 kfree(data);
926 }
927}
928
915/* 929/*
916 * Sanity-check a server address provided by the mount command. 930 * Sanity-check a server address provided by the mount command.
917 * 931 *
@@ -2219,9 +2233,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2219 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); 2233 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
2220 mntfh = nfs_alloc_fhandle(); 2234 mntfh = nfs_alloc_fhandle();
2221 if (data == NULL || mntfh == NULL) 2235 if (data == NULL || mntfh == NULL)
2222 goto out_free_fh; 2236 goto out;
2223
2224 security_init_mnt_opts(&data->lsm_opts);
2225 2237
2226 /* Validate the mount data */ 2238 /* Validate the mount data */
2227 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); 2239 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
@@ -2233,8 +2245,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2233#ifdef CONFIG_NFS_V4 2245#ifdef CONFIG_NFS_V4
2234 if (data->version == 4) { 2246 if (data->version == 4) {
2235 mntroot = nfs4_try_mount(flags, dev_name, data); 2247 mntroot = nfs4_try_mount(flags, dev_name, data);
2236 kfree(data->client_address);
2237 kfree(data->nfs_server.export_path);
2238 goto out; 2248 goto out;
2239 } 2249 }
2240#endif /* CONFIG_NFS_V4 */ 2250#endif /* CONFIG_NFS_V4 */
@@ -2289,13 +2299,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2289 s->s_flags |= MS_ACTIVE; 2299 s->s_flags |= MS_ACTIVE;
2290 2300
2291out: 2301out:
2292 kfree(data->nfs_server.hostname); 2302 nfs_free_parsed_mount_data(data);
2293 kfree(data->mount_server.hostname);
2294 kfree(data->fscache_uniq);
2295 security_free_mnt_opts(&data->lsm_opts);
2296out_free_fh:
2297 nfs_free_fhandle(mntfh); 2303 nfs_free_fhandle(mntfh);
2298 kfree(data);
2299 return mntroot; 2304 return mntroot;
2300 2305
2301out_err_nosb: 2306out_err_nosb:
@@ -2622,9 +2627,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2622 2627
2623 mntfh = nfs_alloc_fhandle(); 2628 mntfh = nfs_alloc_fhandle();
2624 if (data == NULL || mntfh == NULL) 2629 if (data == NULL || mntfh == NULL)
2625 goto out_free_fh; 2630 goto out;
2626
2627 security_init_mnt_opts(&data->lsm_opts);
2628 2631
2629 /* Get a volume representation */ 2632 /* Get a volume representation */
2630 server = nfs4_create_server(data, mntfh); 2633 server = nfs4_create_server(data, mntfh);
@@ -2676,13 +2679,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2676 2679
2677 s->s_flags |= MS_ACTIVE; 2680 s->s_flags |= MS_ACTIVE;
2678 2681
2679 security_free_mnt_opts(&data->lsm_opts);
2680 nfs_free_fhandle(mntfh); 2682 nfs_free_fhandle(mntfh);
2681 return mntroot; 2683 return mntroot;
2682 2684
2683out: 2685out:
2684 security_free_mnt_opts(&data->lsm_opts);
2685out_free_fh:
2686 nfs_free_fhandle(mntfh); 2686 nfs_free_fhandle(mntfh);
2687 return ERR_PTR(error); 2687 return ERR_PTR(error);
2688 2688
@@ -2839,7 +2839,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
2839 2839
2840 data = nfs_alloc_parsed_mount_data(4); 2840 data = nfs_alloc_parsed_mount_data(4);
2841 if (data == NULL) 2841 if (data == NULL)
2842 goto out_free_data; 2842 goto out;
2843 2843
2844 /* Validate the mount data */ 2844 /* Validate the mount data */
2845 error = nfs4_validate_mount_data(raw_data, data, dev_name); 2845 error = nfs4_validate_mount_data(raw_data, data, dev_name);
@@ -2853,12 +2853,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
2853 error = PTR_ERR(res); 2853 error = PTR_ERR(res);
2854 2854
2855out: 2855out:
2856 kfree(data->client_address); 2856 nfs_free_parsed_mount_data(data);
2857 kfree(data->nfs_server.export_path);
2858 kfree(data->nfs_server.hostname);
2859 kfree(data->fscache_uniq);
2860out_free_data:
2861 kfree(data);
2862 dprintk("<-- nfs4_mount() = %d%s\n", error, 2857 dprintk("<-- nfs4_mount() = %d%s\n", error,
2863 error != 0 ? " [error]" : ""); 2858 error != 0 ? " [error]" : "");
2864 return res; 2859 return res;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1dda78db6a73..834f0fe96f89 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1052,7 +1052,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
1052 .pg_doio = nfs_generic_pg_writepages, 1052 .pg_doio = nfs_generic_pg_writepages,
1053}; 1053};
1054 1054
1055static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio, 1055void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
1056 struct inode *inode, int ioflags) 1056 struct inode *inode, int ioflags)
1057{ 1057{
1058 nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, 1058 nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
@@ -1166,13 +1166,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
1166static void nfs_writeback_release_full(void *calldata) 1166static void nfs_writeback_release_full(void *calldata)
1167{ 1167{
1168 struct nfs_write_data *data = calldata; 1168 struct nfs_write_data *data = calldata;
1169 int ret, status = data->task.tk_status; 1169 int status = data->task.tk_status;
1170 struct nfs_pageio_descriptor pgio;
1171
1172 if (data->pnfs_error) {
1173 nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
1174 pgio.pg_recoalesce = 1;
1175 }
1176 1170
1177 /* Update attributes as result of writeback. */ 1171 /* Update attributes as result of writeback. */
1178 while (!list_empty(&data->pages)) { 1172 while (!list_empty(&data->pages)) {
@@ -1188,11 +1182,6 @@ static void nfs_writeback_release_full(void *calldata)
1188 req->wb_bytes, 1182 req->wb_bytes,
1189 (long long)req_offset(req)); 1183 (long long)req_offset(req));
1190 1184
1191 if (data->pnfs_error) {
1192 dprintk(", pnfs error = %d\n", data->pnfs_error);
1193 goto next;
1194 }
1195
1196 if (status < 0) { 1185 if (status < 0) {
1197 nfs_set_pageerror(page); 1186 nfs_set_pageerror(page);
1198 nfs_context_set_write_error(req->wb_context, status); 1187 nfs_context_set_write_error(req->wb_context, status);
@@ -1212,19 +1201,7 @@ remove_request:
1212 next: 1201 next:
1213 nfs_clear_page_tag_locked(req); 1202 nfs_clear_page_tag_locked(req);
1214 nfs_end_page_writeback(page); 1203 nfs_end_page_writeback(page);
1215 if (data->pnfs_error) {
1216 lock_page(page);
1217 nfs_pageio_cond_complete(&pgio, page->index);
1218 ret = nfs_page_async_flush(&pgio, page, 0);
1219 if (ret) {
1220 nfs_set_pageerror(page);
1221 dprintk("rewrite to MDS error = %d\n", ret);
1222 }
1223 unlock_page(page);
1224 }
1225 } 1204 }
1226 if (data->pnfs_error)
1227 nfs_pageio_complete(&pgio);
1228 nfs_writedata_release(calldata); 1205 nfs_writedata_release(calldata);
1229} 1206}
1230 1207
@@ -1711,7 +1688,7 @@ out_error:
1711 1688
1712#ifdef CONFIG_MIGRATION 1689#ifdef CONFIG_MIGRATION
1713int nfs_migrate_page(struct address_space *mapping, struct page *newpage, 1690int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1714 struct page *page) 1691 struct page *page, enum migrate_mode mode)
1715{ 1692{
1716 /* 1693 /*
1717 * If PagePrivate is set, then the page is currently associated with 1694 * If PagePrivate is set, then the page is currently associated with
@@ -1726,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1726 1703
1727 nfs_fscache_release_page(page, GFP_KERNEL); 1704 nfs_fscache_release_page(page, GFP_KERNEL);
1728 1705
1729 return migrate_page(mapping, newpage, page); 1706 return migrate_page(mapping, newpage, page, mode);
1730} 1707}
1731#endif 1708#endif
1732 1709
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 10e6366608f2..8df1ea4a6ff9 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -80,3 +80,13 @@ config NFSD_V4
80 available from http://linux-nfs.org/. 80 available from http://linux-nfs.org/.
81 81
82 If unsure, say N. 82 If unsure, say N.
83
84config NFSD_FAULT_INJECTION
85 bool "NFS server manual fault injection"
86 depends on NFSD_V4 && DEBUG_KERNEL
87 help
88 This option enables support for manually injecting faults
89 into the NFS server. This is intended to be used for
90 testing error recovery on the NFS client.
91
92 If unsure, say N.
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9b118ee20193..af32ef06b4fe 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NFSD) += nfsd.o
6 6
7nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ 7nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
8 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o 8 export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
9nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
9nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o 10nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
10nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o 11nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
11nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o 12nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 62f3b9074e84..cf8a6bd062fa 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -87,7 +87,7 @@ static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
87 struct svc_expkey key; 87 struct svc_expkey key;
88 struct svc_expkey *ek = NULL; 88 struct svc_expkey *ek = NULL;
89 89
90 if (mesg[mlen-1] != '\n') 90 if (mlen < 1 || mesg[mlen-1] != '\n')
91 return -EINVAL; 91 return -EINVAL;
92 mesg[mlen-1] = 0; 92 mesg[mlen-1] = 0;
93 93
@@ -1226,12 +1226,12 @@ nfsd_export_init(void)
1226 int rv; 1226 int rv;
1227 dprintk("nfsd: initializing export module.\n"); 1227 dprintk("nfsd: initializing export module.\n");
1228 1228
1229 rv = cache_register(&svc_export_cache); 1229 rv = cache_register_net(&svc_export_cache, &init_net);
1230 if (rv) 1230 if (rv)
1231 return rv; 1231 return rv;
1232 rv = cache_register(&svc_expkey_cache); 1232 rv = cache_register_net(&svc_expkey_cache, &init_net);
1233 if (rv) 1233 if (rv)
1234 cache_unregister(&svc_export_cache); 1234 cache_unregister_net(&svc_export_cache, &init_net);
1235 return rv; 1235 return rv;
1236 1236
1237} 1237}
@@ -1255,8 +1255,8 @@ nfsd_export_shutdown(void)
1255 1255
1256 dprintk("nfsd: shutting down export module.\n"); 1256 dprintk("nfsd: shutting down export module.\n");
1257 1257
1258 cache_unregister(&svc_expkey_cache); 1258 cache_unregister_net(&svc_expkey_cache, &init_net);
1259 cache_unregister(&svc_export_cache); 1259 cache_unregister_net(&svc_export_cache, &init_net);
1260 svcauth_unix_purge(); 1260 svcauth_unix_purge();
1261 1261
1262 dprintk("nfsd: export shutdown complete.\n"); 1262 dprintk("nfsd: export shutdown complete.\n");
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
new file mode 100644
index 000000000000..ce7f0758d84c
--- /dev/null
+++ b/fs/nfsd/fault_inject.c
@@ -0,0 +1,91 @@
1/*
2 * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
3 *
4 * Uses debugfs to create fault injection points for client testing
5 */
6
7#include <linux/types.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/module.h>
11
12#include "state.h"
13#include "fault_inject.h"
14
15struct nfsd_fault_inject_op {
16 char *file;
17 void (*func)(u64);
18};
19
20static struct nfsd_fault_inject_op inject_ops[] = {
21 {
22 .file = "forget_clients",
23 .func = nfsd_forget_clients,
24 },
25 {
26 .file = "forget_locks",
27 .func = nfsd_forget_locks,
28 },
29 {
30 .file = "forget_openowners",
31 .func = nfsd_forget_openowners,
32 },
33 {
34 .file = "forget_delegations",
35 .func = nfsd_forget_delegations,
36 },
37 {
38 .file = "recall_delegations",
39 .func = nfsd_recall_delegations,
40 },
41};
42
43static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
44static struct dentry *debug_dir;
45
46static int nfsd_inject_set(void *op_ptr, u64 val)
47{
48 struct nfsd_fault_inject_op *op = op_ptr;
49
50 if (val == 0)
51 printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
52 else
53 printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
54
55 op->func(val);
56 return 0;
57}
58
59static int nfsd_inject_get(void *data, u64 *val)
60{
61 return 0;
62}
63
64DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n");
65
66void nfsd_fault_inject_cleanup(void)
67{
68 debugfs_remove_recursive(debug_dir);
69}
70
71int nfsd_fault_inject_init(void)
72{
73 unsigned int i;
74 struct nfsd_fault_inject_op *op;
75 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
76
77 debug_dir = debugfs_create_dir("nfsd", NULL);
78 if (!debug_dir)
79 goto fail;
80
81 for (i = 0; i < NUM_INJECT_OPS; i++) {
82 op = &inject_ops[i];
83 if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd))
84 goto fail;
85 }
86 return 0;
87
88fail:
89 nfsd_fault_inject_cleanup();
90 return -ENOMEM;
91}
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
new file mode 100644
index 000000000000..90bd0570956c
--- /dev/null
+++ b/fs/nfsd/fault_inject.h
@@ -0,0 +1,28 @@
1/*
2 * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
3 *
4 * Function definitions for fault injection
5 */
6
7#ifndef LINUX_NFSD_FAULT_INJECT_H
8#define LINUX_NFSD_FAULT_INJECT_H
9
10#ifdef CONFIG_NFSD_FAULT_INJECTION
11int nfsd_fault_inject_init(void);
12void nfsd_fault_inject_cleanup(void);
13void nfsd_forget_clients(u64);
14void nfsd_forget_locks(u64);
15void nfsd_forget_openowners(u64);
16void nfsd_forget_delegations(u64);
17void nfsd_recall_delegations(u64);
18#else /* CONFIG_NFSD_FAULT_INJECTION */
19static inline int nfsd_fault_inject_init(void) { return 0; }
20static inline void nfsd_fault_inject_cleanup(void) {}
21static inline void nfsd_forget_clients(u64 num) {}
22static inline void nfsd_forget_locks(u64 num) {}
23static inline void nfsd_forget_openowners(u64 num) {}
24static inline void nfsd_forget_delegations(u64 num) {}
25static inline void nfsd_recall_delegations(u64 num) {}
26#endif /* CONFIG_NFSD_FAULT_INJECTION */
27
28#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7748d6a18d97..6f3ebb48b12f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -718,7 +718,7 @@ int set_callback_cred(void)
718{ 718{
719 if (callback_cred) 719 if (callback_cred)
720 return 0; 720 return 0;
721 callback_cred = rpc_lookup_machine_cred(); 721 callback_cred = rpc_lookup_machine_cred("nfs");
722 if (!callback_cred) 722 if (!callback_cred)
723 return -ENOMEM; 723 return -ENOMEM;
724 return 0; 724 return 0;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 55780a22fdbd..94096273cd6c 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -36,6 +36,7 @@
36#include <linux/seq_file.h> 36#include <linux/seq_file.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/slab.h> 38#include <linux/slab.h>
39#include <net/net_namespace.h>
39#include "idmap.h" 40#include "idmap.h"
40#include "nfsd.h" 41#include "nfsd.h"
41 42
@@ -466,20 +467,20 @@ nfsd_idmap_init(void)
466{ 467{
467 int rv; 468 int rv;
468 469
469 rv = cache_register(&idtoname_cache); 470 rv = cache_register_net(&idtoname_cache, &init_net);
470 if (rv) 471 if (rv)
471 return rv; 472 return rv;
472 rv = cache_register(&nametoid_cache); 473 rv = cache_register_net(&nametoid_cache, &init_net);
473 if (rv) 474 if (rv)
474 cache_unregister(&idtoname_cache); 475 cache_unregister_net(&idtoname_cache, &init_net);
475 return rv; 476 return rv;
476} 477}
477 478
478void 479void
479nfsd_idmap_shutdown(void) 480nfsd_idmap_shutdown(void)
480{ 481{
481 cache_unregister(&idtoname_cache); 482 cache_unregister_net(&idtoname_cache, &init_net);
482 cache_unregister(&nametoid_cache); 483 cache_unregister_net(&nametoid_cache, &init_net);
483} 484}
484 485
485static int 486static int
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c5e28ed8bca0..896da74ec563 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -266,10 +266,6 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
266{ 266{
267 __be32 status; 267 __be32 status;
268 268
269 /* Only reclaims from previously confirmed clients are valid */
270 if ((status = nfs4_check_open_reclaim(&open->op_clientid)))
271 return status;
272
273 /* We don't know the target directory, and therefore can not 269 /* We don't know the target directory, and therefore can not
274 * set the change info 270 * set the change info
275 */ 271 */
@@ -373,6 +369,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
373 break; 369 break;
374 case NFS4_OPEN_CLAIM_PREVIOUS: 370 case NFS4_OPEN_CLAIM_PREVIOUS:
375 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; 371 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
372 status = nfs4_check_open_reclaim(&open->op_clientid);
373 if (status)
374 goto out;
376 case NFS4_OPEN_CLAIM_FH: 375 case NFS4_OPEN_CLAIM_FH:
377 case NFS4_OPEN_CLAIM_DELEG_CUR_FH: 376 case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
378 status = do_open_fhandle(rqstp, &cstate->current_fh, 377 status = do_open_fhandle(rqstp, &cstate->current_fh,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 80a0be9ed008..0b3e875d1abd 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -117,8 +117,7 @@ out_no_tfm:
117 return status; 117 return status;
118} 118}
119 119
120int 120void nfsd4_create_clid_dir(struct nfs4_client *clp)
121nfsd4_create_clid_dir(struct nfs4_client *clp)
122{ 121{
123 const struct cred *original_cred; 122 const struct cred *original_cred;
124 char *dname = clp->cl_recdir; 123 char *dname = clp->cl_recdir;
@@ -127,13 +126,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
127 126
128 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 127 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
129 128
130 if (!rec_file || clp->cl_firststate) 129 if (clp->cl_firststate)
131 return 0; 130 return;
132
133 clp->cl_firststate = 1; 131 clp->cl_firststate = 1;
132 if (!rec_file)
133 return;
134 status = nfs4_save_creds(&original_cred); 134 status = nfs4_save_creds(&original_cred);
135 if (status < 0) 135 if (status < 0)
136 return status; 136 return;
137 137
138 dir = rec_file->f_path.dentry; 138 dir = rec_file->f_path.dentry;
139 /* lock the parent */ 139 /* lock the parent */
@@ -144,8 +144,15 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
144 status = PTR_ERR(dentry); 144 status = PTR_ERR(dentry);
145 goto out_unlock; 145 goto out_unlock;
146 } 146 }
147 status = -EEXIST;
148 if (dentry->d_inode) 147 if (dentry->d_inode)
148 /*
149 * In the 4.1 case, where we're called from
150 * reclaim_complete(), records from the previous reboot
151 * may still be left, so this is OK.
152 *
153 * In the 4.0 case, we should never get here; but we may
154 * as well be forgiving and just succeed silently.
155 */
149 goto out_put; 156 goto out_put;
150 status = mnt_want_write_file(rec_file); 157 status = mnt_want_write_file(rec_file);
151 if (status) 158 if (status)
@@ -164,7 +171,6 @@ out_unlock:
164 " and is writeable", status, 171 " and is writeable", status,
165 user_recovery_dirname); 172 user_recovery_dirname);
166 nfs4_reset_creds(original_cred); 173 nfs4_reset_creds(original_cred);
167 return status;
168} 174}
169 175
170typedef int (recdir_func)(struct dentry *, struct dentry *); 176typedef int (recdir_func)(struct dentry *, struct dentry *);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 9ca16dc09e04..e8c98f009670 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -49,12 +49,20 @@
49time_t nfsd4_lease = 90; /* default lease time */ 49time_t nfsd4_lease = 90; /* default lease time */
50time_t nfsd4_grace = 90; 50time_t nfsd4_grace = 90;
51static time_t boot_time; 51static time_t boot_time;
52static stateid_t zerostateid; /* bits all 0 */ 52
53static stateid_t onestateid; /* bits all 1 */ 53#define all_ones {{~0,~0},~0}
54static const stateid_t one_stateid = {
55 .si_generation = ~0,
56 .si_opaque = all_ones,
57};
58static const stateid_t zero_stateid = {
59 /* all fields zero */
60};
61
54static u64 current_sessionid = 1; 62static u64 current_sessionid = 1;
55 63
56#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) 64#define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t)))
57#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 65#define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t)))
58 66
59/* forward declarations */ 67/* forward declarations */
60static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); 68static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner);
@@ -133,21 +141,21 @@ unsigned int max_delegations;
133 * Open owner state (share locks) 141 * Open owner state (share locks)
134 */ 142 */
135 143
136/* hash tables for open owners */ 144/* hash tables for lock and open owners */
137#define OPEN_OWNER_HASH_BITS 8 145#define OWNER_HASH_BITS 8
138#define OPEN_OWNER_HASH_SIZE (1 << OPEN_OWNER_HASH_BITS) 146#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
139#define OPEN_OWNER_HASH_MASK (OPEN_OWNER_HASH_SIZE - 1) 147#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
140 148
141static unsigned int open_ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) 149static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
142{ 150{
143 unsigned int ret; 151 unsigned int ret;
144 152
145 ret = opaque_hashval(ownername->data, ownername->len); 153 ret = opaque_hashval(ownername->data, ownername->len);
146 ret += clientid; 154 ret += clientid;
147 return ret & OPEN_OWNER_HASH_MASK; 155 return ret & OWNER_HASH_MASK;
148} 156}
149 157
150static struct list_head open_ownerstr_hashtbl[OPEN_OWNER_HASH_SIZE]; 158static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
151 159
152/* hash table for nfs4_file */ 160/* hash table for nfs4_file */
153#define FILE_HASH_BITS 8 161#define FILE_HASH_BITS 8
@@ -514,6 +522,7 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
514 522
515 list_del(&lo->lo_owner.so_strhash); 523 list_del(&lo->lo_owner.so_strhash);
516 list_del(&lo->lo_perstateid); 524 list_del(&lo->lo_perstateid);
525 list_del(&lo->lo_owner_ino_hash);
517 while (!list_empty(&lo->lo_owner.so_stateids)) { 526 while (!list_empty(&lo->lo_owner.so_stateids)) {
518 stp = list_first_entry(&lo->lo_owner.so_stateids, 527 stp = list_first_entry(&lo->lo_owner.so_stateids,
519 struct nfs4_ol_stateid, st_perstateowner); 528 struct nfs4_ol_stateid, st_perstateowner);
@@ -985,12 +994,11 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
985 clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); 994 clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
986 if (clp == NULL) 995 if (clp == NULL)
987 return NULL; 996 return NULL;
988 clp->cl_name.data = kmalloc(name.len, GFP_KERNEL); 997 clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
989 if (clp->cl_name.data == NULL) { 998 if (clp->cl_name.data == NULL) {
990 kfree(clp); 999 kfree(clp);
991 return NULL; 1000 return NULL;
992 } 1001 }
993 memcpy(clp->cl_name.data, name.data, name.len);
994 clp->cl_name.len = name.len; 1002 clp->cl_name.len = name.len;
995 return clp; 1003 return clp;
996} 1004}
@@ -1058,7 +1066,6 @@ expire_client(struct nfs4_client *clp)
1058 spin_unlock(&recall_lock); 1066 spin_unlock(&recall_lock);
1059 while (!list_empty(&reaplist)) { 1067 while (!list_empty(&reaplist)) {
1060 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); 1068 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
1061 list_del_init(&dp->dl_recall_lru);
1062 unhash_delegation(dp); 1069 unhash_delegation(dp);
1063 } 1070 }
1064 while (!list_empty(&clp->cl_openowners)) { 1071 while (!list_empty(&clp->cl_openowners)) {
@@ -2301,7 +2308,7 @@ nfsd4_free_slabs(void)
2301 nfsd4_free_slab(&deleg_slab); 2308 nfsd4_free_slab(&deleg_slab);
2302} 2309}
2303 2310
2304static int 2311int
2305nfsd4_init_slabs(void) 2312nfsd4_init_slabs(void)
2306{ 2313{
2307 openowner_slab = kmem_cache_create("nfsd4_openowners", 2314 openowner_slab = kmem_cache_create("nfsd4_openowners",
@@ -2373,7 +2380,7 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
2373 2380
2374static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) 2381static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
2375{ 2382{
2376 list_add(&oo->oo_owner.so_strhash, &open_ownerstr_hashtbl[strhashval]); 2383 list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
2377 list_add(&oo->oo_perclient, &clp->cl_openowners); 2384 list_add(&oo->oo_perclient, &clp->cl_openowners);
2378} 2385}
2379 2386
@@ -2436,7 +2443,9 @@ find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open)
2436 struct nfs4_stateowner *so; 2443 struct nfs4_stateowner *so;
2437 struct nfs4_openowner *oo; 2444 struct nfs4_openowner *oo;
2438 2445
2439 list_for_each_entry(so, &open_ownerstr_hashtbl[hashval], so_strhash) { 2446 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) {
2447 if (!so->so_is_open_owner)
2448 continue;
2440 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { 2449 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
2441 oo = openowner(so); 2450 oo = openowner(so);
2442 renew_client(oo->oo_owner.so_client); 2451 renew_client(oo->oo_owner.so_client);
@@ -2580,7 +2589,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2580 if (open->op_file == NULL) 2589 if (open->op_file == NULL)
2581 return nfserr_jukebox; 2590 return nfserr_jukebox;
2582 2591
2583 strhashval = open_ownerstr_hashval(clientid->cl_id, &open->op_owner); 2592 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
2584 oo = find_openstateowner_str(strhashval, open); 2593 oo = find_openstateowner_str(strhashval, open);
2585 open->op_openowner = oo; 2594 open->op_openowner = oo;
2586 if (!oo) { 2595 if (!oo) {
@@ -3123,7 +3132,6 @@ nfs4_laundromat(void)
3123 spin_unlock(&recall_lock); 3132 spin_unlock(&recall_lock);
3124 list_for_each_safe(pos, next, &reaplist) { 3133 list_for_each_safe(pos, next, &reaplist) {
3125 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 3134 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
3126 list_del_init(&dp->dl_recall_lru);
3127 unhash_delegation(dp); 3135 unhash_delegation(dp);
3128 } 3136 }
3129 test_val = nfsd4_lease; 3137 test_val = nfsd4_lease;
@@ -3718,13 +3726,11 @@ out:
3718} 3726}
3719 3727
3720 3728
3721/*
3722 * Lock owner state (byte-range locks)
3723 */
3724#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) 3729#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
3725#define LOCK_HASH_BITS 8 3730
3726#define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) 3731#define LOCKOWNER_INO_HASH_BITS 8
3727#define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) 3732#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
3733#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
3728 3734
3729static inline u64 3735static inline u64
3730end_offset(u64 start, u64 len) 3736end_offset(u64 start, u64 len)
@@ -3746,16 +3752,14 @@ last_byte_offset(u64 start, u64 len)
3746 return end > start ? end - 1: NFS4_MAX_UINT64; 3752 return end > start ? end - 1: NFS4_MAX_UINT64;
3747} 3753}
3748 3754
3749static inline unsigned int 3755static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
3750lock_ownerstr_hashval(struct inode *inode, u32 cl_id,
3751 struct xdr_netobj *ownername)
3752{ 3756{
3753 return (file_hashval(inode) + cl_id 3757 return (file_hashval(inode) + cl_id
3754 + opaque_hashval(ownername->data, ownername->len)) 3758 + opaque_hashval(ownername->data, ownername->len))
3755 & LOCK_HASH_MASK; 3759 & LOCKOWNER_INO_HASH_MASK;
3756} 3760}
3757 3761
3758static struct list_head lock_ownerstr_hashtbl[LOCK_HASH_SIZE]; 3762static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
3759 3763
3760/* 3764/*
3761 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that 3765 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
@@ -3809,23 +3813,39 @@ nevermind:
3809 deny->ld_type = NFS4_WRITE_LT; 3813 deny->ld_type = NFS4_WRITE_LT;
3810} 3814}
3811 3815
3816static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner)
3817{
3818 struct nfs4_ol_stateid *lst;
3819
3820 if (!same_owner_str(&lo->lo_owner, owner, clid))
3821 return false;
3822 lst = list_first_entry(&lo->lo_owner.so_stateids,
3823 struct nfs4_ol_stateid, st_perstateowner);
3824 return lst->st_file->fi_inode == inode;
3825}
3826
3812static struct nfs4_lockowner * 3827static struct nfs4_lockowner *
3813find_lockowner_str(struct inode *inode, clientid_t *clid, 3828find_lockowner_str(struct inode *inode, clientid_t *clid,
3814 struct xdr_netobj *owner) 3829 struct xdr_netobj *owner)
3815{ 3830{
3816 unsigned int hashval = lock_ownerstr_hashval(inode, clid->cl_id, owner); 3831 unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
3817 struct nfs4_stateowner *op; 3832 struct nfs4_lockowner *lo;
3818 3833
3819 list_for_each_entry(op, &lock_ownerstr_hashtbl[hashval], so_strhash) { 3834 list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
3820 if (same_owner_str(op, owner, clid)) 3835 if (same_lockowner_ino(lo, inode, clid, owner))
3821 return lockowner(op); 3836 return lo;
3822 } 3837 }
3823 return NULL; 3838 return NULL;
3824} 3839}
3825 3840
3826static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) 3841static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp)
3827{ 3842{
3828 list_add(&lo->lo_owner.so_strhash, &lock_ownerstr_hashtbl[strhashval]); 3843 struct inode *inode = open_stp->st_file->fi_inode;
3844 unsigned int inohash = lockowner_ino_hashval(inode,
3845 clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
3846
3847 list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]);
3848 list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]);
3829 list_add(&lo->lo_perstateid, &open_stp->st_lockowners); 3849 list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
3830} 3850}
3831 3851
@@ -3834,7 +3854,7 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
3834 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 3854 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
3835 * occurred. 3855 * occurred.
3836 * 3856 *
3837 * strhashval = lock_ownerstr_hashval 3857 * strhashval = ownerstr_hashval
3838 */ 3858 */
3839 3859
3840static struct nfs4_lockowner * 3860static struct nfs4_lockowner *
@@ -3892,6 +3912,37 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
3892 __set_bit(access, &lock_stp->st_access_bmap); 3912 __set_bit(access, &lock_stp->st_access_bmap);
3893} 3913}
3894 3914
3915__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
3916{
3917 struct nfs4_file *fi = ost->st_file;
3918 struct nfs4_openowner *oo = openowner(ost->st_stateowner);
3919 struct nfs4_client *cl = oo->oo_owner.so_client;
3920 struct nfs4_lockowner *lo;
3921 unsigned int strhashval;
3922
3923 lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner);
3924 if (lo) {
3925 if (!cstate->minorversion)
3926 return nfserr_bad_seqid;
3927 /* XXX: a lockowner always has exactly one stateid: */
3928 *lst = list_first_entry(&lo->lo_owner.so_stateids,
3929 struct nfs4_ol_stateid, st_perstateowner);
3930 return nfs_ok;
3931 }
3932 strhashval = ownerstr_hashval(cl->cl_clientid.cl_id,
3933 &lock->v.new.owner);
3934 lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
3935 if (lo == NULL)
3936 return nfserr_jukebox;
3937 *lst = alloc_init_lock_stateid(lo, fi, ost);
3938 if (*lst == NULL) {
3939 release_lockowner(lo);
3940 return nfserr_jukebox;
3941 }
3942 *new = true;
3943 return nfs_ok;
3944}
3945
3895/* 3946/*
3896 * LOCK operation 3947 * LOCK operation
3897 */ 3948 */
@@ -3907,7 +3958,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3907 struct file_lock file_lock; 3958 struct file_lock file_lock;
3908 struct file_lock conflock; 3959 struct file_lock conflock;
3909 __be32 status = 0; 3960 __be32 status = 0;
3910 unsigned int strhashval; 3961 bool new_state = false;
3911 int lkflg; 3962 int lkflg;
3912 int err; 3963 int err;
3913 3964
@@ -3933,10 +3984,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3933 * lock stateid. 3984 * lock stateid.
3934 */ 3985 */
3935 struct nfs4_ol_stateid *open_stp = NULL; 3986 struct nfs4_ol_stateid *open_stp = NULL;
3936 3987
3988 if (nfsd4_has_session(cstate))
3989 /* See rfc 5661 18.10.3: given clientid is ignored: */
3990 memcpy(&lock->v.new.clientid,
3991 &cstate->session->se_client->cl_clientid,
3992 sizeof(clientid_t));
3993
3937 status = nfserr_stale_clientid; 3994 status = nfserr_stale_clientid;
3938 if (!nfsd4_has_session(cstate) && 3995 if (STALE_CLIENTID(&lock->lk_new_clientid))
3939 STALE_CLIENTID(&lock->lk_new_clientid))
3940 goto out; 3996 goto out;
3941 3997
3942 /* validate and update open stateid and open seqid */ 3998 /* validate and update open stateid and open seqid */
@@ -3948,25 +4004,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3948 goto out; 4004 goto out;
3949 open_sop = openowner(open_stp->st_stateowner); 4005 open_sop = openowner(open_stp->st_stateowner);
3950 status = nfserr_bad_stateid; 4006 status = nfserr_bad_stateid;
3951 if (!nfsd4_has_session(cstate) && 4007 if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
3952 !same_clid(&open_sop->oo_owner.so_client->cl_clientid,
3953 &lock->v.new.clientid)) 4008 &lock->v.new.clientid))
3954 goto out; 4009 goto out;
3955 /* create lockowner and lock stateid */ 4010 status = lookup_or_create_lock_state(cstate, open_stp, lock,
3956 fp = open_stp->st_file; 4011 &lock_stp, &new_state);
3957 strhashval = lock_ownerstr_hashval(fp->fi_inode, 4012 if (status)
3958 open_sop->oo_owner.so_client->cl_clientid.cl_id,
3959 &lock->v.new.owner);
3960 /* XXX: Do we need to check for duplicate stateowners on
3961 * the same file, or should they just be allowed (and
3962 * create new stateids)? */
3963 status = nfserr_jukebox;
3964 lock_sop = alloc_init_lock_stateowner(strhashval,
3965 open_sop->oo_owner.so_client, open_stp, lock);
3966 if (lock_sop == NULL)
3967 goto out;
3968 lock_stp = alloc_init_lock_stateid(lock_sop, fp, open_stp);
3969 if (lock_stp == NULL)
3970 goto out; 4013 goto out;
3971 } else { 4014 } else {
3972 /* lock (lock owner + lock stateid) already exists */ 4015 /* lock (lock owner + lock stateid) already exists */
@@ -3976,10 +4019,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3976 NFS4_LOCK_STID, &lock_stp); 4019 NFS4_LOCK_STID, &lock_stp);
3977 if (status) 4020 if (status)
3978 goto out; 4021 goto out;
3979 lock_sop = lockowner(lock_stp->st_stateowner);
3980 fp = lock_stp->st_file;
3981 } 4022 }
3982 /* lock_sop and lock_stp have been created or found */ 4023 lock_sop = lockowner(lock_stp->st_stateowner);
4024 fp = lock_stp->st_file;
3983 4025
3984 lkflg = setlkflg(lock->lk_type); 4026 lkflg = setlkflg(lock->lk_type);
3985 status = nfs4_check_openmode(lock_stp, lkflg); 4027 status = nfs4_check_openmode(lock_stp, lkflg);
@@ -4054,7 +4096,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4054 break; 4096 break;
4055 } 4097 }
4056out: 4098out:
4057 if (status && lock->lk_is_new && lock_sop) 4099 if (status && new_state)
4058 release_lockowner(lock_sop); 4100 release_lockowner(lock_sop);
4059 if (!cstate->replay_owner) 4101 if (!cstate->replay_owner)
4060 nfs4_unlock_state(); 4102 nfs4_unlock_state();
@@ -4251,7 +4293,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4251 struct nfs4_ol_stateid *stp; 4293 struct nfs4_ol_stateid *stp;
4252 struct xdr_netobj *owner = &rlockowner->rl_owner; 4294 struct xdr_netobj *owner = &rlockowner->rl_owner;
4253 struct list_head matches; 4295 struct list_head matches;
4254 int i; 4296 unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
4255 __be32 status; 4297 __be32 status;
4256 4298
4257 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", 4299 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
@@ -4266,22 +4308,19 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4266 nfs4_lock_state(); 4308 nfs4_lock_state();
4267 4309
4268 status = nfserr_locks_held; 4310 status = nfserr_locks_held;
4269 /* XXX: we're doing a linear search through all the lockowners.
4270 * Yipes! For now we'll just hope clients aren't really using
4271 * release_lockowner much, but eventually we have to fix these
4272 * data structures. */
4273 INIT_LIST_HEAD(&matches); 4311 INIT_LIST_HEAD(&matches);
4274 for (i = 0; i < LOCK_HASH_SIZE; i++) { 4312
4275 list_for_each_entry(sop, &lock_ownerstr_hashtbl[i], so_strhash) { 4313 list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) {
4276 if (!same_owner_str(sop, owner, clid)) 4314 if (sop->so_is_open_owner)
4277 continue; 4315 continue;
4278 list_for_each_entry(stp, &sop->so_stateids, 4316 if (!same_owner_str(sop, owner, clid))
4279 st_perstateowner) { 4317 continue;
4280 lo = lockowner(sop); 4318 list_for_each_entry(stp, &sop->so_stateids,
4281 if (check_for_locks(stp->st_file, lo)) 4319 st_perstateowner) {
4282 goto out; 4320 lo = lockowner(sop);
4283 list_add(&lo->lo_list, &matches); 4321 if (check_for_locks(stp->st_file, lo))
4284 } 4322 goto out;
4323 list_add(&lo->lo_list, &matches);
4285 } 4324 }
4286 } 4325 }
4287 /* Clients probably won't expect us to return with some (but not all) 4326 /* Clients probably won't expect us to return with some (but not all)
@@ -4394,16 +4433,127 @@ nfs4_check_open_reclaim(clientid_t *clid)
4394 return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad; 4433 return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
4395} 4434}
4396 4435
4436#ifdef CONFIG_NFSD_FAULT_INJECTION
4437
4438void nfsd_forget_clients(u64 num)
4439{
4440 struct nfs4_client *clp, *next;
4441 int count = 0;
4442
4443 nfs4_lock_state();
4444 list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
4445 nfsd4_remove_clid_dir(clp);
4446 expire_client(clp);
4447 if (++count == num)
4448 break;
4449 }
4450 nfs4_unlock_state();
4451
4452 printk(KERN_INFO "NFSD: Forgot %d clients", count);
4453}
4454
4455static void release_lockowner_sop(struct nfs4_stateowner *sop)
4456{
4457 release_lockowner(lockowner(sop));
4458}
4459
4460static void release_openowner_sop(struct nfs4_stateowner *sop)
4461{
4462 release_openowner(openowner(sop));
4463}
4464
4465static int nfsd_release_n_owners(u64 num, bool is_open_owner,
4466 void (*release_sop)(struct nfs4_stateowner *))
4467{
4468 int i, count = 0;
4469 struct nfs4_stateowner *sop, *next;
4470
4471 for (i = 0; i < OWNER_HASH_SIZE; i++) {
4472 list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) {
4473 if (sop->so_is_open_owner != is_open_owner)
4474 continue;
4475 release_sop(sop);
4476 if (++count == num)
4477 return count;
4478 }
4479 }
4480 return count;
4481}
4482
4483void nfsd_forget_locks(u64 num)
4484{
4485 int count;
4486
4487 nfs4_lock_state();
4488 count = nfsd_release_n_owners(num, false, release_lockowner_sop);
4489 nfs4_unlock_state();
4490
4491 printk(KERN_INFO "NFSD: Forgot %d locks", count);
4492}
4493
4494void nfsd_forget_openowners(u64 num)
4495{
4496 int count;
4497
4498 nfs4_lock_state();
4499 count = nfsd_release_n_owners(num, true, release_openowner_sop);
4500 nfs4_unlock_state();
4501
4502 printk(KERN_INFO "NFSD: Forgot %d open owners", count);
4503}
4504
4505int nfsd_process_n_delegations(u64 num, void (*deleg_func)(struct nfs4_delegation *))
4506{
4507 int i, count = 0;
4508 struct nfs4_file *fp, *fnext;
4509 struct nfs4_delegation *dp, *dnext;
4510
4511 for (i = 0; i < FILE_HASH_SIZE; i++) {
4512 list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) {
4513 list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) {
4514 deleg_func(dp);
4515 if (++count == num)
4516 return count;
4517 }
4518 }
4519 }
4520
4521 return count;
4522}
4523
4524void nfsd_forget_delegations(u64 num)
4525{
4526 unsigned int count;
4527
4528 nfs4_lock_state();
4529 count = nfsd_process_n_delegations(num, unhash_delegation);
4530 nfs4_unlock_state();
4531
4532 printk(KERN_INFO "NFSD: Forgot %d delegations", count);
4533}
4534
4535void nfsd_recall_delegations(u64 num)
4536{
4537 unsigned int count;
4538
4539 nfs4_lock_state();
4540 spin_lock(&recall_lock);
4541 count = nfsd_process_n_delegations(num, nfsd_break_one_deleg);
4542 spin_unlock(&recall_lock);
4543 nfs4_unlock_state();
4544
4545 printk(KERN_INFO "NFSD: Recalled %d delegations", count);
4546}
4547
4548#endif /* CONFIG_NFSD_FAULT_INJECTION */
4549
4397/* initialization to perform at module load time: */ 4550/* initialization to perform at module load time: */
4398 4551
4399int 4552void
4400nfs4_state_init(void) 4553nfs4_state_init(void)
4401{ 4554{
4402 int i, status; 4555 int i;
4403 4556
4404 status = nfsd4_init_slabs();
4405 if (status)
4406 return status;
4407 for (i = 0; i < CLIENT_HASH_SIZE; i++) { 4557 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4408 INIT_LIST_HEAD(&conf_id_hashtbl[i]); 4558 INIT_LIST_HEAD(&conf_id_hashtbl[i]);
4409 INIT_LIST_HEAD(&conf_str_hashtbl[i]); 4559 INIT_LIST_HEAD(&conf_str_hashtbl[i]);
@@ -4416,18 +4566,15 @@ nfs4_state_init(void)
4416 for (i = 0; i < FILE_HASH_SIZE; i++) { 4566 for (i = 0; i < FILE_HASH_SIZE; i++) {
4417 INIT_LIST_HEAD(&file_hashtbl[i]); 4567 INIT_LIST_HEAD(&file_hashtbl[i]);
4418 } 4568 }
4419 for (i = 0; i < OPEN_OWNER_HASH_SIZE; i++) { 4569 for (i = 0; i < OWNER_HASH_SIZE; i++) {
4420 INIT_LIST_HEAD(&open_ownerstr_hashtbl[i]); 4570 INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
4421 }
4422 for (i = 0; i < LOCK_HASH_SIZE; i++) {
4423 INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
4424 } 4571 }
4425 memset(&onestateid, ~0, sizeof(stateid_t)); 4572 for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
4573 INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
4426 INIT_LIST_HEAD(&close_lru); 4574 INIT_LIST_HEAD(&close_lru);
4427 INIT_LIST_HEAD(&client_lru); 4575 INIT_LIST_HEAD(&client_lru);
4428 INIT_LIST_HEAD(&del_recall_lru); 4576 INIT_LIST_HEAD(&del_recall_lru);
4429 reclaim_str_hashtbl_size = 0; 4577 reclaim_str_hashtbl_size = 0;
4430 return 0;
4431} 4578}
4432 4579
4433static void 4580static void
@@ -4526,7 +4673,6 @@ __nfs4_state_shutdown(void)
4526 spin_unlock(&recall_lock); 4673 spin_unlock(&recall_lock);
4527 list_for_each_safe(pos, next, &reaplist) { 4674 list_for_each_safe(pos, next, &reaplist) {
4528 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 4675 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
4529 list_del_init(&dp->dl_recall_lru);
4530 unhash_delegation(dp); 4676 unhash_delegation(dp);
4531 } 4677 }
4532 4678
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b6fa792d6b85..0ec5a1b9700e 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -215,10 +215,9 @@ defer_free(struct nfsd4_compoundargs *argp,
215static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) 215static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
216{ 216{
217 if (p == argp->tmp) { 217 if (p == argp->tmp) {
218 p = kmalloc(nbytes, GFP_KERNEL); 218 p = kmemdup(argp->tmp, nbytes, GFP_KERNEL);
219 if (!p) 219 if (!p)
220 return NULL; 220 return NULL;
221 memcpy(p, argp->tmp, nbytes);
222 } else { 221 } else {
223 BUG_ON(p != argp->tmpp); 222 BUG_ON(p != argp->tmpp);
224 argp->tmpp = NULL; 223 argp->tmpp = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bb4a11d58a5a..748eda93ce59 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -18,6 +18,7 @@
18#include "idmap.h" 18#include "idmap.h"
19#include "nfsd.h" 19#include "nfsd.h"
20#include "cache.h" 20#include "cache.h"
21#include "fault_inject.h"
21 22
22/* 23/*
23 * We have a single directory with several nodes in it. 24 * We have a single directory with several nodes in it.
@@ -1128,9 +1129,13 @@ static int __init init_nfsd(void)
1128 int retval; 1129 int retval;
1129 printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); 1130 printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
1130 1131
1131 retval = nfs4_state_init(); /* nfs4 locking state */ 1132 retval = nfsd4_init_slabs();
1132 if (retval) 1133 if (retval)
1133 return retval; 1134 return retval;
1135 nfs4_state_init();
1136 retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
1137 if (retval)
1138 goto out_free_slabs;
1134 nfsd_stat_init(); /* Statistics */ 1139 nfsd_stat_init(); /* Statistics */
1135 retval = nfsd_reply_cache_init(); 1140 retval = nfsd_reply_cache_init();
1136 if (retval) 1141 if (retval)
@@ -1161,6 +1166,8 @@ out_free_cache:
1161 nfsd_reply_cache_shutdown(); 1166 nfsd_reply_cache_shutdown();
1162out_free_stat: 1167out_free_stat:
1163 nfsd_stat_shutdown(); 1168 nfsd_stat_shutdown();
1169 nfsd_fault_inject_cleanup();
1170out_free_slabs:
1164 nfsd4_free_slabs(); 1171 nfsd4_free_slabs();
1165 return retval; 1172 return retval;
1166} 1173}
@@ -1175,6 +1182,7 @@ static void __exit exit_nfsd(void)
1175 nfsd_lockd_shutdown(); 1182 nfsd_lockd_shutdown();
1176 nfsd_idmap_shutdown(); 1183 nfsd_idmap_shutdown();
1177 nfsd4_free_slabs(); 1184 nfsd4_free_slabs();
1185 nfsd_fault_inject_cleanup();
1178 unregister_filesystem(&nfsd_fs_type); 1186 unregister_filesystem(&nfsd_fs_type);
1179} 1187}
1180 1188
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 58134a23fdfb..1d1e8589b4ce 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -104,14 +104,16 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
104 */ 104 */
105#ifdef CONFIG_NFSD_V4 105#ifdef CONFIG_NFSD_V4
106extern unsigned int max_delegations; 106extern unsigned int max_delegations;
107int nfs4_state_init(void); 107void nfs4_state_init(void);
108int nfsd4_init_slabs(void);
108void nfsd4_free_slabs(void); 109void nfsd4_free_slabs(void);
109int nfs4_state_start(void); 110int nfs4_state_start(void);
110void nfs4_state_shutdown(void); 111void nfs4_state_shutdown(void);
111void nfs4_reset_lease(time_t leasetime); 112void nfs4_reset_lease(time_t leasetime);
112int nfs4_reset_recoverydir(char *recdir); 113int nfs4_reset_recoverydir(char *recdir);
113#else 114#else
114static inline int nfs4_state_init(void) { return 0; } 115static inline void nfs4_state_init(void) { }
116static inline int nfsd4_init_slabs(void) { return 0; }
115static inline void nfsd4_free_slabs(void) { } 117static inline void nfsd4_free_slabs(void) { }
116static inline int nfs4_state_start(void) { return 0; } 118static inline int nfs4_state_start(void) { return 0; }
117static inline void nfs4_state_shutdown(void) { } 119static inline void nfs4_state_shutdown(void) { }
@@ -338,15 +340,15 @@ static inline u32 nfsd_suppattrs2(u32 minorversion)
338} 340}
339 341
340/* These will return ERR_INVAL if specified in GETATTR or READDIR. */ 342/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
341#define NFSD_WRITEONLY_ATTRS_WORD1 \ 343#define NFSD_WRITEONLY_ATTRS_WORD1 \
342(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) 344 (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
343 345
344/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ 346/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
345#define NFSD_WRITEABLE_ATTRS_WORD0 \ 347#define NFSD_WRITEABLE_ATTRS_WORD0 \
346(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL ) 348 (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
347#define NFSD_WRITEABLE_ATTRS_WORD1 \ 349#define NFSD_WRITEABLE_ATTRS_WORD1 \
348(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ 350 (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
349 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) 351 | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
350#define NFSD_WRITEABLE_ATTRS_WORD2 0 352#define NFSD_WRITEABLE_ATTRS_WORD2 0
351 353
352#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ 354#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index a3cf38476a1b..ffb5df1db94f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -366,6 +366,7 @@ struct nfs4_openowner {
366 366
367struct nfs4_lockowner { 367struct nfs4_lockowner {
368 struct nfs4_stateowner lo_owner; /* must be first element */ 368 struct nfs4_stateowner lo_owner; /* must be first element */
369 struct list_head lo_owner_ino_hash; /* hash by owner,file */
369 struct list_head lo_perstateid; /* for lockowners only */ 370 struct list_head lo_perstateid; /* for lockowners only */
370 struct list_head lo_list; /* for temporary uses */ 371 struct list_head lo_list; /* for temporary uses */
371}; 372};
@@ -482,7 +483,7 @@ extern void nfsd4_shutdown_recdir(void);
482extern int nfs4_client_to_reclaim(const char *name); 483extern int nfs4_client_to_reclaim(const char *name);
483extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); 484extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
484extern void nfsd4_recdir_purge_old(void); 485extern void nfsd4_recdir_purge_old(void);
485extern int nfsd4_create_clid_dir(struct nfs4_client *clp); 486extern void nfsd4_create_clid_dir(struct nfs4_client *clp);
486extern void nfsd4_remove_clid_dir(struct nfs4_client *clp); 487extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
487extern void release_session_client(struct nfsd4_session *); 488extern void release_session_client(struct nfsd4_session *);
488extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *); 489extern __be32 nfs4_validate_stateid(struct nfs4_client *, stateid_t *);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d25a723b68ad..edf6d3ed8777 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -594,8 +594,19 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
594 return error; 594 return error;
595} 595}
596 596
597#define NFSD_XATTR_JUNCTION_PREFIX XATTR_TRUSTED_PREFIX "junction." 597/*
598#define NFSD_XATTR_JUNCTION_TYPE NFSD_XATTR_JUNCTION_PREFIX "type" 598 * NFS junction information is stored in an extended attribute.
599 */
600#define NFSD_JUNCTION_XATTR_NAME XATTR_TRUSTED_PREFIX "junction.nfs"
601
602/**
603 * nfsd4_is_junction - Test if an object could be an NFS junction
604 *
605 * @dentry: object to test
606 *
607 * Returns 1 if "dentry" appears to contain NFS junction information.
608 * Otherwise 0 is returned.
609 */
599int nfsd4_is_junction(struct dentry *dentry) 610int nfsd4_is_junction(struct dentry *dentry)
600{ 611{
601 struct inode *inode = dentry->d_inode; 612 struct inode *inode = dentry->d_inode;
@@ -606,7 +617,7 @@ int nfsd4_is_junction(struct dentry *dentry)
606 return 0; 617 return 0;
607 if (!(inode->i_mode & S_ISVTX)) 618 if (!(inode->i_mode & S_ISVTX))
608 return 0; 619 return 0;
609 if (vfs_getxattr(dentry, NFSD_XATTR_JUNCTION_TYPE, NULL, 0) <= 0) 620 if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0)
610 return 0; 621 return 0;
611 return 1; 622 return 1;
612} 623}
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 44a88a9fa2c8..fea6bd5831dc 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -52,7 +52,7 @@ static const struct utf8_table utf8_table[] =
52#define SURROGATE_LOW 0x00000400 52#define SURROGATE_LOW 0x00000400
53#define SURROGATE_BITS 0x000003ff 53#define SURROGATE_BITS 0x000003ff
54 54
55int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) 55int utf8_to_utf32(const u8 *s, int inlen, unicode_t *pu)
56{ 56{
57 unsigned long l; 57 unsigned long l;
58 int c0, c, nc; 58 int c0, c, nc;
@@ -71,7 +71,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
71 *pu = (unicode_t) l; 71 *pu = (unicode_t) l;
72 return nc; 72 return nc;
73 } 73 }
74 if (len <= nc) 74 if (inlen <= nc)
75 return -1; 75 return -1;
76 s++; 76 s++;
77 c = (*s ^ 0x80) & 0xFF; 77 c = (*s ^ 0x80) & 0xFF;
@@ -83,7 +83,7 @@ int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
83} 83}
84EXPORT_SYMBOL(utf8_to_utf32); 84EXPORT_SYMBOL(utf8_to_utf32);
85 85
86int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) 86int utf32_to_utf8(unicode_t u, u8 *s, int maxout)
87{ 87{
88 unsigned long l; 88 unsigned long l;
89 int c, nc; 89 int c, nc;
@@ -97,7 +97,7 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
97 return -1; 97 return -1;
98 98
99 nc = 0; 99 nc = 0;
100 for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { 100 for (t = utf8_table; t->cmask && maxout; t++, maxout--) {
101 nc++; 101 nc++;
102 if (l <= t->lmask) { 102 if (l <= t->lmask) {
103 c = t->shift; 103 c = t->shift;
@@ -114,34 +114,57 @@ int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
114} 114}
115EXPORT_SYMBOL(utf32_to_utf8); 115EXPORT_SYMBOL(utf32_to_utf8);
116 116
117int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) 117static inline void put_utf16(wchar_t *s, unsigned c, enum utf16_endian endian)
118{
119 switch (endian) {
120 default:
121 *s = (wchar_t) c;
122 break;
123 case UTF16_LITTLE_ENDIAN:
124 *s = __cpu_to_le16(c);
125 break;
126 case UTF16_BIG_ENDIAN:
127 *s = __cpu_to_be16(c);
128 break;
129 }
130}
131
132int utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
133 wchar_t *pwcs, int maxout)
118{ 134{
119 u16 *op; 135 u16 *op;
120 int size; 136 int size;
121 unicode_t u; 137 unicode_t u;
122 138
123 op = pwcs; 139 op = pwcs;
124 while (*s && len > 0) { 140 while (inlen > 0 && maxout > 0 && *s) {
125 if (*s & 0x80) { 141 if (*s & 0x80) {
126 size = utf8_to_utf32(s, len, &u); 142 size = utf8_to_utf32(s, inlen, &u);
127 if (size < 0) 143 if (size < 0)
128 return -EINVAL; 144 return -EINVAL;
145 s += size;
146 inlen -= size;
129 147
130 if (u >= PLANE_SIZE) { 148 if (u >= PLANE_SIZE) {
149 if (maxout < 2)
150 break;
131 u -= PLANE_SIZE; 151 u -= PLANE_SIZE;
132 *op++ = (wchar_t) (SURROGATE_PAIR | 152 put_utf16(op++, SURROGATE_PAIR |
133 ((u >> 10) & SURROGATE_BITS)); 153 ((u >> 10) & SURROGATE_BITS),
134 *op++ = (wchar_t) (SURROGATE_PAIR | 154 endian);
155 put_utf16(op++, SURROGATE_PAIR |
135 SURROGATE_LOW | 156 SURROGATE_LOW |
136 (u & SURROGATE_BITS)); 157 (u & SURROGATE_BITS),
158 endian);
159 maxout -= 2;
137 } else { 160 } else {
138 *op++ = (wchar_t) u; 161 put_utf16(op++, u, endian);
162 maxout--;
139 } 163 }
140 s += size;
141 len -= size;
142 } else { 164 } else {
143 *op++ = *s++; 165 put_utf16(op++, *s++, endian);
144 len--; 166 inlen--;
167 maxout--;
145 } 168 }
146 } 169 }
147 return op - pwcs; 170 return op - pwcs;
@@ -160,27 +183,27 @@ static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
160 } 183 }
161} 184}
162 185
163int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, 186int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
164 u8 *s, int maxlen) 187 u8 *s, int maxout)
165{ 188{
166 u8 *op; 189 u8 *op;
167 int size; 190 int size;
168 unsigned long u, v; 191 unsigned long u, v;
169 192
170 op = s; 193 op = s;
171 while (len > 0 && maxlen > 0) { 194 while (inlen > 0 && maxout > 0) {
172 u = get_utf16(*pwcs, endian); 195 u = get_utf16(*pwcs, endian);
173 if (!u) 196 if (!u)
174 break; 197 break;
175 pwcs++; 198 pwcs++;
176 len--; 199 inlen--;
177 if (u > 0x7f) { 200 if (u > 0x7f) {
178 if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { 201 if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
179 if (u & SURROGATE_LOW) { 202 if (u & SURROGATE_LOW) {
180 /* Ignore character and move on */ 203 /* Ignore character and move on */
181 continue; 204 continue;
182 } 205 }
183 if (len <= 0) 206 if (inlen <= 0)
184 break; 207 break;
185 v = get_utf16(*pwcs, endian); 208 v = get_utf16(*pwcs, endian);
186 if ((v & SURROGATE_MASK) != SURROGATE_PAIR || 209 if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
@@ -191,18 +214,18 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
191 u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) 214 u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
192 + (v & SURROGATE_BITS); 215 + (v & SURROGATE_BITS);
193 pwcs++; 216 pwcs++;
194 len--; 217 inlen--;
195 } 218 }
196 size = utf32_to_utf8(u, op, maxlen); 219 size = utf32_to_utf8(u, op, maxout);
197 if (size == -1) { 220 if (size == -1) {
198 /* Ignore character and move on */ 221 /* Ignore character and move on */
199 } else { 222 } else {
200 op += size; 223 op += size;
201 maxlen -= size; 224 maxout -= size;
202 } 225 }
203 } else { 226 } else {
204 *op++ = (u8) u; 227 *op++ = (u8) u;
205 maxlen--; 228 maxout--;
206 } 229 }
207 } 230 }
208 return op - s; 231 return op - s;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index e14587d55689..f104d565b682 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -135,9 +135,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
135 135
136 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; 136 mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
137 137
138 /* 1 from caller and 1 for being on i_list/g_list */
139 BUG_ON(atomic_read(&mark->refcnt) < 2);
140
141 spin_lock(&group->mark_lock); 138 spin_lock(&group->mark_lock);
142 139
143 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { 140 if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
@@ -182,6 +179,11 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark)
182 iput(inode); 179 iput(inode);
183 180
184 /* 181 /*
182 * We don't necessarily have a ref on mark from caller so the above iput
183 * may have already destroyed it. Don't touch from now on.
184 */
185
186 /*
185 * it's possible that this group tried to destroy itself, but this 187 * it's possible that this group tried to destroy itself, but this
186 * this mark was simultaneously being freed by inode. If that's the 188 * this mark was simultaneously being freed by inode. If that's the
187 * case, we finish freeing the group here. 189 * case, we finish freeing the group here.
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 608be4516091..5a4a8af5c406 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3198,7 +3198,7 @@ MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparm
3198MODULE_VERSION(NTFS_VERSION); 3198MODULE_VERSION(NTFS_VERSION);
3199MODULE_LICENSE("GPL"); 3199MODULE_LICENSE("GPL");
3200#ifdef DEBUG 3200#ifdef DEBUG
3201module_param(debug_msgs, bool, 0); 3201module_param(debug_msgs, bint, 0);
3202MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); 3202MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
3203#endif 3203#endif
3204 3204
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index a5ebe421195f..286edf1e231f 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -827,8 +827,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
827 goto out; 827 goto out;
828 } 828 }
829 829
830 rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name), 830 rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
831 &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN); 831 NULL, NULL, NULL, &fsdlm);
832 if (rc) { 832 if (rc) {
833 ocfs2_live_connection_drop(control); 833 ocfs2_live_connection_drop(control);
834 goto out; 834 goto out;
diff --git a/fs/pipe.c b/fs/pipe.c
index f0e485d54e64..a932ced92a16 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1137 if (nr_pages < pipe->nrbufs) 1137 if (nr_pages < pipe->nrbufs)
1138 return -EBUSY; 1138 return -EBUSY;
1139 1139
1140 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); 1140 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN);
1141 if (unlikely(!bufs)) 1141 if (unlikely(!bufs))
1142 return -ENOMEM; 1142 return -ENOMEM;
1143 1143
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8c344f037bd0..c602b8d20f06 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -380,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
380 380
381 state = *get_task_state(task); 381 state = *get_task_state(task);
382 vsize = eip = esp = 0; 382 vsize = eip = esp = 0;
383 permitted = ptrace_may_access(task, PTRACE_MODE_READ); 383 permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT);
384 mm = get_task_mm(task); 384 mm = get_task_mm(task);
385 if (mm) { 385 if (mm) {
386 vsize = task_vsize(mm); 386 vsize = task_vsize(mm);
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
464 464
465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ 465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ 466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", 467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
468 pid_nr_ns(pid, ns), 468 pid_nr_ns(pid, ns),
469 tcomm, 469 tcomm,
470 state, 470 state,
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
511 task->policy, 511 task->policy,
512 (unsigned long long)delayacct_blkio_ticks(task), 512 (unsigned long long)delayacct_blkio_ticks(task),
513 cputime_to_clock_t(gtime), 513 cputime_to_clock_t(gtime),
514 cputime_to_clock_t(cgtime)); 514 cputime_to_clock_t(cgtime),
515 (mm && permitted) ? mm->start_data : 0,
516 (mm && permitted) ? mm->end_data : 0,
517 (mm && permitted) ? mm->start_brk : 0);
515 if (mm) 518 if (mm)
516 mmput(mm); 519 mmput(mm);
517 return 0; 520 return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a1dddda999f2..d4548dd49b02 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,9 +83,11 @@
83#include <linux/pid_namespace.h> 83#include <linux/pid_namespace.h>
84#include <linux/fs_struct.h> 84#include <linux/fs_struct.h>
85#include <linux/slab.h> 85#include <linux/slab.h>
86#include <linux/flex_array.h>
86#ifdef CONFIG_HARDWALL 87#ifdef CONFIG_HARDWALL
87#include <asm/hardwall.h> 88#include <asm/hardwall.h>
88#endif 89#endif
90#include <trace/events/oom.h>
89#include "internal.h" 91#include "internal.h"
90 92
91/* NOTE: 93/* NOTE:
@@ -133,6 +135,8 @@ struct pid_entry {
133 NULL, &proc_single_file_operations, \ 135 NULL, &proc_single_file_operations, \
134 { .proc_show = show } ) 136 { .proc_show = show } )
135 137
138static int proc_fd_permission(struct inode *inode, int mask);
139
136/* 140/*
137 * Count the number of hardlinks for the pid_entry table, excluding the . 141 * Count the number of hardlinks for the pid_entry table, excluding the .
138 * and .. links. 142 * and .. links.
@@ -165,9 +169,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
165 return result; 169 return result;
166} 170}
167 171
168static int proc_cwd_link(struct inode *inode, struct path *path) 172static int proc_cwd_link(struct dentry *dentry, struct path *path)
169{ 173{
170 struct task_struct *task = get_proc_task(inode); 174 struct task_struct *task = get_proc_task(dentry->d_inode);
171 int result = -ENOENT; 175 int result = -ENOENT;
172 176
173 if (task) { 177 if (task) {
@@ -182,9 +186,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
182 return result; 186 return result;
183} 187}
184 188
185static int proc_root_link(struct inode *inode, struct path *path) 189static int proc_root_link(struct dentry *dentry, struct path *path)
186{ 190{
187 struct task_struct *task = get_proc_task(inode); 191 struct task_struct *task = get_proc_task(dentry->d_inode);
188 int result = -ENOENT; 192 int result = -ENOENT;
189 193
190 if (task) { 194 if (task) {
@@ -194,82 +198,9 @@ static int proc_root_link(struct inode *inode, struct path *path)
194 return result; 198 return result;
195} 199}
196 200
197static struct mm_struct *__check_mem_permission(struct task_struct *task)
198{
199 struct mm_struct *mm;
200
201 mm = get_task_mm(task);
202 if (!mm)
203 return ERR_PTR(-EINVAL);
204
205 /*
206 * A task can always look at itself, in case it chooses
207 * to use system calls instead of load instructions.
208 */
209 if (task == current)
210 return mm;
211
212 /*
213 * If current is actively ptrace'ing, and would also be
214 * permitted to freshly attach with ptrace now, permit it.
215 */
216 if (task_is_stopped_or_traced(task)) {
217 int match;
218 rcu_read_lock();
219 match = (ptrace_parent(task) == current);
220 rcu_read_unlock();
221 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
222 return mm;
223 }
224
225 /*
226 * No one else is allowed.
227 */
228 mmput(mm);
229 return ERR_PTR(-EPERM);
230}
231
232/*
233 * If current may access user memory in @task return a reference to the
234 * corresponding mm, otherwise ERR_PTR.
235 */
236static struct mm_struct *check_mem_permission(struct task_struct *task)
237{
238 struct mm_struct *mm;
239 int err;
240
241 /*
242 * Avoid racing if task exec's as we might get a new mm but validate
243 * against old credentials.
244 */
245 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
246 if (err)
247 return ERR_PTR(err);
248
249 mm = __check_mem_permission(task);
250 mutex_unlock(&task->signal->cred_guard_mutex);
251
252 return mm;
253}
254
255struct mm_struct *mm_for_maps(struct task_struct *task) 201struct mm_struct *mm_for_maps(struct task_struct *task)
256{ 202{
257 struct mm_struct *mm; 203 return mm_access(task, PTRACE_MODE_READ);
258 int err;
259
260 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
261 if (err)
262 return ERR_PTR(err);
263
264 mm = get_task_mm(task);
265 if (mm && mm != current->mm &&
266 !ptrace_may_access(task, PTRACE_MODE_READ)) {
267 mmput(mm);
268 mm = ERR_PTR(-EACCES);
269 }
270 mutex_unlock(&task->signal->cred_guard_mutex);
271
272 return mm;
273} 204}
274 205
275static int proc_pid_cmdline(struct task_struct *task, char * buffer) 206static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -627,6 +558,52 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
627 return 0; 558 return 0;
628} 559}
629 560
561/*
562 * May current process learn task's sched/cmdline info (for hide_pid_min=1)
563 * or euid/egid (for hide_pid_min=2)?
564 */
565static bool has_pid_permissions(struct pid_namespace *pid,
566 struct task_struct *task,
567 int hide_pid_min)
568{
569 if (pid->hide_pid < hide_pid_min)
570 return true;
571 if (in_group_p(pid->pid_gid))
572 return true;
573 return ptrace_may_access(task, PTRACE_MODE_READ);
574}
575
576
577static int proc_pid_permission(struct inode *inode, int mask)
578{
579 struct pid_namespace *pid = inode->i_sb->s_fs_info;
580 struct task_struct *task;
581 bool has_perms;
582
583 task = get_proc_task(inode);
584 if (!task)
585 return -ESRCH;
586 has_perms = has_pid_permissions(pid, task, 1);
587 put_task_struct(task);
588
589 if (!has_perms) {
590 if (pid->hide_pid == 2) {
591 /*
592 * Let's make getdents(), stat(), and open()
593 * consistent with each other. If a process
594 * may not stat() a file, it shouldn't be seen
595 * in procfs at all.
596 */
597 return -ENOENT;
598 }
599
600 return -EPERM;
601 }
602 return generic_permission(inode, mask);
603}
604
605
606
630static const struct inode_operations proc_def_inode_operations = { 607static const struct inode_operations proc_def_inode_operations = {
631 .setattr = proc_setattr, 608 .setattr = proc_setattr,
632}; 609};
@@ -702,133 +679,96 @@ static const struct file_operations proc_single_file_operations = {
702 679
703static int mem_open(struct inode* inode, struct file* file) 680static int mem_open(struct inode* inode, struct file* file)
704{ 681{
705 file->private_data = (void*)((long)current->self_exec_id);
706 /* OK to pass negative loff_t, we can catch out-of-range */
707 file->f_mode |= FMODE_UNSIGNED_OFFSET;
708 return 0;
709}
710
711static ssize_t mem_read(struct file * file, char __user * buf,
712 size_t count, loff_t *ppos)
713{
714 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 682 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
715 char *page;
716 unsigned long src = *ppos;
717 int ret = -ESRCH;
718 struct mm_struct *mm; 683 struct mm_struct *mm;
719 684
720 if (!task) 685 if (!task)
721 goto out_no_task; 686 return -ESRCH;
722 687
723 ret = -ENOMEM; 688 mm = mm_access(task, PTRACE_MODE_ATTACH);
724 page = (char *)__get_free_page(GFP_TEMPORARY); 689 put_task_struct(task);
725 if (!page)
726 goto out;
727 690
728 mm = check_mem_permission(task);
729 ret = PTR_ERR(mm);
730 if (IS_ERR(mm)) 691 if (IS_ERR(mm))
731 goto out_free; 692 return PTR_ERR(mm);
732
733 ret = -EIO;
734
735 if (file->private_data != (void*)((long)current->self_exec_id))
736 goto out_put;
737 693
738 ret = 0; 694 if (mm) {
739 695 /* ensure this mm_struct can't be freed */
740 while (count > 0) { 696 atomic_inc(&mm->mm_count);
741 int this_len, retval; 697 /* but do not pin its memory */
742 698 mmput(mm);
743 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
744 retval = access_remote_vm(mm, src, page, this_len, 0);
745 if (!retval) {
746 if (!ret)
747 ret = -EIO;
748 break;
749 }
750
751 if (copy_to_user(buf, page, retval)) {
752 ret = -EFAULT;
753 break;
754 }
755
756 ret += retval;
757 src += retval;
758 buf += retval;
759 count -= retval;
760 } 699 }
761 *ppos = src;
762 700
763out_put: 701 /* OK to pass negative loff_t, we can catch out-of-range */
764 mmput(mm); 702 file->f_mode |= FMODE_UNSIGNED_OFFSET;
765out_free: 703 file->private_data = mm;
766 free_page((unsigned long) page); 704
767out: 705 return 0;
768 put_task_struct(task);
769out_no_task:
770 return ret;
771} 706}
772 707
773static ssize_t mem_write(struct file * file, const char __user *buf, 708static ssize_t mem_rw(struct file *file, char __user *buf,
774 size_t count, loff_t *ppos) 709 size_t count, loff_t *ppos, int write)
775{ 710{
776 int copied; 711 struct mm_struct *mm = file->private_data;
712 unsigned long addr = *ppos;
713 ssize_t copied;
777 char *page; 714 char *page;
778 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
779 unsigned long dst = *ppos;
780 struct mm_struct *mm;
781 715
782 copied = -ESRCH; 716 if (!mm)
783 if (!task) 717 return 0;
784 goto out_no_task;
785 718
786 copied = -ENOMEM;
787 page = (char *)__get_free_page(GFP_TEMPORARY); 719 page = (char *)__get_free_page(GFP_TEMPORARY);
788 if (!page) 720 if (!page)
789 goto out_task; 721 return -ENOMEM;
790
791 mm = check_mem_permission(task);
792 copied = PTR_ERR(mm);
793 if (IS_ERR(mm))
794 goto out_free;
795
796 copied = -EIO;
797 if (file->private_data != (void *)((long)current->self_exec_id))
798 goto out_mm;
799 722
800 copied = 0; 723 copied = 0;
724 if (!atomic_inc_not_zero(&mm->mm_users))
725 goto free;
726
801 while (count > 0) { 727 while (count > 0) {
802 int this_len, retval; 728 int this_len = min_t(int, count, PAGE_SIZE);
803 729
804 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; 730 if (write && copy_from_user(page, buf, this_len)) {
805 if (copy_from_user(page, buf, this_len)) {
806 copied = -EFAULT; 731 copied = -EFAULT;
807 break; 732 break;
808 } 733 }
809 retval = access_remote_vm(mm, dst, page, this_len, 1); 734
810 if (!retval) { 735 this_len = access_remote_vm(mm, addr, page, this_len, write);
736 if (!this_len) {
811 if (!copied) 737 if (!copied)
812 copied = -EIO; 738 copied = -EIO;
813 break; 739 break;
814 } 740 }
815 copied += retval; 741
816 buf += retval; 742 if (!write && copy_to_user(buf, page, this_len)) {
817 dst += retval; 743 copied = -EFAULT;
818 count -= retval; 744 break;
745 }
746
747 buf += this_len;
748 addr += this_len;
749 copied += this_len;
750 count -= this_len;
819 } 751 }
820 *ppos = dst; 752 *ppos = addr;
821 753
822out_mm:
823 mmput(mm); 754 mmput(mm);
824out_free: 755free:
825 free_page((unsigned long) page); 756 free_page((unsigned long) page);
826out_task:
827 put_task_struct(task);
828out_no_task:
829 return copied; 757 return copied;
830} 758}
831 759
760static ssize_t mem_read(struct file *file, char __user *buf,
761 size_t count, loff_t *ppos)
762{
763 return mem_rw(file, buf, count, ppos, 0);
764}
765
766static ssize_t mem_write(struct file *file, const char __user *buf,
767 size_t count, loff_t *ppos)
768{
769 return mem_rw(file, (char __user*)buf, count, ppos, 1);
770}
771
832loff_t mem_lseek(struct file *file, loff_t offset, int orig) 772loff_t mem_lseek(struct file *file, loff_t offset, int orig)
833{ 773{
834 switch (orig) { 774 switch (orig) {
@@ -845,11 +785,20 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
845 return file->f_pos; 785 return file->f_pos;
846} 786}
847 787
788static int mem_release(struct inode *inode, struct file *file)
789{
790 struct mm_struct *mm = file->private_data;
791 if (mm)
792 mmdrop(mm);
793 return 0;
794}
795
848static const struct file_operations proc_mem_operations = { 796static const struct file_operations proc_mem_operations = {
849 .llseek = mem_lseek, 797 .llseek = mem_lseek,
850 .read = mem_read, 798 .read = mem_read,
851 .write = mem_write, 799 .write = mem_write,
852 .open = mem_open, 800 .open = mem_open,
801 .release = mem_release,
853}; 802};
854 803
855static ssize_t environ_read(struct file *file, char __user *buf, 804static ssize_t environ_read(struct file *file, char __user *buf,
@@ -1010,6 +959,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1010 else 959 else
1011 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / 960 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1012 -OOM_DISABLE; 961 -OOM_DISABLE;
962 trace_oom_score_adj_update(task);
1013err_sighand: 963err_sighand:
1014 unlock_task_sighand(task, &flags); 964 unlock_task_sighand(task, &flags);
1015err_task_lock: 965err_task_lock:
@@ -1097,6 +1047,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1097 task->signal->oom_score_adj = oom_score_adj; 1047 task->signal->oom_score_adj = oom_score_adj;
1098 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 1048 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1099 task->signal->oom_score_adj_min = oom_score_adj; 1049 task->signal->oom_score_adj_min = oom_score_adj;
1050 trace_oom_score_adj_update(task);
1100 /* 1051 /*
1101 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is 1052 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1102 * always attainable. 1053 * always attainable.
@@ -1147,9 +1098,6 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1147 ssize_t length; 1098 ssize_t length;
1148 uid_t loginuid; 1099 uid_t loginuid;
1149 1100
1150 if (!capable(CAP_AUDIT_CONTROL))
1151 return -EPERM;
1152
1153 rcu_read_lock(); 1101 rcu_read_lock();
1154 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { 1102 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1155 rcu_read_unlock(); 1103 rcu_read_unlock();
@@ -1178,7 +1126,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1178 goto out_free_page; 1126 goto out_free_page;
1179 1127
1180 } 1128 }
1181 length = audit_set_loginuid(current, loginuid); 1129 length = audit_set_loginuid(loginuid);
1182 if (likely(length == 0)) 1130 if (likely(length == 0))
1183 length = count; 1131 length = count;
1184 1132
@@ -1453,13 +1401,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
1453 .release = single_release, 1401 .release = single_release,
1454}; 1402};
1455 1403
1456static int proc_exe_link(struct inode *inode, struct path *exe_path) 1404static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1457{ 1405{
1458 struct task_struct *task; 1406 struct task_struct *task;
1459 struct mm_struct *mm; 1407 struct mm_struct *mm;
1460 struct file *exe_file; 1408 struct file *exe_file;
1461 1409
1462 task = get_proc_task(inode); 1410 task = get_proc_task(dentry->d_inode);
1463 if (!task) 1411 if (!task)
1464 return -ENOENT; 1412 return -ENOENT;
1465 mm = get_task_mm(task); 1413 mm = get_task_mm(task);
@@ -1489,7 +1437,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1489 if (!proc_fd_access_allowed(inode)) 1437 if (!proc_fd_access_allowed(inode))
1490 goto out; 1438 goto out;
1491 1439
1492 error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); 1440 error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
1493out: 1441out:
1494 return ERR_PTR(error); 1442 return ERR_PTR(error);
1495} 1443}
@@ -1528,7 +1476,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
1528 if (!proc_fd_access_allowed(inode)) 1476 if (!proc_fd_access_allowed(inode))
1529 goto out; 1477 goto out;
1530 1478
1531 error = PROC_I(inode)->op.proc_get_link(inode, &path); 1479 error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1532 if (error) 1480 if (error)
1533 goto out; 1481 goto out;
1534 1482
@@ -1609,6 +1557,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1609 struct inode *inode = dentry->d_inode; 1557 struct inode *inode = dentry->d_inode;
1610 struct task_struct *task; 1558 struct task_struct *task;
1611 const struct cred *cred; 1559 const struct cred *cred;
1560 struct pid_namespace *pid = dentry->d_sb->s_fs_info;
1612 1561
1613 generic_fillattr(inode, stat); 1562 generic_fillattr(inode, stat);
1614 1563
@@ -1617,6 +1566,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1617 stat->gid = 0; 1566 stat->gid = 0;
1618 task = pid_task(proc_pid(inode), PIDTYPE_PID); 1567 task = pid_task(proc_pid(inode), PIDTYPE_PID);
1619 if (task) { 1568 if (task) {
1569 if (!has_pid_permissions(pid, task, 2)) {
1570 rcu_read_unlock();
1571 /*
1572 * This doesn't prevent learning whether PID exists,
1573 * it only makes getattr() consistent with readdir().
1574 */
1575 return -ENOENT;
1576 }
1620 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1577 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1621 task_dumpable(task)) { 1578 task_dumpable(task)) {
1622 cred = __task_cred(task); 1579 cred = __task_cred(task);
@@ -1820,9 +1777,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1820 return -ENOENT; 1777 return -ENOENT;
1821} 1778}
1822 1779
1823static int proc_fd_link(struct inode *inode, struct path *path) 1780static int proc_fd_link(struct dentry *dentry, struct path *path)
1824{ 1781{
1825 return proc_fd_info(inode, path, NULL); 1782 return proc_fd_info(dentry->d_inode, path, NULL);
1826} 1783}
1827 1784
1828static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) 1785static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
@@ -2043,6 +2000,355 @@ static const struct file_operations proc_fd_operations = {
2043 .llseek = default_llseek, 2000 .llseek = default_llseek,
2044}; 2001};
2045 2002
2003#ifdef CONFIG_CHECKPOINT_RESTORE
2004
2005/*
2006 * dname_to_vma_addr - maps a dentry name into two unsigned longs
2007 * which represent vma start and end addresses.
2008 */
2009static int dname_to_vma_addr(struct dentry *dentry,
2010 unsigned long *start, unsigned long *end)
2011{
2012 if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
2013 return -EINVAL;
2014
2015 return 0;
2016}
2017
2018static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
2019{
2020 unsigned long vm_start, vm_end;
2021 bool exact_vma_exists = false;
2022 struct mm_struct *mm = NULL;
2023 struct task_struct *task;
2024 const struct cred *cred;
2025 struct inode *inode;
2026 int status = 0;
2027
2028 if (nd && nd->flags & LOOKUP_RCU)
2029 return -ECHILD;
2030
2031 if (!capable(CAP_SYS_ADMIN)) {
2032 status = -EACCES;
2033 goto out_notask;
2034 }
2035
2036 inode = dentry->d_inode;
2037 task = get_proc_task(inode);
2038 if (!task)
2039 goto out_notask;
2040
2041 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2042 goto out;
2043
2044 mm = get_task_mm(task);
2045 if (!mm)
2046 goto out;
2047
2048 if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
2049 down_read(&mm->mmap_sem);
2050 exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
2051 up_read(&mm->mmap_sem);
2052 }
2053
2054 mmput(mm);
2055
2056 if (exact_vma_exists) {
2057 if (task_dumpable(task)) {
2058 rcu_read_lock();
2059 cred = __task_cred(task);
2060 inode->i_uid = cred->euid;
2061 inode->i_gid = cred->egid;
2062 rcu_read_unlock();
2063 } else {
2064 inode->i_uid = 0;
2065 inode->i_gid = 0;
2066 }
2067 security_task_to_inode(task, inode);
2068 status = 1;
2069 }
2070
2071out:
2072 put_task_struct(task);
2073
2074out_notask:
2075 if (status <= 0)
2076 d_drop(dentry);
2077
2078 return status;
2079}
2080
2081static const struct dentry_operations tid_map_files_dentry_operations = {
2082 .d_revalidate = map_files_d_revalidate,
2083 .d_delete = pid_delete_dentry,
2084};
2085
2086static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
2087{
2088 unsigned long vm_start, vm_end;
2089 struct vm_area_struct *vma;
2090 struct task_struct *task;
2091 struct mm_struct *mm;
2092 int rc;
2093
2094 rc = -ENOENT;
2095 task = get_proc_task(dentry->d_inode);
2096 if (!task)
2097 goto out;
2098
2099 mm = get_task_mm(task);
2100 put_task_struct(task);
2101 if (!mm)
2102 goto out;
2103
2104 rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
2105 if (rc)
2106 goto out_mmput;
2107
2108 down_read(&mm->mmap_sem);
2109 vma = find_exact_vma(mm, vm_start, vm_end);
2110 if (vma && vma->vm_file) {
2111 *path = vma->vm_file->f_path;
2112 path_get(path);
2113 rc = 0;
2114 }
2115 up_read(&mm->mmap_sem);
2116
2117out_mmput:
2118 mmput(mm);
2119out:
2120 return rc;
2121}
2122
2123struct map_files_info {
2124 struct file *file;
2125 unsigned long len;
2126 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
2127};
2128
2129static struct dentry *
2130proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
2131 struct task_struct *task, const void *ptr)
2132{
2133 const struct file *file = ptr;
2134 struct proc_inode *ei;
2135 struct inode *inode;
2136
2137 if (!file)
2138 return ERR_PTR(-ENOENT);
2139
2140 inode = proc_pid_make_inode(dir->i_sb, task);
2141 if (!inode)
2142 return ERR_PTR(-ENOENT);
2143
2144 ei = PROC_I(inode);
2145 ei->op.proc_get_link = proc_map_files_get_link;
2146
2147 inode->i_op = &proc_pid_link_inode_operations;
2148 inode->i_size = 64;
2149 inode->i_mode = S_IFLNK;
2150
2151 if (file->f_mode & FMODE_READ)
2152 inode->i_mode |= S_IRUSR;
2153 if (file->f_mode & FMODE_WRITE)
2154 inode->i_mode |= S_IWUSR;
2155
2156 d_set_d_op(dentry, &tid_map_files_dentry_operations);
2157 d_add(dentry, inode);
2158
2159 return NULL;
2160}
2161
2162static struct dentry *proc_map_files_lookup(struct inode *dir,
2163 struct dentry *dentry, struct nameidata *nd)
2164{
2165 unsigned long vm_start, vm_end;
2166 struct vm_area_struct *vma;
2167 struct task_struct *task;
2168 struct dentry *result;
2169 struct mm_struct *mm;
2170
2171 result = ERR_PTR(-EACCES);
2172 if (!capable(CAP_SYS_ADMIN))
2173 goto out;
2174
2175 result = ERR_PTR(-ENOENT);
2176 task = get_proc_task(dir);
2177 if (!task)
2178 goto out;
2179
2180 result = ERR_PTR(-EACCES);
2181 if (lock_trace(task))
2182 goto out_put_task;
2183
2184 result = ERR_PTR(-ENOENT);
2185 if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
2186 goto out_unlock;
2187
2188 mm = get_task_mm(task);
2189 if (!mm)
2190 goto out_unlock;
2191
2192 down_read(&mm->mmap_sem);
2193 vma = find_exact_vma(mm, vm_start, vm_end);
2194 if (!vma)
2195 goto out_no_vma;
2196
2197 result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
2198
2199out_no_vma:
2200 up_read(&mm->mmap_sem);
2201 mmput(mm);
2202out_unlock:
2203 unlock_trace(task);
2204out_put_task:
2205 put_task_struct(task);
2206out:
2207 return result;
2208}
2209
2210static const struct inode_operations proc_map_files_inode_operations = {
2211 .lookup = proc_map_files_lookup,
2212 .permission = proc_fd_permission,
2213 .setattr = proc_setattr,
2214};
2215
2216static int
2217proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
2218{
2219 struct dentry *dentry = filp->f_path.dentry;
2220 struct inode *inode = dentry->d_inode;
2221 struct vm_area_struct *vma;
2222 struct task_struct *task;
2223 struct mm_struct *mm;
2224 ino_t ino;
2225 int ret;
2226
2227 ret = -EACCES;
2228 if (!capable(CAP_SYS_ADMIN))
2229 goto out;
2230
2231 ret = -ENOENT;
2232 task = get_proc_task(inode);
2233 if (!task)
2234 goto out;
2235
2236 ret = -EACCES;
2237 if (lock_trace(task))
2238 goto out_put_task;
2239
2240 ret = 0;
2241 switch (filp->f_pos) {
2242 case 0:
2243 ino = inode->i_ino;
2244 if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
2245 goto out_unlock;
2246 filp->f_pos++;
2247 case 1:
2248 ino = parent_ino(dentry);
2249 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
2250 goto out_unlock;
2251 filp->f_pos++;
2252 default:
2253 {
2254 unsigned long nr_files, pos, i;
2255 struct flex_array *fa = NULL;
2256 struct map_files_info info;
2257 struct map_files_info *p;
2258
2259 mm = get_task_mm(task);
2260 if (!mm)
2261 goto out_unlock;
2262 down_read(&mm->mmap_sem);
2263
2264 nr_files = 0;
2265
2266 /*
2267 * We need two passes here:
2268 *
2269 * 1) Collect vmas of mapped files with mmap_sem taken
2270 * 2) Release mmap_sem and instantiate entries
2271 *
2272 * otherwise we get lockdep complained, since filldir()
2273 * routine might require mmap_sem taken in might_fault().
2274 */
2275
2276 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
2277 if (vma->vm_file && ++pos > filp->f_pos)
2278 nr_files++;
2279 }
2280
2281 if (nr_files) {
2282 fa = flex_array_alloc(sizeof(info), nr_files,
2283 GFP_KERNEL);
2284 if (!fa || flex_array_prealloc(fa, 0, nr_files,
2285 GFP_KERNEL)) {
2286 ret = -ENOMEM;
2287 if (fa)
2288 flex_array_free(fa);
2289 up_read(&mm->mmap_sem);
2290 mmput(mm);
2291 goto out_unlock;
2292 }
2293 for (i = 0, vma = mm->mmap, pos = 2; vma;
2294 vma = vma->vm_next) {
2295 if (!vma->vm_file)
2296 continue;
2297 if (++pos <= filp->f_pos)
2298 continue;
2299
2300 get_file(vma->vm_file);
2301 info.file = vma->vm_file;
2302 info.len = snprintf(info.name,
2303 sizeof(info.name), "%lx-%lx",
2304 vma->vm_start, vma->vm_end);
2305 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
2306 BUG();
2307 }
2308 }
2309 up_read(&mm->mmap_sem);
2310
2311 for (i = 0; i < nr_files; i++) {
2312 p = flex_array_get(fa, i);
2313 ret = proc_fill_cache(filp, dirent, filldir,
2314 p->name, p->len,
2315 proc_map_files_instantiate,
2316 task, p->file);
2317 if (ret)
2318 break;
2319 filp->f_pos++;
2320 fput(p->file);
2321 }
2322 for (; i < nr_files; i++) {
2323 /*
2324 * In case of error don't forget
2325 * to put rest of file refs.
2326 */
2327 p = flex_array_get(fa, i);
2328 fput(p->file);
2329 }
2330 if (fa)
2331 flex_array_free(fa);
2332 mmput(mm);
2333 }
2334 }
2335
2336out_unlock:
2337 unlock_trace(task);
2338out_put_task:
2339 put_task_struct(task);
2340out:
2341 return ret;
2342}
2343
2344static const struct file_operations proc_map_files_operations = {
2345 .read = generic_read_dir,
2346 .readdir = proc_map_files_readdir,
2347 .llseek = default_llseek,
2348};
2349
2350#endif /* CONFIG_CHECKPOINT_RESTORE */
2351
2046/* 2352/*
2047 * /proc/pid/fd needs a special permission handler so that a process can still 2353 * /proc/pid/fd needs a special permission handler so that a process can still
2048 * access /proc/self/fd after it has executed a setuid(). 2354 * access /proc/self/fd after it has executed a setuid().
@@ -2658,6 +2964,9 @@ static const struct inode_operations proc_task_inode_operations;
2658static const struct pid_entry tgid_base_stuff[] = { 2964static const struct pid_entry tgid_base_stuff[] = {
2659 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 2965 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2660 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2966 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2967#ifdef CONFIG_CHECKPOINT_RESTORE
2968 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
2969#endif
2661 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2970 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2662 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 2971 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2663#ifdef CONFIG_NET 2972#ifdef CONFIG_NET
@@ -2761,6 +3070,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
2761 .lookup = proc_tgid_base_lookup, 3070 .lookup = proc_tgid_base_lookup,
2762 .getattr = pid_getattr, 3071 .getattr = pid_getattr,
2763 .setattr = proc_setattr, 3072 .setattr = proc_setattr,
3073 .permission = proc_pid_permission,
2764}; 3074};
2765 3075
2766static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) 3076static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -2964,6 +3274,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
2964 proc_pid_instantiate, iter.task, NULL); 3274 proc_pid_instantiate, iter.task, NULL);
2965} 3275}
2966 3276
3277static int fake_filldir(void *buf, const char *name, int namelen,
3278 loff_t offset, u64 ino, unsigned d_type)
3279{
3280 return 0;
3281}
3282
2967/* for the /proc/ directory itself, after non-process stuff has been done */ 3283/* for the /proc/ directory itself, after non-process stuff has been done */
2968int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 3284int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2969{ 3285{
@@ -2971,6 +3287,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2971 struct task_struct *reaper; 3287 struct task_struct *reaper;
2972 struct tgid_iter iter; 3288 struct tgid_iter iter;
2973 struct pid_namespace *ns; 3289 struct pid_namespace *ns;
3290 filldir_t __filldir;
2974 3291
2975 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) 3292 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
2976 goto out_no_task; 3293 goto out_no_task;
@@ -2992,8 +3309,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2992 for (iter = next_tgid(ns, iter); 3309 for (iter = next_tgid(ns, iter);
2993 iter.task; 3310 iter.task;
2994 iter.tgid += 1, iter = next_tgid(ns, iter)) { 3311 iter.tgid += 1, iter = next_tgid(ns, iter)) {
3312 if (has_pid_permissions(ns, iter.task, 2))
3313 __filldir = filldir;
3314 else
3315 __filldir = fake_filldir;
3316
2995 filp->f_pos = iter.tgid + TGID_OFFSET; 3317 filp->f_pos = iter.tgid + TGID_OFFSET;
2996 if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { 3318 if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
2997 put_task_struct(iter.task); 3319 put_task_struct(iter.task);
2998 goto out; 3320 goto out;
2999 } 3321 }
@@ -3328,6 +3650,7 @@ static const struct inode_operations proc_task_inode_operations = {
3328 .lookup = proc_task_lookup, 3650 .lookup = proc_task_lookup,
3329 .getattr = proc_task_getattr, 3651 .getattr = proc_task_getattr,
3330 .setattr = proc_setattr, 3652 .setattr = proc_setattr,
3653 .permission = proc_pid_permission,
3331}; 3654};
3332 3655
3333static const struct file_operations proc_task_operations = { 3656static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 51a176622b8f..84fd3235a590 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,6 +7,7 @@
7#include <linux/time.h> 7#include <linux/time.h>
8#include <linux/proc_fs.h> 8#include <linux/proc_fs.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/pid_namespace.h>
10#include <linux/mm.h> 11#include <linux/mm.h>
11#include <linux/string.h> 12#include <linux/string.h>
12#include <linux/stat.h> 13#include <linux/stat.h>
@@ -17,7 +18,9 @@
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/module.h> 19#include <linux/module.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
21#include <linux/seq_file.h>
20#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/mount.h>
21 24
22#include <asm/system.h> 25#include <asm/system.h>
23#include <asm/uaccess.h> 26#include <asm/uaccess.h>
@@ -101,12 +104,27 @@ void __init proc_init_inodecache(void)
101 init_once); 104 init_once);
102} 105}
103 106
107static int proc_show_options(struct seq_file *seq, struct dentry *root)
108{
109 struct super_block *sb = root->d_sb;
110 struct pid_namespace *pid = sb->s_fs_info;
111
112 if (pid->pid_gid)
113 seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
114 if (pid->hide_pid != 0)
115 seq_printf(seq, ",hidepid=%u", pid->hide_pid);
116
117 return 0;
118}
119
104static const struct super_operations proc_sops = { 120static const struct super_operations proc_sops = {
105 .alloc_inode = proc_alloc_inode, 121 .alloc_inode = proc_alloc_inode,
106 .destroy_inode = proc_destroy_inode, 122 .destroy_inode = proc_destroy_inode,
107 .drop_inode = generic_delete_inode, 123 .drop_inode = generic_delete_inode,
108 .evict_inode = proc_evict_inode, 124 .evict_inode = proc_evict_inode,
109 .statfs = simple_statfs, 125 .statfs = simple_statfs,
126 .remount_fs = proc_remount,
127 .show_options = proc_show_options,
110}; 128};
111 129
112static void __pde_users_dec(struct proc_dir_entry *pde) 130static void __pde_users_dec(struct proc_dir_entry *pde)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7838e5cfec14..292577531ad1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde);
117 117
118int proc_fill_super(struct super_block *); 118int proc_fill_super(struct super_block *);
119struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); 119struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
120int proc_remount(struct super_block *sb, int *flags, char *data);
120 121
121/* 122/*
122 * These are generic /proc routines that use the internal 123 * These are generic /proc routines that use the internal
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 03102d978180..46a15d8a29ca 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,6 +18,7 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/pid_namespace.h> 20#include <linux/pid_namespace.h>
21#include <linux/parser.h>
21 22
22#include "internal.h" 23#include "internal.h"
23 24
@@ -36,6 +37,63 @@ static int proc_set_super(struct super_block *sb, void *data)
36 return err; 37 return err;
37} 38}
38 39
40enum {
41 Opt_gid, Opt_hidepid, Opt_err,
42};
43
44static const match_table_t tokens = {
45 {Opt_hidepid, "hidepid=%u"},
46 {Opt_gid, "gid=%u"},
47 {Opt_err, NULL},
48};
49
50static int proc_parse_options(char *options, struct pid_namespace *pid)
51{
52 char *p;
53 substring_t args[MAX_OPT_ARGS];
54 int option;
55
56 if (!options)
57 return 1;
58
59 while ((p = strsep(&options, ",")) != NULL) {
60 int token;
61 if (!*p)
62 continue;
63
64 args[0].to = args[0].from = 0;
65 token = match_token(p, tokens, args);
66 switch (token) {
67 case Opt_gid:
68 if (match_int(&args[0], &option))
69 return 0;
70 pid->pid_gid = option;
71 break;
72 case Opt_hidepid:
73 if (match_int(&args[0], &option))
74 return 0;
75 if (option < 0 || option > 2) {
76 pr_err("proc: hidepid value must be between 0 and 2.\n");
77 return 0;
78 }
79 pid->hide_pid = option;
80 break;
81 default:
82 pr_err("proc: unrecognized mount option \"%s\" "
83 "or missing value\n", p);
84 return 0;
85 }
86 }
87
88 return 1;
89}
90
91int proc_remount(struct super_block *sb, int *flags, char *data)
92{
93 struct pid_namespace *pid = sb->s_fs_info;
94 return !proc_parse_options(data, pid);
95}
96
39static struct dentry *proc_mount(struct file_system_type *fs_type, 97static struct dentry *proc_mount(struct file_system_type *fs_type,
40 int flags, const char *dev_name, void *data) 98 int flags, const char *dev_name, void *data)
41{ 99{
@@ -43,11 +101,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
43 struct super_block *sb; 101 struct super_block *sb;
44 struct pid_namespace *ns; 102 struct pid_namespace *ns;
45 struct proc_inode *ei; 103 struct proc_inode *ei;
104 char *options;
46 105
47 if (flags & MS_KERNMOUNT) 106 if (flags & MS_KERNMOUNT) {
48 ns = (struct pid_namespace *)data; 107 ns = (struct pid_namespace *)data;
49 else 108 options = NULL;
109 } else {
50 ns = current->nsproxy->pid_ns; 110 ns = current->nsproxy->pid_ns;
111 options = data;
112 }
51 113
52 sb = sget(fs_type, proc_test_super, proc_set_super, ns); 114 sb = sget(fs_type, proc_test_super, proc_set_super, ns);
53 if (IS_ERR(sb)) 115 if (IS_ERR(sb))
@@ -55,6 +117,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
55 117
56 if (!sb->s_root) { 118 if (!sb->s_root) {
57 sb->s_flags = flags; 119 sb->s_flags = flags;
120 if (!proc_parse_options(options, ns)) {
121 deactivate_locked_super(sb);
122 return ERR_PTR(-EINVAL);
123 }
58 err = proc_fill_super(sb); 124 err = proc_fill_super(sb);
59 if (err) { 125 if (err) {
60 deactivate_locked_super(sb); 126 deactivate_locked_super(sb);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d76ca6ae2b1b..121f77cfef76 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -77,6 +77,8 @@ static int show_stat(struct seq_file *p, void *v)
77 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 77 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
78 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 78 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
79 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 79 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
80 sum += kstat_cpu_irqs_sum(i);
81 sum += arch_irq_stat_cpu(i);
80 82
81 for (j = 0; j < NR_SOFTIRQS; j++) { 83 for (j = 0; j < NR_SOFTIRQS; j++) {
82 unsigned int softirq_stat = kstat_softirqs_cpu(j, i); 84 unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e418c5abdb0e..7dcd2a250495 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -518,6 +518,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
518 if (!page) 518 if (!page)
519 continue; 519 continue;
520 520
521 if (PageReserved(page))
522 continue;
523
521 /* Clear accessed and referenced bits. */ 524 /* Clear accessed and referenced bits. */
522 ptep_test_and_clear_young(vma, addr, pte); 525 ptep_test_and_clear_young(vma, addr, pte);
523 ClearPageReferenced(page); 526 ClearPageReferenced(page);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2bfd987f4853..6b009548d2e0 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -179,47 +179,33 @@ static const char *qnx4_checkroot(struct super_block *sb)
179 struct qnx4_inode_entry *rootdir; 179 struct qnx4_inode_entry *rootdir;
180 int rd, rl; 180 int rd, rl;
181 int i, j; 181 int i, j;
182 int found = 0;
183 182
184 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') { 183 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
185 return "no qnx4 filesystem (no root dir)."; 184 return "no qnx4 filesystem (no root dir).";
186 } else { 185 QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
187 QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); 186 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
188 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; 187 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
189 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); 188 for (j = 0; j < rl; j++) {
190 for (j = 0; j < rl; j++) { 189 bh = sb_bread(sb, rd + j); /* root dir, first block */
191 bh = sb_bread(sb, rd + j); /* root dir, first block */ 190 if (bh == NULL)
192 if (bh == NULL) { 191 return "unable to read root entry.";
193 return "unable to read root entry."; 192 rootdir = (struct qnx4_inode_entry *) bh->b_data;
194 } 193 for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) {
195 for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) { 194 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
196 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); 195 if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0)
197 if (rootdir->di_fname != NULL) { 196 continue;
198 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); 197 qnx4_sb(sb)->BitMap = kmemdup(rootdir,
199 if (!strcmp(rootdir->di_fname, 198 sizeof(struct qnx4_inode_entry),
200 QNX4_BMNAME)) { 199 GFP_KERNEL);
201 found = 1;
202 qnx4_sb(sb)->BitMap = kmemdup(rootdir,
203 sizeof(struct qnx4_inode_entry),
204 GFP_KERNEL);
205 if (!qnx4_sb(sb)->BitMap) {
206 brelse (bh);
207 return "not enough memory for bitmap inode";
208 }/* keep bitmap inode known */
209 break;
210 }
211 }
212 }
213 brelse(bh); 200 brelse(bh);
214 if (found != 0) { 201 if (!qnx4_sb(sb)->BitMap)
215 break; 202 return "not enough memory for bitmap inode";
216 } 203 /* keep bitmap inode known */
217 } 204 return NULL;
218 if (found == 0) {
219 return "bitmap file not found.";
220 } 205 }
206 brelse(bh);
221 } 207 }
222 return NULL; 208 return "bitmap file not found.";
223} 209}
224 210
225static int qnx4_fill_super(struct super_block *s, void *data, int silent) 211static int qnx4_fill_super(struct super_block *s, void *data, int silent)
@@ -270,7 +256,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
270 if (IS_ERR(root)) { 256 if (IS_ERR(root)) {
271 printk(KERN_ERR "qnx4: get inode failed\n"); 257 printk(KERN_ERR "qnx4: get inode failed\n");
272 ret = PTR_ERR(root); 258 ret = PTR_ERR(root);
273 goto out; 259 goto outb;
274 } 260 }
275 261
276 ret = -ENOMEM; 262 ret = -ENOMEM;
@@ -283,6 +269,8 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
283 269
284 outi: 270 outi:
285 iput(root); 271 iput(root);
272 outb:
273 kfree(qs->BitMap);
286 out: 274 out:
287 brelse(bh); 275 brelse(bh);
288 outnobh: 276 outnobh:
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 5ec59b20cf76..46741970371b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2125,6 +2125,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
2125 mutex_unlock(&dqopt->dqio_mutex); 2125 mutex_unlock(&dqopt->dqio_mutex);
2126 goto out_file_init; 2126 goto out_file_init;
2127 } 2127 }
2128 if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
2129 dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
2128 mutex_unlock(&dqopt->dqio_mutex); 2130 mutex_unlock(&dqopt->dqio_mutex);
2129 spin_lock(&dq_state_lock); 2131 spin_lock(&dq_state_lock);
2130 dqopt->flags |= dquot_state_flag(flags, type); 2132 dqopt->flags |= dquot_state_flag(flags, type);
@@ -2464,7 +2466,7 @@ int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2464 spin_lock(&dq_data_lock); 2466 spin_lock(&dq_data_lock);
2465 ii->dqi_bgrace = mi->dqi_bgrace; 2467 ii->dqi_bgrace = mi->dqi_bgrace;
2466 ii->dqi_igrace = mi->dqi_igrace; 2468 ii->dqi_igrace = mi->dqi_igrace;
2467 ii->dqi_flags = mi->dqi_flags & DQF_MASK; 2469 ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK;
2468 ii->dqi_valid = IIF_ALL; 2470 ii->dqi_valid = IIF_ALL;
2469 spin_unlock(&dq_data_lock); 2471 spin_unlock(&dq_data_lock);
2470 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2472 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
@@ -2490,8 +2492,8 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2490 if (ii->dqi_valid & IIF_IGRACE) 2492 if (ii->dqi_valid & IIF_IGRACE)
2491 mi->dqi_igrace = ii->dqi_igrace; 2493 mi->dqi_igrace = ii->dqi_igrace;
2492 if (ii->dqi_valid & IIF_FLAGS) 2494 if (ii->dqi_valid & IIF_FLAGS)
2493 mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | 2495 mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) |
2494 (ii->dqi_flags & DQF_MASK); 2496 (ii->dqi_flags & DQF_SETINFO_MASK);
2495 spin_unlock(&dq_data_lock); 2497 spin_unlock(&dq_data_lock);
2496 mark_info_dirty(sb, type); 2498 mark_info_dirty(sb, type);
2497 /* Force write to disk */ 2499 /* Force write to disk */
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index a945cd265228..70de42f09f1d 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1364,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
1364 struct reiserfs_bitmap_info *bitmap; 1364 struct reiserfs_bitmap_info *bitmap;
1365 unsigned int bmap_nr = reiserfs_bmap_count(sb); 1365 unsigned int bmap_nr = reiserfs_bmap_count(sb);
1366 1366
1367 /* Avoid lock recursion in fault case */
1368 reiserfs_write_unlock(sb);
1369 bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); 1367 bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
1370 reiserfs_write_lock(sb);
1371 if (bitmap == NULL) 1368 if (bitmap == NULL)
1372 return -ENOMEM; 1369 return -ENOMEM;
1373 1370
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index eb711060a6f2..c3cf54fd4de3 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2678 char b[BDEVNAME_SIZE]; 2678 char b[BDEVNAME_SIZE];
2679 int ret; 2679 int ret;
2680 2680
2681 /*
2682 * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
2683 * dependency inversion warnings.
2684 */
2685 reiserfs_write_unlock(sb);
2686 journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); 2681 journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
2687 if (!journal) { 2682 if (!journal) {
2688 reiserfs_warning(sb, "journal-1256", 2683 reiserfs_warning(sb, "journal-1256",
2689 "unable to get memory for journal structure"); 2684 "unable to get memory for journal structure");
2690 reiserfs_write_lock(sb);
2691 return 1; 2685 return 1;
2692 } 2686 }
2693 INIT_LIST_HEAD(&journal->j_bitmap_nodes); 2687 INIT_LIST_HEAD(&journal->j_bitmap_nodes);
@@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2695 INIT_LIST_HEAD(&journal->j_working_list); 2689 INIT_LIST_HEAD(&journal->j_working_list);
2696 INIT_LIST_HEAD(&journal->j_journal_list); 2690 INIT_LIST_HEAD(&journal->j_journal_list);
2697 journal->j_persistent_trans = 0; 2691 journal->j_persistent_trans = 0;
2698 ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, 2692 if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
2699 reiserfs_bmap_count(sb)); 2693 reiserfs_bmap_count(sb)))
2700 reiserfs_write_lock(sb);
2701 if (ret)
2702 goto free_and_return; 2694 goto free_and_return;
2703 2695
2704 allocate_bitmap_nodes(sb); 2696 allocate_bitmap_nodes(sb);
@@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2727 goto free_and_return; 2719 goto free_and_return;
2728 } 2720 }
2729 2721
2730 /*
2731 * We need to unlock here to avoid creating the following
2732 * dependency:
2733 * reiserfs_lock -> sysfs_mutex
2734 * Because the reiserfs mmap path creates the following dependency:
2735 * mm->mmap -> reiserfs_lock, hence we have
2736 * mm->mmap -> reiserfs_lock ->sysfs_mutex
2737 * This would ends up in a circular dependency with sysfs readdir path
2738 * which does sysfs_mutex -> mm->mmap_sem
2739 * This is fine because the reiserfs lock is useless in mount path,
2740 * at least until we call journal_begin. We keep it for paranoid
2741 * reasons.
2742 */
2743 reiserfs_write_unlock(sb);
2744 if (journal_init_dev(sb, journal, j_dev_name) != 0) { 2722 if (journal_init_dev(sb, journal, j_dev_name) != 0) {
2745 reiserfs_write_lock(sb);
2746 reiserfs_warning(sb, "sh-462", 2723 reiserfs_warning(sb, "sh-462",
2747 "unable to initialize jornal device"); 2724 "unable to initialize jornal device");
2748 goto free_and_return; 2725 goto free_and_return;
2749 } 2726 }
2750 reiserfs_write_lock(sb);
2751 2727
2752 rs = SB_DISK_SUPER_BLOCK(sb); 2728 rs = SB_DISK_SUPER_BLOCK(sb);
2753 2729
@@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2829 journal->j_mount_id = 10; 2805 journal->j_mount_id = 10;
2830 journal->j_state = 0; 2806 journal->j_state = 0;
2831 atomic_set(&(journal->j_jlock), 0); 2807 atomic_set(&(journal->j_jlock), 0);
2832 reiserfs_write_unlock(sb);
2833 journal->j_cnode_free_list = allocate_cnodes(num_cnodes); 2808 journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
2834 reiserfs_write_lock(sb);
2835 journal->j_cnode_free_orig = journal->j_cnode_free_list; 2809 journal->j_cnode_free_orig = journal->j_cnode_free_list;
2836 journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; 2810 journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
2837 journal->j_cnode_used = 0; 2811 journal->j_cnode_used = 0;
@@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2848 2822
2849 init_journal_hash(sb); 2823 init_journal_hash(sb);
2850 jl = journal->j_current_jl; 2824 jl = journal->j_current_jl;
2825
2826 /*
2827 * get_list_bitmap() may call flush_commit_list() which
2828 * requires the lock. Calling flush_commit_list() shouldn't happen
2829 * this early but I like to be paranoid.
2830 */
2831 reiserfs_write_lock(sb);
2851 jl->j_list_bitmap = get_list_bitmap(sb, jl); 2832 jl->j_list_bitmap = get_list_bitmap(sb, jl);
2833 reiserfs_write_unlock(sb);
2852 if (!jl->j_list_bitmap) { 2834 if (!jl->j_list_bitmap) {
2853 reiserfs_warning(sb, "journal-2005", 2835 reiserfs_warning(sb, "journal-2005",
2854 "get_list_bitmap failed for journal list 0"); 2836 "get_list_bitmap failed for journal list 0");
2855 goto free_and_return; 2837 goto free_and_return;
2856 } 2838 }
2857 if (journal_read(sb) < 0) { 2839
2840 /*
2841 * Journal_read needs to be inspected in order to push down
2842 * the lock further inside (or even remove it).
2843 */
2844 reiserfs_write_lock(sb);
2845 ret = journal_read(sb);
2846 reiserfs_write_unlock(sb);
2847 if (ret < 0) {
2858 reiserfs_warning(sb, "reiserfs-2006", 2848 reiserfs_warning(sb, "reiserfs-2006",
2859 "Replay Failure, unable to mount"); 2849 "Replay Failure, unable to mount");
2860 goto free_and_return; 2850 goto free_and_return;
2861 } 2851 }
2862 2852
2863 reiserfs_mounted_fs_count++; 2853 reiserfs_mounted_fs_count++;
2864 if (reiserfs_mounted_fs_count <= 1) { 2854 if (reiserfs_mounted_fs_count <= 1)
2865 reiserfs_write_unlock(sb);
2866 commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); 2855 commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
2867 reiserfs_write_lock(sb);
2868 }
2869 2856
2870 INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); 2857 INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
2871 journal->j_work_sb = sb; 2858 journal->j_work_sb = sb;
@@ -2896,14 +2883,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
2896 journal->j_cnode_free < (journal->j_trans_max * 3)) { 2883 journal->j_cnode_free < (journal->j_trans_max * 3)) {
2897 return 1; 2884 return 1;
2898 } 2885 }
2899 /* protected by the BKL here */ 2886
2900 journal->j_len_alloc += new_alloc; 2887 journal->j_len_alloc += new_alloc;
2901 th->t_blocks_allocated += new_alloc ; 2888 th->t_blocks_allocated += new_alloc ;
2902 return 0; 2889 return 0;
2903} 2890}
2904 2891
2905/* this must be called inside a transaction, and requires the 2892/* this must be called inside a transaction
2906** kernel_lock to be held
2907*/ 2893*/
2908void reiserfs_block_writes(struct reiserfs_transaction_handle *th) 2894void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
2909{ 2895{
@@ -2914,8 +2900,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
2914 return; 2900 return;
2915} 2901}
2916 2902
2917/* this must be called without a transaction started, and does not 2903/* this must be called without a transaction started
2918** require BKL
2919*/ 2904*/
2920void reiserfs_allow_writes(struct super_block *s) 2905void reiserfs_allow_writes(struct super_block *s)
2921{ 2906{
@@ -2924,8 +2909,7 @@ void reiserfs_allow_writes(struct super_block *s)
2924 wake_up(&journal->j_join_wait); 2909 wake_up(&journal->j_join_wait);
2925} 2910}
2926 2911
2927/* this must be called without a transaction started, and does not 2912/* this must be called without a transaction started
2928** require BKL
2929*/ 2913*/
2930void reiserfs_wait_on_write_block(struct super_block *s) 2914void reiserfs_wait_on_write_block(struct super_block *s)
2931{ 2915{
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 19c454e61b79..e12d8b97cd4d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -455,16 +455,20 @@ int remove_save_link(struct inode *inode, int truncate)
455static void reiserfs_kill_sb(struct super_block *s) 455static void reiserfs_kill_sb(struct super_block *s)
456{ 456{
457 if (REISERFS_SB(s)) { 457 if (REISERFS_SB(s)) {
458 if (REISERFS_SB(s)->xattr_root) { 458 /*
459 d_invalidate(REISERFS_SB(s)->xattr_root); 459 * Force any pending inode evictions to occur now. Any
460 dput(REISERFS_SB(s)->xattr_root); 460 * inodes to be removed that have extended attributes
461 REISERFS_SB(s)->xattr_root = NULL; 461 * associated with them need to clean them up before
462 } 462 * we can release the extended attribute root dentries.
463 if (REISERFS_SB(s)->priv_root) { 463 * shrink_dcache_for_umount will BUG if we don't release
464 d_invalidate(REISERFS_SB(s)->priv_root); 464 * those before it's called so ->put_super is too late.
465 dput(REISERFS_SB(s)->priv_root); 465 */
466 REISERFS_SB(s)->priv_root = NULL; 466 shrink_dcache_sb(s);
467 } 467
468 dput(REISERFS_SB(s)->xattr_root);
469 REISERFS_SB(s)->xattr_root = NULL;
470 dput(REISERFS_SB(s)->priv_root);
471 REISERFS_SB(s)->priv_root = NULL;
468 } 472 }
469 473
470 kill_block_super(s); 474 kill_block_super(s);
@@ -1249,7 +1253,8 @@ static void handle_quota_files(struct super_block *s, char **qf_names,
1249 kfree(REISERFS_SB(s)->s_qf_names[i]); 1253 kfree(REISERFS_SB(s)->s_qf_names[i]);
1250 REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; 1254 REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
1251 } 1255 }
1252 REISERFS_SB(s)->s_jquota_fmt = *qfmt; 1256 if (*qfmt)
1257 REISERFS_SB(s)->s_jquota_fmt = *qfmt;
1253} 1258}
1254#endif 1259#endif
1255 1260
@@ -1514,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset)
1514static int reread_meta_blocks(struct super_block *s) 1519static int reread_meta_blocks(struct super_block *s)
1515{ 1520{
1516 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); 1521 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
1517 reiserfs_write_unlock(s);
1518 wait_on_buffer(SB_BUFFER_WITH_SB(s)); 1522 wait_on_buffer(SB_BUFFER_WITH_SB(s));
1519 reiserfs_write_lock(s);
1520 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { 1523 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
1521 reiserfs_warning(s, "reiserfs-2504", "error reading the super"); 1524 reiserfs_warning(s, "reiserfs-2504", "error reading the super");
1522 return 1; 1525 return 1;
@@ -1741,22 +1744,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1741 mutex_init(&REISERFS_SB(s)->lock); 1744 mutex_init(&REISERFS_SB(s)->lock);
1742 REISERFS_SB(s)->lock_depth = -1; 1745 REISERFS_SB(s)->lock_depth = -1;
1743 1746
1744 /*
1745 * This function is called with the bkl, which also was the old
1746 * locking used here.
1747 * do_journal_begin() will soon check if we hold the lock (ie: was the
1748 * bkl). This is likely because do_journal_begin() has several another
1749 * callers because at this time, it doesn't seem to be necessary to
1750 * protect against anything.
1751 * Anyway, let's be conservative and lock for now.
1752 */
1753 reiserfs_write_lock(s);
1754
1755 jdev_name = NULL; 1747 jdev_name = NULL;
1756 if (reiserfs_parse_options 1748 if (reiserfs_parse_options
1757 (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, 1749 (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
1758 &commit_max_age, qf_names, &qfmt) == 0) { 1750 &commit_max_age, qf_names, &qfmt) == 0) {
1759 goto error; 1751 goto error_unlocked;
1760 } 1752 }
1761 if (jdev_name && jdev_name[0]) { 1753 if (jdev_name && jdev_name[0]) {
1762 REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL); 1754 REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
@@ -1772,7 +1764,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1772 1764
1773 if (blocks) { 1765 if (blocks) {
1774 SWARN(silent, s, "jmacd-7", "resize option for remount only"); 1766 SWARN(silent, s, "jmacd-7", "resize option for remount only");
1775 goto error; 1767 goto error_unlocked;
1776 } 1768 }
1777 1769
1778 /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ 1770 /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
@@ -1782,7 +1774,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1782 else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { 1774 else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
1783 SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", 1775 SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
1784 reiserfs_bdevname(s)); 1776 reiserfs_bdevname(s));
1785 goto error; 1777 goto error_unlocked;
1786 } 1778 }
1787 1779
1788 rs = SB_DISK_SUPER_BLOCK(s); 1780 rs = SB_DISK_SUPER_BLOCK(s);
@@ -1798,7 +1790,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1798 "or increase size of your LVM partition"); 1790 "or increase size of your LVM partition");
1799 SWARN(silent, s, "", "Or may be you forgot to " 1791 SWARN(silent, s, "", "Or may be you forgot to "
1800 "reboot after fdisk when it told you to"); 1792 "reboot after fdisk when it told you to");
1801 goto error; 1793 goto error_unlocked;
1802 } 1794 }
1803 1795
1804 sbi->s_mount_state = SB_REISERFS_STATE(s); 1796 sbi->s_mount_state = SB_REISERFS_STATE(s);
@@ -1806,8 +1798,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1806 1798
1807 if ((errval = reiserfs_init_bitmap_cache(s))) { 1799 if ((errval = reiserfs_init_bitmap_cache(s))) {
1808 SWARN(silent, s, "jmacd-8", "unable to read bitmap"); 1800 SWARN(silent, s, "jmacd-8", "unable to read bitmap");
1809 goto error; 1801 goto error_unlocked;
1810 } 1802 }
1803
1811 errval = -EINVAL; 1804 errval = -EINVAL;
1812#ifdef CONFIG_REISERFS_CHECK 1805#ifdef CONFIG_REISERFS_CHECK
1813 SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); 1806 SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON");
@@ -1830,24 +1823,26 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1830 if (reiserfs_barrier_flush(s)) { 1823 if (reiserfs_barrier_flush(s)) {
1831 printk("reiserfs: using flush barriers\n"); 1824 printk("reiserfs: using flush barriers\n");
1832 } 1825 }
1826
1833 // set_device_ro(s->s_dev, 1) ; 1827 // set_device_ro(s->s_dev, 1) ;
1834 if (journal_init(s, jdev_name, old_format, commit_max_age)) { 1828 if (journal_init(s, jdev_name, old_format, commit_max_age)) {
1835 SWARN(silent, s, "sh-2022", 1829 SWARN(silent, s, "sh-2022",
1836 "unable to initialize journal space"); 1830 "unable to initialize journal space");
1837 goto error; 1831 goto error_unlocked;
1838 } else { 1832 } else {
1839 jinit_done = 1; /* once this is set, journal_release must be called 1833 jinit_done = 1; /* once this is set, journal_release must be called
1840 ** if we error out of the mount 1834 ** if we error out of the mount
1841 */ 1835 */
1842 } 1836 }
1837
1843 if (reread_meta_blocks(s)) { 1838 if (reread_meta_blocks(s)) {
1844 SWARN(silent, s, "jmacd-9", 1839 SWARN(silent, s, "jmacd-9",
1845 "unable to reread meta blocks after journal init"); 1840 "unable to reread meta blocks after journal init");
1846 goto error; 1841 goto error_unlocked;
1847 } 1842 }
1848 1843
1849 if (replay_only(s)) 1844 if (replay_only(s))
1850 goto error; 1845 goto error_unlocked;
1851 1846
1852 if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { 1847 if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
1853 SWARN(silent, s, "clm-7000", 1848 SWARN(silent, s, "clm-7000",
@@ -1861,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1861 reiserfs_init_locked_inode, (void *)(&args)); 1856 reiserfs_init_locked_inode, (void *)(&args));
1862 if (!root_inode) { 1857 if (!root_inode) {
1863 SWARN(silent, s, "jmacd-10", "get root inode failed"); 1858 SWARN(silent, s, "jmacd-10", "get root inode failed");
1864 goto error; 1859 goto error_unlocked;
1865 } 1860 }
1866 1861
1862 /*
1863 * This path assumed to be called with the BKL in the old times.
1864 * Now we have inherited the big reiserfs lock from it and many
1865 * reiserfs helpers called in the mount path and elsewhere require
1866 * this lock to be held even if it's not always necessary. Let's be
1867 * conservative and hold it early. The window can be reduced after
1868 * careful review of the code.
1869 */
1870 reiserfs_write_lock(s);
1871
1867 if (root_inode->i_state & I_NEW) { 1872 if (root_inode->i_state & I_NEW) {
1868 reiserfs_read_locked_inode(root_inode, &args); 1873 reiserfs_read_locked_inode(root_inode, &args);
1869 unlock_new_inode(root_inode); 1874 unlock_new_inode(root_inode);
@@ -1990,12 +1995,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1990 return (0); 1995 return (0);
1991 1996
1992error: 1997error:
1993 if (jinit_done) { /* kill the commit thread, free journal ram */ 1998 reiserfs_write_unlock(s);
1999
2000error_unlocked:
2001 /* kill the commit thread, free journal ram */
2002 if (jinit_done) {
2003 reiserfs_write_lock(s);
1994 journal_release_error(NULL, s); 2004 journal_release_error(NULL, s);
2005 reiserfs_write_unlock(s);
1995 } 2006 }
1996 2007
1997 reiserfs_write_unlock(s);
1998
1999 reiserfs_free_bitmap_cache(s); 2008 reiserfs_free_bitmap_cache(s);
2000 if (SB_BUFFER_WITH_SB(s)) 2009 if (SB_BUFFER_WITH_SB(s))
2001 brelse(SB_BUFFER_WITH_SB(s)); 2010 brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index eed99428f104..e1a7779dd3cb 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -28,9 +28,10 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
28 struct inode *inode = file->f_mapping->host; 28 struct inode *inode = file->f_mapping->host;
29 struct mtd_info *mtd = inode->i_sb->s_mtd; 29 struct mtd_info *mtd = inode->i_sb->s_mtd;
30 unsigned long isize, offset, maxpages, lpages; 30 unsigned long isize, offset, maxpages, lpages;
31 int ret;
31 32
32 if (!mtd) 33 if (!mtd)
33 goto cant_map_directly; 34 return (unsigned long) -ENOSYS;
34 35
35 /* the mapping mustn't extend beyond the EOF */ 36 /* the mapping mustn't extend beyond the EOF */
36 lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 37 lpages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -41,23 +42,20 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
41 if ((pgoff >= maxpages) || (maxpages - pgoff < lpages)) 42 if ((pgoff >= maxpages) || (maxpages - pgoff < lpages))
42 return (unsigned long) -EINVAL; 43 return (unsigned long) -EINVAL;
43 44
44 /* we need to call down to the MTD layer to do the actual mapping */ 45 if (addr != 0)
45 if (mtd->get_unmapped_area) { 46 return (unsigned long) -EINVAL;
46 if (addr != 0)
47 return (unsigned long) -EINVAL;
48
49 if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
50 return (unsigned long) -EINVAL;
51 47
52 offset += ROMFS_I(inode)->i_dataoffset; 48 if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
53 if (offset > mtd->size - len) 49 return (unsigned long) -EINVAL;
54 return (unsigned long) -EINVAL;
55 50
56 return mtd->get_unmapped_area(mtd, len, offset, flags); 51 offset += ROMFS_I(inode)->i_dataoffset;
57 } 52 if (offset > mtd->size - len)
53 return (unsigned long) -EINVAL;
58 54
59cant_map_directly: 55 ret = mtd_get_unmapped_area(mtd, len, offset, flags);
60 return (unsigned long) -ENOSYS; 56 if (ret == -EOPNOTSUPP)
57 ret = -ENOSYS;
58 return (unsigned long) ret;
61} 59}
62 60
63/* 61/*
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f744be98cd5a..af0b73802592 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -70,11 +70,15 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
70 spin_lock(&cache->lock); 70 spin_lock(&cache->lock);
71 71
72 while (1) { 72 while (1) {
73 for (i = 0; i < cache->entries; i++) 73 for (i = cache->curr_blk, n = 0; n < cache->entries; n++) {
74 if (cache->entry[i].block == block) 74 if (cache->entry[i].block == block) {
75 cache->curr_blk = i;
75 break; 76 break;
77 }
78 i = (i + 1) % cache->entries;
79 }
76 80
77 if (i == cache->entries) { 81 if (n == cache->entries) {
78 /* 82 /*
79 * Block not in cache, if all cache entries are used 83 * Block not in cache, if all cache entries are used
80 * go to sleep waiting for one to become available. 84 * go to sleep waiting for one to become available.
@@ -245,6 +249,7 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
245 goto cleanup; 249 goto cleanup;
246 } 250 }
247 251
252 cache->curr_blk = 0;
248 cache->next_blk = 0; 253 cache->next_blk = 0;
249 cache->unused = entries; 254 cache->unused = entries;
250 cache->entries = entries; 255 cache->entries = entries;
@@ -332,17 +337,20 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
332 u64 *block, int *offset, int length) 337 u64 *block, int *offset, int length)
333{ 338{
334 struct squashfs_sb_info *msblk = sb->s_fs_info; 339 struct squashfs_sb_info *msblk = sb->s_fs_info;
335 int bytes, copied = length; 340 int bytes, res = length;
336 struct squashfs_cache_entry *entry; 341 struct squashfs_cache_entry *entry;
337 342
338 TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); 343 TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset);
339 344
340 while (length) { 345 while (length) {
341 entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); 346 entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0);
342 if (entry->error) 347 if (entry->error) {
343 return entry->error; 348 res = entry->error;
344 else if (*offset >= entry->length) 349 goto error;
345 return -EIO; 350 } else if (*offset >= entry->length) {
351 res = -EIO;
352 goto error;
353 }
346 354
347 bytes = squashfs_copy_data(buffer, entry, *offset, length); 355 bytes = squashfs_copy_data(buffer, entry, *offset, length);
348 if (buffer) 356 if (buffer)
@@ -358,7 +366,11 @@ int squashfs_read_metadata(struct super_block *sb, void *buffer,
358 squashfs_cache_put(entry); 366 squashfs_cache_put(entry);
359 } 367 }
360 368
361 return copied; 369 return res;
370
371error:
372 squashfs_cache_put(entry);
373 return res;
362} 374}
363 375
364 376
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index fd7b3b3bda13..81afbccfa843 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -208,8 +208,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
208 inode->i_op = &squashfs_inode_ops; 208 inode->i_op = &squashfs_inode_ops;
209 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
210 inode->i_mode |= S_IFREG; 210 inode->i_mode |= S_IFREG;
211 inode->i_blocks = ((inode->i_size - 211 inode->i_blocks = (inode->i_size -
212 le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; 212 le64_to_cpu(sqsh_ino->sparse) + 511) >> 9;
213 213
214 squashfs_i(inode)->fragment_block = frag_blk; 214 squashfs_i(inode)->fragment_block = frag_blk;
215 squashfs_i(inode)->fragment_size = frag_size; 215 squashfs_i(inode)->fragment_size = frag_size;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 651f0b31d296..52934a22f296 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -28,6 +28,7 @@
28struct squashfs_cache { 28struct squashfs_cache {
29 char *name; 29 char *name;
30 int entries; 30 int entries;
31 int curr_blk;
31 int next_blk; 32 int next_blk;
32 int num_waiters; 33 int num_waiters;
33 int unused; 34 int unused;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index d0858c2d9a47..ecaa2f7bdb8f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -290,7 +290,7 @@ handle_fragments:
290 290
291check_directory_table: 291check_directory_table:
292 /* Sanity check directory_table */ 292 /* Sanity check directory_table */
293 if (msblk->directory_table >= next_table) { 293 if (msblk->directory_table > next_table) {
294 err = -EINVAL; 294 err = -EINVAL;
295 goto failed_mount; 295 goto failed_mount;
296 } 296 }
diff --git a/fs/super.c b/fs/super.c
index de41e1e46f09..6015c02296b7 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1186,6 +1186,8 @@ int freeze_super(struct super_block *sb)
1186 printk(KERN_ERR 1186 printk(KERN_ERR
1187 "VFS:Filesystem freeze failed\n"); 1187 "VFS:Filesystem freeze failed\n");
1188 sb->s_frozen = SB_UNFROZEN; 1188 sb->s_frozen = SB_UNFROZEN;
1189 smp_wmb();
1190 wake_up(&sb->s_wait_unfrozen);
1189 deactivate_locked_super(sb); 1191 deactivate_locked_super(sb);
1190 return ret; 1192 return ret;
1191 } 1193 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 62f4fb37789e..00012e31829d 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -493,6 +493,12 @@ int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
493 const void *ns = NULL; 493 const void *ns = NULL;
494 int err; 494 int err;
495 495
496 if (!dir_sd) {
497 WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
498 kobject_name(kobj));
499 return -ENOENT;
500 }
501
496 err = 0; 502 err = 0;
497 if (!sysfs_ns_type(dir_sd)) 503 if (!sysfs_ns_type(dir_sd))
498 goto out; 504 goto out;
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 4a802b4a9056..85eb81683a29 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -318,8 +318,11 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
318 struct sysfs_addrm_cxt acxt; 318 struct sysfs_addrm_cxt acxt;
319 struct sysfs_dirent *sd; 319 struct sysfs_dirent *sd;
320 320
321 if (!dir_sd) 321 if (!dir_sd) {
322 WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
323 name);
322 return -ENOENT; 324 return -ENOENT;
325 }
323 326
324 sysfs_addrm_start(&acxt, dir_sd); 327 sysfs_addrm_start(&acxt, dir_sd);
325 328
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index b09ba2dd8b62..f922cbacdb96 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -38,9 +38,6 @@
38 38
39DEFINE_SPINLOCK(dbg_lock); 39DEFINE_SPINLOCK(dbg_lock);
40 40
41static char dbg_key_buf0[128];
42static char dbg_key_buf1[128];
43
44static const char *get_key_fmt(int fmt) 41static const char *get_key_fmt(int fmt)
45{ 42{
46 switch (fmt) { 43 switch (fmt) {
@@ -103,8 +100,8 @@ static const char *get_dent_type(int type)
103 } 100 }
104} 101}
105 102
106static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, 103const char *dbg_snprintf_key(const struct ubifs_info *c,
107 char *buffer) 104 const union ubifs_key *key, char *buffer, int len)
108{ 105{
109 char *p = buffer; 106 char *p = buffer;
110 int type = key_type(c, key); 107 int type = key_type(c, key);
@@ -112,45 +109,34 @@ static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
112 if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { 109 if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
113 switch (type) { 110 switch (type) {
114 case UBIFS_INO_KEY: 111 case UBIFS_INO_KEY:
115 sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key), 112 len -= snprintf(p, len, "(%lu, %s)",
116 get_key_type(type)); 113 (unsigned long)key_inum(c, key),
114 get_key_type(type));
117 break; 115 break;
118 case UBIFS_DENT_KEY: 116 case UBIFS_DENT_KEY:
119 case UBIFS_XENT_KEY: 117 case UBIFS_XENT_KEY:
120 sprintf(p, "(%lu, %s, %#08x)", 118 len -= snprintf(p, len, "(%lu, %s, %#08x)",
121 (unsigned long)key_inum(c, key), 119 (unsigned long)key_inum(c, key),
122 get_key_type(type), key_hash(c, key)); 120 get_key_type(type), key_hash(c, key));
123 break; 121 break;
124 case UBIFS_DATA_KEY: 122 case UBIFS_DATA_KEY:
125 sprintf(p, "(%lu, %s, %u)", 123 len -= snprintf(p, len, "(%lu, %s, %u)",
126 (unsigned long)key_inum(c, key), 124 (unsigned long)key_inum(c, key),
127 get_key_type(type), key_block(c, key)); 125 get_key_type(type), key_block(c, key));
128 break; 126 break;
129 case UBIFS_TRUN_KEY: 127 case UBIFS_TRUN_KEY:
130 sprintf(p, "(%lu, %s)", 128 len -= snprintf(p, len, "(%lu, %s)",
131 (unsigned long)key_inum(c, key), 129 (unsigned long)key_inum(c, key),
132 get_key_type(type)); 130 get_key_type(type));
133 break; 131 break;
134 default: 132 default:
135 sprintf(p, "(bad key type: %#08x, %#08x)", 133 len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
136 key->u32[0], key->u32[1]); 134 key->u32[0], key->u32[1]);
137 } 135 }
138 } else 136 } else
139 sprintf(p, "bad key format %d", c->key_fmt); 137 len -= snprintf(p, len, "bad key format %d", c->key_fmt);
140} 138 ubifs_assert(len > 0);
141 139 return p;
142const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
143{
144 /* dbg_lock must be held */
145 sprintf_key(c, key, dbg_key_buf0);
146 return dbg_key_buf0;
147}
148
149const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
150{
151 /* dbg_lock must be held */
152 sprintf_key(c, key, dbg_key_buf1);
153 return dbg_key_buf1;
154} 140}
155 141
156const char *dbg_ntype(int type) 142const char *dbg_ntype(int type)
@@ -319,6 +305,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
319 int i, n; 305 int i, n;
320 union ubifs_key key; 306 union ubifs_key key;
321 const struct ubifs_ch *ch = node; 307 const struct ubifs_ch *ch = node;
308 char key_buf[DBG_KEY_BUF_LEN];
322 309
323 if (dbg_is_tst_rcvry(c)) 310 if (dbg_is_tst_rcvry(c))
324 return; 311 return;
@@ -474,7 +461,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
474 const struct ubifs_ino_node *ino = node; 461 const struct ubifs_ino_node *ino = node;
475 462
476 key_read(c, &ino->key, &key); 463 key_read(c, &ino->key, &key);
477 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); 464 printk(KERN_DEBUG "\tkey %s\n",
465 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
478 printk(KERN_DEBUG "\tcreat_sqnum %llu\n", 466 printk(KERN_DEBUG "\tcreat_sqnum %llu\n",
479 (unsigned long long)le64_to_cpu(ino->creat_sqnum)); 467 (unsigned long long)le64_to_cpu(ino->creat_sqnum));
480 printk(KERN_DEBUG "\tsize %llu\n", 468 printk(KERN_DEBUG "\tsize %llu\n",
@@ -517,7 +505,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
517 int nlen = le16_to_cpu(dent->nlen); 505 int nlen = le16_to_cpu(dent->nlen);
518 506
519 key_read(c, &dent->key, &key); 507 key_read(c, &dent->key, &key);
520 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); 508 printk(KERN_DEBUG "\tkey %s\n",
509 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
521 printk(KERN_DEBUG "\tinum %llu\n", 510 printk(KERN_DEBUG "\tinum %llu\n",
522 (unsigned long long)le64_to_cpu(dent->inum)); 511 (unsigned long long)le64_to_cpu(dent->inum));
523 printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); 512 printk(KERN_DEBUG "\ttype %d\n", (int)dent->type);
@@ -541,7 +530,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
541 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; 530 int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
542 531
543 key_read(c, &dn->key, &key); 532 key_read(c, &dn->key, &key);
544 printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); 533 printk(KERN_DEBUG "\tkey %s\n",
534 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
545 printk(KERN_DEBUG "\tsize %u\n", 535 printk(KERN_DEBUG "\tsize %u\n",
546 le32_to_cpu(dn->size)); 536 le32_to_cpu(dn->size));
547 printk(KERN_DEBUG "\tcompr_typ %d\n", 537 printk(KERN_DEBUG "\tcompr_typ %d\n",
@@ -582,7 +572,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
582 key_read(c, &br->key, &key); 572 key_read(c, &br->key, &key);
583 printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", 573 printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
584 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), 574 i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
585 le32_to_cpu(br->len), DBGKEY(&key)); 575 le32_to_cpu(br->len),
576 dbg_snprintf_key(c, &key, key_buf,
577 DBG_KEY_BUF_LEN));
586 } 578 }
587 break; 579 break;
588 } 580 }
@@ -934,6 +926,7 @@ void dbg_dump_znode(const struct ubifs_info *c,
934{ 926{
935 int n; 927 int n;
936 const struct ubifs_zbranch *zbr; 928 const struct ubifs_zbranch *zbr;
929 char key_buf[DBG_KEY_BUF_LEN];
937 930
938 spin_lock(&dbg_lock); 931 spin_lock(&dbg_lock);
939 if (znode->parent) 932 if (znode->parent)
@@ -958,12 +951,16 @@ void dbg_dump_znode(const struct ubifs_info *c,
958 printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " 951 printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
959 "%s\n", n, zbr->znode, zbr->lnum, 952 "%s\n", n, zbr->znode, zbr->lnum,
960 zbr->offs, zbr->len, 953 zbr->offs, zbr->len,
961 DBGKEY(&zbr->key)); 954 dbg_snprintf_key(c, &zbr->key,
955 key_buf,
956 DBG_KEY_BUF_LEN));
962 else 957 else
963 printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " 958 printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
964 "%s\n", n, zbr->znode, zbr->lnum, 959 "%s\n", n, zbr->znode, zbr->lnum,
965 zbr->offs, zbr->len, 960 zbr->offs, zbr->len,
966 DBGKEY(&zbr->key)); 961 dbg_snprintf_key(c, &zbr->key,
962 key_buf,
963 DBG_KEY_BUF_LEN));
967 } 964 }
968 spin_unlock(&dbg_lock); 965 spin_unlock(&dbg_lock);
969} 966}
@@ -1260,6 +1257,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1260 int err, nlen1, nlen2, cmp; 1257 int err, nlen1, nlen2, cmp;
1261 struct ubifs_dent_node *dent1, *dent2; 1258 struct ubifs_dent_node *dent1, *dent2;
1262 union ubifs_key key; 1259 union ubifs_key key;
1260 char key_buf[DBG_KEY_BUF_LEN];
1263 1261
1264 ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); 1262 ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
1265 dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); 1263 dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
@@ -1290,9 +1288,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1290 key_read(c, &dent1->key, &key); 1288 key_read(c, &dent1->key, &key);
1291 if (keys_cmp(c, &zbr1->key, &key)) { 1289 if (keys_cmp(c, &zbr1->key, &key)) {
1292 dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, 1290 dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
1293 zbr1->offs, DBGKEY(&key)); 1291 zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
1292 DBG_KEY_BUF_LEN));
1294 dbg_err("but it should have key %s according to tnc", 1293 dbg_err("but it should have key %s according to tnc",
1295 DBGKEY(&zbr1->key)); 1294 dbg_snprintf_key(c, &zbr1->key, key_buf,
1295 DBG_KEY_BUF_LEN));
1296 dbg_dump_node(c, dent1); 1296 dbg_dump_node(c, dent1);
1297 goto out_free; 1297 goto out_free;
1298 } 1298 }
@@ -1300,9 +1300,11 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1300 key_read(c, &dent2->key, &key); 1300 key_read(c, &dent2->key, &key);
1301 if (keys_cmp(c, &zbr2->key, &key)) { 1301 if (keys_cmp(c, &zbr2->key, &key)) {
1302 dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, 1302 dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
1303 zbr1->offs, DBGKEY(&key)); 1303 zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
1304 DBG_KEY_BUF_LEN));
1304 dbg_err("but it should have key %s according to tnc", 1305 dbg_err("but it should have key %s according to tnc",
1305 DBGKEY(&zbr2->key)); 1306 dbg_snprintf_key(c, &zbr2->key, key_buf,
1307 DBG_KEY_BUF_LEN));
1306 dbg_dump_node(c, dent2); 1308 dbg_dump_node(c, dent2);
1307 goto out_free; 1309 goto out_free;
1308 } 1310 }
@@ -1319,7 +1321,7 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
1319 dbg_err("2 xent/dent nodes with the same name"); 1321 dbg_err("2 xent/dent nodes with the same name");
1320 else 1322 else
1321 dbg_err("bad order of colliding key %s", 1323 dbg_err("bad order of colliding key %s",
1322 DBGKEY(&key)); 1324 dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
1323 1325
1324 ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); 1326 ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
1325 dbg_dump_node(c, dent1); 1327 dbg_dump_node(c, dent1);
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 8d9c46810189..ad1a6fee6010 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -169,40 +169,39 @@ struct ubifs_global_debug_info {
169 spin_unlock(&dbg_lock); \ 169 spin_unlock(&dbg_lock); \
170} while (0) 170} while (0)
171 171
172const char *dbg_key_str0(const struct ubifs_info *c, 172#define ubifs_dbg_msg(type, fmt, ...) \
173 const union ubifs_key *key); 173 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
174const char *dbg_key_str1(const struct ubifs_info *c, 174
175 const union ubifs_key *key); 175#define DBG_KEY_BUF_LEN 32
176 176#define ubifs_dbg_msg_key(type, key, fmt, ...) do { \
177/* 177 char __tmp_key_buf[DBG_KEY_BUF_LEN]; \
178 * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message 178 pr_debug("UBIFS DBG " type ": " fmt "%s\n", ##__VA_ARGS__, \
179 * macros. 179 dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN)); \
180 */
181#define DBGKEY(key) dbg_key_str0(c, (key))
182#define DBGKEY1(key) dbg_key_str1(c, (key))
183
184extern spinlock_t dbg_lock;
185
186#define ubifs_dbg_msg(type, fmt, ...) do { \
187 spin_lock(&dbg_lock); \
188 pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \
189 spin_unlock(&dbg_lock); \
190} while (0) 180} while (0)
191 181
192/* Just a debugging messages not related to any specific UBIFS subsystem */ 182/* Just a debugging messages not related to any specific UBIFS subsystem */
193#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__) 183#define dbg_msg(fmt, ...) \
184 printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \
185 __func__, ##__VA_ARGS__)
186
194/* General messages */ 187/* General messages */
195#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) 188#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
196/* Additional journal messages */ 189/* Additional journal messages */
197#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) 190#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
191#define dbg_jnlk(key, fmt, ...) \
192 ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
198/* Additional TNC messages */ 193/* Additional TNC messages */
199#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) 194#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
195#define dbg_tnck(key, fmt, ...) \
196 ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
200/* Additional lprops messages */ 197/* Additional lprops messages */
201#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) 198#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
202/* Additional LEB find messages */ 199/* Additional LEB find messages */
203#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) 200#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
204/* Additional mount messages */ 201/* Additional mount messages */
205#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) 202#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
203#define dbg_mntk(key, fmt, ...) \
204 ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
206/* Additional I/O messages */ 205/* Additional I/O messages */
207#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) 206#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
208/* Additional commit messages */ 207/* Additional commit messages */
@@ -218,6 +217,7 @@ extern spinlock_t dbg_lock;
218/* Additional recovery messages */ 217/* Additional recovery messages */
219#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) 218#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__)
220 219
220extern spinlock_t dbg_lock;
221extern struct ubifs_global_debug_info ubifs_dbg; 221extern struct ubifs_global_debug_info ubifs_dbg;
222 222
223static inline int dbg_is_chk_gen(const struct ubifs_info *c) 223static inline int dbg_is_chk_gen(const struct ubifs_info *c)
@@ -258,6 +258,8 @@ const char *dbg_cstate(int cmt_state);
258const char *dbg_jhead(int jhead); 258const char *dbg_jhead(int jhead);
259const char *dbg_get_key_dump(const struct ubifs_info *c, 259const char *dbg_get_key_dump(const struct ubifs_info *c,
260 const union ubifs_key *key); 260 const union ubifs_key *key);
261const char *dbg_snprintf_key(const struct ubifs_info *c,
262 const union ubifs_key *key, char *buffer, int len);
261void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode); 263void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
262void dbg_dump_node(const struct ubifs_info *c, const void *node); 264void dbg_dump_node(const struct ubifs_info *c, const void *node);
263void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, 265void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
@@ -345,20 +347,23 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
345#define dbg_dump_stack() 347#define dbg_dump_stack()
346#define ubifs_assert_cmt_locked(c) 348#define ubifs_assert_cmt_locked(c)
347 349
348#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 350#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
349#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 351#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
350#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 352#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
351#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 353#define dbg_jnlk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
352#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 354#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
353#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 355#define dbg_tnck(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
354#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 356#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
355#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 357#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
356#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 358#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
357#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 359#define dbg_mntk(key, fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
358#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 360#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
359#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 361#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
360#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 362#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
361#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) 363#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
364#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
365#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
366#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
362 367
363static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; } 368static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; }
364static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; } 369static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; }
@@ -368,6 +373,10 @@ static inline const char *dbg_jhead(int jhead) { return ""; }
368static inline const char * 373static inline const char *
369dbg_get_key_dump(const struct ubifs_info *c, 374dbg_get_key_dump(const struct ubifs_info *c,
370 const union ubifs_key *key) { return ""; } 375 const union ubifs_key *key) { return ""; }
376static inline const char *
377dbg_snprintf_key(const struct ubifs_info *c,
378 const union ubifs_key *key, char *buffer,
379 int len) { return ""; }
371static inline void dbg_dump_inode(struct ubifs_info *c, 380static inline void dbg_dump_inode(struct ubifs_info *c,
372 const struct inode *inode) { return; } 381 const struct inode *inode) { return; }
373static inline void dbg_dump_node(const struct ubifs_info *c, 382static inline void dbg_dump_node(const struct ubifs_info *c,
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index cef0460f4c54..2f438ab2e7a2 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -697,9 +697,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
697 int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; 697 int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
698 struct ubifs_inode *ui = ubifs_inode(inode); 698 struct ubifs_inode *ui = ubifs_inode(inode);
699 699
700 dbg_jnl("ino %lu, blk %u, len %d, key %s", 700 dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
701 (unsigned long)key_inum(c, key), key_block(c, key), len, 701 (unsigned long)key_inum(c, key), key_block(c, key), len);
702 DBGKEY(key));
703 ubifs_assert(len <= UBIFS_BLOCK_SIZE); 702 ubifs_assert(len <= UBIFS_BLOCK_SIZE);
704 703
705 data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); 704 data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
@@ -1177,7 +1176,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
1177 dn = (void *)trun + UBIFS_TRUN_NODE_SZ; 1176 dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
1178 blk = new_size >> UBIFS_BLOCK_SHIFT; 1177 blk = new_size >> UBIFS_BLOCK_SHIFT;
1179 data_key_init(c, &key, inum, blk); 1178 data_key_init(c, &key, inum, blk);
1180 dbg_jnl("last block key %s", DBGKEY(&key)); 1179 dbg_jnlk(&key, "last block key ");
1181 err = ubifs_tnc_lookup(c, &key, dn); 1180 err = ubifs_tnc_lookup(c, &key, dn);
1182 if (err == -ENOENT) 1181 if (err == -ENOENT)
1183 dlen = 0; /* Not found (so it is a hole) */ 1182 dlen = 0; /* Not found (so it is a hole) */
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 6189c74d97f0..66d59d0a1402 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1986,12 +1986,11 @@ again:
1986 1986
1987 if (path[h].in_tree) 1987 if (path[h].in_tree)
1988 continue; 1988 continue;
1989 nnode = kmalloc(sz, GFP_NOFS); 1989 nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS);
1990 if (!nnode) { 1990 if (!nnode) {
1991 err = -ENOMEM; 1991 err = -ENOMEM;
1992 goto out; 1992 goto out;
1993 } 1993 }
1994 memcpy(nnode, &path[h].nnode, sz);
1995 parent = nnode->parent; 1994 parent = nnode->parent;
1996 parent->nbranch[nnode->iip].nnode = nnode; 1995 parent->nbranch[nnode->iip].nnode = nnode;
1997 path[h].ptr.nnode = nnode; 1996 path[h].ptr.nnode = nnode;
@@ -2004,12 +2003,11 @@ again:
2004 const size_t sz = sizeof(struct ubifs_pnode); 2003 const size_t sz = sizeof(struct ubifs_pnode);
2005 struct ubifs_nnode *parent; 2004 struct ubifs_nnode *parent;
2006 2005
2007 pnode = kmalloc(sz, GFP_NOFS); 2006 pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS);
2008 if (!pnode) { 2007 if (!pnode) {
2009 err = -ENOMEM; 2008 err = -ENOMEM;
2010 goto out; 2009 goto out;
2011 } 2010 }
2012 memcpy(pnode, &path[h].pnode, sz);
2013 parent = pnode->parent; 2011 parent = pnode->parent;
2014 parent->nbranch[pnode->iip].pnode = pnode; 2012 parent->nbranch[pnode->iip].pnode = pnode;
2015 path[h].ptr.pnode = pnode; 2013 path[h].ptr.pnode = pnode;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ccabaf1164b3..b007637f0406 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -221,8 +221,8 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
221{ 221{
222 int err; 222 int err;
223 223
224 dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum, 224 dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
225 r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key)); 225 r->lnum, r->offs, r->len, r->deletion, r->sqnum);
226 226
227 /* Set c->replay_sqnum to help deal with dangling branches. */ 227 /* Set c->replay_sqnum to help deal with dangling branches. */
228 c->replay_sqnum = r->sqnum; 228 c->replay_sqnum = r->sqnum;
@@ -361,7 +361,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
361{ 361{
362 struct replay_entry *r; 362 struct replay_entry *r;
363 363
364 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); 364 dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
365 365
366 if (key_inum(c, key) >= c->highest_inum) 366 if (key_inum(c, key) >= c->highest_inum)
367 c->highest_inum = key_inum(c, key); 367 c->highest_inum = key_inum(c, key);
@@ -409,7 +409,7 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
409 struct replay_entry *r; 409 struct replay_entry *r;
410 char *nbuf; 410 char *nbuf;
411 411
412 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); 412 dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
413 if (key_inum(c, key) >= c->highest_inum) 413 if (key_inum(c, key) >= c->highest_inum)
414 c->highest_inum = key_inum(c, key); 414 c->highest_inum = key_inum(c, key);
415 415
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 066738647685..16ad84d8402f 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -344,12 +344,11 @@ static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr,
344 return err; 344 return err;
345 } 345 }
346 346
347 lnc_node = kmalloc(zbr->len, GFP_NOFS); 347 lnc_node = kmemdup(node, zbr->len, GFP_NOFS);
348 if (!lnc_node) 348 if (!lnc_node)
349 /* We don't have to have the cache, so no error */ 349 /* We don't have to have the cache, so no error */
350 return 0; 350 return 0;
351 351
352 memcpy(lnc_node, node, zbr->len);
353 zbr->leaf = lnc_node; 352 zbr->leaf = lnc_node;
354 return 0; 353 return 0;
355} 354}
@@ -506,7 +505,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
506{ 505{
507 int ret; 506 int ret;
508 507
509 dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); 508 dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
510 509
511 ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, 510 ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
512 zbr->offs); 511 zbr->offs);
@@ -520,8 +519,8 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
520 ret = 0; 519 ret = 0;
521 } 520 }
522 if (ret == 0 && c->replaying) 521 if (ret == 0 && c->replaying)
523 dbg_mnt("dangling branch LEB %d:%d len %d, key %s", 522 dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
524 zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); 523 zbr->lnum, zbr->offs, zbr->len);
525 return ret; 524 return ret;
526} 525}
527 526
@@ -996,9 +995,9 @@ static int fallible_resolve_collision(struct ubifs_info *c,
996 if (adding || !o_znode) 995 if (adding || !o_znode)
997 return 0; 996 return 0;
998 997
999 dbg_mnt("dangling match LEB %d:%d len %d %s", 998 dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
1000 o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, 999 o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
1001 o_znode->zbranch[o_n].len, DBGKEY(key)); 1000 o_znode->zbranch[o_n].len);
1002 *zn = o_znode; 1001 *zn = o_znode;
1003 *n = o_n; 1002 *n = o_n;
1004 return 1; 1003 return 1;
@@ -1180,7 +1179,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1180 struct ubifs_znode *znode; 1179 struct ubifs_znode *znode;
1181 unsigned long time = get_seconds(); 1180 unsigned long time = get_seconds();
1182 1181
1183 dbg_tnc("search key %s", DBGKEY(key)); 1182 dbg_tnck(key, "search key ");
1184 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY); 1183 ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
1185 1184
1186 znode = c->zroot.znode; 1185 znode = c->zroot.znode;
@@ -1316,7 +1315,7 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key,
1316 struct ubifs_znode *znode; 1315 struct ubifs_znode *znode;
1317 unsigned long time = get_seconds(); 1316 unsigned long time = get_seconds();
1318 1317
1319 dbg_tnc("search and dirty key %s", DBGKEY(key)); 1318 dbg_tnck(key, "search and dirty key ");
1320 1319
1321 znode = c->zroot.znode; 1320 znode = c->zroot.znode;
1322 if (unlikely(!znode)) { 1321 if (unlikely(!znode)) {
@@ -1723,8 +1722,8 @@ static int validate_data_node(struct ubifs_info *c, void *buf,
1723 if (!keys_eq(c, &zbr->key, &key1)) { 1722 if (!keys_eq(c, &zbr->key, &key1)) {
1724 ubifs_err("bad key in node at LEB %d:%d", 1723 ubifs_err("bad key in node at LEB %d:%d",
1725 zbr->lnum, zbr->offs); 1724 zbr->lnum, zbr->offs);
1726 dbg_tnc("looked for key %s found node's key %s", 1725 dbg_tnck(&zbr->key, "looked for key ");
1727 DBGKEY(&zbr->key), DBGKEY1(&key1)); 1726 dbg_tnck(&key1, "found node's key ");
1728 goto out_err; 1727 goto out_err;
1729 } 1728 }
1730 1729
@@ -1777,7 +1776,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
1777 ubifs_err("failed to read from LEB %d:%d, error %d", 1776 ubifs_err("failed to read from LEB %d:%d, error %d",
1778 lnum, offs, err); 1777 lnum, offs, err);
1779 dbg_dump_stack(); 1778 dbg_dump_stack();
1780 dbg_tnc("key %s", DBGKEY(&bu->key)); 1779 dbg_tnck(&bu->key, "key ");
1781 return err; 1780 return err;
1782 } 1781 }
1783 1782
@@ -1812,7 +1811,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
1812 int found, n, err; 1811 int found, n, err;
1813 struct ubifs_znode *znode; 1812 struct ubifs_znode *znode;
1814 1813
1815 dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); 1814 dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
1816 mutex_lock(&c->tnc_mutex); 1815 mutex_lock(&c->tnc_mutex);
1817 found = ubifs_lookup_level0(c, key, &znode, &n); 1816 found = ubifs_lookup_level0(c, key, &znode, &n);
1818 if (!found) { 1817 if (!found) {
@@ -1986,8 +1985,7 @@ again:
1986 zp = znode->parent; 1985 zp = znode->parent;
1987 if (znode->child_cnt < c->fanout) { 1986 if (znode->child_cnt < c->fanout) {
1988 ubifs_assert(n != c->fanout); 1987 ubifs_assert(n != c->fanout);
1989 dbg_tnc("inserted at %d level %d, key %s", n, znode->level, 1988 dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
1990 DBGKEY(key));
1991 1989
1992 insert_zbranch(znode, zbr, n); 1990 insert_zbranch(znode, zbr, n);
1993 1991
@@ -2002,7 +2000,7 @@ again:
2002 * Unfortunately, @znode does not have more empty slots and we have to 2000 * Unfortunately, @znode does not have more empty slots and we have to
2003 * split it. 2001 * split it.
2004 */ 2002 */
2005 dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); 2003 dbg_tnck(key, "splitting level %d, key ", znode->level);
2006 2004
2007 if (znode->alt) 2005 if (znode->alt)
2008 /* 2006 /*
@@ -2096,7 +2094,7 @@ do_split:
2096 } 2094 }
2097 2095
2098 /* Insert new key and branch */ 2096 /* Insert new key and branch */
2099 dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); 2097 dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
2100 2098
2101 insert_zbranch(zi, zbr, n); 2099 insert_zbranch(zi, zbr, n);
2102 2100
@@ -2172,7 +2170,7 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
2172 struct ubifs_znode *znode; 2170 struct ubifs_znode *znode;
2173 2171
2174 mutex_lock(&c->tnc_mutex); 2172 mutex_lock(&c->tnc_mutex);
2175 dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); 2173 dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
2176 found = lookup_level0_dirty(c, key, &znode, &n); 2174 found = lookup_level0_dirty(c, key, &znode, &n);
2177 if (!found) { 2175 if (!found) {
2178 struct ubifs_zbranch zbr; 2176 struct ubifs_zbranch zbr;
@@ -2221,8 +2219,8 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
2221 struct ubifs_znode *znode; 2219 struct ubifs_znode *znode;
2222 2220
2223 mutex_lock(&c->tnc_mutex); 2221 mutex_lock(&c->tnc_mutex);
2224 dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, 2222 dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
2225 old_offs, lnum, offs, len, DBGKEY(key)); 2223 old_offs, lnum, offs, len);
2226 found = lookup_level0_dirty(c, key, &znode, &n); 2224 found = lookup_level0_dirty(c, key, &znode, &n);
2227 if (found < 0) { 2225 if (found < 0) {
2228 err = found; 2226 err = found;
@@ -2304,8 +2302,8 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
2304 struct ubifs_znode *znode; 2302 struct ubifs_znode *znode;
2305 2303
2306 mutex_lock(&c->tnc_mutex); 2304 mutex_lock(&c->tnc_mutex);
2307 dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, 2305 dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
2308 DBGKEY(key)); 2306 lnum, offs, nm->len, nm->name);
2309 found = lookup_level0_dirty(c, key, &znode, &n); 2307 found = lookup_level0_dirty(c, key, &znode, &n);
2310 if (found < 0) { 2308 if (found < 0) {
2311 err = found; 2309 err = found;
@@ -2398,7 +2396,7 @@ static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n)
2398 /* Delete without merge for now */ 2396 /* Delete without merge for now */
2399 ubifs_assert(znode->level == 0); 2397 ubifs_assert(znode->level == 0);
2400 ubifs_assert(n >= 0 && n < c->fanout); 2398 ubifs_assert(n >= 0 && n < c->fanout);
2401 dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); 2399 dbg_tnck(&znode->zbranch[n].key, "deleting key ");
2402 2400
2403 zbr = &znode->zbranch[n]; 2401 zbr = &znode->zbranch[n];
2404 lnc_free(zbr); 2402 lnc_free(zbr);
@@ -2508,7 +2506,7 @@ int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key)
2508 struct ubifs_znode *znode; 2506 struct ubifs_znode *znode;
2509 2507
2510 mutex_lock(&c->tnc_mutex); 2508 mutex_lock(&c->tnc_mutex);
2511 dbg_tnc("key %s", DBGKEY(key)); 2509 dbg_tnck(key, "key ");
2512 found = lookup_level0_dirty(c, key, &znode, &n); 2510 found = lookup_level0_dirty(c, key, &znode, &n);
2513 if (found < 0) { 2511 if (found < 0) {
2514 err = found; 2512 err = found;
@@ -2539,7 +2537,7 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
2539 struct ubifs_znode *znode; 2537 struct ubifs_znode *znode;
2540 2538
2541 mutex_lock(&c->tnc_mutex); 2539 mutex_lock(&c->tnc_mutex);
2542 dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); 2540 dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
2543 err = lookup_level0_dirty(c, key, &znode, &n); 2541 err = lookup_level0_dirty(c, key, &znode, &n);
2544 if (err < 0) 2542 if (err < 0)
2545 goto out_unlock; 2543 goto out_unlock;
@@ -2654,7 +2652,7 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
2654 dbg_dump_znode(c, znode); 2652 dbg_dump_znode(c, znode);
2655 goto out_unlock; 2653 goto out_unlock;
2656 } 2654 }
2657 dbg_tnc("removing %s", DBGKEY(key)); 2655 dbg_tnck(key, "removing key ");
2658 } 2656 }
2659 if (k) { 2657 if (k) {
2660 for (i = n + 1 + k; i < znode->child_cnt; i++) 2658 for (i = n + 1 + k; i < znode->child_cnt; i++)
@@ -2774,7 +2772,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
2774 struct ubifs_zbranch *zbr; 2772 struct ubifs_zbranch *zbr;
2775 union ubifs_key *dkey; 2773 union ubifs_key *dkey;
2776 2774
2777 dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); 2775 dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
2778 ubifs_assert(is_hash_key(c, key)); 2776 ubifs_assert(is_hash_key(c, key));
2779 2777
2780 mutex_lock(&c->tnc_mutex); 2778 mutex_lock(&c->tnc_mutex);
@@ -3333,9 +3331,9 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
3333 3331
3334out_dump: 3332out_dump:
3335 block = key_block(c, key); 3333 block = key_block(c, key);
3336 ubifs_err("inode %lu has size %lld, but there are data at offset %lld " 3334 ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
3337 "(data key %s)", (unsigned long)inode->i_ino, size, 3335 (unsigned long)inode->i_ino, size,
3338 ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key)); 3336 ((loff_t)block) << UBIFS_BLOCK_SHIFT);
3339 mutex_unlock(&c->tnc_mutex); 3337 mutex_unlock(&c->tnc_mutex);
3340 dbg_dump_inode(c, inode); 3338 dbg_dump_inode(c, inode);
3341 dbg_dump_stack(); 3339 dbg_dump_stack();
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index b48db999903e..dc28fe6ec07a 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -328,8 +328,8 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len,
328 case UBIFS_XENT_KEY: 328 case UBIFS_XENT_KEY:
329 break; 329 break;
330 default: 330 default:
331 dbg_msg("bad key type at slot %d: %s", i, 331 dbg_msg("bad key type at slot %d: %d",
332 DBGKEY(&zbr->key)); 332 i, key_type(c, &zbr->key));
333 err = 3; 333 err = 3;
334 goto out_dump; 334 goto out_dump;
335 } 335 }
@@ -475,7 +475,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
475 zbr->offs); 475 zbr->offs);
476 476
477 if (err) { 477 if (err) {
478 dbg_tnc("key %s", DBGKEY(key)); 478 dbg_tnck(key, "key ");
479 return err; 479 return err;
480 } 480 }
481 481
@@ -484,8 +484,8 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
484 if (!keys_eq(c, key, &key1)) { 484 if (!keys_eq(c, key, &key1)) {
485 ubifs_err("bad key in node at LEB %d:%d", 485 ubifs_err("bad key in node at LEB %d:%d",
486 zbr->lnum, zbr->offs); 486 zbr->lnum, zbr->offs);
487 dbg_tnc("looked for key %s found node's key %s", 487 dbg_tnck(key, "looked for key ");
488 DBGKEY(key), DBGKEY1(&key1)); 488 dbg_tnck(&key1, "but found node's key ");
489 dbg_dump_node(c, node); 489 dbg_dump_node(c, node);
490 return -EINVAL; 490 return -EINVAL;
491 } 491 }
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index bf18f7a04544..85b272268754 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -138,12 +138,11 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
138 ui = ubifs_inode(inode); 138 ui = ubifs_inode(inode);
139 ui->xattr = 1; 139 ui->xattr = 1;
140 ui->flags |= UBIFS_XATTR_FL; 140 ui->flags |= UBIFS_XATTR_FL;
141 ui->data = kmalloc(size, GFP_NOFS); 141 ui->data = kmemdup(value, size, GFP_NOFS);
142 if (!ui->data) { 142 if (!ui->data) {
143 err = -ENOMEM; 143 err = -ENOMEM;
144 goto out_free; 144 goto out_free;
145 } 145 }
146 memcpy(ui->data, value, size);
147 inode->i_size = ui->ui_size = size; 146 inode->i_size = ui->ui_size = size;
148 ui->data_len = size; 147 ui->data_len = size;
149 148
@@ -204,12 +203,11 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
204 return err; 203 return err;
205 204
206 kfree(ui->data); 205 kfree(ui->data);
207 ui->data = kmalloc(size, GFP_NOFS); 206 ui->data = kmemdup(value, size, GFP_NOFS);
208 if (!ui->data) { 207 if (!ui->data) {
209 err = -ENOMEM; 208 err = -ENOMEM;
210 goto out_free; 209 goto out_free;
211 } 210 }
212 memcpy(ui->data, value, size);
213 inode->i_size = ui->ui_size = size; 211 inode->i_size = ui->ui_size = size;
214 ui->data_len = size; 212 ui->data_len = size;
215 213
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d8ffa7cc661d..dca0c3881e82 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -125,7 +125,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
125 err = udf_expand_file_adinicb(inode); 125 err = udf_expand_file_adinicb(inode);
126 if (err) { 126 if (err) {
127 udf_debug("udf_expand_adinicb: err=%d\n", err); 127 udf_debug("udf_expand_adinicb: err=%d\n", err);
128 up_write(&iinfo->i_data_sem);
129 return err; 128 return err;
130 } 129 }
131 } else { 130 } else {
@@ -133,9 +132,10 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
133 iinfo->i_lenAlloc = pos + count; 132 iinfo->i_lenAlloc = pos + count;
134 else 133 else
135 iinfo->i_lenAlloc = inode->i_size; 134 iinfo->i_lenAlloc = inode->i_size;
135 up_write(&iinfo->i_data_sem);
136 } 136 }
137 } 137 } else
138 up_write(&iinfo->i_data_sem); 138 up_write(&iinfo->i_data_sem);
139 139
140 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos); 140 retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
141 if (retval > 0) 141 if (retval > 0)
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 4598904be1bb..7699df7b3198 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -53,8 +53,7 @@ static int udf_update_inode(struct inode *, int);
53static void udf_fill_inode(struct inode *, struct buffer_head *); 53static void udf_fill_inode(struct inode *, struct buffer_head *);
54static int udf_sync_inode(struct inode *inode); 54static int udf_sync_inode(struct inode *inode);
55static int udf_alloc_i_data(struct inode *inode, size_t size); 55static int udf_alloc_i_data(struct inode *inode, size_t size);
56static struct buffer_head *inode_getblk(struct inode *, sector_t, int *, 56static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
57 sector_t *, int *);
58static int8_t udf_insert_aext(struct inode *, struct extent_position, 57static int8_t udf_insert_aext(struct inode *, struct extent_position,
59 struct kernel_lb_addr, uint32_t); 58 struct kernel_lb_addr, uint32_t);
60static void udf_split_extents(struct inode *, int *, int, int, 59static void udf_split_extents(struct inode *, int *, int, int,
@@ -151,6 +150,12 @@ const struct address_space_operations udf_aops = {
151 .bmap = udf_bmap, 150 .bmap = udf_bmap,
152}; 151};
153 152
153/*
154 * Expand file stored in ICB to a normal one-block-file
155 *
156 * This function requires i_data_sem for writing and releases it.
157 * This function requires i_mutex held
158 */
154int udf_expand_file_adinicb(struct inode *inode) 159int udf_expand_file_adinicb(struct inode *inode)
155{ 160{
156 struct page *page; 161 struct page *page;
@@ -169,9 +174,15 @@ int udf_expand_file_adinicb(struct inode *inode)
169 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; 174 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
170 /* from now on we have normal address_space methods */ 175 /* from now on we have normal address_space methods */
171 inode->i_data.a_ops = &udf_aops; 176 inode->i_data.a_ops = &udf_aops;
177 up_write(&iinfo->i_data_sem);
172 mark_inode_dirty(inode); 178 mark_inode_dirty(inode);
173 return 0; 179 return 0;
174 } 180 }
181 /*
182 * Release i_data_sem so that we can lock a page - page lock ranks
183 * above i_data_sem. i_mutex still protects us against file changes.
184 */
185 up_write(&iinfo->i_data_sem);
175 186
176 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); 187 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
177 if (!page) 188 if (!page)
@@ -187,6 +198,7 @@ int udf_expand_file_adinicb(struct inode *inode)
187 SetPageUptodate(page); 198 SetPageUptodate(page);
188 kunmap(page); 199 kunmap(page);
189 } 200 }
201 down_write(&iinfo->i_data_sem);
190 memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, 202 memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
191 iinfo->i_lenAlloc); 203 iinfo->i_lenAlloc);
192 iinfo->i_lenAlloc = 0; 204 iinfo->i_lenAlloc = 0;
@@ -196,17 +208,20 @@ int udf_expand_file_adinicb(struct inode *inode)
196 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; 208 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
197 /* from now on we have normal address_space methods */ 209 /* from now on we have normal address_space methods */
198 inode->i_data.a_ops = &udf_aops; 210 inode->i_data.a_ops = &udf_aops;
211 up_write(&iinfo->i_data_sem);
199 err = inode->i_data.a_ops->writepage(page, &udf_wbc); 212 err = inode->i_data.a_ops->writepage(page, &udf_wbc);
200 if (err) { 213 if (err) {
201 /* Restore everything back so that we don't lose data... */ 214 /* Restore everything back so that we don't lose data... */
202 lock_page(page); 215 lock_page(page);
203 kaddr = kmap(page); 216 kaddr = kmap(page);
217 down_write(&iinfo->i_data_sem);
204 memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, 218 memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
205 inode->i_size); 219 inode->i_size);
206 kunmap(page); 220 kunmap(page);
207 unlock_page(page); 221 unlock_page(page);
208 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; 222 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
209 inode->i_data.a_ops = &udf_adinicb_aops; 223 inode->i_data.a_ops = &udf_adinicb_aops;
224 up_write(&iinfo->i_data_sem);
210 } 225 }
211 page_cache_release(page); 226 page_cache_release(page);
212 mark_inode_dirty(inode); 227 mark_inode_dirty(inode);
@@ -310,7 +325,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
310 struct buffer_head *bh_result, int create) 325 struct buffer_head *bh_result, int create)
311{ 326{
312 int err, new; 327 int err, new;
313 struct buffer_head *bh;
314 sector_t phys = 0; 328 sector_t phys = 0;
315 struct udf_inode_info *iinfo; 329 struct udf_inode_info *iinfo;
316 330
@@ -323,7 +337,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
323 337
324 err = -EIO; 338 err = -EIO;
325 new = 0; 339 new = 0;
326 bh = NULL;
327 iinfo = UDF_I(inode); 340 iinfo = UDF_I(inode);
328 341
329 down_write(&iinfo->i_data_sem); 342 down_write(&iinfo->i_data_sem);
@@ -332,13 +345,10 @@ static int udf_get_block(struct inode *inode, sector_t block,
332 iinfo->i_next_alloc_goal++; 345 iinfo->i_next_alloc_goal++;
333 } 346 }
334 347
335 err = 0;
336 348
337 bh = inode_getblk(inode, block, &err, &phys, &new); 349 phys = inode_getblk(inode, block, &err, &new);
338 BUG_ON(bh); 350 if (!phys)
339 if (err)
340 goto abort; 351 goto abort;
341 BUG_ON(!phys);
342 352
343 if (new) 353 if (new)
344 set_buffer_new(bh_result); 354 set_buffer_new(bh_result);
@@ -547,11 +557,10 @@ out:
547 return err; 557 return err;
548} 558}
549 559
550static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, 560static sector_t inode_getblk(struct inode *inode, sector_t block,
551 int *err, sector_t *phys, int *new) 561 int *err, int *new)
552{ 562{
553 static sector_t last_block; 563 static sector_t last_block;
554 struct buffer_head *result = NULL;
555 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; 564 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE];
556 struct extent_position prev_epos, cur_epos, next_epos; 565 struct extent_position prev_epos, cur_epos, next_epos;
557 int count = 0, startnum = 0, endnum = 0; 566 int count = 0, startnum = 0, endnum = 0;
@@ -566,6 +575,8 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
566 int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; 575 int goal = 0, pgoal = iinfo->i_location.logicalBlockNum;
567 int lastblock = 0; 576 int lastblock = 0;
568 577
578 *err = 0;
579 *new = 0;
569 prev_epos.offset = udf_file_entry_alloc_offset(inode); 580 prev_epos.offset = udf_file_entry_alloc_offset(inode);
570 prev_epos.block = iinfo->i_location; 581 prev_epos.block = iinfo->i_location;
571 prev_epos.bh = NULL; 582 prev_epos.bh = NULL;
@@ -635,8 +646,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
635 brelse(cur_epos.bh); 646 brelse(cur_epos.bh);
636 brelse(next_epos.bh); 647 brelse(next_epos.bh);
637 newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); 648 newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
638 *phys = newblock; 649 return newblock;
639 return NULL;
640 } 650 }
641 651
642 last_block = block; 652 last_block = block;
@@ -664,7 +674,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
664 brelse(cur_epos.bh); 674 brelse(cur_epos.bh);
665 brelse(next_epos.bh); 675 brelse(next_epos.bh);
666 *err = ret; 676 *err = ret;
667 return NULL; 677 return 0;
668 } 678 }
669 c = 0; 679 c = 0;
670 offset = 0; 680 offset = 0;
@@ -729,7 +739,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
729 if (!newblocknum) { 739 if (!newblocknum) {
730 brelse(prev_epos.bh); 740 brelse(prev_epos.bh);
731 *err = -ENOSPC; 741 *err = -ENOSPC;
732 return NULL; 742 return 0;
733 } 743 }
734 iinfo->i_lenExtents += inode->i_sb->s_blocksize; 744 iinfo->i_lenExtents += inode->i_sb->s_blocksize;
735 } 745 }
@@ -761,10 +771,10 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
761 771
762 newblock = udf_get_pblock(inode->i_sb, newblocknum, 772 newblock = udf_get_pblock(inode->i_sb, newblocknum,
763 iinfo->i_location.partitionReferenceNum, 0); 773 iinfo->i_location.partitionReferenceNum, 0);
764 if (!newblock) 774 if (!newblock) {
765 return NULL; 775 *err = -EIO;
766 *phys = newblock; 776 return 0;
767 *err = 0; 777 }
768 *new = 1; 778 *new = 1;
769 iinfo->i_next_alloc_block = block; 779 iinfo->i_next_alloc_block = block;
770 iinfo->i_next_alloc_goal = newblocknum; 780 iinfo->i_next_alloc_goal = newblocknum;
@@ -775,7 +785,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
775 else 785 else
776 mark_inode_dirty(inode); 786 mark_inode_dirty(inode);
777 787
778 return result; 788 return newblock;
779} 789}
780 790
781static void udf_split_extents(struct inode *inode, int *c, int offset, 791static void udf_split_extents(struct inode *inode, int *c, int offset,
@@ -1111,10 +1121,9 @@ int udf_setsize(struct inode *inode, loff_t newsize)
1111 if (bsize < 1121 if (bsize <
1112 (udf_file_entry_alloc_offset(inode) + newsize)) { 1122 (udf_file_entry_alloc_offset(inode) + newsize)) {
1113 err = udf_expand_file_adinicb(inode); 1123 err = udf_expand_file_adinicb(inode);
1114 if (err) { 1124 if (err)
1115 up_write(&iinfo->i_data_sem);
1116 return err; 1125 return err;
1117 } 1126 down_write(&iinfo->i_data_sem);
1118 } else 1127 } else
1119 iinfo->i_lenAlloc = newsize; 1128 iinfo->i_lenAlloc = newsize;
1120 } 1129 }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 0c33225647a0..c09a84daaf50 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1798,6 +1798,12 @@ static void udf_close_lvid(struct super_block *sb)
1798 le16_to_cpu(lvid->descTag.descCRCLength))); 1798 le16_to_cpu(lvid->descTag.descCRCLength)));
1799 1799
1800 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1800 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1801 /*
1802 * We set buffer uptodate unconditionally here to avoid spurious
1803 * warnings from mark_buffer_dirty() when previous EIO has marked
1804 * the buffer as !uptodate
1805 */
1806 set_buffer_uptodate(bh);
1801 mark_buffer_dirty(bh); 1807 mark_buffer_dirty(bh);
1802 sbi->s_lvid_dirty = 0; 1808 sbi->s_lvid_dirty = 0;
1803 mutex_unlock(&sbi->s_alloc_mutex); 1809 mutex_unlock(&sbi->s_alloc_mutex);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index b1d4488b0f14..d7c6dbe4194b 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -41,10 +41,16 @@ static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
41 pc = (struct pathComponent *)(from + elen); 41 pc = (struct pathComponent *)(from + elen);
42 switch (pc->componentType) { 42 switch (pc->componentType) {
43 case 1: 43 case 1:
44 if (pc->lengthComponentIdent == 0) { 44 /*
45 p = to; 45 * Symlink points to some place which should be agreed
46 *p++ = '/'; 46 * upon between originator and receiver of the media. Ignore.
47 } 47 */
48 if (pc->lengthComponentIdent > 0)
49 break;
50 /* Fall through */
51 case 2:
52 p = to;
53 *p++ = '/';
48 break; 54 break;
49 case 3: 55 case 3:
50 memcpy(p, "../", 3); 56 memcpy(p, "../", 3);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 574d4ee9b625..74b9baf36ac3 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,8 +111,7 @@ xfs_ioend_new_eof(
111 xfs_fsize_t bsize; 111 xfs_fsize_t bsize;
112 112
113 bsize = ioend->io_offset + ioend->io_size; 113 bsize = ioend->io_offset + ioend->io_size;
114 isize = MAX(ip->i_size, ip->i_new_size); 114 isize = MIN(i_size_read(VFS_I(ip)), bsize);
115 isize = MIN(isize, bsize);
116 return isize > ip->i_d.di_size ? isize : 0; 115 return isize > ip->i_d.di_size ? isize : 0;
117} 116}
118 117
@@ -126,11 +125,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
126} 125}
127 126
128/* 127/*
129 * Update on-disk file size now that data has been written to disk. The 128 * Update on-disk file size now that data has been written to disk.
130 * current in-memory file size is i_size. If a write is beyond eof i_new_size
131 * will be the intended file size until i_size is updated. If this write does
132 * not extend all the way to the valid file size then restrict this update to
133 * the end of the write.
134 * 129 *
135 * This function does not block as blocking on the inode lock in IO completion 130 * This function does not block as blocking on the inode lock in IO completion
136 * can lead to IO completion order dependency deadlocks.. If it can't get the 131 * can lead to IO completion order dependency deadlocks.. If it can't get the
@@ -1279,6 +1274,15 @@ xfs_end_io_direct_write(
1279 struct xfs_ioend *ioend = iocb->private; 1274 struct xfs_ioend *ioend = iocb->private;
1280 1275
1281 /* 1276 /*
1277 * While the generic direct I/O code updates the inode size, it does
1278 * so only after the end_io handler is called, which means our
1279 * end_io handler thinks the on-disk size is outside the in-core
1280 * size. To prevent this just update it a little bit earlier here.
1281 */
1282 if (offset + size > i_size_read(ioend->io_inode))
1283 i_size_write(ioend->io_inode, offset + size);
1284
1285 /*
1282 * blockdev_direct_IO can return an error even after the I/O 1286 * blockdev_direct_IO can return an error even after the I/O
1283 * completion handler was called. Thus we need to protect 1287 * completion handler was called. Thus we need to protect
1284 * against double-freeing. 1288 * against double-freeing.
@@ -1340,12 +1344,11 @@ xfs_vm_write_failed(
1340 1344
1341 if (to > inode->i_size) { 1345 if (to > inode->i_size) {
1342 /* 1346 /*
1343 * punch out the delalloc blocks we have already allocated. We 1347 * Punch out the delalloc blocks we have already allocated.
1344 * don't call xfs_setattr() to do this as we may be in the 1348 *
1345 * middle of a multi-iovec write and so the vfs inode->i_size 1349 * Don't bother with xfs_setattr given that nothing can have
1346 * will not match the xfs ip->i_size and so it will zero too 1350 * made it to disk yet as the page is still locked at this
1347 * much. Hence we jus truncate the page cache to zero what is 1351 * point.
1348 * necessary and punch the delalloc blocks directly.
1349 */ 1352 */
1350 struct xfs_inode *ip = XFS_I(inode); 1353 struct xfs_inode *ip = XFS_I(inode);
1351 xfs_fileoff_t start_fsb; 1354 xfs_fileoff_t start_fsb;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 1e5d97f86ea8..08b9ac644c31 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -827,10 +827,6 @@ xfs_attr_inactive(xfs_inode_t *dp)
827 if (error) 827 if (error)
828 goto out; 828 goto out;
829 829
830 /*
831 * Commit the last in the sequence of transactions.
832 */
833 xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE);
834 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); 830 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
835 xfs_iunlock(dp, XFS_ILOCK_EXCL); 831 xfs_iunlock(dp, XFS_ILOCK_EXCL);
836 832
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index c1b55e596551..d25eafd4d28d 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -271,10 +271,6 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff)
271 dp = args->dp; 271 dp = args->dp;
272 mp = dp->i_mount; 272 mp = dp->i_mount;
273 dp->i_d.di_forkoff = forkoff; 273 dp->i_d.di_forkoff = forkoff;
274 dp->i_df.if_ext_max =
275 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
276 dp->i_afp->if_ext_max =
277 XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
278 274
279 ifp = dp->i_afp; 275 ifp = dp->i_afp;
280 ASSERT(ifp->if_flags & XFS_IFINLINE); 276 ASSERT(ifp->if_flags & XFS_IFINLINE);
@@ -326,7 +322,6 @@ xfs_attr_fork_reset(
326 ASSERT(ip->i_d.di_anextents == 0); 322 ASSERT(ip->i_d.di_anextents == 0);
327 ASSERT(ip->i_afp == NULL); 323 ASSERT(ip->i_afp == NULL);
328 324
329 ip->i_df.if_ext_max = XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
330 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 325 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
331} 326}
332 327
@@ -389,10 +384,6 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
389 (args->op_flags & XFS_DA_OP_ADDNAME) || 384 (args->op_flags & XFS_DA_OP_ADDNAME) ||
390 !(mp->m_flags & XFS_MOUNT_ATTR2) || 385 !(mp->m_flags & XFS_MOUNT_ATTR2) ||
391 dp->i_d.di_format == XFS_DINODE_FMT_BTREE); 386 dp->i_d.di_format == XFS_DINODE_FMT_BTREE);
392 dp->i_afp->if_ext_max =
393 XFS_IFORK_ASIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
394 dp->i_df.if_ext_max =
395 XFS_IFORK_DSIZE(dp) / (uint)sizeof(xfs_bmbt_rec_t);
396 xfs_trans_log_inode(args->trans, dp, 387 xfs_trans_log_inode(args->trans, dp,
397 XFS_ILOG_CORE | XFS_ILOG_ADATA); 388 XFS_ILOG_CORE | XFS_ILOG_ADATA);
398 } 389 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index d0ab78837057..188ef2fbd628 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -249,7 +249,27 @@ xfs_bmbt_lookup_ge(
249} 249}
250 250
251/* 251/*
252* Update the record referred to by cur to the value given 252 * Check if the inode needs to be converted to btree format.
253 */
254static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
255{
256 return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
257 XFS_IFORK_NEXTENTS(ip, whichfork) >
258 XFS_IFORK_MAXEXT(ip, whichfork);
259}
260
261/*
262 * Check if the inode should be converted to extent format.
263 */
264static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
265{
266 return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
267 XFS_IFORK_NEXTENTS(ip, whichfork) <=
268 XFS_IFORK_MAXEXT(ip, whichfork);
269}
270
271/*
272 * Update the record referred to by cur to the value given
253 * by [off, bno, len, state]. 273 * by [off, bno, len, state].
254 * This either works (return 0) or gets an EFSCORRUPTED error. 274 * This either works (return 0) or gets an EFSCORRUPTED error.
255 */ 275 */
@@ -683,8 +703,8 @@ xfs_bmap_add_extent_delay_real(
683 goto done; 703 goto done;
684 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 704 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
685 } 705 }
686 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 706
687 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 707 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
688 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 708 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
689 bma->firstblock, bma->flist, 709 bma->firstblock, bma->flist,
690 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); 710 &bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
@@ -767,8 +787,8 @@ xfs_bmap_add_extent_delay_real(
767 goto done; 787 goto done;
768 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 788 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
769 } 789 }
770 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 790
771 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 791 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
772 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 792 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
773 bma->firstblock, bma->flist, &bma->cur, 1, 793 bma->firstblock, bma->flist, &bma->cur, 1,
774 &tmp_rval, XFS_DATA_FORK); 794 &tmp_rval, XFS_DATA_FORK);
@@ -836,8 +856,8 @@ xfs_bmap_add_extent_delay_real(
836 goto done; 856 goto done;
837 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 857 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
838 } 858 }
839 if (bma->ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 859
840 bma->ip->i_d.di_nextents > bma->ip->i_df.if_ext_max) { 860 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
841 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, 861 error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
842 bma->firstblock, bma->flist, &bma->cur, 862 bma->firstblock, bma->flist, &bma->cur,
843 1, &tmp_rval, XFS_DATA_FORK); 863 1, &tmp_rval, XFS_DATA_FORK);
@@ -884,8 +904,7 @@ xfs_bmap_add_extent_delay_real(
884 } 904 }
885 905
886 /* convert to a btree if necessary */ 906 /* convert to a btree if necessary */
887 if (XFS_IFORK_FORMAT(bma->ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && 907 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
888 XFS_IFORK_NEXTENTS(bma->ip, XFS_DATA_FORK) > ifp->if_ext_max) {
889 int tmp_logflags; /* partial log flag return val */ 908 int tmp_logflags; /* partial log flag return val */
890 909
891 ASSERT(bma->cur == NULL); 910 ASSERT(bma->cur == NULL);
@@ -1421,8 +1440,7 @@ xfs_bmap_add_extent_unwritten_real(
1421 } 1440 }
1422 1441
1423 /* convert to a btree if necessary */ 1442 /* convert to a btree if necessary */
1424 if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_EXTENTS && 1443 if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
1425 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > ifp->if_ext_max) {
1426 int tmp_logflags; /* partial log flag return val */ 1444 int tmp_logflags; /* partial log flag return val */
1427 1445
1428 ASSERT(cur == NULL); 1446 ASSERT(cur == NULL);
@@ -1812,8 +1830,7 @@ xfs_bmap_add_extent_hole_real(
1812 } 1830 }
1813 1831
1814 /* convert to a btree if necessary */ 1832 /* convert to a btree if necessary */
1815 if (XFS_IFORK_FORMAT(bma->ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 1833 if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
1816 XFS_IFORK_NEXTENTS(bma->ip, whichfork) > ifp->if_ext_max) {
1817 int tmp_logflags; /* partial log flag return val */ 1834 int tmp_logflags; /* partial log flag return val */
1818 1835
1819 ASSERT(bma->cur == NULL); 1836 ASSERT(bma->cur == NULL);
@@ -3037,8 +3054,7 @@ xfs_bmap_extents_to_btree(
3037 3054
3038 ifp = XFS_IFORK_PTR(ip, whichfork); 3055 ifp = XFS_IFORK_PTR(ip, whichfork);
3039 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS); 3056 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
3040 ASSERT(ifp->if_ext_max == 3057
3041 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
3042 /* 3058 /*
3043 * Make space in the inode incore. 3059 * Make space in the inode incore.
3044 */ 3060 */
@@ -3184,13 +3200,8 @@ xfs_bmap_forkoff_reset(
3184 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) { 3200 ip->i_d.di_format != XFS_DINODE_FMT_BTREE) {
3185 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3; 3201 uint dfl_forkoff = xfs_default_attroffset(ip) >> 3;
3186 3202
3187 if (dfl_forkoff > ip->i_d.di_forkoff) { 3203 if (dfl_forkoff > ip->i_d.di_forkoff)
3188 ip->i_d.di_forkoff = dfl_forkoff; 3204 ip->i_d.di_forkoff = dfl_forkoff;
3189 ip->i_df.if_ext_max =
3190 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t);
3191 ip->i_afp->if_ext_max =
3192 XFS_IFORK_ASIZE(ip) / sizeof(xfs_bmbt_rec_t);
3193 }
3194 } 3205 }
3195} 3206}
3196 3207
@@ -3430,8 +3441,6 @@ xfs_bmap_add_attrfork(
3430 int error; /* error return value */ 3441 int error; /* error return value */
3431 3442
3432 ASSERT(XFS_IFORK_Q(ip) == 0); 3443 ASSERT(XFS_IFORK_Q(ip) == 0);
3433 ASSERT(ip->i_df.if_ext_max ==
3434 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3435 3444
3436 mp = ip->i_mount; 3445 mp = ip->i_mount;
3437 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 3446 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
@@ -3486,12 +3495,9 @@ xfs_bmap_add_attrfork(
3486 error = XFS_ERROR(EINVAL); 3495 error = XFS_ERROR(EINVAL);
3487 goto error1; 3496 goto error1;
3488 } 3497 }
3489 ip->i_df.if_ext_max = 3498
3490 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
3491 ASSERT(ip->i_afp == NULL); 3499 ASSERT(ip->i_afp == NULL);
3492 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP); 3500 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
3493 ip->i_afp->if_ext_max =
3494 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
3495 ip->i_afp->if_flags = XFS_IFEXTENTS; 3501 ip->i_afp->if_flags = XFS_IFEXTENTS;
3496 logflags = 0; 3502 logflags = 0;
3497 xfs_bmap_init(&flist, &firstblock); 3503 xfs_bmap_init(&flist, &firstblock);
@@ -3535,20 +3541,17 @@ xfs_bmap_add_attrfork(
3535 } else 3541 } else
3536 spin_unlock(&mp->m_sb_lock); 3542 spin_unlock(&mp->m_sb_lock);
3537 } 3543 }
3538 if ((error = xfs_bmap_finish(&tp, &flist, &committed))) 3544
3545 error = xfs_bmap_finish(&tp, &flist, &committed);
3546 if (error)
3539 goto error2; 3547 goto error2;
3540 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 3548 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3541 ASSERT(ip->i_df.if_ext_max ==
3542 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3543 return error;
3544error2: 3549error2:
3545 xfs_bmap_cancel(&flist); 3550 xfs_bmap_cancel(&flist);
3546error1: 3551error1:
3547 xfs_iunlock(ip, XFS_ILOCK_EXCL); 3552 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3548error0: 3553error0:
3549 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 3554 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
3550 ASSERT(ip->i_df.if_ext_max ==
3551 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
3552 return error; 3555 return error;
3553} 3556}
3554 3557
@@ -3994,11 +3997,8 @@ xfs_bmap_one_block(
3994 xfs_bmbt_irec_t s; /* internal version of extent */ 3997 xfs_bmbt_irec_t s; /* internal version of extent */
3995 3998
3996#ifndef DEBUG 3999#ifndef DEBUG
3997 if (whichfork == XFS_DATA_FORK) { 4000 if (whichfork == XFS_DATA_FORK)
3998 return S_ISREG(ip->i_d.di_mode) ? 4001 return XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize;
3999 (ip->i_size == ip->i_mount->m_sb.sb_blocksize) :
4000 (ip->i_d.di_size == ip->i_mount->m_sb.sb_blocksize);
4001 }
4002#endif /* !DEBUG */ 4002#endif /* !DEBUG */
4003 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1) 4003 if (XFS_IFORK_NEXTENTS(ip, whichfork) != 1)
4004 return 0; 4004 return 0;
@@ -4010,7 +4010,7 @@ xfs_bmap_one_block(
4010 xfs_bmbt_get_all(ep, &s); 4010 xfs_bmbt_get_all(ep, &s);
4011 rval = s.br_startoff == 0 && s.br_blockcount == 1; 4011 rval = s.br_startoff == 0 && s.br_blockcount == 1;
4012 if (rval && whichfork == XFS_DATA_FORK) 4012 if (rval && whichfork == XFS_DATA_FORK)
4013 ASSERT(ip->i_size == ip->i_mount->m_sb.sb_blocksize); 4013 ASSERT(XFS_ISIZE(ip) == ip->i_mount->m_sb.sb_blocksize);
4014 return rval; 4014 return rval;
4015} 4015}
4016 4016
@@ -4379,8 +4379,6 @@ xfs_bmapi_read(
4379 XFS_STATS_INC(xs_blk_mapr); 4379 XFS_STATS_INC(xs_blk_mapr);
4380 4380
4381 ifp = XFS_IFORK_PTR(ip, whichfork); 4381 ifp = XFS_IFORK_PTR(ip, whichfork);
4382 ASSERT(ifp->if_ext_max ==
4383 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4384 4382
4385 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 4383 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
4386 error = xfs_iread_extents(NULL, ip, whichfork); 4384 error = xfs_iread_extents(NULL, ip, whichfork);
@@ -4871,8 +4869,6 @@ xfs_bmapi_write(
4871 return XFS_ERROR(EIO); 4869 return XFS_ERROR(EIO);
4872 4870
4873 ifp = XFS_IFORK_PTR(ip, whichfork); 4871 ifp = XFS_IFORK_PTR(ip, whichfork);
4874 ASSERT(ifp->if_ext_max ==
4875 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4876 4872
4877 XFS_STATS_INC(xs_blk_mapw); 4873 XFS_STATS_INC(xs_blk_mapw);
4878 4874
@@ -4981,8 +4977,7 @@ xfs_bmapi_write(
4981 /* 4977 /*
4982 * Transform from btree to extents, give it cur. 4978 * Transform from btree to extents, give it cur.
4983 */ 4979 */
4984 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && 4980 if (xfs_bmap_wants_extents(ip, whichfork)) {
4985 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
4986 int tmp_logflags = 0; 4981 int tmp_logflags = 0;
4987 4982
4988 ASSERT(bma.cur); 4983 ASSERT(bma.cur);
@@ -4992,10 +4987,10 @@ xfs_bmapi_write(
4992 if (error) 4987 if (error)
4993 goto error0; 4988 goto error0;
4994 } 4989 }
4995 ASSERT(ifp->if_ext_max == 4990
4996 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
4997 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE || 4991 ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE ||
4998 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max); 4992 XFS_IFORK_NEXTENTS(ip, whichfork) >
4993 XFS_IFORK_MAXEXT(ip, whichfork));
4999 error = 0; 4994 error = 0;
5000error0: 4995error0:
5001 /* 4996 /*
@@ -5095,8 +5090,7 @@ xfs_bunmapi(
5095 5090
5096 ASSERT(len > 0); 5091 ASSERT(len > 0);
5097 ASSERT(nexts >= 0); 5092 ASSERT(nexts >= 0);
5098 ASSERT(ifp->if_ext_max == 5093
5099 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5100 if (!(ifp->if_flags & XFS_IFEXTENTS) && 5094 if (!(ifp->if_flags & XFS_IFEXTENTS) &&
5101 (error = xfs_iread_extents(tp, ip, whichfork))) 5095 (error = xfs_iread_extents(tp, ip, whichfork)))
5102 return error; 5096 return error;
@@ -5322,7 +5316,8 @@ xfs_bunmapi(
5322 */ 5316 */
5323 if (!wasdel && xfs_trans_get_block_res(tp) == 0 && 5317 if (!wasdel && xfs_trans_get_block_res(tp) == 0 &&
5324 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 5318 XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
5325 XFS_IFORK_NEXTENTS(ip, whichfork) >= ifp->if_ext_max && 5319 XFS_IFORK_NEXTENTS(ip, whichfork) >= /* Note the >= */
5320 XFS_IFORK_MAXEXT(ip, whichfork) &&
5326 del.br_startoff > got.br_startoff && 5321 del.br_startoff > got.br_startoff &&
5327 del.br_startoff + del.br_blockcount < 5322 del.br_startoff + del.br_blockcount <
5328 got.br_startoff + got.br_blockcount) { 5323 got.br_startoff + got.br_blockcount) {
@@ -5353,13 +5348,11 @@ nodelete:
5353 } 5348 }
5354 } 5349 }
5355 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0; 5350 *done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
5356 ASSERT(ifp->if_ext_max == 5351
5357 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5358 /* 5352 /*
5359 * Convert to a btree if necessary. 5353 * Convert to a btree if necessary.
5360 */ 5354 */
5361 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS && 5355 if (xfs_bmap_needs_btree(ip, whichfork)) {
5362 XFS_IFORK_NEXTENTS(ip, whichfork) > ifp->if_ext_max) {
5363 ASSERT(cur == NULL); 5356 ASSERT(cur == NULL);
5364 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist, 5357 error = xfs_bmap_extents_to_btree(tp, ip, firstblock, flist,
5365 &cur, 0, &tmp_logflags, whichfork); 5358 &cur, 0, &tmp_logflags, whichfork);
@@ -5370,8 +5363,7 @@ nodelete:
5370 /* 5363 /*
5371 * transform from btree to extents, give it cur 5364 * transform from btree to extents, give it cur
5372 */ 5365 */
5373 else if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE && 5366 else if (xfs_bmap_wants_extents(ip, whichfork)) {
5374 XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max) {
5375 ASSERT(cur != NULL); 5367 ASSERT(cur != NULL);
5376 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags, 5368 error = xfs_bmap_btree_to_extents(tp, ip, cur, &tmp_logflags,
5377 whichfork); 5369 whichfork);
@@ -5382,8 +5374,6 @@ nodelete:
5382 /* 5374 /*
5383 * transform from extents to local? 5375 * transform from extents to local?
5384 */ 5376 */
5385 ASSERT(ifp->if_ext_max ==
5386 XFS_IFORK_SIZE(ip, whichfork) / (uint)sizeof(xfs_bmbt_rec_t));
5387 error = 0; 5377 error = 0;
5388error0: 5378error0:
5389 /* 5379 /*
@@ -5434,7 +5424,7 @@ xfs_getbmapx_fix_eof_hole(
5434 if (startblock == HOLESTARTBLOCK) { 5424 if (startblock == HOLESTARTBLOCK) {
5435 mp = ip->i_mount; 5425 mp = ip->i_mount;
5436 out->bmv_block = -1; 5426 out->bmv_block = -1;
5437 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, ip->i_size)); 5427 fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
5438 fixlen -= out->bmv_offset; 5428 fixlen -= out->bmv_offset;
5439 if (prealloced && out->bmv_offset + out->bmv_length == end) { 5429 if (prealloced && out->bmv_offset + out->bmv_length == end) {
5440 /* Came to hole at EOF. Trim it. */ 5430 /* Came to hole at EOF. Trim it. */
@@ -5522,7 +5512,7 @@ xfs_getbmap(
5522 fixlen = XFS_MAXIOFFSET(mp); 5512 fixlen = XFS_MAXIOFFSET(mp);
5523 } else { 5513 } else {
5524 prealloced = 0; 5514 prealloced = 0;
5525 fixlen = ip->i_size; 5515 fixlen = XFS_ISIZE(ip);
5526 } 5516 }
5527 } 5517 }
5528 5518
@@ -5551,7 +5541,7 @@ xfs_getbmap(
5551 5541
5552 xfs_ilock(ip, XFS_IOLOCK_SHARED); 5542 xfs_ilock(ip, XFS_IOLOCK_SHARED);
5553 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) { 5543 if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
5554 if (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size) { 5544 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
5555 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF); 5545 error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
5556 if (error) 5546 if (error)
5557 goto out_unlock_iolock; 5547 goto out_unlock_iolock;
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 654dc6f05bac..dd974a55c77d 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -163,12 +163,14 @@ xfs_swap_extents_check_format(
163 163
164 /* Check temp in extent form to max in target */ 164 /* Check temp in extent form to max in target */
165 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 165 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
166 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) 166 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
167 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
167 return EINVAL; 168 return EINVAL;
168 169
169 /* Check target in extent form to max in temp */ 170 /* Check target in extent form to max in temp */
170 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 171 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
171 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) 172 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
173 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
172 return EINVAL; 174 return EINVAL;
173 175
174 /* 176 /*
@@ -180,18 +182,25 @@ xfs_swap_extents_check_format(
180 * (a common defrag case) which will occur when the temp inode is in 182 * (a common defrag case) which will occur when the temp inode is in
181 * extent format... 183 * extent format...
182 */ 184 */
183 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && 185 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
184 ((XFS_IFORK_BOFF(ip) && 186 if (XFS_IFORK_BOFF(ip) &&
185 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || 187 tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
186 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) 188 return EINVAL;
187 return EINVAL; 189 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
190 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
191 return EINVAL;
192 }
188 193
189 /* Reciprocal target->temp btree format checks */ 194 /* Reciprocal target->temp btree format checks */
190 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && 195 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
191 ((XFS_IFORK_BOFF(tip) && 196 if (XFS_IFORK_BOFF(tip) &&
192 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || 197 ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
193 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) 198 return EINVAL;
194 return EINVAL; 199
200 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
201 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
202 return EINVAL;
203 }
195 204
196 return 0; 205 return 0;
197} 206}
@@ -349,16 +358,6 @@ xfs_swap_extents(
349 *tifp = *tempifp; /* struct copy */ 358 *tifp = *tempifp; /* struct copy */
350 359
351 /* 360 /*
352 * Fix the in-memory data fork values that are dependent on the fork
353 * offset in the inode. We can't assume they remain the same as attr2
354 * has dynamic fork offsets.
355 */
356 ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) /
357 (uint)sizeof(xfs_bmbt_rec_t);
358 tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) /
359 (uint)sizeof(xfs_bmbt_rec_t);
360
361 /*
362 * Fix the on-disk inode values 361 * Fix the on-disk inode values
363 */ 362 */
364 tmp = (__uint64_t)ip->i_d.di_nblocks; 363 tmp = (__uint64_t)ip->i_d.di_nblocks;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8a24f0c6c860..286a051f12cf 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -68,7 +68,7 @@ xfs_trim_extents(
68 * Look up the longest btree in the AGF and start with it. 68 * Look up the longest btree in the AGF and start with it.
69 */ 69 */
70 error = xfs_alloc_lookup_le(cur, 0, 70 error = xfs_alloc_lookup_le(cur, 0,
71 XFS_BUF_TO_AGF(agbp)->agf_longest, &i); 71 be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest), &i);
72 if (error) 72 if (error)
73 goto out_del_cursor; 73 goto out_del_cursor;
74 74
@@ -84,7 +84,7 @@ xfs_trim_extents(
84 if (error) 84 if (error)
85 goto out_del_cursor; 85 goto out_del_cursor;
86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); 86 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
87 ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest); 87 ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
88 88
89 /* 89 /*
90 * Too small? Give up. 90 * Too small? Give up.
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f675f3d9d7b3..7e5bc872f2b4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_aio_read(
327 mp->m_rtdev_targp : mp->m_ddev_targp; 327 mp->m_rtdev_targp : mp->m_ddev_targp;
328 if ((iocb->ki_pos & target->bt_smask) || 328 if ((iocb->ki_pos & target->bt_smask) ||
329 (size & target->bt_smask)) { 329 (size & target->bt_smask)) {
330 if (iocb->ki_pos == ip->i_size) 330 if (iocb->ki_pos == i_size_read(inode))
331 return 0; 331 return 0;
332 return -XFS_ERROR(EINVAL); 332 return -XFS_ERROR(EINVAL);
333 } 333 }
@@ -412,51 +412,6 @@ xfs_file_splice_read(
412 return ret; 412 return ret;
413} 413}
414 414
415STATIC void
416xfs_aio_write_isize_update(
417 struct inode *inode,
418 loff_t *ppos,
419 ssize_t bytes_written)
420{
421 struct xfs_inode *ip = XFS_I(inode);
422 xfs_fsize_t isize = i_size_read(inode);
423
424 if (bytes_written > 0)
425 XFS_STATS_ADD(xs_write_bytes, bytes_written);
426
427 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
428 *ppos > isize))
429 *ppos = isize;
430
431 if (*ppos > ip->i_size) {
432 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
433 if (*ppos > ip->i_size)
434 ip->i_size = *ppos;
435 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
436 }
437}
438
439/*
440 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
441 * part of the I/O may have been written to disk before the error occurred. In
442 * this case the on-disk file size may have been adjusted beyond the in-memory
443 * file size and now needs to be truncated back.
444 */
445STATIC void
446xfs_aio_write_newsize_update(
447 struct xfs_inode *ip,
448 xfs_fsize_t new_size)
449{
450 if (new_size == ip->i_new_size) {
451 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
452 if (new_size == ip->i_new_size)
453 ip->i_new_size = 0;
454 if (ip->i_d.di_size > ip->i_size)
455 ip->i_d.di_size = ip->i_size;
456 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
457 }
458}
459
460/* 415/*
461 * xfs_file_splice_write() does not use xfs_rw_ilock() because 416 * xfs_file_splice_write() does not use xfs_rw_ilock() because
462 * generic_file_splice_write() takes the i_mutex itself. This, in theory, 417 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
@@ -475,7 +430,6 @@ xfs_file_splice_write(
475{ 430{
476 struct inode *inode = outfilp->f_mapping->host; 431 struct inode *inode = outfilp->f_mapping->host;
477 struct xfs_inode *ip = XFS_I(inode); 432 struct xfs_inode *ip = XFS_I(inode);
478 xfs_fsize_t new_size;
479 int ioflags = 0; 433 int ioflags = 0;
480 ssize_t ret; 434 ssize_t ret;
481 435
@@ -489,19 +443,12 @@ xfs_file_splice_write(
489 443
490 xfs_ilock(ip, XFS_IOLOCK_EXCL); 444 xfs_ilock(ip, XFS_IOLOCK_EXCL);
491 445
492 new_size = *ppos + count;
493
494 xfs_ilock(ip, XFS_ILOCK_EXCL);
495 if (new_size > ip->i_size)
496 ip->i_new_size = new_size;
497 xfs_iunlock(ip, XFS_ILOCK_EXCL);
498
499 trace_xfs_file_splice_write(ip, count, *ppos, ioflags); 446 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
500 447
501 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 448 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
449 if (ret > 0)
450 XFS_STATS_ADD(xs_write_bytes, ret);
502 451
503 xfs_aio_write_isize_update(inode, ppos, ret);
504 xfs_aio_write_newsize_update(ip, new_size);
505 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 452 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
506 return ret; 453 return ret;
507} 454}
@@ -689,28 +636,26 @@ out_lock:
689/* 636/*
690 * Common pre-write limit and setup checks. 637 * Common pre-write limit and setup checks.
691 * 638 *
692 * Returns with iolock held according to @iolock. 639 * Called with the iolocked held either shared and exclusive according to
640 * @iolock, and returns with it held. Might upgrade the iolock to exclusive
641 * if called for a direct write beyond i_size.
693 */ 642 */
694STATIC ssize_t 643STATIC ssize_t
695xfs_file_aio_write_checks( 644xfs_file_aio_write_checks(
696 struct file *file, 645 struct file *file,
697 loff_t *pos, 646 loff_t *pos,
698 size_t *count, 647 size_t *count,
699 xfs_fsize_t *new_sizep,
700 int *iolock) 648 int *iolock)
701{ 649{
702 struct inode *inode = file->f_mapping->host; 650 struct inode *inode = file->f_mapping->host;
703 struct xfs_inode *ip = XFS_I(inode); 651 struct xfs_inode *ip = XFS_I(inode);
704 xfs_fsize_t new_size;
705 int error = 0; 652 int error = 0;
706 653
707 xfs_rw_ilock(ip, XFS_ILOCK_EXCL); 654 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
708 *new_sizep = 0;
709restart: 655restart:
710 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); 656 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
711 if (error) { 657 if (error) {
712 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 658 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
713 *iolock = 0;
714 return error; 659 return error;
715 } 660 }
716 661
@@ -720,36 +665,21 @@ restart:
720 /* 665 /*
721 * If the offset is beyond the size of the file, we need to zero any 666 * If the offset is beyond the size of the file, we need to zero any
722 * blocks that fall between the existing EOF and the start of this 667 * blocks that fall between the existing EOF and the start of this
723 * write. There is no need to issue zeroing if another in-flght IO ends 668 * write. If zeroing is needed and we are currently holding the
724 * at or before this one If zeronig is needed and we are currently 669 * iolock shared, we need to update it to exclusive which involves
725 * holding the iolock shared, we need to update it to exclusive which 670 * dropping all locks and relocking to maintain correct locking order.
726 * involves dropping all locks and relocking to maintain correct locking 671 * If we do this, restart the function to ensure all checks and values
727 * order. If we do this, restart the function to ensure all checks and 672 * are still valid.
728 * values are still valid.
729 */ 673 */
730 if ((ip->i_new_size && *pos > ip->i_new_size) || 674 if (*pos > i_size_read(inode)) {
731 (!ip->i_new_size && *pos > ip->i_size)) {
732 if (*iolock == XFS_IOLOCK_SHARED) { 675 if (*iolock == XFS_IOLOCK_SHARED) {
733 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock); 676 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
734 *iolock = XFS_IOLOCK_EXCL; 677 *iolock = XFS_IOLOCK_EXCL;
735 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock); 678 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
736 goto restart; 679 goto restart;
737 } 680 }
738 error = -xfs_zero_eof(ip, *pos, ip->i_size); 681 error = -xfs_zero_eof(ip, *pos, i_size_read(inode));
739 } 682 }
740
741 /*
742 * If this IO extends beyond EOF, we may need to update ip->i_new_size.
743 * We have already zeroed space beyond EOF (if necessary). Only update
744 * ip->i_new_size if this IO ends beyond any other in-flight writes.
745 */
746 new_size = *pos + *count;
747 if (new_size > ip->i_size) {
748 if (new_size > ip->i_new_size)
749 ip->i_new_size = new_size;
750 *new_sizep = new_size;
751 }
752
753 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL); 683 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
754 if (error) 684 if (error)
755 return error; 685 return error;
@@ -794,9 +724,7 @@ xfs_file_dio_aio_write(
794 const struct iovec *iovp, 724 const struct iovec *iovp,
795 unsigned long nr_segs, 725 unsigned long nr_segs,
796 loff_t pos, 726 loff_t pos,
797 size_t ocount, 727 size_t ocount)
798 xfs_fsize_t *new_size,
799 int *iolock)
800{ 728{
801 struct file *file = iocb->ki_filp; 729 struct file *file = iocb->ki_filp;
802 struct address_space *mapping = file->f_mapping; 730 struct address_space *mapping = file->f_mapping;
@@ -806,10 +734,10 @@ xfs_file_dio_aio_write(
806 ssize_t ret = 0; 734 ssize_t ret = 0;
807 size_t count = ocount; 735 size_t count = ocount;
808 int unaligned_io = 0; 736 int unaligned_io = 0;
737 int iolock;
809 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 738 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
810 mp->m_rtdev_targp : mp->m_ddev_targp; 739 mp->m_rtdev_targp : mp->m_ddev_targp;
811 740
812 *iolock = 0;
813 if ((pos & target->bt_smask) || (count & target->bt_smask)) 741 if ((pos & target->bt_smask) || (count & target->bt_smask))
814 return -XFS_ERROR(EINVAL); 742 return -XFS_ERROR(EINVAL);
815 743
@@ -824,31 +752,31 @@ xfs_file_dio_aio_write(
824 * EOF zeroing cases and fill out the new inode size as appropriate. 752 * EOF zeroing cases and fill out the new inode size as appropriate.
825 */ 753 */
826 if (unaligned_io || mapping->nrpages) 754 if (unaligned_io || mapping->nrpages)
827 *iolock = XFS_IOLOCK_EXCL; 755 iolock = XFS_IOLOCK_EXCL;
828 else 756 else
829 *iolock = XFS_IOLOCK_SHARED; 757 iolock = XFS_IOLOCK_SHARED;
830 xfs_rw_ilock(ip, *iolock); 758 xfs_rw_ilock(ip, iolock);
831 759
832 /* 760 /*
833 * Recheck if there are cached pages that need invalidate after we got 761 * Recheck if there are cached pages that need invalidate after we got
834 * the iolock to protect against other threads adding new pages while 762 * the iolock to protect against other threads adding new pages while
835 * we were waiting for the iolock. 763 * we were waiting for the iolock.
836 */ 764 */
837 if (mapping->nrpages && *iolock == XFS_IOLOCK_SHARED) { 765 if (mapping->nrpages && iolock == XFS_IOLOCK_SHARED) {
838 xfs_rw_iunlock(ip, *iolock); 766 xfs_rw_iunlock(ip, iolock);
839 *iolock = XFS_IOLOCK_EXCL; 767 iolock = XFS_IOLOCK_EXCL;
840 xfs_rw_ilock(ip, *iolock); 768 xfs_rw_ilock(ip, iolock);
841 } 769 }
842 770
843 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); 771 ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
844 if (ret) 772 if (ret)
845 return ret; 773 goto out;
846 774
847 if (mapping->nrpages) { 775 if (mapping->nrpages) {
848 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1, 776 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
849 FI_REMAPF_LOCKED); 777 FI_REMAPF_LOCKED);
850 if (ret) 778 if (ret)
851 return ret; 779 goto out;
852 } 780 }
853 781
854 /* 782 /*
@@ -857,15 +785,18 @@ xfs_file_dio_aio_write(
857 */ 785 */
858 if (unaligned_io) 786 if (unaligned_io)
859 inode_dio_wait(inode); 787 inode_dio_wait(inode);
860 else if (*iolock == XFS_IOLOCK_EXCL) { 788 else if (iolock == XFS_IOLOCK_EXCL) {
861 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 789 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
862 *iolock = XFS_IOLOCK_SHARED; 790 iolock = XFS_IOLOCK_SHARED;
863 } 791 }
864 792
865 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 793 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
866 ret = generic_file_direct_write(iocb, iovp, 794 ret = generic_file_direct_write(iocb, iovp,
867 &nr_segs, pos, &iocb->ki_pos, count, ocount); 795 &nr_segs, pos, &iocb->ki_pos, count, ocount);
868 796
797out:
798 xfs_rw_iunlock(ip, iolock);
799
869 /* No fallback to buffered IO on errors for XFS. */ 800 /* No fallback to buffered IO on errors for XFS. */
870 ASSERT(ret < 0 || ret == count); 801 ASSERT(ret < 0 || ret == count);
871 return ret; 802 return ret;
@@ -877,9 +808,7 @@ xfs_file_buffered_aio_write(
877 const struct iovec *iovp, 808 const struct iovec *iovp,
878 unsigned long nr_segs, 809 unsigned long nr_segs,
879 loff_t pos, 810 loff_t pos,
880 size_t ocount, 811 size_t ocount)
881 xfs_fsize_t *new_size,
882 int *iolock)
883{ 812{
884 struct file *file = iocb->ki_filp; 813 struct file *file = iocb->ki_filp;
885 struct address_space *mapping = file->f_mapping; 814 struct address_space *mapping = file->f_mapping;
@@ -887,14 +816,14 @@ xfs_file_buffered_aio_write(
887 struct xfs_inode *ip = XFS_I(inode); 816 struct xfs_inode *ip = XFS_I(inode);
888 ssize_t ret; 817 ssize_t ret;
889 int enospc = 0; 818 int enospc = 0;
819 int iolock = XFS_IOLOCK_EXCL;
890 size_t count = ocount; 820 size_t count = ocount;
891 821
892 *iolock = XFS_IOLOCK_EXCL; 822 xfs_rw_ilock(ip, iolock);
893 xfs_rw_ilock(ip, *iolock);
894 823
895 ret = xfs_file_aio_write_checks(file, &pos, &count, new_size, iolock); 824 ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
896 if (ret) 825 if (ret)
897 return ret; 826 goto out;
898 827
899 /* We can write back this queue in page reclaim */ 828 /* We can write back this queue in page reclaim */
900 current->backing_dev_info = mapping->backing_dev_info; 829 current->backing_dev_info = mapping->backing_dev_info;
@@ -908,13 +837,15 @@ write_retry:
908 * page locks and retry *once* 837 * page locks and retry *once*
909 */ 838 */
910 if (ret == -ENOSPC && !enospc) { 839 if (ret == -ENOSPC && !enospc) {
911 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
912 if (ret)
913 return ret;
914 enospc = 1; 840 enospc = 1;
915 goto write_retry; 841 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
842 if (!ret)
843 goto write_retry;
916 } 844 }
845
917 current->backing_dev_info = NULL; 846 current->backing_dev_info = NULL;
847out:
848 xfs_rw_iunlock(ip, iolock);
918 return ret; 849 return ret;
919} 850}
920 851
@@ -930,9 +861,7 @@ xfs_file_aio_write(
930 struct inode *inode = mapping->host; 861 struct inode *inode = mapping->host;
931 struct xfs_inode *ip = XFS_I(inode); 862 struct xfs_inode *ip = XFS_I(inode);
932 ssize_t ret; 863 ssize_t ret;
933 int iolock;
934 size_t ocount = 0; 864 size_t ocount = 0;
935 xfs_fsize_t new_size = 0;
936 865
937 XFS_STATS_INC(xs_write_calls); 866 XFS_STATS_INC(xs_write_calls);
938 867
@@ -951,33 +880,22 @@ xfs_file_aio_write(
951 return -EIO; 880 return -EIO;
952 881
953 if (unlikely(file->f_flags & O_DIRECT)) 882 if (unlikely(file->f_flags & O_DIRECT))
954 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, 883 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
955 ocount, &new_size, &iolock);
956 else 884 else
957 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos, 885 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
958 ocount, &new_size, &iolock); 886 ocount);
959
960 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
961 887
962 if (ret <= 0) 888 if (ret > 0) {
963 goto out_unlock; 889 ssize_t err;
964 890
965 /* Handle various SYNC-type writes */ 891 XFS_STATS_ADD(xs_write_bytes, ret);
966 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
967 loff_t end = pos + ret - 1;
968 int error;
969 892
970 xfs_rw_iunlock(ip, iolock); 893 /* Handle various SYNC-type writes */
971 error = xfs_file_fsync(file, pos, end, 894 err = generic_write_sync(file, pos, ret);
972 (file->f_flags & __O_SYNC) ? 0 : 1); 895 if (err < 0)
973 xfs_rw_ilock(ip, iolock); 896 ret = err;
974 if (error)
975 ret = error;
976 } 897 }
977 898
978out_unlock:
979 xfs_aio_write_newsize_update(ip, new_size);
980 xfs_rw_iunlock(ip, iolock);
981 return ret; 899 return ret;
982} 900}
983 901
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index ed88ed16811c..652b875a9d4c 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -90,7 +90,7 @@ xfs_wait_on_pages(
90 90
91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { 91 if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
92 return -filemap_fdatawait_range(mapping, first, 92 return -filemap_fdatawait_range(mapping, first,
93 last == -1 ? ip->i_size - 1 : last); 93 last == -1 ? XFS_ISIZE(ip) - 1 : last);
94 } 94 }
95 return 0; 95 return 0;
96} 96}
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 3960a066d7ff..8c3e46394d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -77,7 +77,7 @@ xfs_inode_alloc(
77 77
78 ASSERT(atomic_read(&ip->i_pincount) == 0); 78 ASSERT(atomic_read(&ip->i_pincount) == 0);
79 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 79 ASSERT(!spin_is_locked(&ip->i_flags_lock));
80 ASSERT(completion_done(&ip->i_flush)); 80 ASSERT(!xfs_isiflocked(ip));
81 ASSERT(ip->i_ino == 0); 81 ASSERT(ip->i_ino == 0);
82 82
83 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 83 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
@@ -94,8 +94,6 @@ xfs_inode_alloc(
94 ip->i_update_core = 0; 94 ip->i_update_core = 0;
95 ip->i_delayed_blks = 0; 95 ip->i_delayed_blks = 0;
96 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 96 memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
97 ip->i_size = 0;
98 ip->i_new_size = 0;
99 97
100 return ip; 98 return ip;
101} 99}
@@ -150,7 +148,7 @@ xfs_inode_free(
150 /* asserts to verify all state is correct here */ 148 /* asserts to verify all state is correct here */
151 ASSERT(atomic_read(&ip->i_pincount) == 0); 149 ASSERT(atomic_read(&ip->i_pincount) == 0);
152 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 150 ASSERT(!spin_is_locked(&ip->i_flags_lock));
153 ASSERT(completion_done(&ip->i_flush)); 151 ASSERT(!xfs_isiflocked(ip));
154 152
155 /* 153 /*
156 * Because we use RCU freeing we need to ensure the inode always 154 * Because we use RCU freeing we need to ensure the inode always
@@ -450,8 +448,6 @@ again:
450 448
451 *ipp = ip; 449 *ipp = ip;
452 450
453 ASSERT(ip->i_df.if_ext_max ==
454 XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
455 /* 451 /*
456 * If we have a real type for an on-disk inode, we can set ops(&unlock) 452 * If we have a real type for an on-disk inode, we can set ops(&unlock)
457 * now. If it's a new inode being created, xfs_ialloc will handle it. 453 * now. If it's a new inode being created, xfs_ialloc will handle it.
@@ -715,3 +711,19 @@ xfs_isilocked(
715 return 0; 711 return 0;
716} 712}
717#endif 713#endif
714
715void
716__xfs_iflock(
717 struct xfs_inode *ip)
718{
719 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
720 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
721
722 do {
723 prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
724 if (xfs_isiflocked(ip))
725 io_schedule();
726 } while (!xfs_iflock_nowait(ip));
727
728 finish_wait(wq, &wait.wait);
729}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 9dda7cc32848..b21022499c2e 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -299,11 +299,8 @@ xfs_iformat(
299{ 299{
300 xfs_attr_shortform_t *atp; 300 xfs_attr_shortform_t *atp;
301 int size; 301 int size;
302 int error; 302 int error = 0;
303 xfs_fsize_t di_size; 303 xfs_fsize_t di_size;
304 ip->i_df.if_ext_max =
305 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
306 error = 0;
307 304
308 if (unlikely(be32_to_cpu(dip->di_nextents) + 305 if (unlikely(be32_to_cpu(dip->di_nextents) +
309 be16_to_cpu(dip->di_anextents) > 306 be16_to_cpu(dip->di_anextents) >
@@ -350,7 +347,6 @@ xfs_iformat(
350 return XFS_ERROR(EFSCORRUPTED); 347 return XFS_ERROR(EFSCORRUPTED);
351 } 348 }
352 ip->i_d.di_size = 0; 349 ip->i_d.di_size = 0;
353 ip->i_size = 0;
354 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 350 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
355 break; 351 break;
356 352
@@ -409,10 +405,10 @@ xfs_iformat(
409 } 405 }
410 if (!XFS_DFORK_Q(dip)) 406 if (!XFS_DFORK_Q(dip))
411 return 0; 407 return 0;
408
412 ASSERT(ip->i_afp == NULL); 409 ASSERT(ip->i_afp == NULL);
413 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS); 410 ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
414 ip->i_afp->if_ext_max = 411
415 XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
416 switch (dip->di_aformat) { 412 switch (dip->di_aformat) {
417 case XFS_DINODE_FMT_LOCAL: 413 case XFS_DINODE_FMT_LOCAL:
418 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip); 414 atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
@@ -604,10 +600,11 @@ xfs_iformat_btree(
604 * or the number of extents is greater than the number of 600 * or the number of extents is greater than the number of
605 * blocks. 601 * blocks.
606 */ 602 */
607 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max 603 if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
608 || XFS_BMDR_SPACE_CALC(nrecs) > 604 XFS_IFORK_MAXEXT(ip, whichfork) ||
609 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 605 XFS_BMDR_SPACE_CALC(nrecs) >
610 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 606 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
607 XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
611 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).", 608 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
612 (unsigned long long) ip->i_ino); 609 (unsigned long long) ip->i_ino);
613 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 610 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -835,12 +832,6 @@ xfs_iread(
835 * with the uninitialized part of it. 832 * with the uninitialized part of it.
836 */ 833 */
837 ip->i_d.di_mode = 0; 834 ip->i_d.di_mode = 0;
838 /*
839 * Initialize the per-fork minima and maxima for a new
840 * inode here. xfs_iformat will do it for old inodes.
841 */
842 ip->i_df.if_ext_max =
843 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
844 } 835 }
845 836
846 /* 837 /*
@@ -861,7 +852,6 @@ xfs_iread(
861 } 852 }
862 853
863 ip->i_delayed_blks = 0; 854 ip->i_delayed_blks = 0;
864 ip->i_size = ip->i_d.di_size;
865 855
866 /* 856 /*
867 * Mark the buffer containing the inode as something to keep 857 * Mark the buffer containing the inode as something to keep
@@ -1051,7 +1041,6 @@ xfs_ialloc(
1051 } 1041 }
1052 1042
1053 ip->i_d.di_size = 0; 1043 ip->i_d.di_size = 0;
1054 ip->i_size = 0;
1055 ip->i_d.di_nextents = 0; 1044 ip->i_d.di_nextents = 0;
1056 ASSERT(ip->i_d.di_nblocks == 0); 1045 ASSERT(ip->i_d.di_nblocks == 0);
1057 1046
@@ -1166,52 +1155,6 @@ xfs_ialloc(
1166} 1155}
1167 1156
1168/* 1157/*
1169 * Check to make sure that there are no blocks allocated to the
1170 * file beyond the size of the file. We don't check this for
1171 * files with fixed size extents or real time extents, but we
1172 * at least do it for regular files.
1173 */
1174#ifdef DEBUG
1175STATIC void
1176xfs_isize_check(
1177 struct xfs_inode *ip,
1178 xfs_fsize_t isize)
1179{
1180 struct xfs_mount *mp = ip->i_mount;
1181 xfs_fileoff_t map_first;
1182 int nimaps;
1183 xfs_bmbt_irec_t imaps[2];
1184 int error;
1185
1186 if (!S_ISREG(ip->i_d.di_mode))
1187 return;
1188
1189 if (XFS_IS_REALTIME_INODE(ip))
1190 return;
1191
1192 if (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
1193 return;
1194
1195 nimaps = 2;
1196 map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1197 /*
1198 * The filesystem could be shutting down, so bmapi may return
1199 * an error.
1200 */
1201 error = xfs_bmapi_read(ip, map_first,
1202 (XFS_B_TO_FSB(mp,
1203 (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) - map_first),
1204 imaps, &nimaps, XFS_BMAPI_ENTIRE);
1205 if (error)
1206 return;
1207 ASSERT(nimaps == 1);
1208 ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1209}
1210#else /* DEBUG */
1211#define xfs_isize_check(ip, isize)
1212#endif /* DEBUG */
1213
1214/*
1215 * Free up the underlying blocks past new_size. The new size must be smaller 1158 * Free up the underlying blocks past new_size. The new size must be smaller
1216 * than the current size. This routine can be used both for the attribute and 1159 * than the current size. This routine can be used both for the attribute and
1217 * data fork, and does not modify the inode size, which is left to the caller. 1160 * data fork, and does not modify the inode size, which is left to the caller.
@@ -1252,12 +1195,14 @@ xfs_itruncate_extents(
1252 int done = 0; 1195 int done = 0;
1253 1196
1254 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 1197 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
1255 ASSERT(new_size <= ip->i_size); 1198 ASSERT(new_size <= XFS_ISIZE(ip));
1256 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1199 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1257 ASSERT(ip->i_itemp != NULL); 1200 ASSERT(ip->i_itemp != NULL);
1258 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1201 ASSERT(ip->i_itemp->ili_lock_flags == 0);
1259 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1202 ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1260 1203
1204 trace_xfs_itruncate_extents_start(ip, new_size);
1205
1261 /* 1206 /*
1262 * Since it is possible for space to become allocated beyond 1207 * Since it is possible for space to become allocated beyond
1263 * the end of the file (in a crash where the space is allocated 1208 * the end of the file (in a crash where the space is allocated
@@ -1325,6 +1270,14 @@ xfs_itruncate_extents(
1325 goto out; 1270 goto out;
1326 } 1271 }
1327 1272
1273 /*
1274 * Always re-log the inode so that our permanent transaction can keep
1275 * on rolling it forward in the log.
1276 */
1277 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1278
1279 trace_xfs_itruncate_extents_end(ip, new_size);
1280
1328out: 1281out:
1329 *tpp = tp; 1282 *tpp = tp;
1330 return error; 1283 return error;
@@ -1338,74 +1291,6 @@ out_bmap_cancel:
1338 goto out; 1291 goto out;
1339} 1292}
1340 1293
1341int
1342xfs_itruncate_data(
1343 struct xfs_trans **tpp,
1344 struct xfs_inode *ip,
1345 xfs_fsize_t new_size)
1346{
1347 int error;
1348
1349 trace_xfs_itruncate_data_start(ip, new_size);
1350
1351 /*
1352 * The first thing we do is set the size to new_size permanently on
1353 * disk. This way we don't have to worry about anyone ever being able
1354 * to look at the data being freed even in the face of a crash.
1355 * What we're getting around here is the case where we free a block, it
1356 * is allocated to another file, it is written to, and then we crash.
1357 * If the new data gets written to the file but the log buffers
1358 * containing the free and reallocation don't, then we'd end up with
1359 * garbage in the blocks being freed. As long as we make the new_size
1360 * permanent before actually freeing any blocks it doesn't matter if
1361 * they get written to.
1362 */
1363 if (ip->i_d.di_nextents > 0) {
1364 /*
1365 * If we are not changing the file size then do not update
1366 * the on-disk file size - we may be called from
1367 * xfs_inactive_free_eofblocks(). If we update the on-disk
1368 * file size and then the system crashes before the contents
1369 * of the file are flushed to disk then the files may be
1370 * full of holes (ie NULL files bug).
1371 */
1372 if (ip->i_size != new_size) {
1373 ip->i_d.di_size = new_size;
1374 ip->i_size = new_size;
1375 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1376 }
1377 }
1378
1379 error = xfs_itruncate_extents(tpp, ip, XFS_DATA_FORK, new_size);
1380 if (error)
1381 return error;
1382
1383 /*
1384 * If we are not changing the file size then do not update the on-disk
1385 * file size - we may be called from xfs_inactive_free_eofblocks().
1386 * If we update the on-disk file size and then the system crashes
1387 * before the contents of the file are flushed to disk then the files
1388 * may be full of holes (ie NULL files bug).
1389 */
1390 xfs_isize_check(ip, new_size);
1391 if (ip->i_size != new_size) {
1392 ip->i_d.di_size = new_size;
1393 ip->i_size = new_size;
1394 }
1395
1396 ASSERT(new_size != 0 || ip->i_delayed_blks == 0);
1397 ASSERT(new_size != 0 || ip->i_d.di_nextents == 0);
1398
1399 /*
1400 * Always re-log the inode so that our permanent transaction can keep
1401 * on rolling it forward in the log.
1402 */
1403 xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1404
1405 trace_xfs_itruncate_data_end(ip, new_size);
1406 return 0;
1407}
1408
1409/* 1294/*
1410 * This is called when the inode's link count goes to 0. 1295 * This is called when the inode's link count goes to 0.
1411 * We place the on-disk inode on a list in the AGI. It 1296 * We place the on-disk inode on a list in the AGI. It
@@ -1824,8 +1709,7 @@ xfs_ifree(
1824 ASSERT(ip->i_d.di_nlink == 0); 1709 ASSERT(ip->i_d.di_nlink == 0);
1825 ASSERT(ip->i_d.di_nextents == 0); 1710 ASSERT(ip->i_d.di_nextents == 0);
1826 ASSERT(ip->i_d.di_anextents == 0); 1711 ASSERT(ip->i_d.di_anextents == 0);
1827 ASSERT((ip->i_d.di_size == 0 && ip->i_size == 0) || 1712 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
1828 (!S_ISREG(ip->i_d.di_mode)));
1829 ASSERT(ip->i_d.di_nblocks == 0); 1713 ASSERT(ip->i_d.di_nblocks == 0);
1830 1714
1831 /* 1715 /*
@@ -1844,8 +1728,6 @@ xfs_ifree(
1844 ip->i_d.di_flags = 0; 1728 ip->i_d.di_flags = 0;
1845 ip->i_d.di_dmevmask = 0; 1729 ip->i_d.di_dmevmask = 0;
1846 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */ 1730 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
1847 ip->i_df.if_ext_max =
1848 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
1849 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; 1731 ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1850 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; 1732 ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1851 /* 1733 /*
@@ -2151,7 +2033,7 @@ xfs_idestroy_fork(
2151 * once someone is waiting for it to be unpinned. 2033 * once someone is waiting for it to be unpinned.
2152 */ 2034 */
2153static void 2035static void
2154xfs_iunpin_nowait( 2036xfs_iunpin(
2155 struct xfs_inode *ip) 2037 struct xfs_inode *ip)
2156{ 2038{
2157 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2039 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
@@ -2163,14 +2045,29 @@ xfs_iunpin_nowait(
2163 2045
2164} 2046}
2165 2047
2048static void
2049__xfs_iunpin_wait(
2050 struct xfs_inode *ip)
2051{
2052 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2053 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2054
2055 xfs_iunpin(ip);
2056
2057 do {
2058 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2059 if (xfs_ipincount(ip))
2060 io_schedule();
2061 } while (xfs_ipincount(ip));
2062 finish_wait(wq, &wait.wait);
2063}
2064
2166void 2065void
2167xfs_iunpin_wait( 2066xfs_iunpin_wait(
2168 struct xfs_inode *ip) 2067 struct xfs_inode *ip)
2169{ 2068{
2170 if (xfs_ipincount(ip)) { 2069 if (xfs_ipincount(ip))
2171 xfs_iunpin_nowait(ip); 2070 __xfs_iunpin_wait(ip);
2172 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
2173 }
2174} 2071}
2175 2072
2176/* 2073/*
@@ -2510,9 +2407,9 @@ xfs_iflush(
2510 XFS_STATS_INC(xs_iflush_count); 2407 XFS_STATS_INC(xs_iflush_count);
2511 2408
2512 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2409 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2513 ASSERT(!completion_done(&ip->i_flush)); 2410 ASSERT(xfs_isiflocked(ip));
2514 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2411 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2515 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2412 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2516 2413
2517 iip = ip->i_itemp; 2414 iip = ip->i_itemp;
2518 mp = ip->i_mount; 2415 mp = ip->i_mount;
@@ -2529,7 +2426,7 @@ xfs_iflush(
2529 * out for us if they occur after the log force completes. 2426 * out for us if they occur after the log force completes.
2530 */ 2427 */
2531 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) { 2428 if (!(flags & SYNC_WAIT) && xfs_ipincount(ip)) {
2532 xfs_iunpin_nowait(ip); 2429 xfs_iunpin(ip);
2533 xfs_ifunlock(ip); 2430 xfs_ifunlock(ip);
2534 return EAGAIN; 2431 return EAGAIN;
2535 } 2432 }
@@ -2626,9 +2523,9 @@ xfs_iflush_int(
2626#endif 2523#endif
2627 2524
2628 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2525 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2629 ASSERT(!completion_done(&ip->i_flush)); 2526 ASSERT(xfs_isiflocked(ip));
2630 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || 2527 ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2631 ip->i_d.di_nextents > ip->i_df.if_ext_max); 2528 ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2632 2529
2633 iip = ip->i_itemp; 2530 iip = ip->i_itemp;
2634 mp = ip->i_mount; 2531 mp = ip->i_mount;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f0e6b151ba37..2f27b7454085 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -66,7 +66,6 @@ typedef struct xfs_ifork {
66 struct xfs_btree_block *if_broot; /* file's incore btree root */ 66 struct xfs_btree_block *if_broot; /* file's incore btree root */
67 short if_broot_bytes; /* bytes allocated for root */ 67 short if_broot_bytes; /* bytes allocated for root */
68 unsigned char if_flags; /* per-fork flags */ 68 unsigned char if_flags; /* per-fork flags */
69 unsigned char if_ext_max; /* max # of extent records */
70 union { 69 union {
71 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */ 70 xfs_bmbt_rec_host_t *if_extents;/* linear map file exts */
72 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */ 71 xfs_ext_irec_t *if_ext_irec; /* irec map file exts */
@@ -206,12 +205,12 @@ typedef struct xfs_icdinode {
206 ((w) == XFS_DATA_FORK ? \ 205 ((w) == XFS_DATA_FORK ? \
207 ((ip)->i_d.di_nextents = (n)) : \ 206 ((ip)->i_d.di_nextents = (n)) : \
208 ((ip)->i_d.di_anextents = (n))) 207 ((ip)->i_d.di_anextents = (n)))
209 208#define XFS_IFORK_MAXEXT(ip, w) \
209 (XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
210 210
211 211
212#ifdef __KERNEL__ 212#ifdef __KERNEL__
213 213
214struct bhv_desc;
215struct xfs_buf; 214struct xfs_buf;
216struct xfs_bmap_free; 215struct xfs_bmap_free;
217struct xfs_bmbt_irec; 216struct xfs_bmbt_irec;
@@ -220,12 +219,6 @@ struct xfs_mount;
220struct xfs_trans; 219struct xfs_trans;
221struct xfs_dquot; 220struct xfs_dquot;
222 221
223typedef struct dm_attrs_s {
224 __uint32_t da_dmevmask; /* DMIG event mask */
225 __uint16_t da_dmstate; /* DMIG state info */
226 __uint16_t da_pad; /* DMIG extra padding */
227} dm_attrs_t;
228
229typedef struct xfs_inode { 222typedef struct xfs_inode {
230 /* Inode linking and identification information. */ 223 /* Inode linking and identification information. */
231 struct xfs_mount *i_mount; /* fs mount struct ptr */ 224 struct xfs_mount *i_mount; /* fs mount struct ptr */
@@ -244,27 +237,19 @@ typedef struct xfs_inode {
244 struct xfs_inode_log_item *i_itemp; /* logging information */ 237 struct xfs_inode_log_item *i_itemp; /* logging information */
245 mrlock_t i_lock; /* inode lock */ 238 mrlock_t i_lock; /* inode lock */
246 mrlock_t i_iolock; /* inode IO lock */ 239 mrlock_t i_iolock; /* inode IO lock */
247 struct completion i_flush; /* inode flush completion q */
248 atomic_t i_pincount; /* inode pin count */ 240 atomic_t i_pincount; /* inode pin count */
249 wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
250 spinlock_t i_flags_lock; /* inode i_flags lock */ 241 spinlock_t i_flags_lock; /* inode i_flags lock */
251 /* Miscellaneous state. */ 242 /* Miscellaneous state. */
252 unsigned short i_flags; /* see defined flags below */ 243 unsigned long i_flags; /* see defined flags below */
253 unsigned char i_update_core; /* timestamps/size is dirty */ 244 unsigned char i_update_core; /* timestamps/size is dirty */
254 unsigned int i_delayed_blks; /* count of delay alloc blks */ 245 unsigned int i_delayed_blks; /* count of delay alloc blks */
255 246
256 xfs_icdinode_t i_d; /* most of ondisk inode */ 247 xfs_icdinode_t i_d; /* most of ondisk inode */
257 248
258 xfs_fsize_t i_size; /* in-memory size */
259 xfs_fsize_t i_new_size; /* size when write completes */
260
261 /* VFS inode */ 249 /* VFS inode */
262 struct inode i_vnode; /* embedded VFS inode */ 250 struct inode i_vnode; /* embedded VFS inode */
263} xfs_inode_t; 251} xfs_inode_t;
264 252
265#define XFS_ISIZE(ip) S_ISREG((ip)->i_d.di_mode) ? \
266 (ip)->i_size : (ip)->i_d.di_size;
267
268/* Convert from vfs inode to xfs inode */ 253/* Convert from vfs inode to xfs inode */
269static inline struct xfs_inode *XFS_I(struct inode *inode) 254static inline struct xfs_inode *XFS_I(struct inode *inode)
270{ 255{
@@ -278,6 +263,18 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
278} 263}
279 264
280/* 265/*
266 * For regular files we only update the on-disk filesize when actually
267 * writing data back to disk. Until then only the copy in the VFS inode
268 * is uptodate.
269 */
270static inline xfs_fsize_t XFS_ISIZE(struct xfs_inode *ip)
271{
272 if (S_ISREG(ip->i_d.di_mode))
273 return i_size_read(VFS_I(ip));
274 return ip->i_d.di_size;
275}
276
277/*
281 * i_flags helper functions 278 * i_flags helper functions
282 */ 279 */
283static inline void 280static inline void
@@ -331,6 +328,19 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
331 return ret; 328 return ret;
332} 329}
333 330
331static inline int
332xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
333{
334 int ret;
335
336 spin_lock(&ip->i_flags_lock);
337 ret = ip->i_flags & flags;
338 if (!ret)
339 ip->i_flags |= flags;
340 spin_unlock(&ip->i_flags_lock);
341 return ret;
342}
343
334/* 344/*
335 * Project quota id helpers (previously projid was 16bit only 345 * Project quota id helpers (previously projid was 16bit only
336 * and using two 16bit values to hold new 32bit projid was chosen 346 * and using two 16bit values to hold new 32bit projid was chosen
@@ -351,35 +361,19 @@ xfs_set_projid(struct xfs_inode *ip,
351} 361}
352 362
353/* 363/*
354 * Manage the i_flush queue embedded in the inode. This completion
355 * queue synchronizes processes attempting to flush the in-core
356 * inode back to disk.
357 */
358static inline void xfs_iflock(xfs_inode_t *ip)
359{
360 wait_for_completion(&ip->i_flush);
361}
362
363static inline int xfs_iflock_nowait(xfs_inode_t *ip)
364{
365 return try_wait_for_completion(&ip->i_flush);
366}
367
368static inline void xfs_ifunlock(xfs_inode_t *ip)
369{
370 complete(&ip->i_flush);
371}
372
373/*
374 * In-core inode flags. 364 * In-core inode flags.
375 */ 365 */
376#define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */ 366#define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */
377#define XFS_ISTALE 0x0002 /* inode has been staled */ 367#define XFS_ISTALE (1 << 1) /* inode has been staled */
378#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ 368#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
379#define XFS_INEW 0x0008 /* inode has just been allocated */ 369#define XFS_INEW (1 << 3) /* inode has just been allocated */
380#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ 370#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
381#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ 371#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
382#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */ 372#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
373#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
374#define XFS_IFLOCK (1 << __XFS_IFLOCK_BIT)
375#define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */
376#define XFS_IPINNED (1 << __XFS_IPINNED_BIT)
383 377
384/* 378/*
385 * Per-lifetime flags need to be reset when re-using a reclaimable inode during 379 * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -392,6 +386,34 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
392 XFS_IFILESTREAM); 386 XFS_IFILESTREAM);
393 387
394/* 388/*
389 * Synchronize processes attempting to flush the in-core inode back to disk.
390 */
391
392extern void __xfs_iflock(struct xfs_inode *ip);
393
394static inline int xfs_iflock_nowait(struct xfs_inode *ip)
395{
396 return !xfs_iflags_test_and_set(ip, XFS_IFLOCK);
397}
398
399static inline void xfs_iflock(struct xfs_inode *ip)
400{
401 if (!xfs_iflock_nowait(ip))
402 __xfs_iflock(ip);
403}
404
405static inline void xfs_ifunlock(struct xfs_inode *ip)
406{
407 xfs_iflags_clear(ip, XFS_IFLOCK);
408 wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
409}
410
411static inline int xfs_isiflocked(struct xfs_inode *ip)
412{
413 return xfs_iflags_test(ip, XFS_IFLOCK);
414}
415
416/*
395 * Flags for inode locking. 417 * Flags for inode locking.
396 * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) 418 * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield)
397 * 1<<16 - 1<<32-1 -- lockdep annotation (integers) 419 * 1<<16 - 1<<32-1 -- lockdep annotation (integers)
@@ -491,8 +513,6 @@ int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
491 struct xfs_bmap_free *); 513 struct xfs_bmap_free *);
492int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, 514int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
493 int, xfs_fsize_t); 515 int, xfs_fsize_t);
494int xfs_itruncate_data(struct xfs_trans **, struct xfs_inode *,
495 xfs_fsize_t);
496int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 516int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
497 517
498void xfs_iext_realloc(xfs_inode_t *, int, int); 518void xfs_iext_realloc(xfs_inode_t *, int, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cfd6c7f8cc3c..91d71dcd4852 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -79,8 +79,6 @@ xfs_inode_item_size(
79 break; 79 break;
80 80
81 case XFS_DINODE_FMT_BTREE: 81 case XFS_DINODE_FMT_BTREE:
82 ASSERT(ip->i_df.if_ext_max ==
83 XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t));
84 iip->ili_format.ilf_fields &= 82 iip->ili_format.ilf_fields &=
85 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | 83 ~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
86 XFS_ILOG_DEV | XFS_ILOG_UUID); 84 XFS_ILOG_DEV | XFS_ILOG_UUID);
@@ -557,7 +555,7 @@ xfs_inode_item_unpin(
557 trace_xfs_inode_unpin(ip, _RET_IP_); 555 trace_xfs_inode_unpin(ip, _RET_IP_);
558 ASSERT(atomic_read(&ip->i_pincount) > 0); 556 ASSERT(atomic_read(&ip->i_pincount) > 0);
559 if (atomic_dec_and_test(&ip->i_pincount)) 557 if (atomic_dec_and_test(&ip->i_pincount))
560 wake_up(&ip->i_ipin_wait); 558 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
561} 559}
562 560
563/* 561/*
@@ -719,7 +717,7 @@ xfs_inode_item_pushbuf(
719 * If a flush is not in progress anymore, chances are that the 717 * If a flush is not in progress anymore, chances are that the
720 * inode was taken off the AIL. So, just get out. 718 * inode was taken off the AIL. So, just get out.
721 */ 719 */
722 if (completion_done(&ip->i_flush) || 720 if (!xfs_isiflocked(ip) ||
723 !(lip->li_flags & XFS_LI_IN_AIL)) { 721 !(lip->li_flags & XFS_LI_IN_AIL)) {
724 xfs_iunlock(ip, XFS_ILOCK_SHARED); 722 xfs_iunlock(ip, XFS_ILOCK_SHARED);
725 return true; 723 return true;
@@ -752,7 +750,7 @@ xfs_inode_item_push(
752 struct xfs_inode *ip = iip->ili_inode; 750 struct xfs_inode *ip = iip->ili_inode;
753 751
754 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 752 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
755 ASSERT(!completion_done(&ip->i_flush)); 753 ASSERT(xfs_isiflocked(ip));
756 754
757 /* 755 /*
758 * Since we were able to lock the inode's flush lock and 756 * Since we were able to lock the inode's flush lock and
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9afa282aa937..246c7d57c6f9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -57,26 +57,26 @@ xfs_iomap_eof_align_last_fsb(
57 xfs_fileoff_t *last_fsb) 57 xfs_fileoff_t *last_fsb)
58{ 58{
59 xfs_fileoff_t new_last_fsb = 0; 59 xfs_fileoff_t new_last_fsb = 0;
60 xfs_extlen_t align; 60 xfs_extlen_t align = 0;
61 int eof, error; 61 int eof, error;
62 62
63 if (XFS_IS_REALTIME_INODE(ip)) 63 if (!XFS_IS_REALTIME_INODE(ip)) {
64 ; 64 /*
65 /* 65 * Round up the allocation request to a stripe unit
66 * If mounted with the "-o swalloc" option, roundup the allocation 66 * (m_dalign) boundary if the file size is >= stripe unit
67 * request to a stripe width boundary if the file size is >= 67 * size, and we are allocating past the allocation eof.
68 * stripe width and we are allocating past the allocation eof. 68 *
69 */ 69 * If mounted with the "-o swalloc" option the alignment is
70 else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) && 70 * increased from the strip unit size to the stripe width.
71 (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth))) 71 */
72 new_last_fsb = roundup_64(*last_fsb, mp->m_swidth); 72 if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
73 /* 73 align = mp->m_swidth;
74 * Roundup the allocation request to a stripe unit (m_dalign) boundary 74 else if (mp->m_dalign)
75 * if the file size is >= stripe unit size, and we are allocating past 75 align = mp->m_dalign;
76 * the allocation eof. 76
77 */ 77 if (align && XFS_ISIZE(ip) >= XFS_FSB_TO_B(mp, align))
78 else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign))) 78 new_last_fsb = roundup_64(*last_fsb, align);
79 new_last_fsb = roundup_64(*last_fsb, mp->m_dalign); 79 }
80 80
81 /* 81 /*
82 * Always round up the allocation request to an extent boundary 82 * Always round up the allocation request to an extent boundary
@@ -154,7 +154,7 @@ xfs_iomap_write_direct(
154 154
155 offset_fsb = XFS_B_TO_FSBT(mp, offset); 155 offset_fsb = XFS_B_TO_FSBT(mp, offset);
156 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); 156 last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
157 if ((offset + count) > ip->i_size) { 157 if ((offset + count) > XFS_ISIZE(ip)) {
158 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); 158 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
159 if (error) 159 if (error)
160 goto error_out; 160 goto error_out;
@@ -211,7 +211,7 @@ xfs_iomap_write_direct(
211 xfs_trans_ijoin(tp, ip, 0); 211 xfs_trans_ijoin(tp, ip, 0);
212 212
213 bmapi_flag = 0; 213 bmapi_flag = 0;
214 if (offset < ip->i_size || extsz) 214 if (offset < XFS_ISIZE(ip) || extsz)
215 bmapi_flag |= XFS_BMAPI_PREALLOC; 215 bmapi_flag |= XFS_BMAPI_PREALLOC;
216 216
217 /* 217 /*
@@ -286,7 +286,7 @@ xfs_iomap_eof_want_preallocate(
286 int found_delalloc = 0; 286 int found_delalloc = 0;
287 287
288 *prealloc = 0; 288 *prealloc = 0;
289 if ((offset + count) <= ip->i_size) 289 if (offset + count <= XFS_ISIZE(ip))
290 return 0; 290 return 0;
291 291
292 /* 292 /*
@@ -340,7 +340,7 @@ xfs_iomap_prealloc_size(
340 * if we pass in alloc_blocks = 0. Hence the "+ 1" to 340 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
341 * ensure we always pass in a non-zero value. 341 * ensure we always pass in a non-zero value.
342 */ 342 */
343 alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size) + 1; 343 alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
344 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN, 344 alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
345 rounddown_pow_of_two(alloc_blocks)); 345 rounddown_pow_of_two(alloc_blocks));
346 346
@@ -564,7 +564,7 @@ xfs_iomap_write_allocate(
564 * back.... 564 * back....
565 */ 565 */
566 nimaps = 1; 566 nimaps = 1;
567 end_fsb = XFS_B_TO_FSB(mp, ip->i_size); 567 end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
568 error = xfs_bmap_last_offset(NULL, ip, &last_block, 568 error = xfs_bmap_last_offset(NULL, ip, &last_block,
569 XFS_DATA_FORK); 569 XFS_DATA_FORK);
570 if (error) 570 if (error)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f9babd179223..ab302539e5b9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -750,6 +750,7 @@ xfs_setattr_size(
750 struct xfs_mount *mp = ip->i_mount; 750 struct xfs_mount *mp = ip->i_mount;
751 struct inode *inode = VFS_I(ip); 751 struct inode *inode = VFS_I(ip);
752 int mask = iattr->ia_valid; 752 int mask = iattr->ia_valid;
753 xfs_off_t oldsize, newsize;
753 struct xfs_trans *tp; 754 struct xfs_trans *tp;
754 int error; 755 int error;
755 uint lock_flags; 756 uint lock_flags;
@@ -777,11 +778,13 @@ xfs_setattr_size(
777 lock_flags |= XFS_IOLOCK_EXCL; 778 lock_flags |= XFS_IOLOCK_EXCL;
778 xfs_ilock(ip, lock_flags); 779 xfs_ilock(ip, lock_flags);
779 780
781 oldsize = inode->i_size;
782 newsize = iattr->ia_size;
783
780 /* 784 /*
781 * Short circuit the truncate case for zero length files. 785 * Short circuit the truncate case for zero length files.
782 */ 786 */
783 if (iattr->ia_size == 0 && 787 if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
784 ip->i_size == 0 && ip->i_d.di_nextents == 0) {
785 if (!(mask & (ATTR_CTIME|ATTR_MTIME))) 788 if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
786 goto out_unlock; 789 goto out_unlock;
787 790
@@ -807,14 +810,14 @@ xfs_setattr_size(
807 * the inode to the transaction, because the inode cannot be unlocked 810 * the inode to the transaction, because the inode cannot be unlocked
808 * once it is a part of the transaction. 811 * once it is a part of the transaction.
809 */ 812 */
810 if (iattr->ia_size > ip->i_size) { 813 if (newsize > oldsize) {
811 /* 814 /*
812 * Do the first part of growing a file: zero any data in the 815 * Do the first part of growing a file: zero any data in the
813 * last block that is beyond the old EOF. We need to do this 816 * last block that is beyond the old EOF. We need to do this
814 * before the inode is joined to the transaction to modify 817 * before the inode is joined to the transaction to modify
815 * i_size. 818 * i_size.
816 */ 819 */
817 error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size); 820 error = xfs_zero_eof(ip, newsize, oldsize);
818 if (error) 821 if (error)
819 goto out_unlock; 822 goto out_unlock;
820 } 823 }
@@ -833,8 +836,8 @@ xfs_setattr_size(
833 * here and prevents waiting for other data not within the range we 836 * here and prevents waiting for other data not within the range we
834 * care about here. 837 * care about here.
835 */ 838 */
836 if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) { 839 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
837 error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size, 0, 840 error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
838 FI_NONE); 841 FI_NONE);
839 if (error) 842 if (error)
840 goto out_unlock; 843 goto out_unlock;
@@ -845,8 +848,7 @@ xfs_setattr_size(
845 */ 848 */
846 inode_dio_wait(inode); 849 inode_dio_wait(inode);
847 850
848 error = -block_truncate_page(inode->i_mapping, iattr->ia_size, 851 error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
849 xfs_get_blocks);
850 if (error) 852 if (error)
851 goto out_unlock; 853 goto out_unlock;
852 854
@@ -857,7 +859,7 @@ xfs_setattr_size(
857 if (error) 859 if (error)
858 goto out_trans_cancel; 860 goto out_trans_cancel;
859 861
860 truncate_setsize(inode, iattr->ia_size); 862 truncate_setsize(inode, newsize);
861 863
862 commit_flags = XFS_TRANS_RELEASE_LOG_RES; 864 commit_flags = XFS_TRANS_RELEASE_LOG_RES;
863 lock_flags |= XFS_ILOCK_EXCL; 865 lock_flags |= XFS_ILOCK_EXCL;
@@ -876,19 +878,29 @@ xfs_setattr_size(
876 * these flags set. For all other operations the VFS set these flags 878 * these flags set. For all other operations the VFS set these flags
877 * explicitly if it wants a timestamp update. 879 * explicitly if it wants a timestamp update.
878 */ 880 */
879 if (iattr->ia_size != ip->i_size && 881 if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
880 (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
881 iattr->ia_ctime = iattr->ia_mtime = 882 iattr->ia_ctime = iattr->ia_mtime =
882 current_fs_time(inode->i_sb); 883 current_fs_time(inode->i_sb);
883 mask |= ATTR_CTIME | ATTR_MTIME; 884 mask |= ATTR_CTIME | ATTR_MTIME;
884 } 885 }
885 886
886 if (iattr->ia_size > ip->i_size) { 887 /*
887 ip->i_d.di_size = iattr->ia_size; 888 * The first thing we do is set the size to new_size permanently on
888 ip->i_size = iattr->ia_size; 889 * disk. This way we don't have to worry about anyone ever being able
889 } else if (iattr->ia_size <= ip->i_size || 890 * to look at the data being freed even in the face of a crash.
890 (iattr->ia_size == 0 && ip->i_d.di_nextents)) { 891 * What we're getting around here is the case where we free a block, it
891 error = xfs_itruncate_data(&tp, ip, iattr->ia_size); 892 * is allocated to another file, it is written to, and then we crash.
893 * If the new data gets written to the file but the log buffers
894 * containing the free and reallocation don't, then we'd end up with
895 * garbage in the blocks being freed. As long as we make the new size
896 * permanent before actually freeing any blocks it doesn't matter if
897 * they get written to.
898 */
899 ip->i_d.di_size = newsize;
900 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
901
902 if (newsize <= oldsize) {
903 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize);
892 if (error) 904 if (error)
893 goto out_trans_abort; 905 goto out_trans_abort;
894 906
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5cc3dde1bc90..eafbcff81f3a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -31,6 +31,7 @@
31#include "xfs_mount.h" 31#include "xfs_mount.h"
32#include "xfs_bmap_btree.h" 32#include "xfs_bmap_btree.h"
33#include "xfs_inode.h" 33#include "xfs_inode.h"
34#include "xfs_inode_item.h"
34#include "xfs_itable.h" 35#include "xfs_itable.h"
35#include "xfs_bmap.h" 36#include "xfs_bmap.h"
36#include "xfs_rtalloc.h" 37#include "xfs_rtalloc.h"
@@ -263,13 +264,18 @@ xfs_qm_scall_trunc_qfile(
263 xfs_ilock(ip, XFS_ILOCK_EXCL); 264 xfs_ilock(ip, XFS_ILOCK_EXCL);
264 xfs_trans_ijoin(tp, ip, 0); 265 xfs_trans_ijoin(tp, ip, 0);
265 266
266 error = xfs_itruncate_data(&tp, ip, 0); 267 ip->i_d.di_size = 0;
268 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
269
270 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
267 if (error) { 271 if (error) {
268 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | 272 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
269 XFS_TRANS_ABORT); 273 XFS_TRANS_ABORT);
270 goto out_unlock; 274 goto out_unlock;
271 } 275 }
272 276
277 ASSERT(ip->i_d.di_nextents == 0);
278
273 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 279 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
274 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 280 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
275 281
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 281961c1d81a..ee5b695c99a7 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -828,14 +828,6 @@ xfs_fs_inode_init_once(
828 /* xfs inode */ 828 /* xfs inode */
829 atomic_set(&ip->i_pincount, 0); 829 atomic_set(&ip->i_pincount, 0);
830 spin_lock_init(&ip->i_flags_lock); 830 spin_lock_init(&ip->i_flags_lock);
831 init_waitqueue_head(&ip->i_ipin_wait);
832 /*
833 * Because we want to use a counting completion, complete
834 * the flush completion once to allow a single access to
835 * the flush completion without blocking.
836 */
837 init_completion(&ip->i_flush);
838 complete(&ip->i_flush);
839 831
840 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, 832 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
841 "xfsino", ip->i_ino); 833 "xfsino", ip->i_ino);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 72c01a1c16e7..40b75eecd2b4 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -707,14 +707,13 @@ xfs_reclaim_inode_grab(
707 return 1; 707 return 1;
708 708
709 /* 709 /*
710 * do some unlocked checks first to avoid unnecessary lock traffic. 710 * If we are asked for non-blocking operation, do unlocked checks to
711 * The first is a flush lock check, the second is a already in reclaim 711 * see if the inode already is being flushed or in reclaim to avoid
712 * check. Only do these checks if we are not going to block on locks. 712 * lock traffic.
713 */ 713 */
714 if ((flags & SYNC_TRYLOCK) && 714 if ((flags & SYNC_TRYLOCK) &&
715 (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { 715 __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
716 return 1; 716 return 1;
717 }
718 717
719 /* 718 /*
720 * The radix tree lock here protects a thread in xfs_iget from racing 719 * The radix tree lock here protects a thread in xfs_iget from racing
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index a9d5b1e06efe..6b6df5802e95 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -891,7 +891,6 @@ DECLARE_EVENT_CLASS(xfs_file_class,
891 __field(dev_t, dev) 891 __field(dev_t, dev)
892 __field(xfs_ino_t, ino) 892 __field(xfs_ino_t, ino)
893 __field(xfs_fsize_t, size) 893 __field(xfs_fsize_t, size)
894 __field(xfs_fsize_t, new_size)
895 __field(loff_t, offset) 894 __field(loff_t, offset)
896 __field(size_t, count) 895 __field(size_t, count)
897 __field(int, flags) 896 __field(int, flags)
@@ -900,17 +899,15 @@ DECLARE_EVENT_CLASS(xfs_file_class,
900 __entry->dev = VFS_I(ip)->i_sb->s_dev; 899 __entry->dev = VFS_I(ip)->i_sb->s_dev;
901 __entry->ino = ip->i_ino; 900 __entry->ino = ip->i_ino;
902 __entry->size = ip->i_d.di_size; 901 __entry->size = ip->i_d.di_size;
903 __entry->new_size = ip->i_new_size;
904 __entry->offset = offset; 902 __entry->offset = offset;
905 __entry->count = count; 903 __entry->count = count;
906 __entry->flags = flags; 904 __entry->flags = flags;
907 ), 905 ),
908 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 906 TP_printk("dev %d:%d ino 0x%llx size 0x%llx "
909 "offset 0x%llx count 0x%zx ioflags %s", 907 "offset 0x%llx count 0x%zx ioflags %s",
910 MAJOR(__entry->dev), MINOR(__entry->dev), 908 MAJOR(__entry->dev), MINOR(__entry->dev),
911 __entry->ino, 909 __entry->ino,
912 __entry->size, 910 __entry->size,
913 __entry->new_size,
914 __entry->offset, 911 __entry->offset,
915 __entry->count, 912 __entry->count,
916 __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) 913 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
@@ -978,7 +975,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
978 __field(dev_t, dev) 975 __field(dev_t, dev)
979 __field(xfs_ino_t, ino) 976 __field(xfs_ino_t, ino)
980 __field(loff_t, size) 977 __field(loff_t, size)
981 __field(loff_t, new_size)
982 __field(loff_t, offset) 978 __field(loff_t, offset)
983 __field(size_t, count) 979 __field(size_t, count)
984 __field(int, type) 980 __field(int, type)
@@ -990,7 +986,6 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
990 __entry->dev = VFS_I(ip)->i_sb->s_dev; 986 __entry->dev = VFS_I(ip)->i_sb->s_dev;
991 __entry->ino = ip->i_ino; 987 __entry->ino = ip->i_ino;
992 __entry->size = ip->i_d.di_size; 988 __entry->size = ip->i_d.di_size;
993 __entry->new_size = ip->i_new_size;
994 __entry->offset = offset; 989 __entry->offset = offset;
995 __entry->count = count; 990 __entry->count = count;
996 __entry->type = type; 991 __entry->type = type;
@@ -998,13 +993,11 @@ DECLARE_EVENT_CLASS(xfs_imap_class,
998 __entry->startblock = irec ? irec->br_startblock : 0; 993 __entry->startblock = irec ? irec->br_startblock : 0;
999 __entry->blockcount = irec ? irec->br_blockcount : 0; 994 __entry->blockcount = irec ? irec->br_blockcount : 0;
1000 ), 995 ),
1001 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " 996 TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd "
1002 "offset 0x%llx count %zd type %s " 997 "type %s startoff 0x%llx startblock %lld blockcount 0x%llx",
1003 "startoff 0x%llx startblock %lld blockcount 0x%llx",
1004 MAJOR(__entry->dev), MINOR(__entry->dev), 998 MAJOR(__entry->dev), MINOR(__entry->dev),
1005 __entry->ino, 999 __entry->ino,
1006 __entry->size, 1000 __entry->size,
1007 __entry->new_size,
1008 __entry->offset, 1001 __entry->offset,
1009 __entry->count, 1002 __entry->count,
1010 __print_symbolic(__entry->type, XFS_IO_TYPES), 1003 __print_symbolic(__entry->type, XFS_IO_TYPES),
@@ -1031,26 +1024,23 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class,
1031 __field(xfs_ino_t, ino) 1024 __field(xfs_ino_t, ino)
1032 __field(loff_t, isize) 1025 __field(loff_t, isize)
1033 __field(loff_t, disize) 1026 __field(loff_t, disize)
1034 __field(loff_t, new_size)
1035 __field(loff_t, offset) 1027 __field(loff_t, offset)
1036 __field(size_t, count) 1028 __field(size_t, count)
1037 ), 1029 ),
1038 TP_fast_assign( 1030 TP_fast_assign(
1039 __entry->dev = VFS_I(ip)->i_sb->s_dev; 1031 __entry->dev = VFS_I(ip)->i_sb->s_dev;
1040 __entry->ino = ip->i_ino; 1032 __entry->ino = ip->i_ino;
1041 __entry->isize = ip->i_size; 1033 __entry->isize = VFS_I(ip)->i_size;
1042 __entry->disize = ip->i_d.di_size; 1034 __entry->disize = ip->i_d.di_size;
1043 __entry->new_size = ip->i_new_size;
1044 __entry->offset = offset; 1035 __entry->offset = offset;
1045 __entry->count = count; 1036 __entry->count = count;
1046 ), 1037 ),
1047 TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx " 1038 TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx "
1048 "offset 0x%llx count %zd", 1039 "offset 0x%llx count %zd",
1049 MAJOR(__entry->dev), MINOR(__entry->dev), 1040 MAJOR(__entry->dev), MINOR(__entry->dev),
1050 __entry->ino, 1041 __entry->ino,
1051 __entry->isize, 1042 __entry->isize,
1052 __entry->disize, 1043 __entry->disize,
1053 __entry->new_size,
1054 __entry->offset, 1044 __entry->offset,
1055 __entry->count) 1045 __entry->count)
1056); 1046);
@@ -1090,8 +1080,8 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class,
1090DEFINE_EVENT(xfs_itrunc_class, name, \ 1080DEFINE_EVENT(xfs_itrunc_class, name, \
1091 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ 1081 TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
1092 TP_ARGS(ip, new_size)) 1082 TP_ARGS(ip, new_size))
1093DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start); 1083DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_start);
1094DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end); 1084DEFINE_ITRUNC_EVENT(xfs_itruncate_extents_end);
1095 1085
1096TRACE_EVENT(xfs_pagecache_inval, 1086TRACE_EVENT(xfs_pagecache_inval,
1097 TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish), 1087 TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
@@ -1568,7 +1558,6 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1568 __field(xfs_ino_t, ino) 1558 __field(xfs_ino_t, ino)
1569 __field(int, format) 1559 __field(int, format)
1570 __field(int, nex) 1560 __field(int, nex)
1571 __field(int, max_nex)
1572 __field(int, broot_size) 1561 __field(int, broot_size)
1573 __field(int, fork_off) 1562 __field(int, fork_off)
1574 ), 1563 ),
@@ -1578,18 +1567,16 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class,
1578 __entry->ino = ip->i_ino; 1567 __entry->ino = ip->i_ino;
1579 __entry->format = ip->i_d.di_format; 1568 __entry->format = ip->i_d.di_format;
1580 __entry->nex = ip->i_d.di_nextents; 1569 __entry->nex = ip->i_d.di_nextents;
1581 __entry->max_nex = ip->i_df.if_ext_max;
1582 __entry->broot_size = ip->i_df.if_broot_bytes; 1570 __entry->broot_size = ip->i_df.if_broot_bytes;
1583 __entry->fork_off = XFS_IFORK_BOFF(ip); 1571 __entry->fork_off = XFS_IFORK_BOFF(ip);
1584 ), 1572 ),
1585 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " 1573 TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
1586 "Max in-fork extents %d, broot size %d, fork offset %d", 1574 "broot size %d, fork offset %d",
1587 MAJOR(__entry->dev), MINOR(__entry->dev), 1575 MAJOR(__entry->dev), MINOR(__entry->dev),
1588 __entry->ino, 1576 __entry->ino,
1589 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), 1577 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
1590 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR), 1578 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
1591 __entry->nex, 1579 __entry->nex,
1592 __entry->max_nex,
1593 __entry->broot_size, 1580 __entry->broot_size,
1594 __entry->fork_off) 1581 __entry->fork_off)
1595) 1582)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index f2fea868d4db..ebdb88840a47 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -131,7 +131,8 @@ xfs_readlink(
131 __func__, (unsigned long long) ip->i_ino, 131 __func__, (unsigned long long) ip->i_ino,
132 (long long) pathlen); 132 (long long) pathlen);
133 ASSERT(0); 133 ASSERT(0);
134 return XFS_ERROR(EFSCORRUPTED); 134 error = XFS_ERROR(EFSCORRUPTED);
135 goto out;
135 } 136 }
136 137
137 138
@@ -175,7 +176,7 @@ xfs_free_eofblocks(
175 * Figure out if there are any blocks beyond the end 176 * Figure out if there are any blocks beyond the end
176 * of the file. If not, then there is nothing to do. 177 * of the file. If not, then there is nothing to do.
177 */ 178 */
178 end_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)ip->i_size)); 179 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
179 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp)); 180 last_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
180 if (last_fsb <= end_fsb) 181 if (last_fsb <= end_fsb)
181 return 0; 182 return 0;
@@ -226,7 +227,14 @@ xfs_free_eofblocks(
226 xfs_ilock(ip, XFS_ILOCK_EXCL); 227 xfs_ilock(ip, XFS_ILOCK_EXCL);
227 xfs_trans_ijoin(tp, ip, 0); 228 xfs_trans_ijoin(tp, ip, 0);
228 229
229 error = xfs_itruncate_data(&tp, ip, ip->i_size); 230 /*
231 * Do not update the on-disk file size. If we update the
232 * on-disk file size and then the system crashes before the
233 * contents of the file are flushed to disk then the files
234 * may be full of holes (ie NULL files bug).
235 */
236 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
237 XFS_ISIZE(ip));
230 if (error) { 238 if (error) {
231 /* 239 /*
232 * If we get an error at this point we simply don't 240 * If we get an error at this point we simply don't
@@ -540,8 +548,8 @@ xfs_release(
540 return 0; 548 return 0;
541 549
542 if ((S_ISREG(ip->i_d.di_mode) && 550 if ((S_ISREG(ip->i_d.di_mode) &&
543 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || 551 (VFS_I(ip)->i_size > 0 ||
544 ip->i_delayed_blks > 0)) && 552 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
545 (ip->i_df.if_flags & XFS_IFEXTENTS)) && 553 (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
546 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) { 554 (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
547 555
@@ -618,7 +626,7 @@ xfs_inactive(
618 * only one with a reference to the inode. 626 * only one with a reference to the inode.
619 */ 627 */
620 truncate = ((ip->i_d.di_nlink == 0) && 628 truncate = ((ip->i_d.di_nlink == 0) &&
621 ((ip->i_d.di_size != 0) || (ip->i_size != 0) || 629 ((ip->i_d.di_size != 0) || XFS_ISIZE(ip) != 0 ||
622 (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) && 630 (ip->i_d.di_nextents > 0) || (ip->i_delayed_blks > 0)) &&
623 S_ISREG(ip->i_d.di_mode)); 631 S_ISREG(ip->i_d.di_mode));
624 632
@@ -632,12 +640,12 @@ xfs_inactive(
632 640
633 if (ip->i_d.di_nlink != 0) { 641 if (ip->i_d.di_nlink != 0) {
634 if ((S_ISREG(ip->i_d.di_mode) && 642 if ((S_ISREG(ip->i_d.di_mode) &&
635 ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 || 643 (VFS_I(ip)->i_size > 0 ||
636 ip->i_delayed_blks > 0)) && 644 (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
637 (ip->i_df.if_flags & XFS_IFEXTENTS) && 645 (ip->i_df.if_flags & XFS_IFEXTENTS) &&
638 (!(ip->i_d.di_flags & 646 (!(ip->i_d.di_flags &
639 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) || 647 (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
640 (ip->i_delayed_blks != 0)))) { 648 ip->i_delayed_blks != 0))) {
641 error = xfs_free_eofblocks(mp, ip, 0); 649 error = xfs_free_eofblocks(mp, ip, 0);
642 if (error) 650 if (error)
643 return VN_INACTIVE_CACHE; 651 return VN_INACTIVE_CACHE;
@@ -670,13 +678,18 @@ xfs_inactive(
670 xfs_ilock(ip, XFS_ILOCK_EXCL); 678 xfs_ilock(ip, XFS_ILOCK_EXCL);
671 xfs_trans_ijoin(tp, ip, 0); 679 xfs_trans_ijoin(tp, ip, 0);
672 680
673 error = xfs_itruncate_data(&tp, ip, 0); 681 ip->i_d.di_size = 0;
682 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
683
684 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
674 if (error) { 685 if (error) {
675 xfs_trans_cancel(tp, 686 xfs_trans_cancel(tp,
676 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 687 XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
677 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 688 xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
678 return VN_INACTIVE_CACHE; 689 return VN_INACTIVE_CACHE;
679 } 690 }
691
692 ASSERT(ip->i_d.di_nextents == 0);
680 } else if (S_ISLNK(ip->i_d.di_mode)) { 693 } else if (S_ISLNK(ip->i_d.di_mode)) {
681 694
682 /* 695 /*
@@ -1961,11 +1974,11 @@ xfs_zero_remaining_bytes(
1961 * since nothing can read beyond eof. The space will 1974 * since nothing can read beyond eof. The space will
1962 * be zeroed when the file is extended anyway. 1975 * be zeroed when the file is extended anyway.
1963 */ 1976 */
1964 if (startoff >= ip->i_size) 1977 if (startoff >= XFS_ISIZE(ip))
1965 return 0; 1978 return 0;
1966 1979
1967 if (endoff > ip->i_size) 1980 if (endoff > XFS_ISIZE(ip))
1968 endoff = ip->i_size; 1981 endoff = XFS_ISIZE(ip);
1969 1982
1970 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? 1983 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1971 mp->m_rtdev_targp : mp->m_ddev_targp, 1984 mp->m_rtdev_targp : mp->m_ddev_targp,
@@ -2260,7 +2273,7 @@ xfs_change_file_space(
2260 bf->l_start += offset; 2273 bf->l_start += offset;
2261 break; 2274 break;
2262 case 2: /*SEEK_END*/ 2275 case 2: /*SEEK_END*/
2263 bf->l_start += ip->i_size; 2276 bf->l_start += XFS_ISIZE(ip);
2264 break; 2277 break;
2265 default: 2278 default:
2266 return XFS_ERROR(EINVAL); 2279 return XFS_ERROR(EINVAL);
@@ -2277,7 +2290,7 @@ xfs_change_file_space(
2277 bf->l_whence = 0; 2290 bf->l_whence = 0;
2278 2291
2279 startoffset = bf->l_start; 2292 startoffset = bf->l_start;
2280 fsize = ip->i_size; 2293 fsize = XFS_ISIZE(ip);
2281 2294
2282 /* 2295 /*
2283 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve 2296 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve