aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/vfs_addr.c2
-rw-r--r--fs/9p/vfs_file.c10
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/btrfs/async-thread.c4
-rw-r--r--fs/btrfs/async-thread.h2
-rw-r--r--fs/btrfs/backref.c4
-rw-r--r--fs/btrfs/btrfs_inode.h14
-rw-r--r--fs/btrfs/check-integrity.c9
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/compression.h4
-rw-r--r--fs/btrfs/ctree.c62
-rw-r--r--fs/btrfs/ctree.h46
-rw-r--r--fs/btrfs/delayed-inode.c9
-rw-r--r--fs/btrfs/delayed-ref.c22
-rw-r--r--fs/btrfs/delayed-ref.h10
-rw-r--r--fs/btrfs/dev-replace.c6
-rw-r--r--fs/btrfs/disk-io.c570
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c476
-rw-r--r--fs/btrfs/extent_io.c5
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file-item.c6
-rw-r--r--fs/btrfs/file.c65
-rw-r--r--fs/btrfs/free-space-cache.c301
-rw-r--r--fs/btrfs/free-space-cache.h9
-rw-r--r--fs/btrfs/inode-map.c2
-rw-r--r--fs/btrfs/inode.c146
-rw-r--r--fs/btrfs/ioctl.c33
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/math.h6
-rw-r--r--fs/btrfs/props.c2
-rw-r--r--fs/btrfs/qgroup.c348
-rw-r--r--fs/btrfs/qgroup.h3
-rw-r--r--fs/btrfs/raid56.c16
-rw-r--r--fs/btrfs/relocation.c11
-rw-r--r--fs/btrfs/scrub.c25
-rw-r--r--fs/btrfs/send.c83
-rw-r--r--fs/btrfs/super.c23
-rw-r--r--fs/btrfs/sysfs.c2
-rw-r--r--fs/btrfs/sysfs.h22
-rw-r--r--fs/btrfs/tests/qgroup-tests.c4
-rw-r--r--fs/btrfs/transaction.c54
-rw-r--r--fs/btrfs/transaction.h12
-rw-r--r--fs/btrfs/tree-log.c382
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c140
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c53
-rw-r--r--fs/btrfs/zlib.c2
-rw-r--r--fs/ceph/addr.c38
-rw-r--r--fs/ceph/caps.c51
-rw-r--r--fs/ceph/dir.c48
-rw-r--r--fs/ceph/mds_client.c61
-rw-r--r--fs/ceph/strings.c1
-rw-r--r--fs/ceph/super.c56
-rw-r--r--fs/ceph/super.h4
-rw-r--r--fs/ceph/xattr.c23
-rw-r--r--fs/exec.c76
-rw-r--r--fs/ext4/Kconfig17
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c5
-rw-r--r--fs/ext4/balloc.c3
-rw-r--r--fs/ext4/bitmap.c1
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/crypto.c558
-rw-r--r--fs/ext4/crypto_fname.c709
-rw-r--r--fs/ext4/crypto_key.c165
-rw-r--r--fs/ext4/crypto_policy.c194
-rw-r--r--fs/ext4/dir.c81
-rw-r--r--fs/ext4/ext4.h169
-rw-r--r--fs/ext4/ext4_crypto.h147
-rw-r--r--fs/ext4/extents.c81
-rw-r--r--fs/ext4/extents_status.c2
-rw-r--r--fs/ext4/file.c19
-rw-r--r--fs/ext4/fsync.c1
-rw-r--r--fs/ext4/hash.c1
-rw-r--r--fs/ext4/ialloc.c28
-rw-r--r--fs/ext4/inline.c16
-rw-r--r--fs/ext4/inode.c130
-rw-r--r--fs/ext4/ioctl.c86
-rw-r--r--fs/ext4/namei.c637
-rw-r--r--fs/ext4/page-io.c47
-rw-r--r--fs/ext4/readpage.c328
-rw-r--r--fs/ext4/super.c56
-rw-r--r--fs/ext4/symlink.c97
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/ext4/xattr.h3
-rw-r--r--fs/f2fs/Kconfig2
-rw-r--r--fs/f2fs/acl.c14
-rw-r--r--fs/f2fs/checkpoint.c38
-rw-r--r--fs/f2fs/data.c742
-rw-r--r--fs/f2fs/debug.c22
-rw-r--r--fs/f2fs/dir.c93
-rw-r--r--fs/f2fs/f2fs.h174
-rw-r--r--fs/f2fs/file.c64
-rw-r--r--fs/f2fs/gc.c6
-rw-r--r--fs/f2fs/inline.c69
-rw-r--r--fs/f2fs/inode.c25
-rw-r--r--fs/f2fs/namei.c81
-rw-r--r--fs/f2fs/node.c18
-rw-r--r--fs/f2fs/node.h1
-rw-r--r--fs/f2fs/recovery.c76
-rw-r--r--fs/f2fs/segment.c17
-rw-r--r--fs/f2fs/segment.h3
-rw-r--r--fs/f2fs/super.c40
-rw-r--r--fs/f2fs/xattr.c4
-rw-r--r--fs/fs_pin.c4
-rw-r--r--fs/jffs2/xattr.c1
-rw-r--r--fs/lockd/svcsubs.c2
-rw-r--r--fs/namespace.c142
-rw-r--r--fs/nfsd/Kconfig2
-rw-r--r--fs/nfsd/export.c4
-rw-r--r--fs/nfsd/nfs4acl.c50
-rw-r--r--fs/nfsd/nfs4proc.c12
-rw-r--r--fs/nfsd/nfs4state.c5
-rw-r--r--fs/nfsd/nfs4xdr.c16
-rw-r--r--fs/nfsd/nfsctl.c16
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/xdr4.h3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c42
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/super.c9
-rw-r--r--fs/open.c8
-rw-r--r--fs/pnode.c60
-rw-r--r--fs/pnode.h7
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c104
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c150
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h6
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c554
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h13
-rw-r--r--fs/xfs/libxfs/xfs_btree.c24
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c39
-rw-r--r--fs/xfs/libxfs/xfs_format.h62
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c48
-rw-r--r--fs/xfs/libxfs/xfs_sb.c20
-rw-r--r--fs/xfs/xfs_aops.c270
-rw-r--r--fs/xfs/xfs_attr_inactive.c3
-rw-r--r--fs/xfs/xfs_attr_list.c9
-rw-r--r--fs/xfs/xfs_bmap_util.c164
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf_item.c4
-rw-r--r--fs/xfs/xfs_discard.c2
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_error.h8
-rw-r--r--fs/xfs/xfs_file.c161
-rw-r--r--fs/xfs/xfs_filestream.c2
-rw-r--r--fs/xfs/xfs_fsops.c20
-rw-r--r--fs/xfs/xfs_icache.c4
-rw-r--r--fs/xfs/xfs_inode.c558
-rw-r--r--fs/xfs/xfs_inode.h49
-rw-r--r--fs/xfs/xfs_ioctl.c7
-rw-r--r--fs/xfs/xfs_iomap.c3
-rw-r--r--fs/xfs/xfs_iops.c91
-rw-r--r--fs/xfs/xfs_iops.h2
-rw-r--r--fs/xfs/xfs_itable.c2
-rw-r--r--fs/xfs/xfs_linux.h9
-rw-r--r--fs/xfs/xfs_log_recover.c4
-rw-r--r--fs/xfs/xfs_mount.c918
-rw-r--r--fs/xfs/xfs_mount.h95
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_pnfs.c7
-rw-r--r--fs/xfs/xfs_pnfs.h5
-rw-r--r--fs/xfs/xfs_qm.c13
-rw-r--r--fs/xfs/xfs_super.c132
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_symlink.c58
-rw-r--r--fs/xfs/xfs_trace.h29
-rw-r--r--fs/xfs/xfs_trans.c234
171 files changed, 9130 insertions, 3632 deletions
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 099c7712631c..fb9ffcb43277 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -78,7 +78,6 @@ enum p9_cache_modes {
78 * @cache: cache mode of type &p9_cache_modes 78 * @cache: cache mode of type &p9_cache_modes
79 * @cachetag: the tag of the cache associated with this session 79 * @cachetag: the tag of the cache associated with this session
80 * @fscache: session cookie associated with FS-Cache 80 * @fscache: session cookie associated with FS-Cache
81 * @options: copy of options string given by user
82 * @uname: string user name to mount hierarchy as 81 * @uname: string user name to mount hierarchy as
83 * @aname: mount specifier for remote hierarchy 82 * @aname: mount specifier for remote hierarchy
84 * @maxdata: maximum data to be sent/recvd per protocol message 83 * @maxdata: maximum data to be sent/recvd per protocol message
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index be35d05a4d0e..e9e04376c52c 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -231,9 +231,7 @@ static int v9fs_launder_page(struct page *page)
231/** 231/**
232 * v9fs_direct_IO - 9P address space operation for direct I/O 232 * v9fs_direct_IO - 9P address space operation for direct I/O
233 * @iocb: target I/O control block 233 * @iocb: target I/O control block
234 * @iov: array of vectors that define I/O buffer
235 * @pos: offset in file to begin the operation 234 * @pos: offset in file to begin the operation
236 * @nr_segs: size of iovec array
237 * 235 *
238 * The presence of v9fs_direct_IO() in the address space ops vector 236 * The presence of v9fs_direct_IO() in the address space ops vector
239 * allowes open() O_DIRECT flags which would have failed otherwise. 237 * allowes open() O_DIRECT flags which would have failed otherwise.
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2a9dd37dc426..1ef16bd8280b 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -151,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
151{ 151{
152 struct p9_flock flock; 152 struct p9_flock flock;
153 struct p9_fid *fid; 153 struct p9_fid *fid;
154 uint8_t status; 154 uint8_t status = P9_LOCK_ERROR;
155 int res = 0; 155 int res = 0;
156 unsigned char fl_type; 156 unsigned char fl_type;
157 157
@@ -196,7 +196,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
196 for (;;) { 196 for (;;) {
197 res = p9_client_lock_dotl(fid, &flock, &status); 197 res = p9_client_lock_dotl(fid, &flock, &status);
198 if (res < 0) 198 if (res < 0)
199 break; 199 goto out_unlock;
200 200
201 if (status != P9_LOCK_BLOCKED) 201 if (status != P9_LOCK_BLOCKED)
202 break; 202 break;
@@ -214,14 +214,16 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
214 case P9_LOCK_BLOCKED: 214 case P9_LOCK_BLOCKED:
215 res = -EAGAIN; 215 res = -EAGAIN;
216 break; 216 break;
217 default:
218 WARN_ONCE(1, "unknown lock status code: %d\n", status);
219 /* fallthough */
217 case P9_LOCK_ERROR: 220 case P9_LOCK_ERROR:
218 case P9_LOCK_GRACE: 221 case P9_LOCK_GRACE:
219 res = -ENOLCK; 222 res = -ENOLCK;
220 break; 223 break;
221 default:
222 BUG();
223 } 224 }
224 225
226out_unlock:
225 /* 227 /*
226 * incase server returned error for lock request, revert 228 * incase server returned error for lock request, revert
227 * it locally 229 * it locally
diff --git a/fs/Kconfig b/fs/Kconfig
index ec35851e5b71..011f43365d7b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -32,6 +32,7 @@ source "fs/gfs2/Kconfig"
32source "fs/ocfs2/Kconfig" 32source "fs/ocfs2/Kconfig"
33source "fs/btrfs/Kconfig" 33source "fs/btrfs/Kconfig"
34source "fs/nilfs2/Kconfig" 34source "fs/nilfs2/Kconfig"
35source "fs/f2fs/Kconfig"
35 36
36config FS_DAX 37config FS_DAX
37 bool "Direct Access (DAX) support" 38 bool "Direct Access (DAX) support"
@@ -217,7 +218,6 @@ source "fs/pstore/Kconfig"
217source "fs/sysv/Kconfig" 218source "fs/sysv/Kconfig"
218source "fs/ufs/Kconfig" 219source "fs/ufs/Kconfig"
219source "fs/exofs/Kconfig" 220source "fs/exofs/Kconfig"
220source "fs/f2fs/Kconfig"
221 221
222endif # MISC_FILESYSTEMS 222endif # MISC_FILESYSTEMS
223 223
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 4dabeb893b7c..df9932b00d08 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -87,7 +87,7 @@ BTRFS_WORK_HELPER(scrubwrc_helper);
87BTRFS_WORK_HELPER(scrubnc_helper); 87BTRFS_WORK_HELPER(scrubnc_helper);
88 88
89static struct __btrfs_workqueue * 89static struct __btrfs_workqueue *
90__btrfs_alloc_workqueue(const char *name, int flags, int max_active, 90__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active,
91 int thresh) 91 int thresh)
92{ 92{
93 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); 93 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
@@ -132,7 +132,7 @@ static inline void
132__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); 132__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
133 133
134struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, 134struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
135 int flags, 135 unsigned int flags,
136 int max_active, 136 int max_active,
137 int thresh) 137 int thresh)
138{ 138{
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index e386c29ef1f6..ec2ee477f8ba 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -66,7 +66,7 @@ BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
66BTRFS_WORK_HELPER_PROTO(scrubnc_helper); 66BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
67 67
68struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, 68struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
69 int flags, 69 unsigned int flags,
70 int max_active, 70 int max_active,
71 int thresh); 71 int thresh);
72void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, 72void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f55721ff9385..9de772ee0031 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1206,7 +1206,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
1206 struct ulist *roots = NULL; 1206 struct ulist *roots = NULL;
1207 struct ulist_iterator uiter; 1207 struct ulist_iterator uiter;
1208 struct ulist_node *node; 1208 struct ulist_node *node;
1209 struct seq_list elem = {}; 1209 struct seq_list elem = SEQ_LIST_INIT(elem);
1210 int ret = 0; 1210 int ret = 0;
1211 1211
1212 tmp = ulist_alloc(GFP_NOFS); 1212 tmp = ulist_alloc(GFP_NOFS);
@@ -1610,7 +1610,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
1610 struct ulist *roots = NULL; 1610 struct ulist *roots = NULL;
1611 struct ulist_node *ref_node = NULL; 1611 struct ulist_node *ref_node = NULL;
1612 struct ulist_node *root_node = NULL; 1612 struct ulist_node *root_node = NULL;
1613 struct seq_list tree_mod_seq_elem = {}; 1613 struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
1614 struct ulist_iterator ref_uiter; 1614 struct ulist_iterator ref_uiter;
1615 struct ulist_iterator root_uiter; 1615 struct ulist_iterator root_uiter;
1616 1616
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index de5e4f2adfea..0ef5cc13fae2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,7 +66,11 @@ struct btrfs_inode {
66 */ 66 */
67 struct btrfs_key location; 67 struct btrfs_key location;
68 68
69 /* Lock for counters */ 69 /*
70 * Lock for counters and all fields used to determine if the inode is in
71 * the log or not (last_trans, last_sub_trans, last_log_commit,
72 * logged_trans).
73 */
70 spinlock_t lock; 74 spinlock_t lock;
71 75
72 /* the extent_tree has caches of all the extent mappings to disk */ 76 /* the extent_tree has caches of all the extent mappings to disk */
@@ -250,6 +254,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode)
250 254
251static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) 255static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
252{ 256{
257 int ret = 0;
258
259 spin_lock(&BTRFS_I(inode)->lock);
253 if (BTRFS_I(inode)->logged_trans == generation && 260 if (BTRFS_I(inode)->logged_trans == generation &&
254 BTRFS_I(inode)->last_sub_trans <= 261 BTRFS_I(inode)->last_sub_trans <=
255 BTRFS_I(inode)->last_log_commit && 262 BTRFS_I(inode)->last_log_commit &&
@@ -263,9 +270,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
263 */ 270 */
264 smp_mb(); 271 smp_mb();
265 if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) 272 if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
266 return 1; 273 ret = 1;
267 } 274 }
268 return 0; 275 spin_unlock(&BTRFS_I(inode)->lock);
276 return ret;
269} 277}
270 278
271#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 279#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index d897ef803b3b..ce7dec88f4b8 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio)
2990 (unsigned long long)bio->bi_iter.bi_sector, 2990 (unsigned long long)bio->bi_iter.bi_sector,
2991 dev_bytenr, bio->bi_bdev); 2991 dev_bytenr, bio->bi_bdev);
2992 2992
2993 mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, 2993 mapped_datav = kmalloc_array(bio->bi_vcnt,
2994 GFP_NOFS); 2994 sizeof(*mapped_datav), GFP_NOFS);
2995 if (!mapped_datav) 2995 if (!mapped_datav)
2996 goto leave; 2996 goto leave;
2997 cur_bytenr = dev_bytenr; 2997 cur_bytenr = dev_bytenr;
@@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root,
3241 3241
3242 mutex_unlock(&btrfsic_mutex); 3242 mutex_unlock(&btrfsic_mutex);
3243 3243
3244 if (is_vmalloc_addr(state)) 3244 kvfree(state);
3245 vfree(state);
3246 else
3247 kfree(state);
3248} 3245}
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index e9df8862012c..ce62324c78e7 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
622 cb->orig_bio = bio; 622 cb->orig_bio = bio;
623 623
624 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); 624 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
625 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, 625 cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
626 GFP_NOFS); 626 GFP_NOFS);
627 if (!cb->compressed_pages) 627 if (!cb->compressed_pages)
628 goto fail1; 628 goto fail1;
@@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
750static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; 750static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
751static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; 751static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
752 752
753static struct btrfs_compress_op *btrfs_compress_op[] = { 753static const struct btrfs_compress_op * const btrfs_compress_op[] = {
754 &btrfs_zlib_compress, 754 &btrfs_zlib_compress,
755 &btrfs_lzo_compress, 755 &btrfs_lzo_compress,
756}; 756};
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d181f70caae0..13a4dc0436c9 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -77,7 +77,7 @@ struct btrfs_compress_op {
77 size_t srclen, size_t destlen); 77 size_t srclen, size_t destlen);
78}; 78};
79 79
80extern struct btrfs_compress_op btrfs_zlib_compress; 80extern const struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress; 81extern const struct btrfs_compress_op btrfs_lzo_compress;
82 82
83#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6d67f32e648d..0f11ebc92f02 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
578 if (!tree_mod_need_log(fs_info, eb)) 578 if (!tree_mod_need_log(fs_info, eb))
579 return 0; 579 return 0;
580 580
581 tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); 581 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
582 if (!tm_list) 582 if (!tm_list)
583 return -ENOMEM; 583 return -ENOMEM;
584 584
@@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
677 677
678 if (log_removal && btrfs_header_level(old_root) > 0) { 678 if (log_removal && btrfs_header_level(old_root) > 0) {
679 nritems = btrfs_header_nritems(old_root); 679 nritems = btrfs_header_nritems(old_root);
680 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), 680 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
681 flags); 681 flags);
682 if (!tm_list) { 682 if (!tm_list) {
683 ret = -ENOMEM; 683 ret = -ENOMEM;
@@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
814 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) 814 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
815 return 0; 815 return 0;
816 816
817 tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), 817 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
818 GFP_NOFS); 818 GFP_NOFS);
819 if (!tm_list) 819 if (!tm_list)
820 return -ENOMEM; 820 return -ENOMEM;
@@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
905 return 0; 905 return 0;
906 906
907 nritems = btrfs_header_nritems(eb); 907 nritems = btrfs_header_nritems(eb);
908 tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), 908 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
909 GFP_NOFS);
910 if (!tm_list) 909 if (!tm_list)
911 return -ENOMEM; 910 return -ENOMEM;
912 911
@@ -1073,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1073 ret = btrfs_dec_ref(trans, root, buf, 1); 1072 ret = btrfs_dec_ref(trans, root, buf, 1);
1074 BUG_ON(ret); /* -ENOMEM */ 1073 BUG_ON(ret); /* -ENOMEM */
1075 } 1074 }
1076 clean_tree_block(trans, root, buf); 1075 clean_tree_block(trans, root->fs_info, buf);
1077 *last_ref = 1; 1076 *last_ref = 1;
1078 } 1077 }
1079 return 0; 1078 return 0;
@@ -1678,7 +1677,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1678 continue; 1677 continue;
1679 } 1678 }
1680 1679
1681 cur = btrfs_find_tree_block(root, blocknr); 1680 cur = btrfs_find_tree_block(root->fs_info, blocknr);
1682 if (cur) 1681 if (cur)
1683 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1682 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1684 else 1683 else
@@ -1943,7 +1942,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1943 1942
1944 path->locks[level] = 0; 1943 path->locks[level] = 0;
1945 path->nodes[level] = NULL; 1944 path->nodes[level] = NULL;
1946 clean_tree_block(trans, root, mid); 1945 clean_tree_block(trans, root->fs_info, mid);
1947 btrfs_tree_unlock(mid); 1946 btrfs_tree_unlock(mid);
1948 /* once for the path */ 1947 /* once for the path */
1949 free_extent_buffer(mid); 1948 free_extent_buffer(mid);
@@ -1997,7 +1996,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1997 if (wret < 0 && wret != -ENOSPC) 1996 if (wret < 0 && wret != -ENOSPC)
1998 ret = wret; 1997 ret = wret;
1999 if (btrfs_header_nritems(right) == 0) { 1998 if (btrfs_header_nritems(right) == 0) {
2000 clean_tree_block(trans, root, right); 1999 clean_tree_block(trans, root->fs_info, right);
2001 btrfs_tree_unlock(right); 2000 btrfs_tree_unlock(right);
2002 del_ptr(root, path, level + 1, pslot + 1); 2001 del_ptr(root, path, level + 1, pslot + 1);
2003 root_sub_used(root, right->len); 2002 root_sub_used(root, right->len);
@@ -2041,7 +2040,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
2041 BUG_ON(wret == 1); 2040 BUG_ON(wret == 1);
2042 } 2041 }
2043 if (btrfs_header_nritems(mid) == 0) { 2042 if (btrfs_header_nritems(mid) == 0) {
2044 clean_tree_block(trans, root, mid); 2043 clean_tree_block(trans, root->fs_info, mid);
2045 btrfs_tree_unlock(mid); 2044 btrfs_tree_unlock(mid);
2046 del_ptr(root, path, level + 1, pslot); 2045 del_ptr(root, path, level + 1, pslot);
2047 root_sub_used(root, mid->len); 2046 root_sub_used(root, mid->len);
@@ -2259,7 +2258,7 @@ static void reada_for_search(struct btrfs_root *root,
2259 2258
2260 search = btrfs_node_blockptr(node, slot); 2259 search = btrfs_node_blockptr(node, slot);
2261 blocksize = root->nodesize; 2260 blocksize = root->nodesize;
2262 eb = btrfs_find_tree_block(root, search); 2261 eb = btrfs_find_tree_block(root->fs_info, search);
2263 if (eb) { 2262 if (eb) {
2264 free_extent_buffer(eb); 2263 free_extent_buffer(eb);
2265 return; 2264 return;
@@ -2319,7 +2318,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2319 if (slot > 0) { 2318 if (slot > 0) {
2320 block1 = btrfs_node_blockptr(parent, slot - 1); 2319 block1 = btrfs_node_blockptr(parent, slot - 1);
2321 gen = btrfs_node_ptr_generation(parent, slot - 1); 2320 gen = btrfs_node_ptr_generation(parent, slot - 1);
2322 eb = btrfs_find_tree_block(root, block1); 2321 eb = btrfs_find_tree_block(root->fs_info, block1);
2323 /* 2322 /*
2324 * if we get -eagain from btrfs_buffer_uptodate, we 2323 * if we get -eagain from btrfs_buffer_uptodate, we
2325 * don't want to return eagain here. That will loop 2324 * don't want to return eagain here. That will loop
@@ -2332,7 +2331,7 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2332 if (slot + 1 < nritems) { 2331 if (slot + 1 < nritems) {
2333 block2 = btrfs_node_blockptr(parent, slot + 1); 2332 block2 = btrfs_node_blockptr(parent, slot + 1);
2334 gen = btrfs_node_ptr_generation(parent, slot + 1); 2333 gen = btrfs_node_ptr_generation(parent, slot + 1);
2335 eb = btrfs_find_tree_block(root, block2); 2334 eb = btrfs_find_tree_block(root->fs_info, block2);
2336 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2335 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2337 block2 = 0; 2336 block2 = 0;
2338 free_extent_buffer(eb); 2337 free_extent_buffer(eb);
@@ -2450,7 +2449,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2450 blocknr = btrfs_node_blockptr(b, slot); 2449 blocknr = btrfs_node_blockptr(b, slot);
2451 gen = btrfs_node_ptr_generation(b, slot); 2450 gen = btrfs_node_ptr_generation(b, slot);
2452 2451
2453 tmp = btrfs_find_tree_block(root, blocknr); 2452 tmp = btrfs_find_tree_block(root->fs_info, blocknr);
2454 if (tmp) { 2453 if (tmp) {
2455 /* first we do an atomic uptodate check */ 2454 /* first we do an atomic uptodate check */
2456 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2455 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -3126,7 +3125,8 @@ again:
3126 * higher levels 3125 * higher levels
3127 * 3126 *
3128 */ 3127 */
3129static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, 3128static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3129 struct btrfs_path *path,
3130 struct btrfs_disk_key *key, int level) 3130 struct btrfs_disk_key *key, int level)
3131{ 3131{
3132 int i; 3132 int i;
@@ -3137,7 +3137,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3137 if (!path->nodes[i]) 3137 if (!path->nodes[i])
3138 break; 3138 break;
3139 t = path->nodes[i]; 3139 t = path->nodes[i];
3140 tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); 3140 tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3141 btrfs_set_node_key(t, key, tslot); 3141 btrfs_set_node_key(t, key, tslot);
3142 btrfs_mark_buffer_dirty(path->nodes[i]); 3142 btrfs_mark_buffer_dirty(path->nodes[i]);
3143 if (tslot != 0) 3143 if (tslot != 0)
@@ -3151,7 +3151,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
3151 * This function isn't completely safe. It's the caller's responsibility 3151 * This function isn't completely safe. It's the caller's responsibility
3152 * that the new key won't break the order 3152 * that the new key won't break the order
3153 */ 3153 */
3154void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, 3154void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3155 struct btrfs_path *path,
3155 struct btrfs_key *new_key) 3156 struct btrfs_key *new_key)
3156{ 3157{
3157 struct btrfs_disk_key disk_key; 3158 struct btrfs_disk_key disk_key;
@@ -3173,7 +3174,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
3173 btrfs_set_item_key(eb, &disk_key, slot); 3174 btrfs_set_item_key(eb, &disk_key, slot);
3174 btrfs_mark_buffer_dirty(eb); 3175 btrfs_mark_buffer_dirty(eb);
3175 if (slot == 0) 3176 if (slot == 0)
3176 fixup_low_keys(root, path, &disk_key, 1); 3177 fixup_low_keys(fs_info, path, &disk_key, 1);
3177} 3178}
3178 3179
3179/* 3180/*
@@ -3692,7 +3693,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3692 if (left_nritems) 3693 if (left_nritems)
3693 btrfs_mark_buffer_dirty(left); 3694 btrfs_mark_buffer_dirty(left);
3694 else 3695 else
3695 clean_tree_block(trans, root, left); 3696 clean_tree_block(trans, root->fs_info, left);
3696 3697
3697 btrfs_mark_buffer_dirty(right); 3698 btrfs_mark_buffer_dirty(right);
3698 3699
@@ -3704,7 +3705,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3704 if (path->slots[0] >= left_nritems) { 3705 if (path->slots[0] >= left_nritems) {
3705 path->slots[0] -= left_nritems; 3706 path->slots[0] -= left_nritems;
3706 if (btrfs_header_nritems(path->nodes[0]) == 0) 3707 if (btrfs_header_nritems(path->nodes[0]) == 0)
3707 clean_tree_block(trans, root, path->nodes[0]); 3708 clean_tree_block(trans, root->fs_info, path->nodes[0]);
3708 btrfs_tree_unlock(path->nodes[0]); 3709 btrfs_tree_unlock(path->nodes[0]);
3709 free_extent_buffer(path->nodes[0]); 3710 free_extent_buffer(path->nodes[0]);
3710 path->nodes[0] = right; 3711 path->nodes[0] = right;
@@ -3928,10 +3929,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3928 if (right_nritems) 3929 if (right_nritems)
3929 btrfs_mark_buffer_dirty(right); 3930 btrfs_mark_buffer_dirty(right);
3930 else 3931 else
3931 clean_tree_block(trans, root, right); 3932 clean_tree_block(trans, root->fs_info, right);
3932 3933
3933 btrfs_item_key(right, &disk_key, 0); 3934 btrfs_item_key(right, &disk_key, 0);
3934 fixup_low_keys(root, path, &disk_key, 1); 3935 fixup_low_keys(root->fs_info, path, &disk_key, 1);
3935 3936
3936 /* then fixup the leaf pointer in the path */ 3937 /* then fixup the leaf pointer in the path */
3937 if (path->slots[0] < push_items) { 3938 if (path->slots[0] < push_items) {
@@ -4168,6 +4169,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
4168 int mid; 4169 int mid;
4169 int slot; 4170 int slot;
4170 struct extent_buffer *right; 4171 struct extent_buffer *right;
4172 struct btrfs_fs_info *fs_info = root->fs_info;
4171 int ret = 0; 4173 int ret = 0;
4172 int wret; 4174 int wret;
4173 int split; 4175 int split;
@@ -4271,10 +4273,10 @@ again:
4271 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); 4273 btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4272 btrfs_set_header_owner(right, root->root_key.objectid); 4274 btrfs_set_header_owner(right, root->root_key.objectid);
4273 btrfs_set_header_level(right, 0); 4275 btrfs_set_header_level(right, 0);
4274 write_extent_buffer(right, root->fs_info->fsid, 4276 write_extent_buffer(right, fs_info->fsid,
4275 btrfs_header_fsid(), BTRFS_FSID_SIZE); 4277 btrfs_header_fsid(), BTRFS_FSID_SIZE);
4276 4278
4277 write_extent_buffer(right, root->fs_info->chunk_tree_uuid, 4279 write_extent_buffer(right, fs_info->chunk_tree_uuid,
4278 btrfs_header_chunk_tree_uuid(right), 4280 btrfs_header_chunk_tree_uuid(right),
4279 BTRFS_UUID_SIZE); 4281 BTRFS_UUID_SIZE);
4280 4282
@@ -4297,7 +4299,7 @@ again:
4297 path->nodes[0] = right; 4299 path->nodes[0] = right;
4298 path->slots[0] = 0; 4300 path->slots[0] = 0;
4299 if (path->slots[1] == 0) 4301 if (path->slots[1] == 0)
4300 fixup_low_keys(root, path, &disk_key, 1); 4302 fixup_low_keys(fs_info, path, &disk_key, 1);
4301 } 4303 }
4302 btrfs_mark_buffer_dirty(right); 4304 btrfs_mark_buffer_dirty(right);
4303 return ret; 4305 return ret;
@@ -4615,7 +4617,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4615 btrfs_set_disk_key_offset(&disk_key, offset + size_diff); 4617 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4616 btrfs_set_item_key(leaf, &disk_key, slot); 4618 btrfs_set_item_key(leaf, &disk_key, slot);
4617 if (slot == 0) 4619 if (slot == 0)
4618 fixup_low_keys(root, path, &disk_key, 1); 4620 fixup_low_keys(root->fs_info, path, &disk_key, 1);
4619 } 4621 }
4620 4622
4621 item = btrfs_item_nr(slot); 4623 item = btrfs_item_nr(slot);
@@ -4716,7 +4718,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4716 4718
4717 if (path->slots[0] == 0) { 4719 if (path->slots[0] == 0) {
4718 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 4720 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4719 fixup_low_keys(root, path, &disk_key, 1); 4721 fixup_low_keys(root->fs_info, path, &disk_key, 1);
4720 } 4722 }
4721 btrfs_unlock_up_safe(path, 1); 4723 btrfs_unlock_up_safe(path, 1);
4722 4724
@@ -4888,7 +4890,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4888 struct btrfs_disk_key disk_key; 4890 struct btrfs_disk_key disk_key;
4889 4891
4890 btrfs_node_key(parent, &disk_key, 0); 4892 btrfs_node_key(parent, &disk_key, 0);
4891 fixup_low_keys(root, path, &disk_key, level + 1); 4893 fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
4892 } 4894 }
4893 btrfs_mark_buffer_dirty(parent); 4895 btrfs_mark_buffer_dirty(parent);
4894} 4896}
@@ -4981,7 +4983,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4981 btrfs_set_header_level(leaf, 0); 4983 btrfs_set_header_level(leaf, 0);
4982 } else { 4984 } else {
4983 btrfs_set_path_blocking(path); 4985 btrfs_set_path_blocking(path);
4984 clean_tree_block(trans, root, leaf); 4986 clean_tree_block(trans, root->fs_info, leaf);
4985 btrfs_del_leaf(trans, root, path, leaf); 4987 btrfs_del_leaf(trans, root, path, leaf);
4986 } 4988 }
4987 } else { 4989 } else {
@@ -4990,7 +4992,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4990 struct btrfs_disk_key disk_key; 4992 struct btrfs_disk_key disk_key;
4991 4993
4992 btrfs_item_key(leaf, &disk_key, 0); 4994 btrfs_item_key(leaf, &disk_key, 0);
4993 fixup_low_keys(root, path, &disk_key, 1); 4995 fixup_low_keys(root->fs_info, path, &disk_key, 1);
4994 } 4996 }
4995 4997
4996 /* delete the leaf if it is mostly empty */ 4998 /* delete the leaf if it is mostly empty */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9c89cae39ee..6f364e1d8d3d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1061,6 +1061,12 @@ struct btrfs_block_group_item {
1061 __le64 flags; 1061 __le64 flags;
1062} __attribute__ ((__packed__)); 1062} __attribute__ ((__packed__));
1063 1063
1064#define BTRFS_QGROUP_LEVEL_SHIFT 48
1065static inline u64 btrfs_qgroup_level(u64 qgroupid)
1066{
1067 return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
1068}
1069
1064/* 1070/*
1065 * is subvolume quota turned on? 1071 * is subvolume quota turned on?
1066 */ 1072 */
@@ -1256,6 +1262,20 @@ struct btrfs_caching_control {
1256 atomic_t count; 1262 atomic_t count;
1257}; 1263};
1258 1264
1265struct btrfs_io_ctl {
1266 void *cur, *orig;
1267 struct page *page;
1268 struct page **pages;
1269 struct btrfs_root *root;
1270 struct inode *inode;
1271 unsigned long size;
1272 int index;
1273 int num_pages;
1274 int entries;
1275 int bitmaps;
1276 unsigned check_crcs:1;
1277};
1278
1259struct btrfs_block_group_cache { 1279struct btrfs_block_group_cache {
1260 struct btrfs_key key; 1280 struct btrfs_key key;
1261 struct btrfs_block_group_item item; 1281 struct btrfs_block_group_item item;
@@ -1321,6 +1341,9 @@ struct btrfs_block_group_cache {
1321 1341
1322 /* For dirty block groups */ 1342 /* For dirty block groups */
1323 struct list_head dirty_list; 1343 struct list_head dirty_list;
1344 struct list_head io_list;
1345
1346 struct btrfs_io_ctl io_ctl;
1324}; 1347};
1325 1348
1326/* delayed seq elem */ 1349/* delayed seq elem */
@@ -1329,6 +1352,8 @@ struct seq_list {
1329 u64 seq; 1352 u64 seq;
1330}; 1353};
1331 1354
1355#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
1356
1332enum btrfs_orphan_cleanup_state { 1357enum btrfs_orphan_cleanup_state {
1333 ORPHAN_CLEANUP_STARTED = 1, 1358 ORPHAN_CLEANUP_STARTED = 1,
1334 ORPHAN_CLEANUP_DONE = 2, 1359 ORPHAN_CLEANUP_DONE = 2,
@@ -1472,6 +1497,12 @@ struct btrfs_fs_info {
1472 struct mutex chunk_mutex; 1497 struct mutex chunk_mutex;
1473 struct mutex volume_mutex; 1498 struct mutex volume_mutex;
1474 1499
1500 /*
1501 * this is taken to make sure we don't set block groups ro after
1502 * the free space cache has been allocated on them
1503 */
1504 struct mutex ro_block_group_mutex;
1505
1475 /* this is used during read/modify/write to make sure 1506 /* this is used during read/modify/write to make sure
1476 * no two ios are trying to mod the same stripe at the same 1507 * no two ios are trying to mod the same stripe at the same
1477 * time 1508 * time
@@ -1513,6 +1544,7 @@ struct btrfs_fs_info {
1513 1544
1514 spinlock_t delayed_iput_lock; 1545 spinlock_t delayed_iput_lock;
1515 struct list_head delayed_iputs; 1546 struct list_head delayed_iputs;
1547 struct rw_semaphore delayed_iput_sem;
1516 1548
1517 /* this protects tree_mod_seq_list */ 1549 /* this protects tree_mod_seq_list */
1518 spinlock_t tree_mod_seq_lock; 1550 spinlock_t tree_mod_seq_lock;
@@ -3295,6 +3327,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
3295} 3327}
3296 3328
3297/* extent-tree.c */ 3329/* extent-tree.c */
3330
3331u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes);
3332
3298static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3333static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3299 unsigned num_items) 3334 unsigned num_items)
3300{ 3335{
@@ -3385,6 +3420,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
3385 u64 bytenr, u64 num_bytes, u64 parent, 3420 u64 bytenr, u64 num_bytes, u64 parent,
3386 u64 root_objectid, u64 owner, u64 offset, int no_quota); 3421 u64 root_objectid, u64 owner, u64 offset, int no_quota);
3387 3422
3423int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3424 struct btrfs_root *root);
3388int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3425int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3389 struct btrfs_root *root); 3426 struct btrfs_root *root);
3390int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, 3427int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
@@ -3417,7 +3454,7 @@ enum btrfs_reserve_flush_enum {
3417 BTRFS_RESERVE_FLUSH_ALL, 3454 BTRFS_RESERVE_FLUSH_ALL,
3418}; 3455};
3419 3456
3420int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 3457int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
3421void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3458void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
3422void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3459void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3423 struct btrfs_root *root); 3460 struct btrfs_root *root);
@@ -3440,6 +3477,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
3440 unsigned short type); 3477 unsigned short type);
3441void btrfs_free_block_rsv(struct btrfs_root *root, 3478void btrfs_free_block_rsv(struct btrfs_root *root,
3442 struct btrfs_block_rsv *rsv); 3479 struct btrfs_block_rsv *rsv);
3480void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
3443int btrfs_block_rsv_add(struct btrfs_root *root, 3481int btrfs_block_rsv_add(struct btrfs_root *root,
3444 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 3482 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
3445 enum btrfs_reserve_flush_enum flush); 3483 enum btrfs_reserve_flush_enum flush);
@@ -3486,7 +3524,8 @@ int btrfs_previous_item(struct btrfs_root *root,
3486 int type); 3524 int type);
3487int btrfs_previous_extent_item(struct btrfs_root *root, 3525int btrfs_previous_extent_item(struct btrfs_root *root,
3488 struct btrfs_path *path, u64 min_objectid); 3526 struct btrfs_path *path, u64 min_objectid);
3489void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, 3527void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3528 struct btrfs_path *path,
3490 struct btrfs_key *new_key); 3529 struct btrfs_key *new_key);
3491struct extent_buffer *btrfs_root_node(struct btrfs_root *root); 3530struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
3492struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); 3531struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
@@ -4180,7 +4219,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
4180static inline int is_fstree(u64 rootid) 4219static inline int is_fstree(u64 rootid)
4181{ 4220{
4182 if (rootid == BTRFS_FS_TREE_OBJECTID || 4221 if (rootid == BTRFS_FS_TREE_OBJECTID ||
4183 (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) 4222 ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID &&
4223 !btrfs_qgroup_level(rootid)))
4184 return 1; 4224 return 1;
4185 return 0; 4225 return 0;
4186} 4226}
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 82f0c7c95474..cde698a07d21 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1383,7 +1383,7 @@ out:
1383 1383
1384 1384
1385static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, 1385static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1386 struct btrfs_root *root, int nr) 1386 struct btrfs_fs_info *fs_info, int nr)
1387{ 1387{
1388 struct btrfs_async_delayed_work *async_work; 1388 struct btrfs_async_delayed_work *async_work;
1389 1389
@@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1399 btrfs_async_run_delayed_root, NULL, NULL); 1399 btrfs_async_run_delayed_root, NULL, NULL);
1400 async_work->nr = nr; 1400 async_work->nr = nr;
1401 1401
1402 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); 1402 btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
1403 return 0; 1403 return 0;
1404} 1404}
1405 1405
@@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
1426void btrfs_balance_delayed_items(struct btrfs_root *root) 1426void btrfs_balance_delayed_items(struct btrfs_root *root)
1427{ 1427{
1428 struct btrfs_delayed_root *delayed_root; 1428 struct btrfs_delayed_root *delayed_root;
1429 struct btrfs_fs_info *fs_info = root->fs_info;
1429 1430
1430 delayed_root = btrfs_get_delayed_root(root); 1431 delayed_root = btrfs_get_delayed_root(root);
1431 1432
@@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
1438 1439
1439 seq = atomic_read(&delayed_root->items_seq); 1440 seq = atomic_read(&delayed_root->items_seq);
1440 1441
1441 ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); 1442 ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0);
1442 if (ret) 1443 if (ret)
1443 return; 1444 return;
1444 1445
@@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root)
1447 return; 1448 return;
1448 } 1449 }
1449 1450
1450 btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); 1451 btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH);
1451} 1452}
1452 1453
1453/* Will return 0 or -ENOMEM */ 1454/* Will return 0 or -ENOMEM */
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 6d16bea94e1c..8f8ed7d20bac 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -489,11 +489,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
489 * existing and update must have the same bytenr 489 * existing and update must have the same bytenr
490 */ 490 */
491static noinline void 491static noinline void
492update_existing_head_ref(struct btrfs_delayed_ref_node *existing, 492update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
493 struct btrfs_delayed_ref_node *existing,
493 struct btrfs_delayed_ref_node *update) 494 struct btrfs_delayed_ref_node *update)
494{ 495{
495 struct btrfs_delayed_ref_head *existing_ref; 496 struct btrfs_delayed_ref_head *existing_ref;
496 struct btrfs_delayed_ref_head *ref; 497 struct btrfs_delayed_ref_head *ref;
498 int old_ref_mod;
497 499
498 existing_ref = btrfs_delayed_node_to_head(existing); 500 existing_ref = btrfs_delayed_node_to_head(existing);
499 ref = btrfs_delayed_node_to_head(update); 501 ref = btrfs_delayed_node_to_head(update);
@@ -541,7 +543,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
541 * only need the lock for this case cause we could be processing it 543 * only need the lock for this case cause we could be processing it
542 * currently, for refs we just added we know we're a-ok. 544 * currently, for refs we just added we know we're a-ok.
543 */ 545 */
546 old_ref_mod = existing_ref->total_ref_mod;
544 existing->ref_mod += update->ref_mod; 547 existing->ref_mod += update->ref_mod;
548 existing_ref->total_ref_mod += update->ref_mod;
549
550 /*
551 * If we are going to from a positive ref mod to a negative or vice
552 * versa we need to make sure to adjust pending_csums accordingly.
553 */
554 if (existing_ref->is_data) {
555 if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0)
556 delayed_refs->pending_csums -= existing->num_bytes;
557 if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0)
558 delayed_refs->pending_csums += existing->num_bytes;
559 }
545 spin_unlock(&existing_ref->lock); 560 spin_unlock(&existing_ref->lock);
546} 561}
547 562
@@ -605,6 +620,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
605 head_ref->is_data = is_data; 620 head_ref->is_data = is_data;
606 head_ref->ref_root = RB_ROOT; 621 head_ref->ref_root = RB_ROOT;
607 head_ref->processing = 0; 622 head_ref->processing = 0;
623 head_ref->total_ref_mod = count_mod;
608 624
609 spin_lock_init(&head_ref->lock); 625 spin_lock_init(&head_ref->lock);
610 mutex_init(&head_ref->mutex); 626 mutex_init(&head_ref->mutex);
@@ -614,7 +630,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
614 existing = htree_insert(&delayed_refs->href_root, 630 existing = htree_insert(&delayed_refs->href_root,
615 &head_ref->href_node); 631 &head_ref->href_node);
616 if (existing) { 632 if (existing) {
617 update_existing_head_ref(&existing->node, ref); 633 update_existing_head_ref(delayed_refs, &existing->node, ref);
618 /* 634 /*
619 * we've updated the existing ref, free the newly 635 * we've updated the existing ref, free the newly
620 * allocated ref 636 * allocated ref
@@ -622,6 +638,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
622 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); 638 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
623 head_ref = existing; 639 head_ref = existing;
624 } else { 640 } else {
641 if (is_data && count_mod < 0)
642 delayed_refs->pending_csums += num_bytes;
625 delayed_refs->num_heads++; 643 delayed_refs->num_heads++;
626 delayed_refs->num_heads_ready++; 644 delayed_refs->num_heads_ready++;
627 atomic_inc(&delayed_refs->num_entries); 645 atomic_inc(&delayed_refs->num_entries);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index a764e2340d48..5eb0892396d0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -88,6 +88,14 @@ struct btrfs_delayed_ref_head {
88 struct rb_node href_node; 88 struct rb_node href_node;
89 89
90 struct btrfs_delayed_extent_op *extent_op; 90 struct btrfs_delayed_extent_op *extent_op;
91
92 /*
93 * This is used to track the final ref_mod from all the refs associated
94 * with this head ref, this is not adjusted as delayed refs are run,
95 * this is meant to track if we need to do the csum accounting or not.
96 */
97 int total_ref_mod;
98
91 /* 99 /*
92 * when a new extent is allocated, it is just reserved in memory 100 * when a new extent is allocated, it is just reserved in memory
93 * The actual extent isn't inserted into the extent allocation tree 101 * The actual extent isn't inserted into the extent allocation tree
@@ -138,6 +146,8 @@ struct btrfs_delayed_ref_root {
138 /* total number of head nodes ready for processing */ 146 /* total number of head nodes ready for processing */
139 unsigned long num_heads_ready; 147 unsigned long num_heads_ready;
140 148
149 u64 pending_csums;
150
141 /* 151 /*
142 * set when the tree is flushing before a transaction commit, 152 * set when the tree is flushing before a transaction commit,
143 * used by the throttling code to decide if new updates need 153 * used by the throttling code to decide if new updates need
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5ec03d999c37..0573848c7333 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -670,8 +670,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
670 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 670 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
671 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 671 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
672 srcdev = dev_replace->srcdev; 672 srcdev = dev_replace->srcdev;
673 args->status.progress_1000 = div64_u64(dev_replace->cursor_left, 673 args->status.progress_1000 = div_u64(dev_replace->cursor_left,
674 div64_u64(btrfs_device_get_total_bytes(srcdev), 1000)); 674 div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
675 break; 675 break;
676 } 676 }
677 btrfs_dev_replace_unlock(dev_replace); 677 btrfs_dev_replace_unlock(dev_replace);
@@ -806,7 +806,7 @@ static int btrfs_dev_replace_kthread(void *data)
806 btrfs_dev_replace_status(fs_info, status_args); 806 btrfs_dev_replace_status(fs_info, status_args);
807 progress = status_args->status.progress_1000; 807 progress = status_args->status.progress_1000;
808 kfree(status_args); 808 kfree(status_args);
809 do_div(progress, 10); 809 progress = div_u64(progress, 10);
810 printk_in_rcu(KERN_INFO 810 printk_in_rcu(KERN_INFO
811 "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", 811 "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
812 dev_replace->srcdev->missing ? "<missing disk>" : 812 dev_replace->srcdev->missing ? "<missing disk>" :
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 639f2663ed3f..2ef9a4b72d06 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -54,7 +54,7 @@
54#include <asm/cpufeature.h> 54#include <asm/cpufeature.h>
55#endif 55#endif
56 56
57static struct extent_io_ops btree_extent_io_ops; 57static const struct extent_io_ops btree_extent_io_ops;
58static void end_workqueue_fn(struct btrfs_work *work); 58static void end_workqueue_fn(struct btrfs_work *work);
59static void free_fs_root(struct btrfs_root *root); 59static void free_fs_root(struct btrfs_root *root);
60static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 60static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
@@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result)
274 * compute the csum for a btree block, and either verify it or write it 274 * compute the csum for a btree block, and either verify it or write it
275 * into the csum field of the block. 275 * into the csum field of the block.
276 */ 276 */
277static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 277static int csum_tree_block(struct btrfs_fs_info *fs_info,
278 struct extent_buffer *buf,
278 int verify) 279 int verify)
279{ 280{
280 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 281 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
281 char *result = NULL; 282 char *result = NULL;
282 unsigned long len; 283 unsigned long len;
283 unsigned long cur_len; 284 unsigned long cur_len;
@@ -302,7 +303,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
302 offset += cur_len; 303 offset += cur_len;
303 } 304 }
304 if (csum_size > sizeof(inline_result)) { 305 if (csum_size > sizeof(inline_result)) {
305 result = kzalloc(csum_size * sizeof(char), GFP_NOFS); 306 result = kzalloc(csum_size, GFP_NOFS);
306 if (!result) 307 if (!result)
307 return 1; 308 return 1;
308 } else { 309 } else {
@@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
321 printk_ratelimited(KERN_WARNING 322 printk_ratelimited(KERN_WARNING
322 "BTRFS: %s checksum verify failed on %llu wanted %X found %X " 323 "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
323 "level %d\n", 324 "level %d\n",
324 root->fs_info->sb->s_id, buf->start, 325 fs_info->sb->s_id, buf->start,
325 val, found, btrfs_header_level(buf)); 326 val, found, btrfs_header_level(buf));
326 if (result != (char *)&inline_result) 327 if (result != (char *)&inline_result)
327 kfree(result); 328 kfree(result);
@@ -418,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
418 419
419 if (memcmp(raw_disk_sb, result, csum_size)) 420 if (memcmp(raw_disk_sb, result, csum_size))
420 ret = 1; 421 ret = 1;
421
422 if (ret && btrfs_super_generation(disk_sb) < 10) {
423 printk(KERN_WARNING
424 "BTRFS: super block crcs don't match, older mkfs detected\n");
425 ret = 0;
426 }
427 } 422 }
428 423
429 if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { 424 if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
@@ -501,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
501 * we only fill in the checksum field in the first page of a multi-page block 496 * we only fill in the checksum field in the first page of a multi-page block
502 */ 497 */
503 498
504static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) 499static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
505{ 500{
506 u64 start = page_offset(page); 501 u64 start = page_offset(page);
507 u64 found_start; 502 u64 found_start;
@@ -513,14 +508,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
513 found_start = btrfs_header_bytenr(eb); 508 found_start = btrfs_header_bytenr(eb);
514 if (WARN_ON(found_start != start || !PageUptodate(page))) 509 if (WARN_ON(found_start != start || !PageUptodate(page)))
515 return 0; 510 return 0;
516 csum_tree_block(root, eb, 0); 511 csum_tree_block(fs_info, eb, 0);
517 return 0; 512 return 0;
518} 513}
519 514
520static int check_tree_block_fsid(struct btrfs_root *root, 515static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
521 struct extent_buffer *eb) 516 struct extent_buffer *eb)
522{ 517{
523 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 518 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
524 u8 fsid[BTRFS_UUID_SIZE]; 519 u8 fsid[BTRFS_UUID_SIZE];
525 int ret = 1; 520 int ret = 1;
526 521
@@ -640,7 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
640 ret = -EIO; 635 ret = -EIO;
641 goto err; 636 goto err;
642 } 637 }
643 if (check_tree_block_fsid(root, eb)) { 638 if (check_tree_block_fsid(root->fs_info, eb)) {
644 printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", 639 printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
645 eb->fs_info->sb->s_id, eb->start); 640 eb->fs_info->sb->s_id, eb->start);
646 ret = -EIO; 641 ret = -EIO;
@@ -657,7 +652,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
657 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), 652 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
658 eb, found_level); 653 eb, found_level);
659 654
660 ret = csum_tree_block(root, eb, 1); 655 ret = csum_tree_block(root->fs_info, eb, 1);
661 if (ret) { 656 if (ret) {
662 ret = -EIO; 657 ret = -EIO;
663 goto err; 658 goto err;
@@ -882,7 +877,7 @@ static int btree_csum_one_bio(struct bio *bio)
882 877
883 bio_for_each_segment_all(bvec, bio, i) { 878 bio_for_each_segment_all(bvec, bio, i) {
884 root = BTRFS_I(bvec->bv_page->mapping->host)->root; 879 root = BTRFS_I(bvec->bv_page->mapping->host)->root;
885 ret = csum_dirty_buffer(root, bvec->bv_page); 880 ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
886 if (ret) 881 if (ret)
887 break; 882 break;
888 } 883 }
@@ -1119,10 +1114,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
1119 return 0; 1114 return 0;
1120} 1115}
1121 1116
1122struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1117struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
1123 u64 bytenr) 1118 u64 bytenr)
1124{ 1119{
1125 return find_extent_buffer(root->fs_info, bytenr); 1120 return find_extent_buffer(fs_info, bytenr);
1126} 1121}
1127 1122
1128struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 1123struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -1165,11 +1160,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1165 1160
1166} 1161}
1167 1162
1168void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, 1163void clean_tree_block(struct btrfs_trans_handle *trans,
1164 struct btrfs_fs_info *fs_info,
1169 struct extent_buffer *buf) 1165 struct extent_buffer *buf)
1170{ 1166{
1171 struct btrfs_fs_info *fs_info = root->fs_info;
1172
1173 if (btrfs_header_generation(buf) == 1167 if (btrfs_header_generation(buf) ==
1174 fs_info->running_transaction->transid) { 1168 fs_info->running_transaction->transid) {
1175 btrfs_assert_tree_locked(buf); 1169 btrfs_assert_tree_locked(buf);
@@ -2146,6 +2140,267 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2146 } 2140 }
2147} 2141}
2148 2142
2143static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
2144{
2145 mutex_init(&fs_info->scrub_lock);
2146 atomic_set(&fs_info->scrubs_running, 0);
2147 atomic_set(&fs_info->scrub_pause_req, 0);
2148 atomic_set(&fs_info->scrubs_paused, 0);
2149 atomic_set(&fs_info->scrub_cancel_req, 0);
2150 init_waitqueue_head(&fs_info->scrub_pause_wait);
2151 fs_info->scrub_workers_refcnt = 0;
2152}
2153
2154static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
2155{
2156 spin_lock_init(&fs_info->balance_lock);
2157 mutex_init(&fs_info->balance_mutex);
2158 atomic_set(&fs_info->balance_running, 0);
2159 atomic_set(&fs_info->balance_pause_req, 0);
2160 atomic_set(&fs_info->balance_cancel_req, 0);
2161 fs_info->balance_ctl = NULL;
2162 init_waitqueue_head(&fs_info->balance_wait_q);
2163}
2164
2165static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
2166 struct btrfs_root *tree_root)
2167{
2168 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
2169 set_nlink(fs_info->btree_inode, 1);
2170 /*
2171 * we set the i_size on the btree inode to the max possible int.
2172 * the real end of the address space is determined by all of
2173 * the devices in the system
2174 */
2175 fs_info->btree_inode->i_size = OFFSET_MAX;
2176 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
2177
2178 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
2179 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
2180 fs_info->btree_inode->i_mapping);
2181 BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
2182 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
2183
2184 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
2185
2186 BTRFS_I(fs_info->btree_inode)->root = tree_root;
2187 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2188 sizeof(struct btrfs_key));
2189 set_bit(BTRFS_INODE_DUMMY,
2190 &BTRFS_I(fs_info->btree_inode)->runtime_flags);
2191 btrfs_insert_inode_hash(fs_info->btree_inode);
2192}
2193
2194static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
2195{
2196 fs_info->dev_replace.lock_owner = 0;
2197 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2198 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2199 mutex_init(&fs_info->dev_replace.lock_management_lock);
2200 mutex_init(&fs_info->dev_replace.lock);
2201 init_waitqueue_head(&fs_info->replace_wait);
2202}
2203
2204static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
2205{
2206 spin_lock_init(&fs_info->qgroup_lock);
2207 mutex_init(&fs_info->qgroup_ioctl_lock);
2208 fs_info->qgroup_tree = RB_ROOT;
2209 fs_info->qgroup_op_tree = RB_ROOT;
2210 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2211 fs_info->qgroup_seq = 1;
2212 fs_info->quota_enabled = 0;
2213 fs_info->pending_quota_state = 0;
2214 fs_info->qgroup_ulist = NULL;
2215 mutex_init(&fs_info->qgroup_rescan_lock);
2216}
2217
2218static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
2219 struct btrfs_fs_devices *fs_devices)
2220{
2221 int max_active = fs_info->thread_pool_size;
2222 unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2223
2224 fs_info->workers =
2225 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
2226 max_active, 16);
2227
2228 fs_info->delalloc_workers =
2229 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2230
2231 fs_info->flush_workers =
2232 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2233
2234 fs_info->caching_workers =
2235 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2236
2237 /*
2238 * a higher idle thresh on the submit workers makes it much more
2239 * likely that bios will be send down in a sane order to the
2240 * devices
2241 */
2242 fs_info->submit_workers =
2243 btrfs_alloc_workqueue("submit", flags,
2244 min_t(u64, fs_devices->num_devices,
2245 max_active), 64);
2246
2247 fs_info->fixup_workers =
2248 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2249
2250 /*
2251 * endios are largely parallel and should have a very
2252 * low idle thresh
2253 */
2254 fs_info->endio_workers =
2255 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2256 fs_info->endio_meta_workers =
2257 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2258 fs_info->endio_meta_write_workers =
2259 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2260 fs_info->endio_raid56_workers =
2261 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2262 fs_info->endio_repair_workers =
2263 btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
2264 fs_info->rmw_workers =
2265 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2266 fs_info->endio_write_workers =
2267 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2268 fs_info->endio_freespace_worker =
2269 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2270 fs_info->delayed_workers =
2271 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2272 fs_info->readahead_workers =
2273 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2274 fs_info->qgroup_rescan_workers =
2275 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2276 fs_info->extent_workers =
2277 btrfs_alloc_workqueue("extent-refs", flags,
2278 min_t(u64, fs_devices->num_devices,
2279 max_active), 8);
2280
2281 if (!(fs_info->workers && fs_info->delalloc_workers &&
2282 fs_info->submit_workers && fs_info->flush_workers &&
2283 fs_info->endio_workers && fs_info->endio_meta_workers &&
2284 fs_info->endio_meta_write_workers &&
2285 fs_info->endio_repair_workers &&
2286 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2287 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2288 fs_info->caching_workers && fs_info->readahead_workers &&
2289 fs_info->fixup_workers && fs_info->delayed_workers &&
2290 fs_info->extent_workers &&
2291 fs_info->qgroup_rescan_workers)) {
2292 return -ENOMEM;
2293 }
2294
2295 return 0;
2296}
2297
2298static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2299 struct btrfs_fs_devices *fs_devices)
2300{
2301 int ret;
2302 struct btrfs_root *tree_root = fs_info->tree_root;
2303 struct btrfs_root *log_tree_root;
2304 struct btrfs_super_block *disk_super = fs_info->super_copy;
2305 u64 bytenr = btrfs_super_log_root(disk_super);
2306
2307 if (fs_devices->rw_devices == 0) {
2308 printk(KERN_WARNING "BTRFS: log replay required "
2309 "on RO media\n");
2310 return -EIO;
2311 }
2312
2313 log_tree_root = btrfs_alloc_root(fs_info);
2314 if (!log_tree_root)
2315 return -ENOMEM;
2316
2317 __setup_root(tree_root->nodesize, tree_root->sectorsize,
2318 tree_root->stripesize, log_tree_root, fs_info,
2319 BTRFS_TREE_LOG_OBJECTID);
2320
2321 log_tree_root->node = read_tree_block(tree_root, bytenr,
2322 fs_info->generation + 1);
2323 if (!log_tree_root->node ||
2324 !extent_buffer_uptodate(log_tree_root->node)) {
2325 printk(KERN_ERR "BTRFS: failed to read log tree\n");
2326 free_extent_buffer(log_tree_root->node);
2327 kfree(log_tree_root);
2328 return -EIO;
2329 }
2330 /* returns with log_tree_root freed on success */
2331 ret = btrfs_recover_log_trees(log_tree_root);
2332 if (ret) {
2333 btrfs_error(tree_root->fs_info, ret,
2334 "Failed to recover log tree");
2335 free_extent_buffer(log_tree_root->node);
2336 kfree(log_tree_root);
2337 return ret;
2338 }
2339
2340 if (fs_info->sb->s_flags & MS_RDONLY) {
2341 ret = btrfs_commit_super(tree_root);
2342 if (ret)
2343 return ret;
2344 }
2345
2346 return 0;
2347}
2348
2349static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
2350 struct btrfs_root *tree_root)
2351{
2352 struct btrfs_root *root;
2353 struct btrfs_key location;
2354 int ret;
2355
2356 location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
2357 location.type = BTRFS_ROOT_ITEM_KEY;
2358 location.offset = 0;
2359
2360 root = btrfs_read_tree_root(tree_root, &location);
2361 if (IS_ERR(root))
2362 return PTR_ERR(root);
2363 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2364 fs_info->extent_root = root;
2365
2366 location.objectid = BTRFS_DEV_TREE_OBJECTID;
2367 root = btrfs_read_tree_root(tree_root, &location);
2368 if (IS_ERR(root))
2369 return PTR_ERR(root);
2370 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2371 fs_info->dev_root = root;
2372 btrfs_init_devices_late(fs_info);
2373
2374 location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2375 root = btrfs_read_tree_root(tree_root, &location);
2376 if (IS_ERR(root))
2377 return PTR_ERR(root);
2378 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2379 fs_info->csum_root = root;
2380
2381 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2382 root = btrfs_read_tree_root(tree_root, &location);
2383 if (!IS_ERR(root)) {
2384 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2385 fs_info->quota_enabled = 1;
2386 fs_info->pending_quota_state = 1;
2387 fs_info->quota_root = root;
2388 }
2389
2390 location.objectid = BTRFS_UUID_TREE_OBJECTID;
2391 root = btrfs_read_tree_root(tree_root, &location);
2392 if (IS_ERR(root)) {
2393 ret = PTR_ERR(root);
2394 if (ret != -ENOENT)
2395 return ret;
2396 } else {
2397 set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2398 fs_info->uuid_root = root;
2399 }
2400
2401 return 0;
2402}
2403
2149int open_ctree(struct super_block *sb, 2404int open_ctree(struct super_block *sb,
2150 struct btrfs_fs_devices *fs_devices, 2405 struct btrfs_fs_devices *fs_devices,
2151 char *options) 2406 char *options)
@@ -2160,21 +2415,12 @@ int open_ctree(struct super_block *sb,
2160 struct btrfs_super_block *disk_super; 2415 struct btrfs_super_block *disk_super;
2161 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 2416 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
2162 struct btrfs_root *tree_root; 2417 struct btrfs_root *tree_root;
2163 struct btrfs_root *extent_root;
2164 struct btrfs_root *csum_root;
2165 struct btrfs_root *chunk_root; 2418 struct btrfs_root *chunk_root;
2166 struct btrfs_root *dev_root;
2167 struct btrfs_root *quota_root;
2168 struct btrfs_root *uuid_root;
2169 struct btrfs_root *log_tree_root;
2170 int ret; 2419 int ret;
2171 int err = -EINVAL; 2420 int err = -EINVAL;
2172 int num_backups_tried = 0; 2421 int num_backups_tried = 0;
2173 int backup_index = 0; 2422 int backup_index = 0;
2174 int max_active; 2423 int max_active;
2175 int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2176 bool create_uuid_tree;
2177 bool check_uuid_tree;
2178 2424
2179 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); 2425 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
2180 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); 2426 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
@@ -2241,11 +2487,12 @@ int open_ctree(struct super_block *sb,
2241 spin_lock_init(&fs_info->qgroup_op_lock); 2487 spin_lock_init(&fs_info->qgroup_op_lock);
2242 spin_lock_init(&fs_info->buffer_lock); 2488 spin_lock_init(&fs_info->buffer_lock);
2243 spin_lock_init(&fs_info->unused_bgs_lock); 2489 spin_lock_init(&fs_info->unused_bgs_lock);
2244 mutex_init(&fs_info->unused_bg_unpin_mutex);
2245 rwlock_init(&fs_info->tree_mod_log_lock); 2490 rwlock_init(&fs_info->tree_mod_log_lock);
2491 mutex_init(&fs_info->unused_bg_unpin_mutex);
2246 mutex_init(&fs_info->reloc_mutex); 2492 mutex_init(&fs_info->reloc_mutex);
2247 mutex_init(&fs_info->delalloc_root_mutex); 2493 mutex_init(&fs_info->delalloc_root_mutex);
2248 seqlock_init(&fs_info->profiles_lock); 2494 seqlock_init(&fs_info->profiles_lock);
2495 init_rwsem(&fs_info->delayed_iput_sem);
2249 2496
2250 init_completion(&fs_info->kobj_unregister); 2497 init_completion(&fs_info->kobj_unregister);
2251 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2498 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2276,7 +2523,7 @@ int open_ctree(struct super_block *sb,
2276 fs_info->free_chunk_space = 0; 2523 fs_info->free_chunk_space = 0;
2277 fs_info->tree_mod_log = RB_ROOT; 2524 fs_info->tree_mod_log = RB_ROOT;
2278 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; 2525 fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2279 fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); 2526 fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
2280 /* readahead state */ 2527 /* readahead state */
2281 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); 2528 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
2282 spin_lock_init(&fs_info->reada_lock); 2529 spin_lock_init(&fs_info->reada_lock);
@@ -2294,55 +2541,18 @@ int open_ctree(struct super_block *sb,
2294 } 2541 }
2295 btrfs_init_delayed_root(fs_info->delayed_root); 2542 btrfs_init_delayed_root(fs_info->delayed_root);
2296 2543
2297 mutex_init(&fs_info->scrub_lock); 2544 btrfs_init_scrub(fs_info);
2298 atomic_set(&fs_info->scrubs_running, 0);
2299 atomic_set(&fs_info->scrub_pause_req, 0);
2300 atomic_set(&fs_info->scrubs_paused, 0);
2301 atomic_set(&fs_info->scrub_cancel_req, 0);
2302 init_waitqueue_head(&fs_info->replace_wait);
2303 init_waitqueue_head(&fs_info->scrub_pause_wait);
2304 fs_info->scrub_workers_refcnt = 0;
2305#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 2545#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
2306 fs_info->check_integrity_print_mask = 0; 2546 fs_info->check_integrity_print_mask = 0;
2307#endif 2547#endif
2308 2548 btrfs_init_balance(fs_info);
2309 spin_lock_init(&fs_info->balance_lock);
2310 mutex_init(&fs_info->balance_mutex);
2311 atomic_set(&fs_info->balance_running, 0);
2312 atomic_set(&fs_info->balance_pause_req, 0);
2313 atomic_set(&fs_info->balance_cancel_req, 0);
2314 fs_info->balance_ctl = NULL;
2315 init_waitqueue_head(&fs_info->balance_wait_q);
2316 btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); 2549 btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
2317 2550
2318 sb->s_blocksize = 4096; 2551 sb->s_blocksize = 4096;
2319 sb->s_blocksize_bits = blksize_bits(4096); 2552 sb->s_blocksize_bits = blksize_bits(4096);
2320 sb->s_bdi = &fs_info->bdi; 2553 sb->s_bdi = &fs_info->bdi;
2321 2554
2322 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; 2555 btrfs_init_btree_inode(fs_info, tree_root);
2323 set_nlink(fs_info->btree_inode, 1);
2324 /*
2325 * we set the i_size on the btree inode to the max possible int.
2326 * the real end of the address space is determined by all of
2327 * the devices in the system
2328 */
2329 fs_info->btree_inode->i_size = OFFSET_MAX;
2330 fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
2331
2332 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
2333 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
2334 fs_info->btree_inode->i_mapping);
2335 BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
2336 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
2337
2338 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
2339
2340 BTRFS_I(fs_info->btree_inode)->root = tree_root;
2341 memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
2342 sizeof(struct btrfs_key));
2343 set_bit(BTRFS_INODE_DUMMY,
2344 &BTRFS_I(fs_info->btree_inode)->runtime_flags);
2345 btrfs_insert_inode_hash(fs_info->btree_inode);
2346 2556
2347 spin_lock_init(&fs_info->block_group_cache_lock); 2557 spin_lock_init(&fs_info->block_group_cache_lock);
2348 fs_info->block_group_cache_tree = RB_ROOT; 2558 fs_info->block_group_cache_tree = RB_ROOT;
@@ -2363,26 +2573,14 @@ int open_ctree(struct super_block *sb,
2363 mutex_init(&fs_info->transaction_kthread_mutex); 2573 mutex_init(&fs_info->transaction_kthread_mutex);
2364 mutex_init(&fs_info->cleaner_mutex); 2574 mutex_init(&fs_info->cleaner_mutex);
2365 mutex_init(&fs_info->volume_mutex); 2575 mutex_init(&fs_info->volume_mutex);
2576 mutex_init(&fs_info->ro_block_group_mutex);
2366 init_rwsem(&fs_info->commit_root_sem); 2577 init_rwsem(&fs_info->commit_root_sem);
2367 init_rwsem(&fs_info->cleanup_work_sem); 2578 init_rwsem(&fs_info->cleanup_work_sem);
2368 init_rwsem(&fs_info->subvol_sem); 2579 init_rwsem(&fs_info->subvol_sem);
2369 sema_init(&fs_info->uuid_tree_rescan_sem, 1); 2580 sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2370 fs_info->dev_replace.lock_owner = 0;
2371 atomic_set(&fs_info->dev_replace.nesting_level, 0);
2372 mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2373 mutex_init(&fs_info->dev_replace.lock_management_lock);
2374 mutex_init(&fs_info->dev_replace.lock);
2375 2581
2376 spin_lock_init(&fs_info->qgroup_lock); 2582 btrfs_init_dev_replace_locks(fs_info);
2377 mutex_init(&fs_info->qgroup_ioctl_lock); 2583 btrfs_init_qgroup(fs_info);
2378 fs_info->qgroup_tree = RB_ROOT;
2379 fs_info->qgroup_op_tree = RB_ROOT;
2380 INIT_LIST_HEAD(&fs_info->dirty_qgroups);
2381 fs_info->qgroup_seq = 1;
2382 fs_info->quota_enabled = 0;
2383 fs_info->pending_quota_state = 0;
2384 fs_info->qgroup_ulist = NULL;
2385 mutex_init(&fs_info->qgroup_rescan_lock);
2386 2584
2387 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); 2585 btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
2388 btrfs_init_free_cluster(&fs_info->data_alloc_cluster); 2586 btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2554,75 +2752,9 @@ int open_ctree(struct super_block *sb,
2554 2752
2555 max_active = fs_info->thread_pool_size; 2753 max_active = fs_info->thread_pool_size;
2556 2754
2557 fs_info->workers = 2755 ret = btrfs_init_workqueues(fs_info, fs_devices);
2558 btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, 2756 if (ret) {
2559 max_active, 16); 2757 err = ret;
2560
2561 fs_info->delalloc_workers =
2562 btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
2563
2564 fs_info->flush_workers =
2565 btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
2566
2567 fs_info->caching_workers =
2568 btrfs_alloc_workqueue("cache", flags, max_active, 0);
2569
2570 /*
2571 * a higher idle thresh on the submit workers makes it much more
2572 * likely that bios will be send down in a sane order to the
2573 * devices
2574 */
2575 fs_info->submit_workers =
2576 btrfs_alloc_workqueue("submit", flags,
2577 min_t(u64, fs_devices->num_devices,
2578 max_active), 64);
2579
2580 fs_info->fixup_workers =
2581 btrfs_alloc_workqueue("fixup", flags, 1, 0);
2582
2583 /*
2584 * endios are largely parallel and should have a very
2585 * low idle thresh
2586 */
2587 fs_info->endio_workers =
2588 btrfs_alloc_workqueue("endio", flags, max_active, 4);
2589 fs_info->endio_meta_workers =
2590 btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
2591 fs_info->endio_meta_write_workers =
2592 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2593 fs_info->endio_raid56_workers =
2594 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2595 fs_info->endio_repair_workers =
2596 btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
2597 fs_info->rmw_workers =
2598 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2599 fs_info->endio_write_workers =
2600 btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
2601 fs_info->endio_freespace_worker =
2602 btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
2603 fs_info->delayed_workers =
2604 btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
2605 fs_info->readahead_workers =
2606 btrfs_alloc_workqueue("readahead", flags, max_active, 2);
2607 fs_info->qgroup_rescan_workers =
2608 btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
2609 fs_info->extent_workers =
2610 btrfs_alloc_workqueue("extent-refs", flags,
2611 min_t(u64, fs_devices->num_devices,
2612 max_active), 8);
2613
2614 if (!(fs_info->workers && fs_info->delalloc_workers &&
2615 fs_info->submit_workers && fs_info->flush_workers &&
2616 fs_info->endio_workers && fs_info->endio_meta_workers &&
2617 fs_info->endio_meta_write_workers &&
2618 fs_info->endio_repair_workers &&
2619 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2620 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2621 fs_info->caching_workers && fs_info->readahead_workers &&
2622 fs_info->fixup_workers && fs_info->delayed_workers &&
2623 fs_info->extent_workers &&
2624 fs_info->qgroup_rescan_workers)) {
2625 err = -ENOMEM;
2626 goto fail_sb_buffer; 2758 goto fail_sb_buffer;
2627 } 2759 }
2628 2760
@@ -2688,7 +2820,7 @@ int open_ctree(struct super_block *sb,
2688 * keep the device that is marked to be the target device for the 2820 * keep the device that is marked to be the target device for the
2689 * dev_replace procedure 2821 * dev_replace procedure
2690 */ 2822 */
2691 btrfs_close_extra_devices(fs_info, fs_devices, 0); 2823 btrfs_close_extra_devices(fs_devices, 0);
2692 2824
2693 if (!fs_devices->latest_bdev) { 2825 if (!fs_devices->latest_bdev) {
2694 printk(KERN_ERR "BTRFS: failed to read devices on %s\n", 2826 printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
@@ -2714,61 +2846,9 @@ retry_root_backup:
2714 tree_root->commit_root = btrfs_root_node(tree_root); 2846 tree_root->commit_root = btrfs_root_node(tree_root);
2715 btrfs_set_root_refs(&tree_root->root_item, 1); 2847 btrfs_set_root_refs(&tree_root->root_item, 1);
2716 2848
2717 location.objectid = BTRFS_EXTENT_TREE_OBJECTID; 2849 ret = btrfs_read_roots(fs_info, tree_root);
2718 location.type = BTRFS_ROOT_ITEM_KEY; 2850 if (ret)
2719 location.offset = 0;
2720
2721 extent_root = btrfs_read_tree_root(tree_root, &location);
2722 if (IS_ERR(extent_root)) {
2723 ret = PTR_ERR(extent_root);
2724 goto recovery_tree_root;
2725 }
2726 set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
2727 fs_info->extent_root = extent_root;
2728
2729 location.objectid = BTRFS_DEV_TREE_OBJECTID;
2730 dev_root = btrfs_read_tree_root(tree_root, &location);
2731 if (IS_ERR(dev_root)) {
2732 ret = PTR_ERR(dev_root);
2733 goto recovery_tree_root;
2734 }
2735 set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
2736 fs_info->dev_root = dev_root;
2737 btrfs_init_devices_late(fs_info);
2738
2739 location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2740 csum_root = btrfs_read_tree_root(tree_root, &location);
2741 if (IS_ERR(csum_root)) {
2742 ret = PTR_ERR(csum_root);
2743 goto recovery_tree_root; 2851 goto recovery_tree_root;
2744 }
2745 set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
2746 fs_info->csum_root = csum_root;
2747
2748 location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2749 quota_root = btrfs_read_tree_root(tree_root, &location);
2750 if (!IS_ERR(quota_root)) {
2751 set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
2752 fs_info->quota_enabled = 1;
2753 fs_info->pending_quota_state = 1;
2754 fs_info->quota_root = quota_root;
2755 }
2756
2757 location.objectid = BTRFS_UUID_TREE_OBJECTID;
2758 uuid_root = btrfs_read_tree_root(tree_root, &location);
2759 if (IS_ERR(uuid_root)) {
2760 ret = PTR_ERR(uuid_root);
2761 if (ret != -ENOENT)
2762 goto recovery_tree_root;
2763 create_uuid_tree = true;
2764 check_uuid_tree = false;
2765 } else {
2766 set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
2767 fs_info->uuid_root = uuid_root;
2768 create_uuid_tree = false;
2769 check_uuid_tree =
2770 generation != btrfs_super_uuid_tree_generation(disk_super);
2771 }
2772 2852
2773 fs_info->generation = generation; 2853 fs_info->generation = generation;
2774 fs_info->last_trans_committed = generation; 2854 fs_info->last_trans_committed = generation;
@@ -2792,7 +2872,7 @@ retry_root_backup:
2792 goto fail_block_groups; 2872 goto fail_block_groups;
2793 } 2873 }
2794 2874
2795 btrfs_close_extra_devices(fs_info, fs_devices, 1); 2875 btrfs_close_extra_devices(fs_devices, 1);
2796 2876
2797 ret = btrfs_sysfs_add_one(fs_info); 2877 ret = btrfs_sysfs_add_one(fs_info);
2798 if (ret) { 2878 if (ret) {
@@ -2806,7 +2886,7 @@ retry_root_backup:
2806 goto fail_sysfs; 2886 goto fail_sysfs;
2807 } 2887 }
2808 2888
2809 ret = btrfs_read_block_groups(extent_root); 2889 ret = btrfs_read_block_groups(fs_info->extent_root);
2810 if (ret) { 2890 if (ret) {
2811 printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); 2891 printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
2812 goto fail_sysfs; 2892 goto fail_sysfs;
@@ -2864,48 +2944,11 @@ retry_root_backup:
2864 2944
2865 /* do not make disk changes in broken FS */ 2945 /* do not make disk changes in broken FS */
2866 if (btrfs_super_log_root(disk_super) != 0) { 2946 if (btrfs_super_log_root(disk_super) != 0) {
2867 u64 bytenr = btrfs_super_log_root(disk_super); 2947 ret = btrfs_replay_log(fs_info, fs_devices);
2868
2869 if (fs_devices->rw_devices == 0) {
2870 printk(KERN_WARNING "BTRFS: log replay required "
2871 "on RO media\n");
2872 err = -EIO;
2873 goto fail_qgroup;
2874 }
2875
2876 log_tree_root = btrfs_alloc_root(fs_info);
2877 if (!log_tree_root) {
2878 err = -ENOMEM;
2879 goto fail_qgroup;
2880 }
2881
2882 __setup_root(nodesize, sectorsize, stripesize,
2883 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
2884
2885 log_tree_root->node = read_tree_block(tree_root, bytenr,
2886 generation + 1);
2887 if (!log_tree_root->node ||
2888 !extent_buffer_uptodate(log_tree_root->node)) {
2889 printk(KERN_ERR "BTRFS: failed to read log tree\n");
2890 free_extent_buffer(log_tree_root->node);
2891 kfree(log_tree_root);
2892 goto fail_qgroup;
2893 }
2894 /* returns with log_tree_root freed on success */
2895 ret = btrfs_recover_log_trees(log_tree_root);
2896 if (ret) { 2948 if (ret) {
2897 btrfs_error(tree_root->fs_info, ret, 2949 err = ret;
2898 "Failed to recover log tree");
2899 free_extent_buffer(log_tree_root->node);
2900 kfree(log_tree_root);
2901 goto fail_qgroup; 2950 goto fail_qgroup;
2902 } 2951 }
2903
2904 if (sb->s_flags & MS_RDONLY) {
2905 ret = btrfs_commit_super(tree_root);
2906 if (ret)
2907 goto fail_qgroup;
2908 }
2909 } 2952 }
2910 2953
2911 ret = btrfs_find_orphan_roots(tree_root); 2954 ret = btrfs_find_orphan_roots(tree_root);
@@ -2966,7 +3009,7 @@ retry_root_backup:
2966 3009
2967 btrfs_qgroup_rescan_resume(fs_info); 3010 btrfs_qgroup_rescan_resume(fs_info);
2968 3011
2969 if (create_uuid_tree) { 3012 if (!fs_info->uuid_root) {
2970 pr_info("BTRFS: creating UUID tree\n"); 3013 pr_info("BTRFS: creating UUID tree\n");
2971 ret = btrfs_create_uuid_tree(fs_info); 3014 ret = btrfs_create_uuid_tree(fs_info);
2972 if (ret) { 3015 if (ret) {
@@ -2975,8 +3018,9 @@ retry_root_backup:
2975 close_ctree(tree_root); 3018 close_ctree(tree_root);
2976 return ret; 3019 return ret;
2977 } 3020 }
2978 } else if (check_uuid_tree || 3021 } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
2979 btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { 3022 fs_info->generation !=
3023 btrfs_super_uuid_tree_generation(disk_super)) {
2980 pr_info("BTRFS: checking UUID tree\n"); 3024 pr_info("BTRFS: checking UUID tree\n");
2981 ret = btrfs_check_uuid_tree(fs_info); 3025 ret = btrfs_check_uuid_tree(fs_info);
2982 if (ret) { 3026 if (ret) {
@@ -3668,7 +3712,7 @@ void close_ctree(struct btrfs_root *root)
3668 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 3712 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
3669 ret = btrfs_commit_super(root); 3713 ret = btrfs_commit_super(root);
3670 if (ret) 3714 if (ret)
3671 btrfs_err(root->fs_info, "commit super ret %d", ret); 3715 btrfs_err(fs_info, "commit super ret %d", ret);
3672 } 3716 }
3673 3717
3674 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) 3718 if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
@@ -3680,10 +3724,10 @@ void close_ctree(struct btrfs_root *root)
3680 fs_info->closing = 2; 3724 fs_info->closing = 2;
3681 smp_mb(); 3725 smp_mb();
3682 3726
3683 btrfs_free_qgroup_config(root->fs_info); 3727 btrfs_free_qgroup_config(fs_info);
3684 3728
3685 if (percpu_counter_sum(&fs_info->delalloc_bytes)) { 3729 if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
3686 btrfs_info(root->fs_info, "at unmount delalloc count %lld", 3730 btrfs_info(fs_info, "at unmount delalloc count %lld",
3687 percpu_counter_sum(&fs_info->delalloc_bytes)); 3731 percpu_counter_sum(&fs_info->delalloc_bytes));
3688 } 3732 }
3689 3733
@@ -3723,7 +3767,7 @@ void close_ctree(struct btrfs_root *root)
3723 3767
3724 btrfs_free_stripe_hash_table(fs_info); 3768 btrfs_free_stripe_hash_table(fs_info);
3725 3769
3726 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3770 __btrfs_free_block_rsv(root->orphan_block_rsv);
3727 root->orphan_block_rsv = NULL; 3771 root->orphan_block_rsv = NULL;
3728 3772
3729 lock_chunks(root); 3773 lock_chunks(root);
@@ -4134,7 +4178,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
4134 4178
4135 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 4179 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
4136 while (start <= end) { 4180 while (start <= end) {
4137 eb = btrfs_find_tree_block(root, start); 4181 eb = btrfs_find_tree_block(root->fs_info, start);
4138 start += root->nodesize; 4182 start += root->nodesize;
4139 if (!eb) 4183 if (!eb)
4140 continue; 4184 continue;
@@ -4285,7 +4329,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
4285 return 0; 4329 return 0;
4286} 4330}
4287 4331
4288static struct extent_io_ops btree_extent_io_ops = { 4332static const struct extent_io_ops btree_extent_io_ops = {
4289 .readpage_end_io_hook = btree_readpage_end_io_hook, 4333 .readpage_end_io_hook = btree_readpage_end_io_hook,
4290 .readpage_io_failed_hook = btree_io_failed_hook, 4334 .readpage_io_failed_hook = btree_io_failed_hook,
4291 .submit_bio_hook = btree_submit_bio_hook, 4335 .submit_bio_hook = btree_submit_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 27d44c0fd236..d4cbfeeeedd4 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
53 u64 bytenr); 53 u64 bytenr);
54void clean_tree_block(struct btrfs_trans_handle *trans, 54void clean_tree_block(struct btrfs_trans_handle *trans,
55 struct btrfs_root *root, struct extent_buffer *buf); 55 struct btrfs_fs_info *fs_info, struct extent_buffer *buf);
56int open_ctree(struct super_block *sb, 56int open_ctree(struct super_block *sb,
57 struct btrfs_fs_devices *fs_devices, 57 struct btrfs_fs_devices *fs_devices,
58 char *options); 58 char *options);
@@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, int max_mirrors); 61 struct btrfs_root *root, int max_mirrors);
62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
65 u64 bytenr); 65 u64 bytenr);
66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8b353ad02f03..1eef4ee01d1a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2538,6 +2538,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2538 * list before we release it. 2538 * list before we release it.
2539 */ 2539 */
2540 if (btrfs_delayed_ref_is_head(ref)) { 2540 if (btrfs_delayed_ref_is_head(ref)) {
2541 if (locked_ref->is_data &&
2542 locked_ref->total_ref_mod < 0) {
2543 spin_lock(&delayed_refs->lock);
2544 delayed_refs->pending_csums -= ref->num_bytes;
2545 spin_unlock(&delayed_refs->lock);
2546 }
2541 btrfs_delayed_ref_unlock(locked_ref); 2547 btrfs_delayed_ref_unlock(locked_ref);
2542 locked_ref = NULL; 2548 locked_ref = NULL;
2543 } 2549 }
@@ -2561,8 +2567,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2561 */ 2567 */
2562 spin_lock(&delayed_refs->lock); 2568 spin_lock(&delayed_refs->lock);
2563 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; 2569 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2564 avg = div64_u64(avg, 4); 2570 fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
2565 fs_info->avg_delayed_ref_runtime = avg;
2566 spin_unlock(&delayed_refs->lock); 2571 spin_unlock(&delayed_refs->lock);
2567 } 2572 }
2568 return 0; 2573 return 0;
@@ -2624,7 +2629,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
2624 * We don't ever fill up leaves all the way so multiply by 2 just to be 2629 * We don't ever fill up leaves all the way so multiply by 2 just to be
2625 * closer to what we're really going to want to ouse. 2630 * closer to what we're really going to want to ouse.
2626 */ 2631 */
2627 return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); 2632 return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2633}
2634
2635/*
2636 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2637 * would require to store the csums for that many bytes.
2638 */
2639u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2640{
2641 u64 csum_size;
2642 u64 num_csums_per_leaf;
2643 u64 num_csums;
2644
2645 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
2646 num_csums_per_leaf = div64_u64(csum_size,
2647 (u64)btrfs_super_csum_size(root->fs_info->super_copy));
2648 num_csums = div64_u64(csum_bytes, root->sectorsize);
2649 num_csums += num_csums_per_leaf - 1;
2650 num_csums = div64_u64(num_csums, num_csums_per_leaf);
2651 return num_csums;
2628} 2652}
2629 2653
2630int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2654int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
@@ -2632,7 +2656,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2632{ 2656{
2633 struct btrfs_block_rsv *global_rsv; 2657 struct btrfs_block_rsv *global_rsv;
2634 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; 2658 u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2635 u64 num_bytes; 2659 u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2660 u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
2661 u64 num_bytes, num_dirty_bgs_bytes;
2636 int ret = 0; 2662 int ret = 0;
2637 2663
2638 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2664 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
@@ -2640,17 +2666,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2640 if (num_heads > 1) 2666 if (num_heads > 1)
2641 num_bytes += (num_heads - 1) * root->nodesize; 2667 num_bytes += (num_heads - 1) * root->nodesize;
2642 num_bytes <<= 1; 2668 num_bytes <<= 1;
2669 num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2670 num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
2671 num_dirty_bgs);
2643 global_rsv = &root->fs_info->global_block_rsv; 2672 global_rsv = &root->fs_info->global_block_rsv;
2644 2673
2645 /* 2674 /*
2646 * If we can't allocate any more chunks lets make sure we have _lots_ of 2675 * If we can't allocate any more chunks lets make sure we have _lots_ of
2647 * wiggle room since running delayed refs can create more delayed refs. 2676 * wiggle room since running delayed refs can create more delayed refs.
2648 */ 2677 */
2649 if (global_rsv->space_info->full) 2678 if (global_rsv->space_info->full) {
2679 num_dirty_bgs_bytes <<= 1;
2650 num_bytes <<= 1; 2680 num_bytes <<= 1;
2681 }
2651 2682
2652 spin_lock(&global_rsv->lock); 2683 spin_lock(&global_rsv->lock);
2653 if (global_rsv->reserved <= num_bytes) 2684 if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2654 ret = 1; 2685 ret = 1;
2655 spin_unlock(&global_rsv->lock); 2686 spin_unlock(&global_rsv->lock);
2656 return ret; 2687 return ret;
@@ -3193,7 +3224,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3193 struct inode *inode = NULL; 3224 struct inode *inode = NULL;
3194 u64 alloc_hint = 0; 3225 u64 alloc_hint = 0;
3195 int dcs = BTRFS_DC_ERROR; 3226 int dcs = BTRFS_DC_ERROR;
3196 int num_pages = 0; 3227 u64 num_pages = 0;
3197 int retries = 0; 3228 int retries = 0;
3198 int ret = 0; 3229 int ret = 0;
3199 3230
@@ -3267,7 +3298,7 @@ again:
3267 if (ret) 3298 if (ret)
3268 goto out_put; 3299 goto out_put;
3269 3300
3270 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3301 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3271 if (ret) 3302 if (ret)
3272 goto out_put; 3303 goto out_put;
3273 } 3304 }
@@ -3293,14 +3324,14 @@ again:
3293 * taking up quite a bit since it's not folded into the other space 3324 * taking up quite a bit since it's not folded into the other space
3294 * cache. 3325 * cache.
3295 */ 3326 */
3296 num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); 3327 num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
3297 if (!num_pages) 3328 if (!num_pages)
3298 num_pages = 1; 3329 num_pages = 1;
3299 3330
3300 num_pages *= 16; 3331 num_pages *= 16;
3301 num_pages *= PAGE_CACHE_SIZE; 3332 num_pages *= PAGE_CACHE_SIZE;
3302 3333
3303 ret = btrfs_check_data_free_space(inode, num_pages); 3334 ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
3304 if (ret) 3335 if (ret)
3305 goto out_put; 3336 goto out_put;
3306 3337
@@ -3351,16 +3382,156 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
3351 return 0; 3382 return 0;
3352} 3383}
3353 3384
3354int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 3385/*
3386 * transaction commit does final block group cache writeback during a
3387 * critical section where nothing is allowed to change the FS. This is
3388 * required in order for the cache to actually match the block group,
3389 * but can introduce a lot of latency into the commit.
3390 *
3391 * So, btrfs_start_dirty_block_groups is here to kick off block group
3392 * cache IO. There's a chance we'll have to redo some of it if the
3393 * block group changes again during the commit, but it greatly reduces
3394 * the commit latency by getting rid of the easy block groups while
3395 * we're still allowing others to join the commit.
3396 */
3397int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3355 struct btrfs_root *root) 3398 struct btrfs_root *root)
3356{ 3399{
3357 struct btrfs_block_group_cache *cache; 3400 struct btrfs_block_group_cache *cache;
3358 struct btrfs_transaction *cur_trans = trans->transaction; 3401 struct btrfs_transaction *cur_trans = trans->transaction;
3359 int ret = 0; 3402 int ret = 0;
3360 struct btrfs_path *path; 3403 int should_put;
3404 struct btrfs_path *path = NULL;
3405 LIST_HEAD(dirty);
3406 struct list_head *io = &cur_trans->io_bgs;
3407 int num_started = 0;
3408 int loops = 0;
3409
3410 spin_lock(&cur_trans->dirty_bgs_lock);
3411 if (!list_empty(&cur_trans->dirty_bgs)) {
3412 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3413 }
3414 spin_unlock(&cur_trans->dirty_bgs_lock);
3361 3415
3362 if (list_empty(&cur_trans->dirty_bgs)) 3416again:
3417 if (list_empty(&dirty)) {
3418 btrfs_free_path(path);
3363 return 0; 3419 return 0;
3420 }
3421
3422 /*
3423 * make sure all the block groups on our dirty list actually
3424 * exist
3425 */
3426 btrfs_create_pending_block_groups(trans, root);
3427
3428 if (!path) {
3429 path = btrfs_alloc_path();
3430 if (!path)
3431 return -ENOMEM;
3432 }
3433
3434 while (!list_empty(&dirty)) {
3435 cache = list_first_entry(&dirty,
3436 struct btrfs_block_group_cache,
3437 dirty_list);
3438
3439 /*
3440 * cache_write_mutex is here only to save us from balance
3441 * deleting this block group while we are writing out the
3442 * cache
3443 */
3444 mutex_lock(&trans->transaction->cache_write_mutex);
3445
3446 /*
3447 * this can happen if something re-dirties a block
3448 * group that is already under IO. Just wait for it to
3449 * finish and then do it all again
3450 */
3451 if (!list_empty(&cache->io_list)) {
3452 list_del_init(&cache->io_list);
3453 btrfs_wait_cache_io(root, trans, cache,
3454 &cache->io_ctl, path,
3455 cache->key.objectid);
3456 btrfs_put_block_group(cache);
3457 }
3458
3459
3460 /*
3461 * btrfs_wait_cache_io uses the cache->dirty_list to decide
3462 * if it should update the cache_state. Don't delete
3463 * until after we wait.
3464 *
3465 * Since we're not running in the commit critical section
3466 * we need the dirty_bgs_lock to protect from update_block_group
3467 */
3468 spin_lock(&cur_trans->dirty_bgs_lock);
3469 list_del_init(&cache->dirty_list);
3470 spin_unlock(&cur_trans->dirty_bgs_lock);
3471
3472 should_put = 1;
3473
3474 cache_save_setup(cache, trans, path);
3475
3476 if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3477 cache->io_ctl.inode = NULL;
3478 ret = btrfs_write_out_cache(root, trans, cache, path);
3479 if (ret == 0 && cache->io_ctl.inode) {
3480 num_started++;
3481 should_put = 0;
3482
3483 /*
3484 * the cache_write_mutex is protecting
3485 * the io_list
3486 */
3487 list_add_tail(&cache->io_list, io);
3488 } else {
3489 /*
3490 * if we failed to write the cache, the
3491 * generation will be bad and life goes on
3492 */
3493 ret = 0;
3494 }
3495 }
3496 if (!ret)
3497 ret = write_one_cache_group(trans, root, path, cache);
3498 mutex_unlock(&trans->transaction->cache_write_mutex);
3499
3500 /* if its not on the io list, we need to put the block group */
3501 if (should_put)
3502 btrfs_put_block_group(cache);
3503
3504 if (ret)
3505 break;
3506 }
3507
3508 /*
3509 * go through delayed refs for all the stuff we've just kicked off
3510 * and then loop back (just once)
3511 */
3512 ret = btrfs_run_delayed_refs(trans, root, 0);
3513 if (!ret && loops == 0) {
3514 loops++;
3515 spin_lock(&cur_trans->dirty_bgs_lock);
3516 list_splice_init(&cur_trans->dirty_bgs, &dirty);
3517 spin_unlock(&cur_trans->dirty_bgs_lock);
3518 goto again;
3519 }
3520
3521 btrfs_free_path(path);
3522 return ret;
3523}
3524
3525int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3526 struct btrfs_root *root)
3527{
3528 struct btrfs_block_group_cache *cache;
3529 struct btrfs_transaction *cur_trans = trans->transaction;
3530 int ret = 0;
3531 int should_put;
3532 struct btrfs_path *path;
3533 struct list_head *io = &cur_trans->io_bgs;
3534 int num_started = 0;
3364 3535
3365 path = btrfs_alloc_path(); 3536 path = btrfs_alloc_path();
3366 if (!path) 3537 if (!path)
@@ -3376,16 +3547,61 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3376 cache = list_first_entry(&cur_trans->dirty_bgs, 3547 cache = list_first_entry(&cur_trans->dirty_bgs,
3377 struct btrfs_block_group_cache, 3548 struct btrfs_block_group_cache,
3378 dirty_list); 3549 dirty_list);
3550
3551 /*
3552 * this can happen if cache_save_setup re-dirties a block
3553 * group that is already under IO. Just wait for it to
3554 * finish and then do it all again
3555 */
3556 if (!list_empty(&cache->io_list)) {
3557 list_del_init(&cache->io_list);
3558 btrfs_wait_cache_io(root, trans, cache,
3559 &cache->io_ctl, path,
3560 cache->key.objectid);
3561 btrfs_put_block_group(cache);
3562 }
3563
3564 /*
3565 * don't remove from the dirty list until after we've waited
3566 * on any pending IO
3567 */
3379 list_del_init(&cache->dirty_list); 3568 list_del_init(&cache->dirty_list);
3380 if (cache->disk_cache_state == BTRFS_DC_CLEAR) 3569 should_put = 1;
3381 cache_save_setup(cache, trans, path); 3570
3571 cache_save_setup(cache, trans, path);
3572
3382 if (!ret) 3573 if (!ret)
3383 ret = btrfs_run_delayed_refs(trans, root, 3574 ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
3384 (unsigned long) -1); 3575
3385 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) 3576 if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3386 btrfs_write_out_cache(root, trans, cache, path); 3577 cache->io_ctl.inode = NULL;
3578 ret = btrfs_write_out_cache(root, trans, cache, path);
3579 if (ret == 0 && cache->io_ctl.inode) {
3580 num_started++;
3581 should_put = 0;
3582 list_add_tail(&cache->io_list, io);
3583 } else {
3584 /*
3585 * if we failed to write the cache, the
3586 * generation will be bad and life goes on
3587 */
3588 ret = 0;
3589 }
3590 }
3387 if (!ret) 3591 if (!ret)
3388 ret = write_one_cache_group(trans, root, path, cache); 3592 ret = write_one_cache_group(trans, root, path, cache);
3593
3594 /* if its not on the io list, we need to put the block group */
3595 if (should_put)
3596 btrfs_put_block_group(cache);
3597 }
3598
3599 while (!list_empty(io)) {
3600 cache = list_first_entry(io, struct btrfs_block_group_cache,
3601 io_list);
3602 list_del_init(&cache->io_list);
3603 btrfs_wait_cache_io(root, trans, cache,
3604 &cache->io_ctl, path, cache->key.objectid);
3389 btrfs_put_block_group(cache); 3605 btrfs_put_block_group(cache);
3390 } 3606 }
3391 3607
@@ -3635,19 +3851,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3635 * This will check the space that the inode allocates from to make sure we have 3851 * This will check the space that the inode allocates from to make sure we have
3636 * enough space for bytes. 3852 * enough space for bytes.
3637 */ 3853 */
3638int btrfs_check_data_free_space(struct inode *inode, u64 bytes) 3854int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
3639{ 3855{
3640 struct btrfs_space_info *data_sinfo; 3856 struct btrfs_space_info *data_sinfo;
3641 struct btrfs_root *root = BTRFS_I(inode)->root; 3857 struct btrfs_root *root = BTRFS_I(inode)->root;
3642 struct btrfs_fs_info *fs_info = root->fs_info; 3858 struct btrfs_fs_info *fs_info = root->fs_info;
3643 u64 used; 3859 u64 used;
3644 int ret = 0, committed = 0, alloc_chunk = 1; 3860 int ret = 0;
3861 int need_commit = 2;
3862 int have_pinned_space;
3645 3863
3646 /* make sure bytes are sectorsize aligned */ 3864 /* make sure bytes are sectorsize aligned */
3647 bytes = ALIGN(bytes, root->sectorsize); 3865 bytes = ALIGN(bytes, root->sectorsize);
3648 3866
3649 if (btrfs_is_free_space_inode(inode)) { 3867 if (btrfs_is_free_space_inode(inode)) {
3650 committed = 1; 3868 need_commit = 0;
3651 ASSERT(current->journal_info); 3869 ASSERT(current->journal_info);
3652 } 3870 }
3653 3871
@@ -3669,7 +3887,7 @@ again:
3669 * if we don't have enough free bytes in this space then we need 3887 * if we don't have enough free bytes in this space then we need
3670 * to alloc a new chunk. 3888 * to alloc a new chunk.
3671 */ 3889 */
3672 if (!data_sinfo->full && alloc_chunk) { 3890 if (!data_sinfo->full) {
3673 u64 alloc_target; 3891 u64 alloc_target;
3674 3892
3675 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; 3893 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
@@ -3697,8 +3915,10 @@ alloc:
3697 if (ret < 0) { 3915 if (ret < 0) {
3698 if (ret != -ENOSPC) 3916 if (ret != -ENOSPC)
3699 return ret; 3917 return ret;
3700 else 3918 else {
3919 have_pinned_space = 1;
3701 goto commit_trans; 3920 goto commit_trans;
3921 }
3702 } 3922 }
3703 3923
3704 if (!data_sinfo) 3924 if (!data_sinfo)
@@ -3709,26 +3929,39 @@ alloc:
3709 3929
3710 /* 3930 /*
3711 * If we don't have enough pinned space to deal with this 3931 * If we don't have enough pinned space to deal with this
3712 * allocation don't bother committing the transaction. 3932 * allocation, and no removed chunk in current transaction,
3933 * don't bother committing the transaction.
3713 */ 3934 */
3714 if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, 3935 have_pinned_space = percpu_counter_compare(
3715 bytes) < 0) 3936 &data_sinfo->total_bytes_pinned,
3716 committed = 1; 3937 used + bytes - data_sinfo->total_bytes);
3717 spin_unlock(&data_sinfo->lock); 3938 spin_unlock(&data_sinfo->lock);
3718 3939
3719 /* commit the current transaction and try again */ 3940 /* commit the current transaction and try again */
3720commit_trans: 3941commit_trans:
3721 if (!committed && 3942 if (need_commit &&
3722 !atomic_read(&root->fs_info->open_ioctl_trans)) { 3943 !atomic_read(&root->fs_info->open_ioctl_trans)) {
3723 committed = 1; 3944 need_commit--;
3724 3945
3725 trans = btrfs_join_transaction(root); 3946 trans = btrfs_join_transaction(root);
3726 if (IS_ERR(trans)) 3947 if (IS_ERR(trans))
3727 return PTR_ERR(trans); 3948 return PTR_ERR(trans);
3728 ret = btrfs_commit_transaction(trans, root); 3949 if (have_pinned_space >= 0 ||
3729 if (ret) 3950 trans->transaction->have_free_bgs ||
3730 return ret; 3951 need_commit > 0) {
3731 goto again; 3952 ret = btrfs_commit_transaction(trans, root);
3953 if (ret)
3954 return ret;
3955 /*
3956 * make sure that all running delayed iput are
3957 * done
3958 */
3959 down_write(&root->fs_info->delayed_iput_sem);
3960 up_write(&root->fs_info->delayed_iput_sem);
3961 goto again;
3962 } else {
3963 btrfs_end_transaction(trans, root);
3964 }
3732 } 3965 }
3733 3966
3734 trace_btrfs_space_reservation(root->fs_info, 3967 trace_btrfs_space_reservation(root->fs_info,
@@ -3736,12 +3969,16 @@ commit_trans:
3736 data_sinfo->flags, bytes, 1); 3969 data_sinfo->flags, bytes, 1);
3737 return -ENOSPC; 3970 return -ENOSPC;
3738 } 3971 }
3972 ret = btrfs_qgroup_reserve(root, write_bytes);
3973 if (ret)
3974 goto out;
3739 data_sinfo->bytes_may_use += bytes; 3975 data_sinfo->bytes_may_use += bytes;
3740 trace_btrfs_space_reservation(root->fs_info, "space_info", 3976 trace_btrfs_space_reservation(root->fs_info, "space_info",
3741 data_sinfo->flags, bytes, 1); 3977 data_sinfo->flags, bytes, 1);
3978out:
3742 spin_unlock(&data_sinfo->lock); 3979 spin_unlock(&data_sinfo->lock);
3743 3980
3744 return 0; 3981 return ret;
3745} 3982}
3746 3983
3747/* 3984/*
@@ -4298,8 +4535,13 @@ out:
4298static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, 4535static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4299 struct btrfs_fs_info *fs_info, u64 used) 4536 struct btrfs_fs_info *fs_info, u64 used)
4300{ 4537{
4301 return (used >= div_factor_fine(space_info->total_bytes, 98) && 4538 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
4302 !btrfs_fs_closing(fs_info) && 4539
4540 /* If we're just plain full then async reclaim just slows us down. */
4541 if (space_info->bytes_used >= thresh)
4542 return 0;
4543
4544 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
4303 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); 4545 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
4304} 4546}
4305 4547
@@ -4354,10 +4596,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4354 if (!btrfs_need_do_async_reclaim(space_info, fs_info, 4596 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4355 flush_state)) 4597 flush_state))
4356 return; 4598 return;
4357 } while (flush_state <= COMMIT_TRANS); 4599 } while (flush_state < COMMIT_TRANS);
4358
4359 if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
4360 queue_work(system_unbound_wq, work);
4361} 4600}
4362 4601
4363void btrfs_init_async_reclaim_work(struct work_struct *work) 4602void btrfs_init_async_reclaim_work(struct work_struct *work)
@@ -4700,6 +4939,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
4700 kfree(rsv); 4939 kfree(rsv);
4701} 4940}
4702 4941
4942void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
4943{
4944 kfree(rsv);
4945}
4946
4703int btrfs_block_rsv_add(struct btrfs_root *root, 4947int btrfs_block_rsv_add(struct btrfs_root *root,
4704 struct btrfs_block_rsv *block_rsv, u64 num_bytes, 4948 struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4705 enum btrfs_reserve_flush_enum flush) 4949 enum btrfs_reserve_flush_enum flush)
@@ -4812,10 +5056,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4812 5056
4813 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * 5057 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4814 csum_size * 2; 5058 csum_size * 2;
4815 num_bytes += div64_u64(data_used + meta_used, 50); 5059 num_bytes += div_u64(data_used + meta_used, 50);
4816 5060
4817 if (num_bytes * 3 > meta_used) 5061 if (num_bytes * 3 > meta_used)
4818 num_bytes = div64_u64(meta_used, 3); 5062 num_bytes = div_u64(meta_used, 3);
4819 5063
4820 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); 5064 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
4821} 5065}
@@ -4998,8 +5242,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4998 u64 qgroup_reserved) 5242 u64 qgroup_reserved)
4999{ 5243{
5000 btrfs_block_rsv_release(root, rsv, (u64)-1); 5244 btrfs_block_rsv_release(root, rsv, (u64)-1);
5001 if (qgroup_reserved)
5002 btrfs_qgroup_free(root, qgroup_reserved);
5003} 5245}
5004 5246
5005/** 5247/**
@@ -5066,30 +5308,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
5066 int reserve) 5308 int reserve)
5067{ 5309{
5068 struct btrfs_root *root = BTRFS_I(inode)->root; 5310 struct btrfs_root *root = BTRFS_I(inode)->root;
5069 u64 csum_size; 5311 u64 old_csums, num_csums;
5070 int num_csums_per_leaf;
5071 int num_csums;
5072 int old_csums;
5073 5312
5074 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && 5313 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
5075 BTRFS_I(inode)->csum_bytes == 0) 5314 BTRFS_I(inode)->csum_bytes == 0)
5076 return 0; 5315 return 0;
5077 5316
5078 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); 5317 old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5079 if (reserve) 5318 if (reserve)
5080 BTRFS_I(inode)->csum_bytes += num_bytes; 5319 BTRFS_I(inode)->csum_bytes += num_bytes;
5081 else 5320 else
5082 BTRFS_I(inode)->csum_bytes -= num_bytes; 5321 BTRFS_I(inode)->csum_bytes -= num_bytes;
5083 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); 5322 num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5084 num_csums_per_leaf = (int)div64_u64(csum_size,
5085 sizeof(struct btrfs_csum_item) +
5086 sizeof(struct btrfs_disk_key));
5087 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
5088 num_csums = num_csums + num_csums_per_leaf - 1;
5089 num_csums = num_csums / num_csums_per_leaf;
5090
5091 old_csums = old_csums + num_csums_per_leaf - 1;
5092 old_csums = old_csums / num_csums_per_leaf;
5093 5323
5094 /* No change, no need to reserve more */ 5324 /* No change, no need to reserve more */
5095 if (old_csums == num_csums) 5325 if (old_csums == num_csums)
@@ -5163,8 +5393,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5163 spin_unlock(&BTRFS_I(inode)->lock); 5393 spin_unlock(&BTRFS_I(inode)->lock);
5164 5394
5165 if (root->fs_info->quota_enabled) { 5395 if (root->fs_info->quota_enabled) {
5166 ret = btrfs_qgroup_reserve(root, num_bytes + 5396 ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
5167 nr_extents * root->nodesize);
5168 if (ret) 5397 if (ret)
5169 goto out_fail; 5398 goto out_fail;
5170 } 5399 }
@@ -5172,8 +5401,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5172 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5401 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5173 if (unlikely(ret)) { 5402 if (unlikely(ret)) {
5174 if (root->fs_info->quota_enabled) 5403 if (root->fs_info->quota_enabled)
5175 btrfs_qgroup_free(root, num_bytes + 5404 btrfs_qgroup_free(root, nr_extents * root->nodesize);
5176 nr_extents * root->nodesize);
5177 goto out_fail; 5405 goto out_fail;
5178 } 5406 }
5179 5407
@@ -5290,10 +5518,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5290 5518
5291 trace_btrfs_space_reservation(root->fs_info, "delalloc", 5519 trace_btrfs_space_reservation(root->fs_info, "delalloc",
5292 btrfs_ino(inode), to_free, 0); 5520 btrfs_ino(inode), to_free, 0);
5293 if (root->fs_info->quota_enabled) {
5294 btrfs_qgroup_free(root, num_bytes +
5295 dropped * root->nodesize);
5296 }
5297 5521
5298 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5522 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5299 to_free); 5523 to_free);
@@ -5318,7 +5542,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5318{ 5542{
5319 int ret; 5543 int ret;
5320 5544
5321 ret = btrfs_check_data_free_space(inode, num_bytes); 5545 ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
5322 if (ret) 5546 if (ret)
5323 return ret; 5547 return ret;
5324 5548
@@ -5390,14 +5614,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
5390 if (!alloc && cache->cached == BTRFS_CACHE_NO) 5614 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5391 cache_block_group(cache, 1); 5615 cache_block_group(cache, 1);
5392 5616
5393 spin_lock(&trans->transaction->dirty_bgs_lock);
5394 if (list_empty(&cache->dirty_list)) {
5395 list_add_tail(&cache->dirty_list,
5396 &trans->transaction->dirty_bgs);
5397 btrfs_get_block_group(cache);
5398 }
5399 spin_unlock(&trans->transaction->dirty_bgs_lock);
5400
5401 byte_in_group = bytenr - cache->key.objectid; 5617 byte_in_group = bytenr - cache->key.objectid;
5402 WARN_ON(byte_in_group > cache->key.offset); 5618 WARN_ON(byte_in_group > cache->key.offset);
5403 5619
@@ -5446,6 +5662,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
5446 spin_unlock(&info->unused_bgs_lock); 5662 spin_unlock(&info->unused_bgs_lock);
5447 } 5663 }
5448 } 5664 }
5665
5666 spin_lock(&trans->transaction->dirty_bgs_lock);
5667 if (list_empty(&cache->dirty_list)) {
5668 list_add_tail(&cache->dirty_list,
5669 &trans->transaction->dirty_bgs);
5670 trans->transaction->num_dirty_bgs++;
5671 btrfs_get_block_group(cache);
5672 }
5673 spin_unlock(&trans->transaction->dirty_bgs_lock);
5674
5449 btrfs_put_block_group(cache); 5675 btrfs_put_block_group(cache);
5450 total -= num_bytes; 5676 total -= num_bytes;
5451 bytenr += num_bytes; 5677 bytenr += num_bytes;
@@ -6956,15 +7182,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6956 return -ENOSPC; 7182 return -ENOSPC;
6957 } 7183 }
6958 7184
6959 if (btrfs_test_opt(root, DISCARD))
6960 ret = btrfs_discard_extent(root, start, len, NULL);
6961
6962 if (pin) 7185 if (pin)
6963 pin_down_extent(root, cache, start, len, 1); 7186 pin_down_extent(root, cache, start, len, 1);
6964 else { 7187 else {
7188 if (btrfs_test_opt(root, DISCARD))
7189 ret = btrfs_discard_extent(root, start, len, NULL);
6965 btrfs_add_free_space(cache, start, len); 7190 btrfs_add_free_space(cache, start, len);
6966 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); 7191 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
6967 } 7192 }
7193
6968 btrfs_put_block_group(cache); 7194 btrfs_put_block_group(cache);
6969 7195
6970 trace_btrfs_reserved_extent_free(root, start, len); 7196 trace_btrfs_reserved_extent_free(root, start, len);
@@ -7095,9 +7321,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7095 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 7321 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
7096 ins, size); 7322 ins, size);
7097 if (ret) { 7323 if (ret) {
7324 btrfs_free_path(path);
7098 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7325 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7099 root->nodesize); 7326 root->nodesize);
7100 btrfs_free_path(path);
7101 return ret; 7327 return ret;
7102 } 7328 }
7103 7329
@@ -7217,7 +7443,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7217 btrfs_set_header_generation(buf, trans->transid); 7443 btrfs_set_header_generation(buf, trans->transid);
7218 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); 7444 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
7219 btrfs_tree_lock(buf); 7445 btrfs_tree_lock(buf);
7220 clean_tree_block(trans, root, buf); 7446 clean_tree_block(trans, root->fs_info, buf);
7221 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); 7447 clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
7222 7448
7223 btrfs_set_lock_blocking(buf); 7449 btrfs_set_lock_blocking(buf);
@@ -7815,7 +8041,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7815 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 8041 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7816 blocksize = root->nodesize; 8042 blocksize = root->nodesize;
7817 8043
7818 next = btrfs_find_tree_block(root, bytenr); 8044 next = btrfs_find_tree_block(root->fs_info, bytenr);
7819 if (!next) { 8045 if (!next) {
7820 next = btrfs_find_create_tree_block(root, bytenr); 8046 next = btrfs_find_create_tree_block(root, bytenr);
7821 if (!next) 8047 if (!next)
@@ -8016,7 +8242,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8016 btrfs_set_lock_blocking(eb); 8242 btrfs_set_lock_blocking(eb);
8017 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; 8243 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
8018 } 8244 }
8019 clean_tree_block(trans, root, eb); 8245 clean_tree_block(trans, root->fs_info, eb);
8020 } 8246 }
8021 8247
8022 if (eb == root->node) { 8248 if (eb == root->node) {
@@ -8533,10 +8759,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
8533 8759
8534 BUG_ON(cache->ro); 8760 BUG_ON(cache->ro);
8535 8761
8762again:
8536 trans = btrfs_join_transaction(root); 8763 trans = btrfs_join_transaction(root);
8537 if (IS_ERR(trans)) 8764 if (IS_ERR(trans))
8538 return PTR_ERR(trans); 8765 return PTR_ERR(trans);
8539 8766
8767 /*
8768 * we're not allowed to set block groups readonly after the dirty
8769 * block groups cache has started writing. If it already started,
8770 * back off and let this transaction commit
8771 */
8772 mutex_lock(&root->fs_info->ro_block_group_mutex);
8773 if (trans->transaction->dirty_bg_run) {
8774 u64 transid = trans->transid;
8775
8776 mutex_unlock(&root->fs_info->ro_block_group_mutex);
8777 btrfs_end_transaction(trans, root);
8778
8779 ret = btrfs_wait_for_commit(root, transid);
8780 if (ret)
8781 return ret;
8782 goto again;
8783 }
8784
8785
8540 ret = set_block_group_ro(cache, 0); 8786 ret = set_block_group_ro(cache, 0);
8541 if (!ret) 8787 if (!ret)
8542 goto out; 8788 goto out;
@@ -8551,6 +8797,7 @@ out:
8551 alloc_flags = update_block_group_flags(root, cache->flags); 8797 alloc_flags = update_block_group_flags(root, cache->flags);
8552 check_system_chunk(trans, root, alloc_flags); 8798 check_system_chunk(trans, root, alloc_flags);
8553 } 8799 }
8800 mutex_unlock(&root->fs_info->ro_block_group_mutex);
8554 8801
8555 btrfs_end_transaction(trans, root); 8802 btrfs_end_transaction(trans, root);
8556 return ret; 8803 return ret;
@@ -8720,7 +8967,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
8720 min_free <<= 1; 8967 min_free <<= 1;
8721 } else if (index == BTRFS_RAID_RAID0) { 8968 } else if (index == BTRFS_RAID_RAID0) {
8722 dev_min = fs_devices->rw_devices; 8969 dev_min = fs_devices->rw_devices;
8723 do_div(min_free, dev_min); 8970 min_free = div64_u64(min_free, dev_min);
8724 } 8971 }
8725 8972
8726 /* We need to do this so that we can look at pending chunks */ 8973 /* We need to do this so that we can look at pending chunks */
@@ -8992,6 +9239,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8992 INIT_LIST_HEAD(&cache->bg_list); 9239 INIT_LIST_HEAD(&cache->bg_list);
8993 INIT_LIST_HEAD(&cache->ro_list); 9240 INIT_LIST_HEAD(&cache->ro_list);
8994 INIT_LIST_HEAD(&cache->dirty_list); 9241 INIT_LIST_HEAD(&cache->dirty_list);
9242 INIT_LIST_HEAD(&cache->io_list);
8995 btrfs_init_free_space_ctl(cache); 9243 btrfs_init_free_space_ctl(cache);
8996 atomic_set(&cache->trimming, 0); 9244 atomic_set(&cache->trimming, 0);
8997 9245
@@ -9355,7 +9603,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9355 goto out; 9603 goto out;
9356 } 9604 }
9357 9605
9606 /*
9607 * get the inode first so any iput calls done for the io_list
9608 * aren't the final iput (no unlinks allowed now)
9609 */
9358 inode = lookup_free_space_inode(tree_root, block_group, path); 9610 inode = lookup_free_space_inode(tree_root, block_group, path);
9611
9612 mutex_lock(&trans->transaction->cache_write_mutex);
9613 /*
9614 * make sure our free spache cache IO is done before remove the
9615 * free space inode
9616 */
9617 spin_lock(&trans->transaction->dirty_bgs_lock);
9618 if (!list_empty(&block_group->io_list)) {
9619 list_del_init(&block_group->io_list);
9620
9621 WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
9622
9623 spin_unlock(&trans->transaction->dirty_bgs_lock);
9624 btrfs_wait_cache_io(root, trans, block_group,
9625 &block_group->io_ctl, path,
9626 block_group->key.objectid);
9627 btrfs_put_block_group(block_group);
9628 spin_lock(&trans->transaction->dirty_bgs_lock);
9629 }
9630
9631 if (!list_empty(&block_group->dirty_list)) {
9632 list_del_init(&block_group->dirty_list);
9633 btrfs_put_block_group(block_group);
9634 }
9635 spin_unlock(&trans->transaction->dirty_bgs_lock);
9636 mutex_unlock(&trans->transaction->cache_write_mutex);
9637
9359 if (!IS_ERR(inode)) { 9638 if (!IS_ERR(inode)) {
9360 ret = btrfs_orphan_add(trans, inode); 9639 ret = btrfs_orphan_add(trans, inode);
9361 if (ret) { 9640 if (ret) {
@@ -9448,18 +9727,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9448 9727
9449 spin_lock(&trans->transaction->dirty_bgs_lock); 9728 spin_lock(&trans->transaction->dirty_bgs_lock);
9450 if (!list_empty(&block_group->dirty_list)) { 9729 if (!list_empty(&block_group->dirty_list)) {
9451 list_del_init(&block_group->dirty_list); 9730 WARN_ON(1);
9452 btrfs_put_block_group(block_group); 9731 }
9732 if (!list_empty(&block_group->io_list)) {
9733 WARN_ON(1);
9453 } 9734 }
9454 spin_unlock(&trans->transaction->dirty_bgs_lock); 9735 spin_unlock(&trans->transaction->dirty_bgs_lock);
9455
9456 btrfs_remove_free_space_cache(block_group); 9736 btrfs_remove_free_space_cache(block_group);
9457 9737
9458 spin_lock(&block_group->space_info->lock); 9738 spin_lock(&block_group->space_info->lock);
9459 list_del_init(&block_group->ro_list); 9739 list_del_init(&block_group->ro_list);
9740
9741 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
9742 WARN_ON(block_group->space_info->total_bytes
9743 < block_group->key.offset);
9744 WARN_ON(block_group->space_info->bytes_readonly
9745 < block_group->key.offset);
9746 WARN_ON(block_group->space_info->disk_total
9747 < block_group->key.offset * factor);
9748 }
9460 block_group->space_info->total_bytes -= block_group->key.offset; 9749 block_group->space_info->total_bytes -= block_group->key.offset;
9461 block_group->space_info->bytes_readonly -= block_group->key.offset; 9750 block_group->space_info->bytes_readonly -= block_group->key.offset;
9462 block_group->space_info->disk_total -= block_group->key.offset * factor; 9751 block_group->space_info->disk_total -= block_group->key.offset * factor;
9752
9463 spin_unlock(&block_group->space_info->lock); 9753 spin_unlock(&block_group->space_info->lock);
9464 9754
9465 memcpy(&key, &block_group->key, sizeof(key)); 9755 memcpy(&key, &block_group->key, sizeof(key));
@@ -9647,8 +9937,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9647 mutex_unlock(&fs_info->unused_bg_unpin_mutex); 9937 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
9648 9938
9649 /* Reset pinned so btrfs_put_block_group doesn't complain */ 9939 /* Reset pinned so btrfs_put_block_group doesn't complain */
9940 spin_lock(&space_info->lock);
9941 spin_lock(&block_group->lock);
9942
9943 space_info->bytes_pinned -= block_group->pinned;
9944 space_info->bytes_readonly += block_group->pinned;
9945 percpu_counter_add(&space_info->total_bytes_pinned,
9946 -block_group->pinned);
9650 block_group->pinned = 0; 9947 block_group->pinned = 0;
9651 9948
9949 spin_unlock(&block_group->lock);
9950 spin_unlock(&space_info->lock);
9951
9652 /* 9952 /*
9653 * Btrfs_remove_chunk will abort the transaction if things go 9953 * Btrfs_remove_chunk will abort the transaction if things go
9654 * horribly wrong. 9954 * horribly wrong.
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d688cfe5d496..782f3bc4651d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4514,8 +4514,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4514 } 4514 }
4515 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 4515 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
4516 em_len, flags); 4516 em_len, flags);
4517 if (ret) 4517 if (ret) {
4518 if (ret == 1)
4519 ret = 0;
4518 goto out_free; 4520 goto out_free;
4521 }
4519 } 4522 }
4520out_free: 4523out_free:
4521 free_extent_map(em); 4524 free_extent_map(em);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 695b0ccfb755..c668f36898d3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -97,7 +97,7 @@ struct extent_io_tree {
97 u64 dirty_bytes; 97 u64 dirty_bytes;
98 int track_uptodate; 98 int track_uptodate;
99 spinlock_t lock; 99 spinlock_t lock;
100 struct extent_io_ops *ops; 100 const struct extent_io_ops *ops;
101}; 101};
102 102
103struct extent_state { 103struct extent_state {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 84a2d1868271..58ece6558430 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
185 nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; 185 nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
186 if (!dst) { 186 if (!dst) {
187 if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { 187 if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
188 btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, 188 btrfs_bio->csum_allocated = kmalloc_array(nblocks,
189 GFP_NOFS); 189 csum_size, GFP_NOFS);
190 if (!btrfs_bio->csum_allocated) { 190 if (!btrfs_bio->csum_allocated) {
191 btrfs_free_path(path); 191 btrfs_free_path(path);
192 return -ENOMEM; 192 return -ENOMEM;
@@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root,
553 btrfs_truncate_item(root, path, new_size, 0); 553 btrfs_truncate_item(root, path, new_size, 0);
554 554
555 key->offset = end_byte; 555 key->offset = end_byte;
556 btrfs_set_item_key_safe(root, path, key); 556 btrfs_set_item_key_safe(root->fs_info, path, key);
557 } else { 557 } else {
558 BUG(); 558 BUG();
559 } 559 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index faa7d390841b..467620a3b1f9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
273 defrag = rb_entry(node, struct inode_defrag, rb_node); 273 defrag = rb_entry(node, struct inode_defrag, rb_node);
274 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 274 kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
275 275
276 if (need_resched()) { 276 cond_resched_lock(&fs_info->defrag_inodes_lock);
277 spin_unlock(&fs_info->defrag_inodes_lock);
278 cond_resched();
279 spin_lock(&fs_info->defrag_inodes_lock);
280 }
281 277
282 node = rb_first(&fs_info->defrag_inodes); 278 node = rb_first(&fs_info->defrag_inodes);
283 } 279 }
@@ -868,7 +864,7 @@ next_slot:
868 864
869 memcpy(&new_key, &key, sizeof(new_key)); 865 memcpy(&new_key, &key, sizeof(new_key));
870 new_key.offset = end; 866 new_key.offset = end;
871 btrfs_set_item_key_safe(root, path, &new_key); 867 btrfs_set_item_key_safe(root->fs_info, path, &new_key);
872 868
873 extent_offset += end - key.offset; 869 extent_offset += end - key.offset;
874 btrfs_set_file_extent_offset(leaf, fi, extent_offset); 870 btrfs_set_file_extent_offset(leaf, fi, extent_offset);
@@ -1126,7 +1122,7 @@ again:
1126 ino, bytenr, orig_offset, 1122 ino, bytenr, orig_offset,
1127 &other_start, &other_end)) { 1123 &other_start, &other_end)) {
1128 new_key.offset = end; 1124 new_key.offset = end;
1129 btrfs_set_item_key_safe(root, path, &new_key); 1125 btrfs_set_item_key_safe(root->fs_info, path, &new_key);
1130 fi = btrfs_item_ptr(leaf, path->slots[0], 1126 fi = btrfs_item_ptr(leaf, path->slots[0],
1131 struct btrfs_file_extent_item); 1127 struct btrfs_file_extent_item);
1132 btrfs_set_file_extent_generation(leaf, fi, 1128 btrfs_set_file_extent_generation(leaf, fi,
@@ -1160,7 +1156,7 @@ again:
1160 trans->transid); 1156 trans->transid);
1161 path->slots[0]++; 1157 path->slots[0]++;
1162 new_key.offset = start; 1158 new_key.offset = start;
1163 btrfs_set_item_key_safe(root, path, &new_key); 1159 btrfs_set_item_key_safe(root->fs_info, path, &new_key);
1164 1160
1165 fi = btrfs_item_ptr(leaf, path->slots[0], 1161 fi = btrfs_item_ptr(leaf, path->slots[0],
1166 struct btrfs_file_extent_item); 1162 struct btrfs_file_extent_item);
@@ -1485,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1485 PAGE_CACHE_SIZE / (sizeof(struct page *))); 1481 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1486 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1482 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1487 nrptrs = max(nrptrs, 8); 1483 nrptrs = max(nrptrs, 8);
1488 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1484 pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1489 if (!pages) 1485 if (!pages)
1490 return -ENOMEM; 1486 return -ENOMEM;
1491 1487
@@ -1514,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1514 } 1510 }
1515 1511
1516 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1512 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1517 ret = btrfs_check_data_free_space(inode, reserve_bytes); 1513 ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
1518 if (ret == -ENOSPC && 1514 if (ret == -ENOSPC &&
1519 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1515 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1520 BTRFS_INODE_PREALLOC))) { 1516 BTRFS_INODE_PREALLOC))) {
@@ -1635,8 +1631,8 @@ again:
1635 btrfs_end_write_no_snapshoting(root); 1631 btrfs_end_write_no_snapshoting(root);
1636 1632
1637 if (only_release_metadata && copied > 0) { 1633 if (only_release_metadata && copied > 0) {
1638 u64 lockstart = round_down(pos, root->sectorsize); 1634 lockstart = round_down(pos, root->sectorsize);
1639 u64 lockend = lockstart + 1635 lockend = lockstart +
1640 (dirty_pages << PAGE_CACHE_SHIFT) - 1; 1636 (dirty_pages << PAGE_CACHE_SHIFT) - 1;
1641 1637
1642 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, 1638 set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
@@ -1809,7 +1805,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1809 * otherwise subsequent syncs to a file that's been synced in this 1805 * otherwise subsequent syncs to a file that's been synced in this
1810 * transaction will appear to have already occured. 1806 * transaction will appear to have already occured.
1811 */ 1807 */
1808 spin_lock(&BTRFS_I(inode)->lock);
1812 BTRFS_I(inode)->last_sub_trans = root->log_transid; 1809 BTRFS_I(inode)->last_sub_trans = root->log_transid;
1810 spin_unlock(&BTRFS_I(inode)->lock);
1813 if (num_written > 0) { 1811 if (num_written > 0) {
1814 err = generic_write_sync(file, pos, num_written); 1812 err = generic_write_sync(file, pos, num_written);
1815 if (err < 0) 1813 if (err < 0)
@@ -2162,7 +2160,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2162 u64 num_bytes; 2160 u64 num_bytes;
2163 2161
2164 key.offset = offset; 2162 key.offset = offset;
2165 btrfs_set_item_key_safe(root, path, &key); 2163 btrfs_set_item_key_safe(root->fs_info, path, &key);
2166 fi = btrfs_item_ptr(leaf, path->slots[0], 2164 fi = btrfs_item_ptr(leaf, path->slots[0],
2167 struct btrfs_file_extent_item); 2165 struct btrfs_file_extent_item);
2168 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - 2166 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
@@ -2545,7 +2543,6 @@ static long btrfs_fallocate(struct file *file, int mode,
2545{ 2543{
2546 struct inode *inode = file_inode(file); 2544 struct inode *inode = file_inode(file);
2547 struct extent_state *cached_state = NULL; 2545 struct extent_state *cached_state = NULL;
2548 struct btrfs_root *root = BTRFS_I(inode)->root;
2549 u64 cur_offset; 2546 u64 cur_offset;
2550 u64 last_byte; 2547 u64 last_byte;
2551 u64 alloc_start; 2548 u64 alloc_start;
@@ -2570,14 +2567,9 @@ static long btrfs_fallocate(struct file *file, int mode,
2570 * Make sure we have enough space before we do the 2567 * Make sure we have enough space before we do the
2571 * allocation. 2568 * allocation.
2572 */ 2569 */
2573 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); 2570 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
2574 if (ret) 2571 if (ret)
2575 return ret; 2572 return ret;
2576 if (root->fs_info->quota_enabled) {
2577 ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
2578 if (ret)
2579 goto out_reserve_fail;
2580 }
2581 2573
2582 mutex_lock(&inode->i_mutex); 2574 mutex_lock(&inode->i_mutex);
2583 ret = inode_newsize_ok(inode, alloc_end); 2575 ret = inode_newsize_ok(inode, alloc_end);
@@ -2667,23 +2659,35 @@ static long btrfs_fallocate(struct file *file, int mode,
2667 1 << inode->i_blkbits, 2659 1 << inode->i_blkbits,
2668 offset + len, 2660 offset + len,
2669 &alloc_hint); 2661 &alloc_hint);
2670
2671 if (ret < 0) {
2672 free_extent_map(em);
2673 break;
2674 }
2675 } else if (actual_end > inode->i_size && 2662 } else if (actual_end > inode->i_size &&
2676 !(mode & FALLOC_FL_KEEP_SIZE)) { 2663 !(mode & FALLOC_FL_KEEP_SIZE)) {
2664 struct btrfs_trans_handle *trans;
2665 struct btrfs_root *root = BTRFS_I(inode)->root;
2666
2677 /* 2667 /*
2678 * We didn't need to allocate any more space, but we 2668 * We didn't need to allocate any more space, but we
2679 * still extended the size of the file so we need to 2669 * still extended the size of the file so we need to
2680 * update i_size. 2670 * update i_size and the inode item.
2681 */ 2671 */
2682 inode->i_ctime = CURRENT_TIME; 2672 trans = btrfs_start_transaction(root, 1);
2683 i_size_write(inode, actual_end); 2673 if (IS_ERR(trans)) {
2684 btrfs_ordered_update_i_size(inode, actual_end, NULL); 2674 ret = PTR_ERR(trans);
2675 } else {
2676 inode->i_ctime = CURRENT_TIME;
2677 i_size_write(inode, actual_end);
2678 btrfs_ordered_update_i_size(inode, actual_end,
2679 NULL);
2680 ret = btrfs_update_inode(trans, root, inode);
2681 if (ret)
2682 btrfs_end_transaction(trans, root);
2683 else
2684 ret = btrfs_end_transaction(trans,
2685 root);
2686 }
2685 } 2687 }
2686 free_extent_map(em); 2688 free_extent_map(em);
2689 if (ret < 0)
2690 break;
2687 2691
2688 cur_offset = last_byte; 2692 cur_offset = last_byte;
2689 if (cur_offset >= alloc_end) { 2693 if (cur_offset >= alloc_end) {
@@ -2695,9 +2699,6 @@ static long btrfs_fallocate(struct file *file, int mode,
2695 &cached_state, GFP_NOFS); 2699 &cached_state, GFP_NOFS);
2696out: 2700out:
2697 mutex_unlock(&inode->i_mutex); 2701 mutex_unlock(&inode->i_mutex);
2698 if (root->fs_info->quota_enabled)
2699 btrfs_qgroup_free(root, alloc_end - alloc_start);
2700out_reserve_fail:
2701 /* Let go of our reservation. */ 2702 /* Let go of our reservation. */
2702 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 2703 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
2703 return ret; 2704 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index a71978578fa7..253cb74b0e27 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
85 } 85 }
86 86
87 mapping_set_gfp_mask(inode->i_mapping, 87 mapping_set_gfp_mask(inode->i_mapping,
88 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 88 mapping_gfp_mask(inode->i_mapping) &
89 ~(GFP_NOFS & ~__GFP_HIGHMEM));
89 90
90 return inode; 91 return inode;
91} 92}
@@ -170,13 +171,13 @@ static int __create_free_space_inode(struct btrfs_root *root,
170 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 171 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
171 key.offset = offset; 172 key.offset = offset;
172 key.type = 0; 173 key.type = 0;
173
174 ret = btrfs_insert_empty_item(trans, root, path, &key, 174 ret = btrfs_insert_empty_item(trans, root, path, &key,
175 sizeof(struct btrfs_free_space_header)); 175 sizeof(struct btrfs_free_space_header));
176 if (ret < 0) { 176 if (ret < 0) {
177 btrfs_release_path(path); 177 btrfs_release_path(path);
178 return ret; 178 return ret;
179 } 179 }
180
180 leaf = path->nodes[0]; 181 leaf = path->nodes[0];
181 header = btrfs_item_ptr(leaf, path->slots[0], 182 header = btrfs_item_ptr(leaf, path->slots[0],
182 struct btrfs_free_space_header); 183 struct btrfs_free_space_header);
@@ -225,9 +226,37 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
225 226
226int btrfs_truncate_free_space_cache(struct btrfs_root *root, 227int btrfs_truncate_free_space_cache(struct btrfs_root *root,
227 struct btrfs_trans_handle *trans, 228 struct btrfs_trans_handle *trans,
229 struct btrfs_block_group_cache *block_group,
228 struct inode *inode) 230 struct inode *inode)
229{ 231{
230 int ret = 0; 232 int ret = 0;
233 struct btrfs_path *path = btrfs_alloc_path();
234
235 if (!path) {
236 ret = -ENOMEM;
237 goto fail;
238 }
239
240 if (block_group) {
241 mutex_lock(&trans->transaction->cache_write_mutex);
242 if (!list_empty(&block_group->io_list)) {
243 list_del_init(&block_group->io_list);
244
245 btrfs_wait_cache_io(root, trans, block_group,
246 &block_group->io_ctl, path,
247 block_group->key.objectid);
248 btrfs_put_block_group(block_group);
249 }
250
251 /*
252 * now that we've truncated the cache away, its no longer
253 * setup or written
254 */
255 spin_lock(&block_group->lock);
256 block_group->disk_cache_state = BTRFS_DC_CLEAR;
257 spin_unlock(&block_group->lock);
258 }
259 btrfs_free_path(path);
231 260
232 btrfs_i_size_write(inode, 0); 261 btrfs_i_size_write(inode, 0);
233 truncate_pagecache(inode, 0); 262 truncate_pagecache(inode, 0);
@@ -235,15 +264,23 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
235 /* 264 /*
236 * We don't need an orphan item because truncating the free space cache 265 * We don't need an orphan item because truncating the free space cache
237 * will never be split across transactions. 266 * will never be split across transactions.
267 * We don't need to check for -EAGAIN because we're a free space
268 * cache inode
238 */ 269 */
239 ret = btrfs_truncate_inode_items(trans, root, inode, 270 ret = btrfs_truncate_inode_items(trans, root, inode,
240 0, BTRFS_EXTENT_DATA_KEY); 271 0, BTRFS_EXTENT_DATA_KEY);
241 if (ret) { 272 if (ret) {
273 mutex_unlock(&trans->transaction->cache_write_mutex);
242 btrfs_abort_transaction(trans, root, ret); 274 btrfs_abort_transaction(trans, root, ret);
243 return ret; 275 return ret;
244 } 276 }
245 277
246 ret = btrfs_update_inode(trans, root, inode); 278 ret = btrfs_update_inode(trans, root, inode);
279
280 if (block_group)
281 mutex_unlock(&trans->transaction->cache_write_mutex);
282
283fail:
247 if (ret) 284 if (ret)
248 btrfs_abort_transaction(trans, root, ret); 285 btrfs_abort_transaction(trans, root, ret);
249 286
@@ -269,18 +306,7 @@ static int readahead_cache(struct inode *inode)
269 return 0; 306 return 0;
270} 307}
271 308
272struct io_ctl { 309static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
273 void *cur, *orig;
274 struct page *page;
275 struct page **pages;
276 struct btrfs_root *root;
277 unsigned long size;
278 int index;
279 int num_pages;
280 unsigned check_crcs:1;
281};
282
283static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
284 struct btrfs_root *root, int write) 310 struct btrfs_root *root, int write)
285{ 311{
286 int num_pages; 312 int num_pages;
@@ -296,45 +322,46 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
296 (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) 322 (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
297 return -ENOSPC; 323 return -ENOSPC;
298 324
299 memset(io_ctl, 0, sizeof(struct io_ctl)); 325 memset(io_ctl, 0, sizeof(struct btrfs_io_ctl));
300 326
301 io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); 327 io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS);
302 if (!io_ctl->pages) 328 if (!io_ctl->pages)
303 return -ENOMEM; 329 return -ENOMEM;
304 330
305 io_ctl->num_pages = num_pages; 331 io_ctl->num_pages = num_pages;
306 io_ctl->root = root; 332 io_ctl->root = root;
307 io_ctl->check_crcs = check_crcs; 333 io_ctl->check_crcs = check_crcs;
334 io_ctl->inode = inode;
308 335
309 return 0; 336 return 0;
310} 337}
311 338
312static void io_ctl_free(struct io_ctl *io_ctl) 339static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
313{ 340{
314 kfree(io_ctl->pages); 341 kfree(io_ctl->pages);
342 io_ctl->pages = NULL;
315} 343}
316 344
317static void io_ctl_unmap_page(struct io_ctl *io_ctl) 345static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
318{ 346{
319 if (io_ctl->cur) { 347 if (io_ctl->cur) {
320 kunmap(io_ctl->page);
321 io_ctl->cur = NULL; 348 io_ctl->cur = NULL;
322 io_ctl->orig = NULL; 349 io_ctl->orig = NULL;
323 } 350 }
324} 351}
325 352
326static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) 353static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear)
327{ 354{
328 ASSERT(io_ctl->index < io_ctl->num_pages); 355 ASSERT(io_ctl->index < io_ctl->num_pages);
329 io_ctl->page = io_ctl->pages[io_ctl->index++]; 356 io_ctl->page = io_ctl->pages[io_ctl->index++];
330 io_ctl->cur = kmap(io_ctl->page); 357 io_ctl->cur = page_address(io_ctl->page);
331 io_ctl->orig = io_ctl->cur; 358 io_ctl->orig = io_ctl->cur;
332 io_ctl->size = PAGE_CACHE_SIZE; 359 io_ctl->size = PAGE_CACHE_SIZE;
333 if (clear) 360 if (clear)
334 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); 361 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
335} 362}
336 363
337static void io_ctl_drop_pages(struct io_ctl *io_ctl) 364static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
338{ 365{
339 int i; 366 int i;
340 367
@@ -349,7 +376,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl)
349 } 376 }
350} 377}
351 378
352static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, 379static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode,
353 int uptodate) 380 int uptodate)
354{ 381{
355 struct page *page; 382 struct page *page;
@@ -383,7 +410,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
383 return 0; 410 return 0;
384} 411}
385 412
386static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) 413static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
387{ 414{
388 __le64 *val; 415 __le64 *val;
389 416
@@ -406,7 +433,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
406 io_ctl->cur += sizeof(u64); 433 io_ctl->cur += sizeof(u64);
407} 434}
408 435
409static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) 436static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
410{ 437{
411 __le64 *gen; 438 __le64 *gen;
412 439
@@ -435,7 +462,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
435 return 0; 462 return 0;
436} 463}
437 464
438static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) 465static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index)
439{ 466{
440 u32 *tmp; 467 u32 *tmp;
441 u32 crc = ~(u32)0; 468 u32 crc = ~(u32)0;
@@ -453,13 +480,12 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
453 PAGE_CACHE_SIZE - offset); 480 PAGE_CACHE_SIZE - offset);
454 btrfs_csum_final(crc, (char *)&crc); 481 btrfs_csum_final(crc, (char *)&crc);
455 io_ctl_unmap_page(io_ctl); 482 io_ctl_unmap_page(io_ctl);
456 tmp = kmap(io_ctl->pages[0]); 483 tmp = page_address(io_ctl->pages[0]);
457 tmp += index; 484 tmp += index;
458 *tmp = crc; 485 *tmp = crc;
459 kunmap(io_ctl->pages[0]);
460} 486}
461 487
462static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) 488static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
463{ 489{
464 u32 *tmp, val; 490 u32 *tmp, val;
465 u32 crc = ~(u32)0; 491 u32 crc = ~(u32)0;
@@ -473,10 +499,9 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
473 if (index == 0) 499 if (index == 0)
474 offset = sizeof(u32) * io_ctl->num_pages; 500 offset = sizeof(u32) * io_ctl->num_pages;
475 501
476 tmp = kmap(io_ctl->pages[0]); 502 tmp = page_address(io_ctl->pages[0]);
477 tmp += index; 503 tmp += index;
478 val = *tmp; 504 val = *tmp;
479 kunmap(io_ctl->pages[0]);
480 505
481 io_ctl_map_page(io_ctl, 0); 506 io_ctl_map_page(io_ctl, 0);
482 crc = btrfs_csum_data(io_ctl->orig + offset, crc, 507 crc = btrfs_csum_data(io_ctl->orig + offset, crc,
@@ -492,7 +517,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
492 return 0; 517 return 0;
493} 518}
494 519
495static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, 520static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes,
496 void *bitmap) 521 void *bitmap)
497{ 522{
498 struct btrfs_free_space_entry *entry; 523 struct btrfs_free_space_entry *entry;
@@ -522,7 +547,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
522 return 0; 547 return 0;
523} 548}
524 549
525static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) 550static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap)
526{ 551{
527 if (!io_ctl->cur) 552 if (!io_ctl->cur)
528 return -ENOSPC; 553 return -ENOSPC;
@@ -545,7 +570,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
545 return 0; 570 return 0;
546} 571}
547 572
548static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) 573static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl)
549{ 574{
550 /* 575 /*
551 * If we're not on the boundary we know we've modified the page and we 576 * If we're not on the boundary we know we've modified the page and we
@@ -562,7 +587,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
562 } 587 }
563} 588}
564 589
565static int io_ctl_read_entry(struct io_ctl *io_ctl, 590static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl,
566 struct btrfs_free_space *entry, u8 *type) 591 struct btrfs_free_space *entry, u8 *type)
567{ 592{
568 struct btrfs_free_space_entry *e; 593 struct btrfs_free_space_entry *e;
@@ -589,7 +614,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl,
589 return 0; 614 return 0;
590} 615}
591 616
592static int io_ctl_read_bitmap(struct io_ctl *io_ctl, 617static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl,
593 struct btrfs_free_space *entry) 618 struct btrfs_free_space *entry)
594{ 619{
595 int ret; 620 int ret;
@@ -648,7 +673,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
648{ 673{
649 struct btrfs_free_space_header *header; 674 struct btrfs_free_space_header *header;
650 struct extent_buffer *leaf; 675 struct extent_buffer *leaf;
651 struct io_ctl io_ctl; 676 struct btrfs_io_ctl io_ctl;
652 struct btrfs_key key; 677 struct btrfs_key key;
653 struct btrfs_free_space *e, *n; 678 struct btrfs_free_space *e, *n;
654 LIST_HEAD(bitmaps); 679 LIST_HEAD(bitmaps);
@@ -877,7 +902,7 @@ out:
877} 902}
878 903
879static noinline_for_stack 904static noinline_for_stack
880int write_cache_extent_entries(struct io_ctl *io_ctl, 905int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl,
881 struct btrfs_free_space_ctl *ctl, 906 struct btrfs_free_space_ctl *ctl,
882 struct btrfs_block_group_cache *block_group, 907 struct btrfs_block_group_cache *block_group,
883 int *entries, int *bitmaps, 908 int *entries, int *bitmaps,
@@ -885,6 +910,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
885{ 910{
886 int ret; 911 int ret;
887 struct btrfs_free_cluster *cluster = NULL; 912 struct btrfs_free_cluster *cluster = NULL;
913 struct btrfs_free_cluster *cluster_locked = NULL;
888 struct rb_node *node = rb_first(&ctl->free_space_offset); 914 struct rb_node *node = rb_first(&ctl->free_space_offset);
889 struct btrfs_trim_range *trim_entry; 915 struct btrfs_trim_range *trim_entry;
890 916
@@ -896,6 +922,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
896 } 922 }
897 923
898 if (!node && cluster) { 924 if (!node && cluster) {
925 cluster_locked = cluster;
926 spin_lock(&cluster_locked->lock);
899 node = rb_first(&cluster->root); 927 node = rb_first(&cluster->root);
900 cluster = NULL; 928 cluster = NULL;
901 } 929 }
@@ -919,9 +947,15 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
919 node = rb_next(node); 947 node = rb_next(node);
920 if (!node && cluster) { 948 if (!node && cluster) {
921 node = rb_first(&cluster->root); 949 node = rb_first(&cluster->root);
950 cluster_locked = cluster;
951 spin_lock(&cluster_locked->lock);
922 cluster = NULL; 952 cluster = NULL;
923 } 953 }
924 } 954 }
955 if (cluster_locked) {
956 spin_unlock(&cluster_locked->lock);
957 cluster_locked = NULL;
958 }
925 959
926 /* 960 /*
927 * Make sure we don't miss any range that was removed from our rbtree 961 * Make sure we don't miss any range that was removed from our rbtree
@@ -939,6 +973,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl,
939 973
940 return 0; 974 return 0;
941fail: 975fail:
976 if (cluster_locked)
977 spin_unlock(&cluster_locked->lock);
942 return -ENOSPC; 978 return -ENOSPC;
943} 979}
944 980
@@ -1000,7 +1036,7 @@ fail:
1000static noinline_for_stack int 1036static noinline_for_stack int
1001write_pinned_extent_entries(struct btrfs_root *root, 1037write_pinned_extent_entries(struct btrfs_root *root,
1002 struct btrfs_block_group_cache *block_group, 1038 struct btrfs_block_group_cache *block_group,
1003 struct io_ctl *io_ctl, 1039 struct btrfs_io_ctl *io_ctl,
1004 int *entries) 1040 int *entries)
1005{ 1041{
1006 u64 start, extent_start, extent_end, len; 1042 u64 start, extent_start, extent_end, len;
@@ -1050,7 +1086,7 @@ write_pinned_extent_entries(struct btrfs_root *root,
1050} 1086}
1051 1087
1052static noinline_for_stack int 1088static noinline_for_stack int
1053write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) 1089write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
1054{ 1090{
1055 struct list_head *pos, *n; 1091 struct list_head *pos, *n;
1056 int ret; 1092 int ret;
@@ -1084,7 +1120,7 @@ static int flush_dirty_cache(struct inode *inode)
1084 1120
1085static void noinline_for_stack 1121static void noinline_for_stack
1086cleanup_write_cache_enospc(struct inode *inode, 1122cleanup_write_cache_enospc(struct inode *inode,
1087 struct io_ctl *io_ctl, 1123 struct btrfs_io_ctl *io_ctl,
1088 struct extent_state **cached_state, 1124 struct extent_state **cached_state,
1089 struct list_head *bitmap_list) 1125 struct list_head *bitmap_list)
1090{ 1126{
@@ -1101,6 +1137,70 @@ cleanup_write_cache_enospc(struct inode *inode,
1101 GFP_NOFS); 1137 GFP_NOFS);
1102} 1138}
1103 1139
1140int btrfs_wait_cache_io(struct btrfs_root *root,
1141 struct btrfs_trans_handle *trans,
1142 struct btrfs_block_group_cache *block_group,
1143 struct btrfs_io_ctl *io_ctl,
1144 struct btrfs_path *path, u64 offset)
1145{
1146 int ret;
1147 struct inode *inode = io_ctl->inode;
1148
1149 if (!inode)
1150 return 0;
1151
1152 root = root->fs_info->tree_root;
1153
1154 /* Flush the dirty pages in the cache file. */
1155 ret = flush_dirty_cache(inode);
1156 if (ret)
1157 goto out;
1158
1159 /* Update the cache item to tell everyone this cache file is valid. */
1160 ret = update_cache_item(trans, root, inode, path, offset,
1161 io_ctl->entries, io_ctl->bitmaps);
1162out:
1163 io_ctl_free(io_ctl);
1164 if (ret) {
1165 invalidate_inode_pages2(inode->i_mapping);
1166 BTRFS_I(inode)->generation = 0;
1167 if (block_group) {
1168#ifdef DEBUG
1169 btrfs_err(root->fs_info,
1170 "failed to write free space cache for block group %llu",
1171 block_group->key.objectid);
1172#endif
1173 }
1174 }
1175 btrfs_update_inode(trans, root, inode);
1176
1177 if (block_group) {
1178 /* the dirty list is protected by the dirty_bgs_lock */
1179 spin_lock(&trans->transaction->dirty_bgs_lock);
1180
1181 /* the disk_cache_state is protected by the block group lock */
1182 spin_lock(&block_group->lock);
1183
1184 /*
1185 * only mark this as written if we didn't get put back on
1186 * the dirty list while waiting for IO. Otherwise our
1187 * cache state won't be right, and we won't get written again
1188 */
1189 if (!ret && list_empty(&block_group->dirty_list))
1190 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1191 else if (ret)
1192 block_group->disk_cache_state = BTRFS_DC_ERROR;
1193
1194 spin_unlock(&block_group->lock);
1195 spin_unlock(&trans->transaction->dirty_bgs_lock);
1196 io_ctl->inode = NULL;
1197 iput(inode);
1198 }
1199
1200 return ret;
1201
1202}
1203
1104/** 1204/**
1105 * __btrfs_write_out_cache - write out cached info to an inode 1205 * __btrfs_write_out_cache - write out cached info to an inode
1106 * @root - the root the inode belongs to 1206 * @root - the root the inode belongs to
@@ -1117,20 +1217,22 @@ cleanup_write_cache_enospc(struct inode *inode,
1117static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1217static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1118 struct btrfs_free_space_ctl *ctl, 1218 struct btrfs_free_space_ctl *ctl,
1119 struct btrfs_block_group_cache *block_group, 1219 struct btrfs_block_group_cache *block_group,
1220 struct btrfs_io_ctl *io_ctl,
1120 struct btrfs_trans_handle *trans, 1221 struct btrfs_trans_handle *trans,
1121 struct btrfs_path *path, u64 offset) 1222 struct btrfs_path *path, u64 offset)
1122{ 1223{
1123 struct extent_state *cached_state = NULL; 1224 struct extent_state *cached_state = NULL;
1124 struct io_ctl io_ctl;
1125 LIST_HEAD(bitmap_list); 1225 LIST_HEAD(bitmap_list);
1126 int entries = 0; 1226 int entries = 0;
1127 int bitmaps = 0; 1227 int bitmaps = 0;
1128 int ret; 1228 int ret;
1229 int must_iput = 0;
1129 1230
1130 if (!i_size_read(inode)) 1231 if (!i_size_read(inode))
1131 return -1; 1232 return -1;
1132 1233
1133 ret = io_ctl_init(&io_ctl, inode, root, 1); 1234 WARN_ON(io_ctl->pages);
1235 ret = io_ctl_init(io_ctl, inode, root, 1);
1134 if (ret) 1236 if (ret)
1135 return -1; 1237 return -1;
1136 1238
@@ -1143,24 +1245,27 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1143 up_write(&block_group->data_rwsem); 1245 up_write(&block_group->data_rwsem);
1144 BTRFS_I(inode)->generation = 0; 1246 BTRFS_I(inode)->generation = 0;
1145 ret = 0; 1247 ret = 0;
1248 must_iput = 1;
1146 goto out; 1249 goto out;
1147 } 1250 }
1148 spin_unlock(&block_group->lock); 1251 spin_unlock(&block_group->lock);
1149 } 1252 }
1150 1253
1151 /* Lock all pages first so we can lock the extent safely. */ 1254 /* Lock all pages first so we can lock the extent safely. */
1152 io_ctl_prepare_pages(&io_ctl, inode, 0); 1255 io_ctl_prepare_pages(io_ctl, inode, 0);
1153 1256
1154 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 1257 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1155 0, &cached_state); 1258 0, &cached_state);
1156 1259
1157 io_ctl_set_generation(&io_ctl, trans->transid); 1260 io_ctl_set_generation(io_ctl, trans->transid);
1158 1261
1159 mutex_lock(&ctl->cache_writeout_mutex); 1262 mutex_lock(&ctl->cache_writeout_mutex);
1160 /* Write out the extent entries in the free space cache */ 1263 /* Write out the extent entries in the free space cache */
1161 ret = write_cache_extent_entries(&io_ctl, ctl, 1264 spin_lock(&ctl->tree_lock);
1265 ret = write_cache_extent_entries(io_ctl, ctl,
1162 block_group, &entries, &bitmaps, 1266 block_group, &entries, &bitmaps,
1163 &bitmap_list); 1267 &bitmap_list);
1268 spin_unlock(&ctl->tree_lock);
1164 if (ret) { 1269 if (ret) {
1165 mutex_unlock(&ctl->cache_writeout_mutex); 1270 mutex_unlock(&ctl->cache_writeout_mutex);
1166 goto out_nospc; 1271 goto out_nospc;
@@ -1170,8 +1275,11 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1170 * Some spaces that are freed in the current transaction are pinned, 1275 * Some spaces that are freed in the current transaction are pinned,
1171 * they will be added into free space cache after the transaction is 1276 * they will be added into free space cache after the transaction is
1172 * committed, we shouldn't lose them. 1277 * committed, we shouldn't lose them.
1278 *
1279 * If this changes while we are working we'll get added back to
1280 * the dirty list and redo it. No locking needed
1173 */ 1281 */
1174 ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); 1282 ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
1175 if (ret) { 1283 if (ret) {
1176 mutex_unlock(&ctl->cache_writeout_mutex); 1284 mutex_unlock(&ctl->cache_writeout_mutex);
1177 goto out_nospc; 1285 goto out_nospc;
@@ -1182,16 +1290,18 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1182 * locked while doing it because a concurrent trim can be manipulating 1290 * locked while doing it because a concurrent trim can be manipulating
1183 * or freeing the bitmap. 1291 * or freeing the bitmap.
1184 */ 1292 */
1185 ret = write_bitmap_entries(&io_ctl, &bitmap_list); 1293 spin_lock(&ctl->tree_lock);
1294 ret = write_bitmap_entries(io_ctl, &bitmap_list);
1295 spin_unlock(&ctl->tree_lock);
1186 mutex_unlock(&ctl->cache_writeout_mutex); 1296 mutex_unlock(&ctl->cache_writeout_mutex);
1187 if (ret) 1297 if (ret)
1188 goto out_nospc; 1298 goto out_nospc;
1189 1299
1190 /* Zero out the rest of the pages just to make sure */ 1300 /* Zero out the rest of the pages just to make sure */
1191 io_ctl_zero_remaining_pages(&io_ctl); 1301 io_ctl_zero_remaining_pages(io_ctl);
1192 1302
1193 /* Everything is written out, now we dirty the pages in the file. */ 1303 /* Everything is written out, now we dirty the pages in the file. */
1194 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, 1304 ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
1195 0, i_size_read(inode), &cached_state); 1305 0, i_size_read(inode), &cached_state);
1196 if (ret) 1306 if (ret)
1197 goto out_nospc; 1307 goto out_nospc;
@@ -1202,30 +1312,39 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1202 * Release the pages and unlock the extent, we will flush 1312 * Release the pages and unlock the extent, we will flush
1203 * them out later 1313 * them out later
1204 */ 1314 */
1205 io_ctl_drop_pages(&io_ctl); 1315 io_ctl_drop_pages(io_ctl);
1206 1316
1207 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 1317 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1208 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 1318 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1209 1319
1210 /* Flush the dirty pages in the cache file. */ 1320 /*
1211 ret = flush_dirty_cache(inode); 1321 * at this point the pages are under IO and we're happy,
1322 * The caller is responsible for waiting on them and updating the
1323 * the cache and the inode
1324 */
1325 io_ctl->entries = entries;
1326 io_ctl->bitmaps = bitmaps;
1327
1328 ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
1212 if (ret) 1329 if (ret)
1213 goto out; 1330 goto out;
1214 1331
1215 /* Update the cache item to tell everyone this cache file is valid. */ 1332 return 0;
1216 ret = update_cache_item(trans, root, inode, path, offset, 1333
1217 entries, bitmaps);
1218out: 1334out:
1219 io_ctl_free(&io_ctl); 1335 io_ctl->inode = NULL;
1336 io_ctl_free(io_ctl);
1220 if (ret) { 1337 if (ret) {
1221 invalidate_inode_pages2(inode->i_mapping); 1338 invalidate_inode_pages2(inode->i_mapping);
1222 BTRFS_I(inode)->generation = 0; 1339 BTRFS_I(inode)->generation = 0;
1223 } 1340 }
1224 btrfs_update_inode(trans, root, inode); 1341 btrfs_update_inode(trans, root, inode);
1342 if (must_iput)
1343 iput(inode);
1225 return ret; 1344 return ret;
1226 1345
1227out_nospc: 1346out_nospc:
1228 cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); 1347 cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);
1229 1348
1230 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1349 if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1231 up_write(&block_group->data_rwsem); 1350 up_write(&block_group->data_rwsem);
@@ -1241,7 +1360,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
1241 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 1360 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1242 struct inode *inode; 1361 struct inode *inode;
1243 int ret = 0; 1362 int ret = 0;
1244 enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
1245 1363
1246 root = root->fs_info->tree_root; 1364 root = root->fs_info->tree_root;
1247 1365
@@ -1250,34 +1368,34 @@ int btrfs_write_out_cache(struct btrfs_root *root,
1250 spin_unlock(&block_group->lock); 1368 spin_unlock(&block_group->lock);
1251 return 0; 1369 return 0;
1252 } 1370 }
1253
1254 if (block_group->delalloc_bytes) {
1255 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1256 spin_unlock(&block_group->lock);
1257 return 0;
1258 }
1259 spin_unlock(&block_group->lock); 1371 spin_unlock(&block_group->lock);
1260 1372
1261 inode = lookup_free_space_inode(root, block_group, path); 1373 inode = lookup_free_space_inode(root, block_group, path);
1262 if (IS_ERR(inode)) 1374 if (IS_ERR(inode))
1263 return 0; 1375 return 0;
1264 1376
1265 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1377 ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
1378 &block_group->io_ctl, trans,
1266 path, block_group->key.objectid); 1379 path, block_group->key.objectid);
1267 if (ret) { 1380 if (ret) {
1268 dcs = BTRFS_DC_ERROR;
1269 ret = 0;
1270#ifdef DEBUG 1381#ifdef DEBUG
1271 btrfs_err(root->fs_info, 1382 btrfs_err(root->fs_info,
1272 "failed to write free space cache for block group %llu", 1383 "failed to write free space cache for block group %llu",
1273 block_group->key.objectid); 1384 block_group->key.objectid);
1274#endif 1385#endif
1386 spin_lock(&block_group->lock);
1387 block_group->disk_cache_state = BTRFS_DC_ERROR;
1388 spin_unlock(&block_group->lock);
1389
1390 block_group->io_ctl.inode = NULL;
1391 iput(inode);
1275 } 1392 }
1276 1393
1277 spin_lock(&block_group->lock); 1394 /*
1278 block_group->disk_cache_state = dcs; 1395 * if ret == 0 the caller is expected to call btrfs_wait_cache_io
1279 spin_unlock(&block_group->lock); 1396 * to wait for IO and put the inode
1280 iput(inode); 1397 */
1398
1281 return ret; 1399 return ret;
1282} 1400}
1283 1401
@@ -1298,11 +1416,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1298 u64 offset) 1416 u64 offset)
1299{ 1417{
1300 u64 bitmap_start; 1418 u64 bitmap_start;
1301 u64 bytes_per_bitmap; 1419 u32 bytes_per_bitmap;
1302 1420
1303 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; 1421 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1304 bitmap_start = offset - ctl->start; 1422 bitmap_start = offset - ctl->start;
1305 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 1423 bitmap_start = div_u64(bitmap_start, bytes_per_bitmap);
1306 bitmap_start *= bytes_per_bitmap; 1424 bitmap_start *= bytes_per_bitmap;
1307 bitmap_start += ctl->start; 1425 bitmap_start += ctl->start;
1308 1426
@@ -1521,10 +1639,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1521 u64 bitmap_bytes; 1639 u64 bitmap_bytes;
1522 u64 extent_bytes; 1640 u64 extent_bytes;
1523 u64 size = block_group->key.offset; 1641 u64 size = block_group->key.offset;
1524 u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; 1642 u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1525 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); 1643 u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg);
1526 1644
1527 max_bitmaps = max(max_bitmaps, 1); 1645 max_bitmaps = max_t(u32, max_bitmaps, 1);
1528 1646
1529 ASSERT(ctl->total_bitmaps <= max_bitmaps); 1647 ASSERT(ctl->total_bitmaps <= max_bitmaps);
1530 1648
@@ -1537,7 +1655,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1537 max_bytes = MAX_CACHE_BYTES_PER_GIG; 1655 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1538 else 1656 else
1539 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1657 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1540 div64_u64(size, 1024 * 1024 * 1024); 1658 div_u64(size, 1024 * 1024 * 1024);
1541 1659
1542 /* 1660 /*
1543 * we want to account for 1 more bitmap than what we have so we can make 1661 * we want to account for 1 more bitmap than what we have so we can make
@@ -1552,14 +1670,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1552 } 1670 }
1553 1671
1554 /* 1672 /*
1555 * we want the extent entry threshold to always be at most 1/2 the maxw 1673 * we want the extent entry threshold to always be at most 1/2 the max
1556 * bytes we can have, or whatever is less than that. 1674 * bytes we can have, or whatever is less than that.
1557 */ 1675 */
1558 extent_bytes = max_bytes - bitmap_bytes; 1676 extent_bytes = max_bytes - bitmap_bytes;
1559 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); 1677 extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1);
1560 1678
1561 ctl->extents_thresh = 1679 ctl->extents_thresh =
1562 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1680 div_u64(extent_bytes, sizeof(struct btrfs_free_space));
1563} 1681}
1564 1682
1565static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1683static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -1673,7 +1791,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1673 */ 1791 */
1674 if (*bytes >= align) { 1792 if (*bytes >= align) {
1675 tmp = entry->offset - ctl->start + align - 1; 1793 tmp = entry->offset - ctl->start + align - 1;
1676 do_div(tmp, align); 1794 tmp = div64_u64(tmp, align);
1677 tmp = tmp * align + ctl->start; 1795 tmp = tmp * align + ctl->start;
1678 align_off = tmp - entry->offset; 1796 align_off = tmp - entry->offset;
1679 } else { 1797 } else {
@@ -2402,11 +2520,8 @@ static void __btrfs_remove_free_space_cache_locked(
2402 } else { 2520 } else {
2403 free_bitmap(ctl, info); 2521 free_bitmap(ctl, info);
2404 } 2522 }
2405 if (need_resched()) { 2523
2406 spin_unlock(&ctl->tree_lock); 2524 cond_resched_lock(&ctl->tree_lock);
2407 cond_resched();
2408 spin_lock(&ctl->tree_lock);
2409 }
2410 } 2525 }
2411} 2526}
2412 2527
@@ -2431,11 +2546,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2431 2546
2432 WARN_ON(cluster->block_group != block_group); 2547 WARN_ON(cluster->block_group != block_group);
2433 __btrfs_return_cluster_to_free_space(block_group, cluster); 2548 __btrfs_return_cluster_to_free_space(block_group, cluster);
2434 if (need_resched()) { 2549
2435 spin_unlock(&ctl->tree_lock); 2550 cond_resched_lock(&ctl->tree_lock);
2436 cond_resched();
2437 spin_lock(&ctl->tree_lock);
2438 }
2439 } 2551 }
2440 __btrfs_remove_free_space_cache_locked(ctl); 2552 __btrfs_remove_free_space_cache_locked(ctl);
2441 spin_unlock(&ctl->tree_lock); 2553 spin_unlock(&ctl->tree_lock);
@@ -3346,11 +3458,14 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
3346{ 3458{
3347 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 3459 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3348 int ret; 3460 int ret;
3461 struct btrfs_io_ctl io_ctl;
3349 3462
3350 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 3463 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
3351 return 0; 3464 return 0;
3352 3465
3353 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 3466 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
3467 trans, path, 0) ||
3468 btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
3354 if (ret) { 3469 if (ret) {
3355 btrfs_delalloc_release_metadata(inode, inode->i_size); 3470 btrfs_delalloc_release_metadata(inode, inode->i_size);
3356#ifdef DEBUG 3471#ifdef DEBUG
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 88b2238a0aed..a16a029ad3b1 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -48,6 +48,8 @@ struct btrfs_free_space_op {
48 struct btrfs_free_space *info); 48 struct btrfs_free_space *info);
49}; 49};
50 50
51struct btrfs_io_ctl;
52
51struct inode *lookup_free_space_inode(struct btrfs_root *root, 53struct inode *lookup_free_space_inode(struct btrfs_root *root,
52 struct btrfs_block_group_cache 54 struct btrfs_block_group_cache
53 *block_group, struct btrfs_path *path); 55 *block_group, struct btrfs_path *path);
@@ -60,14 +62,19 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
60 struct btrfs_block_rsv *rsv); 62 struct btrfs_block_rsv *rsv);
61int btrfs_truncate_free_space_cache(struct btrfs_root *root, 63int btrfs_truncate_free_space_cache(struct btrfs_root *root,
62 struct btrfs_trans_handle *trans, 64 struct btrfs_trans_handle *trans,
65 struct btrfs_block_group_cache *block_group,
63 struct inode *inode); 66 struct inode *inode);
64int load_free_space_cache(struct btrfs_fs_info *fs_info, 67int load_free_space_cache(struct btrfs_fs_info *fs_info,
65 struct btrfs_block_group_cache *block_group); 68 struct btrfs_block_group_cache *block_group);
69int btrfs_wait_cache_io(struct btrfs_root *root,
70 struct btrfs_trans_handle *trans,
71 struct btrfs_block_group_cache *block_group,
72 struct btrfs_io_ctl *io_ctl,
73 struct btrfs_path *path, u64 offset);
66int btrfs_write_out_cache(struct btrfs_root *root, 74int btrfs_write_out_cache(struct btrfs_root *root,
67 struct btrfs_trans_handle *trans, 75 struct btrfs_trans_handle *trans,
68 struct btrfs_block_group_cache *block_group, 76 struct btrfs_block_group_cache *block_group,
69 struct btrfs_path *path); 77 struct btrfs_path *path);
70
71struct inode *lookup_free_ino_inode(struct btrfs_root *root, 78struct inode *lookup_free_ino_inode(struct btrfs_root *root,
72 struct btrfs_path *path); 79 struct btrfs_path *path);
73int create_free_ino_inode(struct btrfs_root *root, 80int create_free_ino_inode(struct btrfs_root *root,
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 74faea3a516e..f6a596d5a637 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -456,7 +456,7 @@ again:
456 } 456 }
457 457
458 if (i_size_read(inode) > 0) { 458 if (i_size_read(inode) > 0) {
459 ret = btrfs_truncate_free_space_cache(root, trans, inode); 459 ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
460 if (ret) { 460 if (ret) {
461 if (ret != -ENOSPC) 461 if (ret != -ENOSPC)
462 btrfs_abort_transaction(trans, root, ret); 462 btrfs_abort_transaction(trans, root, ret);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43192e10cc43..56f00a25c003 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -59,6 +59,7 @@
59#include "backref.h" 59#include "backref.h"
60#include "hash.h" 60#include "hash.h"
61#include "props.h" 61#include "props.h"
62#include "qgroup.h"
62 63
63struct btrfs_iget_args { 64struct btrfs_iget_args {
64 struct btrfs_key *location; 65 struct btrfs_key *location;
@@ -470,7 +471,7 @@ again:
470 */ 471 */
471 if (inode_need_compress(inode)) { 472 if (inode_need_compress(inode)) {
472 WARN_ON(pages); 473 WARN_ON(pages);
473 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 474 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
474 if (!pages) { 475 if (!pages) {
475 /* just bail out to the uncompressed code */ 476 /* just bail out to the uncompressed code */
476 goto cont; 477 goto cont;
@@ -752,7 +753,6 @@ retry:
752 } 753 }
753 goto out_free; 754 goto out_free;
754 } 755 }
755
756 /* 756 /*
757 * here we're doing allocation and writeback of the 757 * here we're doing allocation and writeback of the
758 * compressed pages 758 * compressed pages
@@ -3110,6 +3110,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
3110 if (empty) 3110 if (empty)
3111 return; 3111 return;
3112 3112
3113 down_read(&fs_info->delayed_iput_sem);
3114
3113 spin_lock(&fs_info->delayed_iput_lock); 3115 spin_lock(&fs_info->delayed_iput_lock);
3114 list_splice_init(&fs_info->delayed_iputs, &list); 3116 list_splice_init(&fs_info->delayed_iputs, &list);
3115 spin_unlock(&fs_info->delayed_iput_lock); 3117 spin_unlock(&fs_info->delayed_iput_lock);
@@ -3120,6 +3122,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
3120 iput(delayed->inode); 3122 iput(delayed->inode);
3121 kfree(delayed); 3123 kfree(delayed);
3122 } 3124 }
3125
3126 up_read(&root->fs_info->delayed_iput_sem);
3123} 3127}
3124 3128
3125/* 3129/*
@@ -4162,6 +4166,21 @@ out:
4162 return err; 4166 return err;
4163} 4167}
4164 4168
4169static int truncate_space_check(struct btrfs_trans_handle *trans,
4170 struct btrfs_root *root,
4171 u64 bytes_deleted)
4172{
4173 int ret;
4174
4175 bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted);
4176 ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv,
4177 bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4178 if (!ret)
4179 trans->bytes_reserved += bytes_deleted;
4180 return ret;
4181
4182}
4183
4165/* 4184/*
4166 * this can truncate away extent items, csum items and directory items. 4185 * this can truncate away extent items, csum items and directory items.
4167 * It starts at a high offset and removes keys until it can't find 4186 * It starts at a high offset and removes keys until it can't find
@@ -4197,9 +4216,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4197 int ret; 4216 int ret;
4198 int err = 0; 4217 int err = 0;
4199 u64 ino = btrfs_ino(inode); 4218 u64 ino = btrfs_ino(inode);
4219 u64 bytes_deleted = 0;
4220 bool be_nice = 0;
4221 bool should_throttle = 0;
4222 bool should_end = 0;
4200 4223
4201 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 4224 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4202 4225
4226 /*
4227 * for non-free space inodes and ref cows, we want to back off from
4228 * time to time
4229 */
4230 if (!btrfs_is_free_space_inode(inode) &&
4231 test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4232 be_nice = 1;
4233
4203 path = btrfs_alloc_path(); 4234 path = btrfs_alloc_path();
4204 if (!path) 4235 if (!path)
4205 return -ENOMEM; 4236 return -ENOMEM;
@@ -4229,6 +4260,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4229 key.type = (u8)-1; 4260 key.type = (u8)-1;
4230 4261
4231search_again: 4262search_again:
4263 /*
4264 * with a 16K leaf size and 128MB extents, you can actually queue
4265 * up a huge file in a single leaf. Most of the time that
4266 * bytes_deleted is > 0, it will be huge by the time we get here
4267 */
4268 if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
4269 if (btrfs_should_end_transaction(trans, root)) {
4270 err = -EAGAIN;
4271 goto error;
4272 }
4273 }
4274
4275
4232 path->leave_spinning = 1; 4276 path->leave_spinning = 1;
4233 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 4277 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4234 if (ret < 0) { 4278 if (ret < 0) {
@@ -4371,22 +4415,39 @@ delete:
4371 } else { 4415 } else {
4372 break; 4416 break;
4373 } 4417 }
4418 should_throttle = 0;
4419
4374 if (found_extent && 4420 if (found_extent &&
4375 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || 4421 (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4376 root == root->fs_info->tree_root)) { 4422 root == root->fs_info->tree_root)) {
4377 btrfs_set_path_blocking(path); 4423 btrfs_set_path_blocking(path);
4424 bytes_deleted += extent_num_bytes;
4378 ret = btrfs_free_extent(trans, root, extent_start, 4425 ret = btrfs_free_extent(trans, root, extent_start,
4379 extent_num_bytes, 0, 4426 extent_num_bytes, 0,
4380 btrfs_header_owner(leaf), 4427 btrfs_header_owner(leaf),
4381 ino, extent_offset, 0); 4428 ino, extent_offset, 0);
4382 BUG_ON(ret); 4429 BUG_ON(ret);
4430 if (btrfs_should_throttle_delayed_refs(trans, root))
4431 btrfs_async_run_delayed_refs(root,
4432 trans->delayed_ref_updates * 2, 0);
4433 if (be_nice) {
4434 if (truncate_space_check(trans, root,
4435 extent_num_bytes)) {
4436 should_end = 1;
4437 }
4438 if (btrfs_should_throttle_delayed_refs(trans,
4439 root)) {
4440 should_throttle = 1;
4441 }
4442 }
4383 } 4443 }
4384 4444
4385 if (found_type == BTRFS_INODE_ITEM_KEY) 4445 if (found_type == BTRFS_INODE_ITEM_KEY)
4386 break; 4446 break;
4387 4447
4388 if (path->slots[0] == 0 || 4448 if (path->slots[0] == 0 ||
4389 path->slots[0] != pending_del_slot) { 4449 path->slots[0] != pending_del_slot ||
4450 should_throttle || should_end) {
4390 if (pending_del_nr) { 4451 if (pending_del_nr) {
4391 ret = btrfs_del_items(trans, root, path, 4452 ret = btrfs_del_items(trans, root, path,
4392 pending_del_slot, 4453 pending_del_slot,
@@ -4399,6 +4460,23 @@ delete:
4399 pending_del_nr = 0; 4460 pending_del_nr = 0;
4400 } 4461 }
4401 btrfs_release_path(path); 4462 btrfs_release_path(path);
4463 if (should_throttle) {
4464 unsigned long updates = trans->delayed_ref_updates;
4465 if (updates) {
4466 trans->delayed_ref_updates = 0;
4467 ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4468 if (ret && !err)
4469 err = ret;
4470 }
4471 }
4472 /*
4473 * if we failed to refill our space rsv, bail out
4474 * and let the transaction restart
4475 */
4476 if (should_end) {
4477 err = -EAGAIN;
4478 goto error;
4479 }
4402 goto search_again; 4480 goto search_again;
4403 } else { 4481 } else {
4404 path->slots[0]--; 4482 path->slots[0]--;
@@ -4415,7 +4493,18 @@ error:
4415 if (last_size != (u64)-1 && 4493 if (last_size != (u64)-1 &&
4416 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) 4494 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4417 btrfs_ordered_update_i_size(inode, last_size, NULL); 4495 btrfs_ordered_update_i_size(inode, last_size, NULL);
4496
4418 btrfs_free_path(path); 4497 btrfs_free_path(path);
4498
4499 if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
4500 unsigned long updates = trans->delayed_ref_updates;
4501 if (updates) {
4502 trans->delayed_ref_updates = 0;
4503 ret = btrfs_run_delayed_refs(trans, root, updates * 2);
4504 if (ret && !err)
4505 err = ret;
4506 }
4507 }
4419 return err; 4508 return err;
4420} 4509}
4421 4510
@@ -4924,6 +5013,7 @@ void btrfs_evict_inode(struct inode *inode)
4924 struct btrfs_trans_handle *trans; 5013 struct btrfs_trans_handle *trans;
4925 struct btrfs_root *root = BTRFS_I(inode)->root; 5014 struct btrfs_root *root = BTRFS_I(inode)->root;
4926 struct btrfs_block_rsv *rsv, *global_rsv; 5015 struct btrfs_block_rsv *rsv, *global_rsv;
5016 int steal_from_global = 0;
4927 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); 5017 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
4928 int ret; 5018 int ret;
4929 5019
@@ -4991,9 +5081,20 @@ void btrfs_evict_inode(struct inode *inode)
4991 * hard as possible to get this to work. 5081 * hard as possible to get this to work.
4992 */ 5082 */
4993 if (ret) 5083 if (ret)
4994 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); 5084 steal_from_global++;
5085 else
5086 steal_from_global = 0;
5087 ret = 0;
4995 5088
4996 if (ret) { 5089 /*
5090 * steal_from_global == 0: we reserved stuff, hooray!
5091 * steal_from_global == 1: we didn't reserve stuff, boo!
5092 * steal_from_global == 2: we've committed, still not a lot of
5093 * room but maybe we'll have room in the global reserve this
5094 * time.
5095 * steal_from_global == 3: abandon all hope!
5096 */
5097 if (steal_from_global > 2) {
4997 btrfs_warn(root->fs_info, 5098 btrfs_warn(root->fs_info,
4998 "Could not get space for a delete, will truncate on mount %d", 5099 "Could not get space for a delete, will truncate on mount %d",
4999 ret); 5100 ret);
@@ -5009,10 +5110,40 @@ void btrfs_evict_inode(struct inode *inode)
5009 goto no_delete; 5110 goto no_delete;
5010 } 5111 }
5011 5112
5113 /*
5114 * We can't just steal from the global reserve, we need tomake
5115 * sure there is room to do it, if not we need to commit and try
5116 * again.
5117 */
5118 if (steal_from_global) {
5119 if (!btrfs_check_space_for_delayed_refs(trans, root))
5120 ret = btrfs_block_rsv_migrate(global_rsv, rsv,
5121 min_size);
5122 else
5123 ret = -ENOSPC;
5124 }
5125
5126 /*
5127 * Couldn't steal from the global reserve, we have too much
5128 * pending stuff built up, commit the transaction and try it
5129 * again.
5130 */
5131 if (ret) {
5132 ret = btrfs_commit_transaction(trans, root);
5133 if (ret) {
5134 btrfs_orphan_del(NULL, inode);
5135 btrfs_free_block_rsv(root, rsv);
5136 goto no_delete;
5137 }
5138 continue;
5139 } else {
5140 steal_from_global = 0;
5141 }
5142
5012 trans->block_rsv = rsv; 5143 trans->block_rsv = rsv;
5013 5144
5014 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 5145 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5015 if (ret != -ENOSPC) 5146 if (ret != -ENOSPC && ret != -EAGAIN)
5016 break; 5147 break;
5017 5148
5018 trans->block_rsv = &root->fs_info->trans_block_rsv; 5149 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -8581,7 +8712,7 @@ static int btrfs_truncate(struct inode *inode)
8581 ret = btrfs_truncate_inode_items(trans, root, inode, 8712 ret = btrfs_truncate_inode_items(trans, root, inode,
8582 inode->i_size, 8713 inode->i_size,
8583 BTRFS_EXTENT_DATA_KEY); 8714 BTRFS_EXTENT_DATA_KEY);
8584 if (ret != -ENOSPC) { 8715 if (ret != -ENOSPC && ret != -EAGAIN) {
8585 err = ret; 8716 err = ret;
8586 break; 8717 break;
8587 } 8718 }
@@ -9451,6 +9582,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9451 btrfs_end_transaction(trans, root); 9582 btrfs_end_transaction(trans, root);
9452 break; 9583 break;
9453 } 9584 }
9585
9454 btrfs_drop_extent_cache(inode, cur_offset, 9586 btrfs_drop_extent_cache(inode, cur_offset,
9455 cur_offset + ins.offset -1, 0); 9587 cur_offset + ins.offset -1, 0);
9456 9588
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 74609b931ba5..ca5d968f4c37 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -456,6 +456,13 @@ static noinline int create_subvol(struct inode *dir,
456 if (ret) 456 if (ret)
457 return ret; 457 return ret;
458 458
459 /*
460 * Don't create subvolume whose level is not zero. Or qgroup will be
461 * screwed up since it assume subvolme qgroup's level to be 0.
462 */
463 if (btrfs_qgroup_level(objectid))
464 return -ENOSPC;
465
459 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 466 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
460 /* 467 /*
461 * The same as the snapshot creation, please see the comment 468 * The same as the snapshot creation, please see the comment
@@ -1564,7 +1571,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1564 goto out_free; 1571 goto out_free;
1565 } 1572 }
1566 1573
1567 do_div(new_size, root->sectorsize); 1574 new_size = div_u64(new_size, root->sectorsize);
1568 new_size *= root->sectorsize; 1575 new_size *= root->sectorsize;
1569 1576
1570 printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", 1577 printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
@@ -2897,6 +2904,9 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
2897 if (src == dst) 2904 if (src == dst)
2898 return -EINVAL; 2905 return -EINVAL;
2899 2906
2907 if (len == 0)
2908 return 0;
2909
2900 btrfs_double_lock(src, loff, dst, dst_loff, len); 2910 btrfs_double_lock(src, loff, dst, dst_loff, len);
2901 2911
2902 ret = extent_same_check_offsets(src, loff, len); 2912 ret = extent_same_check_offsets(src, loff, len);
@@ -3039,7 +3049,7 @@ out:
3039static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3049static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3040 u64 disko) 3050 u64 disko)
3041{ 3051{
3042 struct seq_list tree_mod_seq_elem = {}; 3052 struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
3043 struct ulist *roots; 3053 struct ulist *roots;
3044 struct ulist_iterator uiter; 3054 struct ulist_iterator uiter;
3045 struct ulist_node *root_node = NULL; 3055 struct ulist_node *root_node = NULL;
@@ -3202,6 +3212,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
3202 key.offset = off; 3212 key.offset = off;
3203 3213
3204 while (1) { 3214 while (1) {
3215 u64 next_key_min_offset = key.offset + 1;
3216
3205 /* 3217 /*
3206 * note the key will change type as we walk through the 3218 * note the key will change type as we walk through the
3207 * tree. 3219 * tree.
@@ -3282,7 +3294,7 @@ process_slot:
3282 } else if (key.offset >= off + len) { 3294 } else if (key.offset >= off + len) {
3283 break; 3295 break;
3284 } 3296 }
3285 3297 next_key_min_offset = key.offset + datal;
3286 size = btrfs_item_size_nr(leaf, slot); 3298 size = btrfs_item_size_nr(leaf, slot);
3287 read_extent_buffer(leaf, buf, 3299 read_extent_buffer(leaf, buf,
3288 btrfs_item_ptr_offset(leaf, slot), 3300 btrfs_item_ptr_offset(leaf, slot),
@@ -3497,7 +3509,7 @@ process_slot:
3497 break; 3509 break;
3498 } 3510 }
3499 btrfs_release_path(path); 3511 btrfs_release_path(path);
3500 key.offset++; 3512 key.offset = next_key_min_offset;
3501 } 3513 }
3502 ret = 0; 3514 ret = 0;
3503 3515
@@ -3626,6 +3638,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
3626 if (off + len == src->i_size) 3638 if (off + len == src->i_size)
3627 len = ALIGN(src->i_size, bs) - off; 3639 len = ALIGN(src->i_size, bs) - off;
3628 3640
3641 if (len == 0) {
3642 ret = 0;
3643 goto out_unlock;
3644 }
3645
3629 /* verify the end result is block aligned */ 3646 /* verify the end result is block aligned */
3630 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || 3647 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
3631 !IS_ALIGNED(destoff, bs)) 3648 !IS_ALIGNED(destoff, bs))
@@ -4624,6 +4641,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
4624 sa->src, sa->dst); 4641 sa->src, sa->dst);
4625 } 4642 }
4626 4643
4644 /* update qgroup status and info */
4645 err = btrfs_run_qgroups(trans, root->fs_info);
4646 if (err < 0)
4647 btrfs_error(root->fs_info, ret,
4648 "failed to update qgroup status and info\n");
4627 err = btrfs_end_transaction(trans, root); 4649 err = btrfs_end_transaction(trans, root);
4628 if (err && !ret) 4650 if (err && !ret)
4629 ret = err; 4651 ret = err;
@@ -4669,8 +4691,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
4669 4691
4670 /* FIXME: check if the IDs really exist */ 4692 /* FIXME: check if the IDs really exist */
4671 if (sa->create) { 4693 if (sa->create) {
4672 ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, 4694 ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid);
4673 NULL);
4674 } else { 4695 } else {
4675 ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); 4696 ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
4676 } 4697 }
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 617553cdb7d3..a2f051347731 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -434,7 +434,7 @@ out:
434 return ret; 434 return ret;
435} 435}
436 436
437struct btrfs_compress_op btrfs_lzo_compress = { 437const struct btrfs_compress_op btrfs_lzo_compress = {
438 .alloc_workspace = lzo_alloc_workspace, 438 .alloc_workspace = lzo_alloc_workspace,
439 .free_workspace = lzo_free_workspace, 439 .free_workspace = lzo_free_workspace,
440 .compress_pages = lzo_compress_pages, 440 .compress_pages = lzo_compress_pages,
diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h
index b7816cefbd13..1b10a3cd1195 100644
--- a/fs/btrfs/math.h
+++ b/fs/btrfs/math.h
@@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor)
28 if (factor == 10) 28 if (factor == 10)
29 return num; 29 return num;
30 num *= factor; 30 num *= factor;
31 do_div(num, 10); 31 return div_u64(num, 10);
32 return num;
33} 32}
34 33
35static inline u64 div_factor_fine(u64 num, int factor) 34static inline u64 div_factor_fine(u64 num, int factor)
@@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor)
37 if (factor == 100) 36 if (factor == 100)
38 return num; 37 return num;
39 num *= factor; 38 num *= factor;
40 do_div(num, 100); 39 return div_u64(num, 100);
41 return num;
42} 40}
43 41
44#endif 42#endif
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 129b1dd28527..dca137b04095 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode)
425 425
426 return NULL; 426 return NULL;
427} 427}
428
429
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 058c79eecbfb..3d6546581bb9 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -644,9 +644,8 @@ out:
644} 644}
645 645
646static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 646static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
647 struct btrfs_root *root, u64 qgroupid, 647 struct btrfs_root *root,
648 u64 flags, u64 max_rfer, u64 max_excl, 648 struct btrfs_qgroup *qgroup)
649 u64 rsv_rfer, u64 rsv_excl)
650{ 649{
651 struct btrfs_path *path; 650 struct btrfs_path *path;
652 struct btrfs_key key; 651 struct btrfs_key key;
@@ -657,7 +656,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
657 656
658 key.objectid = 0; 657 key.objectid = 0;
659 key.type = BTRFS_QGROUP_LIMIT_KEY; 658 key.type = BTRFS_QGROUP_LIMIT_KEY;
660 key.offset = qgroupid; 659 key.offset = qgroup->qgroupid;
661 660
662 path = btrfs_alloc_path(); 661 path = btrfs_alloc_path();
663 if (!path) 662 if (!path)
@@ -673,11 +672,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
673 l = path->nodes[0]; 672 l = path->nodes[0];
674 slot = path->slots[0]; 673 slot = path->slots[0];
675 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 674 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
676 btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); 675 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags);
677 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); 676 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer);
678 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); 677 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl);
679 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer); 678 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer);
680 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl); 679 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl);
681 680
682 btrfs_mark_buffer_dirty(l); 681 btrfs_mark_buffer_dirty(l);
683 682
@@ -967,6 +966,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
967 fs_info->pending_quota_state = 0; 966 fs_info->pending_quota_state = 0;
968 quota_root = fs_info->quota_root; 967 quota_root = fs_info->quota_root;
969 fs_info->quota_root = NULL; 968 fs_info->quota_root = NULL;
969 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
970 spin_unlock(&fs_info->qgroup_lock); 970 spin_unlock(&fs_info->qgroup_lock);
971 971
972 btrfs_free_qgroup_config(fs_info); 972 btrfs_free_qgroup_config(fs_info);
@@ -982,7 +982,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
982 list_del(&quota_root->dirty_list); 982 list_del(&quota_root->dirty_list);
983 983
984 btrfs_tree_lock(quota_root->node); 984 btrfs_tree_lock(quota_root->node);
985 clean_tree_block(trans, tree_root, quota_root->node); 985 clean_tree_block(trans, tree_root->fs_info, quota_root->node);
986 btrfs_tree_unlock(quota_root->node); 986 btrfs_tree_unlock(quota_root->node);
987 btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); 987 btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1);
988 988
@@ -1001,6 +1001,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info,
1001 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1001 list_add(&qgroup->dirty, &fs_info->dirty_qgroups);
1002} 1002}
1003 1003
1004/*
1005 * The easy accounting, if we are adding/removing the only ref for an extent
1006 * then this qgroup and all of the parent qgroups get their refrence and
1007 * exclusive counts adjusted.
1008 *
1009 * Caller should hold fs_info->qgroup_lock.
1010 */
1011static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1012 struct ulist *tmp, u64 ref_root,
1013 u64 num_bytes, int sign)
1014{
1015 struct btrfs_qgroup *qgroup;
1016 struct btrfs_qgroup_list *glist;
1017 struct ulist_node *unode;
1018 struct ulist_iterator uiter;
1019 int ret = 0;
1020
1021 qgroup = find_qgroup_rb(fs_info, ref_root);
1022 if (!qgroup)
1023 goto out;
1024
1025 qgroup->rfer += sign * num_bytes;
1026 qgroup->rfer_cmpr += sign * num_bytes;
1027
1028 WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1029 qgroup->excl += sign * num_bytes;
1030 qgroup->excl_cmpr += sign * num_bytes;
1031 if (sign > 0)
1032 qgroup->reserved -= num_bytes;
1033
1034 qgroup_dirty(fs_info, qgroup);
1035
1036 /* Get all of the parent groups that contain this qgroup */
1037 list_for_each_entry(glist, &qgroup->groups, next_group) {
1038 ret = ulist_add(tmp, glist->group->qgroupid,
1039 ptr_to_u64(glist->group), GFP_ATOMIC);
1040 if (ret < 0)
1041 goto out;
1042 }
1043
1044 /* Iterate all of the parents and adjust their reference counts */
1045 ULIST_ITER_INIT(&uiter);
1046 while ((unode = ulist_next(tmp, &uiter))) {
1047 qgroup = u64_to_ptr(unode->aux);
1048 qgroup->rfer += sign * num_bytes;
1049 qgroup->rfer_cmpr += sign * num_bytes;
1050 WARN_ON(sign < 0 && qgroup->excl < num_bytes);
1051 qgroup->excl += sign * num_bytes;
1052 if (sign > 0)
1053 qgroup->reserved -= num_bytes;
1054 qgroup->excl_cmpr += sign * num_bytes;
1055 qgroup_dirty(fs_info, qgroup);
1056
1057 /* Add any parents of the parents */
1058 list_for_each_entry(glist, &qgroup->groups, next_group) {
1059 ret = ulist_add(tmp, glist->group->qgroupid,
1060 ptr_to_u64(glist->group), GFP_ATOMIC);
1061 if (ret < 0)
1062 goto out;
1063 }
1064 }
1065 ret = 0;
1066out:
1067 return ret;
1068}
1069
1070
1071/*
1072 * Quick path for updating qgroup with only excl refs.
1073 *
1074 * In that case, just update all parent will be enough.
1075 * Or we needs to do a full rescan.
1076 * Caller should also hold fs_info->qgroup_lock.
1077 *
1078 * Return 0 for quick update, return >0 for need to full rescan
1079 * and mark INCONSISTENT flag.
1080 * Return < 0 for other error.
1081 */
1082static int quick_update_accounting(struct btrfs_fs_info *fs_info,
1083 struct ulist *tmp, u64 src, u64 dst,
1084 int sign)
1085{
1086 struct btrfs_qgroup *qgroup;
1087 int ret = 1;
1088 int err = 0;
1089
1090 qgroup = find_qgroup_rb(fs_info, src);
1091 if (!qgroup)
1092 goto out;
1093 if (qgroup->excl == qgroup->rfer) {
1094 ret = 0;
1095 err = __qgroup_excl_accounting(fs_info, tmp, dst,
1096 qgroup->excl, sign);
1097 if (err < 0) {
1098 ret = err;
1099 goto out;
1100 }
1101 }
1102out:
1103 if (ret)
1104 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1105 return ret;
1106}
1107
1004int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, 1108int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
1005 struct btrfs_fs_info *fs_info, u64 src, u64 dst) 1109 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1006{ 1110{
@@ -1008,8 +1112,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
1008 struct btrfs_qgroup *parent; 1112 struct btrfs_qgroup *parent;
1009 struct btrfs_qgroup *member; 1113 struct btrfs_qgroup *member;
1010 struct btrfs_qgroup_list *list; 1114 struct btrfs_qgroup_list *list;
1115 struct ulist *tmp;
1011 int ret = 0; 1116 int ret = 0;
1012 1117
1118 tmp = ulist_alloc(GFP_NOFS);
1119 if (!tmp)
1120 return -ENOMEM;
1121
1122 /* Check the level of src and dst first */
1123 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
1124 return -EINVAL;
1125
1013 mutex_lock(&fs_info->qgroup_ioctl_lock); 1126 mutex_lock(&fs_info->qgroup_ioctl_lock);
1014 quota_root = fs_info->quota_root; 1127 quota_root = fs_info->quota_root;
1015 if (!quota_root) { 1128 if (!quota_root) {
@@ -1043,23 +1156,33 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
1043 1156
1044 spin_lock(&fs_info->qgroup_lock); 1157 spin_lock(&fs_info->qgroup_lock);
1045 ret = add_relation_rb(quota_root->fs_info, src, dst); 1158 ret = add_relation_rb(quota_root->fs_info, src, dst);
1159 if (ret < 0) {
1160 spin_unlock(&fs_info->qgroup_lock);
1161 goto out;
1162 }
1163 ret = quick_update_accounting(fs_info, tmp, src, dst, 1);
1046 spin_unlock(&fs_info->qgroup_lock); 1164 spin_unlock(&fs_info->qgroup_lock);
1047out: 1165out:
1048 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1166 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1167 ulist_free(tmp);
1049 return ret; 1168 return ret;
1050} 1169}
1051 1170
1052int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, 1171int __del_qgroup_relation(struct btrfs_trans_handle *trans,
1053 struct btrfs_fs_info *fs_info, u64 src, u64 dst) 1172 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1054{ 1173{
1055 struct btrfs_root *quota_root; 1174 struct btrfs_root *quota_root;
1056 struct btrfs_qgroup *parent; 1175 struct btrfs_qgroup *parent;
1057 struct btrfs_qgroup *member; 1176 struct btrfs_qgroup *member;
1058 struct btrfs_qgroup_list *list; 1177 struct btrfs_qgroup_list *list;
1178 struct ulist *tmp;
1059 int ret = 0; 1179 int ret = 0;
1060 int err; 1180 int err;
1061 1181
1062 mutex_lock(&fs_info->qgroup_ioctl_lock); 1182 tmp = ulist_alloc(GFP_NOFS);
1183 if (!tmp)
1184 return -ENOMEM;
1185
1063 quota_root = fs_info->quota_root; 1186 quota_root = fs_info->quota_root;
1064 if (!quota_root) { 1187 if (!quota_root) {
1065 ret = -EINVAL; 1188 ret = -EINVAL;
@@ -1088,14 +1211,27 @@ exist:
1088 1211
1089 spin_lock(&fs_info->qgroup_lock); 1212 spin_lock(&fs_info->qgroup_lock);
1090 del_relation_rb(fs_info, src, dst); 1213 del_relation_rb(fs_info, src, dst);
1214 ret = quick_update_accounting(fs_info, tmp, src, dst, -1);
1091 spin_unlock(&fs_info->qgroup_lock); 1215 spin_unlock(&fs_info->qgroup_lock);
1092out: 1216out:
1217 ulist_free(tmp);
1218 return ret;
1219}
1220
1221int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
1222 struct btrfs_fs_info *fs_info, u64 src, u64 dst)
1223{
1224 int ret = 0;
1225
1226 mutex_lock(&fs_info->qgroup_ioctl_lock);
1227 ret = __del_qgroup_relation(trans, fs_info, src, dst);
1093 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1228 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1229
1094 return ret; 1230 return ret;
1095} 1231}
1096 1232
1097int btrfs_create_qgroup(struct btrfs_trans_handle *trans, 1233int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
1098 struct btrfs_fs_info *fs_info, u64 qgroupid, char *name) 1234 struct btrfs_fs_info *fs_info, u64 qgroupid)
1099{ 1235{
1100 struct btrfs_root *quota_root; 1236 struct btrfs_root *quota_root;
1101 struct btrfs_qgroup *qgroup; 1237 struct btrfs_qgroup *qgroup;
@@ -1133,6 +1269,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
1133{ 1269{
1134 struct btrfs_root *quota_root; 1270 struct btrfs_root *quota_root;
1135 struct btrfs_qgroup *qgroup; 1271 struct btrfs_qgroup *qgroup;
1272 struct btrfs_qgroup_list *list;
1136 int ret = 0; 1273 int ret = 0;
1137 1274
1138 mutex_lock(&fs_info->qgroup_ioctl_lock); 1275 mutex_lock(&fs_info->qgroup_ioctl_lock);
@@ -1147,15 +1284,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
1147 ret = -ENOENT; 1284 ret = -ENOENT;
1148 goto out; 1285 goto out;
1149 } else { 1286 } else {
1150 /* check if there are no relations to this qgroup */ 1287 /* check if there are no children of this qgroup */
1151 if (!list_empty(&qgroup->groups) || 1288 if (!list_empty(&qgroup->members)) {
1152 !list_empty(&qgroup->members)) {
1153 ret = -EBUSY; 1289 ret = -EBUSY;
1154 goto out; 1290 goto out;
1155 } 1291 }
1156 } 1292 }
1157 ret = del_qgroup_item(trans, quota_root, qgroupid); 1293 ret = del_qgroup_item(trans, quota_root, qgroupid);
1158 1294
1295 while (!list_empty(&qgroup->groups)) {
1296 list = list_first_entry(&qgroup->groups,
1297 struct btrfs_qgroup_list, next_group);
1298 ret = __del_qgroup_relation(trans, fs_info,
1299 qgroupid,
1300 list->group->qgroupid);
1301 if (ret)
1302 goto out;
1303 }
1304
1159 spin_lock(&fs_info->qgroup_lock); 1305 spin_lock(&fs_info->qgroup_lock);
1160 del_qgroup_rb(quota_root->fs_info, qgroupid); 1306 del_qgroup_rb(quota_root->fs_info, qgroupid);
1161 spin_unlock(&fs_info->qgroup_lock); 1307 spin_unlock(&fs_info->qgroup_lock);
@@ -1184,23 +1330,27 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
1184 ret = -ENOENT; 1330 ret = -ENOENT;
1185 goto out; 1331 goto out;
1186 } 1332 }
1187 ret = update_qgroup_limit_item(trans, quota_root, qgroupid, 1333
1188 limit->flags, limit->max_rfer, 1334 spin_lock(&fs_info->qgroup_lock);
1189 limit->max_excl, limit->rsv_rfer, 1335 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
1190 limit->rsv_excl); 1336 qgroup->max_rfer = limit->max_rfer;
1337 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
1338 qgroup->max_excl = limit->max_excl;
1339 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
1340 qgroup->rsv_rfer = limit->rsv_rfer;
1341 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
1342 qgroup->rsv_excl = limit->rsv_excl;
1343 qgroup->lim_flags |= limit->flags;
1344
1345 spin_unlock(&fs_info->qgroup_lock);
1346
1347 ret = update_qgroup_limit_item(trans, quota_root, qgroup);
1191 if (ret) { 1348 if (ret) {
1192 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1349 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
1193 btrfs_info(fs_info, "unable to update quota limit for %llu", 1350 btrfs_info(fs_info, "unable to update quota limit for %llu",
1194 qgroupid); 1351 qgroupid);
1195 } 1352 }
1196 1353
1197 spin_lock(&fs_info->qgroup_lock);
1198 qgroup->lim_flags = limit->flags;
1199 qgroup->max_rfer = limit->max_rfer;
1200 qgroup->max_excl = limit->max_excl;
1201 qgroup->rsv_rfer = limit->rsv_rfer;
1202 qgroup->rsv_excl = limit->rsv_excl;
1203 spin_unlock(&fs_info->qgroup_lock);
1204out: 1354out:
1205 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1355 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1206 return ret; 1356 return ret;
@@ -1256,14 +1406,14 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
1256 return -1; 1406 return -1;
1257 if (oper1->bytenr > oper2->bytenr) 1407 if (oper1->bytenr > oper2->bytenr)
1258 return 1; 1408 return 1;
1259 if (oper1->seq < oper2->seq)
1260 return -1;
1261 if (oper1->seq > oper2->seq)
1262 return 1;
1263 if (oper1->ref_root < oper2->ref_root) 1409 if (oper1->ref_root < oper2->ref_root)
1264 return -1; 1410 return -1;
1265 if (oper1->ref_root > oper2->ref_root) 1411 if (oper1->ref_root > oper2->ref_root)
1266 return 1; 1412 return 1;
1413 if (oper1->seq < oper2->seq)
1414 return -1;
1415 if (oper1->seq > oper2->seq)
1416 return 1;
1267 if (oper1->type < oper2->type) 1417 if (oper1->type < oper2->type)
1268 return -1; 1418 return -1;
1269 if (oper1->type > oper2->type) 1419 if (oper1->type > oper2->type)
@@ -1372,19 +1522,10 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1372 return 0; 1522 return 0;
1373} 1523}
1374 1524
1375/*
1376 * The easy accounting, if we are adding/removing the only ref for an extent
1377 * then this qgroup and all of the parent qgroups get their refrence and
1378 * exclusive counts adjusted.
1379 */
1380static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1525static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1381 struct btrfs_qgroup_operation *oper) 1526 struct btrfs_qgroup_operation *oper)
1382{ 1527{
1383 struct btrfs_qgroup *qgroup;
1384 struct ulist *tmp; 1528 struct ulist *tmp;
1385 struct btrfs_qgroup_list *glist;
1386 struct ulist_node *unode;
1387 struct ulist_iterator uiter;
1388 int sign = 0; 1529 int sign = 0;
1389 int ret = 0; 1530 int ret = 0;
1390 1531
@@ -1395,9 +1536,7 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1395 spin_lock(&fs_info->qgroup_lock); 1536 spin_lock(&fs_info->qgroup_lock);
1396 if (!fs_info->quota_root) 1537 if (!fs_info->quota_root)
1397 goto out; 1538 goto out;
1398 qgroup = find_qgroup_rb(fs_info, oper->ref_root); 1539
1399 if (!qgroup)
1400 goto out;
1401 switch (oper->type) { 1540 switch (oper->type) {
1402 case BTRFS_QGROUP_OPER_ADD_EXCL: 1541 case BTRFS_QGROUP_OPER_ADD_EXCL:
1403 sign = 1; 1542 sign = 1;
@@ -1408,43 +1547,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
1408 default: 1547 default:
1409 ASSERT(0); 1548 ASSERT(0);
1410 } 1549 }
1411 qgroup->rfer += sign * oper->num_bytes; 1550 ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
1412 qgroup->rfer_cmpr += sign * oper->num_bytes; 1551 oper->num_bytes, sign);
1413
1414 WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
1415 qgroup->excl += sign * oper->num_bytes;
1416 qgroup->excl_cmpr += sign * oper->num_bytes;
1417
1418 qgroup_dirty(fs_info, qgroup);
1419
1420 /* Get all of the parent groups that contain this qgroup */
1421 list_for_each_entry(glist, &qgroup->groups, next_group) {
1422 ret = ulist_add(tmp, glist->group->qgroupid,
1423 ptr_to_u64(glist->group), GFP_ATOMIC);
1424 if (ret < 0)
1425 goto out;
1426 }
1427
1428 /* Iterate all of the parents and adjust their reference counts */
1429 ULIST_ITER_INIT(&uiter);
1430 while ((unode = ulist_next(tmp, &uiter))) {
1431 qgroup = u64_to_ptr(unode->aux);
1432 qgroup->rfer += sign * oper->num_bytes;
1433 qgroup->rfer_cmpr += sign * oper->num_bytes;
1434 WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
1435 qgroup->excl += sign * oper->num_bytes;
1436 qgroup->excl_cmpr += sign * oper->num_bytes;
1437 qgroup_dirty(fs_info, qgroup);
1438
1439 /* Add any parents of the parents */
1440 list_for_each_entry(glist, &qgroup->groups, next_group) {
1441 ret = ulist_add(tmp, glist->group->qgroupid,
1442 ptr_to_u64(glist->group), GFP_ATOMIC);
1443 if (ret < 0)
1444 goto out;
1445 }
1446 }
1447 ret = 0;
1448out: 1552out:
1449 spin_unlock(&fs_info->qgroup_lock); 1553 spin_unlock(&fs_info->qgroup_lock);
1450 ulist_free(tmp); 1554 ulist_free(tmp);
@@ -1845,7 +1949,7 @@ static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
1845 struct ulist *roots = NULL; 1949 struct ulist *roots = NULL;
1846 struct ulist *qgroups, *tmp; 1950 struct ulist *qgroups, *tmp;
1847 struct btrfs_qgroup *qgroup; 1951 struct btrfs_qgroup *qgroup;
1848 struct seq_list elem = {}; 1952 struct seq_list elem = SEQ_LIST_INIT(elem);
1849 u64 seq; 1953 u64 seq;
1850 int old_roots = 0; 1954 int old_roots = 0;
1851 int new_roots = 0; 1955 int new_roots = 0;
@@ -1967,7 +2071,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
1967 int err; 2071 int err;
1968 struct btrfs_qgroup *qg; 2072 struct btrfs_qgroup *qg;
1969 u64 root_obj = 0; 2073 u64 root_obj = 0;
1970 struct seq_list elem = {}; 2074 struct seq_list elem = SEQ_LIST_INIT(elem);
1971 2075
1972 parents = ulist_alloc(GFP_NOFS); 2076 parents = ulist_alloc(GFP_NOFS);
1973 if (!parents) 2077 if (!parents)
@@ -2156,6 +2260,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
2156 if (ret) 2260 if (ret)
2157 fs_info->qgroup_flags |= 2261 fs_info->qgroup_flags |=
2158 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2262 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2263 ret = update_qgroup_limit_item(trans, quota_root, qgroup);
2264 if (ret)
2265 fs_info->qgroup_flags |=
2266 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2159 spin_lock(&fs_info->qgroup_lock); 2267 spin_lock(&fs_info->qgroup_lock);
2160 } 2268 }
2161 if (fs_info->quota_enabled) 2269 if (fs_info->quota_enabled)
@@ -2219,6 +2327,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2219 ret = -EINVAL; 2327 ret = -EINVAL;
2220 goto out; 2328 goto out;
2221 } 2329 }
2330
2331 if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
2332 ret = -EINVAL;
2333 goto out;
2334 }
2222 ++i_qgroups; 2335 ++i_qgroups;
2223 } 2336 }
2224 } 2337 }
@@ -2230,17 +2343,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2230 if (ret) 2343 if (ret)
2231 goto out; 2344 goto out;
2232 2345
2233 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2234 ret = update_qgroup_limit_item(trans, quota_root, objectid,
2235 inherit->lim.flags,
2236 inherit->lim.max_rfer,
2237 inherit->lim.max_excl,
2238 inherit->lim.rsv_rfer,
2239 inherit->lim.rsv_excl);
2240 if (ret)
2241 goto out;
2242 }
2243
2244 if (srcid) { 2346 if (srcid) {
2245 struct btrfs_root *srcroot; 2347 struct btrfs_root *srcroot;
2246 struct btrfs_key srckey; 2348 struct btrfs_key srckey;
@@ -2286,6 +2388,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2286 goto unlock; 2388 goto unlock;
2287 } 2389 }
2288 2390
2391 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) {
2392 dstgroup->lim_flags = inherit->lim.flags;
2393 dstgroup->max_rfer = inherit->lim.max_rfer;
2394 dstgroup->max_excl = inherit->lim.max_excl;
2395 dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
2396 dstgroup->rsv_excl = inherit->lim.rsv_excl;
2397
2398 ret = update_qgroup_limit_item(trans, quota_root, dstgroup);
2399 if (ret) {
2400 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2401 btrfs_info(fs_info, "unable to update quota limit for %llu",
2402 dstgroup->qgroupid);
2403 goto unlock;
2404 }
2405 }
2406
2289 if (srcid) { 2407 if (srcid) {
2290 srcgroup = find_qgroup_rb(fs_info, srcid); 2408 srcgroup = find_qgroup_rb(fs_info, srcid);
2291 if (!srcgroup) 2409 if (!srcgroup)
@@ -2302,6 +2420,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2302 dstgroup->excl_cmpr = level_size; 2420 dstgroup->excl_cmpr = level_size;
2303 srcgroup->excl = level_size; 2421 srcgroup->excl = level_size;
2304 srcgroup->excl_cmpr = level_size; 2422 srcgroup->excl_cmpr = level_size;
2423
2424 /* inherit the limit info */
2425 dstgroup->lim_flags = srcgroup->lim_flags;
2426 dstgroup->max_rfer = srcgroup->max_rfer;
2427 dstgroup->max_excl = srcgroup->max_excl;
2428 dstgroup->rsv_rfer = srcgroup->rsv_rfer;
2429 dstgroup->rsv_excl = srcgroup->rsv_excl;
2430
2305 qgroup_dirty(fs_info, dstgroup); 2431 qgroup_dirty(fs_info, dstgroup);
2306 qgroup_dirty(fs_info, srcgroup); 2432 qgroup_dirty(fs_info, srcgroup);
2307 } 2433 }
@@ -2358,12 +2484,6 @@ out:
2358 return ret; 2484 return ret;
2359} 2485}
2360 2486
2361/*
2362 * reserve some space for a qgroup and all its parents. The reservation takes
2363 * place with start_transaction or dealloc_reserve, similar to ENOSPC
2364 * accounting. If not enough space is available, EDQUOT is returned.
2365 * We assume that the requested space is new for all qgroups.
2366 */
2367int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) 2487int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
2368{ 2488{
2369 struct btrfs_root *quota_root; 2489 struct btrfs_root *quota_root;
@@ -2513,7 +2633,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
2513 2633
2514/* 2634/*
2515 * returns < 0 on error, 0 when more leafs are to be scanned. 2635 * returns < 0 on error, 0 when more leafs are to be scanned.
2516 * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. 2636 * returns 1 when done.
2517 */ 2637 */
2518static int 2638static int
2519qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 2639qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
@@ -2522,7 +2642,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2522{ 2642{
2523 struct btrfs_key found; 2643 struct btrfs_key found;
2524 struct ulist *roots = NULL; 2644 struct ulist *roots = NULL;
2525 struct seq_list tree_mod_seq_elem = {}; 2645 struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
2526 u64 num_bytes; 2646 u64 num_bytes;
2527 u64 seq; 2647 u64 seq;
2528 int new_roots; 2648 int new_roots;
@@ -2618,6 +2738,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2618 struct ulist *tmp = NULL, *qgroups = NULL; 2738 struct ulist *tmp = NULL, *qgroups = NULL;
2619 struct extent_buffer *scratch_leaf = NULL; 2739 struct extent_buffer *scratch_leaf = NULL;
2620 int err = -ENOMEM; 2740 int err = -ENOMEM;
2741 int ret = 0;
2621 2742
2622 path = btrfs_alloc_path(); 2743 path = btrfs_alloc_path();
2623 if (!path) 2744 if (!path)
@@ -2660,7 +2781,7 @@ out:
2660 mutex_lock(&fs_info->qgroup_rescan_lock); 2781 mutex_lock(&fs_info->qgroup_rescan_lock);
2661 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 2782 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2662 2783
2663 if (err == 2 && 2784 if (err > 0 &&
2664 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 2785 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
2665 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2786 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
2666 } else if (err < 0) { 2787 } else if (err < 0) {
@@ -2668,13 +2789,33 @@ out:
2668 } 2789 }
2669 mutex_unlock(&fs_info->qgroup_rescan_lock); 2790 mutex_unlock(&fs_info->qgroup_rescan_lock);
2670 2791
2792 /*
2793 * only update status, since the previous part has alreay updated the
2794 * qgroup info.
2795 */
2796 trans = btrfs_start_transaction(fs_info->quota_root, 1);
2797 if (IS_ERR(trans)) {
2798 err = PTR_ERR(trans);
2799 btrfs_err(fs_info,
2800 "fail to start transaction for status update: %d\n",
2801 err);
2802 goto done;
2803 }
2804 ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
2805 if (ret < 0) {
2806 err = ret;
2807 btrfs_err(fs_info, "fail to update qgroup status: %d\n", err);
2808 }
2809 btrfs_end_transaction(trans, fs_info->quota_root);
2810
2671 if (err >= 0) { 2811 if (err >= 0) {
2672 btrfs_info(fs_info, "qgroup scan completed%s", 2812 btrfs_info(fs_info, "qgroup scan completed%s",
2673 err == 2 ? " (inconsistency flag cleared)" : ""); 2813 err > 0 ? " (inconsistency flag cleared)" : "");
2674 } else { 2814 } else {
2675 btrfs_err(fs_info, "qgroup scan failed with %d", err); 2815 btrfs_err(fs_info, "qgroup scan failed with %d", err);
2676 } 2816 }
2677 2817
2818done:
2678 complete_all(&fs_info->qgroup_rescan_completion); 2819 complete_all(&fs_info->qgroup_rescan_completion);
2679} 2820}
2680 2821
@@ -2709,7 +2850,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2709 mutex_unlock(&fs_info->qgroup_rescan_lock); 2850 mutex_unlock(&fs_info->qgroup_rescan_lock);
2710 goto err; 2851 goto err;
2711 } 2852 }
2712
2713 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 2853 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN;
2714 } 2854 }
2715 2855
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 18cc68ca3090..c5242aa9a4b2 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -70,8 +70,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
70int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, 70int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
71 struct btrfs_fs_info *fs_info, u64 src, u64 dst); 71 struct btrfs_fs_info *fs_info, u64 src, u64 dst);
72int btrfs_create_qgroup(struct btrfs_trans_handle *trans, 72int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
73 struct btrfs_fs_info *fs_info, u64 qgroupid, 73 struct btrfs_fs_info *fs_info, u64 qgroupid);
74 char *name);
75int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, 74int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
76 struct btrfs_fs_info *fs_info, u64 qgroupid); 75 struct btrfs_fs_info *fs_info, u64 qgroupid);
77int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, 76int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 5264858ed768..fa72068bd256 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
237 } 237 }
238 238
239 x = cmpxchg(&info->stripe_hash_table, NULL, table); 239 x = cmpxchg(&info->stripe_hash_table, NULL, table);
240 if (x) { 240 if (x)
241 if (is_vmalloc_addr(x)) 241 kvfree(x);
242 vfree(x);
243 else
244 kfree(x);
245 }
246 return 0; 242 return 0;
247} 243}
248 244
@@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
453 if (!info->stripe_hash_table) 449 if (!info->stripe_hash_table)
454 return; 450 return;
455 btrfs_clear_rbio_cache(info); 451 btrfs_clear_rbio_cache(info);
456 if (is_vmalloc_addr(info->stripe_hash_table)) 452 kvfree(info->stripe_hash_table);
457 vfree(info->stripe_hash_table);
458 else
459 kfree(info->stripe_hash_table);
460 info->stripe_hash_table = NULL; 453 info->stripe_hash_table = NULL;
461} 454}
462 455
@@ -1807,8 +1800,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1807 int err; 1800 int err;
1808 int i; 1801 int i;
1809 1802
1810 pointers = kzalloc(rbio->real_stripes * sizeof(void *), 1803 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1811 GFP_NOFS);
1812 if (!pointers) { 1804 if (!pointers) {
1813 err = -ENOMEM; 1805 err = -ENOMEM;
1814 goto cleanup_io; 1806 goto cleanup_io;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d83085381bcc..74b24b01d574 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3027,7 +3027,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
3027 mutex_lock(&inode->i_mutex); 3027 mutex_lock(&inode->i_mutex);
3028 3028
3029 ret = btrfs_check_data_free_space(inode, cluster->end + 3029 ret = btrfs_check_data_free_space(inode, cluster->end +
3030 1 - cluster->start); 3030 1 - cluster->start, 0);
3031 if (ret) 3031 if (ret)
3032 goto out; 3032 goto out;
3033 3033
@@ -3430,7 +3430,9 @@ static int block_use_full_backref(struct reloc_control *rc,
3430} 3430}
3431 3431
3432static int delete_block_group_cache(struct btrfs_fs_info *fs_info, 3432static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3433 struct inode *inode, u64 ino) 3433 struct btrfs_block_group_cache *block_group,
3434 struct inode *inode,
3435 u64 ino)
3434{ 3436{
3435 struct btrfs_key key; 3437 struct btrfs_key key;
3436 struct btrfs_root *root = fs_info->tree_root; 3438 struct btrfs_root *root = fs_info->tree_root;
@@ -3463,7 +3465,7 @@ truncate:
3463 goto out; 3465 goto out;
3464 } 3466 }
3465 3467
3466 ret = btrfs_truncate_free_space_cache(root, trans, inode); 3468 ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode);
3467 3469
3468 btrfs_end_transaction(trans, root); 3470 btrfs_end_transaction(trans, root);
3469 btrfs_btree_balance_dirty(root); 3471 btrfs_btree_balance_dirty(root);
@@ -3509,6 +3511,7 @@ static int find_data_references(struct reloc_control *rc,
3509 */ 3511 */
3510 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { 3512 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3511 ret = delete_block_group_cache(rc->extent_root->fs_info, 3513 ret = delete_block_group_cache(rc->extent_root->fs_info,
3514 rc->block_group,
3512 NULL, ref_objectid); 3515 NULL, ref_objectid);
3513 if (ret != -ENOENT) 3516 if (ret != -ENOENT)
3514 return ret; 3517 return ret;
@@ -4223,7 +4226,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4223 btrfs_free_path(path); 4226 btrfs_free_path(path);
4224 4227
4225 if (!IS_ERR(inode)) 4228 if (!IS_ERR(inode))
4226 ret = delete_block_group_cache(fs_info, inode, 0); 4229 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4227 else 4230 else
4228 ret = PTR_ERR(inode); 4231 ret = PTR_ERR(inode);
4229 4232
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index ec57687c9a4d..ab5811545a98 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
964 * the statistics. 964 * the statistics.
965 */ 965 */
966 966
967 sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * 967 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
968 sizeof(*sblocks_for_recheck), 968 sizeof(*sblocks_for_recheck), GFP_NOFS);
969 GFP_NOFS);
970 if (!sblocks_for_recheck) { 969 if (!sblocks_for_recheck) {
971 spin_lock(&sctx->stat_lock); 970 spin_lock(&sctx->stat_lock);
972 sctx->stat.malloc_errors++; 971 sctx->stat.malloc_errors++;
@@ -2319,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2319 unsigned long *bitmap, 2318 unsigned long *bitmap,
2320 u64 start, u64 len) 2319 u64 start, u64 len)
2321{ 2320{
2322 int offset; 2321 u32 offset;
2323 int nsectors; 2322 int nsectors;
2324 int sectorsize = sparity->sctx->dev_root->sectorsize; 2323 int sectorsize = sparity->sctx->dev_root->sectorsize;
2325 2324
@@ -2329,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2329 } 2328 }
2330 2329
2331 start -= sparity->logic_start; 2330 start -= sparity->logic_start;
2332 offset = (int)do_div(start, sparity->stripe_len); 2331 start = div_u64_rem(start, sparity->stripe_len, &offset);
2333 offset /= sectorsize; 2332 offset /= sectorsize;
2334 nsectors = (int)len / sectorsize; 2333 nsectors = (int)len / sectorsize;
2335 2334
@@ -2612,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num,
2612 int j = 0; 2611 int j = 0;
2613 u64 stripe_nr; 2612 u64 stripe_nr;
2614 u64 last_offset; 2613 u64 last_offset;
2615 int stripe_index; 2614 u32 stripe_index;
2616 int rot; 2615 u32 rot;
2617 2616
2618 last_offset = (physical - map->stripes[num].physical) * 2617 last_offset = (physical - map->stripes[num].physical) *
2619 nr_data_stripes(map); 2618 nr_data_stripes(map);
@@ -2624,12 +2623,11 @@ static int get_raid56_logic_offset(u64 physical, int num,
2624 for (i = 0; i < nr_data_stripes(map); i++) { 2623 for (i = 0; i < nr_data_stripes(map); i++) {
2625 *offset = last_offset + i * map->stripe_len; 2624 *offset = last_offset + i * map->stripe_len;
2626 2625
2627 stripe_nr = *offset; 2626 stripe_nr = div_u64(*offset, map->stripe_len);
2628 do_div(stripe_nr, map->stripe_len); 2627 stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2629 do_div(stripe_nr, nr_data_stripes(map));
2630 2628
2631 /* Work out the disk rotation on this stripe-set */ 2629 /* Work out the disk rotation on this stripe-set */
2632 rot = do_div(stripe_nr, map->num_stripes); 2630 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2633 /* calculate which stripe this data locates */ 2631 /* calculate which stripe this data locates */
2634 rot += i; 2632 rot += i;
2635 stripe_index = rot % map->num_stripes; 2633 stripe_index = rot % map->num_stripes;
@@ -2995,10 +2993,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2995 int extent_mirror_num; 2993 int extent_mirror_num;
2996 int stop_loop = 0; 2994 int stop_loop = 0;
2997 2995
2998 nstripes = length;
2999 physical = map->stripes[num].physical; 2996 physical = map->stripes[num].physical;
3000 offset = 0; 2997 offset = 0;
3001 do_div(nstripes, map->stripe_len); 2998 nstripes = div_u64(length, map->stripe_len);
3002 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 2999 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3003 offset = map->stripe_len * num; 3000 offset = map->stripe_len * num;
3004 increment = map->stripe_len * map->num_stripes; 3001 increment = map->stripe_len * map->num_stripes;
@@ -3563,7 +3560,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3563 int is_dev_replace) 3560 int is_dev_replace)
3564{ 3561{
3565 int ret = 0; 3562 int ret = 0;
3566 int flags = WQ_FREEZABLE | WQ_UNBOUND; 3563 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
3567 int max_active = fs_info->thread_pool_size; 3564 int max_active = fs_info->thread_pool_size;
3568 3565
3569 if (fs_info->scrub_workers_refcnt == 0) { 3566 if (fs_info->scrub_workers_refcnt == 0) {
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d6033f540cc7..a1216f9b4917 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -3067,48 +3067,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
3067 return NULL; 3067 return NULL;
3068} 3068}
3069 3069
3070static int path_loop(struct send_ctx *sctx, struct fs_path *name,
3071 u64 ino, u64 gen, u64 *ancestor_ino)
3072{
3073 int ret = 0;
3074 u64 parent_inode = 0;
3075 u64 parent_gen = 0;
3076 u64 start_ino = ino;
3077
3078 *ancestor_ino = 0;
3079 while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3080 fs_path_reset(name);
3081
3082 if (is_waiting_for_rm(sctx, ino))
3083 break;
3084 if (is_waiting_for_move(sctx, ino)) {
3085 if (*ancestor_ino == 0)
3086 *ancestor_ino = ino;
3087 ret = get_first_ref(sctx->parent_root, ino,
3088 &parent_inode, &parent_gen, name);
3089 } else {
3090 ret = __get_cur_name_and_parent(sctx, ino, gen,
3091 &parent_inode,
3092 &parent_gen, name);
3093 if (ret > 0) {
3094 ret = 0;
3095 break;
3096 }
3097 }
3098 if (ret < 0)
3099 break;
3100 if (parent_inode == start_ino) {
3101 ret = 1;
3102 if (*ancestor_ino == 0)
3103 *ancestor_ino = ino;
3104 break;
3105 }
3106 ino = parent_inode;
3107 gen = parent_gen;
3108 }
3109 return ret;
3110}
3111
3112static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) 3070static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3113{ 3071{
3114 struct fs_path *from_path = NULL; 3072 struct fs_path *from_path = NULL;
@@ -3120,7 +3078,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3120 struct waiting_dir_move *dm = NULL; 3078 struct waiting_dir_move *dm = NULL;
3121 u64 rmdir_ino = 0; 3079 u64 rmdir_ino = 0;
3122 int ret; 3080 int ret;
3123 u64 ancestor = 0;
3124 3081
3125 name = fs_path_alloc(); 3082 name = fs_path_alloc();
3126 from_path = fs_path_alloc(); 3083 from_path = fs_path_alloc();
@@ -3152,22 +3109,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3152 goto out; 3109 goto out;
3153 3110
3154 sctx->send_progress = sctx->cur_ino + 1; 3111 sctx->send_progress = sctx->cur_ino + 1;
3155 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3156 if (ret) {
3157 LIST_HEAD(deleted_refs);
3158 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3159 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3160 &pm->update_refs, &deleted_refs,
3161 pm->is_orphan);
3162 if (ret < 0)
3163 goto out;
3164 if (rmdir_ino) {
3165 dm = get_waiting_dir_move(sctx, pm->ino);
3166 ASSERT(dm);
3167 dm->rmdir_ino = rmdir_ino;
3168 }
3169 goto out;
3170 }
3171 fs_path_reset(name); 3112 fs_path_reset(name);
3172 to_path = name; 3113 to_path = name;
3173 name = NULL; 3114 name = NULL;
@@ -3610,10 +3551,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
3610 if (ret < 0) 3551 if (ret < 0)
3611 goto out; 3552 goto out;
3612 if (ret) { 3553 if (ret) {
3554 struct name_cache_entry *nce;
3555
3613 ret = orphanize_inode(sctx, ow_inode, ow_gen, 3556 ret = orphanize_inode(sctx, ow_inode, ow_gen,
3614 cur->full_path); 3557 cur->full_path);
3615 if (ret < 0) 3558 if (ret < 0)
3616 goto out; 3559 goto out;
3560 /*
3561 * Make sure we clear our orphanized inode's
3562 * name from the name cache. This is because the
3563 * inode ow_inode might be an ancestor of some
3564 * other inode that will be orphanized as well
3565 * later and has an inode number greater than
3566 * sctx->send_progress. We need to prevent
3567 * future name lookups from using the old name
3568 * and get instead the orphan name.
3569 */
3570 nce = name_cache_search(sctx, ow_inode, ow_gen);
3571 if (nce) {
3572 name_cache_delete(sctx, nce);
3573 kfree(nce);
3574 }
3617 } else { 3575 } else {
3618 ret = send_unlink(sctx, cur->full_path); 3576 ret = send_unlink(sctx, cur->full_path);
3619 if (ret < 0) 3577 if (ret < 0)
@@ -5852,19 +5810,20 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5852 ret = PTR_ERR(clone_root); 5810 ret = PTR_ERR(clone_root);
5853 goto out; 5811 goto out;
5854 } 5812 }
5855 clone_sources_to_rollback = i + 1;
5856 spin_lock(&clone_root->root_item_lock); 5813 spin_lock(&clone_root->root_item_lock);
5857 clone_root->send_in_progress++; 5814 if (!btrfs_root_readonly(clone_root) ||
5858 if (!btrfs_root_readonly(clone_root)) { 5815 btrfs_root_dead(clone_root)) {
5859 spin_unlock(&clone_root->root_item_lock); 5816 spin_unlock(&clone_root->root_item_lock);
5860 srcu_read_unlock(&fs_info->subvol_srcu, index); 5817 srcu_read_unlock(&fs_info->subvol_srcu, index);
5861 ret = -EPERM; 5818 ret = -EPERM;
5862 goto out; 5819 goto out;
5863 } 5820 }
5821 clone_root->send_in_progress++;
5864 spin_unlock(&clone_root->root_item_lock); 5822 spin_unlock(&clone_root->root_item_lock);
5865 srcu_read_unlock(&fs_info->subvol_srcu, index); 5823 srcu_read_unlock(&fs_info->subvol_srcu, index);
5866 5824
5867 sctx->clone_roots[i].root = clone_root; 5825 sctx->clone_roots[i].root = clone_root;
5826 clone_sources_to_rollback = i + 1;
5868 } 5827 }
5869 vfree(clone_sources_tmp); 5828 vfree(clone_sources_tmp);
5870 clone_sources_tmp = NULL; 5829 clone_sources_tmp = NULL;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 05fef198ff94..f2c9f9db3b19 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -901,6 +901,15 @@ find_root:
901 if (IS_ERR(new_root)) 901 if (IS_ERR(new_root))
902 return ERR_CAST(new_root); 902 return ERR_CAST(new_root);
903 903
904 if (!(sb->s_flags & MS_RDONLY)) {
905 int ret;
906 down_read(&fs_info->cleanup_work_sem);
907 ret = btrfs_orphan_cleanup(new_root);
908 up_read(&fs_info->cleanup_work_sem);
909 if (ret)
910 return ERR_PTR(ret);
911 }
912
904 dir_id = btrfs_root_dirid(&new_root->root_item); 913 dir_id = btrfs_root_dirid(&new_root->root_item);
905setup_root: 914setup_root:
906 location.objectid = dir_id; 915 location.objectid = dir_id;
@@ -1714,7 +1723,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1714 avail_space = device->total_bytes - device->bytes_used; 1723 avail_space = device->total_bytes - device->bytes_used;
1715 1724
1716 /* align with stripe_len */ 1725 /* align with stripe_len */
1717 do_div(avail_space, BTRFS_STRIPE_LEN); 1726 avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN);
1718 avail_space *= BTRFS_STRIPE_LEN; 1727 avail_space *= BTRFS_STRIPE_LEN;
1719 1728
1720 /* 1729 /*
@@ -1908,6 +1917,17 @@ static struct file_system_type btrfs_fs_type = {
1908}; 1917};
1909MODULE_ALIAS_FS("btrfs"); 1918MODULE_ALIAS_FS("btrfs");
1910 1919
1920static int btrfs_control_open(struct inode *inode, struct file *file)
1921{
1922 /*
1923 * The control file's private_data is used to hold the
1924 * transaction when it is started and is used to keep
1925 * track of whether a transaction is already in progress.
1926 */
1927 file->private_data = NULL;
1928 return 0;
1929}
1930
1911/* 1931/*
1912 * used by btrfsctl to scan devices when no FS is mounted 1932 * used by btrfsctl to scan devices when no FS is mounted
1913 */ 1933 */
@@ -2009,6 +2029,7 @@ static const struct super_operations btrfs_super_ops = {
2009}; 2029};
2010 2030
2011static const struct file_operations btrfs_ctl_fops = { 2031static const struct file_operations btrfs_ctl_fops = {
2032 .open = btrfs_control_open,
2012 .unlocked_ioctl = btrfs_control_ioctl, 2033 .unlocked_ioctl = btrfs_control_ioctl,
2013 .compat_ioctl = btrfs_control_ioctl, 2034 .compat_ioctl = btrfs_control_ioctl,
2014 .owner = THIS_MODULE, 2035 .owner = THIS_MODULE,
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 94edb0a2a026..e8a4c86d274d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -459,7 +459,7 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
459static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; 459static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
460static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; 460static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
461 461
462static u64 supported_feature_masks[3] = { 462static const u64 supported_feature_masks[3] = {
463 [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, 463 [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
464 [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, 464 [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
465 [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, 465 [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index f7dd298b3cf6..3a4bbed723fd 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -61,11 +61,23 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \
61 BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) 61 BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
62 62
63/* convert from attribute */ 63/* convert from attribute */
64#define to_btrfs_feature_attr(a) \ 64static inline struct btrfs_feature_attr *
65 container_of(a, struct btrfs_feature_attr, kobj_attr) 65to_btrfs_feature_attr(struct kobj_attribute *a)
66#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) 66{
67#define attr_to_btrfs_feature_attr(a) \ 67 return container_of(a, struct btrfs_feature_attr, kobj_attr);
68 to_btrfs_feature_attr(attr_to_btrfs_attr(a)) 68}
69
70static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr)
71{
72 return container_of(attr, struct kobj_attribute, attr);
73}
74
75static inline struct btrfs_feature_attr *
76attr_to_btrfs_feature_attr(struct attribute *attr)
77{
78 return to_btrfs_feature_attr(attr_to_btrfs_attr(attr));
79}
80
69char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); 81char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
70extern const char * const btrfs_feature_set_names[3]; 82extern const char * const btrfs_feature_set_names[3];
71extern struct kobj_type space_info_ktype; 83extern struct kobj_type space_info_ktype;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 73f299ebdabb..c32a7ba76bca 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -232,7 +232,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
232 init_dummy_trans(&trans); 232 init_dummy_trans(&trans);
233 233
234 test_msg("Qgroup basic add\n"); 234 test_msg("Qgroup basic add\n");
235 ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); 235 ret = btrfs_create_qgroup(NULL, fs_info, 5);
236 if (ret) { 236 if (ret) {
237 test_msg("Couldn't create a qgroup %d\n", ret); 237 test_msg("Couldn't create a qgroup %d\n", ret);
238 return ret; 238 return ret;
@@ -301,7 +301,7 @@ static int test_multiple_refs(struct btrfs_root *root)
301 test_msg("Qgroup multiple refs test\n"); 301 test_msg("Qgroup multiple refs test\n");
302 302
303 /* We have 5 created already from the previous test */ 303 /* We have 5 created already from the previous test */
304 ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); 304 ret = btrfs_create_qgroup(NULL, fs_info, 256);
305 if (ret) { 305 if (ret) {
306 test_msg("Couldn't create a qgroup %d\n", ret); 306 test_msg("Couldn't create a qgroup %d\n", ret);
307 return ret; 307 return ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8be4278e25e8..5628e25250c0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -35,7 +35,7 @@
35 35
36#define BTRFS_ROOT_TRANS_TAG 0 36#define BTRFS_ROOT_TRANS_TAG 0
37 37
38static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { 38static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
39 [TRANS_STATE_RUNNING] = 0U, 39 [TRANS_STATE_RUNNING] = 0U,
40 [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | 40 [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE |
41 __TRANS_START), 41 __TRANS_START),
@@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
64 if (atomic_dec_and_test(&transaction->use_count)) { 64 if (atomic_dec_and_test(&transaction->use_count)) {
65 BUG_ON(!list_empty(&transaction->list)); 65 BUG_ON(!list_empty(&transaction->list));
66 WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); 66 WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
67 if (transaction->delayed_refs.pending_csums)
68 printk(KERN_ERR "pending csums is %llu\n",
69 transaction->delayed_refs.pending_csums);
67 while (!list_empty(&transaction->pending_chunks)) { 70 while (!list_empty(&transaction->pending_chunks)) {
68 struct extent_map *em; 71 struct extent_map *em;
69 72
@@ -93,11 +96,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree)
93 */ 96 */
94 ASSERT(!waitqueue_active(&state->wq)); 97 ASSERT(!waitqueue_active(&state->wq));
95 free_extent_state(state); 98 free_extent_state(state);
96 if (need_resched()) { 99
97 spin_unlock(&tree->lock); 100 cond_resched_lock(&tree->lock);
98 cond_resched();
99 spin_lock(&tree->lock);
100 }
101 } 101 }
102 spin_unlock(&tree->lock); 102 spin_unlock(&tree->lock);
103} 103}
@@ -222,10 +222,12 @@ loop:
222 atomic_set(&cur_trans->use_count, 2); 222 atomic_set(&cur_trans->use_count, 2);
223 cur_trans->have_free_bgs = 0; 223 cur_trans->have_free_bgs = 0;
224 cur_trans->start_time = get_seconds(); 224 cur_trans->start_time = get_seconds();
225 cur_trans->dirty_bg_run = 0;
225 226
226 cur_trans->delayed_refs.href_root = RB_ROOT; 227 cur_trans->delayed_refs.href_root = RB_ROOT;
227 atomic_set(&cur_trans->delayed_refs.num_entries, 0); 228 atomic_set(&cur_trans->delayed_refs.num_entries, 0);
228 cur_trans->delayed_refs.num_heads_ready = 0; 229 cur_trans->delayed_refs.num_heads_ready = 0;
230 cur_trans->delayed_refs.pending_csums = 0;
229 cur_trans->delayed_refs.num_heads = 0; 231 cur_trans->delayed_refs.num_heads = 0;
230 cur_trans->delayed_refs.flushing = 0; 232 cur_trans->delayed_refs.flushing = 0;
231 cur_trans->delayed_refs.run_delayed_start = 0; 233 cur_trans->delayed_refs.run_delayed_start = 0;
@@ -250,6 +252,9 @@ loop:
250 INIT_LIST_HEAD(&cur_trans->switch_commits); 252 INIT_LIST_HEAD(&cur_trans->switch_commits);
251 INIT_LIST_HEAD(&cur_trans->pending_ordered); 253 INIT_LIST_HEAD(&cur_trans->pending_ordered);
252 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 254 INIT_LIST_HEAD(&cur_trans->dirty_bgs);
255 INIT_LIST_HEAD(&cur_trans->io_bgs);
256 mutex_init(&cur_trans->cache_write_mutex);
257 cur_trans->num_dirty_bgs = 0;
253 spin_lock_init(&cur_trans->dirty_bgs_lock); 258 spin_lock_init(&cur_trans->dirty_bgs_lock);
254 list_add_tail(&cur_trans->list, &fs_info->trans_list); 259 list_add_tail(&cur_trans->list, &fs_info->trans_list);
255 extent_io_tree_init(&cur_trans->dirty_pages, 260 extent_io_tree_init(&cur_trans->dirty_pages,
@@ -721,7 +726,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
721 updates = trans->delayed_ref_updates; 726 updates = trans->delayed_ref_updates;
722 trans->delayed_ref_updates = 0; 727 trans->delayed_ref_updates = 0;
723 if (updates) { 728 if (updates) {
724 err = btrfs_run_delayed_refs(trans, root, updates); 729 err = btrfs_run_delayed_refs(trans, root, updates * 2);
725 if (err) /* Error code will also eval true */ 730 if (err) /* Error code will also eval true */
726 return err; 731 return err;
727 } 732 }
@@ -1057,6 +1062,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
1057{ 1062{
1058 struct btrfs_fs_info *fs_info = root->fs_info; 1063 struct btrfs_fs_info *fs_info = root->fs_info;
1059 struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; 1064 struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
1065 struct list_head *io_bgs = &trans->transaction->io_bgs;
1060 struct list_head *next; 1066 struct list_head *next;
1061 struct extent_buffer *eb; 1067 struct extent_buffer *eb;
1062 int ret; 1068 int ret;
@@ -1110,7 +1116,7 @@ again:
1110 return ret; 1116 return ret;
1111 } 1117 }
1112 1118
1113 while (!list_empty(dirty_bgs)) { 1119 while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
1114 ret = btrfs_write_dirty_block_groups(trans, root); 1120 ret = btrfs_write_dirty_block_groups(trans, root);
1115 if (ret) 1121 if (ret)
1116 return ret; 1122 return ret;
@@ -1810,6 +1816,37 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1810 return ret; 1816 return ret;
1811 } 1817 }
1812 1818
1819 if (!cur_trans->dirty_bg_run) {
1820 int run_it = 0;
1821
1822 /* this mutex is also taken before trying to set
1823 * block groups readonly. We need to make sure
1824 * that nobody has set a block group readonly
1825 * after a extents from that block group have been
1826 * allocated for cache files. btrfs_set_block_group_ro
1827 * will wait for the transaction to commit if it
1828 * finds dirty_bg_run = 1
1829 *
1830 * The dirty_bg_run flag is also used to make sure only
1831 * one process starts all the block group IO. It wouldn't
1832 * hurt to have more than one go through, but there's no
1833 * real advantage to it either.
1834 */
1835 mutex_lock(&root->fs_info->ro_block_group_mutex);
1836 if (!cur_trans->dirty_bg_run) {
1837 run_it = 1;
1838 cur_trans->dirty_bg_run = 1;
1839 }
1840 mutex_unlock(&root->fs_info->ro_block_group_mutex);
1841
1842 if (run_it)
1843 ret = btrfs_start_dirty_block_groups(trans, root);
1844 }
1845 if (ret) {
1846 btrfs_end_transaction(trans, root);
1847 return ret;
1848 }
1849
1813 spin_lock(&root->fs_info->trans_lock); 1850 spin_lock(&root->fs_info->trans_lock);
1814 list_splice(&trans->ordered, &cur_trans->pending_ordered); 1851 list_splice(&trans->ordered, &cur_trans->pending_ordered);
1815 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 1852 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
@@ -2003,6 +2040,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
2003 2040
2004 assert_qgroups_uptodate(trans); 2041 assert_qgroups_uptodate(trans);
2005 ASSERT(list_empty(&cur_trans->dirty_bgs)); 2042 ASSERT(list_empty(&cur_trans->dirty_bgs));
2043 ASSERT(list_empty(&cur_trans->io_bgs));
2006 update_super_roots(root); 2044 update_super_roots(root);
2007 2045
2008 btrfs_set_super_log_root(root->fs_info->super_copy, 0); 2046 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 937050a2b68e..0b24755596ba 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -64,9 +64,19 @@ struct btrfs_transaction {
64 struct list_head pending_ordered; 64 struct list_head pending_ordered;
65 struct list_head switch_commits; 65 struct list_head switch_commits;
66 struct list_head dirty_bgs; 66 struct list_head dirty_bgs;
67 struct list_head io_bgs;
68 u64 num_dirty_bgs;
69
70 /*
71 * we need to make sure block group deletion doesn't race with
72 * free space cache writeout. This mutex keeps them from stomping
73 * on each other
74 */
75 struct mutex cache_write_mutex;
67 spinlock_t dirty_bgs_lock; 76 spinlock_t dirty_bgs_lock;
68 struct btrfs_delayed_ref_root delayed_refs; 77 struct btrfs_delayed_ref_root delayed_refs;
69 int aborted; 78 int aborted;
79 int dirty_bg_run;
70}; 80};
71 81
72#define __TRANS_FREEZABLE (1U << 0) 82#define __TRANS_FREEZABLE (1U << 0)
@@ -136,9 +146,11 @@ struct btrfs_pending_snapshot {
136static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, 146static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
137 struct inode *inode) 147 struct inode *inode)
138{ 148{
149 spin_lock(&BTRFS_I(inode)->lock);
139 BTRFS_I(inode)->last_trans = trans->transaction->transid; 150 BTRFS_I(inode)->last_trans = trans->transaction->transid;
140 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 151 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
141 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; 152 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
153 spin_unlock(&BTRFS_I(inode)->lock);
142} 154}
143 155
144int btrfs_end_transaction(struct btrfs_trans_handle *trans, 156int btrfs_end_transaction(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c5b8ba37f88e..a089b5944efc 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -492,11 +492,19 @@ insert:
492 492
493 if (btrfs_inode_generation(eb, src_item) == 0) { 493 if (btrfs_inode_generation(eb, src_item) == 0) {
494 struct extent_buffer *dst_eb = path->nodes[0]; 494 struct extent_buffer *dst_eb = path->nodes[0];
495 const u64 ino_size = btrfs_inode_size(eb, src_item);
495 496
497 /*
498 * For regular files an ino_size == 0 is used only when
499 * logging that an inode exists, as part of a directory
500 * fsync, and the inode wasn't fsynced before. In this
501 * case don't set the size of the inode in the fs/subvol
502 * tree, otherwise we would be throwing valid data away.
503 */
496 if (S_ISREG(btrfs_inode_mode(eb, src_item)) && 504 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
497 S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { 505 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
506 ino_size != 0) {
498 struct btrfs_map_token token; 507 struct btrfs_map_token token;
499 u64 ino_size = btrfs_inode_size(eb, src_item);
500 508
501 btrfs_init_map_token(&token); 509 btrfs_init_map_token(&token);
502 btrfs_set_token_inode_size(dst_eb, dst_item, 510 btrfs_set_token_inode_size(dst_eb, dst_item,
@@ -1951,6 +1959,104 @@ out:
1951 return ret; 1959 return ret;
1952} 1960}
1953 1961
1962static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
1963 struct btrfs_root *root,
1964 struct btrfs_root *log,
1965 struct btrfs_path *path,
1966 const u64 ino)
1967{
1968 struct btrfs_key search_key;
1969 struct btrfs_path *log_path;
1970 int i;
1971 int nritems;
1972 int ret;
1973
1974 log_path = btrfs_alloc_path();
1975 if (!log_path)
1976 return -ENOMEM;
1977
1978 search_key.objectid = ino;
1979 search_key.type = BTRFS_XATTR_ITEM_KEY;
1980 search_key.offset = 0;
1981again:
1982 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1983 if (ret < 0)
1984 goto out;
1985process_leaf:
1986 nritems = btrfs_header_nritems(path->nodes[0]);
1987 for (i = path->slots[0]; i < nritems; i++) {
1988 struct btrfs_key key;
1989 struct btrfs_dir_item *di;
1990 struct btrfs_dir_item *log_di;
1991 u32 total_size;
1992 u32 cur;
1993
1994 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
1995 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
1996 ret = 0;
1997 goto out;
1998 }
1999
2000 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2001 total_size = btrfs_item_size_nr(path->nodes[0], i);
2002 cur = 0;
2003 while (cur < total_size) {
2004 u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2005 u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2006 u32 this_len = sizeof(*di) + name_len + data_len;
2007 char *name;
2008
2009 name = kmalloc(name_len, GFP_NOFS);
2010 if (!name) {
2011 ret = -ENOMEM;
2012 goto out;
2013 }
2014 read_extent_buffer(path->nodes[0], name,
2015 (unsigned long)(di + 1), name_len);
2016
2017 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2018 name, name_len, 0);
2019 btrfs_release_path(log_path);
2020 if (!log_di) {
2021 /* Doesn't exist in log tree, so delete it. */
2022 btrfs_release_path(path);
2023 di = btrfs_lookup_xattr(trans, root, path, ino,
2024 name, name_len, -1);
2025 kfree(name);
2026 if (IS_ERR(di)) {
2027 ret = PTR_ERR(di);
2028 goto out;
2029 }
2030 ASSERT(di);
2031 ret = btrfs_delete_one_dir_name(trans, root,
2032 path, di);
2033 if (ret)
2034 goto out;
2035 btrfs_release_path(path);
2036 search_key = key;
2037 goto again;
2038 }
2039 kfree(name);
2040 if (IS_ERR(log_di)) {
2041 ret = PTR_ERR(log_di);
2042 goto out;
2043 }
2044 cur += this_len;
2045 di = (struct btrfs_dir_item *)((char *)di + this_len);
2046 }
2047 }
2048 ret = btrfs_next_leaf(root, path);
2049 if (ret > 0)
2050 ret = 0;
2051 else if (ret == 0)
2052 goto process_leaf;
2053out:
2054 btrfs_free_path(log_path);
2055 btrfs_release_path(path);
2056 return ret;
2057}
2058
2059
1954/* 2060/*
1955 * deletion replay happens before we copy any new directory items 2061 * deletion replay happens before we copy any new directory items
1956 * out of the log or out of backreferences from inodes. It 2062 * out of the log or out of backreferences from inodes. It
@@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2104 2210
2105 inode_item = btrfs_item_ptr(eb, i, 2211 inode_item = btrfs_item_ptr(eb, i,
2106 struct btrfs_inode_item); 2212 struct btrfs_inode_item);
2213 ret = replay_xattr_deletes(wc->trans, root, log,
2214 path, key.objectid);
2215 if (ret)
2216 break;
2107 mode = btrfs_inode_mode(eb, inode_item); 2217 mode = btrfs_inode_mode(eb, inode_item);
2108 if (S_ISDIR(mode)) { 2218 if (S_ISDIR(mode)) {
2109 ret = replay_dir_deletes(wc->trans, 2219 ret = replay_dir_deletes(wc->trans,
@@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2230 if (trans) { 2340 if (trans) {
2231 btrfs_tree_lock(next); 2341 btrfs_tree_lock(next);
2232 btrfs_set_lock_blocking(next); 2342 btrfs_set_lock_blocking(next);
2233 clean_tree_block(trans, root, next); 2343 clean_tree_block(trans, root->fs_info,
2344 next);
2234 btrfs_wait_tree_block_writeback(next); 2345 btrfs_wait_tree_block_writeback(next);
2235 btrfs_tree_unlock(next); 2346 btrfs_tree_unlock(next);
2236 } 2347 }
@@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2308 if (trans) { 2419 if (trans) {
2309 btrfs_tree_lock(next); 2420 btrfs_tree_lock(next);
2310 btrfs_set_lock_blocking(next); 2421 btrfs_set_lock_blocking(next);
2311 clean_tree_block(trans, root, next); 2422 clean_tree_block(trans, root->fs_info,
2423 next);
2312 btrfs_wait_tree_block_writeback(next); 2424 btrfs_wait_tree_block_writeback(next);
2313 btrfs_tree_unlock(next); 2425 btrfs_tree_unlock(next);
2314 } 2426 }
@@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
2384 if (trans) { 2496 if (trans) {
2385 btrfs_tree_lock(next); 2497 btrfs_tree_lock(next);
2386 btrfs_set_lock_blocking(next); 2498 btrfs_set_lock_blocking(next);
2387 clean_tree_block(trans, log, next); 2499 clean_tree_block(trans, log->fs_info, next);
2388 btrfs_wait_tree_block_writeback(next); 2500 btrfs_wait_tree_block_writeback(next);
2389 btrfs_tree_unlock(next); 2501 btrfs_tree_unlock(next);
2390 } 2502 }
@@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3020 struct btrfs_root *root, struct inode *inode, 3132 struct btrfs_root *root, struct inode *inode,
3021 struct btrfs_path *path, 3133 struct btrfs_path *path,
3022 struct btrfs_path *dst_path, int key_type, 3134 struct btrfs_path *dst_path, int key_type,
3135 struct btrfs_log_ctx *ctx,
3023 u64 min_offset, u64 *last_offset_ret) 3136 u64 min_offset, u64 *last_offset_ret)
3024{ 3137{
3025 struct btrfs_key min_key; 3138 struct btrfs_key min_key;
@@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3104 src = path->nodes[0]; 3217 src = path->nodes[0];
3105 nritems = btrfs_header_nritems(src); 3218 nritems = btrfs_header_nritems(src);
3106 for (i = path->slots[0]; i < nritems; i++) { 3219 for (i = path->slots[0]; i < nritems; i++) {
3220 struct btrfs_dir_item *di;
3221
3107 btrfs_item_key_to_cpu(src, &min_key, i); 3222 btrfs_item_key_to_cpu(src, &min_key, i);
3108 3223
3109 if (min_key.objectid != ino || min_key.type != key_type) 3224 if (min_key.objectid != ino || min_key.type != key_type)
@@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3114 err = ret; 3229 err = ret;
3115 goto done; 3230 goto done;
3116 } 3231 }
3232
3233 /*
3234 * We must make sure that when we log a directory entry,
3235 * the corresponding inode, after log replay, has a
3236 * matching link count. For example:
3237 *
3238 * touch foo
3239 * mkdir mydir
3240 * sync
3241 * ln foo mydir/bar
3242 * xfs_io -c "fsync" mydir
3243 * <crash>
3244 * <mount fs and log replay>
3245 *
3246 * Would result in a fsync log that when replayed, our
3247 * file inode would have a link count of 1, but we get
3248 * two directory entries pointing to the same inode.
3249 * After removing one of the names, it would not be
3250 * possible to remove the other name, which resulted
3251 * always in stale file handle errors, and would not
3252 * be possible to rmdir the parent directory, since
3253 * its i_size could never decrement to the value
3254 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3255 */
3256 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3257 btrfs_dir_item_key_to_cpu(src, di, &tmp);
3258 if (ctx &&
3259 (btrfs_dir_transid(src, di) == trans->transid ||
3260 btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3261 tmp.type != BTRFS_ROOT_ITEM_KEY)
3262 ctx->log_new_dentries = true;
3117 } 3263 }
3118 path->slots[0] = nritems; 3264 path->slots[0] = nritems;
3119 3265
@@ -3175,7 +3321,8 @@ done:
3175static noinline int log_directory_changes(struct btrfs_trans_handle *trans, 3321static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3176 struct btrfs_root *root, struct inode *inode, 3322 struct btrfs_root *root, struct inode *inode,
3177 struct btrfs_path *path, 3323 struct btrfs_path *path,
3178 struct btrfs_path *dst_path) 3324 struct btrfs_path *dst_path,
3325 struct btrfs_log_ctx *ctx)
3179{ 3326{
3180 u64 min_key; 3327 u64 min_key;
3181 u64 max_key; 3328 u64 max_key;
@@ -3187,7 +3334,7 @@ again:
3187 max_key = 0; 3334 max_key = 0;
3188 while (1) { 3335 while (1) {
3189 ret = log_dir_items(trans, root, inode, path, 3336 ret = log_dir_items(trans, root, inode, path,
3190 dst_path, key_type, min_key, 3337 dst_path, key_type, ctx, min_key,
3191 &max_key); 3338 &max_key);
3192 if (ret) 3339 if (ret)
3193 return ret; 3340 return ret;
@@ -3963,7 +4110,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
3963 if (ret < 0) { 4110 if (ret < 0) {
3964 return ret; 4111 return ret;
3965 } else if (ret > 0) { 4112 } else if (ret > 0) {
3966 *size_ret = i_size_read(inode); 4113 *size_ret = 0;
3967 } else { 4114 } else {
3968 struct btrfs_inode_item *item; 4115 struct btrfs_inode_item *item;
3969 4116
@@ -4070,10 +4217,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4070 if (S_ISDIR(inode->i_mode)) { 4217 if (S_ISDIR(inode->i_mode)) {
4071 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; 4218 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4072 4219
4073 if (inode_only == LOG_INODE_EXISTS) { 4220 if (inode_only == LOG_INODE_EXISTS)
4074 max_key_type = BTRFS_INODE_EXTREF_KEY; 4221 max_key_type = BTRFS_XATTR_ITEM_KEY;
4075 max_key.type = max_key_type;
4076 }
4077 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4222 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4078 } else { 4223 } else {
4079 if (inode_only == LOG_INODE_EXISTS) { 4224 if (inode_only == LOG_INODE_EXISTS) {
@@ -4098,7 +4243,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4098 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4243 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4099 &BTRFS_I(inode)->runtime_flags)) { 4244 &BTRFS_I(inode)->runtime_flags)) {
4100 if (inode_only == LOG_INODE_EXISTS) { 4245 if (inode_only == LOG_INODE_EXISTS) {
4101 max_key.type = BTRFS_INODE_EXTREF_KEY; 4246 max_key.type = BTRFS_XATTR_ITEM_KEY;
4102 ret = drop_objectid_items(trans, log, path, ino, 4247 ret = drop_objectid_items(trans, log, path, ino,
4103 max_key.type); 4248 max_key.type);
4104 } else { 4249 } else {
@@ -4106,20 +4251,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4106 &BTRFS_I(inode)->runtime_flags); 4251 &BTRFS_I(inode)->runtime_flags);
4107 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4252 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4108 &BTRFS_I(inode)->runtime_flags); 4253 &BTRFS_I(inode)->runtime_flags);
4109 ret = btrfs_truncate_inode_items(trans, log, 4254 while(1) {
4110 inode, 0, 0); 4255 ret = btrfs_truncate_inode_items(trans,
4256 log, inode, 0, 0);
4257 if (ret != -EAGAIN)
4258 break;
4259 }
4111 } 4260 }
4112 } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, 4261 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4113 &BTRFS_I(inode)->runtime_flags) || 4262 &BTRFS_I(inode)->runtime_flags) ||
4114 inode_only == LOG_INODE_EXISTS) { 4263 inode_only == LOG_INODE_EXISTS) {
4115 if (inode_only == LOG_INODE_ALL) { 4264 if (inode_only == LOG_INODE_ALL)
4116 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4117 &BTRFS_I(inode)->runtime_flags);
4118 fast_search = true; 4265 fast_search = true;
4119 max_key.type = BTRFS_XATTR_ITEM_KEY; 4266 max_key.type = BTRFS_XATTR_ITEM_KEY;
4120 } else {
4121 max_key.type = BTRFS_INODE_EXTREF_KEY;
4122 }
4123 ret = drop_objectid_items(trans, log, path, ino, 4267 ret = drop_objectid_items(trans, log, path, ino,
4124 max_key.type); 4268 max_key.type);
4125 } else { 4269 } else {
@@ -4277,15 +4421,18 @@ log_extents:
4277 } 4421 }
4278 4422
4279 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4423 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
4280 ret = log_directory_changes(trans, root, inode, path, dst_path); 4424 ret = log_directory_changes(trans, root, inode, path, dst_path,
4425 ctx);
4281 if (ret) { 4426 if (ret) {
4282 err = ret; 4427 err = ret;
4283 goto out_unlock; 4428 goto out_unlock;
4284 } 4429 }
4285 } 4430 }
4286 4431
4432 spin_lock(&BTRFS_I(inode)->lock);
4287 BTRFS_I(inode)->logged_trans = trans->transid; 4433 BTRFS_I(inode)->logged_trans = trans->transid;
4288 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4434 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4435 spin_unlock(&BTRFS_I(inode)->lock);
4289out_unlock: 4436out_unlock:
4290 if (unlikely(err)) 4437 if (unlikely(err))
4291 btrfs_put_logged_extents(&logged_list); 4438 btrfs_put_logged_extents(&logged_list);
@@ -4372,6 +4519,181 @@ out:
4372 return ret; 4519 return ret;
4373} 4520}
4374 4521
4522struct btrfs_dir_list {
4523 u64 ino;
4524 struct list_head list;
4525};
4526
4527/*
4528 * Log the inodes of the new dentries of a directory. See log_dir_items() for
4529 * details about the why it is needed.
4530 * This is a recursive operation - if an existing dentry corresponds to a
4531 * directory, that directory's new entries are logged too (same behaviour as
4532 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
4533 * the dentries point to we do not lock their i_mutex, otherwise lockdep
4534 * complains about the following circular lock dependency / possible deadlock:
4535 *
4536 * CPU0 CPU1
4537 * ---- ----
4538 * lock(&type->i_mutex_dir_key#3/2);
4539 * lock(sb_internal#2);
4540 * lock(&type->i_mutex_dir_key#3/2);
4541 * lock(&sb->s_type->i_mutex_key#14);
4542 *
4543 * Where sb_internal is the lock (a counter that works as a lock) acquired by
4544 * sb_start_intwrite() in btrfs_start_transaction().
4545 * Not locking i_mutex of the inodes is still safe because:
4546 *
4547 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
4548 * that while logging the inode new references (names) are added or removed
4549 * from the inode, leaving the logged inode item with a link count that does
4550 * not match the number of logged inode reference items. This is fine because
4551 * at log replay time we compute the real number of links and correct the
4552 * link count in the inode item (see replay_one_buffer() and
4553 * link_to_fixup_dir());
4554 *
4555 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
4556 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
4557 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
4558 * has a size that doesn't match the sum of the lengths of all the logged
4559 * names. This does not result in a problem because if a dir_item key is
4560 * logged but its matching dir_index key is not logged, at log replay time we
4561 * don't use it to replay the respective name (see replay_one_name()). On the
4562 * other hand if only the dir_index key ends up being logged, the respective
4563 * name is added to the fs/subvol tree with both the dir_item and dir_index
4564 * keys created (see replay_one_name()).
4565 * The directory's inode item with a wrong i_size is not a problem as well,
4566 * since we don't use it at log replay time to set the i_size in the inode
4567 * item of the fs/subvol tree (see overwrite_item()).
4568 */
4569static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
4570 struct btrfs_root *root,
4571 struct inode *start_inode,
4572 struct btrfs_log_ctx *ctx)
4573{
4574 struct btrfs_root *log = root->log_root;
4575 struct btrfs_path *path;
4576 LIST_HEAD(dir_list);
4577 struct btrfs_dir_list *dir_elem;
4578 int ret = 0;
4579
4580 path = btrfs_alloc_path();
4581 if (!path)
4582 return -ENOMEM;
4583
4584 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
4585 if (!dir_elem) {
4586 btrfs_free_path(path);
4587 return -ENOMEM;
4588 }
4589 dir_elem->ino = btrfs_ino(start_inode);
4590 list_add_tail(&dir_elem->list, &dir_list);
4591
4592 while (!list_empty(&dir_list)) {
4593 struct extent_buffer *leaf;
4594 struct btrfs_key min_key;
4595 int nritems;
4596 int i;
4597
4598 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
4599 list);
4600 if (ret)
4601 goto next_dir_inode;
4602
4603 min_key.objectid = dir_elem->ino;
4604 min_key.type = BTRFS_DIR_ITEM_KEY;
4605 min_key.offset = 0;
4606again:
4607 btrfs_release_path(path);
4608 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
4609 if (ret < 0) {
4610 goto next_dir_inode;
4611 } else if (ret > 0) {
4612 ret = 0;
4613 goto next_dir_inode;
4614 }
4615
4616process_leaf:
4617 leaf = path->nodes[0];
4618 nritems = btrfs_header_nritems(leaf);
4619 for (i = path->slots[0]; i < nritems; i++) {
4620 struct btrfs_dir_item *di;
4621 struct btrfs_key di_key;
4622 struct inode *di_inode;
4623 struct btrfs_dir_list *new_dir_elem;
4624 int log_mode = LOG_INODE_EXISTS;
4625 int type;
4626
4627 btrfs_item_key_to_cpu(leaf, &min_key, i);
4628 if (min_key.objectid != dir_elem->ino ||
4629 min_key.type != BTRFS_DIR_ITEM_KEY)
4630 goto next_dir_inode;
4631
4632 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
4633 type = btrfs_dir_type(leaf, di);
4634 if (btrfs_dir_transid(leaf, di) < trans->transid &&
4635 type != BTRFS_FT_DIR)
4636 continue;
4637 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
4638 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
4639 continue;
4640
4641 di_inode = btrfs_iget(root->fs_info->sb, &di_key,
4642 root, NULL);
4643 if (IS_ERR(di_inode)) {
4644 ret = PTR_ERR(di_inode);
4645 goto next_dir_inode;
4646 }
4647
4648 if (btrfs_inode_in_log(di_inode, trans->transid)) {
4649 iput(di_inode);
4650 continue;
4651 }
4652
4653 ctx->log_new_dentries = false;
4654 if (type == BTRFS_FT_DIR)
4655 log_mode = LOG_INODE_ALL;
4656 btrfs_release_path(path);
4657 ret = btrfs_log_inode(trans, root, di_inode,
4658 log_mode, 0, LLONG_MAX, ctx);
4659 iput(di_inode);
4660 if (ret)
4661 goto next_dir_inode;
4662 if (ctx->log_new_dentries) {
4663 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
4664 GFP_NOFS);
4665 if (!new_dir_elem) {
4666 ret = -ENOMEM;
4667 goto next_dir_inode;
4668 }
4669 new_dir_elem->ino = di_key.objectid;
4670 list_add_tail(&new_dir_elem->list, &dir_list);
4671 }
4672 break;
4673 }
4674 if (i == nritems) {
4675 ret = btrfs_next_leaf(log, path);
4676 if (ret < 0) {
4677 goto next_dir_inode;
4678 } else if (ret > 0) {
4679 ret = 0;
4680 goto next_dir_inode;
4681 }
4682 goto process_leaf;
4683 }
4684 if (min_key.offset < (u64)-1) {
4685 min_key.offset++;
4686 goto again;
4687 }
4688next_dir_inode:
4689 list_del(&dir_elem->list);
4690 kfree(dir_elem);
4691 }
4692
4693 btrfs_free_path(path);
4694 return ret;
4695}
4696
4375/* 4697/*
4376 * helper function around btrfs_log_inode to make sure newly created 4698 * helper function around btrfs_log_inode to make sure newly created
4377 * parent directories also end up in the log. A minimal inode and backref 4699 * parent directories also end up in the log. A minimal inode and backref
@@ -4394,6 +4716,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4394 const struct dentry * const first_parent = parent; 4716 const struct dentry * const first_parent = parent;
4395 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > 4717 const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
4396 last_committed); 4718 last_committed);
4719 bool log_dentries = false;
4720 struct inode *orig_inode = inode;
4397 4721
4398 sb = inode->i_sb; 4722 sb = inode->i_sb;
4399 4723
@@ -4449,6 +4773,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4449 goto end_trans; 4773 goto end_trans;
4450 } 4774 }
4451 4775
4776 if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries)
4777 log_dentries = true;
4778
4452 while (1) { 4779 while (1) {
4453 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) 4780 if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
4454 break; 4781 break;
@@ -4485,7 +4812,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4485 dput(old_parent); 4812 dput(old_parent);
4486 old_parent = parent; 4813 old_parent = parent;
4487 } 4814 }
4488 ret = 0; 4815 if (log_dentries)
4816 ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
4817 else
4818 ret = 0;
4489end_trans: 4819end_trans:
4490 dput(old_parent); 4820 dput(old_parent);
4491 if (ret < 0) { 4821 if (ret < 0) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 154990c26dcb..6916a781ea02 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -29,6 +29,7 @@ struct btrfs_log_ctx {
29 int log_ret; 29 int log_ret;
30 int log_transid; 30 int log_transid;
31 int io_err; 31 int io_err;
32 bool log_new_dentries;
32 struct list_head list; 33 struct list_head list;
33}; 34};
34 35
@@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
37 ctx->log_ret = 0; 38 ctx->log_ret = 0;
38 ctx->log_transid = 0; 39 ctx->log_transid = 0;
39 ctx->io_err = 0; 40 ctx->io_err = 0;
41 ctx->log_new_dentries = false;
40 INIT_LIST_HEAD(&ctx->list); 42 INIT_LIST_HEAD(&ctx->list);
41} 43}
42 44
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8222f6f74147..8bcd2a007517 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,8 +366,8 @@ loop_lock:
366 btrfsic_submit_bio(cur->bi_rw, cur); 366 btrfsic_submit_bio(cur->bi_rw, cur);
367 num_run++; 367 num_run++;
368 batch_run++; 368 batch_run++;
369 if (need_resched()) 369
370 cond_resched(); 370 cond_resched();
371 371
372 /* 372 /*
373 * we made progress, there is more work to do and the bdi 373 * we made progress, there is more work to do and the bdi
@@ -400,8 +400,7 @@ loop_lock:
400 * against it before looping 400 * against it before looping
401 */ 401 */
402 last_waited = ioc->last_waited; 402 last_waited = ioc->last_waited;
403 if (need_resched()) 403 cond_resched();
404 cond_resched();
405 continue; 404 continue;
406 } 405 }
407 spin_lock(&device->io_lock); 406 spin_lock(&device->io_lock);
@@ -609,8 +608,7 @@ error:
609 return ERR_PTR(-ENOMEM); 608 return ERR_PTR(-ENOMEM);
610} 609}
611 610
612void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 611void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
613 struct btrfs_fs_devices *fs_devices, int step)
614{ 612{
615 struct btrfs_device *device, *next; 613 struct btrfs_device *device, *next;
616 struct btrfs_device *latest_dev = NULL; 614 struct btrfs_device *latest_dev = NULL;
@@ -1136,11 +1134,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
1136 path = btrfs_alloc_path(); 1134 path = btrfs_alloc_path();
1137 if (!path) 1135 if (!path)
1138 return -ENOMEM; 1136 return -ENOMEM;
1139again: 1137
1140 max_hole_start = search_start; 1138 max_hole_start = search_start;
1141 max_hole_size = 0; 1139 max_hole_size = 0;
1142 hole_size = 0;
1143 1140
1141again:
1144 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { 1142 if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1145 ret = -ENOSPC; 1143 ret = -ENOSPC;
1146 goto out; 1144 goto out;
@@ -1233,21 +1231,23 @@ next:
1233 * allocated dev extents, and when shrinking the device, 1231 * allocated dev extents, and when shrinking the device,
1234 * search_end may be smaller than search_start. 1232 * search_end may be smaller than search_start.
1235 */ 1233 */
1236 if (search_end > search_start) 1234 if (search_end > search_start) {
1237 hole_size = search_end - search_start; 1235 hole_size = search_end - search_start;
1238 1236
1239 if (hole_size > max_hole_size) { 1237 if (contains_pending_extent(trans, device, &search_start,
1240 max_hole_start = search_start; 1238 hole_size)) {
1241 max_hole_size = hole_size; 1239 btrfs_release_path(path);
1242 } 1240 goto again;
1241 }
1243 1242
1244 if (contains_pending_extent(trans, device, &search_start, hole_size)) { 1243 if (hole_size > max_hole_size) {
1245 btrfs_release_path(path); 1244 max_hole_start = search_start;
1246 goto again; 1245 max_hole_size = hole_size;
1246 }
1247 } 1247 }
1248 1248
1249 /* See above. */ 1249 /* See above. */
1250 if (hole_size < num_bytes) 1250 if (max_hole_size < num_bytes)
1251 ret = -ENOSPC; 1251 ret = -ENOSPC;
1252 else 1252 else
1253 ret = 0; 1253 ret = 0;
@@ -2487,8 +2487,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans,
2487} 2487}
2488 2488
2489static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2489static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2490 struct btrfs_root *root, 2490 struct btrfs_root *root, u64 chunk_objectid,
2491 u64 chunk_tree, u64 chunk_objectid,
2492 u64 chunk_offset) 2491 u64 chunk_offset)
2493{ 2492{
2494 int ret; 2493 int ret;
@@ -2580,7 +2579,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2580 struct map_lookup *map; 2579 struct map_lookup *map;
2581 u64 dev_extent_len = 0; 2580 u64 dev_extent_len = 0;
2582 u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2581 u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2583 u64 chunk_tree = root->fs_info->chunk_root->objectid;
2584 int i, ret = 0; 2582 int i, ret = 0;
2585 2583
2586 /* Just in case */ 2584 /* Just in case */
@@ -2634,8 +2632,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2634 } 2632 }
2635 } 2633 }
2636 } 2634 }
2637 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2635 ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
2638 chunk_offset);
2639 if (ret) { 2636 if (ret) {
2640 btrfs_abort_transaction(trans, root, ret); 2637 btrfs_abort_transaction(trans, root, ret);
2641 goto out; 2638 goto out;
@@ -2664,8 +2661,8 @@ out:
2664} 2661}
2665 2662
2666static int btrfs_relocate_chunk(struct btrfs_root *root, 2663static int btrfs_relocate_chunk(struct btrfs_root *root,
2667 u64 chunk_tree, u64 chunk_objectid, 2664 u64 chunk_objectid,
2668 u64 chunk_offset) 2665 u64 chunk_offset)
2669{ 2666{
2670 struct btrfs_root *extent_root; 2667 struct btrfs_root *extent_root;
2671 struct btrfs_trans_handle *trans; 2668 struct btrfs_trans_handle *trans;
@@ -2707,7 +2704,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2707 struct btrfs_chunk *chunk; 2704 struct btrfs_chunk *chunk;
2708 struct btrfs_key key; 2705 struct btrfs_key key;
2709 struct btrfs_key found_key; 2706 struct btrfs_key found_key;
2710 u64 chunk_tree = chunk_root->root_key.objectid;
2711 u64 chunk_type; 2707 u64 chunk_type;
2712 bool retried = false; 2708 bool retried = false;
2713 int failed = 0; 2709 int failed = 0;
@@ -2744,7 +2740,7 @@ again:
2744 btrfs_release_path(path); 2740 btrfs_release_path(path);
2745 2741
2746 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 2742 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2747 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 2743 ret = btrfs_relocate_chunk(chunk_root,
2748 found_key.objectid, 2744 found_key.objectid,
2749 found_key.offset); 2745 found_key.offset);
2750 if (ret == -ENOSPC) 2746 if (ret == -ENOSPC)
@@ -3022,7 +3018,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
3022 3018
3023 stripe_offset = btrfs_stripe_offset(leaf, stripe); 3019 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3024 stripe_length = btrfs_chunk_length(leaf, chunk); 3020 stripe_length = btrfs_chunk_length(leaf, chunk);
3025 do_div(stripe_length, factor); 3021 stripe_length = div_u64(stripe_length, factor);
3026 3022
3027 if (stripe_offset < bargs->pend && 3023 if (stripe_offset < bargs->pend &&
3028 stripe_offset + stripe_length > bargs->pstart) 3024 stripe_offset + stripe_length > bargs->pstart)
@@ -3255,7 +3251,6 @@ again:
3255 } 3251 }
3256 3252
3257 ret = btrfs_relocate_chunk(chunk_root, 3253 ret = btrfs_relocate_chunk(chunk_root,
3258 chunk_root->root_key.objectid,
3259 found_key.objectid, 3254 found_key.objectid,
3260 found_key.offset); 3255 found_key.offset);
3261 if (ret && ret != -ENOSPC) 3256 if (ret && ret != -ENOSPC)
@@ -3957,7 +3952,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3957 struct btrfs_dev_extent *dev_extent = NULL; 3952 struct btrfs_dev_extent *dev_extent = NULL;
3958 struct btrfs_path *path; 3953 struct btrfs_path *path;
3959 u64 length; 3954 u64 length;
3960 u64 chunk_tree;
3961 u64 chunk_objectid; 3955 u64 chunk_objectid;
3962 u64 chunk_offset; 3956 u64 chunk_offset;
3963 int ret; 3957 int ret;
@@ -4027,13 +4021,11 @@ again:
4027 break; 4021 break;
4028 } 4022 }
4029 4023
4030 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
4031 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 4024 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
4032 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 4025 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4033 btrfs_release_path(path); 4026 btrfs_release_path(path);
4034 4027
4035 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 4028 ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset);
4036 chunk_offset);
4037 if (ret && ret != -ENOSPC) 4029 if (ret && ret != -ENOSPC)
4038 goto done; 4030 goto done;
4039 if (ret == -ENOSPC) 4031 if (ret == -ENOSPC)
@@ -4131,7 +4123,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
4131 return 0; 4123 return 0;
4132} 4124}
4133 4125
4134static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { 4126static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
4135 [BTRFS_RAID_RAID10] = { 4127 [BTRFS_RAID_RAID10] = {
4136 .sub_stripes = 2, 4128 .sub_stripes = 2,
4137 .dev_stripes = 1, 4129 .dev_stripes = 1,
@@ -4289,7 +4281,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4289 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 4281 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4290 max_chunk_size); 4282 max_chunk_size);
4291 4283
4292 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, 4284 devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4293 GFP_NOFS); 4285 GFP_NOFS);
4294 if (!devices_info) 4286 if (!devices_info)
4295 return -ENOMEM; 4287 return -ENOMEM;
@@ -4400,8 +4392,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4400 */ 4392 */
4401 if (stripe_size * data_stripes > max_chunk_size) { 4393 if (stripe_size * data_stripes > max_chunk_size) {
4402 u64 mask = (1ULL << 24) - 1; 4394 u64 mask = (1ULL << 24) - 1;
4403 stripe_size = max_chunk_size; 4395
4404 do_div(stripe_size, data_stripes); 4396 stripe_size = div_u64(max_chunk_size, data_stripes);
4405 4397
4406 /* bump the answer up to a 16MB boundary */ 4398 /* bump the answer up to a 16MB boundary */
4407 stripe_size = (stripe_size + mask) & ~mask; 4399 stripe_size = (stripe_size + mask) & ~mask;
@@ -4413,10 +4405,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4413 stripe_size = devices_info[ndevs-1].max_avail; 4405 stripe_size = devices_info[ndevs-1].max_avail;
4414 } 4406 }
4415 4407
4416 do_div(stripe_size, dev_stripes); 4408 stripe_size = div_u64(stripe_size, dev_stripes);
4417 4409
4418 /* align to BTRFS_STRIPE_LEN */ 4410 /* align to BTRFS_STRIPE_LEN */
4419 do_div(stripe_size, raid_stripe_len); 4411 stripe_size = div_u64(stripe_size, raid_stripe_len);
4420 stripe_size *= raid_stripe_len; 4412 stripe_size *= raid_stripe_len;
4421 4413
4422 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 4414 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -4954,7 +4946,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4954 u64 stripe_nr_orig; 4946 u64 stripe_nr_orig;
4955 u64 stripe_nr_end; 4947 u64 stripe_nr_end;
4956 u64 stripe_len; 4948 u64 stripe_len;
4957 int stripe_index; 4949 u32 stripe_index;
4958 int i; 4950 int i;
4959 int ret = 0; 4951 int ret = 0;
4960 int num_stripes; 4952 int num_stripes;
@@ -4995,7 +4987,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4995 * stripe_nr counts the total number of stripes we have to stride 4987 * stripe_nr counts the total number of stripes we have to stride
4996 * to get to this block 4988 * to get to this block
4997 */ 4989 */
4998 do_div(stripe_nr, stripe_len); 4990 stripe_nr = div64_u64(stripe_nr, stripe_len);
4999 4991
5000 stripe_offset = stripe_nr * stripe_len; 4992 stripe_offset = stripe_nr * stripe_len;
5001 BUG_ON(offset < stripe_offset); 4993 BUG_ON(offset < stripe_offset);
@@ -5011,7 +5003,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5011 /* allow a write of a full stripe, but make sure we don't 5003 /* allow a write of a full stripe, but make sure we don't
5012 * allow straddling of stripes 5004 * allow straddling of stripes
5013 */ 5005 */
5014 do_div(raid56_full_stripe_start, full_stripe_len); 5006 raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5007 full_stripe_len);
5015 raid56_full_stripe_start *= full_stripe_len; 5008 raid56_full_stripe_start *= full_stripe_len;
5016 } 5009 }
5017 5010
@@ -5136,7 +5129,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5136 stripe_index = 0; 5129 stripe_index = 0;
5137 stripe_nr_orig = stripe_nr; 5130 stripe_nr_orig = stripe_nr;
5138 stripe_nr_end = ALIGN(offset + *length, map->stripe_len); 5131 stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
5139 do_div(stripe_nr_end, map->stripe_len); 5132 stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len);
5140 stripe_end_offset = stripe_nr_end * map->stripe_len - 5133 stripe_end_offset = stripe_nr_end * map->stripe_len -
5141 (offset + *length); 5134 (offset + *length);
5142 5135
@@ -5144,7 +5137,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5144 if (rw & REQ_DISCARD) 5137 if (rw & REQ_DISCARD)
5145 num_stripes = min_t(u64, map->num_stripes, 5138 num_stripes = min_t(u64, map->num_stripes,
5146 stripe_nr_end - stripe_nr_orig); 5139 stripe_nr_end - stripe_nr_orig);
5147 stripe_index = do_div(stripe_nr, map->num_stripes); 5140 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5141 &stripe_index);
5148 if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) 5142 if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
5149 mirror_num = 1; 5143 mirror_num = 1;
5150 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5144 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
@@ -5170,9 +5164,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5170 } 5164 }
5171 5165
5172 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5166 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5173 int factor = map->num_stripes / map->sub_stripes; 5167 u32 factor = map->num_stripes / map->sub_stripes;
5174 5168
5175 stripe_index = do_div(stripe_nr, factor); 5169 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5176 stripe_index *= map->sub_stripes; 5170 stripe_index *= map->sub_stripes;
5177 5171
5178 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) 5172 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
@@ -5198,8 +5192,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5198 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || 5192 ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5199 mirror_num > 1)) { 5193 mirror_num > 1)) {
5200 /* push stripe_nr back to the start of the full stripe */ 5194 /* push stripe_nr back to the start of the full stripe */
5201 stripe_nr = raid56_full_stripe_start; 5195 stripe_nr = div_u64(raid56_full_stripe_start,
5202 do_div(stripe_nr, stripe_len * nr_data_stripes(map)); 5196 stripe_len * nr_data_stripes(map));
5203 5197
5204 /* RAID[56] write or recovery. Return all stripes */ 5198 /* RAID[56] write or recovery. Return all stripes */
5205 num_stripes = map->num_stripes; 5199 num_stripes = map->num_stripes;
@@ -5209,32 +5203,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5209 stripe_index = 0; 5203 stripe_index = 0;
5210 stripe_offset = 0; 5204 stripe_offset = 0;
5211 } else { 5205 } else {
5212 u64 tmp;
5213
5214 /* 5206 /*
5215 * Mirror #0 or #1 means the original data block. 5207 * Mirror #0 or #1 means the original data block.
5216 * Mirror #2 is RAID5 parity block. 5208 * Mirror #2 is RAID5 parity block.
5217 * Mirror #3 is RAID6 Q block. 5209 * Mirror #3 is RAID6 Q block.
5218 */ 5210 */
5219 stripe_index = do_div(stripe_nr, nr_data_stripes(map)); 5211 stripe_nr = div_u64_rem(stripe_nr,
5212 nr_data_stripes(map), &stripe_index);
5220 if (mirror_num > 1) 5213 if (mirror_num > 1)
5221 stripe_index = nr_data_stripes(map) + 5214 stripe_index = nr_data_stripes(map) +
5222 mirror_num - 2; 5215 mirror_num - 2;
5223 5216
5224 /* We distribute the parity blocks across stripes */ 5217 /* We distribute the parity blocks across stripes */
5225 tmp = stripe_nr + stripe_index; 5218 div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
5226 stripe_index = do_div(tmp, map->num_stripes); 5219 &stripe_index);
5227 if (!(rw & (REQ_WRITE | REQ_DISCARD | 5220 if (!(rw & (REQ_WRITE | REQ_DISCARD |
5228 REQ_GET_READ_MIRRORS)) && mirror_num <= 1) 5221 REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
5229 mirror_num = 1; 5222 mirror_num = 1;
5230 } 5223 }
5231 } else { 5224 } else {
5232 /* 5225 /*
5233 * after this do_div call, stripe_nr is the number of stripes 5226 * after this, stripe_nr is the number of stripes on this
5234 * on this device we have to walk to find the data, and 5227 * device we have to walk to find the data, and stripe_index is
5235 * stripe_index is the number of our device in the stripe array 5228 * the number of our device in the stripe array
5236 */ 5229 */
5237 stripe_index = do_div(stripe_nr, map->num_stripes); 5230 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5231 &stripe_index);
5238 mirror_num = stripe_index + 1; 5232 mirror_num = stripe_index + 1;
5239 } 5233 }
5240 BUG_ON(stripe_index >= map->num_stripes); 5234 BUG_ON(stripe_index >= map->num_stripes);
@@ -5261,7 +5255,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5261 need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || 5255 need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5262 mirror_num > 1)) { 5256 mirror_num > 1)) {
5263 u64 tmp; 5257 u64 tmp;
5264 int i, rot; 5258 unsigned rot;
5265 5259
5266 bbio->raid_map = (u64 *)((void *)bbio->stripes + 5260 bbio->raid_map = (u64 *)((void *)bbio->stripes +
5267 sizeof(struct btrfs_bio_stripe) * 5261 sizeof(struct btrfs_bio_stripe) *
@@ -5269,8 +5263,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5269 sizeof(int) * tgtdev_indexes); 5263 sizeof(int) * tgtdev_indexes);
5270 5264
5271 /* Work out the disk rotation on this stripe-set */ 5265 /* Work out the disk rotation on this stripe-set */
5272 tmp = stripe_nr; 5266 div_u64_rem(stripe_nr, num_stripes, &rot);
5273 rot = do_div(tmp, num_stripes);
5274 5267
5275 /* Fill in the logical address of each stripe */ 5268 /* Fill in the logical address of each stripe */
5276 tmp = stripe_nr * nr_data_stripes(map); 5269 tmp = stripe_nr * nr_data_stripes(map);
@@ -5285,8 +5278,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5285 } 5278 }
5286 5279
5287 if (rw & REQ_DISCARD) { 5280 if (rw & REQ_DISCARD) {
5288 int factor = 0; 5281 u32 factor = 0;
5289 int sub_stripes = 0; 5282 u32 sub_stripes = 0;
5290 u64 stripes_per_dev = 0; 5283 u64 stripes_per_dev = 0;
5291 u32 remaining_stripes = 0; 5284 u32 remaining_stripes = 0;
5292 u32 last_stripe = 0; 5285 u32 last_stripe = 0;
@@ -5437,9 +5430,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5437 } 5430 }
5438 } 5431 }
5439 if (found) { 5432 if (found) {
5440 u64 length = map->stripe_len; 5433 if (physical_of_found + map->stripe_len <=
5441
5442 if (physical_of_found + length <=
5443 dev_replace->cursor_left) { 5434 dev_replace->cursor_left) {
5444 struct btrfs_bio_stripe *tgtdev_stripe = 5435 struct btrfs_bio_stripe *tgtdev_stripe =
5445 bbio->stripes + num_stripes; 5436 bbio->stripes + num_stripes;
@@ -5535,15 +5526,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5535 rmap_len = map->stripe_len; 5526 rmap_len = map->stripe_len;
5536 5527
5537 if (map->type & BTRFS_BLOCK_GROUP_RAID10) 5528 if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5538 do_div(length, map->num_stripes / map->sub_stripes); 5529 length = div_u64(length, map->num_stripes / map->sub_stripes);
5539 else if (map->type & BTRFS_BLOCK_GROUP_RAID0) 5530 else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5540 do_div(length, map->num_stripes); 5531 length = div_u64(length, map->num_stripes);
5541 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 5532 else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5542 do_div(length, nr_data_stripes(map)); 5533 length = div_u64(length, nr_data_stripes(map));
5543 rmap_len = map->stripe_len * nr_data_stripes(map); 5534 rmap_len = map->stripe_len * nr_data_stripes(map);
5544 } 5535 }
5545 5536
5546 buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); 5537 buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5547 BUG_ON(!buf); /* -ENOMEM */ 5538 BUG_ON(!buf); /* -ENOMEM */
5548 5539
5549 for (i = 0; i < map->num_stripes; i++) { 5540 for (i = 0; i < map->num_stripes; i++) {
@@ -5554,11 +5545,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5554 continue; 5545 continue;
5555 5546
5556 stripe_nr = physical - map->stripes[i].physical; 5547 stripe_nr = physical - map->stripes[i].physical;
5557 do_div(stripe_nr, map->stripe_len); 5548 stripe_nr = div_u64(stripe_nr, map->stripe_len);
5558 5549
5559 if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 5550 if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5560 stripe_nr = stripe_nr * map->num_stripes + i; 5551 stripe_nr = stripe_nr * map->num_stripes + i;
5561 do_div(stripe_nr, map->sub_stripes); 5552 stripe_nr = div_u64(stripe_nr, map->sub_stripes);
5562 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 5553 } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5563 stripe_nr = stripe_nr * map->num_stripes + i; 5554 stripe_nr = stripe_nr * map->num_stripes + i;
5564 } /* else if RAID[56], multiply by nr_data_stripes(). 5555 } /* else if RAID[56], multiply by nr_data_stripes().
@@ -5835,8 +5826,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5835 u64 length = 0; 5826 u64 length = 0;
5836 u64 map_length; 5827 u64 map_length;
5837 int ret; 5828 int ret;
5838 int dev_nr = 0; 5829 int dev_nr;
5839 int total_devs = 1; 5830 int total_devs;
5840 struct btrfs_bio *bbio = NULL; 5831 struct btrfs_bio *bbio = NULL;
5841 5832
5842 length = bio->bi_iter.bi_size; 5833 length = bio->bi_iter.bi_size;
@@ -5877,11 +5868,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5877 BUG(); 5868 BUG();
5878 } 5869 }
5879 5870
5880 while (dev_nr < total_devs) { 5871 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
5881 dev = bbio->stripes[dev_nr].dev; 5872 dev = bbio->stripes[dev_nr].dev;
5882 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { 5873 if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
5883 bbio_error(bbio, first_bio, logical); 5874 bbio_error(bbio, first_bio, logical);
5884 dev_nr++;
5885 continue; 5875 continue;
5886 } 5876 }
5887 5877
@@ -5894,7 +5884,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5894 ret = breakup_stripe_bio(root, bbio, first_bio, dev, 5884 ret = breakup_stripe_bio(root, bbio, first_bio, dev,
5895 dev_nr, rw, async_submit); 5885 dev_nr, rw, async_submit);
5896 BUG_ON(ret); 5886 BUG_ON(ret);
5897 dev_nr++;
5898 continue; 5887 continue;
5899 } 5888 }
5900 5889
@@ -5909,7 +5898,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5909 submit_stripe_bio(root, bbio, bio, 5898 submit_stripe_bio(root, bbio, bio,
5910 bbio->stripes[dev_nr].physical, dev_nr, rw, 5899 bbio->stripes[dev_nr].physical, dev_nr, rw,
5911 async_submit); 5900 async_submit);
5912 dev_nr++;
5913 } 5901 }
5914 btrfs_bio_counter_dec(root->fs_info); 5902 btrfs_bio_counter_dec(root->fs_info);
5915 return 0; 5903 return 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 83069dec6898..ebc31331a837 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -421,8 +421,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
421int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 421int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
422 struct btrfs_fs_devices **fs_devices_ret); 422 struct btrfs_fs_devices **fs_devices_ret);
423int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 423int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
424void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, 424void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
425 struct btrfs_fs_devices *fs_devices, int step);
426int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 425int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
427 char *device_path, 426 char *device_path,
428 struct btrfs_device **device); 427 struct btrfs_device **device);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 883b93623bc5..45ea704be030 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = {
364/* 364/*
365 * Check if the attribute is in a supported namespace. 365 * Check if the attribute is in a supported namespace.
366 * 366 *
367 * This applied after the check for the synthetic attributes in the system 367 * This is applied after the check for the synthetic attributes in the system
368 * namespace. 368 * namespace.
369 */ 369 */
370static bool btrfs_is_valid_xattr(const char *name) 370static int btrfs_is_valid_xattr(const char *name)
371{ 371{
372 return !strncmp(name, XATTR_SECURITY_PREFIX, 372 int len = strlen(name);
373 XATTR_SECURITY_PREFIX_LEN) || 373 int prefixlen = 0;
374 !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || 374
375 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 375 if (!strncmp(name, XATTR_SECURITY_PREFIX,
376 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || 376 XATTR_SECURITY_PREFIX_LEN))
377 !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); 377 prefixlen = XATTR_SECURITY_PREFIX_LEN;
378 else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
379 prefixlen = XATTR_SYSTEM_PREFIX_LEN;
380 else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
381 prefixlen = XATTR_TRUSTED_PREFIX_LEN;
382 else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
383 prefixlen = XATTR_USER_PREFIX_LEN;
384 else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
385 prefixlen = XATTR_BTRFS_PREFIX_LEN;
386 else
387 return -EOPNOTSUPP;
388
389 /*
390 * The name cannot consist of just prefix
391 */
392 if (len <= prefixlen)
393 return -EINVAL;
394
395 return 0;
378} 396}
379 397
380ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, 398ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
381 void *buffer, size_t size) 399 void *buffer, size_t size)
382{ 400{
401 int ret;
402
383 /* 403 /*
384 * If this is a request for a synthetic attribute in the system.* 404 * If this is a request for a synthetic attribute in the system.*
385 * namespace use the generic infrastructure to resolve a handler 405 * namespace use the generic infrastructure to resolve a handler
@@ -388,8 +408,9 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
388 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 408 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
389 return generic_getxattr(dentry, name, buffer, size); 409 return generic_getxattr(dentry, name, buffer, size);
390 410
391 if (!btrfs_is_valid_xattr(name)) 411 ret = btrfs_is_valid_xattr(name);
392 return -EOPNOTSUPP; 412 if (ret)
413 return ret;
393 return __btrfs_getxattr(dentry->d_inode, name, buffer, size); 414 return __btrfs_getxattr(dentry->d_inode, name, buffer, size);
394} 415}
395 416
@@ -397,6 +418,7 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
397 size_t size, int flags) 418 size_t size, int flags)
398{ 419{
399 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; 420 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
421 int ret;
400 422
401 /* 423 /*
402 * The permission on security.* and system.* is not checked 424 * The permission on security.* and system.* is not checked
@@ -413,8 +435,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
413 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 435 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
414 return generic_setxattr(dentry, name, value, size, flags); 436 return generic_setxattr(dentry, name, value, size, flags);
415 437
416 if (!btrfs_is_valid_xattr(name)) 438 ret = btrfs_is_valid_xattr(name);
417 return -EOPNOTSUPP; 439 if (ret)
440 return ret;
418 441
419 if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) 442 if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
420 return btrfs_set_prop(dentry->d_inode, name, 443 return btrfs_set_prop(dentry->d_inode, name,
@@ -430,6 +453,7 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
430int btrfs_removexattr(struct dentry *dentry, const char *name) 453int btrfs_removexattr(struct dentry *dentry, const char *name)
431{ 454{
432 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; 455 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
456 int ret;
433 457
434 /* 458 /*
435 * The permission on security.* and system.* is not checked 459 * The permission on security.* and system.* is not checked
@@ -446,8 +470,9 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
446 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 470 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
447 return generic_removexattr(dentry, name); 471 return generic_removexattr(dentry, name);
448 472
449 if (!btrfs_is_valid_xattr(name)) 473 ret = btrfs_is_valid_xattr(name);
450 return -EOPNOTSUPP; 474 if (ret)
475 return ret;
451 476
452 if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) 477 if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
453 return btrfs_set_prop(dentry->d_inode, name, 478 return btrfs_set_prop(dentry->d_inode, name,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index fb22fd8d8fb8..82990b8f872b 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -403,7 +403,7 @@ next:
403 return ret; 403 return ret;
404} 404}
405 405
406struct btrfs_compress_op btrfs_zlib_compress = { 406const struct btrfs_compress_op btrfs_zlib_compress = {
407 .alloc_workspace = zlib_alloc_workspace, 407 .alloc_workspace = zlib_alloc_workspace,
408 .free_workspace = zlib_free_workspace, 408 .free_workspace = zlib_free_workspace,
409 .compress_pages = zlib_compress_pages, 409 .compress_pages = zlib_compress_pages,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 155ab9c0246b..e162bcd105ee 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1146,6 +1146,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1146 inode, page, (int)pos, (int)len); 1146 inode, page, (int)pos, (int)len);
1147 1147
1148 r = ceph_update_writeable_page(file, pos, len, page); 1148 r = ceph_update_writeable_page(file, pos, len, page);
1149 if (r < 0)
1150 page_cache_release(page);
1151 else
1152 *pagep = page;
1149 } while (r == -EAGAIN); 1153 } while (r == -EAGAIN);
1150 1154
1151 return r; 1155 return r;
@@ -1534,19 +1538,27 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
1534 1538
1535 osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); 1539 osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
1536 1540
1537 err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, 1541 {
1538 "inline_version", &inline_version, 1542 __le64 xattr_buf = cpu_to_le64(inline_version);
1539 sizeof(inline_version), 1543 err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
1540 CEPH_OSD_CMPXATTR_OP_GT, 1544 "inline_version", &xattr_buf,
1541 CEPH_OSD_CMPXATTR_MODE_U64); 1545 sizeof(xattr_buf),
1542 if (err) 1546 CEPH_OSD_CMPXATTR_OP_GT,
1543 goto out_put; 1547 CEPH_OSD_CMPXATTR_MODE_U64);
1544 1548 if (err)
1545 err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, 1549 goto out_put;
1546 "inline_version", &inline_version, 1550 }
1547 sizeof(inline_version), 0, 0); 1551
1548 if (err) 1552 {
1549 goto out_put; 1553 char xattr_buf[32];
1554 int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
1555 "%llu", inline_version);
1556 err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
1557 "inline_version",
1558 xattr_buf, xattr_len, 0, 0);
1559 if (err)
1560 goto out_put;
1561 }
1550 1562
1551 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1563 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
1552 err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1564 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 8172775428a0..11631c4c7d14 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -896,6 +896,18 @@ int ceph_is_any_caps(struct inode *inode)
896 return ret; 896 return ret;
897} 897}
898 898
899static void drop_inode_snap_realm(struct ceph_inode_info *ci)
900{
901 struct ceph_snap_realm *realm = ci->i_snap_realm;
902 spin_lock(&realm->inodes_with_caps_lock);
903 list_del_init(&ci->i_snap_realm_item);
904 ci->i_snap_realm_counter++;
905 ci->i_snap_realm = NULL;
906 spin_unlock(&realm->inodes_with_caps_lock);
907 ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
908 realm);
909}
910
899/* 911/*
900 * Remove a cap. Take steps to deal with a racing iterate_session_caps. 912 * Remove a cap. Take steps to deal with a racing iterate_session_caps.
901 * 913 *
@@ -946,15 +958,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
946 if (removed) 958 if (removed)
947 ceph_put_cap(mdsc, cap); 959 ceph_put_cap(mdsc, cap);
948 960
949 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { 961 /* when reconnect denied, we remove session caps forcibly,
950 struct ceph_snap_realm *realm = ci->i_snap_realm; 962 * i_wr_ref can be non-zero. If there are ongoing write,
951 spin_lock(&realm->inodes_with_caps_lock); 963 * keep i_snap_realm.
952 list_del_init(&ci->i_snap_realm_item); 964 */
953 ci->i_snap_realm_counter++; 965 if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm)
954 ci->i_snap_realm = NULL; 966 drop_inode_snap_realm(ci);
955 spin_unlock(&realm->inodes_with_caps_lock); 967
956 ceph_put_snap_realm(mdsc, realm);
957 }
958 if (!__ceph_is_any_real_caps(ci)) 968 if (!__ceph_is_any_real_caps(ci))
959 __cap_delay_cancel(mdsc, ci); 969 __cap_delay_cancel(mdsc, ci);
960} 970}
@@ -1394,6 +1404,13 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1394 int was = ci->i_dirty_caps; 1404 int was = ci->i_dirty_caps;
1395 int dirty = 0; 1405 int dirty = 0;
1396 1406
1407 if (!ci->i_auth_cap) {
1408 pr_warn("__mark_dirty_caps %p %llx mask %s, "
1409 "but no auth cap (session was closed?)\n",
1410 inode, ceph_ino(inode), ceph_cap_string(mask));
1411 return 0;
1412 }
1413
1397 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, 1414 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1398 ceph_cap_string(mask), ceph_cap_string(was), 1415 ceph_cap_string(mask), ceph_cap_string(was),
1399 ceph_cap_string(was | mask)); 1416 ceph_cap_string(was | mask));
@@ -1404,7 +1421,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1404 ci->i_snap_realm->cached_context); 1421 ci->i_snap_realm->cached_context);
1405 dout(" inode %p now dirty snapc %p auth cap %p\n", 1422 dout(" inode %p now dirty snapc %p auth cap %p\n",
1406 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); 1423 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
1407 WARN_ON(!ci->i_auth_cap);
1408 BUG_ON(!list_empty(&ci->i_dirty_item)); 1424 BUG_ON(!list_empty(&ci->i_dirty_item));
1409 spin_lock(&mdsc->cap_dirty_lock); 1425 spin_lock(&mdsc->cap_dirty_lock);
1410 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1426 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -1545,7 +1561,19 @@ retry_locked:
1545 if (!mdsc->stopping && inode->i_nlink > 0) { 1561 if (!mdsc->stopping && inode->i_nlink > 0) {
1546 if (want) { 1562 if (want) {
1547 retain |= CEPH_CAP_ANY; /* be greedy */ 1563 retain |= CEPH_CAP_ANY; /* be greedy */
1564 } else if (S_ISDIR(inode->i_mode) &&
1565 (issued & CEPH_CAP_FILE_SHARED) &&
1566 __ceph_dir_is_complete(ci)) {
1567 /*
1568 * If a directory is complete, we want to keep
1569 * the exclusive cap. So that MDS does not end up
1570 * revoking the shared cap on every create/unlink
1571 * operation.
1572 */
1573 want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1574 retain |= want;
1548 } else { 1575 } else {
1576
1549 retain |= CEPH_CAP_ANY_SHARED; 1577 retain |= CEPH_CAP_ANY_SHARED;
1550 /* 1578 /*
1551 * keep RD only if we didn't have the file open RW, 1579 * keep RD only if we didn't have the file open RW,
@@ -2309,6 +2337,9 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2309 wake = 1; 2337 wake = 1;
2310 } 2338 }
2311 } 2339 }
2340 /* see comment in __ceph_remove_cap() */
2341 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm)
2342 drop_inode_snap_realm(ci);
2312 } 2343 }
2313 spin_unlock(&ci->i_ceph_lock); 2344 spin_unlock(&ci->i_ceph_lock);
2314 2345
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 83e9976f7189..e729b79812b4 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -281,6 +281,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
281 /* can we use the dcache? */ 281 /* can we use the dcache? */
282 spin_lock(&ci->i_ceph_lock); 282 spin_lock(&ci->i_ceph_lock);
283 if ((ctx->pos == 2 || fi->dentry) && 283 if ((ctx->pos == 2 || fi->dentry) &&
284 ceph_test_mount_opt(fsc, DCACHE) &&
284 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 285 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
285 ceph_snap(inode) != CEPH_SNAPDIR && 286 ceph_snap(inode) != CEPH_SNAPDIR &&
286 __ceph_dir_is_complete_ordered(ci) && 287 __ceph_dir_is_complete_ordered(ci) &&
@@ -336,16 +337,23 @@ more:
336 ceph_mdsc_put_request(req); 337 ceph_mdsc_put_request(req);
337 return err; 338 return err;
338 } 339 }
339 req->r_inode = inode;
340 ihold(inode);
341 req->r_dentry = dget(file->f_path.dentry);
342 /* hints to request -> mds selection code */ 340 /* hints to request -> mds selection code */
343 req->r_direct_mode = USE_AUTH_MDS; 341 req->r_direct_mode = USE_AUTH_MDS;
344 req->r_direct_hash = ceph_frag_value(frag); 342 req->r_direct_hash = ceph_frag_value(frag);
345 req->r_direct_is_hash = true; 343 req->r_direct_is_hash = true;
346 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); 344 if (fi->last_name) {
345 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
346 if (!req->r_path2) {
347 ceph_mdsc_put_request(req);
348 return -ENOMEM;
349 }
350 }
347 req->r_readdir_offset = fi->next_offset; 351 req->r_readdir_offset = fi->next_offset;
348 req->r_args.readdir.frag = cpu_to_le32(frag); 352 req->r_args.readdir.frag = cpu_to_le32(frag);
353
354 req->r_inode = inode;
355 ihold(inode);
356 req->r_dentry = dget(file->f_path.dentry);
349 err = ceph_mdsc_do_request(mdsc, NULL, req); 357 err = ceph_mdsc_do_request(mdsc, NULL, req);
350 if (err < 0) { 358 if (err < 0) {
351 ceph_mdsc_put_request(req); 359 ceph_mdsc_put_request(req);
@@ -629,6 +637,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
629 fsc->mount_options->snapdir_name, 637 fsc->mount_options->snapdir_name,
630 dentry->d_name.len) && 638 dentry->d_name.len) &&
631 !is_root_ceph_dentry(dir, dentry) && 639 !is_root_ceph_dentry(dir, dentry) &&
640 ceph_test_mount_opt(fsc, DCACHE) &&
632 __ceph_dir_is_complete(ci) && 641 __ceph_dir_is_complete(ci) &&
633 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 642 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
634 spin_unlock(&ci->i_ceph_lock); 643 spin_unlock(&ci->i_ceph_lock);
@@ -755,10 +764,15 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
755 err = PTR_ERR(req); 764 err = PTR_ERR(req);
756 goto out; 765 goto out;
757 } 766 }
758 req->r_dentry = dget(dentry);
759 req->r_num_caps = 2;
760 req->r_path2 = kstrdup(dest, GFP_NOFS); 767 req->r_path2 = kstrdup(dest, GFP_NOFS);
768 if (!req->r_path2) {
769 err = -ENOMEM;
770 ceph_mdsc_put_request(req);
771 goto out;
772 }
761 req->r_locked_dir = dir; 773 req->r_locked_dir = dir;
774 req->r_dentry = dget(dentry);
775 req->r_num_caps = 2;
762 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 776 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
763 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 777 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
764 err = ceph_mdsc_do_request(mdsc, dir, req); 778 err = ceph_mdsc_do_request(mdsc, dir, req);
@@ -933,16 +947,20 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
933 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); 947 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
934 struct ceph_mds_client *mdsc = fsc->mdsc; 948 struct ceph_mds_client *mdsc = fsc->mdsc;
935 struct ceph_mds_request *req; 949 struct ceph_mds_request *req;
950 int op = CEPH_MDS_OP_RENAME;
936 int err; 951 int err;
937 952
938 if (ceph_snap(old_dir) != ceph_snap(new_dir)) 953 if (ceph_snap(old_dir) != ceph_snap(new_dir))
939 return -EXDEV; 954 return -EXDEV;
940 if (ceph_snap(old_dir) != CEPH_NOSNAP || 955 if (ceph_snap(old_dir) != CEPH_NOSNAP) {
941 ceph_snap(new_dir) != CEPH_NOSNAP) 956 if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR)
942 return -EROFS; 957 op = CEPH_MDS_OP_RENAMESNAP;
958 else
959 return -EROFS;
960 }
943 dout("rename dir %p dentry %p to dir %p dentry %p\n", 961 dout("rename dir %p dentry %p to dir %p dentry %p\n",
944 old_dir, old_dentry, new_dir, new_dentry); 962 old_dir, old_dentry, new_dir, new_dentry);
945 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); 963 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
946 if (IS_ERR(req)) 964 if (IS_ERR(req))
947 return PTR_ERR(req); 965 return PTR_ERR(req);
948 ihold(old_dir); 966 ihold(old_dir);
@@ -1240,11 +1258,12 @@ static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end,
1240 dout("dir_fsync %p wait on tid %llu (until %llu)\n", 1258 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1241 inode, req->r_tid, last_tid); 1259 inode, req->r_tid, last_tid);
1242 if (req->r_timeout) { 1260 if (req->r_timeout) {
1243 ret = wait_for_completion_timeout( 1261 unsigned long time_left = wait_for_completion_timeout(
1244 &req->r_safe_completion, req->r_timeout); 1262 &req->r_safe_completion,
1245 if (ret > 0) 1263 req->r_timeout);
1264 if (time_left > 0)
1246 ret = 0; 1265 ret = 0;
1247 else if (ret == 0) 1266 else
1248 ret = -EIO; /* timed out */ 1267 ret = -EIO; /* timed out */
1249 } else { 1268 } else {
1250 wait_for_completion(&req->r_safe_completion); 1269 wait_for_completion(&req->r_safe_completion);
@@ -1372,6 +1391,7 @@ const struct inode_operations ceph_snapdir_iops = {
1372 .getattr = ceph_getattr, 1391 .getattr = ceph_getattr,
1373 .mkdir = ceph_mkdir, 1392 .mkdir = ceph_mkdir,
1374 .rmdir = ceph_unlink, 1393 .rmdir = ceph_unlink,
1394 .rename = ceph_rename,
1375}; 1395};
1376 1396
1377const struct dentry_operations ceph_dentry_ops = { 1397const struct dentry_operations ceph_dentry_ops = {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 71c073f38e54..0a2eb32ffe43 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1021,6 +1021,33 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
1021 spin_unlock(&session->s_cap_lock); 1021 spin_unlock(&session->s_cap_lock);
1022} 1022}
1023 1023
1024static void cleanup_session_requests(struct ceph_mds_client *mdsc,
1025 struct ceph_mds_session *session)
1026{
1027 struct ceph_mds_request *req;
1028 struct rb_node *p;
1029
1030 dout("cleanup_session_requests mds%d\n", session->s_mds);
1031 mutex_lock(&mdsc->mutex);
1032 while (!list_empty(&session->s_unsafe)) {
1033 req = list_first_entry(&session->s_unsafe,
1034 struct ceph_mds_request, r_unsafe_item);
1035 list_del_init(&req->r_unsafe_item);
1036 pr_info(" dropping unsafe request %llu\n", req->r_tid);
1037 __unregister_request(mdsc, req);
1038 }
1039 /* zero r_attempts, so kick_requests() will re-send requests */
1040 p = rb_first(&mdsc->request_tree);
1041 while (p) {
1042 req = rb_entry(p, struct ceph_mds_request, r_node);
1043 p = rb_next(p);
1044 if (req->r_session &&
1045 req->r_session->s_mds == session->s_mds)
1046 req->r_attempts = 0;
1047 }
1048 mutex_unlock(&mdsc->mutex);
1049}
1050
1024/* 1051/*
1025 * Helper to safely iterate over all caps associated with a session, with 1052 * Helper to safely iterate over all caps associated with a session, with
1026 * special care taken to handle a racing __ceph_remove_cap(). 1053 * special care taken to handle a racing __ceph_remove_cap().
@@ -1098,7 +1125,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1098 cap, ci, &ci->vfs_inode); 1125 cap, ci, &ci->vfs_inode);
1099 spin_lock(&ci->i_ceph_lock); 1126 spin_lock(&ci->i_ceph_lock);
1100 __ceph_remove_cap(cap, false); 1127 __ceph_remove_cap(cap, false);
1101 if (!__ceph_is_any_real_caps(ci)) { 1128 if (!ci->i_auth_cap) {
1102 struct ceph_mds_client *mdsc = 1129 struct ceph_mds_client *mdsc =
1103 ceph_sb_to_client(inode->i_sb)->mdsc; 1130 ceph_sb_to_client(inode->i_sb)->mdsc;
1104 1131
@@ -1120,13 +1147,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1120 mdsc->num_cap_flushing--; 1147 mdsc->num_cap_flushing--;
1121 drop = 1; 1148 drop = 1;
1122 } 1149 }
1123 if (drop && ci->i_wrbuffer_ref) {
1124 pr_info(" dropping dirty data for %p %lld\n",
1125 inode, ceph_ino(inode));
1126 ci->i_wrbuffer_ref = 0;
1127 ci->i_wrbuffer_ref_head = 0;
1128 drop++;
1129 }
1130 spin_unlock(&mdsc->cap_dirty_lock); 1150 spin_unlock(&mdsc->cap_dirty_lock);
1131 } 1151 }
1132 spin_unlock(&ci->i_ceph_lock); 1152 spin_unlock(&ci->i_ceph_lock);
@@ -1853,7 +1873,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1853 */ 1873 */
1854static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, 1874static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1855 struct ceph_mds_request *req, 1875 struct ceph_mds_request *req,
1856 int mds) 1876 int mds, bool drop_cap_releases)
1857{ 1877{
1858 struct ceph_msg *msg; 1878 struct ceph_msg *msg;
1859 struct ceph_mds_request_head *head; 1879 struct ceph_mds_request_head *head;
@@ -1937,6 +1957,12 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1937 releases += ceph_encode_inode_release(&p, 1957 releases += ceph_encode_inode_release(&p,
1938 req->r_old_dentry->d_inode, 1958 req->r_old_dentry->d_inode,
1939 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); 1959 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1960
1961 if (drop_cap_releases) {
1962 releases = 0;
1963 p = msg->front.iov_base + req->r_request_release_offset;
1964 }
1965
1940 head->num_releases = cpu_to_le16(releases); 1966 head->num_releases = cpu_to_le16(releases);
1941 1967
1942 /* time stamp */ 1968 /* time stamp */
@@ -1989,7 +2015,7 @@ static void complete_request(struct ceph_mds_client *mdsc,
1989 */ 2015 */
1990static int __prepare_send_request(struct ceph_mds_client *mdsc, 2016static int __prepare_send_request(struct ceph_mds_client *mdsc,
1991 struct ceph_mds_request *req, 2017 struct ceph_mds_request *req,
1992 int mds) 2018 int mds, bool drop_cap_releases)
1993{ 2019{
1994 struct ceph_mds_request_head *rhead; 2020 struct ceph_mds_request_head *rhead;
1995 struct ceph_msg *msg; 2021 struct ceph_msg *msg;
@@ -2048,7 +2074,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
2048 ceph_msg_put(req->r_request); 2074 ceph_msg_put(req->r_request);
2049 req->r_request = NULL; 2075 req->r_request = NULL;
2050 } 2076 }
2051 msg = create_request_message(mdsc, req, mds); 2077 msg = create_request_message(mdsc, req, mds, drop_cap_releases);
2052 if (IS_ERR(msg)) { 2078 if (IS_ERR(msg)) {
2053 req->r_err = PTR_ERR(msg); 2079 req->r_err = PTR_ERR(msg);
2054 complete_request(mdsc, req); 2080 complete_request(mdsc, req);
@@ -2132,7 +2158,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
2132 if (req->r_request_started == 0) /* note request start time */ 2158 if (req->r_request_started == 0) /* note request start time */
2133 req->r_request_started = jiffies; 2159 req->r_request_started = jiffies;
2134 2160
2135 err = __prepare_send_request(mdsc, req, mds); 2161 err = __prepare_send_request(mdsc, req, mds, false);
2136 if (!err) { 2162 if (!err) {
2137 ceph_msg_get(req->r_request); 2163 ceph_msg_get(req->r_request);
2138 ceph_con_send(&session->s_con, req->r_request); 2164 ceph_con_send(&session->s_con, req->r_request);
@@ -2590,6 +2616,7 @@ static void handle_session(struct ceph_mds_session *session,
2590 case CEPH_SESSION_CLOSE: 2616 case CEPH_SESSION_CLOSE:
2591 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) 2617 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2592 pr_info("mds%d reconnect denied\n", session->s_mds); 2618 pr_info("mds%d reconnect denied\n", session->s_mds);
2619 cleanup_session_requests(mdsc, session);
2593 remove_session_caps(session); 2620 remove_session_caps(session);
2594 wake = 2; /* for good measure */ 2621 wake = 2; /* for good measure */
2595 wake_up_all(&mdsc->session_close_wq); 2622 wake_up_all(&mdsc->session_close_wq);
@@ -2658,7 +2685,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2658 2685
2659 mutex_lock(&mdsc->mutex); 2686 mutex_lock(&mdsc->mutex);
2660 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { 2687 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2661 err = __prepare_send_request(mdsc, req, session->s_mds); 2688 err = __prepare_send_request(mdsc, req, session->s_mds, true);
2662 if (!err) { 2689 if (!err) {
2663 ceph_msg_get(req->r_request); 2690 ceph_msg_get(req->r_request);
2664 ceph_con_send(&session->s_con, req->r_request); 2691 ceph_con_send(&session->s_con, req->r_request);
@@ -2679,7 +2706,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2679 continue; /* only old requests */ 2706 continue; /* only old requests */
2680 if (req->r_session && 2707 if (req->r_session &&
2681 req->r_session->s_mds == session->s_mds) { 2708 req->r_session->s_mds == session->s_mds) {
2682 err = __prepare_send_request(mdsc, req, session->s_mds); 2709 err = __prepare_send_request(mdsc, req,
2710 session->s_mds, true);
2683 if (!err) { 2711 if (!err) {
2684 ceph_msg_get(req->r_request); 2712 ceph_msg_get(req->r_request);
2685 ceph_con_send(&session->s_con, req->r_request); 2713 ceph_con_send(&session->s_con, req->r_request);
@@ -2864,7 +2892,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2864 spin_unlock(&session->s_cap_lock); 2892 spin_unlock(&session->s_cap_lock);
2865 2893
2866 /* trim unused caps to reduce MDS's cache rejoin time */ 2894 /* trim unused caps to reduce MDS's cache rejoin time */
2867 shrink_dcache_parent(mdsc->fsc->sb->s_root); 2895 if (mdsc->fsc->sb->s_root)
2896 shrink_dcache_parent(mdsc->fsc->sb->s_root);
2868 2897
2869 ceph_con_close(&session->s_con); 2898 ceph_con_close(&session->s_con);
2870 ceph_con_open(&session->s_con, 2899 ceph_con_open(&session->s_con,
@@ -3133,7 +3162,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
3133 di->lease_renew_from && 3162 di->lease_renew_from &&
3134 di->lease_renew_after == 0) { 3163 di->lease_renew_after == 0) {
3135 unsigned long duration = 3164 unsigned long duration =
3136 le32_to_cpu(h->duration_ms) * HZ / 1000; 3165 msecs_to_jiffies(le32_to_cpu(h->duration_ms));
3137 3166
3138 di->lease_seq = seq; 3167 di->lease_seq = seq;
3139 dentry->d_time = di->lease_renew_from + duration; 3168 dentry->d_time = di->lease_renew_from + duration;
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 51cc23e48111..89e6bc321df3 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -75,6 +75,7 @@ const char *ceph_mds_op_name(int op)
75 case CEPH_MDS_OP_LSSNAP: return "lssnap"; 75 case CEPH_MDS_OP_LSSNAP: return "lssnap";
76 case CEPH_MDS_OP_MKSNAP: return "mksnap"; 76 case CEPH_MDS_OP_MKSNAP: return "mksnap";
77 case CEPH_MDS_OP_RMSNAP: return "rmsnap"; 77 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
78 case CEPH_MDS_OP_RENAMESNAP: return "renamesnap";
78 case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; 79 case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
79 case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; 80 case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
80 } 81 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index a63997b8bcff..e463ebd69a9c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -345,6 +345,11 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
345 fsopt->rsize = CEPH_RSIZE_DEFAULT; 345 fsopt->rsize = CEPH_RSIZE_DEFAULT;
346 fsopt->rasize = CEPH_RASIZE_DEFAULT; 346 fsopt->rasize = CEPH_RASIZE_DEFAULT;
347 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 347 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
348 if (!fsopt->snapdir_name) {
349 err = -ENOMEM;
350 goto out;
351 }
352
348 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; 353 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
349 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 354 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
350 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 355 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
@@ -406,31 +411,20 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
406{ 411{
407 struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); 412 struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb);
408 struct ceph_mount_options *fsopt = fsc->mount_options; 413 struct ceph_mount_options *fsopt = fsc->mount_options;
409 struct ceph_options *opt = fsc->client->options; 414 size_t pos;
410 415 int ret;
411 if (opt->flags & CEPH_OPT_FSID) 416
412 seq_printf(m, ",fsid=%pU", &opt->fsid); 417 /* a comma between MNT/MS and client options */
413 if (opt->flags & CEPH_OPT_NOSHARE) 418 seq_putc(m, ',');
414 seq_puts(m, ",noshare"); 419 pos = m->count;
415 if (opt->flags & CEPH_OPT_NOCRC) 420
416 seq_puts(m, ",nocrc"); 421 ret = ceph_print_client_options(m, fsc->client);
417 if (opt->flags & CEPH_OPT_NOMSGAUTH) 422 if (ret)
418 seq_puts(m, ",nocephx_require_signatures"); 423 return ret;
419 if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) 424
420 seq_puts(m, ",notcp_nodelay"); 425 /* retract our comma if no client options */
421 426 if (m->count == pos)
422 if (opt->name) 427 m->count--;
423 seq_printf(m, ",name=%s", opt->name);
424 if (opt->key)
425 seq_puts(m, ",secret=<hidden>");
426
427 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
428 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
429 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
430 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
431 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
432 seq_printf(m, ",osdkeepalivetimeout=%d",
433 opt->osd_keepalive_timeout);
434 428
435 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) 429 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
436 seq_puts(m, ",dirstat"); 430 seq_puts(m, ",dirstat");
@@ -438,14 +432,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
438 seq_puts(m, ",norbytes"); 432 seq_puts(m, ",norbytes");
439 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) 433 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
440 seq_puts(m, ",noasyncreaddir"); 434 seq_puts(m, ",noasyncreaddir");
441 if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) 435 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
442 seq_puts(m, ",dcache");
443 else
444 seq_puts(m, ",nodcache"); 436 seq_puts(m, ",nodcache");
445 if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) 437 if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE)
446 seq_puts(m, ",fsc"); 438 seq_puts(m, ",fsc");
447 else
448 seq_puts(m, ",nofsc");
449 439
450#ifdef CONFIG_CEPH_FS_POSIX_ACL 440#ifdef CONFIG_CEPH_FS_POSIX_ACL
451 if (fsopt->sb_flags & MS_POSIXACL) 441 if (fsopt->sb_flags & MS_POSIXACL)
@@ -477,6 +467,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
477 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); 467 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
478 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) 468 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
479 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); 469 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
470
480 return 0; 471 return 0;
481} 472}
482 473
@@ -730,6 +721,11 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
730 if (IS_ERR(req)) 721 if (IS_ERR(req))
731 return ERR_CAST(req); 722 return ERR_CAST(req);
732 req->r_path1 = kstrdup(path, GFP_NOFS); 723 req->r_path1 = kstrdup(path, GFP_NOFS);
724 if (!req->r_path1) {
725 root = ERR_PTR(-ENOMEM);
726 goto out;
727 }
728
733 req->r_ino1.ino = CEPH_INO_ROOT; 729 req->r_ino1.ino = CEPH_INO_ROOT;
734 req->r_ino1.snap = CEPH_NOSNAP; 730 req->r_ino1.snap = CEPH_NOSNAP;
735 req->r_started = started; 731 req->r_started = started;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 04c8124ed30e..fa20e1318939 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,7 +36,8 @@
36#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ 36#define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */
37#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ 37#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
38 38
39#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) 39#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \
40 CEPH_MOUNT_OPT_DCACHE)
40 41
41#define ceph_set_mount_opt(fsc, opt) \ 42#define ceph_set_mount_opt(fsc, opt) \
42 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; 43 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
@@ -881,7 +882,6 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
881 882
882/* file.c */ 883/* file.c */
883extern const struct file_operations ceph_file_fops; 884extern const struct file_operations ceph_file_fops;
884extern const struct address_space_operations ceph_aops;
885 885
886extern int ceph_open(struct inode *inode, struct file *file); 886extern int ceph_open(struct inode *inode, struct file *file);
887extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, 887extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 5a492caf34cb..5c4c9c256931 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -877,16 +877,23 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
877 err = PTR_ERR(req); 877 err = PTR_ERR(req);
878 goto out; 878 goto out;
879 } 879 }
880 req->r_inode = inode; 880
881 ihold(inode);
882 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
883 req->r_num_caps = 1;
884 req->r_args.setxattr.flags = cpu_to_le32(flags); 881 req->r_args.setxattr.flags = cpu_to_le32(flags);
885 req->r_path2 = kstrdup(name, GFP_NOFS); 882 req->r_path2 = kstrdup(name, GFP_NOFS);
883 if (!req->r_path2) {
884 ceph_mdsc_put_request(req);
885 err = -ENOMEM;
886 goto out;
887 }
886 888
887 req->r_pagelist = pagelist; 889 req->r_pagelist = pagelist;
888 pagelist = NULL; 890 pagelist = NULL;
889 891
892 req->r_inode = inode;
893 ihold(inode);
894 req->r_num_caps = 1;
895 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
896
890 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); 897 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
891 err = ceph_mdsc_do_request(mdsc, NULL, req); 898 err = ceph_mdsc_do_request(mdsc, NULL, req);
892 ceph_mdsc_put_request(req); 899 ceph_mdsc_put_request(req);
@@ -1019,12 +1026,14 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
1019 USE_AUTH_MDS); 1026 USE_AUTH_MDS);
1020 if (IS_ERR(req)) 1027 if (IS_ERR(req))
1021 return PTR_ERR(req); 1028 return PTR_ERR(req);
1029 req->r_path2 = kstrdup(name, GFP_NOFS);
1030 if (!req->r_path2)
1031 return -ENOMEM;
1032
1022 req->r_inode = inode; 1033 req->r_inode = inode;
1023 ihold(inode); 1034 ihold(inode);
1024 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
1025 req->r_num_caps = 1; 1035 req->r_num_caps = 1;
1026 req->r_path2 = kstrdup(name, GFP_NOFS); 1036 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
1027
1028 err = ceph_mdsc_do_request(mdsc, NULL, req); 1037 err = ceph_mdsc_do_request(mdsc, NULL, req);
1029 ceph_mdsc_put_request(req); 1038 ceph_mdsc_put_request(req);
1030 return err; 1039 return err;
diff --git a/fs/exec.c b/fs/exec.c
index 02bfd980a40c..49a1c61433b7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1275,6 +1275,53 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
1275 spin_unlock(&p->fs->lock); 1275 spin_unlock(&p->fs->lock);
1276} 1276}
1277 1277
1278static void bprm_fill_uid(struct linux_binprm *bprm)
1279{
1280 struct inode *inode;
1281 unsigned int mode;
1282 kuid_t uid;
1283 kgid_t gid;
1284
1285 /* clear any previous set[ug]id data from a previous binary */
1286 bprm->cred->euid = current_euid();
1287 bprm->cred->egid = current_egid();
1288
1289 if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
1290 return;
1291
1292 if (task_no_new_privs(current))
1293 return;
1294
1295 inode = file_inode(bprm->file);
1296 mode = READ_ONCE(inode->i_mode);
1297 if (!(mode & (S_ISUID|S_ISGID)))
1298 return;
1299
1300 /* Be careful if suid/sgid is set */
1301 mutex_lock(&inode->i_mutex);
1302
1303 /* reload atomically mode/uid/gid now that lock held */
1304 mode = inode->i_mode;
1305 uid = inode->i_uid;
1306 gid = inode->i_gid;
1307 mutex_unlock(&inode->i_mutex);
1308
1309 /* We ignore suid/sgid if there are no mappings for them in the ns */
1310 if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
1311 !kgid_has_mapping(bprm->cred->user_ns, gid))
1312 return;
1313
1314 if (mode & S_ISUID) {
1315 bprm->per_clear |= PER_CLEAR_ON_SETID;
1316 bprm->cred->euid = uid;
1317 }
1318
1319 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1320 bprm->per_clear |= PER_CLEAR_ON_SETID;
1321 bprm->cred->egid = gid;
1322 }
1323}
1324
1278/* 1325/*
1279 * Fill the binprm structure from the inode. 1326 * Fill the binprm structure from the inode.
1280 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes 1327 * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
@@ -1283,36 +1330,9 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
1283 */ 1330 */
1284int prepare_binprm(struct linux_binprm *bprm) 1331int prepare_binprm(struct linux_binprm *bprm)
1285{ 1332{
1286 struct inode *inode = file_inode(bprm->file);
1287 umode_t mode = inode->i_mode;
1288 int retval; 1333 int retval;
1289 1334
1290 1335 bprm_fill_uid(bprm);
1291 /* clear any previous set[ug]id data from a previous binary */
1292 bprm->cred->euid = current_euid();
1293 bprm->cred->egid = current_egid();
1294
1295 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
1296 !task_no_new_privs(current) &&
1297 kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
1298 kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
1299 /* Set-uid? */
1300 if (mode & S_ISUID) {
1301 bprm->per_clear |= PER_CLEAR_ON_SETID;
1302 bprm->cred->euid = inode->i_uid;
1303 }
1304
1305 /* Set-gid? */
1306 /*
1307 * If setgid is set but no group execute bit then this
1308 * is a candidate for mandatory locking, not a setgid
1309 * executable.
1310 */
1311 if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
1312 bprm->per_clear |= PER_CLEAR_ON_SETID;
1313 bprm->cred->egid = inode->i_gid;
1314 }
1315 }
1316 1336
1317 /* fill in binprm security blob */ 1337 /* fill in binprm security blob */
1318 retval = security_bprm_set_creds(bprm); 1338 retval = security_bprm_set_creds(bprm);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index efea5d5c44ce..18228c201f7f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -64,6 +64,23 @@ config EXT4_FS_SECURITY
64 If you are not using a security module that requires using 64 If you are not using a security module that requires using
65 extended attributes for file security labels, say N. 65 extended attributes for file security labels, say N.
66 66
67config EXT4_FS_ENCRYPTION
68 bool "Ext4 Encryption"
69 depends on EXT4_FS
70 select CRYPTO_AES
71 select CRYPTO_CBC
72 select CRYPTO_ECB
73 select CRYPTO_XTS
74 select CRYPTO_CTS
75 select CRYPTO_SHA256
76 select KEYS
77 select ENCRYPTED_KEYS
78 help
79 Enable encryption of ext4 files and directories. This
80 feature is similar to ecryptfs, but it is more memory
81 efficient since it avoids caching the encrypted and
82 decrypted pages in the page cache.
83
67config EXT4_DEBUG 84config EXT4_DEBUG
68 bool "EXT4 debugging support" 85 bool "EXT4 debugging support"
69 depends on EXT4_FS 86 depends on EXT4_FS
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 0310fec2ee3d..75285ea9aa05 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,9 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ 8 ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ 9 ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
10 mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ 10 mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
11 xattr_trusted.o inline.o 11 xattr_trusted.o inline.o readpage.o
12 12
13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o 13ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
14ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o 14ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
15ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \
16 crypto_key.o crypto_fname.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index d40c8dbbb0d6..69b1e73026a5 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -4,11 +4,6 @@
4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> 4 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
5 */ 5 */
6 6
7#include <linux/init.h>
8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/capability.h>
11#include <linux/fs.h>
12#include "ext4_jbd2.h" 7#include "ext4_jbd2.h"
13#include "ext4.h" 8#include "ext4.h"
14#include "xattr.h" 9#include "xattr.h"
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 83a6f497c4e0..955bf49a7945 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -14,7 +14,6 @@
14#include <linux/time.h> 14#include <linux/time.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/jbd2.h>
18#include <linux/quotaops.h> 17#include <linux/quotaops.h>
19#include <linux/buffer_head.h> 18#include <linux/buffer_head.h>
20#include "ext4.h" 19#include "ext4.h"
@@ -641,8 +640,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
641 * fail EDQUOT for metdata, but we do account for it. 640 * fail EDQUOT for metdata, but we do account for it.
642 */ 641 */
643 if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { 642 if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) {
644 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
645 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
646 dquot_alloc_block_nofail(inode, 643 dquot_alloc_block_nofail(inode,
647 EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); 644 EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
648 } 645 }
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index b610779a958c..4a606afb171f 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -8,7 +8,6 @@
8 */ 8 */
9 9
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/jbd2.h>
12#include "ext4.h" 11#include "ext4.h"
13 12
14unsigned int ext4_count_free(char *bitmap, unsigned int numchars) 13unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 41eb9dcfac7e..3522340c7a99 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -16,7 +16,6 @@
16#include <linux/swap.h> 16#include <linux/swap.h>
17#include <linux/pagemap.h> 17#include <linux/pagemap.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/mutex.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include "ext4.h" 20#include "ext4.h"
22 21
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
new file mode 100644
index 000000000000..8ff15273ab0c
--- /dev/null
+++ b/fs/ext4/crypto.c
@@ -0,0 +1,558 @@
1/*
2 * linux/fs/ext4/crypto.c
3 *
4 * Copyright (C) 2015, Google, Inc.
5 *
6 * This contains encryption functions for ext4
7 *
8 * Written by Michael Halcrow, 2014.
9 *
10 * Filename encryption additions
11 * Uday Savagaonkar, 2014
12 * Encryption policy handling additions
13 * Ildar Muslukhov, 2014
14 *
15 * This has not yet undergone a rigorous security audit.
16 *
17 * The usage of AES-XTS should conform to recommendations in NIST
18 * Special Publication 800-38E and IEEE P1619/D16.
19 */
20
21#include <crypto/hash.h>
22#include <crypto/sha.h>
23#include <keys/user-type.h>
24#include <keys/encrypted-type.h>
25#include <linux/crypto.h>
26#include <linux/ecryptfs.h>
27#include <linux/gfp.h>
28#include <linux/kernel.h>
29#include <linux/key.h>
30#include <linux/list.h>
31#include <linux/mempool.h>
32#include <linux/module.h>
33#include <linux/mutex.h>
34#include <linux/random.h>
35#include <linux/scatterlist.h>
36#include <linux/spinlock_types.h>
37
38#include "ext4_extents.h"
39#include "xattr.h"
40
41/* Encryption added and removed here! (L: */
42
43static unsigned int num_prealloc_crypto_pages = 32;
44static unsigned int num_prealloc_crypto_ctxs = 128;
45
46module_param(num_prealloc_crypto_pages, uint, 0444);
47MODULE_PARM_DESC(num_prealloc_crypto_pages,
48 "Number of crypto pages to preallocate");
49module_param(num_prealloc_crypto_ctxs, uint, 0444);
50MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
51 "Number of crypto contexts to preallocate");
52
53static mempool_t *ext4_bounce_page_pool;
54
55static LIST_HEAD(ext4_free_crypto_ctxs);
56static DEFINE_SPINLOCK(ext4_crypto_ctx_lock);
57
58/**
59 * ext4_release_crypto_ctx() - Releases an encryption context
60 * @ctx: The encryption context to release.
61 *
62 * If the encryption context was allocated from the pre-allocated pool, returns
63 * it to that pool. Else, frees it.
64 *
65 * If there's a bounce page in the context, this frees that.
66 */
67void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx)
68{
69 unsigned long flags;
70
71 if (ctx->bounce_page) {
72 if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL)
73 __free_page(ctx->bounce_page);
74 else
75 mempool_free(ctx->bounce_page, ext4_bounce_page_pool);
76 ctx->bounce_page = NULL;
77 }
78 ctx->control_page = NULL;
79 if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) {
80 if (ctx->tfm)
81 crypto_free_tfm(ctx->tfm);
82 kfree(ctx);
83 } else {
84 spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
85 list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
86 spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
87 }
88}
89
90/**
91 * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context
92 * @mask: The allocation mask.
93 *
94 * Return: An allocated and initialized encryption context on success. An error
95 * value or NULL otherwise.
96 */
97static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask)
98{
99 struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx),
100 mask);
101
102 if (!ctx)
103 return ERR_PTR(-ENOMEM);
104 return ctx;
105}
106
107/**
108 * ext4_get_crypto_ctx() - Gets an encryption context
109 * @inode: The inode for which we are doing the crypto
110 *
111 * Allocates and initializes an encryption context.
112 *
113 * Return: An allocated and initialized encryption context on success; error
114 * value or NULL otherwise.
115 */
116struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode)
117{
118 struct ext4_crypto_ctx *ctx = NULL;
119 int res = 0;
120 unsigned long flags;
121 struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key;
122
123 if (!ext4_read_workqueue)
124 ext4_init_crypto();
125
126 /*
127 * We first try getting the ctx from a free list because in
128 * the common case the ctx will have an allocated and
129 * initialized crypto tfm, so it's probably a worthwhile
130 * optimization. For the bounce page, we first try getting it
131 * from the kernel allocator because that's just about as fast
132 * as getting it from a list and because a cache of free pages
133 * should generally be a "last resort" option for a filesystem
134 * to be able to do its job.
135 */
136 spin_lock_irqsave(&ext4_crypto_ctx_lock, flags);
137 ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs,
138 struct ext4_crypto_ctx, free_list);
139 if (ctx)
140 list_del(&ctx->free_list);
141 spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags);
142 if (!ctx) {
143 ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS);
144 if (IS_ERR(ctx)) {
145 res = PTR_ERR(ctx);
146 goto out;
147 }
148 ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
149 } else {
150 ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL;
151 }
152
153 /* Allocate a new Crypto API context if we don't already have
154 * one or if it isn't the right mode. */
155 BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID);
156 if (ctx->tfm && (ctx->mode != key->mode)) {
157 crypto_free_tfm(ctx->tfm);
158 ctx->tfm = NULL;
159 ctx->mode = EXT4_ENCRYPTION_MODE_INVALID;
160 }
161 if (!ctx->tfm) {
162 switch (key->mode) {
163 case EXT4_ENCRYPTION_MODE_AES_256_XTS:
164 ctx->tfm = crypto_ablkcipher_tfm(
165 crypto_alloc_ablkcipher("xts(aes)", 0, 0));
166 break;
167 case EXT4_ENCRYPTION_MODE_AES_256_GCM:
168 /* TODO(mhalcrow): AEAD w/ gcm(aes);
169 * crypto_aead_setauthsize() */
170 ctx->tfm = ERR_PTR(-ENOTSUPP);
171 break;
172 default:
173 BUG();
174 }
175 if (IS_ERR_OR_NULL(ctx->tfm)) {
176 res = PTR_ERR(ctx->tfm);
177 ctx->tfm = NULL;
178 goto out;
179 }
180 ctx->mode = key->mode;
181 }
182 BUG_ON(key->size != ext4_encryption_key_size(key->mode));
183
184 /* There shouldn't be a bounce page attached to the crypto
185 * context at this point. */
186 BUG_ON(ctx->bounce_page);
187
188out:
189 if (res) {
190 if (!IS_ERR_OR_NULL(ctx))
191 ext4_release_crypto_ctx(ctx);
192 ctx = ERR_PTR(res);
193 }
194 return ctx;
195}
196
197struct workqueue_struct *ext4_read_workqueue;
198static DEFINE_MUTEX(crypto_init);
199
200/**
201 * ext4_exit_crypto() - Shutdown the ext4 encryption system
202 */
203void ext4_exit_crypto(void)
204{
205 struct ext4_crypto_ctx *pos, *n;
206
207 list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) {
208 if (pos->bounce_page) {
209 if (pos->flags &
210 EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) {
211 __free_page(pos->bounce_page);
212 } else {
213 mempool_free(pos->bounce_page,
214 ext4_bounce_page_pool);
215 }
216 }
217 if (pos->tfm)
218 crypto_free_tfm(pos->tfm);
219 kfree(pos);
220 }
221 INIT_LIST_HEAD(&ext4_free_crypto_ctxs);
222 if (ext4_bounce_page_pool)
223 mempool_destroy(ext4_bounce_page_pool);
224 ext4_bounce_page_pool = NULL;
225 if (ext4_read_workqueue)
226 destroy_workqueue(ext4_read_workqueue);
227 ext4_read_workqueue = NULL;
228}
229
230/**
231 * ext4_init_crypto() - Set up for ext4 encryption.
232 *
233 * We only call this when we start accessing encrypted files, since it
234 * results in memory getting allocated that wouldn't otherwise be used.
235 *
236 * Return: Zero on success, non-zero otherwise.
237 */
238int ext4_init_crypto(void)
239{
240 int i, res;
241
242 mutex_lock(&crypto_init);
243 if (ext4_read_workqueue)
244 goto already_initialized;
245 ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0);
246 if (!ext4_read_workqueue) {
247 res = -ENOMEM;
248 goto fail;
249 }
250
251 for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
252 struct ext4_crypto_ctx *ctx;
253
254 ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL);
255 if (IS_ERR(ctx)) {
256 res = PTR_ERR(ctx);
257 goto fail;
258 }
259 list_add(&ctx->free_list, &ext4_free_crypto_ctxs);
260 }
261
262 ext4_bounce_page_pool =
263 mempool_create_page_pool(num_prealloc_crypto_pages, 0);
264 if (!ext4_bounce_page_pool) {
265 res = -ENOMEM;
266 goto fail;
267 }
268already_initialized:
269 mutex_unlock(&crypto_init);
270 return 0;
271fail:
272 ext4_exit_crypto();
273 mutex_unlock(&crypto_init);
274 return res;
275}
276
277void ext4_restore_control_page(struct page *data_page)
278{
279 struct ext4_crypto_ctx *ctx =
280 (struct ext4_crypto_ctx *)page_private(data_page);
281
282 set_page_private(data_page, (unsigned long)NULL);
283 ClearPagePrivate(data_page);
284 unlock_page(data_page);
285 ext4_release_crypto_ctx(ctx);
286}
287
288/**
289 * ext4_crypt_complete() - The completion callback for page encryption
290 * @req: The asynchronous encryption request context
291 * @res: The result of the encryption operation
292 */
293static void ext4_crypt_complete(struct crypto_async_request *req, int res)
294{
295 struct ext4_completion_result *ecr = req->data;
296
297 if (res == -EINPROGRESS)
298 return;
299 ecr->res = res;
300 complete(&ecr->completion);
301}
302
303typedef enum {
304 EXT4_DECRYPT = 0,
305 EXT4_ENCRYPT,
306} ext4_direction_t;
307
308static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
309 struct inode *inode,
310 ext4_direction_t rw,
311 pgoff_t index,
312 struct page *src_page,
313 struct page *dest_page)
314
315{
316 u8 xts_tweak[EXT4_XTS_TWEAK_SIZE];
317 struct ablkcipher_request *req = NULL;
318 DECLARE_EXT4_COMPLETION_RESULT(ecr);
319 struct scatterlist dst, src;
320 struct ext4_inode_info *ei = EXT4_I(inode);
321 struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm);
322 int res = 0;
323
324 BUG_ON(!ctx->tfm);
325 BUG_ON(ctx->mode != ei->i_encryption_key.mode);
326
327 if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) {
328 printk_ratelimited(KERN_ERR
329 "%s: unsupported crypto algorithm: %d\n",
330 __func__, ctx->mode);
331 return -ENOTSUPP;
332 }
333
334 crypto_ablkcipher_clear_flags(atfm, ~0);
335 crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY);
336
337 res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw,
338 ei->i_encryption_key.size);
339 if (res) {
340 printk_ratelimited(KERN_ERR
341 "%s: crypto_ablkcipher_setkey() failed\n",
342 __func__);
343 return res;
344 }
345 req = ablkcipher_request_alloc(atfm, GFP_NOFS);
346 if (!req) {
347 printk_ratelimited(KERN_ERR
348 "%s: crypto_request_alloc() failed\n",
349 __func__);
350 return -ENOMEM;
351 }
352 ablkcipher_request_set_callback(
353 req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
354 ext4_crypt_complete, &ecr);
355
356 BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index));
357 memcpy(xts_tweak, &index, sizeof(index));
358 memset(&xts_tweak[sizeof(index)], 0,
359 EXT4_XTS_TWEAK_SIZE - sizeof(index));
360
361 sg_init_table(&dst, 1);
362 sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
363 sg_init_table(&src, 1);
364 sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
365 ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
366 xts_tweak);
367 if (rw == EXT4_DECRYPT)
368 res = crypto_ablkcipher_decrypt(req);
369 else
370 res = crypto_ablkcipher_encrypt(req);
371 if (res == -EINPROGRESS || res == -EBUSY) {
372 BUG_ON(req->base.data != &ecr);
373 wait_for_completion(&ecr.completion);
374 res = ecr.res;
375 }
376 ablkcipher_request_free(req);
377 if (res) {
378 printk_ratelimited(
379 KERN_ERR
380 "%s: crypto_ablkcipher_encrypt() returned %d\n",
381 __func__, res);
382 return res;
383 }
384 return 0;
385}
386
387/**
388 * ext4_encrypt() - Encrypts a page
389 * @inode: The inode for which the encryption should take place
390 * @plaintext_page: The page to encrypt. Must be locked.
391 *
392 * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
393 * encryption context.
394 *
395 * Called on the page write path. The caller must call
396 * ext4_restore_control_page() on the returned ciphertext page to
397 * release the bounce buffer and the encryption context.
398 *
399 * Return: An allocated page with the encrypted content on success. Else, an
400 * error value or NULL.
401 */
402struct page *ext4_encrypt(struct inode *inode,
403 struct page *plaintext_page)
404{
405 struct ext4_crypto_ctx *ctx;
406 struct page *ciphertext_page = NULL;
407 int err;
408
409 BUG_ON(!PageLocked(plaintext_page));
410
411 ctx = ext4_get_crypto_ctx(inode);
412 if (IS_ERR(ctx))
413 return (struct page *) ctx;
414
415 /* The encryption operation will require a bounce page. */
416 ciphertext_page = alloc_page(GFP_NOFS);
417 if (!ciphertext_page) {
418 /* This is a potential bottleneck, but at least we'll have
419 * forward progress. */
420 ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
421 GFP_NOFS);
422 if (WARN_ON_ONCE(!ciphertext_page)) {
423 ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
424 GFP_NOFS | __GFP_WAIT);
425 }
426 ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
427 } else {
428 ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
429 }
430 ctx->bounce_page = ciphertext_page;
431 ctx->control_page = plaintext_page;
432 err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
433 plaintext_page, ciphertext_page);
434 if (err) {
435 ext4_release_crypto_ctx(ctx);
436 return ERR_PTR(err);
437 }
438 SetPagePrivate(ciphertext_page);
439 set_page_private(ciphertext_page, (unsigned long)ctx);
440 lock_page(ciphertext_page);
441 return ciphertext_page;
442}
443
444/**
445 * ext4_decrypt() - Decrypts a page in-place
446 * @ctx: The encryption context.
447 * @page: The page to decrypt. Must be locked.
448 *
449 * Decrypts page in-place using the ctx encryption context.
450 *
451 * Called from the read completion callback.
452 *
453 * Return: Zero on success, non-zero otherwise.
454 */
455int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page)
456{
457 BUG_ON(!PageLocked(page));
458
459 return ext4_page_crypto(ctx, page->mapping->host,
460 EXT4_DECRYPT, page->index, page, page);
461}
462
463/*
464 * Convenience function which takes care of allocating and
465 * deallocating the encryption context
466 */
467int ext4_decrypt_one(struct inode *inode, struct page *page)
468{
469 int ret;
470
471 struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
472
473 if (!ctx)
474 return -ENOMEM;
475 ret = ext4_decrypt(ctx, page);
476 ext4_release_crypto_ctx(ctx);
477 return ret;
478}
479
480int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
481{
482 struct ext4_crypto_ctx *ctx;
483 struct page *ciphertext_page = NULL;
484 struct bio *bio;
485 ext4_lblk_t lblk = ex->ee_block;
486 ext4_fsblk_t pblk = ext4_ext_pblock(ex);
487 unsigned int len = ext4_ext_get_actual_len(ex);
488 int err = 0;
489
490 BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
491
492 ctx = ext4_get_crypto_ctx(inode);
493 if (IS_ERR(ctx))
494 return PTR_ERR(ctx);
495
496 ciphertext_page = alloc_page(GFP_NOFS);
497 if (!ciphertext_page) {
498 /* This is a potential bottleneck, but at least we'll have
499 * forward progress. */
500 ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
501 GFP_NOFS);
502 if (WARN_ON_ONCE(!ciphertext_page)) {
503 ciphertext_page = mempool_alloc(ext4_bounce_page_pool,
504 GFP_NOFS | __GFP_WAIT);
505 }
506 ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
507 } else {
508 ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL;
509 }
510 ctx->bounce_page = ciphertext_page;
511
512 while (len--) {
513 err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
514 ZERO_PAGE(0), ciphertext_page);
515 if (err)
516 goto errout;
517
518 bio = bio_alloc(GFP_KERNEL, 1);
519 if (!bio) {
520 err = -ENOMEM;
521 goto errout;
522 }
523 bio->bi_bdev = inode->i_sb->s_bdev;
524 bio->bi_iter.bi_sector = pblk;
525 err = bio_add_page(bio, ciphertext_page,
526 inode->i_sb->s_blocksize, 0);
527 if (err) {
528 bio_put(bio);
529 goto errout;
530 }
531 err = submit_bio_wait(WRITE, bio);
532 if (err)
533 goto errout;
534 }
535 err = 0;
536errout:
537 ext4_release_crypto_ctx(ctx);
538 return err;
539}
540
541bool ext4_valid_contents_enc_mode(uint32_t mode)
542{
543 return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS);
544}
545
546/**
547 * ext4_validate_encryption_key_size() - Validate the encryption key size
548 * @mode: The key mode.
549 * @size: The key size to validate.
550 *
551 * Return: The validated key size for @mode. Zero if invalid.
552 */
553uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
554{
555 if (size == ext4_encryption_key_size(mode))
556 return size;
557 return 0;
558}
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
new file mode 100644
index 000000000000..ca2f5948c1ac
--- /dev/null
+++ b/fs/ext4/crypto_fname.c
@@ -0,0 +1,709 @@
1/*
2 * linux/fs/ext4/crypto_fname.c
3 *
4 * Copyright (C) 2015, Google, Inc.
5 *
6 * This contains functions for filename crypto management in ext4
7 *
8 * Written by Uday Savagaonkar, 2014.
9 *
10 * This has not yet undergone a rigorous security audit.
11 *
12 */
13
14#include <crypto/hash.h>
15#include <crypto/sha.h>
16#include <keys/encrypted-type.h>
17#include <keys/user-type.h>
18#include <linux/crypto.h>
19#include <linux/gfp.h>
20#include <linux/kernel.h>
21#include <linux/key.h>
22#include <linux/key.h>
23#include <linux/list.h>
24#include <linux/mempool.h>
25#include <linux/random.h>
26#include <linux/scatterlist.h>
27#include <linux/spinlock_types.h>
28
29#include "ext4.h"
30#include "ext4_crypto.h"
31#include "xattr.h"
32
33/**
34 * ext4_dir_crypt_complete() -
35 */
36static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res)
37{
38 struct ext4_completion_result *ecr = req->data;
39
40 if (res == -EINPROGRESS)
41 return;
42 ecr->res = res;
43 complete(&ecr->completion);
44}
45
46bool ext4_valid_filenames_enc_mode(uint32_t mode)
47{
48 return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS);
49}
50
51/**
52 * ext4_fname_encrypt() -
53 *
54 * This function encrypts the input filename, and returns the length of the
55 * ciphertext. Errors are returned as negative numbers. We trust the caller to
56 * allocate sufficient memory to oname string.
57 */
58static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx,
59 const struct qstr *iname,
60 struct ext4_str *oname)
61{
62 u32 ciphertext_len;
63 struct ablkcipher_request *req = NULL;
64 DECLARE_EXT4_COMPLETION_RESULT(ecr);
65 struct crypto_ablkcipher *tfm = ctx->ctfm;
66 int res = 0;
67 char iv[EXT4_CRYPTO_BLOCK_SIZE];
68 struct scatterlist sg[1];
69 char *workbuf;
70
71 if (iname->len <= 0 || iname->len > ctx->lim)
72 return -EIO;
73
74 ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ?
75 EXT4_CRYPTO_BLOCK_SIZE : iname->len;
76 ciphertext_len = (ciphertext_len > ctx->lim)
77 ? ctx->lim : ciphertext_len;
78
79 /* Allocate request */
80 req = ablkcipher_request_alloc(tfm, GFP_NOFS);
81 if (!req) {
82 printk_ratelimited(
83 KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
84 return -ENOMEM;
85 }
86 ablkcipher_request_set_callback(req,
87 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
88 ext4_dir_crypt_complete, &ecr);
89
90 /* Map the workpage */
91 workbuf = kmap(ctx->workpage);
92
93 /* Copy the input */
94 memcpy(workbuf, iname->name, iname->len);
95 if (iname->len < ciphertext_len)
96 memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
97
98 /* Initialize IV */
99 memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
100
101 /* Create encryption request */
102 sg_init_table(sg, 1);
103 sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
104 ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv);
105 res = crypto_ablkcipher_encrypt(req);
106 if (res == -EINPROGRESS || res == -EBUSY) {
107 BUG_ON(req->base.data != &ecr);
108 wait_for_completion(&ecr.completion);
109 res = ecr.res;
110 }
111 if (res >= 0) {
112 /* Copy the result to output */
113 memcpy(oname->name, workbuf, ciphertext_len);
114 res = ciphertext_len;
115 }
116 kunmap(ctx->workpage);
117 ablkcipher_request_free(req);
118 if (res < 0) {
119 printk_ratelimited(
120 KERN_ERR "%s: Error (error code %d)\n", __func__, res);
121 }
122 oname->len = ciphertext_len;
123 return res;
124}
125
126/*
127 * ext4_fname_decrypt()
128 * This function decrypts the input filename, and returns
129 * the length of the plaintext.
130 * Errors are returned as negative numbers.
131 * We trust the caller to allocate sufficient memory to oname string.
132 */
133static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx,
134 const struct ext4_str *iname,
135 struct ext4_str *oname)
136{
137 struct ext4_str tmp_in[2], tmp_out[1];
138 struct ablkcipher_request *req = NULL;
139 DECLARE_EXT4_COMPLETION_RESULT(ecr);
140 struct scatterlist sg[1];
141 struct crypto_ablkcipher *tfm = ctx->ctfm;
142 int res = 0;
143 char iv[EXT4_CRYPTO_BLOCK_SIZE];
144 char *workbuf;
145
146 if (iname->len <= 0 || iname->len > ctx->lim)
147 return -EIO;
148
149 tmp_in[0].name = iname->name;
150 tmp_in[0].len = iname->len;
151 tmp_out[0].name = oname->name;
152
153 /* Allocate request */
154 req = ablkcipher_request_alloc(tfm, GFP_NOFS);
155 if (!req) {
156 printk_ratelimited(
157 KERN_ERR "%s: crypto_request_alloc() failed\n", __func__);
158 return -ENOMEM;
159 }
160 ablkcipher_request_set_callback(req,
161 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
162 ext4_dir_crypt_complete, &ecr);
163
164 /* Map the workpage */
165 workbuf = kmap(ctx->workpage);
166
167 /* Copy the input */
168 memcpy(workbuf, iname->name, iname->len);
169
170 /* Initialize IV */
171 memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE);
172
173 /* Create encryption request */
174 sg_init_table(sg, 1);
175 sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0);
176 ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv);
177 res = crypto_ablkcipher_decrypt(req);
178 if (res == -EINPROGRESS || res == -EBUSY) {
179 BUG_ON(req->base.data != &ecr);
180 wait_for_completion(&ecr.completion);
181 res = ecr.res;
182 }
183 if (res >= 0) {
184 /* Copy the result to output */
185 memcpy(oname->name, workbuf, iname->len);
186 res = iname->len;
187 }
188 kunmap(ctx->workpage);
189 ablkcipher_request_free(req);
190 if (res < 0) {
191 printk_ratelimited(
192 KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n",
193 __func__, res);
194 return res;
195 }
196
197 oname->len = strnlen(oname->name, iname->len);
198 return oname->len;
199}
200
201/**
202 * ext4_fname_encode_digest() -
203 *
204 * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
205 * The encoded string is roughly 4/3 times the size of the input string.
206 */
207int ext4_fname_encode_digest(char *dst, char *src, u32 len)
208{
209 static const char *lookup_table =
210 "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_+";
211 u32 current_chunk, num_chunks, i;
212 char tmp_buf[3];
213 u32 c0, c1, c2, c3;
214
215 current_chunk = 0;
216 num_chunks = len/3;
217 for (i = 0; i < num_chunks; i++) {
218 c0 = src[3*i] & 0x3f;
219 c1 = (((src[3*i]>>6)&0x3) | ((src[3*i+1] & 0xf)<<2)) & 0x3f;
220 c2 = (((src[3*i+1]>>4)&0xf) | ((src[3*i+2] & 0x3)<<4)) & 0x3f;
221 c3 = (src[3*i+2]>>2) & 0x3f;
222 dst[4*i] = lookup_table[c0];
223 dst[4*i+1] = lookup_table[c1];
224 dst[4*i+2] = lookup_table[c2];
225 dst[4*i+3] = lookup_table[c3];
226 }
227 if (i*3 < len) {
228 memset(tmp_buf, 0, 3);
229 memcpy(tmp_buf, &src[3*i], len-3*i);
230 c0 = tmp_buf[0] & 0x3f;
231 c1 = (((tmp_buf[0]>>6)&0x3) | ((tmp_buf[1] & 0xf)<<2)) & 0x3f;
232 c2 = (((tmp_buf[1]>>4)&0xf) | ((tmp_buf[2] & 0x3)<<4)) & 0x3f;
233 c3 = (tmp_buf[2]>>2) & 0x3f;
234 dst[4*i] = lookup_table[c0];
235 dst[4*i+1] = lookup_table[c1];
236 dst[4*i+2] = lookup_table[c2];
237 dst[4*i+3] = lookup_table[c3];
238 i++;
239 }
240 return (i * 4);
241}
242
243/**
244 * ext4_fname_hash() -
245 *
246 * This function computes the hash of the input filename, and sets the output
247 * buffer to the *encoded* digest. It returns the length of the digest as its
248 * return value. Errors are returned as negative numbers. We trust the caller
249 * to allocate sufficient memory to oname string.
250 */
251static int ext4_fname_hash(struct ext4_fname_crypto_ctx *ctx,
252 const struct ext4_str *iname,
253 struct ext4_str *oname)
254{
255 struct scatterlist sg;
256 struct hash_desc desc = {
257 .tfm = (struct crypto_hash *)ctx->htfm,
258 .flags = CRYPTO_TFM_REQ_MAY_SLEEP
259 };
260 int res = 0;
261
262 if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) {
263 res = ext4_fname_encode_digest(oname->name, iname->name,
264 iname->len);
265 oname->len = res;
266 return res;
267 }
268
269 sg_init_one(&sg, iname->name, iname->len);
270 res = crypto_hash_init(&desc);
271 if (res) {
272 printk(KERN_ERR
273 "%s: Error initializing crypto hash; res = [%d]\n",
274 __func__, res);
275 goto out;
276 }
277 res = crypto_hash_update(&desc, &sg, iname->len);
278 if (res) {
279 printk(KERN_ERR
280 "%s: Error updating crypto hash; res = [%d]\n",
281 __func__, res);
282 goto out;
283 }
284 res = crypto_hash_final(&desc,
285 &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE]);
286 if (res) {
287 printk(KERN_ERR
288 "%s: Error finalizing crypto hash; res = [%d]\n",
289 __func__, res);
290 goto out;
291 }
292 /* Encode the digest as a printable string--this will increase the
293 * size of the digest */
294 oname->name[0] = 'I';
295 res = ext4_fname_encode_digest(oname->name+1,
296 &oname->name[EXT4_FNAME_CRYPTO_DIGEST_SIZE],
297 EXT4_FNAME_CRYPTO_DIGEST_SIZE) + 1;
298 oname->len = res;
299out:
300 return res;
301}
302
303/**
304 * ext4_free_fname_crypto_ctx() -
305 *
306 * Frees up a crypto context.
307 */
308void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx)
309{
310 if (ctx == NULL || IS_ERR(ctx))
311 return;
312
313 if (ctx->ctfm && !IS_ERR(ctx->ctfm))
314 crypto_free_ablkcipher(ctx->ctfm);
315 if (ctx->htfm && !IS_ERR(ctx->htfm))
316 crypto_free_hash(ctx->htfm);
317 if (ctx->workpage && !IS_ERR(ctx->workpage))
318 __free_page(ctx->workpage);
319 kfree(ctx);
320}
321
322/**
323 * ext4_put_fname_crypto_ctx() -
324 *
325 * Return: The crypto context onto free list. If the free list is above a
326 * threshold, completely frees up the context, and returns the memory.
327 *
328 * TODO: Currently we directly free the crypto context. Eventually we should
329 * add code it to return to free list. Such an approach will increase
330 * efficiency of directory lookup.
331 */
332void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx)
333{
334 if (*ctx == NULL || IS_ERR(*ctx))
335 return;
336 ext4_free_fname_crypto_ctx(*ctx);
337 *ctx = NULL;
338}
339
340/**
341 * ext4_search_fname_crypto_ctx() -
342 */
343static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx(
344 const struct ext4_encryption_key *key)
345{
346 return NULL;
347}
348
349/**
350 * ext4_alloc_fname_crypto_ctx() -
351 */
352struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx(
353 const struct ext4_encryption_key *key)
354{
355 struct ext4_fname_crypto_ctx *ctx;
356
357 ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS);
358 if (ctx == NULL)
359 return ERR_PTR(-ENOMEM);
360 if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) {
361 /* This will automatically set key mode to invalid
362 * As enum for ENCRYPTION_MODE_INVALID is zero */
363 memset(&ctx->key, 0, sizeof(ctx->key));
364 } else {
365 memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key));
366 }
367 ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode)
368 ? 0 : 1;
369 ctx->ctfm_key_is_ready = 0;
370 ctx->ctfm = NULL;
371 ctx->htfm = NULL;
372 ctx->workpage = NULL;
373 return ctx;
374}
375
376/**
377 * ext4_get_fname_crypto_ctx() -
378 *
379 * Allocates a free crypto context and initializes it to hold
380 * the crypto material for the inode.
381 *
382 * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise.
383 */
384struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(
385 struct inode *inode, u32 max_ciphertext_len)
386{
387 struct ext4_fname_crypto_ctx *ctx;
388 struct ext4_inode_info *ei = EXT4_I(inode);
389 int res;
390
391 /* Check if the crypto policy is set on the inode */
392 res = ext4_encrypted_inode(inode);
393 if (res == 0)
394 return NULL;
395
396 if (!ext4_has_encryption_key(inode))
397 ext4_generate_encryption_key(inode);
398
399 /* Get a crypto context based on the key.
400 * A new context is allocated if no context matches the requested key.
401 */
402 ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key));
403 if (ctx == NULL)
404 ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key));
405 if (IS_ERR(ctx))
406 return ctx;
407
408 if (ctx->has_valid_key) {
409 if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) {
410 printk_once(KERN_WARNING
411 "ext4: unsupported key mode %d\n",
412 ctx->key.mode);
413 return ERR_PTR(-ENOKEY);
414 }
415
416 /* As a first cut, we will allocate new tfm in every call.
417 * later, we will keep the tfm around, in case the key gets
418 * re-used */
419 if (ctx->ctfm == NULL) {
420 ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))",
421 0, 0);
422 }
423 if (IS_ERR(ctx->ctfm)) {
424 res = PTR_ERR(ctx->ctfm);
425 printk(
426 KERN_DEBUG "%s: error (%d) allocating crypto tfm\n",
427 __func__, res);
428 ctx->ctfm = NULL;
429 ext4_put_fname_crypto_ctx(&ctx);
430 return ERR_PTR(res);
431 }
432 if (ctx->ctfm == NULL) {
433 printk(
434 KERN_DEBUG "%s: could not allocate crypto tfm\n",
435 __func__);
436 ext4_put_fname_crypto_ctx(&ctx);
437 return ERR_PTR(-ENOMEM);
438 }
439 if (ctx->workpage == NULL)
440 ctx->workpage = alloc_page(GFP_NOFS);
441 if (IS_ERR(ctx->workpage)) {
442 res = PTR_ERR(ctx->workpage);
443 printk(
444 KERN_DEBUG "%s: error (%d) allocating work page\n",
445 __func__, res);
446 ctx->workpage = NULL;
447 ext4_put_fname_crypto_ctx(&ctx);
448 return ERR_PTR(res);
449 }
450 if (ctx->workpage == NULL) {
451 printk(
452 KERN_DEBUG "%s: could not allocate work page\n",
453 __func__);
454 ext4_put_fname_crypto_ctx(&ctx);
455 return ERR_PTR(-ENOMEM);
456 }
457 ctx->lim = max_ciphertext_len;
458 crypto_ablkcipher_clear_flags(ctx->ctfm, ~0);
459 crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm),
460 CRYPTO_TFM_REQ_WEAK_KEY);
461
462 /* If we are lucky, we will get a context that is already
463 * set up with the right key. Else, we will have to
464 * set the key */
465 if (!ctx->ctfm_key_is_ready) {
466 /* Since our crypto objectives for filename encryption
467 * are pretty weak,
468 * we directly use the inode master key */
469 res = crypto_ablkcipher_setkey(ctx->ctfm,
470 ctx->key.raw, ctx->key.size);
471 if (res) {
472 ext4_put_fname_crypto_ctx(&ctx);
473 return ERR_PTR(-EIO);
474 }
475 ctx->ctfm_key_is_ready = 1;
476 } else {
477 /* In the current implementation, key should never be
478 * marked "ready" for a context that has just been
479 * allocated. So we should never reach here */
480 BUG();
481 }
482 }
483 if (ctx->htfm == NULL)
484 ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC);
485 if (IS_ERR(ctx->htfm)) {
486 res = PTR_ERR(ctx->htfm);
487 printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n",
488 __func__, res);
489 ctx->htfm = NULL;
490 ext4_put_fname_crypto_ctx(&ctx);
491 return ERR_PTR(res);
492 }
493 if (ctx->htfm == NULL) {
494 printk(KERN_DEBUG "%s: could not allocate hash tfm\n",
495 __func__);
496 ext4_put_fname_crypto_ctx(&ctx);
497 return ERR_PTR(-ENOMEM);
498 }
499
500 return ctx;
501}
502
503/**
504 * ext4_fname_crypto_round_up() -
505 *
506 * Return: The next multiple of block size
507 */
508u32 ext4_fname_crypto_round_up(u32 size, u32 blksize)
509{
510 return ((size+blksize-1)/blksize)*blksize;
511}
512
513/**
514 * ext4_fname_crypto_namelen_on_disk() -
515 */
516int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
517 u32 namelen)
518{
519 u32 ciphertext_len;
520
521 if (ctx == NULL)
522 return -EIO;
523 if (!(ctx->has_valid_key))
524 return -EACCES;
525 ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ?
526 EXT4_CRYPTO_BLOCK_SIZE : namelen;
527 ciphertext_len = (ciphertext_len > ctx->lim)
528 ? ctx->lim : ciphertext_len;
529 return (int) ciphertext_len;
530}
531
532/**
533 * ext4_fname_crypto_alloc_obuff() -
534 *
535 * Allocates an output buffer that is sufficient for the crypto operation
536 * specified by the context and the direction.
537 */
538int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
539 u32 ilen, struct ext4_str *crypto_str)
540{
541 unsigned int olen;
542
543 if (!ctx)
544 return -EIO;
545 olen = ext4_fname_crypto_round_up(ilen, EXT4_CRYPTO_BLOCK_SIZE);
546 crypto_str->len = olen;
547 if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2)
548 olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2;
549 /* Allocated buffer can hold one more character to null-terminate the
550 * string */
551 crypto_str->name = kmalloc(olen+1, GFP_NOFS);
552 if (!(crypto_str->name))
553 return -ENOMEM;
554 return 0;
555}
556
557/**
558 * ext4_fname_crypto_free_buffer() -
559 *
560 * Frees the buffer allocated for crypto operation.
561 */
562void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str)
563{
564 if (!crypto_str)
565 return;
566 kfree(crypto_str->name);
567 crypto_str->name = NULL;
568}
569
570/**
571 * ext4_fname_disk_to_usr() - converts a filename from disk space to user space
572 */
573int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
574 const struct ext4_str *iname,
575 struct ext4_str *oname)
576{
577 if (ctx == NULL)
578 return -EIO;
579 if (iname->len < 3) {
580 /*Check for . and .. */
581 if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') {
582 oname->name[0] = '.';
583 oname->name[iname->len-1] = '.';
584 oname->len = iname->len;
585 return oname->len;
586 }
587 }
588 if (ctx->has_valid_key)
589 return ext4_fname_decrypt(ctx, iname, oname);
590 else
591 return ext4_fname_hash(ctx, iname, oname);
592}
593
594int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
595 const struct ext4_dir_entry_2 *de,
596 struct ext4_str *oname)
597{
598 struct ext4_str iname = {.name = (unsigned char *) de->name,
599 .len = de->name_len };
600
601 return _ext4_fname_disk_to_usr(ctx, &iname, oname);
602}
603
604
605/**
606 * ext4_fname_usr_to_disk() - converts a filename from user space to disk space
607 */
608int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
609 const struct qstr *iname,
610 struct ext4_str *oname)
611{
612 int res;
613
614 if (ctx == NULL)
615 return -EIO;
616 if (iname->len < 3) {
617 /*Check for . and .. */
618 if (iname->name[0] == '.' &&
619 iname->name[iname->len-1] == '.') {
620 oname->name[0] = '.';
621 oname->name[iname->len-1] = '.';
622 oname->len = iname->len;
623 return oname->len;
624 }
625 }
626 if (ctx->has_valid_key) {
627 res = ext4_fname_encrypt(ctx, iname, oname);
628 return res;
629 }
630 /* Without a proper key, a user is not allowed to modify the filenames
631 * in a directory. Consequently, a user space name cannot be mapped to
632 * a disk-space name */
633 return -EACCES;
634}
635
636/*
637 * Calculate the htree hash from a filename from user space
638 */
639int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
640 const struct qstr *iname,
641 struct dx_hash_info *hinfo)
642{
643 struct ext4_str tmp, tmp2;
644 int ret = 0;
645
646 if (!ctx || !ctx->has_valid_key ||
647 ((iname->name[0] == '.') &&
648 ((iname->len == 1) ||
649 ((iname->name[1] == '.') && (iname->len == 2))))) {
650 ext4fs_dirhash(iname->name, iname->len, hinfo);
651 return 0;
652 }
653
654 /* First encrypt the plaintext name */
655 ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp);
656 if (ret < 0)
657 return ret;
658
659 ret = ext4_fname_encrypt(ctx, iname, &tmp);
660 if (ret < 0)
661 goto out;
662
663 tmp2.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1;
664 tmp2.name = kmalloc(tmp2.len + 1, GFP_KERNEL);
665 if (tmp2.name == NULL) {
666 ret = -ENOMEM;
667 goto out;
668 }
669
670 ret = ext4_fname_hash(ctx, &tmp, &tmp2);
671 if (ret > 0)
672 ext4fs_dirhash(tmp2.name, tmp2.len, hinfo);
673 ext4_fname_crypto_free_buffer(&tmp2);
674out:
675 ext4_fname_crypto_free_buffer(&tmp);
676 return ret;
677}
678
679/**
680 * ext4_fname_disk_to_htree() - converts a filename from disk space to htree-access string
681 */
682int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx,
683 const struct ext4_dir_entry_2 *de,
684 struct dx_hash_info *hinfo)
685{
686 struct ext4_str iname = {.name = (unsigned char *) de->name,
687 .len = de->name_len};
688 struct ext4_str tmp;
689 int ret;
690
691 if (!ctx ||
692 ((iname.name[0] == '.') &&
693 ((iname.len == 1) ||
694 ((iname.name[1] == '.') && (iname.len == 2))))) {
695 ext4fs_dirhash(iname.name, iname.len, hinfo);
696 return 0;
697 }
698
699 tmp.len = (4 * ((EXT4_FNAME_CRYPTO_DIGEST_SIZE + 2) / 3)) + 1;
700 tmp.name = kmalloc(tmp.len + 1, GFP_KERNEL);
701 if (tmp.name == NULL)
702 return -ENOMEM;
703
704 ret = ext4_fname_hash(ctx, &iname, &tmp);
705 if (ret > 0)
706 ext4fs_dirhash(tmp.name, tmp.len, hinfo);
707 ext4_fname_crypto_free_buffer(&tmp);
708 return ret;
709}
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
new file mode 100644
index 000000000000..c8392af8abbb
--- /dev/null
+++ b/fs/ext4/crypto_key.c
@@ -0,0 +1,165 @@
1/*
2 * linux/fs/ext4/crypto_key.c
3 *
4 * Copyright (C) 2015, Google, Inc.
5 *
6 * This contains encryption key functions for ext4
7 *
8 * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
9 */
10
11#include <keys/encrypted-type.h>
12#include <keys/user-type.h>
13#include <linux/random.h>
14#include <linux/scatterlist.h>
15#include <uapi/linux/keyctl.h>
16
17#include "ext4.h"
18#include "xattr.h"
19
20static void derive_crypt_complete(struct crypto_async_request *req, int rc)
21{
22 struct ext4_completion_result *ecr = req->data;
23
24 if (rc == -EINPROGRESS)
25 return;
26
27 ecr->res = rc;
28 complete(&ecr->completion);
29}
30
31/**
32 * ext4_derive_key_aes() - Derive a key using AES-128-ECB
33 * @deriving_key: Encryption key used for derivatio.
34 * @source_key: Source key to which to apply derivation.
35 * @derived_key: Derived key.
36 *
37 * Return: Zero on success; non-zero otherwise.
38 */
39static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
40 char source_key[EXT4_AES_256_XTS_KEY_SIZE],
41 char derived_key[EXT4_AES_256_XTS_KEY_SIZE])
42{
43 int res = 0;
44 struct ablkcipher_request *req = NULL;
45 DECLARE_EXT4_COMPLETION_RESULT(ecr);
46 struct scatterlist src_sg, dst_sg;
47 struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
48 0);
49
50 if (IS_ERR(tfm)) {
51 res = PTR_ERR(tfm);
52 tfm = NULL;
53 goto out;
54 }
55 crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
56 req = ablkcipher_request_alloc(tfm, GFP_NOFS);
57 if (!req) {
58 res = -ENOMEM;
59 goto out;
60 }
61 ablkcipher_request_set_callback(req,
62 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
63 derive_crypt_complete, &ecr);
64 res = crypto_ablkcipher_setkey(tfm, deriving_key,
65 EXT4_AES_128_ECB_KEY_SIZE);
66 if (res < 0)
67 goto out;
68 sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE);
69 sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE);
70 ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
71 EXT4_AES_256_XTS_KEY_SIZE, NULL);
72 res = crypto_ablkcipher_encrypt(req);
73 if (res == -EINPROGRESS || res == -EBUSY) {
74 BUG_ON(req->base.data != &ecr);
75 wait_for_completion(&ecr.completion);
76 res = ecr.res;
77 }
78
79out:
80 if (req)
81 ablkcipher_request_free(req);
82 if (tfm)
83 crypto_free_ablkcipher(tfm);
84 return res;
85}
86
87/**
88 * ext4_generate_encryption_key() - generates an encryption key
89 * @inode: The inode to generate the encryption key for.
90 */
91int ext4_generate_encryption_key(struct inode *inode)
92{
93 struct ext4_inode_info *ei = EXT4_I(inode);
94 struct ext4_encryption_key *crypt_key = &ei->i_encryption_key;
95 char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
96 (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1];
97 struct key *keyring_key = NULL;
98 struct ext4_encryption_key *master_key;
99 struct ext4_encryption_context ctx;
100 struct user_key_payload *ukp;
101 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
102 int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
103 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
104 &ctx, sizeof(ctx));
105
106 if (res != sizeof(ctx)) {
107 if (res > 0)
108 res = -EINVAL;
109 goto out;
110 }
111 res = 0;
112
113 if (S_ISREG(inode->i_mode))
114 crypt_key->mode = ctx.contents_encryption_mode;
115 else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
116 crypt_key->mode = ctx.filenames_encryption_mode;
117 else {
118 printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n");
119 BUG();
120 }
121 crypt_key->size = ext4_encryption_key_size(crypt_key->mode);
122 BUG_ON(!crypt_key->size);
123 if (DUMMY_ENCRYPTION_ENABLED(sbi)) {
124 memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE);
125 goto out;
126 }
127 memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX,
128 EXT4_KEY_DESC_PREFIX_SIZE);
129 sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE,
130 "%*phN", EXT4_KEY_DESCRIPTOR_SIZE,
131 ctx.master_key_descriptor);
132 full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE +
133 (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0';
134 keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
135 if (IS_ERR(keyring_key)) {
136 res = PTR_ERR(keyring_key);
137 keyring_key = NULL;
138 goto out;
139 }
140 BUG_ON(keyring_key->type != &key_type_logon);
141 ukp = ((struct user_key_payload *)keyring_key->payload.data);
142 if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
143 res = -EINVAL;
144 goto out;
145 }
146 master_key = (struct ext4_encryption_key *)ukp->data;
147 BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
148 EXT4_KEY_DERIVATION_NONCE_SIZE);
149 BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
150 res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw);
151out:
152 if (keyring_key)
153 key_put(keyring_key);
154 if (res < 0)
155 crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID;
156 return res;
157}
158
159int ext4_has_encryption_key(struct inode *inode)
160{
161 struct ext4_inode_info *ei = EXT4_I(inode);
162 struct ext4_encryption_key *crypt_key = &ei->i_encryption_key;
163
164 return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID);
165}
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
new file mode 100644
index 000000000000..30eaf9e9864a
--- /dev/null
+++ b/fs/ext4/crypto_policy.c
@@ -0,0 +1,194 @@
1/*
2 * linux/fs/ext4/crypto_policy.c
3 *
4 * Copyright (C) 2015, Google, Inc.
5 *
6 * This contains encryption policy functions for ext4
7 *
8 * Written by Michael Halcrow, 2015.
9 */
10
11#include <linux/random.h>
12#include <linux/string.h>
13#include <linux/types.h>
14
15#include "ext4.h"
16#include "xattr.h"
17
18static int ext4_inode_has_encryption_context(struct inode *inode)
19{
20 int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
21 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0);
22 return (res > 0);
23}
24
25/*
26 * check whether the policy is consistent with the encryption context
27 * for the inode
28 */
29static int ext4_is_encryption_context_consistent_with_policy(
30 struct inode *inode, const struct ext4_encryption_policy *policy)
31{
32 struct ext4_encryption_context ctx;
33 int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
34 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
35 sizeof(ctx));
36 if (res != sizeof(ctx))
37 return 0;
38 return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
39 EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
40 (ctx.contents_encryption_mode ==
41 policy->contents_encryption_mode) &&
42 (ctx.filenames_encryption_mode ==
43 policy->filenames_encryption_mode));
44}
45
46static int ext4_create_encryption_context_from_policy(
47 struct inode *inode, const struct ext4_encryption_policy *policy)
48{
49 struct ext4_encryption_context ctx;
50 int res = 0;
51
52 ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
53 memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
54 EXT4_KEY_DESCRIPTOR_SIZE);
55 if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) {
56 printk(KERN_WARNING
57 "%s: Invalid contents encryption mode %d\n", __func__,
58 policy->contents_encryption_mode);
59 res = -EINVAL;
60 goto out;
61 }
62 if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
63 printk(KERN_WARNING
64 "%s: Invalid filenames encryption mode %d\n", __func__,
65 policy->filenames_encryption_mode);
66 res = -EINVAL;
67 goto out;
68 }
69 ctx.contents_encryption_mode = policy->contents_encryption_mode;
70 ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
71 BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE);
72 get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
73
74 res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
75 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
76 sizeof(ctx), 0);
77out:
78 if (!res)
79 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
80 return res;
81}
82
83int ext4_process_policy(const struct ext4_encryption_policy *policy,
84 struct inode *inode)
85{
86 if (policy->version != 0)
87 return -EINVAL;
88
89 if (!ext4_inode_has_encryption_context(inode)) {
90 if (!ext4_empty_dir(inode))
91 return -ENOTEMPTY;
92 return ext4_create_encryption_context_from_policy(inode,
93 policy);
94 }
95
96 if (ext4_is_encryption_context_consistent_with_policy(inode, policy))
97 return 0;
98
99 printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
100 __func__);
101 return -EINVAL;
102}
103
104int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy)
105{
106 struct ext4_encryption_context ctx;
107
108 int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION,
109 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
110 &ctx, sizeof(ctx));
111 if (res != sizeof(ctx))
112 return -ENOENT;
113 if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1)
114 return -EINVAL;
115 policy->version = 0;
116 policy->contents_encryption_mode = ctx.contents_encryption_mode;
117 policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
118 memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
119 EXT4_KEY_DESCRIPTOR_SIZE);
120 return 0;
121}
122
123int ext4_is_child_context_consistent_with_parent(struct inode *parent,
124 struct inode *child)
125{
126 struct ext4_encryption_context parent_ctx, child_ctx;
127 int res;
128
129 if ((parent == NULL) || (child == NULL)) {
130 pr_err("parent %p child %p\n", parent, child);
131 BUG_ON(1);
132 }
133 /* no restrictions if the parent directory is not encrypted */
134 if (!ext4_encrypted_inode(parent))
135 return 1;
136 res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
137 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
138 &parent_ctx, sizeof(parent_ctx));
139 if (res != sizeof(parent_ctx))
140 return 0;
141 /* if the child directory is not encrypted, this is always a problem */
142 if (!ext4_encrypted_inode(child))
143 return 0;
144 res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION,
145 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
146 &child_ctx, sizeof(child_ctx));
147 if (res != sizeof(child_ctx))
148 return 0;
149 return (memcmp(parent_ctx.master_key_descriptor,
150 child_ctx.master_key_descriptor,
151 EXT4_KEY_DESCRIPTOR_SIZE) == 0 &&
152 (parent_ctx.contents_encryption_mode ==
153 child_ctx.contents_encryption_mode) &&
154 (parent_ctx.filenames_encryption_mode ==
155 child_ctx.filenames_encryption_mode));
156}
157
158/**
159 * ext4_inherit_context() - Sets a child context from its parent
160 * @parent: Parent inode from which the context is inherited.
161 * @child: Child inode that inherits the context from @parent.
162 *
163 * Return: Zero on success, non-zero otherwise
164 */
165int ext4_inherit_context(struct inode *parent, struct inode *child)
166{
167 struct ext4_encryption_context ctx;
168 int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION,
169 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
170 &ctx, sizeof(ctx));
171
172 if (res != sizeof(ctx)) {
173 if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) {
174 ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1;
175 ctx.contents_encryption_mode =
176 EXT4_ENCRYPTION_MODE_AES_256_XTS;
177 ctx.filenames_encryption_mode =
178 EXT4_ENCRYPTION_MODE_AES_256_CTS;
179 memset(ctx.master_key_descriptor, 0x42,
180 EXT4_KEY_DESCRIPTOR_SIZE);
181 res = 0;
182 } else {
183 goto out;
184 }
185 }
186 get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE);
187 res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION,
188 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
189 sizeof(ctx), 0);
190out:
191 if (!res)
192 ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT);
193 return res;
194}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index c24143ea9c08..61db51a5ce4c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -22,10 +22,8 @@
22 */ 22 */
23 23
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/jbd2.h>
26#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
27#include <linux/slab.h> 26#include <linux/slab.h>
28#include <linux/rbtree.h>
29#include "ext4.h" 27#include "ext4.h"
30#include "xattr.h" 28#include "xattr.h"
31 29
@@ -110,7 +108,10 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
110 int err; 108 int err;
111 struct inode *inode = file_inode(file); 109 struct inode *inode = file_inode(file);
112 struct super_block *sb = inode->i_sb; 110 struct super_block *sb = inode->i_sb;
111 struct buffer_head *bh = NULL;
113 int dir_has_error = 0; 112 int dir_has_error = 0;
113 struct ext4_fname_crypto_ctx *enc_ctx = NULL;
114 struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
114 115
115 if (is_dx_dir(inode)) { 116 if (is_dx_dir(inode)) {
116 err = ext4_dx_readdir(file, ctx); 117 err = ext4_dx_readdir(file, ctx);
@@ -127,17 +128,28 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
127 128
128 if (ext4_has_inline_data(inode)) { 129 if (ext4_has_inline_data(inode)) {
129 int has_inline_data = 1; 130 int has_inline_data = 1;
130 int ret = ext4_read_inline_dir(file, ctx, 131 err = ext4_read_inline_dir(file, ctx,
131 &has_inline_data); 132 &has_inline_data);
132 if (has_inline_data) 133 if (has_inline_data)
133 return ret; 134 return err;
135 }
136
137 enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN);
138 if (IS_ERR(enc_ctx))
139 return PTR_ERR(enc_ctx);
140 if (enc_ctx) {
141 err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN,
142 &fname_crypto_str);
143 if (err < 0) {
144 ext4_put_fname_crypto_ctx(&enc_ctx);
145 return err;
146 }
134 } 147 }
135 148
136 offset = ctx->pos & (sb->s_blocksize - 1); 149 offset = ctx->pos & (sb->s_blocksize - 1);
137 150
138 while (ctx->pos < inode->i_size) { 151 while (ctx->pos < inode->i_size) {
139 struct ext4_map_blocks map; 152 struct ext4_map_blocks map;
140 struct buffer_head *bh = NULL;
141 153
142 map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); 154 map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
143 map.m_len = 1; 155 map.m_len = 1;
@@ -180,6 +192,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
180 (unsigned long long)ctx->pos); 192 (unsigned long long)ctx->pos);
181 ctx->pos += sb->s_blocksize - offset; 193 ctx->pos += sb->s_blocksize - offset;
182 brelse(bh); 194 brelse(bh);
195 bh = NULL;
183 continue; 196 continue;
184 } 197 }
185 set_buffer_verified(bh); 198 set_buffer_verified(bh);
@@ -226,25 +239,44 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
226 offset += ext4_rec_len_from_disk(de->rec_len, 239 offset += ext4_rec_len_from_disk(de->rec_len,
227 sb->s_blocksize); 240 sb->s_blocksize);
228 if (le32_to_cpu(de->inode)) { 241 if (le32_to_cpu(de->inode)) {
229 if (!dir_emit(ctx, de->name, 242 if (enc_ctx == NULL) {
230 de->name_len, 243 /* Directory is not encrypted */
231 le32_to_cpu(de->inode), 244 if (!dir_emit(ctx, de->name,
232 get_dtype(sb, de->file_type))) { 245 de->name_len,
233 brelse(bh); 246 le32_to_cpu(de->inode),
234 return 0; 247 get_dtype(sb, de->file_type)))
248 goto done;
249 } else {
250 /* Directory is encrypted */
251 err = ext4_fname_disk_to_usr(enc_ctx,
252 de, &fname_crypto_str);
253 if (err < 0)
254 goto errout;
255 if (!dir_emit(ctx,
256 fname_crypto_str.name, err,
257 le32_to_cpu(de->inode),
258 get_dtype(sb, de->file_type)))
259 goto done;
235 } 260 }
236 } 261 }
237 ctx->pos += ext4_rec_len_from_disk(de->rec_len, 262 ctx->pos += ext4_rec_len_from_disk(de->rec_len,
238 sb->s_blocksize); 263 sb->s_blocksize);
239 } 264 }
240 offset = 0; 265 if ((ctx->pos < inode->i_size) && !dir_relax(inode))
266 goto done;
241 brelse(bh); 267 brelse(bh);
242 if (ctx->pos < inode->i_size) { 268 bh = NULL;
243 if (!dir_relax(inode)) 269 offset = 0;
244 return 0;
245 }
246 } 270 }
247 return 0; 271done:
272 err = 0;
273errout:
274#ifdef CONFIG_EXT4_FS_ENCRYPTION
275 ext4_put_fname_crypto_ctx(&enc_ctx);
276 ext4_fname_crypto_free_buffer(&fname_crypto_str);
277#endif
278 brelse(bh);
279 return err;
248} 280}
249 281
250static inline int is_32bit_api(void) 282static inline int is_32bit_api(void)
@@ -384,10 +416,15 @@ void ext4_htree_free_dir_info(struct dir_private_info *p)
384 416
385/* 417/*
386 * Given a directory entry, enter it into the fname rb tree. 418 * Given a directory entry, enter it into the fname rb tree.
419 *
420 * When filename encryption is enabled, the dirent will hold the
421 * encrypted filename, while the htree will hold decrypted filename.
422 * The decrypted filename is passed in via ent_name. parameter.
387 */ 423 */
388int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 424int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
389 __u32 minor_hash, 425 __u32 minor_hash,
390 struct ext4_dir_entry_2 *dirent) 426 struct ext4_dir_entry_2 *dirent,
427 struct ext4_str *ent_name)
391{ 428{
392 struct rb_node **p, *parent = NULL; 429 struct rb_node **p, *parent = NULL;
393 struct fname *fname, *new_fn; 430 struct fname *fname, *new_fn;
@@ -398,17 +435,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
398 p = &info->root.rb_node; 435 p = &info->root.rb_node;
399 436
400 /* Create and allocate the fname structure */ 437 /* Create and allocate the fname structure */
401 len = sizeof(struct fname) + dirent->name_len + 1; 438 len = sizeof(struct fname) + ent_name->len + 1;
402 new_fn = kzalloc(len, GFP_KERNEL); 439 new_fn = kzalloc(len, GFP_KERNEL);
403 if (!new_fn) 440 if (!new_fn)
404 return -ENOMEM; 441 return -ENOMEM;
405 new_fn->hash = hash; 442 new_fn->hash = hash;
406 new_fn->minor_hash = minor_hash; 443 new_fn->minor_hash = minor_hash;
407 new_fn->inode = le32_to_cpu(dirent->inode); 444 new_fn->inode = le32_to_cpu(dirent->inode);
408 new_fn->name_len = dirent->name_len; 445 new_fn->name_len = ent_name->len;
409 new_fn->file_type = dirent->file_type; 446 new_fn->file_type = dirent->file_type;
410 memcpy(new_fn->name, dirent->name, dirent->name_len); 447 memcpy(new_fn->name, ent_name->name, ent_name->len);
411 new_fn->name[dirent->name_len] = 0; 448 new_fn->name[ent_name->len] = 0;
412 449
413 while (*p) { 450 while (*p) {
414 parent = *p; 451 parent = *p;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c8eb32eefc3c..ef267adce19a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -422,7 +422,7 @@ enum {
422 EXT4_INODE_DIRTY = 8, 422 EXT4_INODE_DIRTY = 8,
423 EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ 423 EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
424 EXT4_INODE_NOCOMPR = 10, /* Don't compress */ 424 EXT4_INODE_NOCOMPR = 10, /* Don't compress */
425 EXT4_INODE_ENCRYPT = 11, /* Compression error */ 425 EXT4_INODE_ENCRYPT = 11, /* Encrypted file */
426/* End compression flags --- maybe not all used */ 426/* End compression flags --- maybe not all used */
427 EXT4_INODE_INDEX = 12, /* hash-indexed directory */ 427 EXT4_INODE_INDEX = 12, /* hash-indexed directory */
428 EXT4_INODE_IMAGIC = 13, /* AFS directory */ 428 EXT4_INODE_IMAGIC = 13, /* AFS directory */
@@ -582,6 +582,15 @@ enum {
582#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 582#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
583#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 583#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
584 584
585/* Encryption algorithms */
586#define EXT4_ENCRYPTION_MODE_INVALID 0
587#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1
588#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2
589#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3
590#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4
591
592#include "ext4_crypto.h"
593
585/* 594/*
586 * ioctl commands 595 * ioctl commands
587 */ 596 */
@@ -603,6 +612,9 @@ enum {
603#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) 612#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
604#define EXT4_IOC_SWAP_BOOT _IO('f', 17) 613#define EXT4_IOC_SWAP_BOOT _IO('f', 17)
605#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) 614#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
615#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy)
616#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16])
617#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy)
606 618
607#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 619#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
608/* 620/*
@@ -939,6 +951,11 @@ struct ext4_inode_info {
939 951
940 /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ 952 /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
941 __u32 i_csum_seed; 953 __u32 i_csum_seed;
954
955#ifdef CONFIG_EXT4_FS_ENCRYPTION
956 /* Encryption params */
957 struct ext4_encryption_key i_encryption_key;
958#endif
942}; 959};
943 960
944/* 961/*
@@ -1142,7 +1159,8 @@ struct ext4_super_block {
1142 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ 1159 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1143 __u8 s_log_groups_per_flex; /* FLEX_BG group size */ 1160 __u8 s_log_groups_per_flex; /* FLEX_BG group size */
1144 __u8 s_checksum_type; /* metadata checksum algorithm used */ 1161 __u8 s_checksum_type; /* metadata checksum algorithm used */
1145 __le16 s_reserved_pad; 1162 __u8 s_encryption_level; /* versioning level for encryption */
1163 __u8 s_reserved_pad; /* Padding to next 32bits */
1146 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ 1164 __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
1147 __le32 s_snapshot_inum; /* Inode number of active snapshot */ 1165 __le32 s_snapshot_inum; /* Inode number of active snapshot */
1148 __le32 s_snapshot_id; /* sequential ID of active snapshot */ 1166 __le32 s_snapshot_id; /* sequential ID of active snapshot */
@@ -1169,7 +1187,9 @@ struct ext4_super_block {
1169 __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ 1187 __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
1170 __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ 1188 __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
1171 __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ 1189 __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
1172 __le32 s_reserved[105]; /* Padding to the end of the block */ 1190 __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
1191 __le32 s_lpf_ino; /* Location of the lost+found inode */
1192 __le32 s_reserved[100]; /* Padding to the end of the block */
1173 __le32 s_checksum; /* crc32c(superblock) */ 1193 __le32 s_checksum; /* crc32c(superblock) */
1174}; 1194};
1175 1195
@@ -1180,8 +1200,16 @@ struct ext4_super_block {
1180/* 1200/*
1181 * run-time mount flags 1201 * run-time mount flags
1182 */ 1202 */
1183#define EXT4_MF_MNTDIR_SAMPLED 0x0001 1203#define EXT4_MF_MNTDIR_SAMPLED 0x0001
1184#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ 1204#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */
1205#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004
1206
1207#ifdef CONFIG_EXT4_FS_ENCRYPTION
1208#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \
1209 EXT4_MF_TEST_DUMMY_ENCRYPTION))
1210#else
1211#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
1212#endif
1185 1213
1186/* Number of quota types we support */ 1214/* Number of quota types we support */
1187#define EXT4_MAXQUOTAS 2 1215#define EXT4_MAXQUOTAS 2
@@ -1351,6 +1379,12 @@ struct ext4_sb_info {
1351 struct ratelimit_state s_err_ratelimit_state; 1379 struct ratelimit_state s_err_ratelimit_state;
1352 struct ratelimit_state s_warning_ratelimit_state; 1380 struct ratelimit_state s_warning_ratelimit_state;
1353 struct ratelimit_state s_msg_ratelimit_state; 1381 struct ratelimit_state s_msg_ratelimit_state;
1382
1383#ifdef CONFIG_EXT4_FS_ENCRYPTION
1384 /* Encryption */
1385 uint32_t s_file_encryption_mode;
1386 uint32_t s_dir_encryption_mode;
1387#endif
1354}; 1388};
1355 1389
1356static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) 1390static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1466,6 +1500,18 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1466#define EXT4_SB(sb) (sb) 1500#define EXT4_SB(sb) (sb)
1467#endif 1501#endif
1468 1502
1503/*
1504 * Returns true if the inode is inode is encrypted
1505 */
1506static inline int ext4_encrypted_inode(struct inode *inode)
1507{
1508#ifdef CONFIG_EXT4_FS_ENCRYPTION
1509 return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
1510#else
1511 return 0;
1512#endif
1513}
1514
1469#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime 1515#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
1470 1516
1471/* 1517/*
@@ -1575,8 +1621,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
1575 EXT4_FEATURE_INCOMPAT_EXTENTS| \ 1621 EXT4_FEATURE_INCOMPAT_EXTENTS| \
1576 EXT4_FEATURE_INCOMPAT_64BIT| \ 1622 EXT4_FEATURE_INCOMPAT_64BIT| \
1577 EXT4_FEATURE_INCOMPAT_FLEX_BG| \ 1623 EXT4_FEATURE_INCOMPAT_FLEX_BG| \
1578 EXT4_FEATURE_INCOMPAT_MMP | \ 1624 EXT4_FEATURE_INCOMPAT_MMP | \
1579 EXT4_FEATURE_INCOMPAT_INLINE_DATA) 1625 EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
1626 EXT4_FEATURE_INCOMPAT_ENCRYPT)
1580#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ 1627#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
1581 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ 1628 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
1582 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ 1629 EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -2001,6 +2048,99 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
2001 struct ext4_group_desc *gdp); 2048 struct ext4_group_desc *gdp);
2002ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); 2049ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
2003 2050
2051/* crypto_policy.c */
2052int ext4_is_child_context_consistent_with_parent(struct inode *parent,
2053 struct inode *child);
2054int ext4_inherit_context(struct inode *parent, struct inode *child);
2055void ext4_to_hex(char *dst, char *src, size_t src_size);
2056int ext4_process_policy(const struct ext4_encryption_policy *policy,
2057 struct inode *inode);
2058int ext4_get_policy(struct inode *inode,
2059 struct ext4_encryption_policy *policy);
2060
2061/* crypto.c */
2062bool ext4_valid_contents_enc_mode(uint32_t mode);
2063uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size);
2064extern struct workqueue_struct *ext4_read_workqueue;
2065struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode);
2066void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
2067void ext4_restore_control_page(struct page *data_page);
2068struct page *ext4_encrypt(struct inode *inode,
2069 struct page *plaintext_page);
2070int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page);
2071int ext4_decrypt_one(struct inode *inode, struct page *page);
2072int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
2073
2074#ifdef CONFIG_EXT4_FS_ENCRYPTION
2075int ext4_init_crypto(void);
2076void ext4_exit_crypto(void);
2077static inline int ext4_sb_has_crypto(struct super_block *sb)
2078{
2079 return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
2080}
2081#else
2082static inline int ext4_init_crypto(void) { return 0; }
2083static inline void ext4_exit_crypto(void) { }
2084static inline int ext4_sb_has_crypto(struct super_block *sb)
2085{
2086 return 0;
2087}
2088#endif
2089
2090/* crypto_fname.c */
2091bool ext4_valid_filenames_enc_mode(uint32_t mode);
2092u32 ext4_fname_crypto_round_up(u32 size, u32 blksize);
2093int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx,
2094 u32 ilen, struct ext4_str *crypto_str);
2095int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
2096 const struct ext4_str *iname,
2097 struct ext4_str *oname);
2098int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx,
2099 const struct ext4_dir_entry_2 *de,
2100 struct ext4_str *oname);
2101int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx,
2102 const struct qstr *iname,
2103 struct ext4_str *oname);
2104int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx,
2105 const struct qstr *iname,
2106 struct dx_hash_info *hinfo);
2107int ext4_fname_disk_to_hash(struct ext4_fname_crypto_ctx *ctx,
2108 const struct ext4_dir_entry_2 *de,
2109 struct dx_hash_info *hinfo);
2110int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx,
2111 u32 namelen);
2112
2113#ifdef CONFIG_EXT4_FS_ENCRYPTION
2114void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx);
2115struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode,
2116 u32 max_len);
2117void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str);
2118#else
2119static inline
2120void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { }
2121static inline
2122struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode,
2123 u32 max_len)
2124{
2125 return NULL;
2126}
2127static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { }
2128#endif
2129
2130
2131/* crypto_key.c */
2132int ext4_generate_encryption_key(struct inode *inode);
2133
2134#ifdef CONFIG_EXT4_FS_ENCRYPTION
2135int ext4_has_encryption_key(struct inode *inode);
2136#else
2137static inline int ext4_has_encryption_key(struct inode *inode)
2138{
2139 return 0;
2140}
2141#endif
2142
2143
2004/* dir.c */ 2144/* dir.c */
2005extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, 2145extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
2006 struct file *, 2146 struct file *,
@@ -2011,17 +2151,20 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
2011 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ 2151 unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
2012 (de), (bh), (buf), (size), (offset))) 2152 (de), (bh), (buf), (size), (offset)))
2013extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, 2153extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
2014 __u32 minor_hash, 2154 __u32 minor_hash,
2015 struct ext4_dir_entry_2 *dirent); 2155 struct ext4_dir_entry_2 *dirent,
2156 struct ext4_str *ent_name);
2016extern void ext4_htree_free_dir_info(struct dir_private_info *p); 2157extern void ext4_htree_free_dir_info(struct dir_private_info *p);
2017extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, 2158extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
2018 struct buffer_head *bh, 2159 struct buffer_head *bh,
2019 void *buf, int buf_size, 2160 void *buf, int buf_size,
2020 const char *name, int namelen, 2161 const char *name, int namelen,
2021 struct ext4_dir_entry_2 **dest_de); 2162 struct ext4_dir_entry_2 **dest_de);
2022void ext4_insert_dentry(struct inode *inode, 2163int ext4_insert_dentry(struct inode *dir,
2164 struct inode *inode,
2023 struct ext4_dir_entry_2 *de, 2165 struct ext4_dir_entry_2 *de,
2024 int buf_size, 2166 int buf_size,
2167 const struct qstr *iname,
2025 const char *name, int namelen); 2168 const char *name, int namelen);
2026static inline void ext4_update_dx_flag(struct inode *inode) 2169static inline void ext4_update_dx_flag(struct inode *inode)
2027{ 2170{
@@ -2099,6 +2242,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
2099extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 2242extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
2100 2243
2101/* inode.c */ 2244/* inode.c */
2245int ext4_inode_is_fast_symlink(struct inode *inode);
2102struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); 2246struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
2103struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); 2247struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
2104int ext4_get_block_write(struct inode *inode, sector_t iblock, 2248int ext4_get_block_write(struct inode *inode, sector_t iblock,
@@ -2189,6 +2333,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
2189 void *entry_buf, 2333 void *entry_buf,
2190 int buf_size, 2334 int buf_size,
2191 int csum_size); 2335 int csum_size);
2336extern int ext4_empty_dir(struct inode *inode);
2192 2337
2193/* resize.c */ 2338/* resize.c */
2194extern int ext4_group_add(struct super_block *sb, 2339extern int ext4_group_add(struct super_block *sb,
@@ -2698,6 +2843,10 @@ static inline void ext4_set_de_type(struct super_block *sb,
2698 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; 2843 de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
2699} 2844}
2700 2845
2846/* readpages.c */
2847extern int ext4_mpage_readpages(struct address_space *mapping,
2848 struct list_head *pages, struct page *page,
2849 unsigned nr_pages);
2701 2850
2702/* symlink.c */ 2851/* symlink.c */
2703extern const struct inode_operations ext4_symlink_inode_operations; 2852extern const struct inode_operations ext4_symlink_inode_operations;
diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h
new file mode 100644
index 000000000000..c2ba35a914b6
--- /dev/null
+++ b/fs/ext4/ext4_crypto.h
@@ -0,0 +1,147 @@
1/*
2 * linux/fs/ext4/ext4_crypto.h
3 *
4 * Copyright (C) 2015, Google, Inc.
5 *
6 * This contains encryption header content for ext4
7 *
8 * Written by Michael Halcrow, 2015.
9 */
10
11#ifndef _EXT4_CRYPTO_H
12#define _EXT4_CRYPTO_H
13
14#include <linux/fs.h>
15
16#define EXT4_KEY_DESCRIPTOR_SIZE 8
17
18/* Policy provided via an ioctl on the topmost directory */
19struct ext4_encryption_policy {
20 char version;
21 char contents_encryption_mode;
22 char filenames_encryption_mode;
23 char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
24} __attribute__((__packed__));
25
26#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1
27#define EXT4_KEY_DERIVATION_NONCE_SIZE 16
28
29/**
30 * Encryption context for inode
31 *
32 * Protector format:
33 * 1 byte: Protector format (1 = this version)
34 * 1 byte: File contents encryption mode
35 * 1 byte: File names encryption mode
36 * 1 byte: Reserved
37 * 8 bytes: Master Key descriptor
38 * 16 bytes: Encryption Key derivation nonce
39 */
40struct ext4_encryption_context {
41 char format;
42 char contents_encryption_mode;
43 char filenames_encryption_mode;
44 char reserved;
45 char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE];
46 char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE];
47} __attribute__((__packed__));
48
49/* Encryption parameters */
50#define EXT4_XTS_TWEAK_SIZE 16
51#define EXT4_AES_128_ECB_KEY_SIZE 16
52#define EXT4_AES_256_GCM_KEY_SIZE 32
53#define EXT4_AES_256_CBC_KEY_SIZE 32
54#define EXT4_AES_256_CTS_KEY_SIZE 32
55#define EXT4_AES_256_XTS_KEY_SIZE 64
56#define EXT4_MAX_KEY_SIZE 64
57
58#define EXT4_KEY_DESC_PREFIX "ext4:"
59#define EXT4_KEY_DESC_PREFIX_SIZE 5
60
61struct ext4_encryption_key {
62 uint32_t mode;
63 char raw[EXT4_MAX_KEY_SIZE];
64 uint32_t size;
65};
66
67#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001
68#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002
69
70struct ext4_crypto_ctx {
71 struct crypto_tfm *tfm; /* Crypto API context */
72 struct page *bounce_page; /* Ciphertext page on write path */
73 struct page *control_page; /* Original page on write path */
74 struct bio *bio; /* The bio for this context */
75 struct work_struct work; /* Work queue for read complete path */
76 struct list_head free_list; /* Free list */
77 int flags; /* Flags */
78 int mode; /* Encryption mode for tfm */
79};
80
81struct ext4_completion_result {
82 struct completion completion;
83 int res;
84};
85
86#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \
87 struct ext4_completion_result ecr = { \
88 COMPLETION_INITIALIZER((ecr).completion), 0 }
89
90static inline int ext4_encryption_key_size(int mode)
91{
92 switch (mode) {
93 case EXT4_ENCRYPTION_MODE_AES_256_XTS:
94 return EXT4_AES_256_XTS_KEY_SIZE;
95 case EXT4_ENCRYPTION_MODE_AES_256_GCM:
96 return EXT4_AES_256_GCM_KEY_SIZE;
97 case EXT4_ENCRYPTION_MODE_AES_256_CBC:
98 return EXT4_AES_256_CBC_KEY_SIZE;
99 case EXT4_ENCRYPTION_MODE_AES_256_CTS:
100 return EXT4_AES_256_CTS_KEY_SIZE;
101 default:
102 BUG();
103 }
104 return 0;
105}
106
107#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4
108#define EXT4_CRYPTO_BLOCK_SIZE 16
109#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32
110
111struct ext4_str {
112 unsigned char *name;
113 u32 len;
114};
115
116struct ext4_fname_crypto_ctx {
117 u32 lim;
118 char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE];
119 struct crypto_ablkcipher *ctfm;
120 struct crypto_hash *htfm;
121 struct page *workpage;
122 struct ext4_encryption_key key;
123 unsigned has_valid_key : 1;
124 unsigned ctfm_key_is_ready : 1;
125};
126
127/**
128 * For encrypted symlinks, the ciphertext length is stored at the beginning
129 * of the string in little-endian format.
130 */
131struct ext4_encrypted_symlink_data {
132 __le16 len;
133 char encrypted_path[1];
134} __attribute__((__packed__));
135
136/**
137 * This function is used to calculate the disk space required to
138 * store a filename of length l in encrypted symlink format.
139 */
140static inline u32 encrypted_symlink_data_len(u32 l)
141{
142 if (l < EXT4_CRYPTO_BLOCK_SIZE)
143 l = EXT4_CRYPTO_BLOCK_SIZE;
144 return (l + sizeof(struct ext4_encrypted_symlink_data) - 1);
145}
146
147#endif /* _EXT4_CRYPTO_H */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bed43081720f..973816bfe4a9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1717,12 +1717,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
1717{ 1717{
1718 unsigned short ext1_ee_len, ext2_ee_len; 1718 unsigned short ext1_ee_len, ext2_ee_len;
1719 1719
1720 /*
1721 * Make sure that both extents are initialized. We don't merge
1722 * unwritten extents so that we can be sure that end_io code has
1723 * the extent that was written properly split out and conversion to
1724 * initialized is trivial.
1725 */
1726 if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) 1720 if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
1727 return 0; 1721 return 0;
1728 1722
@@ -3128,6 +3122,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
3128 ee_len = ext4_ext_get_actual_len(ex); 3122 ee_len = ext4_ext_get_actual_len(ex);
3129 ee_pblock = ext4_ext_pblock(ex); 3123 ee_pblock = ext4_ext_pblock(ex);
3130 3124
3125 if (ext4_encrypted_inode(inode))
3126 return ext4_encrypted_zeroout(inode, ex);
3127
3131 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); 3128 ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
3132 if (ret > 0) 3129 if (ret > 0)
3133 ret = 0; 3130 ret = 0;
@@ -4535,19 +4532,7 @@ got_allocated_blocks:
4535 */ 4532 */
4536 reserved_clusters = get_reserved_cluster_alloc(inode, 4533 reserved_clusters = get_reserved_cluster_alloc(inode,
4537 map->m_lblk, allocated); 4534 map->m_lblk, allocated);
4538 if (map_from_cluster) { 4535 if (!map_from_cluster) {
4539 if (reserved_clusters) {
4540 /*
4541 * We have clusters reserved for this range.
4542 * But since we are not doing actual allocation
4543 * and are simply using blocks from previously
4544 * allocated cluster, we should release the
4545 * reservation and not claim quota.
4546 */
4547 ext4_da_update_reserve_space(inode,
4548 reserved_clusters, 0);
4549 }
4550 } else {
4551 BUG_ON(allocated_clusters < reserved_clusters); 4536 BUG_ON(allocated_clusters < reserved_clusters);
4552 if (reserved_clusters < allocated_clusters) { 4537 if (reserved_clusters < allocated_clusters) {
4553 struct ext4_inode_info *ei = EXT4_I(inode); 4538 struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4803,12 +4788,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4803 else 4788 else
4804 max_blocks -= lblk; 4789 max_blocks -= lblk;
4805 4790
4806 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
4807 EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4808 EXT4_EX_NOCACHE;
4809 if (mode & FALLOC_FL_KEEP_SIZE)
4810 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4811
4812 mutex_lock(&inode->i_mutex); 4791 mutex_lock(&inode->i_mutex);
4813 4792
4814 /* 4793 /*
@@ -4825,15 +4804,28 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4825 ret = inode_newsize_ok(inode, new_size); 4804 ret = inode_newsize_ok(inode, new_size);
4826 if (ret) 4805 if (ret)
4827 goto out_mutex; 4806 goto out_mutex;
4828 /*
4829 * If we have a partial block after EOF we have to allocate
4830 * the entire block.
4831 */
4832 if (partial_end)
4833 max_blocks += 1;
4834 } 4807 }
4835 4808
4809 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
4810 if (mode & FALLOC_FL_KEEP_SIZE)
4811 flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
4812
4813 /* Preallocate the range including the unaligned edges */
4814 if (partial_begin || partial_end) {
4815 ret = ext4_alloc_file_blocks(file,
4816 round_down(offset, 1 << blkbits) >> blkbits,
4817 (round_up((offset + len), 1 << blkbits) -
4818 round_down(offset, 1 << blkbits)) >> blkbits,
4819 new_size, flags, mode);
4820 if (ret)
4821 goto out_mutex;
4822
4823 }
4824
4825 /* Zero range excluding the unaligned edges */
4836 if (max_blocks > 0) { 4826 if (max_blocks > 0) {
4827 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
4828 EXT4_EX_NOCACHE);
4837 4829
4838 /* Now release the pages and zero block aligned part of pages*/ 4830 /* Now release the pages and zero block aligned part of pages*/
4839 truncate_pagecache_range(inode, start, end - 1); 4831 truncate_pagecache_range(inode, start, end - 1);
@@ -4847,19 +4839,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4847 flags, mode); 4839 flags, mode);
4848 if (ret) 4840 if (ret)
4849 goto out_dio; 4841 goto out_dio;
4850 /*
4851 * Remove entire range from the extent status tree.
4852 *
4853 * ext4_es_remove_extent(inode, lblk, max_blocks) is
4854 * NOT sufficient. I'm not sure why this is the case,
4855 * but let's be conservative and remove the extent
4856 * status tree for the entire inode. There should be
4857 * no outstanding delalloc extents thanks to the
4858 * filemap_write_and_wait_range() call above.
4859 */
4860 ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
4861 if (ret)
4862 goto out_dio;
4863 } 4842 }
4864 if (!partial_begin && !partial_end) 4843 if (!partial_begin && !partial_end)
4865 goto out_dio; 4844 goto out_dio;
@@ -4922,6 +4901,20 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4922 ext4_lblk_t lblk; 4901 ext4_lblk_t lblk;
4923 unsigned int blkbits = inode->i_blkbits; 4902 unsigned int blkbits = inode->i_blkbits;
4924 4903
4904 /*
4905 * Encrypted inodes can't handle collapse range or insert
4906 * range since we would need to re-encrypt blocks with a
4907 * different IV or XTS tweak (which are based on the logical
4908 * block number).
4909 *
4910 * XXX It's not clear why zero range isn't working, but we'll
4911 * leave it disabled for encrypted inodes for now. This is a
4912 * bug we should fix....
4913 */
4914 if (ext4_encrypted_inode(inode) &&
4915 (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
4916 return -EOPNOTSUPP;
4917
4925 /* Return error if mode is not supported */ 4918 /* Return error if mode is not supported */
4926 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 4919 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
4927 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) 4920 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e04d45733976..d33d5a6852b9 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -9,12 +9,10 @@
9 * 9 *
10 * Ext4 extents status tree core functions. 10 * Ext4 extents status tree core functions.
11 */ 11 */
12#include <linux/rbtree.h>
13#include <linux/list_sort.h> 12#include <linux/list_sort.h>
14#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
15#include <linux/seq_file.h> 14#include <linux/seq_file.h>
16#include "ext4.h" 15#include "ext4.h"
17#include "extents_status.h"
18 16
19#include <trace/events/ext4.h> 17#include <trace/events/ext4.h>
20 18
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e576d682b353..0613c256c344 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -20,7 +20,6 @@
20 20
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/jbd2.h>
24#include <linux/mount.h> 23#include <linux/mount.h>
25#include <linux/path.h> 24#include <linux/path.h>
26#include <linux/quotaops.h> 25#include <linux/quotaops.h>
@@ -221,6 +220,13 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
221 220
222static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 221static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
223{ 222{
223 struct inode *inode = file->f_mapping->host;
224
225 if (ext4_encrypted_inode(inode)) {
226 int err = ext4_generate_encryption_key(inode);
227 if (err)
228 return 0;
229 }
224 file_accessed(file); 230 file_accessed(file);
225 if (IS_DAX(file_inode(file))) { 231 if (IS_DAX(file_inode(file))) {
226 vma->vm_ops = &ext4_dax_vm_ops; 232 vma->vm_ops = &ext4_dax_vm_ops;
@@ -238,6 +244,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
238 struct vfsmount *mnt = filp->f_path.mnt; 244 struct vfsmount *mnt = filp->f_path.mnt;
239 struct path path; 245 struct path path;
240 char buf[64], *cp; 246 char buf[64], *cp;
247 int ret;
241 248
242 if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && 249 if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
243 !(sb->s_flags & MS_RDONLY))) { 250 !(sb->s_flags & MS_RDONLY))) {
@@ -276,11 +283,17 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
276 * writing and the journal is present 283 * writing and the journal is present
277 */ 284 */
278 if (filp->f_mode & FMODE_WRITE) { 285 if (filp->f_mode & FMODE_WRITE) {
279 int ret = ext4_inode_attach_jinode(inode); 286 ret = ext4_inode_attach_jinode(inode);
280 if (ret < 0) 287 if (ret < 0)
281 return ret; 288 return ret;
282 } 289 }
283 return dquot_file_open(inode, filp); 290 ret = dquot_file_open(inode, filp);
291 if (!ret && ext4_encrypted_inode(inode)) {
292 ret = ext4_generate_encryption_key(inode);
293 if (ret)
294 ret = -EACCES;
295 }
296 return ret;
284} 297}
285 298
286/* 299/*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index a8bc47f75fa0..e9d632e9aa4b 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -26,7 +26,6 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/writeback.h> 28#include <linux/writeback.h>
29#include <linux/jbd2.h>
30#include <linux/blkdev.h> 29#include <linux/blkdev.h>
31 30
32#include "ext4.h" 31#include "ext4.h"
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 3d586f02883e..e026aa941fd5 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -10,7 +10,6 @@
10 */ 10 */
11 11
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/jbd2.h>
14#include <linux/cryptohash.h> 13#include <linux/cryptohash.h>
15#include "ext4.h" 14#include "ext4.h"
16 15
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index ac644c31ca67..2cf18a2d5c72 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -14,7 +14,6 @@
14 14
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/jbd2.h>
18#include <linux/stat.h> 17#include <linux/stat.h>
19#include <linux/string.h> 18#include <linux/string.h>
20#include <linux/quotaops.h> 19#include <linux/quotaops.h>
@@ -997,6 +996,12 @@ got:
997 ei->i_block_group = group; 996 ei->i_block_group = group;
998 ei->i_last_alloc_group = ~0; 997 ei->i_last_alloc_group = ~0;
999 998
999 /* If the directory encrypted, then we should encrypt the inode. */
1000 if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) &&
1001 (ext4_encrypted_inode(dir) ||
1002 DUMMY_ENCRYPTION_ENABLED(sbi)))
1003 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
1004
1000 ext4_set_inode_flags(inode); 1005 ext4_set_inode_flags(inode);
1001 if (IS_DIRSYNC(inode)) 1006 if (IS_DIRSYNC(inode))
1002 ext4_handle_sync(handle); 1007 ext4_handle_sync(handle);
@@ -1029,11 +1034,28 @@ got:
1029 ext4_set_inode_state(inode, EXT4_STATE_NEW); 1034 ext4_set_inode_state(inode, EXT4_STATE_NEW);
1030 1035
1031 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; 1036 ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
1032 1037#ifdef CONFIG_EXT4_FS_ENCRYPTION
1038 if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) &&
1039 (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) {
1040 ei->i_inline_off = 0;
1041 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
1042 EXT4_FEATURE_INCOMPAT_INLINE_DATA))
1043 ext4_set_inode_state(inode,
1044 EXT4_STATE_MAY_INLINE_DATA);
1045 } else {
1046 /* Inline data and encryption are incompatible
1047 * We turn off inline data since encryption is enabled */
1048 ei->i_inline_off = 1;
1049 if (EXT4_HAS_INCOMPAT_FEATURE(sb,
1050 EXT4_FEATURE_INCOMPAT_INLINE_DATA))
1051 ext4_clear_inode_state(inode,
1052 EXT4_STATE_MAY_INLINE_DATA);
1053 }
1054#else
1033 ei->i_inline_off = 0; 1055 ei->i_inline_off = 0;
1034 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) 1056 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
1035 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 1057 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
1036 1058#endif
1037 ret = inode; 1059 ret = inode;
1038 err = dquot_alloc_inode(inode); 1060 err = dquot_alloc_inode(inode);
1039 if (err) 1061 if (err)
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 4b143febf21f..feb2cafbeace 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -11,11 +11,13 @@
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 */ 13 */
14
15#include <linux/fiemap.h>
16
14#include "ext4_jbd2.h" 17#include "ext4_jbd2.h"
15#include "ext4.h" 18#include "ext4.h"
16#include "xattr.h" 19#include "xattr.h"
17#include "truncate.h" 20#include "truncate.h"
18#include <linux/fiemap.h>
19 21
20#define EXT4_XATTR_SYSTEM_DATA "data" 22#define EXT4_XATTR_SYSTEM_DATA "data"
21#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) 23#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
@@ -972,7 +974,7 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
972 offset = 0; 974 offset = 0;
973 while ((void *)de < dlimit) { 975 while ((void *)de < dlimit) {
974 de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); 976 de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
975 trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", 977 trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n",
976 offset, de_len, de->name_len, de->name, 978 offset, de_len, de->name_len, de->name,
977 de->name_len, le32_to_cpu(de->inode)); 979 de->name_len, le32_to_cpu(de->inode));
978 if (ext4_check_dir_entry(dir, NULL, de, bh, 980 if (ext4_check_dir_entry(dir, NULL, de, bh,
@@ -1014,7 +1016,8 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
1014 err = ext4_journal_get_write_access(handle, iloc->bh); 1016 err = ext4_journal_get_write_access(handle, iloc->bh);
1015 if (err) 1017 if (err)
1016 return err; 1018 return err;
1017 ext4_insert_dentry(inode, de, inline_size, name, namelen); 1019 ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name,
1020 name, namelen);
1018 1021
1019 ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); 1022 ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
1020 1023
@@ -1327,6 +1330,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
1327 struct ext4_iloc iloc; 1330 struct ext4_iloc iloc;
1328 void *dir_buf = NULL; 1331 void *dir_buf = NULL;
1329 struct ext4_dir_entry_2 fake; 1332 struct ext4_dir_entry_2 fake;
1333 struct ext4_str tmp_str;
1330 1334
1331 ret = ext4_get_inode_loc(inode, &iloc); 1335 ret = ext4_get_inode_loc(inode, &iloc);
1332 if (ret) 1336 if (ret)
@@ -1398,8 +1402,10 @@ int htree_inlinedir_to_tree(struct file *dir_file,
1398 continue; 1402 continue;
1399 if (de->inode == 0) 1403 if (de->inode == 0)
1400 continue; 1404 continue;
1401 err = ext4_htree_store_dirent(dir_file, 1405 tmp_str.name = de->name;
1402 hinfo->hash, hinfo->minor_hash, de); 1406 tmp_str.len = de->name_len;
1407 err = ext4_htree_store_dirent(dir_file, hinfo->hash,
1408 hinfo->minor_hash, de, &tmp_str);
1403 if (err) { 1409 if (err) {
1404 count = err; 1410 count = err;
1405 goto out; 1411 goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b49cf6e59953..366476e71e10 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -20,7 +20,6 @@
20 20
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/time.h> 22#include <linux/time.h>
23#include <linux/jbd2.h>
24#include <linux/highuid.h> 23#include <linux/highuid.h>
25#include <linux/pagemap.h> 24#include <linux/pagemap.h>
26#include <linux/quotaops.h> 25#include <linux/quotaops.h>
@@ -36,7 +35,6 @@
36#include <linux/kernel.h> 35#include <linux/kernel.h>
37#include <linux/printk.h> 36#include <linux/printk.h>
38#include <linux/slab.h> 37#include <linux/slab.h>
39#include <linux/ratelimit.h>
40#include <linux/bitops.h> 38#include <linux/bitops.h>
41 39
42#include "ext4_jbd2.h" 40#include "ext4_jbd2.h"
@@ -140,7 +138,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
140/* 138/*
141 * Test whether an inode is a fast symlink. 139 * Test whether an inode is a fast symlink.
142 */ 140 */
143static int ext4_inode_is_fast_symlink(struct inode *inode) 141int ext4_inode_is_fast_symlink(struct inode *inode)
144{ 142{
145 int ea_blocks = EXT4_I(inode)->i_file_acl ? 143 int ea_blocks = EXT4_I(inode)->i_file_acl ?
146 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; 144 EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
@@ -887,6 +885,95 @@ int do_journal_get_write_access(handle_t *handle,
887 885
888static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, 886static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
889 struct buffer_head *bh_result, int create); 887 struct buffer_head *bh_result, int create);
888
889#ifdef CONFIG_EXT4_FS_ENCRYPTION
890static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
891 get_block_t *get_block)
892{
893 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
894 unsigned to = from + len;
895 struct inode *inode = page->mapping->host;
896 unsigned block_start, block_end;
897 sector_t block;
898 int err = 0;
899 unsigned blocksize = inode->i_sb->s_blocksize;
900 unsigned bbits;
901 struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
902 bool decrypt = false;
903
904 BUG_ON(!PageLocked(page));
905 BUG_ON(from > PAGE_CACHE_SIZE);
906 BUG_ON(to > PAGE_CACHE_SIZE);
907 BUG_ON(from > to);
908
909 if (!page_has_buffers(page))
910 create_empty_buffers(page, blocksize, 0);
911 head = page_buffers(page);
912 bbits = ilog2(blocksize);
913 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
914
915 for (bh = head, block_start = 0; bh != head || !block_start;
916 block++, block_start = block_end, bh = bh->b_this_page) {
917 block_end = block_start + blocksize;
918 if (block_end <= from || block_start >= to) {
919 if (PageUptodate(page)) {
920 if (!buffer_uptodate(bh))
921 set_buffer_uptodate(bh);
922 }
923 continue;
924 }
925 if (buffer_new(bh))
926 clear_buffer_new(bh);
927 if (!buffer_mapped(bh)) {
928 WARN_ON(bh->b_size != blocksize);
929 err = get_block(inode, block, bh, 1);
930 if (err)
931 break;
932 if (buffer_new(bh)) {
933 unmap_underlying_metadata(bh->b_bdev,
934 bh->b_blocknr);
935 if (PageUptodate(page)) {
936 clear_buffer_new(bh);
937 set_buffer_uptodate(bh);
938 mark_buffer_dirty(bh);
939 continue;
940 }
941 if (block_end > to || block_start < from)
942 zero_user_segments(page, to, block_end,
943 block_start, from);
944 continue;
945 }
946 }
947 if (PageUptodate(page)) {
948 if (!buffer_uptodate(bh))
949 set_buffer_uptodate(bh);
950 continue;
951 }
952 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
953 !buffer_unwritten(bh) &&
954 (block_start < from || block_end > to)) {
955 ll_rw_block(READ, 1, &bh);
956 *wait_bh++ = bh;
957 decrypt = ext4_encrypted_inode(inode) &&
958 S_ISREG(inode->i_mode);
959 }
960 }
961 /*
962 * If we issued read requests, let them complete.
963 */
964 while (wait_bh > wait) {
965 wait_on_buffer(*--wait_bh);
966 if (!buffer_uptodate(*wait_bh))
967 err = -EIO;
968 }
969 if (unlikely(err))
970 page_zero_new_buffers(page, from, to);
971 else if (decrypt)
972 err = ext4_decrypt_one(inode, page);
973 return err;
974}
975#endif
976
890static int ext4_write_begin(struct file *file, struct address_space *mapping, 977static int ext4_write_begin(struct file *file, struct address_space *mapping,
891 loff_t pos, unsigned len, unsigned flags, 978 loff_t pos, unsigned len, unsigned flags,
892 struct page **pagep, void **fsdata) 979 struct page **pagep, void **fsdata)
@@ -949,11 +1036,19 @@ retry_journal:
949 /* In case writeback began while the page was unlocked */ 1036 /* In case writeback began while the page was unlocked */
950 wait_for_stable_page(page); 1037 wait_for_stable_page(page);
951 1038
1039#ifdef CONFIG_EXT4_FS_ENCRYPTION
1040 if (ext4_should_dioread_nolock(inode))
1041 ret = ext4_block_write_begin(page, pos, len,
1042 ext4_get_block_write);
1043 else
1044 ret = ext4_block_write_begin(page, pos, len,
1045 ext4_get_block);
1046#else
952 if (ext4_should_dioread_nolock(inode)) 1047 if (ext4_should_dioread_nolock(inode))
953 ret = __block_write_begin(page, pos, len, ext4_get_block_write); 1048 ret = __block_write_begin(page, pos, len, ext4_get_block_write);
954 else 1049 else
955 ret = __block_write_begin(page, pos, len, ext4_get_block); 1050 ret = __block_write_begin(page, pos, len, ext4_get_block);
956 1051#endif
957 if (!ret && ext4_should_journal_data(inode)) { 1052 if (!ret && ext4_should_journal_data(inode)) {
958 ret = ext4_walk_page_buffers(handle, page_buffers(page), 1053 ret = ext4_walk_page_buffers(handle, page_buffers(page),
959 from, to, NULL, 1054 from, to, NULL,
@@ -2575,7 +2670,12 @@ retry_journal:
2575 /* In case writeback began while the page was unlocked */ 2670 /* In case writeback began while the page was unlocked */
2576 wait_for_stable_page(page); 2671 wait_for_stable_page(page);
2577 2672
2673#ifdef CONFIG_EXT4_FS_ENCRYPTION
2674 ret = ext4_block_write_begin(page, pos, len,
2675 ext4_da_get_block_prep);
2676#else
2578 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); 2677 ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
2678#endif
2579 if (ret < 0) { 2679 if (ret < 0) {
2580 unlock_page(page); 2680 unlock_page(page);
2581 ext4_journal_stop(handle); 2681 ext4_journal_stop(handle);
@@ -2821,7 +2921,7 @@ static int ext4_readpage(struct file *file, struct page *page)
2821 ret = ext4_readpage_inline(inode, page); 2921 ret = ext4_readpage_inline(inode, page);
2822 2922
2823 if (ret == -EAGAIN) 2923 if (ret == -EAGAIN)
2824 return mpage_readpage(page, ext4_get_block); 2924 return ext4_mpage_readpages(page->mapping, NULL, page, 1);
2825 2925
2826 return ret; 2926 return ret;
2827} 2927}
@@ -2836,7 +2936,7 @@ ext4_readpages(struct file *file, struct address_space *mapping,
2836 if (ext4_has_inline_data(inode)) 2936 if (ext4_has_inline_data(inode))
2837 return 0; 2937 return 0;
2838 2938
2839 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2939 return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
2840} 2940}
2841 2941
2842static void ext4_invalidatepage(struct page *page, unsigned int offset, 2942static void ext4_invalidatepage(struct page *page, unsigned int offset,
@@ -3033,6 +3133,9 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3033 get_block_func = ext4_get_block_write; 3133 get_block_func = ext4_get_block_write;
3034 dio_flags = DIO_LOCKING; 3134 dio_flags = DIO_LOCKING;
3035 } 3135 }
3136#ifdef CONFIG_EXT4_FS_ENCRYPTION
3137 BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
3138#endif
3036 if (IS_DAX(inode)) 3139 if (IS_DAX(inode))
3037 ret = dax_do_io(iocb, inode, iter, offset, get_block_func, 3140 ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
3038 ext4_end_io_dio, dio_flags); 3141 ext4_end_io_dio, dio_flags);
@@ -3097,6 +3200,11 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
3097 size_t count = iov_iter_count(iter); 3200 size_t count = iov_iter_count(iter);
3098 ssize_t ret; 3201 ssize_t ret;
3099 3202
3203#ifdef CONFIG_EXT4_FS_ENCRYPTION
3204 if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
3205 return 0;
3206#endif
3207
3100 /* 3208 /*
3101 * If we are doing data journalling we don't support O_DIRECT 3209 * If we are doing data journalling we don't support O_DIRECT
3102 */ 3210 */
@@ -3261,6 +3369,13 @@ static int __ext4_block_zero_page_range(handle_t *handle,
3261 /* Uhhuh. Read error. Complain and punt. */ 3369 /* Uhhuh. Read error. Complain and punt. */
3262 if (!buffer_uptodate(bh)) 3370 if (!buffer_uptodate(bh))
3263 goto unlock; 3371 goto unlock;
3372 if (S_ISREG(inode->i_mode) &&
3373 ext4_encrypted_inode(inode)) {
3374 /* We expect the key to be set. */
3375 BUG_ON(!ext4_has_encryption_key(inode));
3376 BUG_ON(blocksize != PAGE_CACHE_SIZE);
3377 WARN_ON_ONCE(ext4_decrypt_one(inode, page));
3378 }
3264 } 3379 }
3265 if (ext4_should_journal_data(inode)) { 3380 if (ext4_should_journal_data(inode)) {
3266 BUFFER_TRACE(bh, "get write access"); 3381 BUFFER_TRACE(bh, "get write access");
@@ -4096,7 +4211,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4096 inode->i_op = &ext4_dir_inode_operations; 4211 inode->i_op = &ext4_dir_inode_operations;
4097 inode->i_fop = &ext4_dir_operations; 4212 inode->i_fop = &ext4_dir_operations;
4098 } else if (S_ISLNK(inode->i_mode)) { 4213 } else if (S_ISLNK(inode->i_mode)) {
4099 if (ext4_inode_is_fast_symlink(inode)) { 4214 if (ext4_inode_is_fast_symlink(inode) &&
4215 !ext4_encrypted_inode(inode)) {
4100 inode->i_op = &ext4_fast_symlink_inode_operations; 4216 inode->i_op = &ext4_fast_symlink_inode_operations;
4101 nd_terminate_link(ei->i_data, inode->i_size, 4217 nd_terminate_link(ei->i_data, inode->i_size,
4102 sizeof(ei->i_data) - 1); 4218 sizeof(ei->i_data) - 1);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index f58a0d106726..2cb9e178d1c5 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -8,12 +8,12 @@
8 */ 8 */
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/jbd2.h>
12#include <linux/capability.h> 11#include <linux/capability.h>
13#include <linux/time.h> 12#include <linux/time.h>
14#include <linux/compat.h> 13#include <linux/compat.h>
15#include <linux/mount.h> 14#include <linux/mount.h>
16#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/random.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "ext4_jbd2.h" 18#include "ext4_jbd2.h"
19#include "ext4.h" 19#include "ext4.h"
@@ -196,6 +196,16 @@ journal_err_out:
196 return err; 196 return err;
197} 197}
198 198
199static int uuid_is_zero(__u8 u[16])
200{
201 int i;
202
203 for (i = 0; i < 16; i++)
204 if (u[i])
205 return 0;
206 return 1;
207}
208
199long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 209long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
200{ 210{
201 struct inode *inode = file_inode(filp); 211 struct inode *inode = file_inode(filp);
@@ -615,7 +625,78 @@ resizefs_out:
615 } 625 }
616 case EXT4_IOC_PRECACHE_EXTENTS: 626 case EXT4_IOC_PRECACHE_EXTENTS:
617 return ext4_ext_precache(inode); 627 return ext4_ext_precache(inode);
628 case EXT4_IOC_SET_ENCRYPTION_POLICY: {
629#ifdef CONFIG_EXT4_FS_ENCRYPTION
630 struct ext4_encryption_policy policy;
631 int err = 0;
632
633 if (copy_from_user(&policy,
634 (struct ext4_encryption_policy __user *)arg,
635 sizeof(policy))) {
636 err = -EFAULT;
637 goto encryption_policy_out;
638 }
618 639
640 err = ext4_process_policy(&policy, inode);
641encryption_policy_out:
642 return err;
643#else
644 return -EOPNOTSUPP;
645#endif
646 }
647 case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
648 int err, err2;
649 struct ext4_sb_info *sbi = EXT4_SB(sb);
650 handle_t *handle;
651
652 if (!ext4_sb_has_crypto(sb))
653 return -EOPNOTSUPP;
654 if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
655 err = mnt_want_write_file(filp);
656 if (err)
657 return err;
658 handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
659 if (IS_ERR(handle)) {
660 err = PTR_ERR(handle);
661 goto pwsalt_err_exit;
662 }
663 err = ext4_journal_get_write_access(handle, sbi->s_sbh);
664 if (err)
665 goto pwsalt_err_journal;
666 generate_random_uuid(sbi->s_es->s_encrypt_pw_salt);
667 err = ext4_handle_dirty_metadata(handle, NULL,
668 sbi->s_sbh);
669 pwsalt_err_journal:
670 err2 = ext4_journal_stop(handle);
671 if (err2 && !err)
672 err = err2;
673 pwsalt_err_exit:
674 mnt_drop_write_file(filp);
675 if (err)
676 return err;
677 }
678 if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt,
679 16))
680 return -EFAULT;
681 return 0;
682 }
683 case EXT4_IOC_GET_ENCRYPTION_POLICY: {
684#ifdef CONFIG_EXT4_FS_ENCRYPTION
685 struct ext4_encryption_policy policy;
686 int err = 0;
687
688 if (!ext4_encrypted_inode(inode))
689 return -ENOENT;
690 err = ext4_get_policy(inode, &policy);
691 if (err)
692 return err;
693 if (copy_to_user((void *)arg, &policy, sizeof(policy)))
694 return -EFAULT;
695 return 0;
696#else
697 return -EOPNOTSUPP;
698#endif
699 }
619 default: 700 default:
620 return -ENOTTY; 701 return -ENOTTY;
621 } 702 }
@@ -680,6 +761,9 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
680 case FITRIM: 761 case FITRIM:
681 case EXT4_IOC_RESIZE_FS: 762 case EXT4_IOC_RESIZE_FS:
682 case EXT4_IOC_PRECACHE_EXTENTS: 763 case EXT4_IOC_PRECACHE_EXTENTS:
764 case EXT4_IOC_SET_ENCRYPTION_POLICY:
765 case EXT4_IOC_GET_ENCRYPTION_PWSALT:
766 case EXT4_IOC_GET_ENCRYPTION_POLICY:
683 break; 767 break;
684 default: 768 default:
685 return -ENOIOCTLCMD; 769 return -ENOIOCTLCMD;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..ef22cd951c0c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -26,7 +26,6 @@
26 26
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/pagemap.h> 28#include <linux/pagemap.h>
29#include <linux/jbd2.h>
30#include <linux/time.h> 29#include <linux/time.h>
31#include <linux/fcntl.h> 30#include <linux/fcntl.h>
32#include <linux/stat.h> 31#include <linux/stat.h>
@@ -254,8 +253,9 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
254 struct dx_hash_info *hinfo, 253 struct dx_hash_info *hinfo,
255 struct dx_frame *frame); 254 struct dx_frame *frame);
256static void dx_release(struct dx_frame *frames); 255static void dx_release(struct dx_frame *frames);
257static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, 256static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
258 struct dx_hash_info *hinfo, struct dx_map_entry map[]); 257 unsigned blocksize, struct dx_hash_info *hinfo,
258 struct dx_map_entry map[]);
259static void dx_sort_map(struct dx_map_entry *map, unsigned count); 259static void dx_sort_map(struct dx_map_entry *map, unsigned count);
260static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, 260static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
261 struct dx_map_entry *offsets, int count, unsigned blocksize); 261 struct dx_map_entry *offsets, int count, unsigned blocksize);
@@ -586,8 +586,10 @@ struct stats
586 unsigned bcount; 586 unsigned bcount;
587}; 587};
588 588
589static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, 589static struct stats dx_show_leaf(struct inode *dir,
590 int size, int show_names) 590 struct dx_hash_info *hinfo,
591 struct ext4_dir_entry_2 *de,
592 int size, int show_names)
591{ 593{
592 unsigned names = 0, space = 0; 594 unsigned names = 0, space = 0;
593 char *base = (char *) de; 595 char *base = (char *) de;
@@ -600,12 +602,80 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
600 { 602 {
601 if (show_names) 603 if (show_names)
602 { 604 {
605#ifdef CONFIG_EXT4_FS_ENCRYPTION
606 int len;
607 char *name;
608 struct ext4_str fname_crypto_str
609 = {.name = NULL, .len = 0};
610 struct ext4_fname_crypto_ctx *ctx = NULL;
611 int res;
612
613 name = de->name;
614 len = de->name_len;
615 ctx = ext4_get_fname_crypto_ctx(dir,
616 EXT4_NAME_LEN);
617 if (IS_ERR(ctx)) {
618 printk(KERN_WARNING "Error acquiring"
619 " crypto ctxt--skipping crypto\n");
620 ctx = NULL;
621 }
622 if (ctx == NULL) {
623 /* Directory is not encrypted */
624 ext4fs_dirhash(de->name,
625 de->name_len, &h);
626 printk("%*.s:(U)%x.%u ", len,
627 name, h.hash,
628 (unsigned) ((char *) de
629 - base));
630 } else {
631 /* Directory is encrypted */
632 res = ext4_fname_crypto_alloc_buffer(
633 ctx, de->name_len,
634 &fname_crypto_str);
635 if (res < 0) {
636 printk(KERN_WARNING "Error "
637 "allocating crypto "
638 "buffer--skipping "
639 "crypto\n");
640 ext4_put_fname_crypto_ctx(&ctx);
641 ctx = NULL;
642 }
643 res = ext4_fname_disk_to_usr(ctx, de,
644 &fname_crypto_str);
645 if (res < 0) {
646 printk(KERN_WARNING "Error "
647 "converting filename "
648 "from disk to usr"
649 "\n");
650 name = "??";
651 len = 2;
652 } else {
653 name = fname_crypto_str.name;
654 len = fname_crypto_str.len;
655 }
656 res = ext4_fname_disk_to_hash(ctx, de,
657 &h);
658 if (res < 0) {
659 printk(KERN_WARNING "Error "
660 "converting filename "
661 "from disk to htree"
662 "\n");
663 h.hash = 0xDEADBEEF;
664 }
665 printk("%*.s:(E)%x.%u ", len, name,
666 h.hash, (unsigned) ((char *) de
667 - base));
668 ext4_put_fname_crypto_ctx(&ctx);
669 ext4_fname_crypto_free_buffer(
670 &fname_crypto_str);
671 }
672#else
603 int len = de->name_len; 673 int len = de->name_len;
604 char *name = de->name; 674 char *name = de->name;
605 while (len--) printk("%c", *name++);
606 ext4fs_dirhash(de->name, de->name_len, &h); 675 ext4fs_dirhash(de->name, de->name_len, &h);
607 printk(":%x.%u ", h.hash, 676 printk("%*.s:%x.%u ", len, name, h.hash,
608 (unsigned) ((char *) de - base)); 677 (unsigned) ((char *) de - base));
678#endif
609 } 679 }
610 space += EXT4_DIR_REC_LEN(de->name_len); 680 space += EXT4_DIR_REC_LEN(de->name_len);
611 names++; 681 names++;
@@ -623,7 +693,6 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
623 unsigned count = dx_get_count(entries), names = 0, space = 0, i; 693 unsigned count = dx_get_count(entries), names = 0, space = 0, i;
624 unsigned bcount = 0; 694 unsigned bcount = 0;
625 struct buffer_head *bh; 695 struct buffer_head *bh;
626 int err;
627 printk("%i indexed blocks...\n", count); 696 printk("%i indexed blocks...\n", count);
628 for (i = 0; i < count; i++, entries++) 697 for (i = 0; i < count; i++, entries++)
629 { 698 {
@@ -637,7 +706,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
637 continue; 706 continue;
638 stats = levels? 707 stats = levels?
639 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): 708 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
640 dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); 709 dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *)
710 bh->b_data, blocksize, 0);
641 names += stats.names; 711 names += stats.names;
642 space += stats.space; 712 space += stats.space;
643 bcount += stats.bcount; 713 bcount += stats.bcount;
@@ -687,8 +757,28 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
687 if (hinfo->hash_version <= DX_HASH_TEA) 757 if (hinfo->hash_version <= DX_HASH_TEA)
688 hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; 758 hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
689 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; 759 hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
760#ifdef CONFIG_EXT4_FS_ENCRYPTION
761 if (d_name) {
762 struct ext4_fname_crypto_ctx *ctx = NULL;
763 int res;
764
765 /* Check if the directory is encrypted */
766 ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
767 if (IS_ERR(ctx)) {
768 ret_err = ERR_PTR(PTR_ERR(ctx));
769 goto fail;
770 }
771 res = ext4_fname_usr_to_hash(ctx, d_name, hinfo);
772 if (res < 0) {
773 ret_err = ERR_PTR(res);
774 goto fail;
775 }
776 ext4_put_fname_crypto_ctx(&ctx);
777 }
778#else
690 if (d_name) 779 if (d_name)
691 ext4fs_dirhash(d_name->name, d_name->len, hinfo); 780 ext4fs_dirhash(d_name->name, d_name->len, hinfo);
781#endif
692 hash = hinfo->hash; 782 hash = hinfo->hash;
693 783
694 if (root->info.unused_flags & 1) { 784 if (root->info.unused_flags & 1) {
@@ -773,6 +863,7 @@ fail:
773 brelse(frame->bh); 863 brelse(frame->bh);
774 frame--; 864 frame--;
775 } 865 }
866
776 if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) 867 if (ret_err == ERR_PTR(ERR_BAD_DX_DIR))
777 ext4_warning(dir->i_sb, 868 ext4_warning(dir->i_sb,
778 "Corrupt dir inode %lu, running e2fsck is " 869 "Corrupt dir inode %lu, running e2fsck is "
@@ -878,6 +969,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
878 struct buffer_head *bh; 969 struct buffer_head *bh;
879 struct ext4_dir_entry_2 *de, *top; 970 struct ext4_dir_entry_2 *de, *top;
880 int err = 0, count = 0; 971 int err = 0, count = 0;
972 struct ext4_fname_crypto_ctx *ctx = NULL;
973 struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str;
881 974
882 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", 975 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n",
883 (unsigned long)block)); 976 (unsigned long)block));
@@ -889,6 +982,24 @@ static int htree_dirblock_to_tree(struct file *dir_file,
889 top = (struct ext4_dir_entry_2 *) ((char *) de + 982 top = (struct ext4_dir_entry_2 *) ((char *) de +
890 dir->i_sb->s_blocksize - 983 dir->i_sb->s_blocksize -
891 EXT4_DIR_REC_LEN(0)); 984 EXT4_DIR_REC_LEN(0));
985#ifdef CONFIG_EXT4_FS_ENCRYPTION
986 /* Check if the directory is encrypted */
987 ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
988 if (IS_ERR(ctx)) {
989 err = PTR_ERR(ctx);
990 brelse(bh);
991 return err;
992 }
993 if (ctx != NULL) {
994 err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
995 &fname_crypto_str);
996 if (err < 0) {
997 ext4_put_fname_crypto_ctx(&ctx);
998 brelse(bh);
999 return err;
1000 }
1001 }
1002#endif
892 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 1003 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
893 if (ext4_check_dir_entry(dir, NULL, de, bh, 1004 if (ext4_check_dir_entry(dir, NULL, de, bh,
894 bh->b_data, bh->b_size, 1005 bh->b_data, bh->b_size,
@@ -897,21 +1008,52 @@ static int htree_dirblock_to_tree(struct file *dir_file,
897 /* silently ignore the rest of the block */ 1008 /* silently ignore the rest of the block */
898 break; 1009 break;
899 } 1010 }
1011#ifdef CONFIG_EXT4_FS_ENCRYPTION
1012 err = ext4_fname_disk_to_hash(ctx, de, hinfo);
1013 if (err < 0) {
1014 count = err;
1015 goto errout;
1016 }
1017#else
900 ext4fs_dirhash(de->name, de->name_len, hinfo); 1018 ext4fs_dirhash(de->name, de->name_len, hinfo);
1019#endif
901 if ((hinfo->hash < start_hash) || 1020 if ((hinfo->hash < start_hash) ||
902 ((hinfo->hash == start_hash) && 1021 ((hinfo->hash == start_hash) &&
903 (hinfo->minor_hash < start_minor_hash))) 1022 (hinfo->minor_hash < start_minor_hash)))
904 continue; 1023 continue;
905 if (de->inode == 0) 1024 if (de->inode == 0)
906 continue; 1025 continue;
907 if ((err = ext4_htree_store_dirent(dir_file, 1026 if (ctx == NULL) {
908 hinfo->hash, hinfo->minor_hash, de)) != 0) { 1027 /* Directory is not encrypted */
909 brelse(bh); 1028 tmp_str.name = de->name;
910 return err; 1029 tmp_str.len = de->name_len;
1030 err = ext4_htree_store_dirent(dir_file,
1031 hinfo->hash, hinfo->minor_hash, de,
1032 &tmp_str);
1033 } else {
1034 /* Directory is encrypted */
1035 err = ext4_fname_disk_to_usr(ctx, de,
1036 &fname_crypto_str);
1037 if (err < 0) {
1038 count = err;
1039 goto errout;
1040 }
1041 err = ext4_htree_store_dirent(dir_file,
1042 hinfo->hash, hinfo->minor_hash, de,
1043 &fname_crypto_str);
1044 }
1045 if (err != 0) {
1046 count = err;
1047 goto errout;
911 } 1048 }
912 count++; 1049 count++;
913 } 1050 }
1051errout:
914 brelse(bh); 1052 brelse(bh);
1053#ifdef CONFIG_EXT4_FS_ENCRYPTION
1054 ext4_put_fname_crypto_ctx(&ctx);
1055 ext4_fname_crypto_free_buffer(&fname_crypto_str);
1056#endif
915 return count; 1057 return count;
916} 1058}
917 1059
@@ -935,6 +1077,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
935 int count = 0; 1077 int count = 0;
936 int ret, err; 1078 int ret, err;
937 __u32 hashval; 1079 __u32 hashval;
1080 struct ext4_str tmp_str;
938 1081
939 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 1082 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
940 start_hash, start_minor_hash)); 1083 start_hash, start_minor_hash));
@@ -970,14 +1113,22 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
970 /* Add '.' and '..' from the htree header */ 1113 /* Add '.' and '..' from the htree header */
971 if (!start_hash && !start_minor_hash) { 1114 if (!start_hash && !start_minor_hash) {
972 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; 1115 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
973 if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0) 1116 tmp_str.name = de->name;
1117 tmp_str.len = de->name_len;
1118 err = ext4_htree_store_dirent(dir_file, 0, 0,
1119 de, &tmp_str);
1120 if (err != 0)
974 goto errout; 1121 goto errout;
975 count++; 1122 count++;
976 } 1123 }
977 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { 1124 if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
978 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; 1125 de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
979 de = ext4_next_entry(de, dir->i_sb->s_blocksize); 1126 de = ext4_next_entry(de, dir->i_sb->s_blocksize);
980 if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) 1127 tmp_str.name = de->name;
1128 tmp_str.len = de->name_len;
1129 err = ext4_htree_store_dirent(dir_file, 2, 0,
1130 de, &tmp_str);
1131 if (err != 0)
981 goto errout; 1132 goto errout;
982 count++; 1133 count++;
983 } 1134 }
@@ -1035,17 +1186,33 @@ static inline int search_dirblock(struct buffer_head *bh,
1035 * Create map of hash values, offsets, and sizes, stored at end of block. 1186 * Create map of hash values, offsets, and sizes, stored at end of block.
1036 * Returns number of entries mapped. 1187 * Returns number of entries mapped.
1037 */ 1188 */
1038static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, 1189static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
1039 struct dx_hash_info *hinfo, 1190 unsigned blocksize, struct dx_hash_info *hinfo,
1040 struct dx_map_entry *map_tail) 1191 struct dx_map_entry *map_tail)
1041{ 1192{
1042 int count = 0; 1193 int count = 0;
1043 char *base = (char *) de; 1194 char *base = (char *) de;
1044 struct dx_hash_info h = *hinfo; 1195 struct dx_hash_info h = *hinfo;
1196#ifdef CONFIG_EXT4_FS_ENCRYPTION
1197 struct ext4_fname_crypto_ctx *ctx = NULL;
1198 int err;
1199
1200 ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
1201 if (IS_ERR(ctx))
1202 return PTR_ERR(ctx);
1203#endif
1045 1204
1046 while ((char *) de < base + blocksize) { 1205 while ((char *) de < base + blocksize) {
1047 if (de->name_len && de->inode) { 1206 if (de->name_len && de->inode) {
1207#ifdef CONFIG_EXT4_FS_ENCRYPTION
1208 err = ext4_fname_disk_to_hash(ctx, de, &h);
1209 if (err < 0) {
1210 ext4_put_fname_crypto_ctx(&ctx);
1211 return err;
1212 }
1213#else
1048 ext4fs_dirhash(de->name, de->name_len, &h); 1214 ext4fs_dirhash(de->name, de->name_len, &h);
1215#endif
1049 map_tail--; 1216 map_tail--;
1050 map_tail->hash = h.hash; 1217 map_tail->hash = h.hash;
1051 map_tail->offs = ((char *) de - base)>>2; 1218 map_tail->offs = ((char *) de - base)>>2;
@@ -1056,6 +1223,9 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
1056 /* XXX: do we need to check rec_len == 0 case? -Chris */ 1223 /* XXX: do we need to check rec_len == 0 case? -Chris */
1057 de = ext4_next_entry(de, blocksize); 1224 de = ext4_next_entry(de, blocksize);
1058 } 1225 }
1226#ifdef CONFIG_EXT4_FS_ENCRYPTION
1227 ext4_put_fname_crypto_ctx(&ctx);
1228#endif
1059 return count; 1229 return count;
1060} 1230}
1061 1231
@@ -1106,57 +1276,107 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
1106 * `len <= EXT4_NAME_LEN' is guaranteed by caller. 1276 * `len <= EXT4_NAME_LEN' is guaranteed by caller.
1107 * `de != NULL' is guaranteed by caller. 1277 * `de != NULL' is guaranteed by caller.
1108 */ 1278 */
1109static inline int ext4_match (int len, const char * const name, 1279static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx,
1110 struct ext4_dir_entry_2 * de) 1280 struct ext4_str *fname_crypto_str,
1281 int len, const char * const name,
1282 struct ext4_dir_entry_2 *de)
1111{ 1283{
1112 if (len != de->name_len) 1284 int res;
1113 return 0; 1285
1114 if (!de->inode) 1286 if (!de->inode)
1115 return 0; 1287 return 0;
1116 return !memcmp(name, de->name, len); 1288
1289#ifdef CONFIG_EXT4_FS_ENCRYPTION
1290 if (ctx) {
1291 /* Directory is encrypted */
1292 res = ext4_fname_disk_to_usr(ctx, de, fname_crypto_str);
1293 if (res < 0)
1294 return res;
1295 if (len != res)
1296 return 0;
1297 res = memcmp(name, fname_crypto_str->name, len);
1298 return (res == 0) ? 1 : 0;
1299 }
1300#endif
1301 if (len != de->name_len)
1302 return 0;
1303 res = memcmp(name, de->name, len);
1304 return (res == 0) ? 1 : 0;
1117} 1305}
1118 1306
1119/* 1307/*
1120 * Returns 0 if not found, -1 on failure, and 1 on success 1308 * Returns 0 if not found, -1 on failure, and 1 on success
1121 */ 1309 */
1122int search_dir(struct buffer_head *bh, 1310int search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
1123 char *search_buf, 1311 struct inode *dir, const struct qstr *d_name,
1124 int buf_size, 1312 unsigned int offset, struct ext4_dir_entry_2 **res_dir)
1125 struct inode *dir,
1126 const struct qstr *d_name,
1127 unsigned int offset,
1128 struct ext4_dir_entry_2 **res_dir)
1129{ 1313{
1130 struct ext4_dir_entry_2 * de; 1314 struct ext4_dir_entry_2 * de;
1131 char * dlimit; 1315 char * dlimit;
1132 int de_len; 1316 int de_len;
1133 const char *name = d_name->name; 1317 const char *name = d_name->name;
1134 int namelen = d_name->len; 1318 int namelen = d_name->len;
1319 struct ext4_fname_crypto_ctx *ctx = NULL;
1320 struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
1321 int res;
1322
1323 ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
1324 if (IS_ERR(ctx))
1325 return -1;
1326
1327 if (ctx != NULL) {
1328 /* Allocate buffer to hold maximum name length */
1329 res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
1330 &fname_crypto_str);
1331 if (res < 0) {
1332 ext4_put_fname_crypto_ctx(&ctx);
1333 return -1;
1334 }
1335 }
1135 1336
1136 de = (struct ext4_dir_entry_2 *)search_buf; 1337 de = (struct ext4_dir_entry_2 *)search_buf;
1137 dlimit = search_buf + buf_size; 1338 dlimit = search_buf + buf_size;
1138 while ((char *) de < dlimit) { 1339 while ((char *) de < dlimit) {
1139 /* this code is executed quadratically often */ 1340 /* this code is executed quadratically often */
1140 /* do minimal checking `by hand' */ 1341 /* do minimal checking `by hand' */
1342 if ((char *) de + de->name_len <= dlimit) {
1343 res = ext4_match(ctx, &fname_crypto_str, namelen,
1344 name, de);
1345 if (res < 0) {
1346 res = -1;
1347 goto return_result;
1348 }
1349 if (res > 0) {
1350 /* found a match - just to be sure, do
1351 * a full check */
1352 if (ext4_check_dir_entry(dir, NULL, de, bh,
1353 bh->b_data,
1354 bh->b_size, offset)) {
1355 res = -1;
1356 goto return_result;
1357 }
1358 *res_dir = de;
1359 res = 1;
1360 goto return_result;
1361 }
1141 1362
1142 if ((char *) de + namelen <= dlimit &&
1143 ext4_match (namelen, name, de)) {
1144 /* found a match - just to be sure, do a full check */
1145 if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
1146 bh->b_size, offset))
1147 return -1;
1148 *res_dir = de;
1149 return 1;
1150 } 1363 }
1151 /* prevent looping on a bad block */ 1364 /* prevent looping on a bad block */
1152 de_len = ext4_rec_len_from_disk(de->rec_len, 1365 de_len = ext4_rec_len_from_disk(de->rec_len,
1153 dir->i_sb->s_blocksize); 1366 dir->i_sb->s_blocksize);
1154 if (de_len <= 0) 1367 if (de_len <= 0) {
1155 return -1; 1368 res = -1;
1369 goto return_result;
1370 }
1156 offset += de_len; 1371 offset += de_len;
1157 de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); 1372 de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
1158 } 1373 }
1159 return 0; 1374
1375 res = 0;
1376return_result:
1377 ext4_put_fname_crypto_ctx(&ctx);
1378 ext4_fname_crypto_free_buffer(&fname_crypto_str);
1379 return res;
1160} 1380}
1161 1381
1162static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, 1382static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
@@ -1345,6 +1565,9 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
1345 ext4_lblk_t block; 1565 ext4_lblk_t block;
1346 int retval; 1566 int retval;
1347 1567
1568#ifdef CONFIG_EXT4_FS_ENCRYPTION
1569 *res_dir = NULL;
1570#endif
1348 frame = dx_probe(d_name, dir, &hinfo, frames); 1571 frame = dx_probe(d_name, dir, &hinfo, frames);
1349 if (IS_ERR(frame)) 1572 if (IS_ERR(frame))
1350 return (struct buffer_head *) frame; 1573 return (struct buffer_head *) frame;
@@ -1417,6 +1640,18 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
1417 ino); 1640 ino);
1418 return ERR_PTR(-EIO); 1641 return ERR_PTR(-EIO);
1419 } 1642 }
1643 if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
1644 (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1645 S_ISLNK(inode->i_mode)) &&
1646 !ext4_is_child_context_consistent_with_parent(dir,
1647 inode)) {
1648 iput(inode);
1649 ext4_warning(inode->i_sb,
1650 "Inconsistent encryption contexts: %lu/%lu\n",
1651 (unsigned long) dir->i_ino,
1652 (unsigned long) inode->i_ino);
1653 return ERR_PTR(-EPERM);
1654 }
1420 } 1655 }
1421 return d_splice_alias(inode, dentry); 1656 return d_splice_alias(inode, dentry);
1422} 1657}
@@ -1541,7 +1776,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1541 1776
1542 /* create map in the end of data2 block */ 1777 /* create map in the end of data2 block */
1543 map = (struct dx_map_entry *) (data2 + blocksize); 1778 map = (struct dx_map_entry *) (data2 + blocksize);
1544 count = dx_make_map((struct ext4_dir_entry_2 *) data1, 1779 count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
1545 blocksize, hinfo, map); 1780 blocksize, hinfo, map);
1546 map -= count; 1781 map -= count;
1547 dx_sort_map(map, count); 1782 dx_sort_map(map, count);
@@ -1564,7 +1799,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1564 hash2, split, count-split)); 1799 hash2, split, count-split));
1565 1800
1566 /* Fancy dance to stay within two buffers */ 1801 /* Fancy dance to stay within two buffers */
1567 de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); 1802 de2 = dx_move_dirents(data1, data2, map + split, count - split,
1803 blocksize);
1568 de = dx_pack_dirents(data1, blocksize); 1804 de = dx_pack_dirents(data1, blocksize);
1569 de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - 1805 de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
1570 (char *) de, 1806 (char *) de,
@@ -1580,8 +1816,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
1580 initialize_dirent_tail(t, blocksize); 1816 initialize_dirent_tail(t, blocksize);
1581 } 1817 }
1582 1818
1583 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); 1819 dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1,
1584 dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); 1820 blocksize, 1));
1821 dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
1822 blocksize, 1));
1585 1823
1586 /* Which block gets the new entry? */ 1824 /* Which block gets the new entry? */
1587 if (hinfo->hash >= hash2) { 1825 if (hinfo->hash >= hash2) {
@@ -1618,15 +1856,48 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
1618 int nlen, rlen; 1856 int nlen, rlen;
1619 unsigned int offset = 0; 1857 unsigned int offset = 0;
1620 char *top; 1858 char *top;
1859 struct ext4_fname_crypto_ctx *ctx = NULL;
1860 struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
1861 int res;
1862
1863 ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
1864 if (IS_ERR(ctx))
1865 return -1;
1866
1867 if (ctx != NULL) {
1868 /* Calculate record length needed to store the entry */
1869 res = ext4_fname_crypto_namelen_on_disk(ctx, namelen);
1870 if (res < 0) {
1871 ext4_put_fname_crypto_ctx(&ctx);
1872 return res;
1873 }
1874 reclen = EXT4_DIR_REC_LEN(res);
1875
1876 /* Allocate buffer to hold maximum name length */
1877 res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
1878 &fname_crypto_str);
1879 if (res < 0) {
1880 ext4_put_fname_crypto_ctx(&ctx);
1881 return -1;
1882 }
1883 }
1621 1884
1622 de = (struct ext4_dir_entry_2 *)buf; 1885 de = (struct ext4_dir_entry_2 *)buf;
1623 top = buf + buf_size - reclen; 1886 top = buf + buf_size - reclen;
1624 while ((char *) de <= top) { 1887 while ((char *) de <= top) {
1625 if (ext4_check_dir_entry(dir, NULL, de, bh, 1888 if (ext4_check_dir_entry(dir, NULL, de, bh,
1626 buf, buf_size, offset)) 1889 buf, buf_size, offset)) {
1627 return -EIO; 1890 res = -EIO;
1628 if (ext4_match(namelen, name, de)) 1891 goto return_result;
1629 return -EEXIST; 1892 }
1893 /* Provide crypto context and crypto buffer to ext4 match */
1894 res = ext4_match(ctx, &fname_crypto_str, namelen, name, de);
1895 if (res < 0)
1896 goto return_result;
1897 if (res > 0) {
1898 res = -EEXIST;
1899 goto return_result;
1900 }
1630 nlen = EXT4_DIR_REC_LEN(de->name_len); 1901 nlen = EXT4_DIR_REC_LEN(de->name_len);
1631 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); 1902 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
1632 if ((de->inode ? rlen - nlen : rlen) >= reclen) 1903 if ((de->inode ? rlen - nlen : rlen) >= reclen)
@@ -1634,26 +1905,62 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
1634 de = (struct ext4_dir_entry_2 *)((char *)de + rlen); 1905 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
1635 offset += rlen; 1906 offset += rlen;
1636 } 1907 }
1637 if ((char *) de > top)
1638 return -ENOSPC;
1639 1908
1640 *dest_de = de; 1909 if ((char *) de > top)
1641 return 0; 1910 res = -ENOSPC;
1911 else {
1912 *dest_de = de;
1913 res = 0;
1914 }
1915return_result:
1916 ext4_put_fname_crypto_ctx(&ctx);
1917 ext4_fname_crypto_free_buffer(&fname_crypto_str);
1918 return res;
1642} 1919}
1643 1920
1644void ext4_insert_dentry(struct inode *inode, 1921int ext4_insert_dentry(struct inode *dir,
1645 struct ext4_dir_entry_2 *de, 1922 struct inode *inode,
1646 int buf_size, 1923 struct ext4_dir_entry_2 *de,
1647 const char *name, int namelen) 1924 int buf_size,
1925 const struct qstr *iname,
1926 const char *name, int namelen)
1648{ 1927{
1649 1928
1650 int nlen, rlen; 1929 int nlen, rlen;
1930 struct ext4_fname_crypto_ctx *ctx = NULL;
1931 struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
1932 struct ext4_str tmp_str;
1933 int res;
1934
1935 ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
1936 if (IS_ERR(ctx))
1937 return -EIO;
1938 /* By default, the input name would be written to the disk */
1939 tmp_str.name = (unsigned char *)name;
1940 tmp_str.len = namelen;
1941 if (ctx != NULL) {
1942 /* Directory is encrypted */
1943 res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN,
1944 &fname_crypto_str);
1945 if (res < 0) {
1946 ext4_put_fname_crypto_ctx(&ctx);
1947 return -ENOMEM;
1948 }
1949 res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str);
1950 if (res < 0) {
1951 ext4_put_fname_crypto_ctx(&ctx);
1952 ext4_fname_crypto_free_buffer(&fname_crypto_str);
1953 return res;
1954 }
1955 tmp_str.name = fname_crypto_str.name;
1956 tmp_str.len = fname_crypto_str.len;
1957 }
1651 1958
1652 nlen = EXT4_DIR_REC_LEN(de->name_len); 1959 nlen = EXT4_DIR_REC_LEN(de->name_len);
1653 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); 1960 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
1654 if (de->inode) { 1961 if (de->inode) {
1655 struct ext4_dir_entry_2 *de1 = 1962 struct ext4_dir_entry_2 *de1 =
1656 (struct ext4_dir_entry_2 *)((char *)de + nlen); 1963 (struct ext4_dir_entry_2 *)((char *)de + nlen);
1657 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); 1964 de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
1658 de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); 1965 de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
1659 de = de1; 1966 de = de1;
@@ -1661,9 +1968,14 @@ void ext4_insert_dentry(struct inode *inode,
1661 de->file_type = EXT4_FT_UNKNOWN; 1968 de->file_type = EXT4_FT_UNKNOWN;
1662 de->inode = cpu_to_le32(inode->i_ino); 1969 de->inode = cpu_to_le32(inode->i_ino);
1663 ext4_set_de_type(inode->i_sb, de, inode->i_mode); 1970 ext4_set_de_type(inode->i_sb, de, inode->i_mode);
1664 de->name_len = namelen; 1971 de->name_len = tmp_str.len;
1665 memcpy(de->name, name, namelen); 1972
1973 memcpy(de->name, tmp_str.name, tmp_str.len);
1974 ext4_put_fname_crypto_ctx(&ctx);
1975 ext4_fname_crypto_free_buffer(&fname_crypto_str);
1976 return 0;
1666} 1977}
1978
1667/* 1979/*
1668 * Add a new entry into a directory (leaf) block. If de is non-NULL, 1980 * Add a new entry into a directory (leaf) block. If de is non-NULL,
1669 * it points to a directory entry which is guaranteed to be large 1981 * it points to a directory entry which is guaranteed to be large
@@ -1700,8 +2012,12 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1700 return err; 2012 return err;
1701 } 2013 }
1702 2014
1703 /* By now the buffer is marked for journaling */ 2015 /* By now the buffer is marked for journaling. Due to crypto operations,
1704 ext4_insert_dentry(inode, de, blocksize, name, namelen); 2016 * the following function call may fail */
2017 err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name,
2018 name, namelen);
2019 if (err < 0)
2020 return err;
1705 2021
1706 /* 2022 /*
1707 * XXX shouldn't update any times until successful 2023 * XXX shouldn't update any times until successful
@@ -1733,8 +2049,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1733 struct inode *inode, struct buffer_head *bh) 2049 struct inode *inode, struct buffer_head *bh)
1734{ 2050{
1735 struct inode *dir = dentry->d_parent->d_inode; 2051 struct inode *dir = dentry->d_parent->d_inode;
2052#ifdef CONFIG_EXT4_FS_ENCRYPTION
2053 struct ext4_fname_crypto_ctx *ctx = NULL;
2054 int res;
2055#else
1736 const char *name = dentry->d_name.name; 2056 const char *name = dentry->d_name.name;
1737 int namelen = dentry->d_name.len; 2057 int namelen = dentry->d_name.len;
2058#endif
1738 struct buffer_head *bh2; 2059 struct buffer_head *bh2;
1739 struct dx_root *root; 2060 struct dx_root *root;
1740 struct dx_frame frames[2], *frame; 2061 struct dx_frame frames[2], *frame;
@@ -1748,7 +2069,13 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1748 struct dx_hash_info hinfo; 2069 struct dx_hash_info hinfo;
1749 ext4_lblk_t block; 2070 ext4_lblk_t block;
1750 struct fake_dirent *fde; 2071 struct fake_dirent *fde;
1751 int csum_size = 0; 2072 int csum_size = 0;
2073
2074#ifdef CONFIG_EXT4_FS_ENCRYPTION
2075 ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN);
2076 if (IS_ERR(ctx))
2077 return PTR_ERR(ctx);
2078#endif
1752 2079
1753 if (ext4_has_metadata_csum(inode->i_sb)) 2080 if (ext4_has_metadata_csum(inode->i_sb))
1754 csum_size = sizeof(struct ext4_dir_entry_tail); 2081 csum_size = sizeof(struct ext4_dir_entry_tail);
@@ -1815,7 +2142,18 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1815 if (hinfo.hash_version <= DX_HASH_TEA) 2142 if (hinfo.hash_version <= DX_HASH_TEA)
1816 hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; 2143 hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
1817 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; 2144 hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
2145#ifdef CONFIG_EXT4_FS_ENCRYPTION
2146 res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo);
2147 if (res < 0) {
2148 ext4_put_fname_crypto_ctx(&ctx);
2149 ext4_mark_inode_dirty(handle, dir);
2150 brelse(bh);
2151 return res;
2152 }
2153 ext4_put_fname_crypto_ctx(&ctx);
2154#else
1818 ext4fs_dirhash(name, namelen, &hinfo); 2155 ext4fs_dirhash(name, namelen, &hinfo);
2156#endif
1819 memset(frames, 0, sizeof(frames)); 2157 memset(frames, 0, sizeof(frames));
1820 frame = frames; 2158 frame = frames;
1821 frame->entries = entries; 2159 frame->entries = entries;
@@ -1865,7 +2203,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1865 struct inode *inode) 2203 struct inode *inode)
1866{ 2204{
1867 struct inode *dir = dentry->d_parent->d_inode; 2205 struct inode *dir = dentry->d_parent->d_inode;
1868 struct buffer_head *bh; 2206 struct buffer_head *bh = NULL;
1869 struct ext4_dir_entry_2 *de; 2207 struct ext4_dir_entry_2 *de;
1870 struct ext4_dir_entry_tail *t; 2208 struct ext4_dir_entry_tail *t;
1871 struct super_block *sb; 2209 struct super_block *sb;
@@ -1889,14 +2227,14 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1889 return retval; 2227 return retval;
1890 if (retval == 1) { 2228 if (retval == 1) {
1891 retval = 0; 2229 retval = 0;
1892 return retval; 2230 goto out;
1893 } 2231 }
1894 } 2232 }
1895 2233
1896 if (is_dx(dir)) { 2234 if (is_dx(dir)) {
1897 retval = ext4_dx_add_entry(handle, dentry, inode); 2235 retval = ext4_dx_add_entry(handle, dentry, inode);
1898 if (!retval || (retval != ERR_BAD_DX_DIR)) 2236 if (!retval || (retval != ERR_BAD_DX_DIR))
1899 return retval; 2237 goto out;
1900 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); 2238 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1901 dx_fallback++; 2239 dx_fallback++;
1902 ext4_mark_inode_dirty(handle, dir); 2240 ext4_mark_inode_dirty(handle, dir);
@@ -1908,14 +2246,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1908 return PTR_ERR(bh); 2246 return PTR_ERR(bh);
1909 2247
1910 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 2248 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1911 if (retval != -ENOSPC) { 2249 if (retval != -ENOSPC)
1912 brelse(bh); 2250 goto out;
1913 return retval;
1914 }
1915 2251
1916 if (blocks == 1 && !dx_fallback && 2252 if (blocks == 1 && !dx_fallback &&
1917 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) 2253 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
1918 return make_indexed_dir(handle, dentry, inode, bh); 2254 retval = make_indexed_dir(handle, dentry, inode, bh);
2255 bh = NULL; /* make_indexed_dir releases bh */
2256 goto out;
2257 }
1919 brelse(bh); 2258 brelse(bh);
1920 } 2259 }
1921 bh = ext4_append(handle, dir, &block); 2260 bh = ext4_append(handle, dir, &block);
@@ -1931,6 +2270,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1931 } 2270 }
1932 2271
1933 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 2272 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
2273out:
1934 brelse(bh); 2274 brelse(bh);
1935 if (retval == 0) 2275 if (retval == 0)
1936 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); 2276 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -2237,7 +2577,20 @@ retry:
2237 inode->i_op = &ext4_file_inode_operations; 2577 inode->i_op = &ext4_file_inode_operations;
2238 inode->i_fop = &ext4_file_operations; 2578 inode->i_fop = &ext4_file_operations;
2239 ext4_set_aops(inode); 2579 ext4_set_aops(inode);
2240 err = ext4_add_nondir(handle, dentry, inode); 2580 err = 0;
2581#ifdef CONFIG_EXT4_FS_ENCRYPTION
2582 if (!err && (ext4_encrypted_inode(dir) ||
2583 DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) {
2584 err = ext4_inherit_context(dir, inode);
2585 if (err) {
2586 clear_nlink(inode);
2587 unlock_new_inode(inode);
2588 iput(inode);
2589 }
2590 }
2591#endif
2592 if (!err)
2593 err = ext4_add_nondir(handle, dentry, inode);
2241 if (!err && IS_DIRSYNC(dir)) 2594 if (!err && IS_DIRSYNC(dir))
2242 ext4_handle_sync(handle); 2595 ext4_handle_sync(handle);
2243 } 2596 }
@@ -2418,6 +2771,14 @@ retry:
2418 err = ext4_init_new_dir(handle, dir, inode); 2771 err = ext4_init_new_dir(handle, dir, inode);
2419 if (err) 2772 if (err)
2420 goto out_clear_inode; 2773 goto out_clear_inode;
2774#ifdef CONFIG_EXT4_FS_ENCRYPTION
2775 if (ext4_encrypted_inode(dir) ||
2776 DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) {
2777 err = ext4_inherit_context(dir, inode);
2778 if (err)
2779 goto out_clear_inode;
2780 }
2781#endif
2421 err = ext4_mark_inode_dirty(handle, inode); 2782 err = ext4_mark_inode_dirty(handle, inode);
2422 if (!err) 2783 if (!err)
2423 err = ext4_add_entry(handle, dentry, inode); 2784 err = ext4_add_entry(handle, dentry, inode);
@@ -2450,7 +2811,7 @@ out_stop:
2450/* 2811/*
2451 * routine to check that the specified directory is empty (for rmdir) 2812 * routine to check that the specified directory is empty (for rmdir)
2452 */ 2813 */
2453static int empty_dir(struct inode *inode) 2814int ext4_empty_dir(struct inode *inode)
2454{ 2815{
2455 unsigned int offset; 2816 unsigned int offset;
2456 struct buffer_head *bh; 2817 struct buffer_head *bh;
@@ -2718,7 +3079,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2718 goto end_rmdir; 3079 goto end_rmdir;
2719 3080
2720 retval = -ENOTEMPTY; 3081 retval = -ENOTEMPTY;
2721 if (!empty_dir(inode)) 3082 if (!ext4_empty_dir(inode))
2722 goto end_rmdir; 3083 goto end_rmdir;
2723 3084
2724 handle = ext4_journal_start(dir, EXT4_HT_DIR, 3085 handle = ext4_journal_start(dir, EXT4_HT_DIR,
@@ -2828,16 +3189,25 @@ static int ext4_symlink(struct inode *dir,
2828{ 3189{
2829 handle_t *handle; 3190 handle_t *handle;
2830 struct inode *inode; 3191 struct inode *inode;
2831 int l, err, retries = 0; 3192 int err, len = strlen(symname);
2832 int credits; 3193 int credits;
2833 3194 bool encryption_required;
2834 l = strlen(symname)+1; 3195 struct ext4_str disk_link;
2835 if (l > dir->i_sb->s_blocksize) 3196 struct ext4_encrypted_symlink_data *sd = NULL;
3197
3198 disk_link.len = len + 1;
3199 disk_link.name = (char *) symname;
3200
3201 encryption_required = (ext4_encrypted_inode(dir) ||
3202 DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)));
3203 if (encryption_required)
3204 disk_link.len = encrypted_symlink_data_len(len) + 1;
3205 if (disk_link.len > dir->i_sb->s_blocksize)
2836 return -ENAMETOOLONG; 3206 return -ENAMETOOLONG;
2837 3207
2838 dquot_initialize(dir); 3208 dquot_initialize(dir);
2839 3209
2840 if (l > EXT4_N_BLOCKS * 4) { 3210 if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
2841 /* 3211 /*
2842 * For non-fast symlinks, we just allocate inode and put it on 3212 * For non-fast symlinks, we just allocate inode and put it on
2843 * orphan list in the first transaction => we need bitmap, 3213 * orphan list in the first transaction => we need bitmap,
@@ -2856,16 +3226,49 @@ static int ext4_symlink(struct inode *dir,
2856 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 3226 credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2857 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; 3227 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
2858 } 3228 }
2859retry: 3229
2860 inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, 3230 inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
2861 &dentry->d_name, 0, NULL, 3231 &dentry->d_name, 0, NULL,
2862 EXT4_HT_DIR, credits); 3232 EXT4_HT_DIR, credits);
2863 handle = ext4_journal_current_handle(); 3233 handle = ext4_journal_current_handle();
2864 err = PTR_ERR(inode); 3234 if (IS_ERR(inode)) {
2865 if (IS_ERR(inode)) 3235 if (handle)
2866 goto out_stop; 3236 ext4_journal_stop(handle);
3237 return PTR_ERR(inode);
3238 }
3239
3240 if (encryption_required) {
3241 struct ext4_fname_crypto_ctx *ctx = NULL;
3242 struct qstr istr;
3243 struct ext4_str ostr;
3244
3245 sd = kzalloc(disk_link.len, GFP_NOFS);
3246 if (!sd) {
3247 err = -ENOMEM;
3248 goto err_drop_inode;
3249 }
3250 err = ext4_inherit_context(dir, inode);
3251 if (err)
3252 goto err_drop_inode;
3253 ctx = ext4_get_fname_crypto_ctx(inode,
3254 inode->i_sb->s_blocksize);
3255 if (IS_ERR_OR_NULL(ctx)) {
3256 /* We just set the policy, so ctx should not be NULL */
3257 err = (ctx == NULL) ? -EIO : PTR_ERR(ctx);
3258 goto err_drop_inode;
3259 }
3260 istr.name = (const unsigned char *) symname;
3261 istr.len = len;
3262 ostr.name = sd->encrypted_path;
3263 err = ext4_fname_usr_to_disk(ctx, &istr, &ostr);
3264 ext4_put_fname_crypto_ctx(&ctx);
3265 if (err < 0)
3266 goto err_drop_inode;
3267 sd->len = cpu_to_le16(ostr.len);
3268 disk_link.name = (char *) sd;
3269 }
2867 3270
2868 if (l > EXT4_N_BLOCKS * 4) { 3271 if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
2869 inode->i_op = &ext4_symlink_inode_operations; 3272 inode->i_op = &ext4_symlink_inode_operations;
2870 ext4_set_aops(inode); 3273 ext4_set_aops(inode);
2871 /* 3274 /*
@@ -2881,9 +3284,10 @@ retry:
2881 drop_nlink(inode); 3284 drop_nlink(inode);
2882 err = ext4_orphan_add(handle, inode); 3285 err = ext4_orphan_add(handle, inode);
2883 ext4_journal_stop(handle); 3286 ext4_journal_stop(handle);
3287 handle = NULL;
2884 if (err) 3288 if (err)
2885 goto err_drop_inode; 3289 goto err_drop_inode;
2886 err = __page_symlink(inode, symname, l, 1); 3290 err = __page_symlink(inode, disk_link.name, disk_link.len, 1);
2887 if (err) 3291 if (err)
2888 goto err_drop_inode; 3292 goto err_drop_inode;
2889 /* 3293 /*
@@ -2895,34 +3299,37 @@ retry:
2895 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); 3299 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1);
2896 if (IS_ERR(handle)) { 3300 if (IS_ERR(handle)) {
2897 err = PTR_ERR(handle); 3301 err = PTR_ERR(handle);
3302 handle = NULL;
2898 goto err_drop_inode; 3303 goto err_drop_inode;
2899 } 3304 }
2900 set_nlink(inode, 1); 3305 set_nlink(inode, 1);
2901 err = ext4_orphan_del(handle, inode); 3306 err = ext4_orphan_del(handle, inode);
2902 if (err) { 3307 if (err)
2903 ext4_journal_stop(handle);
2904 clear_nlink(inode);
2905 goto err_drop_inode; 3308 goto err_drop_inode;
2906 }
2907 } else { 3309 } else {
2908 /* clear the extent format for fast symlink */ 3310 /* clear the extent format for fast symlink */
2909 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); 3311 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2910 inode->i_op = &ext4_fast_symlink_inode_operations; 3312 inode->i_op = encryption_required ?
2911 memcpy((char *)&EXT4_I(inode)->i_data, symname, l); 3313 &ext4_symlink_inode_operations :
2912 inode->i_size = l-1; 3314 &ext4_fast_symlink_inode_operations;
3315 memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name,
3316 disk_link.len);
3317 inode->i_size = disk_link.len - 1;
2913 } 3318 }
2914 EXT4_I(inode)->i_disksize = inode->i_size; 3319 EXT4_I(inode)->i_disksize = inode->i_size;
2915 err = ext4_add_nondir(handle, dentry, inode); 3320 err = ext4_add_nondir(handle, dentry, inode);
2916 if (!err && IS_DIRSYNC(dir)) 3321 if (!err && IS_DIRSYNC(dir))
2917 ext4_handle_sync(handle); 3322 ext4_handle_sync(handle);
2918 3323
2919out_stop:
2920 if (handle) 3324 if (handle)
2921 ext4_journal_stop(handle); 3325 ext4_journal_stop(handle);
2922 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) 3326 kfree(sd);
2923 goto retry;
2924 return err; 3327 return err;
2925err_drop_inode: 3328err_drop_inode:
3329 if (handle)
3330 ext4_journal_stop(handle);
3331 kfree(sd);
3332 clear_nlink(inode);
2926 unlock_new_inode(inode); 3333 unlock_new_inode(inode);
2927 iput(inode); 3334 iput(inode);
2928 return err; 3335 return err;
@@ -2937,7 +3344,9 @@ static int ext4_link(struct dentry *old_dentry,
2937 3344
2938 if (inode->i_nlink >= EXT4_LINK_MAX) 3345 if (inode->i_nlink >= EXT4_LINK_MAX)
2939 return -EMLINK; 3346 return -EMLINK;
2940 3347 if (ext4_encrypted_inode(dir) &&
3348 !ext4_is_child_context_consistent_with_parent(dir, inode))
3349 return -EPERM;
2941 dquot_initialize(dir); 3350 dquot_initialize(dir);
2942 3351
2943retry: 3352retry:
@@ -3238,6 +3647,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3238 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) 3647 if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
3239 goto end_rename; 3648 goto end_rename;
3240 3649
3650 if ((old.dir != new.dir) &&
3651 ext4_encrypted_inode(new.dir) &&
3652 !ext4_is_child_context_consistent_with_parent(new.dir,
3653 old.inode)) {
3654 retval = -EPERM;
3655 goto end_rename;
3656 }
3657
3241 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, 3658 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3242 &new.de, &new.inlined); 3659 &new.de, &new.inlined);
3243 if (IS_ERR(new.bh)) { 3660 if (IS_ERR(new.bh)) {
@@ -3258,12 +3675,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3258 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); 3675 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
3259 if (!(flags & RENAME_WHITEOUT)) { 3676 if (!(flags & RENAME_WHITEOUT)) {
3260 handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); 3677 handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
3261 if (IS_ERR(handle)) 3678 if (IS_ERR(handle)) {
3262 return PTR_ERR(handle); 3679 retval = PTR_ERR(handle);
3680 handle = NULL;
3681 goto end_rename;
3682 }
3263 } else { 3683 } else {
3264 whiteout = ext4_whiteout_for_rename(&old, credits, &handle); 3684 whiteout = ext4_whiteout_for_rename(&old, credits, &handle);
3265 if (IS_ERR(whiteout)) 3685 if (IS_ERR(whiteout)) {
3266 return PTR_ERR(whiteout); 3686 retval = PTR_ERR(whiteout);
3687 whiteout = NULL;
3688 goto end_rename;
3689 }
3267 } 3690 }
3268 3691
3269 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) 3692 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
@@ -3272,7 +3695,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3272 if (S_ISDIR(old.inode->i_mode)) { 3695 if (S_ISDIR(old.inode->i_mode)) {
3273 if (new.inode) { 3696 if (new.inode) {
3274 retval = -ENOTEMPTY; 3697 retval = -ENOTEMPTY;
3275 if (!empty_dir(new.inode)) 3698 if (!ext4_empty_dir(new.inode))
3276 goto end_rename; 3699 goto end_rename;
3277 } else { 3700 } else {
3278 retval = -EMLINK; 3701 retval = -EMLINK;
@@ -3346,8 +3769,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3346 3769
3347 ext4_dec_count(handle, old.dir); 3770 ext4_dec_count(handle, old.dir);
3348 if (new.inode) { 3771 if (new.inode) {
3349 /* checked empty_dir above, can't have another parent, 3772 /* checked ext4_empty_dir above, can't have another
3350 * ext4_dec_count() won't work for many-linked dirs */ 3773 * parent, ext4_dec_count() won't work for many-linked
3774 * dirs */
3351 clear_nlink(new.inode); 3775 clear_nlink(new.inode);
3352 } else { 3776 } else {
3353 ext4_inc_count(handle, new.dir); 3777 ext4_inc_count(handle, new.dir);
@@ -3427,8 +3851,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3427 handle = ext4_journal_start(old.dir, EXT4_HT_DIR, 3851 handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
3428 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 3852 (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
3429 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); 3853 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
3430 if (IS_ERR(handle)) 3854 if (IS_ERR(handle)) {
3431 return PTR_ERR(handle); 3855 retval = PTR_ERR(handle);
3856 handle = NULL;
3857 goto end_rename;
3858 }
3432 3859
3433 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) 3860 if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
3434 ext4_handle_sync(handle); 3861 ext4_handle_sync(handle);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 464984261e69..5765f88b3904 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -8,7 +8,6 @@
8 8
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/jbd2.h>
12#include <linux/highuid.h> 11#include <linux/highuid.h>
13#include <linux/pagemap.h> 12#include <linux/pagemap.h>
14#include <linux/quotaops.h> 13#include <linux/quotaops.h>
@@ -24,7 +23,6 @@
24#include <linux/kernel.h> 23#include <linux/kernel.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26#include <linux/mm.h> 25#include <linux/mm.h>
27#include <linux/ratelimit.h>
28 26
29#include "ext4_jbd2.h" 27#include "ext4_jbd2.h"
30#include "xattr.h" 28#include "xattr.h"
@@ -68,6 +66,10 @@ static void ext4_finish_bio(struct bio *bio)
68 66
69 bio_for_each_segment_all(bvec, bio, i) { 67 bio_for_each_segment_all(bvec, bio, i) {
70 struct page *page = bvec->bv_page; 68 struct page *page = bvec->bv_page;
69#ifdef CONFIG_EXT4_FS_ENCRYPTION
70 struct page *data_page = NULL;
71 struct ext4_crypto_ctx *ctx = NULL;
72#endif
71 struct buffer_head *bh, *head; 73 struct buffer_head *bh, *head;
72 unsigned bio_start = bvec->bv_offset; 74 unsigned bio_start = bvec->bv_offset;
73 unsigned bio_end = bio_start + bvec->bv_len; 75 unsigned bio_end = bio_start + bvec->bv_len;
@@ -77,6 +79,15 @@ static void ext4_finish_bio(struct bio *bio)
77 if (!page) 79 if (!page)
78 continue; 80 continue;
79 81
82#ifdef CONFIG_EXT4_FS_ENCRYPTION
83 if (!page->mapping) {
84 /* The bounce data pages are unmapped. */
85 data_page = page;
86 ctx = (struct ext4_crypto_ctx *)page_private(data_page);
87 page = ctx->control_page;
88 }
89#endif
90
80 if (error) { 91 if (error) {
81 SetPageError(page); 92 SetPageError(page);
82 set_bit(AS_EIO, &page->mapping->flags); 93 set_bit(AS_EIO, &page->mapping->flags);
@@ -101,8 +112,13 @@ static void ext4_finish_bio(struct bio *bio)
101 } while ((bh = bh->b_this_page) != head); 112 } while ((bh = bh->b_this_page) != head);
102 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); 113 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
103 local_irq_restore(flags); 114 local_irq_restore(flags);
104 if (!under_io) 115 if (!under_io) {
116#ifdef CONFIG_EXT4_FS_ENCRYPTION
117 if (ctx)
118 ext4_restore_control_page(data_page);
119#endif
105 end_page_writeback(page); 120 end_page_writeback(page);
121 }
106 } 122 }
107} 123}
108 124
@@ -377,6 +393,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
377 393
378static int io_submit_add_bh(struct ext4_io_submit *io, 394static int io_submit_add_bh(struct ext4_io_submit *io,
379 struct inode *inode, 395 struct inode *inode,
396 struct page *page,
380 struct buffer_head *bh) 397 struct buffer_head *bh)
381{ 398{
382 int ret; 399 int ret;
@@ -390,7 +407,7 @@ submit_and_retry:
390 if (ret) 407 if (ret)
391 return ret; 408 return ret;
392 } 409 }
393 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 410 ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
394 if (ret != bh->b_size) 411 if (ret != bh->b_size)
395 goto submit_and_retry; 412 goto submit_and_retry;
396 io->io_next_block++; 413 io->io_next_block++;
@@ -403,6 +420,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
403 struct writeback_control *wbc, 420 struct writeback_control *wbc,
404 bool keep_towrite) 421 bool keep_towrite)
405{ 422{
423 struct page *data_page = NULL;
406 struct inode *inode = page->mapping->host; 424 struct inode *inode = page->mapping->host;
407 unsigned block_start, blocksize; 425 unsigned block_start, blocksize;
408 struct buffer_head *bh, *head; 426 struct buffer_head *bh, *head;
@@ -462,19 +480,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
462 set_buffer_async_write(bh); 480 set_buffer_async_write(bh);
463 } while ((bh = bh->b_this_page) != head); 481 } while ((bh = bh->b_this_page) != head);
464 482
465 /* Now submit buffers to write */
466 bh = head = page_buffers(page); 483 bh = head = page_buffers(page);
484
485 if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
486 data_page = ext4_encrypt(inode, page);
487 if (IS_ERR(data_page)) {
488 ret = PTR_ERR(data_page);
489 data_page = NULL;
490 goto out;
491 }
492 }
493
494 /* Now submit buffers to write */
467 do { 495 do {
468 if (!buffer_async_write(bh)) 496 if (!buffer_async_write(bh))
469 continue; 497 continue;
470 ret = io_submit_add_bh(io, inode, bh); 498 ret = io_submit_add_bh(io, inode,
499 data_page ? data_page : page, bh);
471 if (ret) { 500 if (ret) {
472 /* 501 /*
473 * We only get here on ENOMEM. Not much else 502 * We only get here on ENOMEM. Not much else
474 * we can do but mark the page as dirty, and 503 * we can do but mark the page as dirty, and
475 * better luck next time. 504 * better luck next time.
476 */ 505 */
477 redirty_page_for_writepage(wbc, page);
478 break; 506 break;
479 } 507 }
480 nr_submitted++; 508 nr_submitted++;
@@ -483,6 +511,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
483 511
484 /* Error stopped previous loop? Clean up buffers... */ 512 /* Error stopped previous loop? Clean up buffers... */
485 if (ret) { 513 if (ret) {
514 out:
515 if (data_page)
516 ext4_restore_control_page(data_page);
517 printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret);
518 redirty_page_for_writepage(wbc, page);
486 do { 519 do {
487 clear_buffer_async_write(bh); 520 clear_buffer_async_write(bh);
488 bh = bh->b_this_page; 521 bh = bh->b_this_page;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
new file mode 100644
index 000000000000..171b9ac4b45e
--- /dev/null
+++ b/fs/ext4/readpage.c
@@ -0,0 +1,328 @@
1/*
2 * linux/fs/ext4/readpage.c
3 *
4 * Copyright (C) 2002, Linus Torvalds.
5 * Copyright (C) 2015, Google, Inc.
6 *
7 * This was originally taken from fs/mpage.c
8 *
9 * The intent is the ext4_mpage_readpages() function here is intended
10 * to replace mpage_readpages() in the general case, not just for
11 * encrypted files. It has some limitations (see below), where it
12 * will fall back to read_block_full_page(), but these limitations
13 * should only be hit when page_size != block_size.
14 *
15 * This will allow us to attach a callback function to support ext4
16 * encryption.
17 *
18 * If anything unusual happens, such as:
19 *
20 * - encountering a page which has buffers
21 * - encountering a page which has a non-hole after a hole
22 * - encountering a page with non-contiguous blocks
23 *
24 * then this code just gives up and calls the buffer_head-based read function.
25 * It does handle a page which has holes at the end - that is a common case:
26 * the end-of-file on blocksize < PAGE_CACHE_SIZE setups.
27 *
28 */
29
30#include <linux/kernel.h>
31#include <linux/export.h>
32#include <linux/mm.h>
33#include <linux/kdev_t.h>
34#include <linux/gfp.h>
35#include <linux/bio.h>
36#include <linux/fs.h>
37#include <linux/buffer_head.h>
38#include <linux/blkdev.h>
39#include <linux/highmem.h>
40#include <linux/prefetch.h>
41#include <linux/mpage.h>
42#include <linux/writeback.h>
43#include <linux/backing-dev.h>
44#include <linux/pagevec.h>
45#include <linux/cleancache.h>
46
47#include "ext4.h"
48
49/*
50 * Call ext4_decrypt on every single page, reusing the encryption
51 * context.
52 */
53static void completion_pages(struct work_struct *work)
54{
55#ifdef CONFIG_EXT4_FS_ENCRYPTION
56 struct ext4_crypto_ctx *ctx =
57 container_of(work, struct ext4_crypto_ctx, work);
58 struct bio *bio = ctx->bio;
59 struct bio_vec *bv;
60 int i;
61
62 bio_for_each_segment_all(bv, bio, i) {
63 struct page *page = bv->bv_page;
64
65 int ret = ext4_decrypt(ctx, page);
66 if (ret) {
67 WARN_ON_ONCE(1);
68 SetPageError(page);
69 } else
70 SetPageUptodate(page);
71 unlock_page(page);
72 }
73 ext4_release_crypto_ctx(ctx);
74 bio_put(bio);
75#else
76 BUG();
77#endif
78}
79
80static inline bool ext4_bio_encrypted(struct bio *bio)
81{
82#ifdef CONFIG_EXT4_FS_ENCRYPTION
83 return unlikely(bio->bi_private != NULL);
84#else
85 return false;
86#endif
87}
88
89/*
90 * I/O completion handler for multipage BIOs.
91 *
92 * The mpage code never puts partial pages into a BIO (except for end-of-file).
93 * If a page does not map to a contiguous run of blocks then it simply falls
94 * back to block_read_full_page().
95 *
96 * Why is this? If a page's completion depends on a number of different BIOs
97 * which can complete in any order (or at the same time) then determining the
98 * status of that page is hard. See end_buffer_async_read() for the details.
99 * There is no point in duplicating all that complexity.
100 */
101static void mpage_end_io(struct bio *bio, int err)
102{
103 struct bio_vec *bv;
104 int i;
105
106 if (ext4_bio_encrypted(bio)) {
107 struct ext4_crypto_ctx *ctx = bio->bi_private;
108
109 if (err) {
110 ext4_release_crypto_ctx(ctx);
111 } else {
112 INIT_WORK(&ctx->work, completion_pages);
113 ctx->bio = bio;
114 queue_work(ext4_read_workqueue, &ctx->work);
115 return;
116 }
117 }
118 bio_for_each_segment_all(bv, bio, i) {
119 struct page *page = bv->bv_page;
120
121 if (!err) {
122 SetPageUptodate(page);
123 } else {
124 ClearPageUptodate(page);
125 SetPageError(page);
126 }
127 unlock_page(page);
128 }
129
130 bio_put(bio);
131}
132
133int ext4_mpage_readpages(struct address_space *mapping,
134 struct list_head *pages, struct page *page,
135 unsigned nr_pages)
136{
137 struct bio *bio = NULL;
138 unsigned page_idx;
139 sector_t last_block_in_bio = 0;
140
141 struct inode *inode = mapping->host;
142 const unsigned blkbits = inode->i_blkbits;
143 const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
144 const unsigned blocksize = 1 << blkbits;
145 sector_t block_in_file;
146 sector_t last_block;
147 sector_t last_block_in_file;
148 sector_t blocks[MAX_BUF_PER_PAGE];
149 unsigned page_block;
150 struct block_device *bdev = inode->i_sb->s_bdev;
151 int length;
152 unsigned relative_block = 0;
153 struct ext4_map_blocks map;
154
155 map.m_pblk = 0;
156 map.m_lblk = 0;
157 map.m_len = 0;
158 map.m_flags = 0;
159
160 for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
161 int fully_mapped = 1;
162 unsigned first_hole = blocks_per_page;
163
164 prefetchw(&page->flags);
165 if (pages) {
166 page = list_entry(pages->prev, struct page, lru);
167 list_del(&page->lru);
168 if (add_to_page_cache_lru(page, mapping,
169 page->index, GFP_KERNEL))
170 goto next_page;
171 }
172
173 if (page_has_buffers(page))
174 goto confused;
175
176 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
177 last_block = block_in_file + nr_pages * blocks_per_page;
178 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
179 if (last_block > last_block_in_file)
180 last_block = last_block_in_file;
181 page_block = 0;
182
183 /*
184 * Map blocks using the previous result first.
185 */
186 if ((map.m_flags & EXT4_MAP_MAPPED) &&
187 block_in_file > map.m_lblk &&
188 block_in_file < (map.m_lblk + map.m_len)) {
189 unsigned map_offset = block_in_file - map.m_lblk;
190 unsigned last = map.m_len - map_offset;
191
192 for (relative_block = 0; ; relative_block++) {
193 if (relative_block == last) {
194 /* needed? */
195 map.m_flags &= ~EXT4_MAP_MAPPED;
196 break;
197 }
198 if (page_block == blocks_per_page)
199 break;
200 blocks[page_block] = map.m_pblk + map_offset +
201 relative_block;
202 page_block++;
203 block_in_file++;
204 }
205 }
206
207 /*
208 * Then do more ext4_map_blocks() calls until we are
209 * done with this page.
210 */
211 while (page_block < blocks_per_page) {
212 if (block_in_file < last_block) {
213 map.m_lblk = block_in_file;
214 map.m_len = last_block - block_in_file;
215
216 if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
217 set_error_page:
218 SetPageError(page);
219 zero_user_segment(page, 0,
220 PAGE_CACHE_SIZE);
221 unlock_page(page);
222 goto next_page;
223 }
224 }
225 if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
226 fully_mapped = 0;
227 if (first_hole == blocks_per_page)
228 first_hole = page_block;
229 page_block++;
230 block_in_file++;
231 continue;
232 }
233 if (first_hole != blocks_per_page)
234 goto confused; /* hole -> non-hole */
235
236 /* Contiguous blocks? */
237 if (page_block && blocks[page_block-1] != map.m_pblk-1)
238 goto confused;
239 for (relative_block = 0; ; relative_block++) {
240 if (relative_block == map.m_len) {
241 /* needed? */
242 map.m_flags &= ~EXT4_MAP_MAPPED;
243 break;
244 } else if (page_block == blocks_per_page)
245 break;
246 blocks[page_block] = map.m_pblk+relative_block;
247 page_block++;
248 block_in_file++;
249 }
250 }
251 if (first_hole != blocks_per_page) {
252 zero_user_segment(page, first_hole << blkbits,
253 PAGE_CACHE_SIZE);
254 if (first_hole == 0) {
255 SetPageUptodate(page);
256 unlock_page(page);
257 goto next_page;
258 }
259 } else if (fully_mapped) {
260 SetPageMappedToDisk(page);
261 }
262 if (fully_mapped && blocks_per_page == 1 &&
263 !PageUptodate(page) && cleancache_get_page(page) == 0) {
264 SetPageUptodate(page);
265 goto confused;
266 }
267
268 /*
269 * This page will go to BIO. Do we need to send this
270 * BIO off first?
271 */
272 if (bio && (last_block_in_bio != blocks[0] - 1)) {
273 submit_and_realloc:
274 submit_bio(READ, bio);
275 bio = NULL;
276 }
277 if (bio == NULL) {
278 struct ext4_crypto_ctx *ctx = NULL;
279
280 if (ext4_encrypted_inode(inode) &&
281 S_ISREG(inode->i_mode)) {
282 ctx = ext4_get_crypto_ctx(inode);
283 if (IS_ERR(ctx))
284 goto set_error_page;
285 }
286 bio = bio_alloc(GFP_KERNEL,
287 min_t(int, nr_pages, bio_get_nr_vecs(bdev)));
288 if (!bio) {
289 if (ctx)
290 ext4_release_crypto_ctx(ctx);
291 goto set_error_page;
292 }
293 bio->bi_bdev = bdev;
294 bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
295 bio->bi_end_io = mpage_end_io;
296 bio->bi_private = ctx;
297 }
298
299 length = first_hole << blkbits;
300 if (bio_add_page(bio, page, length, 0) < length)
301 goto submit_and_realloc;
302
303 if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
304 (relative_block == map.m_len)) ||
305 (first_hole != blocks_per_page)) {
306 submit_bio(READ, bio);
307 bio = NULL;
308 } else
309 last_block_in_bio = blocks[blocks_per_page - 1];
310 goto next_page;
311 confused:
312 if (bio) {
313 submit_bio(READ, bio);
314 bio = NULL;
315 }
316 if (!PageUptodate(page))
317 block_read_full_page(page, ext4_get_block);
318 else
319 unlock_page(page);
320 next_page:
321 if (pages)
322 page_cache_release(page);
323 }
324 BUG_ON(pages && !list_empty(pages));
325 if (bio)
326 submit_bio(READ, bio);
327 return 0;
328}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d348c7d29d80..821f22dbe825 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -21,7 +21,6 @@
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/time.h> 22#include <linux/time.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/jbd2.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26#include <linux/init.h> 25#include <linux/init.h>
27#include <linux/blkdev.h> 26#include <linux/blkdev.h>
@@ -323,22 +322,6 @@ static void save_error_info(struct super_block *sb, const char *func,
323 ext4_commit_super(sb, 1); 322 ext4_commit_super(sb, 1);
324} 323}
325 324
326/*
327 * The del_gendisk() function uninitializes the disk-specific data
328 * structures, including the bdi structure, without telling anyone
329 * else. Once this happens, any attempt to call mark_buffer_dirty()
330 * (for example, by ext4_commit_super), will cause a kernel OOPS.
331 * This is a kludge to prevent these oops until we can put in a proper
332 * hook in del_gendisk() to inform the VFS and file system layers.
333 */
334static int block_device_ejected(struct super_block *sb)
335{
336 struct inode *bd_inode = sb->s_bdev->bd_inode;
337 struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
338
339 return bdi->dev == NULL;
340}
341
342static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) 325static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
343{ 326{
344 struct super_block *sb = journal->j_private; 327 struct super_block *sb = journal->j_private;
@@ -893,6 +876,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
893 atomic_set(&ei->i_ioend_count, 0); 876 atomic_set(&ei->i_ioend_count, 0);
894 atomic_set(&ei->i_unwritten, 0); 877 atomic_set(&ei->i_unwritten, 0);
895 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 878 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
879#ifdef CONFIG_EXT4_FS_ENCRYPTION
880 ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
881#endif
896 882
897 return &ei->vfs_inode; 883 return &ei->vfs_inode;
898} 884}
@@ -1120,7 +1106,7 @@ enum {
1120 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, 1106 Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
1121 Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, 1107 Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
1122 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, 1108 Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
1123 Opt_data_err_abort, Opt_data_err_ignore, 1109 Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
1124 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, 1110 Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
1125 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, 1111 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
1126 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, 1112 Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
@@ -1211,6 +1197,7 @@ static const match_table_t tokens = {
1211 {Opt_init_itable, "init_itable"}, 1197 {Opt_init_itable, "init_itable"},
1212 {Opt_noinit_itable, "noinit_itable"}, 1198 {Opt_noinit_itable, "noinit_itable"},
1213 {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, 1199 {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
1200 {Opt_test_dummy_encryption, "test_dummy_encryption"},
1214 {Opt_removed, "check=none"}, /* mount option from ext2/3 */ 1201 {Opt_removed, "check=none"}, /* mount option from ext2/3 */
1215 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ 1202 {Opt_removed, "nocheck"}, /* mount option from ext2/3 */
1216 {Opt_removed, "reservation"}, /* mount option from ext2/3 */ 1203 {Opt_removed, "reservation"}, /* mount option from ext2/3 */
@@ -1412,6 +1399,7 @@ static const struct mount_opts {
1412 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, 1399 {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
1413 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, 1400 {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
1414 {Opt_max_dir_size_kb, 0, MOPT_GTE0}, 1401 {Opt_max_dir_size_kb, 0, MOPT_GTE0},
1402 {Opt_test_dummy_encryption, 0, MOPT_GTE0},
1415 {Opt_err, 0, 0} 1403 {Opt_err, 0, 0}
1416}; 1404};
1417 1405
@@ -1588,6 +1576,15 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
1588 } 1576 }
1589 *journal_ioprio = 1577 *journal_ioprio =
1590 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); 1578 IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
1579 } else if (token == Opt_test_dummy_encryption) {
1580#ifdef CONFIG_EXT4_FS_ENCRYPTION
1581 sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION;
1582 ext4_msg(sb, KERN_WARNING,
1583 "Test dummy encryption mode enabled");
1584#else
1585 ext4_msg(sb, KERN_WARNING,
1586 "Test dummy encryption mount option ignored");
1587#endif
1591 } else if (m->flags & MOPT_DATAJ) { 1588 } else if (m->flags & MOPT_DATAJ) {
1592 if (is_remount) { 1589 if (is_remount) {
1593 if (!sbi->s_journal) 1590 if (!sbi->s_journal)
@@ -2685,11 +2682,13 @@ static struct attribute *ext4_attrs[] = {
2685EXT4_INFO_ATTR(lazy_itable_init); 2682EXT4_INFO_ATTR(lazy_itable_init);
2686EXT4_INFO_ATTR(batched_discard); 2683EXT4_INFO_ATTR(batched_discard);
2687EXT4_INFO_ATTR(meta_bg_resize); 2684EXT4_INFO_ATTR(meta_bg_resize);
2685EXT4_INFO_ATTR(encryption);
2688 2686
2689static struct attribute *ext4_feat_attrs[] = { 2687static struct attribute *ext4_feat_attrs[] = {
2690 ATTR_LIST(lazy_itable_init), 2688 ATTR_LIST(lazy_itable_init),
2691 ATTR_LIST(batched_discard), 2689 ATTR_LIST(batched_discard),
2692 ATTR_LIST(meta_bg_resize), 2690 ATTR_LIST(meta_bg_resize),
2691 ATTR_LIST(encryption),
2693 NULL, 2692 NULL,
2694}; 2693};
2695 2694
@@ -3448,6 +3447,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3448 if (sb->s_bdev->bd_part) 3447 if (sb->s_bdev->bd_part)
3449 sbi->s_sectors_written_start = 3448 sbi->s_sectors_written_start =
3450 part_stat_read(sb->s_bdev->bd_part, sectors[1]); 3449 part_stat_read(sb->s_bdev->bd_part, sectors[1]);
3450#ifdef CONFIG_EXT4_FS_ENCRYPTION
3451 /* Modes of operations for file and directory encryption. */
3452 sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
3453 sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
3454#endif
3451 3455
3452 /* Cleanup superblock name */ 3456 /* Cleanup superblock name */
3453 for (cp = sb->s_id; (cp = strchr(cp, '/'));) 3457 for (cp = sb->s_id; (cp = strchr(cp, '/'));)
@@ -3692,6 +3696,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3692 } 3696 }
3693 } 3697 }
3694 3698
3699 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
3700 es->s_encryption_level) {
3701 ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
3702 es->s_encryption_level);
3703 goto failed_mount;
3704 }
3705
3695 if (sb->s_blocksize != blocksize) { 3706 if (sb->s_blocksize != blocksize) {
3696 /* Validate the filesystem blocksize */ 3707 /* Validate the filesystem blocksize */
3697 if (!sb_set_blocksize(sb, blocksize)) { 3708 if (!sb_set_blocksize(sb, blocksize)) {
@@ -4054,6 +4065,13 @@ no_journal:
4054 } 4065 }
4055 } 4066 }
4056 4067
4068 if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
4069 !(sb->s_flags & MS_RDONLY) &&
4070 !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
4071 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
4072 ext4_commit_super(sb, 1);
4073 }
4074
4057 /* 4075 /*
4058 * Get the # of file system overhead blocks from the 4076 * Get the # of file system overhead blocks from the
4059 * superblock if present. 4077 * superblock if present.
@@ -4570,7 +4588,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4570 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; 4588 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
4571 int error = 0; 4589 int error = 0;
4572 4590
4573 if (!sbh || block_device_ejected(sb)) 4591 if (!sbh)
4574 return error; 4592 return error;
4575 if (buffer_write_io_error(sbh)) { 4593 if (buffer_write_io_error(sbh)) {
4576 /* 4594 /*
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ff3711932018..136ca0e911fd 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -18,13 +18,101 @@
18 */ 18 */
19 19
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/jbd2.h>
22#include <linux/namei.h> 21#include <linux/namei.h>
23#include "ext4.h" 22#include "ext4.h"
24#include "xattr.h" 23#include "xattr.h"
25 24
25#ifdef CONFIG_EXT4_FS_ENCRYPTION
26static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) 26static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
27{ 27{
28 struct page *cpage = NULL;
29 char *caddr, *paddr = NULL;
30 struct ext4_str cstr, pstr;
31 struct inode *inode = dentry->d_inode;
32 struct ext4_fname_crypto_ctx *ctx = NULL;
33 struct ext4_encrypted_symlink_data *sd;
34 loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
35 int res;
36 u32 plen, max_size = inode->i_sb->s_blocksize;
37
38 if (!ext4_encrypted_inode(inode))
39 return page_follow_link_light(dentry, nd);
40
41 ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize);
42 if (IS_ERR(ctx))
43 return ctx;
44
45 if (ext4_inode_is_fast_symlink(inode)) {
46 caddr = (char *) EXT4_I(dentry->d_inode)->i_data;
47 max_size = sizeof(EXT4_I(dentry->d_inode)->i_data);
48 } else {
49 cpage = read_mapping_page(inode->i_mapping, 0, NULL);
50 if (IS_ERR(cpage)) {
51 ext4_put_fname_crypto_ctx(&ctx);
52 return cpage;
53 }
54 caddr = kmap(cpage);
55 caddr[size] = 0;
56 }
57
58 /* Symlink is encrypted */
59 sd = (struct ext4_encrypted_symlink_data *)caddr;
60 cstr.name = sd->encrypted_path;
61 cstr.len = le32_to_cpu(sd->len);
62 if ((cstr.len +
63 sizeof(struct ext4_encrypted_symlink_data) - 1) >
64 max_size) {
65 /* Symlink data on the disk is corrupted */
66 res = -EIO;
67 goto errout;
68 }
69 plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
70 EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len;
71 paddr = kmalloc(plen + 1, GFP_NOFS);
72 if (!paddr) {
73 res = -ENOMEM;
74 goto errout;
75 }
76 pstr.name = paddr;
77 res = _ext4_fname_disk_to_usr(ctx, &cstr, &pstr);
78 if (res < 0)
79 goto errout;
80 /* Null-terminate the name */
81 if (res <= plen)
82 paddr[res] = '\0';
83 nd_set_link(nd, paddr);
84 ext4_put_fname_crypto_ctx(&ctx);
85 if (cpage) {
86 kunmap(cpage);
87 page_cache_release(cpage);
88 }
89 return NULL;
90errout:
91 ext4_put_fname_crypto_ctx(&ctx);
92 if (cpage) {
93 kunmap(cpage);
94 page_cache_release(cpage);
95 }
96 kfree(paddr);
97 return ERR_PTR(res);
98}
99
100static void ext4_put_link(struct dentry *dentry, struct nameidata *nd,
101 void *cookie)
102{
103 struct page *page = cookie;
104
105 if (!page) {
106 kfree(nd_get_link(nd));
107 } else {
108 kunmap(page);
109 page_cache_release(page);
110 }
111}
112#endif
113
114static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd)
115{
28 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); 116 struct ext4_inode_info *ei = EXT4_I(dentry->d_inode);
29 nd_set_link(nd, (char *) ei->i_data); 117 nd_set_link(nd, (char *) ei->i_data);
30 return NULL; 118 return NULL;
@@ -32,8 +120,13 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd)
32 120
33const struct inode_operations ext4_symlink_inode_operations = { 121const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 122 .readlink = generic_readlink,
123#ifdef CONFIG_EXT4_FS_ENCRYPTION
124 .follow_link = ext4_follow_link,
125 .put_link = ext4_put_link,
126#else
35 .follow_link = page_follow_link_light, 127 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 128 .put_link = page_put_link,
129#endif
37 .setattr = ext4_setattr, 130 .setattr = ext4_setattr,
38 .setxattr = generic_setxattr, 131 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 132 .getxattr = generic_getxattr,
@@ -43,7 +136,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
43 136
44const struct inode_operations ext4_fast_symlink_inode_operations = { 137const struct inode_operations ext4_fast_symlink_inode_operations = {
45 .readlink = generic_readlink, 138 .readlink = generic_readlink,
46 .follow_link = ext4_follow_link, 139 .follow_link = ext4_follow_fast_link,
47 .setattr = ext4_setattr, 140 .setattr = ext4_setattr,
48 .setxattr = generic_setxattr, 141 .setxattr = generic_setxattr,
49 .getxattr = generic_getxattr, 142 .getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 1e09fc77395c..759842ff8af0 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -55,7 +55,6 @@
55#include <linux/slab.h> 55#include <linux/slab.h>
56#include <linux/mbcache.h> 56#include <linux/mbcache.h>
57#include <linux/quotaops.h> 57#include <linux/quotaops.h>
58#include <linux/rwsem.h>
59#include "ext4_jbd2.h" 58#include "ext4_jbd2.h"
60#include "ext4.h" 59#include "ext4.h"
61#include "xattr.h" 60#include "xattr.h"
@@ -639,8 +638,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
639 free += EXT4_XATTR_LEN(name_len); 638 free += EXT4_XATTR_LEN(name_len);
640 } 639 }
641 if (i->value) { 640 if (i->value) {
642 if (free < EXT4_XATTR_SIZE(i->value_len) || 641 if (free < EXT4_XATTR_LEN(name_len) +
643 free < EXT4_XATTR_LEN(name_len) +
644 EXT4_XATTR_SIZE(i->value_len)) 642 EXT4_XATTR_SIZE(i->value_len))
645 return -ENOSPC; 643 return -ENOSPC;
646 } 644 }
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 29bedf5589f6..ddc0957760ba 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -23,6 +23,7 @@
23#define EXT4_XATTR_INDEX_SECURITY 6 23#define EXT4_XATTR_INDEX_SECURITY 6
24#define EXT4_XATTR_INDEX_SYSTEM 7 24#define EXT4_XATTR_INDEX_SYSTEM 7
25#define EXT4_XATTR_INDEX_RICHACL 8 25#define EXT4_XATTR_INDEX_RICHACL 8
26#define EXT4_XATTR_INDEX_ENCRYPTION 9
26 27
27struct ext4_xattr_header { 28struct ext4_xattr_header {
28 __le32 h_magic; /* magic number for identification */ 29 __le32 h_magic; /* magic number for identification */
@@ -98,6 +99,8 @@ extern const struct xattr_handler ext4_xattr_user_handler;
98extern const struct xattr_handler ext4_xattr_trusted_handler; 99extern const struct xattr_handler ext4_xattr_trusted_handler;
99extern const struct xattr_handler ext4_xattr_security_handler; 100extern const struct xattr_handler ext4_xattr_security_handler;
100 101
102#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
103
101extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); 104extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
102 105
103extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); 106extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 94e2d2ffabe1..05f0f663f14c 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -1,5 +1,5 @@
1config F2FS_FS 1config F2FS_FS
2 tristate "F2FS filesystem support (EXPERIMENTAL)" 2 tristate "F2FS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 help 4 help
5 F2FS is based on Log-structured File System (LFS), which supports 5 F2FS is based on Log-structured File System (LFS), which supports
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 742202779bd5..4320ffab3495 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -351,13 +351,11 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode,
351 351
352 *acl = f2fs_acl_clone(p, GFP_NOFS); 352 *acl = f2fs_acl_clone(p, GFP_NOFS);
353 if (!*acl) 353 if (!*acl)
354 return -ENOMEM; 354 goto no_mem;
355 355
356 ret = f2fs_acl_create_masq(*acl, mode); 356 ret = f2fs_acl_create_masq(*acl, mode);
357 if (ret < 0) { 357 if (ret < 0)
358 posix_acl_release(*acl); 358 goto no_mem_clone;
359 return -ENOMEM;
360 }
361 359
362 if (ret == 0) { 360 if (ret == 0) {
363 posix_acl_release(*acl); 361 posix_acl_release(*acl);
@@ -378,6 +376,12 @@ no_acl:
378 *default_acl = NULL; 376 *default_acl = NULL;
379 *acl = NULL; 377 *acl = NULL;
380 return 0; 378 return 0;
379
380no_mem_clone:
381 posix_acl_release(*acl);
382no_mem:
383 posix_acl_release(p);
384 return -ENOMEM;
381} 385}
382 386
383int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, 387int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7f794b72b3b7..a5e17a2a0781 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -276,7 +276,7 @@ continue_unlock:
276 if (!clear_page_dirty_for_io(page)) 276 if (!clear_page_dirty_for_io(page))
277 goto continue_unlock; 277 goto continue_unlock;
278 278
279 if (f2fs_write_meta_page(page, &wbc)) { 279 if (mapping->a_ops->writepage(page, &wbc)) {
280 unlock_page(page); 280 unlock_page(page);
281 break; 281 break;
282 } 282 }
@@ -464,20 +464,19 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
464 464
465void recover_orphan_inodes(struct f2fs_sb_info *sbi) 465void recover_orphan_inodes(struct f2fs_sb_info *sbi)
466{ 466{
467 block_t start_blk, orphan_blkaddr, i, j; 467 block_t start_blk, orphan_blocks, i, j;
468 468
469 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) 469 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
470 return; 470 return;
471 471
472 set_sbi_flag(sbi, SBI_POR_DOING); 472 set_sbi_flag(sbi, SBI_POR_DOING);
473 473
474 start_blk = __start_cp_addr(sbi) + 1 + 474 start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
475 le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); 475 orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
476 orphan_blkaddr = __start_sum_addr(sbi) - 1;
477 476
478 ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); 477 ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP);
479 478
480 for (i = 0; i < orphan_blkaddr; i++) { 479 for (i = 0; i < orphan_blocks; i++) {
481 struct page *page = get_meta_page(sbi, start_blk + i); 480 struct page *page = get_meta_page(sbi, start_blk + i);
482 struct f2fs_orphan_block *orphan_blk; 481 struct f2fs_orphan_block *orphan_blk;
483 482
@@ -615,7 +614,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
615 unsigned long blk_size = sbi->blocksize; 614 unsigned long blk_size = sbi->blocksize;
616 unsigned long long cp1_version = 0, cp2_version = 0; 615 unsigned long long cp1_version = 0, cp2_version = 0;
617 unsigned long long cp_start_blk_no; 616 unsigned long long cp_start_blk_no;
618 unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); 617 unsigned int cp_blks = 1 + __cp_payload(sbi);
619 block_t cp_blk_no; 618 block_t cp_blk_no;
620 int i; 619 int i;
621 620
@@ -796,6 +795,7 @@ retry:
796 * wribacking dentry pages in the freeing inode. 795 * wribacking dentry pages in the freeing inode.
797 */ 796 */
798 f2fs_submit_merged_bio(sbi, DATA, WRITE); 797 f2fs_submit_merged_bio(sbi, DATA, WRITE);
798 cond_resched();
799 } 799 }
800 goto retry; 800 goto retry;
801} 801}
@@ -884,7 +884,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
884 __u32 crc32 = 0; 884 __u32 crc32 = 0;
885 void *kaddr; 885 void *kaddr;
886 int i; 886 int i;
887 int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); 887 int cp_payload_blks = __cp_payload(sbi);
888 888
889 /* 889 /*
890 * This avoids to conduct wrong roll-forward operations and uses 890 * This avoids to conduct wrong roll-forward operations and uses
@@ -1048,17 +1048,18 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1048 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 1048 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1049 unsigned long long ckpt_ver; 1049 unsigned long long ckpt_ver;
1050 1050
1051 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
1052
1053 mutex_lock(&sbi->cp_mutex); 1051 mutex_lock(&sbi->cp_mutex);
1054 1052
1055 if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && 1053 if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
1056 cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT) 1054 (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC))
1057 goto out; 1055 goto out;
1058 if (unlikely(f2fs_cp_error(sbi))) 1056 if (unlikely(f2fs_cp_error(sbi)))
1059 goto out; 1057 goto out;
1060 if (f2fs_readonly(sbi->sb)) 1058 if (f2fs_readonly(sbi->sb))
1061 goto out; 1059 goto out;
1060
1061 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
1062
1062 if (block_operations(sbi)) 1063 if (block_operations(sbi))
1063 goto out; 1064 goto out;
1064 1065
@@ -1085,6 +1086,10 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1085 1086
1086 unblock_operations(sbi); 1087 unblock_operations(sbi);
1087 stat_inc_cp_count(sbi->stat_info); 1088 stat_inc_cp_count(sbi->stat_info);
1089
1090 if (cpc->reason == CP_RECOVERY)
1091 f2fs_msg(sbi->sb, KERN_NOTICE,
1092 "checkpoint: version = %llx", ckpt_ver);
1088out: 1093out:
1089 mutex_unlock(&sbi->cp_mutex); 1094 mutex_unlock(&sbi->cp_mutex);
1090 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); 1095 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
@@ -1103,14 +1108,9 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
1103 im->ino_num = 0; 1108 im->ino_num = 0;
1104 } 1109 }
1105 1110
1106 /*
1107 * considering 512 blocks in a segment 8 blocks are needed for cp
1108 * and log segment summaries. Remaining blocks are used to keep
1109 * orphan entries with the limitation one reserved segment
1110 * for cp pack we can have max 1020*504 orphan entries
1111 */
1112 sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - 1111 sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
1113 NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; 1112 NR_CURSEG_TYPE - __cp_payload(sbi)) *
1113 F2FS_ORPHANS_PER_BLOCK;
1114} 1114}
1115 1115
1116int __init create_checkpoint_caches(void) 1116int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 319eda511c4f..b91b0e10678e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -25,6 +25,9 @@
25#include "trace.h" 25#include "trace.h"
26#include <trace/events/f2fs.h> 26#include <trace/events/f2fs.h>
27 27
28static struct kmem_cache *extent_tree_slab;
29static struct kmem_cache *extent_node_slab;
30
28static void f2fs_read_end_io(struct bio *bio, int err) 31static void f2fs_read_end_io(struct bio *bio, int err)
29{ 32{
30 struct bio_vec *bvec; 33 struct bio_vec *bvec;
@@ -197,7 +200,7 @@ alloc_new:
197 * ->node_page 200 * ->node_page
198 * update block addresses in the node page 201 * update block addresses in the node page
199 */ 202 */
200static void __set_data_blkaddr(struct dnode_of_data *dn) 203void set_data_blkaddr(struct dnode_of_data *dn)
201{ 204{
202 struct f2fs_node *rn; 205 struct f2fs_node *rn;
203 __le32 *addr_array; 206 __le32 *addr_array;
@@ -226,7 +229,7 @@ int reserve_new_block(struct dnode_of_data *dn)
226 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); 229 trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
227 230
228 dn->data_blkaddr = NEW_ADDR; 231 dn->data_blkaddr = NEW_ADDR;
229 __set_data_blkaddr(dn); 232 set_data_blkaddr(dn);
230 mark_inode_dirty(dn->inode); 233 mark_inode_dirty(dn->inode);
231 sync_inode_page(dn); 234 sync_inode_page(dn);
232 return 0; 235 return 0;
@@ -248,73 +251,62 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
248 return err; 251 return err;
249} 252}
250 253
251static int check_extent_cache(struct inode *inode, pgoff_t pgofs, 254static void f2fs_map_bh(struct super_block *sb, pgoff_t pgofs,
252 struct buffer_head *bh_result) 255 struct extent_info *ei, struct buffer_head *bh_result)
256{
257 unsigned int blkbits = sb->s_blocksize_bits;
258 size_t max_size = bh_result->b_size;
259 size_t mapped_size;
260
261 clear_buffer_new(bh_result);
262 map_bh(bh_result, sb, ei->blk + pgofs - ei->fofs);
263 mapped_size = (ei->fofs + ei->len - pgofs) << blkbits;
264 bh_result->b_size = min(max_size, mapped_size);
265}
266
267static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs,
268 struct extent_info *ei)
253{ 269{
254 struct f2fs_inode_info *fi = F2FS_I(inode); 270 struct f2fs_inode_info *fi = F2FS_I(inode);
255 pgoff_t start_fofs, end_fofs; 271 pgoff_t start_fofs, end_fofs;
256 block_t start_blkaddr; 272 block_t start_blkaddr;
257 273
258 if (is_inode_flag_set(fi, FI_NO_EXTENT)) 274 read_lock(&fi->ext_lock);
259 return 0;
260
261 read_lock(&fi->ext.ext_lock);
262 if (fi->ext.len == 0) { 275 if (fi->ext.len == 0) {
263 read_unlock(&fi->ext.ext_lock); 276 read_unlock(&fi->ext_lock);
264 return 0; 277 return false;
265 } 278 }
266 279
267 stat_inc_total_hit(inode->i_sb); 280 stat_inc_total_hit(inode->i_sb);
268 281
269 start_fofs = fi->ext.fofs; 282 start_fofs = fi->ext.fofs;
270 end_fofs = fi->ext.fofs + fi->ext.len - 1; 283 end_fofs = fi->ext.fofs + fi->ext.len - 1;
271 start_blkaddr = fi->ext.blk_addr; 284 start_blkaddr = fi->ext.blk;
272 285
273 if (pgofs >= start_fofs && pgofs <= end_fofs) { 286 if (pgofs >= start_fofs && pgofs <= end_fofs) {
274 unsigned int blkbits = inode->i_sb->s_blocksize_bits; 287 *ei = fi->ext;
275 size_t count;
276
277 set_buffer_new(bh_result);
278 map_bh(bh_result, inode->i_sb,
279 start_blkaddr + pgofs - start_fofs);
280 count = end_fofs - pgofs + 1;
281 if (count < (UINT_MAX >> blkbits))
282 bh_result->b_size = (count << blkbits);
283 else
284 bh_result->b_size = UINT_MAX;
285
286 stat_inc_read_hit(inode->i_sb); 288 stat_inc_read_hit(inode->i_sb);
287 read_unlock(&fi->ext.ext_lock); 289 read_unlock(&fi->ext_lock);
288 return 1; 290 return true;
289 } 291 }
290 read_unlock(&fi->ext.ext_lock); 292 read_unlock(&fi->ext_lock);
291 return 0; 293 return false;
292} 294}
293 295
294void update_extent_cache(struct dnode_of_data *dn) 296static bool update_extent_info(struct inode *inode, pgoff_t fofs,
297 block_t blkaddr)
295{ 298{
296 struct f2fs_inode_info *fi = F2FS_I(dn->inode); 299 struct f2fs_inode_info *fi = F2FS_I(inode);
297 pgoff_t fofs, start_fofs, end_fofs; 300 pgoff_t start_fofs, end_fofs;
298 block_t start_blkaddr, end_blkaddr; 301 block_t start_blkaddr, end_blkaddr;
299 int need_update = true; 302 int need_update = true;
300 303
301 f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); 304 write_lock(&fi->ext_lock);
302
303 /* Update the page address in the parent node */
304 __set_data_blkaddr(dn);
305
306 if (is_inode_flag_set(fi, FI_NO_EXTENT))
307 return;
308
309 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
310 dn->ofs_in_node;
311
312 write_lock(&fi->ext.ext_lock);
313 305
314 start_fofs = fi->ext.fofs; 306 start_fofs = fi->ext.fofs;
315 end_fofs = fi->ext.fofs + fi->ext.len - 1; 307 end_fofs = fi->ext.fofs + fi->ext.len - 1;
316 start_blkaddr = fi->ext.blk_addr; 308 start_blkaddr = fi->ext.blk;
317 end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1; 309 end_blkaddr = fi->ext.blk + fi->ext.len - 1;
318 310
319 /* Drop and initialize the matched extent */ 311 /* Drop and initialize the matched extent */
320 if (fi->ext.len == 1 && fofs == start_fofs) 312 if (fi->ext.len == 1 && fofs == start_fofs)
@@ -322,24 +314,24 @@ void update_extent_cache(struct dnode_of_data *dn)
322 314
323 /* Initial extent */ 315 /* Initial extent */
324 if (fi->ext.len == 0) { 316 if (fi->ext.len == 0) {
325 if (dn->data_blkaddr != NULL_ADDR) { 317 if (blkaddr != NULL_ADDR) {
326 fi->ext.fofs = fofs; 318 fi->ext.fofs = fofs;
327 fi->ext.blk_addr = dn->data_blkaddr; 319 fi->ext.blk = blkaddr;
328 fi->ext.len = 1; 320 fi->ext.len = 1;
329 } 321 }
330 goto end_update; 322 goto end_update;
331 } 323 }
332 324
333 /* Front merge */ 325 /* Front merge */
334 if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) { 326 if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) {
335 fi->ext.fofs--; 327 fi->ext.fofs--;
336 fi->ext.blk_addr--; 328 fi->ext.blk--;
337 fi->ext.len++; 329 fi->ext.len++;
338 goto end_update; 330 goto end_update;
339 } 331 }
340 332
341 /* Back merge */ 333 /* Back merge */
342 if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) { 334 if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) {
343 fi->ext.len++; 335 fi->ext.len++;
344 goto end_update; 336 goto end_update;
345 } 337 }
@@ -351,8 +343,7 @@ void update_extent_cache(struct dnode_of_data *dn)
351 fi->ext.len = fofs - start_fofs; 343 fi->ext.len = fofs - start_fofs;
352 } else { 344 } else {
353 fi->ext.fofs = fofs + 1; 345 fi->ext.fofs = fofs + 1;
354 fi->ext.blk_addr = start_blkaddr + 346 fi->ext.blk = start_blkaddr + fofs - start_fofs + 1;
355 fofs - start_fofs + 1;
356 fi->ext.len -= fofs - start_fofs + 1; 347 fi->ext.len -= fofs - start_fofs + 1;
357 } 348 }
358 } else { 349 } else {
@@ -366,27 +357,583 @@ void update_extent_cache(struct dnode_of_data *dn)
366 need_update = true; 357 need_update = true;
367 } 358 }
368end_update: 359end_update:
369 write_unlock(&fi->ext.ext_lock); 360 write_unlock(&fi->ext_lock);
370 if (need_update) 361 return need_update;
371 sync_inode_page(dn); 362}
363
364static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
365 struct extent_tree *et, struct extent_info *ei,
366 struct rb_node *parent, struct rb_node **p)
367{
368 struct extent_node *en;
369
370 en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC);
371 if (!en)
372 return NULL;
373
374 en->ei = *ei;
375 INIT_LIST_HEAD(&en->list);
376
377 rb_link_node(&en->rb_node, parent, p);
378 rb_insert_color(&en->rb_node, &et->root);
379 et->count++;
380 atomic_inc(&sbi->total_ext_node);
381 return en;
382}
383
384static void __detach_extent_node(struct f2fs_sb_info *sbi,
385 struct extent_tree *et, struct extent_node *en)
386{
387 rb_erase(&en->rb_node, &et->root);
388 et->count--;
389 atomic_dec(&sbi->total_ext_node);
390
391 if (et->cached_en == en)
392 et->cached_en = NULL;
393}
394
395static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi,
396 nid_t ino)
397{
398 struct extent_tree *et;
399
400 down_read(&sbi->extent_tree_lock);
401 et = radix_tree_lookup(&sbi->extent_tree_root, ino);
402 if (!et) {
403 up_read(&sbi->extent_tree_lock);
404 return NULL;
405 }
406 atomic_inc(&et->refcount);
407 up_read(&sbi->extent_tree_lock);
408
409 return et;
410}
411
412static struct extent_tree *__grab_extent_tree(struct inode *inode)
413{
414 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
415 struct extent_tree *et;
416 nid_t ino = inode->i_ino;
417
418 down_write(&sbi->extent_tree_lock);
419 et = radix_tree_lookup(&sbi->extent_tree_root, ino);
420 if (!et) {
421 et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS);
422 f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et);
423 memset(et, 0, sizeof(struct extent_tree));
424 et->ino = ino;
425 et->root = RB_ROOT;
426 et->cached_en = NULL;
427 rwlock_init(&et->lock);
428 atomic_set(&et->refcount, 0);
429 et->count = 0;
430 sbi->total_ext_tree++;
431 }
432 atomic_inc(&et->refcount);
433 up_write(&sbi->extent_tree_lock);
434
435 return et;
436}
437
438static struct extent_node *__lookup_extent_tree(struct extent_tree *et,
439 unsigned int fofs)
440{
441 struct rb_node *node = et->root.rb_node;
442 struct extent_node *en;
443
444 if (et->cached_en) {
445 struct extent_info *cei = &et->cached_en->ei;
446
447 if (cei->fofs <= fofs && cei->fofs + cei->len > fofs)
448 return et->cached_en;
449 }
450
451 while (node) {
452 en = rb_entry(node, struct extent_node, rb_node);
453
454 if (fofs < en->ei.fofs) {
455 node = node->rb_left;
456 } else if (fofs >= en->ei.fofs + en->ei.len) {
457 node = node->rb_right;
458 } else {
459 et->cached_en = en;
460 return en;
461 }
462 }
463 return NULL;
464}
465
466static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi,
467 struct extent_tree *et, struct extent_node *en)
468{
469 struct extent_node *prev;
470 struct rb_node *node;
471
472 node = rb_prev(&en->rb_node);
473 if (!node)
474 return NULL;
475
476 prev = rb_entry(node, struct extent_node, rb_node);
477 if (__is_back_mergeable(&en->ei, &prev->ei)) {
478 en->ei.fofs = prev->ei.fofs;
479 en->ei.blk = prev->ei.blk;
480 en->ei.len += prev->ei.len;
481 __detach_extent_node(sbi, et, prev);
482 return prev;
483 }
484 return NULL;
485}
486
487static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi,
488 struct extent_tree *et, struct extent_node *en)
489{
490 struct extent_node *next;
491 struct rb_node *node;
492
493 node = rb_next(&en->rb_node);
494 if (!node)
495 return NULL;
496
497 next = rb_entry(node, struct extent_node, rb_node);
498 if (__is_front_mergeable(&en->ei, &next->ei)) {
499 en->ei.len += next->ei.len;
500 __detach_extent_node(sbi, et, next);
501 return next;
502 }
503 return NULL;
504}
505
506static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi,
507 struct extent_tree *et, struct extent_info *ei,
508 struct extent_node **den)
509{
510 struct rb_node **p = &et->root.rb_node;
511 struct rb_node *parent = NULL;
512 struct extent_node *en;
513
514 while (*p) {
515 parent = *p;
516 en = rb_entry(parent, struct extent_node, rb_node);
517
518 if (ei->fofs < en->ei.fofs) {
519 if (__is_front_mergeable(ei, &en->ei)) {
520 f2fs_bug_on(sbi, !den);
521 en->ei.fofs = ei->fofs;
522 en->ei.blk = ei->blk;
523 en->ei.len += ei->len;
524 *den = __try_back_merge(sbi, et, en);
525 return en;
526 }
527 p = &(*p)->rb_left;
528 } else if (ei->fofs >= en->ei.fofs + en->ei.len) {
529 if (__is_back_mergeable(ei, &en->ei)) {
530 f2fs_bug_on(sbi, !den);
531 en->ei.len += ei->len;
532 *den = __try_front_merge(sbi, et, en);
533 return en;
534 }
535 p = &(*p)->rb_right;
536 } else {
537 f2fs_bug_on(sbi, 1);
538 }
539 }
540
541 return __attach_extent_node(sbi, et, ei, parent, p);
542}
543
544static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
545 struct extent_tree *et, bool free_all)
546{
547 struct rb_node *node, *next;
548 struct extent_node *en;
549 unsigned int count = et->count;
550
551 node = rb_first(&et->root);
552 while (node) {
553 next = rb_next(node);
554 en = rb_entry(node, struct extent_node, rb_node);
555
556 if (free_all) {
557 spin_lock(&sbi->extent_lock);
558 if (!list_empty(&en->list))
559 list_del_init(&en->list);
560 spin_unlock(&sbi->extent_lock);
561 }
562
563 if (free_all || list_empty(&en->list)) {
564 __detach_extent_node(sbi, et, en);
565 kmem_cache_free(extent_node_slab, en);
566 }
567 node = next;
568 }
569
570 return count - et->count;
571}
572
573static void f2fs_init_extent_tree(struct inode *inode,
574 struct f2fs_extent *i_ext)
575{
576 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
577 struct extent_tree *et;
578 struct extent_node *en;
579 struct extent_info ei;
580
581 if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
582 return;
583
584 et = __grab_extent_tree(inode);
585
586 write_lock(&et->lock);
587 if (et->count)
588 goto out;
589
590 set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
591 le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
592
593 en = __insert_extent_tree(sbi, et, &ei, NULL);
594 if (en) {
595 et->cached_en = en;
596
597 spin_lock(&sbi->extent_lock);
598 list_add_tail(&en->list, &sbi->extent_list);
599 spin_unlock(&sbi->extent_lock);
600 }
601out:
602 write_unlock(&et->lock);
603 atomic_dec(&et->refcount);
604}
605
606static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
607 struct extent_info *ei)
608{
609 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
610 struct extent_tree *et;
611 struct extent_node *en;
612
613 trace_f2fs_lookup_extent_tree_start(inode, pgofs);
614
615 et = __find_extent_tree(sbi, inode->i_ino);
616 if (!et)
617 return false;
618
619 read_lock(&et->lock);
620 en = __lookup_extent_tree(et, pgofs);
621 if (en) {
622 *ei = en->ei;
623 spin_lock(&sbi->extent_lock);
624 if (!list_empty(&en->list))
625 list_move_tail(&en->list, &sbi->extent_list);
626 spin_unlock(&sbi->extent_lock);
627 stat_inc_read_hit(sbi->sb);
628 }
629 stat_inc_total_hit(sbi->sb);
630 read_unlock(&et->lock);
631
632 trace_f2fs_lookup_extent_tree_end(inode, pgofs, en);
633
634 atomic_dec(&et->refcount);
635 return en ? true : false;
636}
637
638static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs,
639 block_t blkaddr)
640{
641 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
642 struct extent_tree *et;
643 struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
644 struct extent_node *den = NULL;
645 struct extent_info ei, dei;
646 unsigned int endofs;
647
648 trace_f2fs_update_extent_tree(inode, fofs, blkaddr);
649
650 et = __grab_extent_tree(inode);
651
652 write_lock(&et->lock);
653
654 /* 1. lookup and remove existing extent info in cache */
655 en = __lookup_extent_tree(et, fofs);
656 if (!en)
657 goto update_extent;
658
659 dei = en->ei;
660 __detach_extent_node(sbi, et, en);
661
662 /* 2. if extent can be split more, split and insert the left part */
663 if (dei.len > 1) {
664 /* insert left part of split extent into cache */
665 if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
666 set_extent_info(&ei, dei.fofs, dei.blk,
667 fofs - dei.fofs);
668 en1 = __insert_extent_tree(sbi, et, &ei, NULL);
669 }
670
671 /* insert right part of split extent into cache */
672 endofs = dei.fofs + dei.len - 1;
673 if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) {
674 set_extent_info(&ei, fofs + 1,
675 fofs - dei.fofs + dei.blk, endofs - fofs);
676 en2 = __insert_extent_tree(sbi, et, &ei, NULL);
677 }
678 }
679
680update_extent:
681 /* 3. update extent in extent cache */
682 if (blkaddr) {
683 set_extent_info(&ei, fofs, blkaddr, 1);
684 en3 = __insert_extent_tree(sbi, et, &ei, &den);
685 }
686
687 /* 4. update in global extent list */
688 spin_lock(&sbi->extent_lock);
689 if (en && !list_empty(&en->list))
690 list_del(&en->list);
691 /*
692 * en1 and en2 split from en, they will become more and more smaller
693 * fragments after splitting several times. So if the length is smaller
694 * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree.
695 */
696 if (en1)
697 list_add_tail(&en1->list, &sbi->extent_list);
698 if (en2)
699 list_add_tail(&en2->list, &sbi->extent_list);
700 if (en3) {
701 if (list_empty(&en3->list))
702 list_add_tail(&en3->list, &sbi->extent_list);
703 else
704 list_move_tail(&en3->list, &sbi->extent_list);
705 }
706 if (den && !list_empty(&den->list))
707 list_del(&den->list);
708 spin_unlock(&sbi->extent_lock);
709
710 /* 5. release extent node */
711 if (en)
712 kmem_cache_free(extent_node_slab, en);
713 if (den)
714 kmem_cache_free(extent_node_slab, den);
715
716 write_unlock(&et->lock);
717 atomic_dec(&et->refcount);
718}
719
720void f2fs_preserve_extent_tree(struct inode *inode)
721{
722 struct extent_tree *et;
723 struct extent_info *ext = &F2FS_I(inode)->ext;
724 bool sync = false;
725
726 if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
727 return;
728
729 et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino);
730 if (!et) {
731 if (ext->len) {
732 ext->len = 0;
733 update_inode_page(inode);
734 }
735 return;
736 }
737
738 read_lock(&et->lock);
739 if (et->count) {
740 struct extent_node *en;
741
742 if (et->cached_en) {
743 en = et->cached_en;
744 } else {
745 struct rb_node *node = rb_first(&et->root);
746
747 if (!node)
748 node = rb_last(&et->root);
749 en = rb_entry(node, struct extent_node, rb_node);
750 }
751
752 if (__is_extent_same(ext, &en->ei))
753 goto out;
754
755 *ext = en->ei;
756 sync = true;
757 } else if (ext->len) {
758 ext->len = 0;
759 sync = true;
760 }
761out:
762 read_unlock(&et->lock);
763 atomic_dec(&et->refcount);
764
765 if (sync)
766 update_inode_page(inode);
767}
768
769void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
770{
771 struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
772 struct extent_node *en, *tmp;
773 unsigned long ino = F2FS_ROOT_INO(sbi);
774 struct radix_tree_iter iter;
775 void **slot;
776 unsigned int found;
777 unsigned int node_cnt = 0, tree_cnt = 0;
778
779 if (!test_opt(sbi, EXTENT_CACHE))
780 return;
781
782 if (available_free_memory(sbi, EXTENT_CACHE))
783 return;
784
785 spin_lock(&sbi->extent_lock);
786 list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) {
787 if (!nr_shrink--)
788 break;
789 list_del_init(&en->list);
790 }
791 spin_unlock(&sbi->extent_lock);
792
793 down_read(&sbi->extent_tree_lock);
794 while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
795 (void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
796 unsigned i;
797
798 ino = treevec[found - 1]->ino + 1;
799 for (i = 0; i < found; i++) {
800 struct extent_tree *et = treevec[i];
801
802 atomic_inc(&et->refcount);
803 write_lock(&et->lock);
804 node_cnt += __free_extent_tree(sbi, et, false);
805 write_unlock(&et->lock);
806 atomic_dec(&et->refcount);
807 }
808 }
809 up_read(&sbi->extent_tree_lock);
810
811 down_write(&sbi->extent_tree_lock);
812 radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter,
813 F2FS_ROOT_INO(sbi)) {
814 struct extent_tree *et = (struct extent_tree *)*slot;
815
816 if (!atomic_read(&et->refcount) && !et->count) {
817 radix_tree_delete(&sbi->extent_tree_root, et->ino);
818 kmem_cache_free(extent_tree_slab, et);
819 sbi->total_ext_tree--;
820 tree_cnt++;
821 }
822 }
823 up_write(&sbi->extent_tree_lock);
824
825 trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt);
826}
827
828void f2fs_destroy_extent_tree(struct inode *inode)
829{
830 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
831 struct extent_tree *et;
832 unsigned int node_cnt = 0;
833
834 if (!test_opt(sbi, EXTENT_CACHE))
835 return;
836
837 et = __find_extent_tree(sbi, inode->i_ino);
838 if (!et)
839 goto out;
840
841 /* free all extent info belong to this extent tree */
842 write_lock(&et->lock);
843 node_cnt = __free_extent_tree(sbi, et, true);
844 write_unlock(&et->lock);
845
846 atomic_dec(&et->refcount);
847
848 /* try to find and delete extent tree entry in radix tree */
849 down_write(&sbi->extent_tree_lock);
850 et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino);
851 if (!et) {
852 up_write(&sbi->extent_tree_lock);
853 goto out;
854 }
855 f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
856 radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
857 kmem_cache_free(extent_tree_slab, et);
858 sbi->total_ext_tree--;
859 up_write(&sbi->extent_tree_lock);
860out:
861 trace_f2fs_destroy_extent_tree(inode, node_cnt);
372 return; 862 return;
373} 863}
374 864
865void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext)
866{
867 if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
868 f2fs_init_extent_tree(inode, i_ext);
869
870 write_lock(&F2FS_I(inode)->ext_lock);
871 get_extent_info(&F2FS_I(inode)->ext, *i_ext);
872 write_unlock(&F2FS_I(inode)->ext_lock);
873}
874
875static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs,
876 struct extent_info *ei)
877{
878 if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
879 return false;
880
881 if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE))
882 return f2fs_lookup_extent_tree(inode, pgofs, ei);
883
884 return lookup_extent_info(inode, pgofs, ei);
885}
886
887void f2fs_update_extent_cache(struct dnode_of_data *dn)
888{
889 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
890 pgoff_t fofs;
891
892 f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
893
894 if (is_inode_flag_set(fi, FI_NO_EXTENT))
895 return;
896
897 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
898 dn->ofs_in_node;
899
900 if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE))
901 return f2fs_update_extent_tree(dn->inode, fofs,
902 dn->data_blkaddr);
903
904 if (update_extent_info(dn->inode, fofs, dn->data_blkaddr))
905 sync_inode_page(dn);
906}
907
375struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) 908struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
376{ 909{
377 struct address_space *mapping = inode->i_mapping; 910 struct address_space *mapping = inode->i_mapping;
378 struct dnode_of_data dn; 911 struct dnode_of_data dn;
379 struct page *page; 912 struct page *page;
913 struct extent_info ei;
380 int err; 914 int err;
381 struct f2fs_io_info fio = { 915 struct f2fs_io_info fio = {
382 .type = DATA, 916 .type = DATA,
383 .rw = sync ? READ_SYNC : READA, 917 .rw = sync ? READ_SYNC : READA,
384 }; 918 };
385 919
920 /*
921 * If sync is false, it needs to check its block allocation.
922 * This is need and triggered by two flows:
923 * gc and truncate_partial_data_page.
924 */
925 if (!sync)
926 goto search;
927
386 page = find_get_page(mapping, index); 928 page = find_get_page(mapping, index);
387 if (page && PageUptodate(page)) 929 if (page && PageUptodate(page))
388 return page; 930 return page;
389 f2fs_put_page(page, 0); 931 f2fs_put_page(page, 0);
932search:
933 if (f2fs_lookup_extent_cache(inode, index, &ei)) {
934 dn.data_blkaddr = ei.blk + index - ei.fofs;
935 goto got_it;
936 }
390 937
391 set_new_dnode(&dn, inode, NULL, NULL, 0); 938 set_new_dnode(&dn, inode, NULL, NULL, 0);
392 err = get_dnode_of_data(&dn, index, LOOKUP_NODE); 939 err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
@@ -401,6 +948,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
401 if (unlikely(dn.data_blkaddr == NEW_ADDR)) 948 if (unlikely(dn.data_blkaddr == NEW_ADDR))
402 return ERR_PTR(-EINVAL); 949 return ERR_PTR(-EINVAL);
403 950
951got_it:
404 page = grab_cache_page(mapping, index); 952 page = grab_cache_page(mapping, index);
405 if (!page) 953 if (!page)
406 return ERR_PTR(-ENOMEM); 954 return ERR_PTR(-ENOMEM);
@@ -435,6 +983,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
435 struct address_space *mapping = inode->i_mapping; 983 struct address_space *mapping = inode->i_mapping;
436 struct dnode_of_data dn; 984 struct dnode_of_data dn;
437 struct page *page; 985 struct page *page;
986 struct extent_info ei;
438 int err; 987 int err;
439 struct f2fs_io_info fio = { 988 struct f2fs_io_info fio = {
440 .type = DATA, 989 .type = DATA,
@@ -445,6 +994,11 @@ repeat:
445 if (!page) 994 if (!page)
446 return ERR_PTR(-ENOMEM); 995 return ERR_PTR(-ENOMEM);
447 996
997 if (f2fs_lookup_extent_cache(inode, index, &ei)) {
998 dn.data_blkaddr = ei.blk + index - ei.fofs;
999 goto got_it;
1000 }
1001
448 set_new_dnode(&dn, inode, NULL, NULL, 0); 1002 set_new_dnode(&dn, inode, NULL, NULL, 0);
449 err = get_dnode_of_data(&dn, index, LOOKUP_NODE); 1003 err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
450 if (err) { 1004 if (err) {
@@ -458,6 +1012,7 @@ repeat:
458 return ERR_PTR(-ENOENT); 1012 return ERR_PTR(-ENOENT);
459 } 1013 }
460 1014
1015got_it:
461 if (PageUptodate(page)) 1016 if (PageUptodate(page))
462 return page; 1017 return page;
463 1018
@@ -569,19 +1124,26 @@ static int __allocate_data_block(struct dnode_of_data *dn)
569 1124
570 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 1125 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
571 return -EPERM; 1126 return -EPERM;
1127
1128 dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
1129 if (dn->data_blkaddr == NEW_ADDR)
1130 goto alloc;
1131
572 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) 1132 if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
573 return -ENOSPC; 1133 return -ENOSPC;
574 1134
1135alloc:
575 get_node_info(sbi, dn->nid, &ni); 1136 get_node_info(sbi, dn->nid, &ni);
576 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 1137 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
577 1138
578 if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) 1139 if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
579 seg = CURSEG_DIRECT_IO; 1140 seg = CURSEG_DIRECT_IO;
580 1141
581 allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg); 1142 allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
1143 &sum, seg);
582 1144
583 /* direct IO doesn't use extent cache to maximize the performance */ 1145 /* direct IO doesn't use extent cache to maximize the performance */
584 __set_data_blkaddr(dn); 1146 set_data_blkaddr(dn);
585 1147
586 /* update i_size */ 1148 /* update i_size */
587 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 1149 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -615,7 +1177,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
615 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); 1177 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
616 1178
617 while (dn.ofs_in_node < end_offset && len) { 1179 while (dn.ofs_in_node < end_offset && len) {
618 if (dn.data_blkaddr == NULL_ADDR) { 1180 block_t blkaddr;
1181
1182 blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
1183 if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
619 if (__allocate_data_block(&dn)) 1184 if (__allocate_data_block(&dn))
620 goto sync_out; 1185 goto sync_out;
621 allocated = true; 1186 allocated = true;
@@ -659,13 +1224,16 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
659 int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; 1224 int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
660 pgoff_t pgofs, end_offset; 1225 pgoff_t pgofs, end_offset;
661 int err = 0, ofs = 1; 1226 int err = 0, ofs = 1;
1227 struct extent_info ei;
662 bool allocated = false; 1228 bool allocated = false;
663 1229
664 /* Get the page offset from the block offset(iblock) */ 1230 /* Get the page offset from the block offset(iblock) */
665 pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); 1231 pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
666 1232
667 if (check_extent_cache(inode, pgofs, bh_result)) 1233 if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
1234 f2fs_map_bh(inode->i_sb, pgofs, &ei, bh_result);
668 goto out; 1235 goto out;
1236 }
669 1237
670 if (create) 1238 if (create)
671 f2fs_lock_op(F2FS_I_SB(inode)); 1239 f2fs_lock_op(F2FS_I_SB(inode));
@@ -682,7 +1250,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
682 goto put_out; 1250 goto put_out;
683 1251
684 if (dn.data_blkaddr != NULL_ADDR) { 1252 if (dn.data_blkaddr != NULL_ADDR) {
685 set_buffer_new(bh_result); 1253 clear_buffer_new(bh_result);
686 map_bh(bh_result, inode->i_sb, dn.data_blkaddr); 1254 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
687 } else if (create) { 1255 } else if (create) {
688 err = __allocate_data_block(&dn); 1256 err = __allocate_data_block(&dn);
@@ -727,6 +1295,7 @@ get_next:
727 if (err) 1295 if (err)
728 goto sync_out; 1296 goto sync_out;
729 allocated = true; 1297 allocated = true;
1298 set_buffer_new(bh_result);
730 blkaddr = dn.data_blkaddr; 1299 blkaddr = dn.data_blkaddr;
731 } 1300 }
732 /* Give more consecutive addresses for the readahead */ 1301 /* Give more consecutive addresses for the readahead */
@@ -813,8 +1382,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
813 fio->blk_addr = dn.data_blkaddr; 1382 fio->blk_addr = dn.data_blkaddr;
814 1383
815 /* This page is already truncated */ 1384 /* This page is already truncated */
816 if (fio->blk_addr == NULL_ADDR) 1385 if (fio->blk_addr == NULL_ADDR) {
1386 ClearPageUptodate(page);
817 goto out_writepage; 1387 goto out_writepage;
1388 }
818 1389
819 set_page_writeback(page); 1390 set_page_writeback(page);
820 1391
@@ -827,10 +1398,15 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
827 need_inplace_update(inode))) { 1398 need_inplace_update(inode))) {
828 rewrite_data_page(page, fio); 1399 rewrite_data_page(page, fio);
829 set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); 1400 set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
1401 trace_f2fs_do_write_data_page(page, IPU);
830 } else { 1402 } else {
831 write_data_page(page, &dn, fio); 1403 write_data_page(page, &dn, fio);
832 update_extent_cache(&dn); 1404 set_data_blkaddr(&dn);
1405 f2fs_update_extent_cache(&dn);
1406 trace_f2fs_do_write_data_page(page, OPU);
833 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); 1407 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
1408 if (page->index == 0)
1409 set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
834 } 1410 }
835out_writepage: 1411out_writepage:
836 f2fs_put_dnode(&dn); 1412 f2fs_put_dnode(&dn);
@@ -909,6 +1485,8 @@ done:
909 clear_cold_data(page); 1485 clear_cold_data(page);
910out: 1486out:
911 inode_dec_dirty_pages(inode); 1487 inode_dec_dirty_pages(inode);
1488 if (err)
1489 ClearPageUptodate(page);
912 unlock_page(page); 1490 unlock_page(page);
913 if (need_balance_fs) 1491 if (need_balance_fs)
914 f2fs_balance_fs(sbi); 1492 f2fs_balance_fs(sbi);
@@ -935,7 +1513,6 @@ static int f2fs_write_data_pages(struct address_space *mapping,
935{ 1513{
936 struct inode *inode = mapping->host; 1514 struct inode *inode = mapping->host;
937 struct f2fs_sb_info *sbi = F2FS_I_SB(inode); 1515 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
938 bool locked = false;
939 int ret; 1516 int ret;
940 long diff; 1517 long diff;
941 1518
@@ -950,15 +1527,13 @@ static int f2fs_write_data_pages(struct address_space *mapping,
950 available_free_memory(sbi, DIRTY_DENTS)) 1527 available_free_memory(sbi, DIRTY_DENTS))
951 goto skip_write; 1528 goto skip_write;
952 1529
1530 /* during POR, we don't need to trigger writepage at all. */
1531 if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1532 goto skip_write;
1533
953 diff = nr_pages_to_write(sbi, DATA, wbc); 1534 diff = nr_pages_to_write(sbi, DATA, wbc);
954 1535
955 if (!S_ISDIR(inode->i_mode)) {
956 mutex_lock(&sbi->writepages);
957 locked = true;
958 }
959 ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); 1536 ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
960 if (locked)
961 mutex_unlock(&sbi->writepages);
962 1537
963 f2fs_submit_merged_bio(sbi, DATA, WRITE); 1538 f2fs_submit_merged_bio(sbi, DATA, WRITE);
964 1539
@@ -1236,6 +1811,37 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
1236 return generic_block_bmap(mapping, block, get_data_block); 1811 return generic_block_bmap(mapping, block, get_data_block);
1237} 1812}
1238 1813
1814void init_extent_cache_info(struct f2fs_sb_info *sbi)
1815{
1816 INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO);
1817 init_rwsem(&sbi->extent_tree_lock);
1818 INIT_LIST_HEAD(&sbi->extent_list);
1819 spin_lock_init(&sbi->extent_lock);
1820 sbi->total_ext_tree = 0;
1821 atomic_set(&sbi->total_ext_node, 0);
1822}
1823
1824int __init create_extent_cache(void)
1825{
1826 extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree",
1827 sizeof(struct extent_tree));
1828 if (!extent_tree_slab)
1829 return -ENOMEM;
1830 extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node",
1831 sizeof(struct extent_node));
1832 if (!extent_node_slab) {
1833 kmem_cache_destroy(extent_tree_slab);
1834 return -ENOMEM;
1835 }
1836 return 0;
1837}
1838
1839void destroy_extent_cache(void)
1840{
1841 kmem_cache_destroy(extent_node_slab);
1842 kmem_cache_destroy(extent_tree_slab);
1843}
1844
1239const struct address_space_operations f2fs_dblock_aops = { 1845const struct address_space_operations f2fs_dblock_aops = {
1240 .readpage = f2fs_read_data_page, 1846 .readpage = f2fs_read_data_page,
1241 .readpages = f2fs_read_data_pages, 1847 .readpages = f2fs_read_data_pages,
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index e671373cc8ab..f5388f37217e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -35,6 +35,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
35 /* validation check of the segment numbers */ 35 /* validation check of the segment numbers */
36 si->hit_ext = sbi->read_hit_ext; 36 si->hit_ext = sbi->read_hit_ext;
37 si->total_ext = sbi->total_hit_ext; 37 si->total_ext = sbi->total_hit_ext;
38 si->ext_tree = sbi->total_ext_tree;
39 si->ext_node = atomic_read(&sbi->total_ext_node);
38 si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); 40 si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
39 si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); 41 si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
40 si->ndirty_dirs = sbi->n_dirty_dirs; 42 si->ndirty_dirs = sbi->n_dirty_dirs;
@@ -185,6 +187,9 @@ get_cache:
185 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); 187 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
186 for (i = 0; i <= UPDATE_INO; i++) 188 for (i = 0; i <= UPDATE_INO; i++)
187 si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); 189 si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
190 si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
191 si->cache_mem += atomic_read(&sbi->total_ext_node) *
192 sizeof(struct extent_node);
188 193
189 si->page_mem = 0; 194 si->page_mem = 0;
190 npages = NODE_MAPPING(sbi)->nrpages; 195 npages = NODE_MAPPING(sbi)->nrpages;
@@ -260,13 +265,20 @@ static int stat_show(struct seq_file *s, void *v)
260 seq_printf(s, "CP calls: %d\n", si->cp_count); 265 seq_printf(s, "CP calls: %d\n", si->cp_count);
261 seq_printf(s, "GC calls: %d (BG: %d)\n", 266 seq_printf(s, "GC calls: %d (BG: %d)\n",
262 si->call_count, si->bg_gc); 267 si->call_count, si->bg_gc);
263 seq_printf(s, " - data segments : %d\n", si->data_segs); 268 seq_printf(s, " - data segments : %d (%d)\n",
264 seq_printf(s, " - node segments : %d\n", si->node_segs); 269 si->data_segs, si->bg_data_segs);
265 seq_printf(s, "Try to move %d blocks\n", si->tot_blks); 270 seq_printf(s, " - node segments : %d (%d)\n",
266 seq_printf(s, " - data blocks : %d\n", si->data_blks); 271 si->node_segs, si->bg_node_segs);
267 seq_printf(s, " - node blocks : %d\n", si->node_blks); 272 seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks,
273 si->bg_data_blks + si->bg_node_blks);
274 seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks,
275 si->bg_data_blks);
276 seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
277 si->bg_node_blks);
268 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", 278 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
269 si->hit_ext, si->total_ext); 279 si->hit_ext, si->total_ext);
280 seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree);
281 seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node);
270 seq_puts(s, "\nBalancing F2FS Async:\n"); 282 seq_puts(s, "\nBalancing F2FS Async:\n");
271 seq_printf(s, " - inmem: %4d, wb: %4d\n", 283 seq_printf(s, " - inmem: %4d, wb: %4d\n",
272 si->inmem_pages, si->wb_pages); 284 si->inmem_pages, si->wb_pages);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b74097a7f6d9..3a3302ab7871 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -59,9 +59,8 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
59 [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, 59 [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
60}; 60};
61 61
62void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) 62void set_de_type(struct f2fs_dir_entry *de, umode_t mode)
63{ 63{
64 umode_t mode = inode->i_mode;
65 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; 64 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
66} 65}
67 66
@@ -127,22 +126,19 @@ struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots,
127 *max_slots = 0; 126 *max_slots = 0;
128 while (bit_pos < d->max) { 127 while (bit_pos < d->max) {
129 if (!test_bit_le(bit_pos, d->bitmap)) { 128 if (!test_bit_le(bit_pos, d->bitmap)) {
130 if (bit_pos == 0)
131 max_len = 1;
132 else if (!test_bit_le(bit_pos - 1, d->bitmap))
133 max_len++;
134 bit_pos++; 129 bit_pos++;
130 max_len++;
135 continue; 131 continue;
136 } 132 }
133
137 de = &d->dentry[bit_pos]; 134 de = &d->dentry[bit_pos];
138 if (early_match_name(name->len, namehash, de) && 135 if (early_match_name(name->len, namehash, de) &&
139 !memcmp(d->filename[bit_pos], name->name, name->len)) 136 !memcmp(d->filename[bit_pos], name->name, name->len))
140 goto found; 137 goto found;
141 138
142 if (max_slots && *max_slots >= 0 && max_len > *max_slots) { 139 if (max_slots && max_len > *max_slots)
143 *max_slots = max_len; 140 *max_slots = max_len;
144 max_len = 0; 141 max_len = 0;
145 }
146 142
147 /* remain bug on condition */ 143 /* remain bug on condition */
148 if (unlikely(!de->name_len)) 144 if (unlikely(!de->name_len))
@@ -219,14 +215,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
219 unsigned int max_depth; 215 unsigned int max_depth;
220 unsigned int level; 216 unsigned int level;
221 217
218 *res_page = NULL;
219
222 if (f2fs_has_inline_dentry(dir)) 220 if (f2fs_has_inline_dentry(dir))
223 return find_in_inline_dir(dir, child, res_page); 221 return find_in_inline_dir(dir, child, res_page);
224 222
225 if (npages == 0) 223 if (npages == 0)
226 return NULL; 224 return NULL;
227 225
228 *res_page = NULL;
229
230 name_hash = f2fs_dentry_hash(child); 226 name_hash = f2fs_dentry_hash(child);
231 max_depth = F2FS_I(dir)->i_current_depth; 227 max_depth = F2FS_I(dir)->i_current_depth;
232 228
@@ -285,7 +281,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
285 lock_page(page); 281 lock_page(page);
286 f2fs_wait_on_page_writeback(page, type); 282 f2fs_wait_on_page_writeback(page, type);
287 de->ino = cpu_to_le32(inode->i_ino); 283 de->ino = cpu_to_le32(inode->i_ino);
288 set_de_type(de, inode); 284 set_de_type(de, inode->i_mode);
289 f2fs_dentry_kunmap(dir, page); 285 f2fs_dentry_kunmap(dir, page);
290 set_page_dirty(page); 286 set_page_dirty(page);
291 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 287 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -331,14 +327,14 @@ void do_make_empty_dir(struct inode *inode, struct inode *parent,
331 de->hash_code = 0; 327 de->hash_code = 0;
332 de->ino = cpu_to_le32(inode->i_ino); 328 de->ino = cpu_to_le32(inode->i_ino);
333 memcpy(d->filename[0], ".", 1); 329 memcpy(d->filename[0], ".", 1);
334 set_de_type(de, inode); 330 set_de_type(de, inode->i_mode);
335 331
336 de = &d->dentry[1]; 332 de = &d->dentry[1];
337 de->hash_code = 0; 333 de->hash_code = 0;
338 de->name_len = cpu_to_le16(2); 334 de->name_len = cpu_to_le16(2);
339 de->ino = cpu_to_le32(parent->i_ino); 335 de->ino = cpu_to_le32(parent->i_ino);
340 memcpy(d->filename[1], "..", 2); 336 memcpy(d->filename[1], "..", 2);
341 set_de_type(de, inode); 337 set_de_type(de, parent->i_mode);
342 338
343 test_and_set_bit_le(0, (void *)d->bitmap); 339 test_and_set_bit_le(0, (void *)d->bitmap);
344 test_and_set_bit_le(1, (void *)d->bitmap); 340 test_and_set_bit_le(1, (void *)d->bitmap);
@@ -435,7 +431,7 @@ error:
435void update_parent_metadata(struct inode *dir, struct inode *inode, 431void update_parent_metadata(struct inode *dir, struct inode *inode,
436 unsigned int current_depth) 432 unsigned int current_depth)
437{ 433{
438 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { 434 if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
439 if (S_ISDIR(inode->i_mode)) { 435 if (S_ISDIR(inode->i_mode)) {
440 inc_nlink(dir); 436 inc_nlink(dir);
441 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 437 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
@@ -450,7 +446,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
450 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); 446 set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
451 } 447 }
452 448
453 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) 449 if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
454 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 450 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
455} 451}
456 452
@@ -474,30 +470,47 @@ next:
474 goto next; 470 goto next;
475} 471}
476 472
473void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d,
474 const struct qstr *name, f2fs_hash_t name_hash,
475 unsigned int bit_pos)
476{
477 struct f2fs_dir_entry *de;
478 int slots = GET_DENTRY_SLOTS(name->len);
479 int i;
480
481 de = &d->dentry[bit_pos];
482 de->hash_code = name_hash;
483 de->name_len = cpu_to_le16(name->len);
484 memcpy(d->filename[bit_pos], name->name, name->len);
485 de->ino = cpu_to_le32(ino);
486 set_de_type(de, mode);
487 for (i = 0; i < slots; i++)
488 test_and_set_bit_le(bit_pos + i, (void *)d->bitmap);
489}
490
477/* 491/*
478 * Caller should grab and release a rwsem by calling f2fs_lock_op() and 492 * Caller should grab and release a rwsem by calling f2fs_lock_op() and
479 * f2fs_unlock_op(). 493 * f2fs_unlock_op().
480 */ 494 */
481int __f2fs_add_link(struct inode *dir, const struct qstr *name, 495int __f2fs_add_link(struct inode *dir, const struct qstr *name,
482 struct inode *inode) 496 struct inode *inode, nid_t ino, umode_t mode)
483{ 497{
484 unsigned int bit_pos; 498 unsigned int bit_pos;
485 unsigned int level; 499 unsigned int level;
486 unsigned int current_depth; 500 unsigned int current_depth;
487 unsigned long bidx, block; 501 unsigned long bidx, block;
488 f2fs_hash_t dentry_hash; 502 f2fs_hash_t dentry_hash;
489 struct f2fs_dir_entry *de;
490 unsigned int nbucket, nblock; 503 unsigned int nbucket, nblock;
491 size_t namelen = name->len; 504 size_t namelen = name->len;
492 struct page *dentry_page = NULL; 505 struct page *dentry_page = NULL;
493 struct f2fs_dentry_block *dentry_blk = NULL; 506 struct f2fs_dentry_block *dentry_blk = NULL;
507 struct f2fs_dentry_ptr d;
494 int slots = GET_DENTRY_SLOTS(namelen); 508 int slots = GET_DENTRY_SLOTS(namelen);
495 struct page *page; 509 struct page *page = NULL;
496 int err = 0; 510 int err = 0;
497 int i;
498 511
499 if (f2fs_has_inline_dentry(dir)) { 512 if (f2fs_has_inline_dentry(dir)) {
500 err = f2fs_add_inline_entry(dir, name, inode); 513 err = f2fs_add_inline_entry(dir, name, inode, ino, mode);
501 if (!err || err != -EAGAIN) 514 if (!err || err != -EAGAIN)
502 return err; 515 return err;
503 else 516 else
@@ -547,30 +560,31 @@ start:
547add_dentry: 560add_dentry:
548 f2fs_wait_on_page_writeback(dentry_page, DATA); 561 f2fs_wait_on_page_writeback(dentry_page, DATA);
549 562
550 down_write(&F2FS_I(inode)->i_sem); 563 if (inode) {
551 page = init_inode_metadata(inode, dir, name, NULL); 564 down_write(&F2FS_I(inode)->i_sem);
552 if (IS_ERR(page)) { 565 page = init_inode_metadata(inode, dir, name, NULL);
553 err = PTR_ERR(page); 566 if (IS_ERR(page)) {
554 goto fail; 567 err = PTR_ERR(page);
568 goto fail;
569 }
555 } 570 }
556 de = &dentry_blk->dentry[bit_pos]; 571
557 de->hash_code = dentry_hash; 572 make_dentry_ptr(&d, (void *)dentry_blk, 1);
558 de->name_len = cpu_to_le16(namelen); 573 f2fs_update_dentry(ino, mode, &d, name, dentry_hash, bit_pos);
559 memcpy(dentry_blk->filename[bit_pos], name->name, name->len); 574
560 de->ino = cpu_to_le32(inode->i_ino);
561 set_de_type(de, inode);
562 for (i = 0; i < slots; i++)
563 test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
564 set_page_dirty(dentry_page); 575 set_page_dirty(dentry_page);
565 576
566 /* we don't need to mark_inode_dirty now */ 577 if (inode) {
567 F2FS_I(inode)->i_pino = dir->i_ino; 578 /* we don't need to mark_inode_dirty now */
568 update_inode(inode, page); 579 F2FS_I(inode)->i_pino = dir->i_ino;
569 f2fs_put_page(page, 1); 580 update_inode(inode, page);
581 f2fs_put_page(page, 1);
582 }
570 583
571 update_parent_metadata(dir, inode, current_depth); 584 update_parent_metadata(dir, inode, current_depth);
572fail: 585fail:
573 up_write(&F2FS_I(inode)->i_sem); 586 if (inode)
587 up_write(&F2FS_I(inode)->i_sem);
574 588
575 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { 589 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
576 update_inode_page(dir); 590 update_inode_page(dir);
@@ -669,6 +683,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
669 if (bit_pos == NR_DENTRY_IN_BLOCK) { 683 if (bit_pos == NR_DENTRY_IN_BLOCK) {
670 truncate_hole(dir, page->index, page->index + 1); 684 truncate_hole(dir, page->index, page->index + 1);
671 clear_page_dirty_for_io(page); 685 clear_page_dirty_for_io(page);
686 ClearPagePrivate(page);
672 ClearPageUptodate(page); 687 ClearPageUptodate(page);
673 inode_dec_dirty_pages(dir); 688 inode_dec_dirty_pages(dir);
674 } 689 }
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7fa3313ab0e2..c06a25e5cec3 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -50,6 +50,7 @@
50#define F2FS_MOUNT_FLUSH_MERGE 0x00000400 50#define F2FS_MOUNT_FLUSH_MERGE 0x00000400
51#define F2FS_MOUNT_NOBARRIER 0x00000800 51#define F2FS_MOUNT_NOBARRIER 0x00000800
52#define F2FS_MOUNT_FASTBOOT 0x00001000 52#define F2FS_MOUNT_FASTBOOT 0x00001000
53#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
53 54
54#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) 55#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
55#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) 56#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -102,6 +103,7 @@ enum {
102 CP_UMOUNT, 103 CP_UMOUNT,
103 CP_FASTBOOT, 104 CP_FASTBOOT,
104 CP_SYNC, 105 CP_SYNC,
106 CP_RECOVERY,
105 CP_DISCARD, 107 CP_DISCARD,
106}; 108};
107 109
@@ -216,6 +218,15 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
216#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) 218#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
217#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) 219#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
218 220
221/*
222 * should be same as XFS_IOC_GOINGDOWN.
223 * Flags for going down operation used by FS_IOC_GOINGDOWN
224 */
225#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */
226#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
227#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
228#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
229
219#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 230#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
220/* 231/*
221 * ioctl commands in 32 bit emulation 232 * ioctl commands in 32 bit emulation
@@ -273,14 +284,34 @@ enum {
273 284
274#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ 285#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
275 286
287/* vector size for gang look-up from extent cache that consists of radix tree */
288#define EXT_TREE_VEC_SIZE 64
289
276/* for in-memory extent cache entry */ 290/* for in-memory extent cache entry */
277#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */ 291#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */
292
293/* number of extent info in extent cache we try to shrink */
294#define EXTENT_CACHE_SHRINK_NUMBER 128
278 295
279struct extent_info { 296struct extent_info {
280 rwlock_t ext_lock; /* rwlock for consistency */ 297 unsigned int fofs; /* start offset in a file */
281 unsigned int fofs; /* start offset in a file */ 298 u32 blk; /* start block address of the extent */
282 u32 blk_addr; /* start block address of the extent */ 299 unsigned int len; /* length of the extent */
283 unsigned int len; /* length of the extent */ 300};
301
302struct extent_node {
303 struct rb_node rb_node; /* rb node located in rb-tree */
304 struct list_head list; /* node in global extent list of sbi */
305 struct extent_info ei; /* extent info */
306};
307
308struct extent_tree {
309 nid_t ino; /* inode number */
310 struct rb_root root; /* root of extent info rb-tree */
311 struct extent_node *cached_en; /* recently accessed extent node */
312 rwlock_t lock; /* protect extent info rb-tree */
313 atomic_t refcount; /* reference count of rb-tree */
314 unsigned int count; /* # of extent node in rb-tree*/
284}; 315};
285 316
286/* 317/*
@@ -309,6 +340,7 @@ struct f2fs_inode_info {
309 nid_t i_xattr_nid; /* node id that contains xattrs */ 340 nid_t i_xattr_nid; /* node id that contains xattrs */
310 unsigned long long xattr_ver; /* cp version of xattr modification */ 341 unsigned long long xattr_ver; /* cp version of xattr modification */
311 struct extent_info ext; /* in-memory extent cache entry */ 342 struct extent_info ext; /* in-memory extent cache entry */
343 rwlock_t ext_lock; /* rwlock for single extent cache */
312 struct inode_entry *dirty_dir; /* the pointer of dirty dir */ 344 struct inode_entry *dirty_dir; /* the pointer of dirty dir */
313 345
314 struct radix_tree_root inmem_root; /* radix tree for inmem pages */ 346 struct radix_tree_root inmem_root; /* radix tree for inmem pages */
@@ -319,21 +351,51 @@ struct f2fs_inode_info {
319static inline void get_extent_info(struct extent_info *ext, 351static inline void get_extent_info(struct extent_info *ext,
320 struct f2fs_extent i_ext) 352 struct f2fs_extent i_ext)
321{ 353{
322 write_lock(&ext->ext_lock);
323 ext->fofs = le32_to_cpu(i_ext.fofs); 354 ext->fofs = le32_to_cpu(i_ext.fofs);
324 ext->blk_addr = le32_to_cpu(i_ext.blk_addr); 355 ext->blk = le32_to_cpu(i_ext.blk);
325 ext->len = le32_to_cpu(i_ext.len); 356 ext->len = le32_to_cpu(i_ext.len);
326 write_unlock(&ext->ext_lock);
327} 357}
328 358
329static inline void set_raw_extent(struct extent_info *ext, 359static inline void set_raw_extent(struct extent_info *ext,
330 struct f2fs_extent *i_ext) 360 struct f2fs_extent *i_ext)
331{ 361{
332 read_lock(&ext->ext_lock);
333 i_ext->fofs = cpu_to_le32(ext->fofs); 362 i_ext->fofs = cpu_to_le32(ext->fofs);
334 i_ext->blk_addr = cpu_to_le32(ext->blk_addr); 363 i_ext->blk = cpu_to_le32(ext->blk);
335 i_ext->len = cpu_to_le32(ext->len); 364 i_ext->len = cpu_to_le32(ext->len);
336 read_unlock(&ext->ext_lock); 365}
366
367static inline void set_extent_info(struct extent_info *ei, unsigned int fofs,
368 u32 blk, unsigned int len)
369{
370 ei->fofs = fofs;
371 ei->blk = blk;
372 ei->len = len;
373}
374
375static inline bool __is_extent_same(struct extent_info *ei1,
376 struct extent_info *ei2)
377{
378 return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk &&
379 ei1->len == ei2->len);
380}
381
382static inline bool __is_extent_mergeable(struct extent_info *back,
383 struct extent_info *front)
384{
385 return (back->fofs + back->len == front->fofs &&
386 back->blk + back->len == front->blk);
387}
388
389static inline bool __is_back_mergeable(struct extent_info *cur,
390 struct extent_info *back)
391{
392 return __is_extent_mergeable(back, cur);
393}
394
395static inline bool __is_front_mergeable(struct extent_info *cur,
396 struct extent_info *front)
397{
398 return __is_extent_mergeable(cur, front);
337} 399}
338 400
339struct f2fs_nm_info { 401struct f2fs_nm_info {
@@ -502,6 +564,10 @@ enum page_type {
502 META, 564 META,
503 NR_PAGE_TYPE, 565 NR_PAGE_TYPE,
504 META_FLUSH, 566 META_FLUSH,
567 INMEM, /* the below types are used by tracepoints only. */
568 INMEM_DROP,
569 IPU,
570 OPU,
505}; 571};
506 572
507struct f2fs_io_info { 573struct f2fs_io_info {
@@ -559,7 +625,6 @@ struct f2fs_sb_info {
559 struct mutex cp_mutex; /* checkpoint procedure lock */ 625 struct mutex cp_mutex; /* checkpoint procedure lock */
560 struct rw_semaphore cp_rwsem; /* blocking FS operations */ 626 struct rw_semaphore cp_rwsem; /* blocking FS operations */
561 struct rw_semaphore node_write; /* locking node writes */ 627 struct rw_semaphore node_write; /* locking node writes */
562 struct mutex writepages; /* mutex for writepages() */
563 wait_queue_head_t cp_wait; 628 wait_queue_head_t cp_wait;
564 629
565 struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ 630 struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -571,6 +636,14 @@ struct f2fs_sb_info {
571 struct list_head dir_inode_list; /* dir inode list */ 636 struct list_head dir_inode_list; /* dir inode list */
572 spinlock_t dir_inode_lock; /* for dir inode list lock */ 637 spinlock_t dir_inode_lock; /* for dir inode list lock */
573 638
639 /* for extent tree cache */
640 struct radix_tree_root extent_tree_root;/* cache extent cache entries */
641 struct rw_semaphore extent_tree_lock; /* locking extent radix tree */
642 struct list_head extent_list; /* lru list for shrinker */
643 spinlock_t extent_lock; /* locking extent lru list */
644 int total_ext_tree; /* extent tree count */
645 atomic_t total_ext_node; /* extent info count */
646
574 /* basic filesystem units */ 647 /* basic filesystem units */
575 unsigned int log_sectors_per_block; /* log2 sectors per block */ 648 unsigned int log_sectors_per_block; /* log2 sectors per block */
576 unsigned int log_blocksize; /* log2 block size */ 649 unsigned int log_blocksize; /* log2 block size */
@@ -920,12 +993,17 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
920 return 0; 993 return 0;
921} 994}
922 995
996static inline block_t __cp_payload(struct f2fs_sb_info *sbi)
997{
998 return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
999}
1000
923static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) 1001static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
924{ 1002{
925 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 1003 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
926 int offset; 1004 int offset;
927 1005
928 if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) { 1006 if (__cp_payload(sbi) > 0) {
929 if (flag == NAT_BITMAP) 1007 if (flag == NAT_BITMAP)
930 return &ckpt->sit_nat_version_bitmap; 1008 return &ckpt->sit_nat_version_bitmap;
931 else 1009 else
@@ -1166,8 +1244,10 @@ enum {
1166 FI_NEED_IPU, /* used for ipu per file */ 1244 FI_NEED_IPU, /* used for ipu per file */
1167 FI_ATOMIC_FILE, /* indicate atomic file */ 1245 FI_ATOMIC_FILE, /* indicate atomic file */
1168 FI_VOLATILE_FILE, /* indicate volatile file */ 1246 FI_VOLATILE_FILE, /* indicate volatile file */
1247 FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
1169 FI_DROP_CACHE, /* drop dirty page cache */ 1248 FI_DROP_CACHE, /* drop dirty page cache */
1170 FI_DATA_EXIST, /* indicate data exists */ 1249 FI_DATA_EXIST, /* indicate data exists */
1250 FI_INLINE_DOTS, /* indicate inline dot dentries */
1171}; 1251};
1172 1252
1173static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 1253static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1204,6 +1284,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
1204 set_inode_flag(fi, FI_INLINE_DENTRY); 1284 set_inode_flag(fi, FI_INLINE_DENTRY);
1205 if (ri->i_inline & F2FS_DATA_EXIST) 1285 if (ri->i_inline & F2FS_DATA_EXIST)
1206 set_inode_flag(fi, FI_DATA_EXIST); 1286 set_inode_flag(fi, FI_DATA_EXIST);
1287 if (ri->i_inline & F2FS_INLINE_DOTS)
1288 set_inode_flag(fi, FI_INLINE_DOTS);
1207} 1289}
1208 1290
1209static inline void set_raw_inline(struct f2fs_inode_info *fi, 1291static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -1219,6 +1301,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
1219 ri->i_inline |= F2FS_INLINE_DENTRY; 1301 ri->i_inline |= F2FS_INLINE_DENTRY;
1220 if (is_inode_flag_set(fi, FI_DATA_EXIST)) 1302 if (is_inode_flag_set(fi, FI_DATA_EXIST))
1221 ri->i_inline |= F2FS_DATA_EXIST; 1303 ri->i_inline |= F2FS_DATA_EXIST;
1304 if (is_inode_flag_set(fi, FI_INLINE_DOTS))
1305 ri->i_inline |= F2FS_INLINE_DOTS;
1222} 1306}
1223 1307
1224static inline int f2fs_has_inline_xattr(struct inode *inode) 1308static inline int f2fs_has_inline_xattr(struct inode *inode)
@@ -1264,6 +1348,11 @@ static inline int f2fs_exist_data(struct inode *inode)
1264 return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); 1348 return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST);
1265} 1349}
1266 1350
1351static inline int f2fs_has_inline_dots(struct inode *inode)
1352{
1353 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS);
1354}
1355
1267static inline bool f2fs_is_atomic_file(struct inode *inode) 1356static inline bool f2fs_is_atomic_file(struct inode *inode)
1268{ 1357{
1269 return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); 1358 return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
@@ -1274,6 +1363,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode)
1274 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); 1363 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
1275} 1364}
1276 1365
1366static inline bool f2fs_is_first_block_written(struct inode *inode)
1367{
1368 return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
1369}
1370
1277static inline bool f2fs_is_drop_cache(struct inode *inode) 1371static inline bool f2fs_is_drop_cache(struct inode *inode)
1278{ 1372{
1279 return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); 1373 return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
@@ -1290,12 +1384,6 @@ static inline int f2fs_has_inline_dentry(struct inode *inode)
1290 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); 1384 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY);
1291} 1385}
1292 1386
1293static inline void *inline_dentry_addr(struct page *page)
1294{
1295 struct f2fs_inode *ri = F2FS_INODE(page);
1296 return (void *)&(ri->i_addr[1]);
1297}
1298
1299static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) 1387static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page)
1300{ 1388{
1301 if (!f2fs_has_inline_dentry(dir)) 1389 if (!f2fs_has_inline_dentry(dir))
@@ -1363,7 +1451,7 @@ struct dentry *f2fs_get_parent(struct dentry *child);
1363 * dir.c 1451 * dir.c
1364 */ 1452 */
1365extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; 1453extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
1366void set_de_type(struct f2fs_dir_entry *, struct inode *); 1454void set_de_type(struct f2fs_dir_entry *, umode_t);
1367struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *, 1455struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *,
1368 struct f2fs_dentry_ptr *); 1456 struct f2fs_dentry_ptr *);
1369bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, 1457bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
@@ -1382,7 +1470,10 @@ ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
1382void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, 1470void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
1383 struct page *, struct inode *); 1471 struct page *, struct inode *);
1384int update_dent_inode(struct inode *, const struct qstr *); 1472int update_dent_inode(struct inode *, const struct qstr *);
1385int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); 1473void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *,
1474 const struct qstr *, f2fs_hash_t , unsigned int);
1475int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t,
1476 umode_t);
1386void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, 1477void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *,
1387 struct inode *); 1478 struct inode *);
1388int f2fs_do_tmpfile(struct inode *, struct inode *); 1479int f2fs_do_tmpfile(struct inode *, struct inode *);
@@ -1392,7 +1483,7 @@ bool f2fs_empty_dir(struct inode *);
1392static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) 1483static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
1393{ 1484{
1394 return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name, 1485 return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name,
1395 inode); 1486 inode, inode->i_ino, inode->i_mode);
1396} 1487}
1397 1488
1398/* 1489/*
@@ -1519,14 +1610,22 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
1519 struct f2fs_io_info *); 1610 struct f2fs_io_info *);
1520void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, 1611void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
1521 struct f2fs_io_info *); 1612 struct f2fs_io_info *);
1613void set_data_blkaddr(struct dnode_of_data *);
1522int reserve_new_block(struct dnode_of_data *); 1614int reserve_new_block(struct dnode_of_data *);
1523int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); 1615int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
1524void update_extent_cache(struct dnode_of_data *); 1616void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
1617void f2fs_destroy_extent_tree(struct inode *);
1618void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *);
1619void f2fs_update_extent_cache(struct dnode_of_data *);
1620void f2fs_preserve_extent_tree(struct inode *);
1525struct page *find_data_page(struct inode *, pgoff_t, bool); 1621struct page *find_data_page(struct inode *, pgoff_t, bool);
1526struct page *get_lock_data_page(struct inode *, pgoff_t); 1622struct page *get_lock_data_page(struct inode *, pgoff_t);
1527struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); 1623struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
1528int do_write_data_page(struct page *, struct f2fs_io_info *); 1624int do_write_data_page(struct page *, struct f2fs_io_info *);
1529int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); 1625int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
1626void init_extent_cache_info(struct f2fs_sb_info *);
1627int __init create_extent_cache(void);
1628void destroy_extent_cache(void);
1530void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); 1629void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
1531int f2fs_release_page(struct page *, gfp_t); 1630int f2fs_release_page(struct page *, gfp_t);
1532 1631
@@ -1554,7 +1653,7 @@ struct f2fs_stat_info {
1554 struct f2fs_sb_info *sbi; 1653 struct f2fs_sb_info *sbi;
1555 int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; 1654 int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
1556 int main_area_segs, main_area_sections, main_area_zones; 1655 int main_area_segs, main_area_sections, main_area_zones;
1557 int hit_ext, total_ext; 1656 int hit_ext, total_ext, ext_tree, ext_node;
1558 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; 1657 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
1559 int nats, dirty_nats, sits, dirty_sits, fnids; 1658 int nats, dirty_nats, sits, dirty_sits, fnids;
1560 int total_count, utilization; 1659 int total_count, utilization;
@@ -1566,7 +1665,9 @@ struct f2fs_stat_info {
1566 int dirty_count, node_pages, meta_pages; 1665 int dirty_count, node_pages, meta_pages;
1567 int prefree_count, call_count, cp_count; 1666 int prefree_count, call_count, cp_count;
1568 int tot_segs, node_segs, data_segs, free_segs, free_secs; 1667 int tot_segs, node_segs, data_segs, free_segs, free_secs;
1668 int bg_node_segs, bg_data_segs;
1569 int tot_blks, data_blks, node_blks; 1669 int tot_blks, data_blks, node_blks;
1670 int bg_data_blks, bg_node_blks;
1570 int curseg[NR_CURSEG_TYPE]; 1671 int curseg[NR_CURSEG_TYPE];
1571 int cursec[NR_CURSEG_TYPE]; 1672 int cursec[NR_CURSEG_TYPE];
1572 int curzone[NR_CURSEG_TYPE]; 1673 int curzone[NR_CURSEG_TYPE];
@@ -1615,31 +1716,36 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1615 ((sbi)->block_count[(curseg)->alloc_type]++) 1716 ((sbi)->block_count[(curseg)->alloc_type]++)
1616#define stat_inc_inplace_blocks(sbi) \ 1717#define stat_inc_inplace_blocks(sbi) \
1617 (atomic_inc(&(sbi)->inplace_count)) 1718 (atomic_inc(&(sbi)->inplace_count))
1618#define stat_inc_seg_count(sbi, type) \ 1719#define stat_inc_seg_count(sbi, type, gc_type) \
1619 do { \ 1720 do { \
1620 struct f2fs_stat_info *si = F2FS_STAT(sbi); \ 1721 struct f2fs_stat_info *si = F2FS_STAT(sbi); \
1621 (si)->tot_segs++; \ 1722 (si)->tot_segs++; \
1622 if (type == SUM_TYPE_DATA) \ 1723 if (type == SUM_TYPE_DATA) { \
1623 si->data_segs++; \ 1724 si->data_segs++; \
1624 else \ 1725 si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \
1726 } else { \
1625 si->node_segs++; \ 1727 si->node_segs++; \
1728 si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \
1729 } \
1626 } while (0) 1730 } while (0)
1627 1731
1628#define stat_inc_tot_blk_count(si, blks) \ 1732#define stat_inc_tot_blk_count(si, blks) \
1629 (si->tot_blks += (blks)) 1733 (si->tot_blks += (blks))
1630 1734
1631#define stat_inc_data_blk_count(sbi, blks) \ 1735#define stat_inc_data_blk_count(sbi, blks, gc_type) \
1632 do { \ 1736 do { \
1633 struct f2fs_stat_info *si = F2FS_STAT(sbi); \ 1737 struct f2fs_stat_info *si = F2FS_STAT(sbi); \
1634 stat_inc_tot_blk_count(si, blks); \ 1738 stat_inc_tot_blk_count(si, blks); \
1635 si->data_blks += (blks); \ 1739 si->data_blks += (blks); \
1740 si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \
1636 } while (0) 1741 } while (0)
1637 1742
1638#define stat_inc_node_blk_count(sbi, blks) \ 1743#define stat_inc_node_blk_count(sbi, blks, gc_type) \
1639 do { \ 1744 do { \
1640 struct f2fs_stat_info *si = F2FS_STAT(sbi); \ 1745 struct f2fs_stat_info *si = F2FS_STAT(sbi); \
1641 stat_inc_tot_blk_count(si, blks); \ 1746 stat_inc_tot_blk_count(si, blks); \
1642 si->node_blks += (blks); \ 1747 si->node_blks += (blks); \
1748 si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \
1643 } while (0) 1749 } while (0)
1644 1750
1645int f2fs_build_stats(struct f2fs_sb_info *); 1751int f2fs_build_stats(struct f2fs_sb_info *);
@@ -1661,10 +1767,10 @@ void f2fs_destroy_root_stats(void);
1661#define stat_inc_seg_type(sbi, curseg) 1767#define stat_inc_seg_type(sbi, curseg)
1662#define stat_inc_block_count(sbi, curseg) 1768#define stat_inc_block_count(sbi, curseg)
1663#define stat_inc_inplace_blocks(sbi) 1769#define stat_inc_inplace_blocks(sbi)
1664#define stat_inc_seg_count(si, type) 1770#define stat_inc_seg_count(sbi, type, gc_type)
1665#define stat_inc_tot_blk_count(si, blks) 1771#define stat_inc_tot_blk_count(si, blks)
1666#define stat_inc_data_blk_count(si, blks) 1772#define stat_inc_data_blk_count(sbi, blks, gc_type)
1667#define stat_inc_node_blk_count(sbi, blks) 1773#define stat_inc_node_blk_count(sbi, blks, gc_type)
1668 1774
1669static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } 1775static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
1670static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } 1776static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
@@ -1688,6 +1794,7 @@ extern struct kmem_cache *inode_entry_slab;
1688 */ 1794 */
1689bool f2fs_may_inline(struct inode *); 1795bool f2fs_may_inline(struct inode *);
1690void read_inline_data(struct page *, struct page *); 1796void read_inline_data(struct page *, struct page *);
1797bool truncate_inline_inode(struct page *, u64);
1691int f2fs_read_inline_data(struct inode *, struct page *); 1798int f2fs_read_inline_data(struct inode *, struct page *);
1692int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); 1799int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
1693int f2fs_convert_inline_inode(struct inode *); 1800int f2fs_convert_inline_inode(struct inode *);
@@ -1697,7 +1804,8 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
1697 struct page **); 1804 struct page **);
1698struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); 1805struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
1699int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); 1806int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
1700int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *); 1807int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
1808 nid_t, umode_t);
1701void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, 1809void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
1702 struct inode *, struct inode *); 1810 struct inode *, struct inode *);
1703bool f2fs_empty_inline_dir(struct inode *); 1811bool f2fs_empty_inline_dir(struct inode *);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index df6a0596eccf..a6f3f6186588 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -241,6 +241,8 @@ go_write:
241 * will be used only for fsynced inodes after checkpoint. 241 * will be used only for fsynced inodes after checkpoint.
242 */ 242 */
243 try_to_fix_pino(inode); 243 try_to_fix_pino(inode);
244 clear_inode_flag(fi, FI_APPEND_WRITE);
245 clear_inode_flag(fi, FI_UPDATE_WRITE);
244 goto out; 246 goto out;
245 } 247 }
246sync_nodes: 248sync_nodes:
@@ -433,8 +435,12 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
433 continue; 435 continue;
434 436
435 dn->data_blkaddr = NULL_ADDR; 437 dn->data_blkaddr = NULL_ADDR;
436 update_extent_cache(dn); 438 set_data_blkaddr(dn);
439 f2fs_update_extent_cache(dn);
437 invalidate_blocks(sbi, blkaddr); 440 invalidate_blocks(sbi, blkaddr);
441 if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
442 clear_inode_flag(F2FS_I(dn->inode),
443 FI_FIRST_BLOCK_WRITTEN);
438 nr_free++; 444 nr_free++;
439 } 445 }
440 if (nr_free) { 446 if (nr_free) {
@@ -454,15 +460,16 @@ void truncate_data_blocks(struct dnode_of_data *dn)
454 truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); 460 truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
455} 461}
456 462
457static int truncate_partial_data_page(struct inode *inode, u64 from) 463static int truncate_partial_data_page(struct inode *inode, u64 from,
464 bool force)
458{ 465{
459 unsigned offset = from & (PAGE_CACHE_SIZE - 1); 466 unsigned offset = from & (PAGE_CACHE_SIZE - 1);
460 struct page *page; 467 struct page *page;
461 468
462 if (!offset) 469 if (!offset && !force)
463 return 0; 470 return 0;
464 471
465 page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); 472 page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, force);
466 if (IS_ERR(page)) 473 if (IS_ERR(page))
467 return 0; 474 return 0;
468 475
@@ -473,7 +480,8 @@ static int truncate_partial_data_page(struct inode *inode, u64 from)
473 480
474 f2fs_wait_on_page_writeback(page, DATA); 481 f2fs_wait_on_page_writeback(page, DATA);
475 zero_user(page, offset, PAGE_CACHE_SIZE - offset); 482 zero_user(page, offset, PAGE_CACHE_SIZE - offset);
476 set_page_dirty(page); 483 if (!force)
484 set_page_dirty(page);
477out: 485out:
478 f2fs_put_page(page, 1); 486 f2fs_put_page(page, 1);
479 return 0; 487 return 0;
@@ -487,6 +495,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
487 pgoff_t free_from; 495 pgoff_t free_from;
488 int count = 0, err = 0; 496 int count = 0, err = 0;
489 struct page *ipage; 497 struct page *ipage;
498 bool truncate_page = false;
490 499
491 trace_f2fs_truncate_blocks_enter(inode, from); 500 trace_f2fs_truncate_blocks_enter(inode, from);
492 501
@@ -502,7 +511,10 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
502 } 511 }
503 512
504 if (f2fs_has_inline_data(inode)) { 513 if (f2fs_has_inline_data(inode)) {
514 if (truncate_inline_inode(ipage, from))
515 set_page_dirty(ipage);
505 f2fs_put_page(ipage, 1); 516 f2fs_put_page(ipage, 1);
517 truncate_page = true;
506 goto out; 518 goto out;
507 } 519 }
508 520
@@ -533,7 +545,7 @@ out:
533 545
534 /* lastly zero out the first data page */ 546 /* lastly zero out the first data page */
535 if (!err) 547 if (!err)
536 err = truncate_partial_data_page(inode, from); 548 err = truncate_partial_data_page(inode, from, truncate_page);
537 549
538 trace_f2fs_truncate_blocks_exit(inode, err); 550 trace_f2fs_truncate_blocks_exit(inode, err);
539 return err; 551 return err;
@@ -997,6 +1009,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
997 if (!f2fs_is_volatile_file(inode)) 1009 if (!f2fs_is_volatile_file(inode))
998 return 0; 1010 return 0;
999 1011
1012 if (!f2fs_is_first_block_written(inode))
1013 return truncate_partial_data_page(inode, 0, true);
1014
1000 punch_hole(inode, 0, F2FS_BLKSIZE); 1015 punch_hole(inode, 0, F2FS_BLKSIZE);
1001 return 0; 1016 return 0;
1002} 1017}
@@ -1029,6 +1044,41 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
1029 return ret; 1044 return ret;
1030} 1045}
1031 1046
1047static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
1048{
1049 struct inode *inode = file_inode(filp);
1050 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1051 struct super_block *sb = sbi->sb;
1052 __u32 in;
1053
1054 if (!capable(CAP_SYS_ADMIN))
1055 return -EPERM;
1056
1057 if (get_user(in, (__u32 __user *)arg))
1058 return -EFAULT;
1059
1060 switch (in) {
1061 case F2FS_GOING_DOWN_FULLSYNC:
1062 sb = freeze_bdev(sb->s_bdev);
1063 if (sb && !IS_ERR(sb)) {
1064 f2fs_stop_checkpoint(sbi);
1065 thaw_bdev(sb->s_bdev, sb);
1066 }
1067 break;
1068 case F2FS_GOING_DOWN_METASYNC:
1069 /* do checkpoint only */
1070 f2fs_sync_fs(sb, 1);
1071 f2fs_stop_checkpoint(sbi);
1072 break;
1073 case F2FS_GOING_DOWN_NOSYNC:
1074 f2fs_stop_checkpoint(sbi);
1075 break;
1076 default:
1077 return -EINVAL;
1078 }
1079 return 0;
1080}
1081
1032static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) 1082static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
1033{ 1083{
1034 struct inode *inode = file_inode(filp); 1084 struct inode *inode = file_inode(filp);
@@ -1078,6 +1128,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1078 return f2fs_ioc_release_volatile_write(filp); 1128 return f2fs_ioc_release_volatile_write(filp);
1079 case F2FS_IOC_ABORT_VOLATILE_WRITE: 1129 case F2FS_IOC_ABORT_VOLATILE_WRITE:
1080 return f2fs_ioc_abort_volatile_write(filp); 1130 return f2fs_ioc_abort_volatile_write(filp);
1131 case F2FS_IOC_SHUTDOWN:
1132 return f2fs_ioc_shutdown(filp, arg);
1081 case FITRIM: 1133 case FITRIM:
1082 return f2fs_ioc_fitrim(filp, arg); 1134 return f2fs_ioc_fitrim(filp, arg);
1083 default: 1135 default:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 76adbc3641f1..ed58211fe79b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -435,7 +435,7 @@ next_step:
435 set_page_dirty(node_page); 435 set_page_dirty(node_page);
436 } 436 }
437 f2fs_put_page(node_page, 1); 437 f2fs_put_page(node_page, 1);
438 stat_inc_node_blk_count(sbi, 1); 438 stat_inc_node_blk_count(sbi, 1, gc_type);
439 } 439 }
440 440
441 if (initial) { 441 if (initial) {
@@ -622,7 +622,7 @@ next_step:
622 if (IS_ERR(data_page)) 622 if (IS_ERR(data_page))
623 continue; 623 continue;
624 move_data_page(inode, data_page, gc_type); 624 move_data_page(inode, data_page, gc_type);
625 stat_inc_data_blk_count(sbi, 1); 625 stat_inc_data_blk_count(sbi, 1, gc_type);
626 } 626 }
627 } 627 }
628 628
@@ -680,7 +680,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
680 } 680 }
681 blk_finish_plug(&plug); 681 blk_finish_plug(&plug);
682 682
683 stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); 683 stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type);
684 stat_inc_call_count(sbi->stat_info); 684 stat_inc_call_count(sbi->stat_info);
685 685
686 f2fs_put_page(sum_page, 1); 686 f2fs_put_page(sum_page, 1);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 1484c00133cd..8140e4f0e538 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -21,7 +21,7 @@ bool f2fs_may_inline(struct inode *inode)
21 if (f2fs_is_atomic_file(inode)) 21 if (f2fs_is_atomic_file(inode))
22 return false; 22 return false;
23 23
24 if (!S_ISREG(inode->i_mode)) 24 if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
25 return false; 25 return false;
26 26
27 if (i_size_read(inode) > MAX_INLINE_DATA) 27 if (i_size_read(inode) > MAX_INLINE_DATA)
@@ -50,10 +50,19 @@ void read_inline_data(struct page *page, struct page *ipage)
50 SetPageUptodate(page); 50 SetPageUptodate(page);
51} 51}
52 52
53static void truncate_inline_data(struct page *ipage) 53bool truncate_inline_inode(struct page *ipage, u64 from)
54{ 54{
55 void *addr;
56
57 if (from >= MAX_INLINE_DATA)
58 return false;
59
60 addr = inline_data_addr(ipage);
61
55 f2fs_wait_on_page_writeback(ipage, NODE); 62 f2fs_wait_on_page_writeback(ipage, NODE);
56 memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA); 63 memset(addr + from, 0, MAX_INLINE_DATA - from);
64
65 return true;
57} 66}
58 67
59int f2fs_read_inline_data(struct inode *inode, struct page *page) 68int f2fs_read_inline_data(struct inode *inode, struct page *page)
@@ -122,7 +131,8 @@ no_update:
122 set_page_writeback(page); 131 set_page_writeback(page);
123 fio.blk_addr = dn->data_blkaddr; 132 fio.blk_addr = dn->data_blkaddr;
124 write_data_page(page, dn, &fio); 133 write_data_page(page, dn, &fio);
125 update_extent_cache(dn); 134 set_data_blkaddr(dn);
135 f2fs_update_extent_cache(dn);
126 f2fs_wait_on_page_writeback(page, DATA); 136 f2fs_wait_on_page_writeback(page, DATA);
127 if (dirty) 137 if (dirty)
128 inode_dec_dirty_pages(dn->inode); 138 inode_dec_dirty_pages(dn->inode);
@@ -131,7 +141,7 @@ no_update:
131 set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); 141 set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
132 142
133 /* clear inline data and flag after data writeback */ 143 /* clear inline data and flag after data writeback */
134 truncate_inline_data(dn->inode_page); 144 truncate_inline_inode(dn->inode_page, 0);
135clear_out: 145clear_out:
136 stat_dec_inline_inode(dn->inode); 146 stat_dec_inline_inode(dn->inode);
137 f2fs_clear_inline_inode(dn->inode); 147 f2fs_clear_inline_inode(dn->inode);
@@ -245,7 +255,7 @@ process_inline:
245 if (f2fs_has_inline_data(inode)) { 255 if (f2fs_has_inline_data(inode)) {
246 ipage = get_node_page(sbi, inode->i_ino); 256 ipage = get_node_page(sbi, inode->i_ino);
247 f2fs_bug_on(sbi, IS_ERR(ipage)); 257 f2fs_bug_on(sbi, IS_ERR(ipage));
248 truncate_inline_data(ipage); 258 truncate_inline_inode(ipage, 0);
249 f2fs_clear_inline_inode(inode); 259 f2fs_clear_inline_inode(inode);
250 update_inode(inode, ipage); 260 update_inode(inode, ipage);
251 f2fs_put_page(ipage, 1); 261 f2fs_put_page(ipage, 1);
@@ -363,7 +373,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
363 set_page_dirty(page); 373 set_page_dirty(page);
364 374
365 /* clear inline dir and flag after data writeback */ 375 /* clear inline dir and flag after data writeback */
366 truncate_inline_data(ipage); 376 truncate_inline_inode(ipage, 0);
367 377
368 stat_dec_inline_dir(dir); 378 stat_dec_inline_dir(dir);
369 clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); 379 clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
@@ -380,21 +390,18 @@ out:
380} 390}
381 391
382int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, 392int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
383 struct inode *inode) 393 struct inode *inode, nid_t ino, umode_t mode)
384{ 394{
385 struct f2fs_sb_info *sbi = F2FS_I_SB(dir); 395 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
386 struct page *ipage; 396 struct page *ipage;
387 unsigned int bit_pos; 397 unsigned int bit_pos;
388 f2fs_hash_t name_hash; 398 f2fs_hash_t name_hash;
389 struct f2fs_dir_entry *de;
390 size_t namelen = name->len; 399 size_t namelen = name->len;
391 struct f2fs_inline_dentry *dentry_blk = NULL; 400 struct f2fs_inline_dentry *dentry_blk = NULL;
401 struct f2fs_dentry_ptr d;
392 int slots = GET_DENTRY_SLOTS(namelen); 402 int slots = GET_DENTRY_SLOTS(namelen);
393 struct page *page; 403 struct page *page = NULL;
394 int err = 0; 404 int err = 0;
395 int i;
396
397 name_hash = f2fs_dentry_hash(name);
398 405
399 ipage = get_node_page(sbi, dir->i_ino); 406 ipage = get_node_page(sbi, dir->i_ino);
400 if (IS_ERR(ipage)) 407 if (IS_ERR(ipage))
@@ -410,32 +417,34 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name,
410 goto out; 417 goto out;
411 } 418 }
412 419
413 down_write(&F2FS_I(inode)->i_sem); 420 if (inode) {
414 page = init_inode_metadata(inode, dir, name, ipage); 421 down_write(&F2FS_I(inode)->i_sem);
415 if (IS_ERR(page)) { 422 page = init_inode_metadata(inode, dir, name, ipage);
416 err = PTR_ERR(page); 423 if (IS_ERR(page)) {
417 goto fail; 424 err = PTR_ERR(page);
425 goto fail;
426 }
418 } 427 }
419 428
420 f2fs_wait_on_page_writeback(ipage, NODE); 429 f2fs_wait_on_page_writeback(ipage, NODE);
421 de = &dentry_blk->dentry[bit_pos]; 430
422 de->hash_code = name_hash; 431 name_hash = f2fs_dentry_hash(name);
423 de->name_len = cpu_to_le16(namelen); 432 make_dentry_ptr(&d, (void *)dentry_blk, 2);
424 memcpy(dentry_blk->filename[bit_pos], name->name, name->len); 433 f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos);
425 de->ino = cpu_to_le32(inode->i_ino); 434
426 set_de_type(de, inode);
427 for (i = 0; i < slots; i++)
428 test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
429 set_page_dirty(ipage); 435 set_page_dirty(ipage);
430 436
431 /* we don't need to mark_inode_dirty now */ 437 /* we don't need to mark_inode_dirty now */
432 F2FS_I(inode)->i_pino = dir->i_ino; 438 if (inode) {
433 update_inode(inode, page); 439 F2FS_I(inode)->i_pino = dir->i_ino;
434 f2fs_put_page(page, 1); 440 update_inode(inode, page);
441 f2fs_put_page(page, 1);
442 }
435 443
436 update_parent_metadata(dir, inode, 0); 444 update_parent_metadata(dir, inode, 0);
437fail: 445fail:
438 up_write(&F2FS_I(inode)->i_sem); 446 if (inode)
447 up_write(&F2FS_I(inode)->i_sem);
439 448
440 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { 449 if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
441 update_inode(dir, ipage); 450 update_inode(dir, ipage);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2d002e3738a7..e622ec95409e 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -51,6 +51,15 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
51 } 51 }
52} 52}
53 53
54static bool __written_first_block(struct f2fs_inode *ri)
55{
56 block_t addr = le32_to_cpu(ri->i_addr[0]);
57
58 if (addr != NEW_ADDR && addr != NULL_ADDR)
59 return true;
60 return false;
61}
62
54static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) 63static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
55{ 64{
56 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 65 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -130,7 +139,8 @@ static int do_read_inode(struct inode *inode)
130 fi->i_pino = le32_to_cpu(ri->i_pino); 139 fi->i_pino = le32_to_cpu(ri->i_pino);
131 fi->i_dir_level = ri->i_dir_level; 140 fi->i_dir_level = ri->i_dir_level;
132 141
133 get_extent_info(&fi->ext, ri->i_ext); 142 f2fs_init_extent_cache(inode, &ri->i_ext);
143
134 get_inline_info(fi, ri); 144 get_inline_info(fi, ri);
135 145
136 /* check data exist */ 146 /* check data exist */
@@ -140,6 +150,9 @@ static int do_read_inode(struct inode *inode)
140 /* get rdev by using inline_info */ 150 /* get rdev by using inline_info */
141 __get_inode_rdev(inode, ri); 151 __get_inode_rdev(inode, ri);
142 152
153 if (__written_first_block(ri))
154 set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN);
155
143 f2fs_put_page(node_page, 1); 156 f2fs_put_page(node_page, 1);
144 157
145 stat_inc_inline_inode(inode); 158 stat_inc_inline_inode(inode);
@@ -220,7 +233,11 @@ void update_inode(struct inode *inode, struct page *node_page)
220 ri->i_links = cpu_to_le32(inode->i_nlink); 233 ri->i_links = cpu_to_le32(inode->i_nlink);
221 ri->i_size = cpu_to_le64(i_size_read(inode)); 234 ri->i_size = cpu_to_le64(i_size_read(inode));
222 ri->i_blocks = cpu_to_le64(inode->i_blocks); 235 ri->i_blocks = cpu_to_le64(inode->i_blocks);
236
237 read_lock(&F2FS_I(inode)->ext_lock);
223 set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); 238 set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
239 read_unlock(&F2FS_I(inode)->ext_lock);
240
224 set_raw_inline(F2FS_I(inode), ri); 241 set_raw_inline(F2FS_I(inode), ri);
225 242
226 ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 243 ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -328,6 +345,12 @@ void f2fs_evict_inode(struct inode *inode)
328no_delete: 345no_delete:
329 stat_dec_inline_dir(inode); 346 stat_dec_inline_dir(inode);
330 stat_dec_inline_inode(inode); 347 stat_dec_inline_inode(inode);
348
349 /* update extent info in inode */
350 if (inode->i_nlink)
351 f2fs_preserve_extent_tree(inode);
352 f2fs_destroy_extent_tree(inode);
353
331 invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); 354 invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
332 if (xnid) 355 if (xnid)
333 invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); 356 invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index e79639a9787a..407dde3d7a92 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -14,6 +14,7 @@
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/ctype.h> 15#include <linux/ctype.h>
16#include <linux/dcache.h> 16#include <linux/dcache.h>
17#include <linux/namei.h>
17 18
18#include "f2fs.h" 19#include "f2fs.h"
19#include "node.h" 20#include "node.h"
@@ -187,6 +188,44 @@ struct dentry *f2fs_get_parent(struct dentry *child)
187 return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino)); 188 return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
188} 189}
189 190
191static int __recover_dot_dentries(struct inode *dir, nid_t pino)
192{
193 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
194 struct qstr dot = QSTR_INIT(".", 1);
195 struct qstr dotdot = QSTR_INIT("..", 2);
196 struct f2fs_dir_entry *de;
197 struct page *page;
198 int err = 0;
199
200 f2fs_lock_op(sbi);
201
202 de = f2fs_find_entry(dir, &dot, &page);
203 if (de) {
204 f2fs_dentry_kunmap(dir, page);
205 f2fs_put_page(page, 0);
206 } else {
207 err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR);
208 if (err)
209 goto out;
210 }
211
212 de = f2fs_find_entry(dir, &dotdot, &page);
213 if (de) {
214 f2fs_dentry_kunmap(dir, page);
215 f2fs_put_page(page, 0);
216 } else {
217 err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
218 }
219out:
220 if (!err) {
221 clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS);
222 mark_inode_dirty(dir);
223 }
224
225 f2fs_unlock_op(sbi);
226 return err;
227}
228
190static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, 229static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
191 unsigned int flags) 230 unsigned int flags)
192{ 231{
@@ -206,6 +245,16 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
206 inode = f2fs_iget(dir->i_sb, ino); 245 inode = f2fs_iget(dir->i_sb, ino);
207 if (IS_ERR(inode)) 246 if (IS_ERR(inode))
208 return ERR_CAST(inode); 247 return ERR_CAST(inode);
248
249 if (f2fs_has_inline_dots(inode)) {
250 int err;
251
252 err = __recover_dot_dentries(inode, dir->i_ino);
253 if (err) {
254 iget_failed(inode);
255 return ERR_PTR(err);
256 }
257 }
209 } 258 }
210 259
211 return d_splice_alias(inode, dentry); 260 return d_splice_alias(inode, dentry);
@@ -247,6 +296,23 @@ fail:
247 return err; 296 return err;
248} 297}
249 298
299static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd)
300{
301 struct page *page;
302
303 page = page_follow_link_light(dentry, nd);
304 if (IS_ERR(page))
305 return page;
306
307 /* this is broken symlink case */
308 if (*nd_get_link(nd) == 0) {
309 kunmap(page);
310 page_cache_release(page);
311 return ERR_PTR(-ENOENT);
312 }
313 return page;
314}
315
250static int f2fs_symlink(struct inode *dir, struct dentry *dentry, 316static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
251 const char *symname) 317 const char *symname)
252{ 318{
@@ -276,6 +342,17 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
276 d_instantiate(dentry, inode); 342 d_instantiate(dentry, inode);
277 unlock_new_inode(inode); 343 unlock_new_inode(inode);
278 344
345 /*
346 * Let's flush symlink data in order to avoid broken symlink as much as
347 * possible. Nevertheless, fsyncing is the best way, but there is no
348 * way to get a file descriptor in order to flush that.
349 *
350 * Note that, it needs to do dir->fsync to make this recoverable.
351 * If the symlink path is stored into inline_data, there is no
352 * performance regression.
353 */
354 filemap_write_and_wait_range(inode->i_mapping, 0, symlen - 1);
355
279 if (IS_DIRSYNC(dir)) 356 if (IS_DIRSYNC(dir))
280 f2fs_sync_fs(sbi->sb, 1); 357 f2fs_sync_fs(sbi->sb, 1);
281 return err; 358 return err;
@@ -693,6 +770,8 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
693 f2fs_unlock_op(sbi); 770 f2fs_unlock_op(sbi);
694 771
695 alloc_nid_done(sbi, inode->i_ino); 772 alloc_nid_done(sbi, inode->i_ino);
773
774 stat_inc_inline_inode(inode);
696 d_tmpfile(dentry, inode); 775 d_tmpfile(dentry, inode);
697 unlock_new_inode(inode); 776 unlock_new_inode(inode);
698 return 0; 777 return 0;
@@ -729,7 +808,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
729 808
730const struct inode_operations f2fs_symlink_inode_operations = { 809const struct inode_operations f2fs_symlink_inode_operations = {
731 .readlink = generic_readlink, 810 .readlink = generic_readlink,
732 .follow_link = page_follow_link_light, 811 .follow_link = f2fs_follow_link,
733 .put_link = page_put_link, 812 .put_link = page_put_link,
734 .getattr = f2fs_getattr, 813 .getattr = f2fs_getattr,
735 .setattr = f2fs_setattr, 814 .setattr = f2fs_setattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 97bd9d3db882..8ab0cf1930bd 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -41,7 +41,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
41 /* only uses low memory */ 41 /* only uses low memory */
42 avail_ram = val.totalram - val.totalhigh; 42 avail_ram = val.totalram - val.totalhigh;
43 43
44 /* give 25%, 25%, 50%, 50% memory for each components respectively */ 44 /*
45 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
46 */
45 if (type == FREE_NIDS) { 47 if (type == FREE_NIDS) {
46 mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 48 mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
47 PAGE_CACHE_SHIFT; 49 PAGE_CACHE_SHIFT;
@@ -62,6 +64,11 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
62 mem_size += (sbi->im[i].ino_num * 64 mem_size += (sbi->im[i].ino_num *
63 sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; 65 sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
64 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); 66 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
67 } else if (type == EXTENT_CACHE) {
68 mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
69 atomic_read(&sbi->total_ext_node) *
70 sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
71 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
65 } else { 72 } else {
66 if (sbi->sb->s_bdi->dirty_exceeded) 73 if (sbi->sb->s_bdi->dirty_exceeded)
67 return false; 74 return false;
@@ -494,7 +501,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
494 501
495 /* if inline_data is set, should not report any block indices */ 502 /* if inline_data is set, should not report any block indices */
496 if (f2fs_has_inline_data(dn->inode) && index) { 503 if (f2fs_has_inline_data(dn->inode) && index) {
497 err = -EINVAL; 504 err = -ENOENT;
498 f2fs_put_page(npage[0], 1); 505 f2fs_put_page(npage[0], 1);
499 goto release_out; 506 goto release_out;
500 } 507 }
@@ -995,6 +1002,7 @@ static int read_node_page(struct page *page, int rw)
995 get_node_info(sbi, page->index, &ni); 1002 get_node_info(sbi, page->index, &ni);
996 1003
997 if (unlikely(ni.blk_addr == NULL_ADDR)) { 1004 if (unlikely(ni.blk_addr == NULL_ADDR)) {
1005 ClearPageUptodate(page);
998 f2fs_put_page(page, 1); 1006 f2fs_put_page(page, 1);
999 return -ENOENT; 1007 return -ENOENT;
1000 } 1008 }
@@ -1306,6 +1314,7 @@ static int f2fs_write_node_page(struct page *page,
1306 1314
1307 /* This page is already truncated */ 1315 /* This page is already truncated */
1308 if (unlikely(ni.blk_addr == NULL_ADDR)) { 1316 if (unlikely(ni.blk_addr == NULL_ADDR)) {
1317 ClearPageUptodate(page);
1309 dec_page_count(sbi, F2FS_DIRTY_NODES); 1318 dec_page_count(sbi, F2FS_DIRTY_NODES);
1310 unlock_page(page); 1319 unlock_page(page);
1311 return 0; 1320 return 0;
@@ -1821,6 +1830,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1821 struct f2fs_nat_block *nat_blk; 1830 struct f2fs_nat_block *nat_blk;
1822 struct nat_entry *ne, *cur; 1831 struct nat_entry *ne, *cur;
1823 struct page *page = NULL; 1832 struct page *page = NULL;
1833 struct f2fs_nm_info *nm_i = NM_I(sbi);
1824 1834
1825 /* 1835 /*
1826 * there are two steps to flush nat entries: 1836 * there are two steps to flush nat entries:
@@ -1874,7 +1884,9 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1874 1884
1875 f2fs_bug_on(sbi, set->entry_cnt); 1885 f2fs_bug_on(sbi, set->entry_cnt);
1876 1886
1887 down_write(&nm_i->nat_tree_lock);
1877 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); 1888 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
1889 up_write(&nm_i->nat_tree_lock);
1878 kmem_cache_free(nat_entry_set_slab, set); 1890 kmem_cache_free(nat_entry_set_slab, set);
1879} 1891}
1880 1892
@@ -1902,6 +1914,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1902 if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) 1914 if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
1903 remove_nats_in_journal(sbi); 1915 remove_nats_in_journal(sbi);
1904 1916
1917 down_write(&nm_i->nat_tree_lock);
1905 while ((found = __gang_lookup_nat_set(nm_i, 1918 while ((found = __gang_lookup_nat_set(nm_i,
1906 set_idx, SETVEC_SIZE, setvec))) { 1919 set_idx, SETVEC_SIZE, setvec))) {
1907 unsigned idx; 1920 unsigned idx;
@@ -1910,6 +1923,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
1910 __adjust_nat_entry_set(setvec[idx], &sets, 1923 __adjust_nat_entry_set(setvec[idx], &sets,
1911 MAX_NAT_JENTRIES(sum)); 1924 MAX_NAT_JENTRIES(sum));
1912 } 1925 }
1926 up_write(&nm_i->nat_tree_lock);
1913 1927
1914 /* flush dirty nats in nat entry set */ 1928 /* flush dirty nats in nat entry set */
1915 list_for_each_entry_safe(set, tmp, &sets, set_list) 1929 list_for_each_entry_safe(set, tmp, &sets, set_list)
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index f405bbf2435a..c56026f1725c 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -120,6 +120,7 @@ enum mem_type {
120 NAT_ENTRIES, /* indicates the cached nat entry */ 120 NAT_ENTRIES, /* indicates the cached nat entry */
121 DIRTY_DENTS, /* indicates dirty dentry pages */ 121 DIRTY_DENTS, /* indicates dirty dentry pages */
122 INO_ENTRIES, /* indicates inode entries */ 122 INO_ENTRIES, /* indicates inode entries */
123 EXTENT_CACHE, /* indicates extent cache */
123 BASE_CHECK, /* check kernel status */ 124 BASE_CHECK, /* check kernel status */
124}; 125};
125 126
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 41afb9534bbd..8d8ea99f2156 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -93,10 +93,9 @@ static int recover_dentry(struct inode *inode, struct page *ipage)
93 } 93 }
94retry: 94retry:
95 de = f2fs_find_entry(dir, &name, &page); 95 de = f2fs_find_entry(dir, &name, &page);
96 if (de && inode->i_ino == le32_to_cpu(de->ino)) { 96 if (de && inode->i_ino == le32_to_cpu(de->ino))
97 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
98 goto out_unmap_put; 97 goto out_unmap_put;
99 } 98
100 if (de) { 99 if (de) {
101 einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); 100 einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
102 if (IS_ERR(einode)) { 101 if (IS_ERR(einode)) {
@@ -115,7 +114,7 @@ retry:
115 iput(einode); 114 iput(einode);
116 goto retry; 115 goto retry;
117 } 116 }
118 err = __f2fs_add_link(dir, &name, inode); 117 err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode);
119 if (err) 118 if (err)
120 goto out_err; 119 goto out_err;
121 120
@@ -187,11 +186,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
187 goto next; 186 goto next;
188 187
189 entry = get_fsync_inode(head, ino_of_node(page)); 188 entry = get_fsync_inode(head, ino_of_node(page));
190 if (entry) { 189 if (!entry) {
191 if (IS_INODE(page) && is_dent_dnode(page))
192 set_inode_flag(F2FS_I(entry->inode),
193 FI_INC_LINK);
194 } else {
195 if (IS_INODE(page) && is_dent_dnode(page)) { 190 if (IS_INODE(page) && is_dent_dnode(page)) {
196 err = recover_inode_page(sbi, page); 191 err = recover_inode_page(sbi, page);
197 if (err) 192 if (err)
@@ -212,8 +207,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
212 if (IS_ERR(entry->inode)) { 207 if (IS_ERR(entry->inode)) {
213 err = PTR_ERR(entry->inode); 208 err = PTR_ERR(entry->inode);
214 kmem_cache_free(fsync_entry_slab, entry); 209 kmem_cache_free(fsync_entry_slab, entry);
215 if (err == -ENOENT) 210 if (err == -ENOENT) {
211 err = 0;
216 goto next; 212 goto next;
213 }
217 break; 214 break;
218 } 215 }
219 list_add_tail(&entry->list, head); 216 list_add_tail(&entry->list, head);
@@ -256,6 +253,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
256 struct f2fs_summary_block *sum_node; 253 struct f2fs_summary_block *sum_node;
257 struct f2fs_summary sum; 254 struct f2fs_summary sum;
258 struct page *sum_page, *node_page; 255 struct page *sum_page, *node_page;
256 struct dnode_of_data tdn = *dn;
259 nid_t ino, nid; 257 nid_t ino, nid;
260 struct inode *inode; 258 struct inode *inode;
261 unsigned int offset; 259 unsigned int offset;
@@ -283,17 +281,15 @@ got_it:
283 /* Use the locked dnode page and inode */ 281 /* Use the locked dnode page and inode */
284 nid = le32_to_cpu(sum.nid); 282 nid = le32_to_cpu(sum.nid);
285 if (dn->inode->i_ino == nid) { 283 if (dn->inode->i_ino == nid) {
286 struct dnode_of_data tdn = *dn;
287 tdn.nid = nid; 284 tdn.nid = nid;
285 if (!dn->inode_page_locked)
286 lock_page(dn->inode_page);
288 tdn.node_page = dn->inode_page; 287 tdn.node_page = dn->inode_page;
289 tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); 288 tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
290 truncate_data_blocks_range(&tdn, 1); 289 goto truncate_out;
291 return 0;
292 } else if (dn->nid == nid) { 290 } else if (dn->nid == nid) {
293 struct dnode_of_data tdn = *dn;
294 tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); 291 tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
295 truncate_data_blocks_range(&tdn, 1); 292 goto truncate_out;
296 return 0;
297 } 293 }
298 294
299 /* Get the node page */ 295 /* Get the node page */
@@ -317,18 +313,33 @@ got_it:
317 bidx = start_bidx_of_node(offset, F2FS_I(inode)) + 313 bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
318 le16_to_cpu(sum.ofs_in_node); 314 le16_to_cpu(sum.ofs_in_node);
319 315
320 if (ino != dn->inode->i_ino) { 316 /*
321 truncate_hole(inode, bidx, bidx + 1); 317 * if inode page is locked, unlock temporarily, but its reference
318 * count keeps alive.
319 */
320 if (ino == dn->inode->i_ino && dn->inode_page_locked)
321 unlock_page(dn->inode_page);
322
323 set_new_dnode(&tdn, inode, NULL, NULL, 0);
324 if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
325 goto out;
326
327 if (tdn.data_blkaddr == blkaddr)
328 truncate_data_blocks_range(&tdn, 1);
329
330 f2fs_put_dnode(&tdn);
331out:
332 if (ino != dn->inode->i_ino)
322 iput(inode); 333 iput(inode);
323 } else { 334 else if (dn->inode_page_locked)
324 struct dnode_of_data tdn; 335 lock_page(dn->inode_page);
325 set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0); 336 return 0;
326 if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) 337
327 return 0; 338truncate_out:
328 if (tdn.data_blkaddr != NULL_ADDR) 339 if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr)
329 truncate_data_blocks_range(&tdn, 1); 340 truncate_data_blocks_range(&tdn, 1);
330 f2fs_put_page(tdn.node_page, 1); 341 if (dn->inode->i_ino == nid && !dn->inode_page_locked)
331 } 342 unlock_page(dn->inode_page);
332 return 0; 343 return 0;
333} 344}
334 345
@@ -384,7 +395,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
384 src = datablock_addr(dn.node_page, dn.ofs_in_node); 395 src = datablock_addr(dn.node_page, dn.ofs_in_node);
385 dest = datablock_addr(page, dn.ofs_in_node); 396 dest = datablock_addr(page, dn.ofs_in_node);
386 397
387 if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { 398 if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR &&
399 dest >= MAIN_BLKADDR(sbi) && dest < MAX_BLKADDR(sbi)) {
400
388 if (src == NULL_ADDR) { 401 if (src == NULL_ADDR) {
389 err = reserve_new_block(&dn); 402 err = reserve_new_block(&dn);
390 /* We should not get -ENOSPC */ 403 /* We should not get -ENOSPC */
@@ -401,14 +414,13 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
401 /* write dummy data page */ 414 /* write dummy data page */
402 recover_data_page(sbi, NULL, &sum, src, dest); 415 recover_data_page(sbi, NULL, &sum, src, dest);
403 dn.data_blkaddr = dest; 416 dn.data_blkaddr = dest;
404 update_extent_cache(&dn); 417 set_data_blkaddr(&dn);
418 f2fs_update_extent_cache(&dn);
405 recovered++; 419 recovered++;
406 } 420 }
407 dn.ofs_in_node++; 421 dn.ofs_in_node++;
408 } 422 }
409 423
410 /* write node page in place */
411 set_summary(&sum, dn.nid, 0, 0);
412 if (IS_INODE(dn.node_page)) 424 if (IS_INODE(dn.node_page))
413 sync_inode_page(&dn); 425 sync_inode_page(&dn);
414 426
@@ -552,7 +564,7 @@ out:
552 mutex_unlock(&sbi->cp_mutex); 564 mutex_unlock(&sbi->cp_mutex);
553 } else if (need_writecp) { 565 } else if (need_writecp) {
554 struct cp_control cpc = { 566 struct cp_control cpc = {
555 .reason = CP_SYNC, 567 .reason = CP_RECOVERY,
556 }; 568 };
557 mutex_unlock(&sbi->cp_mutex); 569 mutex_unlock(&sbi->cp_mutex);
558 write_checkpoint(sbi, &cpc); 570 write_checkpoint(sbi, &cpc);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index daee4ab913da..f939660941bb 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -205,6 +205,8 @@ retry:
205 list_add_tail(&new->list, &fi->inmem_pages); 205 list_add_tail(&new->list, &fi->inmem_pages);
206 inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); 206 inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
207 mutex_unlock(&fi->inmem_lock); 207 mutex_unlock(&fi->inmem_lock);
208
209 trace_f2fs_register_inmem_page(page, INMEM);
208} 210}
209 211
210void commit_inmem_pages(struct inode *inode, bool abort) 212void commit_inmem_pages(struct inode *inode, bool abort)
@@ -238,11 +240,13 @@ void commit_inmem_pages(struct inode *inode, bool abort)
238 f2fs_wait_on_page_writeback(cur->page, DATA); 240 f2fs_wait_on_page_writeback(cur->page, DATA);
239 if (clear_page_dirty_for_io(cur->page)) 241 if (clear_page_dirty_for_io(cur->page))
240 inode_dec_dirty_pages(inode); 242 inode_dec_dirty_pages(inode);
243 trace_f2fs_commit_inmem_page(cur->page, INMEM);
241 do_write_data_page(cur->page, &fio); 244 do_write_data_page(cur->page, &fio);
242 submit_bio = true; 245 submit_bio = true;
243 } 246 }
244 f2fs_put_page(cur->page, 1); 247 f2fs_put_page(cur->page, 1);
245 } else { 248 } else {
249 trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
246 put_page(cur->page); 250 put_page(cur->page);
247 } 251 }
248 radix_tree_delete(&fi->inmem_root, cur->page->index); 252 radix_tree_delete(&fi->inmem_root, cur->page->index);
@@ -277,6 +281,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
277 281
278void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) 282void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
279{ 283{
284 /* try to shrink extent cache when there is no enough memory */
285 f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER);
286
280 /* check the # of cached NAT entries and prefree segments */ 287 /* check the # of cached NAT entries and prefree segments */
281 if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || 288 if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
282 excess_prefree_segs(sbi) || 289 excess_prefree_segs(sbi) ||
@@ -549,7 +556,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
549 556
550 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); 557 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
551 558
552 if (end - start < cpc->trim_minlen) 559 if (force && end - start < cpc->trim_minlen)
553 continue; 560 continue;
554 561
555 __add_discard_entry(sbi, cpc, start, end); 562 __add_discard_entry(sbi, cpc, start, end);
@@ -1164,6 +1171,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
1164 curseg = CURSEG_I(sbi, type); 1171 curseg = CURSEG_I(sbi, type);
1165 1172
1166 mutex_lock(&curseg->curseg_mutex); 1173 mutex_lock(&curseg->curseg_mutex);
1174 mutex_lock(&sit_i->sentry_lock);
1167 1175
1168 /* direct_io'ed data is aligned to the segment for better performance */ 1176 /* direct_io'ed data is aligned to the segment for better performance */
1169 if (direct_io && curseg->next_blkoff) 1177 if (direct_io && curseg->next_blkoff)
@@ -1178,7 +1186,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
1178 */ 1186 */
1179 __add_sum_entry(sbi, type, sum); 1187 __add_sum_entry(sbi, type, sum);
1180 1188
1181 mutex_lock(&sit_i->sentry_lock);
1182 __refresh_next_blkoff(sbi, curseg); 1189 __refresh_next_blkoff(sbi, curseg);
1183 1190
1184 stat_inc_block_count(sbi, curseg); 1191 stat_inc_block_count(sbi, curseg);
@@ -1730,6 +1737,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1730 mutex_lock(&curseg->curseg_mutex); 1737 mutex_lock(&curseg->curseg_mutex);
1731 mutex_lock(&sit_i->sentry_lock); 1738 mutex_lock(&sit_i->sentry_lock);
1732 1739
1740 if (!sit_i->dirty_sentries)
1741 goto out;
1742
1733 /* 1743 /*
1734 * add and account sit entries of dirty bitmap in sit entry 1744 * add and account sit entries of dirty bitmap in sit entry
1735 * set temporarily 1745 * set temporarily
@@ -1744,9 +1754,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1744 if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) 1754 if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
1745 remove_sits_in_journal(sbi); 1755 remove_sits_in_journal(sbi);
1746 1756
1747 if (!sit_i->dirty_sentries)
1748 goto out;
1749
1750 /* 1757 /*
1751 * there are two steps to flush sit entries: 1758 * there are two steps to flush sit entries:
1752 * #1, flush sit entries to journal in current cold data summary block. 1759 * #1, flush sit entries to journal in current cold data summary block.
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7fd35111cf62..85d7fa7514b2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -336,7 +336,8 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
336 clear_bit(segno, free_i->free_segmap); 336 clear_bit(segno, free_i->free_segmap);
337 free_i->free_segments++; 337 free_i->free_segments++;
338 338
339 next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno); 339 next = find_next_bit(free_i->free_segmap,
340 start_segno + sbi->segs_per_sec, start_segno);
340 if (next >= start_segno + sbi->segs_per_sec) { 341 if (next >= start_segno + sbi->segs_per_sec) {
341 clear_bit(secno, free_i->free_secmap); 342 clear_bit(secno, free_i->free_secmap);
342 free_i->free_sections++; 343 free_i->free_sections++;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f2fe666a6ea9..160b88346b24 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -57,6 +57,8 @@ enum {
57 Opt_flush_merge, 57 Opt_flush_merge,
58 Opt_nobarrier, 58 Opt_nobarrier,
59 Opt_fastboot, 59 Opt_fastboot,
60 Opt_extent_cache,
61 Opt_noinline_data,
60 Opt_err, 62 Opt_err,
61}; 63};
62 64
@@ -78,6 +80,8 @@ static match_table_t f2fs_tokens = {
78 {Opt_flush_merge, "flush_merge"}, 80 {Opt_flush_merge, "flush_merge"},
79 {Opt_nobarrier, "nobarrier"}, 81 {Opt_nobarrier, "nobarrier"},
80 {Opt_fastboot, "fastboot"}, 82 {Opt_fastboot, "fastboot"},
83 {Opt_extent_cache, "extent_cache"},
84 {Opt_noinline_data, "noinline_data"},
81 {Opt_err, NULL}, 85 {Opt_err, NULL},
82}; 86};
83 87
@@ -367,6 +371,12 @@ static int parse_options(struct super_block *sb, char *options)
367 case Opt_fastboot: 371 case Opt_fastboot:
368 set_opt(sbi, FASTBOOT); 372 set_opt(sbi, FASTBOOT);
369 break; 373 break;
374 case Opt_extent_cache:
375 set_opt(sbi, EXTENT_CACHE);
376 break;
377 case Opt_noinline_data:
378 clear_opt(sbi, INLINE_DATA);
379 break;
370 default: 380 default:
371 f2fs_msg(sb, KERN_ERR, 381 f2fs_msg(sb, KERN_ERR,
372 "Unrecognized mount option \"%s\" or missing value", 382 "Unrecognized mount option \"%s\" or missing value",
@@ -392,7 +402,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
392 atomic_set(&fi->dirty_pages, 0); 402 atomic_set(&fi->dirty_pages, 0);
393 fi->i_current_depth = 1; 403 fi->i_current_depth = 1;
394 fi->i_advise = 0; 404 fi->i_advise = 0;
395 rwlock_init(&fi->ext.ext_lock); 405 rwlock_init(&fi->ext_lock);
396 init_rwsem(&fi->i_sem); 406 init_rwsem(&fi->i_sem);
397 INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); 407 INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS);
398 INIT_LIST_HEAD(&fi->inmem_pages); 408 INIT_LIST_HEAD(&fi->inmem_pages);
@@ -591,6 +601,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
591 seq_puts(seq, ",disable_ext_identify"); 601 seq_puts(seq, ",disable_ext_identify");
592 if (test_opt(sbi, INLINE_DATA)) 602 if (test_opt(sbi, INLINE_DATA))
593 seq_puts(seq, ",inline_data"); 603 seq_puts(seq, ",inline_data");
604 else
605 seq_puts(seq, ",noinline_data");
594 if (test_opt(sbi, INLINE_DENTRY)) 606 if (test_opt(sbi, INLINE_DENTRY))
595 seq_puts(seq, ",inline_dentry"); 607 seq_puts(seq, ",inline_dentry");
596 if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) 608 if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
@@ -599,6 +611,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
599 seq_puts(seq, ",nobarrier"); 611 seq_puts(seq, ",nobarrier");
600 if (test_opt(sbi, FASTBOOT)) 612 if (test_opt(sbi, FASTBOOT))
601 seq_puts(seq, ",fastboot"); 613 seq_puts(seq, ",fastboot");
614 if (test_opt(sbi, EXTENT_CACHE))
615 seq_puts(seq, ",extent_cache");
602 seq_printf(seq, ",active_logs=%u", sbi->active_logs); 616 seq_printf(seq, ",active_logs=%u", sbi->active_logs);
603 617
604 return 0; 618 return 0;
@@ -959,7 +973,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
959 struct buffer_head *raw_super_buf; 973 struct buffer_head *raw_super_buf;
960 struct inode *root; 974 struct inode *root;
961 long err = -EINVAL; 975 long err = -EINVAL;
962 bool retry = true; 976 bool retry = true, need_fsck = false;
963 char *options = NULL; 977 char *options = NULL;
964 int i; 978 int i;
965 979
@@ -984,6 +998,7 @@ try_onemore:
984 sbi->active_logs = NR_CURSEG_TYPE; 998 sbi->active_logs = NR_CURSEG_TYPE;
985 999
986 set_opt(sbi, BG_GC); 1000 set_opt(sbi, BG_GC);
1001 set_opt(sbi, INLINE_DATA);
987 1002
988#ifdef CONFIG_F2FS_FS_XATTR 1003#ifdef CONFIG_F2FS_FS_XATTR
989 set_opt(sbi, XATTR_USER); 1004 set_opt(sbi, XATTR_USER);
@@ -1020,7 +1035,6 @@ try_onemore:
1020 sbi->raw_super = raw_super; 1035 sbi->raw_super = raw_super;
1021 sbi->raw_super_buf = raw_super_buf; 1036 sbi->raw_super_buf = raw_super_buf;
1022 mutex_init(&sbi->gc_mutex); 1037 mutex_init(&sbi->gc_mutex);
1023 mutex_init(&sbi->writepages);
1024 mutex_init(&sbi->cp_mutex); 1038 mutex_init(&sbi->cp_mutex);
1025 init_rwsem(&sbi->node_write); 1039 init_rwsem(&sbi->node_write);
1026 clear_sbi_flag(sbi, SBI_POR_DOING); 1040 clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -1072,6 +1086,8 @@ try_onemore:
1072 INIT_LIST_HEAD(&sbi->dir_inode_list); 1086 INIT_LIST_HEAD(&sbi->dir_inode_list);
1073 spin_lock_init(&sbi->dir_inode_lock); 1087 spin_lock_init(&sbi->dir_inode_lock);
1074 1088
1089 init_extent_cache_info(sbi);
1090
1075 init_ino_entry_info(sbi); 1091 init_ino_entry_info(sbi);
1076 1092
1077 /* setup f2fs internal modules */ 1093 /* setup f2fs internal modules */
@@ -1146,9 +1162,6 @@ try_onemore:
1146 if (err) 1162 if (err)
1147 goto free_proc; 1163 goto free_proc;
1148 1164
1149 if (!retry)
1150 set_sbi_flag(sbi, SBI_NEED_FSCK);
1151
1152 /* recover fsynced data */ 1165 /* recover fsynced data */
1153 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { 1166 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
1154 /* 1167 /*
@@ -1160,8 +1173,13 @@ try_onemore:
1160 err = -EROFS; 1173 err = -EROFS;
1161 goto free_kobj; 1174 goto free_kobj;
1162 } 1175 }
1176
1177 if (need_fsck)
1178 set_sbi_flag(sbi, SBI_NEED_FSCK);
1179
1163 err = recover_fsync_data(sbi); 1180 err = recover_fsync_data(sbi);
1164 if (err) { 1181 if (err) {
1182 need_fsck = true;
1165 f2fs_msg(sb, KERN_ERR, 1183 f2fs_msg(sb, KERN_ERR,
1166 "Cannot recover all fsync data errno=%ld", err); 1184 "Cannot recover all fsync data errno=%ld", err);
1167 goto free_kobj; 1185 goto free_kobj;
@@ -1212,7 +1230,7 @@ free_sbi:
1212 1230
1213 /* give only one another chance */ 1231 /* give only one another chance */
1214 if (retry) { 1232 if (retry) {
1215 retry = 0; 1233 retry = false;
1216 shrink_dcache_sb(sb); 1234 shrink_dcache_sb(sb);
1217 goto try_onemore; 1235 goto try_onemore;
1218 } 1236 }
@@ -1278,10 +1296,13 @@ static int __init init_f2fs_fs(void)
1278 err = create_checkpoint_caches(); 1296 err = create_checkpoint_caches();
1279 if (err) 1297 if (err)
1280 goto free_segment_manager_caches; 1298 goto free_segment_manager_caches;
1299 err = create_extent_cache();
1300 if (err)
1301 goto free_checkpoint_caches;
1281 f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); 1302 f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
1282 if (!f2fs_kset) { 1303 if (!f2fs_kset) {
1283 err = -ENOMEM; 1304 err = -ENOMEM;
1284 goto free_checkpoint_caches; 1305 goto free_extent_cache;
1285 } 1306 }
1286 err = register_filesystem(&f2fs_fs_type); 1307 err = register_filesystem(&f2fs_fs_type);
1287 if (err) 1308 if (err)
@@ -1292,6 +1313,8 @@ static int __init init_f2fs_fs(void)
1292 1313
1293free_kset: 1314free_kset:
1294 kset_unregister(f2fs_kset); 1315 kset_unregister(f2fs_kset);
1316free_extent_cache:
1317 destroy_extent_cache();
1295free_checkpoint_caches: 1318free_checkpoint_caches:
1296 destroy_checkpoint_caches(); 1319 destroy_checkpoint_caches();
1297free_segment_manager_caches: 1320free_segment_manager_caches:
@@ -1309,6 +1332,7 @@ static void __exit exit_f2fs_fs(void)
1309 remove_proc_entry("fs/f2fs", NULL); 1332 remove_proc_entry("fs/f2fs", NULL);
1310 f2fs_destroy_root_stats(); 1333 f2fs_destroy_root_stats();
1311 unregister_filesystem(&f2fs_fs_type); 1334 unregister_filesystem(&f2fs_fs_type);
1335 destroy_extent_cache();
1312 destroy_checkpoint_caches(); 1336 destroy_checkpoint_caches();
1313 destroy_segment_manager_caches(); 1337 destroy_segment_manager_caches();
1314 destroy_node_manager_caches(); 1338 destroy_node_manager_caches();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 5072bf9ae0ef..b0fd2f2d0716 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -135,7 +135,8 @@ static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
135 if (strcmp(name, "") != 0) 135 if (strcmp(name, "") != 0)
136 return -EINVAL; 136 return -EINVAL;
137 137
138 *((char *)buffer) = F2FS_I(inode)->i_advise; 138 if (buffer)
139 *((char *)buffer) = F2FS_I(inode)->i_advise;
139 return sizeof(char); 140 return sizeof(char);
140} 141}
141 142
@@ -152,6 +153,7 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
152 return -EINVAL; 153 return -EINVAL;
153 154
154 F2FS_I(inode)->i_advise |= *(char *)value; 155 F2FS_I(inode)->i_advise |= *(char *)value;
156 mark_inode_dirty(inode);
155 return 0; 157 return 0;
156} 158}
157 159
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index b06c98796afb..611b5408f6ec 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock);
9void pin_remove(struct fs_pin *pin) 9void pin_remove(struct fs_pin *pin)
10{ 10{
11 spin_lock(&pin_lock); 11 spin_lock(&pin_lock);
12 hlist_del(&pin->m_list); 12 hlist_del_init(&pin->m_list);
13 hlist_del(&pin->s_list); 13 hlist_del_init(&pin->s_list);
14 spin_unlock(&pin_lock); 14 spin_unlock(&pin_lock);
15 spin_lock_irq(&pin->wait.lock); 15 spin_lock_irq(&pin->wait.lock);
16 pin->done = 1; 16 pin->done = 1;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 762c7a3cf43d..2eac55379239 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1266,7 +1266,6 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_
1266 if (rc) { 1266 if (rc) {
1267 JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n", 1267 JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n",
1268 __func__, rc, totlen); 1268 __func__, rc, totlen);
1269 rc = rc ? rc : -EBADFD;
1270 goto out; 1269 goto out;
1271 } 1270 }
1272 rc = save_xattr_ref(c, ref); 1271 rc = save_xattr_ref(c, ref);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 665ef5a05183..a563ddbc19e6 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -31,7 +31,7 @@
31static struct hlist_head nlm_files[FILE_NRHASH]; 31static struct hlist_head nlm_files[FILE_NRHASH];
32static DEFINE_MUTEX(nlm_file_mutex); 32static DEFINE_MUTEX(nlm_file_mutex);
33 33
34#ifdef NFSD_DEBUG 34#ifdef CONFIG_SUNRPC_DEBUG
35static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) 35static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
36{ 36{
37 u32 *fhp = (u32*)f->data; 37 u32 *fhp = (u32*)f->data;
diff --git a/fs/namespace.c b/fs/namespace.c
index 82ef1405260e..1f4f9dac6e5a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -632,14 +632,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
632 */ 632 */
633struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) 633struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
634{ 634{
635 struct mount *p, *res; 635 struct mount *p, *res = NULL;
636 res = p = __lookup_mnt(mnt, dentry); 636 p = __lookup_mnt(mnt, dentry);
637 if (!p) 637 if (!p)
638 goto out; 638 goto out;
639 if (!(p->mnt.mnt_flags & MNT_UMOUNT))
640 res = p;
639 hlist_for_each_entry_continue(p, mnt_hash) { 641 hlist_for_each_entry_continue(p, mnt_hash) {
640 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) 642 if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
641 break; 643 break;
642 res = p; 644 if (!(p->mnt.mnt_flags & MNT_UMOUNT))
645 res = p;
643 } 646 }
644out: 647out:
645 return res; 648 return res;
@@ -795,10 +798,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
795/* 798/*
796 * vfsmount lock must be held for write 799 * vfsmount lock must be held for write
797 */ 800 */
798static void detach_mnt(struct mount *mnt, struct path *old_path) 801static void unhash_mnt(struct mount *mnt)
799{ 802{
800 old_path->dentry = mnt->mnt_mountpoint;
801 old_path->mnt = &mnt->mnt_parent->mnt;
802 mnt->mnt_parent = mnt; 803 mnt->mnt_parent = mnt;
803 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 804 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
804 list_del_init(&mnt->mnt_child); 805 list_del_init(&mnt->mnt_child);
@@ -811,6 +812,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
811/* 812/*
812 * vfsmount lock must be held for write 813 * vfsmount lock must be held for write
813 */ 814 */
815static void detach_mnt(struct mount *mnt, struct path *old_path)
816{
817 old_path->dentry = mnt->mnt_mountpoint;
818 old_path->mnt = &mnt->mnt_parent->mnt;
819 unhash_mnt(mnt);
820}
821
822/*
823 * vfsmount lock must be held for write
824 */
825static void umount_mnt(struct mount *mnt)
826{
827 /* old mountpoint will be dropped when we can do that */
828 mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
829 unhash_mnt(mnt);
830}
831
832/*
833 * vfsmount lock must be held for write
834 */
814void mnt_set_mountpoint(struct mount *mnt, 835void mnt_set_mountpoint(struct mount *mnt,
815 struct mountpoint *mp, 836 struct mountpoint *mp,
816 struct mount *child_mnt) 837 struct mount *child_mnt)
@@ -1078,6 +1099,13 @@ static void mntput_no_expire(struct mount *mnt)
1078 rcu_read_unlock(); 1099 rcu_read_unlock();
1079 1100
1080 list_del(&mnt->mnt_instance); 1101 list_del(&mnt->mnt_instance);
1102
1103 if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1104 struct mount *p, *tmp;
1105 list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1106 umount_mnt(p);
1107 }
1108 }
1081 unlock_mount_hash(); 1109 unlock_mount_hash();
1082 1110
1083 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { 1111 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
@@ -1298,17 +1326,15 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
1298 1326
1299static void namespace_unlock(void) 1327static void namespace_unlock(void)
1300{ 1328{
1301 struct hlist_head head = unmounted; 1329 struct hlist_head head;
1302 1330
1303 if (likely(hlist_empty(&head))) { 1331 hlist_move_list(&unmounted, &head);
1304 up_write(&namespace_sem);
1305 return;
1306 }
1307 1332
1308 head.first->pprev = &head.first;
1309 INIT_HLIST_HEAD(&unmounted);
1310 up_write(&namespace_sem); 1333 up_write(&namespace_sem);
1311 1334
1335 if (likely(hlist_empty(&head)))
1336 return;
1337
1312 synchronize_rcu(); 1338 synchronize_rcu();
1313 1339
1314 group_pin_kill(&head); 1340 group_pin_kill(&head);
@@ -1319,49 +1345,63 @@ static inline void namespace_lock(void)
1319 down_write(&namespace_sem); 1345 down_write(&namespace_sem);
1320} 1346}
1321 1347
1348enum umount_tree_flags {
1349 UMOUNT_SYNC = 1,
1350 UMOUNT_PROPAGATE = 2,
1351 UMOUNT_CONNECTED = 4,
1352};
1322/* 1353/*
1323 * mount_lock must be held 1354 * mount_lock must be held
1324 * namespace_sem must be held for write 1355 * namespace_sem must be held for write
1325 * how = 0 => just this tree, don't propagate
1326 * how = 1 => propagate; we know that nobody else has reference to any victims
1327 * how = 2 => lazy umount
1328 */ 1356 */
1329void umount_tree(struct mount *mnt, int how) 1357static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
1330{ 1358{
1331 HLIST_HEAD(tmp_list); 1359 LIST_HEAD(tmp_list);
1332 struct mount *p; 1360 struct mount *p;
1333 1361
1362 if (how & UMOUNT_PROPAGATE)
1363 propagate_mount_unlock(mnt);
1364
1365 /* Gather the mounts to umount */
1334 for (p = mnt; p; p = next_mnt(p, mnt)) { 1366 for (p = mnt; p; p = next_mnt(p, mnt)) {
1335 hlist_del_init_rcu(&p->mnt_hash); 1367 p->mnt.mnt_flags |= MNT_UMOUNT;
1336 hlist_add_head(&p->mnt_hash, &tmp_list); 1368 list_move(&p->mnt_list, &tmp_list);
1337 } 1369 }
1338 1370
1339 hlist_for_each_entry(p, &tmp_list, mnt_hash) 1371 /* Hide the mounts from mnt_mounts */
1372 list_for_each_entry(p, &tmp_list, mnt_list) {
1340 list_del_init(&p->mnt_child); 1373 list_del_init(&p->mnt_child);
1374 }
1341 1375
1342 if (how) 1376 /* Add propogated mounts to the tmp_list */
1377 if (how & UMOUNT_PROPAGATE)
1343 propagate_umount(&tmp_list); 1378 propagate_umount(&tmp_list);
1344 1379
1345 while (!hlist_empty(&tmp_list)) { 1380 while (!list_empty(&tmp_list)) {
1346 p = hlist_entry(tmp_list.first, struct mount, mnt_hash); 1381 bool disconnect;
1347 hlist_del_init_rcu(&p->mnt_hash); 1382 p = list_first_entry(&tmp_list, struct mount, mnt_list);
1348 list_del_init(&p->mnt_expire); 1383 list_del_init(&p->mnt_expire);
1349 list_del_init(&p->mnt_list); 1384 list_del_init(&p->mnt_list);
1350 __touch_mnt_namespace(p->mnt_ns); 1385 __touch_mnt_namespace(p->mnt_ns);
1351 p->mnt_ns = NULL; 1386 p->mnt_ns = NULL;
1352 if (how < 2) 1387 if (how & UMOUNT_SYNC)
1353 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1388 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1354 1389
1355 pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted); 1390 disconnect = !(((how & UMOUNT_CONNECTED) &&
1391 mnt_has_parent(p) &&
1392 (p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) ||
1393 IS_MNT_LOCKED_AND_LAZY(p));
1394
1395 pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
1396 disconnect ? &unmounted : NULL);
1356 if (mnt_has_parent(p)) { 1397 if (mnt_has_parent(p)) {
1357 hlist_del_init(&p->mnt_mp_list);
1358 put_mountpoint(p->mnt_mp);
1359 mnt_add_count(p->mnt_parent, -1); 1398 mnt_add_count(p->mnt_parent, -1);
1360 /* old mountpoint will be dropped when we can do that */ 1399 if (!disconnect) {
1361 p->mnt_ex_mountpoint = p->mnt_mountpoint; 1400 /* Don't forget about p */
1362 p->mnt_mountpoint = p->mnt.mnt_root; 1401 list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
1363 p->mnt_parent = p; 1402 } else {
1364 p->mnt_mp = NULL; 1403 umount_mnt(p);
1404 }
1365 } 1405 }
1366 change_mnt_propagation(p, MS_PRIVATE); 1406 change_mnt_propagation(p, MS_PRIVATE);
1367 } 1407 }
@@ -1447,14 +1487,14 @@ static int do_umount(struct mount *mnt, int flags)
1447 1487
1448 if (flags & MNT_DETACH) { 1488 if (flags & MNT_DETACH) {
1449 if (!list_empty(&mnt->mnt_list)) 1489 if (!list_empty(&mnt->mnt_list))
1450 umount_tree(mnt, 2); 1490 umount_tree(mnt, UMOUNT_PROPAGATE);
1451 retval = 0; 1491 retval = 0;
1452 } else { 1492 } else {
1453 shrink_submounts(mnt); 1493 shrink_submounts(mnt);
1454 retval = -EBUSY; 1494 retval = -EBUSY;
1455 if (!propagate_mount_busy(mnt, 2)) { 1495 if (!propagate_mount_busy(mnt, 2)) {
1456 if (!list_empty(&mnt->mnt_list)) 1496 if (!list_empty(&mnt->mnt_list))
1457 umount_tree(mnt, 1); 1497 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
1458 retval = 0; 1498 retval = 0;
1459 } 1499 }
1460 } 1500 }
@@ -1480,13 +1520,20 @@ void __detach_mounts(struct dentry *dentry)
1480 1520
1481 namespace_lock(); 1521 namespace_lock();
1482 mp = lookup_mountpoint(dentry); 1522 mp = lookup_mountpoint(dentry);
1483 if (!mp) 1523 if (IS_ERR_OR_NULL(mp))
1484 goto out_unlock; 1524 goto out_unlock;
1485 1525
1486 lock_mount_hash(); 1526 lock_mount_hash();
1487 while (!hlist_empty(&mp->m_list)) { 1527 while (!hlist_empty(&mp->m_list)) {
1488 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); 1528 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1489 umount_tree(mnt, 2); 1529 if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
1530 struct mount *p, *tmp;
1531 list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
1532 hlist_add_head(&p->mnt_umount.s_list, &unmounted);
1533 umount_mnt(p);
1534 }
1535 }
1536 else umount_tree(mnt, UMOUNT_CONNECTED);
1490 } 1537 }
1491 unlock_mount_hash(); 1538 unlock_mount_hash();
1492 put_mountpoint(mp); 1539 put_mountpoint(mp);
@@ -1648,7 +1695,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1648out: 1695out:
1649 if (res) { 1696 if (res) {
1650 lock_mount_hash(); 1697 lock_mount_hash();
1651 umount_tree(res, 0); 1698 umount_tree(res, UMOUNT_SYNC);
1652 unlock_mount_hash(); 1699 unlock_mount_hash();
1653 } 1700 }
1654 return q; 1701 return q;
@@ -1660,8 +1707,11 @@ struct vfsmount *collect_mounts(struct path *path)
1660{ 1707{
1661 struct mount *tree; 1708 struct mount *tree;
1662 namespace_lock(); 1709 namespace_lock();
1663 tree = copy_tree(real_mount(path->mnt), path->dentry, 1710 if (!check_mnt(real_mount(path->mnt)))
1664 CL_COPY_ALL | CL_PRIVATE); 1711 tree = ERR_PTR(-EINVAL);
1712 else
1713 tree = copy_tree(real_mount(path->mnt), path->dentry,
1714 CL_COPY_ALL | CL_PRIVATE);
1665 namespace_unlock(); 1715 namespace_unlock();
1666 if (IS_ERR(tree)) 1716 if (IS_ERR(tree))
1667 return ERR_CAST(tree); 1717 return ERR_CAST(tree);
@@ -1672,7 +1722,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
1672{ 1722{
1673 namespace_lock(); 1723 namespace_lock();
1674 lock_mount_hash(); 1724 lock_mount_hash();
1675 umount_tree(real_mount(mnt), 0); 1725 umount_tree(real_mount(mnt), UMOUNT_SYNC);
1676 unlock_mount_hash(); 1726 unlock_mount_hash();
1677 namespace_unlock(); 1727 namespace_unlock();
1678} 1728}
@@ -1855,7 +1905,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
1855 out_cleanup_ids: 1905 out_cleanup_ids:
1856 while (!hlist_empty(&tree_list)) { 1906 while (!hlist_empty(&tree_list)) {
1857 child = hlist_entry(tree_list.first, struct mount, mnt_hash); 1907 child = hlist_entry(tree_list.first, struct mount, mnt_hash);
1858 umount_tree(child, 0); 1908 umount_tree(child, UMOUNT_SYNC);
1859 } 1909 }
1860 unlock_mount_hash(); 1910 unlock_mount_hash();
1861 cleanup_group_ids(source_mnt, NULL); 1911 cleanup_group_ids(source_mnt, NULL);
@@ -2035,7 +2085,7 @@ static int do_loopback(struct path *path, const char *old_name,
2035 err = graft_tree(mnt, parent, mp); 2085 err = graft_tree(mnt, parent, mp);
2036 if (err) { 2086 if (err) {
2037 lock_mount_hash(); 2087 lock_mount_hash();
2038 umount_tree(mnt, 0); 2088 umount_tree(mnt, UMOUNT_SYNC);
2039 unlock_mount_hash(); 2089 unlock_mount_hash();
2040 } 2090 }
2041out2: 2091out2:
@@ -2406,7 +2456,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
2406 while (!list_empty(&graveyard)) { 2456 while (!list_empty(&graveyard)) {
2407 mnt = list_first_entry(&graveyard, struct mount, mnt_expire); 2457 mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
2408 touch_mnt_namespace(mnt->mnt_ns); 2458 touch_mnt_namespace(mnt->mnt_ns);
2409 umount_tree(mnt, 1); 2459 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
2410 } 2460 }
2411 unlock_mount_hash(); 2461 unlock_mount_hash();
2412 namespace_unlock(); 2462 namespace_unlock();
@@ -2477,7 +2527,7 @@ static void shrink_submounts(struct mount *mnt)
2477 m = list_first_entry(&graveyard, struct mount, 2527 m = list_first_entry(&graveyard, struct mount,
2478 mnt_expire); 2528 mnt_expire);
2479 touch_mnt_namespace(m->mnt_ns); 2529 touch_mnt_namespace(m->mnt_ns);
2480 umount_tree(m, 1); 2530 umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
2481 } 2531 }
2482 } 2532 }
2483} 2533}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index fc2d108f5272..a0b77fc1bd39 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -108,7 +108,7 @@ config NFSD_V4_SECURITY_LABEL
108 108
109config NFSD_FAULT_INJECTION 109config NFSD_FAULT_INJECTION
110 bool "NFS server manual fault injection" 110 bool "NFS server manual fault injection"
111 depends on NFSD_V4 && DEBUG_KERNEL 111 depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS
112 help 112 help
113 This option enables support for manually injecting faults 113 This option enables support for manually injecting faults
114 into the NFS server. This is intended to be used for 114 into the NFS server. This is intended to be used for
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c3e3b6e55ae2..900c3ae94adc 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -691,8 +691,7 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b)
691 struct svc_export *orig = container_of(a, struct svc_export, h); 691 struct svc_export *orig = container_of(a, struct svc_export, h);
692 struct svc_export *new = container_of(b, struct svc_export, h); 692 struct svc_export *new = container_of(b, struct svc_export, h);
693 return orig->ex_client == new->ex_client && 693 return orig->ex_client == new->ex_client &&
694 orig->ex_path.dentry == new->ex_path.dentry && 694 path_equal(&orig->ex_path, &new->ex_path);
695 orig->ex_path.mnt == new->ex_path.mnt;
696} 695}
697 696
698static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) 697static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
@@ -1159,6 +1158,7 @@ static struct flags {
1159 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, 1158 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
1160 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, 1159 { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
1161 { NFSEXP_V4ROOT, {"v4root", ""}}, 1160 { NFSEXP_V4ROOT, {"v4root", ""}},
1161 { NFSEXP_PNFS, {"pnfs", ""}},
1162 { 0, {"", ""}} 1162 { 0, {"", ""}}
1163}; 1163};
1164 1164
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 59fd76651781..eaf4605a4b9e 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -499,43 +499,13 @@ static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_s
499 state->mask.allow |= astate->allow; 499 state->mask.allow |= astate->allow;
500} 500}
501 501
502/*
503 * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS,
504 * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate
505 * to traditional read/write/execute permissions.
506 *
507 * It's problematic to reject acls that use certain mode bits, because it
508 * places the burden on users to learn the rules about which bits one
509 * particular server sets, without giving the user a lot of help--we return an
510 * error that could mean any number of different things. To make matters
511 * worse, the problematic bits might be introduced by some application that's
512 * automatically mapping from some other acl model.
513 *
514 * So wherever possible we accept anything, possibly erring on the side of
515 * denying more permissions than necessary.
516 *
517 * However we do reject *explicit* DENY's of a few bits representing
518 * permissions we could never deny:
519 */
520
521static inline int check_deny(u32 mask, int isowner)
522{
523 if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL))
524 return -EINVAL;
525 if (!isowner)
526 return 0;
527 if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL))
528 return -EINVAL;
529 return 0;
530}
531
532static struct posix_acl * 502static struct posix_acl *
533posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) 503posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
534{ 504{
535 struct posix_acl_entry *pace; 505 struct posix_acl_entry *pace;
536 struct posix_acl *pacl; 506 struct posix_acl *pacl;
537 int nace; 507 int nace;
538 int i, error = 0; 508 int i;
539 509
540 /* 510 /*
541 * ACLs with no ACEs are treated differently in the inheritable 511 * ACLs with no ACEs are treated differently in the inheritable
@@ -560,17 +530,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
560 530
561 pace = pacl->a_entries; 531 pace = pacl->a_entries;
562 pace->e_tag = ACL_USER_OBJ; 532 pace->e_tag = ACL_USER_OBJ;
563 error = check_deny(state->owner.deny, 1);
564 if (error)
565 goto out_err;
566 low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); 533 low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags);
567 534
568 for (i=0; i < state->users->n; i++) { 535 for (i=0; i < state->users->n; i++) {
569 pace++; 536 pace++;
570 pace->e_tag = ACL_USER; 537 pace->e_tag = ACL_USER;
571 error = check_deny(state->users->aces[i].perms.deny, 0);
572 if (error)
573 goto out_err;
574 low_mode_from_nfs4(state->users->aces[i].perms.allow, 538 low_mode_from_nfs4(state->users->aces[i].perms.allow,
575 &pace->e_perm, flags); 539 &pace->e_perm, flags);
576 pace->e_uid = state->users->aces[i].uid; 540 pace->e_uid = state->users->aces[i].uid;
@@ -579,18 +543,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
579 543
580 pace++; 544 pace++;
581 pace->e_tag = ACL_GROUP_OBJ; 545 pace->e_tag = ACL_GROUP_OBJ;
582 error = check_deny(state->group.deny, 0);
583 if (error)
584 goto out_err;
585 low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); 546 low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags);
586 add_to_mask(state, &state->group); 547 add_to_mask(state, &state->group);
587 548
588 for (i=0; i < state->groups->n; i++) { 549 for (i=0; i < state->groups->n; i++) {
589 pace++; 550 pace++;
590 pace->e_tag = ACL_GROUP; 551 pace->e_tag = ACL_GROUP;
591 error = check_deny(state->groups->aces[i].perms.deny, 0);
592 if (error)
593 goto out_err;
594 low_mode_from_nfs4(state->groups->aces[i].perms.allow, 552 low_mode_from_nfs4(state->groups->aces[i].perms.allow,
595 &pace->e_perm, flags); 553 &pace->e_perm, flags);
596 pace->e_gid = state->groups->aces[i].gid; 554 pace->e_gid = state->groups->aces[i].gid;
@@ -605,15 +563,9 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
605 563
606 pace++; 564 pace++;
607 pace->e_tag = ACL_OTHER; 565 pace->e_tag = ACL_OTHER;
608 error = check_deny(state->other.deny, 0);
609 if (error)
610 goto out_err;
611 low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); 566 low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags);
612 567
613 return pacl; 568 return pacl;
614out_err:
615 posix_acl_release(pacl);
616 return ERR_PTR(error);
617} 569}
618 570
619static inline void allow_bits(struct posix_ace_state *astate, u32 mask) 571static inline void allow_bits(struct posix_ace_state *astate, u32 mask)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 92b9d97aff4f..4a8314f08a0e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -470,7 +470,7 @@ out:
470 fh_put(resfh); 470 fh_put(resfh);
471 kfree(resfh); 471 kfree(resfh);
472 } 472 }
473 nfsd4_cleanup_open_state(cstate, open, status); 473 nfsd4_cleanup_open_state(cstate, open);
474 nfsd4_bump_seqid(cstate, status); 474 nfsd4_bump_seqid(cstate, status);
475 return status; 475 return status;
476} 476}
@@ -1030,6 +1030,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1030 dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); 1030 dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
1031 return status; 1031 return status;
1032 } 1032 }
1033 if (!file)
1034 return nfserr_bad_stateid;
1033 1035
1034 status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, 1036 status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file,
1035 fallocate->falloc_offset, 1037 fallocate->falloc_offset,
@@ -1069,6 +1071,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1069 dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); 1071 dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
1070 return status; 1072 return status;
1071 } 1073 }
1074 if (!file)
1075 return nfserr_bad_stateid;
1072 1076
1073 switch (seek->seek_whence) { 1077 switch (seek->seek_whence) {
1074 case NFS4_CONTENT_DATA: 1078 case NFS4_CONTENT_DATA:
@@ -1815,7 +1819,7 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
1815 bmap0 &= ~FATTR4_WORD0_FILEHANDLE; 1819 bmap0 &= ~FATTR4_WORD0_FILEHANDLE;
1816 } 1820 }
1817 if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { 1821 if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) {
1818 ret += NFSD4_MAX_SEC_LABEL_LEN + 12; 1822 ret += NFS4_MAXLABELLEN + 12;
1819 bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; 1823 bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL;
1820 } 1824 }
1821 /* 1825 /*
@@ -2282,13 +2286,13 @@ static struct nfsd4_operation nfsd4_ops[] = {
2282 .op_func = (nfsd4op_func)nfsd4_allocate, 2286 .op_func = (nfsd4op_func)nfsd4_allocate,
2283 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, 2287 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
2284 .op_name = "OP_ALLOCATE", 2288 .op_name = "OP_ALLOCATE",
2285 .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, 2289 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
2286 }, 2290 },
2287 [OP_DEALLOCATE] = { 2291 [OP_DEALLOCATE] = {
2288 .op_func = (nfsd4op_func)nfsd4_deallocate, 2292 .op_func = (nfsd4op_func)nfsd4_deallocate,
2289 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, 2293 .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
2290 .op_name = "OP_DEALLOCATE", 2294 .op_name = "OP_DEALLOCATE",
2291 .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, 2295 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
2292 }, 2296 },
2293 [OP_SEEK] = { 2297 [OP_SEEK] = {
2294 .op_func = (nfsd4op_func)nfsd4_seek, 2298 .op_func = (nfsd4op_func)nfsd4_seek,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 326a545ea7b2..d42786ee39af 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1139,7 +1139,7 @@ hash_sessionid(struct nfs4_sessionid *sessionid)
1139 return sid->sequence % SESSION_HASH_SIZE; 1139 return sid->sequence % SESSION_HASH_SIZE;
1140} 1140}
1141 1141
1142#ifdef NFSD_DEBUG 1142#ifdef CONFIG_SUNRPC_DEBUG
1143static inline void 1143static inline void
1144dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) 1144dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
1145{ 1145{
@@ -4049,7 +4049,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
4049 status = nfserr_bad_stateid; 4049 status = nfserr_bad_stateid;
4050 if (nfsd4_is_deleg_cur(open)) 4050 if (nfsd4_is_deleg_cur(open))
4051 goto out; 4051 goto out;
4052 status = nfserr_jukebox;
4053 } 4052 }
4054 4053
4055 /* 4054 /*
@@ -4118,7 +4117,7 @@ out:
4118} 4117}
4119 4118
4120void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, 4119void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
4121 struct nfsd4_open *open, __be32 status) 4120 struct nfsd4_open *open)
4122{ 4121{
4123 if (open->op_openowner) { 4122 if (open->op_openowner) {
4124 struct nfs4_stateowner *so = &open->op_openowner->oo_owner; 4123 struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5fb7e78169a6..a45032ce7b80 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -424,7 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
424 len += 4; 424 len += 4;
425 dummy32 = be32_to_cpup(p++); 425 dummy32 = be32_to_cpup(p++);
426 READ_BUF(dummy32); 426 READ_BUF(dummy32);
427 if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN) 427 if (dummy32 > NFS4_MAXLABELLEN)
428 return nfserr_badlabel; 428 return nfserr_badlabel;
429 len += (XDR_QUADLEN(dummy32) << 2); 429 len += (XDR_QUADLEN(dummy32) << 2);
430 READMEM(buf, dummy32); 430 READMEM(buf, dummy32);
@@ -2020,7 +2020,7 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
2020 * dentries/path components in an array. 2020 * dentries/path components in an array.
2021 */ 2021 */
2022 for (;;) { 2022 for (;;) {
2023 if (cur.dentry == root->dentry && cur.mnt == root->mnt) 2023 if (path_equal(&cur, root))
2024 break; 2024 break;
2025 if (cur.dentry == cur.mnt->mnt_root) { 2025 if (cur.dentry == cur.mnt->mnt_root) {
2026 if (follow_up(&cur)) 2026 if (follow_up(&cur))
@@ -3422,6 +3422,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
3422 unsigned long maxcount; 3422 unsigned long maxcount;
3423 struct xdr_stream *xdr = &resp->xdr; 3423 struct xdr_stream *xdr = &resp->xdr;
3424 struct file *file = read->rd_filp; 3424 struct file *file = read->rd_filp;
3425 struct svc_fh *fhp = read->rd_fhp;
3425 int starting_len = xdr->buf->len; 3426 int starting_len = xdr->buf->len;
3426 struct raparms *ra; 3427 struct raparms *ra;
3427 __be32 *p; 3428 __be32 *p;
@@ -3445,12 +3446,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
3445 maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len)); 3446 maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
3446 maxcount = min_t(unsigned long, maxcount, read->rd_length); 3447 maxcount = min_t(unsigned long, maxcount, read->rd_length);
3447 3448
3448 if (!read->rd_filp) { 3449 if (read->rd_filp)
3450 err = nfsd_permission(resp->rqstp, fhp->fh_export,
3451 fhp->fh_dentry,
3452 NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
3453 else
3449 err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, 3454 err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
3450 &file, &ra); 3455 &file, &ra);
3451 if (err) 3456 if (err)
3452 goto err_truncate; 3457 goto err_truncate;
3453 }
3454 3458
3455 if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) 3459 if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
3456 err = nfsd4_encode_splice_read(resp, read, file, maxcount); 3460 err = nfsd4_encode_splice_read(resp, read, file, maxcount);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index aa47d75ddb26..9690cb4dd588 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1250,15 +1250,15 @@ static int __init init_nfsd(void)
1250 int retval; 1250 int retval;
1251 printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); 1251 printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
1252 1252
1253 retval = register_cld_notifier();
1254 if (retval)
1255 return retval;
1256 retval = register_pernet_subsys(&nfsd_net_ops); 1253 retval = register_pernet_subsys(&nfsd_net_ops);
1257 if (retval < 0) 1254 if (retval < 0)
1258 goto out_unregister_notifier; 1255 return retval;
1259 retval = nfsd4_init_slabs(); 1256 retval = register_cld_notifier();
1260 if (retval) 1257 if (retval)
1261 goto out_unregister_pernet; 1258 goto out_unregister_pernet;
1259 retval = nfsd4_init_slabs();
1260 if (retval)
1261 goto out_unregister_notifier;
1262 retval = nfsd4_init_pnfs(); 1262 retval = nfsd4_init_pnfs();
1263 if (retval) 1263 if (retval)
1264 goto out_free_slabs; 1264 goto out_free_slabs;
@@ -1290,10 +1290,10 @@ out_exit_pnfs:
1290 nfsd4_exit_pnfs(); 1290 nfsd4_exit_pnfs();
1291out_free_slabs: 1291out_free_slabs:
1292 nfsd4_free_slabs(); 1292 nfsd4_free_slabs();
1293out_unregister_pernet:
1294 unregister_pernet_subsys(&nfsd_net_ops);
1295out_unregister_notifier: 1293out_unregister_notifier:
1296 unregister_cld_notifier(); 1294 unregister_cld_notifier();
1295out_unregister_pernet:
1296 unregister_pernet_subsys(&nfsd_net_ops);
1297 return retval; 1297 return retval;
1298} 1298}
1299 1299
@@ -1308,8 +1308,8 @@ static void __exit exit_nfsd(void)
1308 nfsd4_exit_pnfs(); 1308 nfsd4_exit_pnfs();
1309 nfsd_fault_inject_cleanup(); 1309 nfsd_fault_inject_cleanup();
1310 unregister_filesystem(&nfsd_fs_type); 1310 unregister_filesystem(&nfsd_fs_type);
1311 unregister_pernet_subsys(&nfsd_net_ops);
1312 unregister_cld_notifier(); 1311 unregister_cld_notifier();
1312 unregister_pernet_subsys(&nfsd_net_ops);
1313} 1313}
1314 1314
1315MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); 1315MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 565c4da1a9eb..cf980523898b 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -24,7 +24,7 @@
24#include "export.h" 24#include "export.h"
25 25
26#undef ifdebug 26#undef ifdebug
27#ifdef NFSD_DEBUG 27#ifdef CONFIG_SUNRPC_DEBUG
28# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) 28# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag)
29#else 29#else
30# define ifdebug(flag) if (0) 30# define ifdebug(flag) if (0)
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 0bda93e58e1b..556ce2e47555 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -40,7 +40,6 @@
40#include "state.h" 40#include "state.h"
41#include "nfsd.h" 41#include "nfsd.h"
42 42
43#define NFSD4_MAX_SEC_LABEL_LEN 2048
44#define NFSD4_MAX_TAGLEN 128 43#define NFSD4_MAX_TAGLEN 128
45#define XDR_LEN(n) (((n) + 3) & ~3) 44#define XDR_LEN(n) (((n) + 3) & ~3)
46 45
@@ -683,7 +682,7 @@ extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
683 struct svc_fh *current_fh, struct nfsd4_open *open); 682 struct svc_fh *current_fh, struct nfsd4_open *open);
684extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate); 683extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate);
685extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, 684extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
686 struct nfsd4_open *open, __be32 status); 685 struct nfsd4_open *open);
687extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, 686extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
688 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); 687 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
689extern __be32 nfsd4_close(struct svc_rqst *rqstp, 688extern __be32 nfsd4_close(struct svc_rqst *rqstp,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 8e19b9d7aba8..16eff45727ee 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1312,9 +1312,7 @@ static int o2hb_debug_init(void)
1312 int ret = -ENOMEM; 1312 int ret = -ENOMEM;
1313 1313
1314 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); 1314 o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL);
1315 if (IS_ERR_OR_NULL(o2hb_debug_dir)) { 1315 if (!o2hb_debug_dir) {
1316 ret = o2hb_debug_dir ?
1317 PTR_ERR(o2hb_debug_dir) : -ENOMEM;
1318 mlog_errno(ret); 1316 mlog_errno(ret);
1319 goto bail; 1317 goto bail;
1320 } 1318 }
@@ -1327,9 +1325,7 @@ static int o2hb_debug_init(void)
1327 sizeof(o2hb_live_node_bitmap), 1325 sizeof(o2hb_live_node_bitmap),
1328 O2NM_MAX_NODES, 1326 O2NM_MAX_NODES,
1329 o2hb_live_node_bitmap); 1327 o2hb_live_node_bitmap);
1330 if (IS_ERR_OR_NULL(o2hb_debug_livenodes)) { 1328 if (!o2hb_debug_livenodes) {
1331 ret = o2hb_debug_livenodes ?
1332 PTR_ERR(o2hb_debug_livenodes) : -ENOMEM;
1333 mlog_errno(ret); 1329 mlog_errno(ret);
1334 goto bail; 1330 goto bail;
1335 } 1331 }
@@ -1342,9 +1338,7 @@ static int o2hb_debug_init(void)
1342 sizeof(o2hb_live_region_bitmap), 1338 sizeof(o2hb_live_region_bitmap),
1343 O2NM_MAX_REGIONS, 1339 O2NM_MAX_REGIONS,
1344 o2hb_live_region_bitmap); 1340 o2hb_live_region_bitmap);
1345 if (IS_ERR_OR_NULL(o2hb_debug_liveregions)) { 1341 if (!o2hb_debug_liveregions) {
1346 ret = o2hb_debug_liveregions ?
1347 PTR_ERR(o2hb_debug_liveregions) : -ENOMEM;
1348 mlog_errno(ret); 1342 mlog_errno(ret);
1349 goto bail; 1343 goto bail;
1350 } 1344 }
@@ -1358,9 +1352,7 @@ static int o2hb_debug_init(void)
1358 sizeof(o2hb_quorum_region_bitmap), 1352 sizeof(o2hb_quorum_region_bitmap),
1359 O2NM_MAX_REGIONS, 1353 O2NM_MAX_REGIONS,
1360 o2hb_quorum_region_bitmap); 1354 o2hb_quorum_region_bitmap);
1361 if (IS_ERR_OR_NULL(o2hb_debug_quorumregions)) { 1355 if (!o2hb_debug_quorumregions) {
1362 ret = o2hb_debug_quorumregions ?
1363 PTR_ERR(o2hb_debug_quorumregions) : -ENOMEM;
1364 mlog_errno(ret); 1356 mlog_errno(ret);
1365 goto bail; 1357 goto bail;
1366 } 1358 }
@@ -1374,9 +1366,7 @@ static int o2hb_debug_init(void)
1374 sizeof(o2hb_failed_region_bitmap), 1366 sizeof(o2hb_failed_region_bitmap),
1375 O2NM_MAX_REGIONS, 1367 O2NM_MAX_REGIONS,
1376 o2hb_failed_region_bitmap); 1368 o2hb_failed_region_bitmap);
1377 if (IS_ERR_OR_NULL(o2hb_debug_failedregions)) { 1369 if (!o2hb_debug_failedregions) {
1378 ret = o2hb_debug_failedregions ?
1379 PTR_ERR(o2hb_debug_failedregions) : -ENOMEM;
1380 mlog_errno(ret); 1370 mlog_errno(ret);
1381 goto bail; 1371 goto bail;
1382 } 1372 }
@@ -2010,8 +2000,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2010 2000
2011 reg->hr_debug_dir = 2001 reg->hr_debug_dir =
2012 debugfs_create_dir(config_item_name(&reg->hr_item), dir); 2002 debugfs_create_dir(config_item_name(&reg->hr_item), dir);
2013 if (IS_ERR_OR_NULL(reg->hr_debug_dir)) { 2003 if (!reg->hr_debug_dir) {
2014 ret = reg->hr_debug_dir ? PTR_ERR(reg->hr_debug_dir) : -ENOMEM;
2015 mlog_errno(ret); 2004 mlog_errno(ret);
2016 goto bail; 2005 goto bail;
2017 } 2006 }
@@ -2024,9 +2013,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2024 O2HB_DB_TYPE_REGION_LIVENODES, 2013 O2HB_DB_TYPE_REGION_LIVENODES,
2025 sizeof(reg->hr_live_node_bitmap), 2014 sizeof(reg->hr_live_node_bitmap),
2026 O2NM_MAX_NODES, reg); 2015 O2NM_MAX_NODES, reg);
2027 if (IS_ERR_OR_NULL(reg->hr_debug_livenodes)) { 2016 if (!reg->hr_debug_livenodes) {
2028 ret = reg->hr_debug_livenodes ?
2029 PTR_ERR(reg->hr_debug_livenodes) : -ENOMEM;
2030 mlog_errno(ret); 2017 mlog_errno(ret);
2031 goto bail; 2018 goto bail;
2032 } 2019 }
@@ -2038,9 +2025,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2038 sizeof(*(reg->hr_db_regnum)), 2025 sizeof(*(reg->hr_db_regnum)),
2039 O2HB_DB_TYPE_REGION_NUMBER, 2026 O2HB_DB_TYPE_REGION_NUMBER,
2040 0, O2NM_MAX_NODES, reg); 2027 0, O2NM_MAX_NODES, reg);
2041 if (IS_ERR_OR_NULL(reg->hr_debug_regnum)) { 2028 if (!reg->hr_debug_regnum) {
2042 ret = reg->hr_debug_regnum ?
2043 PTR_ERR(reg->hr_debug_regnum) : -ENOMEM;
2044 mlog_errno(ret); 2029 mlog_errno(ret);
2045 goto bail; 2030 goto bail;
2046 } 2031 }
@@ -2052,9 +2037,7 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2052 sizeof(*(reg->hr_db_elapsed_time)), 2037 sizeof(*(reg->hr_db_elapsed_time)),
2053 O2HB_DB_TYPE_REGION_ELAPSED_TIME, 2038 O2HB_DB_TYPE_REGION_ELAPSED_TIME,
2054 0, 0, reg); 2039 0, 0, reg);
2055 if (IS_ERR_OR_NULL(reg->hr_debug_elapsed_time)) { 2040 if (!reg->hr_debug_elapsed_time) {
2056 ret = reg->hr_debug_elapsed_time ?
2057 PTR_ERR(reg->hr_debug_elapsed_time) : -ENOMEM;
2058 mlog_errno(ret); 2041 mlog_errno(ret);
2059 goto bail; 2042 goto bail;
2060 } 2043 }
@@ -2066,16 +2049,13 @@ static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir)
2066 sizeof(*(reg->hr_db_pinned)), 2049 sizeof(*(reg->hr_db_pinned)),
2067 O2HB_DB_TYPE_REGION_PINNED, 2050 O2HB_DB_TYPE_REGION_PINNED,
2068 0, 0, reg); 2051 0, 0, reg);
2069 if (IS_ERR_OR_NULL(reg->hr_debug_pinned)) { 2052 if (!reg->hr_debug_pinned) {
2070 ret = reg->hr_debug_pinned ?
2071 PTR_ERR(reg->hr_debug_pinned) : -ENOMEM;
2072 mlog_errno(ret); 2053 mlog_errno(ret);
2073 goto bail; 2054 goto bail;
2074 } 2055 }
2075 2056
2076 return 0; 2057 ret = 0;
2077bail: 2058bail:
2078 debugfs_remove_recursive(reg->hr_debug_dir);
2079 return ret; 2059 return ret;
2080} 2060}
2081 2061
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 956edf67be20..8b23aa2f52dd 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2959,7 +2959,7 @@ static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2959 osb->osb_debug_root, 2959 osb->osb_debug_root,
2960 osb, 2960 osb,
2961 &ocfs2_dlm_debug_fops); 2961 &ocfs2_dlm_debug_fops);
2962 if (IS_ERR_OR_NULL(dlm_debug->d_locking_state)) { 2962 if (!dlm_debug->d_locking_state) {
2963 ret = -EINVAL; 2963 ret = -EINVAL;
2964 mlog(ML_ERROR, 2964 mlog(ML_ERROR,
2965 "Unable to create locking state debugfs file.\n"); 2965 "Unable to create locking state debugfs file.\n");
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 837ddce4b659..403c5660b306 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1112,7 +1112,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1112 1112
1113 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, 1113 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
1114 ocfs2_debugfs_root); 1114 ocfs2_debugfs_root);
1115 if (IS_ERR_OR_NULL(osb->osb_debug_root)) { 1115 if (!osb->osb_debug_root) {
1116 status = -EINVAL; 1116 status = -EINVAL;
1117 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n"); 1117 mlog(ML_ERROR, "Unable to create per-mount debugfs root.\n");
1118 goto read_super_error; 1118 goto read_super_error;
@@ -1122,7 +1122,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1122 osb->osb_debug_root, 1122 osb->osb_debug_root,
1123 osb, 1123 osb,
1124 &ocfs2_osb_debug_fops); 1124 &ocfs2_osb_debug_fops);
1125 if (IS_ERR_OR_NULL(osb->osb_ctxt)) { 1125 if (!osb->osb_ctxt) {
1126 status = -EINVAL; 1126 status = -EINVAL;
1127 mlog_errno(status); 1127 mlog_errno(status);
1128 goto read_super_error; 1128 goto read_super_error;
@@ -1606,9 +1606,8 @@ static int __init ocfs2_init(void)
1606 } 1606 }
1607 1607
1608 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); 1608 ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
1609 if (IS_ERR_OR_NULL(ocfs2_debugfs_root)) { 1609 if (!ocfs2_debugfs_root) {
1610 status = ocfs2_debugfs_root ? 1610 status = -ENOMEM;
1611 PTR_ERR(ocfs2_debugfs_root) : -ENOMEM;
1612 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1611 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1613 goto out4; 1612 goto out4;
1614 } 1613 }
diff --git a/fs/open.c b/fs/open.c
index 6796f04d6032..98e5a52dc68c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -231,8 +231,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
231 return -EINVAL; 231 return -EINVAL;
232 232
233 /* Return error if mode is not supported */ 233 /* Return error if mode is not supported */
234 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 234 if (mode & ~FALLOC_FL_SUPPORTED_MASK)
235 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
236 return -EOPNOTSUPP; 235 return -EOPNOTSUPP;
237 236
238 /* Punch hole and zero range are mutually exclusive */ 237 /* Punch hole and zero range are mutually exclusive */
@@ -250,6 +249,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
250 (mode & ~FALLOC_FL_COLLAPSE_RANGE)) 249 (mode & ~FALLOC_FL_COLLAPSE_RANGE))
251 return -EINVAL; 250 return -EINVAL;
252 251
252 /* Insert range should only be used exclusively. */
253 if ((mode & FALLOC_FL_INSERT_RANGE) &&
254 (mode & ~FALLOC_FL_INSERT_RANGE))
255 return -EINVAL;
256
253 if (!(file->f_mode & FMODE_WRITE)) 257 if (!(file->f_mode & FMODE_WRITE))
254 return -EBADF; 258 return -EBADF;
255 259
diff --git a/fs/pnode.c b/fs/pnode.c
index 260ac8f898a4..6367e1e435c6 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -362,6 +362,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
362} 362}
363 363
364/* 364/*
365 * Clear MNT_LOCKED when it can be shown to be safe.
366 *
367 * mount_lock lock must be held for write
368 */
369void propagate_mount_unlock(struct mount *mnt)
370{
371 struct mount *parent = mnt->mnt_parent;
372 struct mount *m, *child;
373
374 BUG_ON(parent == mnt);
375
376 for (m = propagation_next(parent, parent); m;
377 m = propagation_next(m, parent)) {
378 child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
379 if (child)
380 child->mnt.mnt_flags &= ~MNT_LOCKED;
381 }
382}
383
384/*
385 * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
386 */
387static void mark_umount_candidates(struct mount *mnt)
388{
389 struct mount *parent = mnt->mnt_parent;
390 struct mount *m;
391
392 BUG_ON(parent == mnt);
393
394 for (m = propagation_next(parent, parent); m;
395 m = propagation_next(m, parent)) {
396 struct mount *child = __lookup_mnt_last(&m->mnt,
397 mnt->mnt_mountpoint);
398 if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
399 SET_MNT_MARK(child);
400 }
401 }
402}
403
404/*
365 * NOTE: unmounting 'mnt' naturally propagates to all other mounts its 405 * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
366 * parent propagates to. 406 * parent propagates to.
367 */ 407 */
@@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt)
378 struct mount *child = __lookup_mnt_last(&m->mnt, 418 struct mount *child = __lookup_mnt_last(&m->mnt,
379 mnt->mnt_mountpoint); 419 mnt->mnt_mountpoint);
380 /* 420 /*
381 * umount the child only if the child has no 421 * umount the child only if the child has no children
382 * other children 422 * and the child is marked safe to unmount.
383 */ 423 */
384 if (child && list_empty(&child->mnt_mounts)) { 424 if (!child || !IS_MNT_MARKED(child))
425 continue;
426 CLEAR_MNT_MARK(child);
427 if (list_empty(&child->mnt_mounts)) {
385 list_del_init(&child->mnt_child); 428 list_del_init(&child->mnt_child);
386 hlist_del_init_rcu(&child->mnt_hash); 429 child->mnt.mnt_flags |= MNT_UMOUNT;
387 hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); 430 list_move_tail(&child->mnt_list, &mnt->mnt_list);
388 } 431 }
389 } 432 }
390} 433}
@@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt)
396 * 439 *
397 * vfsmount lock must be held for write 440 * vfsmount lock must be held for write
398 */ 441 */
399int propagate_umount(struct hlist_head *list) 442int propagate_umount(struct list_head *list)
400{ 443{
401 struct mount *mnt; 444 struct mount *mnt;
402 445
403 hlist_for_each_entry(mnt, list, mnt_hash) 446 list_for_each_entry_reverse(mnt, list, mnt_list)
447 mark_umount_candidates(mnt);
448
449 list_for_each_entry(mnt, list, mnt_list)
404 __propagate_umount(mnt); 450 __propagate_umount(mnt);
405 return 0; 451 return 0;
406} 452}
diff --git a/fs/pnode.h b/fs/pnode.h
index 4a246358b031..7114ce6e6b9e 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -19,6 +19,9 @@
19#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) 19#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
20#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) 20#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
21#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) 21#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
22#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
23#define IS_MNT_LOCKED_AND_LAZY(m) \
24 (((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED)
22 25
23#define CL_EXPIRE 0x01 26#define CL_EXPIRE 0x01
24#define CL_SLAVE 0x02 27#define CL_SLAVE 0x02
@@ -40,14 +43,14 @@ static inline void set_mnt_shared(struct mount *mnt)
40void change_mnt_propagation(struct mount *, int); 43void change_mnt_propagation(struct mount *, int);
41int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, 44int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
42 struct hlist_head *); 45 struct hlist_head *);
43int propagate_umount(struct hlist_head *); 46int propagate_umount(struct list_head *);
44int propagate_mount_busy(struct mount *, int); 47int propagate_mount_busy(struct mount *, int);
48void propagate_mount_unlock(struct mount *);
45void mnt_release_group_id(struct mount *); 49void mnt_release_group_id(struct mount *);
46int get_dominating_id(struct mount *mnt, const struct path *root); 50int get_dominating_id(struct mount *mnt, const struct path *root);
47unsigned int mnt_get_count(struct mount *mnt); 51unsigned int mnt_get_count(struct mount *mnt);
48void mnt_set_mountpoint(struct mount *, struct mountpoint *, 52void mnt_set_mountpoint(struct mount *, struct mountpoint *,
49 struct mount *); 53 struct mount *);
50void umount_tree(struct mount *, int);
51struct mount *copy_tree(struct mount *, struct dentry *, int); 54struct mount *copy_tree(struct mount *, struct dentry *, int);
52bool is_path_reachable(struct mount *, struct dentry *, 55bool is_path_reachable(struct mount *, struct dentry *,
53 const struct path *root); 56 const struct path *root);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a6fbf4472017..516162be1398 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -260,6 +260,7 @@ xfs_alloc_fix_len(
260 rlen = rlen - (k - args->mod); 260 rlen = rlen - (k - args->mod);
261 else 261 else
262 rlen = rlen - args->prod + (args->mod - k); 262 rlen = rlen - args->prod + (args->mod - k);
263 /* casts to (int) catch length underflows */
263 if ((int)rlen < (int)args->minlen) 264 if ((int)rlen < (int)args->minlen)
264 return; 265 return;
265 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 266 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
@@ -286,7 +287,8 @@ xfs_alloc_fix_minleft(
286 if (diff >= 0) 287 if (diff >= 0)
287 return 1; 288 return 1;
288 args->len += diff; /* shrink the allocated space */ 289 args->len += diff; /* shrink the allocated space */
289 if (args->len >= args->minlen) 290 /* casts to (int) catch length underflows */
291 if ((int)args->len >= (int)args->minlen)
290 return 1; 292 return 1;
291 args->agbno = NULLAGBLOCK; 293 args->agbno = NULLAGBLOCK;
292 return 0; 294 return 0;
@@ -315,6 +317,9 @@ xfs_alloc_fixup_trees(
315 xfs_agblock_t nfbno2; /* second new free startblock */ 317 xfs_agblock_t nfbno2; /* second new free startblock */
316 xfs_extlen_t nflen1=0; /* first new free length */ 318 xfs_extlen_t nflen1=0; /* first new free length */
317 xfs_extlen_t nflen2=0; /* second new free length */ 319 xfs_extlen_t nflen2=0; /* second new free length */
320 struct xfs_mount *mp;
321
322 mp = cnt_cur->bc_mp;
318 323
319 /* 324 /*
320 * Look up the record in the by-size tree if necessary. 325 * Look up the record in the by-size tree if necessary.
@@ -323,13 +328,13 @@ xfs_alloc_fixup_trees(
323#ifdef DEBUG 328#ifdef DEBUG
324 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 329 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
325 return error; 330 return error;
326 XFS_WANT_CORRUPTED_RETURN( 331 XFS_WANT_CORRUPTED_RETURN(mp,
327 i == 1 && nfbno1 == fbno && nflen1 == flen); 332 i == 1 && nfbno1 == fbno && nflen1 == flen);
328#endif 333#endif
329 } else { 334 } else {
330 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 335 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
331 return error; 336 return error;
332 XFS_WANT_CORRUPTED_RETURN(i == 1); 337 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
333 } 338 }
334 /* 339 /*
335 * Look up the record in the by-block tree if necessary. 340 * Look up the record in the by-block tree if necessary.
@@ -338,13 +343,13 @@ xfs_alloc_fixup_trees(
338#ifdef DEBUG 343#ifdef DEBUG
339 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 344 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
340 return error; 345 return error;
341 XFS_WANT_CORRUPTED_RETURN( 346 XFS_WANT_CORRUPTED_RETURN(mp,
342 i == 1 && nfbno1 == fbno && nflen1 == flen); 347 i == 1 && nfbno1 == fbno && nflen1 == flen);
343#endif 348#endif
344 } else { 349 } else {
345 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 350 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
346 return error; 351 return error;
347 XFS_WANT_CORRUPTED_RETURN(i == 1); 352 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
348 } 353 }
349 354
350#ifdef DEBUG 355#ifdef DEBUG
@@ -355,7 +360,7 @@ xfs_alloc_fixup_trees(
355 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); 360 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
356 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); 361 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
357 362
358 XFS_WANT_CORRUPTED_RETURN( 363 XFS_WANT_CORRUPTED_RETURN(mp,
359 bnoblock->bb_numrecs == cntblock->bb_numrecs); 364 bnoblock->bb_numrecs == cntblock->bb_numrecs);
360 } 365 }
361#endif 366#endif
@@ -386,25 +391,25 @@ xfs_alloc_fixup_trees(
386 */ 391 */
387 if ((error = xfs_btree_delete(cnt_cur, &i))) 392 if ((error = xfs_btree_delete(cnt_cur, &i)))
388 return error; 393 return error;
389 XFS_WANT_CORRUPTED_RETURN(i == 1); 394 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
390 /* 395 /*
391 * Add new by-size btree entry(s). 396 * Add new by-size btree entry(s).
392 */ 397 */
393 if (nfbno1 != NULLAGBLOCK) { 398 if (nfbno1 != NULLAGBLOCK) {
394 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 399 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
395 return error; 400 return error;
396 XFS_WANT_CORRUPTED_RETURN(i == 0); 401 XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
397 if ((error = xfs_btree_insert(cnt_cur, &i))) 402 if ((error = xfs_btree_insert(cnt_cur, &i)))
398 return error; 403 return error;
399 XFS_WANT_CORRUPTED_RETURN(i == 1); 404 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
400 } 405 }
401 if (nfbno2 != NULLAGBLOCK) { 406 if (nfbno2 != NULLAGBLOCK) {
402 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 407 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
403 return error; 408 return error;
404 XFS_WANT_CORRUPTED_RETURN(i == 0); 409 XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
405 if ((error = xfs_btree_insert(cnt_cur, &i))) 410 if ((error = xfs_btree_insert(cnt_cur, &i)))
406 return error; 411 return error;
407 XFS_WANT_CORRUPTED_RETURN(i == 1); 412 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
408 } 413 }
409 /* 414 /*
410 * Fix up the by-block btree entry(s). 415 * Fix up the by-block btree entry(s).
@@ -415,7 +420,7 @@ xfs_alloc_fixup_trees(
415 */ 420 */
416 if ((error = xfs_btree_delete(bno_cur, &i))) 421 if ((error = xfs_btree_delete(bno_cur, &i)))
417 return error; 422 return error;
418 XFS_WANT_CORRUPTED_RETURN(i == 1); 423 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
419 } else { 424 } else {
420 /* 425 /*
421 * Update the by-block entry to start later|be shorter. 426 * Update the by-block entry to start later|be shorter.
@@ -429,10 +434,10 @@ xfs_alloc_fixup_trees(
429 */ 434 */
430 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 435 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
431 return error; 436 return error;
432 XFS_WANT_CORRUPTED_RETURN(i == 0); 437 XFS_WANT_CORRUPTED_RETURN(mp, i == 0);
433 if ((error = xfs_btree_insert(bno_cur, &i))) 438 if ((error = xfs_btree_insert(bno_cur, &i)))
434 return error; 439 return error;
435 XFS_WANT_CORRUPTED_RETURN(i == 1); 440 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
436 } 441 }
437 return 0; 442 return 0;
438} 443}
@@ -682,7 +687,7 @@ xfs_alloc_ag_vextent_exact(
682 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 687 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
683 if (error) 688 if (error)
684 goto error0; 689 goto error0;
685 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 690 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
686 ASSERT(fbno <= args->agbno); 691 ASSERT(fbno <= args->agbno);
687 692
688 /* 693 /*
@@ -783,7 +788,7 @@ xfs_alloc_find_best_extent(
783 error = xfs_alloc_get_rec(*scur, sbno, slen, &i); 788 error = xfs_alloc_get_rec(*scur, sbno, slen, &i);
784 if (error) 789 if (error)
785 goto error0; 790 goto error0;
786 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 791 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
787 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); 792 xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
788 793
789 /* 794 /*
@@ -946,7 +951,7 @@ restart:
946 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, 951 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno,
947 &ltlen, &i))) 952 &ltlen, &i)))
948 goto error0; 953 goto error0;
949 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 954 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
950 if (ltlen >= args->minlen) 955 if (ltlen >= args->minlen)
951 break; 956 break;
952 if ((error = xfs_btree_increment(cnt_cur, 0, &i))) 957 if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
@@ -966,7 +971,7 @@ restart:
966 */ 971 */
967 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 972 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
968 goto error0; 973 goto error0;
969 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 974 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
970 xfs_alloc_compute_aligned(args, ltbno, ltlen, 975 xfs_alloc_compute_aligned(args, ltbno, ltlen,
971 &ltbnoa, &ltlena); 976 &ltbnoa, &ltlena);
972 if (ltlena < args->minlen) 977 if (ltlena < args->minlen)
@@ -999,7 +1004,7 @@ restart:
999 cnt_cur->bc_ptrs[0] = besti; 1004 cnt_cur->bc_ptrs[0] = besti;
1000 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 1005 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
1001 goto error0; 1006 goto error0;
1002 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1007 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1003 ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); 1008 ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
1004 args->len = blen; 1009 args->len = blen;
1005 if (!xfs_alloc_fix_minleft(args)) { 1010 if (!xfs_alloc_fix_minleft(args)) {
@@ -1088,7 +1093,7 @@ restart:
1088 if (bno_cur_lt) { 1093 if (bno_cur_lt) {
1089 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i))) 1094 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
1090 goto error0; 1095 goto error0;
1091 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1096 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1092 xfs_alloc_compute_aligned(args, ltbno, ltlen, 1097 xfs_alloc_compute_aligned(args, ltbno, ltlen,
1093 &ltbnoa, &ltlena); 1098 &ltbnoa, &ltlena);
1094 if (ltlena >= args->minlen) 1099 if (ltlena >= args->minlen)
@@ -1104,7 +1109,7 @@ restart:
1104 if (bno_cur_gt) { 1109 if (bno_cur_gt) {
1105 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i))) 1110 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
1106 goto error0; 1111 goto error0;
1107 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1112 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1108 xfs_alloc_compute_aligned(args, gtbno, gtlen, 1113 xfs_alloc_compute_aligned(args, gtbno, gtlen,
1109 &gtbnoa, &gtlena); 1114 &gtbnoa, &gtlena);
1110 if (gtlena >= args->minlen) 1115 if (gtlena >= args->minlen)
@@ -1303,7 +1308,7 @@ restart:
1303 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1308 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1304 if (error) 1309 if (error)
1305 goto error0; 1310 goto error0;
1306 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1311 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1307 1312
1308 xfs_alloc_compute_aligned(args, fbno, flen, 1313 xfs_alloc_compute_aligned(args, fbno, flen,
1309 &rbno, &rlen); 1314 &rbno, &rlen);
@@ -1342,7 +1347,7 @@ restart:
1342 * This can't happen in the second case above. 1347 * This can't happen in the second case above.
1343 */ 1348 */
1344 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1349 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1345 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1350 XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
1346 (rlen <= flen && rbno + rlen <= fbno + flen), error0); 1351 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
1347 if (rlen < args->maxlen) { 1352 if (rlen < args->maxlen) {
1348 xfs_agblock_t bestfbno; 1353 xfs_agblock_t bestfbno;
@@ -1362,13 +1367,13 @@ restart:
1362 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1367 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1363 &i))) 1368 &i)))
1364 goto error0; 1369 goto error0;
1365 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1370 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1366 if (flen < bestrlen) 1371 if (flen < bestrlen)
1367 break; 1372 break;
1368 xfs_alloc_compute_aligned(args, fbno, flen, 1373 xfs_alloc_compute_aligned(args, fbno, flen,
1369 &rbno, &rlen); 1374 &rbno, &rlen);
1370 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1375 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1371 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1376 XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
1372 (rlen <= flen && rbno + rlen <= fbno + flen), 1377 (rlen <= flen && rbno + rlen <= fbno + flen),
1373 error0); 1378 error0);
1374 if (rlen > bestrlen) { 1379 if (rlen > bestrlen) {
@@ -1383,7 +1388,7 @@ restart:
1383 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1388 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1384 &i))) 1389 &i)))
1385 goto error0; 1390 goto error0;
1386 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1391 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1387 rlen = bestrlen; 1392 rlen = bestrlen;
1388 rbno = bestrbno; 1393 rbno = bestrbno;
1389 flen = bestflen; 1394 flen = bestflen;
@@ -1408,7 +1413,7 @@ restart:
1408 if (!xfs_alloc_fix_minleft(args)) 1413 if (!xfs_alloc_fix_minleft(args))
1409 goto out_nominleft; 1414 goto out_nominleft;
1410 rlen = args->len; 1415 rlen = args->len;
1411 XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); 1416 XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
1412 /* 1417 /*
1413 * Allocate and initialize a cursor for the by-block tree. 1418 * Allocate and initialize a cursor for the by-block tree.
1414 */ 1419 */
@@ -1422,7 +1427,7 @@ restart:
1422 cnt_cur = bno_cur = NULL; 1427 cnt_cur = bno_cur = NULL;
1423 args->len = rlen; 1428 args->len = rlen;
1424 args->agbno = rbno; 1429 args->agbno = rbno;
1425 XFS_WANT_CORRUPTED_GOTO( 1430 XFS_WANT_CORRUPTED_GOTO(args->mp,
1426 args->agbno + args->len <= 1431 args->agbno + args->len <=
1427 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), 1432 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
1428 error0); 1433 error0);
@@ -1467,7 +1472,7 @@ xfs_alloc_ag_vextent_small(
1467 if (i) { 1472 if (i) {
1468 if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) 1473 if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
1469 goto error0; 1474 goto error0;
1470 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1475 XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
1471 } 1476 }
1472 /* 1477 /*
1473 * Nothing in the btree, try the freelist. Make sure 1478 * Nothing in the btree, try the freelist. Make sure
@@ -1493,7 +1498,7 @@ xfs_alloc_ag_vextent_small(
1493 } 1498 }
1494 args->len = 1; 1499 args->len = 1;
1495 args->agbno = fbno; 1500 args->agbno = fbno;
1496 XFS_WANT_CORRUPTED_GOTO( 1501 XFS_WANT_CORRUPTED_GOTO(args->mp,
1497 args->agbno + args->len <= 1502 args->agbno + args->len <=
1498 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), 1503 be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length),
1499 error0); 1504 error0);
@@ -1579,7 +1584,7 @@ xfs_free_ag_extent(
1579 */ 1584 */
1580 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i))) 1585 if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1581 goto error0; 1586 goto error0;
1582 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1587 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1583 /* 1588 /*
1584 * It's not contiguous, though. 1589 * It's not contiguous, though.
1585 */ 1590 */
@@ -1591,7 +1596,8 @@ xfs_free_ag_extent(
1591 * space was invalid, it's (partly) already free. 1596 * space was invalid, it's (partly) already free.
1592 * Very bad. 1597 * Very bad.
1593 */ 1598 */
1594 XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0); 1599 XFS_WANT_CORRUPTED_GOTO(mp,
1600 ltbno + ltlen <= bno, error0);
1595 } 1601 }
1596 } 1602 }
1597 /* 1603 /*
@@ -1606,7 +1612,7 @@ xfs_free_ag_extent(
1606 */ 1612 */
1607 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i))) 1613 if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1608 goto error0; 1614 goto error0;
1609 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1615 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1610 /* 1616 /*
1611 * It's not contiguous, though. 1617 * It's not contiguous, though.
1612 */ 1618 */
@@ -1618,7 +1624,7 @@ xfs_free_ag_extent(
1618 * space was invalid, it's (partly) already free. 1624 * space was invalid, it's (partly) already free.
1619 * Very bad. 1625 * Very bad.
1620 */ 1626 */
1621 XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0); 1627 XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0);
1622 } 1628 }
1623 } 1629 }
1624 /* 1630 /*
@@ -1635,31 +1641,31 @@ xfs_free_ag_extent(
1635 */ 1641 */
1636 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 1642 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1637 goto error0; 1643 goto error0;
1638 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1644 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1639 if ((error = xfs_btree_delete(cnt_cur, &i))) 1645 if ((error = xfs_btree_delete(cnt_cur, &i)))
1640 goto error0; 1646 goto error0;
1641 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1647 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1642 /* 1648 /*
1643 * Delete the old by-size entry on the right. 1649 * Delete the old by-size entry on the right.
1644 */ 1650 */
1645 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 1651 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1646 goto error0; 1652 goto error0;
1647 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1653 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1648 if ((error = xfs_btree_delete(cnt_cur, &i))) 1654 if ((error = xfs_btree_delete(cnt_cur, &i)))
1649 goto error0; 1655 goto error0;
1650 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1656 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1651 /* 1657 /*
1652 * Delete the old by-block entry for the right block. 1658 * Delete the old by-block entry for the right block.
1653 */ 1659 */
1654 if ((error = xfs_btree_delete(bno_cur, &i))) 1660 if ((error = xfs_btree_delete(bno_cur, &i)))
1655 goto error0; 1661 goto error0;
1656 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1662 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1657 /* 1663 /*
1658 * Move the by-block cursor back to the left neighbor. 1664 * Move the by-block cursor back to the left neighbor.
1659 */ 1665 */
1660 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 1666 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
1661 goto error0; 1667 goto error0;
1662 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1668 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1663#ifdef DEBUG 1669#ifdef DEBUG
1664 /* 1670 /*
1665 * Check that this is the right record: delete didn't 1671 * Check that this is the right record: delete didn't
@@ -1672,7 +1678,7 @@ xfs_free_ag_extent(
1672 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 1678 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
1673 &i))) 1679 &i)))
1674 goto error0; 1680 goto error0;
1675 XFS_WANT_CORRUPTED_GOTO( 1681 XFS_WANT_CORRUPTED_GOTO(mp,
1676 i == 1 && xxbno == ltbno && xxlen == ltlen, 1682 i == 1 && xxbno == ltbno && xxlen == ltlen,
1677 error0); 1683 error0);
1678 } 1684 }
@@ -1695,17 +1701,17 @@ xfs_free_ag_extent(
1695 */ 1701 */
1696 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 1702 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1697 goto error0; 1703 goto error0;
1698 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1704 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1699 if ((error = xfs_btree_delete(cnt_cur, &i))) 1705 if ((error = xfs_btree_delete(cnt_cur, &i)))
1700 goto error0; 1706 goto error0;
1701 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1707 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1702 /* 1708 /*
1703 * Back up the by-block cursor to the left neighbor, and 1709 * Back up the by-block cursor to the left neighbor, and
1704 * update its length. 1710 * update its length.
1705 */ 1711 */
1706 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 1712 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
1707 goto error0; 1713 goto error0;
1708 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1714 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1709 nbno = ltbno; 1715 nbno = ltbno;
1710 nlen = len + ltlen; 1716 nlen = len + ltlen;
1711 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 1717 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
@@ -1721,10 +1727,10 @@ xfs_free_ag_extent(
1721 */ 1727 */
1722 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 1728 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
1723 goto error0; 1729 goto error0;
1724 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1730 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1725 if ((error = xfs_btree_delete(cnt_cur, &i))) 1731 if ((error = xfs_btree_delete(cnt_cur, &i)))
1726 goto error0; 1732 goto error0;
1727 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1733 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1728 /* 1734 /*
1729 * Update the starting block and length of the right 1735 * Update the starting block and length of the right
1730 * neighbor in the by-block tree. 1736 * neighbor in the by-block tree.
@@ -1743,7 +1749,7 @@ xfs_free_ag_extent(
1743 nlen = len; 1749 nlen = len;
1744 if ((error = xfs_btree_insert(bno_cur, &i))) 1750 if ((error = xfs_btree_insert(bno_cur, &i)))
1745 goto error0; 1751 goto error0;
1746 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1752 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1747 } 1753 }
1748 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1754 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1749 bno_cur = NULL; 1755 bno_cur = NULL;
@@ -1752,10 +1758,10 @@ xfs_free_ag_extent(
1752 */ 1758 */
1753 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 1759 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
1754 goto error0; 1760 goto error0;
1755 XFS_WANT_CORRUPTED_GOTO(i == 0, error0); 1761 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0);
1756 if ((error = xfs_btree_insert(cnt_cur, &i))) 1762 if ((error = xfs_btree_insert(cnt_cur, &i)))
1757 goto error0; 1763 goto error0;
1758 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1764 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1759 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1765 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1760 cnt_cur = NULL; 1766 cnt_cur = NULL;
1761 1767
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 15105dbc9e28..04e79d57bca6 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
86 int move_count); 86 int move_count);
87STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); 87STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
88 88
89/*
90 * attr3 block 'firstused' conversion helpers.
91 *
92 * firstused refers to the offset of the first used byte of the nameval region
93 * of an attr leaf block. The region starts at the tail of the block and expands
94 * backwards towards the middle. As such, firstused is initialized to the block
95 * size for an empty leaf block and is reduced from there.
96 *
97 * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k.
98 * The in-core firstused field is 32-bit and thus supports the maximum fsb size.
99 * The on-disk field is only 16-bit, however, and overflows at 64k. Since this
100 * only occurs at exactly 64k, we use zero as a magic on-disk value to represent
101 * the attr block size. The following helpers manage the conversion between the
102 * in-core and on-disk formats.
103 */
104
105static void
106xfs_attr3_leaf_firstused_from_disk(
107 struct xfs_da_geometry *geo,
108 struct xfs_attr3_icleaf_hdr *to,
109 struct xfs_attr_leafblock *from)
110{
111 struct xfs_attr3_leaf_hdr *hdr3;
112
113 if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) {
114 hdr3 = (struct xfs_attr3_leaf_hdr *) from;
115 to->firstused = be16_to_cpu(hdr3->firstused);
116 } else {
117 to->firstused = be16_to_cpu(from->hdr.firstused);
118 }
119
120 /*
121 * Convert from the magic fsb size value to actual blocksize. This
122 * should only occur for empty blocks when the block size overflows
123 * 16-bits.
124 */
125 if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) {
126 ASSERT(!to->count && !to->usedbytes);
127 ASSERT(geo->blksize > USHRT_MAX);
128 to->firstused = geo->blksize;
129 }
130}
131
132static void
133xfs_attr3_leaf_firstused_to_disk(
134 struct xfs_da_geometry *geo,
135 struct xfs_attr_leafblock *to,
136 struct xfs_attr3_icleaf_hdr *from)
137{
138 struct xfs_attr3_leaf_hdr *hdr3;
139 uint32_t firstused;
140
141 /* magic value should only be seen on disk */
142 ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF);
143
144 /*
145 * Scale down the 32-bit in-core firstused value to the 16-bit on-disk
146 * value. This only overflows at the max supported value of 64k. Use the
147 * magic on-disk value to represent block size in this case.
148 */
149 firstused = from->firstused;
150 if (firstused > USHRT_MAX) {
151 ASSERT(from->firstused == geo->blksize);
152 firstused = XFS_ATTR3_LEAF_NULLOFF;
153 }
154
155 if (from->magic == XFS_ATTR3_LEAF_MAGIC) {
156 hdr3 = (struct xfs_attr3_leaf_hdr *) to;
157 hdr3->firstused = cpu_to_be16(firstused);
158 } else {
159 to->hdr.firstused = cpu_to_be16(firstused);
160 }
161}
162
89void 163void
90xfs_attr3_leaf_hdr_from_disk( 164xfs_attr3_leaf_hdr_from_disk(
165 struct xfs_da_geometry *geo,
91 struct xfs_attr3_icleaf_hdr *to, 166 struct xfs_attr3_icleaf_hdr *to,
92 struct xfs_attr_leafblock *from) 167 struct xfs_attr_leafblock *from)
93{ 168{
@@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk(
104 to->magic = be16_to_cpu(hdr3->info.hdr.magic); 179 to->magic = be16_to_cpu(hdr3->info.hdr.magic);
105 to->count = be16_to_cpu(hdr3->count); 180 to->count = be16_to_cpu(hdr3->count);
106 to->usedbytes = be16_to_cpu(hdr3->usedbytes); 181 to->usedbytes = be16_to_cpu(hdr3->usedbytes);
107 to->firstused = be16_to_cpu(hdr3->firstused); 182 xfs_attr3_leaf_firstused_from_disk(geo, to, from);
108 to->holes = hdr3->holes; 183 to->holes = hdr3->holes;
109 184
110 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { 185 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk(
118 to->magic = be16_to_cpu(from->hdr.info.magic); 193 to->magic = be16_to_cpu(from->hdr.info.magic);
119 to->count = be16_to_cpu(from->hdr.count); 194 to->count = be16_to_cpu(from->hdr.count);
120 to->usedbytes = be16_to_cpu(from->hdr.usedbytes); 195 to->usedbytes = be16_to_cpu(from->hdr.usedbytes);
121 to->firstused = be16_to_cpu(from->hdr.firstused); 196 xfs_attr3_leaf_firstused_from_disk(geo, to, from);
122 to->holes = from->hdr.holes; 197 to->holes = from->hdr.holes;
123 198
124 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { 199 for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
@@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk(
129 204
130void 205void
131xfs_attr3_leaf_hdr_to_disk( 206xfs_attr3_leaf_hdr_to_disk(
207 struct xfs_da_geometry *geo,
132 struct xfs_attr_leafblock *to, 208 struct xfs_attr_leafblock *to,
133 struct xfs_attr3_icleaf_hdr *from) 209 struct xfs_attr3_icleaf_hdr *from)
134{ 210{
135 int i; 211 int i;
136 212
137 ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || 213 ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC ||
138 from->magic == XFS_ATTR3_LEAF_MAGIC); 214 from->magic == XFS_ATTR3_LEAF_MAGIC);
@@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk(
145 hdr3->info.hdr.magic = cpu_to_be16(from->magic); 221 hdr3->info.hdr.magic = cpu_to_be16(from->magic);
146 hdr3->count = cpu_to_be16(from->count); 222 hdr3->count = cpu_to_be16(from->count);
147 hdr3->usedbytes = cpu_to_be16(from->usedbytes); 223 hdr3->usedbytes = cpu_to_be16(from->usedbytes);
148 hdr3->firstused = cpu_to_be16(from->firstused); 224 xfs_attr3_leaf_firstused_to_disk(geo, to, from);
149 hdr3->holes = from->holes; 225 hdr3->holes = from->holes;
150 hdr3->pad1 = 0; 226 hdr3->pad1 = 0;
151 227
@@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk(
160 to->hdr.info.magic = cpu_to_be16(from->magic); 236 to->hdr.info.magic = cpu_to_be16(from->magic);
161 to->hdr.count = cpu_to_be16(from->count); 237 to->hdr.count = cpu_to_be16(from->count);
162 to->hdr.usedbytes = cpu_to_be16(from->usedbytes); 238 to->hdr.usedbytes = cpu_to_be16(from->usedbytes);
163 to->hdr.firstused = cpu_to_be16(from->firstused); 239 xfs_attr3_leaf_firstused_to_disk(geo, to, from);
164 to->hdr.holes = from->holes; 240 to->hdr.holes = from->holes;
165 to->hdr.pad1 = 0; 241 to->hdr.pad1 = 0;
166 242
@@ -178,7 +254,7 @@ xfs_attr3_leaf_verify(
178 struct xfs_attr_leafblock *leaf = bp->b_addr; 254 struct xfs_attr_leafblock *leaf = bp->b_addr;
179 struct xfs_attr3_icleaf_hdr ichdr; 255 struct xfs_attr3_icleaf_hdr ichdr;
180 256
181 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 257 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
182 258
183 if (xfs_sb_version_hascrc(&mp->m_sb)) { 259 if (xfs_sb_version_hascrc(&mp->m_sb)) {
184 struct xfs_da3_node_hdr *hdr3 = bp->b_addr; 260 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
@@ -757,9 +833,10 @@ xfs_attr_shortform_allfit(
757 struct xfs_attr3_icleaf_hdr leafhdr; 833 struct xfs_attr3_icleaf_hdr leafhdr;
758 int bytes; 834 int bytes;
759 int i; 835 int i;
836 struct xfs_mount *mp = bp->b_target->bt_mount;
760 837
761 leaf = bp->b_addr; 838 leaf = bp->b_addr;
762 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); 839 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
763 entry = xfs_attr3_leaf_entryp(leaf); 840 entry = xfs_attr3_leaf_entryp(leaf);
764 841
765 bytes = sizeof(struct xfs_attr_sf_hdr); 842 bytes = sizeof(struct xfs_attr_sf_hdr);
@@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform(
812 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); 889 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
813 890
814 leaf = (xfs_attr_leafblock_t *)tmpbuffer; 891 leaf = (xfs_attr_leafblock_t *)tmpbuffer;
815 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 892 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
816 entry = xfs_attr3_leaf_entryp(leaf); 893 entry = xfs_attr3_leaf_entryp(leaf);
817 894
818 /* XXX (dgc): buffer is about to be marked stale - why zero it? */ 895 /* XXX (dgc): buffer is about to be marked stale - why zero it? */
@@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node(
923 btree = dp->d_ops->node_tree_p(node); 1000 btree = dp->d_ops->node_tree_p(node);
924 1001
925 leaf = bp2->b_addr; 1002 leaf = bp2->b_addr;
926 xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); 1003 xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf);
927 entries = xfs_attr3_leaf_entryp(leaf); 1004 entries = xfs_attr3_leaf_entryp(leaf);
928 1005
929 /* both on-disk, don't endian-flip twice */ 1006 /* both on-disk, don't endian-flip twice */
@@ -988,7 +1065,7 @@ xfs_attr3_leaf_create(
988 } 1065 }
989 ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; 1066 ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
990 1067
991 xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); 1068 xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
992 xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); 1069 xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
993 1070
994 *bpp = bp; 1071 *bpp = bp;
@@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add(
1073 trace_xfs_attr_leaf_add(args); 1150 trace_xfs_attr_leaf_add(args);
1074 1151
1075 leaf = bp->b_addr; 1152 leaf = bp->b_addr;
1076 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 1153 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
1077 ASSERT(args->index >= 0 && args->index <= ichdr.count); 1154 ASSERT(args->index >= 0 && args->index <= ichdr.count);
1078 entsize = xfs_attr_leaf_newentsize(args, NULL); 1155 entsize = xfs_attr_leaf_newentsize(args, NULL);
1079 1156
@@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add(
1126 tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); 1203 tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0);
1127 1204
1128out_log_hdr: 1205out_log_hdr:
1129 xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); 1206 xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
1130 xfs_trans_log_buf(args->trans, bp, 1207 xfs_trans_log_buf(args->trans, bp,
1131 XFS_DA_LOGRANGE(leaf, &leaf->hdr, 1208 XFS_DA_LOGRANGE(leaf, &leaf->hdr,
1132 xfs_attr3_leaf_hdr_size(leaf))); 1209 xfs_attr3_leaf_hdr_size(leaf)));
@@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact(
1294 ichdr_dst->freemap[0].base; 1371 ichdr_dst->freemap[0].base;
1295 1372
1296 /* write the header back to initialise the underlying buffer */ 1373 /* write the header back to initialise the underlying buffer */
1297 xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); 1374 xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst);
1298 1375
1299 /* 1376 /*
1300 * Copy all entry's in the same (sorted) order, 1377 * Copy all entry's in the same (sorted) order,
@@ -1344,9 +1421,10 @@ xfs_attr_leaf_order(
1344{ 1421{
1345 struct xfs_attr3_icleaf_hdr ichdr1; 1422 struct xfs_attr3_icleaf_hdr ichdr1;
1346 struct xfs_attr3_icleaf_hdr ichdr2; 1423 struct xfs_attr3_icleaf_hdr ichdr2;
1424 struct xfs_mount *mp = leaf1_bp->b_target->bt_mount;
1347 1425
1348 xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); 1426 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr);
1349 xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); 1427 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr);
1350 return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); 1428 return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2);
1351} 1429}
1352 1430
@@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance(
1388 ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); 1466 ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC);
1389 leaf1 = blk1->bp->b_addr; 1467 leaf1 = blk1->bp->b_addr;
1390 leaf2 = blk2->bp->b_addr; 1468 leaf2 = blk2->bp->b_addr;
1391 xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); 1469 xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1);
1392 xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); 1470 xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2);
1393 ASSERT(ichdr2.count == 0); 1471 ASSERT(ichdr2.count == 0);
1394 args = state->args; 1472 args = state->args;
1395 1473
@@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance(
1490 ichdr1.count, count); 1568 ichdr1.count, count);
1491 } 1569 }
1492 1570
1493 xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); 1571 xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1);
1494 xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); 1572 xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2);
1495 xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); 1573 xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
1496 xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); 1574 xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
1497 1575
@@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall(
1684 */ 1762 */
1685 blk = &state->path.blk[ state->path.active-1 ]; 1763 blk = &state->path.blk[ state->path.active-1 ];
1686 leaf = blk->bp->b_addr; 1764 leaf = blk->bp->b_addr;
1687 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 1765 xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf);
1688 bytes = xfs_attr3_leaf_hdr_size(leaf) + 1766 bytes = xfs_attr3_leaf_hdr_size(leaf) +
1689 ichdr.count * sizeof(xfs_attr_leaf_entry_t) + 1767 ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
1690 ichdr.usedbytes; 1768 ichdr.usedbytes;
@@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall(
1740 if (error) 1818 if (error)
1741 return error; 1819 return error;
1742 1820
1743 xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); 1821 xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr);
1744 1822
1745 bytes = state->args->geo->blksize - 1823 bytes = state->args->geo->blksize -
1746 (state->args->geo->blksize >> 2) - 1824 (state->args->geo->blksize >> 2) -
@@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove(
1805 trace_xfs_attr_leaf_remove(args); 1883 trace_xfs_attr_leaf_remove(args);
1806 1884
1807 leaf = bp->b_addr; 1885 leaf = bp->b_addr;
1808 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 1886 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
1809 1887
1810 ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); 1888 ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
1811 ASSERT(args->index >= 0 && args->index < ichdr.count); 1889 ASSERT(args->index >= 0 && args->index < ichdr.count);
@@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove(
1918 tmp = be16_to_cpu(entry->nameidx); 1996 tmp = be16_to_cpu(entry->nameidx);
1919 } 1997 }
1920 ichdr.firstused = tmp; 1998 ichdr.firstused = tmp;
1921 if (!ichdr.firstused) 1999 ASSERT(ichdr.firstused != 0);
1922 ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN;
1923 } else { 2000 } else {
1924 ichdr.holes = 1; /* mark as needing compaction */ 2001 ichdr.holes = 1; /* mark as needing compaction */
1925 } 2002 }
1926 xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); 2003 xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr);
1927 xfs_trans_log_buf(args->trans, bp, 2004 xfs_trans_log_buf(args->trans, bp,
1928 XFS_DA_LOGRANGE(leaf, &leaf->hdr, 2005 XFS_DA_LOGRANGE(leaf, &leaf->hdr,
1929 xfs_attr3_leaf_hdr_size(leaf))); 2006 xfs_attr3_leaf_hdr_size(leaf)));
@@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance(
1957 2034
1958 drop_leaf = drop_blk->bp->b_addr; 2035 drop_leaf = drop_blk->bp->b_addr;
1959 save_leaf = save_blk->bp->b_addr; 2036 save_leaf = save_blk->bp->b_addr;
1960 xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); 2037 xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf);
1961 xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); 2038 xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf);
1962 entry = xfs_attr3_leaf_entryp(drop_leaf); 2039 entry = xfs_attr3_leaf_entryp(drop_leaf);
1963 2040
1964 /* 2041 /*
@@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance(
2012 tmphdr.firstused = state->args->geo->blksize; 2089 tmphdr.firstused = state->args->geo->blksize;
2013 2090
2014 /* write the header to the temp buffer to initialise it */ 2091 /* write the header to the temp buffer to initialise it */
2015 xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); 2092 xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr);
2016 2093
2017 if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, 2094 if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
2018 drop_blk->bp, &drophdr)) { 2095 drop_blk->bp, &drophdr)) {
@@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance(
2039 kmem_free(tmp_leaf); 2116 kmem_free(tmp_leaf);
2040 } 2117 }
2041 2118
2042 xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); 2119 xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
2043 xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, 2120 xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
2044 state->args->geo->blksize - 1); 2121 state->args->geo->blksize - 1);
2045 2122
@@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int(
2085 trace_xfs_attr_leaf_lookup(args); 2162 trace_xfs_attr_leaf_lookup(args);
2086 2163
2087 leaf = bp->b_addr; 2164 leaf = bp->b_addr;
2088 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 2165 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
2089 entries = xfs_attr3_leaf_entryp(leaf); 2166 entries = xfs_attr3_leaf_entryp(leaf);
2090 ASSERT(ichdr.count < args->geo->blksize / 8); 2167 ASSERT(ichdr.count < args->geo->blksize / 8);
2091 2168
@@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue(
2190 int valuelen; 2267 int valuelen;
2191 2268
2192 leaf = bp->b_addr; 2269 leaf = bp->b_addr;
2193 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 2270 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
2194 ASSERT(ichdr.count < args->geo->blksize / 8); 2271 ASSERT(ichdr.count < args->geo->blksize / 8);
2195 ASSERT(args->index < ichdr.count); 2272 ASSERT(args->index < ichdr.count);
2196 2273
@@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash(
2391{ 2468{
2392 struct xfs_attr3_icleaf_hdr ichdr; 2469 struct xfs_attr3_icleaf_hdr ichdr;
2393 struct xfs_attr_leaf_entry *entries; 2470 struct xfs_attr_leaf_entry *entries;
2471 struct xfs_mount *mp = bp->b_target->bt_mount;
2394 2472
2395 xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); 2473 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr);
2396 entries = xfs_attr3_leaf_entryp(bp->b_addr); 2474 entries = xfs_attr3_leaf_entryp(bp->b_addr);
2397 if (count) 2475 if (count)
2398 *count = ichdr.count; 2476 *count = ichdr.count;
@@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag(
2486 ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); 2564 ASSERT(entry->flags & XFS_ATTR_INCOMPLETE);
2487 2565
2488#ifdef DEBUG 2566#ifdef DEBUG
2489 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 2567 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
2490 ASSERT(args->index < ichdr.count); 2568 ASSERT(args->index < ichdr.count);
2491 ASSERT(args->index >= 0); 2569 ASSERT(args->index >= 0);
2492 2570
@@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag(
2550 2628
2551 leaf = bp->b_addr; 2629 leaf = bp->b_addr;
2552#ifdef DEBUG 2630#ifdef DEBUG
2553 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 2631 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
2554 ASSERT(args->index < ichdr.count); 2632 ASSERT(args->index < ichdr.count);
2555 ASSERT(args->index >= 0); 2633 ASSERT(args->index >= 0);
2556#endif 2634#endif
@@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags(
2629 entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; 2707 entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2];
2630 2708
2631#ifdef DEBUG 2709#ifdef DEBUG
2632 xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); 2710 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1);
2633 ASSERT(args->index < ichdr1.count); 2711 ASSERT(args->index < ichdr1.count);
2634 ASSERT(args->index >= 0); 2712 ASSERT(args->index >= 0);
2635 2713
2636 xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); 2714 xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2);
2637 ASSERT(args->index2 < ichdr2.count); 2715 ASSERT(args->index2 < ichdr2.count);
2638 ASSERT(args->index2 >= 0); 2716 ASSERT(args->index2 >= 0);
2639 2717
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index e2929da7c3ba..025c4b820c03 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -100,9 +100,11 @@ int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
100int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, 100int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
101 xfs_dablk_t bno, xfs_daddr_t mappedbno, 101 xfs_dablk_t bno, xfs_daddr_t mappedbno,
102 struct xfs_buf **bpp); 102 struct xfs_buf **bpp);
103void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, 103void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo,
104 struct xfs_attr3_icleaf_hdr *to,
104 struct xfs_attr_leafblock *from); 105 struct xfs_attr_leafblock *from);
105void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, 106void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo,
107 struct xfs_attr_leafblock *to,
106 struct xfs_attr3_icleaf_hdr *from); 108 struct xfs_attr3_icleaf_hdr *from);
107 109
108#endif /* __XFS_ATTR_LEAF_H__ */ 110#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 61ec015dca16..aeffeaaac0ec 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset(
244 } 244 }
245} 245}
246 246
247/*
248 * Debug/sanity checking code
249 */
250
251STATIC int
252xfs_bmap_sanity_check(
253 struct xfs_mount *mp,
254 struct xfs_buf *bp,
255 int level)
256{
257 struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
258
259 if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) &&
260 block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC))
261 return 0;
262
263 if (be16_to_cpu(block->bb_level) != level ||
264 be16_to_cpu(block->bb_numrecs) == 0 ||
265 be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
266 return 0;
267
268 return 1;
269}
270
271#ifdef DEBUG 247#ifdef DEBUG
272STATIC struct xfs_buf * 248STATIC struct xfs_buf *
273xfs_bmap_get_bp( 249xfs_bmap_get_bp(
@@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents(
410 goto error_norelse; 386 goto error_norelse;
411 } 387 }
412 block = XFS_BUF_TO_BLOCK(bp); 388 block = XFS_BUF_TO_BLOCK(bp);
413 XFS_WANT_CORRUPTED_GOTO(
414 xfs_bmap_sanity_check(mp, bp, level),
415 error0);
416 if (level == 0) 389 if (level == 0)
417 break; 390 break;
418 391
@@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents(
424 xfs_check_block(block, mp, 0, 0); 397 xfs_check_block(block, mp, 0, 0);
425 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); 398 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
426 bno = be64_to_cpu(*pp); 399 bno = be64_to_cpu(*pp);
427 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); 400 XFS_WANT_CORRUPTED_GOTO(mp,
401 XFS_FSB_SANITY_CHECK(mp, bno), error0);
428 if (bp_release) { 402 if (bp_release) {
429 bp_release = 0; 403 bp_release = 0;
430 xfs_trans_brelse(NULL, bp); 404 xfs_trans_brelse(NULL, bp);
@@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree(
1029 if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) 1003 if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
1030 goto error0; 1004 goto error0;
1031 /* must be at least one entry */ 1005 /* must be at least one entry */
1032 XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); 1006 XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0);
1033 if ((error = xfs_btree_new_iroot(cur, flags, &stat))) 1007 if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
1034 goto error0; 1008 goto error0;
1035 if (stat == 0) { 1009 if (stat == 0) {
@@ -1311,14 +1285,12 @@ xfs_bmap_read_extents(
1311 if (error) 1285 if (error)
1312 return error; 1286 return error;
1313 block = XFS_BUF_TO_BLOCK(bp); 1287 block = XFS_BUF_TO_BLOCK(bp);
1314 XFS_WANT_CORRUPTED_GOTO(
1315 xfs_bmap_sanity_check(mp, bp, level),
1316 error0);
1317 if (level == 0) 1288 if (level == 0)
1318 break; 1289 break;
1319 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); 1290 pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
1320 bno = be64_to_cpu(*pp); 1291 bno = be64_to_cpu(*pp);
1321 XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); 1292 XFS_WANT_CORRUPTED_GOTO(mp,
1293 XFS_FSB_SANITY_CHECK(mp, bno), error0);
1322 xfs_trans_brelse(tp, bp); 1294 xfs_trans_brelse(tp, bp);
1323 } 1295 }
1324 /* 1296 /*
@@ -1345,9 +1317,6 @@ xfs_bmap_read_extents(
1345 XFS_ERRLEVEL_LOW, ip->i_mount, block); 1317 XFS_ERRLEVEL_LOW, ip->i_mount, block);
1346 goto error0; 1318 goto error0;
1347 } 1319 }
1348 XFS_WANT_CORRUPTED_GOTO(
1349 xfs_bmap_sanity_check(mp, bp, 0),
1350 error0);
1351 /* 1320 /*
1352 * Read-ahead the next leaf block, if any. 1321 * Read-ahead the next leaf block, if any.
1353 */ 1322 */
@@ -1755,7 +1724,9 @@ xfs_bmap_add_extent_delay_real(
1755 xfs_filblks_t temp=0; /* value for da_new calculations */ 1724 xfs_filblks_t temp=0; /* value for da_new calculations */
1756 xfs_filblks_t temp2=0;/* value for da_new calculations */ 1725 xfs_filblks_t temp2=0;/* value for da_new calculations */
1757 int tmp_rval; /* partial logging flags */ 1726 int tmp_rval; /* partial logging flags */
1727 struct xfs_mount *mp;
1758 1728
1729 mp = bma->tp ? bma->tp->t_mountp : NULL;
1759 ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); 1730 ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
1760 1731
1761 ASSERT(bma->idx >= 0); 1732 ASSERT(bma->idx >= 0);
@@ -1866,15 +1837,15 @@ xfs_bmap_add_extent_delay_real(
1866 RIGHT.br_blockcount, &i); 1837 RIGHT.br_blockcount, &i);
1867 if (error) 1838 if (error)
1868 goto done; 1839 goto done;
1869 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1840 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
1870 error = xfs_btree_delete(bma->cur, &i); 1841 error = xfs_btree_delete(bma->cur, &i);
1871 if (error) 1842 if (error)
1872 goto done; 1843 goto done;
1873 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1844 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
1874 error = xfs_btree_decrement(bma->cur, 0, &i); 1845 error = xfs_btree_decrement(bma->cur, 0, &i);
1875 if (error) 1846 if (error)
1876 goto done; 1847 goto done;
1877 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1848 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
1878 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, 1849 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
1879 LEFT.br_startblock, 1850 LEFT.br_startblock,
1880 LEFT.br_blockcount + 1851 LEFT.br_blockcount +
@@ -1907,7 +1878,7 @@ xfs_bmap_add_extent_delay_real(
1907 &i); 1878 &i);
1908 if (error) 1879 if (error)
1909 goto done; 1880 goto done;
1910 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1881 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
1911 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, 1882 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
1912 LEFT.br_startblock, 1883 LEFT.br_startblock,
1913 LEFT.br_blockcount + 1884 LEFT.br_blockcount +
@@ -1938,7 +1909,7 @@ xfs_bmap_add_extent_delay_real(
1938 RIGHT.br_blockcount, &i); 1909 RIGHT.br_blockcount, &i);
1939 if (error) 1910 if (error)
1940 goto done; 1911 goto done;
1941 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1912 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
1942 error = xfs_bmbt_update(bma->cur, PREV.br_startoff, 1913 error = xfs_bmbt_update(bma->cur, PREV.br_startoff,
1943 new->br_startblock, 1914 new->br_startblock,
1944 PREV.br_blockcount + 1915 PREV.br_blockcount +
@@ -1968,12 +1939,12 @@ xfs_bmap_add_extent_delay_real(
1968 &i); 1939 &i);
1969 if (error) 1940 if (error)
1970 goto done; 1941 goto done;
1971 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 1942 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
1972 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; 1943 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
1973 error = xfs_btree_insert(bma->cur, &i); 1944 error = xfs_btree_insert(bma->cur, &i);
1974 if (error) 1945 if (error)
1975 goto done; 1946 goto done;
1976 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1947 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
1977 } 1948 }
1978 break; 1949 break;
1979 1950
@@ -2001,7 +1972,7 @@ xfs_bmap_add_extent_delay_real(
2001 &i); 1972 &i);
2002 if (error) 1973 if (error)
2003 goto done; 1974 goto done;
2004 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 1975 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2005 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, 1976 error = xfs_bmbt_update(bma->cur, LEFT.br_startoff,
2006 LEFT.br_startblock, 1977 LEFT.br_startblock,
2007 LEFT.br_blockcount + 1978 LEFT.br_blockcount +
@@ -2038,12 +2009,12 @@ xfs_bmap_add_extent_delay_real(
2038 &i); 2009 &i);
2039 if (error) 2010 if (error)
2040 goto done; 2011 goto done;
2041 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 2012 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
2042 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; 2013 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
2043 error = xfs_btree_insert(bma->cur, &i); 2014 error = xfs_btree_insert(bma->cur, &i);
2044 if (error) 2015 if (error)
2045 goto done; 2016 goto done;
2046 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2017 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2047 } 2018 }
2048 2019
2049 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { 2020 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2084,7 +2055,7 @@ xfs_bmap_add_extent_delay_real(
2084 RIGHT.br_blockcount, &i); 2055 RIGHT.br_blockcount, &i);
2085 if (error) 2056 if (error)
2086 goto done; 2057 goto done;
2087 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2058 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2088 error = xfs_bmbt_update(bma->cur, new->br_startoff, 2059 error = xfs_bmbt_update(bma->cur, new->br_startoff,
2089 new->br_startblock, 2060 new->br_startblock,
2090 new->br_blockcount + 2061 new->br_blockcount +
@@ -2122,12 +2093,12 @@ xfs_bmap_add_extent_delay_real(
2122 &i); 2093 &i);
2123 if (error) 2094 if (error)
2124 goto done; 2095 goto done;
2125 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 2096 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
2126 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; 2097 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
2127 error = xfs_btree_insert(bma->cur, &i); 2098 error = xfs_btree_insert(bma->cur, &i);
2128 if (error) 2099 if (error)
2129 goto done; 2100 goto done;
2130 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2101 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2131 } 2102 }
2132 2103
2133 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { 2104 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2191,12 +2162,12 @@ xfs_bmap_add_extent_delay_real(
2191 &i); 2162 &i);
2192 if (error) 2163 if (error)
2193 goto done; 2164 goto done;
2194 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 2165 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
2195 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; 2166 bma->cur->bc_rec.b.br_state = XFS_EXT_NORM;
2196 error = xfs_btree_insert(bma->cur, &i); 2167 error = xfs_btree_insert(bma->cur, &i);
2197 if (error) 2168 if (error)
2198 goto done; 2169 goto done;
2199 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2170 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2200 } 2171 }
2201 2172
2202 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { 2173 if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
@@ -2212,9 +2183,8 @@ xfs_bmap_add_extent_delay_real(
2212 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - 2183 diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) -
2213 (bma->cur ? bma->cur->bc_private.b.allocated : 0)); 2184 (bma->cur ? bma->cur->bc_private.b.allocated : 0));
2214 if (diff > 0) { 2185 if (diff > 0) {
2215 error = xfs_icsb_modify_counters(bma->ip->i_mount, 2186 error = xfs_mod_fdblocks(bma->ip->i_mount,
2216 XFS_SBS_FDBLOCKS, 2187 -((int64_t)diff), false);
2217 -((int64_t)diff), 0);
2218 ASSERT(!error); 2188 ASSERT(!error);
2219 if (error) 2189 if (error)
2220 goto done; 2190 goto done;
@@ -2265,9 +2235,8 @@ xfs_bmap_add_extent_delay_real(
2265 temp += bma->cur->bc_private.b.allocated; 2235 temp += bma->cur->bc_private.b.allocated;
2266 ASSERT(temp <= da_old); 2236 ASSERT(temp <= da_old);
2267 if (temp < da_old) 2237 if (temp < da_old)
2268 xfs_icsb_modify_counters(bma->ip->i_mount, 2238 xfs_mod_fdblocks(bma->ip->i_mount,
2269 XFS_SBS_FDBLOCKS, 2239 (int64_t)(da_old - temp), false);
2270 (int64_t)(da_old - temp), 0);
2271 } 2240 }
2272 2241
2273 /* clear out the allocated field, done with it now in any case. */ 2242 /* clear out the allocated field, done with it now in any case. */
@@ -2309,6 +2278,7 @@ xfs_bmap_add_extent_unwritten_real(
2309 /* left is 0, right is 1, prev is 2 */ 2278 /* left is 0, right is 1, prev is 2 */
2310 int rval=0; /* return value (logging flags) */ 2279 int rval=0; /* return value (logging flags) */
2311 int state = 0;/* state bits, accessed thru macros */ 2280 int state = 0;/* state bits, accessed thru macros */
2281 struct xfs_mount *mp = tp->t_mountp;
2312 2282
2313 *logflagsp = 0; 2283 *logflagsp = 0;
2314 2284
@@ -2421,19 +2391,19 @@ xfs_bmap_add_extent_unwritten_real(
2421 RIGHT.br_startblock, 2391 RIGHT.br_startblock,
2422 RIGHT.br_blockcount, &i))) 2392 RIGHT.br_blockcount, &i)))
2423 goto done; 2393 goto done;
2424 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2394 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2425 if ((error = xfs_btree_delete(cur, &i))) 2395 if ((error = xfs_btree_delete(cur, &i)))
2426 goto done; 2396 goto done;
2427 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2397 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2428 if ((error = xfs_btree_decrement(cur, 0, &i))) 2398 if ((error = xfs_btree_decrement(cur, 0, &i)))
2429 goto done; 2399 goto done;
2430 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2400 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2431 if ((error = xfs_btree_delete(cur, &i))) 2401 if ((error = xfs_btree_delete(cur, &i)))
2432 goto done; 2402 goto done;
2433 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2403 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2434 if ((error = xfs_btree_decrement(cur, 0, &i))) 2404 if ((error = xfs_btree_decrement(cur, 0, &i)))
2435 goto done; 2405 goto done;
2436 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2406 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2437 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 2407 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
2438 LEFT.br_startblock, 2408 LEFT.br_startblock,
2439 LEFT.br_blockcount + PREV.br_blockcount + 2409 LEFT.br_blockcount + PREV.br_blockcount +
@@ -2464,13 +2434,13 @@ xfs_bmap_add_extent_unwritten_real(
2464 PREV.br_startblock, PREV.br_blockcount, 2434 PREV.br_startblock, PREV.br_blockcount,
2465 &i))) 2435 &i)))
2466 goto done; 2436 goto done;
2467 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2437 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2468 if ((error = xfs_btree_delete(cur, &i))) 2438 if ((error = xfs_btree_delete(cur, &i)))
2469 goto done; 2439 goto done;
2470 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2440 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2471 if ((error = xfs_btree_decrement(cur, 0, &i))) 2441 if ((error = xfs_btree_decrement(cur, 0, &i)))
2472 goto done; 2442 goto done;
2473 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2443 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2474 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, 2444 if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
2475 LEFT.br_startblock, 2445 LEFT.br_startblock,
2476 LEFT.br_blockcount + PREV.br_blockcount, 2446 LEFT.br_blockcount + PREV.br_blockcount,
@@ -2499,13 +2469,13 @@ xfs_bmap_add_extent_unwritten_real(
2499 RIGHT.br_startblock, 2469 RIGHT.br_startblock,
2500 RIGHT.br_blockcount, &i))) 2470 RIGHT.br_blockcount, &i)))
2501 goto done; 2471 goto done;
2502 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2472 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2503 if ((error = xfs_btree_delete(cur, &i))) 2473 if ((error = xfs_btree_delete(cur, &i)))
2504 goto done; 2474 goto done;
2505 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2475 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2506 if ((error = xfs_btree_decrement(cur, 0, &i))) 2476 if ((error = xfs_btree_decrement(cur, 0, &i)))
2507 goto done; 2477 goto done;
2508 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2478 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2509 if ((error = xfs_bmbt_update(cur, new->br_startoff, 2479 if ((error = xfs_bmbt_update(cur, new->br_startoff,
2510 new->br_startblock, 2480 new->br_startblock,
2511 new->br_blockcount + RIGHT.br_blockcount, 2481 new->br_blockcount + RIGHT.br_blockcount,
@@ -2532,7 +2502,7 @@ xfs_bmap_add_extent_unwritten_real(
2532 new->br_startblock, new->br_blockcount, 2502 new->br_startblock, new->br_blockcount,
2533 &i))) 2503 &i)))
2534 goto done; 2504 goto done;
2535 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2505 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2536 if ((error = xfs_bmbt_update(cur, new->br_startoff, 2506 if ((error = xfs_bmbt_update(cur, new->br_startoff,
2537 new->br_startblock, new->br_blockcount, 2507 new->br_startblock, new->br_blockcount,
2538 newext))) 2508 newext)))
@@ -2569,7 +2539,7 @@ xfs_bmap_add_extent_unwritten_real(
2569 PREV.br_startblock, PREV.br_blockcount, 2539 PREV.br_startblock, PREV.br_blockcount,
2570 &i))) 2540 &i)))
2571 goto done; 2541 goto done;
2572 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2542 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2573 if ((error = xfs_bmbt_update(cur, 2543 if ((error = xfs_bmbt_update(cur,
2574 PREV.br_startoff + new->br_blockcount, 2544 PREV.br_startoff + new->br_blockcount,
2575 PREV.br_startblock + new->br_blockcount, 2545 PREV.br_startblock + new->br_blockcount,
@@ -2611,7 +2581,7 @@ xfs_bmap_add_extent_unwritten_real(
2611 PREV.br_startblock, PREV.br_blockcount, 2581 PREV.br_startblock, PREV.br_blockcount,
2612 &i))) 2582 &i)))
2613 goto done; 2583 goto done;
2614 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2584 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2615 if ((error = xfs_bmbt_update(cur, 2585 if ((error = xfs_bmbt_update(cur,
2616 PREV.br_startoff + new->br_blockcount, 2586 PREV.br_startoff + new->br_blockcount,
2617 PREV.br_startblock + new->br_blockcount, 2587 PREV.br_startblock + new->br_blockcount,
@@ -2621,7 +2591,7 @@ xfs_bmap_add_extent_unwritten_real(
2621 cur->bc_rec.b = *new; 2591 cur->bc_rec.b = *new;
2622 if ((error = xfs_btree_insert(cur, &i))) 2592 if ((error = xfs_btree_insert(cur, &i)))
2623 goto done; 2593 goto done;
2624 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2594 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2625 } 2595 }
2626 break; 2596 break;
2627 2597
@@ -2651,7 +2621,7 @@ xfs_bmap_add_extent_unwritten_real(
2651 PREV.br_startblock, 2621 PREV.br_startblock,
2652 PREV.br_blockcount, &i))) 2622 PREV.br_blockcount, &i)))
2653 goto done; 2623 goto done;
2654 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2624 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2655 if ((error = xfs_bmbt_update(cur, PREV.br_startoff, 2625 if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
2656 PREV.br_startblock, 2626 PREV.br_startblock,
2657 PREV.br_blockcount - new->br_blockcount, 2627 PREV.br_blockcount - new->br_blockcount,
@@ -2689,7 +2659,7 @@ xfs_bmap_add_extent_unwritten_real(
2689 PREV.br_startblock, PREV.br_blockcount, 2659 PREV.br_startblock, PREV.br_blockcount,
2690 &i))) 2660 &i)))
2691 goto done; 2661 goto done;
2692 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2662 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2693 if ((error = xfs_bmbt_update(cur, PREV.br_startoff, 2663 if ((error = xfs_bmbt_update(cur, PREV.br_startoff,
2694 PREV.br_startblock, 2664 PREV.br_startblock,
2695 PREV.br_blockcount - new->br_blockcount, 2665 PREV.br_blockcount - new->br_blockcount,
@@ -2699,11 +2669,11 @@ xfs_bmap_add_extent_unwritten_real(
2699 new->br_startblock, new->br_blockcount, 2669 new->br_startblock, new->br_blockcount,
2700 &i))) 2670 &i)))
2701 goto done; 2671 goto done;
2702 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 2672 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
2703 cur->bc_rec.b.br_state = XFS_EXT_NORM; 2673 cur->bc_rec.b.br_state = XFS_EXT_NORM;
2704 if ((error = xfs_btree_insert(cur, &i))) 2674 if ((error = xfs_btree_insert(cur, &i)))
2705 goto done; 2675 goto done;
2706 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2676 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2707 } 2677 }
2708 break; 2678 break;
2709 2679
@@ -2737,7 +2707,7 @@ xfs_bmap_add_extent_unwritten_real(
2737 PREV.br_startblock, PREV.br_blockcount, 2707 PREV.br_startblock, PREV.br_blockcount,
2738 &i))) 2708 &i)))
2739 goto done; 2709 goto done;
2740 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2710 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2741 /* new right extent - oldext */ 2711 /* new right extent - oldext */
2742 if ((error = xfs_bmbt_update(cur, r[1].br_startoff, 2712 if ((error = xfs_bmbt_update(cur, r[1].br_startoff,
2743 r[1].br_startblock, r[1].br_blockcount, 2713 r[1].br_startblock, r[1].br_blockcount,
@@ -2749,7 +2719,7 @@ xfs_bmap_add_extent_unwritten_real(
2749 new->br_startoff - PREV.br_startoff; 2719 new->br_startoff - PREV.br_startoff;
2750 if ((error = xfs_btree_insert(cur, &i))) 2720 if ((error = xfs_btree_insert(cur, &i)))
2751 goto done; 2721 goto done;
2752 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2722 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2753 /* 2723 /*
2754 * Reset the cursor to the position of the new extent 2724 * Reset the cursor to the position of the new extent
2755 * we are about to insert as we can't trust it after 2725 * we are about to insert as we can't trust it after
@@ -2759,12 +2729,12 @@ xfs_bmap_add_extent_unwritten_real(
2759 new->br_startblock, new->br_blockcount, 2729 new->br_startblock, new->br_blockcount,
2760 &i))) 2730 &i)))
2761 goto done; 2731 goto done;
2762 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 2732 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
2763 /* new middle extent - newext */ 2733 /* new middle extent - newext */
2764 cur->bc_rec.b.br_state = new->br_state; 2734 cur->bc_rec.b.br_state = new->br_state;
2765 if ((error = xfs_btree_insert(cur, &i))) 2735 if ((error = xfs_btree_insert(cur, &i)))
2766 goto done; 2736 goto done;
2767 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 2737 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
2768 } 2738 }
2769 break; 2739 break;
2770 2740
@@ -2944,8 +2914,8 @@ xfs_bmap_add_extent_hole_delay(
2944 } 2914 }
2945 if (oldlen != newlen) { 2915 if (oldlen != newlen) {
2946 ASSERT(oldlen > newlen); 2916 ASSERT(oldlen > newlen);
2947 xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, 2917 xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen),
2948 (int64_t)(oldlen - newlen), 0); 2918 false);
2949 /* 2919 /*
2950 * Nothing to do for disk quota accounting here. 2920 * Nothing to do for disk quota accounting here.
2951 */ 2921 */
@@ -2968,7 +2938,9 @@ xfs_bmap_add_extent_hole_real(
2968 xfs_bmbt_irec_t right; /* right neighbor extent entry */ 2938 xfs_bmbt_irec_t right; /* right neighbor extent entry */
2969 int rval=0; /* return value (logging flags) */ 2939 int rval=0; /* return value (logging flags) */
2970 int state; /* state bits, accessed thru macros */ 2940 int state; /* state bits, accessed thru macros */
2941 struct xfs_mount *mp;
2971 2942
2943 mp = bma->tp ? bma->tp->t_mountp : NULL;
2972 ifp = XFS_IFORK_PTR(bma->ip, whichfork); 2944 ifp = XFS_IFORK_PTR(bma->ip, whichfork);
2973 2945
2974 ASSERT(bma->idx >= 0); 2946 ASSERT(bma->idx >= 0);
@@ -3056,15 +3028,15 @@ xfs_bmap_add_extent_hole_real(
3056 &i); 3028 &i);
3057 if (error) 3029 if (error)
3058 goto done; 3030 goto done;
3059 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3031 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
3060 error = xfs_btree_delete(bma->cur, &i); 3032 error = xfs_btree_delete(bma->cur, &i);
3061 if (error) 3033 if (error)
3062 goto done; 3034 goto done;
3063 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3035 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
3064 error = xfs_btree_decrement(bma->cur, 0, &i); 3036 error = xfs_btree_decrement(bma->cur, 0, &i);
3065 if (error) 3037 if (error)
3066 goto done; 3038 goto done;
3067 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3039 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
3068 error = xfs_bmbt_update(bma->cur, left.br_startoff, 3040 error = xfs_bmbt_update(bma->cur, left.br_startoff,
3069 left.br_startblock, 3041 left.br_startblock,
3070 left.br_blockcount + 3042 left.br_blockcount +
@@ -3097,7 +3069,7 @@ xfs_bmap_add_extent_hole_real(
3097 &i); 3069 &i);
3098 if (error) 3070 if (error)
3099 goto done; 3071 goto done;
3100 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3072 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
3101 error = xfs_bmbt_update(bma->cur, left.br_startoff, 3073 error = xfs_bmbt_update(bma->cur, left.br_startoff,
3102 left.br_startblock, 3074 left.br_startblock,
3103 left.br_blockcount + 3075 left.br_blockcount +
@@ -3131,7 +3103,7 @@ xfs_bmap_add_extent_hole_real(
3131 right.br_blockcount, &i); 3103 right.br_blockcount, &i);
3132 if (error) 3104 if (error)
3133 goto done; 3105 goto done;
3134 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3106 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
3135 error = xfs_bmbt_update(bma->cur, new->br_startoff, 3107 error = xfs_bmbt_update(bma->cur, new->br_startoff,
3136 new->br_startblock, 3108 new->br_startblock,
3137 new->br_blockcount + 3109 new->br_blockcount +
@@ -3161,12 +3133,12 @@ xfs_bmap_add_extent_hole_real(
3161 new->br_blockcount, &i); 3133 new->br_blockcount, &i);
3162 if (error) 3134 if (error)
3163 goto done; 3135 goto done;
3164 XFS_WANT_CORRUPTED_GOTO(i == 0, done); 3136 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
3165 bma->cur->bc_rec.b.br_state = new->br_state; 3137 bma->cur->bc_rec.b.br_state = new->br_state;
3166 error = xfs_btree_insert(bma->cur, &i); 3138 error = xfs_btree_insert(bma->cur, &i);
3167 if (error) 3139 if (error)
3168 goto done; 3140 goto done;
3169 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 3141 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
3170 } 3142 }
3171 break; 3143 break;
3172 } 3144 }
@@ -4160,18 +4132,15 @@ xfs_bmapi_reserve_delalloc(
4160 ASSERT(indlen > 0); 4132 ASSERT(indlen > 0);
4161 4133
4162 if (rt) { 4134 if (rt) {
4163 error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, 4135 error = xfs_mod_frextents(mp, -((int64_t)extsz));
4164 -((int64_t)extsz), 0);
4165 } else { 4136 } else {
4166 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 4137 error = xfs_mod_fdblocks(mp, -((int64_t)alen), false);
4167 -((int64_t)alen), 0);
4168 } 4138 }
4169 4139
4170 if (error) 4140 if (error)
4171 goto out_unreserve_quota; 4141 goto out_unreserve_quota;
4172 4142
4173 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 4143 error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false);
4174 -((int64_t)indlen), 0);
4175 if (error) 4144 if (error)
4176 goto out_unreserve_blocks; 4145 goto out_unreserve_blocks;
4177 4146
@@ -4198,9 +4167,9 @@ xfs_bmapi_reserve_delalloc(
4198 4167
4199out_unreserve_blocks: 4168out_unreserve_blocks:
4200 if (rt) 4169 if (rt)
4201 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); 4170 xfs_mod_frextents(mp, extsz);
4202 else 4171 else
4203 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); 4172 xfs_mod_fdblocks(mp, alen, false);
4204out_unreserve_quota: 4173out_unreserve_quota:
4205 if (XFS_IS_QUOTA_ON(mp)) 4174 if (XFS_IS_QUOTA_ON(mp))
4206 xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? 4175 xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ?
@@ -4801,7 +4770,7 @@ xfs_bmap_del_extent(
4801 got.br_startblock, got.br_blockcount, 4770 got.br_startblock, got.br_blockcount,
4802 &i))) 4771 &i)))
4803 goto done; 4772 goto done;
4804 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 4773 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
4805 } 4774 }
4806 da_old = da_new = 0; 4775 da_old = da_new = 0;
4807 } else { 4776 } else {
@@ -4835,7 +4804,7 @@ xfs_bmap_del_extent(
4835 } 4804 }
4836 if ((error = xfs_btree_delete(cur, &i))) 4805 if ((error = xfs_btree_delete(cur, &i)))
4837 goto done; 4806 goto done;
4838 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 4807 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
4839 break; 4808 break;
4840 4809
4841 case 2: 4810 case 2:
@@ -4935,7 +4904,8 @@ xfs_bmap_del_extent(
4935 got.br_startblock, 4904 got.br_startblock,
4936 temp, &i))) 4905 temp, &i)))
4937 goto done; 4906 goto done;
4938 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 4907 XFS_WANT_CORRUPTED_GOTO(mp,
4908 i == 1, done);
4939 /* 4909 /*
4940 * Update the btree record back 4910 * Update the btree record back
4941 * to the original value. 4911 * to the original value.
@@ -4956,7 +4926,7 @@ xfs_bmap_del_extent(
4956 error = -ENOSPC; 4926 error = -ENOSPC;
4957 goto done; 4927 goto done;
4958 } 4928 }
4959 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 4929 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
4960 } else 4930 } else
4961 flags |= xfs_ilog_fext(whichfork); 4931 flags |= xfs_ilog_fext(whichfork);
4962 XFS_IFORK_NEXT_SET(ip, whichfork, 4932 XFS_IFORK_NEXT_SET(ip, whichfork,
@@ -5012,10 +4982,8 @@ xfs_bmap_del_extent(
5012 * Nothing to do for disk quota accounting here. 4982 * Nothing to do for disk quota accounting here.
5013 */ 4983 */
5014 ASSERT(da_old >= da_new); 4984 ASSERT(da_old >= da_new);
5015 if (da_old > da_new) { 4985 if (da_old > da_new)
5016 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 4986 xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
5017 (int64_t)(da_old - da_new), 0);
5018 }
5019done: 4987done:
5020 *logflagsp = flags; 4988 *logflagsp = flags;
5021 return error; 4989 return error;
@@ -5284,14 +5252,13 @@ xfs_bunmapi(
5284 5252
5285 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); 5253 rtexts = XFS_FSB_TO_B(mp, del.br_blockcount);
5286 do_div(rtexts, mp->m_sb.sb_rextsize); 5254 do_div(rtexts, mp->m_sb.sb_rextsize);
5287 xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, 5255 xfs_mod_frextents(mp, (int64_t)rtexts);
5288 (int64_t)rtexts, 0);
5289 (void)xfs_trans_reserve_quota_nblks(NULL, 5256 (void)xfs_trans_reserve_quota_nblks(NULL,
5290 ip, -((long)del.br_blockcount), 0, 5257 ip, -((long)del.br_blockcount), 0,
5291 XFS_QMOPT_RES_RTBLKS); 5258 XFS_QMOPT_RES_RTBLKS);
5292 } else { 5259 } else {
5293 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 5260 xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount,
5294 (int64_t)del.br_blockcount, 0); 5261 false);
5295 (void)xfs_trans_reserve_quota_nblks(NULL, 5262 (void)xfs_trans_reserve_quota_nblks(NULL,
5296 ip, -((long)del.br_blockcount), 0, 5263 ip, -((long)del.br_blockcount), 0,
5297 XFS_QMOPT_RES_REGBLKS); 5264 XFS_QMOPT_RES_REGBLKS);
@@ -5453,6 +5420,7 @@ xfs_bmse_merge(
5453 struct xfs_bmbt_irec left; 5420 struct xfs_bmbt_irec left;
5454 xfs_filblks_t blockcount; 5421 xfs_filblks_t blockcount;
5455 int error, i; 5422 int error, i;
5423 struct xfs_mount *mp = ip->i_mount;
5456 5424
5457 xfs_bmbt_get_all(gotp, &got); 5425 xfs_bmbt_get_all(gotp, &got);
5458 xfs_bmbt_get_all(leftp, &left); 5426 xfs_bmbt_get_all(leftp, &left);
@@ -5487,19 +5455,19 @@ xfs_bmse_merge(
5487 got.br_blockcount, &i); 5455 got.br_blockcount, &i);
5488 if (error) 5456 if (error)
5489 return error; 5457 return error;
5490 XFS_WANT_CORRUPTED_RETURN(i == 1); 5458 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
5491 5459
5492 error = xfs_btree_delete(cur, &i); 5460 error = xfs_btree_delete(cur, &i);
5493 if (error) 5461 if (error)
5494 return error; 5462 return error;
5495 XFS_WANT_CORRUPTED_RETURN(i == 1); 5463 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
5496 5464
5497 /* lookup and update size of the previous extent */ 5465 /* lookup and update size of the previous extent */
5498 error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, 5466 error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
5499 left.br_blockcount, &i); 5467 left.br_blockcount, &i);
5500 if (error) 5468 if (error)
5501 return error; 5469 return error;
5502 XFS_WANT_CORRUPTED_RETURN(i == 1); 5470 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
5503 5471
5504 left.br_blockcount = blockcount; 5472 left.br_blockcount = blockcount;
5505 5473
@@ -5518,50 +5486,92 @@ xfs_bmse_shift_one(
5518 int *current_ext, 5486 int *current_ext,
5519 struct xfs_bmbt_rec_host *gotp, 5487 struct xfs_bmbt_rec_host *gotp,
5520 struct xfs_btree_cur *cur, 5488 struct xfs_btree_cur *cur,
5521 int *logflags) 5489 int *logflags,
5490 enum shift_direction direction)
5522{ 5491{
5523 struct xfs_ifork *ifp; 5492 struct xfs_ifork *ifp;
5493 struct xfs_mount *mp;
5524 xfs_fileoff_t startoff; 5494 xfs_fileoff_t startoff;
5525 struct xfs_bmbt_rec_host *leftp; 5495 struct xfs_bmbt_rec_host *adj_irecp;
5526 struct xfs_bmbt_irec got; 5496 struct xfs_bmbt_irec got;
5527 struct xfs_bmbt_irec left; 5497 struct xfs_bmbt_irec adj_irec;
5528 int error; 5498 int error;
5529 int i; 5499 int i;
5500 int total_extents;
5530 5501
5502 mp = ip->i_mount;
5531 ifp = XFS_IFORK_PTR(ip, whichfork); 5503 ifp = XFS_IFORK_PTR(ip, whichfork);
5504 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5532 5505
5533 xfs_bmbt_get_all(gotp, &got); 5506 xfs_bmbt_get_all(gotp, &got);
5534 startoff = got.br_startoff - offset_shift_fsb;
5535 5507
5536 /* delalloc extents should be prevented by caller */ 5508 /* delalloc extents should be prevented by caller */
5537 XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock)); 5509 XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock));
5538 5510
5539 /* 5511 if (direction == SHIFT_LEFT) {
5540 * Check for merge if we've got an extent to the left, otherwise make 5512 startoff = got.br_startoff - offset_shift_fsb;
5541 * sure there's enough room at the start of the file for the shift. 5513
5542 */ 5514 /*
5543 if (*current_ext) { 5515 * Check for merge if we've got an extent to the left,
5544 /* grab the left extent and check for a large enough hole */ 5516 * otherwise make sure there's enough room at the start
5545 leftp = xfs_iext_get_ext(ifp, *current_ext - 1); 5517 * of the file for the shift.
5546 xfs_bmbt_get_all(leftp, &left); 5518 */
5519 if (!*current_ext) {
5520 if (got.br_startoff < offset_shift_fsb)
5521 return -EINVAL;
5522 goto update_current_ext;
5523 }
5524 /*
5525 * grab the left extent and check for a large
5526 * enough hole.
5527 */
5528 adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1);
5529 xfs_bmbt_get_all(adj_irecp, &adj_irec);
5547 5530
5548 if (startoff < left.br_startoff + left.br_blockcount) 5531 if (startoff <
5532 adj_irec.br_startoff + adj_irec.br_blockcount)
5549 return -EINVAL; 5533 return -EINVAL;
5550 5534
5551 /* check whether to merge the extent or shift it down */ 5535 /* check whether to merge the extent or shift it down */
5552 if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { 5536 if (xfs_bmse_can_merge(&adj_irec, &got,
5537 offset_shift_fsb)) {
5553 return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, 5538 return xfs_bmse_merge(ip, whichfork, offset_shift_fsb,
5554 *current_ext, gotp, leftp, cur, 5539 *current_ext, gotp, adj_irecp,
5555 logflags); 5540 cur, logflags);
5556 } 5541 }
5557 } else if (got.br_startoff < offset_shift_fsb) 5542 } else {
5558 return -EINVAL; 5543 startoff = got.br_startoff + offset_shift_fsb;
5559 5544 /* nothing to move if this is the last extent */
5545 if (*current_ext >= (total_extents - 1))
5546 goto update_current_ext;
5547 /*
5548 * If this is not the last extent in the file, make sure there
5549 * is enough room between current extent and next extent for
5550 * accommodating the shift.
5551 */
5552 adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1);
5553 xfs_bmbt_get_all(adj_irecp, &adj_irec);
5554 if (startoff + got.br_blockcount > adj_irec.br_startoff)
5555 return -EINVAL;
5556 /*
5557 * Unlike a left shift (which involves a hole punch),
5558 * a right shift does not modify extent neighbors
5559 * in any way. We should never find mergeable extents
5560 * in this scenario. Check anyways and warn if we
5561 * encounter two extents that could be one.
5562 */
5563 if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb))
5564 WARN_ON_ONCE(1);
5565 }
5560 /* 5566 /*
5561 * Increment the extent index for the next iteration, update the start 5567 * Increment the extent index for the next iteration, update the start
5562 * offset of the in-core extent and update the btree if applicable. 5568 * offset of the in-core extent and update the btree if applicable.
5563 */ 5569 */
5564 (*current_ext)++; 5570update_current_ext:
5571 if (direction == SHIFT_LEFT)
5572 (*current_ext)++;
5573 else
5574 (*current_ext)--;
5565 xfs_bmbt_set_startoff(gotp, startoff); 5575 xfs_bmbt_set_startoff(gotp, startoff);
5566 *logflags |= XFS_ILOG_CORE; 5576 *logflags |= XFS_ILOG_CORE;
5567 if (!cur) { 5577 if (!cur) {
@@ -5573,18 +5583,18 @@ xfs_bmse_shift_one(
5573 got.br_blockcount, &i); 5583 got.br_blockcount, &i);
5574 if (error) 5584 if (error)
5575 return error; 5585 return error;
5576 XFS_WANT_CORRUPTED_RETURN(i == 1); 5586 XFS_WANT_CORRUPTED_RETURN(mp, i == 1);
5577 5587
5578 got.br_startoff = startoff; 5588 got.br_startoff = startoff;
5579 return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, 5589 return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
5580 got.br_blockcount, got.br_state); 5590 got.br_blockcount, got.br_state);
5581} 5591}
5582 5592
5583/* 5593/*
5584 * Shift extent records to the left to cover a hole. 5594 * Shift extent records to the left/right to cover/create a hole.
5585 * 5595 *
5586 * The maximum number of extents to be shifted in a single operation is 5596 * The maximum number of extents to be shifted in a single operation is
5587 * @num_exts. @start_fsb specifies the file offset to start the shift and the 5597 * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the
5588 * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb 5598 * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
5589 * is the length by which each extent is shifted. If there is no hole to shift 5599 * is the length by which each extent is shifted. If there is no hole to shift
5590 * the extents into, this will be considered invalid operation and we abort 5600 * the extents into, this will be considered invalid operation and we abort
@@ -5594,12 +5604,13 @@ int
5594xfs_bmap_shift_extents( 5604xfs_bmap_shift_extents(
5595 struct xfs_trans *tp, 5605 struct xfs_trans *tp,
5596 struct xfs_inode *ip, 5606 struct xfs_inode *ip,
5597 xfs_fileoff_t start_fsb, 5607 xfs_fileoff_t *next_fsb,
5598 xfs_fileoff_t offset_shift_fsb, 5608 xfs_fileoff_t offset_shift_fsb,
5599 int *done, 5609 int *done,
5600 xfs_fileoff_t *next_fsb, 5610 xfs_fileoff_t stop_fsb,
5601 xfs_fsblock_t *firstblock, 5611 xfs_fsblock_t *firstblock,
5602 struct xfs_bmap_free *flist, 5612 struct xfs_bmap_free *flist,
5613 enum shift_direction direction,
5603 int num_exts) 5614 int num_exts)
5604{ 5615{
5605 struct xfs_btree_cur *cur = NULL; 5616 struct xfs_btree_cur *cur = NULL;
@@ -5609,10 +5620,11 @@ xfs_bmap_shift_extents(
5609 struct xfs_ifork *ifp; 5620 struct xfs_ifork *ifp;
5610 xfs_extnum_t nexts = 0; 5621 xfs_extnum_t nexts = 0;
5611 xfs_extnum_t current_ext; 5622 xfs_extnum_t current_ext;
5623 xfs_extnum_t total_extents;
5624 xfs_extnum_t stop_extent;
5612 int error = 0; 5625 int error = 0;
5613 int whichfork = XFS_DATA_FORK; 5626 int whichfork = XFS_DATA_FORK;
5614 int logflags = 0; 5627 int logflags = 0;
5615 int total_extents;
5616 5628
5617 if (unlikely(XFS_TEST_ERROR( 5629 if (unlikely(XFS_TEST_ERROR(
5618 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 5630 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5628,6 +5640,8 @@ xfs_bmap_shift_extents(
5628 5640
5629 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 5641 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
5630 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 5642 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
5643 ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
5644 ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT);
5631 5645
5632 ifp = XFS_IFORK_PTR(ip, whichfork); 5646 ifp = XFS_IFORK_PTR(ip, whichfork);
5633 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 5647 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5645,43 +5659,83 @@ xfs_bmap_shift_extents(
5645 } 5659 }
5646 5660
5647 /* 5661 /*
5662 * There may be delalloc extents in the data fork before the range we
5663 * are collapsing out, so we cannot use the count of real extents here.
5664 * Instead we have to calculate it from the incore fork.
5665 */
5666 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5667 if (total_extents == 0) {
5668 *done = 1;
5669 goto del_cursor;
5670 }
5671
5672 /*
5673 * In case of first right shift, we need to initialize next_fsb
5674 */
5675 if (*next_fsb == NULLFSBLOCK) {
5676 gotp = xfs_iext_get_ext(ifp, total_extents - 1);
5677 xfs_bmbt_get_all(gotp, &got);
5678 *next_fsb = got.br_startoff;
5679 if (stop_fsb > *next_fsb) {
5680 *done = 1;
5681 goto del_cursor;
5682 }
5683 }
5684
5685 /* Lookup the extent index at which we have to stop */
5686 if (direction == SHIFT_RIGHT) {
5687 gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent);
5688 /* Make stop_extent exclusive of shift range */
5689 stop_extent--;
5690 } else
5691 stop_extent = total_extents;
5692
5693 /*
5648 * Look up the extent index for the fsb where we start shifting. We can 5694 * Look up the extent index for the fsb where we start shifting. We can
5649 * henceforth iterate with current_ext as extent list changes are locked 5695 * henceforth iterate with current_ext as extent list changes are locked
5650 * out via ilock. 5696 * out via ilock.
5651 * 5697 *
5652 * gotp can be null in 2 cases: 1) if there are no extents or 2) 5698 * gotp can be null in 2 cases: 1) if there are no extents or 2)
5653 * start_fsb lies in a hole beyond which there are no extents. Either 5699 * *next_fsb lies in a hole beyond which there are no extents. Either
5654 * way, we are done. 5700 * way, we are done.
5655 */ 5701 */
5656 gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext); 5702 gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, &current_ext);
5657 if (!gotp) { 5703 if (!gotp) {
5658 *done = 1; 5704 *done = 1;
5659 goto del_cursor; 5705 goto del_cursor;
5660 } 5706 }
5661 5707
5662 /* 5708 /* some sanity checking before we finally start shifting extents */
5663 * There may be delalloc extents in the data fork before the range we 5709 if ((direction == SHIFT_LEFT && current_ext >= stop_extent) ||
5664 * are collapsing out, so we cannot use the count of real extents here. 5710 (direction == SHIFT_RIGHT && current_ext <= stop_extent)) {
5665 * Instead we have to calculate it from the incore fork. 5711 error = -EIO;
5666 */ 5712 goto del_cursor;
5667 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); 5713 }
5668 while (nexts++ < num_exts && current_ext < total_extents) { 5714
5715 while (nexts++ < num_exts) {
5669 error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, 5716 error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
5670 &current_ext, gotp, cur, &logflags); 5717 &current_ext, gotp, cur, &logflags,
5718 direction);
5671 if (error) 5719 if (error)
5672 goto del_cursor; 5720 goto del_cursor;
5721 /*
5722 * If there was an extent merge during the shift, the extent
5723 * count can change. Update the total and grade the next record.
5724 */
5725 if (direction == SHIFT_LEFT) {
5726 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5727 stop_extent = total_extents;
5728 }
5673 5729
5674 /* update total extent count and grab the next record */ 5730 if (current_ext == stop_extent) {
5675 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); 5731 *done = 1;
5676 if (current_ext >= total_extents) 5732 *next_fsb = NULLFSBLOCK;
5677 break; 5733 break;
5734 }
5678 gotp = xfs_iext_get_ext(ifp, current_ext); 5735 gotp = xfs_iext_get_ext(ifp, current_ext);
5679 } 5736 }
5680 5737
5681 /* Check if we are done */ 5738 if (!*done) {
5682 if (current_ext == total_extents) {
5683 *done = 1;
5684 } else if (next_fsb) {
5685 xfs_bmbt_get_all(gotp, &got); 5739 xfs_bmbt_get_all(gotp, &got);
5686 *next_fsb = got.br_startoff; 5740 *next_fsb = got.br_startoff;
5687 } 5741 }
@@ -5696,3 +5750,189 @@ del_cursor:
5696 5750
5697 return error; 5751 return error;
5698} 5752}
5753
5754/*
5755 * Splits an extent into two extents at split_fsb block such that it is
5756 * the first block of the current_ext. @current_ext is a target extent
5757 * to be split. @split_fsb is a block where the extents is split.
5758 * If split_fsb lies in a hole or the first block of extents, just return 0.
5759 */
5760STATIC int
5761xfs_bmap_split_extent_at(
5762 struct xfs_trans *tp,
5763 struct xfs_inode *ip,
5764 xfs_fileoff_t split_fsb,
5765 xfs_fsblock_t *firstfsb,
5766 struct xfs_bmap_free *free_list)
5767{
5768 int whichfork = XFS_DATA_FORK;
5769 struct xfs_btree_cur *cur = NULL;
5770 struct xfs_bmbt_rec_host *gotp;
5771 struct xfs_bmbt_irec got;
5772 struct xfs_bmbt_irec new; /* split extent */
5773 struct xfs_mount *mp = ip->i_mount;
5774 struct xfs_ifork *ifp;
5775 xfs_fsblock_t gotblkcnt; /* new block count for got */
5776 xfs_extnum_t current_ext;
5777 int error = 0;
5778 int logflags = 0;
5779 int i = 0;
5780
5781 if (unlikely(XFS_TEST_ERROR(
5782 (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
5783 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
5784 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
5785 XFS_ERROR_REPORT("xfs_bmap_split_extent_at",
5786 XFS_ERRLEVEL_LOW, mp);
5787 return -EFSCORRUPTED;
5788 }
5789
5790 if (XFS_FORCED_SHUTDOWN(mp))
5791 return -EIO;
5792
5793 ifp = XFS_IFORK_PTR(ip, whichfork);
5794 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
5795 /* Read in all the extents */
5796 error = xfs_iread_extents(tp, ip, whichfork);
5797 if (error)
5798 return error;
5799 }
5800
5801 /*
5802 * gotp can be null in 2 cases: 1) if there are no extents
5803 * or 2) split_fsb lies in a hole beyond which there are
5804 * no extents. Either way, we are done.
5805 */
5806 gotp = xfs_iext_bno_to_ext(ifp, split_fsb, &current_ext);
5807 if (!gotp)
5808 return 0;
5809
5810 xfs_bmbt_get_all(gotp, &got);
5811
5812 /*
5813 * Check split_fsb lies in a hole or the start boundary offset
5814 * of the extent.
5815 */
5816 if (got.br_startoff >= split_fsb)
5817 return 0;
5818
5819 gotblkcnt = split_fsb - got.br_startoff;
5820 new.br_startoff = split_fsb;
5821 new.br_startblock = got.br_startblock + gotblkcnt;
5822 new.br_blockcount = got.br_blockcount - gotblkcnt;
5823 new.br_state = got.br_state;
5824
5825 if (ifp->if_flags & XFS_IFBROOT) {
5826 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5827 cur->bc_private.b.firstblock = *firstfsb;
5828 cur->bc_private.b.flist = free_list;
5829 cur->bc_private.b.flags = 0;
5830 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5831 got.br_startblock,
5832 got.br_blockcount,
5833 &i);
5834 if (error)
5835 goto del_cursor;
5836 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
5837 }
5838
5839 xfs_bmbt_set_blockcount(gotp, gotblkcnt);
5840 got.br_blockcount = gotblkcnt;
5841
5842 logflags = XFS_ILOG_CORE;
5843 if (cur) {
5844 error = xfs_bmbt_update(cur, got.br_startoff,
5845 got.br_startblock,
5846 got.br_blockcount,
5847 got.br_state);
5848 if (error)
5849 goto del_cursor;
5850 } else
5851 logflags |= XFS_ILOG_DEXT;
5852
5853 /* Add new extent */
5854 current_ext++;
5855 xfs_iext_insert(ip, current_ext, 1, &new, 0);
5856 XFS_IFORK_NEXT_SET(ip, whichfork,
5857 XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
5858
5859 if (cur) {
5860 error = xfs_bmbt_lookup_eq(cur, new.br_startoff,
5861 new.br_startblock, new.br_blockcount,
5862 &i);
5863 if (error)
5864 goto del_cursor;
5865 XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor);
5866 cur->bc_rec.b.br_state = new.br_state;
5867
5868 error = xfs_btree_insert(cur, &i);
5869 if (error)
5870 goto del_cursor;
5871 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor);
5872 }
5873
5874 /*
5875 * Convert to a btree if necessary.
5876 */
5877 if (xfs_bmap_needs_btree(ip, whichfork)) {
5878 int tmp_logflags; /* partial log flag return val */
5879
5880 ASSERT(cur == NULL);
5881 error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list,
5882 &cur, 0, &tmp_logflags, whichfork);
5883 logflags |= tmp_logflags;
5884 }
5885
5886del_cursor:
5887 if (cur) {
5888 cur->bc_private.b.allocated = 0;
5889 xfs_btree_del_cursor(cur,
5890 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5891 }
5892
5893 if (logflags)
5894 xfs_trans_log_inode(tp, ip, logflags);
5895 return error;
5896}
5897
5898int
5899xfs_bmap_split_extent(
5900 struct xfs_inode *ip,
5901 xfs_fileoff_t split_fsb)
5902{
5903 struct xfs_mount *mp = ip->i_mount;
5904 struct xfs_trans *tp;
5905 struct xfs_bmap_free free_list;
5906 xfs_fsblock_t firstfsb;
5907 int committed;
5908 int error;
5909
5910 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
5911 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
5912 XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
5913 if (error) {
5914 xfs_trans_cancel(tp, 0);
5915 return error;
5916 }
5917
5918 xfs_ilock(ip, XFS_ILOCK_EXCL);
5919 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
5920
5921 xfs_bmap_init(&free_list, &firstfsb);
5922
5923 error = xfs_bmap_split_extent_at(tp, ip, split_fsb,
5924 &firstfsb, &free_list);
5925 if (error)
5926 goto out;
5927
5928 error = xfs_bmap_finish(&tp, &free_list, &committed);
5929 if (error)
5930 goto out;
5931
5932 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
5933
5934
5935out:
5936 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
5937 return error;
5938}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b9d8a499d2c4..6aaa0c1c7200 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
166 */ 166 */
167#define XFS_BMAP_MAX_SHIFT_EXTENTS 1 167#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
168 168
169enum shift_direction {
170 SHIFT_LEFT = 0,
171 SHIFT_RIGHT,
172};
173
169#ifdef DEBUG 174#ifdef DEBUG
170void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, 175void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
171 int whichfork, unsigned long caller_ip); 176 int whichfork, unsigned long caller_ip);
@@ -211,8 +216,10 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
211 xfs_extnum_t num); 216 xfs_extnum_t num);
212uint xfs_default_attroffset(struct xfs_inode *ip); 217uint xfs_default_attroffset(struct xfs_inode *ip);
213int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, 218int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
214 xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, 219 xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
215 int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, 220 int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
216 struct xfs_bmap_free *flist, int num_exts); 221 struct xfs_bmap_free *flist, enum shift_direction direction,
222 int num_exts);
223int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
217 224
218#endif /* __XFS_BMAP_H__ */ 225#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 81cad433df85..c72283dd8d44 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -168,7 +168,7 @@ xfs_btree_check_lptr(
168 xfs_fsblock_t bno, /* btree block disk address */ 168 xfs_fsblock_t bno, /* btree block disk address */
169 int level) /* btree block level */ 169 int level) /* btree block level */
170{ 170{
171 XFS_WANT_CORRUPTED_RETURN( 171 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
172 level > 0 && 172 level > 0 &&
173 bno != NULLFSBLOCK && 173 bno != NULLFSBLOCK &&
174 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); 174 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
@@ -187,7 +187,7 @@ xfs_btree_check_sptr(
187{ 187{
188 xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; 188 xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
189 189
190 XFS_WANT_CORRUPTED_RETURN( 190 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
191 level > 0 && 191 level > 0 &&
192 bno != NULLAGBLOCK && 192 bno != NULLAGBLOCK &&
193 bno != 0 && 193 bno != 0 &&
@@ -1825,7 +1825,7 @@ xfs_btree_lookup(
1825 error = xfs_btree_increment(cur, 0, &i); 1825 error = xfs_btree_increment(cur, 0, &i);
1826 if (error) 1826 if (error)
1827 goto error0; 1827 goto error0;
1828 XFS_WANT_CORRUPTED_RETURN(i == 1); 1828 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1829 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); 1829 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1830 *stat = 1; 1830 *stat = 1;
1831 return 0; 1831 return 0;
@@ -2285,7 +2285,7 @@ xfs_btree_rshift(
2285 if (error) 2285 if (error)
2286 goto error0; 2286 goto error0;
2287 i = xfs_btree_lastrec(tcur, level); 2287 i = xfs_btree_lastrec(tcur, level);
2288 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 2288 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
2289 2289
2290 error = xfs_btree_increment(tcur, level, &i); 2290 error = xfs_btree_increment(tcur, level, &i);
2291 if (error) 2291 if (error)
@@ -3138,7 +3138,7 @@ xfs_btree_insert(
3138 goto error0; 3138 goto error0;
3139 } 3139 }
3140 3140
3141 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 3141 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3142 level++; 3142 level++;
3143 3143
3144 /* 3144 /*
@@ -3582,15 +3582,15 @@ xfs_btree_delrec(
3582 * Actually any entry but the first would suffice. 3582 * Actually any entry but the first would suffice.
3583 */ 3583 */
3584 i = xfs_btree_lastrec(tcur, level); 3584 i = xfs_btree_lastrec(tcur, level);
3585 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 3585 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3586 3586
3587 error = xfs_btree_increment(tcur, level, &i); 3587 error = xfs_btree_increment(tcur, level, &i);
3588 if (error) 3588 if (error)
3589 goto error0; 3589 goto error0;
3590 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 3590 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3591 3591
3592 i = xfs_btree_lastrec(tcur, level); 3592 i = xfs_btree_lastrec(tcur, level);
3593 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 3593 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3594 3594
3595 /* Grab a pointer to the block. */ 3595 /* Grab a pointer to the block. */
3596 right = xfs_btree_get_block(tcur, level, &rbp); 3596 right = xfs_btree_get_block(tcur, level, &rbp);
@@ -3634,12 +3634,12 @@ xfs_btree_delrec(
3634 rrecs = xfs_btree_get_numrecs(right); 3634 rrecs = xfs_btree_get_numrecs(right);
3635 if (!xfs_btree_ptr_is_null(cur, &lptr)) { 3635 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3636 i = xfs_btree_firstrec(tcur, level); 3636 i = xfs_btree_firstrec(tcur, level);
3637 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 3637 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3638 3638
3639 error = xfs_btree_decrement(tcur, level, &i); 3639 error = xfs_btree_decrement(tcur, level, &i);
3640 if (error) 3640 if (error)
3641 goto error0; 3641 goto error0;
3642 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 3642 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3643 } 3643 }
3644 } 3644 }
3645 3645
@@ -3653,13 +3653,13 @@ xfs_btree_delrec(
3653 * previous block. 3653 * previous block.
3654 */ 3654 */
3655 i = xfs_btree_firstrec(tcur, level); 3655 i = xfs_btree_firstrec(tcur, level);
3656 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 3656 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3657 3657
3658 error = xfs_btree_decrement(tcur, level, &i); 3658 error = xfs_btree_decrement(tcur, level, &i);
3659 if (error) 3659 if (error)
3660 goto error0; 3660 goto error0;
3661 i = xfs_btree_firstrec(tcur, level); 3661 i = xfs_btree_firstrec(tcur, level);
3662 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 3662 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3663 3663
3664 /* Grab a pointer to the block. */ 3664 /* Grab a pointer to the block. */
3665 left = xfs_btree_get_block(tcur, level, &lbp); 3665 left = xfs_btree_get_block(tcur, level, &lbp);
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 9cb0115c6bd1..2385f8cd08ab 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -538,12 +538,12 @@ xfs_da3_root_split(
538 oldroot = blk1->bp->b_addr; 538 oldroot = blk1->bp->b_addr;
539 if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || 539 if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
540 oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { 540 oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
541 struct xfs_da3_icnode_hdr nodehdr; 541 struct xfs_da3_icnode_hdr icnodehdr;
542 542
543 dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot); 543 dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot);
544 btree = dp->d_ops->node_tree_p(oldroot); 544 btree = dp->d_ops->node_tree_p(oldroot);
545 size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); 545 size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
546 level = nodehdr.level; 546 level = icnodehdr.level;
547 547
548 /* 548 /*
549 * we are about to copy oldroot to bp, so set up the type 549 * we are about to copy oldroot to bp, so set up the type
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 0a49b0286372..74bcbabfa523 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr {
725 __uint16_t magic; 725 __uint16_t magic;
726 __uint16_t count; 726 __uint16_t count;
727 __uint16_t usedbytes; 727 __uint16_t usedbytes;
728 __uint16_t firstused; 728 /*
729 * firstused is 32-bit here instead of 16-bit like the on-disk variant
730 * to support maximum fsb size of 64k without overflow issues throughout
731 * the attr code. Instead, the overflow condition is handled on
732 * conversion to/from disk.
733 */
734 __uint32_t firstused;
729 __u8 holes; 735 __u8 holes;
730 struct { 736 struct {
731 __uint16_t base; 737 __uint16_t base;
@@ -734,6 +740,12 @@ struct xfs_attr3_icleaf_hdr {
734}; 740};
735 741
736/* 742/*
743 * Special value to represent fs block size in the leaf header firstused field.
744 * Only used when block size overflows the 2-bytes available on disk.
745 */
746#define XFS_ATTR3_LEAF_NULLOFF 0
747
748/*
737 * Flags used in the leaf_entry[i].flags field. 749 * Flags used in the leaf_entry[i].flags field.
738 * NOTE: the INCOMPLETE bit must not collide with the flags bits specified 750 * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
739 * on the system call, they are "or"ed together for various operations. 751 * on the system call, they are "or"ed together for various operations.
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 5ff31be9b1cd..de1ea16f5748 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -89,7 +89,7 @@ __xfs_dir3_data_check(
89 * so just ensure that the count falls somewhere inside the 89 * so just ensure that the count falls somewhere inside the
90 * block right now. 90 * block right now.
91 */ 91 */
92 XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) < 92 XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) <
93 ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); 93 ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
94 break; 94 break;
95 case cpu_to_be32(XFS_DIR3_DATA_MAGIC): 95 case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
@@ -107,21 +107,21 @@ __xfs_dir3_data_check(
107 bf = ops->data_bestfree_p(hdr); 107 bf = ops->data_bestfree_p(hdr);
108 count = lastfree = freeseen = 0; 108 count = lastfree = freeseen = 0;
109 if (!bf[0].length) { 109 if (!bf[0].length) {
110 XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); 110 XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset);
111 freeseen |= 1 << 0; 111 freeseen |= 1 << 0;
112 } 112 }
113 if (!bf[1].length) { 113 if (!bf[1].length) {
114 XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); 114 XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset);
115 freeseen |= 1 << 1; 115 freeseen |= 1 << 1;
116 } 116 }
117 if (!bf[2].length) { 117 if (!bf[2].length) {
118 XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); 118 XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset);
119 freeseen |= 1 << 2; 119 freeseen |= 1 << 2;
120 } 120 }
121 121
122 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= 122 XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >=
123 be16_to_cpu(bf[1].length)); 123 be16_to_cpu(bf[1].length));
124 XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= 124 XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >=
125 be16_to_cpu(bf[2].length)); 125 be16_to_cpu(bf[2].length));
126 /* 126 /*
127 * Loop over the data/unused entries. 127 * Loop over the data/unused entries.
@@ -134,18 +134,18 @@ __xfs_dir3_data_check(
134 * doesn't need to be there. 134 * doesn't need to be there.
135 */ 135 */
136 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { 136 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
137 XFS_WANT_CORRUPTED_RETURN(lastfree == 0); 137 XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0);
138 XFS_WANT_CORRUPTED_RETURN( 138 XFS_WANT_CORRUPTED_RETURN(mp,
139 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == 139 be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
140 (char *)dup - (char *)hdr); 140 (char *)dup - (char *)hdr);
141 dfp = xfs_dir2_data_freefind(hdr, bf, dup); 141 dfp = xfs_dir2_data_freefind(hdr, bf, dup);
142 if (dfp) { 142 if (dfp) {
143 i = (int)(dfp - bf); 143 i = (int)(dfp - bf);
144 XFS_WANT_CORRUPTED_RETURN( 144 XFS_WANT_CORRUPTED_RETURN(mp,
145 (freeseen & (1 << i)) == 0); 145 (freeseen & (1 << i)) == 0);
146 freeseen |= 1 << i; 146 freeseen |= 1 << i;
147 } else { 147 } else {
148 XFS_WANT_CORRUPTED_RETURN( 148 XFS_WANT_CORRUPTED_RETURN(mp,
149 be16_to_cpu(dup->length) <= 149 be16_to_cpu(dup->length) <=
150 be16_to_cpu(bf[2].length)); 150 be16_to_cpu(bf[2].length));
151 } 151 }
@@ -160,13 +160,13 @@ __xfs_dir3_data_check(
160 * The linear search is crude but this is DEBUG code. 160 * The linear search is crude but this is DEBUG code.
161 */ 161 */
162 dep = (xfs_dir2_data_entry_t *)p; 162 dep = (xfs_dir2_data_entry_t *)p;
163 XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); 163 XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0);
164 XFS_WANT_CORRUPTED_RETURN( 164 XFS_WANT_CORRUPTED_RETURN(mp,
165 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); 165 !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
166 XFS_WANT_CORRUPTED_RETURN( 166 XFS_WANT_CORRUPTED_RETURN(mp,
167 be16_to_cpu(*ops->data_entry_tag_p(dep)) == 167 be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
168 (char *)dep - (char *)hdr); 168 (char *)dep - (char *)hdr);
169 XFS_WANT_CORRUPTED_RETURN( 169 XFS_WANT_CORRUPTED_RETURN(mp,
170 ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); 170 ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
171 count++; 171 count++;
172 lastfree = 0; 172 lastfree = 0;
@@ -183,14 +183,15 @@ __xfs_dir3_data_check(
183 be32_to_cpu(lep[i].hashval) == hash) 183 be32_to_cpu(lep[i].hashval) == hash)
184 break; 184 break;
185 } 185 }
186 XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); 186 XFS_WANT_CORRUPTED_RETURN(mp,
187 i < be32_to_cpu(btp->count));
187 } 188 }
188 p += ops->data_entsize(dep->namelen); 189 p += ops->data_entsize(dep->namelen);
189 } 190 }
190 /* 191 /*
191 * Need to have seen all the entries and all the bestfree slots. 192 * Need to have seen all the entries and all the bestfree slots.
192 */ 193 */
193 XFS_WANT_CORRUPTED_RETURN(freeseen == 7); 194 XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7);
194 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || 195 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
195 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { 196 hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
196 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { 197 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
@@ -198,13 +199,13 @@ __xfs_dir3_data_check(
198 cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) 199 cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
199 stale++; 200 stale++;
200 if (i > 0) 201 if (i > 0)
201 XFS_WANT_CORRUPTED_RETURN( 202 XFS_WANT_CORRUPTED_RETURN(mp,
202 be32_to_cpu(lep[i].hashval) >= 203 be32_to_cpu(lep[i].hashval) >=
203 be32_to_cpu(lep[i - 1].hashval)); 204 be32_to_cpu(lep[i - 1].hashval));
204 } 205 }
205 XFS_WANT_CORRUPTED_RETURN(count == 206 XFS_WANT_CORRUPTED_RETURN(mp, count ==
206 be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); 207 be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
207 XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); 208 XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale));
208 } 209 }
209 return 0; 210 return 0;
210} 211}
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 8eb718979383..4daaa662337b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -264,68 +264,6 @@ typedef struct xfs_dsb {
264 /* must be padded to 64 bit alignment */ 264 /* must be padded to 64 bit alignment */
265} xfs_dsb_t; 265} xfs_dsb_t;
266 266
267/*
268 * Sequence number values for the fields.
269 */
270typedef enum {
271 XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS,
272 XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO,
273 XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS,
274 XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS,
275 XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE,
276 XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG,
277 XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG,
278 XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT,
279 XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO,
280 XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN,
281 XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG,
282 XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT,
283 XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT,
284 XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT,
285 XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD,
286 XFS_SBS_PQUOTINO, XFS_SBS_LSN,
287 XFS_SBS_FIELDCOUNT
288} xfs_sb_field_t;
289
290/*
291 * Mask values, defined based on the xfs_sb_field_t values.
292 * Only define the ones we're using.
293 */
294#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x)
295#define XFS_SB_UUID XFS_SB_MVAL(UUID)
296#define XFS_SB_FNAME XFS_SB_MVAL(FNAME)
297#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO)
298#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO)
299#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO)
300#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM)
301#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO)
302#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO)
303#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS)
304#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN)
305#define XFS_SB_UNIT XFS_SB_MVAL(UNIT)
306#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH)
307#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT)
308#define XFS_SB_IFREE XFS_SB_MVAL(IFREE)
309#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS)
310#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \
311 XFS_SB_MVAL(BAD_FEATURES2))
312#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT)
313#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
314#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
315#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT)
316#define XFS_SB_CRC XFS_SB_MVAL(CRC)
317#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO)
318#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT)
319#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1)
320#define XFS_SB_MOD_BITS \
321 (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \
322 XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
323 XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
324 XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
325 XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
326 XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
327 XFS_SB_PQUOTINO)
328
329 267
330/* 268/*
331 * Misc. Flags - warning - these will be cleared by xfs_repair unless 269 * Misc. Flags - warning - these will be cleared by xfs_repair unless
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 116ef1ddb3e3..07349a183a11 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -376,7 +376,8 @@ xfs_ialloc_ag_alloc(
376 */ 376 */
377 newlen = args.mp->m_ialloc_inos; 377 newlen = args.mp->m_ialloc_inos;
378 if (args.mp->m_maxicount && 378 if (args.mp->m_maxicount &&
379 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) 379 percpu_counter_read(&args.mp->m_icount) + newlen >
380 args.mp->m_maxicount)
380 return -ENOSPC; 381 return -ENOSPC;
381 args.minlen = args.maxlen = args.mp->m_ialloc_blks; 382 args.minlen = args.maxlen = args.mp->m_ialloc_blks;
382 /* 383 /*
@@ -700,7 +701,7 @@ xfs_ialloc_next_rec(
700 error = xfs_inobt_get_rec(cur, rec, &i); 701 error = xfs_inobt_get_rec(cur, rec, &i);
701 if (error) 702 if (error)
702 return error; 703 return error;
703 XFS_WANT_CORRUPTED_RETURN(i == 1); 704 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
704 } 705 }
705 706
706 return 0; 707 return 0;
@@ -724,7 +725,7 @@ xfs_ialloc_get_rec(
724 error = xfs_inobt_get_rec(cur, rec, &i); 725 error = xfs_inobt_get_rec(cur, rec, &i);
725 if (error) 726 if (error)
726 return error; 727 return error;
727 XFS_WANT_CORRUPTED_RETURN(i == 1); 728 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
728 } 729 }
729 730
730 return 0; 731 return 0;
@@ -783,12 +784,12 @@ xfs_dialloc_ag_inobt(
783 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); 784 error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
784 if (error) 785 if (error)
785 goto error0; 786 goto error0;
786 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 787 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
787 788
788 error = xfs_inobt_get_rec(cur, &rec, &j); 789 error = xfs_inobt_get_rec(cur, &rec, &j);
789 if (error) 790 if (error)
790 goto error0; 791 goto error0;
791 XFS_WANT_CORRUPTED_GOTO(j == 1, error0); 792 XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
792 793
793 if (rec.ir_freecount > 0) { 794 if (rec.ir_freecount > 0) {
794 /* 795 /*
@@ -944,19 +945,19 @@ newino:
944 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); 945 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
945 if (error) 946 if (error)
946 goto error0; 947 goto error0;
947 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 948 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
948 949
949 for (;;) { 950 for (;;) {
950 error = xfs_inobt_get_rec(cur, &rec, &i); 951 error = xfs_inobt_get_rec(cur, &rec, &i);
951 if (error) 952 if (error)
952 goto error0; 953 goto error0;
953 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 954 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
954 if (rec.ir_freecount > 0) 955 if (rec.ir_freecount > 0)
955 break; 956 break;
956 error = xfs_btree_increment(cur, 0, &i); 957 error = xfs_btree_increment(cur, 0, &i);
957 if (error) 958 if (error)
958 goto error0; 959 goto error0;
959 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 960 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
960 } 961 }
961 962
962alloc_inode: 963alloc_inode:
@@ -1016,7 +1017,7 @@ xfs_dialloc_ag_finobt_near(
1016 error = xfs_inobt_get_rec(lcur, rec, &i); 1017 error = xfs_inobt_get_rec(lcur, rec, &i);
1017 if (error) 1018 if (error)
1018 return error; 1019 return error;
1019 XFS_WANT_CORRUPTED_RETURN(i == 1); 1020 XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
1020 1021
1021 /* 1022 /*
1022 * See if we've landed in the parent inode record. The finobt 1023 * See if we've landed in the parent inode record. The finobt
@@ -1039,10 +1040,10 @@ xfs_dialloc_ag_finobt_near(
1039 error = xfs_inobt_get_rec(rcur, &rrec, &j); 1040 error = xfs_inobt_get_rec(rcur, &rrec, &j);
1040 if (error) 1041 if (error)
1041 goto error_rcur; 1042 goto error_rcur;
1042 XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); 1043 XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
1043 } 1044 }
1044 1045
1045 XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); 1046 XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
1046 if (i == 1 && j == 1) { 1047 if (i == 1 && j == 1) {
1047 /* 1048 /*
1048 * Both the left and right records are valid. Choose the closer 1049 * Both the left and right records are valid. Choose the closer
@@ -1095,7 +1096,7 @@ xfs_dialloc_ag_finobt_newino(
1095 error = xfs_inobt_get_rec(cur, rec, &i); 1096 error = xfs_inobt_get_rec(cur, rec, &i);
1096 if (error) 1097 if (error)
1097 return error; 1098 return error;
1098 XFS_WANT_CORRUPTED_RETURN(i == 1); 1099 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1099 return 0; 1100 return 0;
1100 } 1101 }
1101 } 1102 }
@@ -1106,12 +1107,12 @@ xfs_dialloc_ag_finobt_newino(
1106 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); 1107 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1107 if (error) 1108 if (error)
1108 return error; 1109 return error;
1109 XFS_WANT_CORRUPTED_RETURN(i == 1); 1110 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1110 1111
1111 error = xfs_inobt_get_rec(cur, rec, &i); 1112 error = xfs_inobt_get_rec(cur, rec, &i);
1112 if (error) 1113 if (error)
1113 return error; 1114 return error;
1114 XFS_WANT_CORRUPTED_RETURN(i == 1); 1115 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1115 1116
1116 return 0; 1117 return 0;
1117} 1118}
@@ -1133,19 +1134,19 @@ xfs_dialloc_ag_update_inobt(
1133 error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); 1134 error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1134 if (error) 1135 if (error)
1135 return error; 1136 return error;
1136 XFS_WANT_CORRUPTED_RETURN(i == 1); 1137 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1137 1138
1138 error = xfs_inobt_get_rec(cur, &rec, &i); 1139 error = xfs_inobt_get_rec(cur, &rec, &i);
1139 if (error) 1140 if (error)
1140 return error; 1141 return error;
1141 XFS_WANT_CORRUPTED_RETURN(i == 1); 1142 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1142 ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % 1143 ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1143 XFS_INODES_PER_CHUNK) == 0); 1144 XFS_INODES_PER_CHUNK) == 0);
1144 1145
1145 rec.ir_free &= ~XFS_INOBT_MASK(offset); 1146 rec.ir_free &= ~XFS_INOBT_MASK(offset);
1146 rec.ir_freecount--; 1147 rec.ir_freecount--;
1147 1148
1148 XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && 1149 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
1149 (rec.ir_freecount == frec->ir_freecount)); 1150 (rec.ir_freecount == frec->ir_freecount));
1150 1151
1151 return xfs_inobt_update(cur, &rec); 1152 return xfs_inobt_update(cur, &rec);
@@ -1340,7 +1341,8 @@ xfs_dialloc(
1340 * inode. 1341 * inode.
1341 */ 1342 */
1342 if (mp->m_maxicount && 1343 if (mp->m_maxicount &&
1343 mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { 1344 percpu_counter_read(&mp->m_icount) + mp->m_ialloc_inos >
1345 mp->m_maxicount) {
1344 noroom = 1; 1346 noroom = 1;
1345 okalloc = 0; 1347 okalloc = 0;
1346 } 1348 }
@@ -1475,14 +1477,14 @@ xfs_difree_inobt(
1475 __func__, error); 1477 __func__, error);
1476 goto error0; 1478 goto error0;
1477 } 1479 }
1478 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1480 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1479 error = xfs_inobt_get_rec(cur, &rec, &i); 1481 error = xfs_inobt_get_rec(cur, &rec, &i);
1480 if (error) { 1482 if (error) {
1481 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", 1483 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1482 __func__, error); 1484 __func__, error);
1483 goto error0; 1485 goto error0;
1484 } 1486 }
1485 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1487 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1486 /* 1488 /*
1487 * Get the offset in the inode chunk. 1489 * Get the offset in the inode chunk.
1488 */ 1490 */
@@ -1592,7 +1594,7 @@ xfs_difree_finobt(
1592 * freed an inode in a previously fully allocated chunk. If not, 1594 * freed an inode in a previously fully allocated chunk. If not,
1593 * something is out of sync. 1595 * something is out of sync.
1594 */ 1596 */
1595 XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); 1597 XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
1596 1598
1597 error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, 1599 error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
1598 ibtrec->ir_free, &i); 1600 ibtrec->ir_free, &i);
@@ -1613,12 +1615,12 @@ xfs_difree_finobt(
1613 error = xfs_inobt_get_rec(cur, &rec, &i); 1615 error = xfs_inobt_get_rec(cur, &rec, &i);
1614 if (error) 1616 if (error)
1615 goto error; 1617 goto error;
1616 XFS_WANT_CORRUPTED_GOTO(i == 1, error); 1618 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
1617 1619
1618 rec.ir_free |= XFS_INOBT_MASK(offset); 1620 rec.ir_free |= XFS_INOBT_MASK(offset);
1619 rec.ir_freecount++; 1621 rec.ir_freecount++;
1620 1622
1621 XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && 1623 XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
1622 (rec.ir_freecount == ibtrec->ir_freecount), 1624 (rec.ir_freecount == ibtrec->ir_freecount),
1623 error); 1625 error);
1624 1626
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index b0a5fe95a3e2..dc4bfc5d88fc 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -111,14 +111,6 @@ xfs_mount_validate_sb(
111 bool check_inprogress, 111 bool check_inprogress,
112 bool check_version) 112 bool check_version)
113{ 113{
114
115 /*
116 * If the log device and data device have the
117 * same device number, the log is internal.
118 * Consequently, the sb_logstart should be non-zero. If
119 * we have a zero sb_logstart in this case, we may be trying to mount
120 * a volume filesystem in a non-volume manner.
121 */
122 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 114 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
123 xfs_warn(mp, "bad magic number"); 115 xfs_warn(mp, "bad magic number");
124 return -EWRONGFS; 116 return -EWRONGFS;
@@ -743,17 +735,15 @@ xfs_initialize_perag_data(
743 btree += pag->pagf_btreeblks; 735 btree += pag->pagf_btreeblks;
744 xfs_perag_put(pag); 736 xfs_perag_put(pag);
745 } 737 }
746 /* 738
747 * Overwrite incore superblock counters with just-read data 739 /* Overwrite incore superblock counters with just-read data */
748 */
749 spin_lock(&mp->m_sb_lock); 740 spin_lock(&mp->m_sb_lock);
750 sbp->sb_ifree = ifree; 741 sbp->sb_ifree = ifree;
751 sbp->sb_icount = ialloc; 742 sbp->sb_icount = ialloc;
752 sbp->sb_fdblocks = bfree + bfreelst + btree; 743 sbp->sb_fdblocks = bfree + bfreelst + btree;
753 spin_unlock(&mp->m_sb_lock); 744 spin_unlock(&mp->m_sb_lock);
754 745
755 /* Fixup the per-cpu counters as well. */ 746 xfs_reinit_percpu_counters(mp);
756 xfs_icsb_reinit_counters(mp);
757 747
758 return 0; 748 return 0;
759} 749}
@@ -771,6 +761,10 @@ xfs_log_sb(
771 struct xfs_mount *mp = tp->t_mountp; 761 struct xfs_mount *mp = tp->t_mountp;
772 struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); 762 struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0);
773 763
764 mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
765 mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
766 mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
767
774 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); 768 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
775 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); 769 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
776 xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); 770 xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1d8eef9cf0f5..a56960dd1684 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1232,6 +1232,117 @@ xfs_vm_releasepage(
1232 return try_to_free_buffers(page); 1232 return try_to_free_buffers(page);
1233} 1233}
1234 1234
1235/*
1236 * When we map a DIO buffer, we may need to attach an ioend that describes the
1237 * type of write IO we are doing. This passes to the completion function the
1238 * operations it needs to perform. If the mapping is for an overwrite wholly
1239 * within the EOF then we don't need an ioend and so we don't allocate one.
1240 * This avoids the unnecessary overhead of allocating and freeing ioends for
1241 * workloads that don't require transactions on IO completion.
1242 *
1243 * If we get multiple mappings in a single IO, we might be mapping different
1244 * types. But because the direct IO can only have a single private pointer, we
1245 * need to ensure that:
1246 *
1247 * a) i) the ioend spans the entire region of unwritten mappings; or
1248 * ii) the ioend spans all the mappings that cross or are beyond EOF; and
1249 * b) if it contains unwritten extents, it is *permanently* marked as such
1250 *
1251 * We could do this by chaining ioends like buffered IO does, but we only
1252 * actually get one IO completion callback from the direct IO, and that spans
1253 * the entire IO regardless of how many mappings and IOs are needed to complete
1254 * the DIO. There is only going to be one reference to the ioend and its life
1255 * cycle is constrained by the DIO completion code. hence we don't need
1256 * reference counting here.
1257 */
1258static void
1259xfs_map_direct(
1260 struct inode *inode,
1261 struct buffer_head *bh_result,
1262 struct xfs_bmbt_irec *imap,
1263 xfs_off_t offset)
1264{
1265 struct xfs_ioend *ioend;
1266 xfs_off_t size = bh_result->b_size;
1267 int type;
1268
1269 if (ISUNWRITTEN(imap))
1270 type = XFS_IO_UNWRITTEN;
1271 else
1272 type = XFS_IO_OVERWRITE;
1273
1274 trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
1275
1276 if (bh_result->b_private) {
1277 ioend = bh_result->b_private;
1278 ASSERT(ioend->io_size > 0);
1279 ASSERT(offset >= ioend->io_offset);
1280 if (offset + size > ioend->io_offset + ioend->io_size)
1281 ioend->io_size = offset - ioend->io_offset + size;
1282
1283 if (type == XFS_IO_UNWRITTEN && type != ioend->io_type)
1284 ioend->io_type = XFS_IO_UNWRITTEN;
1285
1286 trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset,
1287 ioend->io_size, ioend->io_type,
1288 imap);
1289 } else if (type == XFS_IO_UNWRITTEN ||
1290 offset + size > i_size_read(inode)) {
1291 ioend = xfs_alloc_ioend(inode, type);
1292 ioend->io_offset = offset;
1293 ioend->io_size = size;
1294
1295 bh_result->b_private = ioend;
1296 set_buffer_defer_completion(bh_result);
1297
1298 trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type,
1299 imap);
1300 } else {
1301 trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
1302 imap);
1303 }
1304}
1305
1306/*
1307 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1308 * is, so that we can avoid repeated get_blocks calls.
1309 *
1310 * If the mapping spans EOF, then we have to break the mapping up as the mapping
1311 * for blocks beyond EOF must be marked new so that sub block regions can be
1312 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1313 * was just allocated or is unwritten, otherwise the callers would overwrite
1314 * existing data with zeros. Hence we have to split the mapping into a range up
1315 * to and including EOF, and a second mapping for beyond EOF.
1316 */
1317static void
1318xfs_map_trim_size(
1319 struct inode *inode,
1320 sector_t iblock,
1321 struct buffer_head *bh_result,
1322 struct xfs_bmbt_irec *imap,
1323 xfs_off_t offset,
1324 ssize_t size)
1325{
1326 xfs_off_t mapping_size;
1327
1328 mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1329 mapping_size <<= inode->i_blkbits;
1330
1331 ASSERT(mapping_size > 0);
1332 if (mapping_size > size)
1333 mapping_size = size;
1334 if (offset < i_size_read(inode) &&
1335 offset + mapping_size >= i_size_read(inode)) {
1336 /* limit mapping to block that spans EOF */
1337 mapping_size = roundup_64(i_size_read(inode) - offset,
1338 1 << inode->i_blkbits);
1339 }
1340 if (mapping_size > LONG_MAX)
1341 mapping_size = LONG_MAX;
1342
1343 bh_result->b_size = mapping_size;
1344}
1345
1235STATIC int 1346STATIC int
1236__xfs_get_blocks( 1347__xfs_get_blocks(
1237 struct inode *inode, 1348 struct inode *inode,
@@ -1320,31 +1431,37 @@ __xfs_get_blocks(
1320 1431
1321 xfs_iunlock(ip, lockmode); 1432 xfs_iunlock(ip, lockmode);
1322 } 1433 }
1323 1434 trace_xfs_get_blocks_alloc(ip, offset, size,
1324 trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); 1435 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1436 : XFS_IO_DELALLOC, &imap);
1325 } else if (nimaps) { 1437 } else if (nimaps) {
1326 trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); 1438 trace_xfs_get_blocks_found(ip, offset, size,
1439 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1440 : XFS_IO_OVERWRITE, &imap);
1327 xfs_iunlock(ip, lockmode); 1441 xfs_iunlock(ip, lockmode);
1328 } else { 1442 } else {
1329 trace_xfs_get_blocks_notfound(ip, offset, size); 1443 trace_xfs_get_blocks_notfound(ip, offset, size);
1330 goto out_unlock; 1444 goto out_unlock;
1331 } 1445 }
1332 1446
1447 /* trim mapping down to size requested */
1448 if (direct || size > (1 << inode->i_blkbits))
1449 xfs_map_trim_size(inode, iblock, bh_result,
1450 &imap, offset, size);
1451
1452 /*
1453 * For unwritten extents do not report a disk address in the buffered
1454 * read case (treat as if we're reading into a hole).
1455 */
1333 if (imap.br_startblock != HOLESTARTBLOCK && 1456 if (imap.br_startblock != HOLESTARTBLOCK &&
1334 imap.br_startblock != DELAYSTARTBLOCK) { 1457 imap.br_startblock != DELAYSTARTBLOCK &&
1335 /* 1458 (create || !ISUNWRITTEN(&imap))) {
1336 * For unwritten extents do not report a disk address on 1459 xfs_map_buffer(inode, bh_result, &imap, offset);
1337 * the read case (treat as if we're reading into a hole). 1460 if (ISUNWRITTEN(&imap))
1338 */
1339 if (create || !ISUNWRITTEN(&imap))
1340 xfs_map_buffer(inode, bh_result, &imap, offset);
1341 if (create && ISUNWRITTEN(&imap)) {
1342 if (direct) {
1343 bh_result->b_private = inode;
1344 set_buffer_defer_completion(bh_result);
1345 }
1346 set_buffer_unwritten(bh_result); 1461 set_buffer_unwritten(bh_result);
1347 } 1462 /* direct IO needs special help */
1463 if (create && direct)
1464 xfs_map_direct(inode, bh_result, &imap, offset);
1348 } 1465 }
1349 1466
1350 /* 1467 /*
@@ -1377,39 +1494,6 @@ __xfs_get_blocks(
1377 } 1494 }
1378 } 1495 }
1379 1496
1380 /*
1381 * If this is O_DIRECT or the mpage code calling tell them how large
1382 * the mapping is, so that we can avoid repeated get_blocks calls.
1383 *
1384 * If the mapping spans EOF, then we have to break the mapping up as the
1385 * mapping for blocks beyond EOF must be marked new so that sub block
1386 * regions can be correctly zeroed. We can't do this for mappings within
1387 * EOF unless the mapping was just allocated or is unwritten, otherwise
1388 * the callers would overwrite existing data with zeros. Hence we have
1389 * to split the mapping into a range up to and including EOF, and a
1390 * second mapping for beyond EOF.
1391 */
1392 if (direct || size > (1 << inode->i_blkbits)) {
1393 xfs_off_t mapping_size;
1394
1395 mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
1396 mapping_size <<= inode->i_blkbits;
1397
1398 ASSERT(mapping_size > 0);
1399 if (mapping_size > size)
1400 mapping_size = size;
1401 if (offset < i_size_read(inode) &&
1402 offset + mapping_size >= i_size_read(inode)) {
1403 /* limit mapping to block that spans EOF */
1404 mapping_size = roundup_64(i_size_read(inode) - offset,
1405 1 << inode->i_blkbits);
1406 }
1407 if (mapping_size > LONG_MAX)
1408 mapping_size = LONG_MAX;
1409
1410 bh_result->b_size = mapping_size;
1411 }
1412
1413 return 0; 1497 return 0;
1414 1498
1415out_unlock: 1499out_unlock:
@@ -1440,9 +1524,11 @@ xfs_get_blocks_direct(
1440/* 1524/*
1441 * Complete a direct I/O write request. 1525 * Complete a direct I/O write request.
1442 * 1526 *
1443 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1527 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
1444 * need to issue a transaction to convert the range from unwritten to written 1528 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
1445 * extents. 1529 * wholly within the EOF and so there is nothing for us to do. Note that in this
1530 * case the completion can be called in interrupt context, whereas if we have an
1531 * ioend we will always be called in task context (i.e. from a workqueue).
1446 */ 1532 */
1447STATIC void 1533STATIC void
1448xfs_end_io_direct_write( 1534xfs_end_io_direct_write(
@@ -1454,43 +1540,71 @@ xfs_end_io_direct_write(
1454 struct inode *inode = file_inode(iocb->ki_filp); 1540 struct inode *inode = file_inode(iocb->ki_filp);
1455 struct xfs_inode *ip = XFS_I(inode); 1541 struct xfs_inode *ip = XFS_I(inode);
1456 struct xfs_mount *mp = ip->i_mount; 1542 struct xfs_mount *mp = ip->i_mount;
1543 struct xfs_ioend *ioend = private;
1457 1544
1458 if (XFS_FORCED_SHUTDOWN(mp)) 1545 trace_xfs_gbmap_direct_endio(ip, offset, size,
1546 ioend ? ioend->io_type : 0, NULL);
1547
1548 if (!ioend) {
1549 ASSERT(offset + size <= i_size_read(inode));
1459 return; 1550 return;
1551 }
1552
1553 if (XFS_FORCED_SHUTDOWN(mp))
1554 goto out_end_io;
1460 1555
1461 /* 1556 /*
1462 * While the generic direct I/O code updates the inode size, it does 1557 * dio completion end_io functions are only called on writes if more
1463 * so only after the end_io handler is called, which means our 1558 * than 0 bytes was written.
1464 * end_io handler thinks the on-disk size is outside the in-core
1465 * size. To prevent this just update it a little bit earlier here.
1466 */ 1559 */
1560 ASSERT(size > 0);
1561
1562 /*
1563 * The ioend only maps whole blocks, while the IO may be sector aligned.
1564 * Hence the ioend offset/size may not match the IO offset/size exactly.
1565 * Because we don't map overwrites within EOF into the ioend, the offset
1566 * may not match, but only if the endio spans EOF. Either way, write
1567 * the IO sizes into the ioend so that completion processing does the
1568 * right thing.
1569 */
1570 ASSERT(offset + size <= ioend->io_offset + ioend->io_size);
1571 ioend->io_size = size;
1572 ioend->io_offset = offset;
1573
1574 /*
1575 * The ioend tells us whether we are doing unwritten extent conversion
1576 * or an append transaction that updates the on-disk file size. These
1577 * cases are the only cases where we should *potentially* be needing
1578 * to update the VFS inode size.
1579 *
1580 * We need to update the in-core inode size here so that we don't end up
1581 * with the on-disk inode size being outside the in-core inode size. We
1582 * have no other method of updating EOF for AIO, so always do it here
1583 * if necessary.
1584 *
1585 * We need to lock the test/set EOF update as we can be racing with
1586 * other IO completions here to update the EOF. Failing to serialise
1587 * here can result in EOF moving backwards and Bad Things Happen when
1588 * that occurs.
1589 */
1590 spin_lock(&ip->i_flags_lock);
1467 if (offset + size > i_size_read(inode)) 1591 if (offset + size > i_size_read(inode))
1468 i_size_write(inode, offset + size); 1592 i_size_write(inode, offset + size);
1593 spin_unlock(&ip->i_flags_lock);
1469 1594
1470 /* 1595 /*
1471 * For direct I/O we do not know if we need to allocate blocks or not, 1596 * If we are doing an append IO that needs to update the EOF on disk,
1472 * so we can't preallocate an append transaction, as that results in 1597 * do the transaction reserve now so we can use common end io
1473 * nested reservations and log space deadlocks. Hence allocate the 1598 * processing. Stashing the error (if there is one) in the ioend will
1474 * transaction here. While this is sub-optimal and can block IO 1599 * result in the ioend processing passing on the error if it is
1475 * completion for some time, we're stuck with doing it this way until 1600 * possible as we can't return it from here.
1476 * we can pass the ioend to the direct IO allocation callbacks and
1477 * avoid nesting that way.
1478 */ 1601 */
1479 if (private && size > 0) { 1602 if (ioend->io_type == XFS_IO_OVERWRITE)
1480 xfs_iomap_write_unwritten(ip, offset, size); 1603 ioend->io_error = xfs_setfilesize_trans_alloc(ioend);
1481 } else if (offset + size > ip->i_d.di_size) {
1482 struct xfs_trans *tp;
1483 int error;
1484
1485 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
1486 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
1487 if (error) {
1488 xfs_trans_cancel(tp, 0);
1489 return;
1490 }
1491 1604
1492 xfs_setfilesize(ip, tp, offset, size); 1605out_end_io:
1493 } 1606 xfs_end_io(&ioend->io_work);
1607 return;
1494} 1608}
1495 1609
1496STATIC ssize_t 1610STATIC ssize_t
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 83af4c149635..f9c1c64782d3 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive(
132 int size; 132 int size;
133 int tmp; 133 int tmp;
134 int i; 134 int i;
135 struct xfs_mount *mp = bp->b_target->bt_mount;
135 136
136 leaf = bp->b_addr; 137 leaf = bp->b_addr;
137 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 138 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
138 139
139 /* 140 /*
140 * Count the number of "remote" value extents. 141 * Count the number of "remote" value extents.
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index a43d370d2c58..65fb37a18e92 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
225 int error, i; 225 int error, i;
226 struct xfs_buf *bp; 226 struct xfs_buf *bp;
227 struct xfs_inode *dp = context->dp; 227 struct xfs_inode *dp = context->dp;
228 struct xfs_mount *mp = dp->i_mount;
228 229
229 trace_xfs_attr_node_list(context); 230 trace_xfs_attr_node_list(context);
230 231
@@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
256 case XFS_ATTR_LEAF_MAGIC: 257 case XFS_ATTR_LEAF_MAGIC:
257 case XFS_ATTR3_LEAF_MAGIC: 258 case XFS_ATTR3_LEAF_MAGIC:
258 leaf = bp->b_addr; 259 leaf = bp->b_addr;
259 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); 260 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo,
261 &leafhdr, leaf);
260 entries = xfs_attr3_leaf_entryp(leaf); 262 entries = xfs_attr3_leaf_entryp(leaf);
261 if (cursor->hashval > be32_to_cpu( 263 if (cursor->hashval > be32_to_cpu(
262 entries[leafhdr.count - 1].hashval)) { 264 entries[leafhdr.count - 1].hashval)) {
@@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
340 xfs_trans_brelse(NULL, bp); 342 xfs_trans_brelse(NULL, bp);
341 return error; 343 return error;
342 } 344 }
343 xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); 345 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
344 if (context->seen_enough || leafhdr.forw == 0) 346 if (context->seen_enough || leafhdr.forw == 0)
345 break; 347 break;
346 cursor->blkno = leafhdr.forw; 348 cursor->blkno = leafhdr.forw;
@@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int(
368 struct xfs_attr_leaf_entry *entry; 370 struct xfs_attr_leaf_entry *entry;
369 int retval; 371 int retval;
370 int i; 372 int i;
373 struct xfs_mount *mp = context->dp->i_mount;
371 374
372 trace_xfs_attr_list_leaf(context); 375 trace_xfs_attr_list_leaf(context);
373 376
374 leaf = bp->b_addr; 377 leaf = bp->b_addr;
375 xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); 378 xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
376 entries = xfs_attr3_leaf_entryp(leaf); 379 entries = xfs_attr3_leaf_entryp(leaf);
377 380
378 cursor = context->cursor; 381 cursor = context->cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 22a5dcb70b32..a52bbd3abc7d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1376,22 +1376,19 @@ out:
1376} 1376}
1377 1377
1378/* 1378/*
1379 * xfs_collapse_file_space() 1379 * @next_fsb will keep track of the extent currently undergoing shift.
1380 * This routine frees disk space and shift extent for the given file. 1380 * @stop_fsb will keep track of the extent at which we have to stop.
1381 * The first thing we do is to free data blocks in the specified range 1381 * If we are shifting left, we will start with block (offset + len) and
1382 * by calling xfs_free_file_space(). It would also sync dirty data 1382 * shift each extent till last extent.
1383 * and invalidate page cache over the region on which collapse range 1383 * If we are shifting right, we will start with last extent inside file space
1384 * is working. And Shift extent records to the left to cover a hole. 1384 * and continue until we reach the block corresponding to offset.
1385 * RETURNS:
1386 * 0 on success
1387 * errno on error
1388 *
1389 */ 1385 */
1390int 1386static int
1391xfs_collapse_file_space( 1387xfs_shift_file_space(
1392 struct xfs_inode *ip, 1388 struct xfs_inode *ip,
1393 xfs_off_t offset, 1389 xfs_off_t offset,
1394 xfs_off_t len) 1390 xfs_off_t len,
1391 enum shift_direction direction)
1395{ 1392{
1396 int done = 0; 1393 int done = 0;
1397 struct xfs_mount *mp = ip->i_mount; 1394 struct xfs_mount *mp = ip->i_mount;
@@ -1400,21 +1397,26 @@ xfs_collapse_file_space(
1400 struct xfs_bmap_free free_list; 1397 struct xfs_bmap_free free_list;
1401 xfs_fsblock_t first_block; 1398 xfs_fsblock_t first_block;
1402 int committed; 1399 int committed;
1403 xfs_fileoff_t start_fsb; 1400 xfs_fileoff_t stop_fsb;
1404 xfs_fileoff_t next_fsb; 1401 xfs_fileoff_t next_fsb;
1405 xfs_fileoff_t shift_fsb; 1402 xfs_fileoff_t shift_fsb;
1406 1403
1407 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1404 ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
1408 1405
1409 trace_xfs_collapse_file_space(ip); 1406 if (direction == SHIFT_LEFT) {
1407 next_fsb = XFS_B_TO_FSB(mp, offset + len);
1408 stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
1409 } else {
1410 /*
1411 * If right shift, delegate the work of initialization of
1412 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
1413 */
1414 next_fsb = NULLFSBLOCK;
1415 stop_fsb = XFS_B_TO_FSB(mp, offset);
1416 }
1410 1417
1411 next_fsb = XFS_B_TO_FSB(mp, offset + len);
1412 shift_fsb = XFS_B_TO_FSB(mp, len); 1418 shift_fsb = XFS_B_TO_FSB(mp, len);
1413 1419
1414 error = xfs_free_file_space(ip, offset, len);
1415 if (error)
1416 return error;
1417
1418 /* 1420 /*
1419 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation 1421 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
1420 * into the accessible region of the file. 1422 * into the accessible region of the file.
@@ -1427,20 +1429,28 @@ xfs_collapse_file_space(
1427 1429
1428 /* 1430 /*
1429 * Writeback and invalidate cache for the remainder of the file as we're 1431 * Writeback and invalidate cache for the remainder of the file as we're
1430 * about to shift down every extent from the collapse range to EOF. The 1432 * about to shift down every extent from offset to EOF.
1431 * free of the collapse range above might have already done some of
1432 * this, but we shouldn't rely on it to do anything outside of the range
1433 * that was freed.
1434 */ 1433 */
1435 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 1434 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1436 offset + len, -1); 1435 offset, -1);
1437 if (error) 1436 if (error)
1438 return error; 1437 return error;
1439 error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, 1438 error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
1440 (offset + len) >> PAGE_CACHE_SHIFT, -1); 1439 offset >> PAGE_CACHE_SHIFT, -1);
1441 if (error) 1440 if (error)
1442 return error; 1441 return error;
1443 1442
1443 /*
1444 * The extent shiting code works on extent granularity. So, if
1445 * stop_fsb is not the starting block of extent, we need to split
1446 * the extent at stop_fsb.
1447 */
1448 if (direction == SHIFT_RIGHT) {
1449 error = xfs_bmap_split_extent(ip, stop_fsb);
1450 if (error)
1451 return error;
1452 }
1453
1444 while (!error && !done) { 1454 while (!error && !done) {
1445 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 1455 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1446 /* 1456 /*
@@ -1464,7 +1474,7 @@ xfs_collapse_file_space(
1464 if (error) 1474 if (error)
1465 goto out; 1475 goto out;
1466 1476
1467 xfs_trans_ijoin(tp, ip, 0); 1477 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1468 1478
1469 xfs_bmap_init(&free_list, &first_block); 1479 xfs_bmap_init(&free_list, &first_block);
1470 1480
@@ -1472,10 +1482,9 @@ xfs_collapse_file_space(
1472 * We are using the write transaction in which max 2 bmbt 1482 * We are using the write transaction in which max 2 bmbt
1473 * updates are allowed 1483 * updates are allowed
1474 */ 1484 */
1475 start_fsb = next_fsb; 1485 error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
1476 error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb, 1486 &done, stop_fsb, &first_block, &free_list,
1477 &done, &next_fsb, &first_block, &free_list, 1487 direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
1478 XFS_BMAP_MAX_SHIFT_EXTENTS);
1479 if (error) 1488 if (error)
1480 goto out; 1489 goto out;
1481 1490
@@ -1484,18 +1493,70 @@ xfs_collapse_file_space(
1484 goto out; 1493 goto out;
1485 1494
1486 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1495 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1487 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1488 } 1496 }
1489 1497
1490 return error; 1498 return error;
1491 1499
1492out: 1500out:
1493 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 1501 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1494 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1495 return error; 1502 return error;
1496} 1503}
1497 1504
1498/* 1505/*
1506 * xfs_collapse_file_space()
1507 * This routine frees disk space and shift extent for the given file.
1508 * The first thing we do is to free data blocks in the specified range
1509 * by calling xfs_free_file_space(). It would also sync dirty data
1510 * and invalidate page cache over the region on which collapse range
1511 * is working. And Shift extent records to the left to cover a hole.
1512 * RETURNS:
1513 * 0 on success
1514 * errno on error
1515 *
1516 */
1517int
1518xfs_collapse_file_space(
1519 struct xfs_inode *ip,
1520 xfs_off_t offset,
1521 xfs_off_t len)
1522{
1523 int error;
1524
1525 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1526 trace_xfs_collapse_file_space(ip);
1527
1528 error = xfs_free_file_space(ip, offset, len);
1529 if (error)
1530 return error;
1531
1532 return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
1533}
1534
1535/*
1536 * xfs_insert_file_space()
1537 * This routine create hole space by shifting extents for the given file.
1538 * The first thing we do is to sync dirty data and invalidate page cache
1539 * over the region on which insert range is working. And split an extent
1540 * to two extents at given offset by calling xfs_bmap_split_extent.
1541 * And shift all extent records which are laying between [offset,
1542 * last allocated extent] to the right to reserve hole range.
1543 * RETURNS:
1544 * 0 on success
1545 * errno on error
1546 */
1547int
1548xfs_insert_file_space(
1549 struct xfs_inode *ip,
1550 loff_t offset,
1551 loff_t len)
1552{
1553 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1554 trace_xfs_insert_file_space(ip);
1555
1556 return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
1557}
1558
1559/*
1499 * We need to check that the format of the data fork in the temporary inode is 1560 * We need to check that the format of the data fork in the temporary inode is
1500 * valid for the target inode before doing the swap. This is not a problem with 1561 * valid for the target inode before doing the swap. This is not a problem with
1501 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized 1562 * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
@@ -1599,13 +1660,6 @@ xfs_swap_extent_flush(
1599 /* Verify O_DIRECT for ftmp */ 1660 /* Verify O_DIRECT for ftmp */
1600 if (VFS_I(ip)->i_mapping->nrpages) 1661 if (VFS_I(ip)->i_mapping->nrpages)
1601 return -EINVAL; 1662 return -EINVAL;
1602
1603 /*
1604 * Don't try to swap extents on mmap()d files because we can't lock
1605 * out races against page faults safely.
1606 */
1607 if (mapping_mapped(VFS_I(ip)->i_mapping))
1608 return -EBUSY;
1609 return 0; 1663 return 0;
1610} 1664}
1611 1665
@@ -1633,13 +1687,14 @@ xfs_swap_extents(
1633 } 1687 }
1634 1688
1635 /* 1689 /*
1636 * Lock up the inodes against other IO and truncate to begin with. 1690 * Lock the inodes against other IO, page faults and truncate to
1637 * Then we can ensure the inodes are flushed and have no page cache 1691 * begin with. Then we can ensure the inodes are flushed and have no
1638 * safely. Once we have done this we can take the ilocks and do the rest 1692 * page cache safely. Once we have done this we can take the ilocks and
1639 * of the checks. 1693 * do the rest of the checks.
1640 */ 1694 */
1641 lock_flags = XFS_IOLOCK_EXCL; 1695 lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1642 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); 1696 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1697 xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
1643 1698
1644 /* Verify that both files have the same format */ 1699 /* Verify that both files have the same format */
1645 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { 1700 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
@@ -1666,8 +1721,16 @@ xfs_swap_extents(
1666 xfs_trans_cancel(tp, 0); 1721 xfs_trans_cancel(tp, 0);
1667 goto out_unlock; 1722 goto out_unlock;
1668 } 1723 }
1724
1725 /*
1726 * Lock and join the inodes to the tansaction so that transaction commit
1727 * or cancel will unlock the inodes from this point onwards.
1728 */
1669 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); 1729 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1670 lock_flags |= XFS_ILOCK_EXCL; 1730 lock_flags |= XFS_ILOCK_EXCL;
1731 xfs_trans_ijoin(tp, ip, lock_flags);
1732 xfs_trans_ijoin(tp, tip, lock_flags);
1733
1671 1734
1672 /* Verify all data are being swapped */ 1735 /* Verify all data are being swapped */
1673 if (sxp->sx_offset != 0 || 1736 if (sxp->sx_offset != 0 ||
@@ -1720,9 +1783,6 @@ xfs_swap_extents(
1720 goto out_trans_cancel; 1783 goto out_trans_cancel;
1721 } 1784 }
1722 1785
1723 xfs_trans_ijoin(tp, ip, lock_flags);
1724 xfs_trans_ijoin(tp, tip, lock_flags);
1725
1726 /* 1786 /*
1727 * Before we've swapped the forks, lets set the owners of the forks 1787 * Before we've swapped the forks, lets set the owners of the forks
1728 * appropriately. We have to do this as we are demand paging the btree 1788 * appropriately. We have to do this as we are demand paging the btree
@@ -1856,5 +1916,5 @@ out_unlock:
1856 1916
1857out_trans_cancel: 1917out_trans_cancel:
1858 xfs_trans_cancel(tp, 0); 1918 xfs_trans_cancel(tp, 0);
1859 goto out_unlock; 1919 goto out;
1860} 1920}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 736429a72a12..af97d9a1dfb4 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -63,6 +63,8 @@ int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
63 xfs_off_t len); 63 xfs_off_t len);
64int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, 64int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
65 xfs_off_t len); 65 xfs_off_t len);
66int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
67 xfs_off_t len);
66 68
67/* EOF block manipulation functions */ 69/* EOF block manipulation functions */
68bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); 70bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 507d96a57ac7..092d652bc03d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -537,9 +537,9 @@ xfs_buf_item_push(
537 537
538 /* has a previous flush failed due to IO errors? */ 538 /* has a previous flush failed due to IO errors? */
539 if ((bp->b_flags & XBF_WRITE_FAIL) && 539 if ((bp->b_flags & XBF_WRITE_FAIL) &&
540 ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { 540 ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) {
541 xfs_warn(bp->b_target->bt_mount, 541 xfs_warn(bp->b_target->bt_mount,
542"Detected failing async write on buffer block 0x%llx. Retrying async write.", 542"Failing async write on buffer block 0x%llx. Retrying async write.",
543 (long long)bp->b_bn); 543 (long long)bp->b_bn);
544 } 544 }
545 545
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 799e5a2d334d..e85a9519a5ae 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -84,7 +84,7 @@ xfs_trim_extents(
84 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); 84 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
85 if (error) 85 if (error)
86 goto out_del_cursor; 86 goto out_del_cursor;
87 XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); 87 XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor);
88 ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); 88 ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest));
89 89
90 /* 90 /*
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 3ee186ac1093..338e50bbfd1e 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -131,7 +131,7 @@ xfs_error_report(
131{ 131{
132 if (level <= xfs_error_level) { 132 if (level <= xfs_error_level) {
133 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, 133 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
134 "Internal error %s at line %d of file %s. Caller %pF", 134 "Internal error %s at line %d of file %s. Caller %pS",
135 tag, linenum, filename, ra); 135 tag, linenum, filename, ra);
136 136
137 xfs_stack_trace(); 137 xfs_stack_trace();
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 279a76e52791..c0394ed126fc 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
40/* 40/*
41 * Macros to set EFSCORRUPTED & return/branch. 41 * Macros to set EFSCORRUPTED & return/branch.
42 */ 42 */
43#define XFS_WANT_CORRUPTED_GOTO(x,l) \ 43#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \
44 { \ 44 { \
45 int fs_is_ok = (x); \ 45 int fs_is_ok = (x); \
46 ASSERT(fs_is_ok); \ 46 ASSERT(fs_is_ok); \
47 if (unlikely(!fs_is_ok)) { \ 47 if (unlikely(!fs_is_ok)) { \
48 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ 48 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
49 XFS_ERRLEVEL_LOW, NULL); \ 49 XFS_ERRLEVEL_LOW, mp); \
50 error = -EFSCORRUPTED; \ 50 error = -EFSCORRUPTED; \
51 goto l; \ 51 goto l; \
52 } \ 52 } \
53 } 53 }
54 54
55#define XFS_WANT_CORRUPTED_RETURN(x) \ 55#define XFS_WANT_CORRUPTED_RETURN(mp, x) \
56 { \ 56 { \
57 int fs_is_ok = (x); \ 57 int fs_is_ok = (x); \
58 ASSERT(fs_is_ok); \ 58 ASSERT(fs_is_ok); \
59 if (unlikely(!fs_is_ok)) { \ 59 if (unlikely(!fs_is_ok)) { \
60 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ 60 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
61 XFS_ERRLEVEL_LOW, NULL); \ 61 XFS_ERRLEVEL_LOW, mp); \
62 return -EFSCORRUPTED; \ 62 return -EFSCORRUPTED; \
63 } \ 63 } \
64 } 64 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f12ad0a8585..8121e75352ee 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -559,7 +559,7 @@ restart:
559 if (error <= 0) 559 if (error <= 0)
560 return error; 560 return error;
561 561
562 error = xfs_break_layouts(inode, iolock); 562 error = xfs_break_layouts(inode, iolock, true);
563 if (error) 563 if (error)
564 return error; 564 return error;
565 565
@@ -569,21 +569,42 @@ restart:
569 * write. If zeroing is needed and we are currently holding the 569 * write. If zeroing is needed and we are currently holding the
570 * iolock shared, we need to update it to exclusive which implies 570 * iolock shared, we need to update it to exclusive which implies
571 * having to redo all checks before. 571 * having to redo all checks before.
572 *
573 * We need to serialise against EOF updates that occur in IO
574 * completions here. We want to make sure that nobody is changing the
575 * size while we do this check until we have placed an IO barrier (i.e.
576 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
577 * The spinlock effectively forms a memory barrier once we have the
578 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
579 * and hence be able to correctly determine if we need to run zeroing.
572 */ 580 */
581 spin_lock(&ip->i_flags_lock);
573 if (iocb->ki_pos > i_size_read(inode)) { 582 if (iocb->ki_pos > i_size_read(inode)) {
574 bool zero = false; 583 bool zero = false;
575 584
585 spin_unlock(&ip->i_flags_lock);
576 if (*iolock == XFS_IOLOCK_SHARED) { 586 if (*iolock == XFS_IOLOCK_SHARED) {
577 xfs_rw_iunlock(ip, *iolock); 587 xfs_rw_iunlock(ip, *iolock);
578 *iolock = XFS_IOLOCK_EXCL; 588 *iolock = XFS_IOLOCK_EXCL;
579 xfs_rw_ilock(ip, *iolock); 589 xfs_rw_ilock(ip, *iolock);
580 iov_iter_reexpand(from, count); 590 iov_iter_reexpand(from, count);
591
592 /*
593 * We now have an IO submission barrier in place, but
594 * AIO can do EOF updates during IO completion and hence
595 * we now need to wait for all of them to drain. Non-AIO
596 * DIO will have drained before we are given the
597 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
598 * no-op.
599 */
600 inode_dio_wait(inode);
581 goto restart; 601 goto restart;
582 } 602 }
583 error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); 603 error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
584 if (error) 604 if (error)
585 return error; 605 return error;
586 } 606 } else
607 spin_unlock(&ip->i_flags_lock);
587 608
588 /* 609 /*
589 * Updating the timestamps will grab the ilock again from 610 * Updating the timestamps will grab the ilock again from
@@ -645,6 +666,8 @@ xfs_file_dio_aio_write(
645 int iolock; 666 int iolock;
646 size_t count = iov_iter_count(from); 667 size_t count = iov_iter_count(from);
647 loff_t pos = iocb->ki_pos; 668 loff_t pos = iocb->ki_pos;
669 loff_t end;
670 struct iov_iter data;
648 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? 671 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
649 mp->m_rtdev_targp : mp->m_ddev_targp; 672 mp->m_rtdev_targp : mp->m_ddev_targp;
650 673
@@ -685,10 +708,11 @@ xfs_file_dio_aio_write(
685 goto out; 708 goto out;
686 count = iov_iter_count(from); 709 count = iov_iter_count(from);
687 pos = iocb->ki_pos; 710 pos = iocb->ki_pos;
711 end = pos + count - 1;
688 712
689 if (mapping->nrpages) { 713 if (mapping->nrpages) {
690 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 714 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
691 pos, pos + count - 1); 715 pos, end);
692 if (ret) 716 if (ret)
693 goto out; 717 goto out;
694 /* 718 /*
@@ -698,7 +722,7 @@ xfs_file_dio_aio_write(
698 */ 722 */
699 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, 723 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
700 pos >> PAGE_CACHE_SHIFT, 724 pos >> PAGE_CACHE_SHIFT,
701 (pos + count - 1) >> PAGE_CACHE_SHIFT); 725 end >> PAGE_CACHE_SHIFT);
702 WARN_ON_ONCE(ret); 726 WARN_ON_ONCE(ret);
703 ret = 0; 727 ret = 0;
704 } 728 }
@@ -715,8 +739,22 @@ xfs_file_dio_aio_write(
715 } 739 }
716 740
717 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 741 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
718 ret = generic_file_direct_write(iocb, from, pos);
719 742
743 data = *from;
744 ret = mapping->a_ops->direct_IO(iocb, &data, pos);
745
746 /* see generic_file_direct_write() for why this is necessary */
747 if (mapping->nrpages) {
748 invalidate_inode_pages2_range(mapping,
749 pos >> PAGE_CACHE_SHIFT,
750 end >> PAGE_CACHE_SHIFT);
751 }
752
753 if (ret > 0) {
754 pos += ret;
755 iov_iter_advance(from, ret);
756 iocb->ki_pos = pos;
757 }
720out: 758out:
721 xfs_rw_iunlock(ip, iolock); 759 xfs_rw_iunlock(ip, iolock);
722 760
@@ -822,6 +860,11 @@ xfs_file_write_iter(
822 return ret; 860 return ret;
823} 861}
824 862
863#define XFS_FALLOC_FL_SUPPORTED \
864 (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
865 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
866 FALLOC_FL_INSERT_RANGE)
867
825STATIC long 868STATIC long
826xfs_file_fallocate( 869xfs_file_fallocate(
827 struct file *file, 870 struct file *file,
@@ -835,18 +878,21 @@ xfs_file_fallocate(
835 enum xfs_prealloc_flags flags = 0; 878 enum xfs_prealloc_flags flags = 0;
836 uint iolock = XFS_IOLOCK_EXCL; 879 uint iolock = XFS_IOLOCK_EXCL;
837 loff_t new_size = 0; 880 loff_t new_size = 0;
881 bool do_file_insert = 0;
838 882
839 if (!S_ISREG(inode->i_mode)) 883 if (!S_ISREG(inode->i_mode))
840 return -EINVAL; 884 return -EINVAL;
841 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 885 if (mode & ~XFS_FALLOC_FL_SUPPORTED)
842 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
843 return -EOPNOTSUPP; 886 return -EOPNOTSUPP;
844 887
845 xfs_ilock(ip, iolock); 888 xfs_ilock(ip, iolock);
846 error = xfs_break_layouts(inode, &iolock); 889 error = xfs_break_layouts(inode, &iolock, false);
847 if (error) 890 if (error)
848 goto out_unlock; 891 goto out_unlock;
849 892
893 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
894 iolock |= XFS_MMAPLOCK_EXCL;
895
850 if (mode & FALLOC_FL_PUNCH_HOLE) { 896 if (mode & FALLOC_FL_PUNCH_HOLE) {
851 error = xfs_free_file_space(ip, offset, len); 897 error = xfs_free_file_space(ip, offset, len);
852 if (error) 898 if (error)
@@ -873,6 +919,27 @@ xfs_file_fallocate(
873 error = xfs_collapse_file_space(ip, offset, len); 919 error = xfs_collapse_file_space(ip, offset, len);
874 if (error) 920 if (error)
875 goto out_unlock; 921 goto out_unlock;
922 } else if (mode & FALLOC_FL_INSERT_RANGE) {
923 unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
924
925 new_size = i_size_read(inode) + len;
926 if (offset & blksize_mask || len & blksize_mask) {
927 error = -EINVAL;
928 goto out_unlock;
929 }
930
931 /* check the new inode size does not wrap through zero */
932 if (new_size > inode->i_sb->s_maxbytes) {
933 error = -EFBIG;
934 goto out_unlock;
935 }
936
937 /* Offset should be less than i_size */
938 if (offset >= i_size_read(inode)) {
939 error = -EINVAL;
940 goto out_unlock;
941 }
942 do_file_insert = 1;
876 } else { 943 } else {
877 flags |= XFS_PREALLOC_SET; 944 flags |= XFS_PREALLOC_SET;
878 945
@@ -907,8 +974,19 @@ xfs_file_fallocate(
907 iattr.ia_valid = ATTR_SIZE; 974 iattr.ia_valid = ATTR_SIZE;
908 iattr.ia_size = new_size; 975 iattr.ia_size = new_size;
909 error = xfs_setattr_size(ip, &iattr); 976 error = xfs_setattr_size(ip, &iattr);
977 if (error)
978 goto out_unlock;
910 } 979 }
911 980
981 /*
982 * Perform hole insertion now that the file size has been
983 * updated so that if we crash during the operation we don't
984 * leave shifted extents past EOF and hence losing access to
985 * the data that is contained within them.
986 */
987 if (do_file_insert)
988 error = xfs_insert_file_space(ip, offset, len);
989
912out_unlock: 990out_unlock:
913 xfs_iunlock(ip, iolock); 991 xfs_iunlock(ip, iolock);
914 return error; 992 return error;
@@ -997,20 +1075,6 @@ xfs_file_mmap(
997} 1075}
998 1076
999/* 1077/*
1000 * mmap()d file has taken write protection fault and is being made
1001 * writable. We can set the page state up correctly for a writable
1002 * page, which means we can do correct delalloc accounting (ENOSPC
1003 * checking!) and unwritten extent mapping.
1004 */
1005STATIC int
1006xfs_vm_page_mkwrite(
1007 struct vm_area_struct *vma,
1008 struct vm_fault *vmf)
1009{
1010 return block_page_mkwrite(vma, vmf, xfs_get_blocks);
1011}
1012
1013/*
1014 * This type is designed to indicate the type of offset we would like 1078 * This type is designed to indicate the type of offset we would like
1015 * to search from page cache for xfs_seek_hole_data(). 1079 * to search from page cache for xfs_seek_hole_data().
1016 */ 1080 */
@@ -1385,6 +1449,55 @@ xfs_file_llseek(
1385 } 1449 }
1386} 1450}
1387 1451
1452/*
1453 * Locking for serialisation of IO during page faults. This results in a lock
1454 * ordering of:
1455 *
1456 * mmap_sem (MM)
1457 * i_mmap_lock (XFS - truncate serialisation)
1458 * page_lock (MM)
1459 * i_lock (XFS - extent map serialisation)
1460 */
1461STATIC int
1462xfs_filemap_fault(
1463 struct vm_area_struct *vma,
1464 struct vm_fault *vmf)
1465{
1466 struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
1467 int error;
1468
1469 trace_xfs_filemap_fault(ip);
1470
1471 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1472 error = filemap_fault(vma, vmf);
1473 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1474
1475 return error;
1476}
1477
1478/*
1479 * mmap()d file has taken write protection fault and is being made writable. We
1480 * can set the page state up correctly for a writable page, which means we can
1481 * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
1482 * mapping.
1483 */
1484STATIC int
1485xfs_filemap_page_mkwrite(
1486 struct vm_area_struct *vma,
1487 struct vm_fault *vmf)
1488{
1489 struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host);
1490 int error;
1491
1492 trace_xfs_filemap_page_mkwrite(ip);
1493
1494 xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
1495 error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
1496 xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
1497
1498 return error;
1499}
1500
1388const struct file_operations xfs_file_operations = { 1501const struct file_operations xfs_file_operations = {
1389 .llseek = xfs_file_llseek, 1502 .llseek = xfs_file_llseek,
1390 .read_iter = xfs_file_read_iter, 1503 .read_iter = xfs_file_read_iter,
@@ -1415,7 +1528,7 @@ const struct file_operations xfs_dir_file_operations = {
1415}; 1528};
1416 1529
1417static const struct vm_operations_struct xfs_file_vm_ops = { 1530static const struct vm_operations_struct xfs_file_vm_ops = {
1418 .fault = filemap_fault, 1531 .fault = xfs_filemap_fault,
1419 .map_pages = filemap_map_pages, 1532 .map_pages = filemap_map_pages,
1420 .page_mkwrite = xfs_vm_page_mkwrite, 1533 .page_mkwrite = xfs_filemap_page_mkwrite,
1421}; 1534};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index a2e86e8a0fea..8f9f854376c6 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -322,7 +322,7 @@ xfs_filestream_lookup_ag(
322 322
323 pip = xfs_filestream_get_parent(ip); 323 pip = xfs_filestream_get_parent(ip);
324 if (!pip) 324 if (!pip)
325 goto out; 325 return NULLAGNUMBER;
326 326
327 mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); 327 mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
328 if (mru) { 328 if (mru) {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 74efe5b760dc..cb7e8a29dfb6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -637,12 +637,13 @@ xfs_fs_counts(
637 xfs_mount_t *mp, 637 xfs_mount_t *mp,
638 xfs_fsop_counts_t *cnt) 638 xfs_fsop_counts_t *cnt)
639{ 639{
640 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); 640 cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
641 cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
642 cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
643 XFS_ALLOC_SET_ASIDE(mp);
644
641 spin_lock(&mp->m_sb_lock); 645 spin_lock(&mp->m_sb_lock);
642 cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
643 cnt->freertx = mp->m_sb.sb_frextents; 646 cnt->freertx = mp->m_sb.sb_frextents;
644 cnt->freeino = mp->m_sb.sb_ifree;
645 cnt->allocino = mp->m_sb.sb_icount;
646 spin_unlock(&mp->m_sb_lock); 647 spin_unlock(&mp->m_sb_lock);
647 return 0; 648 return 0;
648} 649}
@@ -692,14 +693,9 @@ xfs_reserve_blocks(
692 * what to do. This means that the amount of free space can 693 * what to do. This means that the amount of free space can
693 * change while we do this, so we need to retry if we end up 694 * change while we do this, so we need to retry if we end up
694 * trying to reserve more space than is available. 695 * trying to reserve more space than is available.
695 *
696 * We also use the xfs_mod_incore_sb() interface so that we
697 * don't have to care about whether per cpu counter are
698 * enabled, disabled or even compiled in....
699 */ 696 */
700retry: 697retry:
701 spin_lock(&mp->m_sb_lock); 698 spin_lock(&mp->m_sb_lock);
702 xfs_icsb_sync_counters_locked(mp, 0);
703 699
704 /* 700 /*
705 * If our previous reservation was larger than the current value, 701 * If our previous reservation was larger than the current value,
@@ -716,7 +712,8 @@ retry:
716 } else { 712 } else {
717 __int64_t free; 713 __int64_t free;
718 714
719 free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); 715 free = percpu_counter_sum(&mp->m_fdblocks) -
716 XFS_ALLOC_SET_ASIDE(mp);
720 if (!free) 717 if (!free)
721 goto out; /* ENOSPC and fdblks_delta = 0 */ 718 goto out; /* ENOSPC and fdblks_delta = 0 */
722 719
@@ -755,8 +752,7 @@ out:
755 * the extra reserve blocks from the reserve..... 752 * the extra reserve blocks from the reserve.....
756 */ 753 */
757 int error; 754 int error;
758 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 755 error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
759 fdblks_delta, 0);
760 if (error == -ENOSPC) 756 if (error == -ENOSPC)
761 goto retry; 757 goto retry;
762 } 758 }
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9771b7ef62ed..76a9f2783282 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -439,11 +439,11 @@ again:
439 *ipp = ip; 439 *ipp = ip;
440 440
441 /* 441 /*
442 * If we have a real type for an on-disk inode, we can set ops(&unlock) 442 * If we have a real type for an on-disk inode, we can setup the inode
443 * now. If it's a new inode being created, xfs_ialloc will handle it. 443 * now. If it's a new inode being created, xfs_ialloc will handle it.
444 */ 444 */
445 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) 445 if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
446 xfs_setup_inode(ip); 446 xfs_setup_existing_inode(ip);
447 return 0; 447 return 0;
448 448
449out_error_or_again: 449out_error_or_again:
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6163767aa856..d6ebc85192b7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared(
117} 117}
118 118
119/* 119/*
120 * The xfs inode contains 2 locks: a multi-reader lock called the 120 * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
121 * i_iolock and a multi-reader lock called the i_lock. This routine 121 * the i_lock. This routine allows various combinations of the locks to be
122 * allows either or both of the locks to be obtained. 122 * obtained.
123 * 123 *
124 * The 2 locks should always be ordered so that the IO lock is 124 * The 3 locks should always be ordered so that the IO lock is obtained first,
125 * obtained first in order to prevent deadlock. 125 * the mmap lock second and the ilock last in order to prevent deadlock.
126 * 126 *
127 * ip -- the inode being locked 127 * Basic locking order:
128 * lock_flags -- this parameter indicates the inode's locks 128 *
129 * to be locked. It can be: 129 * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
130 * XFS_IOLOCK_SHARED, 130 *
131 * XFS_IOLOCK_EXCL, 131 * mmap_sem locking order:
132 * XFS_ILOCK_SHARED, 132 *
133 * XFS_ILOCK_EXCL, 133 * i_iolock -> page lock -> mmap_sem
134 * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, 134 * mmap_sem -> i_mmap_lock -> page_lock
135 * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, 135 *
136 * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, 136 * The difference in mmap_sem locking order mean that we cannot hold the
137 * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL 137 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
138 * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
139 * in get_user_pages() to map the user pages into the kernel address space for
140 * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
141 * page faults already hold the mmap_sem.
142 *
143 * Hence to serialise fully against both syscall and mmap based IO, we need to
144 * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
145 * taken in places where we need to invalidate the page cache in a race
146 * free manner (e.g. truncate, hole punch and other extent manipulation
147 * functions).
138 */ 148 */
139void 149void
140xfs_ilock( 150xfs_ilock(
@@ -150,6 +160,8 @@ xfs_ilock(
150 */ 160 */
151 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 161 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
152 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 162 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
163 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
164 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
153 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 165 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
154 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 166 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
155 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 167 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -159,6 +171,11 @@ xfs_ilock(
159 else if (lock_flags & XFS_IOLOCK_SHARED) 171 else if (lock_flags & XFS_IOLOCK_SHARED)
160 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); 172 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
161 173
174 if (lock_flags & XFS_MMAPLOCK_EXCL)
175 mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
176 else if (lock_flags & XFS_MMAPLOCK_SHARED)
177 mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
178
162 if (lock_flags & XFS_ILOCK_EXCL) 179 if (lock_flags & XFS_ILOCK_EXCL)
163 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 180 mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
164 else if (lock_flags & XFS_ILOCK_SHARED) 181 else if (lock_flags & XFS_ILOCK_SHARED)
@@ -191,6 +208,8 @@ xfs_ilock_nowait(
191 */ 208 */
192 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 209 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
193 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 210 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
211 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
212 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
194 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 213 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
195 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 214 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
196 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 215 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -202,21 +221,35 @@ xfs_ilock_nowait(
202 if (!mrtryaccess(&ip->i_iolock)) 221 if (!mrtryaccess(&ip->i_iolock))
203 goto out; 222 goto out;
204 } 223 }
224
225 if (lock_flags & XFS_MMAPLOCK_EXCL) {
226 if (!mrtryupdate(&ip->i_mmaplock))
227 goto out_undo_iolock;
228 } else if (lock_flags & XFS_MMAPLOCK_SHARED) {
229 if (!mrtryaccess(&ip->i_mmaplock))
230 goto out_undo_iolock;
231 }
232
205 if (lock_flags & XFS_ILOCK_EXCL) { 233 if (lock_flags & XFS_ILOCK_EXCL) {
206 if (!mrtryupdate(&ip->i_lock)) 234 if (!mrtryupdate(&ip->i_lock))
207 goto out_undo_iolock; 235 goto out_undo_mmaplock;
208 } else if (lock_flags & XFS_ILOCK_SHARED) { 236 } else if (lock_flags & XFS_ILOCK_SHARED) {
209 if (!mrtryaccess(&ip->i_lock)) 237 if (!mrtryaccess(&ip->i_lock))
210 goto out_undo_iolock; 238 goto out_undo_mmaplock;
211 } 239 }
212 return 1; 240 return 1;
213 241
214 out_undo_iolock: 242out_undo_mmaplock:
243 if (lock_flags & XFS_MMAPLOCK_EXCL)
244 mrunlock_excl(&ip->i_mmaplock);
245 else if (lock_flags & XFS_MMAPLOCK_SHARED)
246 mrunlock_shared(&ip->i_mmaplock);
247out_undo_iolock:
215 if (lock_flags & XFS_IOLOCK_EXCL) 248 if (lock_flags & XFS_IOLOCK_EXCL)
216 mrunlock_excl(&ip->i_iolock); 249 mrunlock_excl(&ip->i_iolock);
217 else if (lock_flags & XFS_IOLOCK_SHARED) 250 else if (lock_flags & XFS_IOLOCK_SHARED)
218 mrunlock_shared(&ip->i_iolock); 251 mrunlock_shared(&ip->i_iolock);
219 out: 252out:
220 return 0; 253 return 0;
221} 254}
222 255
@@ -244,6 +277,8 @@ xfs_iunlock(
244 */ 277 */
245 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 278 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
246 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 279 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
280 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) !=
281 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL));
247 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 282 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
248 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 283 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
249 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); 284 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
@@ -254,6 +289,11 @@ xfs_iunlock(
254 else if (lock_flags & XFS_IOLOCK_SHARED) 289 else if (lock_flags & XFS_IOLOCK_SHARED)
255 mrunlock_shared(&ip->i_iolock); 290 mrunlock_shared(&ip->i_iolock);
256 291
292 if (lock_flags & XFS_MMAPLOCK_EXCL)
293 mrunlock_excl(&ip->i_mmaplock);
294 else if (lock_flags & XFS_MMAPLOCK_SHARED)
295 mrunlock_shared(&ip->i_mmaplock);
296
257 if (lock_flags & XFS_ILOCK_EXCL) 297 if (lock_flags & XFS_ILOCK_EXCL)
258 mrunlock_excl(&ip->i_lock); 298 mrunlock_excl(&ip->i_lock);
259 else if (lock_flags & XFS_ILOCK_SHARED) 299 else if (lock_flags & XFS_ILOCK_SHARED)
@@ -271,11 +311,14 @@ xfs_ilock_demote(
271 xfs_inode_t *ip, 311 xfs_inode_t *ip,
272 uint lock_flags) 312 uint lock_flags)
273{ 313{
274 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); 314 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL));
275 ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 315 ASSERT((lock_flags &
316 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
276 317
277 if (lock_flags & XFS_ILOCK_EXCL) 318 if (lock_flags & XFS_ILOCK_EXCL)
278 mrdemote(&ip->i_lock); 319 mrdemote(&ip->i_lock);
320 if (lock_flags & XFS_MMAPLOCK_EXCL)
321 mrdemote(&ip->i_mmaplock);
279 if (lock_flags & XFS_IOLOCK_EXCL) 322 if (lock_flags & XFS_IOLOCK_EXCL)
280 mrdemote(&ip->i_iolock); 323 mrdemote(&ip->i_iolock);
281 324
@@ -294,6 +337,12 @@ xfs_isilocked(
294 return rwsem_is_locked(&ip->i_lock.mr_lock); 337 return rwsem_is_locked(&ip->i_lock.mr_lock);
295 } 338 }
296 339
340 if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
341 if (!(lock_flags & XFS_MMAPLOCK_SHARED))
342 return !!ip->i_mmaplock.mr_writer;
343 return rwsem_is_locked(&ip->i_mmaplock.mr_lock);
344 }
345
297 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { 346 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
298 if (!(lock_flags & XFS_IOLOCK_SHARED)) 347 if (!(lock_flags & XFS_IOLOCK_SHARED))
299 return !!ip->i_iolock.mr_writer; 348 return !!ip->i_iolock.mr_writer;
@@ -314,14 +363,27 @@ int xfs_lock_delays;
314#endif 363#endif
315 364
316/* 365/*
317 * Bump the subclass so xfs_lock_inodes() acquires each lock with 366 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different
318 * a different value 367 * value. This shouldn't be called for page fault locking, but we also need to
368 * ensure we don't overrun the number of lockdep subclasses for the iolock or
369 * mmaplock as that is limited to 12 by the mmap lock lockdep annotations.
319 */ 370 */
320static inline int 371static inline int
321xfs_lock_inumorder(int lock_mode, int subclass) 372xfs_lock_inumorder(int lock_mode, int subclass)
322{ 373{
323 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) 374 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
375 ASSERT(subclass + XFS_LOCK_INUMORDER <
376 (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT)));
324 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; 377 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
378 }
379
380 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
381 ASSERT(subclass + XFS_LOCK_INUMORDER <
382 (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT)));
383 lock_mode |= (subclass + XFS_LOCK_INUMORDER) <<
384 XFS_MMAPLOCK_SHIFT;
385 }
386
325 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) 387 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
326 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; 388 lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
327 389
@@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass)
329} 391}
330 392
331/* 393/*
332 * The following routine will lock n inodes in exclusive mode. 394 * The following routine will lock n inodes in exclusive mode. We assume the
333 * We assume the caller calls us with the inodes in i_ino order. 395 * caller calls us with the inodes in i_ino order.
334 * 396 *
335 * We need to detect deadlock where an inode that we lock 397 * We need to detect deadlock where an inode that we lock is in the AIL and we
336 * is in the AIL and we start waiting for another inode that is locked 398 * start waiting for another inode that is locked by a thread in a long running
337 * by a thread in a long running transaction (such as truncate). This can 399 * transaction (such as truncate). This can result in deadlock since the long
338 * result in deadlock since the long running trans might need to wait 400 * running trans might need to wait for the inode we just locked in order to
339 * for the inode we just locked in order to push the tail and free space 401 * push the tail and free space in the log.
340 * in the log.
341 */ 402 */
342void 403void
343xfs_lock_inodes( 404xfs_lock_inodes(
@@ -348,30 +409,27 @@ xfs_lock_inodes(
348 int attempts = 0, i, j, try_lock; 409 int attempts = 0, i, j, try_lock;
349 xfs_log_item_t *lp; 410 xfs_log_item_t *lp;
350 411
351 ASSERT(ips && (inodes >= 2)); /* we need at least two */ 412 /* currently supports between 2 and 5 inodes */
413 ASSERT(ips && inodes >= 2 && inodes <= 5);
352 414
353 try_lock = 0; 415 try_lock = 0;
354 i = 0; 416 i = 0;
355
356again: 417again:
357 for (; i < inodes; i++) { 418 for (; i < inodes; i++) {
358 ASSERT(ips[i]); 419 ASSERT(ips[i]);
359 420
360 if (i && (ips[i] == ips[i-1])) /* Already locked */ 421 if (i && (ips[i] == ips[i - 1])) /* Already locked */
361 continue; 422 continue;
362 423
363 /* 424 /*
364 * If try_lock is not set yet, make sure all locked inodes 425 * If try_lock is not set yet, make sure all locked inodes are
365 * are not in the AIL. 426 * not in the AIL. If any are, set try_lock to be used later.
366 * If any are, set try_lock to be used later.
367 */ 427 */
368
369 if (!try_lock) { 428 if (!try_lock) {
370 for (j = (i - 1); j >= 0 && !try_lock; j--) { 429 for (j = (i - 1); j >= 0 && !try_lock; j--) {
371 lp = (xfs_log_item_t *)ips[j]->i_itemp; 430 lp = (xfs_log_item_t *)ips[j]->i_itemp;
372 if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { 431 if (lp && (lp->li_flags & XFS_LI_IN_AIL))
373 try_lock++; 432 try_lock++;
374 }
375 } 433 }
376 } 434 }
377 435
@@ -381,51 +439,42 @@ again:
381 * we can't get any, we must release all we have 439 * we can't get any, we must release all we have
382 * and try again. 440 * and try again.
383 */ 441 */
442 if (!try_lock) {
443 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
444 continue;
445 }
446
447 /* try_lock means we have an inode locked that is in the AIL. */
448 ASSERT(i != 0);
449 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i)))
450 continue;
384 451
385 if (try_lock) { 452 /*
386 /* try_lock must be 0 if i is 0. */ 453 * Unlock all previous guys and try again. xfs_iunlock will try
454 * to push the tail if the inode is in the AIL.
455 */
456 attempts++;
457 for (j = i - 1; j >= 0; j--) {
387 /* 458 /*
388 * try_lock means we have an inode locked 459 * Check to see if we've already unlocked this one. Not
389 * that is in the AIL. 460 * the first one going back, and the inode ptr is the
461 * same.
390 */ 462 */
391 ASSERT(i != 0); 463 if (j != (i - 1) && ips[j] == ips[j + 1])
392 if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { 464 continue;
393 attempts++; 465
394 466 xfs_iunlock(ips[j], lock_mode);
395 /* 467 }
396 * Unlock all previous guys and try again.
397 * xfs_iunlock will try to push the tail
398 * if the inode is in the AIL.
399 */
400
401 for(j = i - 1; j >= 0; j--) {
402
403 /*
404 * Check to see if we've already
405 * unlocked this one.
406 * Not the first one going back,
407 * and the inode ptr is the same.
408 */
409 if ((j != (i - 1)) && ips[j] ==
410 ips[j+1])
411 continue;
412
413 xfs_iunlock(ips[j], lock_mode);
414 }
415 468
416 if ((attempts % 5) == 0) { 469 if ((attempts % 5) == 0) {
417 delay(1); /* Don't just spin the CPU */ 470 delay(1); /* Don't just spin the CPU */
418#ifdef DEBUG 471#ifdef DEBUG
419 xfs_lock_delays++; 472 xfs_lock_delays++;
420#endif 473#endif
421 }
422 i = 0;
423 try_lock = 0;
424 goto again;
425 }
426 } else {
427 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
428 } 474 }
475 i = 0;
476 try_lock = 0;
477 goto again;
429 } 478 }
430 479
431#ifdef DEBUG 480#ifdef DEBUG
@@ -440,10 +489,10 @@ again:
440} 489}
441 490
442/* 491/*
443 * xfs_lock_two_inodes() can only be used to lock one type of lock 492 * xfs_lock_two_inodes() can only be used to lock one type of lock at a time -
444 * at a time - the iolock or the ilock, but not both at once. If 493 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we
445 * we lock both at once, lockdep will report false positives saying 494 * lock more than one at a time, lockdep will report false positives saying we
446 * we have violated locking orders. 495 * have violated locking orders.
447 */ 496 */
448void 497void
449xfs_lock_two_inodes( 498xfs_lock_two_inodes(
@@ -455,8 +504,12 @@ xfs_lock_two_inodes(
455 int attempts = 0; 504 int attempts = 0;
456 xfs_log_item_t *lp; 505 xfs_log_item_t *lp;
457 506
458 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) 507 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
459 ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); 508 ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
509 ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
510 } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
511 ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
512
460 ASSERT(ip0->i_ino != ip1->i_ino); 513 ASSERT(ip0->i_ino != ip1->i_ino);
461 514
462 if (ip0->i_ino > ip1->i_ino) { 515 if (ip0->i_ino > ip1->i_ino) {
@@ -818,7 +871,7 @@ xfs_ialloc(
818 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 871 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
819 xfs_trans_log_inode(tp, ip, flags); 872 xfs_trans_log_inode(tp, ip, flags);
820 873
821 /* now that we have an i_mode we can setup inode ops and unlock */ 874 /* now that we have an i_mode we can setup the inode structure */
822 xfs_setup_inode(ip); 875 xfs_setup_inode(ip);
823 876
824 *ipp = ip; 877 *ipp = ip;
@@ -1235,12 +1288,14 @@ xfs_create(
1235 xfs_trans_cancel(tp, cancel_flags); 1288 xfs_trans_cancel(tp, cancel_flags);
1236 out_release_inode: 1289 out_release_inode:
1237 /* 1290 /*
1238 * Wait until after the current transaction is aborted to 1291 * Wait until after the current transaction is aborted to finish the
1239 * release the inode. This prevents recursive transactions 1292 * setup of the inode and release the inode. This prevents recursive
1240 * and deadlocks from xfs_inactive. 1293 * transactions and deadlocks from xfs_inactive.
1241 */ 1294 */
1242 if (ip) 1295 if (ip) {
1296 xfs_finish_inode_setup(ip);
1243 IRELE(ip); 1297 IRELE(ip);
1298 }
1244 1299
1245 xfs_qm_dqrele(udqp); 1300 xfs_qm_dqrele(udqp);
1246 xfs_qm_dqrele(gdqp); 1301 xfs_qm_dqrele(gdqp);
@@ -1345,12 +1400,14 @@ xfs_create_tmpfile(
1345 xfs_trans_cancel(tp, cancel_flags); 1400 xfs_trans_cancel(tp, cancel_flags);
1346 out_release_inode: 1401 out_release_inode:
1347 /* 1402 /*
1348 * Wait until after the current transaction is aborted to 1403 * Wait until after the current transaction is aborted to finish the
1349 * release the inode. This prevents recursive transactions 1404 * setup of the inode and release the inode. This prevents recursive
1350 * and deadlocks from xfs_inactive. 1405 * transactions and deadlocks from xfs_inactive.
1351 */ 1406 */
1352 if (ip) 1407 if (ip) {
1408 xfs_finish_inode_setup(ip);
1353 IRELE(ip); 1409 IRELE(ip);
1410 }
1354 1411
1355 xfs_qm_dqrele(udqp); 1412 xfs_qm_dqrele(udqp);
1356 xfs_qm_dqrele(gdqp); 1413 xfs_qm_dqrele(gdqp);
@@ -2611,19 +2668,22 @@ xfs_remove(
2611/* 2668/*
2612 * Enter all inodes for a rename transaction into a sorted array. 2669 * Enter all inodes for a rename transaction into a sorted array.
2613 */ 2670 */
2671#define __XFS_SORT_INODES 5
2614STATIC void 2672STATIC void
2615xfs_sort_for_rename( 2673xfs_sort_for_rename(
2616 xfs_inode_t *dp1, /* in: old (source) directory inode */ 2674 struct xfs_inode *dp1, /* in: old (source) directory inode */
2617 xfs_inode_t *dp2, /* in: new (target) directory inode */ 2675 struct xfs_inode *dp2, /* in: new (target) directory inode */
2618 xfs_inode_t *ip1, /* in: inode of old entry */ 2676 struct xfs_inode *ip1, /* in: inode of old entry */
2619 xfs_inode_t *ip2, /* in: inode of new entry, if it 2677 struct xfs_inode *ip2, /* in: inode of new entry */
2620 already exists, NULL otherwise. */ 2678 struct xfs_inode *wip, /* in: whiteout inode */
2621 xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ 2679 struct xfs_inode **i_tab,/* out: sorted array of inodes */
2622 int *num_inodes) /* out: number of inodes in array */ 2680 int *num_inodes) /* in/out: inodes in array */
2623{ 2681{
2624 xfs_inode_t *temp;
2625 int i, j; 2682 int i, j;
2626 2683
2684 ASSERT(*num_inodes == __XFS_SORT_INODES);
2685 memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *));
2686
2627 /* 2687 /*
2628 * i_tab contains a list of pointers to inodes. We initialize 2688 * i_tab contains a list of pointers to inodes. We initialize
2629 * the table here & we'll sort it. We will then use it to 2689 * the table here & we'll sort it. We will then use it to
@@ -2631,25 +2691,24 @@ xfs_sort_for_rename(
2631 * 2691 *
2632 * Note that the table may contain duplicates. e.g., dp1 == dp2. 2692 * Note that the table may contain duplicates. e.g., dp1 == dp2.
2633 */ 2693 */
2634 i_tab[0] = dp1; 2694 i = 0;
2635 i_tab[1] = dp2; 2695 i_tab[i++] = dp1;
2636 i_tab[2] = ip1; 2696 i_tab[i++] = dp2;
2637 if (ip2) { 2697 i_tab[i++] = ip1;
2638 *num_inodes = 4; 2698 if (ip2)
2639 i_tab[3] = ip2; 2699 i_tab[i++] = ip2;
2640 } else { 2700 if (wip)
2641 *num_inodes = 3; 2701 i_tab[i++] = wip;
2642 i_tab[3] = NULL; 2702 *num_inodes = i;
2643 }
2644 2703
2645 /* 2704 /*
2646 * Sort the elements via bubble sort. (Remember, there are at 2705 * Sort the elements via bubble sort. (Remember, there are at
2647 * most 4 elements to sort, so this is adequate.) 2706 * most 5 elements to sort, so this is adequate.)
2648 */ 2707 */
2649 for (i = 0; i < *num_inodes; i++) { 2708 for (i = 0; i < *num_inodes; i++) {
2650 for (j = 1; j < *num_inodes; j++) { 2709 for (j = 1; j < *num_inodes; j++) {
2651 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { 2710 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2652 temp = i_tab[j]; 2711 struct xfs_inode *temp = i_tab[j];
2653 i_tab[j] = i_tab[j-1]; 2712 i_tab[j] = i_tab[j-1];
2654 i_tab[j-1] = temp; 2713 i_tab[j-1] = temp;
2655 } 2714 }
@@ -2657,6 +2716,31 @@ xfs_sort_for_rename(
2657 } 2716 }
2658} 2717}
2659 2718
2719static int
2720xfs_finish_rename(
2721 struct xfs_trans *tp,
2722 struct xfs_bmap_free *free_list)
2723{
2724 int committed = 0;
2725 int error;
2726
2727 /*
2728 * If this is a synchronous mount, make sure that the rename transaction
2729 * goes to disk before returning to the user.
2730 */
2731 if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2732 xfs_trans_set_sync(tp);
2733
2734 error = xfs_bmap_finish(&tp, free_list, &committed);
2735 if (error) {
2736 xfs_bmap_cancel(free_list);
2737 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
2738 return error;
2739 }
2740
2741 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2742}
2743
2660/* 2744/*
2661 * xfs_cross_rename() 2745 * xfs_cross_rename()
2662 * 2746 *
@@ -2685,14 +2769,14 @@ xfs_cross_rename(
2685 ip2->i_ino, 2769 ip2->i_ino,
2686 first_block, free_list, spaceres); 2770 first_block, free_list, spaceres);
2687 if (error) 2771 if (error)
2688 goto out; 2772 goto out_trans_abort;
2689 2773
2690 /* Swap inode number for dirent in second parent */ 2774 /* Swap inode number for dirent in second parent */
2691 error = xfs_dir_replace(tp, dp2, name2, 2775 error = xfs_dir_replace(tp, dp2, name2,
2692 ip1->i_ino, 2776 ip1->i_ino,
2693 first_block, free_list, spaceres); 2777 first_block, free_list, spaceres);
2694 if (error) 2778 if (error)
2695 goto out; 2779 goto out_trans_abort;
2696 2780
2697 /* 2781 /*
2698 * If we're renaming one or more directories across different parents, 2782 * If we're renaming one or more directories across different parents,
@@ -2707,16 +2791,16 @@ xfs_cross_rename(
2707 dp1->i_ino, first_block, 2791 dp1->i_ino, first_block,
2708 free_list, spaceres); 2792 free_list, spaceres);
2709 if (error) 2793 if (error)
2710 goto out; 2794 goto out_trans_abort;
2711 2795
2712 /* transfer ip2 ".." reference to dp1 */ 2796 /* transfer ip2 ".." reference to dp1 */
2713 if (!S_ISDIR(ip1->i_d.di_mode)) { 2797 if (!S_ISDIR(ip1->i_d.di_mode)) {
2714 error = xfs_droplink(tp, dp2); 2798 error = xfs_droplink(tp, dp2);
2715 if (error) 2799 if (error)
2716 goto out; 2800 goto out_trans_abort;
2717 error = xfs_bumplink(tp, dp1); 2801 error = xfs_bumplink(tp, dp1);
2718 if (error) 2802 if (error)
2719 goto out; 2803 goto out_trans_abort;
2720 } 2804 }
2721 2805
2722 /* 2806 /*
@@ -2734,16 +2818,16 @@ xfs_cross_rename(
2734 dp2->i_ino, first_block, 2818 dp2->i_ino, first_block,
2735 free_list, spaceres); 2819 free_list, spaceres);
2736 if (error) 2820 if (error)
2737 goto out; 2821 goto out_trans_abort;
2738 2822
2739 /* transfer ip1 ".." reference to dp2 */ 2823 /* transfer ip1 ".." reference to dp2 */
2740 if (!S_ISDIR(ip2->i_d.di_mode)) { 2824 if (!S_ISDIR(ip2->i_d.di_mode)) {
2741 error = xfs_droplink(tp, dp1); 2825 error = xfs_droplink(tp, dp1);
2742 if (error) 2826 if (error)
2743 goto out; 2827 goto out_trans_abort;
2744 error = xfs_bumplink(tp, dp2); 2828 error = xfs_bumplink(tp, dp2);
2745 if (error) 2829 if (error)
2746 goto out; 2830 goto out_trans_abort;
2747 } 2831 }
2748 2832
2749 /* 2833 /*
@@ -2771,66 +2855,108 @@ xfs_cross_rename(
2771 } 2855 }
2772 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2856 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2773 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); 2857 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
2774out: 2858 return xfs_finish_rename(tp, free_list);
2859
2860out_trans_abort:
2861 xfs_bmap_cancel(free_list);
2862 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
2775 return error; 2863 return error;
2776} 2864}
2777 2865
2778/* 2866/*
2867 * xfs_rename_alloc_whiteout()
2868 *
2869 * Return a referenced, unlinked, unlocked inode that that can be used as a
2870 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
2871 * crash between allocating the inode and linking it into the rename transaction
2872 * recovery will free the inode and we won't leak it.
2873 */
2874static int
2875xfs_rename_alloc_whiteout(
2876 struct xfs_inode *dp,
2877 struct xfs_inode **wip)
2878{
2879 struct xfs_inode *tmpfile;
2880 int error;
2881
2882 error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile);
2883 if (error)
2884 return error;
2885
2886 /* Satisfy xfs_bumplink that this is a real tmpfile */
2887 xfs_finish_inode_setup(tmpfile);
2888 VFS_I(tmpfile)->i_state |= I_LINKABLE;
2889
2890 *wip = tmpfile;
2891 return 0;
2892}
2893
2894/*
2779 * xfs_rename 2895 * xfs_rename
2780 */ 2896 */
2781int 2897int
2782xfs_rename( 2898xfs_rename(
2783 xfs_inode_t *src_dp, 2899 struct xfs_inode *src_dp,
2784 struct xfs_name *src_name, 2900 struct xfs_name *src_name,
2785 xfs_inode_t *src_ip, 2901 struct xfs_inode *src_ip,
2786 xfs_inode_t *target_dp, 2902 struct xfs_inode *target_dp,
2787 struct xfs_name *target_name, 2903 struct xfs_name *target_name,
2788 xfs_inode_t *target_ip, 2904 struct xfs_inode *target_ip,
2789 unsigned int flags) 2905 unsigned int flags)
2790{ 2906{
2791 xfs_trans_t *tp = NULL; 2907 struct xfs_mount *mp = src_dp->i_mount;
2792 xfs_mount_t *mp = src_dp->i_mount; 2908 struct xfs_trans *tp;
2793 int new_parent; /* moving to a new dir */ 2909 struct xfs_bmap_free free_list;
2794 int src_is_directory; /* src_name is a directory */ 2910 xfs_fsblock_t first_block;
2795 int error; 2911 struct xfs_inode *wip = NULL; /* whiteout inode */
2796 xfs_bmap_free_t free_list; 2912 struct xfs_inode *inodes[__XFS_SORT_INODES];
2797 xfs_fsblock_t first_block; 2913 int num_inodes = __XFS_SORT_INODES;
2798 int cancel_flags; 2914 bool new_parent = (src_dp != target_dp);
2799 int committed; 2915 bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
2800 xfs_inode_t *inodes[4]; 2916 int cancel_flags = 0;
2801 int spaceres; 2917 int spaceres;
2802 int num_inodes; 2918 int error;
2803 2919
2804 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 2920 trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2805 2921
2806 new_parent = (src_dp != target_dp); 2922 if ((flags & RENAME_EXCHANGE) && !target_ip)
2807 src_is_directory = S_ISDIR(src_ip->i_d.di_mode); 2923 return -EINVAL;
2808 2924
2809 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, 2925 /*
2926 * If we are doing a whiteout operation, allocate the whiteout inode
2927 * we will be placing at the target and ensure the type is set
2928 * appropriately.
2929 */
2930 if (flags & RENAME_WHITEOUT) {
2931 ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
2932 error = xfs_rename_alloc_whiteout(target_dp, &wip);
2933 if (error)
2934 return error;
2935
2936 /* setup target dirent info as whiteout */
2937 src_name->type = XFS_DIR3_FT_CHRDEV;
2938 }
2939
2940 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip,
2810 inodes, &num_inodes); 2941 inodes, &num_inodes);
2811 2942
2812 xfs_bmap_init(&free_list, &first_block);
2813 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); 2943 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
2814 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2815 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 2944 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2816 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); 2945 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
2817 if (error == -ENOSPC) { 2946 if (error == -ENOSPC) {
2818 spaceres = 0; 2947 spaceres = 0;
2819 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); 2948 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
2820 } 2949 }
2821 if (error) { 2950 if (error)
2822 xfs_trans_cancel(tp, 0); 2951 goto out_trans_cancel;
2823 goto std_return; 2952 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2824 }
2825 2953
2826 /* 2954 /*
2827 * Attach the dquots to the inodes 2955 * Attach the dquots to the inodes
2828 */ 2956 */
2829 error = xfs_qm_vop_rename_dqattach(inodes); 2957 error = xfs_qm_vop_rename_dqattach(inodes);
2830 if (error) { 2958 if (error)
2831 xfs_trans_cancel(tp, cancel_flags); 2959 goto out_trans_cancel;
2832 goto std_return;
2833 }
2834 2960
2835 /* 2961 /*
2836 * Lock all the participating inodes. Depending upon whether 2962 * Lock all the participating inodes. Depending upon whether
@@ -2851,6 +2977,8 @@ xfs_rename(
2851 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 2977 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2852 if (target_ip) 2978 if (target_ip)
2853 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 2979 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2980 if (wip)
2981 xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL);
2854 2982
2855 /* 2983 /*
2856 * If we are using project inheritance, we only allow renames 2984 * If we are using project inheritance, we only allow renames
@@ -2860,24 +2988,16 @@ xfs_rename(
2860 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 2988 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2861 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { 2989 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
2862 error = -EXDEV; 2990 error = -EXDEV;
2863 goto error_return; 2991 goto out_trans_cancel;
2864 } 2992 }
2865 2993
2866 /* 2994 xfs_bmap_init(&free_list, &first_block);
2867 * Handle RENAME_EXCHANGE flags 2995
2868 */ 2996 /* RENAME_EXCHANGE is unique from here on. */
2869 if (flags & RENAME_EXCHANGE) { 2997 if (flags & RENAME_EXCHANGE)
2870 if (target_ip == NULL) { 2998 return xfs_cross_rename(tp, src_dp, src_name, src_ip,
2871 error = -EINVAL; 2999 target_dp, target_name, target_ip,
2872 goto error_return; 3000 &free_list, &first_block, spaceres);
2873 }
2874 error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
2875 target_dp, target_name, target_ip,
2876 &free_list, &first_block, spaceres);
2877 if (error)
2878 goto abort_return;
2879 goto finish_rename;
2880 }
2881 3001
2882 /* 3002 /*
2883 * Set up the target. 3003 * Set up the target.
@@ -2890,7 +3010,7 @@ xfs_rename(
2890 if (!spaceres) { 3010 if (!spaceres) {
2891 error = xfs_dir_canenter(tp, target_dp, target_name); 3011 error = xfs_dir_canenter(tp, target_dp, target_name);
2892 if (error) 3012 if (error)
2893 goto error_return; 3013 goto out_trans_cancel;
2894 } 3014 }
2895 /* 3015 /*
2896 * If target does not exist and the rename crosses 3016 * If target does not exist and the rename crosses
@@ -2901,9 +3021,9 @@ xfs_rename(
2901 src_ip->i_ino, &first_block, 3021 src_ip->i_ino, &first_block,
2902 &free_list, spaceres); 3022 &free_list, spaceres);
2903 if (error == -ENOSPC) 3023 if (error == -ENOSPC)
2904 goto error_return; 3024 goto out_bmap_cancel;
2905 if (error) 3025 if (error)
2906 goto abort_return; 3026 goto out_trans_abort;
2907 3027
2908 xfs_trans_ichgtime(tp, target_dp, 3028 xfs_trans_ichgtime(tp, target_dp,
2909 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3029 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2911,7 +3031,7 @@ xfs_rename(
2911 if (new_parent && src_is_directory) { 3031 if (new_parent && src_is_directory) {
2912 error = xfs_bumplink(tp, target_dp); 3032 error = xfs_bumplink(tp, target_dp);
2913 if (error) 3033 if (error)
2914 goto abort_return; 3034 goto out_trans_abort;
2915 } 3035 }
2916 } else { /* target_ip != NULL */ 3036 } else { /* target_ip != NULL */
2917 /* 3037 /*
@@ -2926,7 +3046,7 @@ xfs_rename(
2926 if (!(xfs_dir_isempty(target_ip)) || 3046 if (!(xfs_dir_isempty(target_ip)) ||
2927 (target_ip->i_d.di_nlink > 2)) { 3047 (target_ip->i_d.di_nlink > 2)) {
2928 error = -EEXIST; 3048 error = -EEXIST;
2929 goto error_return; 3049 goto out_trans_cancel;
2930 } 3050 }
2931 } 3051 }
2932 3052
@@ -2943,7 +3063,7 @@ xfs_rename(
2943 src_ip->i_ino, 3063 src_ip->i_ino,
2944 &first_block, &free_list, spaceres); 3064 &first_block, &free_list, spaceres);
2945 if (error) 3065 if (error)
2946 goto abort_return; 3066 goto out_trans_abort;
2947 3067
2948 xfs_trans_ichgtime(tp, target_dp, 3068 xfs_trans_ichgtime(tp, target_dp,
2949 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3069 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2954,7 +3074,7 @@ xfs_rename(
2954 */ 3074 */
2955 error = xfs_droplink(tp, target_ip); 3075 error = xfs_droplink(tp, target_ip);
2956 if (error) 3076 if (error)
2957 goto abort_return; 3077 goto out_trans_abort;
2958 3078
2959 if (src_is_directory) { 3079 if (src_is_directory) {
2960 /* 3080 /*
@@ -2962,7 +3082,7 @@ xfs_rename(
2962 */ 3082 */
2963 error = xfs_droplink(tp, target_ip); 3083 error = xfs_droplink(tp, target_ip);
2964 if (error) 3084 if (error)
2965 goto abort_return; 3085 goto out_trans_abort;
2966 } 3086 }
2967 } /* target_ip != NULL */ 3087 } /* target_ip != NULL */
2968 3088
@@ -2979,7 +3099,7 @@ xfs_rename(
2979 &first_block, &free_list, spaceres); 3099 &first_block, &free_list, spaceres);
2980 ASSERT(error != -EEXIST); 3100 ASSERT(error != -EEXIST);
2981 if (error) 3101 if (error)
2982 goto abort_return; 3102 goto out_trans_abort;
2983 } 3103 }
2984 3104
2985 /* 3105 /*
@@ -3005,49 +3125,67 @@ xfs_rename(
3005 */ 3125 */
3006 error = xfs_droplink(tp, src_dp); 3126 error = xfs_droplink(tp, src_dp);
3007 if (error) 3127 if (error)
3008 goto abort_return; 3128 goto out_trans_abort;
3009 } 3129 }
3010 3130
3011 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, 3131 /*
3132 * For whiteouts, we only need to update the source dirent with the
3133 * inode number of the whiteout inode rather than removing it
3134 * altogether.
3135 */
3136 if (wip) {
3137 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino,
3012 &first_block, &free_list, spaceres); 3138 &first_block, &free_list, spaceres);
3139 } else
3140 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
3141 &first_block, &free_list, spaceres);
3013 if (error) 3142 if (error)
3014 goto abort_return; 3143 goto out_trans_abort;
3015
3016 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3017 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3018 if (new_parent)
3019 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3020 3144
3021finish_rename:
3022 /* 3145 /*
3023 * If this is a synchronous mount, make sure that the 3146 * For whiteouts, we need to bump the link count on the whiteout inode.
3024 * rename transaction goes to disk before returning to 3147 * This means that failures all the way up to this point leave the inode
3025 * the user. 3148 * on the unlinked list and so cleanup is a simple matter of dropping
3149 * the remaining reference to it. If we fail here after bumping the link
3150 * count, we're shutting down the filesystem so we'll never see the
3151 * intermediate state on disk.
3026 */ 3152 */
3027 if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { 3153 if (wip) {
3028 xfs_trans_set_sync(tp); 3154 ASSERT(wip->i_d.di_nlink == 0);
3029 } 3155 error = xfs_bumplink(tp, wip);
3156 if (error)
3157 goto out_trans_abort;
3158 error = xfs_iunlink_remove(tp, wip);
3159 if (error)
3160 goto out_trans_abort;
3161 xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE);
3030 3162
3031 error = xfs_bmap_finish(&tp, &free_list, &committed); 3163 /*
3032 if (error) { 3164 * Now we have a real link, clear the "I'm a tmpfile" state
3033 xfs_bmap_cancel(&free_list); 3165 * flag from the inode so it doesn't accidentally get misused in
3034 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | 3166 * future.
3035 XFS_TRANS_ABORT)); 3167 */
3036 goto std_return; 3168 VFS_I(wip)->i_state &= ~I_LINKABLE;
3037 } 3169 }
3038 3170
3039 /* 3171 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
3040 * trans_commit will unlock src_ip, target_ip & decrement 3172 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
3041 * the vnode references. 3173 if (new_parent)
3042 */ 3174 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
3043 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
3044 3175
3045 abort_return: 3176 error = xfs_finish_rename(tp, &free_list);
3177 if (wip)
3178 IRELE(wip);
3179 return error;
3180
3181out_trans_abort:
3046 cancel_flags |= XFS_TRANS_ABORT; 3182 cancel_flags |= XFS_TRANS_ABORT;
3047 error_return: 3183out_bmap_cancel:
3048 xfs_bmap_cancel(&free_list); 3184 xfs_bmap_cancel(&free_list);
3185out_trans_cancel:
3049 xfs_trans_cancel(tp, cancel_flags); 3186 xfs_trans_cancel(tp, cancel_flags);
3050 std_return: 3187 if (wip)
3188 IRELE(wip);
3051 return error; 3189 return error;
3052} 3190}
3053 3191
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a1cd55f3f351..8f22d20368d8 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,6 +56,7 @@ typedef struct xfs_inode {
56 struct xfs_inode_log_item *i_itemp; /* logging information */ 56 struct xfs_inode_log_item *i_itemp; /* logging information */
57 mrlock_t i_lock; /* inode lock */ 57 mrlock_t i_lock; /* inode lock */
58 mrlock_t i_iolock; /* inode IO lock */ 58 mrlock_t i_iolock; /* inode IO lock */
59 mrlock_t i_mmaplock; /* inode mmap IO lock */
59 atomic_t i_pincount; /* inode pin count */ 60 atomic_t i_pincount; /* inode pin count */
60 spinlock_t i_flags_lock; /* inode i_flags lock */ 61 spinlock_t i_flags_lock; /* inode i_flags lock */
61 /* Miscellaneous state. */ 62 /* Miscellaneous state. */
@@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
263#define XFS_IOLOCK_SHARED (1<<1) 264#define XFS_IOLOCK_SHARED (1<<1)
264#define XFS_ILOCK_EXCL (1<<2) 265#define XFS_ILOCK_EXCL (1<<2)
265#define XFS_ILOCK_SHARED (1<<3) 266#define XFS_ILOCK_SHARED (1<<3)
267#define XFS_MMAPLOCK_EXCL (1<<4)
268#define XFS_MMAPLOCK_SHARED (1<<5)
266 269
267#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ 270#define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \
268 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) 271 | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \
272 | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED)
269 273
270#define XFS_LOCK_FLAGS \ 274#define XFS_LOCK_FLAGS \
271 { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ 275 { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \
272 { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ 276 { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \
273 { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ 277 { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \
274 { XFS_ILOCK_SHARED, "ILOCK_SHARED" } 278 { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \
279 { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \
280 { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" }
275 281
276 282
277/* 283/*
@@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
302#define XFS_IOLOCK_SHIFT 16 308#define XFS_IOLOCK_SHIFT 16
303#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) 309#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
304 310
311#define XFS_MMAPLOCK_SHIFT 20
312
305#define XFS_ILOCK_SHIFT 24 313#define XFS_ILOCK_SHIFT 24
306#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) 314#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
307#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) 315#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
308#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) 316#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
309 317
310#define XFS_IOLOCK_DEP_MASK 0x00ff0000 318#define XFS_IOLOCK_DEP_MASK 0x000f0000
319#define XFS_MMAPLOCK_DEP_MASK 0x00f00000
311#define XFS_ILOCK_DEP_MASK 0xff000000 320#define XFS_ILOCK_DEP_MASK 0xff000000
312#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) 321#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \
322 XFS_MMAPLOCK_DEP_MASK | \
323 XFS_ILOCK_DEP_MASK)
313 324
314#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) 325#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \
315#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) 326 >> XFS_IOLOCK_SHIFT)
327#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \
328 >> XFS_MMAPLOCK_SHIFT)
329#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \
330 >> XFS_ILOCK_SHIFT)
316 331
317/* 332/*
318 * For multiple groups support: if S_ISGID bit is set in the parent 333 * For multiple groups support: if S_ISGID bit is set in the parent
@@ -391,6 +406,28 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
391int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); 406int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
392 407
393 408
409/* from xfs_iops.c */
410/*
411 * When setting up a newly allocated inode, we need to call
412 * xfs_finish_inode_setup() once the inode is fully instantiated at
413 * the VFS level to prevent the rest of the world seeing the inode
414 * before we've completed instantiation. Otherwise we can do it
415 * the moment the inode lookup is complete.
416 */
417extern void xfs_setup_inode(struct xfs_inode *ip);
418static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
419{
420 xfs_iflags_clear(ip, XFS_INEW);
421 barrier();
422 unlock_new_inode(VFS_I(ip));
423}
424
425static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
426{
427 xfs_setup_inode(ip);
428 xfs_finish_inode_setup(ip);
429}
430
394#define IHOLD(ip) \ 431#define IHOLD(ip) \
395do { \ 432do { \
396 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ 433 ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ac4feae45eb3..5f4a396f5186 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -631,7 +631,7 @@ xfs_ioc_space(
631 631
632 if (filp->f_flags & O_DSYNC) 632 if (filp->f_flags & O_DSYNC)
633 flags |= XFS_PREALLOC_SYNC; 633 flags |= XFS_PREALLOC_SYNC;
634 if (ioflags & XFS_IO_INVIS) 634 if (ioflags & XFS_IO_INVIS)
635 flags |= XFS_PREALLOC_INVISIBLE; 635 flags |= XFS_PREALLOC_INVISIBLE;
636 636
637 error = mnt_want_write_file(filp); 637 error = mnt_want_write_file(filp);
@@ -639,10 +639,13 @@ xfs_ioc_space(
639 return error; 639 return error;
640 640
641 xfs_ilock(ip, iolock); 641 xfs_ilock(ip, iolock);
642 error = xfs_break_layouts(inode, &iolock); 642 error = xfs_break_layouts(inode, &iolock, false);
643 if (error) 643 if (error)
644 goto out_unlock; 644 goto out_unlock;
645 645
646 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
647 iolock |= XFS_MMAPLOCK_EXCL;
648
646 switch (bf->l_whence) { 649 switch (bf->l_whence) {
647 case 0: /*SEEK_SET*/ 650 case 0: /*SEEK_SET*/
648 break; 651 break;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index ccb1dd0d509e..38e633bad8c2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -460,8 +460,7 @@ xfs_iomap_prealloc_size(
460 alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), 460 alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN),
461 alloc_blocks); 461 alloc_blocks);
462 462
463 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); 463 freesp = percpu_counter_read_positive(&mp->m_fdblocks);
464 freesp = mp->m_sb.sb_fdblocks;
465 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { 464 if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
466 shift = 2; 465 shift = 2;
467 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) 466 if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index e53a90331422..2f1839e4dd1b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -187,6 +187,8 @@ xfs_generic_create(
187 else 187 else
188 d_instantiate(dentry, inode); 188 d_instantiate(dentry, inode);
189 189
190 xfs_finish_inode_setup(ip);
191
190 out_free_acl: 192 out_free_acl:
191 if (default_acl) 193 if (default_acl)
192 posix_acl_release(default_acl); 194 posix_acl_release(default_acl);
@@ -195,6 +197,7 @@ xfs_generic_create(
195 return error; 197 return error;
196 198
197 out_cleanup_inode: 199 out_cleanup_inode:
200 xfs_finish_inode_setup(ip);
198 if (!tmpfile) 201 if (!tmpfile)
199 xfs_cleanup_inode(dir, inode, dentry); 202 xfs_cleanup_inode(dir, inode, dentry);
200 iput(inode); 203 iput(inode);
@@ -367,9 +370,11 @@ xfs_vn_symlink(
367 goto out_cleanup_inode; 370 goto out_cleanup_inode;
368 371
369 d_instantiate(dentry, inode); 372 d_instantiate(dentry, inode);
373 xfs_finish_inode_setup(cip);
370 return 0; 374 return 0;
371 375
372 out_cleanup_inode: 376 out_cleanup_inode:
377 xfs_finish_inode_setup(cip);
373 xfs_cleanup_inode(dir, inode, dentry); 378 xfs_cleanup_inode(dir, inode, dentry);
374 iput(inode); 379 iput(inode);
375 out: 380 out:
@@ -389,7 +394,7 @@ xfs_vn_rename(
389 struct xfs_name oname; 394 struct xfs_name oname;
390 struct xfs_name nname; 395 struct xfs_name nname;
391 396
392 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 397 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
393 return -EINVAL; 398 return -EINVAL;
394 399
395 /* if we are exchanging files, we need to set i_mode of both files */ 400 /* if we are exchanging files, we need to set i_mode of both files */
@@ -766,6 +771,7 @@ xfs_setattr_size(
766 return error; 771 return error;
767 772
768 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 773 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
774 ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
769 ASSERT(S_ISREG(ip->i_d.di_mode)); 775 ASSERT(S_ISREG(ip->i_d.di_mode));
770 ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| 776 ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
771 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); 777 ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
@@ -829,55 +835,27 @@ xfs_setattr_size(
829 inode_dio_wait(inode); 835 inode_dio_wait(inode);
830 836
831 /* 837 /*
832 * Do all the page cache truncate work outside the transaction context 838 * We've already locked out new page faults, so now we can safely remove
833 * as the "lock" order is page lock->log space reservation. i.e. 839 * pages from the page cache knowing they won't get refaulted until we
834 * locking pages inside the transaction can ABBA deadlock with 840 * drop the XFS_MMAP_EXCL lock after the extent manipulations are
835 * writeback. We have to do the VFS inode size update before we truncate 841 * complete. The truncate_setsize() call also cleans partial EOF page
836 * the pagecache, however, to avoid racing with page faults beyond the 842 * PTEs on extending truncates and hence ensures sub-page block size
837 * new EOF they are not serialised against truncate operations except by 843 * filesystems are correctly handled, too.
838 * page locks and size updates.
839 * 844 *
840 * Hence we are in a situation where a truncate can fail with ENOMEM 845 * We have to do all the page cache truncate work outside the
841 * from xfs_trans_reserve(), but having already truncated the in-memory 846 * transaction context as the "lock" order is page lock->log space
842 * version of the file (i.e. made user visible changes). There's not 847 * reservation as defined by extent allocation in the writeback path.
843 * much we can do about this, except to hope that the caller sees ENOMEM 848 * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but
844 * and retries the truncate operation. 849 * having already truncated the in-memory version of the file (i.e. made
850 * user visible changes). There's not much we can do about this, except
851 * to hope that the caller sees ENOMEM and retries the truncate
852 * operation.
845 */ 853 */
846 error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); 854 error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
847 if (error) 855 if (error)
848 return error; 856 return error;
849 truncate_setsize(inode, newsize); 857 truncate_setsize(inode, newsize);
850 858
851 /*
852 * The "we can't serialise against page faults" pain gets worse.
853 *
854 * If the file is mapped then we have to clean the page at the old EOF
855 * when extending the file. Extending the file can expose changes the
856 * underlying page mapping (e.g. from beyond EOF to a hole or
857 * unwritten), and so on the next attempt to write to that page we need
858 * to remap it for write. i.e. we need .page_mkwrite() to be called.
859 * Hence we need to clean the page to clean the pte and so a new write
860 * fault will be triggered appropriately.
861 *
862 * If we do it before we change the inode size, then we can race with a
863 * page fault that maps the page with exactly the same problem. If we do
864 * it after we change the file size, then a new page fault can come in
865 * and allocate space before we've run the rest of the truncate
866 * transaction. That's kinda grotesque, but it's better than have data
867 * over a hole, and so that's the lesser evil that has been chosen here.
868 *
869 * The real solution, however, is to have some mechanism for locking out
870 * page faults while a truncate is in progress.
871 */
872 if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
873 error = filemap_write_and_wait_range(
874 VFS_I(ip)->i_mapping,
875 round_down(oldsize, PAGE_CACHE_SIZE),
876 round_up(oldsize, PAGE_CACHE_SIZE) - 1);
877 if (error)
878 return error;
879 }
880
881 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 859 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
882 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 860 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
883 if (error) 861 if (error)
@@ -975,9 +953,13 @@ xfs_vn_setattr(
975 uint iolock = XFS_IOLOCK_EXCL; 953 uint iolock = XFS_IOLOCK_EXCL;
976 954
977 xfs_ilock(ip, iolock); 955 xfs_ilock(ip, iolock);
978 error = xfs_break_layouts(dentry->d_inode, &iolock); 956 error = xfs_break_layouts(dentry->d_inode, &iolock, true);
979 if (!error) 957 if (!error) {
958 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
959 iolock |= XFS_MMAPLOCK_EXCL;
960
980 error = xfs_setattr_size(ip, iattr); 961 error = xfs_setattr_size(ip, iattr);
962 }
981 xfs_iunlock(ip, iolock); 963 xfs_iunlock(ip, iolock);
982 } else { 964 } else {
983 error = xfs_setattr_nonsize(ip, iattr, 0); 965 error = xfs_setattr_nonsize(ip, iattr, 0);
@@ -1228,16 +1210,12 @@ xfs_diflags_to_iflags(
1228} 1210}
1229 1211
1230/* 1212/*
1231 * Initialize the Linux inode, set up the operation vectors and 1213 * Initialize the Linux inode and set up the operation vectors.
1232 * unlock the inode.
1233 * 1214 *
1234 * When reading existing inodes from disk this is called directly 1215 * When reading existing inodes from disk this is called directly from xfs_iget,
1235 * from xfs_iget, when creating a new inode it is called from 1216 * when creating a new inode it is called from xfs_ialloc after setting up the
1236 * xfs_ialloc after setting up the inode. 1217 * inode. These callers have different criteria for clearing XFS_INEW, so leave
1237 * 1218 * it up to the caller to deal with unlocking the inode appropriately.
1238 * We are always called with an uninitialised linux inode here.
1239 * We need to initialise the necessary fields and take a reference
1240 * on it.
1241 */ 1219 */
1242void 1220void
1243xfs_setup_inode( 1221xfs_setup_inode(
@@ -1324,9 +1302,4 @@ xfs_setup_inode(
1324 inode_has_no_xattr(inode); 1302 inode_has_no_xattr(inode);
1325 cache_no_acl(inode); 1303 cache_no_acl(inode);
1326 } 1304 }
1327
1328 xfs_iflags_clear(ip, XFS_INEW);
1329 barrier();
1330
1331 unlock_new_inode(inode);
1332} 1305}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index ea7a98e9cb70..a0f84abb0d09 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations;
25 25
26extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); 26extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
27 27
28extern void xfs_setup_inode(struct xfs_inode *);
29
30/* 28/*
31 * Internal setattr interfaces. 29 * Internal setattr interfaces.
32 */ 30 */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 82e314258f73..80429891dc9b 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk(
229 error = xfs_inobt_get_rec(cur, irec, &stat); 229 error = xfs_inobt_get_rec(cur, irec, &stat);
230 if (error) 230 if (error)
231 return error; 231 return error;
232 XFS_WANT_CORRUPTED_RETURN(stat == 1); 232 XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1);
233 233
234 /* Check if the record contains the inode in request */ 234 /* Check if the record contains the inode in request */
235 if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { 235 if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index c31d2c2eadc4..7c7842c85a08 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -116,15 +116,6 @@ typedef __uint64_t __psunsigned_t;
116#undef XFS_NATIVE_HOST 116#undef XFS_NATIVE_HOST
117#endif 117#endif
118 118
119/*
120 * Feature macros (disable/enable)
121 */
122#ifdef CONFIG_SMP
123#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
124#else
125#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
126#endif
127
128#define irix_sgid_inherit xfs_params.sgid_inherit.val 119#define irix_sgid_inherit xfs_params.sgid_inherit.val
129#define irix_symlink_mode xfs_params.symlink_mode.val 120#define irix_symlink_mode xfs_params.symlink_mode.val
130#define xfs_panic_mask xfs_params.panic_mask.val 121#define xfs_panic_mask xfs_params.panic_mask.val
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a5a945fc3bdc..4f5784f85a5b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -4463,10 +4463,10 @@ xlog_do_recover(
4463 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); 4463 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
4464 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); 4464 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
4465 ASSERT(xfs_sb_good_version(sbp)); 4465 ASSERT(xfs_sb_good_version(sbp));
4466 xfs_reinit_percpu_counters(log->l_mp);
4467
4466 xfs_buf_relse(bp); 4468 xfs_buf_relse(bp);
4467 4469
4468 /* We've re-read the superblock so re-initialize per-cpu counters */
4469 xfs_icsb_reinit_counters(log->l_mp);
4470 4470
4471 xlog_recover_check_summary(log); 4471 xlog_recover_check_summary(log);
4472 4472
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4fa80e63eea2..2ce7ee3b4ec1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,18 +43,6 @@
43#include "xfs_sysfs.h" 43#include "xfs_sysfs.h"
44 44
45 45
46#ifdef HAVE_PERCPU_SB
47STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
48 int);
49STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
50 int);
51STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
52#else
53
54#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
55#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0)
56#endif
57
58static DEFINE_MUTEX(xfs_uuid_table_mutex); 46static DEFINE_MUTEX(xfs_uuid_table_mutex);
59static int xfs_uuid_table_size; 47static int xfs_uuid_table_size;
60static uuid_t *xfs_uuid_table; 48static uuid_t *xfs_uuid_table;
@@ -347,8 +335,7 @@ reread:
347 goto reread; 335 goto reread;
348 } 336 }
349 337
350 /* Initialize per-cpu counters */ 338 xfs_reinit_percpu_counters(mp);
351 xfs_icsb_reinit_counters(mp);
352 339
353 /* no need to be quiet anymore, so reset the buf ops */ 340 /* no need to be quiet anymore, so reset the buf ops */
354 bp->b_ops = &xfs_sb_buf_ops; 341 bp->b_ops = &xfs_sb_buf_ops;
@@ -1087,8 +1074,6 @@ xfs_log_sbcount(xfs_mount_t *mp)
1087 if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) 1074 if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
1088 return 0; 1075 return 0;
1089 1076
1090 xfs_icsb_sync_counters(mp, 0);
1091
1092 /* 1077 /*
1093 * we don't need to do this if we are updating the superblock 1078 * we don't need to do this if we are updating the superblock
1094 * counters on every modification. 1079 * counters on every modification.
@@ -1099,253 +1084,136 @@ xfs_log_sbcount(xfs_mount_t *mp)
1099 return xfs_sync_sb(mp, true); 1084 return xfs_sync_sb(mp, true);
1100} 1085}
1101 1086
1102/* 1087int
1103 * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply 1088xfs_mod_icount(
1104 * a delta to a specified field in the in-core superblock. Simply 1089 struct xfs_mount *mp,
1105 * switch on the field indicated and apply the delta to that field. 1090 int64_t delta)
1106 * Fields are not allowed to dip below zero, so if the delta would
1107 * do this do not apply it and return EINVAL.
1108 *
1109 * The m_sb_lock must be held when this routine is called.
1110 */
1111STATIC int
1112xfs_mod_incore_sb_unlocked(
1113 xfs_mount_t *mp,
1114 xfs_sb_field_t field,
1115 int64_t delta,
1116 int rsvd)
1117{ 1091{
1118 int scounter; /* short counter for 32 bit fields */ 1092 /* deltas are +/-64, hence the large batch size of 128. */
1119 long long lcounter; /* long counter for 64 bit fields */ 1093 __percpu_counter_add(&mp->m_icount, delta, 128);
1120 long long res_used, rem; 1094 if (percpu_counter_compare(&mp->m_icount, 0) < 0) {
1121
1122 /*
1123 * With the in-core superblock spin lock held, switch
1124 * on the indicated field. Apply the delta to the
1125 * proper field. If the fields value would dip below
1126 * 0, then do not apply the delta and return EINVAL.
1127 */
1128 switch (field) {
1129 case XFS_SBS_ICOUNT:
1130 lcounter = (long long)mp->m_sb.sb_icount;
1131 lcounter += delta;
1132 if (lcounter < 0) {
1133 ASSERT(0);
1134 return -EINVAL;
1135 }
1136 mp->m_sb.sb_icount = lcounter;
1137 return 0;
1138 case XFS_SBS_IFREE:
1139 lcounter = (long long)mp->m_sb.sb_ifree;
1140 lcounter += delta;
1141 if (lcounter < 0) {
1142 ASSERT(0);
1143 return -EINVAL;
1144 }
1145 mp->m_sb.sb_ifree = lcounter;
1146 return 0;
1147 case XFS_SBS_FDBLOCKS:
1148 lcounter = (long long)
1149 mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1150 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1151
1152 if (delta > 0) { /* Putting blocks back */
1153 if (res_used > delta) {
1154 mp->m_resblks_avail += delta;
1155 } else {
1156 rem = delta - res_used;
1157 mp->m_resblks_avail = mp->m_resblks;
1158 lcounter += rem;
1159 }
1160 } else { /* Taking blocks away */
1161 lcounter += delta;
1162 if (lcounter >= 0) {
1163 mp->m_sb.sb_fdblocks = lcounter +
1164 XFS_ALLOC_SET_ASIDE(mp);
1165 return 0;
1166 }
1167
1168 /*
1169 * We are out of blocks, use any available reserved
1170 * blocks if were allowed to.
1171 */
1172 if (!rsvd)
1173 return -ENOSPC;
1174
1175 lcounter = (long long)mp->m_resblks_avail + delta;
1176 if (lcounter >= 0) {
1177 mp->m_resblks_avail = lcounter;
1178 return 0;
1179 }
1180 printk_once(KERN_WARNING
1181 "Filesystem \"%s\": reserve blocks depleted! "
1182 "Consider increasing reserve pool size.",
1183 mp->m_fsname);
1184 return -ENOSPC;
1185 }
1186
1187 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
1188 return 0;
1189 case XFS_SBS_FREXTENTS:
1190 lcounter = (long long)mp->m_sb.sb_frextents;
1191 lcounter += delta;
1192 if (lcounter < 0) {
1193 return -ENOSPC;
1194 }
1195 mp->m_sb.sb_frextents = lcounter;
1196 return 0;
1197 case XFS_SBS_DBLOCKS:
1198 lcounter = (long long)mp->m_sb.sb_dblocks;
1199 lcounter += delta;
1200 if (lcounter < 0) {
1201 ASSERT(0);
1202 return -EINVAL;
1203 }
1204 mp->m_sb.sb_dblocks = lcounter;
1205 return 0;
1206 case XFS_SBS_AGCOUNT:
1207 scounter = mp->m_sb.sb_agcount;
1208 scounter += delta;
1209 if (scounter < 0) {
1210 ASSERT(0);
1211 return -EINVAL;
1212 }
1213 mp->m_sb.sb_agcount = scounter;
1214 return 0;
1215 case XFS_SBS_IMAX_PCT:
1216 scounter = mp->m_sb.sb_imax_pct;
1217 scounter += delta;
1218 if (scounter < 0) {
1219 ASSERT(0);
1220 return -EINVAL;
1221 }
1222 mp->m_sb.sb_imax_pct = scounter;
1223 return 0;
1224 case XFS_SBS_REXTSIZE:
1225 scounter = mp->m_sb.sb_rextsize;
1226 scounter += delta;
1227 if (scounter < 0) {
1228 ASSERT(0);
1229 return -EINVAL;
1230 }
1231 mp->m_sb.sb_rextsize = scounter;
1232 return 0;
1233 case XFS_SBS_RBMBLOCKS:
1234 scounter = mp->m_sb.sb_rbmblocks;
1235 scounter += delta;
1236 if (scounter < 0) {
1237 ASSERT(0);
1238 return -EINVAL;
1239 }
1240 mp->m_sb.sb_rbmblocks = scounter;
1241 return 0;
1242 case XFS_SBS_RBLOCKS:
1243 lcounter = (long long)mp->m_sb.sb_rblocks;
1244 lcounter += delta;
1245 if (lcounter < 0) {
1246 ASSERT(0);
1247 return -EINVAL;
1248 }
1249 mp->m_sb.sb_rblocks = lcounter;
1250 return 0;
1251 case XFS_SBS_REXTENTS:
1252 lcounter = (long long)mp->m_sb.sb_rextents;
1253 lcounter += delta;
1254 if (lcounter < 0) {
1255 ASSERT(0);
1256 return -EINVAL;
1257 }
1258 mp->m_sb.sb_rextents = lcounter;
1259 return 0;
1260 case XFS_SBS_REXTSLOG:
1261 scounter = mp->m_sb.sb_rextslog;
1262 scounter += delta;
1263 if (scounter < 0) {
1264 ASSERT(0);
1265 return -EINVAL;
1266 }
1267 mp->m_sb.sb_rextslog = scounter;
1268 return 0;
1269 default:
1270 ASSERT(0); 1095 ASSERT(0);
1096 percpu_counter_add(&mp->m_icount, -delta);
1271 return -EINVAL; 1097 return -EINVAL;
1272 } 1098 }
1099 return 0;
1273} 1100}
1274 1101
1275/*
1276 * xfs_mod_incore_sb() is used to change a field in the in-core
1277 * superblock structure by the specified delta. This modification
1278 * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked()
1279 * routine to do the work.
1280 */
1281int 1102int
1282xfs_mod_incore_sb( 1103xfs_mod_ifree(
1283 struct xfs_mount *mp, 1104 struct xfs_mount *mp,
1284 xfs_sb_field_t field, 1105 int64_t delta)
1285 int64_t delta,
1286 int rsvd)
1287{ 1106{
1288 int status; 1107 percpu_counter_add(&mp->m_ifree, delta);
1289 1108 if (percpu_counter_compare(&mp->m_ifree, 0) < 0) {
1290#ifdef HAVE_PERCPU_SB 1109 ASSERT(0);
1291 ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS); 1110 percpu_counter_add(&mp->m_ifree, -delta);
1292#endif 1111 return -EINVAL;
1293 spin_lock(&mp->m_sb_lock); 1112 }
1294 status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); 1113 return 0;
1295 spin_unlock(&mp->m_sb_lock);
1296
1297 return status;
1298} 1114}
1299 1115
1300/*
1301 * Change more than one field in the in-core superblock structure at a time.
1302 *
1303 * The fields and changes to those fields are specified in the array of
1304 * xfs_mod_sb structures passed in. Either all of the specified deltas
1305 * will be applied or none of them will. If any modified field dips below 0,
1306 * then all modifications will be backed out and EINVAL will be returned.
1307 *
1308 * Note that this function may not be used for the superblock values that
1309 * are tracked with the in-memory per-cpu counters - a direct call to
1310 * xfs_icsb_modify_counters is required for these.
1311 */
1312int 1116int
1313xfs_mod_incore_sb_batch( 1117xfs_mod_fdblocks(
1314 struct xfs_mount *mp, 1118 struct xfs_mount *mp,
1315 xfs_mod_sb_t *msb, 1119 int64_t delta,
1316 uint nmsb, 1120 bool rsvd)
1317 int rsvd)
1318{ 1121{
1319 xfs_mod_sb_t *msbp; 1122 int64_t lcounter;
1320 int error = 0; 1123 long long res_used;
1124 s32 batch;
1125
1126 if (delta > 0) {
1127 /*
1128 * If the reserve pool is depleted, put blocks back into it
1129 * first. Most of the time the pool is full.
1130 */
1131 if (likely(mp->m_resblks == mp->m_resblks_avail)) {
1132 percpu_counter_add(&mp->m_fdblocks, delta);
1133 return 0;
1134 }
1135
1136 spin_lock(&mp->m_sb_lock);
1137 res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
1138
1139 if (res_used > delta) {
1140 mp->m_resblks_avail += delta;
1141 } else {
1142 delta -= res_used;
1143 mp->m_resblks_avail = mp->m_resblks;
1144 percpu_counter_add(&mp->m_fdblocks, delta);
1145 }
1146 spin_unlock(&mp->m_sb_lock);
1147 return 0;
1148 }
1321 1149
1322 /* 1150 /*
1323 * Loop through the array of mod structures and apply each individually. 1151 * Taking blocks away, need to be more accurate the closer we
1324 * If any fail, then back out all those which have already been applied. 1152 * are to zero.
1325 * Do all of this within the scope of the m_sb_lock so that all of the 1153 *
1326 * changes will be atomic. 1154 * batch size is set to a maximum of 1024 blocks - if we are
1155 * allocating of freeing extents larger than this then we aren't
1156 * going to be hammering the counter lock so a lock per update
1157 * is not a problem.
1158 *
1159 * If the counter has a value of less than 2 * max batch size,
1160 * then make everything serialise as we are real close to
1161 * ENOSPC.
1162 */
1163#define __BATCH 1024
1164 if (percpu_counter_compare(&mp->m_fdblocks, 2 * __BATCH) < 0)
1165 batch = 1;
1166 else
1167 batch = __BATCH;
1168#undef __BATCH
1169
1170 __percpu_counter_add(&mp->m_fdblocks, delta, batch);
1171 if (percpu_counter_compare(&mp->m_fdblocks,
1172 XFS_ALLOC_SET_ASIDE(mp)) >= 0) {
1173 /* we had space! */
1174 return 0;
1175 }
1176
1177 /*
1178 * lock up the sb for dipping into reserves before releasing the space
1179 * that took us to ENOSPC.
1327 */ 1180 */
1328 spin_lock(&mp->m_sb_lock); 1181 spin_lock(&mp->m_sb_lock);
1329 for (msbp = msb; msbp < (msb + nmsb); msbp++) { 1182 percpu_counter_add(&mp->m_fdblocks, -delta);
1330 ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || 1183 if (!rsvd)
1331 msbp->msb_field > XFS_SBS_FDBLOCKS); 1184 goto fdblocks_enospc;
1332 1185
1333 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, 1186 lcounter = (long long)mp->m_resblks_avail + delta;
1334 msbp->msb_delta, rsvd); 1187 if (lcounter >= 0) {
1335 if (error) 1188 mp->m_resblks_avail = lcounter;
1336 goto unwind; 1189 spin_unlock(&mp->m_sb_lock);
1190 return 0;
1337 } 1191 }
1192 printk_once(KERN_WARNING
1193 "Filesystem \"%s\": reserve blocks depleted! "
1194 "Consider increasing reserve pool size.",
1195 mp->m_fsname);
1196fdblocks_enospc:
1338 spin_unlock(&mp->m_sb_lock); 1197 spin_unlock(&mp->m_sb_lock);
1339 return 0; 1198 return -ENOSPC;
1199}
1340 1200
1341unwind: 1201int
1342 while (--msbp >= msb) { 1202xfs_mod_frextents(
1343 error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, 1203 struct xfs_mount *mp,
1344 -msbp->msb_delta, rsvd); 1204 int64_t delta)
1345 ASSERT(error == 0); 1205{
1346 } 1206 int64_t lcounter;
1207 int ret = 0;
1208
1209 spin_lock(&mp->m_sb_lock);
1210 lcounter = mp->m_sb.sb_frextents + delta;
1211 if (lcounter < 0)
1212 ret = -ENOSPC;
1213 else
1214 mp->m_sb.sb_frextents = lcounter;
1347 spin_unlock(&mp->m_sb_lock); 1215 spin_unlock(&mp->m_sb_lock);
1348 return error; 1216 return ret;
1349} 1217}
1350 1218
1351/* 1219/*
@@ -1407,573 +1275,3 @@ xfs_dev_is_read_only(
1407 } 1275 }
1408 return 0; 1276 return 0;
1409} 1277}
1410
1411#ifdef HAVE_PERCPU_SB
1412/*
1413 * Per-cpu incore superblock counters
1414 *
1415 * Simple concept, difficult implementation
1416 *
1417 * Basically, replace the incore superblock counters with a distributed per cpu
1418 * counter for contended fields (e.g. free block count).
1419 *
1420 * Difficulties arise in that the incore sb is used for ENOSPC checking, and
1421 * hence needs to be accurately read when we are running low on space. Hence
1422 * there is a method to enable and disable the per-cpu counters based on how
1423 * much "stuff" is available in them.
1424 *
1425 * Basically, a counter is enabled if there is enough free resource to justify
1426 * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
1427 * ENOSPC), then we disable the counters to synchronise all callers and
1428 * re-distribute the available resources.
1429 *
1430 * If, once we redistributed the available resources, we still get a failure,
1431 * we disable the per-cpu counter and go through the slow path.
1432 *
1433 * The slow path is the current xfs_mod_incore_sb() function. This means that
1434 * when we disable a per-cpu counter, we need to drain its resources back to
1435 * the global superblock. We do this after disabling the counter to prevent
1436 * more threads from queueing up on the counter.
1437 *
1438 * Essentially, this means that we still need a lock in the fast path to enable
1439 * synchronisation between the global counters and the per-cpu counters. This
1440 * is not a problem because the lock will be local to a CPU almost all the time
1441 * and have little contention except when we get to ENOSPC conditions.
1442 *
1443 * Basically, this lock becomes a barrier that enables us to lock out the fast
1444 * path while we do things like enabling and disabling counters and
1445 * synchronising the counters.
1446 *
1447 * Locking rules:
1448 *
1449 * 1. m_sb_lock before picking up per-cpu locks
1450 * 2. per-cpu locks always picked up via for_each_online_cpu() order
1451 * 3. accurate counter sync requires m_sb_lock + per cpu locks
1452 * 4. modifying per-cpu counters requires holding per-cpu lock
1453 * 5. modifying global counters requires holding m_sb_lock
1454 * 6. enabling or disabling a counter requires holding the m_sb_lock
1455 * and _none_ of the per-cpu locks.
1456 *
1457 * Disabled counters are only ever re-enabled by a balance operation
1458 * that results in more free resources per CPU than a given threshold.
1459 * To ensure counters don't remain disabled, they are rebalanced when
1460 * the global resource goes above a higher threshold (i.e. some hysteresis
1461 * is present to prevent thrashing).
1462 */
1463
1464#ifdef CONFIG_HOTPLUG_CPU
1465/*
1466 * hot-plug CPU notifier support.
1467 *
1468 * We need a notifier per filesystem as we need to be able to identify
1469 * the filesystem to balance the counters out. This is achieved by
1470 * having a notifier block embedded in the xfs_mount_t and doing pointer
1471 * magic to get the mount pointer from the notifier block address.
1472 */
1473STATIC int
1474xfs_icsb_cpu_notify(
1475 struct notifier_block *nfb,
1476 unsigned long action,
1477 void *hcpu)
1478{
1479 xfs_icsb_cnts_t *cntp;
1480 xfs_mount_t *mp;
1481
1482 mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
1483 cntp = (xfs_icsb_cnts_t *)
1484 per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
1485 switch (action) {
1486 case CPU_UP_PREPARE:
1487 case CPU_UP_PREPARE_FROZEN:
1488 /* Easy Case - initialize the area and locks, and
1489 * then rebalance when online does everything else for us. */
1490 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1491 break;
1492 case CPU_ONLINE:
1493 case CPU_ONLINE_FROZEN:
1494 xfs_icsb_lock(mp);
1495 xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
1496 xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
1497 xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
1498 xfs_icsb_unlock(mp);
1499 break;
1500 case CPU_DEAD:
1501 case CPU_DEAD_FROZEN:
1502 /* Disable all the counters, then fold the dead cpu's
1503 * count into the total on the global superblock and
1504 * re-enable the counters. */
1505 xfs_icsb_lock(mp);
1506 spin_lock(&mp->m_sb_lock);
1507 xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
1508 xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
1509 xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
1510
1511 mp->m_sb.sb_icount += cntp->icsb_icount;
1512 mp->m_sb.sb_ifree += cntp->icsb_ifree;
1513 mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
1514
1515 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1516
1517 xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
1518 xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
1519 xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
1520 spin_unlock(&mp->m_sb_lock);
1521 xfs_icsb_unlock(mp);
1522 break;
1523 }
1524
1525 return NOTIFY_OK;
1526}
1527#endif /* CONFIG_HOTPLUG_CPU */
1528
1529int
1530xfs_icsb_init_counters(
1531 xfs_mount_t *mp)
1532{
1533 xfs_icsb_cnts_t *cntp;
1534 int i;
1535
1536 mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
1537 if (mp->m_sb_cnts == NULL)
1538 return -ENOMEM;
1539
1540 for_each_online_cpu(i) {
1541 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1542 memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
1543 }
1544
1545 mutex_init(&mp->m_icsb_mutex);
1546
1547 /*
1548 * start with all counters disabled so that the
1549 * initial balance kicks us off correctly
1550 */
1551 mp->m_icsb_counters = -1;
1552
1553#ifdef CONFIG_HOTPLUG_CPU
1554 mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
1555 mp->m_icsb_notifier.priority = 0;
1556 register_hotcpu_notifier(&mp->m_icsb_notifier);
1557#endif /* CONFIG_HOTPLUG_CPU */
1558
1559 return 0;
1560}
1561
1562void
1563xfs_icsb_reinit_counters(
1564 xfs_mount_t *mp)
1565{
1566 xfs_icsb_lock(mp);
1567 /*
1568 * start with all counters disabled so that the
1569 * initial balance kicks us off correctly
1570 */
1571 mp->m_icsb_counters = -1;
1572 xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
1573 xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
1574 xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
1575 xfs_icsb_unlock(mp);
1576}
1577
1578void
1579xfs_icsb_destroy_counters(
1580 xfs_mount_t *mp)
1581{
1582 if (mp->m_sb_cnts) {
1583 unregister_hotcpu_notifier(&mp->m_icsb_notifier);
1584 free_percpu(mp->m_sb_cnts);
1585 }
1586 mutex_destroy(&mp->m_icsb_mutex);
1587}
1588
1589STATIC void
1590xfs_icsb_lock_cntr(
1591 xfs_icsb_cnts_t *icsbp)
1592{
1593 while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
1594 ndelay(1000);
1595 }
1596}
1597
1598STATIC void
1599xfs_icsb_unlock_cntr(
1600 xfs_icsb_cnts_t *icsbp)
1601{
1602 clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
1603}
1604
1605
1606STATIC void
1607xfs_icsb_lock_all_counters(
1608 xfs_mount_t *mp)
1609{
1610 xfs_icsb_cnts_t *cntp;
1611 int i;
1612
1613 for_each_online_cpu(i) {
1614 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1615 xfs_icsb_lock_cntr(cntp);
1616 }
1617}
1618
1619STATIC void
1620xfs_icsb_unlock_all_counters(
1621 xfs_mount_t *mp)
1622{
1623 xfs_icsb_cnts_t *cntp;
1624 int i;
1625
1626 for_each_online_cpu(i) {
1627 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1628 xfs_icsb_unlock_cntr(cntp);
1629 }
1630}
1631
1632STATIC void
1633xfs_icsb_count(
1634 xfs_mount_t *mp,
1635 xfs_icsb_cnts_t *cnt,
1636 int flags)
1637{
1638 xfs_icsb_cnts_t *cntp;
1639 int i;
1640
1641 memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
1642
1643 if (!(flags & XFS_ICSB_LAZY_COUNT))
1644 xfs_icsb_lock_all_counters(mp);
1645
1646 for_each_online_cpu(i) {
1647 cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
1648 cnt->icsb_icount += cntp->icsb_icount;
1649 cnt->icsb_ifree += cntp->icsb_ifree;
1650 cnt->icsb_fdblocks += cntp->icsb_fdblocks;
1651 }
1652
1653 if (!(flags & XFS_ICSB_LAZY_COUNT))
1654 xfs_icsb_unlock_all_counters(mp);
1655}
1656
1657STATIC int
1658xfs_icsb_counter_disabled(
1659 xfs_mount_t *mp,
1660 xfs_sb_field_t field)
1661{
1662 ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1663 return test_bit(field, &mp->m_icsb_counters);
1664}
1665
1666STATIC void
1667xfs_icsb_disable_counter(
1668 xfs_mount_t *mp,
1669 xfs_sb_field_t field)
1670{
1671 xfs_icsb_cnts_t cnt;
1672
1673 ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1674
1675 /*
1676 * If we are already disabled, then there is nothing to do
1677 * here. We check before locking all the counters to avoid
1678 * the expensive lock operation when being called in the
1679 * slow path and the counter is already disabled. This is
1680 * safe because the only time we set or clear this state is under
1681 * the m_icsb_mutex.
1682 */
1683 if (xfs_icsb_counter_disabled(mp, field))
1684 return;
1685
1686 xfs_icsb_lock_all_counters(mp);
1687 if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
1688 /* drain back to superblock */
1689
1690 xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
1691 switch(field) {
1692 case XFS_SBS_ICOUNT:
1693 mp->m_sb.sb_icount = cnt.icsb_icount;
1694 break;
1695 case XFS_SBS_IFREE:
1696 mp->m_sb.sb_ifree = cnt.icsb_ifree;
1697 break;
1698 case XFS_SBS_FDBLOCKS:
1699 mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
1700 break;
1701 default:
1702 BUG();
1703 }
1704 }
1705
1706 xfs_icsb_unlock_all_counters(mp);
1707}
1708
1709STATIC void
1710xfs_icsb_enable_counter(
1711 xfs_mount_t *mp,
1712 xfs_sb_field_t field,
1713 uint64_t count,
1714 uint64_t resid)
1715{
1716 xfs_icsb_cnts_t *cntp;
1717 int i;
1718
1719 ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
1720
1721 xfs_icsb_lock_all_counters(mp);
1722 for_each_online_cpu(i) {
1723 cntp = per_cpu_ptr(mp->m_sb_cnts, i);
1724 switch (field) {
1725 case XFS_SBS_ICOUNT:
1726 cntp->icsb_icount = count + resid;
1727 break;
1728 case XFS_SBS_IFREE:
1729 cntp->icsb_ifree = count + resid;
1730 break;
1731 case XFS_SBS_FDBLOCKS:
1732 cntp->icsb_fdblocks = count + resid;
1733 break;
1734 default:
1735 BUG();
1736 break;
1737 }
1738 resid = 0;
1739 }
1740 clear_bit(field, &mp->m_icsb_counters);
1741 xfs_icsb_unlock_all_counters(mp);
1742}
1743
1744void
1745xfs_icsb_sync_counters_locked(
1746 xfs_mount_t *mp,
1747 int flags)
1748{
1749 xfs_icsb_cnts_t cnt;
1750
1751 xfs_icsb_count(mp, &cnt, flags);
1752
1753 if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
1754 mp->m_sb.sb_icount = cnt.icsb_icount;
1755 if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
1756 mp->m_sb.sb_ifree = cnt.icsb_ifree;
1757 if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
1758 mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
1759}
1760
1761/*
1762 * Accurate update of per-cpu counters to incore superblock
1763 */
1764void
1765xfs_icsb_sync_counters(
1766 xfs_mount_t *mp,
1767 int flags)
1768{
1769 spin_lock(&mp->m_sb_lock);
1770 xfs_icsb_sync_counters_locked(mp, flags);
1771 spin_unlock(&mp->m_sb_lock);
1772}
1773
1774/*
1775 * Balance and enable/disable counters as necessary.
1776 *
1777 * Thresholds for re-enabling counters are somewhat magic. inode counts are
1778 * chosen to be the same number as single on disk allocation chunk per CPU, and
1779 * free blocks is something far enough zero that we aren't going thrash when we
1780 * get near ENOSPC. We also need to supply a minimum we require per cpu to
1781 * prevent looping endlessly when xfs_alloc_space asks for more than will
1782 * be distributed to a single CPU but each CPU has enough blocks to be
1783 * reenabled.
1784 *
1785 * Note that we can be called when counters are already disabled.
1786 * xfs_icsb_disable_counter() optimises the counter locking in this case to
1787 * prevent locking every per-cpu counter needlessly.
1788 */
1789
1790#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64
1791#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
1792 (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
1793STATIC void
1794xfs_icsb_balance_counter_locked(
1795 xfs_mount_t *mp,
1796 xfs_sb_field_t field,
1797 int min_per_cpu)
1798{
1799 uint64_t count, resid;
1800 int weight = num_online_cpus();
1801 uint64_t min = (uint64_t)min_per_cpu;
1802
1803 /* disable counter and sync counter */
1804 xfs_icsb_disable_counter(mp, field);
1805
1806 /* update counters - first CPU gets residual*/
1807 switch (field) {
1808 case XFS_SBS_ICOUNT:
1809 count = mp->m_sb.sb_icount;
1810 resid = do_div(count, weight);
1811 if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
1812 return;
1813 break;
1814 case XFS_SBS_IFREE:
1815 count = mp->m_sb.sb_ifree;
1816 resid = do_div(count, weight);
1817 if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
1818 return;
1819 break;
1820 case XFS_SBS_FDBLOCKS:
1821 count = mp->m_sb.sb_fdblocks;
1822 resid = do_div(count, weight);
1823 if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
1824 return;
1825 break;
1826 default:
1827 BUG();
1828 count = resid = 0; /* quiet, gcc */
1829 break;
1830 }
1831
1832 xfs_icsb_enable_counter(mp, field, count, resid);
1833}
1834
1835STATIC void
1836xfs_icsb_balance_counter(
1837 xfs_mount_t *mp,
1838 xfs_sb_field_t fields,
1839 int min_per_cpu)
1840{
1841 spin_lock(&mp->m_sb_lock);
1842 xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
1843 spin_unlock(&mp->m_sb_lock);
1844}
1845
1846int
1847xfs_icsb_modify_counters(
1848 xfs_mount_t *mp,
1849 xfs_sb_field_t field,
1850 int64_t delta,
1851 int rsvd)
1852{
1853 xfs_icsb_cnts_t *icsbp;
1854 long long lcounter; /* long counter for 64 bit fields */
1855 int ret = 0;
1856
1857 might_sleep();
1858again:
1859 preempt_disable();
1860 icsbp = this_cpu_ptr(mp->m_sb_cnts);
1861
1862 /*
1863 * if the counter is disabled, go to slow path
1864 */
1865 if (unlikely(xfs_icsb_counter_disabled(mp, field)))
1866 goto slow_path;
1867 xfs_icsb_lock_cntr(icsbp);
1868 if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
1869 xfs_icsb_unlock_cntr(icsbp);
1870 goto slow_path;
1871 }
1872
1873 switch (field) {
1874 case XFS_SBS_ICOUNT:
1875 lcounter = icsbp->icsb_icount;
1876 lcounter += delta;
1877 if (unlikely(lcounter < 0))
1878 goto balance_counter;
1879 icsbp->icsb_icount = lcounter;
1880 break;
1881
1882 case XFS_SBS_IFREE:
1883 lcounter = icsbp->icsb_ifree;
1884 lcounter += delta;
1885 if (unlikely(lcounter < 0))
1886 goto balance_counter;
1887 icsbp->icsb_ifree = lcounter;
1888 break;
1889
1890 case XFS_SBS_FDBLOCKS:
1891 BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
1892
1893 lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1894 lcounter += delta;
1895 if (unlikely(lcounter < 0))
1896 goto balance_counter;
1897 icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
1898 break;
1899 default:
1900 BUG();
1901 break;
1902 }
1903 xfs_icsb_unlock_cntr(icsbp);
1904 preempt_enable();
1905 return 0;
1906
1907slow_path:
1908 preempt_enable();
1909
1910 /*
1911 * serialise with a mutex so we don't burn lots of cpu on
1912 * the superblock lock. We still need to hold the superblock
1913 * lock, however, when we modify the global structures.
1914 */
1915 xfs_icsb_lock(mp);
1916
1917 /*
1918 * Now running atomically.
1919 *
1920 * If the counter is enabled, someone has beaten us to rebalancing.
1921 * Drop the lock and try again in the fast path....
1922 */
1923 if (!(xfs_icsb_counter_disabled(mp, field))) {
1924 xfs_icsb_unlock(mp);
1925 goto again;
1926 }
1927
1928 /*
1929 * The counter is currently disabled. Because we are
1930 * running atomically here, we know a rebalance cannot
1931 * be in progress. Hence we can go straight to operating
1932 * on the global superblock. We do not call xfs_mod_incore_sb()
1933 * here even though we need to get the m_sb_lock. Doing so
1934 * will cause us to re-enter this function and deadlock.
1935 * Hence we get the m_sb_lock ourselves and then call
1936 * xfs_mod_incore_sb_unlocked() as the unlocked path operates
1937 * directly on the global counters.
1938 */
1939 spin_lock(&mp->m_sb_lock);
1940 ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
1941 spin_unlock(&mp->m_sb_lock);
1942
1943 /*
1944 * Now that we've modified the global superblock, we
1945 * may be able to re-enable the distributed counters
1946 * (e.g. lots of space just got freed). After that
1947 * we are done.
1948 */
1949 if (ret != -ENOSPC)
1950 xfs_icsb_balance_counter(mp, field, 0);
1951 xfs_icsb_unlock(mp);
1952 return ret;
1953
1954balance_counter:
1955 xfs_icsb_unlock_cntr(icsbp);
1956 preempt_enable();
1957
1958 /*
1959 * We may have multiple threads here if multiple per-cpu
1960 * counters run dry at the same time. This will mean we can
1961 * do more balances than strictly necessary but it is not
1962 * the common slowpath case.
1963 */
1964 xfs_icsb_lock(mp);
1965
1966 /*
1967 * running atomically.
1968 *
1969 * This will leave the counter in the correct state for future
1970 * accesses. After the rebalance, we simply try again and our retry
1971 * will either succeed through the fast path or slow path without
1972 * another balance operation being required.
1973 */
1974 xfs_icsb_balance_counter(mp, field, delta);
1975 xfs_icsb_unlock(mp);
1976 goto again;
1977}
1978
1979#endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 0d8abd6364d9..8c995a2ccb6f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,8 +18,6 @@
18#ifndef __XFS_MOUNT_H__ 18#ifndef __XFS_MOUNT_H__
19#define __XFS_MOUNT_H__ 19#define __XFS_MOUNT_H__
20 20
21#ifdef __KERNEL__
22
23struct xlog; 21struct xlog;
24struct xfs_inode; 22struct xfs_inode;
25struct xfs_mru_cache; 23struct xfs_mru_cache;
@@ -29,44 +27,6 @@ struct xfs_quotainfo;
29struct xfs_dir_ops; 27struct xfs_dir_ops;
30struct xfs_da_geometry; 28struct xfs_da_geometry;
31 29
32#ifdef HAVE_PERCPU_SB
33
34/*
35 * Valid per-cpu incore superblock counters. Note that if you add new counters,
36 * you may need to define new counter disabled bit field descriptors as there
37 * are more possible fields in the superblock that can fit in a bitfield on a
38 * 32 bit platform. The XFS_SBS_* values for the current current counters just
39 * fit.
40 */
41typedef struct xfs_icsb_cnts {
42 uint64_t icsb_fdblocks;
43 uint64_t icsb_ifree;
44 uint64_t icsb_icount;
45 unsigned long icsb_flags;
46} xfs_icsb_cnts_t;
47
48#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */
49
50#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */
51
52extern int xfs_icsb_init_counters(struct xfs_mount *);
53extern void xfs_icsb_reinit_counters(struct xfs_mount *);
54extern void xfs_icsb_destroy_counters(struct xfs_mount *);
55extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
56extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
57extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
58 int64_t, int);
59
60#else
61#define xfs_icsb_init_counters(mp) (0)
62#define xfs_icsb_destroy_counters(mp) do { } while (0)
63#define xfs_icsb_reinit_counters(mp) do { } while (0)
64#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
65#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
66#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
67 xfs_mod_incore_sb(mp, field, delta, rsvd)
68#endif
69
70/* dynamic preallocation free space thresholds, 5% down to 1% */ 30/* dynamic preallocation free space thresholds, 5% down to 1% */
71enum { 31enum {
72 XFS_LOWSP_1_PCNT = 0, 32 XFS_LOWSP_1_PCNT = 0,
@@ -81,8 +41,13 @@ typedef struct xfs_mount {
81 struct super_block *m_super; 41 struct super_block *m_super;
82 xfs_tid_t m_tid; /* next unused tid for fs */ 42 xfs_tid_t m_tid; /* next unused tid for fs */
83 struct xfs_ail *m_ail; /* fs active log item list */ 43 struct xfs_ail *m_ail; /* fs active log item list */
84 xfs_sb_t m_sb; /* copy of fs superblock */ 44
45 struct xfs_sb m_sb; /* copy of fs superblock */
85 spinlock_t m_sb_lock; /* sb counter lock */ 46 spinlock_t m_sb_lock; /* sb counter lock */
47 struct percpu_counter m_icount; /* allocated inodes counter */
48 struct percpu_counter m_ifree; /* free inodes counter */
49 struct percpu_counter m_fdblocks; /* free block counter */
50
86 struct xfs_buf *m_sb_bp; /* buffer for superblock */ 51 struct xfs_buf *m_sb_bp; /* buffer for superblock */
87 char *m_fsname; /* filesystem name */ 52 char *m_fsname; /* filesystem name */
88 int m_fsname_len; /* strlen of fs name */ 53 int m_fsname_len; /* strlen of fs name */
@@ -152,12 +117,6 @@ typedef struct xfs_mount {
152 const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ 117 const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
153 uint m_chsize; /* size of next field */ 118 uint m_chsize; /* size of next field */
154 atomic_t m_active_trans; /* number trans frozen */ 119 atomic_t m_active_trans; /* number trans frozen */
155#ifdef HAVE_PERCPU_SB
156 xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */
157 unsigned long m_icsb_counters; /* disabled per-cpu counters */
158 struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
159 struct mutex m_icsb_mutex; /* balancer sync lock */
160#endif
161 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 120 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
162 struct delayed_work m_reclaim_work; /* background inode reclaim */ 121 struct delayed_work m_reclaim_work; /* background inode reclaim */
163 struct delayed_work m_eofblocks_work; /* background eof blocks 122 struct delayed_work m_eofblocks_work; /* background eof blocks
@@ -301,35 +260,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
301} 260}
302 261
303/* 262/*
304 * Per-cpu superblock locking functions
305 */
306#ifdef HAVE_PERCPU_SB
307static inline void
308xfs_icsb_lock(xfs_mount_t *mp)
309{
310 mutex_lock(&mp->m_icsb_mutex);
311}
312
313static inline void
314xfs_icsb_unlock(xfs_mount_t *mp)
315{
316 mutex_unlock(&mp->m_icsb_mutex);
317}
318#else
319#define xfs_icsb_lock(mp)
320#define xfs_icsb_unlock(mp)
321#endif
322
323/*
324 * This structure is for use by the xfs_mod_incore_sb_batch() routine.
325 * xfs_growfs can specify a few fields which are more than int limit
326 */
327typedef struct xfs_mod_sb {
328 xfs_sb_field_t msb_field; /* Field to modify, see below */
329 int64_t msb_delta; /* Change to make to specified field */
330} xfs_mod_sb_t;
331
332/*
333 * Per-ag incore structure, copies of information in agf and agi, to improve the 263 * Per-ag incore structure, copies of information in agf and agi, to improve the
334 * performance of allocation group selection. 264 * performance of allocation group selection.
335 */ 265 */
@@ -383,11 +313,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
383extern int xfs_mountfs(xfs_mount_t *mp); 313extern int xfs_mountfs(xfs_mount_t *mp);
384extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, 314extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount,
385 xfs_agnumber_t *maxagi); 315 xfs_agnumber_t *maxagi);
386
387extern void xfs_unmountfs(xfs_mount_t *); 316extern void xfs_unmountfs(xfs_mount_t *);
388extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); 317
389extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, 318extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta);
390 uint, int); 319extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta);
320extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
321 bool reserved);
322extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
323
391extern int xfs_mount_log_sb(xfs_mount_t *); 324extern int xfs_mount_log_sb(xfs_mount_t *);
392extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); 325extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
393extern int xfs_readsb(xfs_mount_t *, int); 326extern int xfs_readsb(xfs_mount_t *, int);
@@ -399,6 +332,4 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
399 332
400extern void xfs_set_low_space_thresholds(struct xfs_mount *); 333extern void xfs_set_low_space_thresholds(struct xfs_mount *);
401 334
402#endif /* __KERNEL__ */
403
404#endif /* __XFS_MOUNT_H__ */ 335#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 30ecca3037e3..f8a674d7f092 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -437,7 +437,7 @@ xfs_mru_cache_insert(
437 if (!mru || !mru->lists) 437 if (!mru || !mru->lists)
438 return -EINVAL; 438 return -EINVAL;
439 439
440 if (radix_tree_preload(GFP_KERNEL)) 440 if (radix_tree_preload(GFP_NOFS))
441 return -ENOMEM; 441 return -ENOMEM;
442 442
443 INIT_LIST_HEAD(&elem->list_node); 443 INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 365dd57ea760..981a657eca39 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -31,7 +31,8 @@
31int 31int
32xfs_break_layouts( 32xfs_break_layouts(
33 struct inode *inode, 33 struct inode *inode,
34 uint *iolock) 34 uint *iolock,
35 bool with_imutex)
35{ 36{
36 struct xfs_inode *ip = XFS_I(inode); 37 struct xfs_inode *ip = XFS_I(inode);
37 int error; 38 int error;
@@ -40,8 +41,12 @@ xfs_break_layouts(
40 41
41 while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { 42 while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
42 xfs_iunlock(ip, *iolock); 43 xfs_iunlock(ip, *iolock);
44 if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
45 mutex_unlock(&inode->i_mutex);
43 error = break_layout(inode, true); 46 error = break_layout(inode, true);
44 *iolock = XFS_IOLOCK_EXCL; 47 *iolock = XFS_IOLOCK_EXCL;
48 if (with_imutex)
49 mutex_lock(&inode->i_mutex);
45 xfs_ilock(ip, *iolock); 50 xfs_ilock(ip, *iolock);
46 } 51 }
47 52
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index b7fbfce660f6..8147ac108820 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
8int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, 8int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
9 struct iattr *iattr); 9 struct iattr *iattr);
10 10
11int xfs_break_layouts(struct inode *inode, uint *iolock); 11int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
12#else 12#else
13static inline int xfs_break_layouts(struct inode *inode, uint *iolock) 13static inline int
14xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
14{ 15{
15 return 0; 16 return 0;
16} 17}
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index fbbb9e62e274..5538468c7f63 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -719,6 +719,7 @@ xfs_qm_qino_alloc(
719 xfs_trans_t *tp; 719 xfs_trans_t *tp;
720 int error; 720 int error;
721 int committed; 721 int committed;
722 bool need_alloc = true;
722 723
723 *ip = NULL; 724 *ip = NULL;
724 /* 725 /*
@@ -747,6 +748,7 @@ xfs_qm_qino_alloc(
747 return error; 748 return error;
748 mp->m_sb.sb_gquotino = NULLFSINO; 749 mp->m_sb.sb_gquotino = NULLFSINO;
749 mp->m_sb.sb_pquotino = NULLFSINO; 750 mp->m_sb.sb_pquotino = NULLFSINO;
751 need_alloc = false;
750 } 752 }
751 } 753 }
752 754
@@ -758,7 +760,7 @@ xfs_qm_qino_alloc(
758 return error; 760 return error;
759 } 761 }
760 762
761 if (!*ip) { 763 if (need_alloc) {
762 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, 764 error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip,
763 &committed); 765 &committed);
764 if (error) { 766 if (error) {
@@ -794,11 +796,14 @@ xfs_qm_qino_alloc(
794 spin_unlock(&mp->m_sb_lock); 796 spin_unlock(&mp->m_sb_lock);
795 xfs_log_sb(tp); 797 xfs_log_sb(tp);
796 798
797 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 799 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
800 if (error) {
801 ASSERT(XFS_FORCED_SHUTDOWN(mp));
798 xfs_alert(mp, "%s failed (error %d)!", __func__, error); 802 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
799 return error;
800 } 803 }
801 return 0; 804 if (need_alloc)
805 xfs_finish_inode_setup(*ip);
806 return error;
802} 807}
803 808
804 809
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8fcc4ccc5c79..5f357ca97e76 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -109,8 +109,6 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
109#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ 109#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
110#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ 110#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
111#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ 111#define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */
112#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */
113#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */
114#define MNTOPT_DISCARD "discard" /* Discard unused blocks */ 112#define MNTOPT_DISCARD "discard" /* Discard unused blocks */
115#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ 113#define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */
116 114
@@ -361,28 +359,10 @@ xfs_parseargs(
361 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { 359 } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
362 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); 360 mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
363 mp->m_qflags &= ~XFS_GQUOTA_ENFD; 361 mp->m_qflags &= ~XFS_GQUOTA_ENFD;
364 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
365 xfs_warn(mp,
366 "delaylog is the default now, option is deprecated.");
367 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
368 xfs_warn(mp,
369 "nodelaylog support has been removed, option is deprecated.");
370 } else if (!strcmp(this_char, MNTOPT_DISCARD)) { 362 } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
371 mp->m_flags |= XFS_MOUNT_DISCARD; 363 mp->m_flags |= XFS_MOUNT_DISCARD;
372 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { 364 } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
373 mp->m_flags &= ~XFS_MOUNT_DISCARD; 365 mp->m_flags &= ~XFS_MOUNT_DISCARD;
374 } else if (!strcmp(this_char, "ihashsize")) {
375 xfs_warn(mp,
376 "ihashsize no longer used, option is deprecated.");
377 } else if (!strcmp(this_char, "osyncisdsync")) {
378 xfs_warn(mp,
379 "osyncisdsync has no effect, option is deprecated.");
380 } else if (!strcmp(this_char, "osyncisosync")) {
381 xfs_warn(mp,
382 "osyncisosync has no effect, option is deprecated.");
383 } else if (!strcmp(this_char, "irixsgid")) {
384 xfs_warn(mp,
385 "irixsgid is now a sysctl(2) variable, option is deprecated.");
386 } else { 366 } else {
387 xfs_warn(mp, "unknown mount option [%s].", this_char); 367 xfs_warn(mp, "unknown mount option [%s].", this_char);
388 return -EINVAL; 368 return -EINVAL;
@@ -986,6 +966,8 @@ xfs_fs_inode_init_once(
986 atomic_set(&ip->i_pincount, 0); 966 atomic_set(&ip->i_pincount, 0);
987 spin_lock_init(&ip->i_flags_lock); 967 spin_lock_init(&ip->i_flags_lock);
988 968
969 mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
970 "xfsino", ip->i_ino);
989 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, 971 mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
990 "xfsino", ip->i_ino); 972 "xfsino", ip->i_ino);
991} 973}
@@ -1033,23 +1015,6 @@ xfs_free_fsname(
1033 kfree(mp->m_logname); 1015 kfree(mp->m_logname);
1034} 1016}
1035 1017
1036STATIC void
1037xfs_fs_put_super(
1038 struct super_block *sb)
1039{
1040 struct xfs_mount *mp = XFS_M(sb);
1041
1042 xfs_filestream_unmount(mp);
1043 xfs_unmountfs(mp);
1044
1045 xfs_freesb(mp);
1046 xfs_icsb_destroy_counters(mp);
1047 xfs_destroy_mount_workqueues(mp);
1048 xfs_close_devices(mp);
1049 xfs_free_fsname(mp);
1050 kfree(mp);
1051}
1052
1053STATIC int 1018STATIC int
1054xfs_fs_sync_fs( 1019xfs_fs_sync_fs(
1055 struct super_block *sb, 1020 struct super_block *sb,
@@ -1085,6 +1050,9 @@ xfs_fs_statfs(
1085 xfs_sb_t *sbp = &mp->m_sb; 1050 xfs_sb_t *sbp = &mp->m_sb;
1086 struct xfs_inode *ip = XFS_I(dentry->d_inode); 1051 struct xfs_inode *ip = XFS_I(dentry->d_inode);
1087 __uint64_t fakeinos, id; 1052 __uint64_t fakeinos, id;
1053 __uint64_t icount;
1054 __uint64_t ifree;
1055 __uint64_t fdblocks;
1088 xfs_extlen_t lsize; 1056 xfs_extlen_t lsize;
1089 __int64_t ffree; 1057 __int64_t ffree;
1090 1058
@@ -1095,17 +1063,21 @@ xfs_fs_statfs(
1095 statp->f_fsid.val[0] = (u32)id; 1063 statp->f_fsid.val[0] = (u32)id;
1096 statp->f_fsid.val[1] = (u32)(id >> 32); 1064 statp->f_fsid.val[1] = (u32)(id >> 32);
1097 1065
1098 xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); 1066 icount = percpu_counter_sum(&mp->m_icount);
1067 ifree = percpu_counter_sum(&mp->m_ifree);
1068 fdblocks = percpu_counter_sum(&mp->m_fdblocks);
1099 1069
1100 spin_lock(&mp->m_sb_lock); 1070 spin_lock(&mp->m_sb_lock);
1101 statp->f_bsize = sbp->sb_blocksize; 1071 statp->f_bsize = sbp->sb_blocksize;
1102 lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; 1072 lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
1103 statp->f_blocks = sbp->sb_dblocks - lsize; 1073 statp->f_blocks = sbp->sb_dblocks - lsize;
1104 statp->f_bfree = statp->f_bavail = 1074 spin_unlock(&mp->m_sb_lock);
1105 sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); 1075
1076 statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp);
1077 statp->f_bavail = statp->f_bfree;
1078
1106 fakeinos = statp->f_bfree << sbp->sb_inopblog; 1079 fakeinos = statp->f_bfree << sbp->sb_inopblog;
1107 statp->f_files = 1080 statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
1108 MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
1109 if (mp->m_maxicount) 1081 if (mp->m_maxicount)
1110 statp->f_files = min_t(typeof(statp->f_files), 1082 statp->f_files = min_t(typeof(statp->f_files),
1111 statp->f_files, 1083 statp->f_files,
@@ -1117,10 +1089,9 @@ xfs_fs_statfs(
1117 sbp->sb_icount); 1089 sbp->sb_icount);
1118 1090
1119 /* make sure statp->f_ffree does not underflow */ 1091 /* make sure statp->f_ffree does not underflow */
1120 ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); 1092 ffree = statp->f_files - (icount - ifree);
1121 statp->f_ffree = max_t(__int64_t, ffree, 0); 1093 statp->f_ffree = max_t(__int64_t, ffree, 0);
1122 1094
1123 spin_unlock(&mp->m_sb_lock);
1124 1095
1125 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1096 if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1126 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == 1097 ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) ==
@@ -1256,6 +1227,12 @@ xfs_fs_remount(
1256 1227
1257 /* ro -> rw */ 1228 /* ro -> rw */
1258 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { 1229 if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
1230 if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
1231 xfs_warn(mp,
1232 "ro->rw transition prohibited on norecovery mount");
1233 return -EINVAL;
1234 }
1235
1259 mp->m_flags &= ~XFS_MOUNT_RDONLY; 1236 mp->m_flags &= ~XFS_MOUNT_RDONLY;
1260 1237
1261 /* 1238 /*
@@ -1401,6 +1378,51 @@ xfs_finish_flags(
1401 return 0; 1378 return 0;
1402} 1379}
1403 1380
1381static int
1382xfs_init_percpu_counters(
1383 struct xfs_mount *mp)
1384{
1385 int error;
1386
1387 error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL);
1388 if (error)
1389 return -ENOMEM;
1390
1391 error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL);
1392 if (error)
1393 goto free_icount;
1394
1395 error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL);
1396 if (error)
1397 goto free_ifree;
1398
1399 return 0;
1400
1401free_ifree:
1402 percpu_counter_destroy(&mp->m_ifree);
1403free_icount:
1404 percpu_counter_destroy(&mp->m_icount);
1405 return -ENOMEM;
1406}
1407
1408void
1409xfs_reinit_percpu_counters(
1410 struct xfs_mount *mp)
1411{
1412 percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount);
1413 percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree);
1414 percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks);
1415}
1416
1417static void
1418xfs_destroy_percpu_counters(
1419 struct xfs_mount *mp)
1420{
1421 percpu_counter_destroy(&mp->m_icount);
1422 percpu_counter_destroy(&mp->m_ifree);
1423 percpu_counter_destroy(&mp->m_fdblocks);
1424}
1425
1404STATIC int 1426STATIC int
1405xfs_fs_fill_super( 1427xfs_fs_fill_super(
1406 struct super_block *sb, 1428 struct super_block *sb,
@@ -1449,7 +1471,7 @@ xfs_fs_fill_super(
1449 if (error) 1471 if (error)
1450 goto out_close_devices; 1472 goto out_close_devices;
1451 1473
1452 error = xfs_icsb_init_counters(mp); 1474 error = xfs_init_percpu_counters(mp);
1453 if (error) 1475 if (error)
1454 goto out_destroy_workqueues; 1476 goto out_destroy_workqueues;
1455 1477
@@ -1507,7 +1529,7 @@ xfs_fs_fill_super(
1507 out_free_sb: 1529 out_free_sb:
1508 xfs_freesb(mp); 1530 xfs_freesb(mp);
1509 out_destroy_counters: 1531 out_destroy_counters:
1510 xfs_icsb_destroy_counters(mp); 1532 xfs_destroy_percpu_counters(mp);
1511out_destroy_workqueues: 1533out_destroy_workqueues:
1512 xfs_destroy_mount_workqueues(mp); 1534 xfs_destroy_mount_workqueues(mp);
1513 out_close_devices: 1535 out_close_devices:
@@ -1524,6 +1546,24 @@ out_destroy_workqueues:
1524 goto out_free_sb; 1546 goto out_free_sb;
1525} 1547}
1526 1548
1549STATIC void
1550xfs_fs_put_super(
1551 struct super_block *sb)
1552{
1553 struct xfs_mount *mp = XFS_M(sb);
1554
1555 xfs_notice(mp, "Unmounting Filesystem");
1556 xfs_filestream_unmount(mp);
1557 xfs_unmountfs(mp);
1558
1559 xfs_freesb(mp);
1560 xfs_destroy_percpu_counters(mp);
1561 xfs_destroy_mount_workqueues(mp);
1562 xfs_close_devices(mp);
1563 xfs_free_fsname(mp);
1564 kfree(mp);
1565}
1566
1527STATIC struct dentry * 1567STATIC struct dentry *
1528xfs_fs_mount( 1568xfs_fs_mount(
1529 struct file_system_type *fs_type, 1569 struct file_system_type *fs_type,
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2b830c2f322e..499058fea303 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations;
72extern const struct xattr_handler *xfs_xattr_handlers[]; 72extern const struct xattr_handler *xfs_xattr_handlers[];
73extern const struct quotactl_ops xfs_quotactl_operations; 73extern const struct quotactl_ops xfs_quotactl_operations;
74 74
75extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
76
75#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 77#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
76 78
77#endif /* __XFS_SUPER_H__ */ 79#endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 25791df6f638..3df411eadb86 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -177,7 +177,7 @@ xfs_symlink(
177 int pathlen; 177 int pathlen;
178 struct xfs_bmap_free free_list; 178 struct xfs_bmap_free free_list;
179 xfs_fsblock_t first_block; 179 xfs_fsblock_t first_block;
180 bool unlock_dp_on_error = false; 180 bool unlock_dp_on_error = false;
181 uint cancel_flags; 181 uint cancel_flags;
182 int committed; 182 int committed;
183 xfs_fileoff_t first_fsb; 183 xfs_fileoff_t first_fsb;
@@ -221,7 +221,7 @@ xfs_symlink(
221 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 221 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
222 &udqp, &gdqp, &pdqp); 222 &udqp, &gdqp, &pdqp);
223 if (error) 223 if (error)
224 goto std_return; 224 return error;
225 225
226 tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); 226 tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK);
227 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 227 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
@@ -241,7 +241,7 @@ xfs_symlink(
241 } 241 }
242 if (error) { 242 if (error) {
243 cancel_flags = 0; 243 cancel_flags = 0;
244 goto error_return; 244 goto out_trans_cancel;
245 } 245 }
246 246
247 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 247 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
@@ -252,7 +252,7 @@ xfs_symlink(
252 */ 252 */
253 if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { 253 if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
254 error = -EPERM; 254 error = -EPERM;
255 goto error_return; 255 goto out_trans_cancel;
256 } 256 }
257 257
258 /* 258 /*
@@ -261,7 +261,7 @@ xfs_symlink(
261 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, 261 error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
262 pdqp, resblks, 1, 0); 262 pdqp, resblks, 1, 0);
263 if (error) 263 if (error)
264 goto error_return; 264 goto out_trans_cancel;
265 265
266 /* 266 /*
267 * Check for ability to enter directory entry, if no space reserved. 267 * Check for ability to enter directory entry, if no space reserved.
@@ -269,7 +269,7 @@ xfs_symlink(
269 if (!resblks) { 269 if (!resblks) {
270 error = xfs_dir_canenter(tp, dp, link_name); 270 error = xfs_dir_canenter(tp, dp, link_name);
271 if (error) 271 if (error)
272 goto error_return; 272 goto out_trans_cancel;
273 } 273 }
274 /* 274 /*
275 * Initialize the bmap freelist prior to calling either 275 * Initialize the bmap freelist prior to calling either
@@ -282,15 +282,14 @@ xfs_symlink(
282 */ 282 */
283 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, 283 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
284 prid, resblks > 0, &ip, NULL); 284 prid, resblks > 0, &ip, NULL);
285 if (error) { 285 if (error)
286 if (error == -ENOSPC) 286 goto out_trans_cancel;
287 goto error_return;
288 goto error1;
289 }
290 287
291 /* 288 /*
292 * An error after we've joined dp to the transaction will result in the 289 * Now we join the directory inode to the transaction. We do not do it
293 * transaction cancel unlocking dp so don't do it explicitly in the 290 * earlier because xfs_dir_ialloc might commit the previous transaction
291 * (and release all the locks). An error from here on will result in
292 * the transaction cancel unlocking dp so don't do it explicitly in the
294 * error path. 293 * error path.
295 */ 294 */
296 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 295 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
@@ -330,7 +329,7 @@ xfs_symlink(
330 XFS_BMAPI_METADATA, &first_block, resblks, 329 XFS_BMAPI_METADATA, &first_block, resblks,
331 mval, &nmaps, &free_list); 330 mval, &nmaps, &free_list);
332 if (error) 331 if (error)
333 goto error2; 332 goto out_bmap_cancel;
334 333
335 if (resblks) 334 if (resblks)
336 resblks -= fs_blocks; 335 resblks -= fs_blocks;
@@ -348,7 +347,7 @@ xfs_symlink(
348 BTOBB(byte_cnt), 0); 347 BTOBB(byte_cnt), 0);
349 if (!bp) { 348 if (!bp) {
350 error = -ENOMEM; 349 error = -ENOMEM;
351 goto error2; 350 goto out_bmap_cancel;
352 } 351 }
353 bp->b_ops = &xfs_symlink_buf_ops; 352 bp->b_ops = &xfs_symlink_buf_ops;
354 353
@@ -378,7 +377,7 @@ xfs_symlink(
378 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, 377 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
379 &first_block, &free_list, resblks); 378 &first_block, &free_list, resblks);
380 if (error) 379 if (error)
381 goto error2; 380 goto out_bmap_cancel;
382 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 381 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
383 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 382 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
384 383
@@ -392,10 +391,13 @@ xfs_symlink(
392 } 391 }
393 392
394 error = xfs_bmap_finish(&tp, &free_list, &committed); 393 error = xfs_bmap_finish(&tp, &free_list, &committed);
395 if (error) { 394 if (error)
396 goto error2; 395 goto out_bmap_cancel;
397 } 396
398 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 397 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
398 if (error)
399 goto out_release_inode;
400
399 xfs_qm_dqrele(udqp); 401 xfs_qm_dqrele(udqp);
400 xfs_qm_dqrele(gdqp); 402 xfs_qm_dqrele(gdqp);
401 xfs_qm_dqrele(pdqp); 403 xfs_qm_dqrele(pdqp);
@@ -403,20 +405,28 @@ xfs_symlink(
403 *ipp = ip; 405 *ipp = ip;
404 return 0; 406 return 0;
405 407
406 error2: 408out_bmap_cancel:
407 IRELE(ip);
408 error1:
409 xfs_bmap_cancel(&free_list); 409 xfs_bmap_cancel(&free_list);
410 cancel_flags |= XFS_TRANS_ABORT; 410 cancel_flags |= XFS_TRANS_ABORT;
411 error_return: 411out_trans_cancel:
412 xfs_trans_cancel(tp, cancel_flags); 412 xfs_trans_cancel(tp, cancel_flags);
413out_release_inode:
414 /*
415 * Wait until after the current transaction is aborted to finish the
416 * setup of the inode and release the inode. This prevents recursive
417 * transactions and deadlocks from xfs_inactive.
418 */
419 if (ip) {
420 xfs_finish_inode_setup(ip);
421 IRELE(ip);
422 }
423
413 xfs_qm_dqrele(udqp); 424 xfs_qm_dqrele(udqp);
414 xfs_qm_dqrele(gdqp); 425 xfs_qm_dqrele(gdqp);
415 xfs_qm_dqrele(pdqp); 426 xfs_qm_dqrele(pdqp);
416 427
417 if (unlock_dp_on_error) 428 if (unlock_dp_on_error)
418 xfs_iunlock(dp, XFS_ILOCK_EXCL); 429 xfs_iunlock(dp, XFS_ILOCK_EXCL);
419 std_return:
420 return error; 430 return error;
421} 431}
422 432
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 51372e34d988..615781bf4ee5 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
115 __entry->refcount = refcount; 115 __entry->refcount = refcount;
116 __entry->caller_ip = caller_ip; 116 __entry->caller_ip = caller_ip;
117 ), 117 ),
118 TP_printk("dev %d:%d agno %u refcount %d caller %pf", 118 TP_printk("dev %d:%d agno %u refcount %d caller %ps",
119 MAJOR(__entry->dev), MINOR(__entry->dev), 119 MAJOR(__entry->dev), MINOR(__entry->dev),
120 __entry->agno, 120 __entry->agno,
121 __entry->refcount, 121 __entry->refcount,
@@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert,
239 __entry->caller_ip = caller_ip; 239 __entry->caller_ip = caller_ip;
240 ), 240 ),
241 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 241 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
242 "offset %lld block %lld count %lld flag %d caller %pf", 242 "offset %lld block %lld count %lld flag %d caller %ps",
243 MAJOR(__entry->dev), MINOR(__entry->dev), 243 MAJOR(__entry->dev), MINOR(__entry->dev),
244 __entry->ino, 244 __entry->ino,
245 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 245 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
283 __entry->caller_ip = caller_ip; 283 __entry->caller_ip = caller_ip;
284 ), 284 ),
285 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 285 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
286 "offset %lld block %lld count %lld flag %d caller %pf", 286 "offset %lld block %lld count %lld flag %d caller %ps",
287 MAJOR(__entry->dev), MINOR(__entry->dev), 287 MAJOR(__entry->dev), MINOR(__entry->dev),
288 __entry->ino, 288 __entry->ino,
289 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 289 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
@@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
329 __entry->caller_ip = caller_ip; 329 __entry->caller_ip = caller_ip;
330 ), 330 ),
331 TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " 331 TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d "
332 "lock %d flags %s caller %pf", 332 "lock %d flags %s caller %ps",
333 MAJOR(__entry->dev), MINOR(__entry->dev), 333 MAJOR(__entry->dev), MINOR(__entry->dev),
334 (unsigned long long)__entry->bno, 334 (unsigned long long)__entry->bno,
335 __entry->nblks, 335 __entry->nblks,
@@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
402 __entry->caller_ip = caller_ip; 402 __entry->caller_ip = caller_ip;
403 ), 403 ),
404 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " 404 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
405 "lock %d flags %s caller %pf", 405 "lock %d flags %s caller %ps",
406 MAJOR(__entry->dev), MINOR(__entry->dev), 406 MAJOR(__entry->dev), MINOR(__entry->dev),
407 (unsigned long long)__entry->bno, 407 (unsigned long long)__entry->bno,
408 __entry->buffer_length, 408 __entry->buffer_length,
@@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror,
447 __entry->caller_ip = caller_ip; 447 __entry->caller_ip = caller_ip;
448 ), 448 ),
449 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " 449 TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
450 "lock %d error %d flags %s caller %pf", 450 "lock %d error %d flags %s caller %ps",
451 MAJOR(__entry->dev), MINOR(__entry->dev), 451 MAJOR(__entry->dev), MINOR(__entry->dev),
452 (unsigned long long)__entry->bno, 452 (unsigned long long)__entry->bno,
453 __entry->buffer_length, 453 __entry->buffer_length,
@@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class,
613 __entry->lock_flags = lock_flags; 613 __entry->lock_flags = lock_flags;
614 __entry->caller_ip = caller_ip; 614 __entry->caller_ip = caller_ip;
615 ), 615 ),
616 TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", 616 TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps",
617 MAJOR(__entry->dev), MINOR(__entry->dev), 617 MAJOR(__entry->dev), MINOR(__entry->dev),
618 __entry->ino, 618 __entry->ino,
619 __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), 619 __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
@@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space);
664DEFINE_INODE_EVENT(xfs_free_file_space); 664DEFINE_INODE_EVENT(xfs_free_file_space);
665DEFINE_INODE_EVENT(xfs_zero_file_space); 665DEFINE_INODE_EVENT(xfs_zero_file_space);
666DEFINE_INODE_EVENT(xfs_collapse_file_space); 666DEFINE_INODE_EVENT(xfs_collapse_file_space);
667DEFINE_INODE_EVENT(xfs_insert_file_space);
667DEFINE_INODE_EVENT(xfs_readdir); 668DEFINE_INODE_EVENT(xfs_readdir);
668#ifdef CONFIG_XFS_POSIX_ACL 669#ifdef CONFIG_XFS_POSIX_ACL
669DEFINE_INODE_EVENT(xfs_get_acl); 670DEFINE_INODE_EVENT(xfs_get_acl);
@@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
685DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); 686DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
686DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); 687DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
687 688
689DEFINE_INODE_EVENT(xfs_filemap_fault);
690DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
691
688DECLARE_EVENT_CLASS(xfs_iref_class, 692DECLARE_EVENT_CLASS(xfs_iref_class,
689 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), 693 TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
690 TP_ARGS(ip, caller_ip), 694 TP_ARGS(ip, caller_ip),
@@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
702 __entry->pincount = atomic_read(&ip->i_pincount); 706 __entry->pincount = atomic_read(&ip->i_pincount);
703 __entry->caller_ip = caller_ip; 707 __entry->caller_ip = caller_ip;
704 ), 708 ),
705 TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", 709 TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps",
706 MAJOR(__entry->dev), MINOR(__entry->dev), 710 MAJOR(__entry->dev), MINOR(__entry->dev),
707 __entry->ino, 711 __entry->ino,
708 __entry->count, 712 __entry->count,
@@ -1217,6 +1221,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
1217DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); 1221DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
1218DEFINE_IOMAP_EVENT(xfs_get_blocks_found); 1222DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
1219DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); 1223DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
1224DEFINE_IOMAP_EVENT(xfs_gbmap_direct);
1225DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new);
1226DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update);
1227DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none);
1228DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio);
1220 1229
1221DECLARE_EVENT_CLASS(xfs_simple_io_class, 1230DECLARE_EVENT_CLASS(xfs_simple_io_class,
1222 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 1231 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -1333,7 +1342,7 @@ TRACE_EVENT(xfs_bunmap,
1333 __entry->flags = flags; 1342 __entry->flags = flags;
1334 ), 1343 ),
1335 TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" 1344 TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
1336 "flags %s caller %pf", 1345 "flags %s caller %ps",
1337 MAJOR(__entry->dev), MINOR(__entry->dev), 1346 MAJOR(__entry->dev), MINOR(__entry->dev),
1338 __entry->ino, 1347 __entry->ino,
1339 __entry->size, 1348 __entry->size,
@@ -1466,7 +1475,7 @@ TRACE_EVENT(xfs_agf,
1466 ), 1475 ),
1467 TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " 1476 TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
1468 "levels b %u c %u flfirst %u fllast %u flcount %u " 1477 "levels b %u c %u flfirst %u fllast %u flcount %u "
1469 "freeblks %u longest %u caller %pf", 1478 "freeblks %u longest %u caller %ps",
1470 MAJOR(__entry->dev), MINOR(__entry->dev), 1479 MAJOR(__entry->dev), MINOR(__entry->dev),
1471 __entry->agno, 1480 __entry->agno,
1472 __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), 1481 __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index eb90cd59a0ec..220ef2c906b2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -173,7 +173,7 @@ xfs_trans_reserve(
173 uint rtextents) 173 uint rtextents)
174{ 174{
175 int error = 0; 175 int error = 0;
176 int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; 176 bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
177 177
178 /* Mark this thread as being in a transaction */ 178 /* Mark this thread as being in a transaction */
179 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); 179 current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
@@ -184,8 +184,7 @@ xfs_trans_reserve(
184 * fail if the count would go below zero. 184 * fail if the count would go below zero.
185 */ 185 */
186 if (blocks > 0) { 186 if (blocks > 0) {
187 error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, 187 error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
188 -((int64_t)blocks), rsvd);
189 if (error != 0) { 188 if (error != 0) {
190 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 189 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
191 return -ENOSPC; 190 return -ENOSPC;
@@ -236,8 +235,7 @@ xfs_trans_reserve(
236 * fail if the count would go below zero. 235 * fail if the count would go below zero.
237 */ 236 */
238 if (rtextents > 0) { 237 if (rtextents > 0) {
239 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, 238 error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
240 -((int64_t)rtextents), rsvd);
241 if (error) { 239 if (error) {
242 error = -ENOSPC; 240 error = -ENOSPC;
243 goto undo_log; 241 goto undo_log;
@@ -268,8 +266,7 @@ undo_log:
268 266
269undo_blocks: 267undo_blocks:
270 if (blocks > 0) { 268 if (blocks > 0) {
271 xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, 269 xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
272 (int64_t)blocks, rsvd);
273 tp->t_blk_res = 0; 270 tp->t_blk_res = 0;
274 } 271 }
275 272
@@ -488,6 +485,54 @@ xfs_trans_apply_sb_deltas(
488 sizeof(sbp->sb_frextents) - 1); 485 sizeof(sbp->sb_frextents) - 1);
489} 486}
490 487
488STATIC int
489xfs_sb_mod8(
490 uint8_t *field,
491 int8_t delta)
492{
493 int8_t counter = *field;
494
495 counter += delta;
496 if (counter < 0) {
497 ASSERT(0);
498 return -EINVAL;
499 }
500 *field = counter;
501 return 0;
502}
503
504STATIC int
505xfs_sb_mod32(
506 uint32_t *field,
507 int32_t delta)
508{
509 int32_t counter = *field;
510
511 counter += delta;
512 if (counter < 0) {
513 ASSERT(0);
514 return -EINVAL;
515 }
516 *field = counter;
517 return 0;
518}
519
520STATIC int
521xfs_sb_mod64(
522 uint64_t *field,
523 int64_t delta)
524{
525 int64_t counter = *field;
526
527 counter += delta;
528 if (counter < 0) {
529 ASSERT(0);
530 return -EINVAL;
531 }
532 *field = counter;
533 return 0;
534}
535
491/* 536/*
492 * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations 537 * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
493 * and apply superblock counter changes to the in-core superblock. The 538 * and apply superblock counter changes to the in-core superblock. The
@@ -495,13 +540,6 @@ xfs_trans_apply_sb_deltas(
495 * applied to the in-core superblock. The idea is that that has already been 540 * applied to the in-core superblock. The idea is that that has already been
496 * done. 541 * done.
497 * 542 *
498 * This is done efficiently with a single call to xfs_mod_incore_sb_batch().
499 * However, we have to ensure that we only modify each superblock field only
500 * once because the application of the delta values may not be atomic. That can
501 * lead to ENOSPC races occurring if we have two separate modifcations of the
502 * free space counter to put back the entire reservation and then take away
503 * what we used.
504 *
505 * If we are not logging superblock counters, then the inode allocated/free and 543 * If we are not logging superblock counters, then the inode allocated/free and
506 * used block counts are not updated in the on disk superblock. In this case, 544 * used block counts are not updated in the on disk superblock. In this case,
507 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 545 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
@@ -509,21 +547,15 @@ xfs_trans_apply_sb_deltas(
509 */ 547 */
510void 548void
511xfs_trans_unreserve_and_mod_sb( 549xfs_trans_unreserve_and_mod_sb(
512 xfs_trans_t *tp) 550 struct xfs_trans *tp)
513{ 551{
514 xfs_mod_sb_t msb[9]; /* If you add cases, add entries */ 552 struct xfs_mount *mp = tp->t_mountp;
515 xfs_mod_sb_t *msbp; 553 bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
516 xfs_mount_t *mp = tp->t_mountp; 554 int64_t blkdelta = 0;
517 /* REFERENCED */ 555 int64_t rtxdelta = 0;
518 int error; 556 int64_t idelta = 0;
519 int rsvd; 557 int64_t ifreedelta = 0;
520 int64_t blkdelta = 0; 558 int error;
521 int64_t rtxdelta = 0;
522 int64_t idelta = 0;
523 int64_t ifreedelta = 0;
524
525 msbp = msb;
526 rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
527 559
528 /* calculate deltas */ 560 /* calculate deltas */
529 if (tp->t_blk_res > 0) 561 if (tp->t_blk_res > 0)
@@ -547,97 +579,115 @@ xfs_trans_unreserve_and_mod_sb(
547 579
548 /* apply the per-cpu counters */ 580 /* apply the per-cpu counters */
549 if (blkdelta) { 581 if (blkdelta) {
550 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 582 error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
551 blkdelta, rsvd);
552 if (error) 583 if (error)
553 goto out; 584 goto out;
554 } 585 }
555 586
556 if (idelta) { 587 if (idelta) {
557 error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, 588 error = xfs_mod_icount(mp, idelta);
558 idelta, rsvd);
559 if (error) 589 if (error)
560 goto out_undo_fdblocks; 590 goto out_undo_fdblocks;
561 } 591 }
562 592
563 if (ifreedelta) { 593 if (ifreedelta) {
564 error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, 594 error = xfs_mod_ifree(mp, ifreedelta);
565 ifreedelta, rsvd);
566 if (error) 595 if (error)
567 goto out_undo_icount; 596 goto out_undo_icount;
568 } 597 }
569 598
599 if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
600 return;
601
570 /* apply remaining deltas */ 602 /* apply remaining deltas */
571 if (rtxdelta != 0) { 603 spin_lock(&mp->m_sb_lock);
572 msbp->msb_field = XFS_SBS_FREXTENTS; 604 if (rtxdelta) {
573 msbp->msb_delta = rtxdelta; 605 error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
574 msbp++; 606 if (error)
607 goto out_undo_ifree;
575 } 608 }
576 609
577 if (tp->t_flags & XFS_TRANS_SB_DIRTY) { 610 if (tp->t_dblocks_delta != 0) {
578 if (tp->t_dblocks_delta != 0) { 611 error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
579 msbp->msb_field = XFS_SBS_DBLOCKS; 612 if (error)
580 msbp->msb_delta = tp->t_dblocks_delta; 613 goto out_undo_frextents;
581 msbp++;
582 }
583 if (tp->t_agcount_delta != 0) {
584 msbp->msb_field = XFS_SBS_AGCOUNT;
585 msbp->msb_delta = tp->t_agcount_delta;
586 msbp++;
587 }
588 if (tp->t_imaxpct_delta != 0) {
589 msbp->msb_field = XFS_SBS_IMAX_PCT;
590 msbp->msb_delta = tp->t_imaxpct_delta;
591 msbp++;
592 }
593 if (tp->t_rextsize_delta != 0) {
594 msbp->msb_field = XFS_SBS_REXTSIZE;
595 msbp->msb_delta = tp->t_rextsize_delta;
596 msbp++;
597 }
598 if (tp->t_rbmblocks_delta != 0) {
599 msbp->msb_field = XFS_SBS_RBMBLOCKS;
600 msbp->msb_delta = tp->t_rbmblocks_delta;
601 msbp++;
602 }
603 if (tp->t_rblocks_delta != 0) {
604 msbp->msb_field = XFS_SBS_RBLOCKS;
605 msbp->msb_delta = tp->t_rblocks_delta;
606 msbp++;
607 }
608 if (tp->t_rextents_delta != 0) {
609 msbp->msb_field = XFS_SBS_REXTENTS;
610 msbp->msb_delta = tp->t_rextents_delta;
611 msbp++;
612 }
613 if (tp->t_rextslog_delta != 0) {
614 msbp->msb_field = XFS_SBS_REXTSLOG;
615 msbp->msb_delta = tp->t_rextslog_delta;
616 msbp++;
617 }
618 } 614 }
619 615 if (tp->t_agcount_delta != 0) {
620 /* 616 error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
621 * If we need to change anything, do it.
622 */
623 if (msbp > msb) {
624 error = xfs_mod_incore_sb_batch(tp->t_mountp, msb,
625 (uint)(msbp - msb), rsvd);
626 if (error) 617 if (error)
627 goto out_undo_ifreecount; 618 goto out_undo_dblocks;
628 } 619 }
629 620 if (tp->t_imaxpct_delta != 0) {
621 error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
622 if (error)
623 goto out_undo_agcount;
624 }
625 if (tp->t_rextsize_delta != 0) {
626 error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
627 tp->t_rextsize_delta);
628 if (error)
629 goto out_undo_imaxpct;
630 }
631 if (tp->t_rbmblocks_delta != 0) {
632 error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
633 tp->t_rbmblocks_delta);
634 if (error)
635 goto out_undo_rextsize;
636 }
637 if (tp->t_rblocks_delta != 0) {
638 error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
639 if (error)
640 goto out_undo_rbmblocks;
641 }
642 if (tp->t_rextents_delta != 0) {
643 error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
644 tp->t_rextents_delta);
645 if (error)
646 goto out_undo_rblocks;
647 }
648 if (tp->t_rextslog_delta != 0) {
649 error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
650 tp->t_rextslog_delta);
651 if (error)
652 goto out_undo_rextents;
653 }
654 spin_unlock(&mp->m_sb_lock);
630 return; 655 return;
631 656
632out_undo_ifreecount: 657out_undo_rextents:
658 if (tp->t_rextents_delta)
659 xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
660out_undo_rblocks:
661 if (tp->t_rblocks_delta)
662 xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
663out_undo_rbmblocks:
664 if (tp->t_rbmblocks_delta)
665 xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
666out_undo_rextsize:
667 if (tp->t_rextsize_delta)
668 xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
669out_undo_imaxpct:
670 if (tp->t_rextsize_delta)
671 xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
672out_undo_agcount:
673 if (tp->t_agcount_delta)
674 xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
675out_undo_dblocks:
676 if (tp->t_dblocks_delta)
677 xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
678out_undo_frextents:
679 if (rtxdelta)
680 xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
681out_undo_ifree:
682 spin_unlock(&mp->m_sb_lock);
633 if (ifreedelta) 683 if (ifreedelta)
634 xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd); 684 xfs_mod_ifree(mp, -ifreedelta);
635out_undo_icount: 685out_undo_icount:
636 if (idelta) 686 if (idelta)
637 xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd); 687 xfs_mod_icount(mp, -idelta);
638out_undo_fdblocks: 688out_undo_fdblocks:
639 if (blkdelta) 689 if (blkdelta)
640 xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); 690 xfs_mod_fdblocks(mp, -blkdelta, rsvd);
641out: 691out:
642 ASSERT(error == 0); 692 ASSERT(error == 0);
643 return; 693 return;