aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/adfs/inode.c15
-rw-r--r--fs/affs/file.c18
-rw-r--r--fs/affs/inode.c5
-rw-r--r--fs/bfs/file.c15
-rw-r--r--fs/btrfs/ctree.c14
-rw-r--r--fs/btrfs/inode.c16
-rw-r--r--fs/cachefiles/interface.c57
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/key.c2
-rw-r--r--fs/cachefiles/namei.c3
-rw-r--r--fs/cachefiles/rdwr.c114
-rw-r--r--fs/cachefiles/xattr.c2
-rw-r--r--fs/ceph/addr.c60
-rw-r--r--fs/ceph/caps.c18
-rw-r--r--fs/ceph/file.c73
-rw-r--r--fs/ceph/inode.c15
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/super.c4
-rw-r--r--fs/dcache.c35
-rw-r--r--fs/exec.c21
-rw-r--r--fs/exportfs/expfs.c4
-rw-r--r--fs/f2fs/Kconfig53
-rw-r--r--fs/f2fs/Makefile7
-rw-r--r--fs/f2fs/acl.c414
-rw-r--r--fs/f2fs/acl.h57
-rw-r--r--fs/f2fs/checkpoint.c794
-rw-r--r--fs/f2fs/data.c702
-rw-r--r--fs/f2fs/debug.c361
-rw-r--r--fs/f2fs/dir.c672
-rw-r--r--fs/f2fs/f2fs.h1083
-rw-r--r--fs/f2fs/file.c636
-rw-r--r--fs/f2fs/gc.c742
-rw-r--r--fs/f2fs/gc.h117
-rw-r--r--fs/f2fs/hash.c97
-rw-r--r--fs/f2fs/inode.c268
-rw-r--r--fs/f2fs/namei.c503
-rw-r--r--fs/f2fs/node.c1764
-rw-r--r--fs/f2fs/node.h353
-rw-r--r--fs/f2fs/recovery.c375
-rw-r--r--fs/f2fs/segment.c1791
-rw-r--r--fs/f2fs/segment.h618
-rw-r--r--fs/f2fs/super.c657
-rw-r--r--fs/f2fs/xattr.c440
-rw-r--r--fs/f2fs/xattr.h145
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fscache/cache.c8
-rw-r--r--fs/fscache/cookie.c78
-rw-r--r--fs/fscache/internal.h15
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fscache/object.c101
-rw-r--r--fs/fscache/operation.c140
-rw-r--r--fs/fscache/page.c195
-rw-r--r--fs/fscache/stats.c17
-rw-r--r--fs/hfs/inode.c26
-rw-r--r--fs/hfsplus/inode.c27
-rw-r--r--fs/hpfs/file.c20
-rw-r--r--fs/hpfs/hpfs_fn.h1
-rw-r--r--fs/hpfs/inode.c5
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/inode.c20
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/logfs/readwrite.c10
-rw-r--r--fs/minix/file.c6
-rw-r--r--fs/minix/inode.c17
-rw-r--r--fs/namei.c118
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/ncpfs/inode.c4
-rw-r--r--fs/nfs/fscache.c1
-rw-r--r--fs/nfs/fscache.h20
-rw-r--r--fs/nfs/inode.c20
-rw-r--r--fs/nfs/nfs4file.c2
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/nfs/write.c3
-rw-r--r--fs/nfsd/fault_inject.c113
-rw-r--r--fs/nfsd/fault_inject.h28
-rw-r--r--fs/nfsd/netns.h66
-rw-r--r--fs/nfsd/nfs2acl.c2
-rw-r--r--fs/nfsd/nfs3acl.c2
-rw-r--r--fs/nfsd/nfs3proc.c6
-rw-r--r--fs/nfsd/nfs3xdr.c47
-rw-r--r--fs/nfsd/nfs4callback.c69
-rw-r--r--fs/nfsd/nfs4proc.c74
-rw-r--r--fs/nfsd/nfs4recover.c561
-rw-r--r--fs/nfsd/nfs4state.c1015
-rw-r--r--fs/nfsd/nfs4xdr.c324
-rw-r--r--fs/nfsd/nfsctl.c100
-rw-r--r--fs/nfsd/nfsd.h36
-rw-r--r--fs/nfsd/nfsfh.c4
-rw-r--r--fs/nfsd/nfssvc.c203
-rw-r--r--fs/nfsd/nfsxdr.c11
-rw-r--r--fs/nfsd/state.h64
-rw-r--r--fs/nfsd/vfs.c53
-rw-r--r--fs/nfsd/xdr4.h15
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/inode.c24
-rw-r--r--fs/nilfs2/nilfs.h1
-rw-r--r--fs/nilfs2/recovery.c3
-rw-r--r--fs/ntfs/file.c16
-rw-r--r--fs/ntfs/inode.c8
-rw-r--r--fs/ntfs/inode.h4
-rw-r--r--fs/ocfs2/file.c18
-rw-r--r--fs/omfs/file.c22
-rw-r--r--fs/open.c97
-rw-r--r--fs/proc/base.c7
-rw-r--r--fs/proc/generic.c9
-rw-r--r--fs/proc/proc_sysctl.c7
-rw-r--r--fs/reiserfs/file.c3
-rw-r--r--fs/reiserfs/inode.c15
-rw-r--r--fs/reiserfs/reiserfs.h1
-rw-r--r--fs/stat.c16
-rw-r--r--fs/statfs.c9
-rw-r--r--fs/sysv/file.c5
-rw-r--r--fs/sysv/itree.c17
-rw-r--r--fs/ufs/inode.c15
-rw-r--r--fs/utimes.c6
-rw-r--r--fs/xattr.c72
119 files changed, 15683 insertions, 1514 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index eaff24a19502..cfe512fd1caf 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -220,6 +220,7 @@ source "fs/pstore/Kconfig"
220source "fs/sysv/Kconfig" 220source "fs/sysv/Kconfig"
221source "fs/ufs/Kconfig" 221source "fs/ufs/Kconfig"
222source "fs/exofs/Kconfig" 222source "fs/exofs/Kconfig"
223source "fs/f2fs/Kconfig"
223 224
224endif # MISC_FILESYSTEMS 225endif # MISC_FILESYSTEMS
225 226
diff --git a/fs/Makefile b/fs/Makefile
index 1d7af79288a0..9d53192236fc 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
123obj-$(CONFIG_OCFS2_FS) += ocfs2/ 123obj-$(CONFIG_OCFS2_FS) += ocfs2/
124obj-$(CONFIG_BTRFS_FS) += btrfs/ 124obj-$(CONFIG_BTRFS_FS) += btrfs/
125obj-$(CONFIG_GFS2_FS) += gfs2/ 125obj-$(CONFIG_GFS2_FS) += gfs2/
126obj-$(CONFIG_F2FS_FS) += f2fs/
126obj-y += exofs/ # Multiple modules 127obj-y += exofs/ # Multiple modules
127obj-$(CONFIG_CEPH_FS) += ceph/ 128obj-$(CONFIG_CEPH_FS) += ceph/
128obj-$(CONFIG_PSTORE) += pstore/ 129obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e9bad5093a3f..5f95d1ed9c6d 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -45,6 +45,14 @@ static int adfs_readpage(struct file *file, struct page *page)
45 return block_read_full_page(page, adfs_get_block); 45 return block_read_full_page(page, adfs_get_block);
46} 46}
47 47
48static void adfs_write_failed(struct address_space *mapping, loff_t to)
49{
50 struct inode *inode = mapping->host;
51
52 if (to > inode->i_size)
53 truncate_pagecache(inode, to, inode->i_size);
54}
55
48static int adfs_write_begin(struct file *file, struct address_space *mapping, 56static int adfs_write_begin(struct file *file, struct address_space *mapping,
49 loff_t pos, unsigned len, unsigned flags, 57 loff_t pos, unsigned len, unsigned flags,
50 struct page **pagep, void **fsdata) 58 struct page **pagep, void **fsdata)
@@ -55,11 +63,8 @@ static int adfs_write_begin(struct file *file, struct address_space *mapping,
55 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 63 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
56 adfs_get_block, 64 adfs_get_block,
57 &ADFS_I(mapping->host)->mmu_private); 65 &ADFS_I(mapping->host)->mmu_private);
58 if (unlikely(ret)) { 66 if (unlikely(ret))
59 loff_t isize = mapping->host->i_size; 67 adfs_write_failed(mapping, pos + len);
60 if (pos + len > isize)
61 vmtruncate(mapping->host, isize);
62 }
63 68
64 return ret; 69 return ret;
65} 70}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 2f4c935cb327..af3261b78102 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -39,7 +39,6 @@ const struct file_operations affs_file_operations = {
39}; 39};
40 40
41const struct inode_operations affs_file_inode_operations = { 41const struct inode_operations affs_file_inode_operations = {
42 .truncate = affs_truncate,
43 .setattr = affs_notify_change, 42 .setattr = affs_notify_change,
44}; 43};
45 44
@@ -402,6 +401,16 @@ static int affs_readpage(struct file *file, struct page *page)
402 return block_read_full_page(page, affs_get_block); 401 return block_read_full_page(page, affs_get_block);
403} 402}
404 403
404static void affs_write_failed(struct address_space *mapping, loff_t to)
405{
406 struct inode *inode = mapping->host;
407
408 if (to > inode->i_size) {
409 truncate_pagecache(inode, to, inode->i_size);
410 affs_truncate(inode);
411 }
412}
413
405static int affs_write_begin(struct file *file, struct address_space *mapping, 414static int affs_write_begin(struct file *file, struct address_space *mapping,
406 loff_t pos, unsigned len, unsigned flags, 415 loff_t pos, unsigned len, unsigned flags,
407 struct page **pagep, void **fsdata) 416 struct page **pagep, void **fsdata)
@@ -412,11 +421,8 @@ static int affs_write_begin(struct file *file, struct address_space *mapping,
412 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 421 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
413 affs_get_block, 422 affs_get_block,
414 &AFFS_I(mapping->host)->mmu_private); 423 &AFFS_I(mapping->host)->mmu_private);
415 if (unlikely(ret)) { 424 if (unlikely(ret))
416 loff_t isize = mapping->host->i_size; 425 affs_write_failed(mapping, pos + len);
417 if (pos + len > isize)
418 vmtruncate(mapping->host, isize);
419 }
420 426
421 return ret; 427 return ret;
422} 428}
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 15c484268229..0e092d08680e 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -237,9 +237,12 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
237 237
238 if ((attr->ia_valid & ATTR_SIZE) && 238 if ((attr->ia_valid & ATTR_SIZE) &&
239 attr->ia_size != i_size_read(inode)) { 239 attr->ia_size != i_size_read(inode)) {
240 error = vmtruncate(inode, attr->ia_size); 240 error = inode_newsize_ok(inode, attr->ia_size);
241 if (error) 241 if (error)
242 return error; 242 return error;
243
244 truncate_setsize(inode, attr->ia_size);
245 affs_truncate(inode);
243 } 246 }
244 247
245 setattr_copy(inode, attr); 248 setattr_copy(inode, attr);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index f20e8a71062f..ad3ea1497cc3 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -161,6 +161,14 @@ static int bfs_readpage(struct file *file, struct page *page)
161 return block_read_full_page(page, bfs_get_block); 161 return block_read_full_page(page, bfs_get_block);
162} 162}
163 163
164static void bfs_write_failed(struct address_space *mapping, loff_t to)
165{
166 struct inode *inode = mapping->host;
167
168 if (to > inode->i_size)
169 truncate_pagecache(inode, to, inode->i_size);
170}
171
164static int bfs_write_begin(struct file *file, struct address_space *mapping, 172static int bfs_write_begin(struct file *file, struct address_space *mapping,
165 loff_t pos, unsigned len, unsigned flags, 173 loff_t pos, unsigned len, unsigned flags,
166 struct page **pagep, void **fsdata) 174 struct page **pagep, void **fsdata)
@@ -169,11 +177,8 @@ static int bfs_write_begin(struct file *file, struct address_space *mapping,
169 177
170 ret = block_write_begin(mapping, pos, len, flags, pagep, 178 ret = block_write_begin(mapping, pos, len, flags, pagep,
171 bfs_get_block); 179 bfs_get_block);
172 if (unlikely(ret)) { 180 if (unlikely(ret))
173 loff_t isize = mapping->host->i_size; 181 bfs_write_failed(mapping, pos + len);
174 if (pos + len > isize)
175 vmtruncate(mapping->host, isize);
176 }
177 182
178 return ret; 183 return ret;
179} 184}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c7b67cf24bba..eea5da7a2b9a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,13 +1138,13 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
1138 switch (tm->op) { 1138 switch (tm->op) {
1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING: 1139 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1140 BUG_ON(tm->slot < n); 1140 BUG_ON(tm->slot < n);
1141 case MOD_LOG_KEY_REMOVE:
1142 n++;
1143 case MOD_LOG_KEY_REMOVE_WHILE_MOVING: 1141 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1142 case MOD_LOG_KEY_REMOVE:
1144 btrfs_set_node_key(eb, &tm->key, tm->slot); 1143 btrfs_set_node_key(eb, &tm->key, tm->slot);
1145 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); 1144 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1146 btrfs_set_node_ptr_generation(eb, tm->slot, 1145 btrfs_set_node_ptr_generation(eb, tm->slot,
1147 tm->generation); 1146 tm->generation);
1147 n++;
1148 break; 1148 break;
1149 case MOD_LOG_KEY_REPLACE: 1149 case MOD_LOG_KEY_REPLACE:
1150 BUG_ON(tm->slot >= n); 1150 BUG_ON(tm->slot >= n);
@@ -4611,12 +4611,6 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4611 u32 nritems; 4611 u32 nritems;
4612 int ret; 4612 int ret;
4613 4613
4614 if (level) {
4615 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4616 MOD_LOG_KEY_REMOVE);
4617 BUG_ON(ret < 0);
4618 }
4619
4620 nritems = btrfs_header_nritems(parent); 4614 nritems = btrfs_header_nritems(parent);
4621 if (slot != nritems - 1) { 4615 if (slot != nritems - 1) {
4622 if (level) 4616 if (level)
@@ -4627,6 +4621,10 @@ static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4627 btrfs_node_key_ptr_offset(slot + 1), 4621 btrfs_node_key_ptr_offset(slot + 1),
4628 sizeof(struct btrfs_key_ptr) * 4622 sizeof(struct btrfs_key_ptr) *
4629 (nritems - slot - 1)); 4623 (nritems - slot - 1));
4624 } else if (level) {
4625 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4626 MOD_LOG_KEY_REMOVE);
4627 BUG_ON(ret < 0);
4630 } 4628 }
4631 4629
4632 nritems--; 4630 nritems--;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 67ed24ae86bb..16d9e8e191e6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4262,16 +4262,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4262 if (dentry->d_name.len > BTRFS_NAME_LEN) 4262 if (dentry->d_name.len > BTRFS_NAME_LEN)
4263 return ERR_PTR(-ENAMETOOLONG); 4263 return ERR_PTR(-ENAMETOOLONG);
4264 4264
4265 if (unlikely(d_need_lookup(dentry))) { 4265 ret = btrfs_inode_by_name(dir, dentry, &location);
4266 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
4267 kfree(dentry->d_fsdata);
4268 dentry->d_fsdata = NULL;
4269 /* This thing is hashed, drop it for now */
4270 d_drop(dentry);
4271 } else {
4272 ret = btrfs_inode_by_name(dir, dentry, &location);
4273 }
4274
4275 if (ret < 0) 4266 if (ret < 0)
4276 return ERR_PTR(ret); 4267 return ERR_PTR(ret);
4277 4268
@@ -4341,11 +4332,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4341 struct dentry *ret; 4332 struct dentry *ret;
4342 4333
4343 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry); 4334 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4344 if (unlikely(d_need_lookup(dentry))) {
4345 spin_lock(&dentry->d_lock);
4346 dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
4347 spin_unlock(&dentry->d_lock);
4348 }
4349 return ret; 4335 return ret;
4350} 4336}
4351 4337
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 67bef6d01484..746ce532e130 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -41,12 +41,12 @@ static struct fscache_object *cachefiles_alloc_object(
41 41
42 _enter("{%s},%p,", cache->cache.identifier, cookie); 42 _enter("{%s},%p,", cache->cache.identifier, cookie);
43 43
44 lookup_data = kmalloc(sizeof(*lookup_data), GFP_KERNEL); 44 lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp);
45 if (!lookup_data) 45 if (!lookup_data)
46 goto nomem_lookup_data; 46 goto nomem_lookup_data;
47 47
48 /* create a new object record and a temporary leaf image */ 48 /* create a new object record and a temporary leaf image */
49 object = kmem_cache_alloc(cachefiles_object_jar, GFP_KERNEL); 49 object = kmem_cache_alloc(cachefiles_object_jar, cachefiles_gfp);
50 if (!object) 50 if (!object)
51 goto nomem_object; 51 goto nomem_object;
52 52
@@ -63,7 +63,7 @@ static struct fscache_object *cachefiles_alloc_object(
63 * - stick the length on the front and leave space on the back for the 63 * - stick the length on the front and leave space on the back for the
64 * encoder 64 * encoder
65 */ 65 */
66 buffer = kmalloc((2 + 512) + 3, GFP_KERNEL); 66 buffer = kmalloc((2 + 512) + 3, cachefiles_gfp);
67 if (!buffer) 67 if (!buffer)
68 goto nomem_buffer; 68 goto nomem_buffer;
69 69
@@ -219,7 +219,7 @@ static void cachefiles_update_object(struct fscache_object *_object)
219 return; 219 return;
220 } 220 }
221 221
222 auxdata = kmalloc(2 + 512 + 3, GFP_KERNEL); 222 auxdata = kmalloc(2 + 512 + 3, cachefiles_gfp);
223 if (!auxdata) { 223 if (!auxdata) {
224 _leave(" [nomem]"); 224 _leave(" [nomem]");
225 return; 225 return;
@@ -441,6 +441,54 @@ truncate_failed:
441} 441}
442 442
443/* 443/*
444 * Invalidate an object
445 */
446static void cachefiles_invalidate_object(struct fscache_operation *op)
447{
448 struct cachefiles_object *object;
449 struct cachefiles_cache *cache;
450 const struct cred *saved_cred;
451 struct path path;
452 uint64_t ni_size;
453 int ret;
454
455 object = container_of(op->object, struct cachefiles_object, fscache);
456 cache = container_of(object->fscache.cache,
457 struct cachefiles_cache, cache);
458
459 op->object->cookie->def->get_attr(op->object->cookie->netfs_data,
460 &ni_size);
461
462 _enter("{OBJ%x},[%llu]",
463 op->object->debug_id, (unsigned long long)ni_size);
464
465 if (object->backer) {
466 ASSERT(S_ISREG(object->backer->d_inode->i_mode));
467
468 fscache_set_store_limit(&object->fscache, ni_size);
469
470 path.dentry = object->backer;
471 path.mnt = cache->mnt;
472
473 cachefiles_begin_secure(cache, &saved_cred);
474 ret = vfs_truncate(&path, 0);
475 if (ret == 0)
476 ret = vfs_truncate(&path, ni_size);
477 cachefiles_end_secure(cache, saved_cred);
478
479 if (ret != 0) {
480 fscache_set_store_limit(&object->fscache, 0);
481 if (ret == -EIO)
482 cachefiles_io_error_obj(object,
483 "Invalidate failed");
484 }
485 }
486
487 fscache_op_complete(op, true);
488 _leave("");
489}
490
491/*
444 * dissociate a cache from all the pages it was backing 492 * dissociate a cache from all the pages it was backing
445 */ 493 */
446static void cachefiles_dissociate_pages(struct fscache_cache *cache) 494static void cachefiles_dissociate_pages(struct fscache_cache *cache)
@@ -455,6 +503,7 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
455 .lookup_complete = cachefiles_lookup_complete, 503 .lookup_complete = cachefiles_lookup_complete,
456 .grab_object = cachefiles_grab_object, 504 .grab_object = cachefiles_grab_object,
457 .update_object = cachefiles_update_object, 505 .update_object = cachefiles_update_object,
506 .invalidate_object = cachefiles_invalidate_object,
458 .drop_object = cachefiles_drop_object, 507 .drop_object = cachefiles_drop_object,
459 .put_object = cachefiles_put_object, 508 .put_object = cachefiles_put_object,
460 .sync_cache = cachefiles_sync_cache, 509 .sync_cache = cachefiles_sync_cache,
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index bd6bc1bde2d7..49382519907a 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -23,6 +23,8 @@ extern unsigned cachefiles_debug;
23#define CACHEFILES_DEBUG_KLEAVE 2 23#define CACHEFILES_DEBUG_KLEAVE 2
24#define CACHEFILES_DEBUG_KDEBUG 4 24#define CACHEFILES_DEBUG_KDEBUG 4
25 25
26#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
27
26/* 28/*
27 * node records 29 * node records
28 */ 30 */
diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c
index 81b8b2b3a674..33b58c60f2d1 100644
--- a/fs/cachefiles/key.c
+++ b/fs/cachefiles/key.c
@@ -78,7 +78,7 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type)
78 78
79 _debug("max: %d", max); 79 _debug("max: %d", max);
80 80
81 key = kmalloc(max, GFP_KERNEL); 81 key = kmalloc(max, cachefiles_gfp);
82 if (!key) 82 if (!key)
83 return NULL; 83 return NULL;
84 84
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index b0b5f7cdfffa..8c01c5fcdf75 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -40,8 +40,7 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", 40 printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
41 prefix, fscache_object_states[object->fscache.state], 41 prefix, fscache_object_states[object->fscache.state],
42 object->fscache.flags, work_busy(&object->fscache.work), 42 object->fscache.flags, work_busy(&object->fscache.work),
43 object->fscache.events, 43 object->fscache.events, object->fscache.event_mask);
44 object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
45 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", 44 printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
46 prefix, object->fscache.n_ops, object->fscache.n_in_progress, 45 prefix, object->fscache.n_ops, object->fscache.n_in_progress,
47 object->fscache.n_exclusive); 46 object->fscache.n_exclusive);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index c994691d9445..480992259707 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -77,25 +77,25 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
77 struct page *backpage = monitor->back_page, *backpage2; 77 struct page *backpage = monitor->back_page, *backpage2;
78 int ret; 78 int ret;
79 79
80 kenter("{ino=%lx},{%lx,%lx}", 80 _enter("{ino=%lx},{%lx,%lx}",
81 object->backer->d_inode->i_ino, 81 object->backer->d_inode->i_ino,
82 backpage->index, backpage->flags); 82 backpage->index, backpage->flags);
83 83
84 /* skip if the page was truncated away completely */ 84 /* skip if the page was truncated away completely */
85 if (backpage->mapping != bmapping) { 85 if (backpage->mapping != bmapping) {
86 kleave(" = -ENODATA [mapping]"); 86 _leave(" = -ENODATA [mapping]");
87 return -ENODATA; 87 return -ENODATA;
88 } 88 }
89 89
90 backpage2 = find_get_page(bmapping, backpage->index); 90 backpage2 = find_get_page(bmapping, backpage->index);
91 if (!backpage2) { 91 if (!backpage2) {
92 kleave(" = -ENODATA [gone]"); 92 _leave(" = -ENODATA [gone]");
93 return -ENODATA; 93 return -ENODATA;
94 } 94 }
95 95
96 if (backpage != backpage2) { 96 if (backpage != backpage2) {
97 put_page(backpage2); 97 put_page(backpage2);
98 kleave(" = -ENODATA [different]"); 98 _leave(" = -ENODATA [different]");
99 return -ENODATA; 99 return -ENODATA;
100 } 100 }
101 101
@@ -114,7 +114,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
114 if (PageUptodate(backpage)) 114 if (PageUptodate(backpage))
115 goto unlock_discard; 115 goto unlock_discard;
116 116
117 kdebug("reissue read"); 117 _debug("reissue read");
118 ret = bmapping->a_ops->readpage(NULL, backpage); 118 ret = bmapping->a_ops->readpage(NULL, backpage);
119 if (ret < 0) 119 if (ret < 0)
120 goto unlock_discard; 120 goto unlock_discard;
@@ -129,7 +129,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object,
129 } 129 }
130 130
131 /* it'll reappear on the todo list */ 131 /* it'll reappear on the todo list */
132 kleave(" = -EINPROGRESS"); 132 _leave(" = -EINPROGRESS");
133 return -EINPROGRESS; 133 return -EINPROGRESS;
134 134
135unlock_discard: 135unlock_discard:
@@ -137,7 +137,7 @@ unlock_discard:
137 spin_lock_irq(&object->work_lock); 137 spin_lock_irq(&object->work_lock);
138 list_del(&monitor->op_link); 138 list_del(&monitor->op_link);
139 spin_unlock_irq(&object->work_lock); 139 spin_unlock_irq(&object->work_lock);
140 kleave(" = %d", ret); 140 _leave(" = %d", ret);
141 return ret; 141 return ret;
142} 142}
143 143
@@ -174,11 +174,13 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
174 _debug("- copy {%lu}", monitor->back_page->index); 174 _debug("- copy {%lu}", monitor->back_page->index);
175 175
176 recheck: 176 recheck:
177 if (PageUptodate(monitor->back_page)) { 177 if (test_bit(FSCACHE_COOKIE_INVALIDATING,
178 &object->fscache.cookie->flags)) {
179 error = -ESTALE;
180 } else if (PageUptodate(monitor->back_page)) {
178 copy_highpage(monitor->netfs_page, monitor->back_page); 181 copy_highpage(monitor->netfs_page, monitor->back_page);
179 182 fscache_mark_page_cached(monitor->op,
180 pagevec_add(&pagevec, monitor->netfs_page); 183 monitor->netfs_page);
181 fscache_mark_pages_cached(monitor->op, &pagevec);
182 error = 0; 184 error = 0;
183 } else if (!PageError(monitor->back_page)) { 185 } else if (!PageError(monitor->back_page)) {
184 /* the page has probably been truncated */ 186 /* the page has probably been truncated */
@@ -198,6 +200,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
198 200
199 fscache_end_io(op, monitor->netfs_page, error); 201 fscache_end_io(op, monitor->netfs_page, error);
200 page_cache_release(monitor->netfs_page); 202 page_cache_release(monitor->netfs_page);
203 fscache_retrieval_complete(op, 1);
201 fscache_put_retrieval(op); 204 fscache_put_retrieval(op);
202 kfree(monitor); 205 kfree(monitor);
203 206
@@ -239,7 +242,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
239 _debug("read back %p{%lu,%d}", 242 _debug("read back %p{%lu,%d}",
240 netpage, netpage->index, page_count(netpage)); 243 netpage, netpage->index, page_count(netpage));
241 244
242 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); 245 monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
243 if (!monitor) 246 if (!monitor)
244 goto nomem; 247 goto nomem;
245 248
@@ -258,13 +261,14 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
258 goto backing_page_already_present; 261 goto backing_page_already_present;
259 262
260 if (!newpage) { 263 if (!newpage) {
261 newpage = page_cache_alloc_cold(bmapping); 264 newpage = __page_cache_alloc(cachefiles_gfp |
265 __GFP_COLD);
262 if (!newpage) 266 if (!newpage)
263 goto nomem_monitor; 267 goto nomem_monitor;
264 } 268 }
265 269
266 ret = add_to_page_cache(newpage, bmapping, 270 ret = add_to_page_cache(newpage, bmapping,
267 netpage->index, GFP_KERNEL); 271 netpage->index, cachefiles_gfp);
268 if (ret == 0) 272 if (ret == 0)
269 goto installed_new_backing_page; 273 goto installed_new_backing_page;
270 if (ret != -EEXIST) 274 if (ret != -EEXIST)
@@ -335,11 +339,11 @@ backing_page_already_present:
335backing_page_already_uptodate: 339backing_page_already_uptodate:
336 _debug("- uptodate"); 340 _debug("- uptodate");
337 341
338 pagevec_add(pagevec, netpage); 342 fscache_mark_page_cached(op, netpage);
339 fscache_mark_pages_cached(op, pagevec);
340 343
341 copy_highpage(netpage, backpage); 344 copy_highpage(netpage, backpage);
342 fscache_end_io(op, netpage, 0); 345 fscache_end_io(op, netpage, 0);
346 fscache_retrieval_complete(op, 1);
343 347
344success: 348success:
345 _debug("success"); 349 _debug("success");
@@ -357,10 +361,13 @@ out:
357 361
358read_error: 362read_error:
359 _debug("read error %d", ret); 363 _debug("read error %d", ret);
360 if (ret == -ENOMEM) 364 if (ret == -ENOMEM) {
365 fscache_retrieval_complete(op, 1);
361 goto out; 366 goto out;
367 }
362io_error: 368io_error:
363 cachefiles_io_error_obj(object, "Page read error on backing file"); 369 cachefiles_io_error_obj(object, "Page read error on backing file");
370 fscache_retrieval_complete(op, 1);
364 ret = -ENOBUFS; 371 ret = -ENOBUFS;
365 goto out; 372 goto out;
366 373
@@ -370,6 +377,7 @@ nomem_monitor:
370 fscache_put_retrieval(monitor->op); 377 fscache_put_retrieval(monitor->op);
371 kfree(monitor); 378 kfree(monitor);
372nomem: 379nomem:
380 fscache_retrieval_complete(op, 1);
373 _leave(" = -ENOMEM"); 381 _leave(" = -ENOMEM");
374 return -ENOMEM; 382 return -ENOMEM;
375} 383}
@@ -408,7 +416,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
408 _enter("{%p},{%lx},,,", object, page->index); 416 _enter("{%p},{%lx},,,", object, page->index);
409 417
410 if (!object->backer) 418 if (!object->backer)
411 return -ENOBUFS; 419 goto enobufs;
412 420
413 inode = object->backer->d_inode; 421 inode = object->backer->d_inode;
414 ASSERT(S_ISREG(inode->i_mode)); 422 ASSERT(S_ISREG(inode->i_mode));
@@ -417,7 +425,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
417 425
418 /* calculate the shift required to use bmap */ 426 /* calculate the shift required to use bmap */
419 if (inode->i_sb->s_blocksize > PAGE_SIZE) 427 if (inode->i_sb->s_blocksize > PAGE_SIZE)
420 return -ENOBUFS; 428 goto enobufs;
421 429
422 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 430 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
423 431
@@ -448,15 +456,20 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
448 &pagevec); 456 &pagevec);
449 } else if (cachefiles_has_space(cache, 0, 1) == 0) { 457 } else if (cachefiles_has_space(cache, 0, 1) == 0) {
450 /* there's space in the cache we can use */ 458 /* there's space in the cache we can use */
451 pagevec_add(&pagevec, page); 459 fscache_mark_page_cached(op, page);
452 fscache_mark_pages_cached(op, &pagevec); 460 fscache_retrieval_complete(op, 1);
453 ret = -ENODATA; 461 ret = -ENODATA;
454 } else { 462 } else {
455 ret = -ENOBUFS; 463 goto enobufs;
456 } 464 }
457 465
458 _leave(" = %d", ret); 466 _leave(" = %d", ret);
459 return ret; 467 return ret;
468
469enobufs:
470 fscache_retrieval_complete(op, 1);
471 _leave(" = -ENOBUFS");
472 return -ENOBUFS;
460} 473}
461 474
462/* 475/*
@@ -465,8 +478,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
465 */ 478 */
466static int cachefiles_read_backing_file(struct cachefiles_object *object, 479static int cachefiles_read_backing_file(struct cachefiles_object *object,
467 struct fscache_retrieval *op, 480 struct fscache_retrieval *op,
468 struct list_head *list, 481 struct list_head *list)
469 struct pagevec *mark_pvec)
470{ 482{
471 struct cachefiles_one_read *monitor = NULL; 483 struct cachefiles_one_read *monitor = NULL;
472 struct address_space *bmapping = object->backer->d_inode->i_mapping; 484 struct address_space *bmapping = object->backer->d_inode->i_mapping;
@@ -485,7 +497,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
485 netpage, netpage->index, page_count(netpage)); 497 netpage, netpage->index, page_count(netpage));
486 498
487 if (!monitor) { 499 if (!monitor) {
488 monitor = kzalloc(sizeof(*monitor), GFP_KERNEL); 500 monitor = kzalloc(sizeof(*monitor), cachefiles_gfp);
489 if (!monitor) 501 if (!monitor)
490 goto nomem; 502 goto nomem;
491 503
@@ -500,13 +512,14 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
500 goto backing_page_already_present; 512 goto backing_page_already_present;
501 513
502 if (!newpage) { 514 if (!newpage) {
503 newpage = page_cache_alloc_cold(bmapping); 515 newpage = __page_cache_alloc(cachefiles_gfp |
516 __GFP_COLD);
504 if (!newpage) 517 if (!newpage)
505 goto nomem; 518 goto nomem;
506 } 519 }
507 520
508 ret = add_to_page_cache(newpage, bmapping, 521 ret = add_to_page_cache(newpage, bmapping,
509 netpage->index, GFP_KERNEL); 522 netpage->index, cachefiles_gfp);
510 if (ret == 0) 523 if (ret == 0)
511 goto installed_new_backing_page; 524 goto installed_new_backing_page;
512 if (ret != -EEXIST) 525 if (ret != -EEXIST)
@@ -536,10 +549,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
536 _debug("- monitor add"); 549 _debug("- monitor add");
537 550
538 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 551 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
539 GFP_KERNEL); 552 cachefiles_gfp);
540 if (ret < 0) { 553 if (ret < 0) {
541 if (ret == -EEXIST) { 554 if (ret == -EEXIST) {
542 page_cache_release(netpage); 555 page_cache_release(netpage);
556 fscache_retrieval_complete(op, 1);
543 continue; 557 continue;
544 } 558 }
545 goto nomem; 559 goto nomem;
@@ -612,10 +626,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
612 _debug("- uptodate"); 626 _debug("- uptodate");
613 627
614 ret = add_to_page_cache(netpage, op->mapping, netpage->index, 628 ret = add_to_page_cache(netpage, op->mapping, netpage->index,
615 GFP_KERNEL); 629 cachefiles_gfp);
616 if (ret < 0) { 630 if (ret < 0) {
617 if (ret == -EEXIST) { 631 if (ret == -EEXIST) {
618 page_cache_release(netpage); 632 page_cache_release(netpage);
633 fscache_retrieval_complete(op, 1);
619 continue; 634 continue;
620 } 635 }
621 goto nomem; 636 goto nomem;
@@ -626,16 +641,17 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
626 page_cache_release(backpage); 641 page_cache_release(backpage);
627 backpage = NULL; 642 backpage = NULL;
628 643
629 if (!pagevec_add(mark_pvec, netpage)) 644 fscache_mark_page_cached(op, netpage);
630 fscache_mark_pages_cached(op, mark_pvec);
631 645
632 page_cache_get(netpage); 646 page_cache_get(netpage);
633 if (!pagevec_add(&lru_pvec, netpage)) 647 if (!pagevec_add(&lru_pvec, netpage))
634 __pagevec_lru_add_file(&lru_pvec); 648 __pagevec_lru_add_file(&lru_pvec);
635 649
650 /* the netpage is unlocked and marked up to date here */
636 fscache_end_io(op, netpage, 0); 651 fscache_end_io(op, netpage, 0);
637 page_cache_release(netpage); 652 page_cache_release(netpage);
638 netpage = NULL; 653 netpage = NULL;
654 fscache_retrieval_complete(op, 1);
639 continue; 655 continue;
640 } 656 }
641 657
@@ -661,6 +677,7 @@ out:
661 list_for_each_entry_safe(netpage, _n, list, lru) { 677 list_for_each_entry_safe(netpage, _n, list, lru) {
662 list_del(&netpage->lru); 678 list_del(&netpage->lru);
663 page_cache_release(netpage); 679 page_cache_release(netpage);
680 fscache_retrieval_complete(op, 1);
664 } 681 }
665 682
666 _leave(" = %d", ret); 683 _leave(" = %d", ret);
@@ -669,15 +686,17 @@ out:
669nomem: 686nomem:
670 _debug("nomem"); 687 _debug("nomem");
671 ret = -ENOMEM; 688 ret = -ENOMEM;
672 goto out; 689 goto record_page_complete;
673 690
674read_error: 691read_error:
675 _debug("read error %d", ret); 692 _debug("read error %d", ret);
676 if (ret == -ENOMEM) 693 if (ret == -ENOMEM)
677 goto out; 694 goto record_page_complete;
678io_error: 695io_error:
679 cachefiles_io_error_obj(object, "Page read error on backing file"); 696 cachefiles_io_error_obj(object, "Page read error on backing file");
680 ret = -ENOBUFS; 697 ret = -ENOBUFS;
698record_page_complete:
699 fscache_retrieval_complete(op, 1);
681 goto out; 700 goto out;
682} 701}
683 702
@@ -709,7 +728,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
709 *nr_pages); 728 *nr_pages);
710 729
711 if (!object->backer) 730 if (!object->backer)
712 return -ENOBUFS; 731 goto all_enobufs;
713 732
714 space = 1; 733 space = 1;
715 if (cachefiles_has_space(cache, 0, *nr_pages) < 0) 734 if (cachefiles_has_space(cache, 0, *nr_pages) < 0)
@@ -722,7 +741,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
722 741
723 /* calculate the shift required to use bmap */ 742 /* calculate the shift required to use bmap */
724 if (inode->i_sb->s_blocksize > PAGE_SIZE) 743 if (inode->i_sb->s_blocksize > PAGE_SIZE)
725 return -ENOBUFS; 744 goto all_enobufs;
726 745
727 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; 746 shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
728 747
@@ -762,7 +781,10 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
762 nrbackpages++; 781 nrbackpages++;
763 } else if (space && pagevec_add(&pagevec, page) == 0) { 782 } else if (space && pagevec_add(&pagevec, page) == 0) {
764 fscache_mark_pages_cached(op, &pagevec); 783 fscache_mark_pages_cached(op, &pagevec);
784 fscache_retrieval_complete(op, 1);
765 ret = -ENODATA; 785 ret = -ENODATA;
786 } else {
787 fscache_retrieval_complete(op, 1);
766 } 788 }
767 } 789 }
768 790
@@ -775,18 +797,18 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
775 /* submit the apparently valid pages to the backing fs to be read from 797 /* submit the apparently valid pages to the backing fs to be read from
776 * disk */ 798 * disk */
777 if (nrbackpages > 0) { 799 if (nrbackpages > 0) {
778 ret2 = cachefiles_read_backing_file(object, op, &backpages, 800 ret2 = cachefiles_read_backing_file(object, op, &backpages);
779 &pagevec);
780 if (ret2 == -ENOMEM || ret2 == -EINTR) 801 if (ret2 == -ENOMEM || ret2 == -EINTR)
781 ret = ret2; 802 ret = ret2;
782 } 803 }
783 804
784 if (pagevec_count(&pagevec) > 0)
785 fscache_mark_pages_cached(op, &pagevec);
786
787 _leave(" = %d [nr=%u%s]", 805 _leave(" = %d [nr=%u%s]",
788 ret, *nr_pages, list_empty(pages) ? " empty" : ""); 806 ret, *nr_pages, list_empty(pages) ? " empty" : "");
789 return ret; 807 return ret;
808
809all_enobufs:
810 fscache_retrieval_complete(op, *nr_pages);
811 return -ENOBUFS;
790} 812}
791 813
792/* 814/*
@@ -806,7 +828,6 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
806{ 828{
807 struct cachefiles_object *object; 829 struct cachefiles_object *object;
808 struct cachefiles_cache *cache; 830 struct cachefiles_cache *cache;
809 struct pagevec pagevec;
810 int ret; 831 int ret;
811 832
812 object = container_of(op->op.object, 833 object = container_of(op->op.object,
@@ -817,14 +838,12 @@ int cachefiles_allocate_page(struct fscache_retrieval *op,
817 _enter("%p,{%lx},", object, page->index); 838 _enter("%p,{%lx},", object, page->index);
818 839
819 ret = cachefiles_has_space(cache, 0, 1); 840 ret = cachefiles_has_space(cache, 0, 1);
820 if (ret == 0) { 841 if (ret == 0)
821 pagevec_init(&pagevec, 0); 842 fscache_mark_page_cached(op, page);
822 pagevec_add(&pagevec, page); 843 else
823 fscache_mark_pages_cached(op, &pagevec);
824 } else {
825 ret = -ENOBUFS; 844 ret = -ENOBUFS;
826 }
827 845
846 fscache_retrieval_complete(op, 1);
828 _leave(" = %d", ret); 847 _leave(" = %d", ret);
829 return ret; 848 return ret;
830} 849}
@@ -874,6 +893,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op,
874 ret = -ENOBUFS; 893 ret = -ENOBUFS;
875 } 894 }
876 895
896 fscache_retrieval_complete(op, *nr_pages);
877 _leave(" = %d", ret); 897 _leave(" = %d", ret);
878 return ret; 898 return ret;
879} 899}
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index e18b183b47e1..73b46288b54b 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -174,7 +174,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object,
174 ASSERT(dentry); 174 ASSERT(dentry);
175 ASSERT(dentry->d_inode); 175 ASSERT(dentry->d_inode);
176 176
177 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); 177 auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp);
178 if (!auxbuf) { 178 if (!auxbuf) {
179 _leave(" = -ENOMEM"); 179 _leave(" = -ENOMEM");
180 return -ENOMEM; 180 return -ENOMEM;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6690269f5dde..064d1a68d2c1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -267,6 +267,14 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
267 kfree(req->r_pages); 267 kfree(req->r_pages);
268} 268}
269 269
270static void ceph_unlock_page_vector(struct page **pages, int num_pages)
271{
272 int i;
273
274 for (i = 0; i < num_pages; i++)
275 unlock_page(pages[i]);
276}
277
270/* 278/*
271 * start an async read(ahead) operation. return nr_pages we submitted 279 * start an async read(ahead) operation. return nr_pages we submitted
272 * a read for on success, or negative error code. 280 * a read for on success, or negative error code.
@@ -347,6 +355,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
347 return nr_pages; 355 return nr_pages;
348 356
349out_pages: 357out_pages:
358 ceph_unlock_page_vector(pages, nr_pages);
350 ceph_release_page_vector(pages, nr_pages); 359 ceph_release_page_vector(pages, nr_pages);
351out: 360out:
352 ceph_osdc_put_request(req); 361 ceph_osdc_put_request(req);
@@ -1078,23 +1087,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1078 struct page **pagep, void **fsdata) 1087 struct page **pagep, void **fsdata)
1079{ 1088{
1080 struct inode *inode = file->f_dentry->d_inode; 1089 struct inode *inode = file->f_dentry->d_inode;
1090 struct ceph_inode_info *ci = ceph_inode(inode);
1091 struct ceph_file_info *fi = file->private_data;
1081 struct page *page; 1092 struct page *page;
1082 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1093 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1083 int r; 1094 int r, want, got = 0;
1095
1096 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1097 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1098 else
1099 want = CEPH_CAP_FILE_BUFFER;
1100
1101 dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
1102 inode, ceph_vinop(inode), pos, len, inode->i_size);
1103 r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
1104 if (r < 0)
1105 return r;
1106 dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
1107 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
1108 if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
1109 ceph_put_cap_refs(ci, got);
1110 return -EAGAIN;
1111 }
1084 1112
1085 do { 1113 do {
1086 /* get a page */ 1114 /* get a page */
1087 page = grab_cache_page_write_begin(mapping, index, 0); 1115 page = grab_cache_page_write_begin(mapping, index, 0);
1088 if (!page) 1116 if (!page) {
1089 return -ENOMEM; 1117 r = -ENOMEM;
1090 *pagep = page; 1118 break;
1119 }
1091 1120
1092 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1121 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1093 inode, page, (int)pos, (int)len); 1122 inode, page, (int)pos, (int)len);
1094 1123
1095 r = ceph_update_writeable_page(file, pos, len, page); 1124 r = ceph_update_writeable_page(file, pos, len, page);
1125 if (r)
1126 page_cache_release(page);
1096 } while (r == -EAGAIN); 1127 } while (r == -EAGAIN);
1097 1128
1129 if (r) {
1130 ceph_put_cap_refs(ci, got);
1131 } else {
1132 *pagep = page;
1133 *(int *)fsdata = got;
1134 }
1098 return r; 1135 return r;
1099} 1136}
1100 1137
@@ -1108,10 +1145,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1108 struct page *page, void *fsdata) 1145 struct page *page, void *fsdata)
1109{ 1146{
1110 struct inode *inode = file->f_dentry->d_inode; 1147 struct inode *inode = file->f_dentry->d_inode;
1148 struct ceph_inode_info *ci = ceph_inode(inode);
1111 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1149 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1112 struct ceph_mds_client *mdsc = fsc->mdsc; 1150 struct ceph_mds_client *mdsc = fsc->mdsc;
1113 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1151 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1114 int check_cap = 0; 1152 int check_cap = 0;
1153 int got = (unsigned long)fsdata;
1115 1154
1116 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, 1155 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1117 inode, page, (int)pos, (int)copied, (int)len); 1156 inode, page, (int)pos, (int)copied, (int)len);
@@ -1134,6 +1173,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1134 up_read(&mdsc->snap_rwsem); 1173 up_read(&mdsc->snap_rwsem);
1135 page_cache_release(page); 1174 page_cache_release(page);
1136 1175
1176 if (copied > 0) {
1177 int dirty;
1178 spin_lock(&ci->i_ceph_lock);
1179 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
1180 spin_unlock(&ci->i_ceph_lock);
1181 if (dirty)
1182 __mark_inode_dirty(inode, dirty);
1183 }
1184
1185 dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
1186 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
1187 ceph_put_cap_refs(ci, got);
1188
1137 if (check_cap) 1189 if (check_cap)
1138 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); 1190 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1139 1191
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3251e9cc6401..a1d9bb30c1bf 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -236,8 +236,10 @@ static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
236 if (!ctx) { 236 if (!ctx) {
237 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 237 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
238 if (cap) { 238 if (cap) {
239 spin_lock(&mdsc->caps_list_lock);
239 mdsc->caps_use_count++; 240 mdsc->caps_use_count++;
240 mdsc->caps_total_count++; 241 mdsc->caps_total_count++;
242 spin_unlock(&mdsc->caps_list_lock);
241 } 243 }
242 return cap; 244 return cap;
243 } 245 }
@@ -1349,11 +1351,15 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1349 if (!ci->i_head_snapc) 1351 if (!ci->i_head_snapc)
1350 ci->i_head_snapc = ceph_get_snap_context( 1352 ci->i_head_snapc = ceph_get_snap_context(
1351 ci->i_snap_realm->cached_context); 1353 ci->i_snap_realm->cached_context);
1352 dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, 1354 dout(" inode %p now dirty snapc %p auth cap %p\n",
1353 ci->i_head_snapc); 1355 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
1354 BUG_ON(!list_empty(&ci->i_dirty_item)); 1356 BUG_ON(!list_empty(&ci->i_dirty_item));
1355 spin_lock(&mdsc->cap_dirty_lock); 1357 spin_lock(&mdsc->cap_dirty_lock);
1356 list_add(&ci->i_dirty_item, &mdsc->cap_dirty); 1358 if (ci->i_auth_cap)
1359 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1360 else
1361 list_add(&ci->i_dirty_item,
1362 &mdsc->cap_dirty_migrating);
1357 spin_unlock(&mdsc->cap_dirty_lock); 1363 spin_unlock(&mdsc->cap_dirty_lock);
1358 if (ci->i_flushing_caps == 0) { 1364 if (ci->i_flushing_caps == 0) {
1359 ihold(inode); 1365 ihold(inode);
@@ -2388,7 +2394,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2388 &atime); 2394 &atime);
2389 2395
2390 /* max size increase? */ 2396 /* max size increase? */
2391 if (max_size != ci->i_max_size) { 2397 if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
2392 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); 2398 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2393 ci->i_max_size = max_size; 2399 ci->i_max_size = max_size;
2394 if (max_size >= ci->i_wanted_max_size) { 2400 if (max_size >= ci->i_wanted_max_size) {
@@ -2745,6 +2751,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2745 2751
2746 /* make sure we re-request max_size, if necessary */ 2752 /* make sure we re-request max_size, if necessary */
2747 spin_lock(&ci->i_ceph_lock); 2753 spin_lock(&ci->i_ceph_lock);
2754 ci->i_wanted_max_size = 0; /* reset */
2748 ci->i_requested_max_size = 0; 2755 ci->i_requested_max_size = 0;
2749 spin_unlock(&ci->i_ceph_lock); 2756 spin_unlock(&ci->i_ceph_lock);
2750} 2757}
@@ -2840,8 +2847,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2840 case CEPH_CAP_OP_IMPORT: 2847 case CEPH_CAP_OP_IMPORT:
2841 handle_cap_import(mdsc, inode, h, session, 2848 handle_cap_import(mdsc, inode, h, session,
2842 snaptrace, snaptrace_len); 2849 snaptrace, snaptrace_len);
2843 ceph_check_caps(ceph_inode(inode), 0, session);
2844 goto done_unlocked;
2845 } 2850 }
2846 2851
2847 /* the rest require a cap */ 2852 /* the rest require a cap */
@@ -2858,6 +2863,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2858 switch (op) { 2863 switch (op) {
2859 case CEPH_CAP_OP_REVOKE: 2864 case CEPH_CAP_OP_REVOKE:
2860 case CEPH_CAP_OP_GRANT: 2865 case CEPH_CAP_OP_GRANT:
2866 case CEPH_CAP_OP_IMPORT:
2861 handle_cap_grant(inode, h, session, cap, msg->middle); 2867 handle_cap_grant(inode, h, session, cap, msg->middle);
2862 goto done_unlocked; 2868 goto done_unlocked;
2863 2869
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d4dfdcf76d7f..e51558fca3a3 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
712 struct ceph_osd_client *osdc = 712 struct ceph_osd_client *osdc =
713 &ceph_sb_to_client(inode->i_sb)->client->osdc; 713 &ceph_sb_to_client(inode->i_sb)->client->osdc;
714 loff_t endoff = pos + iov->iov_len; 714 loff_t endoff = pos + iov->iov_len;
715 int want, got = 0; 715 int got = 0;
716 int ret, err; 716 int ret, err, written;
717 717
718 if (ceph_snap(inode) != CEPH_NOSNAP) 718 if (ceph_snap(inode) != CEPH_NOSNAP)
719 return -EROFS; 719 return -EROFS;
720 720
721retry_snap: 721retry_snap:
722 written = 0;
722 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 723 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
723 return -ENOSPC; 724 return -ENOSPC;
724 __ceph_do_pending_vmtruncate(inode); 725 __ceph_do_pending_vmtruncate(inode);
725 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
726 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
727 inode->i_size);
728 if (fi->fmode & CEPH_FILE_MODE_LAZY)
729 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
730 else
731 want = CEPH_CAP_FILE_BUFFER;
732 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
733 if (ret < 0)
734 goto out_put;
735
736 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
737 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
738 ceph_cap_string(got));
739
740 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
741 (iocb->ki_filp->f_flags & O_DIRECT) ||
742 (inode->i_sb->s_flags & MS_SYNCHRONOUS) ||
743 (fi->flags & CEPH_F_SYNC)) {
744 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
745 &iocb->ki_pos);
746 } else {
747 /*
748 * buffered write; drop Fw early to avoid slow
749 * revocation if we get stuck on balance_dirty_pages
750 */
751 int dirty;
752
753 spin_lock(&ci->i_ceph_lock);
754 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
755 spin_unlock(&ci->i_ceph_lock);
756 ceph_put_cap_refs(ci, got);
757 726
727 /*
728 * try to do a buffered write. if we don't have sufficient
729 * caps, we'll get -EAGAIN from generic_file_aio_write, or a
730 * short write if we only get caps for some pages.
731 */
732 if (!(iocb->ki_filp->f_flags & O_DIRECT) &&
733 !(inode->i_sb->s_flags & MS_SYNCHRONOUS) &&
734 !(fi->flags & CEPH_F_SYNC)) {
758 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 735 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
736 if (ret >= 0)
737 written = ret;
738
759 if ((ret >= 0 || ret == -EIOCBQUEUED) && 739 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
760 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 740 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
761 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 741 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
762 err = vfs_fsync_range(file, pos, pos + ret - 1, 1); 742 err = vfs_fsync_range(file, pos, pos + written - 1, 1);
763 if (err < 0) 743 if (err < 0)
764 ret = err; 744 ret = err;
765 } 745 }
746 if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff)
747 goto out;
748 }
766 749
767 if (dirty) 750 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
768 __mark_inode_dirty(inode, dirty); 751 inode, ceph_vinop(inode), pos + written,
752 (unsigned)iov->iov_len - written, inode->i_size);
753 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff);
754 if (ret < 0)
769 goto out; 755 goto out;
770 }
771 756
757 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
758 inode, ceph_vinop(inode), pos + written,
759 (unsigned)iov->iov_len - written, ceph_cap_string(got));
760 ret = ceph_sync_write(file, iov->iov_base + written,
761 iov->iov_len - written, &iocb->ki_pos);
772 if (ret >= 0) { 762 if (ret >= 0) {
773 int dirty; 763 int dirty;
774 spin_lock(&ci->i_ceph_lock); 764 spin_lock(&ci->i_ceph_lock);
@@ -777,13 +767,10 @@ retry_snap:
777 if (dirty) 767 if (dirty)
778 __mark_inode_dirty(inode, dirty); 768 __mark_inode_dirty(inode, dirty);
779 } 769 }
780
781out_put:
782 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", 770 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
783 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 771 inode, ceph_vinop(inode), pos + written,
784 ceph_cap_string(got)); 772 (unsigned)iov->iov_len - written, ceph_cap_string(got));
785 ceph_put_cap_refs(ci, got); 773 ceph_put_cap_refs(ci, got);
786
787out: 774out:
788 if (ret == -EOLDSNAPC) { 775 if (ret == -EOLDSNAPC) {
789 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", 776 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ba95eea201bf..2971eaa65cdc 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1466,7 +1466,7 @@ void __ceph_do_pending_vmtruncate(struct inode *inode)
1466{ 1466{
1467 struct ceph_inode_info *ci = ceph_inode(inode); 1467 struct ceph_inode_info *ci = ceph_inode(inode);
1468 u64 to; 1468 u64 to;
1469 int wrbuffer_refs, wake = 0; 1469 int wrbuffer_refs, finish = 0;
1470 1470
1471retry: 1471retry:
1472 spin_lock(&ci->i_ceph_lock); 1472 spin_lock(&ci->i_ceph_lock);
@@ -1498,15 +1498,18 @@ retry:
1498 truncate_inode_pages(inode->i_mapping, to); 1498 truncate_inode_pages(inode->i_mapping, to);
1499 1499
1500 spin_lock(&ci->i_ceph_lock); 1500 spin_lock(&ci->i_ceph_lock);
1501 ci->i_truncate_pending--; 1501 if (to == ci->i_truncate_size) {
1502 if (ci->i_truncate_pending == 0) 1502 ci->i_truncate_pending = 0;
1503 wake = 1; 1503 finish = 1;
1504 }
1504 spin_unlock(&ci->i_ceph_lock); 1505 spin_unlock(&ci->i_ceph_lock);
1506 if (!finish)
1507 goto retry;
1505 1508
1506 if (wrbuffer_refs == 0) 1509 if (wrbuffer_refs == 0)
1507 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 1510 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1508 if (wake) 1511
1509 wake_up_all(&ci->i_cap_wq); 1512 wake_up_all(&ci->i_cap_wq);
1510} 1513}
1511 1514
1512 1515
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1bcf712655d9..9165eb8309eb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1590,7 +1590,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1590 } else if (rpath || rino) { 1590 } else if (rpath || rino) {
1591 *ino = rino; 1591 *ino = rino;
1592 *ppath = rpath; 1592 *ppath = rpath;
1593 *pathlen = strlen(rpath); 1593 *pathlen = rpath ? strlen(rpath) : 0;
1594 dout(" path %.*s\n", *pathlen, rpath); 1594 dout(" path %.*s\n", *pathlen, rpath);
1595 } 1595 }
1596 1596
@@ -1876,9 +1876,14 @@ finish:
1876static void __wake_requests(struct ceph_mds_client *mdsc, 1876static void __wake_requests(struct ceph_mds_client *mdsc,
1877 struct list_head *head) 1877 struct list_head *head)
1878{ 1878{
1879 struct ceph_mds_request *req, *nreq; 1879 struct ceph_mds_request *req;
1880 LIST_HEAD(tmp_list);
1881
1882 list_splice_init(head, &tmp_list);
1880 1883
1881 list_for_each_entry_safe(req, nreq, head, r_wait) { 1884 while (!list_empty(&tmp_list)) {
1885 req = list_entry(tmp_list.next,
1886 struct ceph_mds_request, r_wait);
1882 list_del_init(&req->r_wait); 1887 list_del_init(&req->r_wait);
1883 __do_request(mdsc, req); 1888 __do_request(mdsc, req);
1884 } 1889 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 2eb43f211325..e86aa9948124 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -403,8 +403,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
403 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); 403 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
404 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) 404 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
405 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); 405 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
406 if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
407 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
408 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) 406 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
409 seq_printf(m, ",osdkeepalivetimeout=%d", 407 seq_printf(m, ",osdkeepalivetimeout=%d",
410 opt->osd_keepalive_timeout); 408 opt->osd_keepalive_timeout);
@@ -849,7 +847,7 @@ static int ceph_register_bdi(struct super_block *sb,
849 fsc->backing_dev_info.ra_pages = 847 fsc->backing_dev_info.ra_pages =
850 default_backing_dev_info.ra_pages; 848 default_backing_dev_info.ra_pages;
851 849
852 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d", 850 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
853 atomic_long_inc_return(&bdi_seq)); 851 atomic_long_inc_return(&bdi_seq));
854 if (!err) 852 if (!err)
855 sb->s_bdi = &fsc->backing_dev_info; 853 sb->s_bdi = &fsc->backing_dev_info;
diff --git a/fs/dcache.c b/fs/dcache.c
index 3a463d0c4fe8..19153a0a810c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -455,24 +455,6 @@ void d_drop(struct dentry *dentry)
455EXPORT_SYMBOL(d_drop); 455EXPORT_SYMBOL(d_drop);
456 456
457/* 457/*
458 * d_clear_need_lookup - drop a dentry from cache and clear the need lookup flag
459 * @dentry: dentry to drop
460 *
461 * This is called when we do a lookup on a placeholder dentry that needed to be
462 * looked up. The dentry should have been hashed in order for it to be found by
463 * the lookup code, but now needs to be unhashed while we do the actual lookup
464 * and clear the DCACHE_NEED_LOOKUP flag.
465 */
466void d_clear_need_lookup(struct dentry *dentry)
467{
468 spin_lock(&dentry->d_lock);
469 __d_drop(dentry);
470 dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
471 spin_unlock(&dentry->d_lock);
472}
473EXPORT_SYMBOL(d_clear_need_lookup);
474
475/*
476 * Finish off a dentry we've decided to kill. 458 * Finish off a dentry we've decided to kill.
477 * dentry->d_lock must be held, returns with it unlocked. 459 * dentry->d_lock must be held, returns with it unlocked.
478 * If ref is non-zero, then decrement the refcount too. 460 * If ref is non-zero, then decrement the refcount too.
@@ -565,13 +547,7 @@ repeat:
565 if (d_unhashed(dentry)) 547 if (d_unhashed(dentry))
566 goto kill_it; 548 goto kill_it;
567 549
568 /* 550 dentry->d_flags |= DCACHE_REFERENCED;
569 * If this dentry needs lookup, don't set the referenced flag so that it
570 * is more likely to be cleaned up by the dcache shrinker in case of
571 * memory pressure.
572 */
573 if (!d_need_lookup(dentry))
574 dentry->d_flags |= DCACHE_REFERENCED;
575 dentry_lru_add(dentry); 551 dentry_lru_add(dentry);
576 552
577 dentry->d_count--; 553 dentry->d_count--;
@@ -1583,7 +1559,7 @@ EXPORT_SYMBOL(d_find_any_alias);
1583 */ 1559 */
1584struct dentry *d_obtain_alias(struct inode *inode) 1560struct dentry *d_obtain_alias(struct inode *inode)
1585{ 1561{
1586 static const struct qstr anonstring = { .name = "" }; 1562 static const struct qstr anonstring = QSTR_INIT("/", 1);
1587 struct dentry *tmp; 1563 struct dentry *tmp;
1588 struct dentry *res; 1564 struct dentry *res;
1589 1565
@@ -1737,13 +1713,6 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
1737 } 1713 }
1738 1714
1739 /* 1715 /*
1740 * We are going to instantiate this dentry, unhash it and clear the
1741 * lookup flag so we can do that.
1742 */
1743 if (unlikely(d_need_lookup(found)))
1744 d_clear_need_lookup(found);
1745
1746 /*
1747 * Negative dentry: instantiate it unless the inode is a directory and 1716 * Negative dentry: instantiate it unless the inode is a directory and
1748 * already has a dentry. 1717 * already has a dentry.
1749 */ 1718 */
diff --git a/fs/exec.c b/fs/exec.c
index 237d5342786c..18c45cac368f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1669,7 +1669,6 @@ int get_dumpable(struct mm_struct *mm)
1669 return __get_dumpable(mm->flags); 1669 return __get_dumpable(mm->flags);
1670} 1670}
1671 1671
1672#ifdef __ARCH_WANT_SYS_EXECVE
1673SYSCALL_DEFINE3(execve, 1672SYSCALL_DEFINE3(execve,
1674 const char __user *, filename, 1673 const char __user *, filename,
1675 const char __user *const __user *, argv, 1674 const char __user *const __user *, argv,
@@ -1697,23 +1696,3 @@ asmlinkage long compat_sys_execve(const char __user * filename,
1697 return error; 1696 return error;
1698} 1697}
1699#endif 1698#endif
1700#endif
1701
1702#ifdef __ARCH_WANT_KERNEL_EXECVE
1703int kernel_execve(const char *filename,
1704 const char *const argv[],
1705 const char *const envp[])
1706{
1707 int ret = do_execve(filename,
1708 (const char __user *const __user *)argv,
1709 (const char __user *const __user *)envp);
1710 if (ret < 0)
1711 return ret;
1712
1713 /*
1714 * We were successful. We won't be returning to our caller, but
1715 * instead to user space by manipulating the kernel stack.
1716 */
1717 ret_from_kernel_execve(current_pt_regs());
1718}
1719#endif
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 606bb074c501..5df4bb4aab14 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -322,10 +322,10 @@ static int export_encode_fh(struct inode *inode, struct fid *fid,
322 322
323 if (parent && (len < 4)) { 323 if (parent && (len < 4)) {
324 *max_len = 4; 324 *max_len = 4;
325 return 255; 325 return FILEID_INVALID;
326 } else if (len < 2) { 326 } else if (len < 2) {
327 *max_len = 2; 327 *max_len = 2;
328 return 255; 328 return FILEID_INVALID;
329 } 329 }
330 330
331 len = 2; 331 len = 2;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
new file mode 100644
index 000000000000..fd27e7e6326e
--- /dev/null
+++ b/fs/f2fs/Kconfig
@@ -0,0 +1,53 @@
1config F2FS_FS
2 tristate "F2FS filesystem support (EXPERIMENTAL)"
3 depends on BLOCK
4 help
5 F2FS is based on Log-structured File System (LFS), which supports
6 versatile "flash-friendly" features. The design has been focused on
7 addressing the fundamental issues in LFS, which are snowball effect
8 of wandering tree and high cleaning overhead.
9
10 Since flash-based storages show different characteristics according to
11 the internal geometry or flash memory management schemes aka FTL, F2FS
12 and tools support various parameters not only for configuring on-disk
13 layout, but also for selecting allocation and cleaning algorithms.
14
15 If unsure, say N.
16
17config F2FS_STAT_FS
18 bool "F2FS Status Information"
19 depends on F2FS_FS && DEBUG_FS
20 default y
21 help
22 /sys/kernel/debug/f2fs/ contains information about all the partitions
23 mounted as f2fs. Each file shows the whole f2fs information.
24
25 /sys/kernel/debug/f2fs/status includes:
26 - major file system information managed by f2fs currently
27 - average SIT information about whole segments
28 - current memory footprint consumed by f2fs.
29
30config F2FS_FS_XATTR
31 bool "F2FS extended attributes"
32 depends on F2FS_FS
33 default y
34 help
35 Extended attributes are name:value pairs associated with inodes by
36 the kernel or by users (see the attr(5) manual page, or visit
37 <http://acl.bestbits.at/> for details).
38
39 If unsure, say N.
40
41config F2FS_FS_POSIX_ACL
42 bool "F2FS Access Control Lists"
43 depends on F2FS_FS_XATTR
44 select FS_POSIX_ACL
45 default y
46 help
47 Posix Access Control Lists (ACLs) support permissions for users and
48 gourps beyond the owner/group/world scheme.
49
50 To learn more about Access Control Lists, visit the POSIX ACLs for
51 Linux website <http://acl.bestbits.at/>.
52
53 If you don't know what Access Control Lists are, say N
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
new file mode 100644
index 000000000000..27a0820340b9
--- /dev/null
+++ b/fs/f2fs/Makefile
@@ -0,0 +1,7 @@
1obj-$(CONFIG_F2FS_FS) += f2fs.o
2
3f2fs-y := dir.o file.o inode.o namei.o hash.o super.o
4f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
5f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
6f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
7f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
new file mode 100644
index 000000000000..fed74d193ffb
--- /dev/null
+++ b/fs/f2fs/acl.c
@@ -0,0 +1,414 @@
1/*
2 * fs/f2fs/acl.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext2/acl.c
8 *
9 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15#include <linux/f2fs_fs.h>
16#include "f2fs.h"
17#include "xattr.h"
18#include "acl.h"
19
20#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
21 (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
22
23static inline size_t f2fs_acl_size(int count)
24{
25 if (count <= 4) {
26 return sizeof(struct f2fs_acl_header) +
27 count * sizeof(struct f2fs_acl_entry_short);
28 } else {
29 return sizeof(struct f2fs_acl_header) +
30 4 * sizeof(struct f2fs_acl_entry_short) +
31 (count - 4) * sizeof(struct f2fs_acl_entry);
32 }
33}
34
35static inline int f2fs_acl_count(size_t size)
36{
37 ssize_t s;
38 size -= sizeof(struct f2fs_acl_header);
39 s = size - 4 * sizeof(struct f2fs_acl_entry_short);
40 if (s < 0) {
41 if (size % sizeof(struct f2fs_acl_entry_short))
42 return -1;
43 return size / sizeof(struct f2fs_acl_entry_short);
44 } else {
45 if (s % sizeof(struct f2fs_acl_entry))
46 return -1;
47 return s / sizeof(struct f2fs_acl_entry) + 4;
48 }
49}
50
51static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
52{
53 int i, count;
54 struct posix_acl *acl;
55 struct f2fs_acl_header *hdr = (struct f2fs_acl_header *)value;
56 struct f2fs_acl_entry *entry = (struct f2fs_acl_entry *)(hdr + 1);
57 const char *end = value + size;
58
59 if (hdr->a_version != cpu_to_le32(F2FS_ACL_VERSION))
60 return ERR_PTR(-EINVAL);
61
62 count = f2fs_acl_count(size);
63 if (count < 0)
64 return ERR_PTR(-EINVAL);
65 if (count == 0)
66 return NULL;
67
68 acl = posix_acl_alloc(count, GFP_KERNEL);
69 if (!acl)
70 return ERR_PTR(-ENOMEM);
71
72 for (i = 0; i < count; i++) {
73
74 if ((char *)entry > end)
75 goto fail;
76
77 acl->a_entries[i].e_tag = le16_to_cpu(entry->e_tag);
78 acl->a_entries[i].e_perm = le16_to_cpu(entry->e_perm);
79
80 switch (acl->a_entries[i].e_tag) {
81 case ACL_USER_OBJ:
82 case ACL_GROUP_OBJ:
83 case ACL_MASK:
84 case ACL_OTHER:
85 acl->a_entries[i].e_id = ACL_UNDEFINED_ID;
86 entry = (struct f2fs_acl_entry *)((char *)entry +
87 sizeof(struct f2fs_acl_entry_short));
88 break;
89
90 case ACL_USER:
91 acl->a_entries[i].e_uid =
92 make_kuid(&init_user_ns,
93 le32_to_cpu(entry->e_id));
94 entry = (struct f2fs_acl_entry *)((char *)entry +
95 sizeof(struct f2fs_acl_entry));
96 break;
97 case ACL_GROUP:
98 acl->a_entries[i].e_gid =
99 make_kgid(&init_user_ns,
100 le32_to_cpu(entry->e_id));
101 entry = (struct f2fs_acl_entry *)((char *)entry +
102 sizeof(struct f2fs_acl_entry));
103 break;
104 default:
105 goto fail;
106 }
107 }
108 if ((char *)entry != end)
109 goto fail;
110 return acl;
111fail:
112 posix_acl_release(acl);
113 return ERR_PTR(-EINVAL);
114}
115
116static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
117{
118 struct f2fs_acl_header *f2fs_acl;
119 struct f2fs_acl_entry *entry;
120 int i;
121
122 f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
123 sizeof(struct f2fs_acl_entry), GFP_KERNEL);
124 if (!f2fs_acl)
125 return ERR_PTR(-ENOMEM);
126
127 f2fs_acl->a_version = cpu_to_le32(F2FS_ACL_VERSION);
128 entry = (struct f2fs_acl_entry *)(f2fs_acl + 1);
129
130 for (i = 0; i < acl->a_count; i++) {
131
132 entry->e_tag = cpu_to_le16(acl->a_entries[i].e_tag);
133 entry->e_perm = cpu_to_le16(acl->a_entries[i].e_perm);
134
135 switch (acl->a_entries[i].e_tag) {
136 case ACL_USER:
137 entry->e_id = cpu_to_le32(
138 from_kuid(&init_user_ns,
139 acl->a_entries[i].e_uid));
140 entry = (struct f2fs_acl_entry *)((char *)entry +
141 sizeof(struct f2fs_acl_entry));
142 break;
143 case ACL_GROUP:
144 entry->e_id = cpu_to_le32(
145 from_kgid(&init_user_ns,
146 acl->a_entries[i].e_gid));
147 entry = (struct f2fs_acl_entry *)((char *)entry +
148 sizeof(struct f2fs_acl_entry));
149 break;
150 case ACL_USER_OBJ:
151 case ACL_GROUP_OBJ:
152 case ACL_MASK:
153 case ACL_OTHER:
154 entry = (struct f2fs_acl_entry *)((char *)entry +
155 sizeof(struct f2fs_acl_entry_short));
156 break;
157 default:
158 goto fail;
159 }
160 }
161 *size = f2fs_acl_size(acl->a_count);
162 return (void *)f2fs_acl;
163
164fail:
165 kfree(f2fs_acl);
166 return ERR_PTR(-EINVAL);
167}
168
169struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
170{
171 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
172 int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
173 void *value = NULL;
174 struct posix_acl *acl;
175 int retval;
176
177 if (!test_opt(sbi, POSIX_ACL))
178 return NULL;
179
180 acl = get_cached_acl(inode, type);
181 if (acl != ACL_NOT_CACHED)
182 return acl;
183
184 if (type == ACL_TYPE_ACCESS)
185 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
186
187 retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
188 if (retval > 0) {
189 value = kmalloc(retval, GFP_KERNEL);
190 if (!value)
191 return ERR_PTR(-ENOMEM);
192 retval = f2fs_getxattr(inode, name_index, "", value, retval);
193 }
194
195 if (retval < 0) {
196 if (retval == -ENODATA)
197 acl = NULL;
198 else
199 acl = ERR_PTR(retval);
200 } else {
201 acl = f2fs_acl_from_disk(value, retval);
202 }
203 kfree(value);
204 if (!IS_ERR(acl))
205 set_cached_acl(inode, type, acl);
206
207 return acl;
208}
209
210static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
211{
212 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
213 struct f2fs_inode_info *fi = F2FS_I(inode);
214 int name_index;
215 void *value = NULL;
216 size_t size = 0;
217 int error;
218
219 if (!test_opt(sbi, POSIX_ACL))
220 return 0;
221 if (S_ISLNK(inode->i_mode))
222 return -EOPNOTSUPP;
223
224 switch (type) {
225 case ACL_TYPE_ACCESS:
226 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
227 if (acl) {
228 error = posix_acl_equiv_mode(acl, &inode->i_mode);
229 if (error < 0)
230 return error;
231 set_acl_inode(fi, inode->i_mode);
232 if (error == 0)
233 acl = NULL;
234 }
235 break;
236
237 case ACL_TYPE_DEFAULT:
238 name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
239 if (!S_ISDIR(inode->i_mode))
240 return acl ? -EACCES : 0;
241 break;
242
243 default:
244 return -EINVAL;
245 }
246
247 if (acl) {
248 value = f2fs_acl_to_disk(acl, &size);
249 if (IS_ERR(value)) {
250 cond_clear_inode_flag(fi, FI_ACL_MODE);
251 return (int)PTR_ERR(value);
252 }
253 }
254
255 error = f2fs_setxattr(inode, name_index, "", value, size);
256
257 kfree(value);
258 if (!error)
259 set_cached_acl(inode, type, acl);
260
261 cond_clear_inode_flag(fi, FI_ACL_MODE);
262 return error;
263}
264
265int f2fs_init_acl(struct inode *inode, struct inode *dir)
266{
267 struct posix_acl *acl = NULL;
268 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
269 int error = 0;
270
271 if (!S_ISLNK(inode->i_mode)) {
272 if (test_opt(sbi, POSIX_ACL)) {
273 acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
274 if (IS_ERR(acl))
275 return PTR_ERR(acl);
276 }
277 if (!acl)
278 inode->i_mode &= ~current_umask();
279 }
280
281 if (test_opt(sbi, POSIX_ACL) && acl) {
282
283 if (S_ISDIR(inode->i_mode)) {
284 error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
285 if (error)
286 goto cleanup;
287 }
288 error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
289 if (error < 0)
290 return error;
291 if (error > 0)
292 error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
293 }
294cleanup:
295 posix_acl_release(acl);
296 return error;
297}
298
299int f2fs_acl_chmod(struct inode *inode)
300{
301 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
302 struct posix_acl *acl;
303 int error;
304 mode_t mode = get_inode_mode(inode);
305
306 if (!test_opt(sbi, POSIX_ACL))
307 return 0;
308 if (S_ISLNK(mode))
309 return -EOPNOTSUPP;
310
311 acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
312 if (IS_ERR(acl) || !acl)
313 return PTR_ERR(acl);
314
315 error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
316 if (error)
317 return error;
318 error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
319 posix_acl_release(acl);
320 return error;
321}
322
323static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list,
324 size_t list_size, const char *name, size_t name_len, int type)
325{
326 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
327 const char *xname = POSIX_ACL_XATTR_DEFAULT;
328 size_t size;
329
330 if (!test_opt(sbi, POSIX_ACL))
331 return 0;
332
333 if (type == ACL_TYPE_ACCESS)
334 xname = POSIX_ACL_XATTR_ACCESS;
335
336 size = strlen(xname) + 1;
337 if (list && size <= list_size)
338 memcpy(list, xname, size);
339 return size;
340}
341
342static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
343 void *buffer, size_t size, int type)
344{
345 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
346 struct posix_acl *acl;
347 int error;
348
349 if (strcmp(name, "") != 0)
350 return -EINVAL;
351 if (!test_opt(sbi, POSIX_ACL))
352 return -EOPNOTSUPP;
353
354 acl = f2fs_get_acl(dentry->d_inode, type);
355 if (IS_ERR(acl))
356 return PTR_ERR(acl);
357 if (!acl)
358 return -ENODATA;
359 error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
360 posix_acl_release(acl);
361
362 return error;
363}
364
365static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
366 const void *value, size_t size, int flags, int type)
367{
368 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
369 struct inode *inode = dentry->d_inode;
370 struct posix_acl *acl = NULL;
371 int error;
372
373 if (strcmp(name, "") != 0)
374 return -EINVAL;
375 if (!test_opt(sbi, POSIX_ACL))
376 return -EOPNOTSUPP;
377 if (!inode_owner_or_capable(inode))
378 return -EPERM;
379
380 if (value) {
381 acl = posix_acl_from_xattr(&init_user_ns, value, size);
382 if (IS_ERR(acl))
383 return PTR_ERR(acl);
384 if (acl) {
385 error = posix_acl_valid(acl);
386 if (error)
387 goto release_and_out;
388 }
389 } else {
390 acl = NULL;
391 }
392
393 error = f2fs_set_acl(inode, type, acl);
394
395release_and_out:
396 posix_acl_release(acl);
397 return error;
398}
399
400const struct xattr_handler f2fs_xattr_acl_default_handler = {
401 .prefix = POSIX_ACL_XATTR_DEFAULT,
402 .flags = ACL_TYPE_DEFAULT,
403 .list = f2fs_xattr_list_acl,
404 .get = f2fs_xattr_get_acl,
405 .set = f2fs_xattr_set_acl,
406};
407
408const struct xattr_handler f2fs_xattr_acl_access_handler = {
409 .prefix = POSIX_ACL_XATTR_ACCESS,
410 .flags = ACL_TYPE_ACCESS,
411 .list = f2fs_xattr_list_acl,
412 .get = f2fs_xattr_get_acl,
413 .set = f2fs_xattr_set_acl,
414};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
new file mode 100644
index 000000000000..80f430674417
--- /dev/null
+++ b/fs/f2fs/acl.h
@@ -0,0 +1,57 @@
1/*
2 * fs/f2fs/acl.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext2/acl.h
8 *
9 * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15#ifndef __F2FS_ACL_H__
16#define __F2FS_ACL_H__
17
18#include <linux/posix_acl_xattr.h>
19
20#define F2FS_ACL_VERSION 0x0001
21
22struct f2fs_acl_entry {
23 __le16 e_tag;
24 __le16 e_perm;
25 __le32 e_id;
26};
27
28struct f2fs_acl_entry_short {
29 __le16 e_tag;
30 __le16 e_perm;
31};
32
33struct f2fs_acl_header {
34 __le32 a_version;
35};
36
37#ifdef CONFIG_F2FS_FS_POSIX_ACL
38
39extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type);
40extern int f2fs_acl_chmod(struct inode *inode);
41extern int f2fs_init_acl(struct inode *inode, struct inode *dir);
42#else
43#define f2fs_check_acl NULL
44#define f2fs_get_acl NULL
45#define f2fs_set_acl NULL
46
47static inline int f2fs_acl_chmod(struct inode *inode)
48{
49 return 0;
50}
51
52static inline int f2fs_init_acl(struct inode *inode, struct inode *dir)
53{
54 return 0;
55}
56#endif
57#endif /* __F2FS_ACL_H__ */
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
new file mode 100644
index 000000000000..6ef36c37e2be
--- /dev/null
+++ b/fs/f2fs/checkpoint.c
@@ -0,0 +1,794 @@
1/*
2 * fs/f2fs/checkpoint.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/bio.h>
13#include <linux/mpage.h>
14#include <linux/writeback.h>
15#include <linux/blkdev.h>
16#include <linux/f2fs_fs.h>
17#include <linux/pagevec.h>
18#include <linux/swap.h>
19
20#include "f2fs.h"
21#include "node.h"
22#include "segment.h"
23
24static struct kmem_cache *orphan_entry_slab;
25static struct kmem_cache *inode_entry_slab;
26
27/*
28 * We guarantee no failure on the returned page.
29 */
30struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
31{
32 struct address_space *mapping = sbi->meta_inode->i_mapping;
33 struct page *page = NULL;
34repeat:
35 page = grab_cache_page(mapping, index);
36 if (!page) {
37 cond_resched();
38 goto repeat;
39 }
40
41 /* We wait writeback only inside grab_meta_page() */
42 wait_on_page_writeback(page);
43 SetPageUptodate(page);
44 return page;
45}
46
47/*
48 * We guarantee no failure on the returned page.
49 */
50struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
51{
52 struct address_space *mapping = sbi->meta_inode->i_mapping;
53 struct page *page;
54repeat:
55 page = grab_cache_page(mapping, index);
56 if (!page) {
57 cond_resched();
58 goto repeat;
59 }
60 if (f2fs_readpage(sbi, page, index, READ_SYNC)) {
61 f2fs_put_page(page, 1);
62 goto repeat;
63 }
64 mark_page_accessed(page);
65
66 /* We do not allow returning an errorneous page */
67 return page;
68}
69
70static int f2fs_write_meta_page(struct page *page,
71 struct writeback_control *wbc)
72{
73 struct inode *inode = page->mapping->host;
74 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
75 int err;
76
77 wait_on_page_writeback(page);
78
79 err = write_meta_page(sbi, page, wbc);
80 if (err) {
81 wbc->pages_skipped++;
82 set_page_dirty(page);
83 }
84
85 dec_page_count(sbi, F2FS_DIRTY_META);
86
87 /* In this case, we should not unlock this page */
88 if (err != AOP_WRITEPAGE_ACTIVATE)
89 unlock_page(page);
90 return err;
91}
92
93static int f2fs_write_meta_pages(struct address_space *mapping,
94 struct writeback_control *wbc)
95{
96 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
97 struct block_device *bdev = sbi->sb->s_bdev;
98 long written;
99
100 if (wbc->for_kupdate)
101 return 0;
102
103 if (get_pages(sbi, F2FS_DIRTY_META) == 0)
104 return 0;
105
106 /* if mounting is failed, skip writing node pages */
107 mutex_lock(&sbi->cp_mutex);
108 written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
109 mutex_unlock(&sbi->cp_mutex);
110 wbc->nr_to_write -= written;
111 return 0;
112}
113
114long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
115 long nr_to_write)
116{
117 struct address_space *mapping = sbi->meta_inode->i_mapping;
118 pgoff_t index = 0, end = LONG_MAX;
119 struct pagevec pvec;
120 long nwritten = 0;
121 struct writeback_control wbc = {
122 .for_reclaim = 0,
123 };
124
125 pagevec_init(&pvec, 0);
126
127 while (index <= end) {
128 int i, nr_pages;
129 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
130 PAGECACHE_TAG_DIRTY,
131 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
132 if (nr_pages == 0)
133 break;
134
135 for (i = 0; i < nr_pages; i++) {
136 struct page *page = pvec.pages[i];
137 lock_page(page);
138 BUG_ON(page->mapping != mapping);
139 BUG_ON(!PageDirty(page));
140 clear_page_dirty_for_io(page);
141 f2fs_write_meta_page(page, &wbc);
142 if (nwritten++ >= nr_to_write)
143 break;
144 }
145 pagevec_release(&pvec);
146 cond_resched();
147 }
148
149 if (nwritten)
150 f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
151
152 return nwritten;
153}
154
155static int f2fs_set_meta_page_dirty(struct page *page)
156{
157 struct address_space *mapping = page->mapping;
158 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
159
160 SetPageUptodate(page);
161 if (!PageDirty(page)) {
162 __set_page_dirty_nobuffers(page);
163 inc_page_count(sbi, F2FS_DIRTY_META);
164 F2FS_SET_SB_DIRT(sbi);
165 return 1;
166 }
167 return 0;
168}
169
170const struct address_space_operations f2fs_meta_aops = {
171 .writepage = f2fs_write_meta_page,
172 .writepages = f2fs_write_meta_pages,
173 .set_page_dirty = f2fs_set_meta_page_dirty,
174};
175
176int check_orphan_space(struct f2fs_sb_info *sbi)
177{
178 unsigned int max_orphans;
179 int err = 0;
180
181 /*
182 * considering 512 blocks in a segment 5 blocks are needed for cp
183 * and log segment summaries. Remaining blocks are used to keep
184 * orphan entries with the limitation one reserved segment
185 * for cp pack we can have max 1020*507 orphan entries
186 */
187 max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
188 mutex_lock(&sbi->orphan_inode_mutex);
189 if (sbi->n_orphans >= max_orphans)
190 err = -ENOSPC;
191 mutex_unlock(&sbi->orphan_inode_mutex);
192 return err;
193}
194
195void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
196{
197 struct list_head *head, *this;
198 struct orphan_inode_entry *new = NULL, *orphan = NULL;
199
200 mutex_lock(&sbi->orphan_inode_mutex);
201 head = &sbi->orphan_inode_list;
202 list_for_each(this, head) {
203 orphan = list_entry(this, struct orphan_inode_entry, list);
204 if (orphan->ino == ino)
205 goto out;
206 if (orphan->ino > ino)
207 break;
208 orphan = NULL;
209 }
210retry:
211 new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
212 if (!new) {
213 cond_resched();
214 goto retry;
215 }
216 new->ino = ino;
217 INIT_LIST_HEAD(&new->list);
218
219 /* add new_oentry into list which is sorted by inode number */
220 if (orphan) {
221 struct orphan_inode_entry *prev;
222
223 /* get previous entry */
224 prev = list_entry(orphan->list.prev, typeof(*prev), list);
225 if (&prev->list != head)
226 /* insert new orphan inode entry */
227 list_add(&new->list, &prev->list);
228 else
229 list_add(&new->list, head);
230 } else {
231 list_add_tail(&new->list, head);
232 }
233 sbi->n_orphans++;
234out:
235 mutex_unlock(&sbi->orphan_inode_mutex);
236}
237
238void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
239{
240 struct list_head *this, *next, *head;
241 struct orphan_inode_entry *orphan;
242
243 mutex_lock(&sbi->orphan_inode_mutex);
244 head = &sbi->orphan_inode_list;
245 list_for_each_safe(this, next, head) {
246 orphan = list_entry(this, struct orphan_inode_entry, list);
247 if (orphan->ino == ino) {
248 list_del(&orphan->list);
249 kmem_cache_free(orphan_entry_slab, orphan);
250 sbi->n_orphans--;
251 break;
252 }
253 }
254 mutex_unlock(&sbi->orphan_inode_mutex);
255}
256
257static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
258{
259 struct inode *inode = f2fs_iget(sbi->sb, ino);
260 BUG_ON(IS_ERR(inode));
261 clear_nlink(inode);
262
263 /* truncate all the data during iput */
264 iput(inode);
265}
266
267int recover_orphan_inodes(struct f2fs_sb_info *sbi)
268{
269 block_t start_blk, orphan_blkaddr, i, j;
270
271 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
272 return 0;
273
274 sbi->por_doing = 1;
275 start_blk = __start_cp_addr(sbi) + 1;
276 orphan_blkaddr = __start_sum_addr(sbi) - 1;
277
278 for (i = 0; i < orphan_blkaddr; i++) {
279 struct page *page = get_meta_page(sbi, start_blk + i);
280 struct f2fs_orphan_block *orphan_blk;
281
282 orphan_blk = (struct f2fs_orphan_block *)page_address(page);
283 for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) {
284 nid_t ino = le32_to_cpu(orphan_blk->ino[j]);
285 recover_orphan_inode(sbi, ino);
286 }
287 f2fs_put_page(page, 1);
288 }
289 /* clear Orphan Flag */
290 clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
291 sbi->por_doing = 0;
292 return 0;
293}
294
295static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
296{
297 struct list_head *head, *this, *next;
298 struct f2fs_orphan_block *orphan_blk = NULL;
299 struct page *page = NULL;
300 unsigned int nentries = 0;
301 unsigned short index = 1;
302 unsigned short orphan_blocks;
303
304 orphan_blocks = (unsigned short)((sbi->n_orphans +
305 (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
306
307 mutex_lock(&sbi->orphan_inode_mutex);
308 head = &sbi->orphan_inode_list;
309
310 /* loop for each orphan inode entry and write them in Jornal block */
311 list_for_each_safe(this, next, head) {
312 struct orphan_inode_entry *orphan;
313
314 orphan = list_entry(this, struct orphan_inode_entry, list);
315
316 if (nentries == F2FS_ORPHANS_PER_BLOCK) {
317 /*
318 * an orphan block is full of 1020 entries,
319 * then we need to flush current orphan blocks
320 * and bring another one in memory
321 */
322 orphan_blk->blk_addr = cpu_to_le16(index);
323 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
324 orphan_blk->entry_count = cpu_to_le32(nentries);
325 set_page_dirty(page);
326 f2fs_put_page(page, 1);
327 index++;
328 start_blk++;
329 nentries = 0;
330 page = NULL;
331 }
332 if (page)
333 goto page_exist;
334
335 page = grab_meta_page(sbi, start_blk);
336 orphan_blk = (struct f2fs_orphan_block *)page_address(page);
337 memset(orphan_blk, 0, sizeof(*orphan_blk));
338page_exist:
339 orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
340 }
341 if (!page)
342 goto end;
343
344 orphan_blk->blk_addr = cpu_to_le16(index);
345 orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
346 orphan_blk->entry_count = cpu_to_le32(nentries);
347 set_page_dirty(page);
348 f2fs_put_page(page, 1);
349end:
350 mutex_unlock(&sbi->orphan_inode_mutex);
351}
352
353static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
354 block_t cp_addr, unsigned long long *version)
355{
356 struct page *cp_page_1, *cp_page_2 = NULL;
357 unsigned long blk_size = sbi->blocksize;
358 struct f2fs_checkpoint *cp_block;
359 unsigned long long cur_version = 0, pre_version = 0;
360 unsigned int crc = 0;
361 size_t crc_offset;
362
363 /* Read the 1st cp block in this CP pack */
364 cp_page_1 = get_meta_page(sbi, cp_addr);
365
366 /* get the version number */
367 cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1);
368 crc_offset = le32_to_cpu(cp_block->checksum_offset);
369 if (crc_offset >= blk_size)
370 goto invalid_cp1;
371
372 crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
373 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
374 goto invalid_cp1;
375
376 pre_version = le64_to_cpu(cp_block->checkpoint_ver);
377
378 /* Read the 2nd cp block in this CP pack */
379 cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
380 cp_page_2 = get_meta_page(sbi, cp_addr);
381
382 cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2);
383 crc_offset = le32_to_cpu(cp_block->checksum_offset);
384 if (crc_offset >= blk_size)
385 goto invalid_cp2;
386
387 crc = *(unsigned int *)((unsigned char *)cp_block + crc_offset);
388 if (!f2fs_crc_valid(crc, cp_block, crc_offset))
389 goto invalid_cp2;
390
391 cur_version = le64_to_cpu(cp_block->checkpoint_ver);
392
393 if (cur_version == pre_version) {
394 *version = cur_version;
395 f2fs_put_page(cp_page_2, 1);
396 return cp_page_1;
397 }
398invalid_cp2:
399 f2fs_put_page(cp_page_2, 1);
400invalid_cp1:
401 f2fs_put_page(cp_page_1, 1);
402 return NULL;
403}
404
405int get_valid_checkpoint(struct f2fs_sb_info *sbi)
406{
407 struct f2fs_checkpoint *cp_block;
408 struct f2fs_super_block *fsb = sbi->raw_super;
409 struct page *cp1, *cp2, *cur_page;
410 unsigned long blk_size = sbi->blocksize;
411 unsigned long long cp1_version = 0, cp2_version = 0;
412 unsigned long long cp_start_blk_no;
413
414 sbi->ckpt = kzalloc(blk_size, GFP_KERNEL);
415 if (!sbi->ckpt)
416 return -ENOMEM;
417 /*
418 * Finding out valid cp block involves read both
419 * sets( cp pack1 and cp pack 2)
420 */
421 cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
422 cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
423
424 /* The second checkpoint pack should start at the next segment */
425 cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
426 cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
427
428 if (cp1 && cp2) {
429 if (ver_after(cp2_version, cp1_version))
430 cur_page = cp2;
431 else
432 cur_page = cp1;
433 } else if (cp1) {
434 cur_page = cp1;
435 } else if (cp2) {
436 cur_page = cp2;
437 } else {
438 goto fail_no_cp;
439 }
440
441 cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
442 memcpy(sbi->ckpt, cp_block, blk_size);
443
444 f2fs_put_page(cp1, 1);
445 f2fs_put_page(cp2, 1);
446 return 0;
447
448fail_no_cp:
449 kfree(sbi->ckpt);
450 return -EINVAL;
451}
452
453void set_dirty_dir_page(struct inode *inode, struct page *page)
454{
455 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
456 struct list_head *head = &sbi->dir_inode_list;
457 struct dir_inode_entry *new;
458 struct list_head *this;
459
460 if (!S_ISDIR(inode->i_mode))
461 return;
462retry:
463 new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
464 if (!new) {
465 cond_resched();
466 goto retry;
467 }
468 new->inode = inode;
469 INIT_LIST_HEAD(&new->list);
470
471 spin_lock(&sbi->dir_inode_lock);
472 list_for_each(this, head) {
473 struct dir_inode_entry *entry;
474 entry = list_entry(this, struct dir_inode_entry, list);
475 if (entry->inode == inode) {
476 kmem_cache_free(inode_entry_slab, new);
477 goto out;
478 }
479 }
480 list_add_tail(&new->list, head);
481 sbi->n_dirty_dirs++;
482
483 BUG_ON(!S_ISDIR(inode->i_mode));
484out:
485 inc_page_count(sbi, F2FS_DIRTY_DENTS);
486 inode_inc_dirty_dents(inode);
487 SetPagePrivate(page);
488
489 spin_unlock(&sbi->dir_inode_lock);
490}
491
492void remove_dirty_dir_inode(struct inode *inode)
493{
494 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
495 struct list_head *head = &sbi->dir_inode_list;
496 struct list_head *this;
497
498 if (!S_ISDIR(inode->i_mode))
499 return;
500
501 spin_lock(&sbi->dir_inode_lock);
502 if (atomic_read(&F2FS_I(inode)->dirty_dents))
503 goto out;
504
505 list_for_each(this, head) {
506 struct dir_inode_entry *entry;
507 entry = list_entry(this, struct dir_inode_entry, list);
508 if (entry->inode == inode) {
509 list_del(&entry->list);
510 kmem_cache_free(inode_entry_slab, entry);
511 sbi->n_dirty_dirs--;
512 break;
513 }
514 }
515out:
516 spin_unlock(&sbi->dir_inode_lock);
517}
518
519void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
520{
521 struct list_head *head = &sbi->dir_inode_list;
522 struct dir_inode_entry *entry;
523 struct inode *inode;
524retry:
525 spin_lock(&sbi->dir_inode_lock);
526 if (list_empty(head)) {
527 spin_unlock(&sbi->dir_inode_lock);
528 return;
529 }
530 entry = list_entry(head->next, struct dir_inode_entry, list);
531 inode = igrab(entry->inode);
532 spin_unlock(&sbi->dir_inode_lock);
533 if (inode) {
534 filemap_flush(inode->i_mapping);
535 iput(inode);
536 } else {
537 /*
538 * We should submit bio, since it exists several
539 * wribacking dentry pages in the freeing inode.
540 */
541 f2fs_submit_bio(sbi, DATA, true);
542 }
543 goto retry;
544}
545
546/*
547 * Freeze all the FS-operations for checkpoint.
548 */
549void block_operations(struct f2fs_sb_info *sbi)
550{
551 int t;
552 struct writeback_control wbc = {
553 .sync_mode = WB_SYNC_ALL,
554 .nr_to_write = LONG_MAX,
555 .for_reclaim = 0,
556 };
557
558 /* Stop renaming operation */
559 mutex_lock_op(sbi, RENAME);
560 mutex_lock_op(sbi, DENTRY_OPS);
561
562retry_dents:
563 /* write all the dirty dentry pages */
564 sync_dirty_dir_inodes(sbi);
565
566 mutex_lock_op(sbi, DATA_WRITE);
567 if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
568 mutex_unlock_op(sbi, DATA_WRITE);
569 goto retry_dents;
570 }
571
572 /* block all the operations */
573 for (t = DATA_NEW; t <= NODE_TRUNC; t++)
574 mutex_lock_op(sbi, t);
575
576 mutex_lock(&sbi->write_inode);
577
578 /*
579 * POR: we should ensure that there is no dirty node pages
580 * until finishing nat/sit flush.
581 */
582retry:
583 sync_node_pages(sbi, 0, &wbc);
584
585 mutex_lock_op(sbi, NODE_WRITE);
586
587 if (get_pages(sbi, F2FS_DIRTY_NODES)) {
588 mutex_unlock_op(sbi, NODE_WRITE);
589 goto retry;
590 }
591 mutex_unlock(&sbi->write_inode);
592}
593
594static void unblock_operations(struct f2fs_sb_info *sbi)
595{
596 int t;
597 for (t = NODE_WRITE; t >= RENAME; t--)
598 mutex_unlock_op(sbi, t);
599}
600
601static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
602{
603 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
604 nid_t last_nid = 0;
605 block_t start_blk;
606 struct page *cp_page;
607 unsigned int data_sum_blocks, orphan_blocks;
608 unsigned int crc32 = 0;
609 void *kaddr;
610 int i;
611
612 /* Flush all the NAT/SIT pages */
613 while (get_pages(sbi, F2FS_DIRTY_META))
614 sync_meta_pages(sbi, META, LONG_MAX);
615
616 next_free_nid(sbi, &last_nid);
617
618 /*
619 * modify checkpoint
620 * version number is already updated
621 */
622 ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
623 ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
624 ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
625 for (i = 0; i < 3; i++) {
626 ckpt->cur_node_segno[i] =
627 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
628 ckpt->cur_node_blkoff[i] =
629 cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_NODE));
630 ckpt->alloc_type[i + CURSEG_HOT_NODE] =
631 curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
632 }
633 for (i = 0; i < 3; i++) {
634 ckpt->cur_data_segno[i] =
635 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
636 ckpt->cur_data_blkoff[i] =
637 cpu_to_le16(curseg_blkoff(sbi, i + CURSEG_HOT_DATA));
638 ckpt->alloc_type[i + CURSEG_HOT_DATA] =
639 curseg_alloc_type(sbi, i + CURSEG_HOT_DATA);
640 }
641
642 ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi));
643 ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi));
644 ckpt->next_free_nid = cpu_to_le32(last_nid);
645
646 /* 2 cp + n data seg summary + orphan inode blocks */
647 data_sum_blocks = npages_for_summary_flush(sbi);
648 if (data_sum_blocks < 3)
649 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
650 else
651 clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
652
653 orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
654 / F2FS_ORPHANS_PER_BLOCK;
655 ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks);
656
657 if (is_umount) {
658 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
659 ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
660 data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE);
661 } else {
662 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
663 ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
664 data_sum_blocks + orphan_blocks);
665 }
666
667 if (sbi->n_orphans)
668 set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
669 else
670 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
671
672 /* update SIT/NAT bitmap */
673 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
674 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
675
676 crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset));
677 *(__le32 *)((unsigned char *)ckpt +
678 le32_to_cpu(ckpt->checksum_offset))
679 = cpu_to_le32(crc32);
680
681 start_blk = __start_cp_addr(sbi);
682
683 /* write out checkpoint buffer at block 0 */
684 cp_page = grab_meta_page(sbi, start_blk++);
685 kaddr = page_address(cp_page);
686 memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
687 set_page_dirty(cp_page);
688 f2fs_put_page(cp_page, 1);
689
690 if (sbi->n_orphans) {
691 write_orphan_inodes(sbi, start_blk);
692 start_blk += orphan_blocks;
693 }
694
695 write_data_summaries(sbi, start_blk);
696 start_blk += data_sum_blocks;
697 if (is_umount) {
698 write_node_summaries(sbi, start_blk);
699 start_blk += NR_CURSEG_NODE_TYPE;
700 }
701
702 /* writeout checkpoint block */
703 cp_page = grab_meta_page(sbi, start_blk);
704 kaddr = page_address(cp_page);
705 memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
706 set_page_dirty(cp_page);
707 f2fs_put_page(cp_page, 1);
708
709 /* wait for previous submitted node/meta pages writeback */
710 while (get_pages(sbi, F2FS_WRITEBACK))
711 congestion_wait(BLK_RW_ASYNC, HZ / 50);
712
713 filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
714 filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
715
716 /* update user_block_counts */
717 sbi->last_valid_block_count = sbi->total_valid_block_count;
718 sbi->alloc_valid_block_count = 0;
719
720 /* Here, we only have one bio having CP pack */
721 if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))
722 sbi->sb->s_flags |= MS_RDONLY;
723 else
724 sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
725
726 clear_prefree_segments(sbi);
727 F2FS_RESET_SB_DIRT(sbi);
728}
729
730/*
731 * We guarantee that this checkpoint procedure should not fail.
732 */
733void write_checkpoint(struct f2fs_sb_info *sbi, bool blocked, bool is_umount)
734{
735 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
736 unsigned long long ckpt_ver;
737
738 if (!blocked) {
739 mutex_lock(&sbi->cp_mutex);
740 block_operations(sbi);
741 }
742
743 f2fs_submit_bio(sbi, DATA, true);
744 f2fs_submit_bio(sbi, NODE, true);
745 f2fs_submit_bio(sbi, META, true);
746
747 /*
748 * update checkpoint pack index
749 * Increase the version number so that
750 * SIT entries and seg summaries are written at correct place
751 */
752 ckpt_ver = le64_to_cpu(ckpt->checkpoint_ver);
753 ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);
754
755 /* write cached NAT/SIT entries to NAT/SIT area */
756 flush_nat_entries(sbi);
757 flush_sit_entries(sbi);
758
759 reset_victim_segmap(sbi);
760
761 /* unlock all the fs_lock[] in do_checkpoint() */
762 do_checkpoint(sbi, is_umount);
763
764 unblock_operations(sbi);
765 mutex_unlock(&sbi->cp_mutex);
766}
767
768void init_orphan_info(struct f2fs_sb_info *sbi)
769{
770 mutex_init(&sbi->orphan_inode_mutex);
771 INIT_LIST_HEAD(&sbi->orphan_inode_list);
772 sbi->n_orphans = 0;
773}
774
775int create_checkpoint_caches(void)
776{
777 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
778 sizeof(struct orphan_inode_entry), NULL);
779 if (unlikely(!orphan_entry_slab))
780 return -ENOMEM;
781 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
782 sizeof(struct dir_inode_entry), NULL);
783 if (unlikely(!inode_entry_slab)) {
784 kmem_cache_destroy(orphan_entry_slab);
785 return -ENOMEM;
786 }
787 return 0;
788}
789
790void destroy_checkpoint_caches(void)
791{
792 kmem_cache_destroy(orphan_entry_slab);
793 kmem_cache_destroy(inode_entry_slab);
794}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
new file mode 100644
index 000000000000..655aeabc1dd4
--- /dev/null
+++ b/fs/f2fs/data.c
@@ -0,0 +1,702 @@
1/*
2 * fs/f2fs/data.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/buffer_head.h>
14#include <linux/mpage.h>
15#include <linux/writeback.h>
16#include <linux/backing-dev.h>
17#include <linux/blkdev.h>
18#include <linux/bio.h>
19
20#include "f2fs.h"
21#include "node.h"
22#include "segment.h"
23
24/*
25 * Lock ordering for the change of data block address:
26 * ->data_page
27 * ->node_page
28 * update block addresses in the node page
29 */
30static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
31{
32 struct f2fs_node *rn;
33 __le32 *addr_array;
34 struct page *node_page = dn->node_page;
35 unsigned int ofs_in_node = dn->ofs_in_node;
36
37 wait_on_page_writeback(node_page);
38
39 rn = (struct f2fs_node *)page_address(node_page);
40
41 /* Get physical address of data block */
42 addr_array = blkaddr_in_node(rn);
43 addr_array[ofs_in_node] = cpu_to_le32(new_addr);
44 set_page_dirty(node_page);
45}
46
47int reserve_new_block(struct dnode_of_data *dn)
48{
49 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
50
51 if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
52 return -EPERM;
53 if (!inc_valid_block_count(sbi, dn->inode, 1))
54 return -ENOSPC;
55
56 __set_data_blkaddr(dn, NEW_ADDR);
57 dn->data_blkaddr = NEW_ADDR;
58 sync_inode_page(dn);
59 return 0;
60}
61
62static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
63 struct buffer_head *bh_result)
64{
65 struct f2fs_inode_info *fi = F2FS_I(inode);
66 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
67 pgoff_t start_fofs, end_fofs;
68 block_t start_blkaddr;
69
70 read_lock(&fi->ext.ext_lock);
71 if (fi->ext.len == 0) {
72 read_unlock(&fi->ext.ext_lock);
73 return 0;
74 }
75
76 sbi->total_hit_ext++;
77 start_fofs = fi->ext.fofs;
78 end_fofs = fi->ext.fofs + fi->ext.len - 1;
79 start_blkaddr = fi->ext.blk_addr;
80
81 if (pgofs >= start_fofs && pgofs <= end_fofs) {
82 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
83 size_t count;
84
85 clear_buffer_new(bh_result);
86 map_bh(bh_result, inode->i_sb,
87 start_blkaddr + pgofs - start_fofs);
88 count = end_fofs - pgofs + 1;
89 if (count < (UINT_MAX >> blkbits))
90 bh_result->b_size = (count << blkbits);
91 else
92 bh_result->b_size = UINT_MAX;
93
94 sbi->read_hit_ext++;
95 read_unlock(&fi->ext.ext_lock);
96 return 1;
97 }
98 read_unlock(&fi->ext.ext_lock);
99 return 0;
100}
101
102void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
103{
104 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
105 pgoff_t fofs, start_fofs, end_fofs;
106 block_t start_blkaddr, end_blkaddr;
107
108 BUG_ON(blk_addr == NEW_ADDR);
109 fofs = start_bidx_of_node(ofs_of_node(dn->node_page)) + dn->ofs_in_node;
110
111 /* Update the page address in the parent node */
112 __set_data_blkaddr(dn, blk_addr);
113
114 write_lock(&fi->ext.ext_lock);
115
116 start_fofs = fi->ext.fofs;
117 end_fofs = fi->ext.fofs + fi->ext.len - 1;
118 start_blkaddr = fi->ext.blk_addr;
119 end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1;
120
121 /* Drop and initialize the matched extent */
122 if (fi->ext.len == 1 && fofs == start_fofs)
123 fi->ext.len = 0;
124
125 /* Initial extent */
126 if (fi->ext.len == 0) {
127 if (blk_addr != NULL_ADDR) {
128 fi->ext.fofs = fofs;
129 fi->ext.blk_addr = blk_addr;
130 fi->ext.len = 1;
131 }
132 goto end_update;
133 }
134
135 /* Frone merge */
136 if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
137 fi->ext.fofs--;
138 fi->ext.blk_addr--;
139 fi->ext.len++;
140 goto end_update;
141 }
142
143 /* Back merge */
144 if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
145 fi->ext.len++;
146 goto end_update;
147 }
148
149 /* Split the existing extent */
150 if (fi->ext.len > 1 &&
151 fofs >= start_fofs && fofs <= end_fofs) {
152 if ((end_fofs - fofs) < (fi->ext.len >> 1)) {
153 fi->ext.len = fofs - start_fofs;
154 } else {
155 fi->ext.fofs = fofs + 1;
156 fi->ext.blk_addr = start_blkaddr +
157 fofs - start_fofs + 1;
158 fi->ext.len -= fofs - start_fofs + 1;
159 }
160 goto end_update;
161 }
162 write_unlock(&fi->ext.ext_lock);
163 return;
164
165end_update:
166 write_unlock(&fi->ext.ext_lock);
167 sync_inode_page(dn);
168 return;
169}
170
171struct page *find_data_page(struct inode *inode, pgoff_t index)
172{
173 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
174 struct address_space *mapping = inode->i_mapping;
175 struct dnode_of_data dn;
176 struct page *page;
177 int err;
178
179 page = find_get_page(mapping, index);
180 if (page && PageUptodate(page))
181 return page;
182 f2fs_put_page(page, 0);
183
184 set_new_dnode(&dn, inode, NULL, NULL, 0);
185 err = get_dnode_of_data(&dn, index, RDONLY_NODE);
186 if (err)
187 return ERR_PTR(err);
188 f2fs_put_dnode(&dn);
189
190 if (dn.data_blkaddr == NULL_ADDR)
191 return ERR_PTR(-ENOENT);
192
193 /* By fallocate(), there is no cached page, but with NEW_ADDR */
194 if (dn.data_blkaddr == NEW_ADDR)
195 return ERR_PTR(-EINVAL);
196
197 page = grab_cache_page(mapping, index);
198 if (!page)
199 return ERR_PTR(-ENOMEM);
200
201 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
202 if (err) {
203 f2fs_put_page(page, 1);
204 return ERR_PTR(err);
205 }
206 unlock_page(page);
207 return page;
208}
209
210/*
211 * If it tries to access a hole, return an error.
212 * Because, the callers, functions in dir.c and GC, should be able to know
213 * whether this page exists or not.
214 */
215struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
216{
217 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
218 struct address_space *mapping = inode->i_mapping;
219 struct dnode_of_data dn;
220 struct page *page;
221 int err;
222
223 set_new_dnode(&dn, inode, NULL, NULL, 0);
224 err = get_dnode_of_data(&dn, index, RDONLY_NODE);
225 if (err)
226 return ERR_PTR(err);
227 f2fs_put_dnode(&dn);
228
229 if (dn.data_blkaddr == NULL_ADDR)
230 return ERR_PTR(-ENOENT);
231
232 page = grab_cache_page(mapping, index);
233 if (!page)
234 return ERR_PTR(-ENOMEM);
235
236 if (PageUptodate(page))
237 return page;
238
239 BUG_ON(dn.data_blkaddr == NEW_ADDR);
240 BUG_ON(dn.data_blkaddr == NULL_ADDR);
241
242 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
243 if (err) {
244 f2fs_put_page(page, 1);
245 return ERR_PTR(err);
246 }
247 return page;
248}
249
250/*
251 * Caller ensures that this data page is never allocated.
252 * A new zero-filled data page is allocated in the page cache.
253 */
254struct page *get_new_data_page(struct inode *inode, pgoff_t index,
255 bool new_i_size)
256{
257 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
258 struct address_space *mapping = inode->i_mapping;
259 struct page *page;
260 struct dnode_of_data dn;
261 int err;
262
263 set_new_dnode(&dn, inode, NULL, NULL, 0);
264 err = get_dnode_of_data(&dn, index, 0);
265 if (err)
266 return ERR_PTR(err);
267
268 if (dn.data_blkaddr == NULL_ADDR) {
269 if (reserve_new_block(&dn)) {
270 f2fs_put_dnode(&dn);
271 return ERR_PTR(-ENOSPC);
272 }
273 }
274 f2fs_put_dnode(&dn);
275
276 page = grab_cache_page(mapping, index);
277 if (!page)
278 return ERR_PTR(-ENOMEM);
279
280 if (PageUptodate(page))
281 return page;
282
283 if (dn.data_blkaddr == NEW_ADDR) {
284 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
285 } else {
286 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
287 if (err) {
288 f2fs_put_page(page, 1);
289 return ERR_PTR(err);
290 }
291 }
292 SetPageUptodate(page);
293
294 if (new_i_size &&
295 i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
296 i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
297 mark_inode_dirty_sync(inode);
298 }
299 return page;
300}
301
302static void read_end_io(struct bio *bio, int err)
303{
304 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
305 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
306
307 do {
308 struct page *page = bvec->bv_page;
309
310 if (--bvec >= bio->bi_io_vec)
311 prefetchw(&bvec->bv_page->flags);
312
313 if (uptodate) {
314 SetPageUptodate(page);
315 } else {
316 ClearPageUptodate(page);
317 SetPageError(page);
318 }
319 unlock_page(page);
320 } while (bvec >= bio->bi_io_vec);
321 kfree(bio->bi_private);
322 bio_put(bio);
323}
324
325/*
326 * Fill the locked page with data located in the block address.
327 * Read operation is synchronous, and caller must unlock the page.
328 */
329int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
330 block_t blk_addr, int type)
331{
332 struct block_device *bdev = sbi->sb->s_bdev;
333 bool sync = (type == READ_SYNC);
334 struct bio *bio;
335
336 /* This page can be already read by other threads */
337 if (PageUptodate(page)) {
338 if (!sync)
339 unlock_page(page);
340 return 0;
341 }
342
343 down_read(&sbi->bio_sem);
344
345 /* Allocate a new bio */
346 bio = f2fs_bio_alloc(bdev, 1);
347
348 /* Initialize the bio */
349 bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
350 bio->bi_end_io = read_end_io;
351
352 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
353 kfree(bio->bi_private);
354 bio_put(bio);
355 up_read(&sbi->bio_sem);
356 return -EFAULT;
357 }
358
359 submit_bio(type, bio);
360 up_read(&sbi->bio_sem);
361
362 /* wait for read completion if sync */
363 if (sync) {
364 lock_page(page);
365 if (PageError(page))
366 return -EIO;
367 }
368 return 0;
369}
370
371/*
372 * This function should be used by the data read flow only where it
373 * does not check the "create" flag that indicates block allocation.
374 * The reason for this special functionality is to exploit VFS readahead
375 * mechanism.
376 */
377static int get_data_block_ro(struct inode *inode, sector_t iblock,
378 struct buffer_head *bh_result, int create)
379{
380 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
381 unsigned maxblocks = bh_result->b_size >> blkbits;
382 struct dnode_of_data dn;
383 pgoff_t pgofs;
384 int err;
385
386 /* Get the page offset from the block offset(iblock) */
387 pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
388
389 if (check_extent_cache(inode, pgofs, bh_result))
390 return 0;
391
392 /* When reading holes, we need its node page */
393 set_new_dnode(&dn, inode, NULL, NULL, 0);
394 err = get_dnode_of_data(&dn, pgofs, RDONLY_NODE);
395 if (err)
396 return (err == -ENOENT) ? 0 : err;
397
398 /* It does not support data allocation */
399 BUG_ON(create);
400
401 if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
402 int i;
403 unsigned int end_offset;
404
405 end_offset = IS_INODE(dn.node_page) ?
406 ADDRS_PER_INODE :
407 ADDRS_PER_BLOCK;
408
409 clear_buffer_new(bh_result);
410
411 /* Give more consecutive addresses for the read ahead */
412 for (i = 0; i < end_offset - dn.ofs_in_node; i++)
413 if (((datablock_addr(dn.node_page,
414 dn.ofs_in_node + i))
415 != (dn.data_blkaddr + i)) || maxblocks == i)
416 break;
417 map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
418 bh_result->b_size = (i << blkbits);
419 }
420 f2fs_put_dnode(&dn);
421 return 0;
422}
423
424static int f2fs_read_data_page(struct file *file, struct page *page)
425{
426 return mpage_readpage(page, get_data_block_ro);
427}
428
429static int f2fs_read_data_pages(struct file *file,
430 struct address_space *mapping,
431 struct list_head *pages, unsigned nr_pages)
432{
433 return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
434}
435
436int do_write_data_page(struct page *page)
437{
438 struct inode *inode = page->mapping->host;
439 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
440 block_t old_blk_addr, new_blk_addr;
441 struct dnode_of_data dn;
442 int err = 0;
443
444 set_new_dnode(&dn, inode, NULL, NULL, 0);
445 err = get_dnode_of_data(&dn, page->index, RDONLY_NODE);
446 if (err)
447 return err;
448
449 old_blk_addr = dn.data_blkaddr;
450
451 /* This page is already truncated */
452 if (old_blk_addr == NULL_ADDR)
453 goto out_writepage;
454
455 set_page_writeback(page);
456
457 /*
458 * If current allocation needs SSR,
459 * it had better in-place writes for updated data.
460 */
461 if (old_blk_addr != NEW_ADDR && !is_cold_data(page) &&
462 need_inplace_update(inode)) {
463 rewrite_data_page(F2FS_SB(inode->i_sb), page,
464 old_blk_addr);
465 } else {
466 write_data_page(inode, page, &dn,
467 old_blk_addr, &new_blk_addr);
468 update_extent_cache(new_blk_addr, &dn);
469 F2FS_I(inode)->data_version =
470 le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
471 }
472out_writepage:
473 f2fs_put_dnode(&dn);
474 return err;
475}
476
477static int f2fs_write_data_page(struct page *page,
478 struct writeback_control *wbc)
479{
480 struct inode *inode = page->mapping->host;
481 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
482 loff_t i_size = i_size_read(inode);
483 const pgoff_t end_index = ((unsigned long long) i_size)
484 >> PAGE_CACHE_SHIFT;
485 unsigned offset;
486 int err = 0;
487
488 if (page->index < end_index)
489 goto out;
490
491 /*
492 * If the offset is out-of-range of file size,
493 * this page does not have to be written to disk.
494 */
495 offset = i_size & (PAGE_CACHE_SIZE - 1);
496 if ((page->index >= end_index + 1) || !offset) {
497 if (S_ISDIR(inode->i_mode)) {
498 dec_page_count(sbi, F2FS_DIRTY_DENTS);
499 inode_dec_dirty_dents(inode);
500 }
501 goto unlock_out;
502 }
503
504 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
505out:
506 if (sbi->por_doing)
507 goto redirty_out;
508
509 if (wbc->for_reclaim && !S_ISDIR(inode->i_mode) && !is_cold_data(page))
510 goto redirty_out;
511
512 mutex_lock_op(sbi, DATA_WRITE);
513 if (S_ISDIR(inode->i_mode)) {
514 dec_page_count(sbi, F2FS_DIRTY_DENTS);
515 inode_dec_dirty_dents(inode);
516 }
517 err = do_write_data_page(page);
518 if (err && err != -ENOENT) {
519 wbc->pages_skipped++;
520 set_page_dirty(page);
521 }
522 mutex_unlock_op(sbi, DATA_WRITE);
523
524 if (wbc->for_reclaim)
525 f2fs_submit_bio(sbi, DATA, true);
526
527 if (err == -ENOENT)
528 goto unlock_out;
529
530 clear_cold_data(page);
531 unlock_page(page);
532
533 if (!wbc->for_reclaim && !S_ISDIR(inode->i_mode))
534 f2fs_balance_fs(sbi);
535 return 0;
536
537unlock_out:
538 unlock_page(page);
539 return (err == -ENOENT) ? 0 : err;
540
541redirty_out:
542 wbc->pages_skipped++;
543 set_page_dirty(page);
544 return AOP_WRITEPAGE_ACTIVATE;
545}
546
547#define MAX_DESIRED_PAGES_WP 4096
548
549static int f2fs_write_data_pages(struct address_space *mapping,
550 struct writeback_control *wbc)
551{
552 struct inode *inode = mapping->host;
553 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
554 int ret;
555 long excess_nrtw = 0, desired_nrtw;
556
557 if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
558 desired_nrtw = MAX_DESIRED_PAGES_WP;
559 excess_nrtw = desired_nrtw - wbc->nr_to_write;
560 wbc->nr_to_write = desired_nrtw;
561 }
562
563 if (!S_ISDIR(inode->i_mode))
564 mutex_lock(&sbi->writepages);
565 ret = generic_writepages(mapping, wbc);
566 if (!S_ISDIR(inode->i_mode))
567 mutex_unlock(&sbi->writepages);
568 f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
569
570 remove_dirty_dir_inode(inode);
571
572 wbc->nr_to_write -= excess_nrtw;
573 return ret;
574}
575
576static int f2fs_write_begin(struct file *file, struct address_space *mapping,
577 loff_t pos, unsigned len, unsigned flags,
578 struct page **pagep, void **fsdata)
579{
580 struct inode *inode = mapping->host;
581 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
582 struct page *page;
583 pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
584 struct dnode_of_data dn;
585 int err = 0;
586
587 /* for nobh_write_end */
588 *fsdata = NULL;
589
590 f2fs_balance_fs(sbi);
591
592 page = grab_cache_page_write_begin(mapping, index, flags);
593 if (!page)
594 return -ENOMEM;
595 *pagep = page;
596
597 mutex_lock_op(sbi, DATA_NEW);
598
599 set_new_dnode(&dn, inode, NULL, NULL, 0);
600 err = get_dnode_of_data(&dn, index, 0);
601 if (err) {
602 mutex_unlock_op(sbi, DATA_NEW);
603 f2fs_put_page(page, 1);
604 return err;
605 }
606
607 if (dn.data_blkaddr == NULL_ADDR) {
608 err = reserve_new_block(&dn);
609 if (err) {
610 f2fs_put_dnode(&dn);
611 mutex_unlock_op(sbi, DATA_NEW);
612 f2fs_put_page(page, 1);
613 return err;
614 }
615 }
616 f2fs_put_dnode(&dn);
617
618 mutex_unlock_op(sbi, DATA_NEW);
619
620 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
621 return 0;
622
623 if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
624 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
625 unsigned end = start + len;
626
627 /* Reading beyond i_size is simple: memset to zero */
628 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
629 return 0;
630 }
631
632 if (dn.data_blkaddr == NEW_ADDR) {
633 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
634 } else {
635 err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
636 if (err) {
637 f2fs_put_page(page, 1);
638 return err;
639 }
640 }
641 SetPageUptodate(page);
642 clear_cold_data(page);
643 return 0;
644}
645
646static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
647 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
648{
649 struct file *file = iocb->ki_filp;
650 struct inode *inode = file->f_mapping->host;
651
652 if (rw == WRITE)
653 return 0;
654
655 /* Needs synchronization with the cleaner */
656 return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
657 get_data_block_ro);
658}
659
660static void f2fs_invalidate_data_page(struct page *page, unsigned long offset)
661{
662 struct inode *inode = page->mapping->host;
663 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
664 if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
665 dec_page_count(sbi, F2FS_DIRTY_DENTS);
666 inode_dec_dirty_dents(inode);
667 }
668 ClearPagePrivate(page);
669}
670
671static int f2fs_release_data_page(struct page *page, gfp_t wait)
672{
673 ClearPagePrivate(page);
674 return 0;
675}
676
677static int f2fs_set_data_page_dirty(struct page *page)
678{
679 struct address_space *mapping = page->mapping;
680 struct inode *inode = mapping->host;
681
682 SetPageUptodate(page);
683 if (!PageDirty(page)) {
684 __set_page_dirty_nobuffers(page);
685 set_dirty_dir_page(inode, page);
686 return 1;
687 }
688 return 0;
689}
690
691const struct address_space_operations f2fs_dblock_aops = {
692 .readpage = f2fs_read_data_page,
693 .readpages = f2fs_read_data_pages,
694 .writepage = f2fs_write_data_page,
695 .writepages = f2fs_write_data_pages,
696 .write_begin = f2fs_write_begin,
697 .write_end = nobh_write_end,
698 .set_page_dirty = f2fs_set_data_page_dirty,
699 .invalidatepage = f2fs_invalidate_data_page,
700 .releasepage = f2fs_release_data_page,
701 .direct_IO = f2fs_direct_IO,
702};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
new file mode 100644
index 000000000000..0e0380a588ad
--- /dev/null
+++ b/fs/f2fs/debug.c
@@ -0,0 +1,361 @@
1/*
2 * f2fs debugging statistics
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 * Copyright (c) 2012 Linux Foundation
7 * Copyright (c) 2012 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 2 as
11 * published by the Free Software Foundation.
12 */
13
14#include <linux/fs.h>
15#include <linux/backing-dev.h>
16#include <linux/proc_fs.h>
17#include <linux/f2fs_fs.h>
18#include <linux/blkdev.h>
19#include <linux/debugfs.h>
20#include <linux/seq_file.h>
21
22#include "f2fs.h"
23#include "node.h"
24#include "segment.h"
25#include "gc.h"
26
27static LIST_HEAD(f2fs_stat_list);
28static struct dentry *debugfs_root;
29
30static void update_general_status(struct f2fs_sb_info *sbi)
31{
32 struct f2fs_stat_info *si = sbi->stat_info;
33 int i;
34
35 /* valid check of the segment numbers */
36 si->hit_ext = sbi->read_hit_ext;
37 si->total_ext = sbi->total_hit_ext;
38 si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
39 si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
40 si->ndirty_dirs = sbi->n_dirty_dirs;
41 si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
42 si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
43 si->rsvd_segs = reserved_segments(sbi);
44 si->overp_segs = overprovision_segments(sbi);
45 si->valid_count = valid_user_blocks(sbi);
46 si->valid_node_count = valid_node_count(sbi);
47 si->valid_inode_count = valid_inode_count(sbi);
48 si->utilization = utilization(sbi);
49
50 si->free_segs = free_segments(sbi);
51 si->free_secs = free_sections(sbi);
52 si->prefree_count = prefree_segments(sbi);
53 si->dirty_count = dirty_segments(sbi);
54 si->node_pages = sbi->node_inode->i_mapping->nrpages;
55 si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
56 si->nats = NM_I(sbi)->nat_cnt;
57 si->sits = SIT_I(sbi)->dirty_sentries;
58 si->fnids = NM_I(sbi)->fcnt;
59 si->bg_gc = sbi->bg_gc;
60 si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
61 * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
62 / 2;
63 si->util_valid = (int)(written_block_count(sbi) >>
64 sbi->log_blocks_per_seg)
65 * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
66 / 2;
67 si->util_invalid = 50 - si->util_free - si->util_valid;
68 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) {
69 struct curseg_info *curseg = CURSEG_I(sbi, i);
70 si->curseg[i] = curseg->segno;
71 si->cursec[i] = curseg->segno / sbi->segs_per_sec;
72 si->curzone[i] = si->cursec[i] / sbi->secs_per_zone;
73 }
74
75 for (i = 0; i < 2; i++) {
76 si->segment_count[i] = sbi->segment_count[i];
77 si->block_count[i] = sbi->block_count[i];
78 }
79}
80
81/*
82 * This function calculates BDF of every segments
83 */
84static void update_sit_info(struct f2fs_sb_info *sbi)
85{
86 struct f2fs_stat_info *si = sbi->stat_info;
87 unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
88 struct sit_info *sit_i = SIT_I(sbi);
89 unsigned int segno, vblocks;
90 int ndirty = 0;
91
92 bimodal = 0;
93 total_vblocks = 0;
94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
95 hblks_per_sec = blks_per_sec / 2;
96 mutex_lock(&sit_i->sentry_lock);
97 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
98 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
99 dist = abs(vblocks - hblks_per_sec);
100 bimodal += dist * dist;
101
102 if (vblocks > 0 && vblocks < blks_per_sec) {
103 total_vblocks += vblocks;
104 ndirty++;
105 }
106 }
107 mutex_unlock(&sit_i->sentry_lock);
108 dist = sbi->total_sections * hblks_per_sec * hblks_per_sec / 100;
109 si->bimodal = bimodal / dist;
110 if (si->dirty_count)
111 si->avg_vblocks = total_vblocks / ndirty;
112 else
113 si->avg_vblocks = 0;
114}
115
116/*
117 * This function calculates memory footprint.
118 */
119static void update_mem_info(struct f2fs_sb_info *sbi)
120{
121 struct f2fs_stat_info *si = sbi->stat_info;
122 unsigned npages;
123
124 if (si->base_mem)
125 goto get_cache;
126
127 si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize;
128 si->base_mem += 2 * sizeof(struct f2fs_inode_info);
129 si->base_mem += sizeof(*sbi->ckpt);
130
131 /* build sm */
132 si->base_mem += sizeof(struct f2fs_sm_info);
133
134 /* build sit */
135 si->base_mem += sizeof(struct sit_info);
136 si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry);
137 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
138 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi);
139 if (sbi->segs_per_sec > 1)
140 si->base_mem += sbi->total_sections *
141 sizeof(struct sec_entry);
142 si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
143
144 /* build free segmap */
145 si->base_mem += sizeof(struct free_segmap_info);
146 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi));
147 si->base_mem += f2fs_bitmap_size(sbi->total_sections);
148
149 /* build curseg */
150 si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
151 si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE;
152
153 /* build dirty segmap */
154 si->base_mem += sizeof(struct dirty_seglist_info);
155 si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi));
156 si->base_mem += 2 * f2fs_bitmap_size(TOTAL_SEGS(sbi));
157
158 /* buld nm */
159 si->base_mem += sizeof(struct f2fs_nm_info);
160 si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
161
162 /* build gc */
163 si->base_mem += sizeof(struct f2fs_gc_kthread);
164
165get_cache:
166 /* free nids */
167 si->cache_mem = NM_I(sbi)->fcnt;
168 si->cache_mem += NM_I(sbi)->nat_cnt;
169 npages = sbi->node_inode->i_mapping->nrpages;
170 si->cache_mem += npages << PAGE_CACHE_SHIFT;
171 npages = sbi->meta_inode->i_mapping->nrpages;
172 si->cache_mem += npages << PAGE_CACHE_SHIFT;
173 si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
174 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
175}
176
177static int stat_show(struct seq_file *s, void *v)
178{
179 struct f2fs_stat_info *si, *next;
180 int i = 0;
181 int j;
182
183 list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
184
185 mutex_lock(&si->stat_lock);
186 if (!si->sbi) {
187 mutex_unlock(&si->stat_lock);
188 continue;
189 }
190 update_general_status(si->sbi);
191
192 seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++);
193 seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ",
194 si->nat_area_segs, si->sit_area_segs);
195 seq_printf(s, "[SSA: %d] [MAIN: %d",
196 si->ssa_area_segs, si->main_area_segs);
197 seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
198 si->overp_segs, si->rsvd_segs);
199 seq_printf(s, "Utilization: %d%% (%d valid blocks)\n",
200 si->utilization, si->valid_count);
201 seq_printf(s, " - Node: %u (Inode: %u, ",
202 si->valid_node_count, si->valid_inode_count);
203 seq_printf(s, "Other: %u)\n - Data: %u\n",
204 si->valid_node_count - si->valid_inode_count,
205 si->valid_count - si->valid_node_count);
206 seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
207 si->main_area_segs, si->main_area_sections,
208 si->main_area_zones);
209 seq_printf(s, " - COLD data: %d, %d, %d\n",
210 si->curseg[CURSEG_COLD_DATA],
211 si->cursec[CURSEG_COLD_DATA],
212 si->curzone[CURSEG_COLD_DATA]);
213 seq_printf(s, " - WARM data: %d, %d, %d\n",
214 si->curseg[CURSEG_WARM_DATA],
215 si->cursec[CURSEG_WARM_DATA],
216 si->curzone[CURSEG_WARM_DATA]);
217 seq_printf(s, " - HOT data: %d, %d, %d\n",
218 si->curseg[CURSEG_HOT_DATA],
219 si->cursec[CURSEG_HOT_DATA],
220 si->curzone[CURSEG_HOT_DATA]);
221 seq_printf(s, " - Dir dnode: %d, %d, %d\n",
222 si->curseg[CURSEG_HOT_NODE],
223 si->cursec[CURSEG_HOT_NODE],
224 si->curzone[CURSEG_HOT_NODE]);
225 seq_printf(s, " - File dnode: %d, %d, %d\n",
226 si->curseg[CURSEG_WARM_NODE],
227 si->cursec[CURSEG_WARM_NODE],
228 si->curzone[CURSEG_WARM_NODE]);
229 seq_printf(s, " - Indir nodes: %d, %d, %d\n",
230 si->curseg[CURSEG_COLD_NODE],
231 si->cursec[CURSEG_COLD_NODE],
232 si->curzone[CURSEG_COLD_NODE]);
233 seq_printf(s, "\n - Valid: %d\n - Dirty: %d\n",
234 si->main_area_segs - si->dirty_count -
235 si->prefree_count - si->free_segs,
236 si->dirty_count);
237 seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
238 si->prefree_count, si->free_segs, si->free_secs);
239 seq_printf(s, "GC calls: %d (BG: %d)\n",
240 si->call_count, si->bg_gc);
241 seq_printf(s, " - data segments : %d\n", si->data_segs);
242 seq_printf(s, " - node segments : %d\n", si->node_segs);
243 seq_printf(s, "Try to move %d blocks\n", si->tot_blks);
244 seq_printf(s, " - data blocks : %d\n", si->data_blks);
245 seq_printf(s, " - node blocks : %d\n", si->node_blks);
246 seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
247 si->hit_ext, si->total_ext);
248 seq_printf(s, "\nBalancing F2FS Async:\n");
249 seq_printf(s, " - nodes %4d in %4d\n",
250 si->ndirty_node, si->node_pages);
251 seq_printf(s, " - dents %4d in dirs:%4d\n",
252 si->ndirty_dent, si->ndirty_dirs);
253 seq_printf(s, " - meta %4d in %4d\n",
254 si->ndirty_meta, si->meta_pages);
255 seq_printf(s, " - NATs %5d > %lu\n",
256 si->nats, NM_WOUT_THRESHOLD);
257 seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
258 si->sits, si->fnids);
259 seq_printf(s, "\nDistribution of User Blocks:");
260 seq_printf(s, " [ valid | invalid | free ]\n");
261 seq_printf(s, " [");
262
263 for (j = 0; j < si->util_valid; j++)
264 seq_printf(s, "-");
265 seq_printf(s, "|");
266
267 for (j = 0; j < si->util_invalid; j++)
268 seq_printf(s, "-");
269 seq_printf(s, "|");
270
271 for (j = 0; j < si->util_free; j++)
272 seq_printf(s, "-");
273 seq_printf(s, "]\n\n");
274 seq_printf(s, "SSR: %u blocks in %u segments\n",
275 si->block_count[SSR], si->segment_count[SSR]);
276 seq_printf(s, "LFS: %u blocks in %u segments\n",
277 si->block_count[LFS], si->segment_count[LFS]);
278
279 /* segment usage info */
280 update_sit_info(si->sbi);
281 seq_printf(s, "\nBDF: %u, avg. vblocks: %u\n",
282 si->bimodal, si->avg_vblocks);
283
284 /* memory footprint */
285 update_mem_info(si->sbi);
286 seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
287 (si->base_mem + si->cache_mem) >> 10,
288 si->base_mem >> 10, si->cache_mem >> 10);
289 mutex_unlock(&si->stat_lock);
290 }
291 return 0;
292}
293
294static int stat_open(struct inode *inode, struct file *file)
295{
296 return single_open(file, stat_show, inode->i_private);
297}
298
299static const struct file_operations stat_fops = {
300 .open = stat_open,
301 .read = seq_read,
302 .llseek = seq_lseek,
303 .release = single_release,
304};
305
306static int init_stats(struct f2fs_sb_info *sbi)
307{
308 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
309 struct f2fs_stat_info *si;
310
311 sbi->stat_info = kzalloc(sizeof(struct f2fs_stat_info), GFP_KERNEL);
312 if (!sbi->stat_info)
313 return -ENOMEM;
314
315 si = sbi->stat_info;
316 mutex_init(&si->stat_lock);
317 list_add_tail(&si->stat_list, &f2fs_stat_list);
318
319 si->all_area_segs = le32_to_cpu(raw_super->segment_count);
320 si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
321 si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
322 si->ssa_area_segs = le32_to_cpu(raw_super->segment_count_ssa);
323 si->main_area_segs = le32_to_cpu(raw_super->segment_count_main);
324 si->main_area_sections = le32_to_cpu(raw_super->section_count);
325 si->main_area_zones = si->main_area_sections /
326 le32_to_cpu(raw_super->secs_per_zone);
327 si->sbi = sbi;
328 return 0;
329}
330
331int f2fs_build_stats(struct f2fs_sb_info *sbi)
332{
333 int retval;
334
335 retval = init_stats(sbi);
336 if (retval)
337 return retval;
338
339 if (!debugfs_root)
340 debugfs_root = debugfs_create_dir("f2fs", NULL);
341
342 debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops);
343 return 0;
344}
345
346void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
347{
348 struct f2fs_stat_info *si = sbi->stat_info;
349
350 list_del(&si->stat_list);
351 mutex_lock(&si->stat_lock);
352 si->sbi = NULL;
353 mutex_unlock(&si->stat_lock);
354 kfree(sbi->stat_info);
355}
356
357void destroy_root_stats(void)
358{
359 debugfs_remove_recursive(debugfs_root);
360 debugfs_root = NULL;
361}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
new file mode 100644
index 000000000000..b4e24f32b54e
--- /dev/null
+++ b/fs/f2fs/dir.c
@@ -0,0 +1,672 @@
1/*
2 * fs/f2fs/dir.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include "f2fs.h"
14#include "acl.h"
15
16static unsigned long dir_blocks(struct inode *inode)
17{
18 return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1))
19 >> PAGE_CACHE_SHIFT;
20}
21
22static unsigned int dir_buckets(unsigned int level)
23{
24 if (level < MAX_DIR_HASH_DEPTH / 2)
25 return 1 << level;
26 else
27 return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1);
28}
29
30static unsigned int bucket_blocks(unsigned int level)
31{
32 if (level < MAX_DIR_HASH_DEPTH / 2)
33 return 2;
34 else
35 return 4;
36}
37
38static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = {
39 [F2FS_FT_UNKNOWN] = DT_UNKNOWN,
40 [F2FS_FT_REG_FILE] = DT_REG,
41 [F2FS_FT_DIR] = DT_DIR,
42 [F2FS_FT_CHRDEV] = DT_CHR,
43 [F2FS_FT_BLKDEV] = DT_BLK,
44 [F2FS_FT_FIFO] = DT_FIFO,
45 [F2FS_FT_SOCK] = DT_SOCK,
46 [F2FS_FT_SYMLINK] = DT_LNK,
47};
48
49#define S_SHIFT 12
50static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = {
51 [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE,
52 [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR,
53 [S_IFCHR >> S_SHIFT] = F2FS_FT_CHRDEV,
54 [S_IFBLK >> S_SHIFT] = F2FS_FT_BLKDEV,
55 [S_IFIFO >> S_SHIFT] = F2FS_FT_FIFO,
56 [S_IFSOCK >> S_SHIFT] = F2FS_FT_SOCK,
57 [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK,
58};
59
60static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
61{
62 mode_t mode = inode->i_mode;
63 de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
64}
65
66static unsigned long dir_block_index(unsigned int level, unsigned int idx)
67{
68 unsigned long i;
69 unsigned long bidx = 0;
70
71 for (i = 0; i < level; i++)
72 bidx += dir_buckets(i) * bucket_blocks(i);
73 bidx += idx * bucket_blocks(level);
74 return bidx;
75}
76
77static bool early_match_name(const char *name, int namelen,
78 f2fs_hash_t namehash, struct f2fs_dir_entry *de)
79{
80 if (le16_to_cpu(de->name_len) != namelen)
81 return false;
82
83 if (de->hash_code != namehash)
84 return false;
85
86 return true;
87}
88
89static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
90 const char *name, int namelen, int *max_slots,
91 f2fs_hash_t namehash, struct page **res_page)
92{
93 struct f2fs_dir_entry *de;
94 unsigned long bit_pos, end_pos, next_pos;
95 struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
96 int slots;
97
98 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
99 NR_DENTRY_IN_BLOCK, 0);
100 while (bit_pos < NR_DENTRY_IN_BLOCK) {
101 de = &dentry_blk->dentry[bit_pos];
102 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
103
104 if (early_match_name(name, namelen, namehash, de)) {
105 if (!memcmp(dentry_blk->filename[bit_pos],
106 name, namelen)) {
107 *res_page = dentry_page;
108 goto found;
109 }
110 }
111 next_pos = bit_pos + slots;
112 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
113 NR_DENTRY_IN_BLOCK, next_pos);
114 if (bit_pos >= NR_DENTRY_IN_BLOCK)
115 end_pos = NR_DENTRY_IN_BLOCK;
116 else
117 end_pos = bit_pos;
118 if (*max_slots < end_pos - next_pos)
119 *max_slots = end_pos - next_pos;
120 }
121
122 de = NULL;
123 kunmap(dentry_page);
124found:
125 return de;
126}
127
128static struct f2fs_dir_entry *find_in_level(struct inode *dir,
129 unsigned int level, const char *name, int namelen,
130 f2fs_hash_t namehash, struct page **res_page)
131{
132 int s = GET_DENTRY_SLOTS(namelen);
133 unsigned int nbucket, nblock;
134 unsigned int bidx, end_block;
135 struct page *dentry_page;
136 struct f2fs_dir_entry *de = NULL;
137 bool room = false;
138 int max_slots = 0;
139
140 BUG_ON(level > MAX_DIR_HASH_DEPTH);
141
142 nbucket = dir_buckets(level);
143 nblock = bucket_blocks(level);
144
145 bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket);
146 end_block = bidx + nblock;
147
148 for (; bidx < end_block; bidx++) {
149 /* no need to allocate new dentry pages to all the indices */
150 dentry_page = find_data_page(dir, bidx);
151 if (IS_ERR(dentry_page)) {
152 room = true;
153 continue;
154 }
155
156 de = find_in_block(dentry_page, name, namelen,
157 &max_slots, namehash, res_page);
158 if (de)
159 break;
160
161 if (max_slots >= s)
162 room = true;
163 f2fs_put_page(dentry_page, 0);
164 }
165
166 if (!de && room && F2FS_I(dir)->chash != namehash) {
167 F2FS_I(dir)->chash = namehash;
168 F2FS_I(dir)->clevel = level;
169 }
170
171 return de;
172}
173
174/*
175 * Find an entry in the specified directory with the wanted name.
176 * It returns the page where the entry was found (as a parameter - res_page),
177 * and the entry itself. Page is returned mapped and unlocked.
178 * Entry is guaranteed to be valid.
179 */
180struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
181 struct qstr *child, struct page **res_page)
182{
183 const char *name = child->name;
184 int namelen = child->len;
185 unsigned long npages = dir_blocks(dir);
186 struct f2fs_dir_entry *de = NULL;
187 f2fs_hash_t name_hash;
188 unsigned int max_depth;
189 unsigned int level;
190
191 if (npages == 0)
192 return NULL;
193
194 *res_page = NULL;
195
196 name_hash = f2fs_dentry_hash(name, namelen);
197 max_depth = F2FS_I(dir)->i_current_depth;
198
199 for (level = 0; level < max_depth; level++) {
200 de = find_in_level(dir, level, name,
201 namelen, name_hash, res_page);
202 if (de)
203 break;
204 }
205 if (!de && F2FS_I(dir)->chash != name_hash) {
206 F2FS_I(dir)->chash = name_hash;
207 F2FS_I(dir)->clevel = level - 1;
208 }
209 return de;
210}
211
212struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
213{
214 struct page *page = NULL;
215 struct f2fs_dir_entry *de = NULL;
216 struct f2fs_dentry_block *dentry_blk = NULL;
217
218 page = get_lock_data_page(dir, 0);
219 if (IS_ERR(page))
220 return NULL;
221
222 dentry_blk = kmap(page);
223 de = &dentry_blk->dentry[1];
224 *p = page;
225 unlock_page(page);
226 return de;
227}
228
229ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr)
230{
231 ino_t res = 0;
232 struct f2fs_dir_entry *de;
233 struct page *page;
234
235 de = f2fs_find_entry(dir, qstr, &page);
236 if (de) {
237 res = le32_to_cpu(de->ino);
238 kunmap(page);
239 f2fs_put_page(page, 0);
240 }
241
242 return res;
243}
244
245void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
246 struct page *page, struct inode *inode)
247{
248 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
249
250 mutex_lock_op(sbi, DENTRY_OPS);
251 lock_page(page);
252 wait_on_page_writeback(page);
253 de->ino = cpu_to_le32(inode->i_ino);
254 set_de_type(de, inode);
255 kunmap(page);
256 set_page_dirty(page);
257 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
258 mark_inode_dirty(dir);
259
260 /* update parent inode number before releasing dentry page */
261 F2FS_I(inode)->i_pino = dir->i_ino;
262
263 f2fs_put_page(page, 1);
264 mutex_unlock_op(sbi, DENTRY_OPS);
265}
266
267void init_dent_inode(struct dentry *dentry, struct page *ipage)
268{
269 struct f2fs_node *rn;
270
271 if (IS_ERR(ipage))
272 return;
273
274 wait_on_page_writeback(ipage);
275
276 /* copy dentry info. to this inode page */
277 rn = (struct f2fs_node *)page_address(ipage);
278 rn->i.i_namelen = cpu_to_le32(dentry->d_name.len);
279 memcpy(rn->i.i_name, dentry->d_name.name, dentry->d_name.len);
280 set_page_dirty(ipage);
281}
282
283static int init_inode_metadata(struct inode *inode, struct dentry *dentry)
284{
285 struct inode *dir = dentry->d_parent->d_inode;
286
287 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
288 int err;
289 err = new_inode_page(inode, dentry);
290 if (err)
291 return err;
292
293 if (S_ISDIR(inode->i_mode)) {
294 err = f2fs_make_empty(inode, dir);
295 if (err) {
296 remove_inode_page(inode);
297 return err;
298 }
299 }
300
301 err = f2fs_init_acl(inode, dir);
302 if (err) {
303 remove_inode_page(inode);
304 return err;
305 }
306 } else {
307 struct page *ipage;
308 ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
309 if (IS_ERR(ipage))
310 return PTR_ERR(ipage);
311 init_dent_inode(dentry, ipage);
312 f2fs_put_page(ipage, 1);
313 }
314 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
315 inc_nlink(inode);
316 f2fs_write_inode(inode, NULL);
317 }
318 return 0;
319}
320
321static void update_parent_metadata(struct inode *dir, struct inode *inode,
322 unsigned int current_depth)
323{
324 bool need_dir_update = false;
325
326 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
327 if (S_ISDIR(inode->i_mode)) {
328 inc_nlink(dir);
329 need_dir_update = true;
330 }
331 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
332 }
333 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
334 if (F2FS_I(dir)->i_current_depth != current_depth) {
335 F2FS_I(dir)->i_current_depth = current_depth;
336 need_dir_update = true;
337 }
338
339 if (need_dir_update)
340 f2fs_write_inode(dir, NULL);
341 else
342 mark_inode_dirty(dir);
343
344 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
345 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
346}
347
348static int room_for_filename(struct f2fs_dentry_block *dentry_blk, int slots)
349{
350 int bit_start = 0;
351 int zero_start, zero_end;
352next:
353 zero_start = find_next_zero_bit_le(&dentry_blk->dentry_bitmap,
354 NR_DENTRY_IN_BLOCK,
355 bit_start);
356 if (zero_start >= NR_DENTRY_IN_BLOCK)
357 return NR_DENTRY_IN_BLOCK;
358
359 zero_end = find_next_bit_le(&dentry_blk->dentry_bitmap,
360 NR_DENTRY_IN_BLOCK,
361 zero_start);
362 if (zero_end - zero_start >= slots)
363 return zero_start;
364
365 bit_start = zero_end + 1;
366
367 if (zero_end + 1 >= NR_DENTRY_IN_BLOCK)
368 return NR_DENTRY_IN_BLOCK;
369 goto next;
370}
371
372int f2fs_add_link(struct dentry *dentry, struct inode *inode)
373{
374 unsigned int bit_pos;
375 unsigned int level;
376 unsigned int current_depth;
377 unsigned long bidx, block;
378 f2fs_hash_t dentry_hash;
379 struct f2fs_dir_entry *de;
380 unsigned int nbucket, nblock;
381 struct inode *dir = dentry->d_parent->d_inode;
382 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
383 const char *name = dentry->d_name.name;
384 int namelen = dentry->d_name.len;
385 struct page *dentry_page = NULL;
386 struct f2fs_dentry_block *dentry_blk = NULL;
387 int slots = GET_DENTRY_SLOTS(namelen);
388 int err = 0;
389 int i;
390
391 dentry_hash = f2fs_dentry_hash(name, dentry->d_name.len);
392 level = 0;
393 current_depth = F2FS_I(dir)->i_current_depth;
394 if (F2FS_I(dir)->chash == dentry_hash) {
395 level = F2FS_I(dir)->clevel;
396 F2FS_I(dir)->chash = 0;
397 }
398
399start:
400 if (current_depth == MAX_DIR_HASH_DEPTH)
401 return -ENOSPC;
402
403 /* Increase the depth, if required */
404 if (level == current_depth)
405 ++current_depth;
406
407 nbucket = dir_buckets(level);
408 nblock = bucket_blocks(level);
409
410 bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
411
412 for (block = bidx; block <= (bidx + nblock - 1); block++) {
413 mutex_lock_op(sbi, DENTRY_OPS);
414 dentry_page = get_new_data_page(dir, block, true);
415 if (IS_ERR(dentry_page)) {
416 mutex_unlock_op(sbi, DENTRY_OPS);
417 return PTR_ERR(dentry_page);
418 }
419
420 dentry_blk = kmap(dentry_page);
421 bit_pos = room_for_filename(dentry_blk, slots);
422 if (bit_pos < NR_DENTRY_IN_BLOCK)
423 goto add_dentry;
424
425 kunmap(dentry_page);
426 f2fs_put_page(dentry_page, 1);
427 mutex_unlock_op(sbi, DENTRY_OPS);
428 }
429
430 /* Move to next level to find the empty slot for new dentry */
431 ++level;
432 goto start;
433add_dentry:
434 err = init_inode_metadata(inode, dentry);
435 if (err)
436 goto fail;
437
438 wait_on_page_writeback(dentry_page);
439
440 de = &dentry_blk->dentry[bit_pos];
441 de->hash_code = dentry_hash;
442 de->name_len = cpu_to_le16(namelen);
443 memcpy(dentry_blk->filename[bit_pos], name, namelen);
444 de->ino = cpu_to_le32(inode->i_ino);
445 set_de_type(de, inode);
446 for (i = 0; i < slots; i++)
447 test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
448 set_page_dirty(dentry_page);
449
450 update_parent_metadata(dir, inode, current_depth);
451
452 /* update parent inode number before releasing dentry page */
453 F2FS_I(inode)->i_pino = dir->i_ino;
454fail:
455 kunmap(dentry_page);
456 f2fs_put_page(dentry_page, 1);
457 mutex_unlock_op(sbi, DENTRY_OPS);
458 return err;
459}
460
461/*
462 * It only removes the dentry from the dentry page,corresponding name
463 * entry in name page does not need to be touched during deletion.
464 */
465void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
466 struct inode *inode)
467{
468 struct f2fs_dentry_block *dentry_blk;
469 unsigned int bit_pos;
470 struct address_space *mapping = page->mapping;
471 struct inode *dir = mapping->host;
472 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
473 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
474 void *kaddr = page_address(page);
475 int i;
476
477 mutex_lock_op(sbi, DENTRY_OPS);
478
479 lock_page(page);
480 wait_on_page_writeback(page);
481
482 dentry_blk = (struct f2fs_dentry_block *)kaddr;
483 bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
484 for (i = 0; i < slots; i++)
485 test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
486
487 /* Let's check and deallocate this dentry page */
488 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
489 NR_DENTRY_IN_BLOCK,
490 0);
491 kunmap(page); /* kunmap - pair of f2fs_find_entry */
492 set_page_dirty(page);
493
494 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
495
496 if (inode && S_ISDIR(inode->i_mode)) {
497 drop_nlink(dir);
498 f2fs_write_inode(dir, NULL);
499 } else {
500 mark_inode_dirty(dir);
501 }
502
503 if (inode) {
504 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
505 drop_nlink(inode);
506 if (S_ISDIR(inode->i_mode)) {
507 drop_nlink(inode);
508 i_size_write(inode, 0);
509 }
510 f2fs_write_inode(inode, NULL);
511 if (inode->i_nlink == 0)
512 add_orphan_inode(sbi, inode->i_ino);
513 }
514
515 if (bit_pos == NR_DENTRY_IN_BLOCK) {
516 truncate_hole(dir, page->index, page->index + 1);
517 clear_page_dirty_for_io(page);
518 ClearPageUptodate(page);
519 dec_page_count(sbi, F2FS_DIRTY_DENTS);
520 inode_dec_dirty_dents(dir);
521 }
522 f2fs_put_page(page, 1);
523
524 mutex_unlock_op(sbi, DENTRY_OPS);
525}
526
527int f2fs_make_empty(struct inode *inode, struct inode *parent)
528{
529 struct page *dentry_page;
530 struct f2fs_dentry_block *dentry_blk;
531 struct f2fs_dir_entry *de;
532 void *kaddr;
533
534 dentry_page = get_new_data_page(inode, 0, true);
535 if (IS_ERR(dentry_page))
536 return PTR_ERR(dentry_page);
537
538 kaddr = kmap_atomic(dentry_page);
539 dentry_blk = (struct f2fs_dentry_block *)kaddr;
540
541 de = &dentry_blk->dentry[0];
542 de->name_len = cpu_to_le16(1);
543 de->hash_code = 0;
544 de->ino = cpu_to_le32(inode->i_ino);
545 memcpy(dentry_blk->filename[0], ".", 1);
546 set_de_type(de, inode);
547
548 de = &dentry_blk->dentry[1];
549 de->hash_code = 0;
550 de->name_len = cpu_to_le16(2);
551 de->ino = cpu_to_le32(parent->i_ino);
552 memcpy(dentry_blk->filename[1], "..", 2);
553 set_de_type(de, inode);
554
555 test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
556 test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
557 kunmap_atomic(kaddr);
558
559 set_page_dirty(dentry_page);
560 f2fs_put_page(dentry_page, 1);
561 return 0;
562}
563
564bool f2fs_empty_dir(struct inode *dir)
565{
566 unsigned long bidx;
567 struct page *dentry_page;
568 unsigned int bit_pos;
569 struct f2fs_dentry_block *dentry_blk;
570 unsigned long nblock = dir_blocks(dir);
571
572 for (bidx = 0; bidx < nblock; bidx++) {
573 void *kaddr;
574 dentry_page = get_lock_data_page(dir, bidx);
575 if (IS_ERR(dentry_page)) {
576 if (PTR_ERR(dentry_page) == -ENOENT)
577 continue;
578 else
579 return false;
580 }
581
582 kaddr = kmap_atomic(dentry_page);
583 dentry_blk = (struct f2fs_dentry_block *)kaddr;
584 if (bidx == 0)
585 bit_pos = 2;
586 else
587 bit_pos = 0;
588 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
589 NR_DENTRY_IN_BLOCK,
590 bit_pos);
591 kunmap_atomic(kaddr);
592
593 f2fs_put_page(dentry_page, 1);
594
595 if (bit_pos < NR_DENTRY_IN_BLOCK)
596 return false;
597 }
598 return true;
599}
600
601static int f2fs_readdir(struct file *file, void *dirent, filldir_t filldir)
602{
603 unsigned long pos = file->f_pos;
604 struct inode *inode = file->f_dentry->d_inode;
605 unsigned long npages = dir_blocks(inode);
606 unsigned char *types = NULL;
607 unsigned int bit_pos = 0, start_bit_pos = 0;
608 int over = 0;
609 struct f2fs_dentry_block *dentry_blk = NULL;
610 struct f2fs_dir_entry *de = NULL;
611 struct page *dentry_page = NULL;
612 unsigned int n = 0;
613 unsigned char d_type = DT_UNKNOWN;
614 int slots;
615
616 types = f2fs_filetype_table;
617 bit_pos = (pos % NR_DENTRY_IN_BLOCK);
618 n = (pos / NR_DENTRY_IN_BLOCK);
619
620 for ( ; n < npages; n++) {
621 dentry_page = get_lock_data_page(inode, n);
622 if (IS_ERR(dentry_page))
623 continue;
624
625 start_bit_pos = bit_pos;
626 dentry_blk = kmap(dentry_page);
627 while (bit_pos < NR_DENTRY_IN_BLOCK) {
628 d_type = DT_UNKNOWN;
629 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
630 NR_DENTRY_IN_BLOCK,
631 bit_pos);
632 if (bit_pos >= NR_DENTRY_IN_BLOCK)
633 break;
634
635 de = &dentry_blk->dentry[bit_pos];
636 if (types && de->file_type < F2FS_FT_MAX)
637 d_type = types[de->file_type];
638
639 over = filldir(dirent,
640 dentry_blk->filename[bit_pos],
641 le16_to_cpu(de->name_len),
642 (n * NR_DENTRY_IN_BLOCK) + bit_pos,
643 le32_to_cpu(de->ino), d_type);
644 if (over) {
645 file->f_pos += bit_pos - start_bit_pos;
646 goto success;
647 }
648 slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
649 bit_pos += slots;
650 }
651 bit_pos = 0;
652 file->f_pos = (n + 1) * NR_DENTRY_IN_BLOCK;
653 kunmap(dentry_page);
654 f2fs_put_page(dentry_page, 1);
655 dentry_page = NULL;
656 }
657success:
658 if (dentry_page && !IS_ERR(dentry_page)) {
659 kunmap(dentry_page);
660 f2fs_put_page(dentry_page, 1);
661 }
662
663 return 0;
664}
665
666const struct file_operations f2fs_dir_operations = {
667 .llseek = generic_file_llseek,
668 .read = generic_read_dir,
669 .readdir = f2fs_readdir,
670 .fsync = f2fs_sync_file,
671 .unlocked_ioctl = f2fs_ioctl,
672};
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
new file mode 100644
index 000000000000..a18d63db2fb6
--- /dev/null
+++ b/fs/f2fs/f2fs.h
@@ -0,0 +1,1083 @@
1/*
2 * fs/f2fs/f2fs.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#ifndef _LINUX_F2FS_H
12#define _LINUX_F2FS_H
13
14#include <linux/types.h>
15#include <linux/page-flags.h>
16#include <linux/buffer_head.h>
17#include <linux/slab.h>
18#include <linux/crc32.h>
19#include <linux/magic.h>
20
21/*
22 * For mount options
23 */
24#define F2FS_MOUNT_BG_GC 0x00000001
25#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002
26#define F2FS_MOUNT_DISCARD 0x00000004
27#define F2FS_MOUNT_NOHEAP 0x00000008
28#define F2FS_MOUNT_XATTR_USER 0x00000010
29#define F2FS_MOUNT_POSIX_ACL 0x00000020
30#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
31
32#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
33#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
34#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option)
35
36#define ver_after(a, b) (typecheck(unsigned long long, a) && \
37 typecheck(unsigned long long, b) && \
38 ((long long)((a) - (b)) > 0))
39
40typedef u64 block_t;
41typedef u32 nid_t;
42
43struct f2fs_mount_info {
44 unsigned int opt;
45};
46
47static inline __u32 f2fs_crc32(void *buff, size_t len)
48{
49 return crc32_le(F2FS_SUPER_MAGIC, buff, len);
50}
51
52static inline bool f2fs_crc_valid(__u32 blk_crc, void *buff, size_t buff_size)
53{
54 return f2fs_crc32(buff, buff_size) == blk_crc;
55}
56
57/*
58 * For checkpoint manager
59 */
60enum {
61 NAT_BITMAP,
62 SIT_BITMAP
63};
64
65/* for the list of orphan inodes */
66struct orphan_inode_entry {
67 struct list_head list; /* list head */
68 nid_t ino; /* inode number */
69};
70
71/* for the list of directory inodes */
72struct dir_inode_entry {
73 struct list_head list; /* list head */
74 struct inode *inode; /* vfs inode pointer */
75};
76
77/* for the list of fsync inodes, used only during recovery */
78struct fsync_inode_entry {
79 struct list_head list; /* list head */
80 struct inode *inode; /* vfs inode pointer */
81 block_t blkaddr; /* block address locating the last inode */
82};
83
84#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
85#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits))
86
87#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne)
88#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid)
89#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
90#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
91
92static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
93{
94 int before = nats_in_cursum(rs);
95 rs->n_nats = cpu_to_le16(before + i);
96 return before;
97}
98
99static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
100{
101 int before = sits_in_cursum(rs);
102 rs->n_sits = cpu_to_le16(before + i);
103 return before;
104}
105
106/*
107 * For INODE and NODE manager
108 */
109#define XATTR_NODE_OFFSET (-1) /*
110 * store xattrs to one node block per
111 * file keeping -1 as its node offset to
112 * distinguish from index node blocks.
113 */
114#define RDONLY_NODE 1 /*
115 * specify a read-only mode when getting
116 * a node block. 0 is read-write mode.
117 * used by get_dnode_of_data().
118 */
119#define F2FS_LINK_MAX 32000 /* maximum link count per file */
120
121/* for in-memory extent cache entry */
122struct extent_info {
123 rwlock_t ext_lock; /* rwlock for consistency */
124 unsigned int fofs; /* start offset in a file */
125 u32 blk_addr; /* start block address of the extent */
126 unsigned int len; /* lenth of the extent */
127};
128
129/*
130 * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
131 */
132#define FADVISE_COLD_BIT 0x01
133
134struct f2fs_inode_info {
135 struct inode vfs_inode; /* serve a vfs inode */
136 unsigned long i_flags; /* keep an inode flags for ioctl */
137 unsigned char i_advise; /* use to give file attribute hints */
138 unsigned int i_current_depth; /* use only in directory structure */
139 unsigned int i_pino; /* parent inode number */
140 umode_t i_acl_mode; /* keep file acl mode temporarily */
141
142 /* Use below internally in f2fs*/
143 unsigned long flags; /* use to pass per-file flags */
144 unsigned long long data_version;/* lastes version of data for fsync */
145 atomic_t dirty_dents; /* # of dirty dentry pages */
146 f2fs_hash_t chash; /* hash value of given file name */
147 unsigned int clevel; /* maximum level of given file name */
148 nid_t i_xattr_nid; /* node id that contains xattrs */
149 struct extent_info ext; /* in-memory extent cache entry */
150};
151
152static inline void get_extent_info(struct extent_info *ext,
153 struct f2fs_extent i_ext)
154{
155 write_lock(&ext->ext_lock);
156 ext->fofs = le32_to_cpu(i_ext.fofs);
157 ext->blk_addr = le32_to_cpu(i_ext.blk_addr);
158 ext->len = le32_to_cpu(i_ext.len);
159 write_unlock(&ext->ext_lock);
160}
161
162static inline void set_raw_extent(struct extent_info *ext,
163 struct f2fs_extent *i_ext)
164{
165 read_lock(&ext->ext_lock);
166 i_ext->fofs = cpu_to_le32(ext->fofs);
167 i_ext->blk_addr = cpu_to_le32(ext->blk_addr);
168 i_ext->len = cpu_to_le32(ext->len);
169 read_unlock(&ext->ext_lock);
170}
171
172struct f2fs_nm_info {
173 block_t nat_blkaddr; /* base disk address of NAT */
174 nid_t max_nid; /* maximum possible node ids */
175 nid_t init_scan_nid; /* the first nid to be scanned */
176 nid_t next_scan_nid; /* the next nid to be scanned */
177
178 /* NAT cache management */
179 struct radix_tree_root nat_root;/* root of the nat entry cache */
180 rwlock_t nat_tree_lock; /* protect nat_tree_lock */
181 unsigned int nat_cnt; /* the # of cached nat entries */
182 struct list_head nat_entries; /* cached nat entry list (clean) */
183 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
184
185 /* free node ids management */
186 struct list_head free_nid_list; /* a list for free nids */
187 spinlock_t free_nid_list_lock; /* protect free nid list */
188 unsigned int fcnt; /* the number of free node id */
189 struct mutex build_lock; /* lock for build free nids */
190
191 /* for checkpoint */
192 char *nat_bitmap; /* NAT bitmap pointer */
193 int bitmap_size; /* bitmap size */
194};
195
196/*
197 * this structure is used as one of function parameters.
198 * all the information are dedicated to a given direct node block determined
199 * by the data offset in a file.
200 */
201struct dnode_of_data {
202 struct inode *inode; /* vfs inode pointer */
203 struct page *inode_page; /* its inode page, NULL is possible */
204 struct page *node_page; /* cached direct node page */
205 nid_t nid; /* node id of the direct node block */
206 unsigned int ofs_in_node; /* data offset in the node page */
207 bool inode_page_locked; /* inode page is locked or not */
208 block_t data_blkaddr; /* block address of the node block */
209};
210
211static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
212 struct page *ipage, struct page *npage, nid_t nid)
213{
214 dn->inode = inode;
215 dn->inode_page = ipage;
216 dn->node_page = npage;
217 dn->nid = nid;
218 dn->inode_page_locked = 0;
219}
220
221/*
222 * For SIT manager
223 *
224 * By default, there are 6 active log areas across the whole main area.
225 * When considering hot and cold data separation to reduce cleaning overhead,
226 * we split 3 for data logs and 3 for node logs as hot, warm, and cold types,
227 * respectively.
228 * In the current design, you should not change the numbers intentionally.
229 * Instead, as a mount option such as active_logs=x, you can use 2, 4, and 6
230 * logs individually according to the underlying devices. (default: 6)
231 * Just in case, on-disk layout covers maximum 16 logs that consist of 8 for
232 * data and 8 for node logs.
233 */
234#define NR_CURSEG_DATA_TYPE (3)
235#define NR_CURSEG_NODE_TYPE (3)
236#define NR_CURSEG_TYPE (NR_CURSEG_DATA_TYPE + NR_CURSEG_NODE_TYPE)
237
238enum {
239 CURSEG_HOT_DATA = 0, /* directory entry blocks */
240 CURSEG_WARM_DATA, /* data blocks */
241 CURSEG_COLD_DATA, /* multimedia or GCed data blocks */
242 CURSEG_HOT_NODE, /* direct node blocks of directory files */
243 CURSEG_WARM_NODE, /* direct node blocks of normal files */
244 CURSEG_COLD_NODE, /* indirect node blocks */
245 NO_CHECK_TYPE
246};
247
248struct f2fs_sm_info {
249 struct sit_info *sit_info; /* whole segment information */
250 struct free_segmap_info *free_info; /* free segment information */
251 struct dirty_seglist_info *dirty_info; /* dirty segment information */
252 struct curseg_info *curseg_array; /* active segment information */
253
254 struct list_head wblist_head; /* list of under-writeback pages */
255 spinlock_t wblist_lock; /* lock for checkpoint */
256
257 block_t seg0_blkaddr; /* block address of 0'th segment */
258 block_t main_blkaddr; /* start block address of main area */
259 block_t ssa_blkaddr; /* start block address of SSA area */
260
261 unsigned int segment_count; /* total # of segments */
262 unsigned int main_segments; /* # of segments in main area */
263 unsigned int reserved_segments; /* # of reserved segments */
264 unsigned int ovp_segments; /* # of overprovision segments */
265};
266
267/*
268 * For directory operation
269 */
270#define NODE_DIR1_BLOCK (ADDRS_PER_INODE + 1)
271#define NODE_DIR2_BLOCK (ADDRS_PER_INODE + 2)
272#define NODE_IND1_BLOCK (ADDRS_PER_INODE + 3)
273#define NODE_IND2_BLOCK (ADDRS_PER_INODE + 4)
274#define NODE_DIND_BLOCK (ADDRS_PER_INODE + 5)
275
276/*
277 * For superblock
278 */
279/*
280 * COUNT_TYPE for monitoring
281 *
282 * f2fs monitors the number of several block types such as on-writeback,
283 * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
284 */
285enum count_type {
286 F2FS_WRITEBACK,
287 F2FS_DIRTY_DENTS,
288 F2FS_DIRTY_NODES,
289 F2FS_DIRTY_META,
290 NR_COUNT_TYPE,
291};
292
293/*
294 * FS_LOCK nesting subclasses for the lock validator:
295 *
296 * The locking order between these classes is
297 * RENAME -> DENTRY_OPS -> DATA_WRITE -> DATA_NEW
298 * -> DATA_TRUNC -> NODE_WRITE -> NODE_NEW -> NODE_TRUNC
299 */
300enum lock_type {
301 RENAME, /* for renaming operations */
302 DENTRY_OPS, /* for directory operations */
303 DATA_WRITE, /* for data write */
304 DATA_NEW, /* for data allocation */
305 DATA_TRUNC, /* for data truncate */
306 NODE_NEW, /* for node allocation */
307 NODE_TRUNC, /* for node truncate */
308 NODE_WRITE, /* for node write */
309 NR_LOCK_TYPE,
310};
311
312/*
313 * The below are the page types of bios used in submti_bio().
314 * The available types are:
315 * DATA User data pages. It operates as async mode.
316 * NODE Node pages. It operates as async mode.
317 * META FS metadata pages such as SIT, NAT, CP.
318 * NR_PAGE_TYPE The number of page types.
319 * META_FLUSH Make sure the previous pages are written
320 * with waiting the bio's completion
321 * ... Only can be used with META.
322 */
323enum page_type {
324 DATA,
325 NODE,
326 META,
327 NR_PAGE_TYPE,
328 META_FLUSH,
329};
330
331struct f2fs_sb_info {
332 struct super_block *sb; /* pointer to VFS super block */
333 struct buffer_head *raw_super_buf; /* buffer head of raw sb */
334 struct f2fs_super_block *raw_super; /* raw super block pointer */
335 int s_dirty; /* dirty flag for checkpoint */
336
337 /* for node-related operations */
338 struct f2fs_nm_info *nm_info; /* node manager */
339 struct inode *node_inode; /* cache node blocks */
340
341 /* for segment-related operations */
342 struct f2fs_sm_info *sm_info; /* segment manager */
343 struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */
344 sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */
345 struct rw_semaphore bio_sem; /* IO semaphore */
346
347 /* for checkpoint */
348 struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
349 struct inode *meta_inode; /* cache meta blocks */
350 struct mutex cp_mutex; /* for checkpoint procedure */
351 struct mutex fs_lock[NR_LOCK_TYPE]; /* for blocking FS operations */
352 struct mutex write_inode; /* mutex for write inode */
353 struct mutex writepages; /* mutex for writepages() */
354 int por_doing; /* recovery is doing or not */
355
356 /* for orphan inode management */
357 struct list_head orphan_inode_list; /* orphan inode list */
358 struct mutex orphan_inode_mutex; /* for orphan inode list */
359 unsigned int n_orphans; /* # of orphan inodes */
360
361 /* for directory inode management */
362 struct list_head dir_inode_list; /* dir inode list */
363 spinlock_t dir_inode_lock; /* for dir inode list lock */
364 unsigned int n_dirty_dirs; /* # of dir inodes */
365
366 /* basic file system units */
367 unsigned int log_sectors_per_block; /* log2 sectors per block */
368 unsigned int log_blocksize; /* log2 block size */
369 unsigned int blocksize; /* block size */
370 unsigned int root_ino_num; /* root inode number*/
371 unsigned int node_ino_num; /* node inode number*/
372 unsigned int meta_ino_num; /* meta inode number*/
373 unsigned int log_blocks_per_seg; /* log2 blocks per segment */
374 unsigned int blocks_per_seg; /* blocks per segment */
375 unsigned int segs_per_sec; /* segments per section */
376 unsigned int secs_per_zone; /* sections per zone */
377 unsigned int total_sections; /* total section count */
378 unsigned int total_node_count; /* total node block count */
379 unsigned int total_valid_node_count; /* valid node block count */
380 unsigned int total_valid_inode_count; /* valid inode count */
381 int active_logs; /* # of active logs */
382
383 block_t user_block_count; /* # of user blocks */
384 block_t total_valid_block_count; /* # of valid blocks */
385 block_t alloc_valid_block_count; /* # of allocated blocks */
386 block_t last_valid_block_count; /* for recovery */
387 u32 s_next_generation; /* for NFS support */
388 atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */
389
390 struct f2fs_mount_info mount_opt; /* mount options */
391
392 /* for cleaning operations */
393 struct mutex gc_mutex; /* mutex for GC */
394 struct f2fs_gc_kthread *gc_thread; /* GC thread */
395
396 /*
397 * for stat information.
398 * one is for the LFS mode, and the other is for the SSR mode.
399 */
400 struct f2fs_stat_info *stat_info; /* FS status information */
401 unsigned int segment_count[2]; /* # of allocated segments */
402 unsigned int block_count[2]; /* # of allocated blocks */
403 unsigned int last_victim[2]; /* last victim segment # */
404 int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
405 int bg_gc; /* background gc calls */
406 spinlock_t stat_lock; /* lock for stat operations */
407};
408
409/*
410 * Inline functions
411 */
412static inline struct f2fs_inode_info *F2FS_I(struct inode *inode)
413{
414 return container_of(inode, struct f2fs_inode_info, vfs_inode);
415}
416
417static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
418{
419 return sb->s_fs_info;
420}
421
422static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
423{
424 return (struct f2fs_super_block *)(sbi->raw_super);
425}
426
427static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi)
428{
429 return (struct f2fs_checkpoint *)(sbi->ckpt);
430}
431
432static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
433{
434 return (struct f2fs_nm_info *)(sbi->nm_info);
435}
436
437static inline struct f2fs_sm_info *SM_I(struct f2fs_sb_info *sbi)
438{
439 return (struct f2fs_sm_info *)(sbi->sm_info);
440}
441
442static inline struct sit_info *SIT_I(struct f2fs_sb_info *sbi)
443{
444 return (struct sit_info *)(SM_I(sbi)->sit_info);
445}
446
447static inline struct free_segmap_info *FREE_I(struct f2fs_sb_info *sbi)
448{
449 return (struct free_segmap_info *)(SM_I(sbi)->free_info);
450}
451
452static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
453{
454 return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
455}
456
457static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
458{
459 sbi->s_dirty = 1;
460}
461
462static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
463{
464 sbi->s_dirty = 0;
465}
466
467static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
468{
469 unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
470 return ckpt_flags & f;
471}
472
473static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
474{
475 unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
476 ckpt_flags |= f;
477 cp->ckpt_flags = cpu_to_le32(ckpt_flags);
478}
479
480static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
481{
482 unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags);
483 ckpt_flags &= (~f);
484 cp->ckpt_flags = cpu_to_le32(ckpt_flags);
485}
486
487static inline void mutex_lock_op(struct f2fs_sb_info *sbi, enum lock_type t)
488{
489 mutex_lock_nested(&sbi->fs_lock[t], t);
490}
491
492static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, enum lock_type t)
493{
494 mutex_unlock(&sbi->fs_lock[t]);
495}
496
497/*
498 * Check whether the given nid is within node id range.
499 */
500static inline void check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
501{
502 BUG_ON((nid >= NM_I(sbi)->max_nid));
503}
504
505#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1
506
507/*
508 * Check whether the inode has blocks or not
509 */
510static inline int F2FS_HAS_BLOCKS(struct inode *inode)
511{
512 if (F2FS_I(inode)->i_xattr_nid)
513 return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
514 else
515 return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
516}
517
518static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
519 struct inode *inode, blkcnt_t count)
520{
521 block_t valid_block_count;
522
523 spin_lock(&sbi->stat_lock);
524 valid_block_count =
525 sbi->total_valid_block_count + (block_t)count;
526 if (valid_block_count > sbi->user_block_count) {
527 spin_unlock(&sbi->stat_lock);
528 return false;
529 }
530 inode->i_blocks += count;
531 sbi->total_valid_block_count = valid_block_count;
532 sbi->alloc_valid_block_count += (block_t)count;
533 spin_unlock(&sbi->stat_lock);
534 return true;
535}
536
537static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
538 struct inode *inode,
539 blkcnt_t count)
540{
541 spin_lock(&sbi->stat_lock);
542 BUG_ON(sbi->total_valid_block_count < (block_t) count);
543 BUG_ON(inode->i_blocks < count);
544 inode->i_blocks -= count;
545 sbi->total_valid_block_count -= (block_t)count;
546 spin_unlock(&sbi->stat_lock);
547 return 0;
548}
549
550static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
551{
552 atomic_inc(&sbi->nr_pages[count_type]);
553 F2FS_SET_SB_DIRT(sbi);
554}
555
556static inline void inode_inc_dirty_dents(struct inode *inode)
557{
558 atomic_inc(&F2FS_I(inode)->dirty_dents);
559}
560
561static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
562{
563 atomic_dec(&sbi->nr_pages[count_type]);
564}
565
566static inline void inode_dec_dirty_dents(struct inode *inode)
567{
568 atomic_dec(&F2FS_I(inode)->dirty_dents);
569}
570
571static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
572{
573 return atomic_read(&sbi->nr_pages[count_type]);
574}
575
576static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
577{
578 block_t ret;
579 spin_lock(&sbi->stat_lock);
580 ret = sbi->total_valid_block_count;
581 spin_unlock(&sbi->stat_lock);
582 return ret;
583}
584
585static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
586{
587 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
588
589 /* return NAT or SIT bitmap */
590 if (flag == NAT_BITMAP)
591 return le32_to_cpu(ckpt->nat_ver_bitmap_bytesize);
592 else if (flag == SIT_BITMAP)
593 return le32_to_cpu(ckpt->sit_ver_bitmap_bytesize);
594
595 return 0;
596}
597
598static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
599{
600 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
601 int offset = (flag == NAT_BITMAP) ?
602 le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
603 return &ckpt->sit_nat_version_bitmap + offset;
604}
605
606static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
607{
608 block_t start_addr;
609 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
610 unsigned long long ckpt_version = le64_to_cpu(ckpt->checkpoint_ver);
611
612 start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
613
614 /*
615 * odd numbered checkpoint should at cp segment 0
616 * and even segent must be at cp segment 1
617 */
618 if (!(ckpt_version & 1))
619 start_addr += sbi->blocks_per_seg;
620
621 return start_addr;
622}
623
624static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
625{
626 return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
627}
628
629static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
630 struct inode *inode,
631 unsigned int count)
632{
633 block_t valid_block_count;
634 unsigned int valid_node_count;
635
636 spin_lock(&sbi->stat_lock);
637
638 valid_block_count = sbi->total_valid_block_count + (block_t)count;
639 sbi->alloc_valid_block_count += (block_t)count;
640 valid_node_count = sbi->total_valid_node_count + count;
641
642 if (valid_block_count > sbi->user_block_count) {
643 spin_unlock(&sbi->stat_lock);
644 return false;
645 }
646
647 if (valid_node_count > sbi->total_node_count) {
648 spin_unlock(&sbi->stat_lock);
649 return false;
650 }
651
652 if (inode)
653 inode->i_blocks += count;
654 sbi->total_valid_node_count = valid_node_count;
655 sbi->total_valid_block_count = valid_block_count;
656 spin_unlock(&sbi->stat_lock);
657
658 return true;
659}
660
661static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
662 struct inode *inode,
663 unsigned int count)
664{
665 spin_lock(&sbi->stat_lock);
666
667 BUG_ON(sbi->total_valid_block_count < count);
668 BUG_ON(sbi->total_valid_node_count < count);
669 BUG_ON(inode->i_blocks < count);
670
671 inode->i_blocks -= count;
672 sbi->total_valid_node_count -= count;
673 sbi->total_valid_block_count -= (block_t)count;
674
675 spin_unlock(&sbi->stat_lock);
676}
677
678static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
679{
680 unsigned int ret;
681 spin_lock(&sbi->stat_lock);
682 ret = sbi->total_valid_node_count;
683 spin_unlock(&sbi->stat_lock);
684 return ret;
685}
686
687static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
688{
689 spin_lock(&sbi->stat_lock);
690 BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count);
691 sbi->total_valid_inode_count++;
692 spin_unlock(&sbi->stat_lock);
693}
694
695static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
696{
697 spin_lock(&sbi->stat_lock);
698 BUG_ON(!sbi->total_valid_inode_count);
699 sbi->total_valid_inode_count--;
700 spin_unlock(&sbi->stat_lock);
701 return 0;
702}
703
704static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
705{
706 unsigned int ret;
707 spin_lock(&sbi->stat_lock);
708 ret = sbi->total_valid_inode_count;
709 spin_unlock(&sbi->stat_lock);
710 return ret;
711}
712
713static inline void f2fs_put_page(struct page *page, int unlock)
714{
715 if (!page || IS_ERR(page))
716 return;
717
718 if (unlock) {
719 BUG_ON(!PageLocked(page));
720 unlock_page(page);
721 }
722 page_cache_release(page);
723}
724
725static inline void f2fs_put_dnode(struct dnode_of_data *dn)
726{
727 if (dn->node_page)
728 f2fs_put_page(dn->node_page, 1);
729 if (dn->inode_page && dn->node_page != dn->inode_page)
730 f2fs_put_page(dn->inode_page, 0);
731 dn->node_page = NULL;
732 dn->inode_page = NULL;
733}
734
735static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
736 size_t size, void (*ctor)(void *))
737{
738 return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
739}
740
741#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
742
743static inline bool IS_INODE(struct page *page)
744{
745 struct f2fs_node *p = (struct f2fs_node *)page_address(page);
746 return RAW_IS_INODE(p);
747}
748
749static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
750{
751 return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr;
752}
753
754static inline block_t datablock_addr(struct page *node_page,
755 unsigned int offset)
756{
757 struct f2fs_node *raw_node;
758 __le32 *addr_array;
759 raw_node = (struct f2fs_node *)page_address(node_page);
760 addr_array = blkaddr_in_node(raw_node);
761 return le32_to_cpu(addr_array[offset]);
762}
763
764static inline int f2fs_test_bit(unsigned int nr, char *addr)
765{
766 int mask;
767
768 addr += (nr >> 3);
769 mask = 1 << (7 - (nr & 0x07));
770 return mask & *addr;
771}
772
773static inline int f2fs_set_bit(unsigned int nr, char *addr)
774{
775 int mask;
776 int ret;
777
778 addr += (nr >> 3);
779 mask = 1 << (7 - (nr & 0x07));
780 ret = mask & *addr;
781 *addr |= mask;
782 return ret;
783}
784
785static inline int f2fs_clear_bit(unsigned int nr, char *addr)
786{
787 int mask;
788 int ret;
789
790 addr += (nr >> 3);
791 mask = 1 << (7 - (nr & 0x07));
792 ret = mask & *addr;
793 *addr &= ~mask;
794 return ret;
795}
796
797/* used for f2fs_inode_info->flags */
798enum {
799 FI_NEW_INODE, /* indicate newly allocated inode */
800 FI_NEED_CP, /* need to do checkpoint during fsync */
801 FI_INC_LINK, /* need to increment i_nlink */
802 FI_ACL_MODE, /* indicate acl mode */
803 FI_NO_ALLOC, /* should not allocate any blocks */
804};
805
806static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
807{
808 set_bit(flag, &fi->flags);
809}
810
811static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
812{
813 return test_bit(flag, &fi->flags);
814}
815
816static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
817{
818 clear_bit(flag, &fi->flags);
819}
820
821static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
822{
823 fi->i_acl_mode = mode;
824 set_inode_flag(fi, FI_ACL_MODE);
825}
826
827static inline int cond_clear_inode_flag(struct f2fs_inode_info *fi, int flag)
828{
829 if (is_inode_flag_set(fi, FI_ACL_MODE)) {
830 clear_inode_flag(fi, FI_ACL_MODE);
831 return 1;
832 }
833 return 0;
834}
835
836/*
837 * file.c
838 */
839int f2fs_sync_file(struct file *, loff_t, loff_t, int);
840void truncate_data_blocks(struct dnode_of_data *);
841void f2fs_truncate(struct inode *);
842int f2fs_setattr(struct dentry *, struct iattr *);
843int truncate_hole(struct inode *, pgoff_t, pgoff_t);
844long f2fs_ioctl(struct file *, unsigned int, unsigned long);
845
846/*
847 * inode.c
848 */
849void f2fs_set_inode_flags(struct inode *);
850struct inode *f2fs_iget_nowait(struct super_block *, unsigned long);
851struct inode *f2fs_iget(struct super_block *, unsigned long);
852void update_inode(struct inode *, struct page *);
853int f2fs_write_inode(struct inode *, struct writeback_control *);
854void f2fs_evict_inode(struct inode *);
855
856/*
857 * namei.c
858 */
859struct dentry *f2fs_get_parent(struct dentry *child);
860
861/*
862 * dir.c
863 */
864struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *,
865 struct page **);
866struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **);
867ino_t f2fs_inode_by_name(struct inode *, struct qstr *);
868void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
869 struct page *, struct inode *);
870void init_dent_inode(struct dentry *, struct page *);
871int f2fs_add_link(struct dentry *, struct inode *);
872void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
873int f2fs_make_empty(struct inode *, struct inode *);
874bool f2fs_empty_dir(struct inode *);
875
876/*
877 * super.c
878 */
879int f2fs_sync_fs(struct super_block *, int);
880
881/*
882 * hash.c
883 */
884f2fs_hash_t f2fs_dentry_hash(const char *, int);
885
886/*
887 * node.c
888 */
889struct dnode_of_data;
890struct node_info;
891
892int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
893void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
894int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
895int truncate_inode_blocks(struct inode *, pgoff_t);
896int remove_inode_page(struct inode *);
897int new_inode_page(struct inode *, struct dentry *);
898struct page *new_node_page(struct dnode_of_data *, unsigned int);
899void ra_node_page(struct f2fs_sb_info *, nid_t);
900struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
901struct page *get_node_page_ra(struct page *, int);
902void sync_inode_page(struct dnode_of_data *);
903int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
904bool alloc_nid(struct f2fs_sb_info *, nid_t *);
905void alloc_nid_done(struct f2fs_sb_info *, nid_t);
906void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
907void recover_node_page(struct f2fs_sb_info *, struct page *,
908 struct f2fs_summary *, struct node_info *, block_t);
909int recover_inode_page(struct f2fs_sb_info *, struct page *);
910int restore_node_summary(struct f2fs_sb_info *, unsigned int,
911 struct f2fs_summary_block *);
912void flush_nat_entries(struct f2fs_sb_info *);
913int build_node_manager(struct f2fs_sb_info *);
914void destroy_node_manager(struct f2fs_sb_info *);
915int create_node_manager_caches(void);
916void destroy_node_manager_caches(void);
917
918/*
919 * segment.c
920 */
921void f2fs_balance_fs(struct f2fs_sb_info *);
922void invalidate_blocks(struct f2fs_sb_info *, block_t);
923void locate_dirty_segment(struct f2fs_sb_info *, unsigned int);
924void clear_prefree_segments(struct f2fs_sb_info *);
925int npages_for_summary_flush(struct f2fs_sb_info *);
926void allocate_new_segments(struct f2fs_sb_info *);
927struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
928struct bio *f2fs_bio_alloc(struct block_device *, int);
929void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool sync);
930int write_meta_page(struct f2fs_sb_info *, struct page *,
931 struct writeback_control *);
932void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
933 block_t, block_t *);
934void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
935 block_t, block_t *);
936void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
937void recover_data_page(struct f2fs_sb_info *, struct page *,
938 struct f2fs_summary *, block_t, block_t);
939void rewrite_node_page(struct f2fs_sb_info *, struct page *,
940 struct f2fs_summary *, block_t, block_t);
941void write_data_summaries(struct f2fs_sb_info *, block_t);
942void write_node_summaries(struct f2fs_sb_info *, block_t);
943int lookup_journal_in_cursum(struct f2fs_summary_block *,
944 int, unsigned int, int);
945void flush_sit_entries(struct f2fs_sb_info *);
946int build_segment_manager(struct f2fs_sb_info *);
947void reset_victim_segmap(struct f2fs_sb_info *);
948void destroy_segment_manager(struct f2fs_sb_info *);
949
950/*
951 * checkpoint.c
952 */
953struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
954struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
955long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
956int check_orphan_space(struct f2fs_sb_info *);
957void add_orphan_inode(struct f2fs_sb_info *, nid_t);
958void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
959int recover_orphan_inodes(struct f2fs_sb_info *);
960int get_valid_checkpoint(struct f2fs_sb_info *);
961void set_dirty_dir_page(struct inode *, struct page *);
962void remove_dirty_dir_inode(struct inode *);
963void sync_dirty_dir_inodes(struct f2fs_sb_info *);
964void block_operations(struct f2fs_sb_info *);
965void write_checkpoint(struct f2fs_sb_info *, bool, bool);
966void init_orphan_info(struct f2fs_sb_info *);
967int create_checkpoint_caches(void);
968void destroy_checkpoint_caches(void);
969
970/*
971 * data.c
972 */
973int reserve_new_block(struct dnode_of_data *);
974void update_extent_cache(block_t, struct dnode_of_data *);
975struct page *find_data_page(struct inode *, pgoff_t);
976struct page *get_lock_data_page(struct inode *, pgoff_t);
977struct page *get_new_data_page(struct inode *, pgoff_t, bool);
978int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
979int do_write_data_page(struct page *);
980
981/*
982 * gc.c
983 */
984int start_gc_thread(struct f2fs_sb_info *);
985void stop_gc_thread(struct f2fs_sb_info *);
986block_t start_bidx_of_node(unsigned int);
987int f2fs_gc(struct f2fs_sb_info *, int);
988void build_gc_manager(struct f2fs_sb_info *);
989int create_gc_caches(void);
990void destroy_gc_caches(void);
991
992/*
993 * recovery.c
994 */
995void recover_fsync_data(struct f2fs_sb_info *);
996bool space_for_roll_forward(struct f2fs_sb_info *);
997
998/*
999 * debug.c
1000 */
1001#ifdef CONFIG_F2FS_STAT_FS
1002struct f2fs_stat_info {
1003 struct list_head stat_list;
1004 struct f2fs_sb_info *sbi;
1005 struct mutex stat_lock;
1006 int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
1007 int main_area_segs, main_area_sections, main_area_zones;
1008 int hit_ext, total_ext;
1009 int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
1010 int nats, sits, fnids;
1011 int total_count, utilization;
1012 int bg_gc;
1013 unsigned int valid_count, valid_node_count, valid_inode_count;
1014 unsigned int bimodal, avg_vblocks;
1015 int util_free, util_valid, util_invalid;
1016 int rsvd_segs, overp_segs;
1017 int dirty_count, node_pages, meta_pages;
1018 int prefree_count, call_count;
1019 int tot_segs, node_segs, data_segs, free_segs, free_secs;
1020 int tot_blks, data_blks, node_blks;
1021 int curseg[NR_CURSEG_TYPE];
1022 int cursec[NR_CURSEG_TYPE];
1023 int curzone[NR_CURSEG_TYPE];
1024
1025 unsigned int segment_count[2];
1026 unsigned int block_count[2];
1027 unsigned base_mem, cache_mem;
1028};
1029
1030#define stat_inc_call_count(si) ((si)->call_count++)
1031
1032#define stat_inc_seg_count(sbi, type) \
1033 do { \
1034 struct f2fs_stat_info *si = sbi->stat_info; \
1035 (si)->tot_segs++; \
1036 if (type == SUM_TYPE_DATA) \
1037 si->data_segs++; \
1038 else \
1039 si->node_segs++; \
1040 } while (0)
1041
1042#define stat_inc_tot_blk_count(si, blks) \
1043 (si->tot_blks += (blks))
1044
1045#define stat_inc_data_blk_count(sbi, blks) \
1046 do { \
1047 struct f2fs_stat_info *si = sbi->stat_info; \
1048 stat_inc_tot_blk_count(si, blks); \
1049 si->data_blks += (blks); \
1050 } while (0)
1051
1052#define stat_inc_node_blk_count(sbi, blks) \
1053 do { \
1054 struct f2fs_stat_info *si = sbi->stat_info; \
1055 stat_inc_tot_blk_count(si, blks); \
1056 si->node_blks += (blks); \
1057 } while (0)
1058
1059int f2fs_build_stats(struct f2fs_sb_info *);
1060void f2fs_destroy_stats(struct f2fs_sb_info *);
1061void destroy_root_stats(void);
1062#else
1063#define stat_inc_call_count(si)
1064#define stat_inc_seg_count(si, type)
1065#define stat_inc_tot_blk_count(si, blks)
1066#define stat_inc_data_blk_count(si, blks)
1067#define stat_inc_node_blk_count(sbi, blks)
1068
1069static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
1070static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
1071static inline void destroy_root_stats(void) { }
1072#endif
1073
1074extern const struct file_operations f2fs_dir_operations;
1075extern const struct file_operations f2fs_file_operations;
1076extern const struct inode_operations f2fs_file_inode_operations;
1077extern const struct address_space_operations f2fs_dblock_aops;
1078extern const struct address_space_operations f2fs_node_aops;
1079extern const struct address_space_operations f2fs_meta_aops;
1080extern const struct inode_operations f2fs_dir_inode_operations;
1081extern const struct inode_operations f2fs_symlink_inode_operations;
1082extern const struct inode_operations f2fs_special_inode_operations;
1083#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
new file mode 100644
index 000000000000..f9e085dfb1f0
--- /dev/null
+++ b/fs/f2fs/file.c
@@ -0,0 +1,636 @@
1/*
2 * fs/f2fs/file.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/stat.h>
14#include <linux/buffer_head.h>
15#include <linux/writeback.h>
16#include <linux/falloc.h>
17#include <linux/types.h>
18#include <linux/uaccess.h>
19#include <linux/mount.h>
20
21#include "f2fs.h"
22#include "node.h"
23#include "segment.h"
24#include "xattr.h"
25#include "acl.h"
26
27static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
28 struct vm_fault *vmf)
29{
30 struct page *page = vmf->page;
31 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
32 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
33 block_t old_blk_addr;
34 struct dnode_of_data dn;
35 int err;
36
37 f2fs_balance_fs(sbi);
38
39 sb_start_pagefault(inode->i_sb);
40
41 mutex_lock_op(sbi, DATA_NEW);
42
43 /* block allocation */
44 set_new_dnode(&dn, inode, NULL, NULL, 0);
45 err = get_dnode_of_data(&dn, page->index, 0);
46 if (err) {
47 mutex_unlock_op(sbi, DATA_NEW);
48 goto out;
49 }
50
51 old_blk_addr = dn.data_blkaddr;
52
53 if (old_blk_addr == NULL_ADDR) {
54 err = reserve_new_block(&dn);
55 if (err) {
56 f2fs_put_dnode(&dn);
57 mutex_unlock_op(sbi, DATA_NEW);
58 goto out;
59 }
60 }
61 f2fs_put_dnode(&dn);
62
63 mutex_unlock_op(sbi, DATA_NEW);
64
65 lock_page(page);
66 if (page->mapping != inode->i_mapping ||
67 page_offset(page) >= i_size_read(inode) ||
68 !PageUptodate(page)) {
69 unlock_page(page);
70 err = -EFAULT;
71 goto out;
72 }
73
74 /*
75 * check to see if the page is mapped already (no holes)
76 */
77 if (PageMappedToDisk(page))
78 goto out;
79
80 /* fill the page */
81 wait_on_page_writeback(page);
82
83 /* page is wholly or partially inside EOF */
84 if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
85 unsigned offset;
86 offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
87 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
88 }
89 set_page_dirty(page);
90 SetPageUptodate(page);
91
92 file_update_time(vma->vm_file);
93out:
94 sb_end_pagefault(inode->i_sb);
95 return block_page_mkwrite_return(err);
96}
97
98static const struct vm_operations_struct f2fs_file_vm_ops = {
99 .fault = filemap_fault,
100 .page_mkwrite = f2fs_vm_page_mkwrite,
101};
102
103static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
104{
105 struct dentry *dentry;
106 nid_t pino;
107
108 inode = igrab(inode);
109 dentry = d_find_any_alias(inode);
110 if (!dentry) {
111 iput(inode);
112 return 0;
113 }
114 pino = dentry->d_parent->d_inode->i_ino;
115 dput(dentry);
116 iput(inode);
117 return !is_checkpointed_node(sbi, pino);
118}
119
120int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
121{
122 struct inode *inode = file->f_mapping->host;
123 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
124 unsigned long long cur_version;
125 int ret = 0;
126 bool need_cp = false;
127 struct writeback_control wbc = {
128 .sync_mode = WB_SYNC_ALL,
129 .nr_to_write = LONG_MAX,
130 .for_reclaim = 0,
131 };
132
133 if (inode->i_sb->s_flags & MS_RDONLY)
134 return 0;
135
136 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
137 if (ret)
138 return ret;
139
140 mutex_lock(&inode->i_mutex);
141
142 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
143 goto out;
144
145 mutex_lock(&sbi->cp_mutex);
146 cur_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver);
147 mutex_unlock(&sbi->cp_mutex);
148
149 if (F2FS_I(inode)->data_version != cur_version &&
150 !(inode->i_state & I_DIRTY))
151 goto out;
152 F2FS_I(inode)->data_version--;
153
154 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
155 need_cp = true;
156 if (is_inode_flag_set(F2FS_I(inode), FI_NEED_CP))
157 need_cp = true;
158 if (!space_for_roll_forward(sbi))
159 need_cp = true;
160 if (need_to_sync_dir(sbi, inode))
161 need_cp = true;
162
163 f2fs_write_inode(inode, NULL);
164
165 if (need_cp) {
166 /* all the dirty node pages should be flushed for POR */
167 ret = f2fs_sync_fs(inode->i_sb, 1);
168 clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
169 } else {
170 while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0)
171 f2fs_write_inode(inode, NULL);
172 filemap_fdatawait_range(sbi->node_inode->i_mapping,
173 0, LONG_MAX);
174 }
175out:
176 mutex_unlock(&inode->i_mutex);
177 return ret;
178}
179
180static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
181{
182 file_accessed(file);
183 vma->vm_ops = &f2fs_file_vm_ops;
184 return 0;
185}
186
187static int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
188{
189 int nr_free = 0, ofs = dn->ofs_in_node;
190 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
191 struct f2fs_node *raw_node;
192 __le32 *addr;
193
194 raw_node = page_address(dn->node_page);
195 addr = blkaddr_in_node(raw_node) + ofs;
196
197 for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
198 block_t blkaddr = le32_to_cpu(*addr);
199 if (blkaddr == NULL_ADDR)
200 continue;
201
202 update_extent_cache(NULL_ADDR, dn);
203 invalidate_blocks(sbi, blkaddr);
204 dec_valid_block_count(sbi, dn->inode, 1);
205 nr_free++;
206 }
207 if (nr_free) {
208 set_page_dirty(dn->node_page);
209 sync_inode_page(dn);
210 }
211 dn->ofs_in_node = ofs;
212 return nr_free;
213}
214
215void truncate_data_blocks(struct dnode_of_data *dn)
216{
217 truncate_data_blocks_range(dn, ADDRS_PER_BLOCK);
218}
219
220static void truncate_partial_data_page(struct inode *inode, u64 from)
221{
222 unsigned offset = from & (PAGE_CACHE_SIZE - 1);
223 struct page *page;
224
225 if (!offset)
226 return;
227
228 page = find_data_page(inode, from >> PAGE_CACHE_SHIFT);
229 if (IS_ERR(page))
230 return;
231
232 lock_page(page);
233 wait_on_page_writeback(page);
234 zero_user(page, offset, PAGE_CACHE_SIZE - offset);
235 set_page_dirty(page);
236 f2fs_put_page(page, 1);
237}
238
239static int truncate_blocks(struct inode *inode, u64 from)
240{
241 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
242 unsigned int blocksize = inode->i_sb->s_blocksize;
243 struct dnode_of_data dn;
244 pgoff_t free_from;
245 int count = 0;
246 int err;
247
248 free_from = (pgoff_t)
249 ((from + blocksize - 1) >> (sbi->log_blocksize));
250
251 mutex_lock_op(sbi, DATA_TRUNC);
252
253 set_new_dnode(&dn, inode, NULL, NULL, 0);
254 err = get_dnode_of_data(&dn, free_from, RDONLY_NODE);
255 if (err) {
256 if (err == -ENOENT)
257 goto free_next;
258 mutex_unlock_op(sbi, DATA_TRUNC);
259 return err;
260 }
261
262 if (IS_INODE(dn.node_page))
263 count = ADDRS_PER_INODE;
264 else
265 count = ADDRS_PER_BLOCK;
266
267 count -= dn.ofs_in_node;
268 BUG_ON(count < 0);
269 if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
270 truncate_data_blocks_range(&dn, count);
271 free_from += count;
272 }
273
274 f2fs_put_dnode(&dn);
275free_next:
276 err = truncate_inode_blocks(inode, free_from);
277 mutex_unlock_op(sbi, DATA_TRUNC);
278
279 /* lastly zero out the first data page */
280 truncate_partial_data_page(inode, from);
281
282 return err;
283}
284
285void f2fs_truncate(struct inode *inode)
286{
287 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
288 S_ISLNK(inode->i_mode)))
289 return;
290
291 if (!truncate_blocks(inode, i_size_read(inode))) {
292 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
293 mark_inode_dirty(inode);
294 }
295
296 f2fs_balance_fs(F2FS_SB(inode->i_sb));
297}
298
299static int f2fs_getattr(struct vfsmount *mnt,
300 struct dentry *dentry, struct kstat *stat)
301{
302 struct inode *inode = dentry->d_inode;
303 generic_fillattr(inode, stat);
304 stat->blocks <<= 3;
305 return 0;
306}
307
308#ifdef CONFIG_F2FS_FS_POSIX_ACL
309static void __setattr_copy(struct inode *inode, const struct iattr *attr)
310{
311 struct f2fs_inode_info *fi = F2FS_I(inode);
312 unsigned int ia_valid = attr->ia_valid;
313
314 if (ia_valid & ATTR_UID)
315 inode->i_uid = attr->ia_uid;
316 if (ia_valid & ATTR_GID)
317 inode->i_gid = attr->ia_gid;
318 if (ia_valid & ATTR_ATIME)
319 inode->i_atime = timespec_trunc(attr->ia_atime,
320 inode->i_sb->s_time_gran);
321 if (ia_valid & ATTR_MTIME)
322 inode->i_mtime = timespec_trunc(attr->ia_mtime,
323 inode->i_sb->s_time_gran);
324 if (ia_valid & ATTR_CTIME)
325 inode->i_ctime = timespec_trunc(attr->ia_ctime,
326 inode->i_sb->s_time_gran);
327 if (ia_valid & ATTR_MODE) {
328 umode_t mode = attr->ia_mode;
329
330 if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
331 mode &= ~S_ISGID;
332 set_acl_inode(fi, mode);
333 }
334}
335#else
336#define __setattr_copy setattr_copy
337#endif
338
339int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
340{
341 struct inode *inode = dentry->d_inode;
342 struct f2fs_inode_info *fi = F2FS_I(inode);
343 int err;
344
345 err = inode_change_ok(inode, attr);
346 if (err)
347 return err;
348
349 if ((attr->ia_valid & ATTR_SIZE) &&
350 attr->ia_size != i_size_read(inode)) {
351 truncate_setsize(inode, attr->ia_size);
352 f2fs_truncate(inode);
353 }
354
355 __setattr_copy(inode, attr);
356
357 if (attr->ia_valid & ATTR_MODE) {
358 err = f2fs_acl_chmod(inode);
359 if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
360 inode->i_mode = fi->i_acl_mode;
361 clear_inode_flag(fi, FI_ACL_MODE);
362 }
363 }
364
365 mark_inode_dirty(inode);
366 return err;
367}
368
369const struct inode_operations f2fs_file_inode_operations = {
370 .getattr = f2fs_getattr,
371 .setattr = f2fs_setattr,
372 .get_acl = f2fs_get_acl,
373#ifdef CONFIG_F2FS_FS_XATTR
374 .setxattr = generic_setxattr,
375 .getxattr = generic_getxattr,
376 .listxattr = f2fs_listxattr,
377 .removexattr = generic_removexattr,
378#endif
379};
380
381static void fill_zero(struct inode *inode, pgoff_t index,
382 loff_t start, loff_t len)
383{
384 struct page *page;
385
386 if (!len)
387 return;
388
389 page = get_new_data_page(inode, index, false);
390
391 if (!IS_ERR(page)) {
392 wait_on_page_writeback(page);
393 zero_user(page, start, len);
394 set_page_dirty(page);
395 f2fs_put_page(page, 1);
396 }
397}
398
399int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
400{
401 pgoff_t index;
402 int err;
403
404 for (index = pg_start; index < pg_end; index++) {
405 struct dnode_of_data dn;
406 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
407
408 mutex_lock_op(sbi, DATA_TRUNC);
409 set_new_dnode(&dn, inode, NULL, NULL, 0);
410 err = get_dnode_of_data(&dn, index, RDONLY_NODE);
411 if (err) {
412 mutex_unlock_op(sbi, DATA_TRUNC);
413 if (err == -ENOENT)
414 continue;
415 return err;
416 }
417
418 if (dn.data_blkaddr != NULL_ADDR)
419 truncate_data_blocks_range(&dn, 1);
420 f2fs_put_dnode(&dn);
421 mutex_unlock_op(sbi, DATA_TRUNC);
422 }
423 return 0;
424}
425
426static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
427{
428 pgoff_t pg_start, pg_end;
429 loff_t off_start, off_end;
430 int ret = 0;
431
432 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
433 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
434
435 off_start = offset & (PAGE_CACHE_SIZE - 1);
436 off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
437
438 if (pg_start == pg_end) {
439 fill_zero(inode, pg_start, off_start,
440 off_end - off_start);
441 } else {
442 if (off_start)
443 fill_zero(inode, pg_start++, off_start,
444 PAGE_CACHE_SIZE - off_start);
445 if (off_end)
446 fill_zero(inode, pg_end, 0, off_end);
447
448 if (pg_start < pg_end) {
449 struct address_space *mapping = inode->i_mapping;
450 loff_t blk_start, blk_end;
451
452 blk_start = pg_start << PAGE_CACHE_SHIFT;
453 blk_end = pg_end << PAGE_CACHE_SHIFT;
454 truncate_inode_pages_range(mapping, blk_start,
455 blk_end - 1);
456 ret = truncate_hole(inode, pg_start, pg_end);
457 }
458 }
459
460 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
461 i_size_read(inode) <= (offset + len)) {
462 i_size_write(inode, offset);
463 mark_inode_dirty(inode);
464 }
465
466 return ret;
467}
468
469static int expand_inode_data(struct inode *inode, loff_t offset,
470 loff_t len, int mode)
471{
472 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
473 pgoff_t index, pg_start, pg_end;
474 loff_t new_size = i_size_read(inode);
475 loff_t off_start, off_end;
476 int ret = 0;
477
478 ret = inode_newsize_ok(inode, (len + offset));
479 if (ret)
480 return ret;
481
482 pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
483 pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
484
485 off_start = offset & (PAGE_CACHE_SIZE - 1);
486 off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
487
488 for (index = pg_start; index <= pg_end; index++) {
489 struct dnode_of_data dn;
490
491 mutex_lock_op(sbi, DATA_NEW);
492
493 set_new_dnode(&dn, inode, NULL, NULL, 0);
494 ret = get_dnode_of_data(&dn, index, 0);
495 if (ret) {
496 mutex_unlock_op(sbi, DATA_NEW);
497 break;
498 }
499
500 if (dn.data_blkaddr == NULL_ADDR) {
501 ret = reserve_new_block(&dn);
502 if (ret) {
503 f2fs_put_dnode(&dn);
504 mutex_unlock_op(sbi, DATA_NEW);
505 break;
506 }
507 }
508 f2fs_put_dnode(&dn);
509
510 mutex_unlock_op(sbi, DATA_NEW);
511
512 if (pg_start == pg_end)
513 new_size = offset + len;
514 else if (index == pg_start && off_start)
515 new_size = (index + 1) << PAGE_CACHE_SHIFT;
516 else if (index == pg_end)
517 new_size = (index << PAGE_CACHE_SHIFT) + off_end;
518 else
519 new_size += PAGE_CACHE_SIZE;
520 }
521
522 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
523 i_size_read(inode) < new_size) {
524 i_size_write(inode, new_size);
525 mark_inode_dirty(inode);
526 }
527
528 return ret;
529}
530
531static long f2fs_fallocate(struct file *file, int mode,
532 loff_t offset, loff_t len)
533{
534 struct inode *inode = file->f_path.dentry->d_inode;
535 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
536 long ret;
537
538 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
539 return -EOPNOTSUPP;
540
541 if (mode & FALLOC_FL_PUNCH_HOLE)
542 ret = punch_hole(inode, offset, len, mode);
543 else
544 ret = expand_inode_data(inode, offset, len, mode);
545
546 f2fs_balance_fs(sbi);
547 return ret;
548}
549
550#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
551#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
552
553static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
554{
555 if (S_ISDIR(mode))
556 return flags;
557 else if (S_ISREG(mode))
558 return flags & F2FS_REG_FLMASK;
559 else
560 return flags & F2FS_OTHER_FLMASK;
561}
562
563long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
564{
565 struct inode *inode = filp->f_dentry->d_inode;
566 struct f2fs_inode_info *fi = F2FS_I(inode);
567 unsigned int flags;
568 int ret;
569
570 switch (cmd) {
571 case FS_IOC_GETFLAGS:
572 flags = fi->i_flags & FS_FL_USER_VISIBLE;
573 return put_user(flags, (int __user *) arg);
574 case FS_IOC_SETFLAGS:
575 {
576 unsigned int oldflags;
577
578 ret = mnt_want_write(filp->f_path.mnt);
579 if (ret)
580 return ret;
581
582 if (!inode_owner_or_capable(inode)) {
583 ret = -EACCES;
584 goto out;
585 }
586
587 if (get_user(flags, (int __user *) arg)) {
588 ret = -EFAULT;
589 goto out;
590 }
591
592 flags = f2fs_mask_flags(inode->i_mode, flags);
593
594 mutex_lock(&inode->i_mutex);
595
596 oldflags = fi->i_flags;
597
598 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
599 if (!capable(CAP_LINUX_IMMUTABLE)) {
600 mutex_unlock(&inode->i_mutex);
601 ret = -EPERM;
602 goto out;
603 }
604 }
605
606 flags = flags & FS_FL_USER_MODIFIABLE;
607 flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
608 fi->i_flags = flags;
609 mutex_unlock(&inode->i_mutex);
610
611 f2fs_set_inode_flags(inode);
612 inode->i_ctime = CURRENT_TIME;
613 mark_inode_dirty(inode);
614out:
615 mnt_drop_write(filp->f_path.mnt);
616 return ret;
617 }
618 default:
619 return -ENOTTY;
620 }
621}
622
623const struct file_operations f2fs_file_operations = {
624 .llseek = generic_file_llseek,
625 .read = do_sync_read,
626 .write = do_sync_write,
627 .aio_read = generic_file_aio_read,
628 .aio_write = generic_file_aio_write,
629 .open = generic_file_open,
630 .mmap = f2fs_file_mmap,
631 .fsync = f2fs_sync_file,
632 .fallocate = f2fs_fallocate,
633 .unlocked_ioctl = f2fs_ioctl,
634 .splice_read = generic_file_splice_read,
635 .splice_write = generic_file_splice_write,
636};
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
new file mode 100644
index 000000000000..644aa3808273
--- /dev/null
+++ b/fs/f2fs/gc.c
@@ -0,0 +1,742 @@
1/*
2 * fs/f2fs/gc.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/module.h>
13#include <linux/backing-dev.h>
14#include <linux/proc_fs.h>
15#include <linux/init.h>
16#include <linux/f2fs_fs.h>
17#include <linux/kthread.h>
18#include <linux/delay.h>
19#include <linux/freezer.h>
20#include <linux/blkdev.h>
21
22#include "f2fs.h"
23#include "node.h"
24#include "segment.h"
25#include "gc.h"
26
27static struct kmem_cache *winode_slab;
28
29static int gc_thread_func(void *data)
30{
31 struct f2fs_sb_info *sbi = data;
32 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
33 long wait_ms;
34
35 wait_ms = GC_THREAD_MIN_SLEEP_TIME;
36
37 do {
38 if (try_to_freeze())
39 continue;
40 else
41 wait_event_interruptible_timeout(*wq,
42 kthread_should_stop(),
43 msecs_to_jiffies(wait_ms));
44 if (kthread_should_stop())
45 break;
46
47 f2fs_balance_fs(sbi);
48
49 if (!test_opt(sbi, BG_GC))
50 continue;
51
52 /*
53 * [GC triggering condition]
54 * 0. GC is not conducted currently.
55 * 1. There are enough dirty segments.
56 * 2. IO subsystem is idle by checking the # of writeback pages.
57 * 3. IO subsystem is idle by checking the # of requests in
58 * bdev's request list.
59 *
60 * Note) We have to avoid triggering GCs too much frequently.
61 * Because it is possible that some segments can be
62 * invalidated soon after by user update or deletion.
63 * So, I'd like to wait some time to collect dirty segments.
64 */
65 if (!mutex_trylock(&sbi->gc_mutex))
66 continue;
67
68 if (!is_idle(sbi)) {
69 wait_ms = increase_sleep_time(wait_ms);
70 mutex_unlock(&sbi->gc_mutex);
71 continue;
72 }
73
74 if (has_enough_invalid_blocks(sbi))
75 wait_ms = decrease_sleep_time(wait_ms);
76 else
77 wait_ms = increase_sleep_time(wait_ms);
78
79 sbi->bg_gc++;
80
81 if (f2fs_gc(sbi, 1) == GC_NONE)
82 wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
83 else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
84 wait_ms = GC_THREAD_MAX_SLEEP_TIME;
85
86 } while (!kthread_should_stop());
87 return 0;
88}
89
90int start_gc_thread(struct f2fs_sb_info *sbi)
91{
92 struct f2fs_gc_kthread *gc_th;
93
94 gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
95 if (!gc_th)
96 return -ENOMEM;
97
98 sbi->gc_thread = gc_th;
99 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
100 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
101 GC_THREAD_NAME);
102 if (IS_ERR(gc_th->f2fs_gc_task)) {
103 kfree(gc_th);
104 return -ENOMEM;
105 }
106 return 0;
107}
108
109void stop_gc_thread(struct f2fs_sb_info *sbi)
110{
111 struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
112 if (!gc_th)
113 return;
114 kthread_stop(gc_th->f2fs_gc_task);
115 kfree(gc_th);
116 sbi->gc_thread = NULL;
117}
118
119static int select_gc_type(int gc_type)
120{
121 return (gc_type == BG_GC) ? GC_CB : GC_GREEDY;
122}
123
124static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
125 int type, struct victim_sel_policy *p)
126{
127 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
128
129 if (p->alloc_mode) {
130 p->gc_mode = GC_GREEDY;
131 p->dirty_segmap = dirty_i->dirty_segmap[type];
132 p->ofs_unit = 1;
133 } else {
134 p->gc_mode = select_gc_type(gc_type);
135 p->dirty_segmap = dirty_i->dirty_segmap[DIRTY];
136 p->ofs_unit = sbi->segs_per_sec;
137 }
138 p->offset = sbi->last_victim[p->gc_mode];
139}
140
141static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
142 struct victim_sel_policy *p)
143{
144 if (p->gc_mode == GC_GREEDY)
145 return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
146 else if (p->gc_mode == GC_CB)
147 return UINT_MAX;
148 else /* No other gc_mode */
149 return 0;
150}
151
152static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
153{
154 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
155 unsigned int segno;
156
157 /*
158 * If the gc_type is FG_GC, we can select victim segments
159 * selected by background GC before.
160 * Those segments guarantee they have small valid blocks.
161 */
162 segno = find_next_bit(dirty_i->victim_segmap[BG_GC],
163 TOTAL_SEGS(sbi), 0);
164 if (segno < TOTAL_SEGS(sbi)) {
165 clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
166 return segno;
167 }
168 return NULL_SEGNO;
169}
170
171static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
172{
173 struct sit_info *sit_i = SIT_I(sbi);
174 unsigned int secno = GET_SECNO(sbi, segno);
175 unsigned int start = secno * sbi->segs_per_sec;
176 unsigned long long mtime = 0;
177 unsigned int vblocks;
178 unsigned char age = 0;
179 unsigned char u;
180 unsigned int i;
181
182 for (i = 0; i < sbi->segs_per_sec; i++)
183 mtime += get_seg_entry(sbi, start + i)->mtime;
184 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
185
186 mtime = div_u64(mtime, sbi->segs_per_sec);
187 vblocks = div_u64(vblocks, sbi->segs_per_sec);
188
189 u = (vblocks * 100) >> sbi->log_blocks_per_seg;
190
191 /* Handle if the system time is changed by user */
192 if (mtime < sit_i->min_mtime)
193 sit_i->min_mtime = mtime;
194 if (mtime > sit_i->max_mtime)
195 sit_i->max_mtime = mtime;
196 if (sit_i->max_mtime != sit_i->min_mtime)
197 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
198 sit_i->max_mtime - sit_i->min_mtime);
199
200 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
201}
202
203static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
204 struct victim_sel_policy *p)
205{
206 if (p->alloc_mode == SSR)
207 return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
208
209 /* alloc_mode == LFS */
210 if (p->gc_mode == GC_GREEDY)
211 return get_valid_blocks(sbi, segno, sbi->segs_per_sec);
212 else
213 return get_cb_cost(sbi, segno);
214}
215
216/*
217 * This function is called from two pathes.
218 * One is garbage collection and the other is SSR segment selection.
219 * When it is called during GC, it just gets a victim segment
220 * and it does not remove it from dirty seglist.
221 * When it is called from SSR segment selection, it finds a segment
222 * which has minimum valid blocks and removes it from dirty seglist.
223 */
224static int get_victim_by_default(struct f2fs_sb_info *sbi,
225 unsigned int *result, int gc_type, int type, char alloc_mode)
226{
227 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
228 struct victim_sel_policy p;
229 unsigned int segno;
230 int nsearched = 0;
231
232 p.alloc_mode = alloc_mode;
233 select_policy(sbi, gc_type, type, &p);
234
235 p.min_segno = NULL_SEGNO;
236 p.min_cost = get_max_cost(sbi, &p);
237
238 mutex_lock(&dirty_i->seglist_lock);
239
240 if (p.alloc_mode == LFS && gc_type == FG_GC) {
241 p.min_segno = check_bg_victims(sbi);
242 if (p.min_segno != NULL_SEGNO)
243 goto got_it;
244 }
245
246 while (1) {
247 unsigned long cost;
248
249 segno = find_next_bit(p.dirty_segmap,
250 TOTAL_SEGS(sbi), p.offset);
251 if (segno >= TOTAL_SEGS(sbi)) {
252 if (sbi->last_victim[p.gc_mode]) {
253 sbi->last_victim[p.gc_mode] = 0;
254 p.offset = 0;
255 continue;
256 }
257 break;
258 }
259 p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
260
261 if (test_bit(segno, dirty_i->victim_segmap[FG_GC]))
262 continue;
263 if (gc_type == BG_GC &&
264 test_bit(segno, dirty_i->victim_segmap[BG_GC]))
265 continue;
266 if (IS_CURSEC(sbi, GET_SECNO(sbi, segno)))
267 continue;
268
269 cost = get_gc_cost(sbi, segno, &p);
270
271 if (p.min_cost > cost) {
272 p.min_segno = segno;
273 p.min_cost = cost;
274 }
275
276 if (cost == get_max_cost(sbi, &p))
277 continue;
278
279 if (nsearched++ >= MAX_VICTIM_SEARCH) {
280 sbi->last_victim[p.gc_mode] = segno;
281 break;
282 }
283 }
284got_it:
285 if (p.min_segno != NULL_SEGNO) {
286 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
287 if (p.alloc_mode == LFS) {
288 int i;
289 for (i = 0; i < p.ofs_unit; i++)
290 set_bit(*result + i,
291 dirty_i->victim_segmap[gc_type]);
292 }
293 }
294 mutex_unlock(&dirty_i->seglist_lock);
295
296 return (p.min_segno == NULL_SEGNO) ? 0 : 1;
297}
298
299static const struct victim_selection default_v_ops = {
300 .get_victim = get_victim_by_default,
301};
302
303static struct inode *find_gc_inode(nid_t ino, struct list_head *ilist)
304{
305 struct list_head *this;
306 struct inode_entry *ie;
307
308 list_for_each(this, ilist) {
309 ie = list_entry(this, struct inode_entry, list);
310 if (ie->inode->i_ino == ino)
311 return ie->inode;
312 }
313 return NULL;
314}
315
316static void add_gc_inode(struct inode *inode, struct list_head *ilist)
317{
318 struct list_head *this;
319 struct inode_entry *new_ie, *ie;
320
321 list_for_each(this, ilist) {
322 ie = list_entry(this, struct inode_entry, list);
323 if (ie->inode == inode) {
324 iput(inode);
325 return;
326 }
327 }
328repeat:
329 new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
330 if (!new_ie) {
331 cond_resched();
332 goto repeat;
333 }
334 new_ie->inode = inode;
335 list_add_tail(&new_ie->list, ilist);
336}
337
338static void put_gc_inode(struct list_head *ilist)
339{
340 struct inode_entry *ie, *next_ie;
341 list_for_each_entry_safe(ie, next_ie, ilist, list) {
342 iput(ie->inode);
343 list_del(&ie->list);
344 kmem_cache_free(winode_slab, ie);
345 }
346}
347
348static int check_valid_map(struct f2fs_sb_info *sbi,
349 unsigned int segno, int offset)
350{
351 struct sit_info *sit_i = SIT_I(sbi);
352 struct seg_entry *sentry;
353 int ret;
354
355 mutex_lock(&sit_i->sentry_lock);
356 sentry = get_seg_entry(sbi, segno);
357 ret = f2fs_test_bit(offset, sentry->cur_valid_map);
358 mutex_unlock(&sit_i->sentry_lock);
359 return ret ? GC_OK : GC_NEXT;
360}
361
362/*
363 * This function compares node address got in summary with that in NAT.
364 * On validity, copy that node with cold status, otherwise (invalid node)
365 * ignore that.
366 */
367static int gc_node_segment(struct f2fs_sb_info *sbi,
368 struct f2fs_summary *sum, unsigned int segno, int gc_type)
369{
370 bool initial = true;
371 struct f2fs_summary *entry;
372 int off;
373
374next_step:
375 entry = sum;
376 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
377 nid_t nid = le32_to_cpu(entry->nid);
378 struct page *node_page;
379 int err;
380
381 /*
382 * It makes sure that free segments are able to write
383 * all the dirty node pages before CP after this CP.
384 * So let's check the space of dirty node pages.
385 */
386 if (should_do_checkpoint(sbi)) {
387 mutex_lock(&sbi->cp_mutex);
388 block_operations(sbi);
389 return GC_BLOCKED;
390 }
391
392 err = check_valid_map(sbi, segno, off);
393 if (err == GC_ERROR)
394 return err;
395 else if (err == GC_NEXT)
396 continue;
397
398 if (initial) {
399 ra_node_page(sbi, nid);
400 continue;
401 }
402 node_page = get_node_page(sbi, nid);
403 if (IS_ERR(node_page))
404 continue;
405
406 /* set page dirty and write it */
407 if (!PageWriteback(node_page))
408 set_page_dirty(node_page);
409 f2fs_put_page(node_page, 1);
410 stat_inc_node_blk_count(sbi, 1);
411 }
412 if (initial) {
413 initial = false;
414 goto next_step;
415 }
416
417 if (gc_type == FG_GC) {
418 struct writeback_control wbc = {
419 .sync_mode = WB_SYNC_ALL,
420 .nr_to_write = LONG_MAX,
421 .for_reclaim = 0,
422 };
423 sync_node_pages(sbi, 0, &wbc);
424 }
425 return GC_DONE;
426}
427
428/*
429 * Calculate start block index that this node page contains
430 */
431block_t start_bidx_of_node(unsigned int node_ofs)
432{
433 block_t start_bidx;
434 unsigned int bidx, indirect_blks;
435 int dec;
436
437 indirect_blks = 2 * NIDS_PER_BLOCK + 4;
438
439 start_bidx = 1;
440 if (node_ofs == 0) {
441 start_bidx = 0;
442 } else if (node_ofs <= 2) {
443 bidx = node_ofs - 1;
444 } else if (node_ofs <= indirect_blks) {
445 dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
446 bidx = node_ofs - 2 - dec;
447 } else {
448 dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
449 bidx = node_ofs - 5 - dec;
450 }
451
452 if (start_bidx)
453 start_bidx = bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE;
454 return start_bidx;
455}
456
457static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
458 struct node_info *dni, block_t blkaddr, unsigned int *nofs)
459{
460 struct page *node_page;
461 nid_t nid;
462 unsigned int ofs_in_node;
463 block_t source_blkaddr;
464
465 nid = le32_to_cpu(sum->nid);
466 ofs_in_node = le16_to_cpu(sum->ofs_in_node);
467
468 node_page = get_node_page(sbi, nid);
469 if (IS_ERR(node_page))
470 return GC_NEXT;
471
472 get_node_info(sbi, nid, dni);
473
474 if (sum->version != dni->version) {
475 f2fs_put_page(node_page, 1);
476 return GC_NEXT;
477 }
478
479 *nofs = ofs_of_node(node_page);
480 source_blkaddr = datablock_addr(node_page, ofs_in_node);
481 f2fs_put_page(node_page, 1);
482
483 if (source_blkaddr != blkaddr)
484 return GC_NEXT;
485 return GC_OK;
486}
487
488static void move_data_page(struct inode *inode, struct page *page, int gc_type)
489{
490 if (page->mapping != inode->i_mapping)
491 goto out;
492
493 if (inode != page->mapping->host)
494 goto out;
495
496 if (PageWriteback(page))
497 goto out;
498
499 if (gc_type == BG_GC) {
500 set_page_dirty(page);
501 set_cold_data(page);
502 } else {
503 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
504 mutex_lock_op(sbi, DATA_WRITE);
505 if (clear_page_dirty_for_io(page) &&
506 S_ISDIR(inode->i_mode)) {
507 dec_page_count(sbi, F2FS_DIRTY_DENTS);
508 inode_dec_dirty_dents(inode);
509 }
510 set_cold_data(page);
511 do_write_data_page(page);
512 mutex_unlock_op(sbi, DATA_WRITE);
513 clear_cold_data(page);
514 }
515out:
516 f2fs_put_page(page, 1);
517}
518
519/*
520 * This function tries to get parent node of victim data block, and identifies
521 * data block validity. If the block is valid, copy that with cold status and
522 * modify parent node.
523 * If the parent node is not valid or the data block address is different,
524 * the victim data block is ignored.
525 */
526static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
527 struct list_head *ilist, unsigned int segno, int gc_type)
528{
529 struct super_block *sb = sbi->sb;
530 struct f2fs_summary *entry;
531 block_t start_addr;
532 int err, off;
533 int phase = 0;
534
535 start_addr = START_BLOCK(sbi, segno);
536
537next_step:
538 entry = sum;
539 for (off = 0; off < sbi->blocks_per_seg; off++, entry++) {
540 struct page *data_page;
541 struct inode *inode;
542 struct node_info dni; /* dnode info for the data */
543 unsigned int ofs_in_node, nofs;
544 block_t start_bidx;
545
546 /*
547 * It makes sure that free segments are able to write
548 * all the dirty node pages before CP after this CP.
549 * So let's check the space of dirty node pages.
550 */
551 if (should_do_checkpoint(sbi)) {
552 mutex_lock(&sbi->cp_mutex);
553 block_operations(sbi);
554 err = GC_BLOCKED;
555 goto stop;
556 }
557
558 err = check_valid_map(sbi, segno, off);
559 if (err == GC_ERROR)
560 goto stop;
561 else if (err == GC_NEXT)
562 continue;
563
564 if (phase == 0) {
565 ra_node_page(sbi, le32_to_cpu(entry->nid));
566 continue;
567 }
568
569 /* Get an inode by ino with checking validity */
570 err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs);
571 if (err == GC_ERROR)
572 goto stop;
573 else if (err == GC_NEXT)
574 continue;
575
576 if (phase == 1) {
577 ra_node_page(sbi, dni.ino);
578 continue;
579 }
580
581 start_bidx = start_bidx_of_node(nofs);
582 ofs_in_node = le16_to_cpu(entry->ofs_in_node);
583
584 if (phase == 2) {
585 inode = f2fs_iget_nowait(sb, dni.ino);
586 if (IS_ERR(inode))
587 continue;
588
589 data_page = find_data_page(inode,
590 start_bidx + ofs_in_node);
591 if (IS_ERR(data_page))
592 goto next_iput;
593
594 f2fs_put_page(data_page, 0);
595 add_gc_inode(inode, ilist);
596 } else {
597 inode = find_gc_inode(dni.ino, ilist);
598 if (inode) {
599 data_page = get_lock_data_page(inode,
600 start_bidx + ofs_in_node);
601 if (IS_ERR(data_page))
602 continue;
603 move_data_page(inode, data_page, gc_type);
604 stat_inc_data_blk_count(sbi, 1);
605 }
606 }
607 continue;
608next_iput:
609 iput(inode);
610 }
611 if (++phase < 4)
612 goto next_step;
613 err = GC_DONE;
614stop:
615 if (gc_type == FG_GC)
616 f2fs_submit_bio(sbi, DATA, true);
617 return err;
618}
619
620static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
621 int gc_type, int type)
622{
623 struct sit_info *sit_i = SIT_I(sbi);
624 int ret;
625 mutex_lock(&sit_i->sentry_lock);
626 ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type, type, LFS);
627 mutex_unlock(&sit_i->sentry_lock);
628 return ret;
629}
630
631static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
632 struct list_head *ilist, int gc_type)
633{
634 struct page *sum_page;
635 struct f2fs_summary_block *sum;
636 int ret = GC_DONE;
637
638 /* read segment summary of victim */
639 sum_page = get_sum_page(sbi, segno);
640 if (IS_ERR(sum_page))
641 return GC_ERROR;
642
643 /*
644 * CP needs to lock sum_page. In this time, we don't need
645 * to lock this page, because this summary page is not gone anywhere.
646 * Also, this page is not gonna be updated before GC is done.
647 */
648 unlock_page(sum_page);
649 sum = page_address(sum_page);
650
651 switch (GET_SUM_TYPE((&sum->footer))) {
652 case SUM_TYPE_NODE:
653 ret = gc_node_segment(sbi, sum->entries, segno, gc_type);
654 break;
655 case SUM_TYPE_DATA:
656 ret = gc_data_segment(sbi, sum->entries, ilist, segno, gc_type);
657 break;
658 }
659 stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)));
660 stat_inc_call_count(sbi->stat_info);
661
662 f2fs_put_page(sum_page, 0);
663 return ret;
664}
665
666int f2fs_gc(struct f2fs_sb_info *sbi, int nGC)
667{
668 unsigned int segno;
669 int old_free_secs, cur_free_secs;
670 int gc_status, nfree;
671 struct list_head ilist;
672 int gc_type = BG_GC;
673
674 INIT_LIST_HEAD(&ilist);
675gc_more:
676 nfree = 0;
677 gc_status = GC_NONE;
678
679 if (has_not_enough_free_secs(sbi))
680 old_free_secs = reserved_sections(sbi);
681 else
682 old_free_secs = free_sections(sbi);
683
684 while (sbi->sb->s_flags & MS_ACTIVE) {
685 int i;
686 if (has_not_enough_free_secs(sbi))
687 gc_type = FG_GC;
688
689 cur_free_secs = free_sections(sbi) + nfree;
690
691 /* We got free space successfully. */
692 if (nGC < cur_free_secs - old_free_secs)
693 break;
694
695 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
696 break;
697
698 for (i = 0; i < sbi->segs_per_sec; i++) {
699 /*
700 * do_garbage_collect will give us three gc_status:
701 * GC_ERROR, GC_DONE, and GC_BLOCKED.
702 * If GC is finished uncleanly, we have to return
703 * the victim to dirty segment list.
704 */
705 gc_status = do_garbage_collect(sbi, segno + i,
706 &ilist, gc_type);
707 if (gc_status != GC_DONE)
708 goto stop;
709 nfree++;
710 }
711 }
712stop:
713 if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) {
714 write_checkpoint(sbi, (gc_status == GC_BLOCKED), false);
715 if (nfree)
716 goto gc_more;
717 }
718 mutex_unlock(&sbi->gc_mutex);
719
720 put_gc_inode(&ilist);
721 BUG_ON(!list_empty(&ilist));
722 return gc_status;
723}
724
725void build_gc_manager(struct f2fs_sb_info *sbi)
726{
727 DIRTY_I(sbi)->v_ops = &default_v_ops;
728}
729
730int create_gc_caches(void)
731{
732 winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
733 sizeof(struct inode_entry), NULL);
734 if (!winode_slab)
735 return -ENOMEM;
736 return 0;
737}
738
739void destroy_gc_caches(void)
740{
741 kmem_cache_destroy(winode_slab);
742}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
new file mode 100644
index 000000000000..b026d9354ccd
--- /dev/null
+++ b/fs/f2fs/gc.h
@@ -0,0 +1,117 @@
1/*
2 * fs/f2fs/gc.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#define GC_THREAD_NAME "f2fs_gc_task"
12#define GC_THREAD_MIN_WB_PAGES 1 /*
13 * a threshold to determine
14 * whether IO subsystem is idle
15 * or not
16 */
17#define GC_THREAD_MIN_SLEEP_TIME 10000 /* milliseconds */
18#define GC_THREAD_MAX_SLEEP_TIME 30000
19#define GC_THREAD_NOGC_SLEEP_TIME 10000
20#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
21#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
22
23/* Search max. number of dirty segments to select a victim segment */
24#define MAX_VICTIM_SEARCH 20
25
26enum {
27 GC_NONE = 0,
28 GC_ERROR,
29 GC_OK,
30 GC_NEXT,
31 GC_BLOCKED,
32 GC_DONE,
33};
34
35struct f2fs_gc_kthread {
36 struct task_struct *f2fs_gc_task;
37 wait_queue_head_t gc_wait_queue_head;
38};
39
40struct inode_entry {
41 struct list_head list;
42 struct inode *inode;
43};
44
45/*
46 * inline functions
47 */
48static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
49{
50 if (free_segments(sbi) < overprovision_segments(sbi))
51 return 0;
52 else
53 return (free_segments(sbi) - overprovision_segments(sbi))
54 << sbi->log_blocks_per_seg;
55}
56
57static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi)
58{
59 return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100;
60}
61
62static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
63{
64 block_t reclaimable_user_blocks = sbi->user_block_count -
65 written_block_count(sbi);
66 return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
67}
68
69static inline long increase_sleep_time(long wait)
70{
71 wait += GC_THREAD_MIN_SLEEP_TIME;
72 if (wait > GC_THREAD_MAX_SLEEP_TIME)
73 wait = GC_THREAD_MAX_SLEEP_TIME;
74 return wait;
75}
76
77static inline long decrease_sleep_time(long wait)
78{
79 wait -= GC_THREAD_MIN_SLEEP_TIME;
80 if (wait <= GC_THREAD_MIN_SLEEP_TIME)
81 wait = GC_THREAD_MIN_SLEEP_TIME;
82 return wait;
83}
84
85static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
86{
87 block_t invalid_user_blocks = sbi->user_block_count -
88 written_block_count(sbi);
89 /*
90 * Background GC is triggered with the following condition.
91 * 1. There are a number of invalid blocks.
92 * 2. There is not enough free space.
93 */
94 if (invalid_user_blocks > limit_invalid_user_blocks(sbi) &&
95 free_user_blocks(sbi) < limit_free_user_blocks(sbi))
96 return true;
97 return false;
98}
99
100static inline int is_idle(struct f2fs_sb_info *sbi)
101{
102 struct block_device *bdev = sbi->sb->s_bdev;
103 struct request_queue *q = bdev_get_queue(bdev);
104 struct request_list *rl = &q->root_rl;
105 return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
106}
107
108static inline bool should_do_checkpoint(struct f2fs_sb_info *sbi)
109{
110 unsigned int pages_per_sec = sbi->segs_per_sec *
111 (1 << sbi->log_blocks_per_seg);
112 int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
113 >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
114 int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
115 >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
116 return free_sections(sbi) <= (node_secs + 2 * dent_secs + 2);
117}
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
new file mode 100644
index 000000000000..a60f04200f8b
--- /dev/null
+++ b/fs/f2fs/hash.c
@@ -0,0 +1,97 @@
1/*
2 * fs/f2fs/hash.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext3/hash.c
8 *
9 * Copyright (C) 2002 by Theodore Ts'o
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 2 as
13 * published by the Free Software Foundation.
14 */
15#include <linux/types.h>
16#include <linux/fs.h>
17#include <linux/f2fs_fs.h>
18#include <linux/cryptohash.h>
19#include <linux/pagemap.h>
20
21#include "f2fs.h"
22
23/*
24 * Hashing code copied from ext3
25 */
26#define DELTA 0x9E3779B9
27
28static void TEA_transform(unsigned int buf[4], unsigned int const in[])
29{
30 __u32 sum = 0;
31 __u32 b0 = buf[0], b1 = buf[1];
32 __u32 a = in[0], b = in[1], c = in[2], d = in[3];
33 int n = 16;
34
35 do {
36 sum += DELTA;
37 b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
38 b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
39 } while (--n);
40
41 buf[0] += b0;
42 buf[1] += b1;
43}
44
45static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num)
46{
47 unsigned pad, val;
48 int i;
49
50 pad = (__u32)len | ((__u32)len << 8);
51 pad |= pad << 16;
52
53 val = pad;
54 if (len > num * 4)
55 len = num * 4;
56 for (i = 0; i < len; i++) {
57 if ((i % 4) == 0)
58 val = pad;
59 val = msg[i] + (val << 8);
60 if ((i % 4) == 3) {
61 *buf++ = val;
62 val = pad;
63 num--;
64 }
65 }
66 if (--num >= 0)
67 *buf++ = val;
68 while (--num >= 0)
69 *buf++ = pad;
70}
71
72f2fs_hash_t f2fs_dentry_hash(const char *name, int len)
73{
74 __u32 hash, minor_hash;
75 f2fs_hash_t f2fs_hash;
76 const char *p;
77 __u32 in[8], buf[4];
78
79 /* Initialize the default seed for the hash checksum functions */
80 buf[0] = 0x67452301;
81 buf[1] = 0xefcdab89;
82 buf[2] = 0x98badcfe;
83 buf[3] = 0x10325476;
84
85 p = name;
86 while (len > 0) {
87 str2hashbuf(p, len, in, 4);
88 TEA_transform(buf, in);
89 len -= 16;
90 p += 16;
91 }
92 hash = buf[0];
93 minor_hash = buf[1];
94
95 f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
96 return f2fs_hash;
97}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
new file mode 100644
index 000000000000..df5fb381ebf1
--- /dev/null
+++ b/fs/f2fs/inode.c
@@ -0,0 +1,268 @@
1/*
2 * fs/f2fs/inode.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/buffer_head.h>
14#include <linux/writeback.h>
15
16#include "f2fs.h"
17#include "node.h"
18
19struct f2fs_iget_args {
20 u64 ino;
21 int on_free;
22};
23
24void f2fs_set_inode_flags(struct inode *inode)
25{
26 unsigned int flags = F2FS_I(inode)->i_flags;
27
28 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE |
29 S_NOATIME | S_DIRSYNC);
30
31 if (flags & FS_SYNC_FL)
32 inode->i_flags |= S_SYNC;
33 if (flags & FS_APPEND_FL)
34 inode->i_flags |= S_APPEND;
35 if (flags & FS_IMMUTABLE_FL)
36 inode->i_flags |= S_IMMUTABLE;
37 if (flags & FS_NOATIME_FL)
38 inode->i_flags |= S_NOATIME;
39 if (flags & FS_DIRSYNC_FL)
40 inode->i_flags |= S_DIRSYNC;
41}
42
43static int f2fs_iget_test(struct inode *inode, void *data)
44{
45 struct f2fs_iget_args *args = data;
46
47 if (inode->i_ino != args->ino)
48 return 0;
49 if (inode->i_state & (I_FREEING | I_WILL_FREE)) {
50 args->on_free = 1;
51 return 0;
52 }
53 return 1;
54}
55
56struct inode *f2fs_iget_nowait(struct super_block *sb, unsigned long ino)
57{
58 struct f2fs_iget_args args = {
59 .ino = ino,
60 .on_free = 0
61 };
62 struct inode *inode = ilookup5(sb, ino, f2fs_iget_test, &args);
63
64 if (inode)
65 return inode;
66 if (!args.on_free)
67 return f2fs_iget(sb, ino);
68 return ERR_PTR(-ENOENT);
69}
70
71static int do_read_inode(struct inode *inode)
72{
73 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
74 struct f2fs_inode_info *fi = F2FS_I(inode);
75 struct page *node_page;
76 struct f2fs_node *rn;
77 struct f2fs_inode *ri;
78
79 /* Check if ino is within scope */
80 check_nid_range(sbi, inode->i_ino);
81
82 node_page = get_node_page(sbi, inode->i_ino);
83 if (IS_ERR(node_page))
84 return PTR_ERR(node_page);
85
86 rn = page_address(node_page);
87 ri = &(rn->i);
88
89 inode->i_mode = le16_to_cpu(ri->i_mode);
90 i_uid_write(inode, le32_to_cpu(ri->i_uid));
91 i_gid_write(inode, le32_to_cpu(ri->i_gid));
92 set_nlink(inode, le32_to_cpu(ri->i_links));
93 inode->i_size = le64_to_cpu(ri->i_size);
94 inode->i_blocks = le64_to_cpu(ri->i_blocks);
95
96 inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime);
97 inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime);
98 inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime);
99 inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
100 inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
101 inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
102 inode->i_generation = le32_to_cpu(ri->i_generation);
103
104 fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
105 fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
106 fi->i_flags = le32_to_cpu(ri->i_flags);
107 fi->flags = 0;
108 fi->data_version = le64_to_cpu(F2FS_CKPT(sbi)->checkpoint_ver) - 1;
109 fi->i_advise = ri->i_advise;
110 fi->i_pino = le32_to_cpu(ri->i_pino);
111 get_extent_info(&fi->ext, ri->i_ext);
112 f2fs_put_page(node_page, 1);
113 return 0;
114}
115
116struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
117{
118 struct f2fs_sb_info *sbi = F2FS_SB(sb);
119 struct inode *inode;
120 int ret;
121
122 inode = iget_locked(sb, ino);
123 if (!inode)
124 return ERR_PTR(-ENOMEM);
125 if (!(inode->i_state & I_NEW))
126 return inode;
127 if (ino == F2FS_NODE_INO(sbi) || ino == F2FS_META_INO(sbi))
128 goto make_now;
129
130 ret = do_read_inode(inode);
131 if (ret)
132 goto bad_inode;
133
134 if (!sbi->por_doing && inode->i_nlink == 0) {
135 ret = -ENOENT;
136 goto bad_inode;
137 }
138
139make_now:
140 if (ino == F2FS_NODE_INO(sbi)) {
141 inode->i_mapping->a_ops = &f2fs_node_aops;
142 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
143 } else if (ino == F2FS_META_INO(sbi)) {
144 inode->i_mapping->a_ops = &f2fs_meta_aops;
145 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
146 } else if (S_ISREG(inode->i_mode)) {
147 inode->i_op = &f2fs_file_inode_operations;
148 inode->i_fop = &f2fs_file_operations;
149 inode->i_mapping->a_ops = &f2fs_dblock_aops;
150 } else if (S_ISDIR(inode->i_mode)) {
151 inode->i_op = &f2fs_dir_inode_operations;
152 inode->i_fop = &f2fs_dir_operations;
153 inode->i_mapping->a_ops = &f2fs_dblock_aops;
154 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER_MOVABLE |
155 __GFP_ZERO);
156 } else if (S_ISLNK(inode->i_mode)) {
157 inode->i_op = &f2fs_symlink_inode_operations;
158 inode->i_mapping->a_ops = &f2fs_dblock_aops;
159 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
160 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
161 inode->i_op = &f2fs_special_inode_operations;
162 init_special_inode(inode, inode->i_mode, inode->i_rdev);
163 } else {
164 ret = -EIO;
165 goto bad_inode;
166 }
167 unlock_new_inode(inode);
168
169 return inode;
170
171bad_inode:
172 iget_failed(inode);
173 return ERR_PTR(ret);
174}
175
176void update_inode(struct inode *inode, struct page *node_page)
177{
178 struct f2fs_node *rn;
179 struct f2fs_inode *ri;
180
181 wait_on_page_writeback(node_page);
182
183 rn = page_address(node_page);
184 ri = &(rn->i);
185
186 ri->i_mode = cpu_to_le16(inode->i_mode);
187 ri->i_advise = F2FS_I(inode)->i_advise;
188 ri->i_uid = cpu_to_le32(i_uid_read(inode));
189 ri->i_gid = cpu_to_le32(i_gid_read(inode));
190 ri->i_links = cpu_to_le32(inode->i_nlink);
191 ri->i_size = cpu_to_le64(i_size_read(inode));
192 ri->i_blocks = cpu_to_le64(inode->i_blocks);
193 set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext);
194
195 ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
196 ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
197 ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
198 ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
199 ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
200 ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
201 ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth);
202 ri->i_xattr_nid = cpu_to_le32(F2FS_I(inode)->i_xattr_nid);
203 ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
204 ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
205 ri->i_generation = cpu_to_le32(inode->i_generation);
206 set_page_dirty(node_page);
207}
208
209int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
210{
211 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
212 struct page *node_page;
213 bool need_lock = false;
214
215 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
216 inode->i_ino == F2FS_META_INO(sbi))
217 return 0;
218
219 node_page = get_node_page(sbi, inode->i_ino);
220 if (IS_ERR(node_page))
221 return PTR_ERR(node_page);
222
223 if (!PageDirty(node_page)) {
224 need_lock = true;
225 f2fs_put_page(node_page, 1);
226 mutex_lock(&sbi->write_inode);
227 node_page = get_node_page(sbi, inode->i_ino);
228 if (IS_ERR(node_page)) {
229 mutex_unlock(&sbi->write_inode);
230 return PTR_ERR(node_page);
231 }
232 }
233 update_inode(inode, node_page);
234 f2fs_put_page(node_page, 1);
235 if (need_lock)
236 mutex_unlock(&sbi->write_inode);
237 return 0;
238}
239
240/*
241 * Called at the last iput() if i_nlink is zero
242 */
243void f2fs_evict_inode(struct inode *inode)
244{
245 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
246
247 truncate_inode_pages(&inode->i_data, 0);
248
249 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
250 inode->i_ino == F2FS_META_INO(sbi))
251 goto no_delete;
252
253 BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents));
254 remove_dirty_dir_inode(inode);
255
256 if (inode->i_nlink || is_bad_inode(inode))
257 goto no_delete;
258
259 set_inode_flag(F2FS_I(inode), FI_NO_ALLOC);
260 i_size_write(inode, 0);
261
262 if (F2FS_HAS_BLOCKS(inode))
263 f2fs_truncate(inode);
264
265 remove_inode_page(inode);
266no_delete:
267 clear_inode(inode);
268}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
new file mode 100644
index 000000000000..89b7675dc377
--- /dev/null
+++ b/fs/f2fs/namei.c
@@ -0,0 +1,503 @@
1/*
2 * fs/f2fs/namei.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/pagemap.h>
14#include <linux/sched.h>
15#include <linux/ctype.h>
16
17#include "f2fs.h"
18#include "xattr.h"
19#include "acl.h"
20
21static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
22{
23 struct super_block *sb = dir->i_sb;
24 struct f2fs_sb_info *sbi = F2FS_SB(sb);
25 nid_t ino;
26 struct inode *inode;
27 bool nid_free = false;
28 int err;
29
30 inode = new_inode(sb);
31 if (!inode)
32 return ERR_PTR(-ENOMEM);
33
34 mutex_lock_op(sbi, NODE_NEW);
35 if (!alloc_nid(sbi, &ino)) {
36 mutex_unlock_op(sbi, NODE_NEW);
37 err = -ENOSPC;
38 goto fail;
39 }
40 mutex_unlock_op(sbi, NODE_NEW);
41
42 inode->i_uid = current_fsuid();
43
44 if (dir->i_mode & S_ISGID) {
45 inode->i_gid = dir->i_gid;
46 if (S_ISDIR(mode))
47 mode |= S_ISGID;
48 } else {
49 inode->i_gid = current_fsgid();
50 }
51
52 inode->i_ino = ino;
53 inode->i_mode = mode;
54 inode->i_blocks = 0;
55 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
56 inode->i_generation = sbi->s_next_generation++;
57
58 err = insert_inode_locked(inode);
59 if (err) {
60 err = -EINVAL;
61 nid_free = true;
62 goto out;
63 }
64
65 mark_inode_dirty(inode);
66 return inode;
67
68out:
69 clear_nlink(inode);
70 unlock_new_inode(inode);
71fail:
72 iput(inode);
73 if (nid_free)
74 alloc_nid_failed(sbi, ino);
75 return ERR_PTR(err);
76}
77
78static int is_multimedia_file(const unsigned char *s, const char *sub)
79{
80 int slen = strlen(s);
81 int sublen = strlen(sub);
82 int ret;
83
84 if (sublen > slen)
85 return 1;
86
87 ret = memcmp(s + slen - sublen, sub, sublen);
88 if (ret) { /* compare upper case */
89 int i;
90 char upper_sub[8];
91 for (i = 0; i < sublen && i < sizeof(upper_sub); i++)
92 upper_sub[i] = toupper(sub[i]);
93 return memcmp(s + slen - sublen, upper_sub, sublen);
94 }
95
96 return ret;
97}
98
99/*
100 * Set multimedia files as cold files for hot/cold data separation
101 */
102static inline void set_cold_file(struct f2fs_sb_info *sbi, struct inode *inode,
103 const unsigned char *name)
104{
105 int i;
106 __u8 (*extlist)[8] = sbi->raw_super->extension_list;
107
108 int count = le32_to_cpu(sbi->raw_super->extension_count);
109 for (i = 0; i < count; i++) {
110 if (!is_multimedia_file(name, extlist[i])) {
111 F2FS_I(inode)->i_advise |= FADVISE_COLD_BIT;
112 break;
113 }
114 }
115}
116
117static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
118 bool excl)
119{
120 struct super_block *sb = dir->i_sb;
121 struct f2fs_sb_info *sbi = F2FS_SB(sb);
122 struct inode *inode;
123 nid_t ino = 0;
124 int err;
125
126 inode = f2fs_new_inode(dir, mode);
127 if (IS_ERR(inode))
128 return PTR_ERR(inode);
129
130 if (!test_opt(sbi, DISABLE_EXT_IDENTIFY))
131 set_cold_file(sbi, inode, dentry->d_name.name);
132
133 inode->i_op = &f2fs_file_inode_operations;
134 inode->i_fop = &f2fs_file_operations;
135 inode->i_mapping->a_ops = &f2fs_dblock_aops;
136 ino = inode->i_ino;
137
138 err = f2fs_add_link(dentry, inode);
139 if (err)
140 goto out;
141
142 alloc_nid_done(sbi, ino);
143
144 if (!sbi->por_doing)
145 d_instantiate(dentry, inode);
146 unlock_new_inode(inode);
147
148 f2fs_balance_fs(sbi);
149 return 0;
150out:
151 clear_nlink(inode);
152 unlock_new_inode(inode);
153 iput(inode);
154 alloc_nid_failed(sbi, ino);
155 return err;
156}
157
158static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
159 struct dentry *dentry)
160{
161 struct inode *inode = old_dentry->d_inode;
162 struct super_block *sb = dir->i_sb;
163 struct f2fs_sb_info *sbi = F2FS_SB(sb);
164 int err;
165
166 inode->i_ctime = CURRENT_TIME;
167 atomic_inc(&inode->i_count);
168
169 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
170 err = f2fs_add_link(dentry, inode);
171 if (err)
172 goto out;
173
174 d_instantiate(dentry, inode);
175
176 f2fs_balance_fs(sbi);
177 return 0;
178out:
179 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
180 iput(inode);
181 return err;
182}
183
184struct dentry *f2fs_get_parent(struct dentry *child)
185{
186 struct qstr dotdot = QSTR_INIT("..", 2);
187 unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot);
188 if (!ino)
189 return ERR_PTR(-ENOENT);
190 return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino));
191}
192
193static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
194 unsigned int flags)
195{
196 struct inode *inode = NULL;
197 struct f2fs_dir_entry *de;
198 struct page *page;
199
200 if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
201 return ERR_PTR(-ENAMETOOLONG);
202
203 de = f2fs_find_entry(dir, &dentry->d_name, &page);
204 if (de) {
205 nid_t ino = le32_to_cpu(de->ino);
206 kunmap(page);
207 f2fs_put_page(page, 0);
208
209 inode = f2fs_iget(dir->i_sb, ino);
210 if (IS_ERR(inode))
211 return ERR_CAST(inode);
212 }
213
214 return d_splice_alias(inode, dentry);
215}
216
217static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
218{
219 struct super_block *sb = dir->i_sb;
220 struct f2fs_sb_info *sbi = F2FS_SB(sb);
221 struct inode *inode = dentry->d_inode;
222 struct f2fs_dir_entry *de;
223 struct page *page;
224 int err = -ENOENT;
225
226 de = f2fs_find_entry(dir, &dentry->d_name, &page);
227 if (!de)
228 goto fail;
229
230 err = check_orphan_space(sbi);
231 if (err) {
232 kunmap(page);
233 f2fs_put_page(page, 0);
234 goto fail;
235 }
236
237 f2fs_delete_entry(de, page, inode);
238
239 /* In order to evict this inode, we set it dirty */
240 mark_inode_dirty(inode);
241 f2fs_balance_fs(sbi);
242fail:
243 return err;
244}
245
246static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
247 const char *symname)
248{
249 struct super_block *sb = dir->i_sb;
250 struct f2fs_sb_info *sbi = F2FS_SB(sb);
251 struct inode *inode;
252 unsigned symlen = strlen(symname) + 1;
253 int err;
254
255 inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
256 if (IS_ERR(inode))
257 return PTR_ERR(inode);
258
259 inode->i_op = &f2fs_symlink_inode_operations;
260 inode->i_mapping->a_ops = &f2fs_dblock_aops;
261
262 err = f2fs_add_link(dentry, inode);
263 if (err)
264 goto out;
265
266 err = page_symlink(inode, symname, symlen);
267 alloc_nid_done(sbi, inode->i_ino);
268
269 d_instantiate(dentry, inode);
270 unlock_new_inode(inode);
271
272 f2fs_balance_fs(sbi);
273
274 return err;
275out:
276 clear_nlink(inode);
277 unlock_new_inode(inode);
278 iput(inode);
279 alloc_nid_failed(sbi, inode->i_ino);
280 return err;
281}
282
283static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
284{
285 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
286 struct inode *inode;
287 int err;
288
289 inode = f2fs_new_inode(dir, S_IFDIR | mode);
290 if (IS_ERR(inode))
291 return PTR_ERR(inode);
292
293 inode->i_op = &f2fs_dir_inode_operations;
294 inode->i_fop = &f2fs_dir_operations;
295 inode->i_mapping->a_ops = &f2fs_dblock_aops;
296 mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
297
298 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
299 err = f2fs_add_link(dentry, inode);
300 if (err)
301 goto out_fail;
302
303 alloc_nid_done(sbi, inode->i_ino);
304
305 d_instantiate(dentry, inode);
306 unlock_new_inode(inode);
307
308 f2fs_balance_fs(sbi);
309 return 0;
310
311out_fail:
312 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
313 clear_nlink(inode);
314 unlock_new_inode(inode);
315 iput(inode);
316 alloc_nid_failed(sbi, inode->i_ino);
317 return err;
318}
319
320static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
321{
322 struct inode *inode = dentry->d_inode;
323 if (f2fs_empty_dir(inode))
324 return f2fs_unlink(dir, dentry);
325 return -ENOTEMPTY;
326}
327
328static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
329 umode_t mode, dev_t rdev)
330{
331 struct super_block *sb = dir->i_sb;
332 struct f2fs_sb_info *sbi = F2FS_SB(sb);
333 struct inode *inode;
334 int err = 0;
335
336 if (!new_valid_dev(rdev))
337 return -EINVAL;
338
339 inode = f2fs_new_inode(dir, mode);
340 if (IS_ERR(inode))
341 return PTR_ERR(inode);
342
343 init_special_inode(inode, inode->i_mode, rdev);
344 inode->i_op = &f2fs_special_inode_operations;
345
346 err = f2fs_add_link(dentry, inode);
347 if (err)
348 goto out;
349
350 alloc_nid_done(sbi, inode->i_ino);
351 d_instantiate(dentry, inode);
352 unlock_new_inode(inode);
353
354 f2fs_balance_fs(sbi);
355
356 return 0;
357out:
358 clear_nlink(inode);
359 unlock_new_inode(inode);
360 iput(inode);
361 alloc_nid_failed(sbi, inode->i_ino);
362 return err;
363}
364
365static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
366 struct inode *new_dir, struct dentry *new_dentry)
367{
368 struct super_block *sb = old_dir->i_sb;
369 struct f2fs_sb_info *sbi = F2FS_SB(sb);
370 struct inode *old_inode = old_dentry->d_inode;
371 struct inode *new_inode = new_dentry->d_inode;
372 struct page *old_dir_page;
373 struct page *old_page;
374 struct f2fs_dir_entry *old_dir_entry = NULL;
375 struct f2fs_dir_entry *old_entry;
376 struct f2fs_dir_entry *new_entry;
377 int err = -ENOENT;
378
379 old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
380 if (!old_entry)
381 goto out;
382
383 if (S_ISDIR(old_inode->i_mode)) {
384 err = -EIO;
385 old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
386 if (!old_dir_entry)
387 goto out_old;
388 }
389
390 mutex_lock_op(sbi, RENAME);
391
392 if (new_inode) {
393 struct page *new_page;
394
395 err = -ENOTEMPTY;
396 if (old_dir_entry && !f2fs_empty_dir(new_inode))
397 goto out_dir;
398
399 err = -ENOENT;
400 new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
401 &new_page);
402 if (!new_entry)
403 goto out_dir;
404
405 f2fs_set_link(new_dir, new_entry, new_page, old_inode);
406
407 new_inode->i_ctime = CURRENT_TIME;
408 if (old_dir_entry)
409 drop_nlink(new_inode);
410 drop_nlink(new_inode);
411 if (!new_inode->i_nlink)
412 add_orphan_inode(sbi, new_inode->i_ino);
413 f2fs_write_inode(new_inode, NULL);
414 } else {
415 err = f2fs_add_link(new_dentry, old_inode);
416 if (err)
417 goto out_dir;
418
419 if (old_dir_entry) {
420 inc_nlink(new_dir);
421 f2fs_write_inode(new_dir, NULL);
422 }
423 }
424
425 old_inode->i_ctime = CURRENT_TIME;
426 set_inode_flag(F2FS_I(old_inode), FI_NEED_CP);
427 mark_inode_dirty(old_inode);
428
429 f2fs_delete_entry(old_entry, old_page, NULL);
430
431 if (old_dir_entry) {
432 if (old_dir != new_dir) {
433 f2fs_set_link(old_inode, old_dir_entry,
434 old_dir_page, new_dir);
435 } else {
436 kunmap(old_dir_page);
437 f2fs_put_page(old_dir_page, 0);
438 }
439 drop_nlink(old_dir);
440 f2fs_write_inode(old_dir, NULL);
441 }
442
443 mutex_unlock_op(sbi, RENAME);
444
445 f2fs_balance_fs(sbi);
446 return 0;
447
448out_dir:
449 if (old_dir_entry) {
450 kunmap(old_dir_page);
451 f2fs_put_page(old_dir_page, 0);
452 }
453 mutex_unlock_op(sbi, RENAME);
454out_old:
455 kunmap(old_page);
456 f2fs_put_page(old_page, 0);
457out:
458 return err;
459}
460
461const struct inode_operations f2fs_dir_inode_operations = {
462 .create = f2fs_create,
463 .lookup = f2fs_lookup,
464 .link = f2fs_link,
465 .unlink = f2fs_unlink,
466 .symlink = f2fs_symlink,
467 .mkdir = f2fs_mkdir,
468 .rmdir = f2fs_rmdir,
469 .mknod = f2fs_mknod,
470 .rename = f2fs_rename,
471 .setattr = f2fs_setattr,
472 .get_acl = f2fs_get_acl,
473#ifdef CONFIG_F2FS_FS_XATTR
474 .setxattr = generic_setxattr,
475 .getxattr = generic_getxattr,
476 .listxattr = f2fs_listxattr,
477 .removexattr = generic_removexattr,
478#endif
479};
480
481const struct inode_operations f2fs_symlink_inode_operations = {
482 .readlink = generic_readlink,
483 .follow_link = page_follow_link_light,
484 .put_link = page_put_link,
485 .setattr = f2fs_setattr,
486#ifdef CONFIG_F2FS_FS_XATTR
487 .setxattr = generic_setxattr,
488 .getxattr = generic_getxattr,
489 .listxattr = f2fs_listxattr,
490 .removexattr = generic_removexattr,
491#endif
492};
493
494const struct inode_operations f2fs_special_inode_operations = {
495 .setattr = f2fs_setattr,
496 .get_acl = f2fs_get_acl,
497#ifdef CONFIG_F2FS_FS_XATTR
498 .setxattr = generic_setxattr,
499 .getxattr = generic_getxattr,
500 .listxattr = f2fs_listxattr,
501 .removexattr = generic_removexattr,
502#endif
503};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
new file mode 100644
index 000000000000..19870361497e
--- /dev/null
+++ b/fs/f2fs/node.c
@@ -0,0 +1,1764 @@
1/*
2 * fs/f2fs/node.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/mpage.h>
14#include <linux/backing-dev.h>
15#include <linux/blkdev.h>
16#include <linux/pagevec.h>
17#include <linux/swap.h>
18
19#include "f2fs.h"
20#include "node.h"
21#include "segment.h"
22
23static struct kmem_cache *nat_entry_slab;
24static struct kmem_cache *free_nid_slab;
25
26static void clear_node_page_dirty(struct page *page)
27{
28 struct address_space *mapping = page->mapping;
29 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
30 unsigned int long flags;
31
32 if (PageDirty(page)) {
33 spin_lock_irqsave(&mapping->tree_lock, flags);
34 radix_tree_tag_clear(&mapping->page_tree,
35 page_index(page),
36 PAGECACHE_TAG_DIRTY);
37 spin_unlock_irqrestore(&mapping->tree_lock, flags);
38
39 clear_page_dirty_for_io(page);
40 dec_page_count(sbi, F2FS_DIRTY_NODES);
41 }
42 ClearPageUptodate(page);
43}
44
45static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
46{
47 pgoff_t index = current_nat_addr(sbi, nid);
48 return get_meta_page(sbi, index);
49}
50
51static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
52{
53 struct page *src_page;
54 struct page *dst_page;
55 pgoff_t src_off;
56 pgoff_t dst_off;
57 void *src_addr;
58 void *dst_addr;
59 struct f2fs_nm_info *nm_i = NM_I(sbi);
60
61 src_off = current_nat_addr(sbi, nid);
62 dst_off = next_nat_addr(sbi, src_off);
63
64 /* get current nat block page with lock */
65 src_page = get_meta_page(sbi, src_off);
66
67 /* Dirty src_page means that it is already the new target NAT page. */
68 if (PageDirty(src_page))
69 return src_page;
70
71 dst_page = grab_meta_page(sbi, dst_off);
72
73 src_addr = page_address(src_page);
74 dst_addr = page_address(dst_page);
75 memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
76 set_page_dirty(dst_page);
77 f2fs_put_page(src_page, 1);
78
79 set_to_next_nat(nm_i, nid);
80
81 return dst_page;
82}
83
84/*
85 * Readahead NAT pages
86 */
87static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
88{
89 struct address_space *mapping = sbi->meta_inode->i_mapping;
90 struct f2fs_nm_info *nm_i = NM_I(sbi);
91 struct page *page;
92 pgoff_t index;
93 int i;
94
95 for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
96 if (nid >= nm_i->max_nid)
97 nid = 0;
98 index = current_nat_addr(sbi, nid);
99
100 page = grab_cache_page(mapping, index);
101 if (!page)
102 continue;
103 if (f2fs_readpage(sbi, page, index, READ)) {
104 f2fs_put_page(page, 1);
105 continue;
106 }
107 page_cache_release(page);
108 }
109}
110
111static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
112{
113 return radix_tree_lookup(&nm_i->nat_root, n);
114}
115
116static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
117 nid_t start, unsigned int nr, struct nat_entry **ep)
118{
119 return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
120}
121
122static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
123{
124 list_del(&e->list);
125 radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
126 nm_i->nat_cnt--;
127 kmem_cache_free(nat_entry_slab, e);
128}
129
130int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
131{
132 struct f2fs_nm_info *nm_i = NM_I(sbi);
133 struct nat_entry *e;
134 int is_cp = 1;
135
136 read_lock(&nm_i->nat_tree_lock);
137 e = __lookup_nat_cache(nm_i, nid);
138 if (e && !e->checkpointed)
139 is_cp = 0;
140 read_unlock(&nm_i->nat_tree_lock);
141 return is_cp;
142}
143
144static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
145{
146 struct nat_entry *new;
147
148 new = kmem_cache_alloc(nat_entry_slab, GFP_ATOMIC);
149 if (!new)
150 return NULL;
151 if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
152 kmem_cache_free(nat_entry_slab, new);
153 return NULL;
154 }
155 memset(new, 0, sizeof(struct nat_entry));
156 nat_set_nid(new, nid);
157 list_add_tail(&new->list, &nm_i->nat_entries);
158 nm_i->nat_cnt++;
159 return new;
160}
161
162static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
163 struct f2fs_nat_entry *ne)
164{
165 struct nat_entry *e;
166retry:
167 write_lock(&nm_i->nat_tree_lock);
168 e = __lookup_nat_cache(nm_i, nid);
169 if (!e) {
170 e = grab_nat_entry(nm_i, nid);
171 if (!e) {
172 write_unlock(&nm_i->nat_tree_lock);
173 goto retry;
174 }
175 nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
176 nat_set_ino(e, le32_to_cpu(ne->ino));
177 nat_set_version(e, ne->version);
178 e->checkpointed = true;
179 }
180 write_unlock(&nm_i->nat_tree_lock);
181}
182
183static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
184 block_t new_blkaddr)
185{
186 struct f2fs_nm_info *nm_i = NM_I(sbi);
187 struct nat_entry *e;
188retry:
189 write_lock(&nm_i->nat_tree_lock);
190 e = __lookup_nat_cache(nm_i, ni->nid);
191 if (!e) {
192 e = grab_nat_entry(nm_i, ni->nid);
193 if (!e) {
194 write_unlock(&nm_i->nat_tree_lock);
195 goto retry;
196 }
197 e->ni = *ni;
198 e->checkpointed = true;
199 BUG_ON(ni->blk_addr == NEW_ADDR);
200 } else if (new_blkaddr == NEW_ADDR) {
201 /*
202 * when nid is reallocated,
203 * previous nat entry can be remained in nat cache.
204 * So, reinitialize it with new information.
205 */
206 e->ni = *ni;
207 BUG_ON(ni->blk_addr != NULL_ADDR);
208 }
209
210 if (new_blkaddr == NEW_ADDR)
211 e->checkpointed = false;
212
213 /* sanity check */
214 BUG_ON(nat_get_blkaddr(e) != ni->blk_addr);
215 BUG_ON(nat_get_blkaddr(e) == NULL_ADDR &&
216 new_blkaddr == NULL_ADDR);
217 BUG_ON(nat_get_blkaddr(e) == NEW_ADDR &&
218 new_blkaddr == NEW_ADDR);
219 BUG_ON(nat_get_blkaddr(e) != NEW_ADDR &&
220 nat_get_blkaddr(e) != NULL_ADDR &&
221 new_blkaddr == NEW_ADDR);
222
223 /* increament version no as node is removed */
224 if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
225 unsigned char version = nat_get_version(e);
226 nat_set_version(e, inc_node_version(version));
227 }
228
229 /* change address */
230 nat_set_blkaddr(e, new_blkaddr);
231 __set_nat_cache_dirty(nm_i, e);
232 write_unlock(&nm_i->nat_tree_lock);
233}
234
235static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
236{
237 struct f2fs_nm_info *nm_i = NM_I(sbi);
238
239 if (nm_i->nat_cnt < 2 * NM_WOUT_THRESHOLD)
240 return 0;
241
242 write_lock(&nm_i->nat_tree_lock);
243 while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
244 struct nat_entry *ne;
245 ne = list_first_entry(&nm_i->nat_entries,
246 struct nat_entry, list);
247 __del_from_nat_cache(nm_i, ne);
248 nr_shrink--;
249 }
250 write_unlock(&nm_i->nat_tree_lock);
251 return nr_shrink;
252}
253
254/*
255 * This function returns always success
256 */
257void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
258{
259 struct f2fs_nm_info *nm_i = NM_I(sbi);
260 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
261 struct f2fs_summary_block *sum = curseg->sum_blk;
262 nid_t start_nid = START_NID(nid);
263 struct f2fs_nat_block *nat_blk;
264 struct page *page = NULL;
265 struct f2fs_nat_entry ne;
266 struct nat_entry *e;
267 int i;
268
269 memset(&ne, 0, sizeof(struct f2fs_nat_entry));
270 ni->nid = nid;
271
272 /* Check nat cache */
273 read_lock(&nm_i->nat_tree_lock);
274 e = __lookup_nat_cache(nm_i, nid);
275 if (e) {
276 ni->ino = nat_get_ino(e);
277 ni->blk_addr = nat_get_blkaddr(e);
278 ni->version = nat_get_version(e);
279 }
280 read_unlock(&nm_i->nat_tree_lock);
281 if (e)
282 return;
283
284 /* Check current segment summary */
285 mutex_lock(&curseg->curseg_mutex);
286 i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
287 if (i >= 0) {
288 ne = nat_in_journal(sum, i);
289 node_info_from_raw_nat(ni, &ne);
290 }
291 mutex_unlock(&curseg->curseg_mutex);
292 if (i >= 0)
293 goto cache;
294
295 /* Fill node_info from nat page */
296 page = get_current_nat_page(sbi, start_nid);
297 nat_blk = (struct f2fs_nat_block *)page_address(page);
298 ne = nat_blk->entries[nid - start_nid];
299 node_info_from_raw_nat(ni, &ne);
300 f2fs_put_page(page, 1);
301cache:
302 /* cache nat entry */
303 cache_nat_entry(NM_I(sbi), nid, &ne);
304}
305
306/*
307 * The maximum depth is four.
308 * Offset[0] will have raw inode offset.
309 */
310static int get_node_path(long block, int offset[4], unsigned int noffset[4])
311{
312 const long direct_index = ADDRS_PER_INODE;
313 const long direct_blks = ADDRS_PER_BLOCK;
314 const long dptrs_per_blk = NIDS_PER_BLOCK;
315 const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
316 const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
317 int n = 0;
318 int level = 0;
319
320 noffset[0] = 0;
321
322 if (block < direct_index) {
323 offset[n++] = block;
324 level = 0;
325 goto got;
326 }
327 block -= direct_index;
328 if (block < direct_blks) {
329 offset[n++] = NODE_DIR1_BLOCK;
330 noffset[n] = 1;
331 offset[n++] = block;
332 level = 1;
333 goto got;
334 }
335 block -= direct_blks;
336 if (block < direct_blks) {
337 offset[n++] = NODE_DIR2_BLOCK;
338 noffset[n] = 2;
339 offset[n++] = block;
340 level = 1;
341 goto got;
342 }
343 block -= direct_blks;
344 if (block < indirect_blks) {
345 offset[n++] = NODE_IND1_BLOCK;
346 noffset[n] = 3;
347 offset[n++] = block / direct_blks;
348 noffset[n] = 4 + offset[n - 1];
349 offset[n++] = block % direct_blks;
350 level = 2;
351 goto got;
352 }
353 block -= indirect_blks;
354 if (block < indirect_blks) {
355 offset[n++] = NODE_IND2_BLOCK;
356 noffset[n] = 4 + dptrs_per_blk;
357 offset[n++] = block / direct_blks;
358 noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
359 offset[n++] = block % direct_blks;
360 level = 2;
361 goto got;
362 }
363 block -= indirect_blks;
364 if (block < dindirect_blks) {
365 offset[n++] = NODE_DIND_BLOCK;
366 noffset[n] = 5 + (dptrs_per_blk * 2);
367 offset[n++] = block / indirect_blks;
368 noffset[n] = 6 + (dptrs_per_blk * 2) +
369 offset[n - 1] * (dptrs_per_blk + 1);
370 offset[n++] = (block / direct_blks) % dptrs_per_blk;
371 noffset[n] = 7 + (dptrs_per_blk * 2) +
372 offset[n - 2] * (dptrs_per_blk + 1) +
373 offset[n - 1];
374 offset[n++] = block % direct_blks;
375 level = 3;
376 goto got;
377 } else {
378 BUG();
379 }
380got:
381 return level;
382}
383
384/*
385 * Caller should call f2fs_put_dnode(dn).
386 */
387int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int ro)
388{
389 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
390 struct page *npage[4];
391 struct page *parent;
392 int offset[4];
393 unsigned int noffset[4];
394 nid_t nids[4];
395 int level, i;
396 int err = 0;
397
398 level = get_node_path(index, offset, noffset);
399
400 nids[0] = dn->inode->i_ino;
401 npage[0] = get_node_page(sbi, nids[0]);
402 if (IS_ERR(npage[0]))
403 return PTR_ERR(npage[0]);
404
405 parent = npage[0];
406 nids[1] = get_nid(parent, offset[0], true);
407 dn->inode_page = npage[0];
408 dn->inode_page_locked = true;
409
410 /* get indirect or direct nodes */
411 for (i = 1; i <= level; i++) {
412 bool done = false;
413
414 if (!nids[i] && !ro) {
415 mutex_lock_op(sbi, NODE_NEW);
416
417 /* alloc new node */
418 if (!alloc_nid(sbi, &(nids[i]))) {
419 mutex_unlock_op(sbi, NODE_NEW);
420 err = -ENOSPC;
421 goto release_pages;
422 }
423
424 dn->nid = nids[i];
425 npage[i] = new_node_page(dn, noffset[i]);
426 if (IS_ERR(npage[i])) {
427 alloc_nid_failed(sbi, nids[i]);
428 mutex_unlock_op(sbi, NODE_NEW);
429 err = PTR_ERR(npage[i]);
430 goto release_pages;
431 }
432
433 set_nid(parent, offset[i - 1], nids[i], i == 1);
434 alloc_nid_done(sbi, nids[i]);
435 mutex_unlock_op(sbi, NODE_NEW);
436 done = true;
437 } else if (ro && i == level && level > 1) {
438 npage[i] = get_node_page_ra(parent, offset[i - 1]);
439 if (IS_ERR(npage[i])) {
440 err = PTR_ERR(npage[i]);
441 goto release_pages;
442 }
443 done = true;
444 }
445 if (i == 1) {
446 dn->inode_page_locked = false;
447 unlock_page(parent);
448 } else {
449 f2fs_put_page(parent, 1);
450 }
451
452 if (!done) {
453 npage[i] = get_node_page(sbi, nids[i]);
454 if (IS_ERR(npage[i])) {
455 err = PTR_ERR(npage[i]);
456 f2fs_put_page(npage[0], 0);
457 goto release_out;
458 }
459 }
460 if (i < level) {
461 parent = npage[i];
462 nids[i + 1] = get_nid(parent, offset[i], false);
463 }
464 }
465 dn->nid = nids[level];
466 dn->ofs_in_node = offset[level];
467 dn->node_page = npage[level];
468 dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node);
469 return 0;
470
471release_pages:
472 f2fs_put_page(parent, 1);
473 if (i > 1)
474 f2fs_put_page(npage[0], 0);
475release_out:
476 dn->inode_page = NULL;
477 dn->node_page = NULL;
478 return err;
479}
480
481static void truncate_node(struct dnode_of_data *dn)
482{
483 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
484 struct node_info ni;
485
486 get_node_info(sbi, dn->nid, &ni);
487 BUG_ON(ni.blk_addr == NULL_ADDR);
488
489 if (ni.blk_addr != NULL_ADDR)
490 invalidate_blocks(sbi, ni.blk_addr);
491
492 /* Deallocate node address */
493 dec_valid_node_count(sbi, dn->inode, 1);
494 set_node_addr(sbi, &ni, NULL_ADDR);
495
496 if (dn->nid == dn->inode->i_ino) {
497 remove_orphan_inode(sbi, dn->nid);
498 dec_valid_inode_count(sbi);
499 } else {
500 sync_inode_page(dn);
501 }
502
503 clear_node_page_dirty(dn->node_page);
504 F2FS_SET_SB_DIRT(sbi);
505
506 f2fs_put_page(dn->node_page, 1);
507 dn->node_page = NULL;
508}
509
510static int truncate_dnode(struct dnode_of_data *dn)
511{
512 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
513 struct page *page;
514
515 if (dn->nid == 0)
516 return 1;
517
518 /* get direct node */
519 page = get_node_page(sbi, dn->nid);
520 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
521 return 1;
522 else if (IS_ERR(page))
523 return PTR_ERR(page);
524
525 /* Make dnode_of_data for parameter */
526 dn->node_page = page;
527 dn->ofs_in_node = 0;
528 truncate_data_blocks(dn);
529 truncate_node(dn);
530 return 1;
531}
532
533static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
534 int ofs, int depth)
535{
536 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
537 struct dnode_of_data rdn = *dn;
538 struct page *page;
539 struct f2fs_node *rn;
540 nid_t child_nid;
541 unsigned int child_nofs;
542 int freed = 0;
543 int i, ret;
544
545 if (dn->nid == 0)
546 return NIDS_PER_BLOCK + 1;
547
548 page = get_node_page(sbi, dn->nid);
549 if (IS_ERR(page))
550 return PTR_ERR(page);
551
552 rn = (struct f2fs_node *)page_address(page);
553 if (depth < 3) {
554 for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
555 child_nid = le32_to_cpu(rn->in.nid[i]);
556 if (child_nid == 0)
557 continue;
558 rdn.nid = child_nid;
559 ret = truncate_dnode(&rdn);
560 if (ret < 0)
561 goto out_err;
562 set_nid(page, i, 0, false);
563 }
564 } else {
565 child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
566 for (i = ofs; i < NIDS_PER_BLOCK; i++) {
567 child_nid = le32_to_cpu(rn->in.nid[i]);
568 if (child_nid == 0) {
569 child_nofs += NIDS_PER_BLOCK + 1;
570 continue;
571 }
572 rdn.nid = child_nid;
573 ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
574 if (ret == (NIDS_PER_BLOCK + 1)) {
575 set_nid(page, i, 0, false);
576 child_nofs += ret;
577 } else if (ret < 0 && ret != -ENOENT) {
578 goto out_err;
579 }
580 }
581 freed = child_nofs;
582 }
583
584 if (!ofs) {
585 /* remove current indirect node */
586 dn->node_page = page;
587 truncate_node(dn);
588 freed++;
589 } else {
590 f2fs_put_page(page, 1);
591 }
592 return freed;
593
594out_err:
595 f2fs_put_page(page, 1);
596 return ret;
597}
598
599static int truncate_partial_nodes(struct dnode_of_data *dn,
600 struct f2fs_inode *ri, int *offset, int depth)
601{
602 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
603 struct page *pages[2];
604 nid_t nid[3];
605 nid_t child_nid;
606 int err = 0;
607 int i;
608 int idx = depth - 2;
609
610 nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
611 if (!nid[0])
612 return 0;
613
614 /* get indirect nodes in the path */
615 for (i = 0; i < depth - 1; i++) {
616 /* refernece count'll be increased */
617 pages[i] = get_node_page(sbi, nid[i]);
618 if (IS_ERR(pages[i])) {
619 depth = i + 1;
620 err = PTR_ERR(pages[i]);
621 goto fail;
622 }
623 nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
624 }
625
626 /* free direct nodes linked to a partial indirect node */
627 for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
628 child_nid = get_nid(pages[idx], i, false);
629 if (!child_nid)
630 continue;
631 dn->nid = child_nid;
632 err = truncate_dnode(dn);
633 if (err < 0)
634 goto fail;
635 set_nid(pages[idx], i, 0, false);
636 }
637
638 if (offset[depth - 1] == 0) {
639 dn->node_page = pages[idx];
640 dn->nid = nid[idx];
641 truncate_node(dn);
642 } else {
643 f2fs_put_page(pages[idx], 1);
644 }
645 offset[idx]++;
646 offset[depth - 1] = 0;
647fail:
648 for (i = depth - 3; i >= 0; i--)
649 f2fs_put_page(pages[i], 1);
650 return err;
651}
652
653/*
654 * All the block addresses of data and nodes should be nullified.
655 */
656int truncate_inode_blocks(struct inode *inode, pgoff_t from)
657{
658 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
659 int err = 0, cont = 1;
660 int level, offset[4], noffset[4];
661 unsigned int nofs;
662 struct f2fs_node *rn;
663 struct dnode_of_data dn;
664 struct page *page;
665
666 level = get_node_path(from, offset, noffset);
667
668 page = get_node_page(sbi, inode->i_ino);
669 if (IS_ERR(page))
670 return PTR_ERR(page);
671
672 set_new_dnode(&dn, inode, page, NULL, 0);
673 unlock_page(page);
674
675 rn = page_address(page);
676 switch (level) {
677 case 0:
678 case 1:
679 nofs = noffset[1];
680 break;
681 case 2:
682 nofs = noffset[1];
683 if (!offset[level - 1])
684 goto skip_partial;
685 err = truncate_partial_nodes(&dn, &rn->i, offset, level);
686 if (err < 0 && err != -ENOENT)
687 goto fail;
688 nofs += 1 + NIDS_PER_BLOCK;
689 break;
690 case 3:
691 nofs = 5 + 2 * NIDS_PER_BLOCK;
692 if (!offset[level - 1])
693 goto skip_partial;
694 err = truncate_partial_nodes(&dn, &rn->i, offset, level);
695 if (err < 0 && err != -ENOENT)
696 goto fail;
697 break;
698 default:
699 BUG();
700 }
701
702skip_partial:
703 while (cont) {
704 dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
705 switch (offset[0]) {
706 case NODE_DIR1_BLOCK:
707 case NODE_DIR2_BLOCK:
708 err = truncate_dnode(&dn);
709 break;
710
711 case NODE_IND1_BLOCK:
712 case NODE_IND2_BLOCK:
713 err = truncate_nodes(&dn, nofs, offset[1], 2);
714 break;
715
716 case NODE_DIND_BLOCK:
717 err = truncate_nodes(&dn, nofs, offset[1], 3);
718 cont = 0;
719 break;
720
721 default:
722 BUG();
723 }
724 if (err < 0 && err != -ENOENT)
725 goto fail;
726 if (offset[1] == 0 &&
727 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
728 lock_page(page);
729 wait_on_page_writeback(page);
730 rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
731 set_page_dirty(page);
732 unlock_page(page);
733 }
734 offset[1] = 0;
735 offset[0]++;
736 nofs += err;
737 }
738fail:
739 f2fs_put_page(page, 0);
740 return err > 0 ? 0 : err;
741}
742
743int remove_inode_page(struct inode *inode)
744{
745 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
746 struct page *page;
747 nid_t ino = inode->i_ino;
748 struct dnode_of_data dn;
749
750 mutex_lock_op(sbi, NODE_TRUNC);
751 page = get_node_page(sbi, ino);
752 if (IS_ERR(page)) {
753 mutex_unlock_op(sbi, NODE_TRUNC);
754 return PTR_ERR(page);
755 }
756
757 if (F2FS_I(inode)->i_xattr_nid) {
758 nid_t nid = F2FS_I(inode)->i_xattr_nid;
759 struct page *npage = get_node_page(sbi, nid);
760
761 if (IS_ERR(npage)) {
762 mutex_unlock_op(sbi, NODE_TRUNC);
763 return PTR_ERR(npage);
764 }
765
766 F2FS_I(inode)->i_xattr_nid = 0;
767 set_new_dnode(&dn, inode, page, npage, nid);
768 dn.inode_page_locked = 1;
769 truncate_node(&dn);
770 }
771 if (inode->i_blocks == 1) {
772 /* inernally call f2fs_put_page() */
773 set_new_dnode(&dn, inode, page, page, ino);
774 truncate_node(&dn);
775 } else if (inode->i_blocks == 0) {
776 struct node_info ni;
777 get_node_info(sbi, inode->i_ino, &ni);
778
779 /* called after f2fs_new_inode() is failed */
780 BUG_ON(ni.blk_addr != NULL_ADDR);
781 f2fs_put_page(page, 1);
782 } else {
783 BUG();
784 }
785 mutex_unlock_op(sbi, NODE_TRUNC);
786 return 0;
787}
788
789int new_inode_page(struct inode *inode, struct dentry *dentry)
790{
791 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
792 struct page *page;
793 struct dnode_of_data dn;
794
795 /* allocate inode page for new inode */
796 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
797 mutex_lock_op(sbi, NODE_NEW);
798 page = new_node_page(&dn, 0);
799 init_dent_inode(dentry, page);
800 mutex_unlock_op(sbi, NODE_NEW);
801 if (IS_ERR(page))
802 return PTR_ERR(page);
803 f2fs_put_page(page, 1);
804 return 0;
805}
806
807struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
808{
809 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
810 struct address_space *mapping = sbi->node_inode->i_mapping;
811 struct node_info old_ni, new_ni;
812 struct page *page;
813 int err;
814
815 if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
816 return ERR_PTR(-EPERM);
817
818 page = grab_cache_page(mapping, dn->nid);
819 if (!page)
820 return ERR_PTR(-ENOMEM);
821
822 get_node_info(sbi, dn->nid, &old_ni);
823
824 SetPageUptodate(page);
825 fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
826
827 /* Reinitialize old_ni with new node page */
828 BUG_ON(old_ni.blk_addr != NULL_ADDR);
829 new_ni = old_ni;
830 new_ni.ino = dn->inode->i_ino;
831
832 if (!inc_valid_node_count(sbi, dn->inode, 1)) {
833 err = -ENOSPC;
834 goto fail;
835 }
836 set_node_addr(sbi, &new_ni, NEW_ADDR);
837
838 dn->node_page = page;
839 sync_inode_page(dn);
840 set_page_dirty(page);
841 set_cold_node(dn->inode, page);
842 if (ofs == 0)
843 inc_valid_inode_count(sbi);
844
845 return page;
846
847fail:
848 f2fs_put_page(page, 1);
849 return ERR_PTR(err);
850}
851
852static int read_node_page(struct page *page, int type)
853{
854 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
855 struct node_info ni;
856
857 get_node_info(sbi, page->index, &ni);
858
859 if (ni.blk_addr == NULL_ADDR)
860 return -ENOENT;
861 return f2fs_readpage(sbi, page, ni.blk_addr, type);
862}
863
864/*
865 * Readahead a node page
866 */
867void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
868{
869 struct address_space *mapping = sbi->node_inode->i_mapping;
870 struct page *apage;
871
872 apage = find_get_page(mapping, nid);
873 if (apage && PageUptodate(apage))
874 goto release_out;
875 f2fs_put_page(apage, 0);
876
877 apage = grab_cache_page(mapping, nid);
878 if (!apage)
879 return;
880
881 if (read_node_page(apage, READA))
882 goto unlock_out;
883
884 page_cache_release(apage);
885 return;
886
887unlock_out:
888 unlock_page(apage);
889release_out:
890 page_cache_release(apage);
891}
892
893struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
894{
895 int err;
896 struct page *page;
897 struct address_space *mapping = sbi->node_inode->i_mapping;
898
899 page = grab_cache_page(mapping, nid);
900 if (!page)
901 return ERR_PTR(-ENOMEM);
902
903 err = read_node_page(page, READ_SYNC);
904 if (err) {
905 f2fs_put_page(page, 1);
906 return ERR_PTR(err);
907 }
908
909 BUG_ON(nid != nid_of_node(page));
910 mark_page_accessed(page);
911 return page;
912}
913
914/*
915 * Return a locked page for the desired node page.
916 * And, readahead MAX_RA_NODE number of node pages.
917 */
918struct page *get_node_page_ra(struct page *parent, int start)
919{
920 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
921 struct address_space *mapping = sbi->node_inode->i_mapping;
922 int i, end;
923 int err = 0;
924 nid_t nid;
925 struct page *page;
926
927 /* First, try getting the desired direct node. */
928 nid = get_nid(parent, start, false);
929 if (!nid)
930 return ERR_PTR(-ENOENT);
931
932 page = find_get_page(mapping, nid);
933 if (page && PageUptodate(page))
934 goto page_hit;
935 f2fs_put_page(page, 0);
936
937repeat:
938 page = grab_cache_page(mapping, nid);
939 if (!page)
940 return ERR_PTR(-ENOMEM);
941
942 err = read_node_page(page, READA);
943 if (err) {
944 f2fs_put_page(page, 1);
945 return ERR_PTR(err);
946 }
947
948 /* Then, try readahead for siblings of the desired node */
949 end = start + MAX_RA_NODE;
950 end = min(end, NIDS_PER_BLOCK);
951 for (i = start + 1; i < end; i++) {
952 nid = get_nid(parent, i, false);
953 if (!nid)
954 continue;
955 ra_node_page(sbi, nid);
956 }
957
958page_hit:
959 lock_page(page);
960 if (PageError(page)) {
961 f2fs_put_page(page, 1);
962 return ERR_PTR(-EIO);
963 }
964
965 /* Has the page been truncated? */
966 if (page->mapping != mapping) {
967 f2fs_put_page(page, 1);
968 goto repeat;
969 }
970 return page;
971}
972
973void sync_inode_page(struct dnode_of_data *dn)
974{
975 if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
976 update_inode(dn->inode, dn->node_page);
977 } else if (dn->inode_page) {
978 if (!dn->inode_page_locked)
979 lock_page(dn->inode_page);
980 update_inode(dn->inode, dn->inode_page);
981 if (!dn->inode_page_locked)
982 unlock_page(dn->inode_page);
983 } else {
984 f2fs_write_inode(dn->inode, NULL);
985 }
986}
987
988int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
989 struct writeback_control *wbc)
990{
991 struct address_space *mapping = sbi->node_inode->i_mapping;
992 pgoff_t index, end;
993 struct pagevec pvec;
994 int step = ino ? 2 : 0;
995 int nwritten = 0, wrote = 0;
996
997 pagevec_init(&pvec, 0);
998
999next_step:
1000 index = 0;
1001 end = LONG_MAX;
1002
1003 while (index <= end) {
1004 int i, nr_pages;
1005 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
1006 PAGECACHE_TAG_DIRTY,
1007 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1008 if (nr_pages == 0)
1009 break;
1010
1011 for (i = 0; i < nr_pages; i++) {
1012 struct page *page = pvec.pages[i];
1013
1014 /*
1015 * flushing sequence with step:
1016 * 0. indirect nodes
1017 * 1. dentry dnodes
1018 * 2. file dnodes
1019 */
1020 if (step == 0 && IS_DNODE(page))
1021 continue;
1022 if (step == 1 && (!IS_DNODE(page) ||
1023 is_cold_node(page)))
1024 continue;
1025 if (step == 2 && (!IS_DNODE(page) ||
1026 !is_cold_node(page)))
1027 continue;
1028
1029 /*
1030 * If an fsync mode,
1031 * we should not skip writing node pages.
1032 */
1033 if (ino && ino_of_node(page) == ino)
1034 lock_page(page);
1035 else if (!trylock_page(page))
1036 continue;
1037
1038 if (unlikely(page->mapping != mapping)) {
1039continue_unlock:
1040 unlock_page(page);
1041 continue;
1042 }
1043 if (ino && ino_of_node(page) != ino)
1044 goto continue_unlock;
1045
1046 if (!PageDirty(page)) {
1047 /* someone wrote it for us */
1048 goto continue_unlock;
1049 }
1050
1051 if (!clear_page_dirty_for_io(page))
1052 goto continue_unlock;
1053
1054 /* called by fsync() */
1055 if (ino && IS_DNODE(page)) {
1056 int mark = !is_checkpointed_node(sbi, ino);
1057 set_fsync_mark(page, 1);
1058 if (IS_INODE(page))
1059 set_dentry_mark(page, mark);
1060 nwritten++;
1061 } else {
1062 set_fsync_mark(page, 0);
1063 set_dentry_mark(page, 0);
1064 }
1065 mapping->a_ops->writepage(page, wbc);
1066 wrote++;
1067
1068 if (--wbc->nr_to_write == 0)
1069 break;
1070 }
1071 pagevec_release(&pvec);
1072 cond_resched();
1073
1074 if (wbc->nr_to_write == 0) {
1075 step = 2;
1076 break;
1077 }
1078 }
1079
1080 if (step < 2) {
1081 step++;
1082 goto next_step;
1083 }
1084
1085 if (wrote)
1086 f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
1087
1088 return nwritten;
1089}
1090
1091static int f2fs_write_node_page(struct page *page,
1092 struct writeback_control *wbc)
1093{
1094 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1095 nid_t nid;
1096 unsigned int nofs;
1097 block_t new_addr;
1098 struct node_info ni;
1099
1100 if (wbc->for_reclaim) {
1101 dec_page_count(sbi, F2FS_DIRTY_NODES);
1102 wbc->pages_skipped++;
1103 set_page_dirty(page);
1104 return AOP_WRITEPAGE_ACTIVATE;
1105 }
1106
1107 wait_on_page_writeback(page);
1108
1109 mutex_lock_op(sbi, NODE_WRITE);
1110
1111 /* get old block addr of this node page */
1112 nid = nid_of_node(page);
1113 nofs = ofs_of_node(page);
1114 BUG_ON(page->index != nid);
1115
1116 get_node_info(sbi, nid, &ni);
1117
1118 /* This page is already truncated */
1119 if (ni.blk_addr == NULL_ADDR)
1120 return 0;
1121
1122 set_page_writeback(page);
1123
1124 /* insert node offset */
1125 write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
1126 set_node_addr(sbi, &ni, new_addr);
1127 dec_page_count(sbi, F2FS_DIRTY_NODES);
1128
1129 mutex_unlock_op(sbi, NODE_WRITE);
1130 unlock_page(page);
1131 return 0;
1132}
1133
1134static int f2fs_write_node_pages(struct address_space *mapping,
1135 struct writeback_control *wbc)
1136{
1137 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1138 struct block_device *bdev = sbi->sb->s_bdev;
1139 long nr_to_write = wbc->nr_to_write;
1140
1141 if (wbc->for_kupdate)
1142 return 0;
1143
1144 if (get_pages(sbi, F2FS_DIRTY_NODES) == 0)
1145 return 0;
1146
1147 if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
1148 write_checkpoint(sbi, false, false);
1149 return 0;
1150 }
1151
1152 /* if mounting is failed, skip writing node pages */
1153 wbc->nr_to_write = bio_get_nr_vecs(bdev);
1154 sync_node_pages(sbi, 0, wbc);
1155 wbc->nr_to_write = nr_to_write -
1156 (bio_get_nr_vecs(bdev) - wbc->nr_to_write);
1157 return 0;
1158}
1159
1160static int f2fs_set_node_page_dirty(struct page *page)
1161{
1162 struct address_space *mapping = page->mapping;
1163 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1164
1165 SetPageUptodate(page);
1166 if (!PageDirty(page)) {
1167 __set_page_dirty_nobuffers(page);
1168 inc_page_count(sbi, F2FS_DIRTY_NODES);
1169 SetPagePrivate(page);
1170 return 1;
1171 }
1172 return 0;
1173}
1174
1175static void f2fs_invalidate_node_page(struct page *page, unsigned long offset)
1176{
1177 struct inode *inode = page->mapping->host;
1178 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1179 if (PageDirty(page))
1180 dec_page_count(sbi, F2FS_DIRTY_NODES);
1181 ClearPagePrivate(page);
1182}
1183
1184static int f2fs_release_node_page(struct page *page, gfp_t wait)
1185{
1186 ClearPagePrivate(page);
1187 return 0;
1188}
1189
1190/*
1191 * Structure of the f2fs node operations
1192 */
1193const struct address_space_operations f2fs_node_aops = {
1194 .writepage = f2fs_write_node_page,
1195 .writepages = f2fs_write_node_pages,
1196 .set_page_dirty = f2fs_set_node_page_dirty,
1197 .invalidatepage = f2fs_invalidate_node_page,
1198 .releasepage = f2fs_release_node_page,
1199};
1200
1201static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
1202{
1203 struct list_head *this;
1204 struct free_nid *i = NULL;
1205 list_for_each(this, head) {
1206 i = list_entry(this, struct free_nid, list);
1207 if (i->nid == n)
1208 break;
1209 i = NULL;
1210 }
1211 return i;
1212}
1213
1214static void __del_from_free_nid_list(struct free_nid *i)
1215{
1216 list_del(&i->list);
1217 kmem_cache_free(free_nid_slab, i);
1218}
1219
1220static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
1221{
1222 struct free_nid *i;
1223
1224 if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
1225 return 0;
1226retry:
1227 i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
1228 if (!i) {
1229 cond_resched();
1230 goto retry;
1231 }
1232 i->nid = nid;
1233 i->state = NID_NEW;
1234
1235 spin_lock(&nm_i->free_nid_list_lock);
1236 if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
1237 spin_unlock(&nm_i->free_nid_list_lock);
1238 kmem_cache_free(free_nid_slab, i);
1239 return 0;
1240 }
1241 list_add_tail(&i->list, &nm_i->free_nid_list);
1242 nm_i->fcnt++;
1243 spin_unlock(&nm_i->free_nid_list_lock);
1244 return 1;
1245}
1246
1247static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
1248{
1249 struct free_nid *i;
1250 spin_lock(&nm_i->free_nid_list_lock);
1251 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
1252 if (i && i->state == NID_NEW) {
1253 __del_from_free_nid_list(i);
1254 nm_i->fcnt--;
1255 }
1256 spin_unlock(&nm_i->free_nid_list_lock);
1257}
1258
1259static int scan_nat_page(struct f2fs_nm_info *nm_i,
1260 struct page *nat_page, nid_t start_nid)
1261{
1262 struct f2fs_nat_block *nat_blk = page_address(nat_page);
1263 block_t blk_addr;
1264 int fcnt = 0;
1265 int i;
1266
1267 /* 0 nid should not be used */
1268 if (start_nid == 0)
1269 ++start_nid;
1270
1271 i = start_nid % NAT_ENTRY_PER_BLOCK;
1272
1273 for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1274 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1275 BUG_ON(blk_addr == NEW_ADDR);
1276 if (blk_addr == NULL_ADDR)
1277 fcnt += add_free_nid(nm_i, start_nid);
1278 }
1279 return fcnt;
1280}
1281
1282static void build_free_nids(struct f2fs_sb_info *sbi)
1283{
1284 struct free_nid *fnid, *next_fnid;
1285 struct f2fs_nm_info *nm_i = NM_I(sbi);
1286 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1287 struct f2fs_summary_block *sum = curseg->sum_blk;
1288 nid_t nid = 0;
1289 bool is_cycled = false;
1290 int fcnt = 0;
1291 int i;
1292
1293 nid = nm_i->next_scan_nid;
1294 nm_i->init_scan_nid = nid;
1295
1296 ra_nat_pages(sbi, nid);
1297
1298 while (1) {
1299 struct page *page = get_current_nat_page(sbi, nid);
1300
1301 fcnt += scan_nat_page(nm_i, page, nid);
1302 f2fs_put_page(page, 1);
1303
1304 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
1305
1306 if (nid >= nm_i->max_nid) {
1307 nid = 0;
1308 is_cycled = true;
1309 }
1310 if (fcnt > MAX_FREE_NIDS)
1311 break;
1312 if (is_cycled && nm_i->init_scan_nid <= nid)
1313 break;
1314 }
1315
1316 nm_i->next_scan_nid = nid;
1317
1318 /* find free nids from current sum_pages */
1319 mutex_lock(&curseg->curseg_mutex);
1320 for (i = 0; i < nats_in_cursum(sum); i++) {
1321 block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
1322 nid = le32_to_cpu(nid_in_journal(sum, i));
1323 if (addr == NULL_ADDR)
1324 add_free_nid(nm_i, nid);
1325 else
1326 remove_free_nid(nm_i, nid);
1327 }
1328 mutex_unlock(&curseg->curseg_mutex);
1329
1330 /* remove the free nids from current allocated nids */
1331 list_for_each_entry_safe(fnid, next_fnid, &nm_i->free_nid_list, list) {
1332 struct nat_entry *ne;
1333
1334 read_lock(&nm_i->nat_tree_lock);
1335 ne = __lookup_nat_cache(nm_i, fnid->nid);
1336 if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
1337 remove_free_nid(nm_i, fnid->nid);
1338 read_unlock(&nm_i->nat_tree_lock);
1339 }
1340}
1341
1342/*
1343 * If this function returns success, caller can obtain a new nid
1344 * from second parameter of this function.
1345 * The returned nid could be used ino as well as nid when inode is created.
1346 */
1347bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
1348{
1349 struct f2fs_nm_info *nm_i = NM_I(sbi);
1350 struct free_nid *i = NULL;
1351 struct list_head *this;
1352retry:
1353 mutex_lock(&nm_i->build_lock);
1354 if (!nm_i->fcnt) {
1355 /* scan NAT in order to build free nid list */
1356 build_free_nids(sbi);
1357 if (!nm_i->fcnt) {
1358 mutex_unlock(&nm_i->build_lock);
1359 return false;
1360 }
1361 }
1362 mutex_unlock(&nm_i->build_lock);
1363
1364 /*
1365 * We check fcnt again since previous check is racy as
1366 * we didn't hold free_nid_list_lock. So other thread
1367 * could consume all of free nids.
1368 */
1369 spin_lock(&nm_i->free_nid_list_lock);
1370 if (!nm_i->fcnt) {
1371 spin_unlock(&nm_i->free_nid_list_lock);
1372 goto retry;
1373 }
1374
1375 BUG_ON(list_empty(&nm_i->free_nid_list));
1376 list_for_each(this, &nm_i->free_nid_list) {
1377 i = list_entry(this, struct free_nid, list);
1378 if (i->state == NID_NEW)
1379 break;
1380 }
1381
1382 BUG_ON(i->state != NID_NEW);
1383 *nid = i->nid;
1384 i->state = NID_ALLOC;
1385 nm_i->fcnt--;
1386 spin_unlock(&nm_i->free_nid_list_lock);
1387 return true;
1388}
1389
1390/*
1391 * alloc_nid() should be called prior to this function.
1392 */
1393void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1394{
1395 struct f2fs_nm_info *nm_i = NM_I(sbi);
1396 struct free_nid *i;
1397
1398 spin_lock(&nm_i->free_nid_list_lock);
1399 i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
1400 if (i) {
1401 BUG_ON(i->state != NID_ALLOC);
1402 __del_from_free_nid_list(i);
1403 }
1404 spin_unlock(&nm_i->free_nid_list_lock);
1405}
1406
1407/*
1408 * alloc_nid() should be called prior to this function.
1409 */
1410void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1411{
1412 alloc_nid_done(sbi, nid);
1413 add_free_nid(NM_I(sbi), nid);
1414}
1415
1416void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
1417 struct f2fs_summary *sum, struct node_info *ni,
1418 block_t new_blkaddr)
1419{
1420 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
1421 set_node_addr(sbi, ni, new_blkaddr);
1422 clear_node_page_dirty(page);
1423}
1424
1425int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1426{
1427 struct address_space *mapping = sbi->node_inode->i_mapping;
1428 struct f2fs_node *src, *dst;
1429 nid_t ino = ino_of_node(page);
1430 struct node_info old_ni, new_ni;
1431 struct page *ipage;
1432
1433 ipage = grab_cache_page(mapping, ino);
1434 if (!ipage)
1435 return -ENOMEM;
1436
1437 /* Should not use this inode from free nid list */
1438 remove_free_nid(NM_I(sbi), ino);
1439
1440 get_node_info(sbi, ino, &old_ni);
1441 SetPageUptodate(ipage);
1442 fill_node_footer(ipage, ino, ino, 0, true);
1443
1444 src = (struct f2fs_node *)page_address(page);
1445 dst = (struct f2fs_node *)page_address(ipage);
1446
1447 memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
1448 dst->i.i_size = 0;
1449 dst->i.i_blocks = cpu_to_le64(1);
1450 dst->i.i_links = cpu_to_le32(1);
1451 dst->i.i_xattr_nid = 0;
1452
1453 new_ni = old_ni;
1454 new_ni.ino = ino;
1455
1456 set_node_addr(sbi, &new_ni, NEW_ADDR);
1457 inc_valid_inode_count(sbi);
1458
1459 f2fs_put_page(ipage, 1);
1460 return 0;
1461}
1462
1463int restore_node_summary(struct f2fs_sb_info *sbi,
1464 unsigned int segno, struct f2fs_summary_block *sum)
1465{
1466 struct f2fs_node *rn;
1467 struct f2fs_summary *sum_entry;
1468 struct page *page;
1469 block_t addr;
1470 int i, last_offset;
1471
1472 /* alloc temporal page for read node */
1473 page = alloc_page(GFP_NOFS | __GFP_ZERO);
1474 if (IS_ERR(page))
1475 return PTR_ERR(page);
1476 lock_page(page);
1477
1478 /* scan the node segment */
1479 last_offset = sbi->blocks_per_seg;
1480 addr = START_BLOCK(sbi, segno);
1481 sum_entry = &sum->entries[0];
1482
1483 for (i = 0; i < last_offset; i++, sum_entry++) {
1484 if (f2fs_readpage(sbi, page, addr, READ_SYNC))
1485 goto out;
1486
1487 rn = (struct f2fs_node *)page_address(page);
1488 sum_entry->nid = rn->footer.nid;
1489 sum_entry->version = 0;
1490 sum_entry->ofs_in_node = 0;
1491 addr++;
1492
1493 /*
1494 * In order to read next node page,
1495 * we must clear PageUptodate flag.
1496 */
1497 ClearPageUptodate(page);
1498 }
1499out:
1500 unlock_page(page);
1501 __free_pages(page, 0);
1502 return 0;
1503}
1504
1505static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
1506{
1507 struct f2fs_nm_info *nm_i = NM_I(sbi);
1508 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1509 struct f2fs_summary_block *sum = curseg->sum_blk;
1510 int i;
1511
1512 mutex_lock(&curseg->curseg_mutex);
1513
1514 if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
1515 mutex_unlock(&curseg->curseg_mutex);
1516 return false;
1517 }
1518
1519 for (i = 0; i < nats_in_cursum(sum); i++) {
1520 struct nat_entry *ne;
1521 struct f2fs_nat_entry raw_ne;
1522 nid_t nid = le32_to_cpu(nid_in_journal(sum, i));
1523
1524 raw_ne = nat_in_journal(sum, i);
1525retry:
1526 write_lock(&nm_i->nat_tree_lock);
1527 ne = __lookup_nat_cache(nm_i, nid);
1528 if (ne) {
1529 __set_nat_cache_dirty(nm_i, ne);
1530 write_unlock(&nm_i->nat_tree_lock);
1531 continue;
1532 }
1533 ne = grab_nat_entry(nm_i, nid);
1534 if (!ne) {
1535 write_unlock(&nm_i->nat_tree_lock);
1536 goto retry;
1537 }
1538 nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr));
1539 nat_set_ino(ne, le32_to_cpu(raw_ne.ino));
1540 nat_set_version(ne, raw_ne.version);
1541 __set_nat_cache_dirty(nm_i, ne);
1542 write_unlock(&nm_i->nat_tree_lock);
1543 }
1544 update_nats_in_cursum(sum, -i);
1545 mutex_unlock(&curseg->curseg_mutex);
1546 return true;
1547}
1548
1549/*
1550 * This function is called during the checkpointing process.
1551 */
1552void flush_nat_entries(struct f2fs_sb_info *sbi)
1553{
1554 struct f2fs_nm_info *nm_i = NM_I(sbi);
1555 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1556 struct f2fs_summary_block *sum = curseg->sum_blk;
1557 struct list_head *cur, *n;
1558 struct page *page = NULL;
1559 struct f2fs_nat_block *nat_blk = NULL;
1560 nid_t start_nid = 0, end_nid = 0;
1561 bool flushed;
1562
1563 flushed = flush_nats_in_journal(sbi);
1564
1565 if (!flushed)
1566 mutex_lock(&curseg->curseg_mutex);
1567
1568 /* 1) flush dirty nat caches */
1569 list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
1570 struct nat_entry *ne;
1571 nid_t nid;
1572 struct f2fs_nat_entry raw_ne;
1573 int offset = -1;
1574 block_t old_blkaddr, new_blkaddr;
1575
1576 ne = list_entry(cur, struct nat_entry, list);
1577 nid = nat_get_nid(ne);
1578
1579 if (nat_get_blkaddr(ne) == NEW_ADDR)
1580 continue;
1581 if (flushed)
1582 goto to_nat_page;
1583
1584 /* if there is room for nat enries in curseg->sumpage */
1585 offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
1586 if (offset >= 0) {
1587 raw_ne = nat_in_journal(sum, offset);
1588 old_blkaddr = le32_to_cpu(raw_ne.block_addr);
1589 goto flush_now;
1590 }
1591to_nat_page:
1592 if (!page || (start_nid > nid || nid > end_nid)) {
1593 if (page) {
1594 f2fs_put_page(page, 1);
1595 page = NULL;
1596 }
1597 start_nid = START_NID(nid);
1598 end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
1599
1600 /*
1601 * get nat block with dirty flag, increased reference
1602 * count, mapped and lock
1603 */
1604 page = get_next_nat_page(sbi, start_nid);
1605 nat_blk = page_address(page);
1606 }
1607
1608 BUG_ON(!nat_blk);
1609 raw_ne = nat_blk->entries[nid - start_nid];
1610 old_blkaddr = le32_to_cpu(raw_ne.block_addr);
1611flush_now:
1612 new_blkaddr = nat_get_blkaddr(ne);
1613
1614 raw_ne.ino = cpu_to_le32(nat_get_ino(ne));
1615 raw_ne.block_addr = cpu_to_le32(new_blkaddr);
1616 raw_ne.version = nat_get_version(ne);
1617
1618 if (offset < 0) {
1619 nat_blk->entries[nid - start_nid] = raw_ne;
1620 } else {
1621 nat_in_journal(sum, offset) = raw_ne;
1622 nid_in_journal(sum, offset) = cpu_to_le32(nid);
1623 }
1624
1625 if (nat_get_blkaddr(ne) == NULL_ADDR) {
1626 write_lock(&nm_i->nat_tree_lock);
1627 __del_from_nat_cache(nm_i, ne);
1628 write_unlock(&nm_i->nat_tree_lock);
1629
1630 /* We can reuse this freed nid at this point */
1631 add_free_nid(NM_I(sbi), nid);
1632 } else {
1633 write_lock(&nm_i->nat_tree_lock);
1634 __clear_nat_cache_dirty(nm_i, ne);
1635 ne->checkpointed = true;
1636 write_unlock(&nm_i->nat_tree_lock);
1637 }
1638 }
1639 if (!flushed)
1640 mutex_unlock(&curseg->curseg_mutex);
1641 f2fs_put_page(page, 1);
1642
1643 /* 2) shrink nat caches if necessary */
1644 try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
1645}
1646
1647static int init_node_manager(struct f2fs_sb_info *sbi)
1648{
1649 struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
1650 struct f2fs_nm_info *nm_i = NM_I(sbi);
1651 unsigned char *version_bitmap;
1652 unsigned int nat_segs, nat_blocks;
1653
1654 nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
1655
1656 /* segment_count_nat includes pair segment so divide to 2. */
1657 nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
1658 nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
1659 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
1660 nm_i->fcnt = 0;
1661 nm_i->nat_cnt = 0;
1662
1663 INIT_LIST_HEAD(&nm_i->free_nid_list);
1664 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1665 INIT_LIST_HEAD(&nm_i->nat_entries);
1666 INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
1667
1668 mutex_init(&nm_i->build_lock);
1669 spin_lock_init(&nm_i->free_nid_list_lock);
1670 rwlock_init(&nm_i->nat_tree_lock);
1671
1672 nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
1673 nm_i->init_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
1674 nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
1675
1676 nm_i->nat_bitmap = kzalloc(nm_i->bitmap_size, GFP_KERNEL);
1677 if (!nm_i->nat_bitmap)
1678 return -ENOMEM;
1679 version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
1680 if (!version_bitmap)
1681 return -EFAULT;
1682
1683 /* copy version bitmap */
1684 memcpy(nm_i->nat_bitmap, version_bitmap, nm_i->bitmap_size);
1685 return 0;
1686}
1687
1688int build_node_manager(struct f2fs_sb_info *sbi)
1689{
1690 int err;
1691
1692 sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
1693 if (!sbi->nm_info)
1694 return -ENOMEM;
1695
1696 err = init_node_manager(sbi);
1697 if (err)
1698 return err;
1699
1700 build_free_nids(sbi);
1701 return 0;
1702}
1703
1704void destroy_node_manager(struct f2fs_sb_info *sbi)
1705{
1706 struct f2fs_nm_info *nm_i = NM_I(sbi);
1707 struct free_nid *i, *next_i;
1708 struct nat_entry *natvec[NATVEC_SIZE];
1709 nid_t nid = 0;
1710 unsigned int found;
1711
1712 if (!nm_i)
1713 return;
1714
1715 /* destroy free nid list */
1716 spin_lock(&nm_i->free_nid_list_lock);
1717 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
1718 BUG_ON(i->state == NID_ALLOC);
1719 __del_from_free_nid_list(i);
1720 nm_i->fcnt--;
1721 }
1722 BUG_ON(nm_i->fcnt);
1723 spin_unlock(&nm_i->free_nid_list_lock);
1724
1725 /* destroy nat cache */
1726 write_lock(&nm_i->nat_tree_lock);
1727 while ((found = __gang_lookup_nat_cache(nm_i,
1728 nid, NATVEC_SIZE, natvec))) {
1729 unsigned idx;
1730 for (idx = 0; idx < found; idx++) {
1731 struct nat_entry *e = natvec[idx];
1732 nid = nat_get_nid(e) + 1;
1733 __del_from_nat_cache(nm_i, e);
1734 }
1735 }
1736 BUG_ON(nm_i->nat_cnt);
1737 write_unlock(&nm_i->nat_tree_lock);
1738
1739 kfree(nm_i->nat_bitmap);
1740 sbi->nm_info = NULL;
1741 kfree(nm_i);
1742}
1743
1744int create_node_manager_caches(void)
1745{
1746 nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
1747 sizeof(struct nat_entry), NULL);
1748 if (!nat_entry_slab)
1749 return -ENOMEM;
1750
1751 free_nid_slab = f2fs_kmem_cache_create("free_nid",
1752 sizeof(struct free_nid), NULL);
1753 if (!free_nid_slab) {
1754 kmem_cache_destroy(nat_entry_slab);
1755 return -ENOMEM;
1756 }
1757 return 0;
1758}
1759
1760void destroy_node_manager_caches(void)
1761{
1762 kmem_cache_destroy(free_nid_slab);
1763 kmem_cache_destroy(nat_entry_slab);
1764}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
new file mode 100644
index 000000000000..afdb130f782e
--- /dev/null
+++ b/fs/f2fs/node.h
@@ -0,0 +1,353 @@
1/*
2 * fs/f2fs/node.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11/* start node id of a node block dedicated to the given node id */
12#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK)
13
14/* node block offset on the NAT area dedicated to the given start node id */
15#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
16
17/* # of pages to perform readahead before building free nids */
18#define FREE_NID_PAGES 4
19
20/* maximum # of free node ids to produce during build_free_nids */
21#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
22
23/* maximum readahead size for node during getting data blocks */
24#define MAX_RA_NODE 128
25
26/* maximum cached nat entries to manage memory footprint */
27#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK)
28
29/* vector size for gang look-up from nat cache that consists of radix tree */
30#define NATVEC_SIZE 64
31
32/*
33 * For node information
34 */
35struct node_info {
36 nid_t nid; /* node id */
37 nid_t ino; /* inode number of the node's owner */
38 block_t blk_addr; /* block address of the node */
39 unsigned char version; /* version of the node */
40};
41
42struct nat_entry {
43 struct list_head list; /* for clean or dirty nat list */
44 bool checkpointed; /* whether it is checkpointed or not */
45 struct node_info ni; /* in-memory node information */
46};
47
48#define nat_get_nid(nat) (nat->ni.nid)
49#define nat_set_nid(nat, n) (nat->ni.nid = n)
50#define nat_get_blkaddr(nat) (nat->ni.blk_addr)
51#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b)
52#define nat_get_ino(nat) (nat->ni.ino)
53#define nat_set_ino(nat, i) (nat->ni.ino = i)
54#define nat_get_version(nat) (nat->ni.version)
55#define nat_set_version(nat, v) (nat->ni.version = v)
56
57#define __set_nat_cache_dirty(nm_i, ne) \
58 list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
59#define __clear_nat_cache_dirty(nm_i, ne) \
60 list_move_tail(&ne->list, &nm_i->nat_entries);
61#define inc_node_version(version) (++version)
62
63static inline void node_info_from_raw_nat(struct node_info *ni,
64 struct f2fs_nat_entry *raw_ne)
65{
66 ni->ino = le32_to_cpu(raw_ne->ino);
67 ni->blk_addr = le32_to_cpu(raw_ne->block_addr);
68 ni->version = raw_ne->version;
69}
70
71/*
72 * For free nid mangement
73 */
74enum nid_state {
75 NID_NEW, /* newly added to free nid list */
76 NID_ALLOC /* it is allocated */
77};
78
79struct free_nid {
80 struct list_head list; /* for free node id list */
81 nid_t nid; /* node id */
82 int state; /* in use or not: NID_NEW or NID_ALLOC */
83};
84
85static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
86{
87 struct f2fs_nm_info *nm_i = NM_I(sbi);
88 struct free_nid *fnid;
89
90 if (nm_i->fcnt <= 0)
91 return -1;
92 spin_lock(&nm_i->free_nid_list_lock);
93 fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
94 *nid = fnid->nid;
95 spin_unlock(&nm_i->free_nid_list_lock);
96 return 0;
97}
98
99/*
100 * inline functions
101 */
102static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr)
103{
104 struct f2fs_nm_info *nm_i = NM_I(sbi);
105 memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size);
106}
107
108static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
109{
110 struct f2fs_nm_info *nm_i = NM_I(sbi);
111 pgoff_t block_off;
112 pgoff_t block_addr;
113 int seg_off;
114
115 block_off = NAT_BLOCK_OFFSET(start);
116 seg_off = block_off >> sbi->log_blocks_per_seg;
117
118 block_addr = (pgoff_t)(nm_i->nat_blkaddr +
119 (seg_off << sbi->log_blocks_per_seg << 1) +
120 (block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
121
122 if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
123 block_addr += sbi->blocks_per_seg;
124
125 return block_addr;
126}
127
128static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
129 pgoff_t block_addr)
130{
131 struct f2fs_nm_info *nm_i = NM_I(sbi);
132
133 block_addr -= nm_i->nat_blkaddr;
134 if ((block_addr >> sbi->log_blocks_per_seg) % 2)
135 block_addr -= sbi->blocks_per_seg;
136 else
137 block_addr += sbi->blocks_per_seg;
138
139 return block_addr + nm_i->nat_blkaddr;
140}
141
142static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid)
143{
144 unsigned int block_off = NAT_BLOCK_OFFSET(start_nid);
145
146 if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
147 f2fs_clear_bit(block_off, nm_i->nat_bitmap);
148 else
149 f2fs_set_bit(block_off, nm_i->nat_bitmap);
150}
151
152static inline void fill_node_footer(struct page *page, nid_t nid,
153 nid_t ino, unsigned int ofs, bool reset)
154{
155 void *kaddr = page_address(page);
156 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
157 if (reset)
158 memset(rn, 0, sizeof(*rn));
159 rn->footer.nid = cpu_to_le32(nid);
160 rn->footer.ino = cpu_to_le32(ino);
161 rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
162}
163
164static inline void copy_node_footer(struct page *dst, struct page *src)
165{
166 void *src_addr = page_address(src);
167 void *dst_addr = page_address(dst);
168 struct f2fs_node *src_rn = (struct f2fs_node *)src_addr;
169 struct f2fs_node *dst_rn = (struct f2fs_node *)dst_addr;
170 memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer));
171}
172
173static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
174{
175 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
176 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
177 void *kaddr = page_address(page);
178 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
179 rn->footer.cp_ver = ckpt->checkpoint_ver;
180 rn->footer.next_blkaddr = cpu_to_le32(blkaddr);
181}
182
183static inline nid_t ino_of_node(struct page *node_page)
184{
185 void *kaddr = page_address(node_page);
186 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
187 return le32_to_cpu(rn->footer.ino);
188}
189
190static inline nid_t nid_of_node(struct page *node_page)
191{
192 void *kaddr = page_address(node_page);
193 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
194 return le32_to_cpu(rn->footer.nid);
195}
196
197static inline unsigned int ofs_of_node(struct page *node_page)
198{
199 void *kaddr = page_address(node_page);
200 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
201 unsigned flag = le32_to_cpu(rn->footer.flag);
202 return flag >> OFFSET_BIT_SHIFT;
203}
204
205static inline unsigned long long cpver_of_node(struct page *node_page)
206{
207 void *kaddr = page_address(node_page);
208 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
209 return le64_to_cpu(rn->footer.cp_ver);
210}
211
212static inline block_t next_blkaddr_of_node(struct page *node_page)
213{
214 void *kaddr = page_address(node_page);
215 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
216 return le32_to_cpu(rn->footer.next_blkaddr);
217}
218
219/*
220 * f2fs assigns the following node offsets described as (num).
221 * N = NIDS_PER_BLOCK
222 *
223 * Inode block (0)
224 * |- direct node (1)
225 * |- direct node (2)
226 * |- indirect node (3)
227 * | `- direct node (4 => 4 + N - 1)
228 * |- indirect node (4 + N)
229 * | `- direct node (5 + N => 5 + 2N - 1)
230 * `- double indirect node (5 + 2N)
231 * `- indirect node (6 + 2N)
232 * `- direct node (x(N + 1))
233 */
234static inline bool IS_DNODE(struct page *node_page)
235{
236 unsigned int ofs = ofs_of_node(node_page);
237 if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
238 ofs == 5 + 2 * NIDS_PER_BLOCK)
239 return false;
240 if (ofs >= 6 + 2 * NIDS_PER_BLOCK) {
241 ofs -= 6 + 2 * NIDS_PER_BLOCK;
242 if ((long int)ofs % (NIDS_PER_BLOCK + 1))
243 return false;
244 }
245 return true;
246}
247
248static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
249{
250 struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
251
252 wait_on_page_writeback(p);
253
254 if (i)
255 rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
256 else
257 rn->in.nid[off] = cpu_to_le32(nid);
258 set_page_dirty(p);
259}
260
261static inline nid_t get_nid(struct page *p, int off, bool i)
262{
263 struct f2fs_node *rn = (struct f2fs_node *)page_address(p);
264 if (i)
265 return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]);
266 return le32_to_cpu(rn->in.nid[off]);
267}
268
269/*
270 * Coldness identification:
271 * - Mark cold files in f2fs_inode_info
272 * - Mark cold node blocks in their node footer
273 * - Mark cold data pages in page cache
274 */
275static inline int is_cold_file(struct inode *inode)
276{
277 return F2FS_I(inode)->i_advise & FADVISE_COLD_BIT;
278}
279
280static inline int is_cold_data(struct page *page)
281{
282 return PageChecked(page);
283}
284
285static inline void set_cold_data(struct page *page)
286{
287 SetPageChecked(page);
288}
289
290static inline void clear_cold_data(struct page *page)
291{
292 ClearPageChecked(page);
293}
294
295static inline int is_cold_node(struct page *page)
296{
297 void *kaddr = page_address(page);
298 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
299 unsigned int flag = le32_to_cpu(rn->footer.flag);
300 return flag & (0x1 << COLD_BIT_SHIFT);
301}
302
303static inline unsigned char is_fsync_dnode(struct page *page)
304{
305 void *kaddr = page_address(page);
306 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
307 unsigned int flag = le32_to_cpu(rn->footer.flag);
308 return flag & (0x1 << FSYNC_BIT_SHIFT);
309}
310
311static inline unsigned char is_dent_dnode(struct page *page)
312{
313 void *kaddr = page_address(page);
314 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
315 unsigned int flag = le32_to_cpu(rn->footer.flag);
316 return flag & (0x1 << DENT_BIT_SHIFT);
317}
318
319static inline void set_cold_node(struct inode *inode, struct page *page)
320{
321 struct f2fs_node *rn = (struct f2fs_node *)page_address(page);
322 unsigned int flag = le32_to_cpu(rn->footer.flag);
323
324 if (S_ISDIR(inode->i_mode))
325 flag &= ~(0x1 << COLD_BIT_SHIFT);
326 else
327 flag |= (0x1 << COLD_BIT_SHIFT);
328 rn->footer.flag = cpu_to_le32(flag);
329}
330
331static inline void set_fsync_mark(struct page *page, int mark)
332{
333 void *kaddr = page_address(page);
334 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
335 unsigned int flag = le32_to_cpu(rn->footer.flag);
336 if (mark)
337 flag |= (0x1 << FSYNC_BIT_SHIFT);
338 else
339 flag &= ~(0x1 << FSYNC_BIT_SHIFT);
340 rn->footer.flag = cpu_to_le32(flag);
341}
342
343static inline void set_dentry_mark(struct page *page, int mark)
344{
345 void *kaddr = page_address(page);
346 struct f2fs_node *rn = (struct f2fs_node *)kaddr;
347 unsigned int flag = le32_to_cpu(rn->footer.flag);
348 if (mark)
349 flag |= (0x1 << DENT_BIT_SHIFT);
350 else
351 flag &= ~(0x1 << DENT_BIT_SHIFT);
352 rn->footer.flag = cpu_to_le32(flag);
353}
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
new file mode 100644
index 000000000000..b07e9b6ef376
--- /dev/null
+++ b/fs/f2fs/recovery.c
@@ -0,0 +1,375 @@
1/*
2 * fs/f2fs/recovery.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include "f2fs.h"
14#include "node.h"
15#include "segment.h"
16
17static struct kmem_cache *fsync_entry_slab;
18
19bool space_for_roll_forward(struct f2fs_sb_info *sbi)
20{
21 if (sbi->last_valid_block_count + sbi->alloc_valid_block_count
22 > sbi->user_block_count)
23 return false;
24 return true;
25}
26
27static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
28 nid_t ino)
29{
30 struct list_head *this;
31 struct fsync_inode_entry *entry;
32
33 list_for_each(this, head) {
34 entry = list_entry(this, struct fsync_inode_entry, list);
35 if (entry->inode->i_ino == ino)
36 return entry;
37 }
38 return NULL;
39}
40
41static int recover_dentry(struct page *ipage, struct inode *inode)
42{
43 struct f2fs_node *raw_node = (struct f2fs_node *)kmap(ipage);
44 struct f2fs_inode *raw_inode = &(raw_node->i);
45 struct dentry dent, parent;
46 struct f2fs_dir_entry *de;
47 struct page *page;
48 struct inode *dir;
49 int err = 0;
50
51 if (!is_dent_dnode(ipage))
52 goto out;
53
54 dir = f2fs_iget(inode->i_sb, le32_to_cpu(raw_inode->i_pino));
55 if (IS_ERR(dir)) {
56 err = -EINVAL;
57 goto out;
58 }
59
60 parent.d_inode = dir;
61 dent.d_parent = &parent;
62 dent.d_name.len = le32_to_cpu(raw_inode->i_namelen);
63 dent.d_name.name = raw_inode->i_name;
64
65 de = f2fs_find_entry(dir, &dent.d_name, &page);
66 if (de) {
67 kunmap(page);
68 f2fs_put_page(page, 0);
69 } else {
70 f2fs_add_link(&dent, inode);
71 }
72 iput(dir);
73out:
74 kunmap(ipage);
75 return err;
76}
77
78static int recover_inode(struct inode *inode, struct page *node_page)
79{
80 void *kaddr = page_address(node_page);
81 struct f2fs_node *raw_node = (struct f2fs_node *)kaddr;
82 struct f2fs_inode *raw_inode = &(raw_node->i);
83
84 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
85 i_size_write(inode, le64_to_cpu(raw_inode->i_size));
86 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
87 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
88 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
89 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
90 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
91 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
92
93 return recover_dentry(node_page, inode);
94}
95
96static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
97{
98 unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
99 struct curseg_info *curseg;
100 struct page *page;
101 block_t blkaddr;
102 int err = 0;
103
104 /* get node pages in the current segment */
105 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
106 blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff;
107
108 /* read node page */
109 page = alloc_page(GFP_F2FS_ZERO);
110 if (IS_ERR(page))
111 return PTR_ERR(page);
112 lock_page(page);
113
114 while (1) {
115 struct fsync_inode_entry *entry;
116
117 if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
118 goto out;
119
120 if (cp_ver != cpver_of_node(page))
121 goto out;
122
123 if (!is_fsync_dnode(page))
124 goto next;
125
126 entry = get_fsync_inode(head, ino_of_node(page));
127 if (entry) {
128 entry->blkaddr = blkaddr;
129 if (IS_INODE(page) && is_dent_dnode(page))
130 set_inode_flag(F2FS_I(entry->inode),
131 FI_INC_LINK);
132 } else {
133 if (IS_INODE(page) && is_dent_dnode(page)) {
134 if (recover_inode_page(sbi, page)) {
135 err = -ENOMEM;
136 goto out;
137 }
138 }
139
140 /* add this fsync inode to the list */
141 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS);
142 if (!entry) {
143 err = -ENOMEM;
144 goto out;
145 }
146
147 INIT_LIST_HEAD(&entry->list);
148 list_add_tail(&entry->list, head);
149
150 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
151 if (IS_ERR(entry->inode)) {
152 err = PTR_ERR(entry->inode);
153 goto out;
154 }
155 entry->blkaddr = blkaddr;
156 }
157 if (IS_INODE(page)) {
158 err = recover_inode(entry->inode, page);
159 if (err)
160 goto out;
161 }
162next:
163 /* check next segment */
164 blkaddr = next_blkaddr_of_node(page);
165 ClearPageUptodate(page);
166 }
167out:
168 unlock_page(page);
169 __free_pages(page, 0);
170 return err;
171}
172
173static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
174 struct list_head *head)
175{
176 struct list_head *this;
177 struct fsync_inode_entry *entry;
178 list_for_each(this, head) {
179 entry = list_entry(this, struct fsync_inode_entry, list);
180 iput(entry->inode);
181 list_del(&entry->list);
182 kmem_cache_free(fsync_entry_slab, entry);
183 }
184}
185
186static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
187 block_t blkaddr)
188{
189 struct seg_entry *sentry;
190 unsigned int segno = GET_SEGNO(sbi, blkaddr);
191 unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
192 (sbi->blocks_per_seg - 1);
193 struct f2fs_summary sum;
194 nid_t ino;
195 void *kaddr;
196 struct inode *inode;
197 struct page *node_page;
198 block_t bidx;
199 int i;
200
201 sentry = get_seg_entry(sbi, segno);
202 if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
203 return;
204
205 /* Get the previous summary */
206 for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
207 struct curseg_info *curseg = CURSEG_I(sbi, i);
208 if (curseg->segno == segno) {
209 sum = curseg->sum_blk->entries[blkoff];
210 break;
211 }
212 }
213 if (i > CURSEG_COLD_DATA) {
214 struct page *sum_page = get_sum_page(sbi, segno);
215 struct f2fs_summary_block *sum_node;
216 kaddr = page_address(sum_page);
217 sum_node = (struct f2fs_summary_block *)kaddr;
218 sum = sum_node->entries[blkoff];
219 f2fs_put_page(sum_page, 1);
220 }
221
222 /* Get the node page */
223 node_page = get_node_page(sbi, le32_to_cpu(sum.nid));
224 bidx = start_bidx_of_node(ofs_of_node(node_page)) +
225 le16_to_cpu(sum.ofs_in_node);
226 ino = ino_of_node(node_page);
227 f2fs_put_page(node_page, 1);
228
229 /* Deallocate previous index in the node page */
230 inode = f2fs_iget_nowait(sbi->sb, ino);
231 truncate_hole(inode, bidx, bidx + 1);
232 iput(inode);
233}
234
235static void do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
236 struct page *page, block_t blkaddr)
237{
238 unsigned int start, end;
239 struct dnode_of_data dn;
240 struct f2fs_summary sum;
241 struct node_info ni;
242
243 start = start_bidx_of_node(ofs_of_node(page));
244 if (IS_INODE(page))
245 end = start + ADDRS_PER_INODE;
246 else
247 end = start + ADDRS_PER_BLOCK;
248
249 set_new_dnode(&dn, inode, NULL, NULL, 0);
250 if (get_dnode_of_data(&dn, start, 0))
251 return;
252
253 wait_on_page_writeback(dn.node_page);
254
255 get_node_info(sbi, dn.nid, &ni);
256 BUG_ON(ni.ino != ino_of_node(page));
257 BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page));
258
259 for (; start < end; start++) {
260 block_t src, dest;
261
262 src = datablock_addr(dn.node_page, dn.ofs_in_node);
263 dest = datablock_addr(page, dn.ofs_in_node);
264
265 if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
266 if (src == NULL_ADDR) {
267 int err = reserve_new_block(&dn);
268 /* We should not get -ENOSPC */
269 BUG_ON(err);
270 }
271
272 /* Check the previous node page having this index */
273 check_index_in_prev_nodes(sbi, dest);
274
275 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
276
277 /* write dummy data page */
278 recover_data_page(sbi, NULL, &sum, src, dest);
279 update_extent_cache(dest, &dn);
280 }
281 dn.ofs_in_node++;
282 }
283
284 /* write node page in place */
285 set_summary(&sum, dn.nid, 0, 0);
286 if (IS_INODE(dn.node_page))
287 sync_inode_page(&dn);
288
289 copy_node_footer(dn.node_page, page);
290 fill_node_footer(dn.node_page, dn.nid, ni.ino,
291 ofs_of_node(page), false);
292 set_page_dirty(dn.node_page);
293
294 recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
295 f2fs_put_dnode(&dn);
296}
297
298static void recover_data(struct f2fs_sb_info *sbi,
299 struct list_head *head, int type)
300{
301 unsigned long long cp_ver = le64_to_cpu(sbi->ckpt->checkpoint_ver);
302 struct curseg_info *curseg;
303 struct page *page;
304 block_t blkaddr;
305
306 /* get node pages in the current segment */
307 curseg = CURSEG_I(sbi, type);
308 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
309
310 /* read node page */
311 page = alloc_page(GFP_NOFS | __GFP_ZERO);
312 if (IS_ERR(page))
313 return;
314 lock_page(page);
315
316 while (1) {
317 struct fsync_inode_entry *entry;
318
319 if (f2fs_readpage(sbi, page, blkaddr, READ_SYNC))
320 goto out;
321
322 if (cp_ver != cpver_of_node(page))
323 goto out;
324
325 entry = get_fsync_inode(head, ino_of_node(page));
326 if (!entry)
327 goto next;
328
329 do_recover_data(sbi, entry->inode, page, blkaddr);
330
331 if (entry->blkaddr == blkaddr) {
332 iput(entry->inode);
333 list_del(&entry->list);
334 kmem_cache_free(fsync_entry_slab, entry);
335 }
336next:
337 /* check next segment */
338 blkaddr = next_blkaddr_of_node(page);
339 ClearPageUptodate(page);
340 }
341out:
342 unlock_page(page);
343 __free_pages(page, 0);
344
345 allocate_new_segments(sbi);
346}
347
348void recover_fsync_data(struct f2fs_sb_info *sbi)
349{
350 struct list_head inode_list;
351
352 fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
353 sizeof(struct fsync_inode_entry), NULL);
354 if (unlikely(!fsync_entry_slab))
355 return;
356
357 INIT_LIST_HEAD(&inode_list);
358
359 /* step #1: find fsynced inode numbers */
360 if (find_fsync_dnodes(sbi, &inode_list))
361 goto out;
362
363 if (list_empty(&inode_list))
364 goto out;
365
366 /* step #2: recover data */
367 sbi->por_doing = 1;
368 recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
369 sbi->por_doing = 0;
370 BUG_ON(!list_empty(&inode_list));
371out:
372 destroy_fsync_dnodes(sbi, &inode_list);
373 kmem_cache_destroy(fsync_entry_slab);
374 write_checkpoint(sbi, false, false);
375}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
new file mode 100644
index 000000000000..1b26e4ea1016
--- /dev/null
+++ b/fs/f2fs/segment.c
@@ -0,0 +1,1791 @@
1/*
2 * fs/f2fs/segment.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/fs.h>
12#include <linux/f2fs_fs.h>
13#include <linux/bio.h>
14#include <linux/blkdev.h>
15#include <linux/vmalloc.h>
16
17#include "f2fs.h"
18#include "segment.h"
19#include "node.h"
20
21static int need_to_flush(struct f2fs_sb_info *sbi)
22{
23 unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) *
24 sbi->segs_per_sec;
25 int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
26 >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
27 int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
28 >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
29
30 if (sbi->por_doing)
31 return 0;
32
33 if (free_sections(sbi) <= (node_secs + 2 * dent_secs +
34 reserved_sections(sbi)))
35 return 1;
36 return 0;
37}
38
39/*
40 * This function balances dirty node and dentry pages.
41 * In addition, it controls garbage collection.
42 */
43void f2fs_balance_fs(struct f2fs_sb_info *sbi)
44{
45 struct writeback_control wbc = {
46 .sync_mode = WB_SYNC_ALL,
47 .nr_to_write = LONG_MAX,
48 .for_reclaim = 0,
49 };
50
51 if (sbi->por_doing)
52 return;
53
54 /*
55 * We should do checkpoint when there are so many dirty node pages
56 * with enough free segments. After then, we should do GC.
57 */
58 if (need_to_flush(sbi)) {
59 sync_dirty_dir_inodes(sbi);
60 sync_node_pages(sbi, 0, &wbc);
61 }
62
63 if (has_not_enough_free_secs(sbi)) {
64 mutex_lock(&sbi->gc_mutex);
65 f2fs_gc(sbi, 1);
66 }
67}
68
69static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
70 enum dirty_type dirty_type)
71{
72 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
73
74 /* need not be added */
75 if (IS_CURSEG(sbi, segno))
76 return;
77
78 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
79 dirty_i->nr_dirty[dirty_type]++;
80
81 if (dirty_type == DIRTY) {
82 struct seg_entry *sentry = get_seg_entry(sbi, segno);
83 dirty_type = sentry->type;
84 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
85 dirty_i->nr_dirty[dirty_type]++;
86 }
87}
88
89static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
90 enum dirty_type dirty_type)
91{
92 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
93
94 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type]))
95 dirty_i->nr_dirty[dirty_type]--;
96
97 if (dirty_type == DIRTY) {
98 struct seg_entry *sentry = get_seg_entry(sbi, segno);
99 dirty_type = sentry->type;
100 if (test_and_clear_bit(segno,
101 dirty_i->dirty_segmap[dirty_type]))
102 dirty_i->nr_dirty[dirty_type]--;
103 clear_bit(segno, dirty_i->victim_segmap[FG_GC]);
104 clear_bit(segno, dirty_i->victim_segmap[BG_GC]);
105 }
106}
107
108/*
109 * Should not occur error such as -ENOMEM.
110 * Adding dirty entry into seglist is not critical operation.
111 * If a given segment is one of current working segments, it won't be added.
112 */
113void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
114{
115 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
116 unsigned short valid_blocks;
117
118 if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno))
119 return;
120
121 mutex_lock(&dirty_i->seglist_lock);
122
123 valid_blocks = get_valid_blocks(sbi, segno, 0);
124
125 if (valid_blocks == 0) {
126 __locate_dirty_segment(sbi, segno, PRE);
127 __remove_dirty_segment(sbi, segno, DIRTY);
128 } else if (valid_blocks < sbi->blocks_per_seg) {
129 __locate_dirty_segment(sbi, segno, DIRTY);
130 } else {
131 /* Recovery routine with SSR needs this */
132 __remove_dirty_segment(sbi, segno, DIRTY);
133 }
134
135 mutex_unlock(&dirty_i->seglist_lock);
136 return;
137}
138
139/*
140 * Should call clear_prefree_segments after checkpoint is done.
141 */
142static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
143{
144 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
145 unsigned int segno, offset = 0;
146 unsigned int total_segs = TOTAL_SEGS(sbi);
147
148 mutex_lock(&dirty_i->seglist_lock);
149 while (1) {
150 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
151 offset);
152 if (segno >= total_segs)
153 break;
154 __set_test_and_free(sbi, segno);
155 offset = segno + 1;
156 }
157 mutex_unlock(&dirty_i->seglist_lock);
158}
159
160void clear_prefree_segments(struct f2fs_sb_info *sbi)
161{
162 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
163 unsigned int segno, offset = 0;
164 unsigned int total_segs = TOTAL_SEGS(sbi);
165
166 mutex_lock(&dirty_i->seglist_lock);
167 while (1) {
168 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
169 offset);
170 if (segno >= total_segs)
171 break;
172
173 offset = segno + 1;
174 if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
175 dirty_i->nr_dirty[PRE]--;
176
177 /* Let's use trim */
178 if (test_opt(sbi, DISCARD))
179 blkdev_issue_discard(sbi->sb->s_bdev,
180 START_BLOCK(sbi, segno) <<
181 sbi->log_sectors_per_block,
182 1 << (sbi->log_sectors_per_block +
183 sbi->log_blocks_per_seg),
184 GFP_NOFS, 0);
185 }
186 mutex_unlock(&dirty_i->seglist_lock);
187}
188
189static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
190{
191 struct sit_info *sit_i = SIT_I(sbi);
192 if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap))
193 sit_i->dirty_sentries++;
194}
195
196static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
197 unsigned int segno, int modified)
198{
199 struct seg_entry *se = get_seg_entry(sbi, segno);
200 se->type = type;
201 if (modified)
202 __mark_sit_entry_dirty(sbi, segno);
203}
204
205static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
206{
207 struct seg_entry *se;
208 unsigned int segno, offset;
209 long int new_vblocks;
210
211 segno = GET_SEGNO(sbi, blkaddr);
212
213 se = get_seg_entry(sbi, segno);
214 new_vblocks = se->valid_blocks + del;
215 offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
216
217 BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) ||
218 (new_vblocks > sbi->blocks_per_seg)));
219
220 se->valid_blocks = new_vblocks;
221 se->mtime = get_mtime(sbi);
222 SIT_I(sbi)->max_mtime = se->mtime;
223
224 /* Update valid block bitmap */
225 if (del > 0) {
226 if (f2fs_set_bit(offset, se->cur_valid_map))
227 BUG();
228 } else {
229 if (!f2fs_clear_bit(offset, se->cur_valid_map))
230 BUG();
231 }
232 if (!f2fs_test_bit(offset, se->ckpt_valid_map))
233 se->ckpt_valid_blocks += del;
234
235 __mark_sit_entry_dirty(sbi, segno);
236
237 /* update total number of valid blocks to be written in ckpt area */
238 SIT_I(sbi)->written_valid_blocks += del;
239
240 if (sbi->segs_per_sec > 1)
241 get_sec_entry(sbi, segno)->valid_blocks += del;
242}
243
244static void refresh_sit_entry(struct f2fs_sb_info *sbi,
245 block_t old_blkaddr, block_t new_blkaddr)
246{
247 update_sit_entry(sbi, new_blkaddr, 1);
248 if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
249 update_sit_entry(sbi, old_blkaddr, -1);
250}
251
252void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
253{
254 unsigned int segno = GET_SEGNO(sbi, addr);
255 struct sit_info *sit_i = SIT_I(sbi);
256
257 BUG_ON(addr == NULL_ADDR);
258 if (addr == NEW_ADDR)
259 return;
260
261 /* add it into sit main buffer */
262 mutex_lock(&sit_i->sentry_lock);
263
264 update_sit_entry(sbi, addr, -1);
265
266 /* add it into dirty seglist */
267 locate_dirty_segment(sbi, segno);
268
269 mutex_unlock(&sit_i->sentry_lock);
270}
271
272/*
273 * This function should be resided under the curseg_mutex lock
274 */
275static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
276 struct f2fs_summary *sum, unsigned short offset)
277{
278 struct curseg_info *curseg = CURSEG_I(sbi, type);
279 void *addr = curseg->sum_blk;
280 addr += offset * sizeof(struct f2fs_summary);
281 memcpy(addr, sum, sizeof(struct f2fs_summary));
282 return;
283}
284
285/*
286 * Calculate the number of current summary pages for writing
287 */
288int npages_for_summary_flush(struct f2fs_sb_info *sbi)
289{
290 int total_size_bytes = 0;
291 int valid_sum_count = 0;
292 int i, sum_space;
293
294 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
295 if (sbi->ckpt->alloc_type[i] == SSR)
296 valid_sum_count += sbi->blocks_per_seg;
297 else
298 valid_sum_count += curseg_blkoff(sbi, i);
299 }
300
301 total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1)
302 + sizeof(struct nat_journal) + 2
303 + sizeof(struct sit_journal) + 2;
304 sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE;
305 if (total_size_bytes < sum_space)
306 return 1;
307 else if (total_size_bytes < 2 * sum_space)
308 return 2;
309 return 3;
310}
311
312/*
313 * Caller should put this summary page
314 */
315struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
316{
317 return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno));
318}
319
320static void write_sum_page(struct f2fs_sb_info *sbi,
321 struct f2fs_summary_block *sum_blk, block_t blk_addr)
322{
323 struct page *page = grab_meta_page(sbi, blk_addr);
324 void *kaddr = page_address(page);
325 memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE);
326 set_page_dirty(page);
327 f2fs_put_page(page, 1);
328}
329
330static unsigned int check_prefree_segments(struct f2fs_sb_info *sbi,
331 int ofs_unit, int type)
332{
333 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
334 unsigned long *prefree_segmap = dirty_i->dirty_segmap[PRE];
335 unsigned int segno, next_segno, i;
336 int ofs = 0;
337
338 /*
339 * If there is not enough reserved sections,
340 * we should not reuse prefree segments.
341 */
342 if (has_not_enough_free_secs(sbi))
343 return NULL_SEGNO;
344
345 /*
346 * NODE page should not reuse prefree segment,
347 * since those information is used for SPOR.
348 */
349 if (IS_NODESEG(type))
350 return NULL_SEGNO;
351next:
352 segno = find_next_bit(prefree_segmap, TOTAL_SEGS(sbi), ofs++);
353 ofs = ((segno / ofs_unit) * ofs_unit) + ofs_unit;
354 if (segno < TOTAL_SEGS(sbi)) {
355 /* skip intermediate segments in a section */
356 if (segno % ofs_unit)
357 goto next;
358
359 /* skip if whole section is not prefree */
360 next_segno = find_next_zero_bit(prefree_segmap,
361 TOTAL_SEGS(sbi), segno + 1);
362 if (next_segno - segno < ofs_unit)
363 goto next;
364
365 /* skip if whole section was not free at the last checkpoint */
366 for (i = 0; i < ofs_unit; i++)
367 if (get_seg_entry(sbi, segno)->ckpt_valid_blocks)
368 goto next;
369 return segno;
370 }
371 return NULL_SEGNO;
372}
373
374/*
375 * Find a new segment from the free segments bitmap to right order
376 * This function should be returned with success, otherwise BUG
377 */
378static void get_new_segment(struct f2fs_sb_info *sbi,
379 unsigned int *newseg, bool new_sec, int dir)
380{
381 struct free_segmap_info *free_i = FREE_I(sbi);
382 unsigned int total_secs = sbi->total_sections;
383 unsigned int segno, secno, zoneno;
384 unsigned int total_zones = sbi->total_sections / sbi->secs_per_zone;
385 unsigned int hint = *newseg / sbi->segs_per_sec;
386 unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
387 unsigned int left_start = hint;
388 bool init = true;
389 int go_left = 0;
390 int i;
391
392 write_lock(&free_i->segmap_lock);
393
394 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
395 segno = find_next_zero_bit(free_i->free_segmap,
396 TOTAL_SEGS(sbi), *newseg + 1);
397 if (segno < TOTAL_SEGS(sbi))
398 goto got_it;
399 }
400find_other_zone:
401 secno = find_next_zero_bit(free_i->free_secmap, total_secs, hint);
402 if (secno >= total_secs) {
403 if (dir == ALLOC_RIGHT) {
404 secno = find_next_zero_bit(free_i->free_secmap,
405 total_secs, 0);
406 BUG_ON(secno >= total_secs);
407 } else {
408 go_left = 1;
409 left_start = hint - 1;
410 }
411 }
412 if (go_left == 0)
413 goto skip_left;
414
415 while (test_bit(left_start, free_i->free_secmap)) {
416 if (left_start > 0) {
417 left_start--;
418 continue;
419 }
420 left_start = find_next_zero_bit(free_i->free_secmap,
421 total_secs, 0);
422 BUG_ON(left_start >= total_secs);
423 break;
424 }
425 secno = left_start;
426skip_left:
427 hint = secno;
428 segno = secno * sbi->segs_per_sec;
429 zoneno = secno / sbi->secs_per_zone;
430
431 /* give up on finding another zone */
432 if (!init)
433 goto got_it;
434 if (sbi->secs_per_zone == 1)
435 goto got_it;
436 if (zoneno == old_zoneno)
437 goto got_it;
438 if (dir == ALLOC_LEFT) {
439 if (!go_left && zoneno + 1 >= total_zones)
440 goto got_it;
441 if (go_left && zoneno == 0)
442 goto got_it;
443 }
444 for (i = 0; i < NR_CURSEG_TYPE; i++)
445 if (CURSEG_I(sbi, i)->zone == zoneno)
446 break;
447
448 if (i < NR_CURSEG_TYPE) {
449 /* zone is in user, try another */
450 if (go_left)
451 hint = zoneno * sbi->secs_per_zone - 1;
452 else if (zoneno + 1 >= total_zones)
453 hint = 0;
454 else
455 hint = (zoneno + 1) * sbi->secs_per_zone;
456 init = false;
457 goto find_other_zone;
458 }
459got_it:
460 /* set it as dirty segment in free segmap */
461 BUG_ON(test_bit(segno, free_i->free_segmap));
462 __set_inuse(sbi, segno);
463 *newseg = segno;
464 write_unlock(&free_i->segmap_lock);
465}
466
467static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
468{
469 struct curseg_info *curseg = CURSEG_I(sbi, type);
470 struct summary_footer *sum_footer;
471
472 curseg->segno = curseg->next_segno;
473 curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno);
474 curseg->next_blkoff = 0;
475 curseg->next_segno = NULL_SEGNO;
476
477 sum_footer = &(curseg->sum_blk->footer);
478 memset(sum_footer, 0, sizeof(struct summary_footer));
479 if (IS_DATASEG(type))
480 SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA);
481 if (IS_NODESEG(type))
482 SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE);
483 __set_sit_entry_type(sbi, type, curseg->segno, modified);
484}
485
486/*
487 * Allocate a current working segment.
488 * This function always allocates a free segment in LFS manner.
489 */
490static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
491{
492 struct curseg_info *curseg = CURSEG_I(sbi, type);
493 unsigned int segno = curseg->segno;
494 int dir = ALLOC_LEFT;
495
496 write_sum_page(sbi, curseg->sum_blk,
497 GET_SUM_BLOCK(sbi, curseg->segno));
498 if (type == CURSEG_WARM_DATA || type == CURSEG_COLD_DATA)
499 dir = ALLOC_RIGHT;
500
501 if (test_opt(sbi, NOHEAP))
502 dir = ALLOC_RIGHT;
503
504 get_new_segment(sbi, &segno, new_sec, dir);
505 curseg->next_segno = segno;
506 reset_curseg(sbi, type, 1);
507 curseg->alloc_type = LFS;
508}
509
510static void __next_free_blkoff(struct f2fs_sb_info *sbi,
511 struct curseg_info *seg, block_t start)
512{
513 struct seg_entry *se = get_seg_entry(sbi, seg->segno);
514 block_t ofs;
515 for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
516 if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
517 && !f2fs_test_bit(ofs, se->cur_valid_map))
518 break;
519 }
520 seg->next_blkoff = ofs;
521}
522
523/*
524 * If a segment is written by LFS manner, next block offset is just obtained
525 * by increasing the current block offset. However, if a segment is written by
526 * SSR manner, next block offset obtained by calling __next_free_blkoff
527 */
528static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
529 struct curseg_info *seg)
530{
531 if (seg->alloc_type == SSR)
532 __next_free_blkoff(sbi, seg, seg->next_blkoff + 1);
533 else
534 seg->next_blkoff++;
535}
536
537/*
538 * This function always allocates a used segment (from dirty seglist) by SSR
539 * manner, so it should recover the existing segment information of valid blocks
540 */
541static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
542{
543 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
544 struct curseg_info *curseg = CURSEG_I(sbi, type);
545 unsigned int new_segno = curseg->next_segno;
546 struct f2fs_summary_block *sum_node;
547 struct page *sum_page;
548
549 write_sum_page(sbi, curseg->sum_blk,
550 GET_SUM_BLOCK(sbi, curseg->segno));
551 __set_test_and_inuse(sbi, new_segno);
552
553 mutex_lock(&dirty_i->seglist_lock);
554 __remove_dirty_segment(sbi, new_segno, PRE);
555 __remove_dirty_segment(sbi, new_segno, DIRTY);
556 mutex_unlock(&dirty_i->seglist_lock);
557
558 reset_curseg(sbi, type, 1);
559 curseg->alloc_type = SSR;
560 __next_free_blkoff(sbi, curseg, 0);
561
562 if (reuse) {
563 sum_page = get_sum_page(sbi, new_segno);
564 sum_node = (struct f2fs_summary_block *)page_address(sum_page);
565 memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
566 f2fs_put_page(sum_page, 1);
567 }
568}
569
570/*
571 * flush out current segment and replace it with new segment
572 * This function should be returned with success, otherwise BUG
573 */
574static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
575 int type, bool force)
576{
577 struct curseg_info *curseg = CURSEG_I(sbi, type);
578 unsigned int ofs_unit;
579
580 if (force) {
581 new_curseg(sbi, type, true);
582 goto out;
583 }
584
585 ofs_unit = need_SSR(sbi) ? 1 : sbi->segs_per_sec;
586 curseg->next_segno = check_prefree_segments(sbi, ofs_unit, type);
587
588 if (curseg->next_segno != NULL_SEGNO)
589 change_curseg(sbi, type, false);
590 else if (type == CURSEG_WARM_NODE)
591 new_curseg(sbi, type, false);
592 else if (need_SSR(sbi) && get_ssr_segment(sbi, type))
593 change_curseg(sbi, type, true);
594 else
595 new_curseg(sbi, type, false);
596out:
597 sbi->segment_count[curseg->alloc_type]++;
598}
599
600void allocate_new_segments(struct f2fs_sb_info *sbi)
601{
602 struct curseg_info *curseg;
603 unsigned int old_curseg;
604 int i;
605
606 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
607 curseg = CURSEG_I(sbi, i);
608 old_curseg = curseg->segno;
609 SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
610 locate_dirty_segment(sbi, old_curseg);
611 }
612}
613
614static const struct segment_allocation default_salloc_ops = {
615 .allocate_segment = allocate_segment_by_default,
616};
617
618static void f2fs_end_io_write(struct bio *bio, int err)
619{
620 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
621 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
622 struct bio_private *p = bio->bi_private;
623
624 do {
625 struct page *page = bvec->bv_page;
626
627 if (--bvec >= bio->bi_io_vec)
628 prefetchw(&bvec->bv_page->flags);
629 if (!uptodate) {
630 SetPageError(page);
631 if (page->mapping)
632 set_bit(AS_EIO, &page->mapping->flags);
633 set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
634 set_page_dirty(page);
635 }
636 end_page_writeback(page);
637 dec_page_count(p->sbi, F2FS_WRITEBACK);
638 } while (bvec >= bio->bi_io_vec);
639
640 if (p->is_sync)
641 complete(p->wait);
642 kfree(p);
643 bio_put(bio);
644}
645
646struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
647{
648 struct bio *bio;
649 struct bio_private *priv;
650retry:
651 priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
652 if (!priv) {
653 cond_resched();
654 goto retry;
655 }
656
657 /* No failure on bio allocation */
658 bio = bio_alloc(GFP_NOIO, npages);
659 bio->bi_bdev = bdev;
660 bio->bi_private = priv;
661 return bio;
662}
663
664static void do_submit_bio(struct f2fs_sb_info *sbi,
665 enum page_type type, bool sync)
666{
667 int rw = sync ? WRITE_SYNC : WRITE;
668 enum page_type btype = type > META ? META : type;
669
670 if (type >= META_FLUSH)
671 rw = WRITE_FLUSH_FUA;
672
673 if (sbi->bio[btype]) {
674 struct bio_private *p = sbi->bio[btype]->bi_private;
675 p->sbi = sbi;
676 sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
677 if (type == META_FLUSH) {
678 DECLARE_COMPLETION_ONSTACK(wait);
679 p->is_sync = true;
680 p->wait = &wait;
681 submit_bio(rw, sbi->bio[btype]);
682 wait_for_completion(&wait);
683 } else {
684 p->is_sync = false;
685 submit_bio(rw, sbi->bio[btype]);
686 }
687 sbi->bio[btype] = NULL;
688 }
689}
690
691void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
692{
693 down_write(&sbi->bio_sem);
694 do_submit_bio(sbi, type, sync);
695 up_write(&sbi->bio_sem);
696}
697
698static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
699 block_t blk_addr, enum page_type type)
700{
701 struct block_device *bdev = sbi->sb->s_bdev;
702
703 verify_block_addr(sbi, blk_addr);
704
705 down_write(&sbi->bio_sem);
706
707 inc_page_count(sbi, F2FS_WRITEBACK);
708
709 if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
710 do_submit_bio(sbi, type, false);
711alloc_new:
712 if (sbi->bio[type] == NULL) {
713 sbi->bio[type] = f2fs_bio_alloc(bdev, bio_get_nr_vecs(bdev));
714 sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
715 /*
716 * The end_io will be assigned at the sumbission phase.
717 * Until then, let bio_add_page() merge consecutive IOs as much
718 * as possible.
719 */
720 }
721
722 if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
723 PAGE_CACHE_SIZE) {
724 do_submit_bio(sbi, type, false);
725 goto alloc_new;
726 }
727
728 sbi->last_block_in_bio[type] = blk_addr;
729
730 up_write(&sbi->bio_sem);
731}
732
733static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
734{
735 struct curseg_info *curseg = CURSEG_I(sbi, type);
736 if (curseg->next_blkoff < sbi->blocks_per_seg)
737 return true;
738 return false;
739}
740
741static int __get_segment_type_2(struct page *page, enum page_type p_type)
742{
743 if (p_type == DATA)
744 return CURSEG_HOT_DATA;
745 else
746 return CURSEG_HOT_NODE;
747}
748
749static int __get_segment_type_4(struct page *page, enum page_type p_type)
750{
751 if (p_type == DATA) {
752 struct inode *inode = page->mapping->host;
753
754 if (S_ISDIR(inode->i_mode))
755 return CURSEG_HOT_DATA;
756 else
757 return CURSEG_COLD_DATA;
758 } else {
759 if (IS_DNODE(page) && !is_cold_node(page))
760 return CURSEG_HOT_NODE;
761 else
762 return CURSEG_COLD_NODE;
763 }
764}
765
766static int __get_segment_type_6(struct page *page, enum page_type p_type)
767{
768 if (p_type == DATA) {
769 struct inode *inode = page->mapping->host;
770
771 if (S_ISDIR(inode->i_mode))
772 return CURSEG_HOT_DATA;
773 else if (is_cold_data(page) || is_cold_file(inode))
774 return CURSEG_COLD_DATA;
775 else
776 return CURSEG_WARM_DATA;
777 } else {
778 if (IS_DNODE(page))
779 return is_cold_node(page) ? CURSEG_WARM_NODE :
780 CURSEG_HOT_NODE;
781 else
782 return CURSEG_COLD_NODE;
783 }
784}
785
786static int __get_segment_type(struct page *page, enum page_type p_type)
787{
788 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
789 switch (sbi->active_logs) {
790 case 2:
791 return __get_segment_type_2(page, p_type);
792 case 4:
793 return __get_segment_type_4(page, p_type);
794 case 6:
795 return __get_segment_type_6(page, p_type);
796 default:
797 BUG();
798 }
799}
800
801static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
802 block_t old_blkaddr, block_t *new_blkaddr,
803 struct f2fs_summary *sum, enum page_type p_type)
804{
805 struct sit_info *sit_i = SIT_I(sbi);
806 struct curseg_info *curseg;
807 unsigned int old_cursegno;
808 int type;
809
810 type = __get_segment_type(page, p_type);
811 curseg = CURSEG_I(sbi, type);
812
813 mutex_lock(&curseg->curseg_mutex);
814
815 *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
816 old_cursegno = curseg->segno;
817
818 /*
819 * __add_sum_entry should be resided under the curseg_mutex
820 * because, this function updates a summary entry in the
821 * current summary block.
822 */
823 __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
824
825 mutex_lock(&sit_i->sentry_lock);
826 __refresh_next_blkoff(sbi, curseg);
827 sbi->block_count[curseg->alloc_type]++;
828
829 /*
830 * SIT information should be updated before segment allocation,
831 * since SSR needs latest valid block information.
832 */
833 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
834
835 if (!__has_curseg_space(sbi, type))
836 sit_i->s_ops->allocate_segment(sbi, type, false);
837
838 locate_dirty_segment(sbi, old_cursegno);
839 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
840 mutex_unlock(&sit_i->sentry_lock);
841
842 if (p_type == NODE)
843 fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
844
845 /* writeout dirty page into bdev */
846 submit_write_page(sbi, page, *new_blkaddr, p_type);
847
848 mutex_unlock(&curseg->curseg_mutex);
849}
850
851int write_meta_page(struct f2fs_sb_info *sbi, struct page *page,
852 struct writeback_control *wbc)
853{
854 if (wbc->for_reclaim)
855 return AOP_WRITEPAGE_ACTIVATE;
856
857 set_page_writeback(page);
858 submit_write_page(sbi, page, page->index, META);
859 return 0;
860}
861
862void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
863 unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
864{
865 struct f2fs_summary sum;
866 set_summary(&sum, nid, 0, 0);
867 do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
868}
869
870void write_data_page(struct inode *inode, struct page *page,
871 struct dnode_of_data *dn, block_t old_blkaddr,
872 block_t *new_blkaddr)
873{
874 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
875 struct f2fs_summary sum;
876 struct node_info ni;
877
878 BUG_ON(old_blkaddr == NULL_ADDR);
879 get_node_info(sbi, dn->nid, &ni);
880 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
881
882 do_write_page(sbi, page, old_blkaddr,
883 new_blkaddr, &sum, DATA);
884}
885
886void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
887 block_t old_blk_addr)
888{
889 submit_write_page(sbi, page, old_blk_addr, DATA);
890}
891
892void recover_data_page(struct f2fs_sb_info *sbi,
893 struct page *page, struct f2fs_summary *sum,
894 block_t old_blkaddr, block_t new_blkaddr)
895{
896 struct sit_info *sit_i = SIT_I(sbi);
897 struct curseg_info *curseg;
898 unsigned int segno, old_cursegno;
899 struct seg_entry *se;
900 int type;
901
902 segno = GET_SEGNO(sbi, new_blkaddr);
903 se = get_seg_entry(sbi, segno);
904 type = se->type;
905
906 if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
907 if (old_blkaddr == NULL_ADDR)
908 type = CURSEG_COLD_DATA;
909 else
910 type = CURSEG_WARM_DATA;
911 }
912 curseg = CURSEG_I(sbi, type);
913
914 mutex_lock(&curseg->curseg_mutex);
915 mutex_lock(&sit_i->sentry_lock);
916
917 old_cursegno = curseg->segno;
918
919 /* change the current segment */
920 if (segno != curseg->segno) {
921 curseg->next_segno = segno;
922 change_curseg(sbi, type, true);
923 }
924
925 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
926 (sbi->blocks_per_seg - 1);
927 __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
928
929 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
930
931 locate_dirty_segment(sbi, old_cursegno);
932 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
933
934 mutex_unlock(&sit_i->sentry_lock);
935 mutex_unlock(&curseg->curseg_mutex);
936}
937
938void rewrite_node_page(struct f2fs_sb_info *sbi,
939 struct page *page, struct f2fs_summary *sum,
940 block_t old_blkaddr, block_t new_blkaddr)
941{
942 struct sit_info *sit_i = SIT_I(sbi);
943 int type = CURSEG_WARM_NODE;
944 struct curseg_info *curseg;
945 unsigned int segno, old_cursegno;
946 block_t next_blkaddr = next_blkaddr_of_node(page);
947 unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
948
949 curseg = CURSEG_I(sbi, type);
950
951 mutex_lock(&curseg->curseg_mutex);
952 mutex_lock(&sit_i->sentry_lock);
953
954 segno = GET_SEGNO(sbi, new_blkaddr);
955 old_cursegno = curseg->segno;
956
957 /* change the current segment */
958 if (segno != curseg->segno) {
959 curseg->next_segno = segno;
960 change_curseg(sbi, type, true);
961 }
962 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
963 (sbi->blocks_per_seg - 1);
964 __add_sum_entry(sbi, type, sum, curseg->next_blkoff);
965
966 /* change the current log to the next block addr in advance */
967 if (next_segno != segno) {
968 curseg->next_segno = next_segno;
969 change_curseg(sbi, type, true);
970 }
971 curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
972 (sbi->blocks_per_seg - 1);
973
974 /* rewrite node page */
975 set_page_writeback(page);
976 submit_write_page(sbi, page, new_blkaddr, NODE);
977 f2fs_submit_bio(sbi, NODE, true);
978 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
979
980 locate_dirty_segment(sbi, old_cursegno);
981 locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
982
983 mutex_unlock(&sit_i->sentry_lock);
984 mutex_unlock(&curseg->curseg_mutex);
985}
986
987static int read_compacted_summaries(struct f2fs_sb_info *sbi)
988{
989 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
990 struct curseg_info *seg_i;
991 unsigned char *kaddr;
992 struct page *page;
993 block_t start;
994 int i, j, offset;
995
996 start = start_sum_block(sbi);
997
998 page = get_meta_page(sbi, start++);
999 kaddr = (unsigned char *)page_address(page);
1000
1001 /* Step 1: restore nat cache */
1002 seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
1003 memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE);
1004
1005 /* Step 2: restore sit cache */
1006 seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
1007 memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE,
1008 SUM_JOURNAL_SIZE);
1009 offset = 2 * SUM_JOURNAL_SIZE;
1010
1011 /* Step 3: restore summary entries */
1012 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
1013 unsigned short blk_off;
1014 unsigned int segno;
1015
1016 seg_i = CURSEG_I(sbi, i);
1017 segno = le32_to_cpu(ckpt->cur_data_segno[i]);
1018 blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]);
1019 seg_i->next_segno = segno;
1020 reset_curseg(sbi, i, 0);
1021 seg_i->alloc_type = ckpt->alloc_type[i];
1022 seg_i->next_blkoff = blk_off;
1023
1024 if (seg_i->alloc_type == SSR)
1025 blk_off = sbi->blocks_per_seg;
1026
1027 for (j = 0; j < blk_off; j++) {
1028 struct f2fs_summary *s;
1029 s = (struct f2fs_summary *)(kaddr + offset);
1030 seg_i->sum_blk->entries[j] = *s;
1031 offset += SUMMARY_SIZE;
1032 if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
1033 SUM_FOOTER_SIZE)
1034 continue;
1035
1036 f2fs_put_page(page, 1);
1037 page = NULL;
1038
1039 page = get_meta_page(sbi, start++);
1040 kaddr = (unsigned char *)page_address(page);
1041 offset = 0;
1042 }
1043 }
1044 f2fs_put_page(page, 1);
1045 return 0;
1046}
1047
1048static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
1049{
1050 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1051 struct f2fs_summary_block *sum;
1052 struct curseg_info *curseg;
1053 struct page *new;
1054 unsigned short blk_off;
1055 unsigned int segno = 0;
1056 block_t blk_addr = 0;
1057
1058 /* get segment number and block addr */
1059 if (IS_DATASEG(type)) {
1060 segno = le32_to_cpu(ckpt->cur_data_segno[type]);
1061 blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
1062 CURSEG_HOT_DATA]);
1063 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
1064 blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
1065 else
1066 blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
1067 } else {
1068 segno = le32_to_cpu(ckpt->cur_node_segno[type -
1069 CURSEG_HOT_NODE]);
1070 blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
1071 CURSEG_HOT_NODE]);
1072 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
1073 blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
1074 type - CURSEG_HOT_NODE);
1075 else
1076 blk_addr = GET_SUM_BLOCK(sbi, segno);
1077 }
1078
1079 new = get_meta_page(sbi, blk_addr);
1080 sum = (struct f2fs_summary_block *)page_address(new);
1081
1082 if (IS_NODESEG(type)) {
1083 if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
1084 struct f2fs_summary *ns = &sum->entries[0];
1085 int i;
1086 for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
1087 ns->version = 0;
1088 ns->ofs_in_node = 0;
1089 }
1090 } else {
1091 if (restore_node_summary(sbi, segno, sum)) {
1092 f2fs_put_page(new, 1);
1093 return -EINVAL;
1094 }
1095 }
1096 }
1097
1098 /* set uncompleted segment to curseg */
1099 curseg = CURSEG_I(sbi, type);
1100 mutex_lock(&curseg->curseg_mutex);
1101 memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE);
1102 curseg->next_segno = segno;
1103 reset_curseg(sbi, type, 0);
1104 curseg->alloc_type = ckpt->alloc_type[type];
1105 curseg->next_blkoff = blk_off;
1106 mutex_unlock(&curseg->curseg_mutex);
1107 f2fs_put_page(new, 1);
1108 return 0;
1109}
1110
1111static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
1112{
1113 int type = CURSEG_HOT_DATA;
1114
1115 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
1116 /* restore for compacted data summary */
1117 if (read_compacted_summaries(sbi))
1118 return -EINVAL;
1119 type = CURSEG_HOT_NODE;
1120 }
1121
1122 for (; type <= CURSEG_COLD_NODE; type++)
1123 if (read_normal_summaries(sbi, type))
1124 return -EINVAL;
1125 return 0;
1126}
1127
1128static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
1129{
1130 struct page *page;
1131 unsigned char *kaddr;
1132 struct f2fs_summary *summary;
1133 struct curseg_info *seg_i;
1134 int written_size = 0;
1135 int i, j;
1136
1137 page = grab_meta_page(sbi, blkaddr++);
1138 kaddr = (unsigned char *)page_address(page);
1139
1140 /* Step 1: write nat cache */
1141 seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA);
1142 memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE);
1143 written_size += SUM_JOURNAL_SIZE;
1144
1145 /* Step 2: write sit cache */
1146 seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA);
1147 memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits,
1148 SUM_JOURNAL_SIZE);
1149 written_size += SUM_JOURNAL_SIZE;
1150
1151 set_page_dirty(page);
1152
1153 /* Step 3: write summary entries */
1154 for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
1155 unsigned short blkoff;
1156 seg_i = CURSEG_I(sbi, i);
1157 if (sbi->ckpt->alloc_type[i] == SSR)
1158 blkoff = sbi->blocks_per_seg;
1159 else
1160 blkoff = curseg_blkoff(sbi, i);
1161
1162 for (j = 0; j < blkoff; j++) {
1163 if (!page) {
1164 page = grab_meta_page(sbi, blkaddr++);
1165 kaddr = (unsigned char *)page_address(page);
1166 written_size = 0;
1167 }
1168 summary = (struct f2fs_summary *)(kaddr + written_size);
1169 *summary = seg_i->sum_blk->entries[j];
1170 written_size += SUMMARY_SIZE;
1171 set_page_dirty(page);
1172
1173 if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
1174 SUM_FOOTER_SIZE)
1175 continue;
1176
1177 f2fs_put_page(page, 1);
1178 page = NULL;
1179 }
1180 }
1181 if (page)
1182 f2fs_put_page(page, 1);
1183}
1184
1185static void write_normal_summaries(struct f2fs_sb_info *sbi,
1186 block_t blkaddr, int type)
1187{
1188 int i, end;
1189 if (IS_DATASEG(type))
1190 end = type + NR_CURSEG_DATA_TYPE;
1191 else
1192 end = type + NR_CURSEG_NODE_TYPE;
1193
1194 for (i = type; i < end; i++) {
1195 struct curseg_info *sum = CURSEG_I(sbi, i);
1196 mutex_lock(&sum->curseg_mutex);
1197 write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type));
1198 mutex_unlock(&sum->curseg_mutex);
1199 }
1200}
1201
1202void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1203{
1204 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG))
1205 write_compacted_summaries(sbi, start_blk);
1206 else
1207 write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA);
1208}
1209
1210void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
1211{
1212 if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
1213 write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
1214 return;
1215}
1216
1217int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
1218 unsigned int val, int alloc)
1219{
1220 int i;
1221
1222 if (type == NAT_JOURNAL) {
1223 for (i = 0; i < nats_in_cursum(sum); i++) {
1224 if (le32_to_cpu(nid_in_journal(sum, i)) == val)
1225 return i;
1226 }
1227 if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
1228 return update_nats_in_cursum(sum, 1);
1229 } else if (type == SIT_JOURNAL) {
1230 for (i = 0; i < sits_in_cursum(sum); i++)
1231 if (le32_to_cpu(segno_in_journal(sum, i)) == val)
1232 return i;
1233 if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
1234 return update_sits_in_cursum(sum, 1);
1235 }
1236 return -1;
1237}
1238
1239static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
1240 unsigned int segno)
1241{
1242 struct sit_info *sit_i = SIT_I(sbi);
1243 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno);
1244 block_t blk_addr = sit_i->sit_base_addr + offset;
1245
1246 check_seg_range(sbi, segno);
1247
1248 /* calculate sit block address */
1249 if (f2fs_test_bit(offset, sit_i->sit_bitmap))
1250 blk_addr += sit_i->sit_blocks;
1251
1252 return get_meta_page(sbi, blk_addr);
1253}
1254
1255static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
1256 unsigned int start)
1257{
1258 struct sit_info *sit_i = SIT_I(sbi);
1259 struct page *src_page, *dst_page;
1260 pgoff_t src_off, dst_off;
1261 void *src_addr, *dst_addr;
1262
1263 src_off = current_sit_addr(sbi, start);
1264 dst_off = next_sit_addr(sbi, src_off);
1265
1266 /* get current sit block page without lock */
1267 src_page = get_meta_page(sbi, src_off);
1268 dst_page = grab_meta_page(sbi, dst_off);
1269 BUG_ON(PageDirty(src_page));
1270
1271 src_addr = page_address(src_page);
1272 dst_addr = page_address(dst_page);
1273 memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE);
1274
1275 set_page_dirty(dst_page);
1276 f2fs_put_page(src_page, 1);
1277
1278 set_to_next_sit(sit_i, start);
1279
1280 return dst_page;
1281}
1282
1283static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
1284{
1285 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1286 struct f2fs_summary_block *sum = curseg->sum_blk;
1287 int i;
1288
1289 /*
1290 * If the journal area in the current summary is full of sit entries,
1291 * all the sit entries will be flushed. Otherwise the sit entries
1292 * are not able to replace with newly hot sit entries.
1293 */
1294 if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) {
1295 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
1296 unsigned int segno;
1297 segno = le32_to_cpu(segno_in_journal(sum, i));
1298 __mark_sit_entry_dirty(sbi, segno);
1299 }
1300 update_sits_in_cursum(sum, -sits_in_cursum(sum));
1301 return 1;
1302 }
1303 return 0;
1304}
1305
1306/*
1307 * CP calls this function, which flushes SIT entries including sit_journal,
1308 * and moves prefree segs to free segs.
1309 */
1310void flush_sit_entries(struct f2fs_sb_info *sbi)
1311{
1312 struct sit_info *sit_i = SIT_I(sbi);
1313 unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
1314 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1315 struct f2fs_summary_block *sum = curseg->sum_blk;
1316 unsigned long nsegs = TOTAL_SEGS(sbi);
1317 struct page *page = NULL;
1318 struct f2fs_sit_block *raw_sit = NULL;
1319 unsigned int start = 0, end = 0;
1320 unsigned int segno = -1;
1321 bool flushed;
1322
1323 mutex_lock(&curseg->curseg_mutex);
1324 mutex_lock(&sit_i->sentry_lock);
1325
1326 /*
1327 * "flushed" indicates whether sit entries in journal are flushed
1328 * to the SIT area or not.
1329 */
1330 flushed = flush_sits_in_journal(sbi);
1331
1332 while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) {
1333 struct seg_entry *se = get_seg_entry(sbi, segno);
1334 int sit_offset, offset;
1335
1336 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
1337
1338 if (flushed)
1339 goto to_sit_page;
1340
1341 offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1);
1342 if (offset >= 0) {
1343 segno_in_journal(sum, offset) = cpu_to_le32(segno);
1344 seg_info_to_raw_sit(se, &sit_in_journal(sum, offset));
1345 goto flush_done;
1346 }
1347to_sit_page:
1348 if (!page || (start > segno) || (segno > end)) {
1349 if (page) {
1350 f2fs_put_page(page, 1);
1351 page = NULL;
1352 }
1353
1354 start = START_SEGNO(sit_i, segno);
1355 end = start + SIT_ENTRY_PER_BLOCK - 1;
1356
1357 /* read sit block that will be updated */
1358 page = get_next_sit_page(sbi, start);
1359 raw_sit = page_address(page);
1360 }
1361
1362 /* udpate entry in SIT block */
1363 seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]);
1364flush_done:
1365 __clear_bit(segno, bitmap);
1366 sit_i->dirty_sentries--;
1367 }
1368 mutex_unlock(&sit_i->sentry_lock);
1369 mutex_unlock(&curseg->curseg_mutex);
1370
1371 /* writeout last modified SIT block */
1372 f2fs_put_page(page, 1);
1373
1374 set_prefree_as_free_segments(sbi);
1375}
1376
1377static int build_sit_info(struct f2fs_sb_info *sbi)
1378{
1379 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
1380 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1381 struct sit_info *sit_i;
1382 unsigned int sit_segs, start;
1383 char *src_bitmap, *dst_bitmap;
1384 unsigned int bitmap_size;
1385
1386 /* allocate memory for SIT information */
1387 sit_i = kzalloc(sizeof(struct sit_info), GFP_KERNEL);
1388 if (!sit_i)
1389 return -ENOMEM;
1390
1391 SM_I(sbi)->sit_info = sit_i;
1392
1393 sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry));
1394 if (!sit_i->sentries)
1395 return -ENOMEM;
1396
1397 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1398 sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
1399 if (!sit_i->dirty_sentries_bitmap)
1400 return -ENOMEM;
1401
1402 for (start = 0; start < TOTAL_SEGS(sbi); start++) {
1403 sit_i->sentries[start].cur_valid_map
1404 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1405 sit_i->sentries[start].ckpt_valid_map
1406 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1407 if (!sit_i->sentries[start].cur_valid_map
1408 || !sit_i->sentries[start].ckpt_valid_map)
1409 return -ENOMEM;
1410 }
1411
1412 if (sbi->segs_per_sec > 1) {
1413 sit_i->sec_entries = vzalloc(sbi->total_sections *
1414 sizeof(struct sec_entry));
1415 if (!sit_i->sec_entries)
1416 return -ENOMEM;
1417 }
1418
1419 /* get information related with SIT */
1420 sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1;
1421
1422 /* setup SIT bitmap from ckeckpoint pack */
1423 bitmap_size = __bitmap_size(sbi, SIT_BITMAP);
1424 src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP);
1425
1426 dst_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
1427 if (!dst_bitmap)
1428 return -ENOMEM;
1429 memcpy(dst_bitmap, src_bitmap, bitmap_size);
1430
1431 /* init SIT information */
1432 sit_i->s_ops = &default_salloc_ops;
1433
1434 sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
1435 sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
1436 sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
1437 sit_i->sit_bitmap = dst_bitmap;
1438 sit_i->bitmap_size = bitmap_size;
1439 sit_i->dirty_sentries = 0;
1440 sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
1441 sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
1442 sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec;
1443 mutex_init(&sit_i->sentry_lock);
1444 return 0;
1445}
1446
1447static int build_free_segmap(struct f2fs_sb_info *sbi)
1448{
1449 struct f2fs_sm_info *sm_info = SM_I(sbi);
1450 struct free_segmap_info *free_i;
1451 unsigned int bitmap_size, sec_bitmap_size;
1452
1453 /* allocate memory for free segmap information */
1454 free_i = kzalloc(sizeof(struct free_segmap_info), GFP_KERNEL);
1455 if (!free_i)
1456 return -ENOMEM;
1457
1458 SM_I(sbi)->free_info = free_i;
1459
1460 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1461 free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
1462 if (!free_i->free_segmap)
1463 return -ENOMEM;
1464
1465 sec_bitmap_size = f2fs_bitmap_size(sbi->total_sections);
1466 free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
1467 if (!free_i->free_secmap)
1468 return -ENOMEM;
1469
1470 /* set all segments as dirty temporarily */
1471 memset(free_i->free_segmap, 0xff, bitmap_size);
1472 memset(free_i->free_secmap, 0xff, sec_bitmap_size);
1473
1474 /* init free segmap information */
1475 free_i->start_segno =
1476 (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
1477 free_i->free_segments = 0;
1478 free_i->free_sections = 0;
1479 rwlock_init(&free_i->segmap_lock);
1480 return 0;
1481}
1482
1483static int build_curseg(struct f2fs_sb_info *sbi)
1484{
1485 struct curseg_info *array;
1486 int i;
1487
1488 array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL);
1489 if (!array)
1490 return -ENOMEM;
1491
1492 SM_I(sbi)->curseg_array = array;
1493
1494 for (i = 0; i < NR_CURSEG_TYPE; i++) {
1495 mutex_init(&array[i].curseg_mutex);
1496 array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
1497 if (!array[i].sum_blk)
1498 return -ENOMEM;
1499 array[i].segno = NULL_SEGNO;
1500 array[i].next_blkoff = 0;
1501 }
1502 return restore_curseg_summaries(sbi);
1503}
1504
1505static void build_sit_entries(struct f2fs_sb_info *sbi)
1506{
1507 struct sit_info *sit_i = SIT_I(sbi);
1508 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1509 struct f2fs_summary_block *sum = curseg->sum_blk;
1510 unsigned int start;
1511
1512 for (start = 0; start < TOTAL_SEGS(sbi); start++) {
1513 struct seg_entry *se = &sit_i->sentries[start];
1514 struct f2fs_sit_block *sit_blk;
1515 struct f2fs_sit_entry sit;
1516 struct page *page;
1517 int i;
1518
1519 mutex_lock(&curseg->curseg_mutex);
1520 for (i = 0; i < sits_in_cursum(sum); i++) {
1521 if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
1522 sit = sit_in_journal(sum, i);
1523 mutex_unlock(&curseg->curseg_mutex);
1524 goto got_it;
1525 }
1526 }
1527 mutex_unlock(&curseg->curseg_mutex);
1528 page = get_current_sit_page(sbi, start);
1529 sit_blk = (struct f2fs_sit_block *)page_address(page);
1530 sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
1531 f2fs_put_page(page, 1);
1532got_it:
1533 check_block_count(sbi, start, &sit);
1534 seg_info_from_raw_sit(se, &sit);
1535 if (sbi->segs_per_sec > 1) {
1536 struct sec_entry *e = get_sec_entry(sbi, start);
1537 e->valid_blocks += se->valid_blocks;
1538 }
1539 }
1540}
1541
1542static void init_free_segmap(struct f2fs_sb_info *sbi)
1543{
1544 unsigned int start;
1545 int type;
1546
1547 for (start = 0; start < TOTAL_SEGS(sbi); start++) {
1548 struct seg_entry *sentry = get_seg_entry(sbi, start);
1549 if (!sentry->valid_blocks)
1550 __set_free(sbi, start);
1551 }
1552
1553 /* set use the current segments */
1554 for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) {
1555 struct curseg_info *curseg_t = CURSEG_I(sbi, type);
1556 __set_test_and_inuse(sbi, curseg_t->segno);
1557 }
1558}
1559
1560static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1561{
1562 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1563 struct free_segmap_info *free_i = FREE_I(sbi);
1564 unsigned int segno = 0, offset = 0;
1565 unsigned short valid_blocks;
1566
1567 while (segno < TOTAL_SEGS(sbi)) {
1568 /* find dirty segment based on free segmap */
1569 segno = find_next_inuse(free_i, TOTAL_SEGS(sbi), offset);
1570 if (segno >= TOTAL_SEGS(sbi))
1571 break;
1572 offset = segno + 1;
1573 valid_blocks = get_valid_blocks(sbi, segno, 0);
1574 if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks)
1575 continue;
1576 mutex_lock(&dirty_i->seglist_lock);
1577 __locate_dirty_segment(sbi, segno, DIRTY);
1578 mutex_unlock(&dirty_i->seglist_lock);
1579 }
1580}
1581
1582static int init_victim_segmap(struct f2fs_sb_info *sbi)
1583{
1584 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1585 unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1586
1587 dirty_i->victim_segmap[FG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
1588 dirty_i->victim_segmap[BG_GC] = kzalloc(bitmap_size, GFP_KERNEL);
1589 if (!dirty_i->victim_segmap[FG_GC] || !dirty_i->victim_segmap[BG_GC])
1590 return -ENOMEM;
1591 return 0;
1592}
1593
1594static int build_dirty_segmap(struct f2fs_sb_info *sbi)
1595{
1596 struct dirty_seglist_info *dirty_i;
1597 unsigned int bitmap_size, i;
1598
1599 /* allocate memory for dirty segments list information */
1600 dirty_i = kzalloc(sizeof(struct dirty_seglist_info), GFP_KERNEL);
1601 if (!dirty_i)
1602 return -ENOMEM;
1603
1604 SM_I(sbi)->dirty_info = dirty_i;
1605 mutex_init(&dirty_i->seglist_lock);
1606
1607 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1608
1609 for (i = 0; i < NR_DIRTY_TYPE; i++) {
1610 dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
1611 dirty_i->nr_dirty[i] = 0;
1612 if (!dirty_i->dirty_segmap[i])
1613 return -ENOMEM;
1614 }
1615
1616 init_dirty_segmap(sbi);
1617 return init_victim_segmap(sbi);
1618}
1619
1620/*
1621 * Update min, max modified time for cost-benefit GC algorithm
1622 */
1623static void init_min_max_mtime(struct f2fs_sb_info *sbi)
1624{
1625 struct sit_info *sit_i = SIT_I(sbi);
1626 unsigned int segno;
1627
1628 mutex_lock(&sit_i->sentry_lock);
1629
1630 sit_i->min_mtime = LLONG_MAX;
1631
1632 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
1633 unsigned int i;
1634 unsigned long long mtime = 0;
1635
1636 for (i = 0; i < sbi->segs_per_sec; i++)
1637 mtime += get_seg_entry(sbi, segno + i)->mtime;
1638
1639 mtime = div_u64(mtime, sbi->segs_per_sec);
1640
1641 if (sit_i->min_mtime > mtime)
1642 sit_i->min_mtime = mtime;
1643 }
1644 sit_i->max_mtime = get_mtime(sbi);
1645 mutex_unlock(&sit_i->sentry_lock);
1646}
1647
1648int build_segment_manager(struct f2fs_sb_info *sbi)
1649{
1650 struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
1651 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
1652 struct f2fs_sm_info *sm_info;
1653 int err;
1654
1655 sm_info = kzalloc(sizeof(struct f2fs_sm_info), GFP_KERNEL);
1656 if (!sm_info)
1657 return -ENOMEM;
1658
1659 /* init sm info */
1660 sbi->sm_info = sm_info;
1661 INIT_LIST_HEAD(&sm_info->wblist_head);
1662 spin_lock_init(&sm_info->wblist_lock);
1663 sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
1664 sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
1665 sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
1666 sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
1667 sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
1668 sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
1669 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1670
1671 err = build_sit_info(sbi);
1672 if (err)
1673 return err;
1674 err = build_free_segmap(sbi);
1675 if (err)
1676 return err;
1677 err = build_curseg(sbi);
1678 if (err)
1679 return err;
1680
1681 /* reinit free segmap based on SIT */
1682 build_sit_entries(sbi);
1683
1684 init_free_segmap(sbi);
1685 err = build_dirty_segmap(sbi);
1686 if (err)
1687 return err;
1688
1689 init_min_max_mtime(sbi);
1690 return 0;
1691}
1692
1693static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
1694 enum dirty_type dirty_type)
1695{
1696 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1697
1698 mutex_lock(&dirty_i->seglist_lock);
1699 kfree(dirty_i->dirty_segmap[dirty_type]);
1700 dirty_i->nr_dirty[dirty_type] = 0;
1701 mutex_unlock(&dirty_i->seglist_lock);
1702}
1703
1704void reset_victim_segmap(struct f2fs_sb_info *sbi)
1705{
1706 unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi));
1707 memset(DIRTY_I(sbi)->victim_segmap[FG_GC], 0, bitmap_size);
1708}
1709
1710static void destroy_victim_segmap(struct f2fs_sb_info *sbi)
1711{
1712 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1713
1714 kfree(dirty_i->victim_segmap[FG_GC]);
1715 kfree(dirty_i->victim_segmap[BG_GC]);
1716}
1717
1718static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
1719{
1720 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1721 int i;
1722
1723 if (!dirty_i)
1724 return;
1725
1726 /* discard pre-free/dirty segments list */
1727 for (i = 0; i < NR_DIRTY_TYPE; i++)
1728 discard_dirty_segmap(sbi, i);
1729
1730 destroy_victim_segmap(sbi);
1731 SM_I(sbi)->dirty_info = NULL;
1732 kfree(dirty_i);
1733}
1734
1735static void destroy_curseg(struct f2fs_sb_info *sbi)
1736{
1737 struct curseg_info *array = SM_I(sbi)->curseg_array;
1738 int i;
1739
1740 if (!array)
1741 return;
1742 SM_I(sbi)->curseg_array = NULL;
1743 for (i = 0; i < NR_CURSEG_TYPE; i++)
1744 kfree(array[i].sum_blk);
1745 kfree(array);
1746}
1747
1748static void destroy_free_segmap(struct f2fs_sb_info *sbi)
1749{
1750 struct free_segmap_info *free_i = SM_I(sbi)->free_info;
1751 if (!free_i)
1752 return;
1753 SM_I(sbi)->free_info = NULL;
1754 kfree(free_i->free_segmap);
1755 kfree(free_i->free_secmap);
1756 kfree(free_i);
1757}
1758
1759static void destroy_sit_info(struct f2fs_sb_info *sbi)
1760{
1761 struct sit_info *sit_i = SIT_I(sbi);
1762 unsigned int start;
1763
1764 if (!sit_i)
1765 return;
1766
1767 if (sit_i->sentries) {
1768 for (start = 0; start < TOTAL_SEGS(sbi); start++) {
1769 kfree(sit_i->sentries[start].cur_valid_map);
1770 kfree(sit_i->sentries[start].ckpt_valid_map);
1771 }
1772 }
1773 vfree(sit_i->sentries);
1774 vfree(sit_i->sec_entries);
1775 kfree(sit_i->dirty_sentries_bitmap);
1776
1777 SM_I(sbi)->sit_info = NULL;
1778 kfree(sit_i->sit_bitmap);
1779 kfree(sit_i);
1780}
1781
1782void destroy_segment_manager(struct f2fs_sb_info *sbi)
1783{
1784 struct f2fs_sm_info *sm_info = SM_I(sbi);
1785 destroy_dirty_segmap(sbi);
1786 destroy_curseg(sbi);
1787 destroy_free_segmap(sbi);
1788 destroy_sit_info(sbi);
1789 sbi->sm_info = NULL;
1790 kfree(sm_info);
1791}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
new file mode 100644
index 000000000000..0948405af6f5
--- /dev/null
+++ b/fs/f2fs/segment.h
@@ -0,0 +1,618 @@
1/*
2 * fs/f2fs/segment.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11/* constant macro */
12#define NULL_SEGNO ((unsigned int)(~0))
13
14/* V: Logical segment # in volume, R: Relative segment # in main area */
15#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
16#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
17
18#define IS_DATASEG(t) \
19 ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \
20 (t == CURSEG_WARM_DATA))
21
22#define IS_NODESEG(t) \
23 ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \
24 (t == CURSEG_WARM_NODE))
25
26#define IS_CURSEG(sbi, segno) \
27 ((segno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
28 (segno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \
29 (segno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \
30 (segno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \
31 (segno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \
32 (segno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno))
33
34#define IS_CURSEC(sbi, secno) \
35 ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \
36 sbi->segs_per_sec) || \
37 (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \
38 sbi->segs_per_sec) || \
39 (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \
40 sbi->segs_per_sec) || \
41 (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \
42 sbi->segs_per_sec) || \
43 (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \
44 sbi->segs_per_sec) || \
45 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
46 sbi->segs_per_sec)) \
47
48#define START_BLOCK(sbi, segno) \
49 (SM_I(sbi)->seg0_blkaddr + \
50 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
51#define NEXT_FREE_BLKADDR(sbi, curseg) \
52 (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
53
54#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr)
55
56#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
60#define GET_SEGNO(sbi, blk_addr) \
61 (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
62 NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
63 GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
64#define GET_SECNO(sbi, segno) \
65 ((segno) / sbi->segs_per_sec)
66#define GET_ZONENO_FROM_SEGNO(sbi, segno) \
67 ((segno / sbi->segs_per_sec) / sbi->secs_per_zone)
68
69#define GET_SUM_BLOCK(sbi, segno) \
70 ((sbi->sm_info->ssa_blkaddr) + segno)
71
72#define GET_SUM_TYPE(footer) ((footer)->entry_type)
73#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type)
74
75#define SIT_ENTRY_OFFSET(sit_i, segno) \
76 (segno % sit_i->sents_per_block)
77#define SIT_BLOCK_OFFSET(sit_i, segno) \
78 (segno / SIT_ENTRY_PER_BLOCK)
79#define START_SEGNO(sit_i, segno) \
80 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
81#define f2fs_bitmap_size(nr) \
82 (BITS_TO_LONGS(nr) * sizeof(unsigned long))
83#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
84
85#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
86 (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
87
88/* during checkpoint, bio_private is used to synchronize the last bio */
89struct bio_private {
90 struct f2fs_sb_info *sbi;
91 bool is_sync;
92 void *wait;
93};
94
95/*
96 * indicate a block allocation direction: RIGHT and LEFT.
97 * RIGHT means allocating new sections towards the end of volume.
98 * LEFT means the opposite direction.
99 */
100enum {
101 ALLOC_RIGHT = 0,
102 ALLOC_LEFT
103};
104
105/*
106 * In the victim_sel_policy->alloc_mode, there are two block allocation modes.
107 * LFS writes data sequentially with cleaning operations.
108 * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
109 */
110enum {
111 LFS = 0,
112 SSR
113};
114
115/*
116 * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes.
117 * GC_CB is based on cost-benefit algorithm.
118 * GC_GREEDY is based on greedy algorithm.
119 */
120enum {
121 GC_CB = 0,
122 GC_GREEDY
123};
124
125/*
126 * BG_GC means the background cleaning job.
127 * FG_GC means the on-demand cleaning job.
128 */
129enum {
130 BG_GC = 0,
131 FG_GC
132};
133
134/* for a function parameter to select a victim segment */
135struct victim_sel_policy {
136 int alloc_mode; /* LFS or SSR */
137 int gc_mode; /* GC_CB or GC_GREEDY */
138 unsigned long *dirty_segmap; /* dirty segment bitmap */
139 unsigned int offset; /* last scanned bitmap offset */
140 unsigned int ofs_unit; /* bitmap search unit */
141 unsigned int min_cost; /* minimum cost */
142 unsigned int min_segno; /* segment # having min. cost */
143};
144
145struct seg_entry {
146 unsigned short valid_blocks; /* # of valid blocks */
147 unsigned char *cur_valid_map; /* validity bitmap of blocks */
148 /*
149 * # of valid blocks and the validity bitmap stored in the the last
150 * checkpoint pack. This information is used by the SSR mode.
151 */
152 unsigned short ckpt_valid_blocks;
153 unsigned char *ckpt_valid_map;
154 unsigned char type; /* segment type like CURSEG_XXX_TYPE */
155 unsigned long long mtime; /* modification time of the segment */
156};
157
158struct sec_entry {
159 unsigned int valid_blocks; /* # of valid blocks in a section */
160};
161
162struct segment_allocation {
163 void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
164};
165
166struct sit_info {
167 const struct segment_allocation *s_ops;
168
169 block_t sit_base_addr; /* start block address of SIT area */
170 block_t sit_blocks; /* # of blocks used by SIT area */
171 block_t written_valid_blocks; /* # of valid blocks in main area */
172 char *sit_bitmap; /* SIT bitmap pointer */
173 unsigned int bitmap_size; /* SIT bitmap size */
174
175 unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
176 unsigned int dirty_sentries; /* # of dirty sentries */
177 unsigned int sents_per_block; /* # of SIT entries per block */
178 struct mutex sentry_lock; /* to protect SIT cache */
179 struct seg_entry *sentries; /* SIT segment-level cache */
180 struct sec_entry *sec_entries; /* SIT section-level cache */
181
182 /* for cost-benefit algorithm in cleaning procedure */
183 unsigned long long elapsed_time; /* elapsed time after mount */
184 unsigned long long mounted_time; /* mount time */
185 unsigned long long min_mtime; /* min. modification time */
186 unsigned long long max_mtime; /* max. modification time */
187};
188
189struct free_segmap_info {
190 unsigned int start_segno; /* start segment number logically */
191 unsigned int free_segments; /* # of free segments */
192 unsigned int free_sections; /* # of free sections */
193 rwlock_t segmap_lock; /* free segmap lock */
194 unsigned long *free_segmap; /* free segment bitmap */
195 unsigned long *free_secmap; /* free section bitmap */
196};
197
198/* Notice: The order of dirty type is same with CURSEG_XXX in f2fs.h */
199enum dirty_type {
200 DIRTY_HOT_DATA, /* dirty segments assigned as hot data logs */
201 DIRTY_WARM_DATA, /* dirty segments assigned as warm data logs */
202 DIRTY_COLD_DATA, /* dirty segments assigned as cold data logs */
203 DIRTY_HOT_NODE, /* dirty segments assigned as hot node logs */
204 DIRTY_WARM_NODE, /* dirty segments assigned as warm node logs */
205 DIRTY_COLD_NODE, /* dirty segments assigned as cold node logs */
206 DIRTY, /* to count # of dirty segments */
207 PRE, /* to count # of entirely obsolete segments */
208 NR_DIRTY_TYPE
209};
210
211struct dirty_seglist_info {
212 const struct victim_selection *v_ops; /* victim selction operation */
213 unsigned long *dirty_segmap[NR_DIRTY_TYPE];
214 struct mutex seglist_lock; /* lock for segment bitmaps */
215 int nr_dirty[NR_DIRTY_TYPE]; /* # of dirty segments */
216 unsigned long *victim_segmap[2]; /* BG_GC, FG_GC */
217};
218
219/* victim selection function for cleaning and SSR */
220struct victim_selection {
221 int (*get_victim)(struct f2fs_sb_info *, unsigned int *,
222 int, int, char);
223};
224
225/* for active log information */
226struct curseg_info {
227 struct mutex curseg_mutex; /* lock for consistency */
228 struct f2fs_summary_block *sum_blk; /* cached summary block */
229 unsigned char alloc_type; /* current allocation type */
230 unsigned int segno; /* current segment number */
231 unsigned short next_blkoff; /* next block offset to write */
232 unsigned int zone; /* current zone number */
233 unsigned int next_segno; /* preallocated segment */
234};
235
236/*
237 * inline functions
238 */
239static inline struct curseg_info *CURSEG_I(struct f2fs_sb_info *sbi, int type)
240{
241 return (struct curseg_info *)(SM_I(sbi)->curseg_array + type);
242}
243
244static inline struct seg_entry *get_seg_entry(struct f2fs_sb_info *sbi,
245 unsigned int segno)
246{
247 struct sit_info *sit_i = SIT_I(sbi);
248 return &sit_i->sentries[segno];
249}
250
251static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi,
252 unsigned int segno)
253{
254 struct sit_info *sit_i = SIT_I(sbi);
255 return &sit_i->sec_entries[GET_SECNO(sbi, segno)];
256}
257
258static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi,
259 unsigned int segno, int section)
260{
261 /*
262 * In order to get # of valid blocks in a section instantly from many
263 * segments, f2fs manages two counting structures separately.
264 */
265 if (section > 1)
266 return get_sec_entry(sbi, segno)->valid_blocks;
267 else
268 return get_seg_entry(sbi, segno)->valid_blocks;
269}
270
271static inline void seg_info_from_raw_sit(struct seg_entry *se,
272 struct f2fs_sit_entry *rs)
273{
274 se->valid_blocks = GET_SIT_VBLOCKS(rs);
275 se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs);
276 memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
277 memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
278 se->type = GET_SIT_TYPE(rs);
279 se->mtime = le64_to_cpu(rs->mtime);
280}
281
282static inline void seg_info_to_raw_sit(struct seg_entry *se,
283 struct f2fs_sit_entry *rs)
284{
285 unsigned short raw_vblocks = (se->type << SIT_VBLOCKS_SHIFT) |
286 se->valid_blocks;
287 rs->vblocks = cpu_to_le16(raw_vblocks);
288 memcpy(rs->valid_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE);
289 memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE);
290 se->ckpt_valid_blocks = se->valid_blocks;
291 rs->mtime = cpu_to_le64(se->mtime);
292}
293
294static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
295 unsigned int max, unsigned int segno)
296{
297 unsigned int ret;
298 read_lock(&free_i->segmap_lock);
299 ret = find_next_bit(free_i->free_segmap, max, segno);
300 read_unlock(&free_i->segmap_lock);
301 return ret;
302}
303
304static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
305{
306 struct free_segmap_info *free_i = FREE_I(sbi);
307 unsigned int secno = segno / sbi->segs_per_sec;
308 unsigned int start_segno = secno * sbi->segs_per_sec;
309 unsigned int next;
310
311 write_lock(&free_i->segmap_lock);
312 clear_bit(segno, free_i->free_segmap);
313 free_i->free_segments++;
314
315 next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno);
316 if (next >= start_segno + sbi->segs_per_sec) {
317 clear_bit(secno, free_i->free_secmap);
318 free_i->free_sections++;
319 }
320 write_unlock(&free_i->segmap_lock);
321}
322
323static inline void __set_inuse(struct f2fs_sb_info *sbi,
324 unsigned int segno)
325{
326 struct free_segmap_info *free_i = FREE_I(sbi);
327 unsigned int secno = segno / sbi->segs_per_sec;
328 set_bit(segno, free_i->free_segmap);
329 free_i->free_segments--;
330 if (!test_and_set_bit(secno, free_i->free_secmap))
331 free_i->free_sections--;
332}
333
334static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
335 unsigned int segno)
336{
337 struct free_segmap_info *free_i = FREE_I(sbi);
338 unsigned int secno = segno / sbi->segs_per_sec;
339 unsigned int start_segno = secno * sbi->segs_per_sec;
340 unsigned int next;
341
342 write_lock(&free_i->segmap_lock);
343 if (test_and_clear_bit(segno, free_i->free_segmap)) {
344 free_i->free_segments++;
345
346 next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi),
347 start_segno);
348 if (next >= start_segno + sbi->segs_per_sec) {
349 if (test_and_clear_bit(secno, free_i->free_secmap))
350 free_i->free_sections++;
351 }
352 }
353 write_unlock(&free_i->segmap_lock);
354}
355
356static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
357 unsigned int segno)
358{
359 struct free_segmap_info *free_i = FREE_I(sbi);
360 unsigned int secno = segno / sbi->segs_per_sec;
361 write_lock(&free_i->segmap_lock);
362 if (!test_and_set_bit(segno, free_i->free_segmap)) {
363 free_i->free_segments--;
364 if (!test_and_set_bit(secno, free_i->free_secmap))
365 free_i->free_sections--;
366 }
367 write_unlock(&free_i->segmap_lock);
368}
369
370static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
371 void *dst_addr)
372{
373 struct sit_info *sit_i = SIT_I(sbi);
374 memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size);
375}
376
377static inline block_t written_block_count(struct f2fs_sb_info *sbi)
378{
379 struct sit_info *sit_i = SIT_I(sbi);
380 block_t vblocks;
381
382 mutex_lock(&sit_i->sentry_lock);
383 vblocks = sit_i->written_valid_blocks;
384 mutex_unlock(&sit_i->sentry_lock);
385
386 return vblocks;
387}
388
389static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
390{
391 struct free_segmap_info *free_i = FREE_I(sbi);
392 unsigned int free_segs;
393
394 read_lock(&free_i->segmap_lock);
395 free_segs = free_i->free_segments;
396 read_unlock(&free_i->segmap_lock);
397
398 return free_segs;
399}
400
401static inline int reserved_segments(struct f2fs_sb_info *sbi)
402{
403 return SM_I(sbi)->reserved_segments;
404}
405
406static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
407{
408 struct free_segmap_info *free_i = FREE_I(sbi);
409 unsigned int free_secs;
410
411 read_lock(&free_i->segmap_lock);
412 free_secs = free_i->free_sections;
413 read_unlock(&free_i->segmap_lock);
414
415 return free_secs;
416}
417
418static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
419{
420 return DIRTY_I(sbi)->nr_dirty[PRE];
421}
422
423static inline unsigned int dirty_segments(struct f2fs_sb_info *sbi)
424{
425 return DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_DATA] +
426 DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_DATA] +
427 DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_DATA] +
428 DIRTY_I(sbi)->nr_dirty[DIRTY_HOT_NODE] +
429 DIRTY_I(sbi)->nr_dirty[DIRTY_WARM_NODE] +
430 DIRTY_I(sbi)->nr_dirty[DIRTY_COLD_NODE];
431}
432
433static inline int overprovision_segments(struct f2fs_sb_info *sbi)
434{
435 return SM_I(sbi)->ovp_segments;
436}
437
438static inline int overprovision_sections(struct f2fs_sb_info *sbi)
439{
440 return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec;
441}
442
443static inline int reserved_sections(struct f2fs_sb_info *sbi)
444{
445 return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec;
446}
447
448static inline bool need_SSR(struct f2fs_sb_info *sbi)
449{
450 return (free_sections(sbi) < overprovision_sections(sbi));
451}
452
453static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
454{
455 struct curseg_info *curseg = CURSEG_I(sbi, type);
456 return DIRTY_I(sbi)->v_ops->get_victim(sbi,
457 &(curseg)->next_segno, BG_GC, type, SSR);
458}
459
460static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi)
461{
462 return free_sections(sbi) <= reserved_sections(sbi);
463}
464
465static inline int utilization(struct f2fs_sb_info *sbi)
466{
467 return (long int)valid_user_blocks(sbi) * 100 /
468 (long int)sbi->user_block_count;
469}
470
471/*
472 * Sometimes f2fs may be better to drop out-of-place update policy.
473 * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
474 * data in the original place likewise other traditional file systems.
475 * But, currently set 100 in percentage, which means it is disabled.
476 * See below need_inplace_update().
477 */
478#define MIN_IPU_UTIL 100
479static inline bool need_inplace_update(struct inode *inode)
480{
481 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
482 if (S_ISDIR(inode->i_mode))
483 return false;
484 if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
485 return true;
486 return false;
487}
488
489static inline unsigned int curseg_segno(struct f2fs_sb_info *sbi,
490 int type)
491{
492 struct curseg_info *curseg = CURSEG_I(sbi, type);
493 return curseg->segno;
494}
495
496static inline unsigned char curseg_alloc_type(struct f2fs_sb_info *sbi,
497 int type)
498{
499 struct curseg_info *curseg = CURSEG_I(sbi, type);
500 return curseg->alloc_type;
501}
502
503static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
504{
505 struct curseg_info *curseg = CURSEG_I(sbi, type);
506 return curseg->next_blkoff;
507}
508
509static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
510{
511 unsigned int end_segno = SM_I(sbi)->segment_count - 1;
512 BUG_ON(segno > end_segno);
513}
514
515/*
516 * This function is used for only debugging.
517 * NOTE: In future, we have to remove this function.
518 */
519static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
520{
521 struct f2fs_sm_info *sm_info = SM_I(sbi);
522 block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
523 block_t start_addr = sm_info->seg0_blkaddr;
524 block_t end_addr = start_addr + total_blks - 1;
525 BUG_ON(blk_addr < start_addr);
526 BUG_ON(blk_addr > end_addr);
527}
528
529/*
530 * Summary block is always treated as invalid block
531 */
532static inline void check_block_count(struct f2fs_sb_info *sbi,
533 int segno, struct f2fs_sit_entry *raw_sit)
534{
535 struct f2fs_sm_info *sm_info = SM_I(sbi);
536 unsigned int end_segno = sm_info->segment_count - 1;
537 int valid_blocks = 0;
538 int i;
539
540 /* check segment usage */
541 BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
542
543 /* check boundary of a given segment number */
544 BUG_ON(segno > end_segno);
545
546 /* check bitmap with valid block count */
547 for (i = 0; i < sbi->blocks_per_seg; i++)
548 if (f2fs_test_bit(i, raw_sit->valid_map))
549 valid_blocks++;
550 BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
551}
552
553static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
554 unsigned int start)
555{
556 struct sit_info *sit_i = SIT_I(sbi);
557 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start);
558 block_t blk_addr = sit_i->sit_base_addr + offset;
559
560 check_seg_range(sbi, start);
561
562 /* calculate sit block address */
563 if (f2fs_test_bit(offset, sit_i->sit_bitmap))
564 blk_addr += sit_i->sit_blocks;
565
566 return blk_addr;
567}
568
569static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
570 pgoff_t block_addr)
571{
572 struct sit_info *sit_i = SIT_I(sbi);
573 block_addr -= sit_i->sit_base_addr;
574 if (block_addr < sit_i->sit_blocks)
575 block_addr += sit_i->sit_blocks;
576 else
577 block_addr -= sit_i->sit_blocks;
578
579 return block_addr + sit_i->sit_base_addr;
580}
581
582static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
583{
584 unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start);
585
586 if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
587 f2fs_clear_bit(block_off, sit_i->sit_bitmap);
588 else
589 f2fs_set_bit(block_off, sit_i->sit_bitmap);
590}
591
592static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi)
593{
594 struct sit_info *sit_i = SIT_I(sbi);
595 return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec -
596 sit_i->mounted_time;
597}
598
599static inline void set_summary(struct f2fs_summary *sum, nid_t nid,
600 unsigned int ofs_in_node, unsigned char version)
601{
602 sum->nid = cpu_to_le32(nid);
603 sum->ofs_in_node = cpu_to_le16(ofs_in_node);
604 sum->version = version;
605}
606
607static inline block_t start_sum_block(struct f2fs_sb_info *sbi)
608{
609 return __start_cp_addr(sbi) +
610 le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
611}
612
613static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
614{
615 return __start_cp_addr(sbi) +
616 le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_total_block_count)
617 - (base + 1) + type;
618}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
new file mode 100644
index 000000000000..13867322cf5a
--- /dev/null
+++ b/fs/f2fs/super.c
@@ -0,0 +1,657 @@
1/*
2 * fs/f2fs/super.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 */
11#include <linux/module.h>
12#include <linux/init.h>
13#include <linux/fs.h>
14#include <linux/statfs.h>
15#include <linux/proc_fs.h>
16#include <linux/buffer_head.h>
17#include <linux/backing-dev.h>
18#include <linux/kthread.h>
19#include <linux/parser.h>
20#include <linux/mount.h>
21#include <linux/seq_file.h>
22#include <linux/random.h>
23#include <linux/exportfs.h>
24#include <linux/f2fs_fs.h>
25
26#include "f2fs.h"
27#include "node.h"
28#include "xattr.h"
29
30static struct kmem_cache *f2fs_inode_cachep;
31
32enum {
33 Opt_gc_background_off,
34 Opt_disable_roll_forward,
35 Opt_discard,
36 Opt_noheap,
37 Opt_nouser_xattr,
38 Opt_noacl,
39 Opt_active_logs,
40 Opt_disable_ext_identify,
41 Opt_err,
42};
43
44static match_table_t f2fs_tokens = {
45 {Opt_gc_background_off, "background_gc_off"},
46 {Opt_disable_roll_forward, "disable_roll_forward"},
47 {Opt_discard, "discard"},
48 {Opt_noheap, "no_heap"},
49 {Opt_nouser_xattr, "nouser_xattr"},
50 {Opt_noacl, "noacl"},
51 {Opt_active_logs, "active_logs=%u"},
52 {Opt_disable_ext_identify, "disable_ext_identify"},
53 {Opt_err, NULL},
54};
55
56static void init_once(void *foo)
57{
58 struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
59
60 inode_init_once(&fi->vfs_inode);
61}
62
63static struct inode *f2fs_alloc_inode(struct super_block *sb)
64{
65 struct f2fs_inode_info *fi;
66
67 fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
68 if (!fi)
69 return NULL;
70
71 init_once((void *) fi);
72
73 /* Initilize f2fs-specific inode info */
74 fi->vfs_inode.i_version = 1;
75 atomic_set(&fi->dirty_dents, 0);
76 fi->i_current_depth = 1;
77 fi->i_advise = 0;
78 rwlock_init(&fi->ext.ext_lock);
79
80 set_inode_flag(fi, FI_NEW_INODE);
81
82 return &fi->vfs_inode;
83}
84
85static void f2fs_i_callback(struct rcu_head *head)
86{
87 struct inode *inode = container_of(head, struct inode, i_rcu);
88 kmem_cache_free(f2fs_inode_cachep, F2FS_I(inode));
89}
90
91static void f2fs_destroy_inode(struct inode *inode)
92{
93 call_rcu(&inode->i_rcu, f2fs_i_callback);
94}
95
96static void f2fs_put_super(struct super_block *sb)
97{
98 struct f2fs_sb_info *sbi = F2FS_SB(sb);
99
100 f2fs_destroy_stats(sbi);
101 stop_gc_thread(sbi);
102
103 write_checkpoint(sbi, false, true);
104
105 iput(sbi->node_inode);
106 iput(sbi->meta_inode);
107
108 /* destroy f2fs internal modules */
109 destroy_node_manager(sbi);
110 destroy_segment_manager(sbi);
111
112 kfree(sbi->ckpt);
113
114 sb->s_fs_info = NULL;
115 brelse(sbi->raw_super_buf);
116 kfree(sbi);
117}
118
119int f2fs_sync_fs(struct super_block *sb, int sync)
120{
121 struct f2fs_sb_info *sbi = F2FS_SB(sb);
122 int ret = 0;
123
124 if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
125 return 0;
126
127 if (sync)
128 write_checkpoint(sbi, false, false);
129
130 return ret;
131}
132
133static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
134{
135 struct super_block *sb = dentry->d_sb;
136 struct f2fs_sb_info *sbi = F2FS_SB(sb);
137 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
138 block_t total_count, user_block_count, start_count, ovp_count;
139
140 total_count = le64_to_cpu(sbi->raw_super->block_count);
141 user_block_count = sbi->user_block_count;
142 start_count = le32_to_cpu(sbi->raw_super->segment0_blkaddr);
143 ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg;
144 buf->f_type = F2FS_SUPER_MAGIC;
145 buf->f_bsize = sbi->blocksize;
146
147 buf->f_blocks = total_count - start_count;
148 buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
149 buf->f_bavail = user_block_count - valid_user_blocks(sbi);
150
151 buf->f_files = valid_inode_count(sbi);
152 buf->f_ffree = sbi->total_node_count - valid_node_count(sbi);
153
154 buf->f_namelen = F2FS_MAX_NAME_LEN;
155 buf->f_fsid.val[0] = (u32)id;
156 buf->f_fsid.val[1] = (u32)(id >> 32);
157
158 return 0;
159}
160
161static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
162{
163 struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
164
165 if (test_opt(sbi, BG_GC))
166 seq_puts(seq, ",background_gc_on");
167 else
168 seq_puts(seq, ",background_gc_off");
169 if (test_opt(sbi, DISABLE_ROLL_FORWARD))
170 seq_puts(seq, ",disable_roll_forward");
171 if (test_opt(sbi, DISCARD))
172 seq_puts(seq, ",discard");
173 if (test_opt(sbi, NOHEAP))
174 seq_puts(seq, ",no_heap_alloc");
175#ifdef CONFIG_F2FS_FS_XATTR
176 if (test_opt(sbi, XATTR_USER))
177 seq_puts(seq, ",user_xattr");
178 else
179 seq_puts(seq, ",nouser_xattr");
180#endif
181#ifdef CONFIG_F2FS_FS_POSIX_ACL
182 if (test_opt(sbi, POSIX_ACL))
183 seq_puts(seq, ",acl");
184 else
185 seq_puts(seq, ",noacl");
186#endif
187 if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
188 seq_puts(seq, ",disable_ext_indentify");
189
190 seq_printf(seq, ",active_logs=%u", sbi->active_logs);
191
192 return 0;
193}
194
195static struct super_operations f2fs_sops = {
196 .alloc_inode = f2fs_alloc_inode,
197 .destroy_inode = f2fs_destroy_inode,
198 .write_inode = f2fs_write_inode,
199 .show_options = f2fs_show_options,
200 .evict_inode = f2fs_evict_inode,
201 .put_super = f2fs_put_super,
202 .sync_fs = f2fs_sync_fs,
203 .statfs = f2fs_statfs,
204};
205
206static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
207 u64 ino, u32 generation)
208{
209 struct f2fs_sb_info *sbi = F2FS_SB(sb);
210 struct inode *inode;
211
212 if (ino < F2FS_ROOT_INO(sbi))
213 return ERR_PTR(-ESTALE);
214
215 /*
216 * f2fs_iget isn't quite right if the inode is currently unallocated!
217 * However f2fs_iget currently does appropriate checks to handle stale
218 * inodes so everything is OK.
219 */
220 inode = f2fs_iget(sb, ino);
221 if (IS_ERR(inode))
222 return ERR_CAST(inode);
223 if (generation && inode->i_generation != generation) {
224 /* we didn't find the right inode.. */
225 iput(inode);
226 return ERR_PTR(-ESTALE);
227 }
228 return inode;
229}
230
231static struct dentry *f2fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
232 int fh_len, int fh_type)
233{
234 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
235 f2fs_nfs_get_inode);
236}
237
238static struct dentry *f2fs_fh_to_parent(struct super_block *sb, struct fid *fid,
239 int fh_len, int fh_type)
240{
241 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
242 f2fs_nfs_get_inode);
243}
244
245static const struct export_operations f2fs_export_ops = {
246 .fh_to_dentry = f2fs_fh_to_dentry,
247 .fh_to_parent = f2fs_fh_to_parent,
248 .get_parent = f2fs_get_parent,
249};
250
251static int parse_options(struct f2fs_sb_info *sbi, char *options)
252{
253 substring_t args[MAX_OPT_ARGS];
254 char *p;
255 int arg = 0;
256
257 if (!options)
258 return 0;
259
260 while ((p = strsep(&options, ",")) != NULL) {
261 int token;
262 if (!*p)
263 continue;
264 /*
265 * Initialize args struct so we know whether arg was
266 * found; some options take optional arguments.
267 */
268 args[0].to = args[0].from = NULL;
269 token = match_token(p, f2fs_tokens, args);
270
271 switch (token) {
272 case Opt_gc_background_off:
273 clear_opt(sbi, BG_GC);
274 break;
275 case Opt_disable_roll_forward:
276 set_opt(sbi, DISABLE_ROLL_FORWARD);
277 break;
278 case Opt_discard:
279 set_opt(sbi, DISCARD);
280 break;
281 case Opt_noheap:
282 set_opt(sbi, NOHEAP);
283 break;
284#ifdef CONFIG_F2FS_FS_XATTR
285 case Opt_nouser_xattr:
286 clear_opt(sbi, XATTR_USER);
287 break;
288#else
289 case Opt_nouser_xattr:
290 pr_info("nouser_xattr options not supported\n");
291 break;
292#endif
293#ifdef CONFIG_F2FS_FS_POSIX_ACL
294 case Opt_noacl:
295 clear_opt(sbi, POSIX_ACL);
296 break;
297#else
298 case Opt_noacl:
299 pr_info("noacl options not supported\n");
300 break;
301#endif
302 case Opt_active_logs:
303 if (args->from && match_int(args, &arg))
304 return -EINVAL;
305 if (arg != 2 && arg != 4 && arg != 6)
306 return -EINVAL;
307 sbi->active_logs = arg;
308 break;
309 case Opt_disable_ext_identify:
310 set_opt(sbi, DISABLE_EXT_IDENTIFY);
311 break;
312 default:
313 pr_err("Unrecognized mount option \"%s\" or missing value\n",
314 p);
315 return -EINVAL;
316 }
317 }
318 return 0;
319}
320
321static loff_t max_file_size(unsigned bits)
322{
323 loff_t result = ADDRS_PER_INODE;
324 loff_t leaf_count = ADDRS_PER_BLOCK;
325
326 /* two direct node blocks */
327 result += (leaf_count * 2);
328
329 /* two indirect node blocks */
330 leaf_count *= NIDS_PER_BLOCK;
331 result += (leaf_count * 2);
332
333 /* one double indirect node block */
334 leaf_count *= NIDS_PER_BLOCK;
335 result += leaf_count;
336
337 result <<= bits;
338 return result;
339}
340
341static int sanity_check_raw_super(struct f2fs_super_block *raw_super)
342{
343 unsigned int blocksize;
344
345 if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic))
346 return 1;
347
348 /* Currently, support only 4KB block size */
349 blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
350 if (blocksize != PAGE_CACHE_SIZE)
351 return 1;
352 if (le32_to_cpu(raw_super->log_sectorsize) !=
353 F2FS_LOG_SECTOR_SIZE)
354 return 1;
355 if (le32_to_cpu(raw_super->log_sectors_per_block) !=
356 F2FS_LOG_SECTORS_PER_BLOCK)
357 return 1;
358 return 0;
359}
360
361static int sanity_check_ckpt(struct f2fs_super_block *raw_super,
362 struct f2fs_checkpoint *ckpt)
363{
364 unsigned int total, fsmeta;
365
366 total = le32_to_cpu(raw_super->segment_count);
367 fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
368 fsmeta += le32_to_cpu(raw_super->segment_count_sit);
369 fsmeta += le32_to_cpu(raw_super->segment_count_nat);
370 fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
371 fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
372
373 if (fsmeta >= total)
374 return 1;
375 return 0;
376}
377
378static void init_sb_info(struct f2fs_sb_info *sbi)
379{
380 struct f2fs_super_block *raw_super = sbi->raw_super;
381 int i;
382
383 sbi->log_sectors_per_block =
384 le32_to_cpu(raw_super->log_sectors_per_block);
385 sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
386 sbi->blocksize = 1 << sbi->log_blocksize;
387 sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
388 sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
389 sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
390 sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
391 sbi->total_sections = le32_to_cpu(raw_super->section_count);
392 sbi->total_node_count =
393 (le32_to_cpu(raw_super->segment_count_nat) / 2)
394 * sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
395 sbi->root_ino_num = le32_to_cpu(raw_super->root_ino);
396 sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
397 sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
398
399 for (i = 0; i < NR_COUNT_TYPE; i++)
400 atomic_set(&sbi->nr_pages[i], 0);
401}
402
403static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
404{
405 struct f2fs_sb_info *sbi;
406 struct f2fs_super_block *raw_super;
407 struct buffer_head *raw_super_buf;
408 struct inode *root;
409 long err = -EINVAL;
410 int i;
411
412 /* allocate memory for f2fs-specific super block info */
413 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
414 if (!sbi)
415 return -ENOMEM;
416
417 /* set a temporary block size */
418 if (!sb_set_blocksize(sb, F2FS_BLKSIZE))
419 goto free_sbi;
420
421 /* read f2fs raw super block */
422 raw_super_buf = sb_bread(sb, 0);
423 if (!raw_super_buf) {
424 err = -EIO;
425 goto free_sbi;
426 }
427 raw_super = (struct f2fs_super_block *)
428 ((char *)raw_super_buf->b_data + F2FS_SUPER_OFFSET);
429
430 /* init some FS parameters */
431 sbi->active_logs = NR_CURSEG_TYPE;
432
433 set_opt(sbi, BG_GC);
434
435#ifdef CONFIG_F2FS_FS_XATTR
436 set_opt(sbi, XATTR_USER);
437#endif
438#ifdef CONFIG_F2FS_FS_POSIX_ACL
439 set_opt(sbi, POSIX_ACL);
440#endif
441 /* parse mount options */
442 if (parse_options(sbi, (char *)data))
443 goto free_sb_buf;
444
445 /* sanity checking of raw super */
446 if (sanity_check_raw_super(raw_super))
447 goto free_sb_buf;
448
449 sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
450 sb->s_max_links = F2FS_LINK_MAX;
451 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
452
453 sb->s_op = &f2fs_sops;
454 sb->s_xattr = f2fs_xattr_handlers;
455 sb->s_export_op = &f2fs_export_ops;
456 sb->s_magic = F2FS_SUPER_MAGIC;
457 sb->s_fs_info = sbi;
458 sb->s_time_gran = 1;
459 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
460 (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
461 memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
462
463 /* init f2fs-specific super block info */
464 sbi->sb = sb;
465 sbi->raw_super = raw_super;
466 sbi->raw_super_buf = raw_super_buf;
467 mutex_init(&sbi->gc_mutex);
468 mutex_init(&sbi->write_inode);
469 mutex_init(&sbi->writepages);
470 mutex_init(&sbi->cp_mutex);
471 for (i = 0; i < NR_LOCK_TYPE; i++)
472 mutex_init(&sbi->fs_lock[i]);
473 sbi->por_doing = 0;
474 spin_lock_init(&sbi->stat_lock);
475 init_rwsem(&sbi->bio_sem);
476 init_sb_info(sbi);
477
478 /* get an inode for meta space */
479 sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
480 if (IS_ERR(sbi->meta_inode)) {
481 err = PTR_ERR(sbi->meta_inode);
482 goto free_sb_buf;
483 }
484
485 err = get_valid_checkpoint(sbi);
486 if (err)
487 goto free_meta_inode;
488
489 /* sanity checking of checkpoint */
490 err = -EINVAL;
491 if (sanity_check_ckpt(raw_super, sbi->ckpt))
492 goto free_cp;
493
494 sbi->total_valid_node_count =
495 le32_to_cpu(sbi->ckpt->valid_node_count);
496 sbi->total_valid_inode_count =
497 le32_to_cpu(sbi->ckpt->valid_inode_count);
498 sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count);
499 sbi->total_valid_block_count =
500 le64_to_cpu(sbi->ckpt->valid_block_count);
501 sbi->last_valid_block_count = sbi->total_valid_block_count;
502 sbi->alloc_valid_block_count = 0;
503 INIT_LIST_HEAD(&sbi->dir_inode_list);
504 spin_lock_init(&sbi->dir_inode_lock);
505
506 /* init super block */
507 if (!sb_set_blocksize(sb, sbi->blocksize))
508 goto free_cp;
509
510 init_orphan_info(sbi);
511
512 /* setup f2fs internal modules */
513 err = build_segment_manager(sbi);
514 if (err)
515 goto free_sm;
516 err = build_node_manager(sbi);
517 if (err)
518 goto free_nm;
519
520 build_gc_manager(sbi);
521
522 /* get an inode for node space */
523 sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
524 if (IS_ERR(sbi->node_inode)) {
525 err = PTR_ERR(sbi->node_inode);
526 goto free_nm;
527 }
528
529 /* if there are nt orphan nodes free them */
530 err = -EINVAL;
531 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) &&
532 recover_orphan_inodes(sbi))
533 goto free_node_inode;
534
535 /* read root inode and dentry */
536 root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
537 if (IS_ERR(root)) {
538 err = PTR_ERR(root);
539 goto free_node_inode;
540 }
541 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
542 goto free_root_inode;
543
544 sb->s_root = d_make_root(root); /* allocate root dentry */
545 if (!sb->s_root) {
546 err = -ENOMEM;
547 goto free_root_inode;
548 }
549
550 /* recover fsynced data */
551 if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) &&
552 !test_opt(sbi, DISABLE_ROLL_FORWARD))
553 recover_fsync_data(sbi);
554
555 /* After POR, we can run background GC thread */
556 err = start_gc_thread(sbi);
557 if (err)
558 goto fail;
559
560 err = f2fs_build_stats(sbi);
561 if (err)
562 goto fail;
563
564 return 0;
565fail:
566 stop_gc_thread(sbi);
567free_root_inode:
568 dput(sb->s_root);
569 sb->s_root = NULL;
570free_node_inode:
571 iput(sbi->node_inode);
572free_nm:
573 destroy_node_manager(sbi);
574free_sm:
575 destroy_segment_manager(sbi);
576free_cp:
577 kfree(sbi->ckpt);
578free_meta_inode:
579 make_bad_inode(sbi->meta_inode);
580 iput(sbi->meta_inode);
581free_sb_buf:
582 brelse(raw_super_buf);
583free_sbi:
584 kfree(sbi);
585 return err;
586}
587
588static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
589 const char *dev_name, void *data)
590{
591 return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
592}
593
594static struct file_system_type f2fs_fs_type = {
595 .owner = THIS_MODULE,
596 .name = "f2fs",
597 .mount = f2fs_mount,
598 .kill_sb = kill_block_super,
599 .fs_flags = FS_REQUIRES_DEV,
600};
601
602static int init_inodecache(void)
603{
604 f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
605 sizeof(struct f2fs_inode_info), NULL);
606 if (f2fs_inode_cachep == NULL)
607 return -ENOMEM;
608 return 0;
609}
610
611static void destroy_inodecache(void)
612{
613 /*
614 * Make sure all delayed rcu free inodes are flushed before we
615 * destroy cache.
616 */
617 rcu_barrier();
618 kmem_cache_destroy(f2fs_inode_cachep);
619}
620
621static int __init init_f2fs_fs(void)
622{
623 int err;
624
625 err = init_inodecache();
626 if (err)
627 goto fail;
628 err = create_node_manager_caches();
629 if (err)
630 goto fail;
631 err = create_gc_caches();
632 if (err)
633 goto fail;
634 err = create_checkpoint_caches();
635 if (err)
636 goto fail;
637 return register_filesystem(&f2fs_fs_type);
638fail:
639 return err;
640}
641
642static void __exit exit_f2fs_fs(void)
643{
644 destroy_root_stats();
645 unregister_filesystem(&f2fs_fs_type);
646 destroy_checkpoint_caches();
647 destroy_gc_caches();
648 destroy_node_manager_caches();
649 destroy_inodecache();
650}
651
652module_init(init_f2fs_fs)
653module_exit(exit_f2fs_fs)
654
655MODULE_AUTHOR("Samsung Electronics's Praesto Team");
656MODULE_DESCRIPTION("Flash Friendly File System");
657MODULE_LICENSE("GPL");
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
new file mode 100644
index 000000000000..7d52e8dc0c59
--- /dev/null
+++ b/fs/f2fs/xattr.c
@@ -0,0 +1,440 @@
1/*
2 * fs/f2fs/xattr.c
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext2/xattr.c
8 *
9 * Copyright (C) 2001-2003 Andreas Gruenbacher <agruen@suse.de>
10 *
11 * Fix by Harrison Xing <harrison@mountainviewdata.com>.
12 * Extended attributes for symlinks and special files added per
13 * suggestion of Luka Renko <luka.renko@hermes.si>.
14 * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>,
15 * Red Hat Inc.
16 *
17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License version 2 as
19 * published by the Free Software Foundation.
20 */
21#include <linux/rwsem.h>
22#include <linux/f2fs_fs.h>
23#include "f2fs.h"
24#include "xattr.h"
25
26static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
27 size_t list_size, const char *name, size_t name_len, int type)
28{
29 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
30 int total_len, prefix_len = 0;
31 const char *prefix = NULL;
32
33 switch (type) {
34 case F2FS_XATTR_INDEX_USER:
35 if (!test_opt(sbi, XATTR_USER))
36 return -EOPNOTSUPP;
37 prefix = XATTR_USER_PREFIX;
38 prefix_len = XATTR_USER_PREFIX_LEN;
39 break;
40 case F2FS_XATTR_INDEX_TRUSTED:
41 if (!capable(CAP_SYS_ADMIN))
42 return -EPERM;
43 prefix = XATTR_TRUSTED_PREFIX;
44 prefix_len = XATTR_TRUSTED_PREFIX_LEN;
45 break;
46 default:
47 return -EINVAL;
48 }
49
50 total_len = prefix_len + name_len + 1;
51 if (list && total_len <= list_size) {
52 memcpy(list, prefix, prefix_len);
53 memcpy(list+prefix_len, name, name_len);
54 list[prefix_len + name_len] = '\0';
55 }
56 return total_len;
57}
58
59static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
60 void *buffer, size_t size, int type)
61{
62 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
63
64 switch (type) {
65 case F2FS_XATTR_INDEX_USER:
66 if (!test_opt(sbi, XATTR_USER))
67 return -EOPNOTSUPP;
68 break;
69 case F2FS_XATTR_INDEX_TRUSTED:
70 if (!capable(CAP_SYS_ADMIN))
71 return -EPERM;
72 break;
73 default:
74 return -EINVAL;
75 }
76 if (strcmp(name, "") == 0)
77 return -EINVAL;
78 return f2fs_getxattr(dentry->d_inode, type, name,
79 buffer, size);
80}
81
82static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
83 const void *value, size_t size, int flags, int type)
84{
85 struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
86
87 switch (type) {
88 case F2FS_XATTR_INDEX_USER:
89 if (!test_opt(sbi, XATTR_USER))
90 return -EOPNOTSUPP;
91 break;
92 case F2FS_XATTR_INDEX_TRUSTED:
93 if (!capable(CAP_SYS_ADMIN))
94 return -EPERM;
95 break;
96 default:
97 return -EINVAL;
98 }
99 if (strcmp(name, "") == 0)
100 return -EINVAL;
101
102 return f2fs_setxattr(dentry->d_inode, type, name, value, size);
103}
104
105static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
106 size_t list_size, const char *name, size_t name_len, int type)
107{
108 const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
109 size_t size;
110
111 if (type != F2FS_XATTR_INDEX_ADVISE)
112 return 0;
113
114 size = strlen(xname) + 1;
115 if (list && size <= list_size)
116 memcpy(list, xname, size);
117 return size;
118}
119
120static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
121 void *buffer, size_t size, int type)
122{
123 struct inode *inode = dentry->d_inode;
124
125 if (strcmp(name, "") != 0)
126 return -EINVAL;
127
128 *((char *)buffer) = F2FS_I(inode)->i_advise;
129 return sizeof(char);
130}
131
132static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
133 const void *value, size_t size, int flags, int type)
134{
135 struct inode *inode = dentry->d_inode;
136
137 if (strcmp(name, "") != 0)
138 return -EINVAL;
139 if (!inode_owner_or_capable(inode))
140 return -EPERM;
141 if (value == NULL)
142 return -EINVAL;
143
144 F2FS_I(inode)->i_advise |= *(char *)value;
145 return 0;
146}
147
148const struct xattr_handler f2fs_xattr_user_handler = {
149 .prefix = XATTR_USER_PREFIX,
150 .flags = F2FS_XATTR_INDEX_USER,
151 .list = f2fs_xattr_generic_list,
152 .get = f2fs_xattr_generic_get,
153 .set = f2fs_xattr_generic_set,
154};
155
156const struct xattr_handler f2fs_xattr_trusted_handler = {
157 .prefix = XATTR_TRUSTED_PREFIX,
158 .flags = F2FS_XATTR_INDEX_TRUSTED,
159 .list = f2fs_xattr_generic_list,
160 .get = f2fs_xattr_generic_get,
161 .set = f2fs_xattr_generic_set,
162};
163
164const struct xattr_handler f2fs_xattr_advise_handler = {
165 .prefix = F2FS_SYSTEM_ADVISE_PREFIX,
166 .flags = F2FS_XATTR_INDEX_ADVISE,
167 .list = f2fs_xattr_advise_list,
168 .get = f2fs_xattr_advise_get,
169 .set = f2fs_xattr_advise_set,
170};
171
172static const struct xattr_handler *f2fs_xattr_handler_map[] = {
173 [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
174#ifdef CONFIG_F2FS_FS_POSIX_ACL
175 [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler,
176 [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
177#endif
178 [F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
179 [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler,
180};
181
182const struct xattr_handler *f2fs_xattr_handlers[] = {
183 &f2fs_xattr_user_handler,
184#ifdef CONFIG_F2FS_FS_POSIX_ACL
185 &f2fs_xattr_acl_access_handler,
186 &f2fs_xattr_acl_default_handler,
187#endif
188 &f2fs_xattr_trusted_handler,
189 &f2fs_xattr_advise_handler,
190 NULL,
191};
192
193static inline const struct xattr_handler *f2fs_xattr_handler(int name_index)
194{
195 const struct xattr_handler *handler = NULL;
196
197 if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map))
198 handler = f2fs_xattr_handler_map[name_index];
199 return handler;
200}
201
202int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
203 void *buffer, size_t buffer_size)
204{
205 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
206 struct f2fs_inode_info *fi = F2FS_I(inode);
207 struct f2fs_xattr_entry *entry;
208 struct page *page;
209 void *base_addr;
210 int error = 0, found = 0;
211 int value_len, name_len;
212
213 if (name == NULL)
214 return -EINVAL;
215 name_len = strlen(name);
216
217 if (!fi->i_xattr_nid)
218 return -ENODATA;
219
220 page = get_node_page(sbi, fi->i_xattr_nid);
221 base_addr = page_address(page);
222
223 list_for_each_xattr(entry, base_addr) {
224 if (entry->e_name_index != name_index)
225 continue;
226 if (entry->e_name_len != name_len)
227 continue;
228 if (!memcmp(entry->e_name, name, name_len)) {
229 found = 1;
230 break;
231 }
232 }
233 if (!found) {
234 error = -ENODATA;
235 goto cleanup;
236 }
237
238 value_len = le16_to_cpu(entry->e_value_size);
239
240 if (buffer && value_len > buffer_size) {
241 error = -ERANGE;
242 goto cleanup;
243 }
244
245 if (buffer) {
246 char *pval = entry->e_name + entry->e_name_len;
247 memcpy(buffer, pval, value_len);
248 }
249 error = value_len;
250
251cleanup:
252 f2fs_put_page(page, 1);
253 return error;
254}
255
256ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
257{
258 struct inode *inode = dentry->d_inode;
259 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
260 struct f2fs_inode_info *fi = F2FS_I(inode);
261 struct f2fs_xattr_entry *entry;
262 struct page *page;
263 void *base_addr;
264 int error = 0;
265 size_t rest = buffer_size;
266
267 if (!fi->i_xattr_nid)
268 return 0;
269
270 page = get_node_page(sbi, fi->i_xattr_nid);
271 base_addr = page_address(page);
272
273 list_for_each_xattr(entry, base_addr) {
274 const struct xattr_handler *handler =
275 f2fs_xattr_handler(entry->e_name_index);
276 size_t size;
277
278 if (!handler)
279 continue;
280
281 size = handler->list(dentry, buffer, rest, entry->e_name,
282 entry->e_name_len, handler->flags);
283 if (buffer && size > rest) {
284 error = -ERANGE;
285 goto cleanup;
286 }
287
288 if (buffer)
289 buffer += size;
290 rest -= size;
291 }
292 error = buffer_size - rest;
293cleanup:
294 f2fs_put_page(page, 1);
295 return error;
296}
297
298int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
299 const void *value, size_t value_len)
300{
301 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
302 struct f2fs_inode_info *fi = F2FS_I(inode);
303 struct f2fs_xattr_header *header = NULL;
304 struct f2fs_xattr_entry *here, *last;
305 struct page *page;
306 void *base_addr;
307 int error, found, free, name_len, newsize;
308 char *pval;
309
310 if (name == NULL)
311 return -EINVAL;
312 name_len = strlen(name);
313
314 if (value == NULL)
315 value_len = 0;
316
317 if (name_len > 255 || value_len > MAX_VALUE_LEN)
318 return -ERANGE;
319
320 mutex_lock_op(sbi, NODE_NEW);
321 if (!fi->i_xattr_nid) {
322 /* Allocate new attribute block */
323 struct dnode_of_data dn;
324
325 if (!alloc_nid(sbi, &fi->i_xattr_nid)) {
326 mutex_unlock_op(sbi, NODE_NEW);
327 return -ENOSPC;
328 }
329 set_new_dnode(&dn, inode, NULL, NULL, fi->i_xattr_nid);
330 mark_inode_dirty(inode);
331
332 page = new_node_page(&dn, XATTR_NODE_OFFSET);
333 if (IS_ERR(page)) {
334 alloc_nid_failed(sbi, fi->i_xattr_nid);
335 fi->i_xattr_nid = 0;
336 mutex_unlock_op(sbi, NODE_NEW);
337 return PTR_ERR(page);
338 }
339
340 alloc_nid_done(sbi, fi->i_xattr_nid);
341 base_addr = page_address(page);
342 header = XATTR_HDR(base_addr);
343 header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC);
344 header->h_refcount = cpu_to_le32(1);
345 } else {
346 /* The inode already has an extended attribute block. */
347 page = get_node_page(sbi, fi->i_xattr_nid);
348 if (IS_ERR(page)) {
349 mutex_unlock_op(sbi, NODE_NEW);
350 return PTR_ERR(page);
351 }
352
353 base_addr = page_address(page);
354 header = XATTR_HDR(base_addr);
355 }
356
357 if (le32_to_cpu(header->h_magic) != F2FS_XATTR_MAGIC) {
358 error = -EIO;
359 goto cleanup;
360 }
361
362 /* find entry with wanted name. */
363 found = 0;
364 list_for_each_xattr(here, base_addr) {
365 if (here->e_name_index != name_index)
366 continue;
367 if (here->e_name_len != name_len)
368 continue;
369 if (!memcmp(here->e_name, name, name_len)) {
370 found = 1;
371 break;
372 }
373 }
374
375 last = here;
376
377 while (!IS_XATTR_LAST_ENTRY(last))
378 last = XATTR_NEXT_ENTRY(last);
379
380 newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) +
381 name_len + value_len);
382
383 /* 1. Check space */
384 if (value) {
385 /* If value is NULL, it is remove operation.
386 * In case of update operation, we caculate free.
387 */
388 free = MIN_OFFSET - ((char *)last - (char *)header);
389 if (found)
390 free = free - ENTRY_SIZE(here);
391
392 if (free < newsize) {
393 error = -ENOSPC;
394 goto cleanup;
395 }
396 }
397
398 /* 2. Remove old entry */
399 if (found) {
400 /* If entry is found, remove old entry.
401 * If not found, remove operation is not needed.
402 */
403 struct f2fs_xattr_entry *next = XATTR_NEXT_ENTRY(here);
404 int oldsize = ENTRY_SIZE(here);
405
406 memmove(here, next, (char *)last - (char *)next);
407 last = (struct f2fs_xattr_entry *)((char *)last - oldsize);
408 memset(last, 0, oldsize);
409 }
410
411 /* 3. Write new entry */
412 if (value) {
413 /* Before we come here, old entry is removed.
414 * We just write new entry. */
415 memset(last, 0, newsize);
416 last->e_name_index = name_index;
417 last->e_name_len = name_len;
418 memcpy(last->e_name, name, name_len);
419 pval = last->e_name + name_len;
420 memcpy(pval, value, value_len);
421 last->e_value_size = cpu_to_le16(value_len);
422 }
423
424 set_page_dirty(page);
425 f2fs_put_page(page, 1);
426
427 if (is_inode_flag_set(fi, FI_ACL_MODE)) {
428 inode->i_mode = fi->i_acl_mode;
429 inode->i_ctime = CURRENT_TIME;
430 clear_inode_flag(fi, FI_ACL_MODE);
431 }
432 f2fs_write_inode(inode, NULL);
433 mutex_unlock_op(sbi, NODE_NEW);
434
435 return 0;
436cleanup:
437 f2fs_put_page(page, 1);
438 mutex_unlock_op(sbi, NODE_NEW);
439 return error;
440}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
new file mode 100644
index 000000000000..49c9558305e3
--- /dev/null
+++ b/fs/f2fs/xattr.h
@@ -0,0 +1,145 @@
1/*
2 * fs/f2fs/xattr.h
3 *
4 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5 * http://www.samsung.com/
6 *
7 * Portions of this code from linux/fs/ext2/xattr.h
8 *
9 * On-disk format of extended attributes for the ext2 filesystem.
10 *
11 * (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License version 2 as
15 * published by the Free Software Foundation.
16 */
17#ifndef __F2FS_XATTR_H__
18#define __F2FS_XATTR_H__
19
20#include <linux/init.h>
21#include <linux/xattr.h>
22
23/* Magic value in attribute blocks */
24#define F2FS_XATTR_MAGIC 0xF2F52011
25
26/* Maximum number of references to one attribute block */
27#define F2FS_XATTR_REFCOUNT_MAX 1024
28
29/* Name indexes */
30#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise"
31#define F2FS_XATTR_INDEX_USER 1
32#define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2
33#define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
34#define F2FS_XATTR_INDEX_TRUSTED 4
35#define F2FS_XATTR_INDEX_LUSTRE 5
36#define F2FS_XATTR_INDEX_SECURITY 6
37#define F2FS_XATTR_INDEX_ADVISE 7
38
39struct f2fs_xattr_header {
40 __le32 h_magic; /* magic number for identification */
41 __le32 h_refcount; /* reference count */
42 __u32 h_reserved[4]; /* zero right now */
43};
44
45struct f2fs_xattr_entry {
46 __u8 e_name_index;
47 __u8 e_name_len;
48 __le16 e_value_size; /* size of attribute value */
49 char e_name[0]; /* attribute name */
50};
51
52#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
53#define XATTR_ENTRY(ptr) ((struct f2fs_xattr_entry *)(ptr))
54#define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr)+1))
55#define XATTR_ROUND (3)
56
57#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND)
58
59#define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \
60 entry->e_name_len + le16_to_cpu(entry->e_value_size)))
61
62#define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\
63 ENTRY_SIZE(entry)))
64
65#define IS_XATTR_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
66
67#define list_for_each_xattr(entry, addr) \
68 for (entry = XATTR_FIRST_ENTRY(addr);\
69 !IS_XATTR_LAST_ENTRY(entry);\
70 entry = XATTR_NEXT_ENTRY(entry))
71
72
73#define MIN_OFFSET XATTR_ALIGN(PAGE_SIZE - \
74 sizeof(struct node_footer) - \
75 sizeof(__u32))
76
77#define MAX_VALUE_LEN (MIN_OFFSET - sizeof(struct f2fs_xattr_header) - \
78 sizeof(struct f2fs_xattr_entry))
79
80/*
81 * On-disk structure of f2fs_xattr
82 * We use only 1 block for xattr.
83 *
84 * +--------------------+
85 * | f2fs_xattr_header |
86 * | |
87 * +--------------------+
88 * | f2fs_xattr_entry |
89 * | .e_name_index = 1 |
90 * | .e_name_len = 3 |
91 * | .e_value_size = 14 |
92 * | .e_name = "foo" |
93 * | "value_of_xattr" |<- value_offs = e_name + e_name_len
94 * +--------------------+
95 * | f2fs_xattr_entry |
96 * | .e_name_index = 4 |
97 * | .e_name = "bar" |
98 * +--------------------+
99 * | |
100 * | Free |
101 * | |
102 * +--------------------+<- MIN_OFFSET
103 * | node_footer |
104 * | (nid, ino, offset) |
105 * +--------------------+
106 *
107 **/
108
109#ifdef CONFIG_F2FS_FS_XATTR
110extern const struct xattr_handler f2fs_xattr_user_handler;
111extern const struct xattr_handler f2fs_xattr_trusted_handler;
112extern const struct xattr_handler f2fs_xattr_acl_access_handler;
113extern const struct xattr_handler f2fs_xattr_acl_default_handler;
114extern const struct xattr_handler f2fs_xattr_advise_handler;
115
116extern const struct xattr_handler *f2fs_xattr_handlers[];
117
118extern int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
119 const void *value, size_t value_len);
120extern int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
121 void *buffer, size_t buffer_size);
122extern ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
123 size_t buffer_size);
124
125#else
126
127#define f2fs_xattr_handlers NULL
128static inline int f2fs_setxattr(struct inode *inode, int name_index,
129 const char *name, const void *value, size_t value_len)
130{
131 return -EOPNOTSUPP;
132}
133static inline int f2fs_getxattr(struct inode *inode, int name_index,
134 const char *name, void *buffer, size_t buffer_size)
135{
136 return -EOPNOTSUPP;
137}
138static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
139 size_t buffer_size)
140{
141 return -EOPNOTSUPP;
142}
143#endif
144
145#endif /* __F2FS_XATTR_H__ */
diff --git a/fs/fhandle.c b/fs/fhandle.c
index cccdc874bb55..999ff5c3cab0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -52,7 +52,7 @@ static long do_sys_name_to_handle(struct path *path,
52 handle_bytes = handle_dwords * sizeof(u32); 52 handle_bytes = handle_dwords * sizeof(u32);
53 handle->handle_bytes = handle_bytes; 53 handle->handle_bytes = handle_bytes;
54 if ((handle->handle_bytes > f_handle.handle_bytes) || 54 if ((handle->handle_bytes > f_handle.handle_bytes) ||
55 (retval == 255) || (retval == -ENOSPC)) { 55 (retval == FILEID_INVALID) || (retval == -ENOSPC)) {
56 /* As per old exportfs_encode_fh documentation 56 /* As per old exportfs_encode_fh documentation
57 * we could return ENOSPC to indicate overflow 57 * we could return ENOSPC to indicate overflow
58 * But file system returned 255 always. So handle 58 * But file system returned 255 always. So handle
diff --git a/fs/file_table.c b/fs/file_table.c
index a72bf9ddd0d2..de9e9653d611 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -458,8 +458,8 @@ void mark_files_ro(struct super_block *sb)
458 spin_unlock(&f->f_lock); 458 spin_unlock(&f->f_lock);
459 if (file_check_writeable(f) != 0) 459 if (file_check_writeable(f) != 0)
460 continue; 460 continue;
461 __mnt_drop_write(f->f_path.mnt);
461 file_release_write(f); 462 file_release_write(f);
462 mnt_drop_write_file(f);
463 } while_file_list_for_each_entry; 463 } while_file_list_for_each_entry;
464 lg_global_unlock(&files_lglock); 464 lg_global_unlock(&files_lglock);
465} 465}
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index 6a3c48abd677..b52aed1dca97 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -314,10 +314,10 @@ EXPORT_SYMBOL(fscache_add_cache);
314 */ 314 */
315void fscache_io_error(struct fscache_cache *cache) 315void fscache_io_error(struct fscache_cache *cache)
316{ 316{
317 set_bit(FSCACHE_IOERROR, &cache->flags); 317 if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
318 318 printk(KERN_ERR "FS-Cache:"
319 printk(KERN_ERR "FS-Cache: Cache %s stopped due to I/O error\n", 319 " Cache '%s' stopped due to I/O error\n",
320 cache->ops->name); 320 cache->ops->name);
321} 321}
322EXPORT_SYMBOL(fscache_io_error); 322EXPORT_SYMBOL(fscache_io_error);
323 323
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 990535071a8a..8dcb114758e3 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -370,6 +370,66 @@ cant_attach_object:
370} 370}
371 371
372/* 372/*
373 * Invalidate an object. Callable with spinlocks held.
374 */
375void __fscache_invalidate(struct fscache_cookie *cookie)
376{
377 struct fscache_object *object;
378
379 _enter("{%s}", cookie->def->name);
380
381 fscache_stat(&fscache_n_invalidates);
382
383 /* Only permit invalidation of data files. Invalidating an index will
384 * require the caller to release all its attachments to the tree rooted
385 * there, and if it's doing that, it may as well just retire the
386 * cookie.
387 */
388 ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE);
389
390 /* We will be updating the cookie too. */
391 BUG_ON(!cookie->def->get_aux);
392
393 /* If there's an object, we tell the object state machine to handle the
394 * invalidation on our behalf, otherwise there's nothing to do.
395 */
396 if (!hlist_empty(&cookie->backing_objects)) {
397 spin_lock(&cookie->lock);
398
399 if (!hlist_empty(&cookie->backing_objects) &&
400 !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
401 &cookie->flags)) {
402 object = hlist_entry(cookie->backing_objects.first,
403 struct fscache_object,
404 cookie_link);
405 if (object->state < FSCACHE_OBJECT_DYING)
406 fscache_raise_event(
407 object, FSCACHE_OBJECT_EV_INVALIDATE);
408 }
409
410 spin_unlock(&cookie->lock);
411 }
412
413 _leave("");
414}
415EXPORT_SYMBOL(__fscache_invalidate);
416
417/*
418 * Wait for object invalidation to complete.
419 */
420void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
421{
422 _enter("%p", cookie);
423
424 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
425 fscache_wait_bit_interruptible,
426 TASK_UNINTERRUPTIBLE);
427
428 _leave("");
429}
430EXPORT_SYMBOL(__fscache_wait_on_invalidate);
431
432/*
373 * update the index entries backing a cookie 433 * update the index entries backing a cookie
374 */ 434 */
375void __fscache_update_cookie(struct fscache_cookie *cookie) 435void __fscache_update_cookie(struct fscache_cookie *cookie)
@@ -442,16 +502,34 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
442 502
443 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE; 503 event = retire ? FSCACHE_OBJECT_EV_RETIRE : FSCACHE_OBJECT_EV_RELEASE;
444 504
505try_again:
445 spin_lock(&cookie->lock); 506 spin_lock(&cookie->lock);
446 507
447 /* break links with all the active objects */ 508 /* break links with all the active objects */
448 while (!hlist_empty(&cookie->backing_objects)) { 509 while (!hlist_empty(&cookie->backing_objects)) {
510 int n_reads;
449 object = hlist_entry(cookie->backing_objects.first, 511 object = hlist_entry(cookie->backing_objects.first,
450 struct fscache_object, 512 struct fscache_object,
451 cookie_link); 513 cookie_link);
452 514
453 _debug("RELEASE OBJ%x", object->debug_id); 515 _debug("RELEASE OBJ%x", object->debug_id);
454 516
517 set_bit(FSCACHE_COOKIE_WAITING_ON_READS, &cookie->flags);
518 n_reads = atomic_read(&object->n_reads);
519 if (n_reads) {
520 int n_ops = object->n_ops;
521 int n_in_progress = object->n_in_progress;
522 spin_unlock(&cookie->lock);
523 printk(KERN_ERR "FS-Cache:"
524 " Cookie '%s' still has %d outstanding reads (%d,%d)\n",
525 cookie->def->name,
526 n_reads, n_ops, n_in_progress);
527 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_WAITING_ON_READS,
528 fscache_wait_bit, TASK_UNINTERRUPTIBLE);
529 printk("Wait finished\n");
530 goto try_again;
531 }
532
455 /* detach each cache object from the object cookie */ 533 /* detach each cache object from the object cookie */
456 spin_lock(&object->lock); 534 spin_lock(&object->lock);
457 hlist_del_init(&object->cookie_link); 535 hlist_del_init(&object->cookie_link);
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index f6aad48d38a8..ee38fef4be51 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -121,12 +121,19 @@ extern int fscache_submit_exclusive_op(struct fscache_object *,
121 struct fscache_operation *); 121 struct fscache_operation *);
122extern int fscache_submit_op(struct fscache_object *, 122extern int fscache_submit_op(struct fscache_object *,
123 struct fscache_operation *); 123 struct fscache_operation *);
124extern int fscache_cancel_op(struct fscache_operation *); 124extern int fscache_cancel_op(struct fscache_operation *,
125 void (*)(struct fscache_operation *));
126extern void fscache_cancel_all_ops(struct fscache_object *);
125extern void fscache_abort_object(struct fscache_object *); 127extern void fscache_abort_object(struct fscache_object *);
126extern void fscache_start_operations(struct fscache_object *); 128extern void fscache_start_operations(struct fscache_object *);
127extern void fscache_operation_gc(struct work_struct *); 129extern void fscache_operation_gc(struct work_struct *);
128 130
129/* 131/*
132 * page.c
133 */
134extern void fscache_invalidate_writes(struct fscache_cookie *);
135
136/*
130 * proc.c 137 * proc.c
131 */ 138 */
132#ifdef CONFIG_PROC_FS 139#ifdef CONFIG_PROC_FS
@@ -194,6 +201,7 @@ extern atomic_t fscache_n_store_vmscan_not_storing;
194extern atomic_t fscache_n_store_vmscan_gone; 201extern atomic_t fscache_n_store_vmscan_gone;
195extern atomic_t fscache_n_store_vmscan_busy; 202extern atomic_t fscache_n_store_vmscan_busy;
196extern atomic_t fscache_n_store_vmscan_cancelled; 203extern atomic_t fscache_n_store_vmscan_cancelled;
204extern atomic_t fscache_n_store_vmscan_wait;
197 205
198extern atomic_t fscache_n_marks; 206extern atomic_t fscache_n_marks;
199extern atomic_t fscache_n_uncaches; 207extern atomic_t fscache_n_uncaches;
@@ -205,6 +213,9 @@ extern atomic_t fscache_n_acquires_ok;
205extern atomic_t fscache_n_acquires_nobufs; 213extern atomic_t fscache_n_acquires_nobufs;
206extern atomic_t fscache_n_acquires_oom; 214extern atomic_t fscache_n_acquires_oom;
207 215
216extern atomic_t fscache_n_invalidates;
217extern atomic_t fscache_n_invalidates_run;
218
208extern atomic_t fscache_n_updates; 219extern atomic_t fscache_n_updates;
209extern atomic_t fscache_n_updates_null; 220extern atomic_t fscache_n_updates_null;
210extern atomic_t fscache_n_updates_run; 221extern atomic_t fscache_n_updates_run;
@@ -237,6 +248,7 @@ extern atomic_t fscache_n_cop_alloc_object;
237extern atomic_t fscache_n_cop_lookup_object; 248extern atomic_t fscache_n_cop_lookup_object;
238extern atomic_t fscache_n_cop_lookup_complete; 249extern atomic_t fscache_n_cop_lookup_complete;
239extern atomic_t fscache_n_cop_grab_object; 250extern atomic_t fscache_n_cop_grab_object;
251extern atomic_t fscache_n_cop_invalidate_object;
240extern atomic_t fscache_n_cop_update_object; 252extern atomic_t fscache_n_cop_update_object;
241extern atomic_t fscache_n_cop_drop_object; 253extern atomic_t fscache_n_cop_drop_object;
242extern atomic_t fscache_n_cop_put_object; 254extern atomic_t fscache_n_cop_put_object;
@@ -278,6 +290,7 @@ extern const struct file_operations fscache_stats_fops;
278static inline void fscache_raise_event(struct fscache_object *object, 290static inline void fscache_raise_event(struct fscache_object *object,
279 unsigned event) 291 unsigned event)
280{ 292{
293 BUG_ON(event >= NR_FSCACHE_OBJECT_EVENTS);
281 if (!test_and_set_bit(event, &object->events) && 294 if (!test_and_set_bit(event, &object->events) &&
282 test_bit(event, &object->event_mask)) 295 test_bit(event, &object->event_mask))
283 fscache_enqueue_object(object); 296 fscache_enqueue_object(object);
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index ebe29c581380..f27c89d17885 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -245,7 +245,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
245 obj->n_in_progress, 245 obj->n_in_progress,
246 obj->n_exclusive, 246 obj->n_exclusive,
247 atomic_read(&obj->n_reads), 247 atomic_read(&obj->n_reads),
248 obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, 248 obj->event_mask,
249 obj->events, 249 obj->events,
250 obj->flags, 250 obj->flags,
251 work_busy(&obj->work)); 251 work_busy(&obj->work));
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index b6b897c550ac..50d41c180211 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -14,6 +14,7 @@
14 14
15#define FSCACHE_DEBUG_LEVEL COOKIE 15#define FSCACHE_DEBUG_LEVEL COOKIE
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/slab.h>
17#include "internal.h" 18#include "internal.h"
18 19
19const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { 20const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
@@ -22,6 +23,7 @@ const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = {
22 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING", 23 [FSCACHE_OBJECT_CREATING] = "OBJECT_CREATING",
23 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE", 24 [FSCACHE_OBJECT_AVAILABLE] = "OBJECT_AVAILABLE",
24 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE", 25 [FSCACHE_OBJECT_ACTIVE] = "OBJECT_ACTIVE",
26 [FSCACHE_OBJECT_INVALIDATING] = "OBJECT_INVALIDATING",
25 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING", 27 [FSCACHE_OBJECT_UPDATING] = "OBJECT_UPDATING",
26 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING", 28 [FSCACHE_OBJECT_DYING] = "OBJECT_DYING",
27 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING", 29 [FSCACHE_OBJECT_LC_DYING] = "OBJECT_LC_DYING",
@@ -39,6 +41,7 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = {
39 [FSCACHE_OBJECT_CREATING] = "CRTN", 41 [FSCACHE_OBJECT_CREATING] = "CRTN",
40 [FSCACHE_OBJECT_AVAILABLE] = "AVBL", 42 [FSCACHE_OBJECT_AVAILABLE] = "AVBL",
41 [FSCACHE_OBJECT_ACTIVE] = "ACTV", 43 [FSCACHE_OBJECT_ACTIVE] = "ACTV",
44 [FSCACHE_OBJECT_INVALIDATING] = "INVL",
42 [FSCACHE_OBJECT_UPDATING] = "UPDT", 45 [FSCACHE_OBJECT_UPDATING] = "UPDT",
43 [FSCACHE_OBJECT_DYING] = "DYNG", 46 [FSCACHE_OBJECT_DYING] = "DYNG",
44 [FSCACHE_OBJECT_LC_DYING] = "LCDY", 47 [FSCACHE_OBJECT_LC_DYING] = "LCDY",
@@ -54,6 +57,7 @@ static void fscache_put_object(struct fscache_object *);
54static void fscache_initialise_object(struct fscache_object *); 57static void fscache_initialise_object(struct fscache_object *);
55static void fscache_lookup_object(struct fscache_object *); 58static void fscache_lookup_object(struct fscache_object *);
56static void fscache_object_available(struct fscache_object *); 59static void fscache_object_available(struct fscache_object *);
60static void fscache_invalidate_object(struct fscache_object *);
57static void fscache_release_object(struct fscache_object *); 61static void fscache_release_object(struct fscache_object *);
58static void fscache_withdraw_object(struct fscache_object *); 62static void fscache_withdraw_object(struct fscache_object *);
59static void fscache_enqueue_dependents(struct fscache_object *); 63static void fscache_enqueue_dependents(struct fscache_object *);
@@ -79,6 +83,15 @@ static inline void fscache_done_parent_op(struct fscache_object *object)
79} 83}
80 84
81/* 85/*
86 * Notify netfs of invalidation completion.
87 */
88static inline void fscache_invalidation_complete(struct fscache_cookie *cookie)
89{
90 if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
91 wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
92}
93
94/*
82 * process events that have been sent to an object's state machine 95 * process events that have been sent to an object's state machine
83 * - initiates parent lookup 96 * - initiates parent lookup
84 * - does object lookup 97 * - does object lookup
@@ -90,6 +103,7 @@ static void fscache_object_state_machine(struct fscache_object *object)
90{ 103{
91 enum fscache_object_state new_state; 104 enum fscache_object_state new_state;
92 struct fscache_cookie *cookie; 105 struct fscache_cookie *cookie;
106 int event;
93 107
94 ASSERT(object != NULL); 108 ASSERT(object != NULL);
95 109
@@ -101,7 +115,8 @@ static void fscache_object_state_machine(struct fscache_object *object)
101 /* wait for the parent object to become ready */ 115 /* wait for the parent object to become ready */
102 case FSCACHE_OBJECT_INIT: 116 case FSCACHE_OBJECT_INIT:
103 object->event_mask = 117 object->event_mask =
104 ULONG_MAX & ~(1 << FSCACHE_OBJECT_EV_CLEARED); 118 FSCACHE_OBJECT_EVENTS_MASK &
119 ~(1 << FSCACHE_OBJECT_EV_CLEARED);
105 fscache_initialise_object(object); 120 fscache_initialise_object(object);
106 goto done; 121 goto done;
107 122
@@ -125,6 +140,16 @@ static void fscache_object_state_machine(struct fscache_object *object)
125 case FSCACHE_OBJECT_ACTIVE: 140 case FSCACHE_OBJECT_ACTIVE:
126 goto active_transit; 141 goto active_transit;
127 142
143 /* Invalidate an object on disk */
144 case FSCACHE_OBJECT_INVALIDATING:
145 clear_bit(FSCACHE_OBJECT_EV_INVALIDATE, &object->events);
146 fscache_stat(&fscache_n_invalidates_run);
147 fscache_stat(&fscache_n_cop_invalidate_object);
148 fscache_invalidate_object(object);
149 fscache_stat_d(&fscache_n_cop_invalidate_object);
150 fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
151 goto active_transit;
152
128 /* update the object metadata on disk */ 153 /* update the object metadata on disk */
129 case FSCACHE_OBJECT_UPDATING: 154 case FSCACHE_OBJECT_UPDATING:
130 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events); 155 clear_bit(FSCACHE_OBJECT_EV_UPDATE, &object->events);
@@ -251,13 +276,17 @@ static void fscache_object_state_machine(struct fscache_object *object)
251 276
252 /* determine the transition from a lookup state */ 277 /* determine the transition from a lookup state */
253lookup_transit: 278lookup_transit:
254 switch (fls(object->events & object->event_mask) - 1) { 279 event = fls(object->events & object->event_mask) - 1;
280 switch (event) {
255 case FSCACHE_OBJECT_EV_WITHDRAW: 281 case FSCACHE_OBJECT_EV_WITHDRAW:
256 case FSCACHE_OBJECT_EV_RETIRE: 282 case FSCACHE_OBJECT_EV_RETIRE:
257 case FSCACHE_OBJECT_EV_RELEASE: 283 case FSCACHE_OBJECT_EV_RELEASE:
258 case FSCACHE_OBJECT_EV_ERROR: 284 case FSCACHE_OBJECT_EV_ERROR:
259 new_state = FSCACHE_OBJECT_LC_DYING; 285 new_state = FSCACHE_OBJECT_LC_DYING;
260 goto change_state; 286 goto change_state;
287 case FSCACHE_OBJECT_EV_INVALIDATE:
288 new_state = FSCACHE_OBJECT_INVALIDATING;
289 goto change_state;
261 case FSCACHE_OBJECT_EV_REQUEUE: 290 case FSCACHE_OBJECT_EV_REQUEUE:
262 goto done; 291 goto done;
263 case -1: 292 case -1:
@@ -268,13 +297,17 @@ lookup_transit:
268 297
269 /* determine the transition from an active state */ 298 /* determine the transition from an active state */
270active_transit: 299active_transit:
271 switch (fls(object->events & object->event_mask) - 1) { 300 event = fls(object->events & object->event_mask) - 1;
301 switch (event) {
272 case FSCACHE_OBJECT_EV_WITHDRAW: 302 case FSCACHE_OBJECT_EV_WITHDRAW:
273 case FSCACHE_OBJECT_EV_RETIRE: 303 case FSCACHE_OBJECT_EV_RETIRE:
274 case FSCACHE_OBJECT_EV_RELEASE: 304 case FSCACHE_OBJECT_EV_RELEASE:
275 case FSCACHE_OBJECT_EV_ERROR: 305 case FSCACHE_OBJECT_EV_ERROR:
276 new_state = FSCACHE_OBJECT_DYING; 306 new_state = FSCACHE_OBJECT_DYING;
277 goto change_state; 307 goto change_state;
308 case FSCACHE_OBJECT_EV_INVALIDATE:
309 new_state = FSCACHE_OBJECT_INVALIDATING;
310 goto change_state;
278 case FSCACHE_OBJECT_EV_UPDATE: 311 case FSCACHE_OBJECT_EV_UPDATE:
279 new_state = FSCACHE_OBJECT_UPDATING; 312 new_state = FSCACHE_OBJECT_UPDATING;
280 goto change_state; 313 goto change_state;
@@ -287,7 +320,8 @@ active_transit:
287 320
288 /* determine the transition from a terminal state */ 321 /* determine the transition from a terminal state */
289terminal_transit: 322terminal_transit:
290 switch (fls(object->events & object->event_mask) - 1) { 323 event = fls(object->events & object->event_mask) - 1;
324 switch (event) {
291 case FSCACHE_OBJECT_EV_WITHDRAW: 325 case FSCACHE_OBJECT_EV_WITHDRAW:
292 new_state = FSCACHE_OBJECT_WITHDRAWING; 326 new_state = FSCACHE_OBJECT_WITHDRAWING;
293 goto change_state; 327 goto change_state;
@@ -320,8 +354,8 @@ done:
320 354
321unsupported_event: 355unsupported_event:
322 printk(KERN_ERR "FS-Cache:" 356 printk(KERN_ERR "FS-Cache:"
323 " Unsupported event %lx [mask %lx] in state %s\n", 357 " Unsupported event %d [%lx/%lx] in state %s\n",
324 object->events, object->event_mask, 358 event, object->events, object->event_mask,
325 fscache_object_states[object->state]); 359 fscache_object_states[object->state]);
326 BUG(); 360 BUG();
327} 361}
@@ -587,8 +621,6 @@ static void fscache_object_available(struct fscache_object *object)
587 if (object->n_in_progress == 0) { 621 if (object->n_in_progress == 0) {
588 if (object->n_ops > 0) { 622 if (object->n_ops > 0) {
589 ASSERTCMP(object->n_ops, >=, object->n_obj_ops); 623 ASSERTCMP(object->n_ops, >=, object->n_obj_ops);
590 ASSERTIF(object->n_ops > object->n_obj_ops,
591 !list_empty(&object->pending_ops));
592 fscache_start_operations(object); 624 fscache_start_operations(object);
593 } else { 625 } else {
594 ASSERT(list_empty(&object->pending_ops)); 626 ASSERT(list_empty(&object->pending_ops));
@@ -681,6 +713,7 @@ static void fscache_withdraw_object(struct fscache_object *object)
681 if (object->cookie == cookie) { 713 if (object->cookie == cookie) {
682 hlist_del_init(&object->cookie_link); 714 hlist_del_init(&object->cookie_link);
683 object->cookie = NULL; 715 object->cookie = NULL;
716 fscache_invalidation_complete(cookie);
684 detached = true; 717 detached = true;
685 } 718 }
686 spin_unlock(&cookie->lock); 719 spin_unlock(&cookie->lock);
@@ -890,3 +923,55 @@ enum fscache_checkaux fscache_check_aux(struct fscache_object *object,
890 return result; 923 return result;
891} 924}
892EXPORT_SYMBOL(fscache_check_aux); 925EXPORT_SYMBOL(fscache_check_aux);
926
927/*
928 * Asynchronously invalidate an object.
929 */
930static void fscache_invalidate_object(struct fscache_object *object)
931{
932 struct fscache_operation *op;
933 struct fscache_cookie *cookie = object->cookie;
934
935 _enter("{OBJ%x}", object->debug_id);
936
937 /* Reject any new read/write ops and abort any that are pending. */
938 fscache_invalidate_writes(cookie);
939 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
940 fscache_cancel_all_ops(object);
941
942 /* Now we have to wait for in-progress reads and writes */
943 op = kzalloc(sizeof(*op), GFP_KERNEL);
944 if (!op) {
945 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
946 _leave(" [ENOMEM]");
947 return;
948 }
949
950 fscache_operation_init(op, object->cache->ops->invalidate_object, NULL);
951 op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
952
953 spin_lock(&cookie->lock);
954 if (fscache_submit_exclusive_op(object, op) < 0)
955 goto submit_op_failed;
956 spin_unlock(&cookie->lock);
957 fscache_put_operation(op);
958
959 /* Once we've completed the invalidation, we know there will be no data
960 * stored in the cache and thus we can reinstate the data-check-skip
961 * optimisation.
962 */
963 set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
964
965 /* We can allow read and write requests to come in once again. They'll
966 * queue up behind our exclusive invalidation operation.
967 */
968 fscache_invalidation_complete(cookie);
969 _leave("");
970 return;
971
972submit_op_failed:
973 spin_unlock(&cookie->lock);
974 kfree(op);
975 fscache_raise_event(object, FSCACHE_OBJECT_EV_ERROR);
976 _leave(" [EIO]");
977}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 30afdfa7aec7..762a9ec4ffa4 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -37,6 +37,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
37 ASSERT(op->processor != NULL); 37 ASSERT(op->processor != NULL);
38 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); 38 ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE);
39 ASSERTCMP(atomic_read(&op->usage), >, 0); 39 ASSERTCMP(atomic_read(&op->usage), >, 0);
40 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
40 41
41 fscache_stat(&fscache_n_op_enqueue); 42 fscache_stat(&fscache_n_op_enqueue);
42 switch (op->flags & FSCACHE_OP_TYPE) { 43 switch (op->flags & FSCACHE_OP_TYPE) {
@@ -64,6 +65,9 @@ EXPORT_SYMBOL(fscache_enqueue_operation);
64static void fscache_run_op(struct fscache_object *object, 65static void fscache_run_op(struct fscache_object *object,
65 struct fscache_operation *op) 66 struct fscache_operation *op)
66{ 67{
68 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
69
70 op->state = FSCACHE_OP_ST_IN_PROGRESS;
67 object->n_in_progress++; 71 object->n_in_progress++;
68 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) 72 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
69 wake_up_bit(&op->flags, FSCACHE_OP_WAITING); 73 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
@@ -84,18 +88,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
84 88
85 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); 89 _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id);
86 90
91 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
92 ASSERTCMP(atomic_read(&op->usage), >, 0);
93
87 spin_lock(&object->lock); 94 spin_lock(&object->lock);
88 ASSERTCMP(object->n_ops, >=, object->n_in_progress); 95 ASSERTCMP(object->n_ops, >=, object->n_in_progress);
89 ASSERTCMP(object->n_ops, >=, object->n_exclusive); 96 ASSERTCMP(object->n_ops, >=, object->n_exclusive);
90 ASSERT(list_empty(&op->pend_link)); 97 ASSERT(list_empty(&op->pend_link));
91 98
92 ret = -ENOBUFS; 99 op->state = FSCACHE_OP_ST_PENDING;
93 if (fscache_object_is_active(object)) { 100 if (fscache_object_is_active(object)) {
94 op->object = object; 101 op->object = object;
95 object->n_ops++; 102 object->n_ops++;
96 object->n_exclusive++; /* reads and writes must wait */ 103 object->n_exclusive++; /* reads and writes must wait */
97 104
98 if (object->n_ops > 1) { 105 if (object->n_in_progress > 0) {
99 atomic_inc(&op->usage); 106 atomic_inc(&op->usage);
100 list_add_tail(&op->pend_link, &object->pending_ops); 107 list_add_tail(&op->pend_link, &object->pending_ops);
101 fscache_stat(&fscache_n_op_pend); 108 fscache_stat(&fscache_n_op_pend);
@@ -121,8 +128,11 @@ int fscache_submit_exclusive_op(struct fscache_object *object,
121 fscache_stat(&fscache_n_op_pend); 128 fscache_stat(&fscache_n_op_pend);
122 ret = 0; 129 ret = 0;
123 } else { 130 } else {
124 /* not allowed to submit ops in any other state */ 131 /* If we're in any other state, there must have been an I/O
125 BUG(); 132 * error of some nature.
133 */
134 ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags));
135 ret = -EIO;
126 } 136 }
127 137
128 spin_unlock(&object->lock); 138 spin_unlock(&object->lock);
@@ -186,6 +196,7 @@ int fscache_submit_op(struct fscache_object *object,
186 _enter("{OBJ%x OP%x},{%u}", 196 _enter("{OBJ%x OP%x},{%u}",
187 object->debug_id, op->debug_id, atomic_read(&op->usage)); 197 object->debug_id, op->debug_id, atomic_read(&op->usage));
188 198
199 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_INITIALISED);
189 ASSERTCMP(atomic_read(&op->usage), >, 0); 200 ASSERTCMP(atomic_read(&op->usage), >, 0);
190 201
191 spin_lock(&object->lock); 202 spin_lock(&object->lock);
@@ -196,6 +207,7 @@ int fscache_submit_op(struct fscache_object *object,
196 ostate = object->state; 207 ostate = object->state;
197 smp_rmb(); 208 smp_rmb();
198 209
210 op->state = FSCACHE_OP_ST_PENDING;
199 if (fscache_object_is_active(object)) { 211 if (fscache_object_is_active(object)) {
200 op->object = object; 212 op->object = object;
201 object->n_ops++; 213 object->n_ops++;
@@ -225,12 +237,15 @@ int fscache_submit_op(struct fscache_object *object,
225 object->state == FSCACHE_OBJECT_LC_DYING || 237 object->state == FSCACHE_OBJECT_LC_DYING ||
226 object->state == FSCACHE_OBJECT_WITHDRAWING) { 238 object->state == FSCACHE_OBJECT_WITHDRAWING) {
227 fscache_stat(&fscache_n_op_rejected); 239 fscache_stat(&fscache_n_op_rejected);
240 op->state = FSCACHE_OP_ST_CANCELLED;
228 ret = -ENOBUFS; 241 ret = -ENOBUFS;
229 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { 242 } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) {
230 fscache_report_unexpected_submission(object, op, ostate); 243 fscache_report_unexpected_submission(object, op, ostate);
231 ASSERT(!fscache_object_is_active(object)); 244 ASSERT(!fscache_object_is_active(object));
245 op->state = FSCACHE_OP_ST_CANCELLED;
232 ret = -ENOBUFS; 246 ret = -ENOBUFS;
233 } else { 247 } else {
248 op->state = FSCACHE_OP_ST_CANCELLED;
234 ret = -ENOBUFS; 249 ret = -ENOBUFS;
235 } 250 }
236 251
@@ -283,20 +298,28 @@ void fscache_start_operations(struct fscache_object *object)
283/* 298/*
284 * cancel an operation that's pending on an object 299 * cancel an operation that's pending on an object
285 */ 300 */
286int fscache_cancel_op(struct fscache_operation *op) 301int fscache_cancel_op(struct fscache_operation *op,
302 void (*do_cancel)(struct fscache_operation *))
287{ 303{
288 struct fscache_object *object = op->object; 304 struct fscache_object *object = op->object;
289 int ret; 305 int ret;
290 306
291 _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); 307 _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id);
292 308
309 ASSERTCMP(op->state, >=, FSCACHE_OP_ST_PENDING);
310 ASSERTCMP(op->state, !=, FSCACHE_OP_ST_CANCELLED);
311 ASSERTCMP(atomic_read(&op->usage), >, 0);
312
293 spin_lock(&object->lock); 313 spin_lock(&object->lock);
294 314
295 ret = -EBUSY; 315 ret = -EBUSY;
296 if (!list_empty(&op->pend_link)) { 316 if (op->state == FSCACHE_OP_ST_PENDING) {
317 ASSERT(!list_empty(&op->pend_link));
297 fscache_stat(&fscache_n_op_cancelled); 318 fscache_stat(&fscache_n_op_cancelled);
298 list_del_init(&op->pend_link); 319 list_del_init(&op->pend_link);
299 object->n_ops--; 320 if (do_cancel)
321 do_cancel(op);
322 op->state = FSCACHE_OP_ST_CANCELLED;
300 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) 323 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
301 object->n_exclusive--; 324 object->n_exclusive--;
302 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) 325 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
@@ -311,6 +334,70 @@ int fscache_cancel_op(struct fscache_operation *op)
311} 334}
312 335
313/* 336/*
337 * Cancel all pending operations on an object
338 */
339void fscache_cancel_all_ops(struct fscache_object *object)
340{
341 struct fscache_operation *op;
342
343 _enter("OBJ%x", object->debug_id);
344
345 spin_lock(&object->lock);
346
347 while (!list_empty(&object->pending_ops)) {
348 op = list_entry(object->pending_ops.next,
349 struct fscache_operation, pend_link);
350 fscache_stat(&fscache_n_op_cancelled);
351 list_del_init(&op->pend_link);
352
353 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING);
354 op->state = FSCACHE_OP_ST_CANCELLED;
355
356 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
357 object->n_exclusive--;
358 if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags))
359 wake_up_bit(&op->flags, FSCACHE_OP_WAITING);
360 fscache_put_operation(op);
361 cond_resched_lock(&object->lock);
362 }
363
364 spin_unlock(&object->lock);
365 _leave("");
366}
367
368/*
369 * Record the completion or cancellation of an in-progress operation.
370 */
371void fscache_op_complete(struct fscache_operation *op, bool cancelled)
372{
373 struct fscache_object *object = op->object;
374
375 _enter("OBJ%x", object->debug_id);
376
377 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_IN_PROGRESS);
378 ASSERTCMP(object->n_in_progress, >, 0);
379 ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
380 object->n_exclusive, >, 0);
381 ASSERTIFCMP(test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags),
382 object->n_in_progress, ==, 1);
383
384 spin_lock(&object->lock);
385
386 op->state = cancelled ?
387 FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE;
388
389 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags))
390 object->n_exclusive--;
391 object->n_in_progress--;
392 if (object->n_in_progress == 0)
393 fscache_start_operations(object);
394
395 spin_unlock(&object->lock);
396 _leave("");
397}
398EXPORT_SYMBOL(fscache_op_complete);
399
400/*
314 * release an operation 401 * release an operation
315 * - queues pending ops if this is the last in-progress op 402 * - queues pending ops if this is the last in-progress op
316 */ 403 */
@@ -328,8 +415,9 @@ void fscache_put_operation(struct fscache_operation *op)
328 return; 415 return;
329 416
330 _debug("PUT OP"); 417 _debug("PUT OP");
331 if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) 418 ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE,
332 BUG(); 419 op->state, ==, FSCACHE_OP_ST_CANCELLED);
420 op->state = FSCACHE_OP_ST_DEAD;
333 421
334 fscache_stat(&fscache_n_op_release); 422 fscache_stat(&fscache_n_op_release);
335 423
@@ -340,8 +428,14 @@ void fscache_put_operation(struct fscache_operation *op)
340 428
341 object = op->object; 429 object = op->object;
342 430
343 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) 431 if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) {
344 atomic_dec(&object->n_reads); 432 if (atomic_dec_and_test(&object->n_reads)) {
433 clear_bit(FSCACHE_COOKIE_WAITING_ON_READS,
434 &object->cookie->flags);
435 wake_up_bit(&object->cookie->flags,
436 FSCACHE_COOKIE_WAITING_ON_READS);
437 }
438 }
345 439
346 /* now... we may get called with the object spinlock held, so we 440 /* now... we may get called with the object spinlock held, so we
347 * complete the cleanup here only if we can immediately acquire the 441 * complete the cleanup here only if we can immediately acquire the
@@ -359,16 +453,6 @@ void fscache_put_operation(struct fscache_operation *op)
359 return; 453 return;
360 } 454 }
361 455
362 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
363 ASSERTCMP(object->n_exclusive, >, 0);
364 object->n_exclusive--;
365 }
366
367 ASSERTCMP(object->n_in_progress, >, 0);
368 object->n_in_progress--;
369 if (object->n_in_progress == 0)
370 fscache_start_operations(object);
371
372 ASSERTCMP(object->n_ops, >, 0); 456 ASSERTCMP(object->n_ops, >, 0);
373 object->n_ops--; 457 object->n_ops--;
374 if (object->n_ops == 0) 458 if (object->n_ops == 0)
@@ -407,23 +491,14 @@ void fscache_operation_gc(struct work_struct *work)
407 spin_unlock(&cache->op_gc_list_lock); 491 spin_unlock(&cache->op_gc_list_lock);
408 492
409 object = op->object; 493 object = op->object;
494 spin_lock(&object->lock);
410 495
411 _debug("GC DEFERRED REL OBJ%x OP%x", 496 _debug("GC DEFERRED REL OBJ%x OP%x",
412 object->debug_id, op->debug_id); 497 object->debug_id, op->debug_id);
413 fscache_stat(&fscache_n_op_gc); 498 fscache_stat(&fscache_n_op_gc);
414 499
415 ASSERTCMP(atomic_read(&op->usage), ==, 0); 500 ASSERTCMP(atomic_read(&op->usage), ==, 0);
416 501 ASSERTCMP(op->state, ==, FSCACHE_OP_ST_DEAD);
417 spin_lock(&object->lock);
418 if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) {
419 ASSERTCMP(object->n_exclusive, >, 0);
420 object->n_exclusive--;
421 }
422
423 ASSERTCMP(object->n_in_progress, >, 0);
424 object->n_in_progress--;
425 if (object->n_in_progress == 0)
426 fscache_start_operations(object);
427 502
428 ASSERTCMP(object->n_ops, >, 0); 503 ASSERTCMP(object->n_ops, >, 0);
429 object->n_ops--; 504 object->n_ops--;
@@ -431,6 +506,7 @@ void fscache_operation_gc(struct work_struct *work)
431 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); 506 fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED);
432 507
433 spin_unlock(&object->lock); 508 spin_unlock(&object->lock);
509 kfree(op);
434 510
435 } while (count++ < 20); 511 } while (count++ < 20);
436 512
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 3f7a59bfa7ad..ff000e52072d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -56,6 +56,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
56 56
57 _enter("%p,%p,%x", cookie, page, gfp); 57 _enter("%p,%p,%x", cookie, page, gfp);
58 58
59try_again:
59 rcu_read_lock(); 60 rcu_read_lock();
60 val = radix_tree_lookup(&cookie->stores, page->index); 61 val = radix_tree_lookup(&cookie->stores, page->index);
61 if (!val) { 62 if (!val) {
@@ -104,11 +105,19 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
104 return true; 105 return true;
105 106
106page_busy: 107page_busy:
107 /* we might want to wait here, but that could deadlock the allocator as 108 /* We will wait here if we're allowed to, but that could deadlock the
108 * the work threads writing to the cache may all end up sleeping 109 * allocator as the work threads writing to the cache may all end up
109 * on memory allocation */ 110 * sleeping on memory allocation, so we may need to impose a timeout
110 fscache_stat(&fscache_n_store_vmscan_busy); 111 * too. */
111 return false; 112 if (!(gfp & __GFP_WAIT)) {
113 fscache_stat(&fscache_n_store_vmscan_busy);
114 return false;
115 }
116
117 fscache_stat(&fscache_n_store_vmscan_wait);
118 __fscache_wait_on_page_write(cookie, page);
119 gfp &= ~__GFP_WAIT;
120 goto try_again;
112} 121}
113EXPORT_SYMBOL(__fscache_maybe_release_page); 122EXPORT_SYMBOL(__fscache_maybe_release_page);
114 123
@@ -162,6 +171,7 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
162 fscache_abort_object(object); 171 fscache_abort_object(object);
163 } 172 }
164 173
174 fscache_op_complete(op, true);
165 _leave(""); 175 _leave("");
166} 176}
167 177
@@ -223,6 +233,8 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op)
223 233
224 _enter("{OP%x}", op->op.debug_id); 234 _enter("{OP%x}", op->op.debug_id);
225 235
236 ASSERTCMP(op->n_pages, ==, 0);
237
226 fscache_hist(fscache_retrieval_histogram, op->start_time); 238 fscache_hist(fscache_retrieval_histogram, op->start_time);
227 if (op->context) 239 if (op->context)
228 fscache_put_context(op->op.object->cookie, op->context); 240 fscache_put_context(op->op.object->cookie, op->context);
@@ -291,6 +303,17 @@ static int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
291} 303}
292 304
293/* 305/*
306 * Handle cancellation of a pending retrieval op
307 */
308static void fscache_do_cancel_retrieval(struct fscache_operation *_op)
309{
310 struct fscache_retrieval *op =
311 container_of(_op, struct fscache_retrieval, op);
312
313 op->n_pages = 0;
314}
315
316/*
294 * wait for an object to become active (or dead) 317 * wait for an object to become active (or dead)
295 */ 318 */
296static int fscache_wait_for_retrieval_activation(struct fscache_object *object, 319static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
@@ -307,8 +330,8 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
307 fscache_stat(stat_op_waits); 330 fscache_stat(stat_op_waits);
308 if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING, 331 if (wait_on_bit(&op->op.flags, FSCACHE_OP_WAITING,
309 fscache_wait_bit_interruptible, 332 fscache_wait_bit_interruptible,
310 TASK_INTERRUPTIBLE) < 0) { 333 TASK_INTERRUPTIBLE) != 0) {
311 ret = fscache_cancel_op(&op->op); 334 ret = fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
312 if (ret == 0) 335 if (ret == 0)
313 return -ERESTARTSYS; 336 return -ERESTARTSYS;
314 337
@@ -320,7 +343,14 @@ static int fscache_wait_for_retrieval_activation(struct fscache_object *object,
320 _debug("<<< GO"); 343 _debug("<<< GO");
321 344
322check_if_dead: 345check_if_dead:
346 if (op->op.state == FSCACHE_OP_ST_CANCELLED) {
347 fscache_stat(stat_object_dead);
348 _leave(" = -ENOBUFS [cancelled]");
349 return -ENOBUFS;
350 }
323 if (unlikely(fscache_object_is_dead(object))) { 351 if (unlikely(fscache_object_is_dead(object))) {
352 pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->op.state);
353 fscache_cancel_op(&op->op, fscache_do_cancel_retrieval);
324 fscache_stat(stat_object_dead); 354 fscache_stat(stat_object_dead);
325 return -ENOBUFS; 355 return -ENOBUFS;
326 } 356 }
@@ -353,6 +383,11 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
353 if (hlist_empty(&cookie->backing_objects)) 383 if (hlist_empty(&cookie->backing_objects))
354 goto nobufs; 384 goto nobufs;
355 385
386 if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
387 _leave(" = -ENOBUFS [invalidating]");
388 return -ENOBUFS;
389 }
390
356 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); 391 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
357 ASSERTCMP(page, !=, NULL); 392 ASSERTCMP(page, !=, NULL);
358 393
@@ -364,6 +399,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
364 _leave(" = -ENOMEM"); 399 _leave(" = -ENOMEM");
365 return -ENOMEM; 400 return -ENOMEM;
366 } 401 }
402 op->n_pages = 1;
367 403
368 spin_lock(&cookie->lock); 404 spin_lock(&cookie->lock);
369 405
@@ -375,10 +411,10 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
375 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP); 411 ASSERTCMP(object->state, >, FSCACHE_OBJECT_LOOKING_UP);
376 412
377 atomic_inc(&object->n_reads); 413 atomic_inc(&object->n_reads);
378 set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); 414 __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
379 415
380 if (fscache_submit_op(object, &op->op) < 0) 416 if (fscache_submit_op(object, &op->op) < 0)
381 goto nobufs_unlock; 417 goto nobufs_unlock_dec;
382 spin_unlock(&cookie->lock); 418 spin_unlock(&cookie->lock);
383 419
384 fscache_stat(&fscache_n_retrieval_ops); 420 fscache_stat(&fscache_n_retrieval_ops);
@@ -425,6 +461,8 @@ error:
425 _leave(" = %d", ret); 461 _leave(" = %d", ret);
426 return ret; 462 return ret;
427 463
464nobufs_unlock_dec:
465 atomic_dec(&object->n_reads);
428nobufs_unlock: 466nobufs_unlock:
429 spin_unlock(&cookie->lock); 467 spin_unlock(&cookie->lock);
430 kfree(op); 468 kfree(op);
@@ -472,6 +510,11 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
472 if (hlist_empty(&cookie->backing_objects)) 510 if (hlist_empty(&cookie->backing_objects))
473 goto nobufs; 511 goto nobufs;
474 512
513 if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
514 _leave(" = -ENOBUFS [invalidating]");
515 return -ENOBUFS;
516 }
517
475 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); 518 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
476 ASSERTCMP(*nr_pages, >, 0); 519 ASSERTCMP(*nr_pages, >, 0);
477 ASSERT(!list_empty(pages)); 520 ASSERT(!list_empty(pages));
@@ -482,6 +525,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
482 op = fscache_alloc_retrieval(mapping, end_io_func, context); 525 op = fscache_alloc_retrieval(mapping, end_io_func, context);
483 if (!op) 526 if (!op)
484 return -ENOMEM; 527 return -ENOMEM;
528 op->n_pages = *nr_pages;
485 529
486 spin_lock(&cookie->lock); 530 spin_lock(&cookie->lock);
487 531
@@ -491,10 +535,10 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
491 struct fscache_object, cookie_link); 535 struct fscache_object, cookie_link);
492 536
493 atomic_inc(&object->n_reads); 537 atomic_inc(&object->n_reads);
494 set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags); 538 __set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
495 539
496 if (fscache_submit_op(object, &op->op) < 0) 540 if (fscache_submit_op(object, &op->op) < 0)
497 goto nobufs_unlock; 541 goto nobufs_unlock_dec;
498 spin_unlock(&cookie->lock); 542 spin_unlock(&cookie->lock);
499 543
500 fscache_stat(&fscache_n_retrieval_ops); 544 fscache_stat(&fscache_n_retrieval_ops);
@@ -541,6 +585,8 @@ error:
541 _leave(" = %d", ret); 585 _leave(" = %d", ret);
542 return ret; 586 return ret;
543 587
588nobufs_unlock_dec:
589 atomic_dec(&object->n_reads);
544nobufs_unlock: 590nobufs_unlock:
545 spin_unlock(&cookie->lock); 591 spin_unlock(&cookie->lock);
546 kfree(op); 592 kfree(op);
@@ -577,12 +623,18 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
577 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX); 623 ASSERTCMP(cookie->def->type, !=, FSCACHE_COOKIE_TYPE_INDEX);
578 ASSERTCMP(page, !=, NULL); 624 ASSERTCMP(page, !=, NULL);
579 625
626 if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
627 _leave(" = -ENOBUFS [invalidating]");
628 return -ENOBUFS;
629 }
630
580 if (fscache_wait_for_deferred_lookup(cookie) < 0) 631 if (fscache_wait_for_deferred_lookup(cookie) < 0)
581 return -ERESTARTSYS; 632 return -ERESTARTSYS;
582 633
583 op = fscache_alloc_retrieval(page->mapping, NULL, NULL); 634 op = fscache_alloc_retrieval(page->mapping, NULL, NULL);
584 if (!op) 635 if (!op)
585 return -ENOMEM; 636 return -ENOMEM;
637 op->n_pages = 1;
586 638
587 spin_lock(&cookie->lock); 639 spin_lock(&cookie->lock);
588 640
@@ -658,9 +710,27 @@ static void fscache_write_op(struct fscache_operation *_op)
658 spin_lock(&object->lock); 710 spin_lock(&object->lock);
659 cookie = object->cookie; 711 cookie = object->cookie;
660 712
661 if (!fscache_object_is_active(object) || !cookie) { 713 if (!fscache_object_is_active(object)) {
714 /* If we get here, then the on-disk cache object likely longer
715 * exists, so we should just cancel this write operation.
716 */
717 spin_unlock(&object->lock);
718 fscache_op_complete(&op->op, false);
719 _leave(" [inactive]");
720 return;
721 }
722
723 if (!cookie) {
724 /* If we get here, then the cookie belonging to the object was
725 * detached, probably by the cookie being withdrawn due to
726 * memory pressure, which means that the pages we might write
727 * to the cache from no longer exist - therefore, we can just
728 * cancel this write operation.
729 */
662 spin_unlock(&object->lock); 730 spin_unlock(&object->lock);
663 _leave(""); 731 fscache_op_complete(&op->op, false);
732 _leave(" [cancel] op{f=%lx s=%u} obj{s=%u f=%lx}",
733 _op->flags, _op->state, object->state, object->flags);
664 return; 734 return;
665 } 735 }
666 736
@@ -696,6 +766,7 @@ static void fscache_write_op(struct fscache_operation *_op)
696 fscache_end_page_write(object, page); 766 fscache_end_page_write(object, page);
697 if (ret < 0) { 767 if (ret < 0) {
698 fscache_abort_object(object); 768 fscache_abort_object(object);
769 fscache_op_complete(&op->op, true);
699 } else { 770 } else {
700 fscache_enqueue_operation(&op->op); 771 fscache_enqueue_operation(&op->op);
701 } 772 }
@@ -710,6 +781,38 @@ superseded:
710 spin_unlock(&cookie->stores_lock); 781 spin_unlock(&cookie->stores_lock);
711 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); 782 clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
712 spin_unlock(&object->lock); 783 spin_unlock(&object->lock);
784 fscache_op_complete(&op->op, true);
785 _leave("");
786}
787
788/*
789 * Clear the pages pending writing for invalidation
790 */
791void fscache_invalidate_writes(struct fscache_cookie *cookie)
792{
793 struct page *page;
794 void *results[16];
795 int n, i;
796
797 _enter("");
798
799 while (spin_lock(&cookie->stores_lock),
800 n = radix_tree_gang_lookup_tag(&cookie->stores, results, 0,
801 ARRAY_SIZE(results),
802 FSCACHE_COOKIE_PENDING_TAG),
803 n > 0) {
804 for (i = n - 1; i >= 0; i--) {
805 page = results[i];
806 radix_tree_delete(&cookie->stores, page->index);
807 }
808
809 spin_unlock(&cookie->stores_lock);
810
811 for (i = n - 1; i >= 0; i--)
812 page_cache_release(results[i]);
813 }
814
815 spin_unlock(&cookie->stores_lock);
713 _leave(""); 816 _leave("");
714} 817}
715 818
@@ -759,7 +862,12 @@ int __fscache_write_page(struct fscache_cookie *cookie,
759 862
760 fscache_stat(&fscache_n_stores); 863 fscache_stat(&fscache_n_stores);
761 864
762 op = kzalloc(sizeof(*op), GFP_NOIO); 865 if (test_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) {
866 _leave(" = -ENOBUFS [invalidating]");
867 return -ENOBUFS;
868 }
869
870 op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY);
763 if (!op) 871 if (!op)
764 goto nomem; 872 goto nomem;
765 873
@@ -915,6 +1023,40 @@ done:
915EXPORT_SYMBOL(__fscache_uncache_page); 1023EXPORT_SYMBOL(__fscache_uncache_page);
916 1024
917/** 1025/**
1026 * fscache_mark_page_cached - Mark a page as being cached
1027 * @op: The retrieval op pages are being marked for
1028 * @page: The page to be marked
1029 *
1030 * Mark a netfs page as being cached. After this is called, the netfs
1031 * must call fscache_uncache_page() to remove the mark.
1032 */
1033void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
1034{
1035 struct fscache_cookie *cookie = op->op.object->cookie;
1036
1037#ifdef CONFIG_FSCACHE_STATS
1038 atomic_inc(&fscache_n_marks);
1039#endif
1040
1041 _debug("- mark %p{%lx}", page, page->index);
1042 if (TestSetPageFsCache(page)) {
1043 static bool once_only;
1044 if (!once_only) {
1045 once_only = true;
1046 printk(KERN_WARNING "FS-Cache:"
1047 " Cookie type %s marked page %lx"
1048 " multiple times\n",
1049 cookie->def->name, page->index);
1050 }
1051 }
1052
1053 if (cookie->def->mark_page_cached)
1054 cookie->def->mark_page_cached(cookie->netfs_data,
1055 op->mapping, page);
1056}
1057EXPORT_SYMBOL(fscache_mark_page_cached);
1058
1059/**
918 * fscache_mark_pages_cached - Mark pages as being cached 1060 * fscache_mark_pages_cached - Mark pages as being cached
919 * @op: The retrieval op pages are being marked for 1061 * @op: The retrieval op pages are being marked for
920 * @pagevec: The pages to be marked 1062 * @pagevec: The pages to be marked
@@ -925,32 +1067,11 @@ EXPORT_SYMBOL(__fscache_uncache_page);
925void fscache_mark_pages_cached(struct fscache_retrieval *op, 1067void fscache_mark_pages_cached(struct fscache_retrieval *op,
926 struct pagevec *pagevec) 1068 struct pagevec *pagevec)
927{ 1069{
928 struct fscache_cookie *cookie = op->op.object->cookie;
929 unsigned long loop; 1070 unsigned long loop;
930 1071
931#ifdef CONFIG_FSCACHE_STATS 1072 for (loop = 0; loop < pagevec->nr; loop++)
932 atomic_add(pagevec->nr, &fscache_n_marks); 1073 fscache_mark_page_cached(op, pagevec->pages[loop]);
933#endif
934
935 for (loop = 0; loop < pagevec->nr; loop++) {
936 struct page *page = pagevec->pages[loop];
937
938 _debug("- mark %p{%lx}", page, page->index);
939 if (TestSetPageFsCache(page)) {
940 static bool once_only;
941 if (!once_only) {
942 once_only = true;
943 printk(KERN_WARNING "FS-Cache:"
944 " Cookie type %s marked page %lx"
945 " multiple times\n",
946 cookie->def->name, page->index);
947 }
948 }
949 }
950 1074
951 if (cookie->def->mark_pages_cached)
952 cookie->def->mark_pages_cached(cookie->netfs_data,
953 op->mapping, pagevec);
954 pagevec_reinit(pagevec); 1075 pagevec_reinit(pagevec);
955} 1076}
956EXPORT_SYMBOL(fscache_mark_pages_cached); 1077EXPORT_SYMBOL(fscache_mark_pages_cached);
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 4765190d537f..8179e8bc4a3d 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -69,6 +69,7 @@ atomic_t fscache_n_store_vmscan_not_storing;
69atomic_t fscache_n_store_vmscan_gone; 69atomic_t fscache_n_store_vmscan_gone;
70atomic_t fscache_n_store_vmscan_busy; 70atomic_t fscache_n_store_vmscan_busy;
71atomic_t fscache_n_store_vmscan_cancelled; 71atomic_t fscache_n_store_vmscan_cancelled;
72atomic_t fscache_n_store_vmscan_wait;
72 73
73atomic_t fscache_n_marks; 74atomic_t fscache_n_marks;
74atomic_t fscache_n_uncaches; 75atomic_t fscache_n_uncaches;
@@ -80,6 +81,9 @@ atomic_t fscache_n_acquires_ok;
80atomic_t fscache_n_acquires_nobufs; 81atomic_t fscache_n_acquires_nobufs;
81atomic_t fscache_n_acquires_oom; 82atomic_t fscache_n_acquires_oom;
82 83
84atomic_t fscache_n_invalidates;
85atomic_t fscache_n_invalidates_run;
86
83atomic_t fscache_n_updates; 87atomic_t fscache_n_updates;
84atomic_t fscache_n_updates_null; 88atomic_t fscache_n_updates_null;
85atomic_t fscache_n_updates_run; 89atomic_t fscache_n_updates_run;
@@ -112,6 +116,7 @@ atomic_t fscache_n_cop_alloc_object;
112atomic_t fscache_n_cop_lookup_object; 116atomic_t fscache_n_cop_lookup_object;
113atomic_t fscache_n_cop_lookup_complete; 117atomic_t fscache_n_cop_lookup_complete;
114atomic_t fscache_n_cop_grab_object; 118atomic_t fscache_n_cop_grab_object;
119atomic_t fscache_n_cop_invalidate_object;
115atomic_t fscache_n_cop_update_object; 120atomic_t fscache_n_cop_update_object;
116atomic_t fscache_n_cop_drop_object; 121atomic_t fscache_n_cop_drop_object;
117atomic_t fscache_n_cop_put_object; 122atomic_t fscache_n_cop_put_object;
@@ -168,6 +173,10 @@ static int fscache_stats_show(struct seq_file *m, void *v)
168 atomic_read(&fscache_n_object_created), 173 atomic_read(&fscache_n_object_created),
169 atomic_read(&fscache_n_object_lookups_timed_out)); 174 atomic_read(&fscache_n_object_lookups_timed_out));
170 175
176 seq_printf(m, "Invals : n=%u run=%u\n",
177 atomic_read(&fscache_n_invalidates),
178 atomic_read(&fscache_n_invalidates_run));
179
171 seq_printf(m, "Updates: n=%u nul=%u run=%u\n", 180 seq_printf(m, "Updates: n=%u nul=%u run=%u\n",
172 atomic_read(&fscache_n_updates), 181 atomic_read(&fscache_n_updates),
173 atomic_read(&fscache_n_updates_null), 182 atomic_read(&fscache_n_updates_null),
@@ -224,11 +233,12 @@ static int fscache_stats_show(struct seq_file *m, void *v)
224 atomic_read(&fscache_n_store_radix_deletes), 233 atomic_read(&fscache_n_store_radix_deletes),
225 atomic_read(&fscache_n_store_pages_over_limit)); 234 atomic_read(&fscache_n_store_pages_over_limit));
226 235
227 seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u\n", 236 seq_printf(m, "VmScan : nos=%u gon=%u bsy=%u can=%u wt=%u\n",
228 atomic_read(&fscache_n_store_vmscan_not_storing), 237 atomic_read(&fscache_n_store_vmscan_not_storing),
229 atomic_read(&fscache_n_store_vmscan_gone), 238 atomic_read(&fscache_n_store_vmscan_gone),
230 atomic_read(&fscache_n_store_vmscan_busy), 239 atomic_read(&fscache_n_store_vmscan_busy),
231 atomic_read(&fscache_n_store_vmscan_cancelled)); 240 atomic_read(&fscache_n_store_vmscan_cancelled),
241 atomic_read(&fscache_n_store_vmscan_wait));
232 242
233 seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n", 243 seq_printf(m, "Ops : pend=%u run=%u enq=%u can=%u rej=%u\n",
234 atomic_read(&fscache_n_op_pend), 244 atomic_read(&fscache_n_op_pend),
@@ -246,7 +256,8 @@ static int fscache_stats_show(struct seq_file *m, void *v)
246 atomic_read(&fscache_n_cop_lookup_object), 256 atomic_read(&fscache_n_cop_lookup_object),
247 atomic_read(&fscache_n_cop_lookup_complete), 257 atomic_read(&fscache_n_cop_lookup_complete),
248 atomic_read(&fscache_n_cop_grab_object)); 258 atomic_read(&fscache_n_cop_grab_object));
249 seq_printf(m, "CacheOp: upo=%d dro=%d pto=%d atc=%d syn=%d\n", 259 seq_printf(m, "CacheOp: inv=%d upo=%d dro=%d pto=%d atc=%d syn=%d\n",
260 atomic_read(&fscache_n_cop_invalidate_object),
250 atomic_read(&fscache_n_cop_update_object), 261 atomic_read(&fscache_n_cop_update_object),
251 atomic_read(&fscache_n_cop_drop_object), 262 atomic_read(&fscache_n_cop_drop_object),
252 atomic_read(&fscache_n_cop_put_object), 263 atomic_read(&fscache_n_cop_put_object),
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 0b35903219bc..d47f11658c17 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -35,6 +35,16 @@ static int hfs_readpage(struct file *file, struct page *page)
35 return block_read_full_page(page, hfs_get_block); 35 return block_read_full_page(page, hfs_get_block);
36} 36}
37 37
38static void hfs_write_failed(struct address_space *mapping, loff_t to)
39{
40 struct inode *inode = mapping->host;
41
42 if (to > inode->i_size) {
43 truncate_pagecache(inode, to, inode->i_size);
44 hfs_file_truncate(inode);
45 }
46}
47
38static int hfs_write_begin(struct file *file, struct address_space *mapping, 48static int hfs_write_begin(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned flags, 49 loff_t pos, unsigned len, unsigned flags,
40 struct page **pagep, void **fsdata) 50 struct page **pagep, void **fsdata)
@@ -45,11 +55,8 @@ static int hfs_write_begin(struct file *file, struct address_space *mapping,
45 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 55 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
46 hfs_get_block, 56 hfs_get_block,
47 &HFS_I(mapping->host)->phys_size); 57 &HFS_I(mapping->host)->phys_size);
48 if (unlikely(ret)) { 58 if (unlikely(ret))
49 loff_t isize = mapping->host->i_size; 59 hfs_write_failed(mapping, pos + len);
50 if (pos + len > isize)
51 vmtruncate(mapping->host, isize);
52 }
53 60
54 return ret; 61 return ret;
55} 62}
@@ -120,6 +127,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
120 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 127 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
121{ 128{
122 struct file *file = iocb->ki_filp; 129 struct file *file = iocb->ki_filp;
130 struct address_space *mapping = file->f_mapping;
123 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 131 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
124 ssize_t ret; 132 ssize_t ret;
125 133
@@ -135,7 +143,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
135 loff_t end = offset + iov_length(iov, nr_segs); 143 loff_t end = offset + iov_length(iov, nr_segs);
136 144
137 if (end > isize) 145 if (end > isize)
138 vmtruncate(inode, isize); 146 hfs_write_failed(mapping, end);
139 } 147 }
140 148
141 return ret; 149 return ret;
@@ -617,9 +625,12 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr)
617 attr->ia_size != i_size_read(inode)) { 625 attr->ia_size != i_size_read(inode)) {
618 inode_dio_wait(inode); 626 inode_dio_wait(inode);
619 627
620 error = vmtruncate(inode, attr->ia_size); 628 error = inode_newsize_ok(inode, attr->ia_size);
621 if (error) 629 if (error)
622 return error; 630 return error;
631
632 truncate_setsize(inode, attr->ia_size);
633 hfs_file_truncate(inode);
623 } 634 }
624 635
625 setattr_copy(inode, attr); 636 setattr_copy(inode, attr);
@@ -668,7 +679,6 @@ static const struct file_operations hfs_file_operations = {
668 679
669static const struct inode_operations hfs_file_inode_operations = { 680static const struct inode_operations hfs_file_inode_operations = {
670 .lookup = hfs_file_lookup, 681 .lookup = hfs_file_lookup,
671 .truncate = hfs_file_truncate,
672 .setattr = hfs_inode_setattr, 682 .setattr = hfs_inode_setattr,
673 .setxattr = hfs_setxattr, 683 .setxattr = hfs_setxattr,
674 .getxattr = hfs_getxattr, 684 .getxattr = hfs_getxattr,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 2172aa5976f5..799b336b59f9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -28,6 +28,16 @@ static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
28 return block_write_full_page(page, hfsplus_get_block, wbc); 28 return block_write_full_page(page, hfsplus_get_block, wbc);
29} 29}
30 30
31static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
32{
33 struct inode *inode = mapping->host;
34
35 if (to > inode->i_size) {
36 truncate_pagecache(inode, to, inode->i_size);
37 hfsplus_file_truncate(inode);
38 }
39}
40
31static int hfsplus_write_begin(struct file *file, struct address_space *mapping, 41static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
32 loff_t pos, unsigned len, unsigned flags, 42 loff_t pos, unsigned len, unsigned flags,
33 struct page **pagep, void **fsdata) 43 struct page **pagep, void **fsdata)
@@ -38,11 +48,8 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
38 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 48 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
39 hfsplus_get_block, 49 hfsplus_get_block,
40 &HFSPLUS_I(mapping->host)->phys_size); 50 &HFSPLUS_I(mapping->host)->phys_size);
41 if (unlikely(ret)) { 51 if (unlikely(ret))
42 loff_t isize = mapping->host->i_size; 52 hfsplus_write_failed(mapping, pos + len);
43 if (pos + len > isize)
44 vmtruncate(mapping->host, isize);
45 }
46 53
47 return ret; 54 return ret;
48} 55}
@@ -116,6 +123,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
116 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 123 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
117{ 124{
118 struct file *file = iocb->ki_filp; 125 struct file *file = iocb->ki_filp;
126 struct address_space *mapping = file->f_mapping;
119 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 127 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
120 ssize_t ret; 128 ssize_t ret;
121 129
@@ -131,7 +139,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
131 loff_t end = offset + iov_length(iov, nr_segs); 139 loff_t end = offset + iov_length(iov, nr_segs);
132 140
133 if (end > isize) 141 if (end > isize)
134 vmtruncate(inode, isize); 142 hfsplus_write_failed(mapping, end);
135 } 143 }
136 144
137 return ret; 145 return ret;
@@ -300,10 +308,8 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
300 if ((attr->ia_valid & ATTR_SIZE) && 308 if ((attr->ia_valid & ATTR_SIZE) &&
301 attr->ia_size != i_size_read(inode)) { 309 attr->ia_size != i_size_read(inode)) {
302 inode_dio_wait(inode); 310 inode_dio_wait(inode);
303 311 truncate_setsize(inode, attr->ia_size);
304 error = vmtruncate(inode, attr->ia_size); 312 hfsplus_file_truncate(inode);
305 if (error)
306 return error;
307 } 313 }
308 314
309 setattr_copy(inode, attr); 315 setattr_copy(inode, attr);
@@ -358,7 +364,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
358 364
359static const struct inode_operations hfsplus_file_inode_operations = { 365static const struct inode_operations hfsplus_file_inode_operations = {
360 .lookup = hfsplus_file_lookup, 366 .lookup = hfsplus_file_lookup,
361 .truncate = hfsplus_file_truncate,
362 .setattr = hfsplus_setattr, 367 .setattr = hfsplus_setattr,
363 .setxattr = hfsplus_setxattr, 368 .setxattr = hfsplus_setxattr,
364 .getxattr = hfsplus_getxattr, 369 .getxattr = hfsplus_getxattr,
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 89d2a5803ae3..fbfe2df5624b 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -50,7 +50,7 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
50 return disk_secno; 50 return disk_secno;
51} 51}
52 52
53static void hpfs_truncate(struct inode *i) 53void hpfs_truncate(struct inode *i)
54{ 54{
55 if (IS_IMMUTABLE(i)) return /*-EPERM*/; 55 if (IS_IMMUTABLE(i)) return /*-EPERM*/;
56 hpfs_lock_assert(i->i_sb); 56 hpfs_lock_assert(i->i_sb);
@@ -105,6 +105,16 @@ static int hpfs_readpage(struct file *file, struct page *page)
105 return block_read_full_page(page,hpfs_get_block); 105 return block_read_full_page(page,hpfs_get_block);
106} 106}
107 107
108static void hpfs_write_failed(struct address_space *mapping, loff_t to)
109{
110 struct inode *inode = mapping->host;
111
112 if (to > inode->i_size) {
113 truncate_pagecache(inode, to, inode->i_size);
114 hpfs_truncate(inode);
115 }
116}
117
108static int hpfs_write_begin(struct file *file, struct address_space *mapping, 118static int hpfs_write_begin(struct file *file, struct address_space *mapping,
109 loff_t pos, unsigned len, unsigned flags, 119 loff_t pos, unsigned len, unsigned flags,
110 struct page **pagep, void **fsdata) 120 struct page **pagep, void **fsdata)
@@ -115,11 +125,8 @@ static int hpfs_write_begin(struct file *file, struct address_space *mapping,
115 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 125 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
116 hpfs_get_block, 126 hpfs_get_block,
117 &hpfs_i(mapping->host)->mmu_private); 127 &hpfs_i(mapping->host)->mmu_private);
118 if (unlikely(ret)) { 128 if (unlikely(ret))
119 loff_t isize = mapping->host->i_size; 129 hpfs_write_failed(mapping, pos + len);
120 if (pos + len > isize)
121 vmtruncate(mapping->host, isize);
122 }
123 130
124 return ret; 131 return ret;
125} 132}
@@ -166,6 +173,5 @@ const struct file_operations hpfs_file_ops =
166 173
167const struct inode_operations hpfs_file_iops = 174const struct inode_operations hpfs_file_iops =
168{ 175{
169 .truncate = hpfs_truncate,
170 .setattr = hpfs_setattr, 176 .setattr = hpfs_setattr,
171}; 177};
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 7102aaecc244..b7ae286646b5 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -252,6 +252,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
252/* file.c */ 252/* file.c */
253 253
254int hpfs_file_fsync(struct file *, loff_t, loff_t, int); 254int hpfs_file_fsync(struct file *, loff_t, loff_t, int);
255void hpfs_truncate(struct inode *);
255extern const struct file_operations hpfs_file_ops; 256extern const struct file_operations hpfs_file_ops;
256extern const struct inode_operations hpfs_file_iops; 257extern const struct inode_operations hpfs_file_iops;
257extern const struct address_space_operations hpfs_aops; 258extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 804a9a842cbc..5dc06c837105 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -277,9 +277,12 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
277 277
278 if ((attr->ia_valid & ATTR_SIZE) && 278 if ((attr->ia_valid & ATTR_SIZE) &&
279 attr->ia_size != i_size_read(inode)) { 279 attr->ia_size != i_size_read(inode)) {
280 error = vmtruncate(inode, attr->ia_size); 280 error = inode_newsize_ok(inode, attr->ia_size);
281 if (error) 281 if (error)
282 goto out_unlock; 282 goto out_unlock;
283
284 truncate_setsize(inode, attr->ia_size);
285 hpfs_truncate(inode);
283 } 286 }
284 287
285 setattr_copy(inode, attr); 288 setattr_copy(inode, attr);
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 9d3afd157f99..dd7442c58358 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -119,9 +119,12 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
119 iattr->ia_size != i_size_read(inode)) { 119 iattr->ia_size != i_size_read(inode)) {
120 inode_dio_wait(inode); 120 inode_dio_wait(inode);
121 121
122 rc = vmtruncate(inode, iattr->ia_size); 122 rc = inode_newsize_ok(inode, iattr->ia_size);
123 if (rc) 123 if (rc)
124 return rc; 124 return rc;
125
126 truncate_setsize(inode, iattr->ia_size);
127 jfs_truncate(inode);
125 } 128 }
126 129
127 setattr_copy(inode, iattr); 130 setattr_copy(inode, iattr);
@@ -133,7 +136,6 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
133} 136}
134 137
135const struct inode_operations jfs_file_inode_operations = { 138const struct inode_operations jfs_file_inode_operations = {
136 .truncate = jfs_truncate,
137 .setxattr = jfs_setxattr, 139 .setxattr = jfs_setxattr,
138 .getxattr = jfs_getxattr, 140 .getxattr = jfs_getxattr,
139 .listxattr = jfs_listxattr, 141 .listxattr = jfs_listxattr,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 4692bf3ca8cb..b7dc47ba675e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -300,6 +300,16 @@ static int jfs_readpages(struct file *file, struct address_space *mapping,
300 return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); 300 return mpage_readpages(mapping, pages, nr_pages, jfs_get_block);
301} 301}
302 302
303static void jfs_write_failed(struct address_space *mapping, loff_t to)
304{
305 struct inode *inode = mapping->host;
306
307 if (to > inode->i_size) {
308 truncate_pagecache(inode, to, inode->i_size);
309 jfs_truncate(inode);
310 }
311}
312
303static int jfs_write_begin(struct file *file, struct address_space *mapping, 313static int jfs_write_begin(struct file *file, struct address_space *mapping,
304 loff_t pos, unsigned len, unsigned flags, 314 loff_t pos, unsigned len, unsigned flags,
305 struct page **pagep, void **fsdata) 315 struct page **pagep, void **fsdata)
@@ -308,11 +318,8 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping,
308 318
309 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, 319 ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata,
310 jfs_get_block); 320 jfs_get_block);
311 if (unlikely(ret)) { 321 if (unlikely(ret))
312 loff_t isize = mapping->host->i_size; 322 jfs_write_failed(mapping, pos + len);
313 if (pos + len > isize)
314 vmtruncate(mapping->host, isize);
315 }
316 323
317 return ret; 324 return ret;
318} 325}
@@ -326,6 +333,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
326 const struct iovec *iov, loff_t offset, unsigned long nr_segs) 333 const struct iovec *iov, loff_t offset, unsigned long nr_segs)
327{ 334{
328 struct file *file = iocb->ki_filp; 335 struct file *file = iocb->ki_filp;
336 struct address_space *mapping = file->f_mapping;
329 struct inode *inode = file->f_mapping->host; 337 struct inode *inode = file->f_mapping->host;
330 ssize_t ret; 338 ssize_t ret;
331 339
@@ -341,7 +349,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
341 loff_t end = offset + iov_length(iov, nr_segs); 349 loff_t end = offset + iov_length(iov, nr_segs);
342 350
343 if (end > isize) 351 if (end > isize)
344 vmtruncate(inode, isize); 352 jfs_write_failed(mapping, end);
345 } 353 }
346 354
347 return ret; 355 return ret;
diff --git a/fs/libfs.c b/fs/libfs.c
index 35fc6e74cd88..916da8c4158b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -369,8 +369,6 @@ int simple_setattr(struct dentry *dentry, struct iattr *iattr)
369 struct inode *inode = dentry->d_inode; 369 struct inode *inode = dentry->d_inode;
370 int error; 370 int error;
371 371
372 WARN_ON_ONCE(inode->i_op->truncate);
373
374 error = inode_change_ok(inode, iattr); 372 error = inode_change_ok(inode, iattr);
375 if (error) 373 if (error)
376 return error; 374 return error;
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index e1a3b6bf6324..9a59cbade2fb 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1887,9 +1887,15 @@ int logfs_truncate(struct inode *inode, u64 target)
1887 logfs_put_wblocks(sb, NULL, 1); 1887 logfs_put_wblocks(sb, NULL, 1);
1888 } 1888 }
1889 1889
1890 if (!err) 1890 if (!err) {
1891 err = vmtruncate(inode, target); 1891 err = inode_newsize_ok(inode, target);
1892 if (err)
1893 goto out;
1894
1895 truncate_setsize(inode, target);
1896 }
1892 1897
1898 out:
1893 /* I don't trust error recovery yet. */ 1899 /* I don't trust error recovery yet. */
1894 WARN_ON(err); 1900 WARN_ON(err);
1895 return err; 1901 return err;
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 4493ce695ab8..adc6f5494231 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -34,9 +34,12 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
34 34
35 if ((attr->ia_valid & ATTR_SIZE) && 35 if ((attr->ia_valid & ATTR_SIZE) &&
36 attr->ia_size != i_size_read(inode)) { 36 attr->ia_size != i_size_read(inode)) {
37 error = vmtruncate(inode, attr->ia_size); 37 error = inode_newsize_ok(inode, attr->ia_size);
38 if (error) 38 if (error)
39 return error; 39 return error;
40
41 truncate_setsize(inode, attr->ia_size);
42 minix_truncate(inode);
40 } 43 }
41 44
42 setattr_copy(inode, attr); 45 setattr_copy(inode, attr);
@@ -45,7 +48,6 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr)
45} 48}
46 49
47const struct inode_operations minix_file_inode_operations = { 50const struct inode_operations minix_file_inode_operations = {
48 .truncate = minix_truncate,
49 .setattr = minix_setattr, 51 .setattr = minix_setattr,
50 .getattr = minix_getattr, 52 .getattr = minix_getattr,
51}; 53};
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 4fc5f8ab1c44..99541cceb584 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -395,6 +395,16 @@ int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len)
395 return __block_write_begin(page, pos, len, minix_get_block); 395 return __block_write_begin(page, pos, len, minix_get_block);
396} 396}
397 397
398static void minix_write_failed(struct address_space *mapping, loff_t to)
399{
400 struct inode *inode = mapping->host;
401
402 if (to > inode->i_size) {
403 truncate_pagecache(inode, to, inode->i_size);
404 minix_truncate(inode);
405 }
406}
407
398static int minix_write_begin(struct file *file, struct address_space *mapping, 408static int minix_write_begin(struct file *file, struct address_space *mapping,
399 loff_t pos, unsigned len, unsigned flags, 409 loff_t pos, unsigned len, unsigned flags,
400 struct page **pagep, void **fsdata) 410 struct page **pagep, void **fsdata)
@@ -403,11 +413,8 @@ static int minix_write_begin(struct file *file, struct address_space *mapping,
403 413
404 ret = block_write_begin(mapping, pos, len, flags, pagep, 414 ret = block_write_begin(mapping, pos, len, flags, pagep,
405 minix_get_block); 415 minix_get_block);
406 if (unlikely(ret)) { 416 if (unlikely(ret))
407 loff_t isize = mapping->host->i_size; 417 minix_write_failed(mapping, pos + len);
408 if (pos + len > isize)
409 vmtruncate(mapping->host, isize);
410 }
411 418
412 return ret; 419 return ret;
413} 420}
diff --git a/fs/namei.c b/fs/namei.c
index 5f4cdf3ad913..43a97ee1d4c8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1275,9 +1275,7 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1275 *need_lookup = false; 1275 *need_lookup = false;
1276 dentry = d_lookup(dir, name); 1276 dentry = d_lookup(dir, name);
1277 if (dentry) { 1277 if (dentry) {
1278 if (d_need_lookup(dentry)) { 1278 if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1279 *need_lookup = true;
1280 } else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
1281 error = d_revalidate(dentry, flags); 1279 error = d_revalidate(dentry, flags);
1282 if (unlikely(error <= 0)) { 1280 if (unlikely(error <= 0)) {
1283 if (error < 0) { 1281 if (error < 0) {
@@ -1383,8 +1381,6 @@ static int lookup_fast(struct nameidata *nd, struct qstr *name,
1383 return -ECHILD; 1381 return -ECHILD;
1384 nd->seq = seq; 1382 nd->seq = seq;
1385 1383
1386 if (unlikely(d_need_lookup(dentry)))
1387 goto unlazy;
1388 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { 1384 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1389 status = d_revalidate(dentry, nd->flags); 1385 status = d_revalidate(dentry, nd->flags);
1390 if (unlikely(status <= 0)) { 1386 if (unlikely(status <= 0)) {
@@ -1410,11 +1406,6 @@ unlazy:
1410 if (unlikely(!dentry)) 1406 if (unlikely(!dentry))
1411 goto need_lookup; 1407 goto need_lookup;
1412 1408
1413 if (unlikely(d_need_lookup(dentry))) {
1414 dput(dentry);
1415 goto need_lookup;
1416 }
1417
1418 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval) 1409 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1419 status = d_revalidate(dentry, nd->flags); 1410 status = d_revalidate(dentry, nd->flags);
1420 if (unlikely(status <= 0)) { 1411 if (unlikely(status <= 0)) {
@@ -1859,7 +1850,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1859 if (flags & LOOKUP_ROOT) { 1850 if (flags & LOOKUP_ROOT) {
1860 struct inode *inode = nd->root.dentry->d_inode; 1851 struct inode *inode = nd->root.dentry->d_inode;
1861 if (*name) { 1852 if (*name) {
1862 if (!inode->i_op->lookup) 1853 if (!can_lookup(inode))
1863 return -ENOTDIR; 1854 return -ENOTDIR;
1864 retval = inode_permission(inode, MAY_EXEC); 1855 retval = inode_permission(inode, MAY_EXEC);
1865 if (retval) 1856 if (retval)
@@ -1903,6 +1894,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1903 get_fs_pwd(current->fs, &nd->path); 1894 get_fs_pwd(current->fs, &nd->path);
1904 } 1895 }
1905 } else { 1896 } else {
1897 /* Caller must check execute permissions on the starting path component */
1906 struct fd f = fdget_raw(dfd); 1898 struct fd f = fdget_raw(dfd);
1907 struct dentry *dentry; 1899 struct dentry *dentry;
1908 1900
@@ -1912,16 +1904,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1912 dentry = f.file->f_path.dentry; 1904 dentry = f.file->f_path.dentry;
1913 1905
1914 if (*name) { 1906 if (*name) {
1915 if (!S_ISDIR(dentry->d_inode->i_mode)) { 1907 if (!can_lookup(dentry->d_inode)) {
1916 fdput(f); 1908 fdput(f);
1917 return -ENOTDIR; 1909 return -ENOTDIR;
1918 } 1910 }
1919
1920 retval = inode_permission(dentry->d_inode, MAY_EXEC);
1921 if (retval) {
1922 fdput(f);
1923 return retval;
1924 }
1925 } 1911 }
1926 1912
1927 nd->path = f.file->f_path; 1913 nd->path = f.file->f_path;
@@ -2189,15 +2175,19 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
2189 * path-walking is complete. 2175 * path-walking is complete.
2190 */ 2176 */
2191static struct filename * 2177static struct filename *
2192user_path_parent(int dfd, const char __user *path, struct nameidata *nd) 2178user_path_parent(int dfd, const char __user *path, struct nameidata *nd,
2179 unsigned int flags)
2193{ 2180{
2194 struct filename *s = getname(path); 2181 struct filename *s = getname(path);
2195 int error; 2182 int error;
2196 2183
2184 /* only LOOKUP_REVAL is allowed in extra flags */
2185 flags &= LOOKUP_REVAL;
2186
2197 if (IS_ERR(s)) 2187 if (IS_ERR(s))
2198 return s; 2188 return s;
2199 2189
2200 error = filename_lookup(dfd, s, LOOKUP_PARENT, nd); 2190 error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd);
2201 if (error) { 2191 if (error) {
2202 putname(s); 2192 putname(s);
2203 return ERR_PTR(error); 2193 return ERR_PTR(error);
@@ -3044,12 +3034,22 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
3044 return file; 3034 return file;
3045} 3035}
3046 3036
3047struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, int is_dir) 3037struct dentry *kern_path_create(int dfd, const char *pathname,
3038 struct path *path, unsigned int lookup_flags)
3048{ 3039{
3049 struct dentry *dentry = ERR_PTR(-EEXIST); 3040 struct dentry *dentry = ERR_PTR(-EEXIST);
3050 struct nameidata nd; 3041 struct nameidata nd;
3051 int err2; 3042 int err2;
3052 int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd); 3043 int error;
3044 bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
3045
3046 /*
3047 * Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
3048 * other flags passed in are ignored!
3049 */
3050 lookup_flags &= LOOKUP_REVAL;
3051
3052 error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
3053 if (error) 3053 if (error)
3054 return ERR_PTR(error); 3054 return ERR_PTR(error);
3055 3055
@@ -3113,13 +3113,14 @@ void done_path_create(struct path *path, struct dentry *dentry)
3113} 3113}
3114EXPORT_SYMBOL(done_path_create); 3114EXPORT_SYMBOL(done_path_create);
3115 3115
3116struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, int is_dir) 3116struct dentry *user_path_create(int dfd, const char __user *pathname,
3117 struct path *path, unsigned int lookup_flags)
3117{ 3118{
3118 struct filename *tmp = getname(pathname); 3119 struct filename *tmp = getname(pathname);
3119 struct dentry *res; 3120 struct dentry *res;
3120 if (IS_ERR(tmp)) 3121 if (IS_ERR(tmp))
3121 return ERR_CAST(tmp); 3122 return ERR_CAST(tmp);
3122 res = kern_path_create(dfd, tmp->name, path, is_dir); 3123 res = kern_path_create(dfd, tmp->name, path, lookup_flags);
3123 putname(tmp); 3124 putname(tmp);
3124 return res; 3125 return res;
3125} 3126}
@@ -3175,12 +3176,13 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3175 struct dentry *dentry; 3176 struct dentry *dentry;
3176 struct path path; 3177 struct path path;
3177 int error; 3178 int error;
3179 unsigned int lookup_flags = 0;
3178 3180
3179 error = may_mknod(mode); 3181 error = may_mknod(mode);
3180 if (error) 3182 if (error)
3181 return error; 3183 return error;
3182 3184retry:
3183 dentry = user_path_create(dfd, filename, &path, 0); 3185 dentry = user_path_create(dfd, filename, &path, lookup_flags);
3184 if (IS_ERR(dentry)) 3186 if (IS_ERR(dentry))
3185 return PTR_ERR(dentry); 3187 return PTR_ERR(dentry);
3186 3188
@@ -3203,6 +3205,10 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
3203 } 3205 }
3204out: 3206out:
3205 done_path_create(&path, dentry); 3207 done_path_create(&path, dentry);
3208 if (retry_estale(error, lookup_flags)) {
3209 lookup_flags |= LOOKUP_REVAL;
3210 goto retry;
3211 }
3206 return error; 3212 return error;
3207} 3213}
3208 3214
@@ -3241,8 +3247,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3241 struct dentry *dentry; 3247 struct dentry *dentry;
3242 struct path path; 3248 struct path path;
3243 int error; 3249 int error;
3250 unsigned int lookup_flags = LOOKUP_DIRECTORY;
3244 3251
3245 dentry = user_path_create(dfd, pathname, &path, 1); 3252retry:
3253 dentry = user_path_create(dfd, pathname, &path, lookup_flags);
3246 if (IS_ERR(dentry)) 3254 if (IS_ERR(dentry))
3247 return PTR_ERR(dentry); 3255 return PTR_ERR(dentry);
3248 3256
@@ -3252,6 +3260,10 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
3252 if (!error) 3260 if (!error)
3253 error = vfs_mkdir(path.dentry->d_inode, dentry, mode); 3261 error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
3254 done_path_create(&path, dentry); 3262 done_path_create(&path, dentry);
3263 if (retry_estale(error, lookup_flags)) {
3264 lookup_flags |= LOOKUP_REVAL;
3265 goto retry;
3266 }
3255 return error; 3267 return error;
3256} 3268}
3257 3269
@@ -3327,8 +3339,9 @@ static long do_rmdir(int dfd, const char __user *pathname)
3327 struct filename *name; 3339 struct filename *name;
3328 struct dentry *dentry; 3340 struct dentry *dentry;
3329 struct nameidata nd; 3341 struct nameidata nd;
3330 3342 unsigned int lookup_flags = 0;
3331 name = user_path_parent(dfd, pathname, &nd); 3343retry:
3344 name = user_path_parent(dfd, pathname, &nd, lookup_flags);
3332 if (IS_ERR(name)) 3345 if (IS_ERR(name))
3333 return PTR_ERR(name); 3346 return PTR_ERR(name);
3334 3347
@@ -3370,6 +3383,10 @@ exit2:
3370exit1: 3383exit1:
3371 path_put(&nd.path); 3384 path_put(&nd.path);
3372 putname(name); 3385 putname(name);
3386 if (retry_estale(error, lookup_flags)) {
3387 lookup_flags |= LOOKUP_REVAL;
3388 goto retry;
3389 }
3373 return error; 3390 return error;
3374} 3391}
3375 3392
@@ -3423,8 +3440,9 @@ static long do_unlinkat(int dfd, const char __user *pathname)
3423 struct dentry *dentry; 3440 struct dentry *dentry;
3424 struct nameidata nd; 3441 struct nameidata nd;
3425 struct inode *inode = NULL; 3442 struct inode *inode = NULL;
3426 3443 unsigned int lookup_flags = 0;
3427 name = user_path_parent(dfd, pathname, &nd); 3444retry:
3445 name = user_path_parent(dfd, pathname, &nd, lookup_flags);
3428 if (IS_ERR(name)) 3446 if (IS_ERR(name))
3429 return PTR_ERR(name); 3447 return PTR_ERR(name);
3430 3448
@@ -3462,6 +3480,11 @@ exit2:
3462exit1: 3480exit1:
3463 path_put(&nd.path); 3481 path_put(&nd.path);
3464 putname(name); 3482 putname(name);
3483 if (retry_estale(error, lookup_flags)) {
3484 lookup_flags |= LOOKUP_REVAL;
3485 inode = NULL;
3486 goto retry;
3487 }
3465 return error; 3488 return error;
3466 3489
3467slashes: 3490slashes:
@@ -3513,12 +3536,13 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3513 struct filename *from; 3536 struct filename *from;
3514 struct dentry *dentry; 3537 struct dentry *dentry;
3515 struct path path; 3538 struct path path;
3539 unsigned int lookup_flags = 0;
3516 3540
3517 from = getname(oldname); 3541 from = getname(oldname);
3518 if (IS_ERR(from)) 3542 if (IS_ERR(from))
3519 return PTR_ERR(from); 3543 return PTR_ERR(from);
3520 3544retry:
3521 dentry = user_path_create(newdfd, newname, &path, 0); 3545 dentry = user_path_create(newdfd, newname, &path, lookup_flags);
3522 error = PTR_ERR(dentry); 3546 error = PTR_ERR(dentry);
3523 if (IS_ERR(dentry)) 3547 if (IS_ERR(dentry))
3524 goto out_putname; 3548 goto out_putname;
@@ -3527,6 +3551,10 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
3527 if (!error) 3551 if (!error)
3528 error = vfs_symlink(path.dentry->d_inode, dentry, from->name); 3552 error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
3529 done_path_create(&path, dentry); 3553 done_path_create(&path, dentry);
3554 if (retry_estale(error, lookup_flags)) {
3555 lookup_flags |= LOOKUP_REVAL;
3556 goto retry;
3557 }
3530out_putname: 3558out_putname:
3531 putname(from); 3559 putname(from);
3532 return error; 3560 return error;
@@ -3613,12 +3641,13 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3613 3641
3614 if (flags & AT_SYMLINK_FOLLOW) 3642 if (flags & AT_SYMLINK_FOLLOW)
3615 how |= LOOKUP_FOLLOW; 3643 how |= LOOKUP_FOLLOW;
3616 3644retry:
3617 error = user_path_at(olddfd, oldname, how, &old_path); 3645 error = user_path_at(olddfd, oldname, how, &old_path);
3618 if (error) 3646 if (error)
3619 return error; 3647 return error;
3620 3648
3621 new_dentry = user_path_create(newdfd, newname, &new_path, 0); 3649 new_dentry = user_path_create(newdfd, newname, &new_path,
3650 (how & LOOKUP_REVAL));
3622 error = PTR_ERR(new_dentry); 3651 error = PTR_ERR(new_dentry);
3623 if (IS_ERR(new_dentry)) 3652 if (IS_ERR(new_dentry))
3624 goto out; 3653 goto out;
@@ -3635,6 +3664,10 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3635 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry); 3664 error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
3636out_dput: 3665out_dput:
3637 done_path_create(&new_path, new_dentry); 3666 done_path_create(&new_path, new_dentry);
3667 if (retry_estale(error, how)) {
3668 how |= LOOKUP_REVAL;
3669 goto retry;
3670 }
3638out: 3671out:
3639 path_put(&old_path); 3672 path_put(&old_path);
3640 3673
@@ -3807,15 +3840,17 @@ SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
3807 struct nameidata oldnd, newnd; 3840 struct nameidata oldnd, newnd;
3808 struct filename *from; 3841 struct filename *from;
3809 struct filename *to; 3842 struct filename *to;
3843 unsigned int lookup_flags = 0;
3844 bool should_retry = false;
3810 int error; 3845 int error;
3811 3846retry:
3812 from = user_path_parent(olddfd, oldname, &oldnd); 3847 from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
3813 if (IS_ERR(from)) { 3848 if (IS_ERR(from)) {
3814 error = PTR_ERR(from); 3849 error = PTR_ERR(from);
3815 goto exit; 3850 goto exit;
3816 } 3851 }
3817 3852
3818 to = user_path_parent(newdfd, newname, &newnd); 3853 to = user_path_parent(newdfd, newname, &newnd, lookup_flags);
3819 if (IS_ERR(to)) { 3854 if (IS_ERR(to)) {
3820 error = PTR_ERR(to); 3855 error = PTR_ERR(to);
3821 goto exit1; 3856 goto exit1;
@@ -3887,11 +3922,18 @@ exit3:
3887 unlock_rename(new_dir, old_dir); 3922 unlock_rename(new_dir, old_dir);
3888 mnt_drop_write(oldnd.path.mnt); 3923 mnt_drop_write(oldnd.path.mnt);
3889exit2: 3924exit2:
3925 if (retry_estale(error, lookup_flags))
3926 should_retry = true;
3890 path_put(&newnd.path); 3927 path_put(&newnd.path);
3891 putname(to); 3928 putname(to);
3892exit1: 3929exit1:
3893 path_put(&oldnd.path); 3930 path_put(&oldnd.path);
3894 putname(from); 3931 putname(from);
3932 if (should_retry) {
3933 should_retry = false;
3934 lookup_flags |= LOOKUP_REVAL;
3935 goto retry;
3936 }
3895exit: 3937exit:
3896 return error; 3938 return error;
3897} 3939}
diff --git a/fs/namespace.c b/fs/namespace.c
index 398a50ff2438..55605c552787 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -313,7 +313,7 @@ int __mnt_want_write(struct vfsmount *m)
313 * incremented count after it has set MNT_WRITE_HOLD. 313 * incremented count after it has set MNT_WRITE_HOLD.
314 */ 314 */
315 smp_mb(); 315 smp_mb();
316 while (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 316 while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
317 cpu_relax(); 317 cpu_relax();
318 /* 318 /*
319 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will 319 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d7e9fe77188a..1acdad7fcec7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -976,9 +976,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
976 goto out; 976 goto out;
977 977
978 if (attr->ia_size != i_size_read(inode)) { 978 if (attr->ia_size != i_size_read(inode)) {
979 result = vmtruncate(inode, attr->ia_size); 979 truncate_setsize(inode, attr->ia_size);
980 if (result)
981 goto out;
982 mark_inode_dirty(inode); 980 mark_inode_dirty(inode);
983 } 981 }
984 } 982 }
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index c817787fbdb4..24d1d1c5fcaf 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -307,6 +307,7 @@ void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
307 nfs_fscache_inode_unlock(inode); 307 nfs_fscache_inode_unlock(inode);
308 } 308 }
309} 309}
310EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
310 311
311/* 312/*
312 * Replace a per-inode cookie due to revalidation detecting a file having 313 * Replace a per-inode cookie due to revalidation detecting a file having
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index c5b11b53ff33..277b02782897 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -153,6 +153,22 @@ static inline void nfs_readpage_to_fscache(struct inode *inode,
153} 153}
154 154
155/* 155/*
156 * Invalidate the contents of fscache for this inode. This will not sleep.
157 */
158static inline void nfs_fscache_invalidate(struct inode *inode)
159{
160 fscache_invalidate(NFS_I(inode)->fscache);
161}
162
163/*
164 * Wait for an object to finish being invalidated.
165 */
166static inline void nfs_fscache_wait_on_invalidate(struct inode *inode)
167{
168 fscache_wait_on_invalidate(NFS_I(inode)->fscache);
169}
170
171/*
156 * indicate the client caching state as readable text 172 * indicate the client caching state as readable text
157 */ 173 */
158static inline const char *nfs_server_fscache_state(struct nfs_server *server) 174static inline const char *nfs_server_fscache_state(struct nfs_server *server)
@@ -162,7 +178,6 @@ static inline const char *nfs_server_fscache_state(struct nfs_server *server)
162 return "no "; 178 return "no ";
163} 179}
164 180
165
166#else /* CONFIG_NFS_FSCACHE */ 181#else /* CONFIG_NFS_FSCACHE */
167static inline int nfs_fscache_register(void) { return 0; } 182static inline int nfs_fscache_register(void) { return 0; }
168static inline void nfs_fscache_unregister(void) {} 183static inline void nfs_fscache_unregister(void) {}
@@ -205,6 +220,9 @@ static inline int nfs_readpages_from_fscache(struct nfs_open_context *ctx,
205static inline void nfs_readpage_to_fscache(struct inode *inode, 220static inline void nfs_readpage_to_fscache(struct inode *inode,
206 struct page *page, int sync) {} 221 struct page *page, int sync) {}
207 222
223
224static inline void nfs_fscache_invalidate(struct inode *inode) {}
225
208static inline const char *nfs_server_fscache_state(struct nfs_server *server) 226static inline const char *nfs_server_fscache_state(struct nfs_server *server)
209{ 227{
210 return "no "; 228 return "no ";
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2faae14d89f4..ebeb94ce1b0b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -161,10 +161,12 @@ static void nfs_zap_caches_locked(struct inode *inode)
161 nfsi->attrtimeo_timestamp = jiffies; 161 nfsi->attrtimeo_timestamp = jiffies;
162 162
163 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); 163 memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) 164 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 165 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
166 else 166 nfs_fscache_invalidate(inode);
167 } else {
167 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE; 168 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
169 }
168} 170}
169 171
170void nfs_zap_caches(struct inode *inode) 172void nfs_zap_caches(struct inode *inode)
@@ -179,6 +181,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
179 if (mapping->nrpages != 0) { 181 if (mapping->nrpages != 0) {
180 spin_lock(&inode->i_lock); 182 spin_lock(&inode->i_lock);
181 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA; 183 NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
184 nfs_fscache_invalidate(inode);
182 spin_unlock(&inode->i_lock); 185 spin_unlock(&inode->i_lock);
183 } 186 }
184} 187}
@@ -881,7 +884,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
881 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); 884 memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
882 spin_unlock(&inode->i_lock); 885 spin_unlock(&inode->i_lock);
883 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); 886 nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
884 nfs_fscache_reset_inode_cookie(inode); 887 nfs_fscache_wait_on_invalidate(inode);
885 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n", 888 dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
886 inode->i_sb->s_id, (long long)NFS_FILEID(inode)); 889 inode->i_sb->s_id, (long long)NFS_FILEID(inode));
887 return 0; 890 return 0;
@@ -957,6 +960,10 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
957 i_size_write(inode, nfs_size_to_loff_t(fattr->size)); 960 i_size_write(inode, nfs_size_to_loff_t(fattr->size));
958 ret |= NFS_INO_INVALID_ATTR; 961 ret |= NFS_INO_INVALID_ATTR;
959 } 962 }
963
964 if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
965 nfs_fscache_invalidate(inode);
966
960 return ret; 967 return ret;
961} 968}
962 969
@@ -1205,8 +1212,10 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
1205 struct nfs_inode *nfsi = NFS_I(inode); 1212 struct nfs_inode *nfsi = NFS_I(inode);
1206 1213
1207 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; 1214 nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
1208 if (S_ISDIR(inode->i_mode)) 1215 if (S_ISDIR(inode->i_mode)) {
1209 nfsi->cache_validity |= NFS_INO_INVALID_DATA; 1216 nfsi->cache_validity |= NFS_INO_INVALID_DATA;
1217 nfs_fscache_invalidate(inode);
1218 }
1210 if ((fattr->valid & NFS_ATTR_FATTR) == 0) 1219 if ((fattr->valid & NFS_ATTR_FATTR) == 0)
1211 return 0; 1220 return 0;
1212 return nfs_refresh_inode_locked(inode, fattr); 1221 return nfs_refresh_inode_locked(inode, fattr);
@@ -1494,6 +1503,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
1494 (save_cache_validity & NFS_INO_REVAL_FORCED)) 1503 (save_cache_validity & NFS_INO_REVAL_FORCED))
1495 nfsi->cache_validity |= invalid; 1504 nfsi->cache_validity |= invalid;
1496 1505
1506 if (invalid & NFS_INO_INVALID_DATA)
1507 nfs_fscache_invalidate(inode);
1508
1497 return 0; 1509 return 0;
1498 out_err: 1510 out_err:
1499 /* 1511 /*
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e7699308364a..08ddcccb8887 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -5,6 +5,7 @@
5 */ 5 */
6#include <linux/nfs_fs.h> 6#include <linux/nfs_fs.h>
7#include "internal.h" 7#include "internal.h"
8#include "fscache.h"
8#include "pnfs.h" 9#include "pnfs.h"
9 10
10#define NFSDBG_FACILITY NFSDBG_FILE 11#define NFSDBG_FACILITY NFSDBG_FILE
@@ -74,6 +75,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
74 75
75 nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); 76 nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
76 nfs_file_set_open_context(filp, ctx); 77 nfs_file_set_open_context(filp, ctx);
78 nfs_fscache_set_inode_cookie(inode, filp);
77 err = 0; 79 err = 0;
78 80
79out_put_ctx: 81out_put_ctx:
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 493f0f41c554..5d864fb36578 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -64,7 +64,7 @@
64#include "pnfs.h" 64#include "pnfs.h"
65#include "netns.h" 65#include "netns.h"
66#include "nfs4session.h" 66#include "nfs4session.h"
67 67#include "fscache.h"
68 68
69#define NFSDBG_FACILITY NFSDBG_PROC 69#define NFSDBG_FACILITY NFSDBG_PROC
70 70
@@ -734,6 +734,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
734 if (!cinfo->atomic || cinfo->before != dir->i_version) 734 if (!cinfo->atomic || cinfo->before != dir->i_version)
735 nfs_force_lookup_revalidate(dir); 735 nfs_force_lookup_revalidate(dir);
736 dir->i_version = cinfo->after; 736 dir->i_version = cinfo->after;
737 nfs_fscache_invalidate(dir);
737 spin_unlock(&dir->i_lock); 738 spin_unlock(&dir->i_lock);
738} 739}
739 740
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5209916e1222..b673be31590e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1794,7 +1794,8 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1794 if (PagePrivate(page)) 1794 if (PagePrivate(page))
1795 return -EBUSY; 1795 return -EBUSY;
1796 1796
1797 nfs_fscache_release_page(page, GFP_KERNEL); 1797 if (!nfs_fscache_release_page(page, GFP_KERNEL))
1798 return -EBUSY;
1798 1799
1799 return migrate_page(mapping, newpage, page, mode); 1800 return migrate_page(mapping, newpage, page, mode);
1800} 1801}
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index e6c38159622f..e761ee95617f 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -8,61 +8,144 @@
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/nsproxy.h>
12#include <linux/sunrpc/clnt.h>
13#include <asm/uaccess.h>
11 14
12#include "state.h" 15#include "state.h"
13#include "fault_inject.h" 16#include "netns.h"
14 17
15struct nfsd_fault_inject_op { 18struct nfsd_fault_inject_op {
16 char *file; 19 char *file;
17 void (*func)(u64); 20 u64 (*forget)(struct nfs4_client *, u64);
21 u64 (*print)(struct nfs4_client *, u64);
18}; 22};
19 23
20static struct nfsd_fault_inject_op inject_ops[] = { 24static struct nfsd_fault_inject_op inject_ops[] = {
21 { 25 {
22 .file = "forget_clients", 26 .file = "forget_clients",
23 .func = nfsd_forget_clients, 27 .forget = nfsd_forget_client,
28 .print = nfsd_print_client,
24 }, 29 },
25 { 30 {
26 .file = "forget_locks", 31 .file = "forget_locks",
27 .func = nfsd_forget_locks, 32 .forget = nfsd_forget_client_locks,
33 .print = nfsd_print_client_locks,
28 }, 34 },
29 { 35 {
30 .file = "forget_openowners", 36 .file = "forget_openowners",
31 .func = nfsd_forget_openowners, 37 .forget = nfsd_forget_client_openowners,
38 .print = nfsd_print_client_openowners,
32 }, 39 },
33 { 40 {
34 .file = "forget_delegations", 41 .file = "forget_delegations",
35 .func = nfsd_forget_delegations, 42 .forget = nfsd_forget_client_delegations,
43 .print = nfsd_print_client_delegations,
36 }, 44 },
37 { 45 {
38 .file = "recall_delegations", 46 .file = "recall_delegations",
39 .func = nfsd_recall_delegations, 47 .forget = nfsd_recall_client_delegations,
48 .print = nfsd_print_client_delegations,
40 }, 49 },
41}; 50};
42 51
43static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op); 52static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
44static struct dentry *debug_dir; 53static struct dentry *debug_dir;
45 54
46static int nfsd_inject_set(void *op_ptr, u64 val) 55static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)
47{ 56{
48 struct nfsd_fault_inject_op *op = op_ptr; 57 u64 count = 0;
49 58
50 if (val == 0) 59 if (val == 0)
51 printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file); 60 printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
52 else 61 else
53 printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val); 62 printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
54 63
55 op->func(val); 64 nfs4_lock_state();
56 return 0; 65 count = nfsd_for_n_state(val, op->forget);
66 nfs4_unlock_state();
67 printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);
57} 68}
58 69
59static int nfsd_inject_get(void *data, u64 *val) 70static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op,
71 struct sockaddr_storage *addr,
72 size_t addr_size)
60{ 73{
61 *val = 0; 74 char buf[INET6_ADDRSTRLEN];
62 return 0; 75 struct nfs4_client *clp;
76 u64 count;
77
78 nfs4_lock_state();
79 clp = nfsd_find_client(addr, addr_size);
80 if (clp) {
81 count = op->forget(clp, 0);
82 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
83 printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count);
84 }
85 nfs4_unlock_state();
86}
87
88static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val)
89{
90 nfs4_lock_state();
91 *val = nfsd_for_n_state(0, op->print);
92 nfs4_unlock_state();
63} 93}
64 94
65DEFINE_SIMPLE_ATTRIBUTE(fops_nfsd, nfsd_inject_get, nfsd_inject_set, "%llu\n"); 95static ssize_t fault_inject_read(struct file *file, char __user *buf,
96 size_t len, loff_t *ppos)
97{
98 static u64 val;
99 char read_buf[25];
100 size_t size, ret;
101 loff_t pos = *ppos;
102
103 if (!pos)
104 nfsd_inject_get(file->f_dentry->d_inode->i_private, &val);
105 size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
106
107 if (pos < 0)
108 return -EINVAL;
109 if (pos >= size || !len)
110 return 0;
111 if (len > size - pos)
112 len = size - pos;
113 ret = copy_to_user(buf, read_buf + pos, len);
114 if (ret == len)
115 return -EFAULT;
116 len -= ret;
117 *ppos = pos + len;
118 return len;
119}
120
121static ssize_t fault_inject_write(struct file *file, const char __user *buf,
122 size_t len, loff_t *ppos)
123{
124 char write_buf[INET6_ADDRSTRLEN];
125 size_t size = min(sizeof(write_buf) - 1, len);
126 struct net *net = current->nsproxy->net_ns;
127 struct sockaddr_storage sa;
128 u64 val;
129
130 if (copy_from_user(write_buf, buf, size))
131 return -EFAULT;
132 write_buf[size] = '\0';
133
134 size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
135 if (size > 0)
136 nfsd_inject_set_client(file->f_dentry->d_inode->i_private, &sa, size);
137 else {
138 val = simple_strtoll(write_buf, NULL, 0);
139 nfsd_inject_set(file->f_dentry->d_inode->i_private, val);
140 }
141 return len; /* on success, claim we got the whole input */
142}
143
144static const struct file_operations fops_nfsd = {
145 .owner = THIS_MODULE,
146 .read = fault_inject_read,
147 .write = fault_inject_write,
148};
66 149
67void nfsd_fault_inject_cleanup(void) 150void nfsd_fault_inject_cleanup(void)
68{ 151{
diff --git a/fs/nfsd/fault_inject.h b/fs/nfsd/fault_inject.h
deleted file mode 100644
index 90bd0570956c..000000000000
--- a/fs/nfsd/fault_inject.h
+++ /dev/null
@@ -1,28 +0,0 @@
1/*
2 * Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
3 *
4 * Function definitions for fault injection
5 */
6
7#ifndef LINUX_NFSD_FAULT_INJECT_H
8#define LINUX_NFSD_FAULT_INJECT_H
9
10#ifdef CONFIG_NFSD_FAULT_INJECTION
11int nfsd_fault_inject_init(void);
12void nfsd_fault_inject_cleanup(void);
13void nfsd_forget_clients(u64);
14void nfsd_forget_locks(u64);
15void nfsd_forget_openowners(u64);
16void nfsd_forget_delegations(u64);
17void nfsd_recall_delegations(u64);
18#else /* CONFIG_NFSD_FAULT_INJECTION */
19static inline int nfsd_fault_inject_init(void) { return 0; }
20static inline void nfsd_fault_inject_cleanup(void) {}
21static inline void nfsd_forget_clients(u64 num) {}
22static inline void nfsd_forget_locks(u64 num) {}
23static inline void nfsd_forget_openowners(u64 num) {}
24static inline void nfsd_forget_delegations(u64 num) {}
25static inline void nfsd_recall_delegations(u64 num) {}
26#endif /* CONFIG_NFSD_FAULT_INJECTION */
27
28#endif /* LINUX_NFSD_FAULT_INJECT_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 65c2431ea32f..1051bebff1b0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -24,7 +24,18 @@
24#include <net/net_namespace.h> 24#include <net/net_namespace.h>
25#include <net/netns/generic.h> 25#include <net/netns/generic.h>
26 26
27/* Hash tables for nfs4_clientid state */
28#define CLIENT_HASH_BITS 4
29#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
30#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
31
32#define LOCKOWNER_INO_HASH_BITS 8
33#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
34
35#define SESSION_HASH_SIZE 512
36
27struct cld_net; 37struct cld_net;
38struct nfsd4_client_tracking_ops;
28 39
29struct nfsd_net { 40struct nfsd_net {
30 struct cld_net *cld_net; 41 struct cld_net *cld_net;
@@ -38,7 +49,62 @@ struct nfsd_net {
38 struct lock_manager nfsd4_manager; 49 struct lock_manager nfsd4_manager;
39 bool grace_ended; 50 bool grace_ended;
40 time_t boot_time; 51 time_t boot_time;
52
53 /*
54 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
55 * used in reboot/reset lease grace period processing
56 *
57 * conf_id_hashtbl[], and conf_name_tree hold confirmed
58 * setclientid_confirmed info.
59 *
60 * unconf_str_hastbl[] and unconf_name_tree hold unconfirmed
61 * setclientid info.
62 */
63 struct list_head *reclaim_str_hashtbl;
64 int reclaim_str_hashtbl_size;
65 struct list_head *conf_id_hashtbl;
66 struct rb_root conf_name_tree;
67 struct list_head *unconf_id_hashtbl;
68 struct rb_root unconf_name_tree;
69 struct list_head *ownerstr_hashtbl;
70 struct list_head *lockowner_ino_hashtbl;
71 struct list_head *sessionid_hashtbl;
72 /*
73 * client_lru holds client queue ordered by nfs4_client.cl_time
74 * for lease renewal.
75 *
76 * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
77 * for last close replay.
78 *
79 * All of the above fields are protected by the client_mutex.
80 */
81 struct list_head client_lru;
82 struct list_head close_lru;
83
84 struct delayed_work laundromat_work;
85
86 /* client_lock protects the client lru list and session hash table */
87 spinlock_t client_lock;
88
89 struct file *rec_file;
90 bool in_grace;
91 struct nfsd4_client_tracking_ops *client_tracking_ops;
92
93 time_t nfsd4_lease;
94 time_t nfsd4_grace;
95
96 bool nfsd_net_up;
97
98 /*
99 * Time of server startup
100 */
101 struct timeval nfssvc_boot;
102
103 struct svc_serv *nfsd_serv;
41}; 104};
42 105
106/* Simple check to find out if a given net was properly initialized */
107#define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
108
43extern int nfsd_net_id; 109extern int nfsd_net_id;
44#endif /* __NFSD_NETNS_H__ */ 110#endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index b314888825d5..9170861c804a 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -253,7 +253,7 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
253 (resp->mask & NFS_ACL) ? resp->acl_access : NULL, 253 (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
254 (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); 254 (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
255 while (w > 0) { 255 while (w > 0) {
256 if (!rqstp->rq_respages[rqstp->rq_resused++]) 256 if (!*(rqstp->rq_next_page++))
257 return 0; 257 return 0;
258 w -= PAGE_SIZE; 258 w -= PAGE_SIZE;
259 } 259 }
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index a596e9d987e4..9cbc1a841f87 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -184,7 +184,7 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
184 (resp->mask & NFS_ACL) ? resp->acl_access : NULL, 184 (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
185 (resp->mask & NFS_DFACL) ? resp->acl_default : NULL); 185 (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
186 while (w > 0) { 186 while (w > 0) {
187 if (!rqstp->rq_respages[rqstp->rq_resused++]) 187 if (!*(rqstp->rq_next_page++))
188 return 0; 188 return 0;
189 w -= PAGE_SIZE; 189 w -= PAGE_SIZE;
190 } 190 }
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 97d90d1c8608..1fc02dfdc5c4 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -460,7 +460,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
460 __be32 nfserr; 460 __be32 nfserr;
461 int count = 0; 461 int count = 0;
462 loff_t offset; 462 loff_t offset;
463 int i; 463 struct page **p;
464 caddr_t page_addr = NULL; 464 caddr_t page_addr = NULL;
465 465
466 dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", 466 dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
@@ -484,8 +484,8 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
484 &resp->common, 484 &resp->common,
485 nfs3svc_encode_entry_plus); 485 nfs3svc_encode_entry_plus);
486 memcpy(resp->verf, argp->verf, 8); 486 memcpy(resp->verf, argp->verf, 8);
487 for (i=1; i<rqstp->rq_resused ; i++) { 487 for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
488 page_addr = page_address(rqstp->rq_respages[i]); 488 page_addr = page_address(*p);
489 489
490 if (((caddr_t)resp->buffer >= page_addr) && 490 if (((caddr_t)resp->buffer >= page_addr) &&
491 ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) { 491 ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 43f46cd9edea..324c0baf7cda 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -7,8 +7,10 @@
7 */ 7 */
8 8
9#include <linux/namei.h> 9#include <linux/namei.h>
10#include <linux/sunrpc/svc_xprt.h>
10#include "xdr3.h" 11#include "xdr3.h"
11#include "auth.h" 12#include "auth.h"
13#include "netns.h"
12 14
13#define NFSDDBG_FACILITY NFSDDBG_XDR 15#define NFSDDBG_FACILITY NFSDDBG_XDR
14 16
@@ -323,7 +325,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
323 struct nfsd3_readargs *args) 325 struct nfsd3_readargs *args)
324{ 326{
325 unsigned int len; 327 unsigned int len;
326 int v,pn; 328 int v;
327 u32 max_blocksize = svc_max_payload(rqstp); 329 u32 max_blocksize = svc_max_payload(rqstp);
328 330
329 if (!(p = decode_fh(p, &args->fh))) 331 if (!(p = decode_fh(p, &args->fh)))
@@ -338,8 +340,9 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
338 /* set up the kvec */ 340 /* set up the kvec */
339 v=0; 341 v=0;
340 while (len > 0) { 342 while (len > 0) {
341 pn = rqstp->rq_resused++; 343 struct page *p = *(rqstp->rq_next_page++);
342 rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); 344
345 rqstp->rq_vec[v].iov_base = page_address(p);
343 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; 346 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
344 len -= rqstp->rq_vec[v].iov_len; 347 len -= rqstp->rq_vec[v].iov_len;
345 v++; 348 v++;
@@ -461,8 +464,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
461 len = ntohl(*p++); 464 len = ntohl(*p++);
462 if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE) 465 if (len == 0 || len > NFS3_MAXPATHLEN || len >= PAGE_SIZE)
463 return 0; 466 return 0;
464 args->tname = new = 467 args->tname = new = page_address(*(rqstp->rq_next_page++));
465 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
466 args->tlen = len; 468 args->tlen = len;
467 /* first copy and check from the first page */ 469 /* first copy and check from the first page */
468 old = (char*)p; 470 old = (char*)p;
@@ -533,8 +535,7 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
533{ 535{
534 if (!(p = decode_fh(p, &args->fh))) 536 if (!(p = decode_fh(p, &args->fh)))
535 return 0; 537 return 0;
536 args->buffer = 538 args->buffer = page_address(*(rqstp->rq_next_page++));
537 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
538 539
539 return xdr_argsize_check(rqstp, p); 540 return xdr_argsize_check(rqstp, p);
540} 541}
@@ -565,8 +566,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
565 if (args->count > PAGE_SIZE) 566 if (args->count > PAGE_SIZE)
566 args->count = PAGE_SIZE; 567 args->count = PAGE_SIZE;
567 568
568 args->buffer = 569 args->buffer = page_address(*(rqstp->rq_next_page++));
569 page_address(rqstp->rq_respages[rqstp->rq_resused++]);
570 570
571 return xdr_argsize_check(rqstp, p); 571 return xdr_argsize_check(rqstp, p);
572} 572}
@@ -575,7 +575,7 @@ int
575nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, 575nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
576 struct nfsd3_readdirargs *args) 576 struct nfsd3_readdirargs *args)
577{ 577{
578 int len, pn; 578 int len;
579 u32 max_blocksize = svc_max_payload(rqstp); 579 u32 max_blocksize = svc_max_payload(rqstp);
580 580
581 if (!(p = decode_fh(p, &args->fh))) 581 if (!(p = decode_fh(p, &args->fh)))
@@ -590,9 +590,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
590 args->count = len; 590 args->count = len;
591 591
592 while (len > 0) { 592 while (len > 0) {
593 pn = rqstp->rq_resused++; 593 struct page *p = *(rqstp->rq_next_page++);
594 if (!args->buffer) 594 if (!args->buffer)
595 args->buffer = page_address(rqstp->rq_respages[pn]); 595 args->buffer = page_address(p);
596 len -= PAGE_SIZE; 596 len -= PAGE_SIZE;
597 } 597 }
598 598
@@ -720,12 +720,14 @@ int
720nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p, 720nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p,
721 struct nfsd3_writeres *resp) 721 struct nfsd3_writeres *resp)
722{ 722{
723 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
724
723 p = encode_wcc_data(rqstp, p, &resp->fh); 725 p = encode_wcc_data(rqstp, p, &resp->fh);
724 if (resp->status == 0) { 726 if (resp->status == 0) {
725 *p++ = htonl(resp->count); 727 *p++ = htonl(resp->count);
726 *p++ = htonl(resp->committed); 728 *p++ = htonl(resp->committed);
727 *p++ = htonl(nfssvc_boot.tv_sec); 729 *p++ = htonl(nn->nfssvc_boot.tv_sec);
728 *p++ = htonl(nfssvc_boot.tv_usec); 730 *p++ = htonl(nn->nfssvc_boot.tv_usec);
729 } 731 }
730 return xdr_ressize_check(rqstp, p); 732 return xdr_ressize_check(rqstp, p);
731} 733}
@@ -876,7 +878,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
876 common); 878 common);
877 __be32 *p = cd->buffer; 879 __be32 *p = cd->buffer;
878 caddr_t curr_page_addr = NULL; 880 caddr_t curr_page_addr = NULL;
879 int pn; /* current page number */ 881 struct page ** page;
880 int slen; /* string (name) length */ 882 int slen; /* string (name) length */
881 int elen; /* estimated entry length in words */ 883 int elen; /* estimated entry length in words */
882 int num_entry_words = 0; /* actual number of words */ 884 int num_entry_words = 0; /* actual number of words */
@@ -913,8 +915,9 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
913 } 915 }
914 916
915 /* determine which page in rq_respages[] we are currently filling */ 917 /* determine which page in rq_respages[] we are currently filling */
916 for (pn=1; pn < cd->rqstp->rq_resused; pn++) { 918 for (page = cd->rqstp->rq_respages + 1;
917 curr_page_addr = page_address(cd->rqstp->rq_respages[pn]); 919 page < cd->rqstp->rq_next_page; page++) {
920 curr_page_addr = page_address(*page);
918 921
919 if (((caddr_t)cd->buffer >= curr_page_addr) && 922 if (((caddr_t)cd->buffer >= curr_page_addr) &&
920 ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE)) 923 ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE))
@@ -929,14 +932,14 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
929 if (plus) 932 if (plus)
930 p = encode_entryplus_baggage(cd, p, name, namlen); 933 p = encode_entryplus_baggage(cd, p, name, namlen);
931 num_entry_words = p - cd->buffer; 934 num_entry_words = p - cd->buffer;
932 } else if (cd->rqstp->rq_respages[pn+1] != NULL) { 935 } else if (*(page+1) != NULL) {
933 /* temporarily encode entry into next page, then move back to 936 /* temporarily encode entry into next page, then move back to
934 * current and next page in rq_respages[] */ 937 * current and next page in rq_respages[] */
935 __be32 *p1, *tmp; 938 __be32 *p1, *tmp;
936 int len1, len2; 939 int len1, len2;
937 940
938 /* grab next page for temporary storage of entry */ 941 /* grab next page for temporary storage of entry */
939 p1 = tmp = page_address(cd->rqstp->rq_respages[pn+1]); 942 p1 = tmp = page_address(*(page+1));
940 943
941 p1 = encode_entry_baggage(cd, p1, name, namlen, ino); 944 p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
942 945
@@ -1082,11 +1085,13 @@ int
1082nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p, 1085nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p,
1083 struct nfsd3_commitres *resp) 1086 struct nfsd3_commitres *resp)
1084{ 1087{
1088 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1089
1085 p = encode_wcc_data(rqstp, p, &resp->fh); 1090 p = encode_wcc_data(rqstp, p, &resp->fh);
1086 /* Write verifier */ 1091 /* Write verifier */
1087 if (resp->status == 0) { 1092 if (resp->status == 0) {
1088 *p++ = htonl(nfssvc_boot.tv_sec); 1093 *p++ = htonl(nn->nfssvc_boot.tv_sec);
1089 *p++ = htonl(nfssvc_boot.tv_usec); 1094 *p++ = htonl(nn->nfssvc_boot.tv_usec);
1090 } 1095 }
1091 return xdr_ressize_check(rqstp, p); 1096 return xdr_ressize_check(rqstp, p);
1092} 1097}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index bdf29c96e4cd..99bc85ff0217 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -36,6 +36,7 @@
36#include <linux/slab.h> 36#include <linux/slab.h>
37#include "nfsd.h" 37#include "nfsd.h"
38#include "state.h" 38#include "state.h"
39#include "netns.h"
39 40
40#define NFSDDBG_FACILITY NFSDDBG_PROC 41#define NFSDDBG_FACILITY NFSDDBG_PROC
41 42
@@ -625,20 +626,46 @@ static const struct rpc_program cb_program = {
625 .pipe_dir_name = "nfsd4_cb", 626 .pipe_dir_name = "nfsd4_cb",
626}; 627};
627 628
628static int max_cb_time(void) 629static int max_cb_time(struct net *net)
629{ 630{
630 return max(nfsd4_lease/10, (time_t)1) * HZ; 631 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
632 return max(nn->nfsd4_lease/10, (time_t)1) * HZ;
631} 633}
632 634
635static struct rpc_cred *callback_cred;
636
637int set_callback_cred(void)
638{
639 if (callback_cred)
640 return 0;
641 callback_cred = rpc_lookup_machine_cred("nfs");
642 if (!callback_cred)
643 return -ENOMEM;
644 return 0;
645}
646
647static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc_clnt *client, struct nfsd4_session *ses)
648{
649 if (clp->cl_minorversion == 0) {
650 return get_rpccred(callback_cred);
651 } else {
652 struct rpc_auth *auth = client->cl_auth;
653 struct auth_cred acred = {};
654
655 acred.uid = ses->se_cb_sec.uid;
656 acred.gid = ses->se_cb_sec.gid;
657 return auth->au_ops->lookup_cred(client->cl_auth, &acred, 0);
658 }
659}
633 660
634static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses) 661static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
635{ 662{
636 struct rpc_timeout timeparms = { 663 struct rpc_timeout timeparms = {
637 .to_initval = max_cb_time(), 664 .to_initval = max_cb_time(clp->net),
638 .to_retries = 0, 665 .to_retries = 0,
639 }; 666 };
640 struct rpc_create_args args = { 667 struct rpc_create_args args = {
641 .net = &init_net, 668 .net = clp->net,
642 .address = (struct sockaddr *) &conn->cb_addr, 669 .address = (struct sockaddr *) &conn->cb_addr,
643 .addrsize = conn->cb_addrlen, 670 .addrsize = conn->cb_addrlen,
644 .saddress = (struct sockaddr *) &conn->cb_saddr, 671 .saddress = (struct sockaddr *) &conn->cb_saddr,
@@ -648,6 +675,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
648 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 675 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
649 }; 676 };
650 struct rpc_clnt *client; 677 struct rpc_clnt *client;
678 struct rpc_cred *cred;
651 679
652 if (clp->cl_minorversion == 0) { 680 if (clp->cl_minorversion == 0) {
653 if (!clp->cl_cred.cr_principal && 681 if (!clp->cl_cred.cr_principal &&
@@ -666,7 +694,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
666 args.bc_xprt = conn->cb_xprt; 694 args.bc_xprt = conn->cb_xprt;
667 args.prognumber = clp->cl_cb_session->se_cb_prog; 695 args.prognumber = clp->cl_cb_session->se_cb_prog;
668 args.protocol = XPRT_TRANSPORT_BC_TCP; 696 args.protocol = XPRT_TRANSPORT_BC_TCP;
669 args.authflavor = RPC_AUTH_UNIX; 697 args.authflavor = ses->se_cb_sec.flavor;
670 } 698 }
671 /* Create RPC client */ 699 /* Create RPC client */
672 client = rpc_create(&args); 700 client = rpc_create(&args);
@@ -675,9 +703,14 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
675 PTR_ERR(client)); 703 PTR_ERR(client));
676 return PTR_ERR(client); 704 return PTR_ERR(client);
677 } 705 }
706 cred = get_backchannel_cred(clp, client, ses);
707 if (IS_ERR(cred)) {
708 rpc_shutdown_client(client);
709 return PTR_ERR(cred);
710 }
678 clp->cl_cb_client = client; 711 clp->cl_cb_client = client;
712 clp->cl_cb_cred = cred;
679 return 0; 713 return 0;
680
681} 714}
682 715
683static void warn_no_callback_path(struct nfs4_client *clp, int reason) 716static void warn_no_callback_path(struct nfs4_client *clp, int reason)
@@ -714,18 +747,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
714 .rpc_call_done = nfsd4_cb_probe_done, 747 .rpc_call_done = nfsd4_cb_probe_done,
715}; 748};
716 749
717static struct rpc_cred *callback_cred;
718
719int set_callback_cred(void)
720{
721 if (callback_cred)
722 return 0;
723 callback_cred = rpc_lookup_machine_cred("nfs");
724 if (!callback_cred)
725 return -ENOMEM;
726 return 0;
727}
728
729static struct workqueue_struct *callback_wq; 750static struct workqueue_struct *callback_wq;
730 751
731static void run_nfsd4_cb(struct nfsd4_callback *cb) 752static void run_nfsd4_cb(struct nfsd4_callback *cb)
@@ -743,7 +764,6 @@ static void do_probe_callback(struct nfs4_client *clp)
743 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL]; 764 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
744 cb->cb_msg.rpc_argp = NULL; 765 cb->cb_msg.rpc_argp = NULL;
745 cb->cb_msg.rpc_resp = NULL; 766 cb->cb_msg.rpc_resp = NULL;
746 cb->cb_msg.rpc_cred = callback_cred;
747 767
748 cb->cb_ops = &nfsd4_cb_probe_ops; 768 cb->cb_ops = &nfsd4_cb_probe_ops;
749 769
@@ -962,6 +982,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
962 if (clp->cl_cb_client) { 982 if (clp->cl_cb_client) {
963 rpc_shutdown_client(clp->cl_cb_client); 983 rpc_shutdown_client(clp->cl_cb_client);
964 clp->cl_cb_client = NULL; 984 clp->cl_cb_client = NULL;
985 put_rpccred(clp->cl_cb_cred);
986 clp->cl_cb_cred = NULL;
965 } 987 }
966 if (clp->cl_cb_conn.cb_xprt) { 988 if (clp->cl_cb_conn.cb_xprt) {
967 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 989 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -995,7 +1017,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
995 run_nfsd4_cb(cb); 1017 run_nfsd4_cb(cb);
996} 1018}
997 1019
998void nfsd4_do_callback_rpc(struct work_struct *w) 1020static void nfsd4_do_callback_rpc(struct work_struct *w)
999{ 1021{
1000 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work); 1022 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work);
1001 struct nfs4_client *clp = cb->cb_clp; 1023 struct nfs4_client *clp = cb->cb_clp;
@@ -1010,10 +1032,16 @@ void nfsd4_do_callback_rpc(struct work_struct *w)
1010 nfsd4_release_cb(cb); 1032 nfsd4_release_cb(cb);
1011 return; 1033 return;
1012 } 1034 }
1035 cb->cb_msg.rpc_cred = clp->cl_cb_cred;
1013 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 1036 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
1014 cb->cb_ops, cb); 1037 cb->cb_ops, cb);
1015} 1038}
1016 1039
1040void nfsd4_init_callback(struct nfsd4_callback *cb)
1041{
1042 INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
1043}
1044
1017void nfsd4_cb_recall(struct nfs4_delegation *dp) 1045void nfsd4_cb_recall(struct nfs4_delegation *dp)
1018{ 1046{
1019 struct nfsd4_callback *cb = &dp->dl_recall; 1047 struct nfsd4_callback *cb = &dp->dl_recall;
@@ -1025,7 +1053,6 @@ void nfsd4_cb_recall(struct nfs4_delegation *dp)
1025 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; 1053 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL];
1026 cb->cb_msg.rpc_argp = cb; 1054 cb->cb_msg.rpc_argp = cb;
1027 cb->cb_msg.rpc_resp = cb; 1055 cb->cb_msg.rpc_resp = cb;
1028 cb->cb_msg.rpc_cred = callback_cred;
1029 1056
1030 cb->cb_ops = &nfsd4_cb_recall_ops; 1057 cb->cb_ops = &nfsd4_cb_recall_ops;
1031 1058
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6c9a4b291dba..9d1c5dba2bbb 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -40,6 +40,7 @@
40#include "xdr4.h" 40#include "xdr4.h"
41#include "vfs.h" 41#include "vfs.h"
42#include "current_stateid.h" 42#include "current_stateid.h"
43#include "netns.h"
43 44
44#define NFSDDBG_FACILITY NFSDDBG_PROC 45#define NFSDDBG_FACILITY NFSDDBG_PROC
45 46
@@ -194,6 +195,7 @@ static __be32
194do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 195do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
195{ 196{
196 struct svc_fh *resfh; 197 struct svc_fh *resfh;
198 int accmode;
197 __be32 status; 199 __be32 status;
198 200
199 resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); 201 resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
@@ -253,9 +255,10 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
253 /* set reply cache */ 255 /* set reply cache */
254 fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh, 256 fh_copy_shallow(&open->op_openowner->oo_owner.so_replay.rp_openfh,
255 &resfh->fh_handle); 257 &resfh->fh_handle);
256 if (!open->op_created) 258 accmode = NFSD_MAY_NOP;
257 status = do_open_permission(rqstp, resfh, open, 259 if (open->op_created)
258 NFSD_MAY_NOP); 260 accmode |= NFSD_MAY_OWNER_OVERRIDE;
261 status = do_open_permission(rqstp, resfh, open, accmode);
259 set_change_info(&open->op_cinfo, current_fh); 262 set_change_info(&open->op_cinfo, current_fh);
260 fh_dup2(current_fh, resfh); 263 fh_dup2(current_fh, resfh);
261out: 264out:
@@ -304,6 +307,8 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
304{ 307{
305 __be32 status; 308 __be32 status;
306 struct nfsd4_compoundres *resp; 309 struct nfsd4_compoundres *resp;
310 struct net *net = SVC_NET(rqstp);
311 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
307 312
308 dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n", 313 dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
309 (int)open->op_fname.len, open->op_fname.data, 314 (int)open->op_fname.len, open->op_fname.data,
@@ -331,7 +336,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
331 336
332 /* check seqid for replay. set nfs4_owner */ 337 /* check seqid for replay. set nfs4_owner */
333 resp = rqstp->rq_resp; 338 resp = rqstp->rq_resp;
334 status = nfsd4_process_open1(&resp->cstate, open); 339 status = nfsd4_process_open1(&resp->cstate, open, nn);
335 if (status == nfserr_replay_me) { 340 if (status == nfserr_replay_me) {
336 struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay; 341 struct nfs4_replay *rp = &open->op_openowner->oo_owner.so_replay;
337 fh_put(&cstate->current_fh); 342 fh_put(&cstate->current_fh);
@@ -354,10 +359,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
354 /* Openowner is now set, so sequence id will get bumped. Now we need 359 /* Openowner is now set, so sequence id will get bumped. Now we need
355 * these checks before we do any creates: */ 360 * these checks before we do any creates: */
356 status = nfserr_grace; 361 status = nfserr_grace;
357 if (locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) 362 if (locks_in_grace(net) && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
358 goto out; 363 goto out;
359 status = nfserr_no_grace; 364 status = nfserr_no_grace;
360 if (!locks_in_grace(SVC_NET(rqstp)) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) 365 if (!locks_in_grace(net) && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS)
361 goto out; 366 goto out;
362 367
363 switch (open->op_claim_type) { 368 switch (open->op_claim_type) {
@@ -370,7 +375,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
370 break; 375 break;
371 case NFS4_OPEN_CLAIM_PREVIOUS: 376 case NFS4_OPEN_CLAIM_PREVIOUS:
372 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; 377 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
373 status = nfs4_check_open_reclaim(&open->op_clientid, cstate->minorversion); 378 status = nfs4_check_open_reclaim(&open->op_clientid,
379 cstate->minorversion,
380 nn);
374 if (status) 381 if (status)
375 goto out; 382 goto out;
376 case NFS4_OPEN_CLAIM_FH: 383 case NFS4_OPEN_CLAIM_FH:
@@ -490,12 +497,13 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
490 &access->ac_supported); 497 &access->ac_supported);
491} 498}
492 499
493static void gen_boot_verifier(nfs4_verifier *verifier) 500static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
494{ 501{
495 __be32 verf[2]; 502 __be32 verf[2];
503 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
496 504
497 verf[0] = (__be32)nfssvc_boot.tv_sec; 505 verf[0] = (__be32)nn->nfssvc_boot.tv_sec;
498 verf[1] = (__be32)nfssvc_boot.tv_usec; 506 verf[1] = (__be32)nn->nfssvc_boot.tv_usec;
499 memcpy(verifier->data, verf, sizeof(verifier->data)); 507 memcpy(verifier->data, verf, sizeof(verifier->data));
500} 508}
501 509
@@ -503,7 +511,7 @@ static __be32
503nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 511nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
504 struct nfsd4_commit *commit) 512 struct nfsd4_commit *commit)
505{ 513{
506 gen_boot_verifier(&commit->co_verf); 514 gen_boot_verifier(&commit->co_verf, SVC_NET(rqstp));
507 return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset, 515 return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
508 commit->co_count); 516 commit->co_count);
509} 517}
@@ -684,6 +692,17 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
684 if (read->rd_offset >= OFFSET_MAX) 692 if (read->rd_offset >= OFFSET_MAX)
685 return nfserr_inval; 693 return nfserr_inval;
686 694
695 /*
696 * If we do a zero copy read, then a client will see read data
697 * that reflects the state of the file *after* performing the
698 * following compound.
699 *
700 * To ensure proper ordering, we therefore turn off zero copy if
701 * the client wants us to do more in this compound:
702 */
703 if (!nfsd4_last_compound_op(rqstp))
704 rqstp->rq_splice_ok = false;
705
687 nfs4_lock_state(); 706 nfs4_lock_state();
688 /* check stateid */ 707 /* check stateid */
689 if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), 708 if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
@@ -876,6 +895,24 @@ out:
876 return status; 895 return status;
877} 896}
878 897
898static int fill_in_write_vector(struct kvec *vec, struct nfsd4_write *write)
899{
900 int i = 1;
901 int buflen = write->wr_buflen;
902
903 vec[0].iov_base = write->wr_head.iov_base;
904 vec[0].iov_len = min_t(int, buflen, write->wr_head.iov_len);
905 buflen -= vec[0].iov_len;
906
907 while (buflen) {
908 vec[i].iov_base = page_address(write->wr_pagelist[i - 1]);
909 vec[i].iov_len = min_t(int, PAGE_SIZE, buflen);
910 buflen -= vec[i].iov_len;
911 i++;
912 }
913 return i;
914}
915
879static __be32 916static __be32
880nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 917nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
881 struct nfsd4_write *write) 918 struct nfsd4_write *write)
@@ -884,6 +921,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
884 struct file *filp = NULL; 921 struct file *filp = NULL;
885 __be32 status = nfs_ok; 922 __be32 status = nfs_ok;
886 unsigned long cnt; 923 unsigned long cnt;
924 int nvecs;
887 925
888 /* no need to check permission - this will be done in nfsd_write() */ 926 /* no need to check permission - this will be done in nfsd_write() */
889 927
@@ -904,10 +942,13 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
904 942
905 cnt = write->wr_buflen; 943 cnt = write->wr_buflen;
906 write->wr_how_written = write->wr_stable_how; 944 write->wr_how_written = write->wr_stable_how;
907 gen_boot_verifier(&write->wr_verifier); 945 gen_boot_verifier(&write->wr_verifier, SVC_NET(rqstp));
946
947 nvecs = fill_in_write_vector(rqstp->rq_vec, write);
948 WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
908 949
909 status = nfsd_write(rqstp, &cstate->current_fh, filp, 950 status = nfsd_write(rqstp, &cstate->current_fh, filp,
910 write->wr_offset, rqstp->rq_vec, write->wr_vlen, 951 write->wr_offset, rqstp->rq_vec, nvecs,
911 &cnt, &write->wr_how_written); 952 &cnt, &write->wr_how_written);
912 if (filp) 953 if (filp)
913 fput(filp); 954 fput(filp);
@@ -1666,6 +1707,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
1666 .op_name = "OP_EXCHANGE_ID", 1707 .op_name = "OP_EXCHANGE_ID",
1667 .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize, 1708 .op_rsize_bop = (nfsd4op_rsize)nfsd4_exchange_id_rsize,
1668 }, 1709 },
1710 [OP_BACKCHANNEL_CTL] = {
1711 .op_func = (nfsd4op_func)nfsd4_backchannel_ctl,
1712 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
1713 .op_name = "OP_BACKCHANNEL_CTL",
1714 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1715 },
1669 [OP_BIND_CONN_TO_SESSION] = { 1716 [OP_BIND_CONN_TO_SESSION] = {
1670 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session, 1717 .op_func = (nfsd4op_func)nfsd4_bind_conn_to_session,
1671 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP 1718 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP
@@ -1719,6 +1766,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
1719 .op_func = (nfsd4op_func)nfsd4_free_stateid, 1766 .op_func = (nfsd4op_func)nfsd4_free_stateid,
1720 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING, 1767 .op_flags = ALLOWED_WITHOUT_FH | OP_MODIFIES_SOMETHING,
1721 .op_name = "OP_FREE_STATEID", 1768 .op_name = "OP_FREE_STATEID",
1769 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
1722 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 1770 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1723 }, 1771 },
1724}; 1772};
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 43295d45cc2b..ba6fdd4a0455 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,13 +58,11 @@ struct nfsd4_client_tracking_ops {
58 void (*create)(struct nfs4_client *); 58 void (*create)(struct nfs4_client *);
59 void (*remove)(struct nfs4_client *); 59 void (*remove)(struct nfs4_client *);
60 int (*check)(struct nfs4_client *); 60 int (*check)(struct nfs4_client *);
61 void (*grace_done)(struct net *, time_t); 61 void (*grace_done)(struct nfsd_net *, time_t);
62}; 62};
63 63
64/* Globals */ 64/* Globals */
65static struct file *rec_file;
66static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; 65static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
67static struct nfsd4_client_tracking_ops *client_tracking_ops;
68 66
69static int 67static int
70nfs4_save_creds(const struct cred **original_creds) 68nfs4_save_creds(const struct cred **original_creds)
@@ -102,33 +100,39 @@ md5_to_hex(char *out, char *md5)
102 *out = '\0'; 100 *out = '\0';
103} 101}
104 102
105__be32 103static int
106nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) 104nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname)
107{ 105{
108 struct xdr_netobj cksum; 106 struct xdr_netobj cksum;
109 struct hash_desc desc; 107 struct hash_desc desc;
110 struct scatterlist sg; 108 struct scatterlist sg;
111 __be32 status = nfserr_jukebox; 109 int status;
112 110
113 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n", 111 dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
114 clname->len, clname->data); 112 clname->len, clname->data);
115 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 113 desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
116 desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); 114 desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
117 if (IS_ERR(desc.tfm)) 115 if (IS_ERR(desc.tfm)) {
116 status = PTR_ERR(desc.tfm);
118 goto out_no_tfm; 117 goto out_no_tfm;
118 }
119
119 cksum.len = crypto_hash_digestsize(desc.tfm); 120 cksum.len = crypto_hash_digestsize(desc.tfm);
120 cksum.data = kmalloc(cksum.len, GFP_KERNEL); 121 cksum.data = kmalloc(cksum.len, GFP_KERNEL);
121 if (cksum.data == NULL) 122 if (cksum.data == NULL) {
123 status = -ENOMEM;
122 goto out; 124 goto out;
125 }
123 126
124 sg_init_one(&sg, clname->data, clname->len); 127 sg_init_one(&sg, clname->data, clname->len);
125 128
126 if (crypto_hash_digest(&desc, &sg, sg.length, cksum.data)) 129 status = crypto_hash_digest(&desc, &sg, sg.length, cksum.data);
130 if (status)
127 goto out; 131 goto out;
128 132
129 md5_to_hex(dname, cksum.data); 133 md5_to_hex(dname, cksum.data);
130 134
131 status = nfs_ok; 135 status = 0;
132out: 136out:
133 kfree(cksum.data); 137 kfree(cksum.data);
134 crypto_free_hash(desc.tfm); 138 crypto_free_hash(desc.tfm);
@@ -136,29 +140,61 @@ out_no_tfm:
136 return status; 140 return status;
137} 141}
138 142
143/*
144 * If we had an error generating the recdir name for the legacy tracker
145 * then warn the admin. If the error doesn't appear to be transient,
146 * then disable recovery tracking.
147 */
148static void
149legacy_recdir_name_error(int error)
150{
151 printk(KERN_ERR "NFSD: unable to generate recoverydir "
152 "name (%d).\n", error);
153
154 /*
155 * if the algorithm just doesn't exist, then disable the recovery
156 * tracker altogether. The crypto libs will generally return this if
157 * FIPS is enabled as well.
158 */
159 if (error == -ENOENT) {
160 printk(KERN_ERR "NFSD: disabling legacy clientid tracking. "
161 "Reboot recovery will not function correctly!\n");
162
163 /* the argument is ignored by the legacy exit function */
164 nfsd4_client_tracking_exit(NULL);
165 }
166}
167
139static void 168static void
140nfsd4_create_clid_dir(struct nfs4_client *clp) 169nfsd4_create_clid_dir(struct nfs4_client *clp)
141{ 170{
142 const struct cred *original_cred; 171 const struct cred *original_cred;
143 char *dname = clp->cl_recdir; 172 char dname[HEXDIR_LEN];
144 struct dentry *dir, *dentry; 173 struct dentry *dir, *dentry;
174 struct nfs4_client_reclaim *crp;
145 int status; 175 int status;
176 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
146 177
147 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 178 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
148 179
149 if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) 180 if (test_and_set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
150 return; 181 return;
151 if (!rec_file) 182 if (!nn->rec_file)
152 return; 183 return;
184
185 status = nfs4_make_rec_clidname(dname, &clp->cl_name);
186 if (status)
187 return legacy_recdir_name_error(status);
188
153 status = nfs4_save_creds(&original_cred); 189 status = nfs4_save_creds(&original_cred);
154 if (status < 0) 190 if (status < 0)
155 return; 191 return;
156 192
157 status = mnt_want_write_file(rec_file); 193 status = mnt_want_write_file(nn->rec_file);
158 if (status) 194 if (status)
159 return; 195 return;
160 196
161 dir = rec_file->f_path.dentry; 197 dir = nn->rec_file->f_path.dentry;
162 /* lock the parent */ 198 /* lock the parent */
163 mutex_lock(&dir->d_inode->i_mutex); 199 mutex_lock(&dir->d_inode->i_mutex);
164 200
@@ -182,18 +218,24 @@ out_put:
182 dput(dentry); 218 dput(dentry);
183out_unlock: 219out_unlock:
184 mutex_unlock(&dir->d_inode->i_mutex); 220 mutex_unlock(&dir->d_inode->i_mutex);
185 if (status == 0) 221 if (status == 0) {
186 vfs_fsync(rec_file, 0); 222 if (nn->in_grace) {
187 else 223 crp = nfs4_client_to_reclaim(dname, nn);
224 if (crp)
225 crp->cr_clp = clp;
226 }
227 vfs_fsync(nn->rec_file, 0);
228 } else {
188 printk(KERN_ERR "NFSD: failed to write recovery record" 229 printk(KERN_ERR "NFSD: failed to write recovery record"
189 " (err %d); please check that %s exists" 230 " (err %d); please check that %s exists"
190 " and is writeable", status, 231 " and is writeable", status,
191 user_recovery_dirname); 232 user_recovery_dirname);
192 mnt_drop_write_file(rec_file); 233 }
234 mnt_drop_write_file(nn->rec_file);
193 nfs4_reset_creds(original_cred); 235 nfs4_reset_creds(original_cred);
194} 236}
195 237
196typedef int (recdir_func)(struct dentry *, struct dentry *); 238typedef int (recdir_func)(struct dentry *, struct dentry *, struct nfsd_net *);
197 239
198struct name_list { 240struct name_list {
199 char name[HEXDIR_LEN]; 241 char name[HEXDIR_LEN];
@@ -219,10 +261,10 @@ nfsd4_build_namelist(void *arg, const char *name, int namlen,
219} 261}
220 262
221static int 263static int
222nfsd4_list_rec_dir(recdir_func *f) 264nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
223{ 265{
224 const struct cred *original_cred; 266 const struct cred *original_cred;
225 struct dentry *dir = rec_file->f_path.dentry; 267 struct dentry *dir = nn->rec_file->f_path.dentry;
226 LIST_HEAD(names); 268 LIST_HEAD(names);
227 int status; 269 int status;
228 270
@@ -230,13 +272,13 @@ nfsd4_list_rec_dir(recdir_func *f)
230 if (status < 0) 272 if (status < 0)
231 return status; 273 return status;
232 274
233 status = vfs_llseek(rec_file, 0, SEEK_SET); 275 status = vfs_llseek(nn->rec_file, 0, SEEK_SET);
234 if (status < 0) { 276 if (status < 0) {
235 nfs4_reset_creds(original_cred); 277 nfs4_reset_creds(original_cred);
236 return status; 278 return status;
237 } 279 }
238 280
239 status = vfs_readdir(rec_file, nfsd4_build_namelist, &names); 281 status = vfs_readdir(nn->rec_file, nfsd4_build_namelist, &names);
240 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 282 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
241 while (!list_empty(&names)) { 283 while (!list_empty(&names)) {
242 struct name_list *entry; 284 struct name_list *entry;
@@ -248,7 +290,7 @@ nfsd4_list_rec_dir(recdir_func *f)
248 status = PTR_ERR(dentry); 290 status = PTR_ERR(dentry);
249 break; 291 break;
250 } 292 }
251 status = f(dir, dentry); 293 status = f(dir, dentry, nn);
252 dput(dentry); 294 dput(dentry);
253 } 295 }
254 list_del(&entry->list); 296 list_del(&entry->list);
@@ -260,14 +302,14 @@ nfsd4_list_rec_dir(recdir_func *f)
260} 302}
261 303
262static int 304static int
263nfsd4_unlink_clid_dir(char *name, int namlen) 305nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
264{ 306{
265 struct dentry *dir, *dentry; 307 struct dentry *dir, *dentry;
266 int status; 308 int status;
267 309
268 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 310 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
269 311
270 dir = rec_file->f_path.dentry; 312 dir = nn->rec_file->f_path.dentry;
271 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); 313 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
272 dentry = lookup_one_len(name, dir, namlen); 314 dentry = lookup_one_len(name, dir, namlen);
273 if (IS_ERR(dentry)) { 315 if (IS_ERR(dentry)) {
@@ -289,37 +331,52 @@ static void
289nfsd4_remove_clid_dir(struct nfs4_client *clp) 331nfsd4_remove_clid_dir(struct nfs4_client *clp)
290{ 332{
291 const struct cred *original_cred; 333 const struct cred *original_cred;
334 struct nfs4_client_reclaim *crp;
335 char dname[HEXDIR_LEN];
292 int status; 336 int status;
337 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
293 338
294 if (!rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) 339 if (!nn->rec_file || !test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
295 return; 340 return;
296 341
297 status = mnt_want_write_file(rec_file); 342 status = nfs4_make_rec_clidname(dname, &clp->cl_name);
343 if (status)
344 return legacy_recdir_name_error(status);
345
346 status = mnt_want_write_file(nn->rec_file);
298 if (status) 347 if (status)
299 goto out; 348 goto out;
300 clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); 349 clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
301 350
302 status = nfs4_save_creds(&original_cred); 351 status = nfs4_save_creds(&original_cred);
303 if (status < 0) 352 if (status < 0)
304 goto out; 353 goto out_drop_write;
305 354
306 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); 355 status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn);
307 nfs4_reset_creds(original_cred); 356 nfs4_reset_creds(original_cred);
308 if (status == 0) 357 if (status == 0) {
309 vfs_fsync(rec_file, 0); 358 vfs_fsync(nn->rec_file, 0);
310 mnt_drop_write_file(rec_file); 359 if (nn->in_grace) {
360 /* remove reclaim record */
361 crp = nfsd4_find_reclaim_client(dname, nn);
362 if (crp)
363 nfs4_remove_reclaim_record(crp, nn);
364 }
365 }
366out_drop_write:
367 mnt_drop_write_file(nn->rec_file);
311out: 368out:
312 if (status) 369 if (status)
313 printk("NFSD: Failed to remove expired client state directory" 370 printk("NFSD: Failed to remove expired client state directory"
314 " %.*s\n", HEXDIR_LEN, clp->cl_recdir); 371 " %.*s\n", HEXDIR_LEN, dname);
315} 372}
316 373
317static int 374static int
318purge_old(struct dentry *parent, struct dentry *child) 375purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
319{ 376{
320 int status; 377 int status;
321 378
322 if (nfs4_has_reclaimed_state(child->d_name.name, false)) 379 if (nfs4_has_reclaimed_state(child->d_name.name, nn))
323 return 0; 380 return 0;
324 381
325 status = vfs_rmdir(parent->d_inode, child); 382 status = vfs_rmdir(parent->d_inode, child);
@@ -331,27 +388,29 @@ purge_old(struct dentry *parent, struct dentry *child)
331} 388}
332 389
333static void 390static void
334nfsd4_recdir_purge_old(struct net *net, time_t boot_time) 391nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time)
335{ 392{
336 int status; 393 int status;
337 394
338 if (!rec_file) 395 nn->in_grace = false;
396 if (!nn->rec_file)
339 return; 397 return;
340 status = mnt_want_write_file(rec_file); 398 status = mnt_want_write_file(nn->rec_file);
341 if (status) 399 if (status)
342 goto out; 400 goto out;
343 status = nfsd4_list_rec_dir(purge_old); 401 status = nfsd4_list_rec_dir(purge_old, nn);
344 if (status == 0) 402 if (status == 0)
345 vfs_fsync(rec_file, 0); 403 vfs_fsync(nn->rec_file, 0);
346 mnt_drop_write_file(rec_file); 404 mnt_drop_write_file(nn->rec_file);
347out: 405out:
406 nfs4_release_reclaim(nn);
348 if (status) 407 if (status)
349 printk("nfsd4: failed to purge old clients from recovery" 408 printk("nfsd4: failed to purge old clients from recovery"
350 " directory %s\n", rec_file->f_path.dentry->d_name.name); 409 " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
351} 410}
352 411
353static int 412static int
354load_recdir(struct dentry *parent, struct dentry *child) 413load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
355{ 414{
356 if (child->d_name.len != HEXDIR_LEN - 1) { 415 if (child->d_name.len != HEXDIR_LEN - 1) {
357 printk("nfsd4: illegal name %s in recovery directory\n", 416 printk("nfsd4: illegal name %s in recovery directory\n",
@@ -359,21 +418,22 @@ load_recdir(struct dentry *parent, struct dentry *child)
359 /* Keep trying; maybe the others are OK: */ 418 /* Keep trying; maybe the others are OK: */
360 return 0; 419 return 0;
361 } 420 }
362 nfs4_client_to_reclaim(child->d_name.name); 421 nfs4_client_to_reclaim(child->d_name.name, nn);
363 return 0; 422 return 0;
364} 423}
365 424
366static int 425static int
367nfsd4_recdir_load(void) { 426nfsd4_recdir_load(struct net *net) {
368 int status; 427 int status;
428 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
369 429
370 if (!rec_file) 430 if (!nn->rec_file)
371 return 0; 431 return 0;
372 432
373 status = nfsd4_list_rec_dir(load_recdir); 433 status = nfsd4_list_rec_dir(load_recdir, nn);
374 if (status) 434 if (status)
375 printk("nfsd4: failed loading clients from recovery" 435 printk("nfsd4: failed loading clients from recovery"
376 " directory %s\n", rec_file->f_path.dentry->d_name.name); 436 " directory %s\n", nn->rec_file->f_path.dentry->d_name.name);
377 return status; 437 return status;
378} 438}
379 439
@@ -382,15 +442,16 @@ nfsd4_recdir_load(void) {
382 */ 442 */
383 443
384static int 444static int
385nfsd4_init_recdir(void) 445nfsd4_init_recdir(struct net *net)
386{ 446{
447 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
387 const struct cred *original_cred; 448 const struct cred *original_cred;
388 int status; 449 int status;
389 450
390 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 451 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
391 user_recovery_dirname); 452 user_recovery_dirname);
392 453
393 BUG_ON(rec_file); 454 BUG_ON(nn->rec_file);
394 455
395 status = nfs4_save_creds(&original_cred); 456 status = nfs4_save_creds(&original_cred);
396 if (status < 0) { 457 if (status < 0) {
@@ -400,23 +461,65 @@ nfsd4_init_recdir(void)
400 return status; 461 return status;
401 } 462 }
402 463
403 rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0); 464 nn->rec_file = filp_open(user_recovery_dirname, O_RDONLY | O_DIRECTORY, 0);
404 if (IS_ERR(rec_file)) { 465 if (IS_ERR(nn->rec_file)) {
405 printk("NFSD: unable to find recovery directory %s\n", 466 printk("NFSD: unable to find recovery directory %s\n",
406 user_recovery_dirname); 467 user_recovery_dirname);
407 status = PTR_ERR(rec_file); 468 status = PTR_ERR(nn->rec_file);
408 rec_file = NULL; 469 nn->rec_file = NULL;
409 } 470 }
410 471
411 nfs4_reset_creds(original_cred); 472 nfs4_reset_creds(original_cred);
473 if (!status)
474 nn->in_grace = true;
412 return status; 475 return status;
413} 476}
414 477
478
479static int
480nfs4_legacy_state_init(struct net *net)
481{
482 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
483 int i;
484
485 nn->reclaim_str_hashtbl = kmalloc(sizeof(struct list_head) *
486 CLIENT_HASH_SIZE, GFP_KERNEL);
487 if (!nn->reclaim_str_hashtbl)
488 return -ENOMEM;
489
490 for (i = 0; i < CLIENT_HASH_SIZE; i++)
491 INIT_LIST_HEAD(&nn->reclaim_str_hashtbl[i]);
492 nn->reclaim_str_hashtbl_size = 0;
493
494 return 0;
495}
496
497static void
498nfs4_legacy_state_shutdown(struct net *net)
499{
500 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
501
502 kfree(nn->reclaim_str_hashtbl);
503}
504
415static int 505static int
416nfsd4_load_reboot_recovery_data(struct net *net) 506nfsd4_load_reboot_recovery_data(struct net *net)
417{ 507{
418 int status; 508 int status;
419 509
510 status = nfsd4_init_recdir(net);
511 if (!status)
512 status = nfsd4_recdir_load(net);
513 if (status)
514 printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n");
515 return status;
516}
517
518static int
519nfsd4_legacy_tracking_init(struct net *net)
520{
521 int status;
522
420 /* XXX: The legacy code won't work in a container */ 523 /* XXX: The legacy code won't work in a container */
421 if (net != &init_net) { 524 if (net != &init_net) {
422 WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client " 525 WARN(1, KERN_ERR "NFSD: attempt to initialize legacy client "
@@ -424,30 +527,37 @@ nfsd4_load_reboot_recovery_data(struct net *net)
424 return -EINVAL; 527 return -EINVAL;
425 } 528 }
426 529
427 nfs4_lock_state(); 530 status = nfs4_legacy_state_init(net);
428 status = nfsd4_init_recdir();
429 if (!status)
430 status = nfsd4_recdir_load();
431 nfs4_unlock_state();
432 if (status) 531 if (status)
433 printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); 532 return status;
533
534 status = nfsd4_load_reboot_recovery_data(net);
535 if (status)
536 goto err;
537 return 0;
538
539err:
540 nfs4_legacy_state_shutdown(net);
434 return status; 541 return status;
435} 542}
436 543
437static void 544static void
438nfsd4_shutdown_recdir(void) 545nfsd4_shutdown_recdir(struct nfsd_net *nn)
439{ 546{
440 if (!rec_file) 547 if (!nn->rec_file)
441 return; 548 return;
442 fput(rec_file); 549 fput(nn->rec_file);
443 rec_file = NULL; 550 nn->rec_file = NULL;
444} 551}
445 552
446static void 553static void
447nfsd4_legacy_tracking_exit(struct net *net) 554nfsd4_legacy_tracking_exit(struct net *net)
448{ 555{
449 nfs4_release_reclaim(); 556 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
450 nfsd4_shutdown_recdir(); 557
558 nfs4_release_reclaim(nn);
559 nfsd4_shutdown_recdir(nn);
560 nfs4_legacy_state_shutdown(net);
451} 561}
452 562
453/* 563/*
@@ -480,13 +590,26 @@ nfs4_recoverydir(void)
480static int 590static int
481nfsd4_check_legacy_client(struct nfs4_client *clp) 591nfsd4_check_legacy_client(struct nfs4_client *clp)
482{ 592{
593 int status;
594 char dname[HEXDIR_LEN];
595 struct nfs4_client_reclaim *crp;
596 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
597
483 /* did we already find that this client is stable? */ 598 /* did we already find that this client is stable? */
484 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) 599 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
485 return 0; 600 return 0;
486 601
602 status = nfs4_make_rec_clidname(dname, &clp->cl_name);
603 if (status) {
604 legacy_recdir_name_error(status);
605 return status;
606 }
607
487 /* look for it in the reclaim hashtable otherwise */ 608 /* look for it in the reclaim hashtable otherwise */
488 if (nfsd4_find_reclaim_client(clp)) { 609 crp = nfsd4_find_reclaim_client(dname, nn);
610 if (crp) {
489 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); 611 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
612 crp->cr_clp = clp;
490 return 0; 613 return 0;
491 } 614 }
492 615
@@ -494,7 +617,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
494} 617}
495 618
496static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { 619static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
497 .init = nfsd4_load_reboot_recovery_data, 620 .init = nfsd4_legacy_tracking_init,
498 .exit = nfsd4_legacy_tracking_exit, 621 .exit = nfsd4_legacy_tracking_exit,
499 .create = nfsd4_create_clid_dir, 622 .create = nfsd4_create_clid_dir,
500 .remove = nfsd4_remove_clid_dir, 623 .remove = nfsd4_remove_clid_dir,
@@ -785,8 +908,7 @@ nfsd4_cld_create(struct nfs4_client *clp)
785{ 908{
786 int ret; 909 int ret;
787 struct cld_upcall *cup; 910 struct cld_upcall *cup;
788 /* FIXME: determine net from clp */ 911 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
789 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
790 struct cld_net *cn = nn->cld_net; 912 struct cld_net *cn = nn->cld_net;
791 913
792 /* Don't upcall if it's already stored */ 914 /* Don't upcall if it's already stored */
@@ -823,8 +945,7 @@ nfsd4_cld_remove(struct nfs4_client *clp)
823{ 945{
824 int ret; 946 int ret;
825 struct cld_upcall *cup; 947 struct cld_upcall *cup;
826 /* FIXME: determine net from clp */ 948 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
827 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
828 struct cld_net *cn = nn->cld_net; 949 struct cld_net *cn = nn->cld_net;
829 950
830 /* Don't upcall if it's already removed */ 951 /* Don't upcall if it's already removed */
@@ -861,8 +982,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
861{ 982{
862 int ret; 983 int ret;
863 struct cld_upcall *cup; 984 struct cld_upcall *cup;
864 /* FIXME: determine net from clp */ 985 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
865 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
866 struct cld_net *cn = nn->cld_net; 986 struct cld_net *cn = nn->cld_net;
867 987
868 /* Don't upcall if one was already stored during this grace pd */ 988 /* Don't upcall if one was already stored during this grace pd */
@@ -892,11 +1012,10 @@ nfsd4_cld_check(struct nfs4_client *clp)
892} 1012}
893 1013
894static void 1014static void
895nfsd4_cld_grace_done(struct net *net, time_t boot_time) 1015nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
896{ 1016{
897 int ret; 1017 int ret;
898 struct cld_upcall *cup; 1018 struct cld_upcall *cup;
899 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
900 struct cld_net *cn = nn->cld_net; 1019 struct cld_net *cn = nn->cld_net;
901 1020
902 cup = alloc_cld_upcall(cn); 1021 cup = alloc_cld_upcall(cn);
@@ -926,28 +1045,261 @@ static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
926 .grace_done = nfsd4_cld_grace_done, 1045 .grace_done = nfsd4_cld_grace_done,
927}; 1046};
928 1047
1048/* upcall via usermodehelper */
1049static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
1050module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
1051 S_IRUGO|S_IWUSR);
1052MODULE_PARM_DESC(cltrack_prog, "Path to the nfsdcltrack upcall program");
1053
1054static bool cltrack_legacy_disable;
1055module_param(cltrack_legacy_disable, bool, S_IRUGO|S_IWUSR);
1056MODULE_PARM_DESC(cltrack_legacy_disable,
1057 "Disable legacy recoverydir conversion. Default: false");
1058
1059#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
1060#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
1061
1062static char *
1063nfsd4_cltrack_legacy_topdir(void)
1064{
1065 int copied;
1066 size_t len;
1067 char *result;
1068
1069 if (cltrack_legacy_disable)
1070 return NULL;
1071
1072 len = strlen(LEGACY_TOPDIR_ENV_PREFIX) +
1073 strlen(nfs4_recoverydir()) + 1;
1074
1075 result = kmalloc(len, GFP_KERNEL);
1076 if (!result)
1077 return result;
1078
1079 copied = snprintf(result, len, LEGACY_TOPDIR_ENV_PREFIX "%s",
1080 nfs4_recoverydir());
1081 if (copied >= len) {
1082 /* just return nothing if output was truncated */
1083 kfree(result);
1084 return NULL;
1085 }
1086
1087 return result;
1088}
1089
1090static char *
1091nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
1092{
1093 int copied;
1094 size_t len;
1095 char *result;
1096
1097 if (cltrack_legacy_disable)
1098 return NULL;
1099
1100 /* +1 is for '/' between "topdir" and "recdir" */
1101 len = strlen(LEGACY_RECDIR_ENV_PREFIX) +
1102 strlen(nfs4_recoverydir()) + 1 + HEXDIR_LEN;
1103
1104 result = kmalloc(len, GFP_KERNEL);
1105 if (!result)
1106 return result;
1107
1108 copied = snprintf(result, len, LEGACY_RECDIR_ENV_PREFIX "%s/",
1109 nfs4_recoverydir());
1110 if (copied > (len - HEXDIR_LEN)) {
1111 /* just return nothing if output will be truncated */
1112 kfree(result);
1113 return NULL;
1114 }
1115
1116 copied = nfs4_make_rec_clidname(result + copied, name);
1117 if (copied) {
1118 kfree(result);
1119 return NULL;
1120 }
1121
1122 return result;
1123}
1124
1125static int
1126nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
1127{
1128 char *envp[2];
1129 char *argv[4];
1130 int ret;
1131
1132 if (unlikely(!cltrack_prog[0])) {
1133 dprintk("%s: cltrack_prog is disabled\n", __func__);
1134 return -EACCES;
1135 }
1136
1137 dprintk("%s: cmd: %s\n", __func__, cmd);
1138 dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
1139 dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)");
1140
1141 envp[0] = legacy;
1142 envp[1] = NULL;
1143
1144 argv[0] = (char *)cltrack_prog;
1145 argv[1] = cmd;
1146 argv[2] = arg;
1147 argv[3] = NULL;
1148
1149 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
1150 /*
1151 * Disable the upcall mechanism if we're getting an ENOENT or EACCES
1152 * error. The admin can re-enable it on the fly by using sysfs
1153 * once the problem has been fixed.
1154 */
1155 if (ret == -ENOENT || ret == -EACCES) {
1156 dprintk("NFSD: %s was not found or isn't executable (%d). "
1157 "Setting cltrack_prog to blank string!",
1158 cltrack_prog, ret);
1159 cltrack_prog[0] = '\0';
1160 }
1161 dprintk("%s: %s return value: %d\n", __func__, cltrack_prog, ret);
1162
1163 return ret;
1164}
1165
1166static char *
1167bin_to_hex_dup(const unsigned char *src, int srclen)
1168{
1169 int i;
1170 char *buf, *hex;
1171
1172 /* +1 for terminating NULL */
1173 buf = kmalloc((srclen * 2) + 1, GFP_KERNEL);
1174 if (!buf)
1175 return buf;
1176
1177 hex = buf;
1178 for (i = 0; i < srclen; i++) {
1179 sprintf(hex, "%2.2x", *src++);
1180 hex += 2;
1181 }
1182 return buf;
1183}
1184
1185static int
1186nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net)
1187{
1188 return nfsd4_umh_cltrack_upcall("init", NULL, NULL);
1189}
1190
1191static void
1192nfsd4_umh_cltrack_create(struct nfs4_client *clp)
1193{
1194 char *hexid;
1195
1196 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1197 if (!hexid) {
1198 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1199 return;
1200 }
1201 nfsd4_umh_cltrack_upcall("create", hexid, NULL);
1202 kfree(hexid);
1203}
1204
1205static void
1206nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
1207{
1208 char *hexid;
1209
1210 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1211 if (!hexid) {
1212 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1213 return;
1214 }
1215 nfsd4_umh_cltrack_upcall("remove", hexid, NULL);
1216 kfree(hexid);
1217}
1218
1219static int
1220nfsd4_umh_cltrack_check(struct nfs4_client *clp)
1221{
1222 int ret;
1223 char *hexid, *legacy;
1224
1225 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1226 if (!hexid) {
1227 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1228 return -ENOMEM;
1229 }
1230 legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
1231 ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy);
1232 kfree(legacy);
1233 kfree(hexid);
1234 return ret;
1235}
1236
1237static void
1238nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn,
1239 time_t boot_time)
1240{
1241 char *legacy;
1242 char timestr[22]; /* FIXME: better way to determine max size? */
1243
1244 sprintf(timestr, "%ld", boot_time);
1245 legacy = nfsd4_cltrack_legacy_topdir();
1246 nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy);
1247 kfree(legacy);
1248}
1249
1250static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
1251 .init = nfsd4_umh_cltrack_init,
1252 .exit = NULL,
1253 .create = nfsd4_umh_cltrack_create,
1254 .remove = nfsd4_umh_cltrack_remove,
1255 .check = nfsd4_umh_cltrack_check,
1256 .grace_done = nfsd4_umh_cltrack_grace_done,
1257};
1258
929int 1259int
930nfsd4_client_tracking_init(struct net *net) 1260nfsd4_client_tracking_init(struct net *net)
931{ 1261{
932 int status; 1262 int status;
933 struct path path; 1263 struct path path;
1264 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
934 1265
935 if (!client_tracking_ops) { 1266 /* just run the init if it the method is already decided */
936 client_tracking_ops = &nfsd4_cld_tracking_ops; 1267 if (nn->client_tracking_ops)
937 status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path); 1268 goto do_init;
938 if (!status) { 1269
939 if (S_ISDIR(path.dentry->d_inode->i_mode)) 1270 /*
940 client_tracking_ops = 1271 * First, try a UMH upcall. It should succeed or fail quickly, so
941 &nfsd4_legacy_tracking_ops; 1272 * there's little harm in trying that first.
942 path_put(&path); 1273 */
943 } 1274 nn->client_tracking_ops = &nfsd4_umh_tracking_ops;
1275 status = nn->client_tracking_ops->init(net);
1276 if (!status)
1277 return status;
1278
1279 /*
1280 * See if the recoverydir exists and is a directory. If it is,
1281 * then use the legacy ops.
1282 */
1283 nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
1284 status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
1285 if (!status) {
1286 status = S_ISDIR(path.dentry->d_inode->i_mode);
1287 path_put(&path);
1288 if (status)
1289 goto do_init;
944 } 1290 }
945 1291
946 status = client_tracking_ops->init(net); 1292 /* Finally, try to use nfsdcld */
1293 nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
1294 printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
1295 "removed in 3.10. Please transition to using "
1296 "nfsdcltrack.\n");
1297do_init:
1298 status = nn->client_tracking_ops->init(net);
947 if (status) { 1299 if (status) {
948 printk(KERN_WARNING "NFSD: Unable to initialize client " 1300 printk(KERN_WARNING "NFSD: Unable to initialize client "
949 "recovery tracking! (%d)\n", status); 1301 "recovery tracking! (%d)\n", status);
950 client_tracking_ops = NULL; 1302 nn->client_tracking_ops = NULL;
951 } 1303 }
952 return status; 1304 return status;
953} 1305}
@@ -955,40 +1307,49 @@ nfsd4_client_tracking_init(struct net *net)
955void 1307void
956nfsd4_client_tracking_exit(struct net *net) 1308nfsd4_client_tracking_exit(struct net *net)
957{ 1309{
958 if (client_tracking_ops) { 1310 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
959 client_tracking_ops->exit(net); 1311
960 client_tracking_ops = NULL; 1312 if (nn->client_tracking_ops) {
1313 if (nn->client_tracking_ops->exit)
1314 nn->client_tracking_ops->exit(net);
1315 nn->client_tracking_ops = NULL;
961 } 1316 }
962} 1317}
963 1318
964void 1319void
965nfsd4_client_record_create(struct nfs4_client *clp) 1320nfsd4_client_record_create(struct nfs4_client *clp)
966{ 1321{
967 if (client_tracking_ops) 1322 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
968 client_tracking_ops->create(clp); 1323
1324 if (nn->client_tracking_ops)
1325 nn->client_tracking_ops->create(clp);
969} 1326}
970 1327
971void 1328void
972nfsd4_client_record_remove(struct nfs4_client *clp) 1329nfsd4_client_record_remove(struct nfs4_client *clp)
973{ 1330{
974 if (client_tracking_ops) 1331 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
975 client_tracking_ops->remove(clp); 1332
1333 if (nn->client_tracking_ops)
1334 nn->client_tracking_ops->remove(clp);
976} 1335}
977 1336
978int 1337int
979nfsd4_client_record_check(struct nfs4_client *clp) 1338nfsd4_client_record_check(struct nfs4_client *clp)
980{ 1339{
981 if (client_tracking_ops) 1340 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
982 return client_tracking_ops->check(clp); 1341
1342 if (nn->client_tracking_ops)
1343 return nn->client_tracking_ops->check(clp);
983 1344
984 return -EOPNOTSUPP; 1345 return -EOPNOTSUPP;
985} 1346}
986 1347
987void 1348void
988nfsd4_record_grace_done(struct net *net, time_t boot_time) 1349nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time)
989{ 1350{
990 if (client_tracking_ops) 1351 if (nn->client_tracking_ops)
991 client_tracking_ops->grace_done(net, boot_time); 1352 nn->client_tracking_ops->grace_done(nn, boot_time);
992} 1353}
993 1354
994static int 1355static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d0237f872cc4..ac8ed96c4199 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -44,16 +44,11 @@
44#include "xdr4.h" 44#include "xdr4.h"
45#include "vfs.h" 45#include "vfs.h"
46#include "current_stateid.h" 46#include "current_stateid.h"
47#include "fault_inject.h"
48 47
49#include "netns.h" 48#include "netns.h"
50 49
51#define NFSDDBG_FACILITY NFSDDBG_PROC 50#define NFSDDBG_FACILITY NFSDDBG_PROC
52 51
53/* Globals */
54time_t nfsd4_lease = 90; /* default lease time */
55time_t nfsd4_grace = 90;
56
57#define all_ones {{~0,~0},~0} 52#define all_ones {{~0,~0},~0}
58static const stateid_t one_stateid = { 53static const stateid_t one_stateid = {
59 .si_generation = ~0, 54 .si_generation = ~0,
@@ -176,8 +171,6 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
176 return ret & OWNER_HASH_MASK; 171 return ret & OWNER_HASH_MASK;
177} 172}
178 173
179static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
180
181/* hash table for nfs4_file */ 174/* hash table for nfs4_file */
182#define FILE_HASH_BITS 8 175#define FILE_HASH_BITS 8
183#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) 176#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
@@ -192,7 +185,7 @@ static struct list_head file_hashtbl[FILE_HASH_SIZE];
192 185
193static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) 186static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag)
194{ 187{
195 BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); 188 WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR]));
196 atomic_inc(&fp->fi_access[oflag]); 189 atomic_inc(&fp->fi_access[oflag]);
197} 190}
198 191
@@ -251,7 +244,7 @@ static inline int get_new_stid(struct nfs4_stid *stid)
251 * preallocations that can exist at a time, but the state lock 244 * preallocations that can exist at a time, but the state lock
252 * prevents anyone from using ours before we get here: 245 * prevents anyone from using ours before we get here:
253 */ 246 */
254 BUG_ON(error); 247 WARN_ON_ONCE(error);
255 /* 248 /*
256 * It shouldn't be a problem to reuse an opaque stateid value. 249 * It shouldn't be a problem to reuse an opaque stateid value.
257 * I don't think it is for 4.1. But with 4.0 I worry that, for 250 * I don't think it is for 4.1. But with 4.0 I worry that, for
@@ -340,7 +333,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
340 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 333 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
341 dp->dl_time = 0; 334 dp->dl_time = 0;
342 atomic_set(&dp->dl_count, 1); 335 atomic_set(&dp->dl_count, 1);
343 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); 336 nfsd4_init_callback(&dp->dl_recall);
344 return dp; 337 return dp;
345} 338}
346 339
@@ -390,14 +383,6 @@ unhash_delegation(struct nfs4_delegation *dp)
390 * SETCLIENTID state 383 * SETCLIENTID state
391 */ 384 */
392 385
393/* client_lock protects the client lru list and session hash table */
394static DEFINE_SPINLOCK(client_lock);
395
396/* Hash tables for nfs4_clientid state */
397#define CLIENT_HASH_BITS 4
398#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
399#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
400
401static unsigned int clientid_hashval(u32 id) 386static unsigned int clientid_hashval(u32 id)
402{ 387{
403 return id & CLIENT_HASH_MASK; 388 return id & CLIENT_HASH_MASK;
@@ -409,31 +394,6 @@ static unsigned int clientstr_hashval(const char *name)
409} 394}
410 395
411/* 396/*
412 * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
413 * used in reboot/reset lease grace period processing
414 *
415 * conf_id_hashtbl[], and conf_str_hashtbl[] hold confirmed
416 * setclientid_confirmed info.
417 *
418 * unconf_str_hastbl[] and unconf_id_hashtbl[] hold unconfirmed
419 * setclientid info.
420 *
421 * client_lru holds client queue ordered by nfs4_client.cl_time
422 * for lease renewal.
423 *
424 * close_lru holds (open) stateowner queue ordered by nfs4_stateowner.so_time
425 * for last close replay.
426 */
427static struct list_head reclaim_str_hashtbl[CLIENT_HASH_SIZE];
428static int reclaim_str_hashtbl_size = 0;
429static struct list_head conf_id_hashtbl[CLIENT_HASH_SIZE];
430static struct list_head conf_str_hashtbl[CLIENT_HASH_SIZE];
431static struct list_head unconf_str_hashtbl[CLIENT_HASH_SIZE];
432static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
433static struct list_head client_lru;
434static struct list_head close_lru;
435
436/*
437 * We store the NONE, READ, WRITE, and BOTH bits separately in the 397 * We store the NONE, READ, WRITE, and BOTH bits separately in the
438 * st_{access,deny}_bmap field of the stateid, in order to track not 398 * st_{access,deny}_bmap field of the stateid, in order to track not
439 * only what share bits are currently in force, but also what 399 * only what share bits are currently in force, but also what
@@ -526,7 +486,8 @@ static int nfs4_access_to_omode(u32 access)
526 case NFS4_SHARE_ACCESS_BOTH: 486 case NFS4_SHARE_ACCESS_BOTH:
527 return O_RDWR; 487 return O_RDWR;
528 } 488 }
529 BUG(); 489 WARN_ON_ONCE(1);
490 return O_RDONLY;
530} 491}
531 492
532/* release all access and file references for a given stateid */ 493/* release all access and file references for a given stateid */
@@ -652,9 +613,6 @@ static void release_openowner(struct nfs4_openowner *oo)
652 nfs4_free_openowner(oo); 613 nfs4_free_openowner(oo);
653} 614}
654 615
655#define SESSION_HASH_SIZE 512
656static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
657
658static inline int 616static inline int
659hash_sessionid(struct nfs4_sessionid *sessionid) 617hash_sessionid(struct nfs4_sessionid *sessionid)
660{ 618{
@@ -785,9 +743,12 @@ out_free:
785 return NULL; 743 return NULL;
786} 744}
787 745
788static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4_channel_attrs *req, int numslots, int slotsize) 746static void init_forechannel_attrs(struct nfsd4_channel_attrs *new,
747 struct nfsd4_channel_attrs *req,
748 int numslots, int slotsize,
749 struct nfsd_net *nn)
789{ 750{
790 u32 maxrpc = nfsd_serv->sv_max_mesg; 751 u32 maxrpc = nn->nfsd_serv->sv_max_mesg;
791 752
792 new->maxreqs = numslots; 753 new->maxreqs = numslots;
793 new->maxresp_cached = min_t(u32, req->maxresp_cached, 754 new->maxresp_cached = min_t(u32, req->maxresp_cached,
@@ -906,21 +867,27 @@ static void __free_session(struct nfsd4_session *ses)
906static void free_session(struct kref *kref) 867static void free_session(struct kref *kref)
907{ 868{
908 struct nfsd4_session *ses; 869 struct nfsd4_session *ses;
870 struct nfsd_net *nn;
909 871
910 lockdep_assert_held(&client_lock);
911 ses = container_of(kref, struct nfsd4_session, se_ref); 872 ses = container_of(kref, struct nfsd4_session, se_ref);
873 nn = net_generic(ses->se_client->net, nfsd_net_id);
874
875 lockdep_assert_held(&nn->client_lock);
912 nfsd4_del_conns(ses); 876 nfsd4_del_conns(ses);
913 __free_session(ses); 877 __free_session(ses);
914} 878}
915 879
916void nfsd4_put_session(struct nfsd4_session *ses) 880void nfsd4_put_session(struct nfsd4_session *ses)
917{ 881{
918 spin_lock(&client_lock); 882 struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
883
884 spin_lock(&nn->client_lock);
919 nfsd4_put_session_locked(ses); 885 nfsd4_put_session_locked(ses);
920 spin_unlock(&client_lock); 886 spin_unlock(&nn->client_lock);
921} 887}
922 888
923static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan) 889static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan,
890 struct nfsd_net *nn)
924{ 891{
925 struct nfsd4_session *new; 892 struct nfsd4_session *new;
926 int numslots, slotsize; 893 int numslots, slotsize;
@@ -941,13 +908,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fchan)
941 nfsd4_put_drc_mem(slotsize, fchan->maxreqs); 908 nfsd4_put_drc_mem(slotsize, fchan->maxreqs);
942 return NULL; 909 return NULL;
943 } 910 }
944 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize); 911 init_forechannel_attrs(&new->se_fchannel, fchan, numslots, slotsize, nn);
945 return new; 912 return new;
946} 913}
947 914
948static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses) 915static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, struct nfs4_client *clp, struct nfsd4_create_session *cses)
949{ 916{
950 int idx; 917 int idx;
918 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
951 919
952 new->se_client = clp; 920 new->se_client = clp;
953 gen_sessionid(new); 921 gen_sessionid(new);
@@ -957,14 +925,15 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
957 new->se_cb_seq_nr = 1; 925 new->se_cb_seq_nr = 1;
958 new->se_flags = cses->flags; 926 new->se_flags = cses->flags;
959 new->se_cb_prog = cses->callback_prog; 927 new->se_cb_prog = cses->callback_prog;
928 new->se_cb_sec = cses->cb_sec;
960 kref_init(&new->se_ref); 929 kref_init(&new->se_ref);
961 idx = hash_sessionid(&new->se_sessionid); 930 idx = hash_sessionid(&new->se_sessionid);
962 spin_lock(&client_lock); 931 spin_lock(&nn->client_lock);
963 list_add(&new->se_hash, &sessionid_hashtbl[idx]); 932 list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
964 spin_lock(&clp->cl_lock); 933 spin_lock(&clp->cl_lock);
965 list_add(&new->se_perclnt, &clp->cl_sessions); 934 list_add(&new->se_perclnt, &clp->cl_sessions);
966 spin_unlock(&clp->cl_lock); 935 spin_unlock(&clp->cl_lock);
967 spin_unlock(&client_lock); 936 spin_unlock(&nn->client_lock);
968 937
969 if (cses->flags & SESSION4_BACK_CHAN) { 938 if (cses->flags & SESSION4_BACK_CHAN) {
970 struct sockaddr *sa = svc_addr(rqstp); 939 struct sockaddr *sa = svc_addr(rqstp);
@@ -978,20 +947,20 @@ static struct nfsd4_session *init_session(struct svc_rqst *rqstp, struct nfsd4_s
978 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa); 947 rpc_copy_addr((struct sockaddr *)&clp->cl_cb_conn.cb_addr, sa);
979 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa); 948 clp->cl_cb_conn.cb_addrlen = svc_addr_len(sa);
980 } 949 }
981 return new;
982} 950}
983 951
984/* caller must hold client_lock */ 952/* caller must hold client_lock */
985static struct nfsd4_session * 953static struct nfsd4_session *
986find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) 954find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
987{ 955{
988 struct nfsd4_session *elem; 956 struct nfsd4_session *elem;
989 int idx; 957 int idx;
958 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
990 959
991 dump_sessionid(__func__, sessionid); 960 dump_sessionid(__func__, sessionid);
992 idx = hash_sessionid(sessionid); 961 idx = hash_sessionid(sessionid);
993 /* Search in the appropriate list */ 962 /* Search in the appropriate list */
994 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { 963 list_for_each_entry(elem, &nn->sessionid_hashtbl[idx], se_hash) {
995 if (!memcmp(elem->se_sessionid.data, sessionid->data, 964 if (!memcmp(elem->se_sessionid.data, sessionid->data,
996 NFS4_MAX_SESSIONID_LEN)) { 965 NFS4_MAX_SESSIONID_LEN)) {
997 return elem; 966 return elem;
@@ -1016,6 +985,8 @@ unhash_session(struct nfsd4_session *ses)
1016static inline void 985static inline void
1017renew_client_locked(struct nfs4_client *clp) 986renew_client_locked(struct nfs4_client *clp)
1018{ 987{
988 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
989
1019 if (is_client_expired(clp)) { 990 if (is_client_expired(clp)) {
1020 WARN_ON(1); 991 WARN_ON(1);
1021 printk("%s: client (clientid %08x/%08x) already expired\n", 992 printk("%s: client (clientid %08x/%08x) already expired\n",
@@ -1028,16 +999,18 @@ renew_client_locked(struct nfs4_client *clp)
1028 dprintk("renewing client (clientid %08x/%08x)\n", 999 dprintk("renewing client (clientid %08x/%08x)\n",
1029 clp->cl_clientid.cl_boot, 1000 clp->cl_clientid.cl_boot,
1030 clp->cl_clientid.cl_id); 1001 clp->cl_clientid.cl_id);
1031 list_move_tail(&clp->cl_lru, &client_lru); 1002 list_move_tail(&clp->cl_lru, &nn->client_lru);
1032 clp->cl_time = get_seconds(); 1003 clp->cl_time = get_seconds();
1033} 1004}
1034 1005
1035static inline void 1006static inline void
1036renew_client(struct nfs4_client *clp) 1007renew_client(struct nfs4_client *clp)
1037{ 1008{
1038 spin_lock(&client_lock); 1009 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1010
1011 spin_lock(&nn->client_lock);
1039 renew_client_locked(clp); 1012 renew_client_locked(clp);
1040 spin_unlock(&client_lock); 1013 spin_unlock(&nn->client_lock);
1041} 1014}
1042 1015
1043/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */ 1016/* SETCLIENTID and SETCLIENTID_CONFIRM Helper functions */
@@ -1075,7 +1048,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
1075static inline void 1048static inline void
1076free_client(struct nfs4_client *clp) 1049free_client(struct nfs4_client *clp)
1077{ 1050{
1078 lockdep_assert_held(&client_lock); 1051 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1052
1053 lockdep_assert_held(&nn->client_lock);
1079 while (!list_empty(&clp->cl_sessions)) { 1054 while (!list_empty(&clp->cl_sessions)) {
1080 struct nfsd4_session *ses; 1055 struct nfsd4_session *ses;
1081 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, 1056 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
@@ -1092,15 +1067,16 @@ void
1092release_session_client(struct nfsd4_session *session) 1067release_session_client(struct nfsd4_session *session)
1093{ 1068{
1094 struct nfs4_client *clp = session->se_client; 1069 struct nfs4_client *clp = session->se_client;
1070 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1095 1071
1096 if (!atomic_dec_and_lock(&clp->cl_refcount, &client_lock)) 1072 if (!atomic_dec_and_lock(&clp->cl_refcount, &nn->client_lock))
1097 return; 1073 return;
1098 if (is_client_expired(clp)) { 1074 if (is_client_expired(clp)) {
1099 free_client(clp); 1075 free_client(clp);
1100 session->se_client = NULL; 1076 session->se_client = NULL;
1101 } else 1077 } else
1102 renew_client_locked(clp); 1078 renew_client_locked(clp);
1103 spin_unlock(&client_lock); 1079 spin_unlock(&nn->client_lock);
1104} 1080}
1105 1081
1106/* must be called under the client_lock */ 1082/* must be called under the client_lock */
@@ -1123,6 +1099,7 @@ destroy_client(struct nfs4_client *clp)
1123 struct nfs4_openowner *oo; 1099 struct nfs4_openowner *oo;
1124 struct nfs4_delegation *dp; 1100 struct nfs4_delegation *dp;
1125 struct list_head reaplist; 1101 struct list_head reaplist;
1102 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1126 1103
1127 INIT_LIST_HEAD(&reaplist); 1104 INIT_LIST_HEAD(&reaplist);
1128 spin_lock(&recall_lock); 1105 spin_lock(&recall_lock);
@@ -1144,12 +1121,15 @@ destroy_client(struct nfs4_client *clp)
1144 if (clp->cl_cb_conn.cb_xprt) 1121 if (clp->cl_cb_conn.cb_xprt)
1145 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 1122 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
1146 list_del(&clp->cl_idhash); 1123 list_del(&clp->cl_idhash);
1147 list_del(&clp->cl_strhash); 1124 if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
1148 spin_lock(&client_lock); 1125 rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
1126 else
1127 rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
1128 spin_lock(&nn->client_lock);
1149 unhash_client_locked(clp); 1129 unhash_client_locked(clp);
1150 if (atomic_read(&clp->cl_refcount) == 0) 1130 if (atomic_read(&clp->cl_refcount) == 0)
1151 free_client(clp); 1131 free_client(clp);
1152 spin_unlock(&client_lock); 1132 spin_unlock(&nn->client_lock);
1153} 1133}
1154 1134
1155static void expire_client(struct nfs4_client *clp) 1135static void expire_client(struct nfs4_client *clp)
@@ -1187,6 +1167,17 @@ static int copy_cred(struct svc_cred *target, struct svc_cred *source)
1187 return 0; 1167 return 0;
1188} 1168}
1189 1169
1170static long long
1171compare_blob(const struct xdr_netobj *o1, const struct xdr_netobj *o2)
1172{
1173 long long res;
1174
1175 res = o1->len - o2->len;
1176 if (res)
1177 return res;
1178 return (long long)memcmp(o1->data, o2->data, o1->len);
1179}
1180
1190static int same_name(const char *n1, const char *n2) 1181static int same_name(const char *n1, const char *n2)
1191{ 1182{
1192 return 0 == memcmp(n1, n2, HEXDIR_LEN); 1183 return 0 == memcmp(n1, n2, HEXDIR_LEN);
@@ -1247,10 +1238,9 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
1247 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal); 1238 return 0 == strcmp(cr1->cr_principal, cr2->cr_principal);
1248} 1239}
1249 1240
1250static void gen_clid(struct nfs4_client *clp) 1241static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
1251{ 1242{
1252 static u32 current_clientid = 1; 1243 static u32 current_clientid = 1;
1253 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
1254 1244
1255 clp->cl_clientid.cl_boot = nn->boot_time; 1245 clp->cl_clientid.cl_boot = nn->boot_time;
1256 clp->cl_clientid.cl_id = current_clientid++; 1246 clp->cl_clientid.cl_id = current_clientid++;
@@ -1283,12 +1273,14 @@ static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t
1283 return NULL; 1273 return NULL;
1284} 1274}
1285 1275
1286static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir, 1276static struct nfs4_client *create_client(struct xdr_netobj name,
1287 struct svc_rqst *rqstp, nfs4_verifier *verf) 1277 struct svc_rqst *rqstp, nfs4_verifier *verf)
1288{ 1278{
1289 struct nfs4_client *clp; 1279 struct nfs4_client *clp;
1290 struct sockaddr *sa = svc_addr(rqstp); 1280 struct sockaddr *sa = svc_addr(rqstp);
1291 int ret; 1281 int ret;
1282 struct net *net = SVC_NET(rqstp);
1283 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1292 1284
1293 clp = alloc_client(name); 1285 clp = alloc_client(name);
1294 if (clp == NULL) 1286 if (clp == NULL)
@@ -1297,23 +1289,21 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1297 INIT_LIST_HEAD(&clp->cl_sessions); 1289 INIT_LIST_HEAD(&clp->cl_sessions);
1298 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); 1290 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
1299 if (ret) { 1291 if (ret) {
1300 spin_lock(&client_lock); 1292 spin_lock(&nn->client_lock);
1301 free_client(clp); 1293 free_client(clp);
1302 spin_unlock(&client_lock); 1294 spin_unlock(&nn->client_lock);
1303 return NULL; 1295 return NULL;
1304 } 1296 }
1305 idr_init(&clp->cl_stateids); 1297 idr_init(&clp->cl_stateids);
1306 memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
1307 atomic_set(&clp->cl_refcount, 0); 1298 atomic_set(&clp->cl_refcount, 0);
1308 clp->cl_cb_state = NFSD4_CB_UNKNOWN; 1299 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
1309 INIT_LIST_HEAD(&clp->cl_idhash); 1300 INIT_LIST_HEAD(&clp->cl_idhash);
1310 INIT_LIST_HEAD(&clp->cl_strhash);
1311 INIT_LIST_HEAD(&clp->cl_openowners); 1301 INIT_LIST_HEAD(&clp->cl_openowners);
1312 INIT_LIST_HEAD(&clp->cl_delegations); 1302 INIT_LIST_HEAD(&clp->cl_delegations);
1313 INIT_LIST_HEAD(&clp->cl_lru); 1303 INIT_LIST_HEAD(&clp->cl_lru);
1314 INIT_LIST_HEAD(&clp->cl_callbacks); 1304 INIT_LIST_HEAD(&clp->cl_callbacks);
1315 spin_lock_init(&clp->cl_lock); 1305 spin_lock_init(&clp->cl_lock);
1316 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_do_callback_rpc); 1306 nfsd4_init_callback(&clp->cl_cb_null);
1317 clp->cl_time = get_seconds(); 1307 clp->cl_time = get_seconds();
1318 clear_bit(0, &clp->cl_cb_slot_busy); 1308 clear_bit(0, &clp->cl_cb_slot_busy);
1319 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1309 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
@@ -1321,17 +1311,60 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
1321 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); 1311 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
1322 gen_confirm(clp); 1312 gen_confirm(clp);
1323 clp->cl_cb_session = NULL; 1313 clp->cl_cb_session = NULL;
1314 clp->net = net;
1324 return clp; 1315 return clp;
1325} 1316}
1326 1317
1327static void 1318static void
1328add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval) 1319add_clp_to_name_tree(struct nfs4_client *new_clp, struct rb_root *root)
1320{
1321 struct rb_node **new = &(root->rb_node), *parent = NULL;
1322 struct nfs4_client *clp;
1323
1324 while (*new) {
1325 clp = rb_entry(*new, struct nfs4_client, cl_namenode);
1326 parent = *new;
1327
1328 if (compare_blob(&clp->cl_name, &new_clp->cl_name) > 0)
1329 new = &((*new)->rb_left);
1330 else
1331 new = &((*new)->rb_right);
1332 }
1333
1334 rb_link_node(&new_clp->cl_namenode, parent, new);
1335 rb_insert_color(&new_clp->cl_namenode, root);
1336}
1337
1338static struct nfs4_client *
1339find_clp_in_name_tree(struct xdr_netobj *name, struct rb_root *root)
1340{
1341 long long cmp;
1342 struct rb_node *node = root->rb_node;
1343 struct nfs4_client *clp;
1344
1345 while (node) {
1346 clp = rb_entry(node, struct nfs4_client, cl_namenode);
1347 cmp = compare_blob(&clp->cl_name, name);
1348 if (cmp > 0)
1349 node = node->rb_left;
1350 else if (cmp < 0)
1351 node = node->rb_right;
1352 else
1353 return clp;
1354 }
1355 return NULL;
1356}
1357
1358static void
1359add_to_unconfirmed(struct nfs4_client *clp)
1329{ 1360{
1330 unsigned int idhashval; 1361 unsigned int idhashval;
1362 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1331 1363
1332 list_add(&clp->cl_strhash, &unconf_str_hashtbl[strhashval]); 1364 clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
1365 add_clp_to_name_tree(clp, &nn->unconf_name_tree);
1333 idhashval = clientid_hashval(clp->cl_clientid.cl_id); 1366 idhashval = clientid_hashval(clp->cl_clientid.cl_id);
1334 list_add(&clp->cl_idhash, &unconf_id_hashtbl[idhashval]); 1367 list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
1335 renew_client(clp); 1368 renew_client(clp);
1336} 1369}
1337 1370
@@ -1339,22 +1372,23 @@ static void
1339move_to_confirmed(struct nfs4_client *clp) 1372move_to_confirmed(struct nfs4_client *clp)
1340{ 1373{
1341 unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); 1374 unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
1342 unsigned int strhashval; 1375 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1343 1376
1344 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); 1377 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
1345 list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); 1378 list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
1346 strhashval = clientstr_hashval(clp->cl_recdir); 1379 rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
1347 list_move(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); 1380 add_clp_to_name_tree(clp, &nn->conf_name_tree);
1381 set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
1348 renew_client(clp); 1382 renew_client(clp);
1349} 1383}
1350 1384
1351static struct nfs4_client * 1385static struct nfs4_client *
1352find_confirmed_client(clientid_t *clid, bool sessions) 1386find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
1353{ 1387{
1354 struct nfs4_client *clp; 1388 struct nfs4_client *clp;
1355 unsigned int idhashval = clientid_hashval(clid->cl_id); 1389 unsigned int idhashval = clientid_hashval(clid->cl_id);
1356 1390
1357 list_for_each_entry(clp, &conf_id_hashtbl[idhashval], cl_idhash) { 1391 list_for_each_entry(clp, &nn->conf_id_hashtbl[idhashval], cl_idhash) {
1358 if (same_clid(&clp->cl_clientid, clid)) { 1392 if (same_clid(&clp->cl_clientid, clid)) {
1359 if ((bool)clp->cl_minorversion != sessions) 1393 if ((bool)clp->cl_minorversion != sessions)
1360 return NULL; 1394 return NULL;
@@ -1366,12 +1400,12 @@ find_confirmed_client(clientid_t *clid, bool sessions)
1366} 1400}
1367 1401
1368static struct nfs4_client * 1402static struct nfs4_client *
1369find_unconfirmed_client(clientid_t *clid, bool sessions) 1403find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
1370{ 1404{
1371 struct nfs4_client *clp; 1405 struct nfs4_client *clp;
1372 unsigned int idhashval = clientid_hashval(clid->cl_id); 1406 unsigned int idhashval = clientid_hashval(clid->cl_id);
1373 1407
1374 list_for_each_entry(clp, &unconf_id_hashtbl[idhashval], cl_idhash) { 1408 list_for_each_entry(clp, &nn->unconf_id_hashtbl[idhashval], cl_idhash) {
1375 if (same_clid(&clp->cl_clientid, clid)) { 1409 if (same_clid(&clp->cl_clientid, clid)) {
1376 if ((bool)clp->cl_minorversion != sessions) 1410 if ((bool)clp->cl_minorversion != sessions)
1377 return NULL; 1411 return NULL;
@@ -1387,27 +1421,15 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
1387} 1421}
1388 1422
1389static struct nfs4_client * 1423static struct nfs4_client *
1390find_confirmed_client_by_str(const char *dname, unsigned int hashval) 1424find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
1391{ 1425{
1392 struct nfs4_client *clp; 1426 return find_clp_in_name_tree(name, &nn->conf_name_tree);
1393
1394 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
1395 if (same_name(clp->cl_recdir, dname))
1396 return clp;
1397 }
1398 return NULL;
1399} 1427}
1400 1428
1401static struct nfs4_client * 1429static struct nfs4_client *
1402find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) 1430find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
1403{ 1431{
1404 struct nfs4_client *clp; 1432 return find_clp_in_name_tree(name, &nn->unconf_name_tree);
1405
1406 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
1407 if (same_name(clp->cl_recdir, dname))
1408 return clp;
1409 }
1410 return NULL;
1411} 1433}
1412 1434
1413static void 1435static void
@@ -1428,7 +1450,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
1428 else 1450 else
1429 goto out_err; 1451 goto out_err;
1430 1452
1431 conn->cb_addrlen = rpc_uaddr2sockaddr(&init_net, se->se_callback_addr_val, 1453 conn->cb_addrlen = rpc_uaddr2sockaddr(clp->net, se->se_callback_addr_val,
1432 se->se_callback_addr_len, 1454 se->se_callback_addr_len,
1433 (struct sockaddr *)&conn->cb_addr, 1455 (struct sockaddr *)&conn->cb_addr,
1434 sizeof(conn->cb_addr)); 1456 sizeof(conn->cb_addr));
@@ -1572,12 +1594,11 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1572{ 1594{
1573 struct nfs4_client *unconf, *conf, *new; 1595 struct nfs4_client *unconf, *conf, *new;
1574 __be32 status; 1596 __be32 status;
1575 unsigned int strhashval;
1576 char dname[HEXDIR_LEN];
1577 char addr_str[INET6_ADDRSTRLEN]; 1597 char addr_str[INET6_ADDRSTRLEN];
1578 nfs4_verifier verf = exid->verifier; 1598 nfs4_verifier verf = exid->verifier;
1579 struct sockaddr *sa = svc_addr(rqstp); 1599 struct sockaddr *sa = svc_addr(rqstp);
1580 bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A; 1600 bool update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
1601 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1581 1602
1582 rpc_ntop(sa, addr_str, sizeof(addr_str)); 1603 rpc_ntop(sa, addr_str, sizeof(addr_str));
1583 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p " 1604 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1592,24 +1613,16 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1592 switch (exid->spa_how) { 1613 switch (exid->spa_how) {
1593 case SP4_NONE: 1614 case SP4_NONE:
1594 break; 1615 break;
1616 default: /* checked by xdr code */
1617 WARN_ON_ONCE(1);
1595 case SP4_SSV: 1618 case SP4_SSV:
1596 return nfserr_serverfault;
1597 default:
1598 BUG(); /* checked by xdr code */
1599 case SP4_MACH_CRED: 1619 case SP4_MACH_CRED:
1600 return nfserr_serverfault; /* no excuse :-/ */ 1620 return nfserr_serverfault; /* no excuse :-/ */
1601 } 1621 }
1602 1622
1603 status = nfs4_make_rec_clidname(dname, &exid->clname);
1604
1605 if (status)
1606 return status;
1607
1608 strhashval = clientstr_hashval(dname);
1609
1610 /* Cases below refer to rfc 5661 section 18.35.4: */ 1623 /* Cases below refer to rfc 5661 section 18.35.4: */
1611 nfs4_lock_state(); 1624 nfs4_lock_state();
1612 conf = find_confirmed_client_by_str(dname, strhashval); 1625 conf = find_confirmed_client_by_name(&exid->clname, nn);
1613 if (conf) { 1626 if (conf) {
1614 bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); 1627 bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
1615 bool verfs_match = same_verf(&verf, &conf->cl_verifier); 1628 bool verfs_match = same_verf(&verf, &conf->cl_verifier);
@@ -1654,21 +1667,21 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1654 goto out; 1667 goto out;
1655 } 1668 }
1656 1669
1657 unconf = find_unconfirmed_client_by_str(dname, strhashval); 1670 unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
1658 if (unconf) /* case 4, possible retry or client restart */ 1671 if (unconf) /* case 4, possible retry or client restart */
1659 expire_client(unconf); 1672 expire_client(unconf);
1660 1673
1661 /* case 1 (normal case) */ 1674 /* case 1 (normal case) */
1662out_new: 1675out_new:
1663 new = create_client(exid->clname, dname, rqstp, &verf); 1676 new = create_client(exid->clname, rqstp, &verf);
1664 if (new == NULL) { 1677 if (new == NULL) {
1665 status = nfserr_jukebox; 1678 status = nfserr_jukebox;
1666 goto out; 1679 goto out;
1667 } 1680 }
1668 new->cl_minorversion = 1; 1681 new->cl_minorversion = 1;
1669 1682
1670 gen_clid(new); 1683 gen_clid(new, nn);
1671 add_to_unconfirmed(new, strhashval); 1684 add_to_unconfirmed(new);
1672out_copy: 1685out_copy:
1673 exid->clientid.cl_boot = new->cl_clientid.cl_boot; 1686 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1674 exid->clientid.cl_id = new->cl_clientid.cl_id; 1687 exid->clientid.cl_id = new->cl_clientid.cl_id;
@@ -1761,12 +1774,13 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1761 struct nfsd4_conn *conn; 1774 struct nfsd4_conn *conn;
1762 struct nfsd4_clid_slot *cs_slot = NULL; 1775 struct nfsd4_clid_slot *cs_slot = NULL;
1763 __be32 status = 0; 1776 __be32 status = 0;
1777 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1764 1778
1765 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A) 1779 if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
1766 return nfserr_inval; 1780 return nfserr_inval;
1767 if (check_forechannel_attrs(cr_ses->fore_channel)) 1781 if (check_forechannel_attrs(cr_ses->fore_channel))
1768 return nfserr_toosmall; 1782 return nfserr_toosmall;
1769 new = alloc_session(&cr_ses->fore_channel); 1783 new = alloc_session(&cr_ses->fore_channel, nn);
1770 if (!new) 1784 if (!new)
1771 return nfserr_jukebox; 1785 return nfserr_jukebox;
1772 status = nfserr_jukebox; 1786 status = nfserr_jukebox;
@@ -1775,8 +1789,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1775 goto out_free_session; 1789 goto out_free_session;
1776 1790
1777 nfs4_lock_state(); 1791 nfs4_lock_state();
1778 unconf = find_unconfirmed_client(&cr_ses->clientid, true); 1792 unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
1779 conf = find_confirmed_client(&cr_ses->clientid, true); 1793 conf = find_confirmed_client(&cr_ses->clientid, true, nn);
1780 1794
1781 if (conf) { 1795 if (conf) {
1782 cs_slot = &conf->cl_cs_slot; 1796 cs_slot = &conf->cl_cs_slot;
@@ -1789,7 +1803,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1789 goto out_free_conn; 1803 goto out_free_conn;
1790 } 1804 }
1791 } else if (unconf) { 1805 } else if (unconf) {
1792 unsigned int hash;
1793 struct nfs4_client *old; 1806 struct nfs4_client *old;
1794 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 1807 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1795 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 1808 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
@@ -1803,8 +1816,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
1803 status = nfserr_seq_misordered; 1816 status = nfserr_seq_misordered;
1804 goto out_free_conn; 1817 goto out_free_conn;
1805 } 1818 }
1806 hash = clientstr_hashval(unconf->cl_recdir); 1819 old = find_confirmed_client_by_name(&unconf->cl_name, nn);
1807 old = find_confirmed_client_by_str(unconf->cl_recdir, hash);
1808 if (old) 1820 if (old)
1809 expire_client(old); 1821 expire_client(old);
1810 move_to_confirmed(unconf); 1822 move_to_confirmed(unconf);
@@ -1843,14 +1855,6 @@ out_free_session:
1843 goto out; 1855 goto out;
1844} 1856}
1845 1857
1846static bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
1847{
1848 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1849 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
1850
1851 return argp->opcnt == resp->opcnt;
1852}
1853
1854static __be32 nfsd4_map_bcts_dir(u32 *dir) 1858static __be32 nfsd4_map_bcts_dir(u32 *dir)
1855{ 1859{
1856 switch (*dir) { 1860 switch (*dir) {
@@ -1865,24 +1869,40 @@ static __be32 nfsd4_map_bcts_dir(u32 *dir)
1865 return nfserr_inval; 1869 return nfserr_inval;
1866} 1870}
1867 1871
1872__be32 nfsd4_backchannel_ctl(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_backchannel_ctl *bc)
1873{
1874 struct nfsd4_session *session = cstate->session;
1875 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1876
1877 spin_lock(&nn->client_lock);
1878 session->se_cb_prog = bc->bc_cb_program;
1879 session->se_cb_sec = bc->bc_cb_sec;
1880 spin_unlock(&nn->client_lock);
1881
1882 nfsd4_probe_callback(session->se_client);
1883
1884 return nfs_ok;
1885}
1886
1868__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, 1887__be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
1869 struct nfsd4_compound_state *cstate, 1888 struct nfsd4_compound_state *cstate,
1870 struct nfsd4_bind_conn_to_session *bcts) 1889 struct nfsd4_bind_conn_to_session *bcts)
1871{ 1890{
1872 __be32 status; 1891 __be32 status;
1873 struct nfsd4_conn *conn; 1892 struct nfsd4_conn *conn;
1893 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
1874 1894
1875 if (!nfsd4_last_compound_op(rqstp)) 1895 if (!nfsd4_last_compound_op(rqstp))
1876 return nfserr_not_only_op; 1896 return nfserr_not_only_op;
1877 spin_lock(&client_lock); 1897 spin_lock(&nn->client_lock);
1878 cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid); 1898 cstate->session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp));
1879 /* Sorta weird: we only need the refcnt'ing because new_conn acquires 1899 /* Sorta weird: we only need the refcnt'ing because new_conn acquires
1880 * client_lock iself: */ 1900 * client_lock iself: */
1881 if (cstate->session) { 1901 if (cstate->session) {
1882 nfsd4_get_session(cstate->session); 1902 nfsd4_get_session(cstate->session);
1883 atomic_inc(&cstate->session->se_client->cl_refcount); 1903 atomic_inc(&cstate->session->se_client->cl_refcount);
1884 } 1904 }
1885 spin_unlock(&client_lock); 1905 spin_unlock(&nn->client_lock);
1886 if (!cstate->session) 1906 if (!cstate->session)
1887 return nfserr_badsession; 1907 return nfserr_badsession;
1888 1908
@@ -1910,6 +1930,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
1910{ 1930{
1911 struct nfsd4_session *ses; 1931 struct nfsd4_session *ses;
1912 __be32 status = nfserr_badsession; 1932 __be32 status = nfserr_badsession;
1933 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id);
1913 1934
1914 /* Notes: 1935 /* Notes:
1915 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid 1936 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1923,24 +1944,24 @@ nfsd4_destroy_session(struct svc_rqst *r,
1923 return nfserr_not_only_op; 1944 return nfserr_not_only_op;
1924 } 1945 }
1925 dump_sessionid(__func__, &sessionid->sessionid); 1946 dump_sessionid(__func__, &sessionid->sessionid);
1926 spin_lock(&client_lock); 1947 spin_lock(&nn->client_lock);
1927 ses = find_in_sessionid_hashtbl(&sessionid->sessionid); 1948 ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r));
1928 if (!ses) { 1949 if (!ses) {
1929 spin_unlock(&client_lock); 1950 spin_unlock(&nn->client_lock);
1930 goto out; 1951 goto out;
1931 } 1952 }
1932 1953
1933 unhash_session(ses); 1954 unhash_session(ses);
1934 spin_unlock(&client_lock); 1955 spin_unlock(&nn->client_lock);
1935 1956
1936 nfs4_lock_state(); 1957 nfs4_lock_state();
1937 nfsd4_probe_callback_sync(ses->se_client); 1958 nfsd4_probe_callback_sync(ses->se_client);
1938 nfs4_unlock_state(); 1959 nfs4_unlock_state();
1939 1960
1940 spin_lock(&client_lock); 1961 spin_lock(&nn->client_lock);
1941 nfsd4_del_conns(ses); 1962 nfsd4_del_conns(ses);
1942 nfsd4_put_session_locked(ses); 1963 nfsd4_put_session_locked(ses);
1943 spin_unlock(&client_lock); 1964 spin_unlock(&nn->client_lock);
1944 status = nfs_ok; 1965 status = nfs_ok;
1945out: 1966out:
1946 dprintk("%s returns %d\n", __func__, ntohl(status)); 1967 dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -2006,6 +2027,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2006 struct nfsd4_slot *slot; 2027 struct nfsd4_slot *slot;
2007 struct nfsd4_conn *conn; 2028 struct nfsd4_conn *conn;
2008 __be32 status; 2029 __be32 status;
2030 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2009 2031
2010 if (resp->opcnt != 1) 2032 if (resp->opcnt != 1)
2011 return nfserr_sequence_pos; 2033 return nfserr_sequence_pos;
@@ -2018,9 +2040,9 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2018 if (!conn) 2040 if (!conn)
2019 return nfserr_jukebox; 2041 return nfserr_jukebox;
2020 2042
2021 spin_lock(&client_lock); 2043 spin_lock(&nn->client_lock);
2022 status = nfserr_badsession; 2044 status = nfserr_badsession;
2023 session = find_in_sessionid_hashtbl(&seq->sessionid); 2045 session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
2024 if (!session) 2046 if (!session)
2025 goto out; 2047 goto out;
2026 2048
@@ -2094,7 +2116,7 @@ out:
2094 } 2116 }
2095 } 2117 }
2096 kfree(conn); 2118 kfree(conn);
2097 spin_unlock(&client_lock); 2119 spin_unlock(&nn->client_lock);
2098 dprintk("%s: return %d\n", __func__, ntohl(status)); 2120 dprintk("%s: return %d\n", __func__, ntohl(status));
2099 return status; 2121 return status;
2100} 2122}
@@ -2104,10 +2126,11 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2104{ 2126{
2105 struct nfs4_client *conf, *unconf, *clp; 2127 struct nfs4_client *conf, *unconf, *clp;
2106 __be32 status = 0; 2128 __be32 status = 0;
2129 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2107 2130
2108 nfs4_lock_state(); 2131 nfs4_lock_state();
2109 unconf = find_unconfirmed_client(&dc->clientid, true); 2132 unconf = find_unconfirmed_client(&dc->clientid, true, nn);
2110 conf = find_confirmed_client(&dc->clientid, true); 2133 conf = find_confirmed_client(&dc->clientid, true, nn);
2111 2134
2112 if (conf) { 2135 if (conf) {
2113 clp = conf; 2136 clp = conf;
@@ -2181,20 +2204,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2181{ 2204{
2182 struct xdr_netobj clname = setclid->se_name; 2205 struct xdr_netobj clname = setclid->se_name;
2183 nfs4_verifier clverifier = setclid->se_verf; 2206 nfs4_verifier clverifier = setclid->se_verf;
2184 unsigned int strhashval;
2185 struct nfs4_client *conf, *unconf, *new; 2207 struct nfs4_client *conf, *unconf, *new;
2186 __be32 status; 2208 __be32 status;
2187 char dname[HEXDIR_LEN]; 2209 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2188
2189 status = nfs4_make_rec_clidname(dname, &clname);
2190 if (status)
2191 return status;
2192
2193 strhashval = clientstr_hashval(dname);
2194 2210
2195 /* Cases below refer to rfc 3530 section 14.2.33: */ 2211 /* Cases below refer to rfc 3530 section 14.2.33: */
2196 nfs4_lock_state(); 2212 nfs4_lock_state();
2197 conf = find_confirmed_client_by_str(dname, strhashval); 2213 conf = find_confirmed_client_by_name(&clname, nn);
2198 if (conf) { 2214 if (conf) {
2199 /* case 0: */ 2215 /* case 0: */
2200 status = nfserr_clid_inuse; 2216 status = nfserr_clid_inuse;
@@ -2209,21 +2225,21 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2209 goto out; 2225 goto out;
2210 } 2226 }
2211 } 2227 }
2212 unconf = find_unconfirmed_client_by_str(dname, strhashval); 2228 unconf = find_unconfirmed_client_by_name(&clname, nn);
2213 if (unconf) 2229 if (unconf)
2214 expire_client(unconf); 2230 expire_client(unconf);
2215 status = nfserr_jukebox; 2231 status = nfserr_jukebox;
2216 new = create_client(clname, dname, rqstp, &clverifier); 2232 new = create_client(clname, rqstp, &clverifier);
2217 if (new == NULL) 2233 if (new == NULL)
2218 goto out; 2234 goto out;
2219 if (conf && same_verf(&conf->cl_verifier, &clverifier)) 2235 if (conf && same_verf(&conf->cl_verifier, &clverifier))
2220 /* case 1: probable callback update */ 2236 /* case 1: probable callback update */
2221 copy_clid(new, conf); 2237 copy_clid(new, conf);
2222 else /* case 4 (new client) or cases 2, 3 (client reboot): */ 2238 else /* case 4 (new client) or cases 2, 3 (client reboot): */
2223 gen_clid(new); 2239 gen_clid(new, nn);
2224 new->cl_minorversion = 0; 2240 new->cl_minorversion = 0;
2225 gen_callback(new, setclid, rqstp); 2241 gen_callback(new, setclid, rqstp);
2226 add_to_unconfirmed(new, strhashval); 2242 add_to_unconfirmed(new);
2227 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 2243 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
2228 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 2244 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
2229 memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); 2245 memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
@@ -2243,14 +2259,14 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2243 nfs4_verifier confirm = setclientid_confirm->sc_confirm; 2259 nfs4_verifier confirm = setclientid_confirm->sc_confirm;
2244 clientid_t * clid = &setclientid_confirm->sc_clientid; 2260 clientid_t * clid = &setclientid_confirm->sc_clientid;
2245 __be32 status; 2261 __be32 status;
2246 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 2262 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2247 2263
2248 if (STALE_CLIENTID(clid, nn)) 2264 if (STALE_CLIENTID(clid, nn))
2249 return nfserr_stale_clientid; 2265 return nfserr_stale_clientid;
2250 nfs4_lock_state(); 2266 nfs4_lock_state();
2251 2267
2252 conf = find_confirmed_client(clid, false); 2268 conf = find_confirmed_client(clid, false, nn);
2253 unconf = find_unconfirmed_client(clid, false); 2269 unconf = find_unconfirmed_client(clid, false, nn);
2254 /* 2270 /*
2255 * We try hard to give out unique clientid's, so if we get an 2271 * We try hard to give out unique clientid's, so if we get an
2256 * attempt to confirm the same clientid with a different cred, 2272 * attempt to confirm the same clientid with a different cred,
@@ -2276,9 +2292,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2276 nfsd4_probe_callback(conf); 2292 nfsd4_probe_callback(conf);
2277 expire_client(unconf); 2293 expire_client(unconf);
2278 } else { /* case 3: normal case; new or rebooted client */ 2294 } else { /* case 3: normal case; new or rebooted client */
2279 unsigned int hash = clientstr_hashval(unconf->cl_recdir); 2295 conf = find_confirmed_client_by_name(&unconf->cl_name, nn);
2280
2281 conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
2282 if (conf) 2296 if (conf)
2283 expire_client(conf); 2297 expire_client(conf);
2284 move_to_confirmed(unconf); 2298 move_to_confirmed(unconf);
@@ -2340,7 +2354,7 @@ nfsd4_init_slabs(void)
2340 if (openowner_slab == NULL) 2354 if (openowner_slab == NULL)
2341 goto out_nomem; 2355 goto out_nomem;
2342 lockowner_slab = kmem_cache_create("nfsd4_lockowners", 2356 lockowner_slab = kmem_cache_create("nfsd4_lockowners",
2343 sizeof(struct nfs4_openowner), 0, 0, NULL); 2357 sizeof(struct nfs4_lockowner), 0, 0, NULL);
2344 if (lockowner_slab == NULL) 2358 if (lockowner_slab == NULL)
2345 goto out_nomem; 2359 goto out_nomem;
2346 file_slab = kmem_cache_create("nfsd4_files", 2360 file_slab = kmem_cache_create("nfsd4_files",
@@ -2404,7 +2418,9 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
2404 2418
2405static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) 2419static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
2406{ 2420{
2407 list_add(&oo->oo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); 2421 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
2422
2423 list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
2408 list_add(&oo->oo_perclient, &clp->cl_openowners); 2424 list_add(&oo->oo_perclient, &clp->cl_openowners);
2409} 2425}
2410 2426
@@ -2444,11 +2460,13 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
2444} 2460}
2445 2461
2446static void 2462static void
2447move_to_close_lru(struct nfs4_openowner *oo) 2463move_to_close_lru(struct nfs4_openowner *oo, struct net *net)
2448{ 2464{
2465 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
2466
2449 dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); 2467 dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
2450 2468
2451 list_move_tail(&oo->oo_close_lru, &close_lru); 2469 list_move_tail(&oo->oo_close_lru, &nn->close_lru);
2452 oo->oo_time = get_seconds(); 2470 oo->oo_time = get_seconds();
2453} 2471}
2454 2472
@@ -2462,13 +2480,14 @@ same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner,
2462} 2480}
2463 2481
2464static struct nfs4_openowner * 2482static struct nfs4_openowner *
2465find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open, bool sessions) 2483find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
2484 bool sessions, struct nfsd_net *nn)
2466{ 2485{
2467 struct nfs4_stateowner *so; 2486 struct nfs4_stateowner *so;
2468 struct nfs4_openowner *oo; 2487 struct nfs4_openowner *oo;
2469 struct nfs4_client *clp; 2488 struct nfs4_client *clp;
2470 2489
2471 list_for_each_entry(so, &ownerstr_hashtbl[hashval], so_strhash) { 2490 list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) {
2472 if (!so->so_is_open_owner) 2491 if (!so->so_is_open_owner)
2473 continue; 2492 continue;
2474 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { 2493 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) {
@@ -2555,9 +2574,14 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
2555 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; 2574 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
2556 struct nfs4_delegation *dp; 2575 struct nfs4_delegation *dp;
2557 2576
2558 BUG_ON(!fp); 2577 if (!fp) {
2559 /* We assume break_lease is only called once per lease: */ 2578 WARN(1, "(%p)->fl_owner NULL\n", fl);
2560 BUG_ON(fp->fi_had_conflict); 2579 return;
2580 }
2581 if (fp->fi_had_conflict) {
2582 WARN(1, "duplicate break on %p\n", fp);
2583 return;
2584 }
2561 /* 2585 /*
2562 * We don't want the locks code to timeout the lease for us; 2586 * We don't want the locks code to timeout the lease for us;
2563 * we'll remove it ourself if a delegation isn't returned 2587 * we'll remove it ourself if a delegation isn't returned
@@ -2599,14 +2623,13 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
2599 2623
2600__be32 2624__be32
2601nfsd4_process_open1(struct nfsd4_compound_state *cstate, 2625nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2602 struct nfsd4_open *open) 2626 struct nfsd4_open *open, struct nfsd_net *nn)
2603{ 2627{
2604 clientid_t *clientid = &open->op_clientid; 2628 clientid_t *clientid = &open->op_clientid;
2605 struct nfs4_client *clp = NULL; 2629 struct nfs4_client *clp = NULL;
2606 unsigned int strhashval; 2630 unsigned int strhashval;
2607 struct nfs4_openowner *oo = NULL; 2631 struct nfs4_openowner *oo = NULL;
2608 __be32 status; 2632 __be32 status;
2609 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
2610 2633
2611 if (STALE_CLIENTID(&open->op_clientid, nn)) 2634 if (STALE_CLIENTID(&open->op_clientid, nn))
2612 return nfserr_stale_clientid; 2635 return nfserr_stale_clientid;
@@ -2619,10 +2642,11 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2619 return nfserr_jukebox; 2642 return nfserr_jukebox;
2620 2643
2621 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); 2644 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner);
2622 oo = find_openstateowner_str(strhashval, open, cstate->minorversion); 2645 oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn);
2623 open->op_openowner = oo; 2646 open->op_openowner = oo;
2624 if (!oo) { 2647 if (!oo) {
2625 clp = find_confirmed_client(clientid, cstate->minorversion); 2648 clp = find_confirmed_client(clientid, cstate->minorversion,
2649 nn);
2626 if (clp == NULL) 2650 if (clp == NULL)
2627 return nfserr_expired; 2651 return nfserr_expired;
2628 goto new_owner; 2652 goto new_owner;
@@ -2891,7 +2915,7 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
2891 open->op_why_no_deleg = WND4_CANCELLED; 2915 open->op_why_no_deleg = WND4_CANCELLED;
2892 break; 2916 break;
2893 case NFS4_SHARE_WANT_NO_DELEG: 2917 case NFS4_SHARE_WANT_NO_DELEG:
2894 BUG(); /* not supposed to get here */ 2918 WARN_ON_ONCE(1);
2895 } 2919 }
2896 } 2920 }
2897} 2921}
@@ -2959,6 +2983,7 @@ out:
2959 } 2983 }
2960 return; 2984 return;
2961out_free: 2985out_free:
2986 unhash_stid(&dp->dl_stid);
2962 nfs4_put_delegation(dp); 2987 nfs4_put_delegation(dp);
2963out_no_deleg: 2988out_no_deleg:
2964 flag = NFS4_OPEN_DELEGATE_NONE; 2989 flag = NFS4_OPEN_DELEGATE_NONE;
@@ -3104,27 +3129,32 @@ void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status)
3104 free_generic_stateid(open->op_stp); 3129 free_generic_stateid(open->op_stp);
3105} 3130}
3106 3131
3132static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp)
3133{
3134 struct nfs4_client *found;
3135
3136 if (STALE_CLIENTID(clid, nn))
3137 return nfserr_stale_clientid;
3138 found = find_confirmed_client(clid, session, nn);
3139 if (clp)
3140 *clp = found;
3141 return found ? nfs_ok : nfserr_expired;
3142}
3143
3107__be32 3144__be32
3108nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3145nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3109 clientid_t *clid) 3146 clientid_t *clid)
3110{ 3147{
3111 struct nfs4_client *clp; 3148 struct nfs4_client *clp;
3112 __be32 status; 3149 __be32 status;
3113 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 3150 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3114 3151
3115 nfs4_lock_state(); 3152 nfs4_lock_state();
3116 dprintk("process_renew(%08x/%08x): starting\n", 3153 dprintk("process_renew(%08x/%08x): starting\n",
3117 clid->cl_boot, clid->cl_id); 3154 clid->cl_boot, clid->cl_id);
3118 status = nfserr_stale_clientid; 3155 status = lookup_clientid(clid, cstate->minorversion, nn, &clp);
3119 if (STALE_CLIENTID(clid, nn)) 3156 if (status)
3120 goto out;
3121 clp = find_confirmed_client(clid, cstate->minorversion);
3122 status = nfserr_expired;
3123 if (clp == NULL) {
3124 /* We assume the client took too long to RENEW. */
3125 dprintk("nfsd4_renew: clientid not found!\n");
3126 goto out; 3157 goto out;
3127 }
3128 status = nfserr_cb_path_down; 3158 status = nfserr_cb_path_down;
3129 if (!list_empty(&clp->cl_delegations) 3159 if (!list_empty(&clp->cl_delegations)
3130 && clp->cl_cb_state != NFSD4_CB_UP) 3160 && clp->cl_cb_state != NFSD4_CB_UP)
@@ -3136,44 +3166,42 @@ out:
3136} 3166}
3137 3167
3138static void 3168static void
3139nfsd4_end_grace(struct net *net) 3169nfsd4_end_grace(struct nfsd_net *nn)
3140{ 3170{
3141 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
3142
3143 /* do nothing if grace period already ended */ 3171 /* do nothing if grace period already ended */
3144 if (nn->grace_ended) 3172 if (nn->grace_ended)
3145 return; 3173 return;
3146 3174
3147 dprintk("NFSD: end of grace period\n"); 3175 dprintk("NFSD: end of grace period\n");
3148 nn->grace_ended = true; 3176 nn->grace_ended = true;
3149 nfsd4_record_grace_done(net, nn->boot_time); 3177 nfsd4_record_grace_done(nn, nn->boot_time);
3150 locks_end_grace(&nn->nfsd4_manager); 3178 locks_end_grace(&nn->nfsd4_manager);
3151 /* 3179 /*
3152 * Now that every NFSv4 client has had the chance to recover and 3180 * Now that every NFSv4 client has had the chance to recover and
3153 * to see the (possibly new, possibly shorter) lease time, we 3181 * to see the (possibly new, possibly shorter) lease time, we
3154 * can safely set the next grace time to the current lease time: 3182 * can safely set the next grace time to the current lease time:
3155 */ 3183 */
3156 nfsd4_grace = nfsd4_lease; 3184 nn->nfsd4_grace = nn->nfsd4_lease;
3157} 3185}
3158 3186
3159static time_t 3187static time_t
3160nfs4_laundromat(void) 3188nfs4_laundromat(struct nfsd_net *nn)
3161{ 3189{
3162 struct nfs4_client *clp; 3190 struct nfs4_client *clp;
3163 struct nfs4_openowner *oo; 3191 struct nfs4_openowner *oo;
3164 struct nfs4_delegation *dp; 3192 struct nfs4_delegation *dp;
3165 struct list_head *pos, *next, reaplist; 3193 struct list_head *pos, *next, reaplist;
3166 time_t cutoff = get_seconds() - nfsd4_lease; 3194 time_t cutoff = get_seconds() - nn->nfsd4_lease;
3167 time_t t, clientid_val = nfsd4_lease; 3195 time_t t, clientid_val = nn->nfsd4_lease;
3168 time_t u, test_val = nfsd4_lease; 3196 time_t u, test_val = nn->nfsd4_lease;
3169 3197
3170 nfs4_lock_state(); 3198 nfs4_lock_state();
3171 3199
3172 dprintk("NFSD: laundromat service - starting\n"); 3200 dprintk("NFSD: laundromat service - starting\n");
3173 nfsd4_end_grace(&init_net); 3201 nfsd4_end_grace(nn);
3174 INIT_LIST_HEAD(&reaplist); 3202 INIT_LIST_HEAD(&reaplist);
3175 spin_lock(&client_lock); 3203 spin_lock(&nn->client_lock);
3176 list_for_each_safe(pos, next, &client_lru) { 3204 list_for_each_safe(pos, next, &nn->client_lru) {
3177 clp = list_entry(pos, struct nfs4_client, cl_lru); 3205 clp = list_entry(pos, struct nfs4_client, cl_lru);
3178 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { 3206 if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
3179 t = clp->cl_time - cutoff; 3207 t = clp->cl_time - cutoff;
@@ -3189,7 +3217,7 @@ nfs4_laundromat(void)
3189 unhash_client_locked(clp); 3217 unhash_client_locked(clp);
3190 list_add(&clp->cl_lru, &reaplist); 3218 list_add(&clp->cl_lru, &reaplist);
3191 } 3219 }
3192 spin_unlock(&client_lock); 3220 spin_unlock(&nn->client_lock);
3193 list_for_each_safe(pos, next, &reaplist) { 3221 list_for_each_safe(pos, next, &reaplist) {
3194 clp = list_entry(pos, struct nfs4_client, cl_lru); 3222 clp = list_entry(pos, struct nfs4_client, cl_lru);
3195 dprintk("NFSD: purging unused client (clientid %08x)\n", 3223 dprintk("NFSD: purging unused client (clientid %08x)\n",
@@ -3199,6 +3227,8 @@ nfs4_laundromat(void)
3199 spin_lock(&recall_lock); 3227 spin_lock(&recall_lock);
3200 list_for_each_safe(pos, next, &del_recall_lru) { 3228 list_for_each_safe(pos, next, &del_recall_lru) {
3201 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 3229 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
3230 if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
3231 continue;
3202 if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) { 3232 if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
3203 u = dp->dl_time - cutoff; 3233 u = dp->dl_time - cutoff;
3204 if (test_val > u) 3234 if (test_val > u)
@@ -3212,8 +3242,8 @@ nfs4_laundromat(void)
3212 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 3242 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
3213 unhash_delegation(dp); 3243 unhash_delegation(dp);
3214 } 3244 }
3215 test_val = nfsd4_lease; 3245 test_val = nn->nfsd4_lease;
3216 list_for_each_safe(pos, next, &close_lru) { 3246 list_for_each_safe(pos, next, &nn->close_lru) {
3217 oo = container_of(pos, struct nfs4_openowner, oo_close_lru); 3247 oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
3218 if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { 3248 if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
3219 u = oo->oo_time - cutoff; 3249 u = oo->oo_time - cutoff;
@@ -3231,16 +3261,19 @@ nfs4_laundromat(void)
3231 3261
3232static struct workqueue_struct *laundry_wq; 3262static struct workqueue_struct *laundry_wq;
3233static void laundromat_main(struct work_struct *); 3263static void laundromat_main(struct work_struct *);
3234static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main);
3235 3264
3236static void 3265static void
3237laundromat_main(struct work_struct *not_used) 3266laundromat_main(struct work_struct *laundry)
3238{ 3267{
3239 time_t t; 3268 time_t t;
3269 struct delayed_work *dwork = container_of(laundry, struct delayed_work,
3270 work);
3271 struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
3272 laundromat_work);
3240 3273
3241 t = nfs4_laundromat(); 3274 t = nfs4_laundromat(nn);
3242 dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t); 3275 dprintk("NFSD: laundromat_main - sleeping for %ld seconds\n", t);
3243 queue_delayed_work(laundry_wq, &laundromat_work, t*HZ); 3276 queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
3244} 3277}
3245 3278
3246static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) 3279static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
@@ -3385,16 +3418,17 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
3385 return nfs_ok; 3418 return nfs_ok;
3386} 3419}
3387 3420
3388static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, bool sessions) 3421static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
3422 struct nfs4_stid **s, bool sessions,
3423 struct nfsd_net *nn)
3389{ 3424{
3390 struct nfs4_client *cl; 3425 struct nfs4_client *cl;
3391 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
3392 3426
3393 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3427 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3394 return nfserr_bad_stateid; 3428 return nfserr_bad_stateid;
3395 if (STALE_STATEID(stateid, nn)) 3429 if (STALE_STATEID(stateid, nn))
3396 return nfserr_stale_stateid; 3430 return nfserr_stale_stateid;
3397 cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions); 3431 cl = find_confirmed_client(&stateid->si_opaque.so_clid, sessions, nn);
3398 if (!cl) 3432 if (!cl)
3399 return nfserr_expired; 3433 return nfserr_expired;
3400 *s = find_stateid_by_type(cl, stateid, typemask); 3434 *s = find_stateid_by_type(cl, stateid, typemask);
@@ -3416,6 +3450,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3416 struct nfs4_delegation *dp = NULL; 3450 struct nfs4_delegation *dp = NULL;
3417 struct svc_fh *current_fh = &cstate->current_fh; 3451 struct svc_fh *current_fh = &cstate->current_fh;
3418 struct inode *ino = current_fh->fh_dentry->d_inode; 3452 struct inode *ino = current_fh->fh_dentry->d_inode;
3453 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
3419 __be32 status; 3454 __be32 status;
3420 3455
3421 if (filpp) 3456 if (filpp)
@@ -3427,7 +3462,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3427 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 3462 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3428 return check_special_stateids(net, current_fh, stateid, flags); 3463 return check_special_stateids(net, current_fh, stateid, flags);
3429 3464
3430 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, cstate->minorversion); 3465 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
3466 &s, cstate->minorversion, nn);
3431 if (status) 3467 if (status)
3432 return status; 3468 return status;
3433 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); 3469 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
@@ -3441,7 +3477,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3441 goto out; 3477 goto out;
3442 if (filpp) { 3478 if (filpp) {
3443 *filpp = dp->dl_file->fi_deleg_file; 3479 *filpp = dp->dl_file->fi_deleg_file;
3444 BUG_ON(!*filpp); 3480 if (!*filpp) {
3481 WARN_ON_ONCE(1);
3482 status = nfserr_serverfault;
3483 goto out;
3484 }
3445 } 3485 }
3446 break; 3486 break;
3447 case NFS4_OPEN_STID: 3487 case NFS4_OPEN_STID:
@@ -3568,7 +3608,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
3568static __be32 3608static __be32
3569nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, 3609nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3570 stateid_t *stateid, char typemask, 3610 stateid_t *stateid, char typemask,
3571 struct nfs4_ol_stateid **stpp) 3611 struct nfs4_ol_stateid **stpp,
3612 struct nfsd_net *nn)
3572{ 3613{
3573 __be32 status; 3614 __be32 status;
3574 struct nfs4_stid *s; 3615 struct nfs4_stid *s;
@@ -3577,7 +3618,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3577 seqid, STATEID_VAL(stateid)); 3618 seqid, STATEID_VAL(stateid));
3578 3619
3579 *stpp = NULL; 3620 *stpp = NULL;
3580 status = nfsd4_lookup_stateid(stateid, typemask, &s, cstate->minorversion); 3621 status = nfsd4_lookup_stateid(stateid, typemask, &s,
3622 cstate->minorversion, nn);
3581 if (status) 3623 if (status)
3582 return status; 3624 return status;
3583 *stpp = openlockstateid(s); 3625 *stpp = openlockstateid(s);
@@ -3586,13 +3628,14 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3586 return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); 3628 return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp);
3587} 3629}
3588 3630
3589static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, stateid_t *stateid, struct nfs4_ol_stateid **stpp) 3631static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3632 stateid_t *stateid, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn)
3590{ 3633{
3591 __be32 status; 3634 __be32 status;
3592 struct nfs4_openowner *oo; 3635 struct nfs4_openowner *oo;
3593 3636
3594 status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, 3637 status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
3595 NFS4_OPEN_STID, stpp); 3638 NFS4_OPEN_STID, stpp, nn);
3596 if (status) 3639 if (status)
3597 return status; 3640 return status;
3598 oo = openowner((*stpp)->st_stateowner); 3641 oo = openowner((*stpp)->st_stateowner);
@@ -3608,6 +3651,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3608 __be32 status; 3651 __be32 status;
3609 struct nfs4_openowner *oo; 3652 struct nfs4_openowner *oo;
3610 struct nfs4_ol_stateid *stp; 3653 struct nfs4_ol_stateid *stp;
3654 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3611 3655
3612 dprintk("NFSD: nfsd4_open_confirm on file %.*s\n", 3656 dprintk("NFSD: nfsd4_open_confirm on file %.*s\n",
3613 (int)cstate->current_fh.fh_dentry->d_name.len, 3657 (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3621,7 +3665,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3621 3665
3622 status = nfs4_preprocess_seqid_op(cstate, 3666 status = nfs4_preprocess_seqid_op(cstate,
3623 oc->oc_seqid, &oc->oc_req_stateid, 3667 oc->oc_seqid, &oc->oc_req_stateid,
3624 NFS4_OPEN_STID, &stp); 3668 NFS4_OPEN_STID, &stp, nn);
3625 if (status) 3669 if (status)
3626 goto out; 3670 goto out;
3627 oo = openowner(stp->st_stateowner); 3671 oo = openowner(stp->st_stateowner);
@@ -3664,7 +3708,7 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
3664 case NFS4_SHARE_ACCESS_BOTH: 3708 case NFS4_SHARE_ACCESS_BOTH:
3665 break; 3709 break;
3666 default: 3710 default:
3667 BUG(); 3711 WARN_ON_ONCE(1);
3668 } 3712 }
3669} 3713}
3670 3714
@@ -3685,6 +3729,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3685{ 3729{
3686 __be32 status; 3730 __be32 status;
3687 struct nfs4_ol_stateid *stp; 3731 struct nfs4_ol_stateid *stp;
3732 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3688 3733
3689 dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n", 3734 dprintk("NFSD: nfsd4_open_downgrade on file %.*s\n",
3690 (int)cstate->current_fh.fh_dentry->d_name.len, 3735 (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3697,7 +3742,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
3697 3742
3698 nfs4_lock_state(); 3743 nfs4_lock_state();
3699 status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, 3744 status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
3700 &od->od_stateid, &stp); 3745 &od->od_stateid, &stp, nn);
3701 if (status) 3746 if (status)
3702 goto out; 3747 goto out;
3703 status = nfserr_inval; 3748 status = nfserr_inval;
@@ -3760,6 +3805,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3760 __be32 status; 3805 __be32 status;
3761 struct nfs4_openowner *oo; 3806 struct nfs4_openowner *oo;
3762 struct nfs4_ol_stateid *stp; 3807 struct nfs4_ol_stateid *stp;
3808 struct net *net = SVC_NET(rqstp);
3809 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
3763 3810
3764 dprintk("NFSD: nfsd4_close on file %.*s\n", 3811 dprintk("NFSD: nfsd4_close on file %.*s\n",
3765 (int)cstate->current_fh.fh_dentry->d_name.len, 3812 (int)cstate->current_fh.fh_dentry->d_name.len,
@@ -3769,7 +3816,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3769 status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, 3816 status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
3770 &close->cl_stateid, 3817 &close->cl_stateid,
3771 NFS4_OPEN_STID|NFS4_CLOSED_STID, 3818 NFS4_OPEN_STID|NFS4_CLOSED_STID,
3772 &stp); 3819 &stp, nn);
3773 if (status) 3820 if (status)
3774 goto out; 3821 goto out;
3775 oo = openowner(stp->st_stateowner); 3822 oo = openowner(stp->st_stateowner);
@@ -3791,7 +3838,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3791 * little while to handle CLOSE replay. 3838 * little while to handle CLOSE replay.
3792 */ 3839 */
3793 if (list_empty(&oo->oo_owner.so_stateids)) 3840 if (list_empty(&oo->oo_owner.so_stateids))
3794 move_to_close_lru(oo); 3841 move_to_close_lru(oo, SVC_NET(rqstp));
3795 } 3842 }
3796 } 3843 }
3797out: 3844out:
@@ -3807,15 +3854,15 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3807 struct nfs4_delegation *dp; 3854 struct nfs4_delegation *dp;
3808 stateid_t *stateid = &dr->dr_stateid; 3855 stateid_t *stateid = &dr->dr_stateid;
3809 struct nfs4_stid *s; 3856 struct nfs4_stid *s;
3810 struct inode *inode;
3811 __be32 status; 3857 __be32 status;
3858 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3812 3859
3813 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3860 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
3814 return status; 3861 return status;
3815 inode = cstate->current_fh.fh_dentry->d_inode;
3816 3862
3817 nfs4_lock_state(); 3863 nfs4_lock_state();
3818 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s, cstate->minorversion); 3864 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s,
3865 cstate->minorversion, nn);
3819 if (status) 3866 if (status)
3820 goto out; 3867 goto out;
3821 dp = delegstateid(s); 3868 dp = delegstateid(s);
@@ -3833,8 +3880,6 @@ out:
3833 3880
3834#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) 3881#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
3835 3882
3836#define LOCKOWNER_INO_HASH_BITS 8
3837#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
3838#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1) 3883#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
3839 3884
3840static inline u64 3885static inline u64
@@ -3852,7 +3897,7 @@ last_byte_offset(u64 start, u64 len)
3852{ 3897{
3853 u64 end; 3898 u64 end;
3854 3899
3855 BUG_ON(!len); 3900 WARN_ON_ONCE(!len);
3856 end = start + len; 3901 end = start + len;
3857 return end > start ? end - 1: NFS4_MAX_UINT64; 3902 return end > start ? end - 1: NFS4_MAX_UINT64;
3858} 3903}
@@ -3864,8 +3909,6 @@ static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct
3864 & LOCKOWNER_INO_HASH_MASK; 3909 & LOCKOWNER_INO_HASH_MASK;
3865} 3910}
3866 3911
3867static struct list_head lockowner_ino_hashtbl[LOCKOWNER_INO_HASH_SIZE];
3868
3869/* 3912/*
3870 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that 3913 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
3871 * we can't properly handle lock requests that go beyond the (2^63 - 1)-th 3914 * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -3931,12 +3974,12 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
3931 3974
3932static struct nfs4_lockowner * 3975static struct nfs4_lockowner *
3933find_lockowner_str(struct inode *inode, clientid_t *clid, 3976find_lockowner_str(struct inode *inode, clientid_t *clid,
3934 struct xdr_netobj *owner) 3977 struct xdr_netobj *owner, struct nfsd_net *nn)
3935{ 3978{
3936 unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner); 3979 unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
3937 struct nfs4_lockowner *lo; 3980 struct nfs4_lockowner *lo;
3938 3981
3939 list_for_each_entry(lo, &lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { 3982 list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) {
3940 if (same_lockowner_ino(lo, inode, clid, owner)) 3983 if (same_lockowner_ino(lo, inode, clid, owner))
3941 return lo; 3984 return lo;
3942 } 3985 }
@@ -3948,9 +3991,10 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
3948 struct inode *inode = open_stp->st_file->fi_inode; 3991 struct inode *inode = open_stp->st_file->fi_inode;
3949 unsigned int inohash = lockowner_ino_hashval(inode, 3992 unsigned int inohash = lockowner_ino_hashval(inode,
3950 clp->cl_clientid.cl_id, &lo->lo_owner.so_owner); 3993 clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
3994 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
3951 3995
3952 list_add(&lo->lo_owner.so_strhash, &ownerstr_hashtbl[strhashval]); 3996 list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]);
3953 list_add(&lo->lo_owner_ino_hash, &lockowner_ino_hashtbl[inohash]); 3997 list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]);
3954 list_add(&lo->lo_perstateid, &open_stp->st_lockowners); 3998 list_add(&lo->lo_perstateid, &open_stp->st_lockowners);
3955} 3999}
3956 4000
@@ -4024,8 +4068,10 @@ static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, s
4024 struct nfs4_client *cl = oo->oo_owner.so_client; 4068 struct nfs4_client *cl = oo->oo_owner.so_client;
4025 struct nfs4_lockowner *lo; 4069 struct nfs4_lockowner *lo;
4026 unsigned int strhashval; 4070 unsigned int strhashval;
4071 struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id);
4027 4072
4028 lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, &lock->v.new.owner); 4073 lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid,
4074 &lock->v.new.owner, nn);
4029 if (lo) { 4075 if (lo) {
4030 if (!cstate->minorversion) 4076 if (!cstate->minorversion)
4031 return nfserr_bad_seqid; 4077 return nfserr_bad_seqid;
@@ -4065,7 +4111,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4065 bool new_state = false; 4111 bool new_state = false;
4066 int lkflg; 4112 int lkflg;
4067 int err; 4113 int err;
4068 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 4114 struct net *net = SVC_NET(rqstp);
4115 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4069 4116
4070 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", 4117 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
4071 (long long) lock->lk_offset, 4118 (long long) lock->lk_offset,
@@ -4099,7 +4146,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4099 status = nfs4_preprocess_confirmed_seqid_op(cstate, 4146 status = nfs4_preprocess_confirmed_seqid_op(cstate,
4100 lock->lk_new_open_seqid, 4147 lock->lk_new_open_seqid,
4101 &lock->lk_new_open_stateid, 4148 &lock->lk_new_open_stateid,
4102 &open_stp); 4149 &open_stp, nn);
4103 if (status) 4150 if (status)
4104 goto out; 4151 goto out;
4105 open_sop = openowner(open_stp->st_stateowner); 4152 open_sop = openowner(open_stp->st_stateowner);
@@ -4113,7 +4160,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4113 status = nfs4_preprocess_seqid_op(cstate, 4160 status = nfs4_preprocess_seqid_op(cstate,
4114 lock->lk_old_lock_seqid, 4161 lock->lk_old_lock_seqid,
4115 &lock->lk_old_lock_stateid, 4162 &lock->lk_old_lock_stateid,
4116 NFS4_LOCK_STID, &lock_stp); 4163 NFS4_LOCK_STID, &lock_stp, nn);
4117 if (status) 4164 if (status)
4118 goto out; 4165 goto out;
4119 lock_sop = lockowner(lock_stp->st_stateowner); 4166 lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4124,10 +4171,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4124 goto out; 4171 goto out;
4125 4172
4126 status = nfserr_grace; 4173 status = nfserr_grace;
4127 if (locks_in_grace(SVC_NET(rqstp)) && !lock->lk_reclaim) 4174 if (locks_in_grace(net) && !lock->lk_reclaim)
4128 goto out; 4175 goto out;
4129 status = nfserr_no_grace; 4176 status = nfserr_no_grace;
4130 if (!locks_in_grace(SVC_NET(rqstp)) && lock->lk_reclaim) 4177 if (!locks_in_grace(net) && lock->lk_reclaim)
4131 goto out; 4178 goto out;
4132 4179
4133 file_lock = locks_alloc_lock(); 4180 file_lock = locks_alloc_lock();
@@ -4238,7 +4285,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4238 struct file_lock *file_lock = NULL; 4285 struct file_lock *file_lock = NULL;
4239 struct nfs4_lockowner *lo; 4286 struct nfs4_lockowner *lo;
4240 __be32 status; 4287 __be32 status;
4241 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 4288 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
4242 4289
4243 if (locks_in_grace(SVC_NET(rqstp))) 4290 if (locks_in_grace(SVC_NET(rqstp)))
4244 return nfserr_grace; 4291 return nfserr_grace;
@@ -4248,9 +4295,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4248 4295
4249 nfs4_lock_state(); 4296 nfs4_lock_state();
4250 4297
4251 status = nfserr_stale_clientid; 4298 if (!nfsd4_has_session(cstate)) {
4252 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid, nn)) 4299 status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL);
4253 goto out; 4300 if (status)
4301 goto out;
4302 }
4254 4303
4255 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 4304 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
4256 goto out; 4305 goto out;
@@ -4278,7 +4327,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4278 goto out; 4327 goto out;
4279 } 4328 }
4280 4329
4281 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner); 4330 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn);
4282 if (lo) 4331 if (lo)
4283 file_lock->fl_owner = (fl_owner_t)lo; 4332 file_lock->fl_owner = (fl_owner_t)lo;
4284 file_lock->fl_pid = current->tgid; 4333 file_lock->fl_pid = current->tgid;
@@ -4313,7 +4362,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4313 struct file_lock *file_lock = NULL; 4362 struct file_lock *file_lock = NULL;
4314 __be32 status; 4363 __be32 status;
4315 int err; 4364 int err;
4316 4365 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
4366
4317 dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n", 4367 dprintk("NFSD: nfsd4_locku: start=%Ld length=%Ld\n",
4318 (long long) locku->lu_offset, 4368 (long long) locku->lu_offset,
4319 (long long) locku->lu_length); 4369 (long long) locku->lu_length);
@@ -4324,7 +4374,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4324 nfs4_lock_state(); 4374 nfs4_lock_state();
4325 4375
4326 status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, 4376 status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
4327 &locku->lu_stateid, NFS4_LOCK_STID, &stp); 4377 &locku->lu_stateid, NFS4_LOCK_STID,
4378 &stp, nn);
4328 if (status) 4379 if (status)
4329 goto out; 4380 goto out;
4330 filp = find_any_file(stp->st_file); 4381 filp = find_any_file(stp->st_file);
@@ -4414,23 +4465,21 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4414 struct list_head matches; 4465 struct list_head matches;
4415 unsigned int hashval = ownerstr_hashval(clid->cl_id, owner); 4466 unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
4416 __be32 status; 4467 __be32 status;
4417 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id); 4468 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
4418 4469
4419 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", 4470 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
4420 clid->cl_boot, clid->cl_id); 4471 clid->cl_boot, clid->cl_id);
4421 4472
4422 /* XXX check for lease expiration */
4423
4424 status = nfserr_stale_clientid;
4425 if (STALE_CLIENTID(clid, nn))
4426 return status;
4427
4428 nfs4_lock_state(); 4473 nfs4_lock_state();
4429 4474
4475 status = lookup_clientid(clid, cstate->minorversion, nn, NULL);
4476 if (status)
4477 goto out;
4478
4430 status = nfserr_locks_held; 4479 status = nfserr_locks_held;
4431 INIT_LIST_HEAD(&matches); 4480 INIT_LIST_HEAD(&matches);
4432 4481
4433 list_for_each_entry(sop, &ownerstr_hashtbl[hashval], so_strhash) { 4482 list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) {
4434 if (sop->so_is_open_owner) 4483 if (sop->so_is_open_owner)
4435 continue; 4484 continue;
4436 if (!same_owner_str(sop, owner, clid)) 4485 if (!same_owner_str(sop, owner, clid))
@@ -4466,73 +4515,74 @@ alloc_reclaim(void)
4466 return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL); 4515 return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
4467} 4516}
4468 4517
4469int 4518bool
4470nfs4_has_reclaimed_state(const char *name, bool use_exchange_id) 4519nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn)
4471{ 4520{
4472 unsigned int strhashval = clientstr_hashval(name); 4521 struct nfs4_client_reclaim *crp;
4473 struct nfs4_client *clp;
4474 4522
4475 clp = find_confirmed_client_by_str(name, strhashval); 4523 crp = nfsd4_find_reclaim_client(name, nn);
4476 if (!clp) 4524 return (crp && crp->cr_clp);
4477 return 0;
4478 return test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
4479} 4525}
4480 4526
4481/* 4527/*
4482 * failure => all reset bets are off, nfserr_no_grace... 4528 * failure => all reset bets are off, nfserr_no_grace...
4483 */ 4529 */
4484int 4530struct nfs4_client_reclaim *
4485nfs4_client_to_reclaim(const char *name) 4531nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn)
4486{ 4532{
4487 unsigned int strhashval; 4533 unsigned int strhashval;
4488 struct nfs4_client_reclaim *crp = NULL; 4534 struct nfs4_client_reclaim *crp;
4489 4535
4490 dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name); 4536 dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
4491 crp = alloc_reclaim(); 4537 crp = alloc_reclaim();
4492 if (!crp) 4538 if (crp) {
4493 return 0; 4539 strhashval = clientstr_hashval(name);
4494 strhashval = clientstr_hashval(name); 4540 INIT_LIST_HEAD(&crp->cr_strhash);
4495 INIT_LIST_HEAD(&crp->cr_strhash); 4541 list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
4496 list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]); 4542 memcpy(crp->cr_recdir, name, HEXDIR_LEN);
4497 memcpy(crp->cr_recdir, name, HEXDIR_LEN); 4543 crp->cr_clp = NULL;
4498 reclaim_str_hashtbl_size++; 4544 nn->reclaim_str_hashtbl_size++;
4499 return 1; 4545 }
4546 return crp;
4547}
4548
4549void
4550nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
4551{
4552 list_del(&crp->cr_strhash);
4553 kfree(crp);
4554 nn->reclaim_str_hashtbl_size--;
4500} 4555}
4501 4556
4502void 4557void
4503nfs4_release_reclaim(void) 4558nfs4_release_reclaim(struct nfsd_net *nn)
4504{ 4559{
4505 struct nfs4_client_reclaim *crp = NULL; 4560 struct nfs4_client_reclaim *crp = NULL;
4506 int i; 4561 int i;
4507 4562
4508 for (i = 0; i < CLIENT_HASH_SIZE; i++) { 4563 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4509 while (!list_empty(&reclaim_str_hashtbl[i])) { 4564 while (!list_empty(&nn->reclaim_str_hashtbl[i])) {
4510 crp = list_entry(reclaim_str_hashtbl[i].next, 4565 crp = list_entry(nn->reclaim_str_hashtbl[i].next,
4511 struct nfs4_client_reclaim, cr_strhash); 4566 struct nfs4_client_reclaim, cr_strhash);
4512 list_del(&crp->cr_strhash); 4567 nfs4_remove_reclaim_record(crp, nn);
4513 kfree(crp);
4514 reclaim_str_hashtbl_size--;
4515 } 4568 }
4516 } 4569 }
4517 BUG_ON(reclaim_str_hashtbl_size); 4570 WARN_ON_ONCE(nn->reclaim_str_hashtbl_size);
4518} 4571}
4519 4572
4520/* 4573/*
4521 * called from OPEN, CLAIM_PREVIOUS with a new clientid. */ 4574 * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
4522struct nfs4_client_reclaim * 4575struct nfs4_client_reclaim *
4523nfsd4_find_reclaim_client(struct nfs4_client *clp) 4576nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
4524{ 4577{
4525 unsigned int strhashval; 4578 unsigned int strhashval;
4526 struct nfs4_client_reclaim *crp = NULL; 4579 struct nfs4_client_reclaim *crp = NULL;
4527 4580
4528 dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n", 4581 dprintk("NFSD: nfs4_find_reclaim_client for recdir %s\n", recdir);
4529 clp->cl_name.len, clp->cl_name.data,
4530 clp->cl_recdir);
4531 4582
4532 /* find clp->cl_name in reclaim_str_hashtbl */ 4583 strhashval = clientstr_hashval(recdir);
4533 strhashval = clientstr_hashval(clp->cl_recdir); 4584 list_for_each_entry(crp, &nn->reclaim_str_hashtbl[strhashval], cr_strhash) {
4534 list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) { 4585 if (same_name(crp->cr_recdir, recdir)) {
4535 if (same_name(crp->cr_recdir, clp->cl_recdir)) {
4536 return crp; 4586 return crp;
4537 } 4587 }
4538 } 4588 }
@@ -4543,12 +4593,12 @@ nfsd4_find_reclaim_client(struct nfs4_client *clp)
4543* Called from OPEN. Look for clientid in reclaim list. 4593* Called from OPEN. Look for clientid in reclaim list.
4544*/ 4594*/
4545__be32 4595__be32
4546nfs4_check_open_reclaim(clientid_t *clid, bool sessions) 4596nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn)
4547{ 4597{
4548 struct nfs4_client *clp; 4598 struct nfs4_client *clp;
4549 4599
4550 /* find clientid in conf_id_hashtbl */ 4600 /* find clientid in conf_id_hashtbl */
4551 clp = find_confirmed_client(clid, sessions); 4601 clp = find_confirmed_client(clid, sessions, nn);
4552 if (clp == NULL) 4602 if (clp == NULL)
4553 return nfserr_reclaim_bad; 4603 return nfserr_reclaim_bad;
4554 4604
@@ -4557,124 +4607,177 @@ nfs4_check_open_reclaim(clientid_t *clid, bool sessions)
4557 4607
4558#ifdef CONFIG_NFSD_FAULT_INJECTION 4608#ifdef CONFIG_NFSD_FAULT_INJECTION
4559 4609
4560void nfsd_forget_clients(u64 num) 4610u64 nfsd_forget_client(struct nfs4_client *clp, u64 max)
4561{ 4611{
4562 struct nfs4_client *clp, *next; 4612 expire_client(clp);
4563 int count = 0; 4613 return 1;
4564
4565 nfs4_lock_state();
4566 list_for_each_entry_safe(clp, next, &client_lru, cl_lru) {
4567 expire_client(clp);
4568 if (++count == num)
4569 break;
4570 }
4571 nfs4_unlock_state();
4572
4573 printk(KERN_INFO "NFSD: Forgot %d clients", count);
4574} 4614}
4575 4615
4576static void release_lockowner_sop(struct nfs4_stateowner *sop) 4616u64 nfsd_print_client(struct nfs4_client *clp, u64 num)
4577{ 4617{
4578 release_lockowner(lockowner(sop)); 4618 char buf[INET6_ADDRSTRLEN];
4619 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
4620 printk(KERN_INFO "NFS Client: %s\n", buf);
4621 return 1;
4579} 4622}
4580 4623
4581static void release_openowner_sop(struct nfs4_stateowner *sop) 4624static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
4625 const char *type)
4582{ 4626{
4583 release_openowner(openowner(sop)); 4627 char buf[INET6_ADDRSTRLEN];
4628 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
4629 printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
4584} 4630}
4585 4631
4586static int nfsd_release_n_owners(u64 num, bool is_open_owner, 4632static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *))
4587 void (*release_sop)(struct nfs4_stateowner *))
4588{ 4633{
4589 int i, count = 0; 4634 struct nfs4_openowner *oop;
4590 struct nfs4_stateowner *sop, *next; 4635 struct nfs4_lockowner *lop, *lo_next;
4636 struct nfs4_ol_stateid *stp, *st_next;
4637 u64 count = 0;
4591 4638
4592 for (i = 0; i < OWNER_HASH_SIZE; i++) { 4639 list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
4593 list_for_each_entry_safe(sop, next, &ownerstr_hashtbl[i], so_strhash) { 4640 list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) {
4594 if (sop->so_is_open_owner != is_open_owner) 4641 list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) {
4595 continue; 4642 if (func)
4596 release_sop(sop); 4643 func(lop);
4597 if (++count == num) 4644 if (++count == max)
4598 return count; 4645 return count;
4646 }
4599 } 4647 }
4600 } 4648 }
4649
4601 return count; 4650 return count;
4602} 4651}
4603 4652
4604void nfsd_forget_locks(u64 num) 4653u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max)
4605{ 4654{
4606 int count; 4655 return nfsd_foreach_client_lock(clp, max, release_lockowner);
4607 4656}
4608 nfs4_lock_state();
4609 count = nfsd_release_n_owners(num, false, release_lockowner_sop);
4610 nfs4_unlock_state();
4611 4657
4612 printk(KERN_INFO "NFSD: Forgot %d locks", count); 4658u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max)
4659{
4660 u64 count = nfsd_foreach_client_lock(clp, max, NULL);
4661 nfsd_print_count(clp, count, "locked files");
4662 return count;
4613} 4663}
4614 4664
4615void nfsd_forget_openowners(u64 num) 4665static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *))
4616{ 4666{
4617 int count; 4667 struct nfs4_openowner *oop, *next;
4668 u64 count = 0;
4618 4669
4619 nfs4_lock_state(); 4670 list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
4620 count = nfsd_release_n_owners(num, true, release_openowner_sop); 4671 if (func)
4621 nfs4_unlock_state(); 4672 func(oop);
4673 if (++count == max)
4674 break;
4675 }
4622 4676
4623 printk(KERN_INFO "NFSD: Forgot %d open owners", count); 4677 return count;
4624} 4678}
4625 4679
4626static int nfsd_process_n_delegations(u64 num, struct list_head *list) 4680u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max)
4627{ 4681{
4628 int i, count = 0; 4682 return nfsd_foreach_client_open(clp, max, release_openowner);
4629 struct nfs4_file *fp, *fnext; 4683}
4630 struct nfs4_delegation *dp, *dnext;
4631 4684
4632 for (i = 0; i < FILE_HASH_SIZE; i++) { 4685u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max)
4633 list_for_each_entry_safe(fp, fnext, &file_hashtbl[i], fi_hash) { 4686{
4634 list_for_each_entry_safe(dp, dnext, &fp->fi_delegations, dl_perfile) { 4687 u64 count = nfsd_foreach_client_open(clp, max, NULL);
4635 list_move(&dp->dl_recall_lru, list); 4688 nfsd_print_count(clp, count, "open files");
4636 if (++count == num) 4689 return count;
4637 return count; 4690}
4638 } 4691
4639 } 4692static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
4640 } 4693 struct list_head *victims)
4694{
4695 struct nfs4_delegation *dp, *next;
4696 u64 count = 0;
4641 4697
4698 list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
4699 if (victims)
4700 list_move(&dp->dl_recall_lru, victims);
4701 if (++count == max)
4702 break;
4703 }
4642 return count; 4704 return count;
4643} 4705}
4644 4706
4645void nfsd_forget_delegations(u64 num) 4707u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
4646{ 4708{
4647 unsigned int count; 4709 struct nfs4_delegation *dp, *next;
4648 LIST_HEAD(victims); 4710 LIST_HEAD(victims);
4649 struct nfs4_delegation *dp, *dnext; 4711 u64 count;
4650 4712
4651 spin_lock(&recall_lock); 4713 spin_lock(&recall_lock);
4652 count = nfsd_process_n_delegations(num, &victims); 4714 count = nfsd_find_all_delegations(clp, max, &victims);
4653 spin_unlock(&recall_lock); 4715 spin_unlock(&recall_lock);
4654 4716
4655 nfs4_lock_state(); 4717 list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
4656 list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru)
4657 unhash_delegation(dp); 4718 unhash_delegation(dp);
4658 nfs4_unlock_state();
4659 4719
4660 printk(KERN_INFO "NFSD: Forgot %d delegations", count); 4720 return count;
4661} 4721}
4662 4722
4663void nfsd_recall_delegations(u64 num) 4723u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
4664{ 4724{
4665 unsigned int count; 4725 struct nfs4_delegation *dp, *next;
4666 LIST_HEAD(victims); 4726 LIST_HEAD(victims);
4667 struct nfs4_delegation *dp, *dnext; 4727 u64 count;
4668 4728
4669 spin_lock(&recall_lock); 4729 spin_lock(&recall_lock);
4670 count = nfsd_process_n_delegations(num, &victims); 4730 count = nfsd_find_all_delegations(clp, max, &victims);
4671 list_for_each_entry_safe(dp, dnext, &victims, dl_recall_lru) { 4731 list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
4672 list_del(&dp->dl_recall_lru);
4673 nfsd_break_one_deleg(dp); 4732 nfsd_break_one_deleg(dp);
4674 }
4675 spin_unlock(&recall_lock); 4733 spin_unlock(&recall_lock);
4676 4734
4677 printk(KERN_INFO "NFSD: Recalled %d delegations", count); 4735 return count;
4736}
4737
4738u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
4739{
4740 u64 count = 0;
4741
4742 spin_lock(&recall_lock);
4743 count = nfsd_find_all_delegations(clp, max, NULL);
4744 spin_unlock(&recall_lock);
4745
4746 nfsd_print_count(clp, count, "delegations");
4747 return count;
4748}
4749
4750u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64))
4751{
4752 struct nfs4_client *clp, *next;
4753 u64 count = 0;
4754 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
4755
4756 if (!nfsd_netns_ready(nn))
4757 return 0;
4758
4759 list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
4760 count += func(clp, max - count);
4761 if ((max != 0) && (count >= max))
4762 break;
4763 }
4764
4765 return count;
4766}
4767
4768struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
4769{
4770 struct nfs4_client *clp;
4771 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id);
4772
4773 if (!nfsd_netns_ready(nn))
4774 return NULL;
4775
4776 list_for_each_entry(clp, &nn->client_lru, cl_lru) {
4777 if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
4778 return clp;
4779 }
4780 return NULL;
4678} 4781}
4679 4782
4680#endif /* CONFIG_NFSD_FAULT_INJECTION */ 4783#endif /* CONFIG_NFSD_FAULT_INJECTION */
@@ -4686,27 +4789,10 @@ nfs4_state_init(void)
4686{ 4789{
4687 int i; 4790 int i;
4688 4791
4689 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4690 INIT_LIST_HEAD(&conf_id_hashtbl[i]);
4691 INIT_LIST_HEAD(&conf_str_hashtbl[i]);
4692 INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
4693 INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
4694 INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
4695 }
4696 for (i = 0; i < SESSION_HASH_SIZE; i++)
4697 INIT_LIST_HEAD(&sessionid_hashtbl[i]);
4698 for (i = 0; i < FILE_HASH_SIZE; i++) { 4792 for (i = 0; i < FILE_HASH_SIZE; i++) {
4699 INIT_LIST_HEAD(&file_hashtbl[i]); 4793 INIT_LIST_HEAD(&file_hashtbl[i]);
4700 } 4794 }
4701 for (i = 0; i < OWNER_HASH_SIZE; i++) {
4702 INIT_LIST_HEAD(&ownerstr_hashtbl[i]);
4703 }
4704 for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
4705 INIT_LIST_HEAD(&lockowner_ino_hashtbl[i]);
4706 INIT_LIST_HEAD(&close_lru);
4707 INIT_LIST_HEAD(&client_lru);
4708 INIT_LIST_HEAD(&del_recall_lru); 4795 INIT_LIST_HEAD(&del_recall_lru);
4709 reclaim_str_hashtbl_size = 0;
4710} 4796}
4711 4797
4712/* 4798/*
@@ -4730,12 +4816,100 @@ set_max_delegations(void)
4730 max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT); 4816 max_delegations = nr_free_buffer_pages() >> (20 - 2 - PAGE_SHIFT);
4731} 4817}
4732 4818
4733/* initialization to perform when the nfsd service is started: */ 4819static int nfs4_state_create_net(struct net *net)
4820{
4821 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4822 int i;
4823
4824 nn->conf_id_hashtbl = kmalloc(sizeof(struct list_head) *
4825 CLIENT_HASH_SIZE, GFP_KERNEL);
4826 if (!nn->conf_id_hashtbl)
4827 goto err;
4828 nn->unconf_id_hashtbl = kmalloc(sizeof(struct list_head) *
4829 CLIENT_HASH_SIZE, GFP_KERNEL);
4830 if (!nn->unconf_id_hashtbl)
4831 goto err_unconf_id;
4832 nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
4833 OWNER_HASH_SIZE, GFP_KERNEL);
4834 if (!nn->ownerstr_hashtbl)
4835 goto err_ownerstr;
4836 nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) *
4837 LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL);
4838 if (!nn->lockowner_ino_hashtbl)
4839 goto err_lockowner_ino;
4840 nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
4841 SESSION_HASH_SIZE, GFP_KERNEL);
4842 if (!nn->sessionid_hashtbl)
4843 goto err_sessionid;
4844
4845 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4846 INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
4847 INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
4848 }
4849 for (i = 0; i < OWNER_HASH_SIZE; i++)
4850 INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]);
4851 for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
4852 INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]);
4853 for (i = 0; i < SESSION_HASH_SIZE; i++)
4854 INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
4855 nn->conf_name_tree = RB_ROOT;
4856 nn->unconf_name_tree = RB_ROOT;
4857 INIT_LIST_HEAD(&nn->client_lru);
4858 INIT_LIST_HEAD(&nn->close_lru);
4859 spin_lock_init(&nn->client_lock);
4860
4861 INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
4862 get_net(net);
4863
4864 return 0;
4865
4866err_sessionid:
4867 kfree(nn->lockowner_ino_hashtbl);
4868err_lockowner_ino:
4869 kfree(nn->ownerstr_hashtbl);
4870err_ownerstr:
4871 kfree(nn->unconf_id_hashtbl);
4872err_unconf_id:
4873 kfree(nn->conf_id_hashtbl);
4874err:
4875 return -ENOMEM;
4876}
4877
4878static void
4879nfs4_state_destroy_net(struct net *net)
4880{
4881 int i;
4882 struct nfs4_client *clp = NULL;
4883 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4884 struct rb_node *node, *tmp;
4885
4886 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4887 while (!list_empty(&nn->conf_id_hashtbl[i])) {
4888 clp = list_entry(nn->conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
4889 destroy_client(clp);
4890 }
4891 }
4892
4893 node = rb_first(&nn->unconf_name_tree);
4894 while (node != NULL) {
4895 tmp = node;
4896 node = rb_next(tmp);
4897 clp = rb_entry(tmp, struct nfs4_client, cl_namenode);
4898 rb_erase(tmp, &nn->unconf_name_tree);
4899 destroy_client(clp);
4900 }
4901
4902 kfree(nn->sessionid_hashtbl);
4903 kfree(nn->lockowner_ino_hashtbl);
4904 kfree(nn->ownerstr_hashtbl);
4905 kfree(nn->unconf_id_hashtbl);
4906 kfree(nn->conf_id_hashtbl);
4907 put_net(net);
4908}
4734 4909
4735int 4910int
4736nfs4_state_start(void) 4911nfs4_state_start_net(struct net *net)
4737{ 4912{
4738 struct net *net = &init_net;
4739 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 4913 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4740 int ret; 4914 int ret;
4741 4915
@@ -4746,18 +4920,32 @@ nfs4_state_start(void)
4746 * to that instead and then do most of the rest of this on a per-net 4920 * to that instead and then do most of the rest of this on a per-net
4747 * basis. 4921 * basis.
4748 */ 4922 */
4749 get_net(net); 4923 if (net != &init_net)
4924 return -EINVAL;
4925
4926 ret = nfs4_state_create_net(net);
4927 if (ret)
4928 return ret;
4750 nfsd4_client_tracking_init(net); 4929 nfsd4_client_tracking_init(net);
4751 nn->boot_time = get_seconds(); 4930 nn->boot_time = get_seconds();
4752 locks_start_grace(net, &nn->nfsd4_manager); 4931 locks_start_grace(net, &nn->nfsd4_manager);
4753 nn->grace_ended = false; 4932 nn->grace_ended = false;
4754 printk(KERN_INFO "NFSD: starting %ld-second grace period\n", 4933 printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
4755 nfsd4_grace); 4934 nn->nfsd4_grace, net);
4935 queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
4936 return 0;
4937}
4938
4939/* initialization to perform when the nfsd service is started: */
4940
4941int
4942nfs4_state_start(void)
4943{
4944 int ret;
4945
4756 ret = set_callback_cred(); 4946 ret = set_callback_cred();
4757 if (ret) { 4947 if (ret)
4758 ret = -ENOMEM; 4948 return -ENOMEM;
4759 goto out_recovery;
4760 }
4761 laundry_wq = create_singlethread_workqueue("nfsd4"); 4949 laundry_wq = create_singlethread_workqueue("nfsd4");
4762 if (laundry_wq == NULL) { 4950 if (laundry_wq == NULL) {
4763 ret = -ENOMEM; 4951 ret = -ENOMEM;
@@ -4766,39 +4954,34 @@ nfs4_state_start(void)
4766 ret = nfsd4_create_callback_queue(); 4954 ret = nfsd4_create_callback_queue();
4767 if (ret) 4955 if (ret)
4768 goto out_free_laundry; 4956 goto out_free_laundry;
4769 queue_delayed_work(laundry_wq, &laundromat_work, nfsd4_grace * HZ); 4957
4770 set_max_delegations(); 4958 set_max_delegations();
4959
4771 return 0; 4960 return 0;
4961
4772out_free_laundry: 4962out_free_laundry:
4773 destroy_workqueue(laundry_wq); 4963 destroy_workqueue(laundry_wq);
4774out_recovery: 4964out_recovery:
4775 nfsd4_client_tracking_exit(net);
4776 put_net(net);
4777 return ret; 4965 return ret;
4778} 4966}
4779 4967
4780static void 4968/* should be called with the state lock held */
4781__nfs4_state_shutdown(void) 4969void
4970nfs4_state_shutdown_net(struct net *net)
4782{ 4971{
4783 int i;
4784 struct nfs4_client *clp = NULL;
4785 struct nfs4_delegation *dp = NULL; 4972 struct nfs4_delegation *dp = NULL;
4786 struct list_head *pos, *next, reaplist; 4973 struct list_head *pos, *next, reaplist;
4974 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4975
4976 cancel_delayed_work_sync(&nn->laundromat_work);
4977 locks_end_grace(&nn->nfsd4_manager);
4787 4978
4788 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
4789 while (!list_empty(&conf_id_hashtbl[i])) {
4790 clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
4791 destroy_client(clp);
4792 }
4793 while (!list_empty(&unconf_str_hashtbl[i])) {
4794 clp = list_entry(unconf_str_hashtbl[i].next, struct nfs4_client, cl_strhash);
4795 destroy_client(clp);
4796 }
4797 }
4798 INIT_LIST_HEAD(&reaplist); 4979 INIT_LIST_HEAD(&reaplist);
4799 spin_lock(&recall_lock); 4980 spin_lock(&recall_lock);
4800 list_for_each_safe(pos, next, &del_recall_lru) { 4981 list_for_each_safe(pos, next, &del_recall_lru) {
4801 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 4982 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
4983 if (dp->dl_stid.sc_client->net != net)
4984 continue;
4802 list_move(&dp->dl_recall_lru, &reaplist); 4985 list_move(&dp->dl_recall_lru, &reaplist);
4803 } 4986 }
4804 spin_unlock(&recall_lock); 4987 spin_unlock(&recall_lock);
@@ -4807,22 +4990,14 @@ __nfs4_state_shutdown(void)
4807 unhash_delegation(dp); 4990 unhash_delegation(dp);
4808 } 4991 }
4809 4992
4810 nfsd4_client_tracking_exit(&init_net); 4993 nfsd4_client_tracking_exit(net);
4811 put_net(&init_net); 4994 nfs4_state_destroy_net(net);
4812} 4995}
4813 4996
4814void 4997void
4815nfs4_state_shutdown(void) 4998nfs4_state_shutdown(void)
4816{ 4999{
4817 struct net *net = &init_net;
4818 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4819
4820 cancel_delayed_work_sync(&laundromat_work);
4821 destroy_workqueue(laundry_wq); 5000 destroy_workqueue(laundry_wq);
4822 locks_end_grace(&nn->nfsd4_manager);
4823 nfs4_lock_state();
4824 __nfs4_state_shutdown();
4825 nfs4_unlock_state();
4826 nfsd4_destroy_callback_queue(); 5001 nfsd4_destroy_callback_queue();
4827} 5002}
4828 5003
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index fd548d155088..0dc11586682f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -53,6 +53,7 @@
53#include "vfs.h" 53#include "vfs.h"
54#include "state.h" 54#include "state.h"
55#include "cache.h" 55#include "cache.h"
56#include "netns.h"
56 57
57#define NFSDDBG_FACILITY NFSDDBG_XDR 58#define NFSDDBG_FACILITY NFSDDBG_XDR
58 59
@@ -65,17 +66,17 @@
65#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL 66#define NFS4_REFERRAL_FSID_MINOR 0x8000000ULL
66 67
67static __be32 68static __be32
68check_filename(char *str, int len, __be32 err) 69check_filename(char *str, int len)
69{ 70{
70 int i; 71 int i;
71 72
72 if (len == 0) 73 if (len == 0)
73 return nfserr_inval; 74 return nfserr_inval;
74 if (isdotent(str, len)) 75 if (isdotent(str, len))
75 return err; 76 return nfserr_badname;
76 for (i = 0; i < len; i++) 77 for (i = 0; i < len; i++)
77 if (str[i] == '/') 78 if (str[i] == '/')
78 return err; 79 return nfserr_badname;
79 return 0; 80 return 0;
80} 81}
81 82
@@ -422,6 +423,86 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
422 DECODE_TAIL; 423 DECODE_TAIL;
423} 424}
424 425
426static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
427{
428 DECODE_HEAD;
429 u32 dummy, uid, gid;
430 char *machine_name;
431 int i;
432 int nr_secflavs;
433
434 /* callback_sec_params4 */
435 READ_BUF(4);
436 READ32(nr_secflavs);
437 cbs->flavor = (u32)(-1);
438 for (i = 0; i < nr_secflavs; ++i) {
439 READ_BUF(4);
440 READ32(dummy);
441 switch (dummy) {
442 case RPC_AUTH_NULL:
443 /* Nothing to read */
444 if (cbs->flavor == (u32)(-1))
445 cbs->flavor = RPC_AUTH_NULL;
446 break;
447 case RPC_AUTH_UNIX:
448 READ_BUF(8);
449 /* stamp */
450 READ32(dummy);
451
452 /* machine name */
453 READ32(dummy);
454 READ_BUF(dummy);
455 SAVEMEM(machine_name, dummy);
456
457 /* uid, gid */
458 READ_BUF(8);
459 READ32(uid);
460 READ32(gid);
461
462 /* more gids */
463 READ_BUF(4);
464 READ32(dummy);
465 READ_BUF(dummy * 4);
466 if (cbs->flavor == (u32)(-1)) {
467 cbs->uid = uid;
468 cbs->gid = gid;
469 cbs->flavor = RPC_AUTH_UNIX;
470 }
471 break;
472 case RPC_AUTH_GSS:
473 dprintk("RPC_AUTH_GSS callback secflavor "
474 "not supported!\n");
475 READ_BUF(8);
476 /* gcbp_service */
477 READ32(dummy);
478 /* gcbp_handle_from_server */
479 READ32(dummy);
480 READ_BUF(dummy);
481 p += XDR_QUADLEN(dummy);
482 /* gcbp_handle_from_client */
483 READ_BUF(4);
484 READ32(dummy);
485 READ_BUF(dummy);
486 break;
487 default:
488 dprintk("Illegal callback secflavor\n");
489 return nfserr_inval;
490 }
491 }
492 DECODE_TAIL;
493}
494
495static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
496{
497 DECODE_HEAD;
498
499 READ_BUF(4);
500 READ32(bc->bc_cb_program);
501 nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
502
503 DECODE_TAIL;
504}
505
425static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts) 506static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
426{ 507{
427 DECODE_HEAD; 508 DECODE_HEAD;
@@ -490,7 +571,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
490 READ32(create->cr_namelen); 571 READ32(create->cr_namelen);
491 READ_BUF(create->cr_namelen); 572 READ_BUF(create->cr_namelen);
492 SAVEMEM(create->cr_name, create->cr_namelen); 573 SAVEMEM(create->cr_name, create->cr_namelen);
493 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) 574 if ((status = check_filename(create->cr_name, create->cr_namelen)))
494 return status; 575 return status;
495 576
496 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, 577 status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
@@ -522,7 +603,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
522 READ32(link->li_namelen); 603 READ32(link->li_namelen);
523 READ_BUF(link->li_namelen); 604 READ_BUF(link->li_namelen);
524 SAVEMEM(link->li_name, link->li_namelen); 605 SAVEMEM(link->li_name, link->li_namelen);
525 if ((status = check_filename(link->li_name, link->li_namelen, nfserr_inval))) 606 if ((status = check_filename(link->li_name, link->li_namelen)))
526 return status; 607 return status;
527 608
528 DECODE_TAIL; 609 DECODE_TAIL;
@@ -616,7 +697,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
616 READ32(lookup->lo_len); 697 READ32(lookup->lo_len);
617 READ_BUF(lookup->lo_len); 698 READ_BUF(lookup->lo_len);
618 SAVEMEM(lookup->lo_name, lookup->lo_len); 699 SAVEMEM(lookup->lo_name, lookup->lo_len);
619 if ((status = check_filename(lookup->lo_name, lookup->lo_len, nfserr_noent))) 700 if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
620 return status; 701 return status;
621 702
622 DECODE_TAIL; 703 DECODE_TAIL;
@@ -780,7 +861,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
780 READ32(open->op_fname.len); 861 READ32(open->op_fname.len);
781 READ_BUF(open->op_fname.len); 862 READ_BUF(open->op_fname.len);
782 SAVEMEM(open->op_fname.data, open->op_fname.len); 863 SAVEMEM(open->op_fname.data, open->op_fname.len);
783 if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) 864 if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
784 return status; 865 return status;
785 break; 866 break;
786 case NFS4_OPEN_CLAIM_PREVIOUS: 867 case NFS4_OPEN_CLAIM_PREVIOUS:
@@ -795,7 +876,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
795 READ32(open->op_fname.len); 876 READ32(open->op_fname.len);
796 READ_BUF(open->op_fname.len); 877 READ_BUF(open->op_fname.len);
797 SAVEMEM(open->op_fname.data, open->op_fname.len); 878 SAVEMEM(open->op_fname.data, open->op_fname.len);
798 if ((status = check_filename(open->op_fname.data, open->op_fname.len, nfserr_inval))) 879 if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
799 return status; 880 return status;
800 break; 881 break;
801 case NFS4_OPEN_CLAIM_FH: 882 case NFS4_OPEN_CLAIM_FH:
@@ -907,7 +988,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
907 READ32(remove->rm_namelen); 988 READ32(remove->rm_namelen);
908 READ_BUF(remove->rm_namelen); 989 READ_BUF(remove->rm_namelen);
909 SAVEMEM(remove->rm_name, remove->rm_namelen); 990 SAVEMEM(remove->rm_name, remove->rm_namelen);
910 if ((status = check_filename(remove->rm_name, remove->rm_namelen, nfserr_noent))) 991 if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
911 return status; 992 return status;
912 993
913 DECODE_TAIL; 994 DECODE_TAIL;
@@ -925,9 +1006,9 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
925 READ32(rename->rn_tnamelen); 1006 READ32(rename->rn_tnamelen);
926 READ_BUF(rename->rn_tnamelen); 1007 READ_BUF(rename->rn_tnamelen);
927 SAVEMEM(rename->rn_tname, rename->rn_tnamelen); 1008 SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
928 if ((status = check_filename(rename->rn_sname, rename->rn_snamelen, nfserr_noent))) 1009 if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
929 return status; 1010 return status;
930 if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen, nfserr_inval))) 1011 if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
931 return status; 1012 return status;
932 1013
933 DECODE_TAIL; 1014 DECODE_TAIL;
@@ -954,8 +1035,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
954 READ32(secinfo->si_namelen); 1035 READ32(secinfo->si_namelen);
955 READ_BUF(secinfo->si_namelen); 1036 READ_BUF(secinfo->si_namelen);
956 SAVEMEM(secinfo->si_name, secinfo->si_namelen); 1037 SAVEMEM(secinfo->si_name, secinfo->si_namelen);
957 status = check_filename(secinfo->si_name, secinfo->si_namelen, 1038 status = check_filename(secinfo->si_name, secinfo->si_namelen);
958 nfserr_noent);
959 if (status) 1039 if (status)
960 return status; 1040 return status;
961 DECODE_TAIL; 1041 DECODE_TAIL;
@@ -1026,31 +1106,14 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
1026static __be32 1106static __be32
1027nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify) 1107nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
1028{ 1108{
1029#if 0
1030 struct nfsd4_compoundargs save = {
1031 .p = argp->p,
1032 .end = argp->end,
1033 .rqstp = argp->rqstp,
1034 };
1035 u32 ve_bmval[2];
1036 struct iattr ve_iattr; /* request */
1037 struct nfs4_acl *ve_acl; /* request */
1038#endif
1039 DECODE_HEAD; 1109 DECODE_HEAD;
1040 1110
1041 if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval))) 1111 if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
1042 goto out; 1112 goto out;
1043 1113
1044 /* For convenience's sake, we compare raw xdr'd attributes in 1114 /* For convenience's sake, we compare raw xdr'd attributes in
1045 * nfsd4_proc_verify; however we still decode here just to return 1115 * nfsd4_proc_verify */
1046 * correct error in case of bad xdr. */ 1116
1047#if 0
1048 status = nfsd4_decode_fattr(ve_bmval, &ve_iattr, &ve_acl);
1049 if (status == nfserr_inval) {
1050 status = nfserrno(status);
1051 goto out;
1052 }
1053#endif
1054 READ_BUF(4); 1117 READ_BUF(4);
1055 READ32(verify->ve_attrlen); 1118 READ32(verify->ve_attrlen);
1056 READ_BUF(verify->ve_attrlen); 1119 READ_BUF(verify->ve_attrlen);
@@ -1063,7 +1126,6 @@ static __be32
1063nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) 1126nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
1064{ 1127{
1065 int avail; 1128 int avail;
1066 int v;
1067 int len; 1129 int len;
1068 DECODE_HEAD; 1130 DECODE_HEAD;
1069 1131
@@ -1087,27 +1149,26 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
1087 __FILE__, __LINE__); 1149 __FILE__, __LINE__);
1088 goto xdr_error; 1150 goto xdr_error;
1089 } 1151 }
1090 argp->rqstp->rq_vec[0].iov_base = p; 1152 write->wr_head.iov_base = p;
1091 argp->rqstp->rq_vec[0].iov_len = avail; 1153 write->wr_head.iov_len = avail;
1092 v = 0; 1154 WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
1093 len = write->wr_buflen; 1155 write->wr_pagelist = argp->pagelist;
1094 while (len > argp->rqstp->rq_vec[v].iov_len) { 1156
1095 len -= argp->rqstp->rq_vec[v].iov_len; 1157 len = XDR_QUADLEN(write->wr_buflen) << 2;
1096 v++; 1158 if (len >= avail) {
1097 argp->rqstp->rq_vec[v].iov_base = page_address(argp->pagelist[0]); 1159 int pages;
1098 argp->pagelist++; 1160
1099 if (argp->pagelen >= PAGE_SIZE) { 1161 len -= avail;
1100 argp->rqstp->rq_vec[v].iov_len = PAGE_SIZE; 1162
1101 argp->pagelen -= PAGE_SIZE; 1163 pages = len >> PAGE_SHIFT;
1102 } else { 1164 argp->pagelist += pages;
1103 argp->rqstp->rq_vec[v].iov_len = argp->pagelen; 1165 argp->pagelen -= pages * PAGE_SIZE;
1104 argp->pagelen -= len; 1166 len -= pages * PAGE_SIZE;
1105 } 1167
1168 argp->p = (__be32 *)page_address(argp->pagelist[0]);
1169 argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
1106 } 1170 }
1107 argp->end = (__be32*) (argp->rqstp->rq_vec[v].iov_base + argp->rqstp->rq_vec[v].iov_len); 1171 argp->p += XDR_QUADLEN(len);
1108 argp->p = (__be32*) (argp->rqstp->rq_vec[v].iov_base + (XDR_QUADLEN(len) << 2));
1109 argp->rqstp->rq_vec[v].iov_len = len;
1110 write->wr_vlen = v+1;
1111 1172
1112 DECODE_TAIL; 1173 DECODE_TAIL;
1113} 1174}
@@ -1237,11 +1298,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1237 struct nfsd4_create_session *sess) 1298 struct nfsd4_create_session *sess)
1238{ 1299{
1239 DECODE_HEAD; 1300 DECODE_HEAD;
1240
1241 u32 dummy; 1301 u32 dummy;
1242 char *machine_name;
1243 int i;
1244 int nr_secflavs;
1245 1302
1246 READ_BUF(16); 1303 READ_BUF(16);
1247 COPYMEM(&sess->clientid, 8); 1304 COPYMEM(&sess->clientid, 8);
@@ -1282,58 +1339,9 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1282 goto xdr_error; 1339 goto xdr_error;
1283 } 1340 }
1284 1341
1285 READ_BUF(8); 1342 READ_BUF(4);
1286 READ32(sess->callback_prog); 1343 READ32(sess->callback_prog);
1287 1344 nfsd4_decode_cb_sec(argp, &sess->cb_sec);
1288 /* callback_sec_params4 */
1289 READ32(nr_secflavs);
1290 for (i = 0; i < nr_secflavs; ++i) {
1291 READ_BUF(4);
1292 READ32(dummy);
1293 switch (dummy) {
1294 case RPC_AUTH_NULL:
1295 /* Nothing to read */
1296 break;
1297 case RPC_AUTH_UNIX:
1298 READ_BUF(8);
1299 /* stamp */
1300 READ32(dummy);
1301
1302 /* machine name */
1303 READ32(dummy);
1304 READ_BUF(dummy);
1305 SAVEMEM(machine_name, dummy);
1306
1307 /* uid, gid */
1308 READ_BUF(8);
1309 READ32(sess->uid);
1310 READ32(sess->gid);
1311
1312 /* more gids */
1313 READ_BUF(4);
1314 READ32(dummy);
1315 READ_BUF(dummy * 4);
1316 break;
1317 case RPC_AUTH_GSS:
1318 dprintk("RPC_AUTH_GSS callback secflavor "
1319 "not supported!\n");
1320 READ_BUF(8);
1321 /* gcbp_service */
1322 READ32(dummy);
1323 /* gcbp_handle_from_server */
1324 READ32(dummy);
1325 READ_BUF(dummy);
1326 p += XDR_QUADLEN(dummy);
1327 /* gcbp_handle_from_client */
1328 READ_BUF(4);
1329 READ32(dummy);
1330 READ_BUF(dummy);
1331 break;
1332 default:
1333 dprintk("Illegal callback secflavor\n");
1334 return nfserr_inval;
1335 }
1336 }
1337 DECODE_TAIL; 1345 DECODE_TAIL;
1338} 1346}
1339 1347
@@ -1528,7 +1536,7 @@ static nfsd4_dec nfsd41_dec_ops[] = {
1528 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp, 1536 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp,
1529 1537
1530 /* new operations for NFSv4.1 */ 1538 /* new operations for NFSv4.1 */
1531 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_notsupp, 1539 [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
1532 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session, 1540 [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
1533 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id, 1541 [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
1534 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session, 1542 [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
@@ -1568,12 +1576,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1568 bool cachethis = false; 1576 bool cachethis = false;
1569 int i; 1577 int i;
1570 1578
1571 /*
1572 * XXX: According to spec, we should check the tag
1573 * for UTF-8 compliance. I'm postponing this for
1574 * now because it seems that some clients do use
1575 * binary tags.
1576 */
1577 READ_BUF(4); 1579 READ_BUF(4);
1578 READ32(argp->taglen); 1580 READ32(argp->taglen);
1579 READ_BUF(argp->taglen + 8); 1581 READ_BUF(argp->taglen + 8);
@@ -1603,38 +1605,8 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1603 op = &argp->ops[i]; 1605 op = &argp->ops[i];
1604 op->replay = NULL; 1606 op->replay = NULL;
1605 1607
1606 /* 1608 READ_BUF(4);
1607 * We can't use READ_BUF() here because we need to handle 1609 READ32(op->opnum);
1608 * a missing opcode as an OP_WRITE + 1. So we need to check
1609 * to see if we're truly at the end of our buffer or if there
1610 * is another page we need to flip to.
1611 */
1612
1613 if (argp->p == argp->end) {
1614 if (argp->pagelen < 4) {
1615 /* There isn't an opcode still on the wire */
1616 op->opnum = OP_WRITE + 1;
1617 op->status = nfserr_bad_xdr;
1618 argp->opcnt = i+1;
1619 break;
1620 }
1621
1622 /*
1623 * False alarm. We just hit a page boundary, but there
1624 * is still data available. Move pointer across page
1625 * boundary. *snip from READ_BUF*
1626 */
1627 argp->p = page_address(argp->pagelist[0]);
1628 argp->pagelist++;
1629 if (argp->pagelen < PAGE_SIZE) {
1630 argp->end = argp->p + (argp->pagelen>>2);
1631 argp->pagelen = 0;
1632 } else {
1633 argp->end = argp->p + (PAGE_SIZE>>2);
1634 argp->pagelen -= PAGE_SIZE;
1635 }
1636 }
1637 op->opnum = ntohl(*argp->p++);
1638 1610
1639 if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP) 1611 if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
1640 op->status = ops->decoders[op->opnum](argp, &op->u); 1612 op->status = ops->decoders[op->opnum](argp, &op->u);
@@ -2014,6 +1986,22 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err)
2014 return 0; 1986 return 0;
2015} 1987}
2016 1988
1989
1990static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
1991{
1992 struct path path = exp->ex_path;
1993 int err;
1994
1995 path_get(&path);
1996 while (follow_up(&path)) {
1997 if (path.dentry != path.mnt->mnt_root)
1998 break;
1999 }
2000 err = vfs_getattr(path.mnt, path.dentry, stat);
2001 path_put(&path);
2002 return err;
2003}
2004
2017/* 2005/*
2018 * Note: @fhp can be NULL; in this case, we might have to compose the filehandle 2006 * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
2019 * ourselves. 2007 * ourselves.
@@ -2048,6 +2036,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2048 .mnt = exp->ex_path.mnt, 2036 .mnt = exp->ex_path.mnt,
2049 .dentry = dentry, 2037 .dentry = dentry,
2050 }; 2038 };
2039 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2051 2040
2052 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 2041 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
2053 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); 2042 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
@@ -2208,7 +2197,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
2208 if (bmval0 & FATTR4_WORD0_LEASE_TIME) { 2197 if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
2209 if ((buflen -= 4) < 0) 2198 if ((buflen -= 4) < 0)
2210 goto out_resource; 2199 goto out_resource;
2211 WRITE32(nfsd4_lease); 2200 WRITE32(nn->nfsd4_lease);
2212 } 2201 }
2213 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { 2202 if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
2214 if ((buflen -= 4) < 0) 2203 if ((buflen -= 4) < 0)
@@ -2430,18 +2419,8 @@ out_acl:
2430 * and this is the root of a cross-mounted filesystem. 2419 * and this is the root of a cross-mounted filesystem.
2431 */ 2420 */
2432 if (ignore_crossmnt == 0 && 2421 if (ignore_crossmnt == 0 &&
2433 dentry == exp->ex_path.mnt->mnt_root) { 2422 dentry == exp->ex_path.mnt->mnt_root)
2434 struct path path = exp->ex_path; 2423 get_parent_attributes(exp, &stat);
2435 path_get(&path);
2436 while (follow_up(&path)) {
2437 if (path.dentry != path.mnt->mnt_root)
2438 break;
2439 }
2440 err = vfs_getattr(path.mnt, path.dentry, &stat);
2441 path_put(&path);
2442 if (err)
2443 goto out_nfserr;
2444 }
2445 WRITE64(stat.ino); 2424 WRITE64(stat.ino);
2446 } 2425 }
2447 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { 2426 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
@@ -2927,7 +2906,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2927 struct nfsd4_read *read) 2906 struct nfsd4_read *read)
2928{ 2907{
2929 u32 eof; 2908 u32 eof;
2930 int v, pn; 2909 int v;
2910 struct page *page;
2931 unsigned long maxcount; 2911 unsigned long maxcount;
2932 long len; 2912 long len;
2933 __be32 *p; 2913 __be32 *p;
@@ -2946,11 +2926,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
2946 len = maxcount; 2926 len = maxcount;
2947 v = 0; 2927 v = 0;
2948 while (len > 0) { 2928 while (len > 0) {
2949 pn = resp->rqstp->rq_resused++; 2929 page = *(resp->rqstp->rq_next_page);
2950 resp->rqstp->rq_vec[v].iov_base = 2930 if (!page) { /* ran out of pages */
2951 page_address(resp->rqstp->rq_respages[pn]); 2931 maxcount -= len;
2932 break;
2933 }
2934 resp->rqstp->rq_vec[v].iov_base = page_address(page);
2952 resp->rqstp->rq_vec[v].iov_len = 2935 resp->rqstp->rq_vec[v].iov_len =
2953 len < PAGE_SIZE ? len : PAGE_SIZE; 2936 len < PAGE_SIZE ? len : PAGE_SIZE;
2937 resp->rqstp->rq_next_page++;
2954 v++; 2938 v++;
2955 len -= PAGE_SIZE; 2939 len -= PAGE_SIZE;
2956 } 2940 }
@@ -2996,8 +2980,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
2996 return nfserr; 2980 return nfserr;
2997 if (resp->xbuf->page_len) 2981 if (resp->xbuf->page_len)
2998 return nfserr_resource; 2982 return nfserr_resource;
2983 if (!*resp->rqstp->rq_next_page)
2984 return nfserr_resource;
2999 2985
3000 page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); 2986 page = page_address(*(resp->rqstp->rq_next_page++));
3001 2987
3002 maxcount = PAGE_SIZE; 2988 maxcount = PAGE_SIZE;
3003 RESERVE_SPACE(4); 2989 RESERVE_SPACE(4);
@@ -3045,6 +3031,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3045 return nfserr; 3031 return nfserr;
3046 if (resp->xbuf->page_len) 3032 if (resp->xbuf->page_len)
3047 return nfserr_resource; 3033 return nfserr_resource;
3034 if (!*resp->rqstp->rq_next_page)
3035 return nfserr_resource;
3048 3036
3049 RESERVE_SPACE(NFS4_VERIFIER_SIZE); 3037 RESERVE_SPACE(NFS4_VERIFIER_SIZE);
3050 savep = p; 3038 savep = p;
@@ -3071,7 +3059,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3071 goto err_no_verf; 3059 goto err_no_verf;
3072 } 3060 }
3073 3061
3074 page = page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused++]); 3062 page = page_address(*(resp->rqstp->rq_next_page++));
3075 readdir->common.err = 0; 3063 readdir->common.err = 0;
3076 readdir->buflen = maxcount; 3064 readdir->buflen = maxcount;
3077 readdir->buffer = page; 3065 readdir->buffer = page;
@@ -3094,8 +3082,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3094 p = readdir->buffer; 3082 p = readdir->buffer;
3095 *p++ = 0; /* no more entries */ 3083 *p++ = 0; /* no more entries */
3096 *p++ = htonl(readdir->common.err == nfserr_eof); 3084 *p++ = htonl(readdir->common.err == nfserr_eof);
3097 resp->xbuf->page_len = ((char*)p) - (char*)page_address( 3085 resp->xbuf->page_len = ((char*)p) -
3098 resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); 3086 (char*)page_address(*(resp->rqstp->rq_next_page-1));
3099 3087
3100 /* Use rest of head for padding and remaining ops: */ 3088 /* Use rest of head for padding and remaining ops: */
3101 resp->xbuf->tail[0].iov_base = tailbase; 3089 resp->xbuf->tail[0].iov_base = tailbase;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index dab350dfc376..74934284d9a7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -19,7 +19,7 @@
19#include "idmap.h" 19#include "idmap.h"
20#include "nfsd.h" 20#include "nfsd.h"
21#include "cache.h" 21#include "cache.h"
22#include "fault_inject.h" 22#include "state.h"
23#include "netns.h" 23#include "netns.h"
24 24
25/* 25/*
@@ -186,9 +186,6 @@ static struct file_operations supported_enctypes_ops = {
186}; 186};
187#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ 187#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
188 188
189extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
190extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
191
192static const struct file_operations pool_stats_operations = { 189static const struct file_operations pool_stats_operations = {
193 .open = nfsd_pool_stats_open, 190 .open = nfsd_pool_stats_open,
194 .read = seq_read, 191 .read = seq_read,
@@ -399,6 +396,8 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
399{ 396{
400 char *mesg = buf; 397 char *mesg = buf;
401 int rv; 398 int rv;
399 struct net *net = &init_net;
400
402 if (size > 0) { 401 if (size > 0) {
403 int newthreads; 402 int newthreads;
404 rv = get_int(&mesg, &newthreads); 403 rv = get_int(&mesg, &newthreads);
@@ -406,11 +405,11 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size)
406 return rv; 405 return rv;
407 if (newthreads < 0) 406 if (newthreads < 0)
408 return -EINVAL; 407 return -EINVAL;
409 rv = nfsd_svc(newthreads); 408 rv = nfsd_svc(newthreads, net);
410 if (rv < 0) 409 if (rv < 0)
411 return rv; 410 return rv;
412 } else 411 } else
413 rv = nfsd_nrthreads(); 412 rv = nfsd_nrthreads(net);
414 413
415 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv); 414 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%d\n", rv);
416} 415}
@@ -448,9 +447,10 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
448 int len; 447 int len;
449 int npools; 448 int npools;
450 int *nthreads; 449 int *nthreads;
450 struct net *net = &init_net;
451 451
452 mutex_lock(&nfsd_mutex); 452 mutex_lock(&nfsd_mutex);
453 npools = nfsd_nrpools(); 453 npools = nfsd_nrpools(net);
454 if (npools == 0) { 454 if (npools == 0) {
455 /* 455 /*
456 * NFS is shut down. The admin can start it by 456 * NFS is shut down. The admin can start it by
@@ -478,12 +478,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size)
478 if (nthreads[i] < 0) 478 if (nthreads[i] < 0)
479 goto out_free; 479 goto out_free;
480 } 480 }
481 rv = nfsd_set_nrthreads(i, nthreads); 481 rv = nfsd_set_nrthreads(i, nthreads, net);
482 if (rv) 482 if (rv)
483 goto out_free; 483 goto out_free;
484 } 484 }
485 485
486 rv = nfsd_get_nrthreads(npools, nthreads); 486 rv = nfsd_get_nrthreads(npools, nthreads, net);
487 if (rv) 487 if (rv)
488 goto out_free; 488 goto out_free;
489 489
@@ -510,11 +510,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
510 unsigned minor; 510 unsigned minor;
511 ssize_t tlen = 0; 511 ssize_t tlen = 0;
512 char *sep; 512 char *sep;
513 struct net *net = &init_net;
514 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
513 515
514 if (size>0) { 516 if (size>0) {
515 if (nfsd_serv) 517 if (nn->nfsd_serv)
516 /* Cannot change versions without updating 518 /* Cannot change versions without updating
517 * nfsd_serv->sv_xdrsize, and reallocing 519 * nn->nfsd_serv->sv_xdrsize, and reallocing
518 * rq_argp and rq_resp 520 * rq_argp and rq_resp
519 */ 521 */
520 return -EBUSY; 522 return -EBUSY;
@@ -645,11 +647,13 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size)
645 * Zero-length write. Return a list of NFSD's current listener 647 * Zero-length write. Return a list of NFSD's current listener
646 * transports. 648 * transports.
647 */ 649 */
648static ssize_t __write_ports_names(char *buf) 650static ssize_t __write_ports_names(char *buf, struct net *net)
649{ 651{
650 if (nfsd_serv == NULL) 652 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
653
654 if (nn->nfsd_serv == NULL)
651 return 0; 655 return 0;
652 return svc_xprt_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT); 656 return svc_xprt_names(nn->nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT);
653} 657}
654 658
655/* 659/*
@@ -657,28 +661,28 @@ static ssize_t __write_ports_names(char *buf)
657 * a socket of a supported family/protocol, and we use it as an 661 * a socket of a supported family/protocol, and we use it as an
658 * nfsd listener. 662 * nfsd listener.
659 */ 663 */
660static ssize_t __write_ports_addfd(char *buf) 664static ssize_t __write_ports_addfd(char *buf, struct net *net)
661{ 665{
662 char *mesg = buf; 666 char *mesg = buf;
663 int fd, err; 667 int fd, err;
664 struct net *net = &init_net; 668 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
665 669
666 err = get_int(&mesg, &fd); 670 err = get_int(&mesg, &fd);
667 if (err != 0 || fd < 0) 671 if (err != 0 || fd < 0)
668 return -EINVAL; 672 return -EINVAL;
669 673
670 err = nfsd_create_serv(); 674 err = nfsd_create_serv(net);
671 if (err != 0) 675 if (err != 0)
672 return err; 676 return err;
673 677
674 err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); 678 err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
675 if (err < 0) { 679 if (err < 0) {
676 nfsd_destroy(net); 680 nfsd_destroy(net);
677 return err; 681 return err;
678 } 682 }
679 683
680 /* Decrease the count, but don't shut down the service */ 684 /* Decrease the count, but don't shut down the service */
681 nfsd_serv->sv_nrthreads--; 685 nn->nfsd_serv->sv_nrthreads--;
682 return err; 686 return err;
683} 687}
684 688
@@ -686,12 +690,12 @@ static ssize_t __write_ports_addfd(char *buf)
686 * A transport listener is added by writing it's transport name and 690 * A transport listener is added by writing it's transport name and
687 * a port number. 691 * a port number.
688 */ 692 */
689static ssize_t __write_ports_addxprt(char *buf) 693static ssize_t __write_ports_addxprt(char *buf, struct net *net)
690{ 694{
691 char transport[16]; 695 char transport[16];
692 struct svc_xprt *xprt; 696 struct svc_xprt *xprt;
693 int port, err; 697 int port, err;
694 struct net *net = &init_net; 698 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
695 699
696 if (sscanf(buf, "%15s %5u", transport, &port) != 2) 700 if (sscanf(buf, "%15s %5u", transport, &port) != 2)
697 return -EINVAL; 701 return -EINVAL;
@@ -699,25 +703,25 @@ static ssize_t __write_ports_addxprt(char *buf)
699 if (port < 1 || port > USHRT_MAX) 703 if (port < 1 || port > USHRT_MAX)
700 return -EINVAL; 704 return -EINVAL;
701 705
702 err = nfsd_create_serv(); 706 err = nfsd_create_serv(net);
703 if (err != 0) 707 if (err != 0)
704 return err; 708 return err;
705 709
706 err = svc_create_xprt(nfsd_serv, transport, net, 710 err = svc_create_xprt(nn->nfsd_serv, transport, net,
707 PF_INET, port, SVC_SOCK_ANONYMOUS); 711 PF_INET, port, SVC_SOCK_ANONYMOUS);
708 if (err < 0) 712 if (err < 0)
709 goto out_err; 713 goto out_err;
710 714
711 err = svc_create_xprt(nfsd_serv, transport, net, 715 err = svc_create_xprt(nn->nfsd_serv, transport, net,
712 PF_INET6, port, SVC_SOCK_ANONYMOUS); 716 PF_INET6, port, SVC_SOCK_ANONYMOUS);
713 if (err < 0 && err != -EAFNOSUPPORT) 717 if (err < 0 && err != -EAFNOSUPPORT)
714 goto out_close; 718 goto out_close;
715 719
716 /* Decrease the count, but don't shut down the service */ 720 /* Decrease the count, but don't shut down the service */
717 nfsd_serv->sv_nrthreads--; 721 nn->nfsd_serv->sv_nrthreads--;
718 return 0; 722 return 0;
719out_close: 723out_close:
720 xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port); 724 xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
721 if (xprt != NULL) { 725 if (xprt != NULL) {
722 svc_close_xprt(xprt); 726 svc_close_xprt(xprt);
723 svc_xprt_put(xprt); 727 svc_xprt_put(xprt);
@@ -727,16 +731,17 @@ out_err:
727 return err; 731 return err;
728} 732}
729 733
730static ssize_t __write_ports(struct file *file, char *buf, size_t size) 734static ssize_t __write_ports(struct file *file, char *buf, size_t size,
735 struct net *net)
731{ 736{
732 if (size == 0) 737 if (size == 0)
733 return __write_ports_names(buf); 738 return __write_ports_names(buf, net);
734 739
735 if (isdigit(buf[0])) 740 if (isdigit(buf[0]))
736 return __write_ports_addfd(buf); 741 return __write_ports_addfd(buf, net);
737 742
738 if (isalpha(buf[0])) 743 if (isalpha(buf[0]))
739 return __write_ports_addxprt(buf); 744 return __write_ports_addxprt(buf, net);
740 745
741 return -EINVAL; 746 return -EINVAL;
742} 747}
@@ -787,9 +792,10 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
787static ssize_t write_ports(struct file *file, char *buf, size_t size) 792static ssize_t write_ports(struct file *file, char *buf, size_t size)
788{ 793{
789 ssize_t rv; 794 ssize_t rv;
795 struct net *net = &init_net;
790 796
791 mutex_lock(&nfsd_mutex); 797 mutex_lock(&nfsd_mutex);
792 rv = __write_ports(file, buf, size); 798 rv = __write_ports(file, buf, size, net);
793 mutex_unlock(&nfsd_mutex); 799 mutex_unlock(&nfsd_mutex);
794 return rv; 800 return rv;
795} 801}
@@ -821,6 +827,9 @@ int nfsd_max_blksize;
821static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) 827static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
822{ 828{
823 char *mesg = buf; 829 char *mesg = buf;
830 struct net *net = &init_net;
831 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
832
824 if (size > 0) { 833 if (size > 0) {
825 int bsize; 834 int bsize;
826 int rv = get_int(&mesg, &bsize); 835 int rv = get_int(&mesg, &bsize);
@@ -835,7 +844,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
835 bsize = NFSSVC_MAXBLKSIZE; 844 bsize = NFSSVC_MAXBLKSIZE;
836 bsize &= ~(1024-1); 845 bsize &= ~(1024-1);
837 mutex_lock(&nfsd_mutex); 846 mutex_lock(&nfsd_mutex);
838 if (nfsd_serv) { 847 if (nn->nfsd_serv) {
839 mutex_unlock(&nfsd_mutex); 848 mutex_unlock(&nfsd_mutex);
840 return -EBUSY; 849 return -EBUSY;
841 } 850 }
@@ -848,13 +857,14 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
848} 857}
849 858
850#ifdef CONFIG_NFSD_V4 859#ifdef CONFIG_NFSD_V4
851static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) 860static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
861 time_t *time, struct nfsd_net *nn)
852{ 862{
853 char *mesg = buf; 863 char *mesg = buf;
854 int rv, i; 864 int rv, i;
855 865
856 if (size > 0) { 866 if (size > 0) {
857 if (nfsd_serv) 867 if (nn->nfsd_serv)
858 return -EBUSY; 868 return -EBUSY;
859 rv = get_int(&mesg, &i); 869 rv = get_int(&mesg, &i);
860 if (rv) 870 if (rv)
@@ -879,12 +889,13 @@ static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, tim
879 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time); 889 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%ld\n", *time);
880} 890}
881 891
882static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_t *time) 892static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size,
893 time_t *time, struct nfsd_net *nn)
883{ 894{
884 ssize_t rv; 895 ssize_t rv;
885 896
886 mutex_lock(&nfsd_mutex); 897 mutex_lock(&nfsd_mutex);
887 rv = __nfsd4_write_time(file, buf, size, time); 898 rv = __nfsd4_write_time(file, buf, size, time, nn);
888 mutex_unlock(&nfsd_mutex); 899 mutex_unlock(&nfsd_mutex);
889 return rv; 900 return rv;
890} 901}
@@ -912,7 +923,8 @@ static ssize_t nfsd4_write_time(struct file *file, char *buf, size_t size, time_
912 */ 923 */
913static ssize_t write_leasetime(struct file *file, char *buf, size_t size) 924static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
914{ 925{
915 return nfsd4_write_time(file, buf, size, &nfsd4_lease); 926 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
927 return nfsd4_write_time(file, buf, size, &nn->nfsd4_lease, nn);
916} 928}
917 929
918/** 930/**
@@ -927,17 +939,19 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
927 */ 939 */
928static ssize_t write_gracetime(struct file *file, char *buf, size_t size) 940static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
929{ 941{
930 return nfsd4_write_time(file, buf, size, &nfsd4_grace); 942 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
943 return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
931} 944}
932 945
933static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) 946static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
947 struct nfsd_net *nn)
934{ 948{
935 char *mesg = buf; 949 char *mesg = buf;
936 char *recdir; 950 char *recdir;
937 int len, status; 951 int len, status;
938 952
939 if (size > 0) { 953 if (size > 0) {
940 if (nfsd_serv) 954 if (nn->nfsd_serv)
941 return -EBUSY; 955 return -EBUSY;
942 if (size > PATH_MAX || buf[size-1] != '\n') 956 if (size > PATH_MAX || buf[size-1] != '\n')
943 return -EINVAL; 957 return -EINVAL;
@@ -981,9 +995,10 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size)
981static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) 995static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
982{ 996{
983 ssize_t rv; 997 ssize_t rv;
998 struct nfsd_net *nn = net_generic(&init_net, nfsd_net_id);
984 999
985 mutex_lock(&nfsd_mutex); 1000 mutex_lock(&nfsd_mutex);
986 rv = __write_recoverydir(file, buf, size); 1001 rv = __write_recoverydir(file, buf, size, nn);
987 mutex_unlock(&nfsd_mutex); 1002 mutex_unlock(&nfsd_mutex);
988 return rv; 1003 return rv;
989} 1004}
@@ -1063,6 +1078,7 @@ int nfsd_net_id;
1063static __net_init int nfsd_init_net(struct net *net) 1078static __net_init int nfsd_init_net(struct net *net)
1064{ 1079{
1065 int retval; 1080 int retval;
1081 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1066 1082
1067 retval = nfsd_export_init(net); 1083 retval = nfsd_export_init(net);
1068 if (retval) 1084 if (retval)
@@ -1070,6 +1086,8 @@ static __net_init int nfsd_init_net(struct net *net)
1070 retval = nfsd_idmap_init(net); 1086 retval = nfsd_idmap_init(net);
1071 if (retval) 1087 if (retval)
1072 goto out_idmap_error; 1088 goto out_idmap_error;
1089 nn->nfsd4_lease = 90; /* default lease time */
1090 nn->nfsd4_grace = 90;
1073 return 0; 1091 return 0;
1074 1092
1075out_idmap_error: 1093out_idmap_error:
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 80d5ce40aadb..de23db255c69 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -55,7 +55,6 @@ extern struct svc_version nfsd_version2, nfsd_version3,
55 nfsd_version4; 55 nfsd_version4;
56extern u32 nfsd_supported_minorversion; 56extern u32 nfsd_supported_minorversion;
57extern struct mutex nfsd_mutex; 57extern struct mutex nfsd_mutex;
58extern struct svc_serv *nfsd_serv;
59extern spinlock_t nfsd_drc_lock; 58extern spinlock_t nfsd_drc_lock;
60extern unsigned int nfsd_drc_max_mem; 59extern unsigned int nfsd_drc_max_mem;
61extern unsigned int nfsd_drc_mem_used; 60extern unsigned int nfsd_drc_mem_used;
@@ -65,26 +64,17 @@ extern const struct seq_operations nfs_exports_op;
65/* 64/*
66 * Function prototypes. 65 * Function prototypes.
67 */ 66 */
68int nfsd_svc(int nrservs); 67int nfsd_svc(int nrservs, struct net *net);
69int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp); 68int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
70 69
71int nfsd_nrthreads(void); 70int nfsd_nrthreads(struct net *);
72int nfsd_nrpools(void); 71int nfsd_nrpools(struct net *);
73int nfsd_get_nrthreads(int n, int *); 72int nfsd_get_nrthreads(int n, int *, struct net *);
74int nfsd_set_nrthreads(int n, int *); 73int nfsd_set_nrthreads(int n, int *, struct net *);
75int nfsd_pool_stats_open(struct inode *, struct file *); 74int nfsd_pool_stats_open(struct inode *, struct file *);
76int nfsd_pool_stats_release(struct inode *, struct file *); 75int nfsd_pool_stats_release(struct inode *, struct file *);
77 76
78static inline void nfsd_destroy(struct net *net) 77void nfsd_destroy(struct net *net);
79{
80 int destroy = (nfsd_serv->sv_nrthreads == 1);
81
82 if (destroy)
83 svc_shutdown_net(nfsd_serv, net);
84 svc_destroy(nfsd_serv);
85 if (destroy)
86 nfsd_serv = NULL;
87}
88 78
89#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) 79#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
90#ifdef CONFIG_NFSD_V2_ACL 80#ifdef CONFIG_NFSD_V2_ACL
@@ -103,7 +93,7 @@ enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
103int nfsd_vers(int vers, enum vers_op change); 93int nfsd_vers(int vers, enum vers_op change);
104int nfsd_minorversion(u32 minorversion, enum vers_op change); 94int nfsd_minorversion(u32 minorversion, enum vers_op change);
105void nfsd_reset_versions(void); 95void nfsd_reset_versions(void);
106int nfsd_create_serv(void); 96int nfsd_create_serv(struct net *net);
107 97
108extern int nfsd_max_blksize; 98extern int nfsd_max_blksize;
109 99
@@ -121,7 +111,9 @@ void nfs4_state_init(void);
121int nfsd4_init_slabs(void); 111int nfsd4_init_slabs(void);
122void nfsd4_free_slabs(void); 112void nfsd4_free_slabs(void);
123int nfs4_state_start(void); 113int nfs4_state_start(void);
114int nfs4_state_start_net(struct net *net);
124void nfs4_state_shutdown(void); 115void nfs4_state_shutdown(void);
116void nfs4_state_shutdown_net(struct net *net);
125void nfs4_reset_lease(time_t leasetime); 117void nfs4_reset_lease(time_t leasetime);
126int nfs4_reset_recoverydir(char *recdir); 118int nfs4_reset_recoverydir(char *recdir);
127char * nfs4_recoverydir(void); 119char * nfs4_recoverydir(void);
@@ -130,7 +122,9 @@ static inline void nfs4_state_init(void) { }
130static inline int nfsd4_init_slabs(void) { return 0; } 122static inline int nfsd4_init_slabs(void) { return 0; }
131static inline void nfsd4_free_slabs(void) { } 123static inline void nfsd4_free_slabs(void) { }
132static inline int nfs4_state_start(void) { return 0; } 124static inline int nfs4_state_start(void) { return 0; }
125static inline int nfs4_state_start_net(struct net *net) { return 0; }
133static inline void nfs4_state_shutdown(void) { } 126static inline void nfs4_state_shutdown(void) { }
127static inline void nfs4_state_shutdown_net(struct net *net) { }
134static inline void nfs4_reset_lease(time_t leasetime) { } 128static inline void nfs4_reset_lease(time_t leasetime) { }
135static inline int nfs4_reset_recoverydir(char *recdir) { return 0; } 129static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
136static inline char * nfs4_recoverydir(void) {return NULL; } 130static inline char * nfs4_recoverydir(void) {return NULL; }
@@ -265,16 +259,8 @@ void nfsd_lockd_shutdown(void);
265/* Check for dir entries '.' and '..' */ 259/* Check for dir entries '.' and '..' */
266#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.')) 260#define isdotent(n, l) (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
267 261
268/*
269 * Time of server startup
270 */
271extern struct timeval nfssvc_boot;
272
273#ifdef CONFIG_NFSD_V4 262#ifdef CONFIG_NFSD_V4
274 263
275extern time_t nfsd4_lease;
276extern time_t nfsd4_grace;
277
278/* before processing a COMPOUND operation, we have to check that there 264/* before processing a COMPOUND operation, we have to check that there
279 * is enough space in the buffer for XDR encode to succeed. otherwise, 265 * is enough space in the buffer for XDR encode to succeed. otherwise,
280 * we might process an operation with side effects, and be unable to 266 * we might process an operation with side effects, and be unable to
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 032af381b3aa..814afaa4458a 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -572,7 +572,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
572 572
573 if (inode) 573 if (inode)
574 _fh_update(fhp, exp, dentry); 574 _fh_update(fhp, exp, dentry);
575 if (fhp->fh_handle.fh_fileid_type == 255) { 575 if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
576 fh_put(fhp); 576 fh_put(fhp);
577 return nfserr_opnotsupp; 577 return nfserr_opnotsupp;
578 } 578 }
@@ -603,7 +603,7 @@ fh_update(struct svc_fh *fhp)
603 goto out; 603 goto out;
604 604
605 _fh_update(fhp, fhp->fh_export, dentry); 605 _fh_update(fhp, fhp->fh_export, dentry);
606 if (fhp->fh_handle.fh_fileid_type == 255) 606 if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
607 return nfserr_opnotsupp; 607 return nfserr_opnotsupp;
608 } 608 }
609out: 609out:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 2013aa001dab..cee62ab9d4a3 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -11,7 +11,6 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/fs_struct.h> 12#include <linux/fs_struct.h>
13#include <linux/swap.h> 13#include <linux/swap.h>
14#include <linux/nsproxy.h>
15 14
16#include <linux/sunrpc/stats.h> 15#include <linux/sunrpc/stats.h>
17#include <linux/sunrpc/svcsock.h> 16#include <linux/sunrpc/svcsock.h>
@@ -22,19 +21,19 @@
22#include "nfsd.h" 21#include "nfsd.h"
23#include "cache.h" 22#include "cache.h"
24#include "vfs.h" 23#include "vfs.h"
24#include "netns.h"
25 25
26#define NFSDDBG_FACILITY NFSDDBG_SVC 26#define NFSDDBG_FACILITY NFSDDBG_SVC
27 27
28extern struct svc_program nfsd_program; 28extern struct svc_program nfsd_program;
29static int nfsd(void *vrqstp); 29static int nfsd(void *vrqstp);
30struct timeval nfssvc_boot;
31 30
32/* 31/*
33 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members 32 * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
34 * of the svc_serv struct. In particular, ->sv_nrthreads but also to some 33 * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
35 * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt 34 * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
36 * 35 *
37 * If (out side the lock) nfsd_serv is non-NULL, then it must point to a 36 * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
38 * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number 37 * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
39 * of nfsd threads must exist and each must listed in ->sp_all_threads in each 38 * of nfsd threads must exist and each must listed in ->sp_all_threads in each
40 * entry of ->sv_pools[]. 39 * entry of ->sv_pools[].
@@ -52,7 +51,6 @@ struct timeval nfssvc_boot;
52 * nfsd_versions 51 * nfsd_versions
53 */ 52 */
54DEFINE_MUTEX(nfsd_mutex); 53DEFINE_MUTEX(nfsd_mutex);
55struct svc_serv *nfsd_serv;
56 54
57/* 55/*
58 * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used. 56 * nfsd_drc_lock protects nfsd_drc_max_pages and nfsd_drc_pages_used.
@@ -173,28 +171,32 @@ int nfsd_minorversion(u32 minorversion, enum vers_op change)
173 */ 171 */
174#define NFSD_MAXSERVS 8192 172#define NFSD_MAXSERVS 8192
175 173
176int nfsd_nrthreads(void) 174int nfsd_nrthreads(struct net *net)
177{ 175{
178 int rv = 0; 176 int rv = 0;
177 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
178
179 mutex_lock(&nfsd_mutex); 179 mutex_lock(&nfsd_mutex);
180 if (nfsd_serv) 180 if (nn->nfsd_serv)
181 rv = nfsd_serv->sv_nrthreads; 181 rv = nn->nfsd_serv->sv_nrthreads;
182 mutex_unlock(&nfsd_mutex); 182 mutex_unlock(&nfsd_mutex);
183 return rv; 183 return rv;
184} 184}
185 185
186static int nfsd_init_socks(void) 186static int nfsd_init_socks(struct net *net)
187{ 187{
188 int error; 188 int error;
189 if (!list_empty(&nfsd_serv->sv_permsocks)) 189 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
190
191 if (!list_empty(&nn->nfsd_serv->sv_permsocks))
190 return 0; 192 return 0;
191 193
192 error = svc_create_xprt(nfsd_serv, "udp", &init_net, PF_INET, NFS_PORT, 194 error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
193 SVC_SOCK_DEFAULTS); 195 SVC_SOCK_DEFAULTS);
194 if (error < 0) 196 if (error < 0)
195 return error; 197 return error;
196 198
197 error = svc_create_xprt(nfsd_serv, "tcp", &init_net, PF_INET, NFS_PORT, 199 error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
198 SVC_SOCK_DEFAULTS); 200 SVC_SOCK_DEFAULTS);
199 if (error < 0) 201 if (error < 0)
200 return error; 202 return error;
@@ -202,14 +204,15 @@ static int nfsd_init_socks(void)
202 return 0; 204 return 0;
203} 205}
204 206
205static bool nfsd_up = false; 207static int nfsd_users = 0;
206 208
207static int nfsd_startup(int nrservs) 209static int nfsd_startup_generic(int nrservs)
208{ 210{
209 int ret; 211 int ret;
210 212
211 if (nfsd_up) 213 if (nfsd_users++)
212 return 0; 214 return 0;
215
213 /* 216 /*
214 * Readahead param cache - will no-op if it already exists. 217 * Readahead param cache - will no-op if it already exists.
215 * (Note therefore results will be suboptimal if number of 218 * (Note therefore results will be suboptimal if number of
@@ -218,43 +221,79 @@ static int nfsd_startup(int nrservs)
218 ret = nfsd_racache_init(2*nrservs); 221 ret = nfsd_racache_init(2*nrservs);
219 if (ret) 222 if (ret)
220 return ret; 223 return ret;
221 ret = nfsd_init_socks(); 224 ret = nfs4_state_start();
222 if (ret) 225 if (ret)
223 goto out_racache; 226 goto out_racache;
224 ret = lockd_up(&init_net); 227 return 0;
228
229out_racache:
230 nfsd_racache_shutdown();
231 return ret;
232}
233
234static void nfsd_shutdown_generic(void)
235{
236 if (--nfsd_users)
237 return;
238
239 nfs4_state_shutdown();
240 nfsd_racache_shutdown();
241}
242
243static int nfsd_startup_net(int nrservs, struct net *net)
244{
245 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
246 int ret;
247
248 if (nn->nfsd_net_up)
249 return 0;
250
251 ret = nfsd_startup_generic(nrservs);
225 if (ret) 252 if (ret)
226 goto out_racache; 253 return ret;
227 ret = nfs4_state_start(); 254 ret = nfsd_init_socks(net);
255 if (ret)
256 goto out_socks;
257 ret = lockd_up(net);
258 if (ret)
259 goto out_socks;
260 ret = nfs4_state_start_net(net);
228 if (ret) 261 if (ret)
229 goto out_lockd; 262 goto out_lockd;
230 nfsd_up = true; 263
264 nn->nfsd_net_up = true;
231 return 0; 265 return 0;
266
232out_lockd: 267out_lockd:
233 lockd_down(&init_net); 268 lockd_down(net);
234out_racache: 269out_socks:
235 nfsd_racache_shutdown(); 270 nfsd_shutdown_generic();
236 return ret; 271 return ret;
237} 272}
238 273
239static void nfsd_shutdown(void) 274static void nfsd_shutdown_net(struct net *net)
240{ 275{
276 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
277
278 nfs4_state_shutdown_net(net);
279 lockd_down(net);
280 nn->nfsd_net_up = false;
281 nfsd_shutdown_generic();
282}
283
284static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
285{
286 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
287
241 /* 288 /*
242 * write_ports can create the server without actually starting 289 * write_ports can create the server without actually starting
243 * any threads--if we get shut down before any threads are 290 * any threads--if we get shut down before any threads are
244 * started, then nfsd_last_thread will be run before any of this 291 * started, then nfsd_last_thread will be run before any of this
245 * other initialization has been done. 292 * other initialization has been done.
246 */ 293 */
247 if (!nfsd_up) 294 if (!nn->nfsd_net_up)
248 return; 295 return;
249 nfs4_state_shutdown(); 296 nfsd_shutdown_net(net);
250 lockd_down(&init_net);
251 nfsd_racache_shutdown();
252 nfsd_up = false;
253}
254
255static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
256{
257 nfsd_shutdown();
258 297
259 svc_rpcb_cleanup(serv, net); 298 svc_rpcb_cleanup(serv, net);
260 299
@@ -327,69 +366,84 @@ static int nfsd_get_default_max_blksize(void)
327 return ret; 366 return ret;
328} 367}
329 368
330int nfsd_create_serv(void) 369int nfsd_create_serv(struct net *net)
331{ 370{
332 int error; 371 int error;
333 struct net *net = current->nsproxy->net_ns; 372 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
334 373
335 WARN_ON(!mutex_is_locked(&nfsd_mutex)); 374 WARN_ON(!mutex_is_locked(&nfsd_mutex));
336 if (nfsd_serv) { 375 if (nn->nfsd_serv) {
337 svc_get(nfsd_serv); 376 svc_get(nn->nfsd_serv);
338 return 0; 377 return 0;
339 } 378 }
340 if (nfsd_max_blksize == 0) 379 if (nfsd_max_blksize == 0)
341 nfsd_max_blksize = nfsd_get_default_max_blksize(); 380 nfsd_max_blksize = nfsd_get_default_max_blksize();
342 nfsd_reset_versions(); 381 nfsd_reset_versions();
343 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 382 nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
344 nfsd_last_thread, nfsd, THIS_MODULE); 383 nfsd_last_thread, nfsd, THIS_MODULE);
345 if (nfsd_serv == NULL) 384 if (nn->nfsd_serv == NULL)
346 return -ENOMEM; 385 return -ENOMEM;
347 386
348 error = svc_bind(nfsd_serv, net); 387 error = svc_bind(nn->nfsd_serv, net);
349 if (error < 0) { 388 if (error < 0) {
350 svc_destroy(nfsd_serv); 389 svc_destroy(nn->nfsd_serv);
351 return error; 390 return error;
352 } 391 }
353 392
354 set_max_drc(); 393 set_max_drc();
355 do_gettimeofday(&nfssvc_boot); /* record boot time */ 394 do_gettimeofday(&nn->nfssvc_boot); /* record boot time */
356 return 0; 395 return 0;
357} 396}
358 397
359int nfsd_nrpools(void) 398int nfsd_nrpools(struct net *net)
360{ 399{
361 if (nfsd_serv == NULL) 400 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
401
402 if (nn->nfsd_serv == NULL)
362 return 0; 403 return 0;
363 else 404 else
364 return nfsd_serv->sv_nrpools; 405 return nn->nfsd_serv->sv_nrpools;
365} 406}
366 407
367int nfsd_get_nrthreads(int n, int *nthreads) 408int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
368{ 409{
369 int i = 0; 410 int i = 0;
411 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
370 412
371 if (nfsd_serv != NULL) { 413 if (nn->nfsd_serv != NULL) {
372 for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++) 414 for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++)
373 nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads; 415 nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads;
374 } 416 }
375 417
376 return 0; 418 return 0;
377} 419}
378 420
379int nfsd_set_nrthreads(int n, int *nthreads) 421void nfsd_destroy(struct net *net)
422{
423 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
424 int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
425
426 if (destroy)
427 svc_shutdown_net(nn->nfsd_serv, net);
428 svc_destroy(nn->nfsd_serv);
429 if (destroy)
430 nn->nfsd_serv = NULL;
431}
432
433int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
380{ 434{
381 int i = 0; 435 int i = 0;
382 int tot = 0; 436 int tot = 0;
383 int err = 0; 437 int err = 0;
384 struct net *net = &init_net; 438 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
385 439
386 WARN_ON(!mutex_is_locked(&nfsd_mutex)); 440 WARN_ON(!mutex_is_locked(&nfsd_mutex));
387 441
388 if (nfsd_serv == NULL || n <= 0) 442 if (nn->nfsd_serv == NULL || n <= 0)
389 return 0; 443 return 0;
390 444
391 if (n > nfsd_serv->sv_nrpools) 445 if (n > nn->nfsd_serv->sv_nrpools)
392 n = nfsd_serv->sv_nrpools; 446 n = nn->nfsd_serv->sv_nrpools;
393 447
394 /* enforce a global maximum number of threads */ 448 /* enforce a global maximum number of threads */
395 tot = 0; 449 tot = 0;
@@ -419,9 +473,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
419 nthreads[0] = 1; 473 nthreads[0] = 1;
420 474
421 /* apply the new numbers */ 475 /* apply the new numbers */
422 svc_get(nfsd_serv); 476 svc_get(nn->nfsd_serv);
423 for (i = 0; i < n; i++) { 477 for (i = 0; i < n; i++) {
424 err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], 478 err = svc_set_num_threads(nn->nfsd_serv, &nn->nfsd_serv->sv_pools[i],
425 nthreads[i]); 479 nthreads[i]);
426 if (err) 480 if (err)
427 break; 481 break;
@@ -436,11 +490,11 @@ int nfsd_set_nrthreads(int n, int *nthreads)
436 * this is the first time nrservs is nonzero. 490 * this is the first time nrservs is nonzero.
437 */ 491 */
438int 492int
439nfsd_svc(int nrservs) 493nfsd_svc(int nrservs, struct net *net)
440{ 494{
441 int error; 495 int error;
442 bool nfsd_up_before; 496 bool nfsd_up_before;
443 struct net *net = &init_net; 497 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
444 498
445 mutex_lock(&nfsd_mutex); 499 mutex_lock(&nfsd_mutex);
446 dprintk("nfsd: creating service\n"); 500 dprintk("nfsd: creating service\n");
@@ -449,29 +503,29 @@ nfsd_svc(int nrservs)
449 if (nrservs > NFSD_MAXSERVS) 503 if (nrservs > NFSD_MAXSERVS)
450 nrservs = NFSD_MAXSERVS; 504 nrservs = NFSD_MAXSERVS;
451 error = 0; 505 error = 0;
452 if (nrservs == 0 && nfsd_serv == NULL) 506 if (nrservs == 0 && nn->nfsd_serv == NULL)
453 goto out; 507 goto out;
454 508
455 error = nfsd_create_serv(); 509 error = nfsd_create_serv(net);
456 if (error) 510 if (error)
457 goto out; 511 goto out;
458 512
459 nfsd_up_before = nfsd_up; 513 nfsd_up_before = nn->nfsd_net_up;
460 514
461 error = nfsd_startup(nrservs); 515 error = nfsd_startup_net(nrservs, net);
462 if (error) 516 if (error)
463 goto out_destroy; 517 goto out_destroy;
464 error = svc_set_num_threads(nfsd_serv, NULL, nrservs); 518 error = svc_set_num_threads(nn->nfsd_serv, NULL, nrservs);
465 if (error) 519 if (error)
466 goto out_shutdown; 520 goto out_shutdown;
467 /* We are holding a reference to nfsd_serv which 521 /* We are holding a reference to nn->nfsd_serv which
468 * we don't want to count in the return value, 522 * we don't want to count in the return value,
469 * so subtract 1 523 * so subtract 1
470 */ 524 */
471 error = nfsd_serv->sv_nrthreads - 1; 525 error = nn->nfsd_serv->sv_nrthreads - 1;
472out_shutdown: 526out_shutdown:
473 if (error < 0 && !nfsd_up_before) 527 if (error < 0 && !nfsd_up_before)
474 nfsd_shutdown(); 528 nfsd_shutdown_net(net);
475out_destroy: 529out_destroy:
476 nfsd_destroy(net); /* Release server */ 530 nfsd_destroy(net); /* Release server */
477out: 531out:
@@ -487,6 +541,8 @@ static int
487nfsd(void *vrqstp) 541nfsd(void *vrqstp)
488{ 542{
489 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 543 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
544 struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
545 struct net *net = perm_sock->xpt_net;
490 int err; 546 int err;
491 547
492 /* Lock module and set up kernel thread */ 548 /* Lock module and set up kernel thread */
@@ -551,7 +607,7 @@ out:
551 /* Release the thread */ 607 /* Release the thread */
552 svc_exit_thread(rqstp); 608 svc_exit_thread(rqstp);
553 609
554 nfsd_destroy(&init_net); 610 nfsd_destroy(net);
555 611
556 /* Release module */ 612 /* Release module */
557 mutex_unlock(&nfsd_mutex); 613 mutex_unlock(&nfsd_mutex);
@@ -640,21 +696,24 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
640 } 696 }
641 697
642 /* Store reply in cache. */ 698 /* Store reply in cache. */
643 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); 699 nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
644 return 1; 700 return 1;
645} 701}
646 702
647int nfsd_pool_stats_open(struct inode *inode, struct file *file) 703int nfsd_pool_stats_open(struct inode *inode, struct file *file)
648{ 704{
649 int ret; 705 int ret;
706 struct net *net = &init_net;
707 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
708
650 mutex_lock(&nfsd_mutex); 709 mutex_lock(&nfsd_mutex);
651 if (nfsd_serv == NULL) { 710 if (nn->nfsd_serv == NULL) {
652 mutex_unlock(&nfsd_mutex); 711 mutex_unlock(&nfsd_mutex);
653 return -ENODEV; 712 return -ENODEV;
654 } 713 }
655 /* bump up the psudo refcount while traversing */ 714 /* bump up the psudo refcount while traversing */
656 svc_get(nfsd_serv); 715 svc_get(nn->nfsd_serv);
657 ret = svc_pool_stats_open(nfsd_serv, file); 716 ret = svc_pool_stats_open(nn->nfsd_serv, file);
658 mutex_unlock(&nfsd_mutex); 717 mutex_unlock(&nfsd_mutex);
659 return ret; 718 return ret;
660} 719}
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 65ec595e2226..979b42106979 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -246,7 +246,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
246 struct nfsd_readargs *args) 246 struct nfsd_readargs *args)
247{ 247{
248 unsigned int len; 248 unsigned int len;
249 int v,pn; 249 int v;
250 if (!(p = decode_fh(p, &args->fh))) 250 if (!(p = decode_fh(p, &args->fh)))
251 return 0; 251 return 0;
252 252
@@ -262,8 +262,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
262 */ 262 */
263 v=0; 263 v=0;
264 while (len > 0) { 264 while (len > 0) {
265 pn = rqstp->rq_resused++; 265 struct page *p = *(rqstp->rq_next_page++);
266 rqstp->rq_vec[v].iov_base = page_address(rqstp->rq_respages[pn]); 266
267 rqstp->rq_vec[v].iov_base = page_address(p);
267 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; 268 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
268 len -= rqstp->rq_vec[v].iov_len; 269 len -= rqstp->rq_vec[v].iov_len;
269 v++; 270 v++;
@@ -355,7 +356,7 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli
355{ 356{
356 if (!(p = decode_fh(p, &args->fh))) 357 if (!(p = decode_fh(p, &args->fh)))
357 return 0; 358 return 0;
358 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); 359 args->buffer = page_address(*(rqstp->rq_next_page++));
359 360
360 return xdr_argsize_check(rqstp, p); 361 return xdr_argsize_check(rqstp, p);
361} 362}
@@ -396,7 +397,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
396 if (args->count > PAGE_SIZE) 397 if (args->count > PAGE_SIZE)
397 args->count = PAGE_SIZE; 398 args->count = PAGE_SIZE;
398 399
399 args->buffer = page_address(rqstp->rq_respages[rqstp->rq_resused++]); 400 args->buffer = page_address(*(rqstp->rq_next_page++));
400 401
401 return xdr_argsize_check(rqstp, p); 402 return xdr_argsize_check(rqstp, p);
402} 403}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index e036894bce57..d1c229feed52 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -150,6 +150,12 @@ struct nfsd4_channel_attrs {
150 u32 rdma_attrs; 150 u32 rdma_attrs;
151}; 151};
152 152
153struct nfsd4_cb_sec {
154 u32 flavor; /* (u32)(-1) used to mean "no valid flavor" */
155 u32 uid;
156 u32 gid;
157};
158
153struct nfsd4_create_session { 159struct nfsd4_create_session {
154 clientid_t clientid; 160 clientid_t clientid;
155 struct nfs4_sessionid sessionid; 161 struct nfs4_sessionid sessionid;
@@ -158,8 +164,12 @@ struct nfsd4_create_session {
158 struct nfsd4_channel_attrs fore_channel; 164 struct nfsd4_channel_attrs fore_channel;
159 struct nfsd4_channel_attrs back_channel; 165 struct nfsd4_channel_attrs back_channel;
160 u32 callback_prog; 166 u32 callback_prog;
161 u32 uid; 167 struct nfsd4_cb_sec cb_sec;
162 u32 gid; 168};
169
170struct nfsd4_backchannel_ctl {
171 u32 bc_cb_program;
172 struct nfsd4_cb_sec bc_cb_sec;
163}; 173};
164 174
165struct nfsd4_bind_conn_to_session { 175struct nfsd4_bind_conn_to_session {
@@ -192,6 +202,7 @@ struct nfsd4_session {
192 struct nfs4_sessionid se_sessionid; 202 struct nfs4_sessionid se_sessionid;
193 struct nfsd4_channel_attrs se_fchannel; 203 struct nfsd4_channel_attrs se_fchannel;
194 struct nfsd4_channel_attrs se_bchannel; 204 struct nfsd4_channel_attrs se_bchannel;
205 struct nfsd4_cb_sec se_cb_sec;
195 struct list_head se_conns; 206 struct list_head se_conns;
196 u32 se_cb_prog; 207 u32 se_cb_prog;
197 u32 se_cb_seq_nr; 208 u32 se_cb_seq_nr;
@@ -221,13 +232,12 @@ struct nfsd4_sessionid {
221 */ 232 */
222struct nfs4_client { 233struct nfs4_client {
223 struct list_head cl_idhash; /* hash by cl_clientid.id */ 234 struct list_head cl_idhash; /* hash by cl_clientid.id */
224 struct list_head cl_strhash; /* hash by cl_name */ 235 struct rb_node cl_namenode; /* link into by-name trees */
225 struct list_head cl_openowners; 236 struct list_head cl_openowners;
226 struct idr cl_stateids; /* stateid lookup */ 237 struct idr cl_stateids; /* stateid lookup */
227 struct list_head cl_delegations; 238 struct list_head cl_delegations;
228 struct list_head cl_lru; /* tail queue */ 239 struct list_head cl_lru; /* tail queue */
229 struct xdr_netobj cl_name; /* id generated by client */ 240 struct xdr_netobj cl_name; /* id generated by client */
230 char cl_recdir[HEXDIR_LEN]; /* recovery dir */
231 nfs4_verifier cl_verifier; /* generated by client */ 241 nfs4_verifier cl_verifier; /* generated by client */
232 time_t cl_time; /* time of last lease renewal */ 242 time_t cl_time; /* time of last lease renewal */
233 struct sockaddr_storage cl_addr; /* client ipaddress */ 243 struct sockaddr_storage cl_addr; /* client ipaddress */
@@ -242,9 +252,11 @@ struct nfs4_client {
242#define NFSD4_CLIENT_CB_KILL (1) 252#define NFSD4_CLIENT_CB_KILL (1)
243#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ 253#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
244#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ 254#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
255#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
245#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 256#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
246 1 << NFSD4_CLIENT_CB_KILL) 257 1 << NFSD4_CLIENT_CB_KILL)
247 unsigned long cl_flags; 258 unsigned long cl_flags;
259 struct rpc_cred *cl_cb_cred;
248 struct rpc_clnt *cl_cb_client; 260 struct rpc_clnt *cl_cb_client;
249 u32 cl_cb_ident; 261 u32 cl_cb_ident;
250#define NFSD4_CB_UP 0 262#define NFSD4_CB_UP 0
@@ -271,6 +283,7 @@ struct nfs4_client {
271 unsigned long cl_cb_slot_busy; 283 unsigned long cl_cb_slot_busy;
272 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */ 284 struct rpc_wait_queue cl_cb_waitq; /* backchannel callers may */
273 /* wait here for slots */ 285 /* wait here for slots */
286 struct net *net;
274}; 287};
275 288
276static inline void 289static inline void
@@ -292,6 +305,7 @@ is_client_expired(struct nfs4_client *clp)
292 */ 305 */
293struct nfs4_client_reclaim { 306struct nfs4_client_reclaim {
294 struct list_head cr_strhash; /* hash by cr_name */ 307 struct list_head cr_strhash; /* hash by cr_name */
308 struct nfs4_client *cr_clp; /* pointer to associated clp */
295 char cr_recdir[HEXDIR_LEN]; /* recover dir */ 309 char cr_recdir[HEXDIR_LEN]; /* recover dir */
296}; 310};
297 311
@@ -452,25 +466,26 @@ extern __be32 nfs4_preprocess_stateid_op(struct net *net,
452 stateid_t *stateid, int flags, struct file **filp); 466 stateid_t *stateid, int flags, struct file **filp);
453extern void nfs4_lock_state(void); 467extern void nfs4_lock_state(void);
454extern void nfs4_unlock_state(void); 468extern void nfs4_unlock_state(void);
455extern int nfs4_in_grace(void); 469void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
456extern void nfs4_release_reclaim(void); 470extern void nfs4_release_reclaim(struct nfsd_net *);
457extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct nfs4_client *crp); 471extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
458extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions); 472 struct nfsd_net *nn);
473extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
459extern void nfs4_free_openowner(struct nfs4_openowner *); 474extern void nfs4_free_openowner(struct nfs4_openowner *);
460extern void nfs4_free_lockowner(struct nfs4_lockowner *); 475extern void nfs4_free_lockowner(struct nfs4_lockowner *);
461extern int set_callback_cred(void); 476extern int set_callback_cred(void);
477extern void nfsd4_init_callback(struct nfsd4_callback *);
462extern void nfsd4_probe_callback(struct nfs4_client *clp); 478extern void nfsd4_probe_callback(struct nfs4_client *clp);
463extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); 479extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
464extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 480extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
465extern void nfsd4_do_callback_rpc(struct work_struct *);
466extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 481extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
467extern int nfsd4_create_callback_queue(void); 482extern int nfsd4_create_callback_queue(void);
468extern void nfsd4_destroy_callback_queue(void); 483extern void nfsd4_destroy_callback_queue(void);
469extern void nfsd4_shutdown_callback(struct nfs4_client *); 484extern void nfsd4_shutdown_callback(struct nfs4_client *);
470extern void nfs4_put_delegation(struct nfs4_delegation *dp); 485extern void nfs4_put_delegation(struct nfs4_delegation *dp);
471extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname); 486extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
472extern int nfs4_client_to_reclaim(const char *name); 487 struct nfsd_net *nn);
473extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id); 488extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
474extern void release_session_client(struct nfsd4_session *); 489extern void release_session_client(struct nfsd4_session *);
475extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *); 490extern void nfsd4_purge_closed_stateid(struct nfs4_stateowner *);
476 491
@@ -480,5 +495,28 @@ extern void nfsd4_client_tracking_exit(struct net *net);
480extern void nfsd4_client_record_create(struct nfs4_client *clp); 495extern void nfsd4_client_record_create(struct nfs4_client *clp);
481extern void nfsd4_client_record_remove(struct nfs4_client *clp); 496extern void nfsd4_client_record_remove(struct nfs4_client *clp);
482extern int nfsd4_client_record_check(struct nfs4_client *clp); 497extern int nfsd4_client_record_check(struct nfs4_client *clp);
483extern void nfsd4_record_grace_done(struct net *net, time_t boot_time); 498extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time);
499
500/* nfs fault injection functions */
501#ifdef CONFIG_NFSD_FAULT_INJECTION
502int nfsd_fault_inject_init(void);
503void nfsd_fault_inject_cleanup(void);
504u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64));
505struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t);
506
507u64 nfsd_forget_client(struct nfs4_client *, u64);
508u64 nfsd_forget_client_locks(struct nfs4_client*, u64);
509u64 nfsd_forget_client_openowners(struct nfs4_client *, u64);
510u64 nfsd_forget_client_delegations(struct nfs4_client *, u64);
511u64 nfsd_recall_client_delegations(struct nfs4_client *, u64);
512
513u64 nfsd_print_client(struct nfs4_client *, u64);
514u64 nfsd_print_client_locks(struct nfs4_client *, u64);
515u64 nfsd_print_client_openowners(struct nfs4_client *, u64);
516u64 nfsd_print_client_delegations(struct nfs4_client *, u64);
517#else /* CONFIG_NFSD_FAULT_INJECTION */
518static inline int nfsd_fault_inject_init(void) { return 0; }
519static inline void nfsd_fault_inject_cleanup(void) {}
520#endif /* CONFIG_NFSD_FAULT_INJECTION */
521
484#endif /* NFSD4_STATE_H */ 522#endif /* NFSD4_STATE_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index c120b48ec305..f0a6d88d7fff 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -886,7 +886,7 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
886 struct splice_desc *sd) 886 struct splice_desc *sd)
887{ 887{
888 struct svc_rqst *rqstp = sd->u.data; 888 struct svc_rqst *rqstp = sd->u.data;
889 struct page **pp = rqstp->rq_respages + rqstp->rq_resused; 889 struct page **pp = rqstp->rq_next_page;
890 struct page *page = buf->page; 890 struct page *page = buf->page;
891 size_t size; 891 size_t size;
892 892
@@ -894,17 +894,15 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
894 894
895 if (rqstp->rq_res.page_len == 0) { 895 if (rqstp->rq_res.page_len == 0) {
896 get_page(page); 896 get_page(page);
897 put_page(*pp); 897 put_page(*rqstp->rq_next_page);
898 *pp = page; 898 *(rqstp->rq_next_page++) = page;
899 rqstp->rq_resused++;
900 rqstp->rq_res.page_base = buf->offset; 899 rqstp->rq_res.page_base = buf->offset;
901 rqstp->rq_res.page_len = size; 900 rqstp->rq_res.page_len = size;
902 } else if (page != pp[-1]) { 901 } else if (page != pp[-1]) {
903 get_page(page); 902 get_page(page);
904 if (*pp) 903 if (*rqstp->rq_next_page)
905 put_page(*pp); 904 put_page(*rqstp->rq_next_page);
906 *pp = page; 905 *(rqstp->rq_next_page++) = page;
907 rqstp->rq_resused++;
908 rqstp->rq_res.page_len += size; 906 rqstp->rq_res.page_len += size;
909 } else 907 } else
910 rqstp->rq_res.page_len += size; 908 rqstp->rq_res.page_len += size;
@@ -936,7 +934,8 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
936 .u.data = rqstp, 934 .u.data = rqstp,
937 }; 935 };
938 936
939 rqstp->rq_resused = 1; 937 WARN_ON_ONCE(rqstp->rq_next_page != rqstp->rq_respages + 1);
938 rqstp->rq_next_page = rqstp->rq_respages + 1;
940 host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); 939 host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
941 } else { 940 } else {
942 oldfs = get_fs(); 941 oldfs = get_fs();
@@ -1020,28 +1019,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1020 inode = dentry->d_inode; 1019 inode = dentry->d_inode;
1021 exp = fhp->fh_export; 1020 exp = fhp->fh_export;
1022 1021
1023 /*
1024 * Request sync writes if
1025 * - the sync export option has been set, or
1026 * - the client requested O_SYNC behavior (NFSv3 feature).
1027 * - The file system doesn't support fsync().
1028 * When NFSv2 gathered writes have been configured for this volume,
1029 * flushing the data to disk is handled separately below.
1030 */
1031 use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp); 1022 use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
1032 1023
1033 if (!file->f_op->fsync) {/* COMMIT3 cannot work */
1034 stable = 2;
1035 *stablep = 2; /* FILE_SYNC */
1036 }
1037
1038 if (!EX_ISSYNC(exp)) 1024 if (!EX_ISSYNC(exp))
1039 stable = 0; 1025 stable = 0;
1040 if (stable && !use_wgather) {
1041 spin_lock(&file->f_lock);
1042 file->f_flags |= O_SYNC;
1043 spin_unlock(&file->f_lock);
1044 }
1045 1026
1046 /* Write the data. */ 1027 /* Write the data. */
1047 oldfs = get_fs(); set_fs(KERNEL_DS); 1028 oldfs = get_fs(); set_fs(KERNEL_DS);
@@ -1057,8 +1038,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1057 if (inode->i_mode & (S_ISUID | S_ISGID)) 1038 if (inode->i_mode & (S_ISUID | S_ISGID))
1058 kill_suid(dentry); 1039 kill_suid(dentry);
1059 1040
1060 if (stable && use_wgather) 1041 if (stable) {
1061 host_err = wait_for_concurrent_writes(file); 1042 if (use_wgather)
1043 host_err = wait_for_concurrent_writes(file);
1044 else
1045 host_err = vfs_fsync_range(file, offset, offset+*cnt, 0);
1046 }
1062 1047
1063out_nfserr: 1048out_nfserr:
1064 dprintk("nfsd: write complete host_err=%d\n", host_err); 1049 dprintk("nfsd: write complete host_err=%d\n", host_err);
@@ -1485,13 +1470,19 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1485 case NFS3_CREATE_EXCLUSIVE: 1470 case NFS3_CREATE_EXCLUSIVE:
1486 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime 1471 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
1487 && dchild->d_inode->i_atime.tv_sec == v_atime 1472 && dchild->d_inode->i_atime.tv_sec == v_atime
1488 && dchild->d_inode->i_size == 0 ) 1473 && dchild->d_inode->i_size == 0 ) {
1474 if (created)
1475 *created = 1;
1489 break; 1476 break;
1477 }
1490 case NFS4_CREATE_EXCLUSIVE4_1: 1478 case NFS4_CREATE_EXCLUSIVE4_1:
1491 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime 1479 if ( dchild->d_inode->i_mtime.tv_sec == v_mtime
1492 && dchild->d_inode->i_atime.tv_sec == v_atime 1480 && dchild->d_inode->i_atime.tv_sec == v_atime
1493 && dchild->d_inode->i_size == 0 ) 1481 && dchild->d_inode->i_size == 0 ) {
1482 if (created)
1483 *created = 1;
1494 goto set_attr; 1484 goto set_attr;
1485 }
1495 /* fallthru */ 1486 /* fallthru */
1496 case NFS3_CREATE_GUARDED: 1487 case NFS3_CREATE_GUARDED:
1497 err = nfserr_exist; 1488 err = nfserr_exist;
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index acd127d4ee82..0889bfb43dc9 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -385,7 +385,8 @@ struct nfsd4_write {
385 u64 wr_offset; /* request */ 385 u64 wr_offset; /* request */
386 u32 wr_stable_how; /* request */ 386 u32 wr_stable_how; /* request */
387 u32 wr_buflen; /* request */ 387 u32 wr_buflen; /* request */
388 int wr_vlen; 388 struct kvec wr_head;
389 struct page ** wr_pagelist; /* request */
389 390
390 u32 wr_bytes_written; /* response */ 391 u32 wr_bytes_written; /* response */
391 u32 wr_how_written; /* response */ 392 u32 wr_how_written; /* response */
@@ -462,6 +463,7 @@ struct nfsd4_op {
462 463
463 /* NFSv4.1 */ 464 /* NFSv4.1 */
464 struct nfsd4_exchange_id exchange_id; 465 struct nfsd4_exchange_id exchange_id;
466 struct nfsd4_backchannel_ctl backchannel_ctl;
465 struct nfsd4_bind_conn_to_session bind_conn_to_session; 467 struct nfsd4_bind_conn_to_session bind_conn_to_session;
466 struct nfsd4_create_session create_session; 468 struct nfsd4_create_session create_session;
467 struct nfsd4_destroy_session destroy_session; 469 struct nfsd4_destroy_session destroy_session;
@@ -526,6 +528,14 @@ static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
526 || nfsd4_is_solo_sequence(resp); 528 || nfsd4_is_solo_sequence(resp);
527} 529}
528 530
531static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
532{
533 struct nfsd4_compoundres *resp = rqstp->rq_resp;
534 struct nfsd4_compoundargs *argp = rqstp->rq_argp;
535
536 return argp->opcnt == resp->opcnt;
537}
538
529#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs) 539#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
530 540
531static inline void 541static inline void
@@ -566,6 +576,7 @@ extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
566 struct nfsd4_sequence *seq); 576 struct nfsd4_sequence *seq);
567extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 577extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
568 struct nfsd4_compound_state *, struct nfsd4_exchange_id *); 578 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
579extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
569extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *); 580extern __be32 nfsd4_bind_conn_to_session(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_bind_conn_to_session *);
570extern __be32 nfsd4_create_session(struct svc_rqst *, 581extern __be32 nfsd4_create_session(struct svc_rqst *,
571 struct nfsd4_compound_state *, 582 struct nfsd4_compound_state *,
@@ -579,7 +590,7 @@ extern __be32 nfsd4_destroy_session(struct svc_rqst *,
579extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *); 590extern __be32 nfsd4_destroy_clientid(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_destroy_clientid *);
580__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *); 591__be32 nfsd4_reclaim_complete(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_reclaim_complete *);
581extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *, 592extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
582 struct nfsd4_open *open); 593 struct nfsd4_open *open, struct nfsd_net *nn);
583extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, 594extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
584 struct svc_fh *current_fh, struct nfsd4_open *open); 595 struct svc_fh *current_fh, struct nfsd4_open *open);
585extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); 596extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 16f35f7423c5..61946883025c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -167,7 +167,6 @@ const struct file_operations nilfs_file_operations = {
167}; 167};
168 168
169const struct inode_operations nilfs_file_inode_operations = { 169const struct inode_operations nilfs_file_inode_operations = {
170 .truncate = nilfs_truncate,
171 .setattr = nilfs_setattr, 170 .setattr = nilfs_setattr,
172 .permission = nilfs_permission, 171 .permission = nilfs_permission,
173 .fiemap = nilfs_fiemap, 172 .fiemap = nilfs_fiemap,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4d31d2cca7fd..6b49f14eac8c 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -213,6 +213,16 @@ static int nilfs_set_page_dirty(struct page *page)
213 return ret; 213 return ret;
214} 214}
215 215
216void nilfs_write_failed(struct address_space *mapping, loff_t to)
217{
218 struct inode *inode = mapping->host;
219
220 if (to > inode->i_size) {
221 truncate_pagecache(inode, to, inode->i_size);
222 nilfs_truncate(inode);
223 }
224}
225
216static int nilfs_write_begin(struct file *file, struct address_space *mapping, 226static int nilfs_write_begin(struct file *file, struct address_space *mapping,
217 loff_t pos, unsigned len, unsigned flags, 227 loff_t pos, unsigned len, unsigned flags,
218 struct page **pagep, void **fsdata) 228 struct page **pagep, void **fsdata)
@@ -227,10 +237,7 @@ static int nilfs_write_begin(struct file *file, struct address_space *mapping,
227 err = block_write_begin(mapping, pos, len, flags, pagep, 237 err = block_write_begin(mapping, pos, len, flags, pagep,
228 nilfs_get_block); 238 nilfs_get_block);
229 if (unlikely(err)) { 239 if (unlikely(err)) {
230 loff_t isize = mapping->host->i_size; 240 nilfs_write_failed(mapping, pos + len);
231 if (pos + len > isize)
232 vmtruncate(mapping->host, isize);
233
234 nilfs_transaction_abort(inode->i_sb); 241 nilfs_transaction_abort(inode->i_sb);
235 } 242 }
236 return err; 243 return err;
@@ -259,6 +266,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
259 loff_t offset, unsigned long nr_segs) 266 loff_t offset, unsigned long nr_segs)
260{ 267{
261 struct file *file = iocb->ki_filp; 268 struct file *file = iocb->ki_filp;
269 struct address_space *mapping = file->f_mapping;
262 struct inode *inode = file->f_mapping->host; 270 struct inode *inode = file->f_mapping->host;
263 ssize_t size; 271 ssize_t size;
264 272
@@ -278,7 +286,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
278 loff_t end = offset + iov_length(iov, nr_segs); 286 loff_t end = offset + iov_length(iov, nr_segs);
279 287
280 if (end > isize) 288 if (end > isize)
281 vmtruncate(inode, isize); 289 nilfs_write_failed(mapping, end);
282 } 290 }
283 291
284 return size; 292 return size;
@@ -786,10 +794,8 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
786 if ((iattr->ia_valid & ATTR_SIZE) && 794 if ((iattr->ia_valid & ATTR_SIZE) &&
787 iattr->ia_size != i_size_read(inode)) { 795 iattr->ia_size != i_size_read(inode)) {
788 inode_dio_wait(inode); 796 inode_dio_wait(inode);
789 797 truncate_setsize(inode, iattr->ia_size);
790 err = vmtruncate(inode, iattr->ia_size); 798 nilfs_truncate(inode);
791 if (unlikely(err))
792 goto out_err;
793 } 799 }
794 800
795 setattr_copy(inode, iattr); 801 setattr_copy(inode, iattr);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 74cece80e9a3..9bc72dec3fa6 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -277,6 +277,7 @@ extern void nilfs_update_inode(struct inode *, struct buffer_head *);
277extern void nilfs_truncate(struct inode *); 277extern void nilfs_truncate(struct inode *);
278extern void nilfs_evict_inode(struct inode *); 278extern void nilfs_evict_inode(struct inode *);
279extern int nilfs_setattr(struct dentry *, struct iattr *); 279extern int nilfs_setattr(struct dentry *, struct iattr *);
280extern void nilfs_write_failed(struct address_space *mapping, loff_t to);
280int nilfs_permission(struct inode *inode, int mask); 281int nilfs_permission(struct inode *inode, int mask);
281int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); 282int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
282extern int nilfs_inode_dirty(struct inode *); 283extern int nilfs_inode_dirty(struct inode *);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index f1626f5011c5..ff00a0b7acb9 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -527,7 +527,8 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
527 if (unlikely(err)) { 527 if (unlikely(err)) {
528 loff_t isize = inode->i_size; 528 loff_t isize = inode->i_size;
529 if (pos + blocksize > isize) 529 if (pos + blocksize > isize)
530 vmtruncate(inode, isize); 530 nilfs_write_failed(inode->i_mapping,
531 pos + blocksize);
531 goto failed_inode; 532 goto failed_inode;
532 } 533 }
533 534
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1ecf46448f85..5b2d4f0853ac 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1762,6 +1762,16 @@ err_out:
1762 return err; 1762 return err;
1763} 1763}
1764 1764
1765static void ntfs_write_failed(struct address_space *mapping, loff_t to)
1766{
1767 struct inode *inode = mapping->host;
1768
1769 if (to > inode->i_size) {
1770 truncate_pagecache(inode, to, inode->i_size);
1771 ntfs_truncate_vfs(inode);
1772 }
1773}
1774
1765/** 1775/**
1766 * ntfs_file_buffered_write - 1776 * ntfs_file_buffered_write -
1767 * 1777 *
@@ -2022,8 +2032,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2022 * allocated space, which is not a disaster. 2032 * allocated space, which is not a disaster.
2023 */ 2033 */
2024 i_size = i_size_read(vi); 2034 i_size = i_size_read(vi);
2025 if (pos + bytes > i_size) 2035 if (pos + bytes > i_size) {
2026 vmtruncate(vi, i_size); 2036 ntfs_write_failed(mapping, pos + bytes);
2037 }
2027 break; 2038 break;
2028 } 2039 }
2029 } 2040 }
@@ -2227,7 +2238,6 @@ const struct file_operations ntfs_file_ops = {
2227 2238
2228const struct inode_operations ntfs_file_inode_ops = { 2239const struct inode_operations ntfs_file_inode_ops = {
2229#ifdef NTFS_RW 2240#ifdef NTFS_RW
2230 .truncate = ntfs_truncate_vfs,
2231 .setattr = ntfs_setattr, 2241 .setattr = ntfs_setattr,
2232#endif /* NTFS_RW */ 2242#endif /* NTFS_RW */
2233}; 2243};
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 1d27331e6fc9..d3e118cc6ffa 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2866,9 +2866,11 @@ conv_err_out:
2866 * 2866 *
2867 * See ntfs_truncate() description above for details. 2867 * See ntfs_truncate() description above for details.
2868 */ 2868 */
2869#ifdef NTFS_RW
2869void ntfs_truncate_vfs(struct inode *vi) { 2870void ntfs_truncate_vfs(struct inode *vi) {
2870 ntfs_truncate(vi); 2871 ntfs_truncate(vi);
2871} 2872}
2873#endif
2872 2874
2873/** 2875/**
2874 * ntfs_setattr - called from notify_change() when an attribute is being changed 2876 * ntfs_setattr - called from notify_change() when an attribute is being changed
@@ -2914,8 +2916,10 @@ int ntfs_setattr(struct dentry *dentry, struct iattr *attr)
2914 NInoCompressed(ni) ? 2916 NInoCompressed(ni) ?
2915 "compressed" : "encrypted"); 2917 "compressed" : "encrypted");
2916 err = -EOPNOTSUPP; 2918 err = -EOPNOTSUPP;
2917 } else 2919 } else {
2918 err = vmtruncate(vi, attr->ia_size); 2920 truncate_setsize(vi, attr->ia_size);
2921 ntfs_truncate_vfs(vi);
2922 }
2919 if (err || ia_valid == ATTR_SIZE) 2923 if (err || ia_valid == ATTR_SIZE)
2920 goto out; 2924 goto out;
2921 } else { 2925 } else {
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
index db29695f845c..76b6cfb579d7 100644
--- a/fs/ntfs/inode.h
+++ b/fs/ntfs/inode.h
@@ -316,6 +316,10 @@ static inline void ntfs_commit_inode(struct inode *vi)
316 return; 316 return;
317} 317}
318 318
319#else
320
321static inline void ntfs_truncate_vfs(struct inode *vi) {}
322
319#endif /* NTFS_RW */ 323#endif /* NTFS_RW */
320 324
321#endif /* _LINUX_NTFS_INODE_H */ 325#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index fe492e1a3cfc..37d313ede159 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1218,24 +1218,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1218 } 1218 }
1219 } 1219 }
1220 1220
1221 /*
1222 * This will intentionally not wind up calling truncate_setsize(),
1223 * since all the work for a size change has been done above.
1224 * Otherwise, we could get into problems with truncate as
1225 * ip_alloc_sem is used there to protect against i_size
1226 * changes.
1227 *
1228 * XXX: this means the conditional below can probably be removed.
1229 */
1230 if ((attr->ia_valid & ATTR_SIZE) &&
1231 attr->ia_size != i_size_read(inode)) {
1232 status = vmtruncate(inode, attr->ia_size);
1233 if (status) {
1234 mlog_errno(status);
1235 goto bail_commit;
1236 }
1237 }
1238
1239 setattr_copy(inode, attr); 1221 setattr_copy(inode, attr);
1240 mark_inode_dirty(inode); 1222 mark_inode_dirty(inode);
1241 1223
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 77e3cb2962b4..e0d9b3e722bd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -306,6 +306,16 @@ omfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
306 return mpage_writepages(mapping, wbc, omfs_get_block); 306 return mpage_writepages(mapping, wbc, omfs_get_block);
307} 307}
308 308
309static void omfs_write_failed(struct address_space *mapping, loff_t to)
310{
311 struct inode *inode = mapping->host;
312
313 if (to > inode->i_size) {
314 truncate_pagecache(inode, to, inode->i_size);
315 omfs_truncate(inode);
316 }
317}
318
309static int omfs_write_begin(struct file *file, struct address_space *mapping, 319static int omfs_write_begin(struct file *file, struct address_space *mapping,
310 loff_t pos, unsigned len, unsigned flags, 320 loff_t pos, unsigned len, unsigned flags,
311 struct page **pagep, void **fsdata) 321 struct page **pagep, void **fsdata)
@@ -314,11 +324,8 @@ static int omfs_write_begin(struct file *file, struct address_space *mapping,
314 324
315 ret = block_write_begin(mapping, pos, len, flags, pagep, 325 ret = block_write_begin(mapping, pos, len, flags, pagep,
316 omfs_get_block); 326 omfs_get_block);
317 if (unlikely(ret)) { 327 if (unlikely(ret))
318 loff_t isize = mapping->host->i_size; 328 omfs_write_failed(mapping, pos + len);
319 if (pos + len > isize)
320 vmtruncate(mapping->host, isize);
321 }
322 329
323 return ret; 330 return ret;
324} 331}
@@ -350,9 +357,11 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
350 357
351 if ((attr->ia_valid & ATTR_SIZE) && 358 if ((attr->ia_valid & ATTR_SIZE) &&
352 attr->ia_size != i_size_read(inode)) { 359 attr->ia_size != i_size_read(inode)) {
353 error = vmtruncate(inode, attr->ia_size); 360 error = inode_newsize_ok(inode, attr->ia_size);
354 if (error) 361 if (error)
355 return error; 362 return error;
363 truncate_setsize(inode, attr->ia_size);
364 omfs_truncate(inode);
356 } 365 }
357 366
358 setattr_copy(inode, attr); 367 setattr_copy(inode, attr);
@@ -362,7 +371,6 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr)
362 371
363const struct inode_operations omfs_file_inops = { 372const struct inode_operations omfs_file_inops = {
364 .setattr = omfs_setattr, 373 .setattr = omfs_setattr,
365 .truncate = omfs_truncate
366}; 374};
367 375
368const struct address_space_operations omfs_aops = { 376const struct address_space_operations omfs_aops = {
diff --git a/fs/open.c b/fs/open.c
index 182d8667b7bd..9b33c0cbfacf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -61,33 +61,22 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
61 return ret; 61 return ret;
62} 62}
63 63
64static long do_sys_truncate(const char __user *pathname, loff_t length) 64long vfs_truncate(struct path *path, loff_t length)
65{ 65{
66 struct path path;
67 struct inode *inode; 66 struct inode *inode;
68 int error; 67 long error;
69
70 error = -EINVAL;
71 if (length < 0) /* sorry, but loff_t says... */
72 goto out;
73 68
74 error = user_path(pathname, &path); 69 inode = path->dentry->d_inode;
75 if (error)
76 goto out;
77 inode = path.dentry->d_inode;
78 70
79 /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ 71 /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
80 error = -EISDIR;
81 if (S_ISDIR(inode->i_mode)) 72 if (S_ISDIR(inode->i_mode))
82 goto dput_and_out; 73 return -EISDIR;
83
84 error = -EINVAL;
85 if (!S_ISREG(inode->i_mode)) 74 if (!S_ISREG(inode->i_mode))
86 goto dput_and_out; 75 return -EINVAL;
87 76
88 error = mnt_want_write(path.mnt); 77 error = mnt_want_write(path->mnt);
89 if (error) 78 if (error)
90 goto dput_and_out; 79 goto out;
91 80
92 error = inode_permission(inode, MAY_WRITE); 81 error = inode_permission(inode, MAY_WRITE);
93 if (error) 82 if (error)
@@ -111,19 +100,40 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
111 100
112 error = locks_verify_truncate(inode, NULL, length); 101 error = locks_verify_truncate(inode, NULL, length);
113 if (!error) 102 if (!error)
114 error = security_path_truncate(&path); 103 error = security_path_truncate(path);
115 if (!error) 104 if (!error)
116 error = do_truncate(path.dentry, length, 0, NULL); 105 error = do_truncate(path->dentry, length, 0, NULL);
117 106
118put_write_and_out: 107put_write_and_out:
119 put_write_access(inode); 108 put_write_access(inode);
120mnt_drop_write_and_out: 109mnt_drop_write_and_out:
121 mnt_drop_write(path.mnt); 110 mnt_drop_write(path->mnt);
122dput_and_out:
123 path_put(&path);
124out: 111out:
125 return error; 112 return error;
126} 113}
114EXPORT_SYMBOL_GPL(vfs_truncate);
115
116static long do_sys_truncate(const char __user *pathname, loff_t length)
117{
118 unsigned int lookup_flags = LOOKUP_FOLLOW;
119 struct path path;
120 int error;
121
122 if (length < 0) /* sorry, but loff_t says... */
123 return -EINVAL;
124
125retry:
126 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
127 if (!error) {
128 error = vfs_truncate(&path, length);
129 path_put(&path);
130 }
131 if (retry_estale(error, lookup_flags)) {
132 lookup_flags |= LOOKUP_REVAL;
133 goto retry;
134 }
135 return error;
136}
127 137
128SYSCALL_DEFINE2(truncate, const char __user *, path, long, length) 138SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
129{ 139{
@@ -306,6 +316,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
306 struct path path; 316 struct path path;
307 struct inode *inode; 317 struct inode *inode;
308 int res; 318 int res;
319 unsigned int lookup_flags = LOOKUP_FOLLOW;
309 320
310 if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ 321 if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
311 return -EINVAL; 322 return -EINVAL;
@@ -328,8 +339,8 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
328 } 339 }
329 340
330 old_cred = override_creds(override_cred); 341 old_cred = override_creds(override_cred);
331 342retry:
332 res = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); 343 res = user_path_at(dfd, filename, lookup_flags, &path);
333 if (res) 344 if (res)
334 goto out; 345 goto out;
335 346
@@ -364,6 +375,10 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
364 375
365out_path_release: 376out_path_release:
366 path_put(&path); 377 path_put(&path);
378 if (retry_estale(res, lookup_flags)) {
379 lookup_flags |= LOOKUP_REVAL;
380 goto retry;
381 }
367out: 382out:
368 revert_creds(old_cred); 383 revert_creds(old_cred);
369 put_cred(override_cred); 384 put_cred(override_cred);
@@ -379,8 +394,9 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
379{ 394{
380 struct path path; 395 struct path path;
381 int error; 396 int error;
382 397 unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
383 error = user_path_dir(filename, &path); 398retry:
399 error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
384 if (error) 400 if (error)
385 goto out; 401 goto out;
386 402
@@ -392,6 +408,10 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
392 408
393dput_and_out: 409dput_and_out:
394 path_put(&path); 410 path_put(&path);
411 if (retry_estale(error, lookup_flags)) {
412 lookup_flags |= LOOKUP_REVAL;
413 goto retry;
414 }
395out: 415out:
396 return error; 416 return error;
397} 417}
@@ -425,8 +445,9 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
425{ 445{
426 struct path path; 446 struct path path;
427 int error; 447 int error;
428 448 unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
429 error = user_path_dir(filename, &path); 449retry:
450 error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
430 if (error) 451 if (error)
431 goto out; 452 goto out;
432 453
@@ -445,6 +466,10 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
445 error = 0; 466 error = 0;
446dput_and_out: 467dput_and_out:
447 path_put(&path); 468 path_put(&path);
469 if (retry_estale(error, lookup_flags)) {
470 lookup_flags |= LOOKUP_REVAL;
471 goto retry;
472 }
448out: 473out:
449 return error; 474 return error;
450} 475}
@@ -489,11 +514,16 @@ SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode
489{ 514{
490 struct path path; 515 struct path path;
491 int error; 516 int error;
492 517 unsigned int lookup_flags = LOOKUP_FOLLOW;
493 error = user_path_at(dfd, filename, LOOKUP_FOLLOW, &path); 518retry:
519 error = user_path_at(dfd, filename, lookup_flags, &path);
494 if (!error) { 520 if (!error) {
495 error = chmod_common(&path, mode); 521 error = chmod_common(&path, mode);
496 path_put(&path); 522 path_put(&path);
523 if (retry_estale(error, lookup_flags)) {
524 lookup_flags |= LOOKUP_REVAL;
525 goto retry;
526 }
497 } 527 }
498 return error; 528 return error;
499} 529}
@@ -552,6 +582,7 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
552 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 582 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
553 if (flag & AT_EMPTY_PATH) 583 if (flag & AT_EMPTY_PATH)
554 lookup_flags |= LOOKUP_EMPTY; 584 lookup_flags |= LOOKUP_EMPTY;
585retry:
555 error = user_path_at(dfd, filename, lookup_flags, &path); 586 error = user_path_at(dfd, filename, lookup_flags, &path);
556 if (error) 587 if (error)
557 goto out; 588 goto out;
@@ -562,6 +593,10 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
562 mnt_drop_write(path.mnt); 593 mnt_drop_write(path.mnt);
563out_release: 594out_release:
564 path_put(&path); 595 path_put(&path);
596 if (retry_estale(error, lookup_flags)) {
597 lookup_flags |= LOOKUP_REVAL;
598 goto retry;
599 }
565out: 600out:
566 return error; 601 return error;
567} 602}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5a5a0be40e40..9b43ff77a51e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -542,13 +542,6 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
542 if (error) 542 if (error)
543 return error; 543 return error;
544 544
545 if ((attr->ia_valid & ATTR_SIZE) &&
546 attr->ia_size != i_size_read(inode)) {
547 error = vmtruncate(inode, attr->ia_size);
548 if (error)
549 return error;
550 }
551
552 setattr_copy(inode, attr); 545 setattr_copy(inode, attr);
553 mark_inode_dirty(inode); 546 mark_inode_dirty(inode);
554 return 0; 547 return 0;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e659a0ff1da7..e064f562b1f7 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -261,16 +261,9 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
261 if (error) 261 if (error)
262 return error; 262 return error;
263 263
264 if ((iattr->ia_valid & ATTR_SIZE) &&
265 iattr->ia_size != i_size_read(inode)) {
266 error = vmtruncate(inode, iattr->ia_size);
267 if (error)
268 return error;
269 }
270
271 setattr_copy(inode, iattr); 264 setattr_copy(inode, iattr);
272 mark_inode_dirty(inode); 265 mark_inode_dirty(inode);
273 266
274 de->uid = inode->i_uid; 267 de->uid = inode->i_uid;
275 de->gid = inode->i_gid; 268 de->gid = inode->i_gid;
276 de->mode = inode->i_mode; 269 de->mode = inode->i_mode;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 701580ddfcc3..1827d88ad58b 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -736,13 +736,6 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
736 if (error) 736 if (error)
737 return error; 737 return error;
738 738
739 if ((attr->ia_valid & ATTR_SIZE) &&
740 attr->ia_size != i_size_read(inode)) {
741 error = vmtruncate(inode, attr->ia_size);
742 if (error)
743 return error;
744 }
745
746 setattr_copy(inode, attr); 739 setattr_copy(inode, attr);
747 mark_inode_dirty(inode); 740 mark_inode_dirty(inode);
748 return 0; 741 return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 8375c922c0d5..50302d6f8895 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -126,7 +126,7 @@ static int reiserfs_file_open(struct inode *inode, struct file *file)
126 return err; 126 return err;
127} 127}
128 128
129static void reiserfs_vfs_truncate_file(struct inode *inode) 129void reiserfs_vfs_truncate_file(struct inode *inode)
130{ 130{
131 mutex_lock(&(REISERFS_I(inode)->tailpack)); 131 mutex_lock(&(REISERFS_I(inode)->tailpack));
132 reiserfs_truncate_file(inode, 1); 132 reiserfs_truncate_file(inode, 1);
@@ -312,7 +312,6 @@ const struct file_operations reiserfs_file_operations = {
312}; 312};
313 313
314const struct inode_operations reiserfs_file_inode_operations = { 314const struct inode_operations reiserfs_file_inode_operations = {
315 .truncate = reiserfs_vfs_truncate_file,
316 .setattr = reiserfs_setattr, 315 .setattr = reiserfs_setattr,
317 .setxattr = reiserfs_setxattr, 316 .setxattr = reiserfs_setxattr,
318 .getxattr = reiserfs_getxattr, 317 .getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d83736fbc26c..95d7680ead47 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3085,8 +3085,10 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
3085 loff_t isize = i_size_read(inode); 3085 loff_t isize = i_size_read(inode);
3086 loff_t end = offset + iov_length(iov, nr_segs); 3086 loff_t end = offset + iov_length(iov, nr_segs);
3087 3087
3088 if (end > isize) 3088 if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
3089 vmtruncate(inode, isize); 3089 truncate_setsize(inode, isize);
3090 reiserfs_vfs_truncate_file(inode);
3091 }
3090 } 3092 }
3091 3093
3092 return ret; 3094 return ret;
@@ -3200,8 +3202,13 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
3200 */ 3202 */
3201 reiserfs_write_unlock_once(inode->i_sb, depth); 3203 reiserfs_write_unlock_once(inode->i_sb, depth);
3202 if ((attr->ia_valid & ATTR_SIZE) && 3204 if ((attr->ia_valid & ATTR_SIZE) &&
3203 attr->ia_size != i_size_read(inode)) 3205 attr->ia_size != i_size_read(inode)) {
3204 error = vmtruncate(inode, attr->ia_size); 3206 error = inode_newsize_ok(inode, attr->ia_size);
3207 if (!error) {
3208 truncate_setsize(inode, attr->ia_size);
3209 reiserfs_vfs_truncate_file(inode);
3210 }
3211 }
3205 3212
3206 if (!error) { 3213 if (!error) {
3207 setattr_copy(inode, attr); 3214 setattr_copy(inode, attr);
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 33215f57ea06..157e474ab303 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -2455,6 +2455,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
2455 *, 2455 *,
2456 int count); 2456 int count);
2457int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); 2457int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
2458void reiserfs_vfs_truncate_file(struct inode *inode);
2458int reiserfs_commit_page(struct inode *inode, struct page *page, 2459int reiserfs_commit_page(struct inode *inode, struct page *page,
2459 unsigned from, unsigned to); 2460 unsigned from, unsigned to);
2460void reiserfs_flush_old_commits(struct super_block *); 2461void reiserfs_flush_old_commits(struct super_block *);
diff --git a/fs/stat.c b/fs/stat.c
index eae494630a36..14f45459c83d 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -74,7 +74,7 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
74{ 74{
75 struct path path; 75 struct path path;
76 int error = -EINVAL; 76 int error = -EINVAL;
77 int lookup_flags = 0; 77 unsigned int lookup_flags = 0;
78 78
79 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | 79 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
80 AT_EMPTY_PATH)) != 0) 80 AT_EMPTY_PATH)) != 0)
@@ -84,13 +84,17 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
84 lookup_flags |= LOOKUP_FOLLOW; 84 lookup_flags |= LOOKUP_FOLLOW;
85 if (flag & AT_EMPTY_PATH) 85 if (flag & AT_EMPTY_PATH)
86 lookup_flags |= LOOKUP_EMPTY; 86 lookup_flags |= LOOKUP_EMPTY;
87 87retry:
88 error = user_path_at(dfd, filename, lookup_flags, &path); 88 error = user_path_at(dfd, filename, lookup_flags, &path);
89 if (error) 89 if (error)
90 goto out; 90 goto out;
91 91
92 error = vfs_getattr(path.mnt, path.dentry, stat); 92 error = vfs_getattr(path.mnt, path.dentry, stat);
93 path_put(&path); 93 path_put(&path);
94 if (retry_estale(error, lookup_flags)) {
95 lookup_flags |= LOOKUP_REVAL;
96 goto retry;
97 }
94out: 98out:
95 return error; 99 return error;
96} 100}
@@ -296,11 +300,13 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
296 struct path path; 300 struct path path;
297 int error; 301 int error;
298 int empty = 0; 302 int empty = 0;
303 unsigned int lookup_flags = LOOKUP_EMPTY;
299 304
300 if (bufsiz <= 0) 305 if (bufsiz <= 0)
301 return -EINVAL; 306 return -EINVAL;
302 307
303 error = user_path_at_empty(dfd, pathname, LOOKUP_EMPTY, &path, &empty); 308retry:
309 error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty);
304 if (!error) { 310 if (!error) {
305 struct inode *inode = path.dentry->d_inode; 311 struct inode *inode = path.dentry->d_inode;
306 312
@@ -314,6 +320,10 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
314 } 320 }
315 } 321 }
316 path_put(&path); 322 path_put(&path);
323 if (retry_estale(error, lookup_flags)) {
324 lookup_flags |= LOOKUP_REVAL;
325 goto retry;
326 }
317 } 327 }
318 return error; 328 return error;
319} 329}
diff --git a/fs/statfs.c b/fs/statfs.c
index f8e832e6f0a2..c219e733f553 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -77,10 +77,17 @@ EXPORT_SYMBOL(vfs_statfs);
77int user_statfs(const char __user *pathname, struct kstatfs *st) 77int user_statfs(const char __user *pathname, struct kstatfs *st)
78{ 78{
79 struct path path; 79 struct path path;
80 int error = user_path_at(AT_FDCWD, pathname, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 80 int error;
81 unsigned int lookup_flags = LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT;
82retry:
83 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
81 if (!error) { 84 if (!error) {
82 error = vfs_statfs(&path, st); 85 error = vfs_statfs(&path, st);
83 path_put(&path); 86 path_put(&path);
87 if (retry_estale(error, lookup_flags)) {
88 lookup_flags |= LOOKUP_REVAL;
89 goto retry;
90 }
84 } 91 }
85 return error; 92 return error;
86} 93}
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 0a65939508e9..9d4dc6831792 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -41,9 +41,11 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
41 41
42 if ((attr->ia_valid & ATTR_SIZE) && 42 if ((attr->ia_valid & ATTR_SIZE) &&
43 attr->ia_size != i_size_read(inode)) { 43 attr->ia_size != i_size_read(inode)) {
44 error = vmtruncate(inode, attr->ia_size); 44 error = inode_newsize_ok(inode, attr->ia_size);
45 if (error) 45 if (error)
46 return error; 46 return error;
47 truncate_setsize(inode, attr->ia_size);
48 sysv_truncate(inode);
47 } 49 }
48 50
49 setattr_copy(inode, attr); 51 setattr_copy(inode, attr);
@@ -52,7 +54,6 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr)
52} 54}
53 55
54const struct inode_operations sysv_file_inode_operations = { 56const struct inode_operations sysv_file_inode_operations = {
55 .truncate = sysv_truncate,
56 .setattr = sysv_setattr, 57 .setattr = sysv_setattr,
57 .getattr = sysv_getattr, 58 .getattr = sysv_getattr,
58}; 59};
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 90b54b438789..c1a591a4725b 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -464,6 +464,16 @@ int sysv_prepare_chunk(struct page *page, loff_t pos, unsigned len)
464 return __block_write_begin(page, pos, len, get_block); 464 return __block_write_begin(page, pos, len, get_block);
465} 465}
466 466
467static void sysv_write_failed(struct address_space *mapping, loff_t to)
468{
469 struct inode *inode = mapping->host;
470
471 if (to > inode->i_size) {
472 truncate_pagecache(inode, to, inode->i_size);
473 sysv_truncate(inode);
474 }
475}
476
467static int sysv_write_begin(struct file *file, struct address_space *mapping, 477static int sysv_write_begin(struct file *file, struct address_space *mapping,
468 loff_t pos, unsigned len, unsigned flags, 478 loff_t pos, unsigned len, unsigned flags,
469 struct page **pagep, void **fsdata) 479 struct page **pagep, void **fsdata)
@@ -471,11 +481,8 @@ static int sysv_write_begin(struct file *file, struct address_space *mapping,
471 int ret; 481 int ret;
472 482
473 ret = block_write_begin(mapping, pos, len, flags, pagep, get_block); 483 ret = block_write_begin(mapping, pos, len, flags, pagep, get_block);
474 if (unlikely(ret)) { 484 if (unlikely(ret))
475 loff_t isize = mapping->host->i_size; 485 sysv_write_failed(mapping, pos + len);
476 if (pos + len > isize)
477 vmtruncate(mapping->host, isize);
478 }
479 486
480 return ret; 487 return ret;
481} 488}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index eb6d0b7dc879..ff24e4449ece 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -526,6 +526,14 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len)
526 return __block_write_begin(page, pos, len, ufs_getfrag_block); 526 return __block_write_begin(page, pos, len, ufs_getfrag_block);
527} 527}
528 528
529static void ufs_write_failed(struct address_space *mapping, loff_t to)
530{
531 struct inode *inode = mapping->host;
532
533 if (to > inode->i_size)
534 truncate_pagecache(inode, to, inode->i_size);
535}
536
529static int ufs_write_begin(struct file *file, struct address_space *mapping, 537static int ufs_write_begin(struct file *file, struct address_space *mapping,
530 loff_t pos, unsigned len, unsigned flags, 538 loff_t pos, unsigned len, unsigned flags,
531 struct page **pagep, void **fsdata) 539 struct page **pagep, void **fsdata)
@@ -534,11 +542,8 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping,
534 542
535 ret = block_write_begin(mapping, pos, len, flags, pagep, 543 ret = block_write_begin(mapping, pos, len, flags, pagep,
536 ufs_getfrag_block); 544 ufs_getfrag_block);
537 if (unlikely(ret)) { 545 if (unlikely(ret))
538 loff_t isize = mapping->host->i_size; 546 ufs_write_failed(mapping, pos + len);
539 if (pos + len > isize)
540 vmtruncate(mapping->host, isize);
541 }
542 547
543 return ret; 548 return ret;
544} 549}
diff --git a/fs/utimes.c b/fs/utimes.c
index bb0696a41735..f4fb7eca10e8 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -158,13 +158,17 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
158 158
159 if (!(flags & AT_SYMLINK_NOFOLLOW)) 159 if (!(flags & AT_SYMLINK_NOFOLLOW))
160 lookup_flags |= LOOKUP_FOLLOW; 160 lookup_flags |= LOOKUP_FOLLOW;
161 161retry:
162 error = user_path_at(dfd, filename, lookup_flags, &path); 162 error = user_path_at(dfd, filename, lookup_flags, &path);
163 if (error) 163 if (error)
164 goto out; 164 goto out;
165 165
166 error = utimes_common(&path, times); 166 error = utimes_common(&path, times);
167 path_put(&path); 167 path_put(&path);
168 if (retry_estale(error, lookup_flags)) {
169 lookup_flags |= LOOKUP_REVAL;
170 goto retry;
171 }
168 } 172 }
169 173
170out: 174out:
diff --git a/fs/xattr.c b/fs/xattr.c
index e21c119f4f99..3377dff18404 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -370,8 +370,9 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
370{ 370{
371 struct path path; 371 struct path path;
372 int error; 372 int error;
373 373 unsigned int lookup_flags = LOOKUP_FOLLOW;
374 error = user_path(pathname, &path); 374retry:
375 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
375 if (error) 376 if (error)
376 return error; 377 return error;
377 error = mnt_want_write(path.mnt); 378 error = mnt_want_write(path.mnt);
@@ -380,6 +381,10 @@ SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
380 mnt_drop_write(path.mnt); 381 mnt_drop_write(path.mnt);
381 } 382 }
382 path_put(&path); 383 path_put(&path);
384 if (retry_estale(error, lookup_flags)) {
385 lookup_flags |= LOOKUP_REVAL;
386 goto retry;
387 }
383 return error; 388 return error;
384} 389}
385 390
@@ -389,8 +394,9 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
389{ 394{
390 struct path path; 395 struct path path;
391 int error; 396 int error;
392 397 unsigned int lookup_flags = 0;
393 error = user_lpath(pathname, &path); 398retry:
399 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
394 if (error) 400 if (error)
395 return error; 401 return error;
396 error = mnt_want_write(path.mnt); 402 error = mnt_want_write(path.mnt);
@@ -399,6 +405,10 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
399 mnt_drop_write(path.mnt); 405 mnt_drop_write(path.mnt);
400 } 406 }
401 path_put(&path); 407 path_put(&path);
408 if (retry_estale(error, lookup_flags)) {
409 lookup_flags |= LOOKUP_REVAL;
410 goto retry;
411 }
402 return error; 412 return error;
403} 413}
404 414
@@ -476,12 +486,17 @@ SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
476{ 486{
477 struct path path; 487 struct path path;
478 ssize_t error; 488 ssize_t error;
479 489 unsigned int lookup_flags = LOOKUP_FOLLOW;
480 error = user_path(pathname, &path); 490retry:
491 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
481 if (error) 492 if (error)
482 return error; 493 return error;
483 error = getxattr(path.dentry, name, value, size); 494 error = getxattr(path.dentry, name, value, size);
484 path_put(&path); 495 path_put(&path);
496 if (retry_estale(error, lookup_flags)) {
497 lookup_flags |= LOOKUP_REVAL;
498 goto retry;
499 }
485 return error; 500 return error;
486} 501}
487 502
@@ -490,12 +505,17 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
490{ 505{
491 struct path path; 506 struct path path;
492 ssize_t error; 507 ssize_t error;
493 508 unsigned int lookup_flags = 0;
494 error = user_lpath(pathname, &path); 509retry:
510 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
495 if (error) 511 if (error)
496 return error; 512 return error;
497 error = getxattr(path.dentry, name, value, size); 513 error = getxattr(path.dentry, name, value, size);
498 path_put(&path); 514 path_put(&path);
515 if (retry_estale(error, lookup_flags)) {
516 lookup_flags |= LOOKUP_REVAL;
517 goto retry;
518 }
499 return error; 519 return error;
500} 520}
501 521
@@ -556,12 +576,17 @@ SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
556{ 576{
557 struct path path; 577 struct path path;
558 ssize_t error; 578 ssize_t error;
559 579 unsigned int lookup_flags = LOOKUP_FOLLOW;
560 error = user_path(pathname, &path); 580retry:
581 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
561 if (error) 582 if (error)
562 return error; 583 return error;
563 error = listxattr(path.dentry, list, size); 584 error = listxattr(path.dentry, list, size);
564 path_put(&path); 585 path_put(&path);
586 if (retry_estale(error, lookup_flags)) {
587 lookup_flags |= LOOKUP_REVAL;
588 goto retry;
589 }
565 return error; 590 return error;
566} 591}
567 592
@@ -570,12 +595,17 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
570{ 595{
571 struct path path; 596 struct path path;
572 ssize_t error; 597 ssize_t error;
573 598 unsigned int lookup_flags = 0;
574 error = user_lpath(pathname, &path); 599retry:
600 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
575 if (error) 601 if (error)
576 return error; 602 return error;
577 error = listxattr(path.dentry, list, size); 603 error = listxattr(path.dentry, list, size);
578 path_put(&path); 604 path_put(&path);
605 if (retry_estale(error, lookup_flags)) {
606 lookup_flags |= LOOKUP_REVAL;
607 goto retry;
608 }
579 return error; 609 return error;
580} 610}
581 611
@@ -615,8 +645,9 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
615{ 645{
616 struct path path; 646 struct path path;
617 int error; 647 int error;
618 648 unsigned int lookup_flags = LOOKUP_FOLLOW;
619 error = user_path(pathname, &path); 649retry:
650 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
620 if (error) 651 if (error)
621 return error; 652 return error;
622 error = mnt_want_write(path.mnt); 653 error = mnt_want_write(path.mnt);
@@ -625,6 +656,10 @@ SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
625 mnt_drop_write(path.mnt); 656 mnt_drop_write(path.mnt);
626 } 657 }
627 path_put(&path); 658 path_put(&path);
659 if (retry_estale(error, lookup_flags)) {
660 lookup_flags |= LOOKUP_REVAL;
661 goto retry;
662 }
628 return error; 663 return error;
629} 664}
630 665
@@ -633,8 +668,9 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
633{ 668{
634 struct path path; 669 struct path path;
635 int error; 670 int error;
636 671 unsigned int lookup_flags = 0;
637 error = user_lpath(pathname, &path); 672retry:
673 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
638 if (error) 674 if (error)
639 return error; 675 return error;
640 error = mnt_want_write(path.mnt); 676 error = mnt_want_write(path.mnt);
@@ -643,6 +679,10 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
643 mnt_drop_write(path.mnt); 679 mnt_drop_write(path.mnt);
644 } 680 }
645 path_put(&path); 681 path_put(&path);
682 if (retry_estale(error, lookup_flags)) {
683 lookup_flags |= LOOKUP_REVAL;
684 goto retry;
685 }
646 return error; 686 return error;
647} 687}
648 688