aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/btrfs/tree-log.c3
-rw-r--r--fs/f2fs/checkpoint.c97
-rw-r--r--fs/f2fs/data.c69
-rw-r--r--fs/f2fs/debug.c20
-rw-r--r--fs/f2fs/dir.c19
-rw-r--r--fs/f2fs/f2fs.h163
-rw-r--r--fs/f2fs/file.c257
-rw-r--r--fs/f2fs/gc.c26
-rw-r--r--fs/f2fs/inline.c20
-rw-r--r--fs/f2fs/inode.c37
-rw-r--r--fs/f2fs/namei.c53
-rw-r--r--fs/f2fs/node.c460
-rw-r--r--fs/f2fs/node.h60
-rw-r--r--fs/f2fs/recovery.c191
-rw-r--r--fs/f2fs/segment.c520
-rw-r--r--fs/f2fs/segment.h160
-rw-r--r--fs/f2fs/super.c47
-rw-r--r--fs/f2fs/xattr.c8
-rw-r--r--fs/lockd/Makefile3
-rw-r--r--fs/lockd/mon.c6
-rw-r--r--fs/lockd/netns.h1
-rw-r--r--fs/lockd/procfs.c92
-rw-r--r--fs/lockd/procfs.h28
-rw-r--r--fs/lockd/svc.c16
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1386
-rw-r--r--fs/nfs/blocklayout/blocklayout.h213
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c384
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c108
-rw-r--r--fs/nfs/blocklayout/dev.c363
-rw-r--r--fs/nfs/blocklayout/extent_tree.c602
-rw-r--r--fs/nfs/blocklayout/extents.c908
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c285
-rw-r--r--fs/nfs/callback.c4
-rw-r--r--fs/nfs/callback_proc.c23
-rw-r--r--fs/nfs/client.c6
-rw-r--r--fs/nfs/direct.c14
-rw-r--r--fs/nfs/file.c52
-rw-r--r--fs/nfs/filelayout/filelayout.c34
-rw-r--r--fs/nfs/filelayout/filelayout.h7
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c108
-rw-r--r--fs/nfs/fscache-index.c3
-rw-r--r--fs/nfs/inode.c4
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/nfs3_fs.h34
-rw-r--r--fs/nfs/nfs3acl.c1
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs4proc.c138
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c18
-rw-r--r--fs/nfs/nfs4xdr.c179
-rw-r--r--fs/nfs/objlayout/objio_osd.c113
-rw-r--r--fs/nfs/objlayout/objlayout.c70
-rw-r--r--fs/nfs/objlayout/objlayout.h5
-rw-r--r--fs/nfs/pagelist.c8
-rw-r--r--fs/nfs/pnfs.c105
-rw-r--r--fs/nfs/pnfs.h50
-rw-r--r--fs/nfs/pnfs_dev.c150
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/write.c150
-rw-r--r--fs/nfs_common/Makefile3
-rw-r--r--fs/nfs_common/grace.c (renamed from fs/lockd/grace.c)68
-rw-r--r--fs/nfsd/Kconfig4
-rw-r--r--fs/nfsd/cache.h1
-rw-r--r--fs/nfsd/export.c1
-rw-r--r--fs/nfsd/nfs3proc.c13
-rw-r--r--fs/nfsd/nfs4callback.c144
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfs4proc.c49
-rw-r--r--fs/nfsd/nfs4recover.c205
-rw-r--r--fs/nfsd/nfs4state.c115
-rw-r--r--fs/nfsd/nfs4xdr.c75
-rw-r--r--fs/nfsd/nfscache.c214
-rw-r--r--fs/nfsd/nfsctl.c45
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c6
-rw-r--r--fs/nfsd/state.h31
-rw-r--r--fs/nfsd/vfs.c37
-rw-r--r--fs/nfsd/xdr4.h14
-rw-r--r--fs/stack.c2
-rw-r--r--fs/timerfd.c3
84 files changed, 4664 insertions, 4311 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..db5dc1598716 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS
233source "fs/nfs/Kconfig" 233source "fs/nfs/Kconfig"
234source "fs/nfsd/Kconfig" 234source "fs/nfsd/Kconfig"
235 235
236config GRACE_PERIOD
237 tristate
238
236config LOCKD 239config LOCKD
237 tristate 240 tristate
238 depends on FILE_LOCKING 241 depends on FILE_LOCKING
242 select GRACE_PERIOD
239 243
240config LOCKD_V4 244config LOCKD_V4
241 bool 245 bool
@@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT
249 253
250config NFS_COMMON 254config NFS_COMMON
251 bool 255 bool
252 depends on NFSD || NFS_FS 256 depends on NFSD || NFS_FS || LOCKD
253 default y 257 default y
254 258
255source "net/sunrpc/Kconfig" 259source "net/sunrpc/Kconfig"
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1d1ba083ca6e..d0262ceb85e1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3994,7 +3994,8 @@ again:
3994 if (ret < 0) { 3994 if (ret < 0) {
3995 err = ret; 3995 err = ret;
3996 goto out_unlock; 3996 goto out_unlock;
3997 } if (ret) { 3997 }
3998 if (ret) {
3998 ins_nr = 0; 3999 ins_nr = 0;
3999 btrfs_release_path(path); 4000 btrfs_release_path(path);
4000 continue; 4001 continue;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index ec3b7a5381fa..dd10a031c052 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,7 +72,22 @@ out:
72 return page; 72 return page;
73} 73}
74 74
75static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) 75struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
76{
77 bool readahead = false;
78 struct page *page;
79
80 page = find_get_page(META_MAPPING(sbi), index);
81 if (!page || (page && !PageUptodate(page)))
82 readahead = true;
83 f2fs_put_page(page, 0);
84
85 if (readahead)
86 ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
87 return get_meta_page(sbi, index);
88}
89
90static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
76{ 91{
77 switch (type) { 92 switch (type) {
78 case META_NAT: 93 case META_NAT:
@@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
82 case META_SSA: 97 case META_SSA:
83 case META_CP: 98 case META_CP:
84 return 0; 99 return 0;
100 case META_POR:
101 return MAX_BLKADDR(sbi);
85 default: 102 default:
86 BUG(); 103 BUG();
87 } 104 }
@@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
90/* 107/*
91 * Readahead CP/NAT/SIT/SSA pages 108 * Readahead CP/NAT/SIT/SSA pages
92 */ 109 */
93int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) 110int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
94{ 111{
95 block_t prev_blk_addr = 0; 112 block_t prev_blk_addr = 0;
96 struct page *page; 113 struct page *page;
97 int blkno = start; 114 block_t blkno = start;
98 int max_blks = get_max_meta_blks(sbi, type); 115 block_t max_blks = get_max_meta_blks(sbi, type);
99 116
100 struct f2fs_io_info fio = { 117 struct f2fs_io_info fio = {
101 .type = META, 118 .type = META,
@@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
125 break; 142 break;
126 case META_SSA: 143 case META_SSA:
127 case META_CP: 144 case META_CP:
128 /* get ssa/cp block addr */ 145 case META_POR:
146 if (unlikely(blkno >= max_blks))
147 goto out;
148 if (unlikely(blkno < SEG0_BLKADDR(sbi)))
149 goto out;
129 blk_addr = blkno; 150 blk_addr = blkno;
130 break; 151 break;
131 default: 152 default:
@@ -151,8 +172,7 @@ out:
151static int f2fs_write_meta_page(struct page *page, 172static int f2fs_write_meta_page(struct page *page,
152 struct writeback_control *wbc) 173 struct writeback_control *wbc)
153{ 174{
154 struct inode *inode = page->mapping->host; 175 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
155 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
156 176
157 trace_f2fs_writepage(page, META); 177 trace_f2fs_writepage(page, META);
158 178
@@ -177,7 +197,7 @@ redirty_out:
177static int f2fs_write_meta_pages(struct address_space *mapping, 197static int f2fs_write_meta_pages(struct address_space *mapping,
178 struct writeback_control *wbc) 198 struct writeback_control *wbc)
179{ 199{
180 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 200 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
181 long diff, written; 201 long diff, written;
182 202
183 trace_f2fs_writepages(mapping->host, wbc, META); 203 trace_f2fs_writepages(mapping->host, wbc, META);
@@ -259,15 +279,12 @@ continue_unlock:
259 279
260static int f2fs_set_meta_page_dirty(struct page *page) 280static int f2fs_set_meta_page_dirty(struct page *page)
261{ 281{
262 struct address_space *mapping = page->mapping;
263 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
264
265 trace_f2fs_set_page_dirty(page, META); 282 trace_f2fs_set_page_dirty(page, META);
266 283
267 SetPageUptodate(page); 284 SetPageUptodate(page);
268 if (!PageDirty(page)) { 285 if (!PageDirty(page)) {
269 __set_page_dirty_nobuffers(page); 286 __set_page_dirty_nobuffers(page);
270 inc_page_count(sbi, F2FS_DIRTY_META); 287 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
271 return 1; 288 return 1;
272 } 289 }
273 return 0; 290 return 0;
@@ -378,7 +395,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
378void release_orphan_inode(struct f2fs_sb_info *sbi) 395void release_orphan_inode(struct f2fs_sb_info *sbi)
379{ 396{
380 spin_lock(&sbi->ino_lock[ORPHAN_INO]); 397 spin_lock(&sbi->ino_lock[ORPHAN_INO]);
381 f2fs_bug_on(sbi->n_orphans == 0); 398 f2fs_bug_on(sbi, sbi->n_orphans == 0);
382 sbi->n_orphans--; 399 sbi->n_orphans--;
383 spin_unlock(&sbi->ino_lock[ORPHAN_INO]); 400 spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
384} 401}
@@ -398,7 +415,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
398static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 415static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
399{ 416{
400 struct inode *inode = f2fs_iget(sbi->sb, ino); 417 struct inode *inode = f2fs_iget(sbi->sb, ino);
401 f2fs_bug_on(IS_ERR(inode)); 418 f2fs_bug_on(sbi, IS_ERR(inode));
402 clear_nlink(inode); 419 clear_nlink(inode);
403 420
404 /* truncate all the data during iput */ 421 /* truncate all the data during iput */
@@ -459,7 +476,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
459 list_for_each_entry(orphan, head, list) { 476 list_for_each_entry(orphan, head, list) {
460 if (!page) { 477 if (!page) {
461 page = find_get_page(META_MAPPING(sbi), start_blk++); 478 page = find_get_page(META_MAPPING(sbi), start_blk++);
462 f2fs_bug_on(!page); 479 f2fs_bug_on(sbi, !page);
463 orphan_blk = 480 orphan_blk =
464 (struct f2fs_orphan_block *)page_address(page); 481 (struct f2fs_orphan_block *)page_address(page);
465 memset(orphan_blk, 0, sizeof(*orphan_blk)); 482 memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -619,7 +636,7 @@ fail_no_cp:
619 636
620static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) 637static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
621{ 638{
622 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 639 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
623 640
624 if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) 641 if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
625 return -EEXIST; 642 return -EEXIST;
@@ -631,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
631 return 0; 648 return 0;
632} 649}
633 650
634void set_dirty_dir_page(struct inode *inode, struct page *page) 651void update_dirty_page(struct inode *inode, struct page *page)
635{ 652{
636 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 653 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
637 struct dir_inode_entry *new; 654 struct dir_inode_entry *new;
638 int ret = 0; 655 int ret = 0;
639 656
640 if (!S_ISDIR(inode->i_mode)) 657 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
641 return; 658 return;
642 659
660 if (!S_ISDIR(inode->i_mode)) {
661 inode_inc_dirty_pages(inode);
662 goto out;
663 }
664
643 new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 665 new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
644 new->inode = inode; 666 new->inode = inode;
645 INIT_LIST_HEAD(&new->list); 667 INIT_LIST_HEAD(&new->list);
646 668
647 spin_lock(&sbi->dir_inode_lock); 669 spin_lock(&sbi->dir_inode_lock);
648 ret = __add_dirty_inode(inode, new); 670 ret = __add_dirty_inode(inode, new);
649 inode_inc_dirty_dents(inode); 671 inode_inc_dirty_pages(inode);
650 SetPagePrivate(page);
651 spin_unlock(&sbi->dir_inode_lock); 672 spin_unlock(&sbi->dir_inode_lock);
652 673
653 if (ret) 674 if (ret)
654 kmem_cache_free(inode_entry_slab, new); 675 kmem_cache_free(inode_entry_slab, new);
676out:
677 SetPagePrivate(page);
655} 678}
656 679
657void add_dirty_dir_inode(struct inode *inode) 680void add_dirty_dir_inode(struct inode *inode)
658{ 681{
659 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 682 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
660 struct dir_inode_entry *new = 683 struct dir_inode_entry *new =
661 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 684 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
662 int ret = 0; 685 int ret = 0;
@@ -674,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode)
674 697
675void remove_dirty_dir_inode(struct inode *inode) 698void remove_dirty_dir_inode(struct inode *inode)
676{ 699{
677 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 700 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
678 struct dir_inode_entry *entry; 701 struct dir_inode_entry *entry;
679 702
680 if (!S_ISDIR(inode->i_mode)) 703 if (!S_ISDIR(inode->i_mode))
681 return; 704 return;
682 705
683 spin_lock(&sbi->dir_inode_lock); 706 spin_lock(&sbi->dir_inode_lock);
684 if (get_dirty_dents(inode) || 707 if (get_dirty_pages(inode) ||
685 !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { 708 !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
686 spin_unlock(&sbi->dir_inode_lock); 709 spin_unlock(&sbi->dir_inode_lock);
687 return; 710 return;
@@ -802,11 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
802 finish_wait(&sbi->cp_wait, &wait); 825 finish_wait(&sbi->cp_wait, &wait);
803} 826}
804 827
805static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) 828static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
806{ 829{
807 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 830 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
808 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); 831 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
809 nid_t last_nid = 0; 832 struct f2fs_nm_info *nm_i = NM_I(sbi);
833 nid_t last_nid = nm_i->next_scan_nid;
810 block_t start_blk; 834 block_t start_blk;
811 struct page *cp_page; 835 struct page *cp_page;
812 unsigned int data_sum_blocks, orphan_blocks; 836 unsigned int data_sum_blocks, orphan_blocks;
@@ -869,7 +893,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
869 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + 893 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
870 orphan_blocks); 894 orphan_blocks);
871 895
872 if (is_umount) { 896 if (cpc->reason == CP_UMOUNT) {
873 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); 897 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
874 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ 898 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
875 cp_payload_blks + data_sum_blocks + 899 cp_payload_blks + data_sum_blocks +
@@ -886,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
886 else 910 else
887 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); 911 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
888 912
913 if (sbi->need_fsck)
914 set_ckpt_flags(ckpt, CP_FSCK_FLAG);
915
889 /* update SIT/NAT bitmap */ 916 /* update SIT/NAT bitmap */
890 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); 917 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
891 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); 918 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -920,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
920 947
921 write_data_summaries(sbi, start_blk); 948 write_data_summaries(sbi, start_blk);
922 start_blk += data_sum_blocks; 949 start_blk += data_sum_blocks;
923 if (is_umount) { 950 if (cpc->reason == CP_UMOUNT) {
924 write_node_summaries(sbi, start_blk); 951 write_node_summaries(sbi, start_blk);
925 start_blk += NR_CURSEG_NODE_TYPE; 952 start_blk += NR_CURSEG_NODE_TYPE;
926 } 953 }
@@ -960,23 +987,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
960/* 987/*
961 * We guarantee that this checkpoint procedure will not fail. 988 * We guarantee that this checkpoint procedure will not fail.
962 */ 989 */
963void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) 990void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
964{ 991{
965 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 992 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
966 unsigned long long ckpt_ver; 993 unsigned long long ckpt_ver;
967 994
968 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); 995 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
969 996
970 mutex_lock(&sbi->cp_mutex); 997 mutex_lock(&sbi->cp_mutex);
971 998
972 if (!sbi->s_dirty) 999 if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
973 goto out; 1000 goto out;
974 if (unlikely(f2fs_cp_error(sbi))) 1001 if (unlikely(f2fs_cp_error(sbi)))
975 goto out; 1002 goto out;
976 if (block_operations(sbi)) 1003 if (block_operations(sbi))
977 goto out; 1004 goto out;
978 1005
979 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); 1006 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
980 1007
981 f2fs_submit_merged_bio(sbi, DATA, WRITE); 1008 f2fs_submit_merged_bio(sbi, DATA, WRITE);
982 f2fs_submit_merged_bio(sbi, NODE, WRITE); 1009 f2fs_submit_merged_bio(sbi, NODE, WRITE);
@@ -992,16 +1019,16 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
992 1019
993 /* write cached NAT/SIT entries to NAT/SIT area */ 1020 /* write cached NAT/SIT entries to NAT/SIT area */
994 flush_nat_entries(sbi); 1021 flush_nat_entries(sbi);
995 flush_sit_entries(sbi); 1022 flush_sit_entries(sbi, cpc);
996 1023
997 /* unlock all the fs_lock[] in do_checkpoint() */ 1024 /* unlock all the fs_lock[] in do_checkpoint() */
998 do_checkpoint(sbi, is_umount); 1025 do_checkpoint(sbi, cpc);
999 1026
1000 unblock_operations(sbi); 1027 unblock_operations(sbi);
1001 stat_inc_cp_count(sbi->stat_info); 1028 stat_inc_cp_count(sbi->stat_info);
1002out: 1029out:
1003 mutex_unlock(&sbi->cp_mutex); 1030 mutex_unlock(&sbi->cp_mutex);
1004 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); 1031 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
1005} 1032}
1006 1033
1007void init_ino_entry_info(struct f2fs_sb_info *sbi) 1034void init_ino_entry_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 76de83e25a89..8e58c4cc2cb9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
85 bio = bio_alloc(GFP_NOIO, npages); 85 bio = bio_alloc(GFP_NOIO, npages);
86 86
87 bio->bi_bdev = sbi->sb->s_bdev; 87 bio->bi_bdev = sbi->sb->s_bdev;
88 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); 88 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
89 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; 89 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
90 bio->bi_private = sbi; 90 bio->bi_private = sbi;
91 91
@@ -193,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
193 __submit_merged_bio(io); 193 __submit_merged_bio(io);
194alloc_new: 194alloc_new:
195 if (io->bio == NULL) { 195 if (io->bio == NULL) {
196 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 196 int bio_blocks = MAX_BIO_BLOCKS(sbi);
197 197
198 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); 198 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
199 io->fio = *fio; 199 io->fio = *fio;
@@ -236,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
236 236
237int reserve_new_block(struct dnode_of_data *dn) 237int reserve_new_block(struct dnode_of_data *dn)
238{ 238{
239 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 239 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
240 240
241 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 241 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
242 return -EPERM; 242 return -EPERM;
@@ -258,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
258 int err; 258 int err;
259 259
260 /* if inode_page exists, index should be zero */ 260 /* if inode_page exists, index should be zero */
261 f2fs_bug_on(!need_put && index); 261 f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
262 262
263 err = get_dnode_of_data(dn, index, ALLOC_NODE); 263 err = get_dnode_of_data(dn, index, ALLOC_NODE);
264 if (err) 264 if (err)
@@ -321,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
321 block_t start_blkaddr, end_blkaddr; 321 block_t start_blkaddr, end_blkaddr;
322 int need_update = true; 322 int need_update = true;
323 323
324 f2fs_bug_on(blk_addr == NEW_ADDR); 324 f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
325 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 325 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
326 dn->ofs_in_node; 326 dn->ofs_in_node;
327 327
@@ -396,7 +396,6 @@ end_update:
396 396
397struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) 397struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
398{ 398{
399 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
400 struct address_space *mapping = inode->i_mapping; 399 struct address_space *mapping = inode->i_mapping;
401 struct dnode_of_data dn; 400 struct dnode_of_data dn;
402 struct page *page; 401 struct page *page;
@@ -429,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
429 return page; 428 return page;
430 } 429 }
431 430
432 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 431 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
433 sync ? READ_SYNC : READA); 432 sync ? READ_SYNC : READA);
434 if (err) 433 if (err)
435 return ERR_PTR(err); 434 return ERR_PTR(err);
@@ -451,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
451 */ 450 */
452struct page *get_lock_data_page(struct inode *inode, pgoff_t index) 451struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
453{ 452{
454 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
455 struct address_space *mapping = inode->i_mapping; 453 struct address_space *mapping = inode->i_mapping;
456 struct dnode_of_data dn; 454 struct dnode_of_data dn;
457 struct page *page; 455 struct page *page;
@@ -490,7 +488,8 @@ repeat:
490 return page; 488 return page;
491 } 489 }
492 490
493 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); 491 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
492 dn.data_blkaddr, READ_SYNC);
494 if (err) 493 if (err)
495 return ERR_PTR(err); 494 return ERR_PTR(err);
496 495
@@ -517,7 +516,6 @@ repeat:
517struct page *get_new_data_page(struct inode *inode, 516struct page *get_new_data_page(struct inode *inode,
518 struct page *ipage, pgoff_t index, bool new_i_size) 517 struct page *ipage, pgoff_t index, bool new_i_size)
519{ 518{
520 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
521 struct address_space *mapping = inode->i_mapping; 519 struct address_space *mapping = inode->i_mapping;
522 struct page *page; 520 struct page *page;
523 struct dnode_of_data dn; 521 struct dnode_of_data dn;
@@ -541,8 +539,8 @@ repeat:
541 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 539 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
542 SetPageUptodate(page); 540 SetPageUptodate(page);
543 } else { 541 } else {
544 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 542 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
545 READ_SYNC); 543 dn.data_blkaddr, READ_SYNC);
546 if (err) 544 if (err)
547 goto put_err; 545 goto put_err;
548 546
@@ -573,10 +571,12 @@ put_err:
573 571
574static int __allocate_data_block(struct dnode_of_data *dn) 572static int __allocate_data_block(struct dnode_of_data *dn)
575{ 573{
576 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 574 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
575 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
577 struct f2fs_summary sum; 576 struct f2fs_summary sum;
578 block_t new_blkaddr; 577 block_t new_blkaddr;
579 struct node_info ni; 578 struct node_info ni;
579 pgoff_t fofs;
580 int type; 580 int type;
581 581
582 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 582 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
@@ -599,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
599 update_extent_cache(new_blkaddr, dn); 599 update_extent_cache(new_blkaddr, dn);
600 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); 600 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
601 601
602 /* update i_size */
603 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
604 dn->ofs_in_node;
605 if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
606 i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
607
602 dn->data_blkaddr = new_blkaddr; 608 dn->data_blkaddr = new_blkaddr;
603 return 0; 609 return 0;
604} 610}
@@ -614,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
614static int __get_data_block(struct inode *inode, sector_t iblock, 620static int __get_data_block(struct inode *inode, sector_t iblock,
615 struct buffer_head *bh_result, int create, bool fiemap) 621 struct buffer_head *bh_result, int create, bool fiemap)
616{ 622{
617 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
618 unsigned int blkbits = inode->i_sb->s_blocksize_bits; 623 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
619 unsigned maxblocks = bh_result->b_size >> blkbits; 624 unsigned maxblocks = bh_result->b_size >> blkbits;
620 struct dnode_of_data dn; 625 struct dnode_of_data dn;
@@ -630,8 +635,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
630 goto out; 635 goto out;
631 636
632 if (create) { 637 if (create) {
633 f2fs_balance_fs(sbi); 638 f2fs_balance_fs(F2FS_I_SB(inode));
634 f2fs_lock_op(sbi); 639 f2fs_lock_op(F2FS_I_SB(inode));
635 } 640 }
636 641
637 /* When reading holes, we need its node page */ 642 /* When reading holes, we need its node page */
@@ -707,7 +712,7 @@ put_out:
707 f2fs_put_dnode(&dn); 712 f2fs_put_dnode(&dn);
708unlock_out: 713unlock_out:
709 if (create) 714 if (create)
710 f2fs_unlock_op(sbi); 715 f2fs_unlock_op(F2FS_I_SB(inode));
711out: 716out:
712 trace_f2fs_get_data_block(inode, iblock, bh_result, err); 717 trace_f2fs_get_data_block(inode, iblock, bh_result, err);
713 return err; 718 return err;
@@ -804,7 +809,7 @@ static int f2fs_write_data_page(struct page *page,
804 struct writeback_control *wbc) 809 struct writeback_control *wbc)
805{ 810{
806 struct inode *inode = page->mapping->host; 811 struct inode *inode = page->mapping->host;
807 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 812 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
808 loff_t i_size = i_size_read(inode); 813 loff_t i_size = i_size_read(inode);
809 const pgoff_t end_index = ((unsigned long long) i_size) 814 const pgoff_t end_index = ((unsigned long long) i_size)
810 >> PAGE_CACHE_SHIFT; 815 >> PAGE_CACHE_SHIFT;
@@ -846,7 +851,7 @@ write:
846 if (unlikely(f2fs_cp_error(sbi))) { 851 if (unlikely(f2fs_cp_error(sbi))) {
847 SetPageError(page); 852 SetPageError(page);
848 unlock_page(page); 853 unlock_page(page);
849 return 0; 854 goto out;
850 } 855 }
851 856
852 if (!wbc->for_reclaim) 857 if (!wbc->for_reclaim)
@@ -866,7 +871,7 @@ done:
866 871
867 clear_cold_data(page); 872 clear_cold_data(page);
868out: 873out:
869 inode_dec_dirty_dents(inode); 874 inode_dec_dirty_pages(inode);
870 unlock_page(page); 875 unlock_page(page);
871 if (need_balance_fs) 876 if (need_balance_fs)
872 f2fs_balance_fs(sbi); 877 f2fs_balance_fs(sbi);
@@ -892,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
892 struct writeback_control *wbc) 897 struct writeback_control *wbc)
893{ 898{
894 struct inode *inode = mapping->host; 899 struct inode *inode = mapping->host;
895 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 900 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
896 bool locked = false; 901 bool locked = false;
897 int ret; 902 int ret;
898 long diff; 903 long diff;
@@ -904,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
904 return 0; 909 return 0;
905 910
906 if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && 911 if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
907 get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) && 912 get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
908 available_free_memory(sbi, DIRTY_DENTS)) 913 available_free_memory(sbi, DIRTY_DENTS))
909 goto skip_write; 914 goto skip_write;
910 915
@@ -926,7 +931,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
926 return ret; 931 return ret;
927 932
928skip_write: 933skip_write:
929 wbc->pages_skipped += get_dirty_dents(inode); 934 wbc->pages_skipped += get_dirty_pages(inode);
930 return 0; 935 return 0;
931} 936}
932 937
@@ -945,7 +950,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
945 struct page **pagep, void **fsdata) 950 struct page **pagep, void **fsdata)
946{ 951{
947 struct inode *inode = mapping->host; 952 struct inode *inode = mapping->host;
948 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 953 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
949 struct page *page; 954 struct page *page;
950 pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; 955 pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
951 struct dnode_of_data dn; 956 struct dnode_of_data dn;
@@ -1047,7 +1052,10 @@ static int f2fs_write_end(struct file *file,
1047 1052
1048 trace_f2fs_write_end(inode, pos, len, copied); 1053 trace_f2fs_write_end(inode, pos, len, copied);
1049 1054
1050 set_page_dirty(page); 1055 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
1056 register_inmem_page(inode, page);
1057 else
1058 set_page_dirty(page);
1051 1059
1052 if (pos + copied > i_size_read(inode)) { 1060 if (pos + copied > i_size_read(inode)) {
1053 i_size_write(inode, pos + copied); 1061 i_size_write(inode, pos + copied);
@@ -1092,9 +1100,6 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1092 if (check_direct_IO(inode, rw, iter, offset)) 1100 if (check_direct_IO(inode, rw, iter, offset))
1093 return 0; 1101 return 0;
1094 1102
1095 /* clear fsync mark to recover these blocks */
1096 fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
1097
1098 trace_f2fs_direct_IO_enter(inode, offset, count, rw); 1103 trace_f2fs_direct_IO_enter(inode, offset, count, rw);
1099 1104
1100 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); 1105 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
@@ -1110,8 +1115,12 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
1110 unsigned int length) 1115 unsigned int length)
1111{ 1116{
1112 struct inode *inode = page->mapping->host; 1117 struct inode *inode = page->mapping->host;
1118
1119 if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
1120 return;
1121
1113 if (PageDirty(page)) 1122 if (PageDirty(page))
1114 inode_dec_dirty_dents(inode); 1123 inode_dec_dirty_pages(inode);
1115 ClearPagePrivate(page); 1124 ClearPagePrivate(page);
1116} 1125}
1117 1126
@@ -1133,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
1133 1142
1134 if (!PageDirty(page)) { 1143 if (!PageDirty(page)) {
1135 __set_page_dirty_nobuffers(page); 1144 __set_page_dirty_nobuffers(page);
1136 set_dirty_dir_page(inode, page); 1145 update_dirty_page(inode, page);
1137 return 1; 1146 return 1;
1138 } 1147 }
1139 return 0; 1148 return 0;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fecebdbfd781..0a91ab813a9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
93 total_vblocks = 0; 93 total_vblocks = 0;
94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); 94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
95 hblks_per_sec = blks_per_sec / 2; 95 hblks_per_sec = blks_per_sec / 2;
96 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 96 for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
97 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); 97 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
98 dist = abs(vblocks - hblks_per_sec); 98 dist = abs(vblocks - hblks_per_sec);
99 bimodal += dist * dist; 99 bimodal += dist * dist;
@@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
103 ndirty++; 103 ndirty++;
104 } 104 }
105 } 105 }
106 dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; 106 dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
107 si->bimodal = bimodal / dist; 107 si->bimodal = bimodal / dist;
108 if (si->dirty_count) 108 if (si->dirty_count)
109 si->avg_vblocks = total_vblocks / ndirty; 109 si->avg_vblocks = total_vblocks / ndirty;
@@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
131 131
132 /* build sit */ 132 /* build sit */
133 si->base_mem += sizeof(struct sit_info); 133 si->base_mem += sizeof(struct sit_info);
134 si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry); 134 si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
135 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); 135 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
136 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); 136 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
137 if (sbi->segs_per_sec > 1) 137 if (sbi->segs_per_sec > 1)
138 si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); 138 si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
139 si->base_mem += __bitmap_size(sbi, SIT_BITMAP); 139 si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
140 140
141 /* build free segmap */ 141 /* build free segmap */
142 si->base_mem += sizeof(struct free_segmap_info); 142 si->base_mem += sizeof(struct free_segmap_info);
143 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); 143 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
144 si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); 144 si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
145 145
146 /* build curseg */ 146 /* build curseg */
147 si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; 147 si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -149,8 +149,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
149 149
150 /* build dirty segmap */ 150 /* build dirty segmap */
151 si->base_mem += sizeof(struct dirty_seglist_info); 151 si->base_mem += sizeof(struct dirty_seglist_info);
152 si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); 152 si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
153 si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); 153 si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
154 154
155 /* build nm */ 155 /* build nm */
156 si->base_mem += sizeof(struct f2fs_nm_info); 156 si->base_mem += sizeof(struct f2fs_nm_info);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 155fb056b7f1..b54f87149c09 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -126,7 +126,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
126 * For the most part, it should be a bug when name_len is zero. 126 * For the most part, it should be a bug when name_len is zero.
127 * We stop here for figuring out where the bugs has occurred. 127 * We stop here for figuring out where the bugs has occurred.
128 */ 128 */
129 f2fs_bug_on(!de->name_len); 129 f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
130 130
131 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); 131 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
132 } 132 }
@@ -151,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
151 bool room = false; 151 bool room = false;
152 int max_slots = 0; 152 int max_slots = 0;
153 153
154 f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); 154 f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
155 155
156 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); 156 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
157 nblock = bucket_blocks(level); 157 nblock = bucket_blocks(level);
@@ -284,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
284 284
285int update_dent_inode(struct inode *inode, const struct qstr *name) 285int update_dent_inode(struct inode *inode, const struct qstr *name)
286{ 286{
287 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
288 struct page *page; 287 struct page *page;
289 288
290 page = get_node_page(sbi, inode->i_ino); 289 page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
291 if (IS_ERR(page)) 290 if (IS_ERR(page))
292 return PTR_ERR(page); 291 return PTR_ERR(page);
293 292
@@ -337,7 +336,6 @@ static int make_empty_dir(struct inode *inode,
337static struct page *init_inode_metadata(struct inode *inode, 336static struct page *init_inode_metadata(struct inode *inode,
338 struct inode *dir, const struct qstr *name) 337 struct inode *dir, const struct qstr *name)
339{ 338{
340 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
341 struct page *page; 339 struct page *page;
342 int err; 340 int err;
343 341
@@ -360,7 +358,7 @@ static struct page *init_inode_metadata(struct inode *inode,
360 if (err) 358 if (err)
361 goto put_error; 359 goto put_error;
362 } else { 360 } else {
363 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); 361 page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
364 if (IS_ERR(page)) 362 if (IS_ERR(page))
365 return page; 363 return page;
366 364
@@ -381,7 +379,7 @@ static struct page *init_inode_metadata(struct inode *inode,
381 * we should remove this inode from orphan list. 379 * we should remove this inode from orphan list.
382 */ 380 */
383 if (inode->i_nlink == 0) 381 if (inode->i_nlink == 0)
384 remove_orphan_inode(sbi, inode->i_ino); 382 remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
385 inc_nlink(inode); 383 inc_nlink(inode);
386 } 384 }
387 return page; 385 return page;
@@ -571,8 +569,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
571{ 569{
572 struct f2fs_dentry_block *dentry_blk; 570 struct f2fs_dentry_block *dentry_blk;
573 unsigned int bit_pos; 571 unsigned int bit_pos;
574 struct address_space *mapping = page->mapping; 572 struct inode *dir = page->mapping->host;
575 struct inode *dir = mapping->host;
576 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); 573 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
577 int i; 574 int i;
578 575
@@ -594,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
594 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 591 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
595 592
596 if (inode) { 593 if (inode) {
597 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 594 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
598 595
599 down_write(&F2FS_I(inode)->i_sem); 596 down_write(&F2FS_I(inode)->i_sem);
600 597
@@ -621,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
621 truncate_hole(dir, page->index, page->index + 1); 618 truncate_hole(dir, page->index, page->index + 1);
622 clear_page_dirty_for_io(page); 619 clear_page_dirty_for_io(page);
623 ClearPageUptodate(page); 620 ClearPageUptodate(page);
624 inode_dec_dirty_dents(dir); 621 inode_dec_dirty_pages(dir);
625 } 622 }
626 f2fs_put_page(page, 1); 623 f2fs_put_page(page, 1);
627} 624}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e921242186f6..8171e80b2ee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,16 @@
21#include <linux/sched.h> 21#include <linux/sched.h>
22 22
23#ifdef CONFIG_F2FS_CHECK_FS 23#ifdef CONFIG_F2FS_CHECK_FS
24#define f2fs_bug_on(condition) BUG_ON(condition) 24#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
25#define f2fs_down_write(x, y) down_write_nest_lock(x, y) 25#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
26#else 26#else
27#define f2fs_bug_on(condition) WARN_ON(condition) 27#define f2fs_bug_on(sbi, condition) \
28 do { \
29 if (unlikely(condition)) { \
30 WARN_ON(1); \
31 sbi->need_fsck = true; \
32 } \
33 } while (0)
28#define f2fs_down_write(x, y) down_write(x) 34#define f2fs_down_write(x, y) down_write(x)
29#endif 35#endif
30 36
@@ -90,6 +96,20 @@ enum {
90 SIT_BITMAP 96 SIT_BITMAP
91}; 97};
92 98
99enum {
100 CP_UMOUNT,
101 CP_SYNC,
102 CP_DISCARD,
103};
104
105struct cp_control {
106 int reason;
107 __u64 trim_start;
108 __u64 trim_end;
109 __u64 trim_minlen;
110 __u64 trimmed;
111};
112
93/* 113/*
94 * For CP/NAT/SIT/SSA readahead 114 * For CP/NAT/SIT/SSA readahead
95 */ 115 */
@@ -97,7 +117,8 @@ enum {
97 META_CP, 117 META_CP,
98 META_NAT, 118 META_NAT,
99 META_SIT, 119 META_SIT,
100 META_SSA 120 META_SSA,
121 META_POR,
101}; 122};
102 123
103/* for the list of ino */ 124/* for the list of ino */
@@ -130,7 +151,9 @@ struct discard_entry {
130struct fsync_inode_entry { 151struct fsync_inode_entry {
131 struct list_head list; /* list head */ 152 struct list_head list; /* list head */
132 struct inode *inode; /* vfs inode pointer */ 153 struct inode *inode; /* vfs inode pointer */
133 block_t blkaddr; /* block address locating the last inode */ 154 block_t blkaddr; /* block address locating the last fsync */
155 block_t last_dentry; /* block address locating the last dentry */
156 block_t last_inode; /* block address locating the last inode */
134}; 157};
135 158
136#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) 159#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
@@ -141,6 +164,9 @@ struct fsync_inode_entry {
141#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) 164#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
142#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) 165#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
143 166
167#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
168#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
169
144static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) 170static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
145{ 171{
146 int before = nats_in_cursum(rs); 172 int before = nats_in_cursum(rs);
@@ -155,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
155 return before; 181 return before;
156} 182}
157 183
184static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
185 int type)
186{
187 if (type == NAT_JOURNAL)
188 return size <= MAX_NAT_JENTRIES(sum);
189 return size <= MAX_SIT_JENTRIES(sum);
190}
191
158/* 192/*
159 * ioctl commands 193 * ioctl commands
160 */ 194 */
161#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS 195#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
162#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS 196#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
197
198#define F2FS_IOCTL_MAGIC 0xf5
199#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
200#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
201#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
163 202
164#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 203#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
165/* 204/*
@@ -222,13 +261,16 @@ struct f2fs_inode_info {
222 /* Use below internally in f2fs*/ 261 /* Use below internally in f2fs*/
223 unsigned long flags; /* use to pass per-file flags */ 262 unsigned long flags; /* use to pass per-file flags */
224 struct rw_semaphore i_sem; /* protect fi info */ 263 struct rw_semaphore i_sem; /* protect fi info */
225 atomic_t dirty_dents; /* # of dirty dentry pages */ 264 atomic_t dirty_pages; /* # of dirty pages */
226 f2fs_hash_t chash; /* hash value of given file name */ 265 f2fs_hash_t chash; /* hash value of given file name */
227 unsigned int clevel; /* maximum level of given file name */ 266 unsigned int clevel; /* maximum level of given file name */
228 nid_t i_xattr_nid; /* node id that contains xattrs */ 267 nid_t i_xattr_nid; /* node id that contains xattrs */
229 unsigned long long xattr_ver; /* cp version of xattr modification */ 268 unsigned long long xattr_ver; /* cp version of xattr modification */
230 struct extent_info ext; /* in-memory extent cache entry */ 269 struct extent_info ext; /* in-memory extent cache entry */
231 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ 270 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
271
272 struct list_head inmem_pages; /* inmemory pages managed by f2fs */
273 struct mutex inmem_lock; /* lock for inmemory pages */
232}; 274};
233 275
234static inline void get_extent_info(struct extent_info *ext, 276static inline void get_extent_info(struct extent_info *ext,
@@ -260,11 +302,10 @@ struct f2fs_nm_info {
260 302
261 /* NAT cache management */ 303 /* NAT cache management */
262 struct radix_tree_root nat_root;/* root of the nat entry cache */ 304 struct radix_tree_root nat_root;/* root of the nat entry cache */
305 struct radix_tree_root nat_set_root;/* root of the nat set cache */
263 rwlock_t nat_tree_lock; /* protect nat_tree_lock */ 306 rwlock_t nat_tree_lock; /* protect nat_tree_lock */
264 unsigned int nat_cnt; /* the # of cached nat entries */
265 struct list_head nat_entries; /* cached nat entry list (clean) */ 307 struct list_head nat_entries; /* cached nat entry list (clean) */
266 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ 308 unsigned int nat_cnt; /* the # of cached nat entries */
267 struct list_head nat_entry_set; /* nat entry set list */
268 unsigned int dirty_nat_cnt; /* total num of nat entries in set */ 309 unsigned int dirty_nat_cnt; /* total num of nat entries in set */
269 310
270 /* free node ids management */ 311 /* free node ids management */
@@ -332,18 +373,16 @@ enum {
332}; 373};
333 374
334struct flush_cmd { 375struct flush_cmd {
335 struct flush_cmd *next;
336 struct completion wait; 376 struct completion wait;
377 struct llist_node llnode;
337 int ret; 378 int ret;
338}; 379};
339 380
340struct flush_cmd_control { 381struct flush_cmd_control {
341 struct task_struct *f2fs_issue_flush; /* flush thread */ 382 struct task_struct *f2fs_issue_flush; /* flush thread */
342 wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ 383 wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
343 struct flush_cmd *issue_list; /* list for command issue */ 384 struct llist_head issue_list; /* list for command issue */
344 struct flush_cmd *dispatch_list; /* list for command dispatch */ 385 struct llist_node *dispatch_list; /* list for command dispatch */
345 spinlock_t issue_lock; /* for issue list lock */
346 struct flush_cmd *issue_tail; /* list tail of issue list */
347}; 386};
348 387
349struct f2fs_sm_info { 388struct f2fs_sm_info {
@@ -369,8 +408,11 @@ struct f2fs_sm_info {
369 int nr_discards; /* # of discards in the list */ 408 int nr_discards; /* # of discards in the list */
370 int max_discards; /* max. discards to be issued */ 409 int max_discards; /* max. discards to be issued */
371 410
411 struct list_head sit_entry_set; /* sit entry set list */
412
372 unsigned int ipu_policy; /* in-place-update policy */ 413 unsigned int ipu_policy; /* in-place-update policy */
373 unsigned int min_ipu_util; /* in-place-update threshold */ 414 unsigned int min_ipu_util; /* in-place-update threshold */
415 unsigned int min_fsync_blocks; /* threshold for fsync */
374 416
375 /* for flush command control */ 417 /* for flush command control */
376 struct flush_cmd_control *cmd_control_info; 418 struct flush_cmd_control *cmd_control_info;
@@ -434,6 +476,7 @@ struct f2fs_sb_info {
434 struct buffer_head *raw_super_buf; /* buffer head of raw sb */ 476 struct buffer_head *raw_super_buf; /* buffer head of raw sb */
435 struct f2fs_super_block *raw_super; /* raw super block pointer */ 477 struct f2fs_super_block *raw_super; /* raw super block pointer */
436 int s_dirty; /* dirty flag for checkpoint */ 478 int s_dirty; /* dirty flag for checkpoint */
479 bool need_fsck; /* need fsck.f2fs to fix */
437 480
438 /* for node-related operations */ 481 /* for node-related operations */
439 struct f2fs_nm_info *nm_info; /* node manager */ 482 struct f2fs_nm_info *nm_info; /* node manager */
@@ -539,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
539 return sb->s_fs_info; 582 return sb->s_fs_info;
540} 583}
541 584
585static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
586{
587 return F2FS_SB(inode->i_sb);
588}
589
590static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
591{
592 return F2FS_I_SB(mapping->host);
593}
594
595static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
596{
597 return F2FS_M_SB(page->mapping);
598}
599
542static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) 600static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
543{ 601{
544 return (struct f2fs_super_block *)(sbi->raw_super); 602 return (struct f2fs_super_block *)(sbi->raw_super);
@@ -703,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
703 blkcnt_t count) 761 blkcnt_t count)
704{ 762{
705 spin_lock(&sbi->stat_lock); 763 spin_lock(&sbi->stat_lock);
706 f2fs_bug_on(sbi->total_valid_block_count < (block_t) count); 764 f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
707 f2fs_bug_on(inode->i_blocks < count); 765 f2fs_bug_on(sbi, inode->i_blocks < count);
708 inode->i_blocks -= count; 766 inode->i_blocks -= count;
709 sbi->total_valid_block_count -= (block_t)count; 767 sbi->total_valid_block_count -= (block_t)count;
710 spin_unlock(&sbi->stat_lock); 768 spin_unlock(&sbi->stat_lock);
@@ -716,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
716 F2FS_SET_SB_DIRT(sbi); 774 F2FS_SET_SB_DIRT(sbi);
717} 775}
718 776
719static inline void inode_inc_dirty_dents(struct inode *inode) 777static inline void inode_inc_dirty_pages(struct inode *inode)
720{ 778{
721 inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); 779 atomic_inc(&F2FS_I(inode)->dirty_pages);
722 atomic_inc(&F2FS_I(inode)->dirty_dents); 780 if (S_ISDIR(inode->i_mode))
781 inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
723} 782}
724 783
725static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) 784static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -727,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
727 atomic_dec(&sbi->nr_pages[count_type]); 786 atomic_dec(&sbi->nr_pages[count_type]);
728} 787}
729 788
730static inline void inode_dec_dirty_dents(struct inode *inode) 789static inline void inode_dec_dirty_pages(struct inode *inode)
731{ 790{
732 if (!S_ISDIR(inode->i_mode)) 791 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
733 return; 792 return;
734 793
735 dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); 794 atomic_dec(&F2FS_I(inode)->dirty_pages);
736 atomic_dec(&F2FS_I(inode)->dirty_dents); 795
796 if (S_ISDIR(inode->i_mode))
797 dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
737} 798}
738 799
739static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) 800static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -741,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
741 return atomic_read(&sbi->nr_pages[count_type]); 802 return atomic_read(&sbi->nr_pages[count_type]);
742} 803}
743 804
744static inline int get_dirty_dents(struct inode *inode) 805static inline int get_dirty_pages(struct inode *inode)
745{ 806{
746 return atomic_read(&F2FS_I(inode)->dirty_dents); 807 return atomic_read(&F2FS_I(inode)->dirty_pages);
747} 808}
748 809
749static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) 810static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -848,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
848{ 909{
849 spin_lock(&sbi->stat_lock); 910 spin_lock(&sbi->stat_lock);
850 911
851 f2fs_bug_on(!sbi->total_valid_block_count); 912 f2fs_bug_on(sbi, !sbi->total_valid_block_count);
852 f2fs_bug_on(!sbi->total_valid_node_count); 913 f2fs_bug_on(sbi, !sbi->total_valid_node_count);
853 f2fs_bug_on(!inode->i_blocks); 914 f2fs_bug_on(sbi, !inode->i_blocks);
854 915
855 inode->i_blocks--; 916 inode->i_blocks--;
856 sbi->total_valid_node_count--; 917 sbi->total_valid_node_count--;
@@ -867,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
867static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) 928static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
868{ 929{
869 spin_lock(&sbi->stat_lock); 930 spin_lock(&sbi->stat_lock);
870 f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count); 931 f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
871 sbi->total_valid_inode_count++; 932 sbi->total_valid_inode_count++;
872 spin_unlock(&sbi->stat_lock); 933 spin_unlock(&sbi->stat_lock);
873} 934}
@@ -875,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
875static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) 936static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
876{ 937{
877 spin_lock(&sbi->stat_lock); 938 spin_lock(&sbi->stat_lock);
878 f2fs_bug_on(!sbi->total_valid_inode_count); 939 f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
879 sbi->total_valid_inode_count--; 940 sbi->total_valid_inode_count--;
880 spin_unlock(&sbi->stat_lock); 941 spin_unlock(&sbi->stat_lock);
881} 942}
@@ -891,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
891 return; 952 return;
892 953
893 if (unlock) { 954 if (unlock) {
894 f2fs_bug_on(!PageLocked(page)); 955 f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
895 unlock_page(page); 956 unlock_page(page);
896 } 957 }
897 page_cache_release(page); 958 page_cache_release(page);
@@ -998,7 +1059,9 @@ enum {
998 FI_INLINE_DATA, /* used for inline data*/ 1059 FI_INLINE_DATA, /* used for inline data*/
999 FI_APPEND_WRITE, /* inode has appended data */ 1060 FI_APPEND_WRITE, /* inode has appended data */
1000 FI_UPDATE_WRITE, /* inode has in-place-update data */ 1061 FI_UPDATE_WRITE, /* inode has in-place-update data */
1001 FI_NEED_IPU, /* used fo ipu for fdatasync */ 1062 FI_NEED_IPU, /* used for ipu per file */
1063 FI_ATOMIC_FILE, /* indicate atomic file */
1064 FI_VOLATILE_FILE, /* indicate volatile file */
1002}; 1065};
1003 1066
1004static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 1067static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1085,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode)
1085 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); 1148 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
1086} 1149}
1087 1150
1151static inline bool f2fs_is_atomic_file(struct inode *inode)
1152{
1153 return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
1154}
1155
1156static inline bool f2fs_is_volatile_file(struct inode *inode)
1157{
1158 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
1159}
1160
1088static inline void *inline_data_addr(struct page *page) 1161static inline void *inline_data_addr(struct page *page)
1089{ 1162{
1090 struct f2fs_inode *ri = F2FS_INODE(page); 1163 struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1141,6 +1214,7 @@ void update_inode(struct inode *, struct page *);
1141void update_inode_page(struct inode *); 1214void update_inode_page(struct inode *);
1142int f2fs_write_inode(struct inode *, struct writeback_control *); 1215int f2fs_write_inode(struct inode *, struct writeback_control *);
1143void f2fs_evict_inode(struct inode *); 1216void f2fs_evict_inode(struct inode *);
1217void handle_failed_inode(struct inode *);
1144 1218
1145/* 1219/*
1146 * namei.c 1220 * namei.c
@@ -1188,9 +1262,9 @@ struct dnode_of_data;
1188struct node_info; 1262struct node_info;
1189 1263
1190bool available_free_memory(struct f2fs_sb_info *, int); 1264bool available_free_memory(struct f2fs_sb_info *, int);
1191int is_checkpointed_node(struct f2fs_sb_info *, nid_t); 1265bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
1192bool fsync_mark_done(struct f2fs_sb_info *, nid_t); 1266bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
1193void fsync_mark_clear(struct f2fs_sb_info *, nid_t); 1267bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
1194void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); 1268void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
1195int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 1269int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
1196int truncate_inode_blocks(struct inode *, pgoff_t); 1270int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1221,6 +1295,8 @@ void destroy_node_manager_caches(void);
1221/* 1295/*
1222 * segment.c 1296 * segment.c
1223 */ 1297 */
1298void register_inmem_page(struct inode *, struct page *);
1299void commit_inmem_pages(struct inode *, bool);
1224void f2fs_balance_fs(struct f2fs_sb_info *); 1300void f2fs_balance_fs(struct f2fs_sb_info *);
1225void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1301void f2fs_balance_fs_bg(struct f2fs_sb_info *);
1226int f2fs_issue_flush(struct f2fs_sb_info *); 1302int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1229,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
1229void invalidate_blocks(struct f2fs_sb_info *, block_t); 1305void invalidate_blocks(struct f2fs_sb_info *, block_t);
1230void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); 1306void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
1231void clear_prefree_segments(struct f2fs_sb_info *); 1307void clear_prefree_segments(struct f2fs_sb_info *);
1308void release_discard_addrs(struct f2fs_sb_info *);
1232void discard_next_dnode(struct f2fs_sb_info *, block_t); 1309void discard_next_dnode(struct f2fs_sb_info *, block_t);
1233int npages_for_summary_flush(struct f2fs_sb_info *); 1310int npages_for_summary_flush(struct f2fs_sb_info *);
1234void allocate_new_segments(struct f2fs_sb_info *); 1311void allocate_new_segments(struct f2fs_sb_info *);
1312int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
1235struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); 1313struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
1236void write_meta_page(struct f2fs_sb_info *, struct page *); 1314void write_meta_page(struct f2fs_sb_info *, struct page *);
1237void write_node_page(struct f2fs_sb_info *, struct page *, 1315void write_node_page(struct f2fs_sb_info *, struct page *,
@@ -1248,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t);
1248void write_node_summaries(struct f2fs_sb_info *, block_t); 1326void write_node_summaries(struct f2fs_sb_info *, block_t);
1249int lookup_journal_in_cursum(struct f2fs_summary_block *, 1327int lookup_journal_in_cursum(struct f2fs_summary_block *,
1250 int, unsigned int, int); 1328 int, unsigned int, int);
1251void flush_sit_entries(struct f2fs_sb_info *); 1329void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
1252int build_segment_manager(struct f2fs_sb_info *); 1330int build_segment_manager(struct f2fs_sb_info *);
1253void destroy_segment_manager(struct f2fs_sb_info *); 1331void destroy_segment_manager(struct f2fs_sb_info *);
1254int __init create_segment_manager_caches(void); 1332int __init create_segment_manager_caches(void);
@@ -1259,7 +1337,8 @@ void destroy_segment_manager_caches(void);
1259 */ 1337 */
1260struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); 1338struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
1261struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); 1339struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
1262int ra_meta_pages(struct f2fs_sb_info *, int, int, int); 1340struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
1341int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
1263long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); 1342long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
1264void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); 1343void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
1265void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); 1344void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
@@ -1271,11 +1350,11 @@ void add_orphan_inode(struct f2fs_sb_info *, nid_t);
1271void remove_orphan_inode(struct f2fs_sb_info *, nid_t); 1350void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
1272void recover_orphan_inodes(struct f2fs_sb_info *); 1351void recover_orphan_inodes(struct f2fs_sb_info *);
1273int get_valid_checkpoint(struct f2fs_sb_info *); 1352int get_valid_checkpoint(struct f2fs_sb_info *);
1274void set_dirty_dir_page(struct inode *, struct page *); 1353void update_dirty_page(struct inode *, struct page *);
1275void add_dirty_dir_inode(struct inode *); 1354void add_dirty_dir_inode(struct inode *);
1276void remove_dirty_dir_inode(struct inode *); 1355void remove_dirty_dir_inode(struct inode *);
1277void sync_dirty_dir_inodes(struct f2fs_sb_info *); 1356void sync_dirty_dir_inodes(struct f2fs_sb_info *);
1278void write_checkpoint(struct f2fs_sb_info *, bool); 1357void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
1279void init_ino_entry_info(struct f2fs_sb_info *); 1358void init_ino_entry_info(struct f2fs_sb_info *);
1280int __init create_checkpoint_caches(void); 1359int __init create_checkpoint_caches(void);
1281void destroy_checkpoint_caches(void); 1360void destroy_checkpoint_caches(void);
@@ -1359,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1359#define stat_inc_inline_inode(inode) \ 1438#define stat_inc_inline_inode(inode) \
1360 do { \ 1439 do { \
1361 if (f2fs_has_inline_data(inode)) \ 1440 if (f2fs_has_inline_data(inode)) \
1362 ((F2FS_SB(inode->i_sb))->inline_inode++); \ 1441 ((F2FS_I_SB(inode))->inline_inode++); \
1363 } while (0) 1442 } while (0)
1364#define stat_dec_inline_inode(inode) \ 1443#define stat_dec_inline_inode(inode) \
1365 do { \ 1444 do { \
1366 if (f2fs_has_inline_data(inode)) \ 1445 if (f2fs_has_inline_data(inode)) \
1367 ((F2FS_SB(inode->i_sb))->inline_inode--); \ 1446 ((F2FS_I_SB(inode))->inline_inode--); \
1368 } while (0) 1447 } while (0)
1369 1448
1370#define stat_inc_seg_type(sbi, curseg) \ 1449#define stat_inc_seg_type(sbi, curseg) \
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 060aee65aee8..8e68bb64f835 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
33{ 33{
34 struct page *page = vmf->page; 34 struct page *page = vmf->page;
35 struct inode *inode = file_inode(vma->vm_file); 35 struct inode *inode = file_inode(vma->vm_file);
36 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 36 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
37 struct dnode_of_data dn; 37 struct dnode_of_data dn;
38 int err; 38 int err;
39 39
@@ -117,7 +117,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
117 117
118static inline bool need_do_checkpoint(struct inode *inode) 118static inline bool need_do_checkpoint(struct inode *inode)
119{ 119{
120 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 120 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
121 bool need_cp = false; 121 bool need_cp = false;
122 122
123 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) 123 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
@@ -138,7 +138,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
138{ 138{
139 struct inode *inode = file->f_mapping->host; 139 struct inode *inode = file->f_mapping->host;
140 struct f2fs_inode_info *fi = F2FS_I(inode); 140 struct f2fs_inode_info *fi = F2FS_I(inode);
141 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 141 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
142 nid_t ino = inode->i_ino;
142 int ret = 0; 143 int ret = 0;
143 bool need_cp = false; 144 bool need_cp = false;
144 struct writeback_control wbc = { 145 struct writeback_control wbc = {
@@ -153,12 +154,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
153 trace_f2fs_sync_file_enter(inode); 154 trace_f2fs_sync_file_enter(inode);
154 155
155 /* if fdatasync is triggered, let's do in-place-update */ 156 /* if fdatasync is triggered, let's do in-place-update */
156 if (datasync) 157 if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
157 set_inode_flag(fi, FI_NEED_IPU); 158 set_inode_flag(fi, FI_NEED_IPU);
158
159 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 159 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
160 if (datasync) 160 clear_inode_flag(fi, FI_NEED_IPU);
161 clear_inode_flag(fi, FI_NEED_IPU); 161
162 if (ret) { 162 if (ret) {
163 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 163 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
164 return ret; 164 return ret;
@@ -168,13 +168,22 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
168 * if there is no written data, don't waste time to write recovery info. 168 * if there is no written data, don't waste time to write recovery info.
169 */ 169 */
170 if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && 170 if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
171 !exist_written_data(sbi, inode->i_ino, APPEND_INO)) { 171 !exist_written_data(sbi, ino, APPEND_INO)) {
172 struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
173
174 /* But we need to avoid that there are some inode updates */
175 if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
176 f2fs_put_page(i, 0);
177 goto go_write;
178 }
179 f2fs_put_page(i, 0);
180
172 if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || 181 if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
173 exist_written_data(sbi, inode->i_ino, UPDATE_INO)) 182 exist_written_data(sbi, ino, UPDATE_INO))
174 goto flush_out; 183 goto flush_out;
175 goto out; 184 goto out;
176 } 185 }
177 186go_write:
178 /* guarantee free sections for fsync */ 187 /* guarantee free sections for fsync */
179 f2fs_balance_fs(sbi); 188 f2fs_balance_fs(sbi);
180 189
@@ -207,26 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
207 up_write(&fi->i_sem); 216 up_write(&fi->i_sem);
208 } 217 }
209 } else { 218 } else {
210 /* if there is no written node page, write its inode page */ 219sync_nodes:
211 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { 220 sync_node_pages(sbi, ino, &wbc);
212 if (fsync_mark_done(sbi, inode->i_ino)) 221
213 goto out; 222 if (need_inode_block_update(sbi, ino)) {
214 mark_inode_dirty_sync(inode); 223 mark_inode_dirty_sync(inode);
215 ret = f2fs_write_inode(inode, NULL); 224 ret = f2fs_write_inode(inode, NULL);
216 if (ret) 225 if (ret)
217 goto out; 226 goto out;
227 goto sync_nodes;
218 } 228 }
219 ret = wait_on_node_pages_writeback(sbi, inode->i_ino); 229
230 ret = wait_on_node_pages_writeback(sbi, ino);
220 if (ret) 231 if (ret)
221 goto out; 232 goto out;
222 233
223 /* once recovery info is written, don't need to tack this */ 234 /* once recovery info is written, don't need to tack this */
224 remove_dirty_inode(sbi, inode->i_ino, APPEND_INO); 235 remove_dirty_inode(sbi, ino, APPEND_INO);
225 clear_inode_flag(fi, FI_APPEND_WRITE); 236 clear_inode_flag(fi, FI_APPEND_WRITE);
226flush_out: 237flush_out:
227 remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO); 238 remove_dirty_inode(sbi, ino, UPDATE_INO);
228 clear_inode_flag(fi, FI_UPDATE_WRITE); 239 clear_inode_flag(fi, FI_UPDATE_WRITE);
229 ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); 240 ret = f2fs_issue_flush(F2FS_I_SB(inode));
230 } 241 }
231out: 242out:
232 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 243 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -353,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
353 maxbytes, i_size_read(inode)); 364 maxbytes, i_size_read(inode));
354 case SEEK_DATA: 365 case SEEK_DATA:
355 case SEEK_HOLE: 366 case SEEK_HOLE:
367 if (offset < 0)
368 return -ENXIO;
356 return f2fs_seek_block(file, offset, whence); 369 return f2fs_seek_block(file, offset, whence);
357 } 370 }
358 371
@@ -369,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
369int truncate_data_blocks_range(struct dnode_of_data *dn, int count) 382int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
370{ 383{
371 int nr_free = 0, ofs = dn->ofs_in_node; 384 int nr_free = 0, ofs = dn->ofs_in_node;
372 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 385 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
373 struct f2fs_node *raw_node; 386 struct f2fs_node *raw_node;
374 __le32 *addr; 387 __le32 *addr;
375 388
@@ -432,7 +445,7 @@ out:
432 445
433int truncate_blocks(struct inode *inode, u64 from, bool lock) 446int truncate_blocks(struct inode *inode, u64 from, bool lock)
434{ 447{
435 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 448 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
436 unsigned int blocksize = inode->i_sb->s_blocksize; 449 unsigned int blocksize = inode->i_sb->s_blocksize;
437 struct dnode_of_data dn; 450 struct dnode_of_data dn;
438 pgoff_t free_from; 451 pgoff_t free_from;
@@ -463,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
463 count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); 476 count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
464 477
465 count -= dn.ofs_in_node; 478 count -= dn.ofs_in_node;
466 f2fs_bug_on(count < 0); 479 f2fs_bug_on(sbi, count < 0);
467 480
468 if (dn.ofs_in_node || IS_INODE(dn.node_page)) { 481 if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
469 truncate_data_blocks_range(&dn, count); 482 truncate_data_blocks_range(&dn, count);
@@ -547,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
547 if (err) 560 if (err)
548 return err; 561 return err;
549 562
550 if ((attr->ia_valid & ATTR_SIZE) && 563 if (attr->ia_valid & ATTR_SIZE) {
551 attr->ia_size != i_size_read(inode)) {
552 err = f2fs_convert_inline_data(inode, attr->ia_size, NULL); 564 err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
553 if (err) 565 if (err)
554 return err; 566 return err;
555 567
556 truncate_setsize(inode, attr->ia_size); 568 if (attr->ia_size != i_size_read(inode)) {
557 f2fs_truncate(inode); 569 truncate_setsize(inode, attr->ia_size);
558 f2fs_balance_fs(F2FS_SB(inode->i_sb)); 570 f2fs_truncate(inode);
571 f2fs_balance_fs(F2FS_I_SB(inode));
572 } else {
573 /*
574 * giving a chance to truncate blocks past EOF which
575 * are fallocated with FALLOC_FL_KEEP_SIZE.
576 */
577 f2fs_truncate(inode);
578 }
559 } 579 }
560 580
561 __setattr_copy(inode, attr); 581 __setattr_copy(inode, attr);
@@ -589,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = {
589static void fill_zero(struct inode *inode, pgoff_t index, 609static void fill_zero(struct inode *inode, pgoff_t index,
590 loff_t start, loff_t len) 610 loff_t start, loff_t len)
591{ 611{
592 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 612 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
593 struct page *page; 613 struct page *page;
594 614
595 if (!len) 615 if (!len)
@@ -638,6 +658,13 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
638 loff_t off_start, off_end; 658 loff_t off_start, off_end;
639 int ret = 0; 659 int ret = 0;
640 660
661 if (!S_ISREG(inode->i_mode))
662 return -EOPNOTSUPP;
663
664 /* skip punching hole beyond i_size */
665 if (offset >= inode->i_size)
666 return ret;
667
641 ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL); 668 ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
642 if (ret) 669 if (ret)
643 return ret; 670 return ret;
@@ -661,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
661 if (pg_start < pg_end) { 688 if (pg_start < pg_end) {
662 struct address_space *mapping = inode->i_mapping; 689 struct address_space *mapping = inode->i_mapping;
663 loff_t blk_start, blk_end; 690 loff_t blk_start, blk_end;
664 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 691 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
665 692
666 f2fs_balance_fs(sbi); 693 f2fs_balance_fs(sbi);
667 694
@@ -682,7 +709,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
682static int expand_inode_data(struct inode *inode, loff_t offset, 709static int expand_inode_data(struct inode *inode, loff_t offset,
683 loff_t len, int mode) 710 loff_t len, int mode)
684{ 711{
685 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 712 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
686 pgoff_t index, pg_start, pg_end; 713 pgoff_t index, pg_start, pg_end;
687 loff_t new_size = i_size_read(inode); 714 loff_t new_size = i_size_read(inode);
688 loff_t off_start, off_end; 715 loff_t off_start, off_end;
@@ -778,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
778 return flags & F2FS_OTHER_FLMASK; 805 return flags & F2FS_OTHER_FLMASK;
779} 806}
780 807
781long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 808static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
809{
810 struct inode *inode = file_inode(filp);
811 struct f2fs_inode_info *fi = F2FS_I(inode);
812 unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
813 return put_user(flags, (int __user *)arg);
814}
815
816static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
782{ 817{
783 struct inode *inode = file_inode(filp); 818 struct inode *inode = file_inode(filp);
784 struct f2fs_inode_info *fi = F2FS_I(inode); 819 struct f2fs_inode_info *fi = F2FS_I(inode);
785 unsigned int flags; 820 unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
821 unsigned int oldflags;
786 int ret; 822 int ret;
787 823
788 switch (cmd) { 824 ret = mnt_want_write_file(filp);
789 case F2FS_IOC_GETFLAGS: 825 if (ret)
790 flags = fi->i_flags & FS_FL_USER_VISIBLE; 826 return ret;
791 return put_user(flags, (int __user *) arg);
792 case F2FS_IOC_SETFLAGS:
793 {
794 unsigned int oldflags;
795 827
796 ret = mnt_want_write_file(filp); 828 if (!inode_owner_or_capable(inode)) {
797 if (ret) 829 ret = -EACCES;
798 return ret; 830 goto out;
831 }
799 832
800 if (!inode_owner_or_capable(inode)) { 833 if (get_user(flags, (int __user *)arg)) {
801 ret = -EACCES; 834 ret = -EFAULT;
802 goto out; 835 goto out;
803 } 836 }
837
838 flags = f2fs_mask_flags(inode->i_mode, flags);
839
840 mutex_lock(&inode->i_mutex);
804 841
805 if (get_user(flags, (int __user *) arg)) { 842 oldflags = fi->i_flags;
806 ret = -EFAULT; 843
844 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
845 if (!capable(CAP_LINUX_IMMUTABLE)) {
846 mutex_unlock(&inode->i_mutex);
847 ret = -EPERM;
807 goto out; 848 goto out;
808 } 849 }
850 }
809 851
810 flags = f2fs_mask_flags(inode->i_mode, flags); 852 flags = flags & FS_FL_USER_MODIFIABLE;
853 flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
854 fi->i_flags = flags;
855 mutex_unlock(&inode->i_mutex);
811 856
812 mutex_lock(&inode->i_mutex); 857 f2fs_set_inode_flags(inode);
858 inode->i_ctime = CURRENT_TIME;
859 mark_inode_dirty(inode);
860out:
861 mnt_drop_write_file(filp);
862 return ret;
863}
813 864
814 oldflags = fi->i_flags; 865static int f2fs_ioc_start_atomic_write(struct file *filp)
866{
867 struct inode *inode = file_inode(filp);
868 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
815 869
816 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 870 if (!inode_owner_or_capable(inode))
817 if (!capable(CAP_LINUX_IMMUTABLE)) { 871 return -EACCES;
818 mutex_unlock(&inode->i_mutex);
819 ret = -EPERM;
820 goto out;
821 }
822 }
823 872
824 flags = flags & FS_FL_USER_MODIFIABLE; 873 f2fs_balance_fs(sbi);
825 flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
826 fi->i_flags = flags;
827 mutex_unlock(&inode->i_mutex);
828 874
829 f2fs_set_inode_flags(inode); 875 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
830 inode->i_ctime = CURRENT_TIME; 876
831 mark_inode_dirty(inode); 877 return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
832out: 878}
833 mnt_drop_write_file(filp); 879
880static int f2fs_ioc_commit_atomic_write(struct file *filp)
881{
882 struct inode *inode = file_inode(filp);
883 int ret;
884
885 if (!inode_owner_or_capable(inode))
886 return -EACCES;
887
888 if (f2fs_is_volatile_file(inode))
889 return 0;
890
891 ret = mnt_want_write_file(filp);
892 if (ret)
834 return ret; 893 return ret;
835 } 894
895 if (f2fs_is_atomic_file(inode))
896 commit_inmem_pages(inode, false);
897
898 ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
899 mnt_drop_write_file(filp);
900 return ret;
901}
902
903static int f2fs_ioc_start_volatile_write(struct file *filp)
904{
905 struct inode *inode = file_inode(filp);
906
907 if (!inode_owner_or_capable(inode))
908 return -EACCES;
909
910 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
911 return 0;
912}
913
914static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
915{
916 struct inode *inode = file_inode(filp);
917 struct super_block *sb = inode->i_sb;
918 struct request_queue *q = bdev_get_queue(sb->s_bdev);
919 struct fstrim_range range;
920 int ret;
921
922 if (!capable(CAP_SYS_ADMIN))
923 return -EPERM;
924
925 if (!blk_queue_discard(q))
926 return -EOPNOTSUPP;
927
928 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
929 sizeof(range)))
930 return -EFAULT;
931
932 range.minlen = max((unsigned int)range.minlen,
933 q->limits.discard_granularity);
934 ret = f2fs_trim_fs(F2FS_SB(sb), &range);
935 if (ret < 0)
936 return ret;
937
938 if (copy_to_user((struct fstrim_range __user *)arg, &range,
939 sizeof(range)))
940 return -EFAULT;
941 return 0;
942}
943
944long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
945{
946 switch (cmd) {
947 case F2FS_IOC_GETFLAGS:
948 return f2fs_ioc_getflags(filp, arg);
949 case F2FS_IOC_SETFLAGS:
950 return f2fs_ioc_setflags(filp, arg);
951 case F2FS_IOC_START_ATOMIC_WRITE:
952 return f2fs_ioc_start_atomic_write(filp);
953 case F2FS_IOC_COMMIT_ATOMIC_WRITE:
954 return f2fs_ioc_commit_atomic_write(filp);
955 case F2FS_IOC_START_VOLATILE_WRITE:
956 return f2fs_ioc_start_volatile_write(filp);
957 case FITRIM:
958 return f2fs_ioc_fitrim(filp, arg);
836 default: 959 default:
837 return -ENOTTY; 960 return -ENOTTY;
838 } 961 }
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 943a31db7cc3..2a8f4acdb86b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -193,7 +193,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
193 * selected by background GC before. 193 * selected by background GC before.
194 * Those segments guarantee they have small valid blocks. 194 * Those segments guarantee they have small valid blocks.
195 */ 195 */
196 for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) { 196 for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
197 if (sec_usage_check(sbi, secno)) 197 if (sec_usage_check(sbi, secno))
198 continue; 198 continue;
199 clear_bit(secno, dirty_i->victim_secmap); 199 clear_bit(secno, dirty_i->victim_secmap);
@@ -263,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
263 unsigned int secno, max_cost; 263 unsigned int secno, max_cost;
264 int nsearched = 0; 264 int nsearched = 0;
265 265
266 mutex_lock(&dirty_i->seglist_lock);
267
266 p.alloc_mode = alloc_mode; 268 p.alloc_mode = alloc_mode;
267 select_policy(sbi, gc_type, type, &p); 269 select_policy(sbi, gc_type, type, &p);
268 270
269 p.min_segno = NULL_SEGNO; 271 p.min_segno = NULL_SEGNO;
270 p.min_cost = max_cost = get_max_cost(sbi, &p); 272 p.min_cost = max_cost = get_max_cost(sbi, &p);
271 273
272 mutex_lock(&dirty_i->seglist_lock);
273
274 if (p.alloc_mode == LFS && gc_type == FG_GC) { 274 if (p.alloc_mode == LFS && gc_type == FG_GC) {
275 p.min_segno = check_bg_victims(sbi); 275 p.min_segno = check_bg_victims(sbi);
276 if (p.min_segno != NULL_SEGNO) 276 if (p.min_segno != NULL_SEGNO)
@@ -281,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
281 unsigned long cost; 281 unsigned long cost;
282 unsigned int segno; 282 unsigned int segno;
283 283
284 segno = find_next_bit(p.dirty_segmap, 284 segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
285 TOTAL_SEGS(sbi), p.offset); 285 if (segno >= MAIN_SEGS(sbi)) {
286 if (segno >= TOTAL_SEGS(sbi)) {
287 if (sbi->last_victim[p.gc_mode]) { 286 if (sbi->last_victim[p.gc_mode]) {
288 sbi->last_victim[p.gc_mode] = 0; 287 sbi->last_victim[p.gc_mode] = 0;
289 p.offset = 0; 288 p.offset = 0;
@@ -423,6 +422,12 @@ next_step:
423 if (IS_ERR(node_page)) 422 if (IS_ERR(node_page))
424 continue; 423 continue;
425 424
425 /* block may become invalid during get_node_page */
426 if (check_valid_map(sbi, segno, off) == 0) {
427 f2fs_put_page(node_page, 1);
428 continue;
429 }
430
426 /* set page dirty and write it */ 431 /* set page dirty and write it */
427 if (gc_type == FG_GC) { 432 if (gc_type == FG_GC) {
428 f2fs_wait_on_page_writeback(node_page, NODE); 433 f2fs_wait_on_page_writeback(node_page, NODE);
@@ -531,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
531 f2fs_wait_on_page_writeback(page, DATA); 536 f2fs_wait_on_page_writeback(page, DATA);
532 537
533 if (clear_page_dirty_for_io(page)) 538 if (clear_page_dirty_for_io(page))
534 inode_dec_dirty_dents(inode); 539 inode_dec_dirty_pages(inode);
535 set_cold_data(page); 540 set_cold_data(page);
536 do_write_data_page(page, &fio); 541 do_write_data_page(page, &fio);
537 clear_cold_data(page); 542 clear_cold_data(page);
@@ -688,6 +693,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
688 int gc_type = BG_GC; 693 int gc_type = BG_GC;
689 int nfree = 0; 694 int nfree = 0;
690 int ret = -1; 695 int ret = -1;
696 struct cp_control cpc = {
697 .reason = CP_SYNC,
698 };
691 699
692 INIT_LIST_HEAD(&ilist); 700 INIT_LIST_HEAD(&ilist);
693gc_more: 701gc_more:
@@ -698,7 +706,7 @@ gc_more:
698 706
699 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { 707 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
700 gc_type = FG_GC; 708 gc_type = FG_GC;
701 write_checkpoint(sbi, false); 709 write_checkpoint(sbi, &cpc);
702 } 710 }
703 711
704 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) 712 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
@@ -723,7 +731,7 @@ gc_more:
723 goto gc_more; 731 goto gc_more;
724 732
725 if (gc_type == FG_GC) 733 if (gc_type == FG_GC)
726 write_checkpoint(sbi, false); 734 write_checkpoint(sbi, &cpc);
727stop: 735stop:
728 mutex_unlock(&sbi->gc_mutex); 736 mutex_unlock(&sbi->gc_mutex);
729 737
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3e8ecdf3742b..88036fd75797 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,11 +15,13 @@
15 15
16bool f2fs_may_inline(struct inode *inode) 16bool f2fs_may_inline(struct inode *inode)
17{ 17{
18 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
19 block_t nr_blocks; 18 block_t nr_blocks;
20 loff_t i_size; 19 loff_t i_size;
21 20
22 if (!test_opt(sbi, INLINE_DATA)) 21 if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
22 return false;
23
24 if (f2fs_is_atomic_file(inode))
23 return false; 25 return false;
24 26
25 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; 27 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
@@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode)
35 37
36int f2fs_read_inline_data(struct inode *inode, struct page *page) 38int f2fs_read_inline_data(struct inode *inode, struct page *page)
37{ 39{
38 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
39 struct page *ipage; 40 struct page *ipage;
40 void *src_addr, *dst_addr; 41 void *src_addr, *dst_addr;
41 42
@@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
44 goto out; 45 goto out;
45 } 46 }
46 47
47 ipage = get_node_page(sbi, inode->i_ino); 48 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
48 if (IS_ERR(ipage)) { 49 if (IS_ERR(ipage)) {
49 unlock_page(page); 50 unlock_page(page);
50 return PTR_ERR(ipage); 51 return PTR_ERR(ipage);
@@ -73,7 +74,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
73 struct dnode_of_data dn; 74 struct dnode_of_data dn;
74 void *src_addr, *dst_addr; 75 void *src_addr, *dst_addr;
75 block_t new_blk_addr; 76 block_t new_blk_addr;
76 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 77 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
77 struct f2fs_io_info fio = { 78 struct f2fs_io_info fio = {
78 .type = DATA, 79 .type = DATA,
79 .rw = WRITE_SYNC | REQ_PRIO, 80 .rw = WRITE_SYNC | REQ_PRIO,
@@ -189,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode,
189 190
190void truncate_inline_data(struct inode *inode, u64 from) 191void truncate_inline_data(struct inode *inode, u64 from)
191{ 192{
192 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
193 struct page *ipage; 193 struct page *ipage;
194 194
195 if (from >= MAX_INLINE_DATA) 195 if (from >= MAX_INLINE_DATA)
196 return; 196 return;
197 197
198 ipage = get_node_page(sbi, inode->i_ino); 198 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
199 if (IS_ERR(ipage)) 199 if (IS_ERR(ipage))
200 return; 200 return;
201 201
@@ -209,7 +209,7 @@ void truncate_inline_data(struct inode *inode, u64 from)
209 209
210bool recover_inline_data(struct inode *inode, struct page *npage) 210bool recover_inline_data(struct inode *inode, struct page *npage)
211{ 211{
212 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 212 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
213 struct f2fs_inode *ri = NULL; 213 struct f2fs_inode *ri = NULL;
214 void *src_addr, *dst_addr; 214 void *src_addr, *dst_addr;
215 struct page *ipage; 215 struct page *ipage;
@@ -229,7 +229,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage)
229 ri && (ri->i_inline & F2FS_INLINE_DATA)) { 229 ri && (ri->i_inline & F2FS_INLINE_DATA)) {
230process_inline: 230process_inline:
231 ipage = get_node_page(sbi, inode->i_ino); 231 ipage = get_node_page(sbi, inode->i_ino);
232 f2fs_bug_on(IS_ERR(ipage)); 232 f2fs_bug_on(sbi, IS_ERR(ipage));
233 233
234 f2fs_wait_on_page_writeback(ipage, NODE); 234 f2fs_wait_on_page_writeback(ipage, NODE);
235 235
@@ -243,7 +243,7 @@ process_inline:
243 243
244 if (f2fs_has_inline_data(inode)) { 244 if (f2fs_has_inline_data(inode)) {
245 ipage = get_node_page(sbi, inode->i_ino); 245 ipage = get_node_page(sbi, inode->i_ino);
246 f2fs_bug_on(IS_ERR(ipage)); 246 f2fs_bug_on(sbi, IS_ERR(ipage));
247 f2fs_wait_on_page_writeback(ipage, NODE); 247 f2fs_wait_on_page_writeback(ipage, NODE);
248 zero_user_segment(ipage, INLINE_DATA_OFFSET, 248 zero_user_segment(ipage, INLINE_DATA_OFFSET,
249 INLINE_DATA_OFFSET + MAX_INLINE_DATA); 249 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c39999f3868..0deead4505e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
69 69
70static int do_read_inode(struct inode *inode) 70static int do_read_inode(struct inode *inode)
71{ 71{
72 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 72 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
73 struct f2fs_inode_info *fi = F2FS_I(inode); 73 struct f2fs_inode_info *fi = F2FS_I(inode);
74 struct page *node_page; 74 struct page *node_page;
75 struct f2fs_inode *ri; 75 struct f2fs_inode *ri;
@@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page)
218 218
219void update_inode_page(struct inode *inode) 219void update_inode_page(struct inode *inode)
220{ 220{
221 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 221 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
222 struct page *node_page; 222 struct page *node_page;
223retry: 223retry:
224 node_page = get_node_page(sbi, inode->i_ino); 224 node_page = get_node_page(sbi, inode->i_ino);
@@ -238,7 +238,7 @@ retry:
238 238
239int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) 239int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
240{ 240{
241 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 241 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
242 242
243 if (inode->i_ino == F2FS_NODE_INO(sbi) || 243 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
244 inode->i_ino == F2FS_META_INO(sbi)) 244 inode->i_ino == F2FS_META_INO(sbi))
@@ -266,9 +266,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
266 */ 266 */
267void f2fs_evict_inode(struct inode *inode) 267void f2fs_evict_inode(struct inode *inode)
268{ 268{
269 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 269 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
270 nid_t xnid = F2FS_I(inode)->i_xattr_nid; 270 nid_t xnid = F2FS_I(inode)->i_xattr_nid;
271 271
272 /* some remained atomic pages should discarded */
273 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
274 commit_inmem_pages(inode, true);
275
272 trace_f2fs_evict_inode(inode); 276 trace_f2fs_evict_inode(inode);
273 truncate_inode_pages_final(&inode->i_data); 277 truncate_inode_pages_final(&inode->i_data);
274 278
@@ -276,7 +280,7 @@ void f2fs_evict_inode(struct inode *inode)
276 inode->i_ino == F2FS_META_INO(sbi)) 280 inode->i_ino == F2FS_META_INO(sbi))
277 goto out_clear; 281 goto out_clear;
278 282
279 f2fs_bug_on(get_dirty_dents(inode)); 283 f2fs_bug_on(sbi, get_dirty_pages(inode));
280 remove_dirty_dir_inode(inode); 284 remove_dirty_dir_inode(inode);
281 285
282 if (inode->i_nlink || is_bad_inode(inode)) 286 if (inode->i_nlink || is_bad_inode(inode))
@@ -306,3 +310,26 @@ no_delete:
306out_clear: 310out_clear:
307 clear_inode(inode); 311 clear_inode(inode);
308} 312}
313
314/* caller should call f2fs_lock_op() */
315void handle_failed_inode(struct inode *inode)
316{
317 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
318
319 clear_nlink(inode);
320 make_bad_inode(inode);
321 unlock_new_inode(inode);
322
323 i_size_write(inode, 0);
324 if (F2FS_HAS_BLOCKS(inode))
325 f2fs_truncate(inode);
326
327 remove_inode_page(inode);
328 stat_dec_inline_inode(inode);
329
330 alloc_nid_failed(sbi, inode->i_ino);
331 f2fs_unlock_op(sbi);
332
333 /* iput will drop the inode object */
334 iput(inode);
335}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ee103fd7283c..0d2526e5aa11 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -23,7 +23,7 @@
23 23
24static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) 24static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
25{ 25{
26 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 26 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
27 nid_t ino; 27 nid_t ino;
28 struct inode *inode; 28 struct inode *inode;
29 bool nid_free = false; 29 bool nid_free = false;
@@ -102,7 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
102static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 102static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
103 bool excl) 103 bool excl)
104{ 104{
105 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 105 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
106 struct inode *inode; 106 struct inode *inode;
107 nid_t ino = 0; 107 nid_t ino = 0;
108 int err; 108 int err;
@@ -123,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
123 123
124 f2fs_lock_op(sbi); 124 f2fs_lock_op(sbi);
125 err = f2fs_add_link(dentry, inode); 125 err = f2fs_add_link(dentry, inode);
126 f2fs_unlock_op(sbi);
127 if (err) 126 if (err)
128 goto out; 127 goto out;
128 f2fs_unlock_op(sbi);
129 129
130 alloc_nid_done(sbi, ino); 130 alloc_nid_done(sbi, ino);
131 131
@@ -133,9 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
133 unlock_new_inode(inode); 133 unlock_new_inode(inode);
134 return 0; 134 return 0;
135out: 135out:
136 clear_nlink(inode); 136 handle_failed_inode(inode);
137 iget_failed(inode);
138 alloc_nid_failed(sbi, ino);
139 return err; 137 return err;
140} 138}
141 139
@@ -143,7 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
143 struct dentry *dentry) 141 struct dentry *dentry)
144{ 142{
145 struct inode *inode = old_dentry->d_inode; 143 struct inode *inode = old_dentry->d_inode;
146 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 144 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
147 int err; 145 int err;
148 146
149 f2fs_balance_fs(sbi); 147 f2fs_balance_fs(sbi);
@@ -154,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
154 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 152 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
155 f2fs_lock_op(sbi); 153 f2fs_lock_op(sbi);
156 err = f2fs_add_link(dentry, inode); 154 err = f2fs_add_link(dentry, inode);
157 f2fs_unlock_op(sbi);
158 if (err) 155 if (err)
159 goto out; 156 goto out;
157 f2fs_unlock_op(sbi);
160 158
161 d_instantiate(dentry, inode); 159 d_instantiate(dentry, inode);
162 return 0; 160 return 0;
163out: 161out:
164 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 162 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
165 iput(inode); 163 iput(inode);
164 f2fs_unlock_op(sbi);
166 return err; 165 return err;
167} 166}
168 167
@@ -203,7 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
203 202
204static int f2fs_unlink(struct inode *dir, struct dentry *dentry) 203static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
205{ 204{
206 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 205 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
207 struct inode *inode = dentry->d_inode; 206 struct inode *inode = dentry->d_inode;
208 struct f2fs_dir_entry *de; 207 struct f2fs_dir_entry *de;
209 struct page *page; 208 struct page *page;
@@ -237,7 +236,7 @@ fail:
237static int f2fs_symlink(struct inode *dir, struct dentry *dentry, 236static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
238 const char *symname) 237 const char *symname)
239{ 238{
240 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 239 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
241 struct inode *inode; 240 struct inode *inode;
242 size_t symlen = strlen(symname) + 1; 241 size_t symlen = strlen(symname) + 1;
243 int err; 242 int err;
@@ -253,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
253 252
254 f2fs_lock_op(sbi); 253 f2fs_lock_op(sbi);
255 err = f2fs_add_link(dentry, inode); 254 err = f2fs_add_link(dentry, inode);
256 f2fs_unlock_op(sbi);
257 if (err) 255 if (err)
258 goto out; 256 goto out;
257 f2fs_unlock_op(sbi);
259 258
260 err = page_symlink(inode, symname, symlen); 259 err = page_symlink(inode, symname, symlen);
261 alloc_nid_done(sbi, inode->i_ino); 260 alloc_nid_done(sbi, inode->i_ino);
@@ -264,15 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
264 unlock_new_inode(inode); 263 unlock_new_inode(inode);
265 return err; 264 return err;
266out: 265out:
267 clear_nlink(inode); 266 handle_failed_inode(inode);
268 iget_failed(inode);
269 alloc_nid_failed(sbi, inode->i_ino);
270 return err; 267 return err;
271} 268}
272 269
273static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 270static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
274{ 271{
275 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 272 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
276 struct inode *inode; 273 struct inode *inode;
277 int err; 274 int err;
278 275
@@ -290,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
290 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 287 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
291 f2fs_lock_op(sbi); 288 f2fs_lock_op(sbi);
292 err = f2fs_add_link(dentry, inode); 289 err = f2fs_add_link(dentry, inode);
293 f2fs_unlock_op(sbi);
294 if (err) 290 if (err)
295 goto out_fail; 291 goto out_fail;
292 f2fs_unlock_op(sbi);
296 293
297 alloc_nid_done(sbi, inode->i_ino); 294 alloc_nid_done(sbi, inode->i_ino);
298 295
@@ -303,9 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
303 300
304out_fail: 301out_fail:
305 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 302 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
306 clear_nlink(inode); 303 handle_failed_inode(inode);
307 iget_failed(inode);
308 alloc_nid_failed(sbi, inode->i_ino);
309 return err; 304 return err;
310} 305}
311 306
@@ -320,7 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
320static int f2fs_mknod(struct inode *dir, struct dentry *dentry, 315static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
321 umode_t mode, dev_t rdev) 316 umode_t mode, dev_t rdev)
322{ 317{
323 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 318 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
324 struct inode *inode; 319 struct inode *inode;
325 int err = 0; 320 int err = 0;
326 321
@@ -338,25 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
338 333
339 f2fs_lock_op(sbi); 334 f2fs_lock_op(sbi);
340 err = f2fs_add_link(dentry, inode); 335 err = f2fs_add_link(dentry, inode);
341 f2fs_unlock_op(sbi);
342 if (err) 336 if (err)
343 goto out; 337 goto out;
338 f2fs_unlock_op(sbi);
344 339
345 alloc_nid_done(sbi, inode->i_ino); 340 alloc_nid_done(sbi, inode->i_ino);
346 d_instantiate(dentry, inode); 341 d_instantiate(dentry, inode);
347 unlock_new_inode(inode); 342 unlock_new_inode(inode);
348 return 0; 343 return 0;
349out: 344out:
350 clear_nlink(inode); 345 handle_failed_inode(inode);
351 iget_failed(inode);
352 alloc_nid_failed(sbi, inode->i_ino);
353 return err; 346 return err;
354} 347}
355 348
356static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, 349static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
357 struct inode *new_dir, struct dentry *new_dentry) 350 struct inode *new_dir, struct dentry *new_dentry)
358{ 351{
359 struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb); 352 struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
360 struct inode *old_inode = old_dentry->d_inode; 353 struct inode *old_inode = old_dentry->d_inode;
361 struct inode *new_inode = new_dentry->d_inode; 354 struct inode *new_inode = new_dentry->d_inode;
362 struct page *old_dir_page; 355 struct page *old_dir_page;
@@ -480,8 +473,7 @@ out:
480static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, 473static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
481 struct inode *new_dir, struct dentry *new_dentry) 474 struct inode *new_dir, struct dentry *new_dentry)
482{ 475{
483 struct super_block *sb = old_dir->i_sb; 476 struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
484 struct f2fs_sb_info *sbi = F2FS_SB(sb);
485 struct inode *old_inode = old_dentry->d_inode; 477 struct inode *old_inode = old_dentry->d_inode;
486 struct inode *new_inode = new_dentry->d_inode; 478 struct inode *new_inode = new_dentry->d_inode;
487 struct page *old_dir_page, *new_dir_page; 479 struct page *old_dir_page, *new_dir_page;
@@ -642,7 +634,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
642 634
643static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 635static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
644{ 636{
645 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 637 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
646 struct inode *inode; 638 struct inode *inode;
647 int err; 639 int err;
648 640
@@ -678,10 +670,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
678release_out: 670release_out:
679 release_orphan_inode(sbi); 671 release_orphan_inode(sbi);
680out: 672out:
681 f2fs_unlock_op(sbi); 673 handle_failed_inode(inode);
682 clear_nlink(inode);
683 iget_failed(inode);
684 alloc_nid_failed(sbi, inode->i_ino);
685 return err; 674 return err;
686} 675}
687 676
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 45378196e19a..44b8afef43d9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
54static void clear_node_page_dirty(struct page *page) 54static void clear_node_page_dirty(struct page *page)
55{ 55{
56 struct address_space *mapping = page->mapping; 56 struct address_space *mapping = page->mapping;
57 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
58 unsigned int long flags; 57 unsigned int long flags;
59 58
60 if (PageDirty(page)) { 59 if (PageDirty(page)) {
@@ -65,7 +64,7 @@ static void clear_node_page_dirty(struct page *page)
65 spin_unlock_irqrestore(&mapping->tree_lock, flags); 64 spin_unlock_irqrestore(&mapping->tree_lock, flags);
66 65
67 clear_page_dirty_for_io(page); 66 clear_page_dirty_for_io(page);
68 dec_page_count(sbi, F2FS_DIRTY_NODES); 67 dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
69 } 68 }
70 ClearPageUptodate(page); 69 ClearPageUptodate(page);
71} 70}
@@ -92,7 +91,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
92 /* get current nat block page with lock */ 91 /* get current nat block page with lock */
93 src_page = get_meta_page(sbi, src_off); 92 src_page = get_meta_page(sbi, src_off);
94 dst_page = grab_meta_page(sbi, dst_off); 93 dst_page = grab_meta_page(sbi, dst_off);
95 f2fs_bug_on(PageDirty(src_page)); 94 f2fs_bug_on(sbi, PageDirty(src_page));
96 95
97 src_addr = page_address(src_page); 96 src_addr = page_address(src_page);
98 dst_addr = page_address(dst_page); 97 dst_addr = page_address(dst_page);
@@ -124,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
124 kmem_cache_free(nat_entry_slab, e); 123 kmem_cache_free(nat_entry_slab, e);
125} 124}
126 125
127int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) 126static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
127 struct nat_entry *ne)
128{
129 nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
130 struct nat_entry_set *head;
131
132 if (get_nat_flag(ne, IS_DIRTY))
133 return;
134retry:
135 head = radix_tree_lookup(&nm_i->nat_set_root, set);
136 if (!head) {
137 head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
138
139 INIT_LIST_HEAD(&head->entry_list);
140 INIT_LIST_HEAD(&head->set_list);
141 head->set = set;
142 head->entry_cnt = 0;
143
144 if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
145 cond_resched();
146 goto retry;
147 }
148 }
149 list_move_tail(&ne->list, &head->entry_list);
150 nm_i->dirty_nat_cnt++;
151 head->entry_cnt++;
152 set_nat_flag(ne, IS_DIRTY, true);
153}
154
155static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
156 struct nat_entry *ne)
157{
158 nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
159 struct nat_entry_set *head;
160
161 head = radix_tree_lookup(&nm_i->nat_set_root, set);
162 if (head) {
163 list_move_tail(&ne->list, &nm_i->nat_entries);
164 set_nat_flag(ne, IS_DIRTY, false);
165 head->entry_cnt--;
166 nm_i->dirty_nat_cnt--;
167 }
168}
169
170static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
171 nid_t start, unsigned int nr, struct nat_entry_set **ep)
172{
173 return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
174 start, nr);
175}
176
177bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
128{ 178{
129 struct f2fs_nm_info *nm_i = NM_I(sbi); 179 struct f2fs_nm_info *nm_i = NM_I(sbi);
130 struct nat_entry *e; 180 struct nat_entry *e;
131 int is_cp = 1; 181 bool is_cp = true;
132 182
133 read_lock(&nm_i->nat_tree_lock); 183 read_lock(&nm_i->nat_tree_lock);
134 e = __lookup_nat_cache(nm_i, nid); 184 e = __lookup_nat_cache(nm_i, nid);
135 if (e && !e->checkpointed) 185 if (e && !get_nat_flag(e, IS_CHECKPOINTED))
136 is_cp = 0; 186 is_cp = false;
137 read_unlock(&nm_i->nat_tree_lock); 187 read_unlock(&nm_i->nat_tree_lock);
138 return is_cp; 188 return is_cp;
139} 189}
140 190
141bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) 191bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
142{ 192{
143 struct f2fs_nm_info *nm_i = NM_I(sbi); 193 struct f2fs_nm_info *nm_i = NM_I(sbi);
144 struct nat_entry *e; 194 struct nat_entry *e;
145 bool fsync_done = false; 195 bool fsynced = false;
146 196
147 read_lock(&nm_i->nat_tree_lock); 197 read_lock(&nm_i->nat_tree_lock);
148 e = __lookup_nat_cache(nm_i, nid); 198 e = __lookup_nat_cache(nm_i, ino);
149 if (e) 199 if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
150 fsync_done = e->fsync_done; 200 fsynced = true;
151 read_unlock(&nm_i->nat_tree_lock); 201 read_unlock(&nm_i->nat_tree_lock);
152 return fsync_done; 202 return fsynced;
153} 203}
154 204
155void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) 205bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
156{ 206{
157 struct f2fs_nm_info *nm_i = NM_I(sbi); 207 struct f2fs_nm_info *nm_i = NM_I(sbi);
158 struct nat_entry *e; 208 struct nat_entry *e;
209 bool need_update = true;
159 210
160 write_lock(&nm_i->nat_tree_lock); 211 read_lock(&nm_i->nat_tree_lock);
161 e = __lookup_nat_cache(nm_i, nid); 212 e = __lookup_nat_cache(nm_i, ino);
162 if (e) 213 if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
163 e->fsync_done = false; 214 (get_nat_flag(e, IS_CHECKPOINTED) ||
164 write_unlock(&nm_i->nat_tree_lock); 215 get_nat_flag(e, HAS_FSYNCED_INODE)))
216 need_update = false;
217 read_unlock(&nm_i->nat_tree_lock);
218 return need_update;
165} 219}
166 220
167static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) 221static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
@@ -177,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
177 } 231 }
178 memset(new, 0, sizeof(struct nat_entry)); 232 memset(new, 0, sizeof(struct nat_entry));
179 nat_set_nid(new, nid); 233 nat_set_nid(new, nid);
180 new->checkpointed = true; 234 nat_reset_flag(new);
181 list_add_tail(&new->list, &nm_i->nat_entries); 235 list_add_tail(&new->list, &nm_i->nat_entries);
182 nm_i->nat_cnt++; 236 nm_i->nat_cnt++;
183 return new; 237 return new;
@@ -216,7 +270,7 @@ retry:
216 goto retry; 270 goto retry;
217 } 271 }
218 e->ni = *ni; 272 e->ni = *ni;
219 f2fs_bug_on(ni->blk_addr == NEW_ADDR); 273 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
220 } else if (new_blkaddr == NEW_ADDR) { 274 } else if (new_blkaddr == NEW_ADDR) {
221 /* 275 /*
222 * when nid is reallocated, 276 * when nid is reallocated,
@@ -224,16 +278,16 @@ retry:
224 * So, reinitialize it with new information. 278 * So, reinitialize it with new information.
225 */ 279 */
226 e->ni = *ni; 280 e->ni = *ni;
227 f2fs_bug_on(ni->blk_addr != NULL_ADDR); 281 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
228 } 282 }
229 283
230 /* sanity check */ 284 /* sanity check */
231 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); 285 f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
232 f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && 286 f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
233 new_blkaddr == NULL_ADDR); 287 new_blkaddr == NULL_ADDR);
234 f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && 288 f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
235 new_blkaddr == NEW_ADDR); 289 new_blkaddr == NEW_ADDR);
236 f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && 290 f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
237 nat_get_blkaddr(e) != NULL_ADDR && 291 nat_get_blkaddr(e) != NULL_ADDR &&
238 new_blkaddr == NEW_ADDR); 292 new_blkaddr == NEW_ADDR);
239 293
@@ -245,12 +299,17 @@ retry:
245 299
246 /* change address */ 300 /* change address */
247 nat_set_blkaddr(e, new_blkaddr); 301 nat_set_blkaddr(e, new_blkaddr);
302 if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
303 set_nat_flag(e, IS_CHECKPOINTED, false);
248 __set_nat_cache_dirty(nm_i, e); 304 __set_nat_cache_dirty(nm_i, e);
249 305
250 /* update fsync_mark if its inode nat entry is still alive */ 306 /* update fsync_mark if its inode nat entry is still alive */
251 e = __lookup_nat_cache(nm_i, ni->ino); 307 e = __lookup_nat_cache(nm_i, ni->ino);
252 if (e) 308 if (e) {
253 e->fsync_done = fsync_done; 309 if (fsync_done && ni->nid == ni->ino)
310 set_nat_flag(e, HAS_FSYNCED_INODE, true);
311 set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
312 }
254 write_unlock(&nm_i->nat_tree_lock); 313 write_unlock(&nm_i->nat_tree_lock);
255} 314}
256 315
@@ -411,7 +470,7 @@ got:
411 */ 470 */
412int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) 471int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
413{ 472{
414 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 473 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
415 struct page *npage[4]; 474 struct page *npage[4];
416 struct page *parent; 475 struct page *parent;
417 int offset[4]; 476 int offset[4];
@@ -504,15 +563,15 @@ release_out:
504 563
505static void truncate_node(struct dnode_of_data *dn) 564static void truncate_node(struct dnode_of_data *dn)
506{ 565{
507 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 566 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
508 struct node_info ni; 567 struct node_info ni;
509 568
510 get_node_info(sbi, dn->nid, &ni); 569 get_node_info(sbi, dn->nid, &ni);
511 if (dn->inode->i_blocks == 0) { 570 if (dn->inode->i_blocks == 0) {
512 f2fs_bug_on(ni.blk_addr != NULL_ADDR); 571 f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
513 goto invalidate; 572 goto invalidate;
514 } 573 }
515 f2fs_bug_on(ni.blk_addr == NULL_ADDR); 574 f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
516 575
517 /* Deallocate node address */ 576 /* Deallocate node address */
518 invalidate_blocks(sbi, ni.blk_addr); 577 invalidate_blocks(sbi, ni.blk_addr);
@@ -540,14 +599,13 @@ invalidate:
540 599
541static int truncate_dnode(struct dnode_of_data *dn) 600static int truncate_dnode(struct dnode_of_data *dn)
542{ 601{
543 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
544 struct page *page; 602 struct page *page;
545 603
546 if (dn->nid == 0) 604 if (dn->nid == 0)
547 return 1; 605 return 1;
548 606
549 /* get direct node */ 607 /* get direct node */
550 page = get_node_page(sbi, dn->nid); 608 page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
551 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) 609 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
552 return 1; 610 return 1;
553 else if (IS_ERR(page)) 611 else if (IS_ERR(page))
@@ -564,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn)
564static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, 622static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
565 int ofs, int depth) 623 int ofs, int depth)
566{ 624{
567 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
568 struct dnode_of_data rdn = *dn; 625 struct dnode_of_data rdn = *dn;
569 struct page *page; 626 struct page *page;
570 struct f2fs_node *rn; 627 struct f2fs_node *rn;
@@ -578,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
578 635
579 trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); 636 trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
580 637
581 page = get_node_page(sbi, dn->nid); 638 page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
582 if (IS_ERR(page)) { 639 if (IS_ERR(page)) {
583 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); 640 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
584 return PTR_ERR(page); 641 return PTR_ERR(page);
@@ -636,7 +693,6 @@ out_err:
636static int truncate_partial_nodes(struct dnode_of_data *dn, 693static int truncate_partial_nodes(struct dnode_of_data *dn,
637 struct f2fs_inode *ri, int *offset, int depth) 694 struct f2fs_inode *ri, int *offset, int depth)
638{ 695{
639 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
640 struct page *pages[2]; 696 struct page *pages[2];
641 nid_t nid[3]; 697 nid_t nid[3];
642 nid_t child_nid; 698 nid_t child_nid;
@@ -651,7 +707,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
651 /* get indirect nodes in the path */ 707 /* get indirect nodes in the path */
652 for (i = 0; i < idx + 1; i++) { 708 for (i = 0; i < idx + 1; i++) {
653 /* reference count'll be increased */ 709 /* reference count'll be increased */
654 pages[i] = get_node_page(sbi, nid[i]); 710 pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
655 if (IS_ERR(pages[i])) { 711 if (IS_ERR(pages[i])) {
656 err = PTR_ERR(pages[i]); 712 err = PTR_ERR(pages[i]);
657 idx = i - 1; 713 idx = i - 1;
@@ -696,7 +752,7 @@ fail:
696 */ 752 */
697int truncate_inode_blocks(struct inode *inode, pgoff_t from) 753int truncate_inode_blocks(struct inode *inode, pgoff_t from)
698{ 754{
699 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 755 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
700 int err = 0, cont = 1; 756 int err = 0, cont = 1;
701 int level, offset[4], noffset[4]; 757 int level, offset[4], noffset[4];
702 unsigned int nofs = 0; 758 unsigned int nofs = 0;
@@ -792,7 +848,7 @@ fail:
792 848
793int truncate_xattr_node(struct inode *inode, struct page *page) 849int truncate_xattr_node(struct inode *inode, struct page *page)
794{ 850{
795 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 851 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
796 nid_t nid = F2FS_I(inode)->i_xattr_nid; 852 nid_t nid = F2FS_I(inode)->i_xattr_nid;
797 struct dnode_of_data dn; 853 struct dnode_of_data dn;
798 struct page *npage; 854 struct page *npage;
@@ -840,7 +896,8 @@ void remove_inode_page(struct inode *inode)
840 truncate_data_blocks_range(&dn, 1); 896 truncate_data_blocks_range(&dn, 1);
841 897
842 /* 0 is possible, after f2fs_new_inode() has failed */ 898 /* 0 is possible, after f2fs_new_inode() has failed */
843 f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); 899 f2fs_bug_on(F2FS_I_SB(inode),
900 inode->i_blocks != 0 && inode->i_blocks != 1);
844 901
845 /* will put inode & node pages */ 902 /* will put inode & node pages */
846 truncate_node(&dn); 903 truncate_node(&dn);
@@ -860,7 +917,7 @@ struct page *new_inode_page(struct inode *inode)
860struct page *new_node_page(struct dnode_of_data *dn, 917struct page *new_node_page(struct dnode_of_data *dn,
861 unsigned int ofs, struct page *ipage) 918 unsigned int ofs, struct page *ipage)
862{ 919{
863 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 920 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
864 struct node_info old_ni, new_ni; 921 struct node_info old_ni, new_ni;
865 struct page *page; 922 struct page *page;
866 int err; 923 int err;
@@ -880,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
880 get_node_info(sbi, dn->nid, &old_ni); 937 get_node_info(sbi, dn->nid, &old_ni);
881 938
882 /* Reinitialize old_ni with new node page */ 939 /* Reinitialize old_ni with new node page */
883 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); 940 f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
884 new_ni = old_ni; 941 new_ni = old_ni;
885 new_ni.ino = dn->inode->i_ino; 942 new_ni.ino = dn->inode->i_ino;
886 set_node_addr(sbi, &new_ni, NEW_ADDR, false); 943 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
@@ -918,7 +975,7 @@ fail:
918 */ 975 */
919static int read_node_page(struct page *page, int rw) 976static int read_node_page(struct page *page, int rw)
920{ 977{
921 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 978 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
922 struct node_info ni; 979 struct node_info ni;
923 980
924 get_node_info(sbi, page->index, &ni); 981 get_node_info(sbi, page->index, &ni);
@@ -994,7 +1051,7 @@ got_it:
994 */ 1051 */
995struct page *get_node_page_ra(struct page *parent, int start) 1052struct page *get_node_page_ra(struct page *parent, int start)
996{ 1053{
997 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); 1054 struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
998 struct blk_plug plug; 1055 struct blk_plug plug;
999 struct page *page; 1056 struct page *page;
1000 int err, i, end; 1057 int err, i, end;
@@ -1124,10 +1181,14 @@ continue_unlock:
1124 1181
1125 /* called by fsync() */ 1182 /* called by fsync() */
1126 if (ino && IS_DNODE(page)) { 1183 if (ino && IS_DNODE(page)) {
1127 int mark = !is_checkpointed_node(sbi, ino);
1128 set_fsync_mark(page, 1); 1184 set_fsync_mark(page, 1);
1129 if (IS_INODE(page)) 1185 if (IS_INODE(page)) {
1130 set_dentry_mark(page, mark); 1186 if (!is_checkpointed_node(sbi, ino) &&
1187 !has_fsynced_inode(sbi, ino))
1188 set_dentry_mark(page, 1);
1189 else
1190 set_dentry_mark(page, 0);
1191 }
1131 nwritten++; 1192 nwritten++;
1132 } else { 1193 } else {
1133 set_fsync_mark(page, 0); 1194 set_fsync_mark(page, 0);
@@ -1206,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1206static int f2fs_write_node_page(struct page *page, 1267static int f2fs_write_node_page(struct page *page,
1207 struct writeback_control *wbc) 1268 struct writeback_control *wbc)
1208{ 1269{
1209 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1270 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1210 nid_t nid; 1271 nid_t nid;
1211 block_t new_addr; 1272 block_t new_addr;
1212 struct node_info ni; 1273 struct node_info ni;
@@ -1226,7 +1287,7 @@ static int f2fs_write_node_page(struct page *page,
1226 1287
1227 /* get old block addr of this node page */ 1288 /* get old block addr of this node page */
1228 nid = nid_of_node(page); 1289 nid = nid_of_node(page);
1229 f2fs_bug_on(page->index != nid); 1290 f2fs_bug_on(sbi, page->index != nid);
1230 1291
1231 get_node_info(sbi, nid, &ni); 1292 get_node_info(sbi, nid, &ni);
1232 1293
@@ -1257,7 +1318,7 @@ redirty_out:
1257static int f2fs_write_node_pages(struct address_space *mapping, 1318static int f2fs_write_node_pages(struct address_space *mapping,
1258 struct writeback_control *wbc) 1319 struct writeback_control *wbc)
1259{ 1320{
1260 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 1321 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
1261 long diff; 1322 long diff;
1262 1323
1263 trace_f2fs_writepages(mapping->host, wbc, NODE); 1324 trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -1282,15 +1343,12 @@ skip_write:
1282 1343
1283static int f2fs_set_node_page_dirty(struct page *page) 1344static int f2fs_set_node_page_dirty(struct page *page)
1284{ 1345{
1285 struct address_space *mapping = page->mapping;
1286 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1287
1288 trace_f2fs_set_page_dirty(page, NODE); 1346 trace_f2fs_set_page_dirty(page, NODE);
1289 1347
1290 SetPageUptodate(page); 1348 SetPageUptodate(page);
1291 if (!PageDirty(page)) { 1349 if (!PageDirty(page)) {
1292 __set_page_dirty_nobuffers(page); 1350 __set_page_dirty_nobuffers(page);
1293 inc_page_count(sbi, F2FS_DIRTY_NODES); 1351 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1294 SetPagePrivate(page); 1352 SetPagePrivate(page);
1295 return 1; 1353 return 1;
1296 } 1354 }
@@ -1301,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1301 unsigned int length) 1359 unsigned int length)
1302{ 1360{
1303 struct inode *inode = page->mapping->host; 1361 struct inode *inode = page->mapping->host;
1304 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1305 if (PageDirty(page)) 1362 if (PageDirty(page))
1306 dec_page_count(sbi, F2FS_DIRTY_NODES); 1363 dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
1307 ClearPagePrivate(page); 1364 ClearPagePrivate(page);
1308} 1365}
1309 1366
@@ -1356,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
1356 read_lock(&nm_i->nat_tree_lock); 1413 read_lock(&nm_i->nat_tree_lock);
1357 ne = __lookup_nat_cache(nm_i, nid); 1414 ne = __lookup_nat_cache(nm_i, nid);
1358 if (ne && 1415 if (ne &&
1359 (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) 1416 (!get_nat_flag(ne, IS_CHECKPOINTED) ||
1417 nat_get_blkaddr(ne) != NULL_ADDR))
1360 allocated = true; 1418 allocated = true;
1361 read_unlock(&nm_i->nat_tree_lock); 1419 read_unlock(&nm_i->nat_tree_lock);
1362 if (allocated) 1420 if (allocated)
@@ -1413,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
1413 break; 1471 break;
1414 1472
1415 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); 1473 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1416 f2fs_bug_on(blk_addr == NEW_ADDR); 1474 f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
1417 if (blk_addr == NULL_ADDR) { 1475 if (blk_addr == NULL_ADDR) {
1418 if (add_free_nid(sbi, start_nid, true) < 0) 1476 if (add_free_nid(sbi, start_nid, true) < 0)
1419 break; 1477 break;
@@ -1483,12 +1541,12 @@ retry:
1483 1541
1484 /* We should not use stale free nids created by build_free_nids */ 1542 /* We should not use stale free nids created by build_free_nids */
1485 if (nm_i->fcnt && !on_build_free_nids(nm_i)) { 1543 if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
1486 f2fs_bug_on(list_empty(&nm_i->free_nid_list)); 1544 f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
1487 list_for_each_entry(i, &nm_i->free_nid_list, list) 1545 list_for_each_entry(i, &nm_i->free_nid_list, list)
1488 if (i->state == NID_NEW) 1546 if (i->state == NID_NEW)
1489 break; 1547 break;
1490 1548
1491 f2fs_bug_on(i->state != NID_NEW); 1549 f2fs_bug_on(sbi, i->state != NID_NEW);
1492 *nid = i->nid; 1550 *nid = i->nid;
1493 i->state = NID_ALLOC; 1551 i->state = NID_ALLOC;
1494 nm_i->fcnt--; 1552 nm_i->fcnt--;
@@ -1514,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1514 1572
1515 spin_lock(&nm_i->free_nid_list_lock); 1573 spin_lock(&nm_i->free_nid_list_lock);
1516 i = __lookup_free_nid_list(nm_i, nid); 1574 i = __lookup_free_nid_list(nm_i, nid);
1517 f2fs_bug_on(!i || i->state != NID_ALLOC); 1575 f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
1518 __del_from_free_nid_list(nm_i, i); 1576 __del_from_free_nid_list(nm_i, i);
1519 spin_unlock(&nm_i->free_nid_list_lock); 1577 spin_unlock(&nm_i->free_nid_list_lock);
1520 1578
@@ -1535,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1535 1593
1536 spin_lock(&nm_i->free_nid_list_lock); 1594 spin_lock(&nm_i->free_nid_list_lock);
1537 i = __lookup_free_nid_list(nm_i, nid); 1595 i = __lookup_free_nid_list(nm_i, nid);
1538 f2fs_bug_on(!i || i->state != NID_ALLOC); 1596 f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
1539 if (!available_free_memory(sbi, FREE_NIDS)) { 1597 if (!available_free_memory(sbi, FREE_NIDS)) {
1540 __del_from_free_nid_list(nm_i, i); 1598 __del_from_free_nid_list(nm_i, i);
1541 need_free = true; 1599 need_free = true;
@@ -1551,14 +1609,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1551 1609
1552void recover_inline_xattr(struct inode *inode, struct page *page) 1610void recover_inline_xattr(struct inode *inode, struct page *page)
1553{ 1611{
1554 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1555 void *src_addr, *dst_addr; 1612 void *src_addr, *dst_addr;
1556 size_t inline_size; 1613 size_t inline_size;
1557 struct page *ipage; 1614 struct page *ipage;
1558 struct f2fs_inode *ri; 1615 struct f2fs_inode *ri;
1559 1616
1560 ipage = get_node_page(sbi, inode->i_ino); 1617 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
1561 f2fs_bug_on(IS_ERR(ipage)); 1618 f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
1562 1619
1563 ri = F2FS_INODE(page); 1620 ri = F2FS_INODE(page);
1564 if (!(ri->i_inline & F2FS_INLINE_XATTR)) { 1621 if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
@@ -1579,7 +1636,7 @@ update_inode:
1579 1636
1580void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) 1637void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1581{ 1638{
1582 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1639 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1583 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; 1640 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
1584 nid_t new_xnid = nid_of_node(page); 1641 nid_t new_xnid = nid_of_node(page);
1585 struct node_info ni; 1642 struct node_info ni;
@@ -1590,7 +1647,7 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1590 1647
1591 /* Deallocate node address */ 1648 /* Deallocate node address */
1592 get_node_info(sbi, prev_xnid, &ni); 1649 get_node_info(sbi, prev_xnid, &ni);
1593 f2fs_bug_on(ni.blk_addr == NULL_ADDR); 1650 f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
1594 invalidate_blocks(sbi, ni.blk_addr); 1651 invalidate_blocks(sbi, ni.blk_addr);
1595 dec_valid_node_count(sbi, inode); 1652 dec_valid_node_count(sbi, inode);
1596 set_node_addr(sbi, &ni, NULL_ADDR, false); 1653 set_node_addr(sbi, &ni, NULL_ADDR, false);
@@ -1598,7 +1655,7 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1598recover_xnid: 1655recover_xnid:
1599 /* 2: allocate new xattr nid */ 1656 /* 2: allocate new xattr nid */
1600 if (unlikely(!inc_valid_node_count(sbi, inode))) 1657 if (unlikely(!inc_valid_node_count(sbi, inode)))
1601 f2fs_bug_on(1); 1658 f2fs_bug_on(sbi, 1);
1602 1659
1603 remove_free_nid(NM_I(sbi), new_xnid); 1660 remove_free_nid(NM_I(sbi), new_xnid);
1604 get_node_info(sbi, new_xnid, &ni); 1661 get_node_info(sbi, new_xnid, &ni);
@@ -1691,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1691 struct f2fs_summary *sum_entry; 1748 struct f2fs_summary *sum_entry;
1692 struct inode *inode = sbi->sb->s_bdev->bd_inode; 1749 struct inode *inode = sbi->sb->s_bdev->bd_inode;
1693 block_t addr; 1750 block_t addr;
1694 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1751 int bio_blocks = MAX_BIO_BLOCKS(sbi);
1695 struct page *pages[bio_blocks]; 1752 struct page *pages[bio_blocks];
1696 int i, idx, last_offset, nrpages, err = 0; 1753 int i, idx, last_offset, nrpages, err = 0;
1697 1754
@@ -1733,89 +1790,6 @@ skip:
1733 return err; 1790 return err;
1734} 1791}
1735 1792
1736static struct nat_entry_set *grab_nat_entry_set(void)
1737{
1738 struct nat_entry_set *nes =
1739 f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
1740
1741 nes->entry_cnt = 0;
1742 INIT_LIST_HEAD(&nes->set_list);
1743 INIT_LIST_HEAD(&nes->entry_list);
1744 return nes;
1745}
1746
1747static void release_nat_entry_set(struct nat_entry_set *nes,
1748 struct f2fs_nm_info *nm_i)
1749{
1750 f2fs_bug_on(!list_empty(&nes->entry_list));
1751
1752 nm_i->dirty_nat_cnt -= nes->entry_cnt;
1753 list_del(&nes->set_list);
1754 kmem_cache_free(nat_entry_set_slab, nes);
1755}
1756
1757static void adjust_nat_entry_set(struct nat_entry_set *nes,
1758 struct list_head *head)
1759{
1760 struct nat_entry_set *next = nes;
1761
1762 if (list_is_last(&nes->set_list, head))
1763 return;
1764
1765 list_for_each_entry_continue(next, head, set_list)
1766 if (nes->entry_cnt <= next->entry_cnt)
1767 break;
1768
1769 list_move_tail(&nes->set_list, &next->set_list);
1770}
1771
1772static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
1773{
1774 struct nat_entry_set *nes;
1775 nid_t start_nid = START_NID(ne->ni.nid);
1776
1777 list_for_each_entry(nes, head, set_list) {
1778 if (nes->start_nid == start_nid) {
1779 list_move_tail(&ne->list, &nes->entry_list);
1780 nes->entry_cnt++;
1781 adjust_nat_entry_set(nes, head);
1782 return;
1783 }
1784 }
1785
1786 nes = grab_nat_entry_set();
1787
1788 nes->start_nid = start_nid;
1789 list_move_tail(&ne->list, &nes->entry_list);
1790 nes->entry_cnt++;
1791 list_add(&nes->set_list, head);
1792}
1793
1794static void merge_nats_in_set(struct f2fs_sb_info *sbi)
1795{
1796 struct f2fs_nm_info *nm_i = NM_I(sbi);
1797 struct list_head *dirty_list = &nm_i->dirty_nat_entries;
1798 struct list_head *set_list = &nm_i->nat_entry_set;
1799 struct nat_entry *ne, *tmp;
1800
1801 write_lock(&nm_i->nat_tree_lock);
1802 list_for_each_entry_safe(ne, tmp, dirty_list, list) {
1803 if (nat_get_blkaddr(ne) == NEW_ADDR)
1804 continue;
1805 add_nat_entry(ne, set_list);
1806 nm_i->dirty_nat_cnt++;
1807 }
1808 write_unlock(&nm_i->nat_tree_lock);
1809}
1810
1811static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
1812{
1813 if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
1814 return true;
1815 else
1816 return false;
1817}
1818
1819static void remove_nats_in_journal(struct f2fs_sb_info *sbi) 1793static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
1820{ 1794{
1821 struct f2fs_nm_info *nm_i = NM_I(sbi); 1795 struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1850,99 +1824,130 @@ found:
1850 mutex_unlock(&curseg->curseg_mutex); 1824 mutex_unlock(&curseg->curseg_mutex);
1851} 1825}
1852 1826
1853/* 1827static void __adjust_nat_entry_set(struct nat_entry_set *nes,
1854 * This function is called during the checkpointing process. 1828 struct list_head *head, int max)
1855 */
1856void flush_nat_entries(struct f2fs_sb_info *sbi)
1857{ 1829{
1858 struct f2fs_nm_info *nm_i = NM_I(sbi); 1830 struct nat_entry_set *cur;
1859 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1860 struct f2fs_summary_block *sum = curseg->sum_blk;
1861 struct nat_entry_set *nes, *tmp;
1862 struct list_head *head = &nm_i->nat_entry_set;
1863 bool to_journal = true;
1864 1831
1865 /* merge nat entries of dirty list to nat entry set temporarily */ 1832 if (nes->entry_cnt >= max)
1866 merge_nats_in_set(sbi); 1833 goto add_out;
1867 1834
1868 /* 1835 list_for_each_entry(cur, head, set_list) {
1869 * if there are no enough space in journal to store dirty nat 1836 if (cur->entry_cnt >= nes->entry_cnt) {
1870 * entries, remove all entries from journal and merge them 1837 list_add(&nes->set_list, cur->set_list.prev);
1871 * into nat entry set. 1838 return;
1872 */ 1839 }
1873 if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
1874 remove_nats_in_journal(sbi);
1875
1876 /*
1877 * merge nat entries of dirty list to nat entry set temporarily
1878 */
1879 merge_nats_in_set(sbi);
1880 } 1840 }
1841add_out:
1842 list_add_tail(&nes->set_list, head);
1843}
1881 1844
1882 if (!nm_i->dirty_nat_cnt) 1845static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1883 return; 1846 struct nat_entry_set *set)
1847{
1848 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1849 struct f2fs_summary_block *sum = curseg->sum_blk;
1850 nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
1851 bool to_journal = true;
1852 struct f2fs_nat_block *nat_blk;
1853 struct nat_entry *ne, *cur;
1854 struct page *page = NULL;
1884 1855
1885 /* 1856 /*
1886 * there are two steps to flush nat entries: 1857 * there are two steps to flush nat entries:
1887 * #1, flush nat entries to journal in current hot data summary block. 1858 * #1, flush nat entries to journal in current hot data summary block.
1888 * #2, flush nat entries to nat page. 1859 * #2, flush nat entries to nat page.
1889 */ 1860 */
1890 list_for_each_entry_safe(nes, tmp, head, set_list) { 1861 if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
1891 struct f2fs_nat_block *nat_blk; 1862 to_journal = false;
1892 struct nat_entry *ne, *cur;
1893 struct page *page;
1894 nid_t start_nid = nes->start_nid;
1895 1863
1896 if (to_journal && !__has_cursum_space(sum, nes->entry_cnt)) 1864 if (to_journal) {
1897 to_journal = false; 1865 mutex_lock(&curseg->curseg_mutex);
1866 } else {
1867 page = get_next_nat_page(sbi, start_nid);
1868 nat_blk = page_address(page);
1869 f2fs_bug_on(sbi, !nat_blk);
1870 }
1871
1872 /* flush dirty nats in nat entry set */
1873 list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
1874 struct f2fs_nat_entry *raw_ne;
1875 nid_t nid = nat_get_nid(ne);
1876 int offset;
1877
1878 if (nat_get_blkaddr(ne) == NEW_ADDR)
1879 continue;
1898 1880
1899 if (to_journal) { 1881 if (to_journal) {
1900 mutex_lock(&curseg->curseg_mutex); 1882 offset = lookup_journal_in_cursum(sum,
1883 NAT_JOURNAL, nid, 1);
1884 f2fs_bug_on(sbi, offset < 0);
1885 raw_ne = &nat_in_journal(sum, offset);
1886 nid_in_journal(sum, offset) = cpu_to_le32(nid);
1901 } else { 1887 } else {
1902 page = get_next_nat_page(sbi, start_nid); 1888 raw_ne = &nat_blk->entries[nid - start_nid];
1903 nat_blk = page_address(page);
1904 f2fs_bug_on(!nat_blk);
1905 } 1889 }
1890 raw_nat_from_node_info(raw_ne, &ne->ni);
1906 1891
1907 /* flush dirty nats in nat entry set */ 1892 write_lock(&NM_I(sbi)->nat_tree_lock);
1908 list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { 1893 nat_reset_flag(ne);
1909 struct f2fs_nat_entry *raw_ne; 1894 __clear_nat_cache_dirty(NM_I(sbi), ne);
1910 nid_t nid = nat_get_nid(ne); 1895 write_unlock(&NM_I(sbi)->nat_tree_lock);
1911 int offset;
1912 1896
1913 if (to_journal) { 1897 if (nat_get_blkaddr(ne) == NULL_ADDR)
1914 offset = lookup_journal_in_cursum(sum, 1898 add_free_nid(sbi, nid, false);
1915 NAT_JOURNAL, nid, 1); 1899 }
1916 f2fs_bug_on(offset < 0);
1917 raw_ne = &nat_in_journal(sum, offset);
1918 nid_in_journal(sum, offset) = cpu_to_le32(nid);
1919 } else {
1920 raw_ne = &nat_blk->entries[nid - start_nid];
1921 }
1922 raw_nat_from_node_info(raw_ne, &ne->ni);
1923 1900
1924 if (nat_get_blkaddr(ne) == NULL_ADDR && 1901 if (to_journal)
1925 add_free_nid(sbi, nid, false) <= 0) { 1902 mutex_unlock(&curseg->curseg_mutex);
1926 write_lock(&nm_i->nat_tree_lock); 1903 else
1927 __del_from_nat_cache(nm_i, ne); 1904 f2fs_put_page(page, 1);
1928 write_unlock(&nm_i->nat_tree_lock);
1929 } else {
1930 write_lock(&nm_i->nat_tree_lock);
1931 __clear_nat_cache_dirty(nm_i, ne);
1932 write_unlock(&nm_i->nat_tree_lock);
1933 }
1934 }
1935 1905
1936 if (to_journal) 1906 if (!set->entry_cnt) {
1937 mutex_unlock(&curseg->curseg_mutex); 1907 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
1938 else 1908 kmem_cache_free(nat_entry_set_slab, set);
1939 f2fs_put_page(page, 1); 1909 }
1910}
1911
1912/*
1913 * This function is called during the checkpointing process.
1914 */
1915void flush_nat_entries(struct f2fs_sb_info *sbi)
1916{
1917 struct f2fs_nm_info *nm_i = NM_I(sbi);
1918 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1919 struct f2fs_summary_block *sum = curseg->sum_blk;
1920 struct nat_entry_set *setvec[NATVEC_SIZE];
1921 struct nat_entry_set *set, *tmp;
1922 unsigned int found;
1923 nid_t set_idx = 0;
1924 LIST_HEAD(sets);
1925
1926 /*
1927 * if there are no enough space in journal to store dirty nat
1928 * entries, remove all entries from journal and merge them
1929 * into nat entry set.
1930 */
1931 if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
1932 remove_nats_in_journal(sbi);
1940 1933
1941 release_nat_entry_set(nes, nm_i); 1934 if (!nm_i->dirty_nat_cnt)
1935 return;
1936
1937 while ((found = __gang_lookup_nat_set(nm_i,
1938 set_idx, NATVEC_SIZE, setvec))) {
1939 unsigned idx;
1940 set_idx = setvec[found - 1]->set + 1;
1941 for (idx = 0; idx < found; idx++)
1942 __adjust_nat_entry_set(setvec[idx], &sets,
1943 MAX_NAT_JENTRIES(sum));
1942 } 1944 }
1943 1945
1944 f2fs_bug_on(!list_empty(head)); 1946 /* flush dirty nats in nat entry set */
1945 f2fs_bug_on(nm_i->dirty_nat_cnt); 1947 list_for_each_entry_safe(set, tmp, &sets, set_list)
1948 __flush_nat_entry_set(sbi, set);
1949
1950 f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
1946} 1951}
1947 1952
1948static int init_node_manager(struct f2fs_sb_info *sbi) 1953static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1969,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
1969 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); 1974 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
1970 INIT_LIST_HEAD(&nm_i->free_nid_list); 1975 INIT_LIST_HEAD(&nm_i->free_nid_list);
1971 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); 1976 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1977 INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
1972 INIT_LIST_HEAD(&nm_i->nat_entries); 1978 INIT_LIST_HEAD(&nm_i->nat_entries);
1973 INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
1974 INIT_LIST_HEAD(&nm_i->nat_entry_set);
1975 1979
1976 mutex_init(&nm_i->build_lock); 1980 mutex_init(&nm_i->build_lock);
1977 spin_lock_init(&nm_i->free_nid_list_lock); 1981 spin_lock_init(&nm_i->free_nid_list_lock);
@@ -2020,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2020 /* destroy free nid list */ 2024 /* destroy free nid list */
2021 spin_lock(&nm_i->free_nid_list_lock); 2025 spin_lock(&nm_i->free_nid_list_lock);
2022 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { 2026 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
2023 f2fs_bug_on(i->state == NID_ALLOC); 2027 f2fs_bug_on(sbi, i->state == NID_ALLOC);
2024 __del_from_free_nid_list(nm_i, i); 2028 __del_from_free_nid_list(nm_i, i);
2025 nm_i->fcnt--; 2029 nm_i->fcnt--;
2026 spin_unlock(&nm_i->free_nid_list_lock); 2030 spin_unlock(&nm_i->free_nid_list_lock);
2027 kmem_cache_free(free_nid_slab, i); 2031 kmem_cache_free(free_nid_slab, i);
2028 spin_lock(&nm_i->free_nid_list_lock); 2032 spin_lock(&nm_i->free_nid_list_lock);
2029 } 2033 }
2030 f2fs_bug_on(nm_i->fcnt); 2034 f2fs_bug_on(sbi, nm_i->fcnt);
2031 spin_unlock(&nm_i->free_nid_list_lock); 2035 spin_unlock(&nm_i->free_nid_list_lock);
2032 2036
2033 /* destroy nat cache */ 2037 /* destroy nat cache */
@@ -2039,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2039 for (idx = 0; idx < found; idx++) 2043 for (idx = 0; idx < found; idx++)
2040 __del_from_nat_cache(nm_i, natvec[idx]); 2044 __del_from_nat_cache(nm_i, natvec[idx]);
2041 } 2045 }
2042 f2fs_bug_on(nm_i->nat_cnt); 2046 f2fs_bug_on(sbi, nm_i->nat_cnt);
2043 write_unlock(&nm_i->nat_tree_lock); 2047 write_unlock(&nm_i->nat_tree_lock);
2044 2048
2045 kfree(nm_i->nat_bitmap); 2049 kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 8a116a407599..8d5e6e0dd840 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -39,10 +39,16 @@ struct node_info {
39 unsigned char version; /* version of the node */ 39 unsigned char version; /* version of the node */
40}; 40};
41 41
42enum {
43 IS_CHECKPOINTED, /* is it checkpointed before? */
44 HAS_FSYNCED_INODE, /* is the inode fsynced before? */
45 HAS_LAST_FSYNC, /* has the latest node fsync mark? */
46 IS_DIRTY, /* this nat entry is dirty? */
47};
48
42struct nat_entry { 49struct nat_entry {
43 struct list_head list; /* for clean or dirty nat list */ 50 struct list_head list; /* for clean or dirty nat list */
44 bool checkpointed; /* whether it is checkpointed or not */ 51 unsigned char flag; /* for node information bits */
45 bool fsync_done; /* whether the latest node has fsync mark */
46 struct node_info ni; /* in-memory node information */ 52 struct node_info ni; /* in-memory node information */
47}; 53};
48 54
@@ -55,18 +61,32 @@ struct nat_entry {
55#define nat_get_version(nat) (nat->ni.version) 61#define nat_get_version(nat) (nat->ni.version)
56#define nat_set_version(nat, v) (nat->ni.version = v) 62#define nat_set_version(nat, v) (nat->ni.version = v)
57 63
58#define __set_nat_cache_dirty(nm_i, ne) \
59 do { \
60 ne->checkpointed = false; \
61 list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
62 } while (0)
63#define __clear_nat_cache_dirty(nm_i, ne) \
64 do { \
65 ne->checkpointed = true; \
66 list_move_tail(&ne->list, &nm_i->nat_entries); \
67 } while (0)
68#define inc_node_version(version) (++version) 64#define inc_node_version(version) (++version)
69 65
66static inline void set_nat_flag(struct nat_entry *ne,
67 unsigned int type, bool set)
68{
69 unsigned char mask = 0x01 << type;
70 if (set)
71 ne->flag |= mask;
72 else
73 ne->flag &= ~mask;
74}
75
76static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
77{
78 unsigned char mask = 0x01 << type;
79 return ne->flag & mask;
80}
81
82static inline void nat_reset_flag(struct nat_entry *ne)
83{
84 /* these states can be set only after checkpoint was done */
85 set_nat_flag(ne, IS_CHECKPOINTED, true);
86 set_nat_flag(ne, HAS_FSYNCED_INODE, false);
87 set_nat_flag(ne, HAS_LAST_FSYNC, true);
88}
89
70static inline void node_info_from_raw_nat(struct node_info *ni, 90static inline void node_info_from_raw_nat(struct node_info *ni,
71 struct f2fs_nat_entry *raw_ne) 91 struct f2fs_nat_entry *raw_ne)
72{ 92{
@@ -90,9 +110,9 @@ enum mem_type {
90}; 110};
91 111
92struct nat_entry_set { 112struct nat_entry_set {
93 struct list_head set_list; /* link with all nat sets */ 113 struct list_head set_list; /* link with other nat sets */
94 struct list_head entry_list; /* link with dirty nat entries */ 114 struct list_head entry_list; /* link with dirty nat entries */
95 nid_t start_nid; /* start nid of nats in set */ 115 nid_t set; /* set number*/
96 unsigned int entry_cnt; /* the # of nat entries in set */ 116 unsigned int entry_cnt; /* the # of nat entries in set */
97}; 117};
98 118
@@ -110,18 +130,19 @@ struct free_nid {
110 int state; /* in use or not: NID_NEW or NID_ALLOC */ 130 int state; /* in use or not: NID_NEW or NID_ALLOC */
111}; 131};
112 132
113static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) 133static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
114{ 134{
115 struct f2fs_nm_info *nm_i = NM_I(sbi); 135 struct f2fs_nm_info *nm_i = NM_I(sbi);
116 struct free_nid *fnid; 136 struct free_nid *fnid;
117 137
118 if (nm_i->fcnt <= 0)
119 return -1;
120 spin_lock(&nm_i->free_nid_list_lock); 138 spin_lock(&nm_i->free_nid_list_lock);
139 if (nm_i->fcnt <= 0) {
140 spin_unlock(&nm_i->free_nid_list_lock);
141 return;
142 }
121 fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); 143 fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
122 *nid = fnid->nid; 144 *nid = fnid->nid;
123 spin_unlock(&nm_i->free_nid_list_lock); 145 spin_unlock(&nm_i->free_nid_list_lock);
124 return 0;
125} 146}
126 147
127/* 148/*
@@ -197,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src)
197 218
198static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) 219static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
199{ 220{
200 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 221 struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
201 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
202 struct f2fs_node *rn = F2FS_NODE(page); 222 struct f2fs_node *rn = F2FS_NODE(page);
203 223
204 rn->footer.cp_ver = ckpt->checkpoint_ver; 224 rn->footer.cp_ver = ckpt->checkpoint_ver;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 756c41cd2582..ebd013225788 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -14,6 +14,37 @@
14#include "node.h" 14#include "node.h"
15#include "segment.h" 15#include "segment.h"
16 16
17/*
18 * Roll forward recovery scenarios.
19 *
20 * [Term] F: fsync_mark, D: dentry_mark
21 *
22 * 1. inode(x) | CP | inode(x) | dnode(F)
23 * -> Update the latest inode(x).
24 *
25 * 2. inode(x) | CP | inode(F) | dnode(F)
26 * -> No problem.
27 *
28 * 3. inode(x) | CP | dnode(F) | inode(x)
29 * -> Recover to the latest dnode(F), and drop the last inode(x)
30 *
31 * 4. inode(x) | CP | dnode(F) | inode(F)
32 * -> No problem.
33 *
34 * 5. CP | inode(x) | dnode(F)
35 * -> The inode(DF) was missing. Should drop this dnode(F).
36 *
37 * 6. CP | inode(DF) | dnode(F)
38 * -> No problem.
39 *
40 * 7. CP | dnode(F) | inode(DF)
41 * -> If f2fs_iget fails, then goto next to find inode(DF).
42 *
43 * 8. CP | dnode(F) | inode(x)
44 * -> If f2fs_iget fails, then goto next to find inode(DF).
45 * But it will fail due to no inode(DF).
46 */
47
17static struct kmem_cache *fsync_entry_slab; 48static struct kmem_cache *fsync_entry_slab;
18 49
19bool space_for_roll_forward(struct f2fs_sb_info *sbi) 50bool space_for_roll_forward(struct f2fs_sb_info *sbi)
@@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
36 return NULL; 67 return NULL;
37} 68}
38 69
39static int recover_dentry(struct page *ipage, struct inode *inode) 70static int recover_dentry(struct inode *inode, struct page *ipage)
40{ 71{
41 struct f2fs_inode *raw_inode = F2FS_INODE(ipage); 72 struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
42 nid_t pino = le32_to_cpu(raw_inode->i_pino); 73 nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -75,7 +106,7 @@ retry:
75 err = -EEXIST; 106 err = -EEXIST;
76 goto out_unmap_put; 107 goto out_unmap_put;
77 } 108 }
78 err = acquire_orphan_inode(F2FS_SB(inode->i_sb)); 109 err = acquire_orphan_inode(F2FS_I_SB(inode));
79 if (err) { 110 if (err) {
80 iput(einode); 111 iput(einode);
81 goto out_unmap_put; 112 goto out_unmap_put;
@@ -110,35 +141,28 @@ out:
110 return err; 141 return err;
111} 142}
112 143
113static int recover_inode(struct inode *inode, struct page *node_page) 144static void recover_inode(struct inode *inode, struct page *page)
114{ 145{
115 struct f2fs_inode *raw_inode = F2FS_INODE(node_page); 146 struct f2fs_inode *raw = F2FS_INODE(page);
116
117 if (!IS_INODE(node_page))
118 return 0;
119 147
120 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 148 inode->i_mode = le16_to_cpu(raw->i_mode);
121 i_size_write(inode, le64_to_cpu(raw_inode->i_size)); 149 i_size_write(inode, le64_to_cpu(raw->i_size));
122 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 150 inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
123 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); 151 inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
124 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 152 inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
125 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 153 inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
126 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); 154 inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
127 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); 155 inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
128
129 if (is_dent_dnode(node_page))
130 return recover_dentry(node_page, inode);
131 156
132 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", 157 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
133 ino_of_node(node_page), raw_inode->i_name); 158 ino_of_node(page), F2FS_INODE(page)->i_name);
134 return 0;
135} 159}
136 160
137static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) 161static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
138{ 162{
139 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); 163 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
140 struct curseg_info *curseg; 164 struct curseg_info *curseg;
141 struct page *page; 165 struct page *page = NULL;
142 block_t blkaddr; 166 block_t blkaddr;
143 int err = 0; 167 int err = 0;
144 168
@@ -146,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
146 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); 170 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
147 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 171 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
148 172
149 /* read node page */
150 page = alloc_page(GFP_F2FS_ZERO);
151 if (!page)
152 return -ENOMEM;
153 lock_page(page);
154
155 while (1) { 173 while (1) {
156 struct fsync_inode_entry *entry; 174 struct fsync_inode_entry *entry;
157 175
158 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); 176 if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
159 if (err) 177 return 0;
160 return err;
161 178
162 lock_page(page); 179 page = get_meta_page_ra(sbi, blkaddr);
163 180
164 if (cp_ver != cpver_of_node(page)) 181 if (cp_ver != cpver_of_node(page))
165 break; 182 break;
@@ -180,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
180 } 197 }
181 198
182 /* add this fsync inode to the list */ 199 /* add this fsync inode to the list */
183 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); 200 entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
184 if (!entry) { 201 if (!entry) {
185 err = -ENOMEM; 202 err = -ENOMEM;
186 break; 203 break;
187 } 204 }
188 205 /*
206 * CP | dnode(F) | inode(DF)
207 * For this case, we should not give up now.
208 */
189 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); 209 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
190 if (IS_ERR(entry->inode)) { 210 if (IS_ERR(entry->inode)) {
191 err = PTR_ERR(entry->inode); 211 err = PTR_ERR(entry->inode);
192 kmem_cache_free(fsync_entry_slab, entry); 212 kmem_cache_free(fsync_entry_slab, entry);
213 if (err == -ENOENT)
214 goto next;
193 break; 215 break;
194 } 216 }
195 list_add_tail(&entry->list, head); 217 list_add_tail(&entry->list, head);
196 } 218 }
197 entry->blkaddr = blkaddr; 219 entry->blkaddr = blkaddr;
198 220
199 err = recover_inode(entry->inode, page); 221 if (IS_INODE(page)) {
200 if (err && err != -ENOENT) 222 entry->last_inode = blkaddr;
201 break; 223 if (is_dent_dnode(page))
224 entry->last_dentry = blkaddr;
225 }
202next: 226next:
203 /* check next segment */ 227 /* check next segment */
204 blkaddr = next_blkaddr_of_node(page); 228 blkaddr = next_blkaddr_of_node(page);
229 f2fs_put_page(page, 1);
205 } 230 }
206 231 f2fs_put_page(page, 1);
207 unlock_page(page);
208 __free_pages(page, 0);
209
210 return err; 232 return err;
211} 233}
212 234
@@ -279,16 +301,30 @@ got_it:
279 ino = ino_of_node(node_page); 301 ino = ino_of_node(node_page);
280 f2fs_put_page(node_page, 1); 302 f2fs_put_page(node_page, 1);
281 303
282 /* Deallocate previous index in the node page */ 304 if (ino != dn->inode->i_ino) {
283 inode = f2fs_iget(sbi->sb, ino); 305 /* Deallocate previous index in the node page */
284 if (IS_ERR(inode)) 306 inode = f2fs_iget(sbi->sb, ino);
285 return PTR_ERR(inode); 307 if (IS_ERR(inode))
308 return PTR_ERR(inode);
309 } else {
310 inode = dn->inode;
311 }
286 312
287 bidx = start_bidx_of_node(offset, F2FS_I(inode)) + 313 bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
288 le16_to_cpu(sum.ofs_in_node); 314 le16_to_cpu(sum.ofs_in_node);
289 315
290 truncate_hole(inode, bidx, bidx + 1); 316 if (ino != dn->inode->i_ino) {
291 iput(inode); 317 truncate_hole(inode, bidx, bidx + 1);
318 iput(inode);
319 } else {
320 struct dnode_of_data tdn;
321 set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
322 if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
323 return 0;
324 if (tdn.data_blkaddr != NULL_ADDR)
325 truncate_data_blocks_range(&tdn, 1);
326 f2fs_put_page(tdn.node_page, 1);
327 }
292 return 0; 328 return 0;
293} 329}
294 330
@@ -331,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
331 f2fs_wait_on_page_writeback(dn.node_page, NODE); 367 f2fs_wait_on_page_writeback(dn.node_page, NODE);
332 368
333 get_node_info(sbi, dn.nid, &ni); 369 get_node_info(sbi, dn.nid, &ni);
334 f2fs_bug_on(ni.ino != ino_of_node(page)); 370 f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
335 f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page)); 371 f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
336 372
337 for (; start < end; start++) { 373 for (; start < end; start++) {
338 block_t src, dest; 374 block_t src, dest;
@@ -344,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
344 if (src == NULL_ADDR) { 380 if (src == NULL_ADDR) {
345 err = reserve_new_block(&dn); 381 err = reserve_new_block(&dn);
346 /* We should not get -ENOSPC */ 382 /* We should not get -ENOSPC */
347 f2fs_bug_on(err); 383 f2fs_bug_on(sbi, err);
348 } 384 }
349 385
350 /* Check the previous node page having this index */ 386 /* Check the previous node page having this index */
@@ -386,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
386{ 422{
387 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); 423 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
388 struct curseg_info *curseg; 424 struct curseg_info *curseg;
389 struct page *page; 425 struct page *page = NULL;
390 int err = 0; 426 int err = 0;
391 block_t blkaddr; 427 block_t blkaddr;
392 428
@@ -394,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi,
394 curseg = CURSEG_I(sbi, type); 430 curseg = CURSEG_I(sbi, type);
395 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 431 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
396 432
397 /* read node page */
398 page = alloc_page(GFP_F2FS_ZERO);
399 if (!page)
400 return -ENOMEM;
401
402 lock_page(page);
403
404 while (1) { 433 while (1) {
405 struct fsync_inode_entry *entry; 434 struct fsync_inode_entry *entry;
406 435
407 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); 436 if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
408 if (err) 437 break;
409 return err;
410 438
411 lock_page(page); 439 page = get_meta_page_ra(sbi, blkaddr);
412 440
413 if (cp_ver != cpver_of_node(page)) 441 if (cp_ver != cpver_of_node(page)) {
442 f2fs_put_page(page, 1);
414 break; 443 break;
444 }
415 445
416 entry = get_fsync_inode(head, ino_of_node(page)); 446 entry = get_fsync_inode(head, ino_of_node(page));
417 if (!entry) 447 if (!entry)
418 goto next; 448 goto next;
419 449 /*
450 * inode(x) | CP | inode(x) | dnode(F)
451 * In this case, we can lose the latest inode(x).
452 * So, call recover_inode for the inode update.
453 */
454 if (entry->last_inode == blkaddr)
455 recover_inode(entry->inode, page);
456 if (entry->last_dentry == blkaddr) {
457 err = recover_dentry(entry->inode, page);
458 if (err) {
459 f2fs_put_page(page, 1);
460 break;
461 }
462 }
420 err = do_recover_data(sbi, entry->inode, page, blkaddr); 463 err = do_recover_data(sbi, entry->inode, page, blkaddr);
421 if (err) 464 if (err) {
465 f2fs_put_page(page, 1);
422 break; 466 break;
467 }
423 468
424 if (entry->blkaddr == blkaddr) { 469 if (entry->blkaddr == blkaddr) {
425 iput(entry->inode); 470 iput(entry->inode);
@@ -429,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
429next: 474next:
430 /* check next segment */ 475 /* check next segment */
431 blkaddr = next_blkaddr_of_node(page); 476 blkaddr = next_blkaddr_of_node(page);
477 f2fs_put_page(page, 1);
432 } 478 }
433
434 unlock_page(page);
435 __free_pages(page, 0);
436
437 if (!err) 479 if (!err)
438 allocate_new_segments(sbi); 480 allocate_new_segments(sbi);
439 return err; 481 return err;
@@ -474,11 +516,15 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
474 /* step #2: recover data */ 516 /* step #2: recover data */
475 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); 517 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
476 if (!err) 518 if (!err)
477 f2fs_bug_on(!list_empty(&inode_list)); 519 f2fs_bug_on(sbi, !list_empty(&inode_list));
478out: 520out:
479 destroy_fsync_dnodes(&inode_list); 521 destroy_fsync_dnodes(&inode_list);
480 kmem_cache_destroy(fsync_entry_slab); 522 kmem_cache_destroy(fsync_entry_slab);
481 523
524 /* truncate meta pages to be used by the recovery */
525 truncate_inode_pages_range(META_MAPPING(sbi),
526 MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
527
482 if (err) { 528 if (err) {
483 truncate_inode_pages_final(NODE_MAPPING(sbi)); 529 truncate_inode_pages_final(NODE_MAPPING(sbi));
484 truncate_inode_pages_final(META_MAPPING(sbi)); 530 truncate_inode_pages_final(META_MAPPING(sbi));
@@ -494,8 +540,11 @@ out:
494 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); 540 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
495 mutex_unlock(&sbi->cp_mutex); 541 mutex_unlock(&sbi->cp_mutex);
496 } else if (need_writecp) { 542 } else if (need_writecp) {
543 struct cp_control cpc = {
544 .reason = CP_SYNC,
545 };
497 mutex_unlock(&sbi->cp_mutex); 546 mutex_unlock(&sbi->cp_mutex);
498 write_checkpoint(sbi, false); 547 write_checkpoint(sbi, &cpc);
499 } else { 548 } else {
500 mutex_unlock(&sbi->cp_mutex); 549 mutex_unlock(&sbi->cp_mutex);
501 } 550 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0aa337cd5bba..923cb76fdc46 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -25,6 +25,8 @@
25#define __reverse_ffz(x) __reverse_ffs(~(x)) 25#define __reverse_ffz(x) __reverse_ffs(~(x))
26 26
27static struct kmem_cache *discard_entry_slab; 27static struct kmem_cache *discard_entry_slab;
28static struct kmem_cache *sit_entry_set_slab;
29static struct kmem_cache *inmem_entry_slab;
28 30
29/* 31/*
30 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since 32 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -172,6 +174,60 @@ found_middle:
172 return result + __reverse_ffz(tmp); 174 return result + __reverse_ffz(tmp);
173} 175}
174 176
177void register_inmem_page(struct inode *inode, struct page *page)
178{
179 struct f2fs_inode_info *fi = F2FS_I(inode);
180 struct inmem_pages *new;
181
182 new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
183
184 /* add atomic page indices to the list */
185 new->page = page;
186 INIT_LIST_HEAD(&new->list);
187
188 /* increase reference count with clean state */
189 mutex_lock(&fi->inmem_lock);
190 get_page(page);
191 list_add_tail(&new->list, &fi->inmem_pages);
192 mutex_unlock(&fi->inmem_lock);
193}
194
195void commit_inmem_pages(struct inode *inode, bool abort)
196{
197 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
198 struct f2fs_inode_info *fi = F2FS_I(inode);
199 struct inmem_pages *cur, *tmp;
200 bool submit_bio = false;
201 struct f2fs_io_info fio = {
202 .type = DATA,
203 .rw = WRITE_SYNC,
204 };
205
206 f2fs_balance_fs(sbi);
207 f2fs_lock_op(sbi);
208
209 mutex_lock(&fi->inmem_lock);
210 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
211 lock_page(cur->page);
212 if (!abort && cur->page->mapping == inode->i_mapping) {
213 f2fs_wait_on_page_writeback(cur->page, DATA);
214 if (clear_page_dirty_for_io(cur->page))
215 inode_dec_dirty_pages(inode);
216 do_write_data_page(cur->page, &fio);
217 submit_bio = true;
218 }
219 f2fs_put_page(cur->page, 1);
220 list_del(&cur->list);
221 kmem_cache_free(inmem_entry_slab, cur);
222 }
223 if (submit_bio)
224 f2fs_submit_merged_bio(sbi, DATA, WRITE);
225 mutex_unlock(&fi->inmem_lock);
226
227 filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
228 f2fs_unlock_op(sbi);
229}
230
175/* 231/*
176 * This function balances dirty node and dentry pages. 232 * This function balances dirty node and dentry pages.
177 * In addition, it controls garbage collection. 233 * In addition, it controls garbage collection.
@@ -205,24 +261,20 @@ repeat:
205 if (kthread_should_stop()) 261 if (kthread_should_stop())
206 return 0; 262 return 0;
207 263
208 spin_lock(&fcc->issue_lock); 264 if (!llist_empty(&fcc->issue_list)) {
209 if (fcc->issue_list) {
210 fcc->dispatch_list = fcc->issue_list;
211 fcc->issue_list = fcc->issue_tail = NULL;
212 }
213 spin_unlock(&fcc->issue_lock);
214
215 if (fcc->dispatch_list) {
216 struct bio *bio = bio_alloc(GFP_NOIO, 0); 265 struct bio *bio = bio_alloc(GFP_NOIO, 0);
217 struct flush_cmd *cmd, *next; 266 struct flush_cmd *cmd, *next;
218 int ret; 267 int ret;
219 268
269 fcc->dispatch_list = llist_del_all(&fcc->issue_list);
270 fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
271
220 bio->bi_bdev = sbi->sb->s_bdev; 272 bio->bi_bdev = sbi->sb->s_bdev;
221 ret = submit_bio_wait(WRITE_FLUSH, bio); 273 ret = submit_bio_wait(WRITE_FLUSH, bio);
222 274
223 for (cmd = fcc->dispatch_list; cmd; cmd = next) { 275 llist_for_each_entry_safe(cmd, next,
276 fcc->dispatch_list, llnode) {
224 cmd->ret = ret; 277 cmd->ret = ret;
225 next = cmd->next;
226 complete(&cmd->wait); 278 complete(&cmd->wait);
227 } 279 }
228 bio_put(bio); 280 bio_put(bio);
@@ -230,7 +282,7 @@ repeat:
230 } 282 }
231 283
232 wait_event_interruptible(*q, 284 wait_event_interruptible(*q,
233 kthread_should_stop() || fcc->issue_list); 285 kthread_should_stop() || !llist_empty(&fcc->issue_list));
234 goto repeat; 286 goto repeat;
235} 287}
236 288
@@ -249,15 +301,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
249 return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); 301 return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
250 302
251 init_completion(&cmd.wait); 303 init_completion(&cmd.wait);
252 cmd.next = NULL;
253 304
254 spin_lock(&fcc->issue_lock); 305 llist_add(&cmd.llnode, &fcc->issue_list);
255 if (fcc->issue_list)
256 fcc->issue_tail->next = &cmd;
257 else
258 fcc->issue_list = &cmd;
259 fcc->issue_tail = &cmd;
260 spin_unlock(&fcc->issue_lock);
261 306
262 if (!fcc->dispatch_list) 307 if (!fcc->dispatch_list)
263 wake_up(&fcc->flush_wait_queue); 308 wake_up(&fcc->flush_wait_queue);
@@ -276,8 +321,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
276 fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); 321 fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
277 if (!fcc) 322 if (!fcc)
278 return -ENOMEM; 323 return -ENOMEM;
279 spin_lock_init(&fcc->issue_lock);
280 init_waitqueue_head(&fcc->flush_wait_queue); 324 init_waitqueue_head(&fcc->flush_wait_queue);
325 init_llist_head(&fcc->issue_list);
281 SM_I(sbi)->cmd_control_info = fcc; 326 SM_I(sbi)->cmd_control_info = fcc;
282 fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, 327 fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
283 "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); 328 "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
@@ -317,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
317 struct seg_entry *sentry = get_seg_entry(sbi, segno); 362 struct seg_entry *sentry = get_seg_entry(sbi, segno);
318 enum dirty_type t = sentry->type; 363 enum dirty_type t = sentry->type;
319 364
365 if (unlikely(t >= DIRTY)) {
366 f2fs_bug_on(sbi, 1);
367 return;
368 }
320 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) 369 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
321 dirty_i->nr_dirty[t]++; 370 dirty_i->nr_dirty[t]++;
322 } 371 }
@@ -376,8 +425,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
376static int f2fs_issue_discard(struct f2fs_sb_info *sbi, 425static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
377 block_t blkstart, block_t blklen) 426 block_t blkstart, block_t blklen)
378{ 427{
379 sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); 428 sector_t start = SECTOR_FROM_BLOCK(blkstart);
380 sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); 429 sector_t len = SECTOR_FROM_BLOCK(blklen);
381 trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); 430 trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
382 return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); 431 return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
383} 432}
@@ -392,22 +441,48 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
392 } 441 }
393} 442}
394 443
395static void add_discard_addrs(struct f2fs_sb_info *sbi, 444static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
396 unsigned int segno, struct seg_entry *se)
397{ 445{
398 struct list_head *head = &SM_I(sbi)->discard_list; 446 struct list_head *head = &SM_I(sbi)->discard_list;
399 struct discard_entry *new; 447 struct discard_entry *new;
400 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); 448 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
401 int max_blocks = sbi->blocks_per_seg; 449 int max_blocks = sbi->blocks_per_seg;
450 struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
402 unsigned long *cur_map = (unsigned long *)se->cur_valid_map; 451 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
403 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; 452 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
404 unsigned long dmap[entries]; 453 unsigned long dmap[entries];
405 unsigned int start = 0, end = -1; 454 unsigned int start = 0, end = -1;
455 bool force = (cpc->reason == CP_DISCARD);
406 int i; 456 int i;
407 457
408 if (!test_opt(sbi, DISCARD)) 458 if (!force && !test_opt(sbi, DISCARD))
409 return; 459 return;
410 460
461 if (force && !se->valid_blocks) {
462 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
463 /*
464 * if this segment is registered in the prefree list, then
465 * we should skip adding a discard candidate, and let the
466 * checkpoint do that later.
467 */
468 mutex_lock(&dirty_i->seglist_lock);
469 if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
470 mutex_unlock(&dirty_i->seglist_lock);
471 cpc->trimmed += sbi->blocks_per_seg;
472 return;
473 }
474 mutex_unlock(&dirty_i->seglist_lock);
475
476 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
477 INIT_LIST_HEAD(&new->list);
478 new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
479 new->len = sbi->blocks_per_seg;
480 list_add_tail(&new->list, head);
481 SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
482 cpc->trimmed += sbi->blocks_per_seg;
483 return;
484 }
485
411 /* zero block will be discarded through the prefree list */ 486 /* zero block will be discarded through the prefree list */
412 if (!se->valid_blocks || se->valid_blocks == max_blocks) 487 if (!se->valid_blocks || se->valid_blocks == max_blocks)
413 return; 488 return;
@@ -416,23 +491,39 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi,
416 for (i = 0; i < entries; i++) 491 for (i = 0; i < entries; i++)
417 dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; 492 dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
418 493
419 while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { 494 while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
420 start = __find_rev_next_bit(dmap, max_blocks, end + 1); 495 start = __find_rev_next_bit(dmap, max_blocks, end + 1);
421 if (start >= max_blocks) 496 if (start >= max_blocks)
422 break; 497 break;
423 498
424 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); 499 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
425 500
501 if (end - start < cpc->trim_minlen)
502 continue;
503
426 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); 504 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
427 INIT_LIST_HEAD(&new->list); 505 INIT_LIST_HEAD(&new->list);
428 new->blkaddr = START_BLOCK(sbi, segno) + start; 506 new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
429 new->len = end - start; 507 new->len = end - start;
508 cpc->trimmed += end - start;
430 509
431 list_add_tail(&new->list, head); 510 list_add_tail(&new->list, head);
432 SM_I(sbi)->nr_discards += end - start; 511 SM_I(sbi)->nr_discards += end - start;
433 } 512 }
434} 513}
435 514
515void release_discard_addrs(struct f2fs_sb_info *sbi)
516{
517 struct list_head *head = &(SM_I(sbi)->discard_list);
518 struct discard_entry *entry, *this;
519
520 /* drop caches */
521 list_for_each_entry_safe(entry, this, head, list) {
522 list_del(&entry->list);
523 kmem_cache_free(discard_entry_slab, entry);
524 }
525}
526
436/* 527/*
437 * Should call clear_prefree_segments after checkpoint is done. 528 * Should call clear_prefree_segments after checkpoint is done.
438 */ 529 */
@@ -440,10 +531,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
440{ 531{
441 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 532 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
442 unsigned int segno; 533 unsigned int segno;
443 unsigned int total_segs = TOTAL_SEGS(sbi);
444 534
445 mutex_lock(&dirty_i->seglist_lock); 535 mutex_lock(&dirty_i->seglist_lock);
446 for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs) 536 for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
447 __set_test_and_free(sbi, segno); 537 __set_test_and_free(sbi, segno);
448 mutex_unlock(&dirty_i->seglist_lock); 538 mutex_unlock(&dirty_i->seglist_lock);
449} 539}
@@ -454,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
454 struct discard_entry *entry, *this; 544 struct discard_entry *entry, *this;
455 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 545 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
456 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; 546 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
457 unsigned int total_segs = TOTAL_SEGS(sbi);
458 unsigned int start = 0, end = -1; 547 unsigned int start = 0, end = -1;
459 548
460 mutex_lock(&dirty_i->seglist_lock); 549 mutex_lock(&dirty_i->seglist_lock);
461 550
462 while (1) { 551 while (1) {
463 int i; 552 int i;
464 start = find_next_bit(prefree_map, total_segs, end + 1); 553 start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
465 if (start >= total_segs) 554 if (start >= MAIN_SEGS(sbi))
466 break; 555 break;
467 end = find_next_zero_bit(prefree_map, total_segs, start + 1); 556 end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
557 start + 1);
468 558
469 for (i = start; i < end; i++) 559 for (i = start; i < end; i++)
470 clear_bit(i, prefree_map); 560 clear_bit(i, prefree_map);
@@ -488,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
488 } 578 }
489} 579}
490 580
491static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) 581static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
492{ 582{
493 struct sit_info *sit_i = SIT_I(sbi); 583 struct sit_info *sit_i = SIT_I(sbi);
494 if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) 584
585 if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
495 sit_i->dirty_sentries++; 586 sit_i->dirty_sentries++;
587 return false;
588 }
589
590 return true;
496} 591}
497 592
498static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, 593static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
@@ -516,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
516 new_vblocks = se->valid_blocks + del; 611 new_vblocks = se->valid_blocks + del;
517 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); 612 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
518 613
519 f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || 614 f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
520 (new_vblocks > sbi->blocks_per_seg))); 615 (new_vblocks > sbi->blocks_per_seg)));
521 616
522 se->valid_blocks = new_vblocks; 617 se->valid_blocks = new_vblocks;
@@ -526,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
526 /* Update valid block bitmap */ 621 /* Update valid block bitmap */
527 if (del > 0) { 622 if (del > 0) {
528 if (f2fs_set_bit(offset, se->cur_valid_map)) 623 if (f2fs_set_bit(offset, se->cur_valid_map))
529 BUG(); 624 f2fs_bug_on(sbi, 1);
530 } else { 625 } else {
531 if (!f2fs_clear_bit(offset, se->cur_valid_map)) 626 if (!f2fs_clear_bit(offset, se->cur_valid_map))
532 BUG(); 627 f2fs_bug_on(sbi, 1);
533 } 628 }
534 if (!f2fs_test_bit(offset, se->ckpt_valid_map)) 629 if (!f2fs_test_bit(offset, se->ckpt_valid_map))
535 se->ckpt_valid_blocks += del; 630 se->ckpt_valid_blocks += del;
@@ -558,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
558 unsigned int segno = GET_SEGNO(sbi, addr); 653 unsigned int segno = GET_SEGNO(sbi, addr);
559 struct sit_info *sit_i = SIT_I(sbi); 654 struct sit_info *sit_i = SIT_I(sbi);
560 655
561 f2fs_bug_on(addr == NULL_ADDR); 656 f2fs_bug_on(sbi, addr == NULL_ADDR);
562 if (addr == NEW_ADDR) 657 if (addr == NEW_ADDR)
563 return; 658 return;
564 659
@@ -634,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
634 unsigned int segno = curseg->segno + 1; 729 unsigned int segno = curseg->segno + 1;
635 struct free_segmap_info *free_i = FREE_I(sbi); 730 struct free_segmap_info *free_i = FREE_I(sbi);
636 731
637 if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) 732 if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
638 return !test_bit(segno, free_i->free_segmap); 733 return !test_bit(segno, free_i->free_segmap);
639 return 0; 734 return 0;
640} 735}
@@ -648,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
648{ 743{
649 struct free_segmap_info *free_i = FREE_I(sbi); 744 struct free_segmap_info *free_i = FREE_I(sbi);
650 unsigned int segno, secno, zoneno; 745 unsigned int segno, secno, zoneno;
651 unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; 746 unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
652 unsigned int hint = *newseg / sbi->segs_per_sec; 747 unsigned int hint = *newseg / sbi->segs_per_sec;
653 unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); 748 unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
654 unsigned int left_start = hint; 749 unsigned int left_start = hint;
@@ -660,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
660 755
661 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { 756 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
662 segno = find_next_zero_bit(free_i->free_segmap, 757 segno = find_next_zero_bit(free_i->free_segmap,
663 TOTAL_SEGS(sbi), *newseg + 1); 758 MAIN_SEGS(sbi), *newseg + 1);
664 if (segno - *newseg < sbi->segs_per_sec - 759 if (segno - *newseg < sbi->segs_per_sec -
665 (*newseg % sbi->segs_per_sec)) 760 (*newseg % sbi->segs_per_sec))
666 goto got_it; 761 goto got_it;
667 } 762 }
668find_other_zone: 763find_other_zone:
669 secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); 764 secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
670 if (secno >= TOTAL_SECS(sbi)) { 765 if (secno >= MAIN_SECS(sbi)) {
671 if (dir == ALLOC_RIGHT) { 766 if (dir == ALLOC_RIGHT) {
672 secno = find_next_zero_bit(free_i->free_secmap, 767 secno = find_next_zero_bit(free_i->free_secmap,
673 TOTAL_SECS(sbi), 0); 768 MAIN_SECS(sbi), 0);
674 f2fs_bug_on(secno >= TOTAL_SECS(sbi)); 769 f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
675 } else { 770 } else {
676 go_left = 1; 771 go_left = 1;
677 left_start = hint - 1; 772 left_start = hint - 1;
@@ -686,8 +781,8 @@ find_other_zone:
686 continue; 781 continue;
687 } 782 }
688 left_start = find_next_zero_bit(free_i->free_secmap, 783 left_start = find_next_zero_bit(free_i->free_secmap,
689 TOTAL_SECS(sbi), 0); 784 MAIN_SECS(sbi), 0);
690 f2fs_bug_on(left_start >= TOTAL_SECS(sbi)); 785 f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
691 break; 786 break;
692 } 787 }
693 secno = left_start; 788 secno = left_start;
@@ -726,7 +821,7 @@ skip_left:
726 } 821 }
727got_it: 822got_it:
728 /* set it as dirty segment in free segmap */ 823 /* set it as dirty segment in free segmap */
729 f2fs_bug_on(test_bit(segno, free_i->free_segmap)); 824 f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
730 __set_inuse(sbi, segno); 825 __set_inuse(sbi, segno);
731 *newseg = segno; 826 *newseg = segno;
732 write_unlock(&free_i->segmap_lock); 827 write_unlock(&free_i->segmap_lock);
@@ -898,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = {
898 .allocate_segment = allocate_segment_by_default, 993 .allocate_segment = allocate_segment_by_default,
899}; 994};
900 995
996int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
997{
998 __u64 start = range->start >> sbi->log_blocksize;
999 __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
1000 unsigned int start_segno, end_segno;
1001 struct cp_control cpc;
1002
1003 if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
1004 range->len < sbi->blocksize)
1005 return -EINVAL;
1006
1007 if (end <= MAIN_BLKADDR(sbi))
1008 goto out;
1009
1010 /* start/end segment number in main_area */
1011 start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
1012 end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
1013 GET_SEGNO(sbi, end);
1014 cpc.reason = CP_DISCARD;
1015 cpc.trim_start = start_segno;
1016 cpc.trim_end = end_segno;
1017 cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
1018 cpc.trimmed = 0;
1019
1020 /* do checkpoint to issue discard commands safely */
1021 write_checkpoint(sbi, &cpc);
1022out:
1023 range->len = cpc.trimmed << sbi->log_blocksize;
1024 return 0;
1025}
1026
901static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) 1027static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
902{ 1028{
903 struct curseg_info *curseg = CURSEG_I(sbi, type); 1029 struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -953,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
953 1079
954static int __get_segment_type(struct page *page, enum page_type p_type) 1080static int __get_segment_type(struct page *page, enum page_type p_type)
955{ 1081{
956 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1082 switch (F2FS_P_SB(page)->active_logs) {
957 switch (sbi->active_logs) {
958 case 2: 1083 case 2:
959 return __get_segment_type_2(page, p_type); 1084 return __get_segment_type_2(page, p_type);
960 case 4: 1085 case 4:
961 return __get_segment_type_4(page, p_type); 1086 return __get_segment_type_4(page, p_type);
962 } 1087 }
963 /* NR_CURSEG_TYPE(6) logs by default */ 1088 /* NR_CURSEG_TYPE(6) logs by default */
964 f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE); 1089 f2fs_bug_on(F2FS_P_SB(page),
1090 F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
965 return __get_segment_type_6(page, p_type); 1091 return __get_segment_type_6(page, p_type);
966} 1092}
967 1093
@@ -1041,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
1041void write_data_page(struct page *page, struct dnode_of_data *dn, 1167void write_data_page(struct page *page, struct dnode_of_data *dn,
1042 block_t *new_blkaddr, struct f2fs_io_info *fio) 1168 block_t *new_blkaddr, struct f2fs_io_info *fio)
1043{ 1169{
1044 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 1170 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1045 struct f2fs_summary sum; 1171 struct f2fs_summary sum;
1046 struct node_info ni; 1172 struct node_info ni;
1047 1173
1048 f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); 1174 f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
1049 get_node_info(sbi, dn->nid, &ni); 1175 get_node_info(sbi, dn->nid, &ni);
1050 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 1176 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
1051 1177
@@ -1055,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
1055void rewrite_data_page(struct page *page, block_t old_blkaddr, 1181void rewrite_data_page(struct page *page, block_t old_blkaddr,
1056 struct f2fs_io_info *fio) 1182 struct f2fs_io_info *fio)
1057{ 1183{
1058 struct inode *inode = page->mapping->host; 1184 f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
1059 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1060 f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
1061} 1185}
1062 1186
1063void recover_data_page(struct f2fs_sb_info *sbi, 1187void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1130,8 +1254,9 @@ out:
1130void f2fs_wait_on_page_writeback(struct page *page, 1254void f2fs_wait_on_page_writeback(struct page *page,
1131 enum page_type type) 1255 enum page_type type)
1132{ 1256{
1133 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1134 if (PageWriteback(page)) { 1257 if (PageWriteback(page)) {
1258 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1259
1135 if (is_merged_page(sbi, page, type)) 1260 if (is_merged_page(sbi, page, type))
1136 f2fs_submit_merged_bio(sbi, type, WRITE); 1261 f2fs_submit_merged_bio(sbi, type, WRITE);
1137 wait_on_page_writeback(page); 1262 wait_on_page_writeback(page);
@@ -1400,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
1400 unsigned int segno) 1525 unsigned int segno)
1401{ 1526{
1402 struct sit_info *sit_i = SIT_I(sbi); 1527 struct sit_info *sit_i = SIT_I(sbi);
1403 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno); 1528 unsigned int offset = SIT_BLOCK_OFFSET(segno);
1404 block_t blk_addr = sit_i->sit_base_addr + offset; 1529 block_t blk_addr = sit_i->sit_base_addr + offset;
1405 1530
1406 check_seg_range(sbi, segno); 1531 check_seg_range(sbi, segno);
@@ -1426,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
1426 /* get current sit block page without lock */ 1551 /* get current sit block page without lock */
1427 src_page = get_meta_page(sbi, src_off); 1552 src_page = get_meta_page(sbi, src_off);
1428 dst_page = grab_meta_page(sbi, dst_off); 1553 dst_page = grab_meta_page(sbi, dst_off);
1429 f2fs_bug_on(PageDirty(src_page)); 1554 f2fs_bug_on(sbi, PageDirty(src_page));
1430 1555
1431 src_addr = page_address(src_page); 1556 src_addr = page_address(src_page);
1432 dst_addr = page_address(dst_page); 1557 dst_addr = page_address(dst_page);
@@ -1440,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
1440 return dst_page; 1565 return dst_page;
1441} 1566}
1442 1567
1443static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) 1568static struct sit_entry_set *grab_sit_entry_set(void)
1569{
1570 struct sit_entry_set *ses =
1571 f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
1572
1573 ses->entry_cnt = 0;
1574 INIT_LIST_HEAD(&ses->set_list);
1575 return ses;
1576}
1577
1578static void release_sit_entry_set(struct sit_entry_set *ses)
1579{
1580 list_del(&ses->set_list);
1581 kmem_cache_free(sit_entry_set_slab, ses);
1582}
1583
1584static void adjust_sit_entry_set(struct sit_entry_set *ses,
1585 struct list_head *head)
1586{
1587 struct sit_entry_set *next = ses;
1588
1589 if (list_is_last(&ses->set_list, head))
1590 return;
1591
1592 list_for_each_entry_continue(next, head, set_list)
1593 if (ses->entry_cnt <= next->entry_cnt)
1594 break;
1595
1596 list_move_tail(&ses->set_list, &next->set_list);
1597}
1598
1599static void add_sit_entry(unsigned int segno, struct list_head *head)
1600{
1601 struct sit_entry_set *ses;
1602 unsigned int start_segno = START_SEGNO(segno);
1603
1604 list_for_each_entry(ses, head, set_list) {
1605 if (ses->start_segno == start_segno) {
1606 ses->entry_cnt++;
1607 adjust_sit_entry_set(ses, head);
1608 return;
1609 }
1610 }
1611
1612 ses = grab_sit_entry_set();
1613
1614 ses->start_segno = start_segno;
1615 ses->entry_cnt++;
1616 list_add(&ses->set_list, head);
1617}
1618
1619static void add_sits_in_set(struct f2fs_sb_info *sbi)
1620{
1621 struct f2fs_sm_info *sm_info = SM_I(sbi);
1622 struct list_head *set_list = &sm_info->sit_entry_set;
1623 unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
1624 unsigned int segno;
1625
1626 for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
1627 add_sit_entry(segno, set_list);
1628}
1629
1630static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
1444{ 1631{
1445 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1632 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1446 struct f2fs_summary_block *sum = curseg->sum_blk; 1633 struct f2fs_summary_block *sum = curseg->sum_blk;
1447 int i; 1634 int i;
1448 1635
1449 /* 1636 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
1450 * If the journal area in the current summary is full of sit entries, 1637 unsigned int segno;
1451 * all the sit entries will be flushed. Otherwise the sit entries 1638 bool dirtied;
1452 * are not able to replace with newly hot sit entries. 1639
1453 */ 1640 segno = le32_to_cpu(segno_in_journal(sum, i));
1454 if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) { 1641 dirtied = __mark_sit_entry_dirty(sbi, segno);
1455 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { 1642
1456 unsigned int segno; 1643 if (!dirtied)
1457 segno = le32_to_cpu(segno_in_journal(sum, i)); 1644 add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
1458 __mark_sit_entry_dirty(sbi, segno);
1459 }
1460 update_sits_in_cursum(sum, -sits_in_cursum(sum));
1461 return true;
1462 } 1645 }
1463 return false; 1646 update_sits_in_cursum(sum, -sits_in_cursum(sum));
1464} 1647}
1465 1648
1466/* 1649/*
1467 * CP calls this function, which flushes SIT entries including sit_journal, 1650 * CP calls this function, which flushes SIT entries including sit_journal,
1468 * and moves prefree segs to free segs. 1651 * and moves prefree segs to free segs.
1469 */ 1652 */
1470void flush_sit_entries(struct f2fs_sb_info *sbi) 1653void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1471{ 1654{
1472 struct sit_info *sit_i = SIT_I(sbi); 1655 struct sit_info *sit_i = SIT_I(sbi);
1473 unsigned long *bitmap = sit_i->dirty_sentries_bitmap; 1656 unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
1474 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1657 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1475 struct f2fs_summary_block *sum = curseg->sum_blk; 1658 struct f2fs_summary_block *sum = curseg->sum_blk;
1476 unsigned long nsegs = TOTAL_SEGS(sbi); 1659 struct sit_entry_set *ses, *tmp;
1477 struct page *page = NULL; 1660 struct list_head *head = &SM_I(sbi)->sit_entry_set;
1478 struct f2fs_sit_block *raw_sit = NULL; 1661 bool to_journal = true;
1479 unsigned int start = 0, end = 0; 1662 struct seg_entry *se;
1480 unsigned int segno;
1481 bool flushed;
1482 1663
1483 mutex_lock(&curseg->curseg_mutex); 1664 mutex_lock(&curseg->curseg_mutex);
1484 mutex_lock(&sit_i->sentry_lock); 1665 mutex_lock(&sit_i->sentry_lock);
1485 1666
1486 /* 1667 /*
1487 * "flushed" indicates whether sit entries in journal are flushed 1668 * add and account sit entries of dirty bitmap in sit entry
1488 * to the SIT area or not. 1669 * set temporarily
1489 */ 1670 */
1490 flushed = flush_sits_in_journal(sbi); 1671 add_sits_in_set(sbi);
1491 1672
1492 for_each_set_bit(segno, bitmap, nsegs) { 1673 /*
1493 struct seg_entry *se = get_seg_entry(sbi, segno); 1674 * if there are no enough space in journal to store dirty sit
1494 int sit_offset, offset; 1675 * entries, remove all entries from journal and add and account
1676 * them in sit entry set.
1677 */
1678 if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
1679 remove_sits_in_journal(sbi);
1495 1680
1496 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); 1681 if (!sit_i->dirty_sentries)
1682 goto out;
1497 1683
1498 /* add discard candidates */ 1684 /*
1499 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) 1685 * there are two steps to flush sit entries:
1500 add_discard_addrs(sbi, segno, se); 1686 * #1, flush sit entries to journal in current cold data summary block.
1687 * #2, flush sit entries to sit page.
1688 */
1689 list_for_each_entry_safe(ses, tmp, head, set_list) {
1690 struct page *page;
1691 struct f2fs_sit_block *raw_sit = NULL;
1692 unsigned int start_segno = ses->start_segno;
1693 unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
1694 (unsigned long)MAIN_SEGS(sbi));
1695 unsigned int segno = start_segno;
1696
1697 if (to_journal &&
1698 !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
1699 to_journal = false;
1700
1701 if (!to_journal) {
1702 page = get_next_sit_page(sbi, start_segno);
1703 raw_sit = page_address(page);
1704 }
1501 1705
1502 if (flushed) 1706 /* flush dirty sit entries in region of current sit set */
1503 goto to_sit_page; 1707 for_each_set_bit_from(segno, bitmap, end) {
1708 int offset, sit_offset;
1504 1709
1505 offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1); 1710 se = get_seg_entry(sbi, segno);
1506 if (offset >= 0) { 1711
1507 segno_in_journal(sum, offset) = cpu_to_le32(segno); 1712 /* add discard candidates */
1508 seg_info_to_raw_sit(se, &sit_in_journal(sum, offset)); 1713 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
1509 goto flush_done; 1714 cpc->trim_start = segno;
1510 } 1715 add_discard_addrs(sbi, cpc);
1511to_sit_page:
1512 if (!page || (start > segno) || (segno > end)) {
1513 if (page) {
1514 f2fs_put_page(page, 1);
1515 page = NULL;
1516 } 1716 }
1517 1717
1518 start = START_SEGNO(sit_i, segno); 1718 if (to_journal) {
1519 end = start + SIT_ENTRY_PER_BLOCK - 1; 1719 offset = lookup_journal_in_cursum(sum,
1720 SIT_JOURNAL, segno, 1);
1721 f2fs_bug_on(sbi, offset < 0);
1722 segno_in_journal(sum, offset) =
1723 cpu_to_le32(segno);
1724 seg_info_to_raw_sit(se,
1725 &sit_in_journal(sum, offset));
1726 } else {
1727 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
1728 seg_info_to_raw_sit(se,
1729 &raw_sit->entries[sit_offset]);
1730 }
1520 1731
1521 /* read sit block that will be updated */ 1732 __clear_bit(segno, bitmap);
1522 page = get_next_sit_page(sbi, start); 1733 sit_i->dirty_sentries--;
1523 raw_sit = page_address(page); 1734 ses->entry_cnt--;
1524 } 1735 }
1525 1736
1526 /* udpate entry in SIT block */ 1737 if (!to_journal)
1527 seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); 1738 f2fs_put_page(page, 1);
1528flush_done: 1739
1529 __clear_bit(segno, bitmap); 1740 f2fs_bug_on(sbi, ses->entry_cnt);
1530 sit_i->dirty_sentries--; 1741 release_sit_entry_set(ses);
1742 }
1743
1744 f2fs_bug_on(sbi, !list_empty(head));
1745 f2fs_bug_on(sbi, sit_i->dirty_sentries);
1746out:
1747 if (cpc->reason == CP_DISCARD) {
1748 for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
1749 add_discard_addrs(sbi, cpc);
1531 } 1750 }
1532 mutex_unlock(&sit_i->sentry_lock); 1751 mutex_unlock(&sit_i->sentry_lock);
1533 mutex_unlock(&curseg->curseg_mutex); 1752 mutex_unlock(&curseg->curseg_mutex);
1534 1753
1535 /* writeout last modified SIT block */
1536 f2fs_put_page(page, 1);
1537
1538 set_prefree_as_free_segments(sbi); 1754 set_prefree_as_free_segments(sbi);
1539} 1755}
1540 1756
@@ -1554,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1554 1770
1555 SM_I(sbi)->sit_info = sit_i; 1771 SM_I(sbi)->sit_info = sit_i;
1556 1772
1557 sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry)); 1773 sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
1558 if (!sit_i->sentries) 1774 if (!sit_i->sentries)
1559 return -ENOMEM; 1775 return -ENOMEM;
1560 1776
1561 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1777 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1562 sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); 1778 sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
1563 if (!sit_i->dirty_sentries_bitmap) 1779 if (!sit_i->dirty_sentries_bitmap)
1564 return -ENOMEM; 1780 return -ENOMEM;
1565 1781
1566 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 1782 for (start = 0; start < MAIN_SEGS(sbi); start++) {
1567 sit_i->sentries[start].cur_valid_map 1783 sit_i->sentries[start].cur_valid_map
1568 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); 1784 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1569 sit_i->sentries[start].ckpt_valid_map 1785 sit_i->sentries[start].ckpt_valid_map
@@ -1574,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1574 } 1790 }
1575 1791
1576 if (sbi->segs_per_sec > 1) { 1792 if (sbi->segs_per_sec > 1) {
1577 sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * 1793 sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
1578 sizeof(struct sec_entry)); 1794 sizeof(struct sec_entry));
1579 if (!sit_i->sec_entries) 1795 if (!sit_i->sec_entries)
1580 return -ENOMEM; 1796 return -ENOMEM;
@@ -1609,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1609 1825
1610static int build_free_segmap(struct f2fs_sb_info *sbi) 1826static int build_free_segmap(struct f2fs_sb_info *sbi)
1611{ 1827{
1612 struct f2fs_sm_info *sm_info = SM_I(sbi);
1613 struct free_segmap_info *free_i; 1828 struct free_segmap_info *free_i;
1614 unsigned int bitmap_size, sec_bitmap_size; 1829 unsigned int bitmap_size, sec_bitmap_size;
1615 1830
@@ -1620,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
1620 1835
1621 SM_I(sbi)->free_info = free_i; 1836 SM_I(sbi)->free_info = free_i;
1622 1837
1623 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1838 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1624 free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); 1839 free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
1625 if (!free_i->free_segmap) 1840 if (!free_i->free_segmap)
1626 return -ENOMEM; 1841 return -ENOMEM;
1627 1842
1628 sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); 1843 sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
1629 free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); 1844 free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
1630 if (!free_i->free_secmap) 1845 if (!free_i->free_secmap)
1631 return -ENOMEM; 1846 return -ENOMEM;
@@ -1635,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
1635 memset(free_i->free_secmap, 0xff, sec_bitmap_size); 1850 memset(free_i->free_secmap, 0xff, sec_bitmap_size);
1636 1851
1637 /* init free segmap information */ 1852 /* init free segmap information */
1638 free_i->start_segno = 1853 free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
1639 (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
1640 free_i->free_segments = 0; 1854 free_i->free_segments = 0;
1641 free_i->free_sections = 0; 1855 free_i->free_sections = 0;
1642 rwlock_init(&free_i->segmap_lock); 1856 rwlock_init(&free_i->segmap_lock);
@@ -1673,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
1673 int sit_blk_cnt = SIT_BLK_CNT(sbi); 1887 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1674 unsigned int i, start, end; 1888 unsigned int i, start, end;
1675 unsigned int readed, start_blk = 0; 1889 unsigned int readed, start_blk = 0;
1676 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1890 int nrpages = MAX_BIO_BLOCKS(sbi);
1677 1891
1678 do { 1892 do {
1679 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); 1893 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
@@ -1681,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
1681 start = start_blk * sit_i->sents_per_block; 1895 start = start_blk * sit_i->sents_per_block;
1682 end = (start_blk + readed) * sit_i->sents_per_block; 1896 end = (start_blk + readed) * sit_i->sents_per_block;
1683 1897
1684 for (; start < end && start < TOTAL_SEGS(sbi); start++) { 1898 for (; start < end && start < MAIN_SEGS(sbi); start++) {
1685 struct seg_entry *se = &sit_i->sentries[start]; 1899 struct seg_entry *se = &sit_i->sentries[start];
1686 struct f2fs_sit_block *sit_blk; 1900 struct f2fs_sit_block *sit_blk;
1687 struct f2fs_sit_entry sit; 1901 struct f2fs_sit_entry sit;
@@ -1719,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
1719 unsigned int start; 1933 unsigned int start;
1720 int type; 1934 int type;
1721 1935
1722 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 1936 for (start = 0; start < MAIN_SEGS(sbi); start++) {
1723 struct seg_entry *sentry = get_seg_entry(sbi, start); 1937 struct seg_entry *sentry = get_seg_entry(sbi, start);
1724 if (!sentry->valid_blocks) 1938 if (!sentry->valid_blocks)
1725 __set_free(sbi, start); 1939 __set_free(sbi, start);
@@ -1736,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1736{ 1950{
1737 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 1951 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1738 struct free_segmap_info *free_i = FREE_I(sbi); 1952 struct free_segmap_info *free_i = FREE_I(sbi);
1739 unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); 1953 unsigned int segno = 0, offset = 0;
1740 unsigned short valid_blocks; 1954 unsigned short valid_blocks;
1741 1955
1742 while (1) { 1956 while (1) {
1743 /* find dirty segment based on free segmap */ 1957 /* find dirty segment based on free segmap */
1744 segno = find_next_inuse(free_i, total_segs, offset); 1958 segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
1745 if (segno >= total_segs) 1959 if (segno >= MAIN_SEGS(sbi))
1746 break; 1960 break;
1747 offset = segno + 1; 1961 offset = segno + 1;
1748 valid_blocks = get_valid_blocks(sbi, segno, 0); 1962 valid_blocks = get_valid_blocks(sbi, segno, 0);
1749 if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks) 1963 if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
1964 continue;
1965 if (valid_blocks > sbi->blocks_per_seg) {
1966 f2fs_bug_on(sbi, 1);
1750 continue; 1967 continue;
1968 }
1751 mutex_lock(&dirty_i->seglist_lock); 1969 mutex_lock(&dirty_i->seglist_lock);
1752 __locate_dirty_segment(sbi, segno, DIRTY); 1970 __locate_dirty_segment(sbi, segno, DIRTY);
1753 mutex_unlock(&dirty_i->seglist_lock); 1971 mutex_unlock(&dirty_i->seglist_lock);
@@ -1757,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1757static int init_victim_secmap(struct f2fs_sb_info *sbi) 1975static int init_victim_secmap(struct f2fs_sb_info *sbi)
1758{ 1976{
1759 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 1977 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1760 unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); 1978 unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
1761 1979
1762 dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); 1980 dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
1763 if (!dirty_i->victim_secmap) 1981 if (!dirty_i->victim_secmap)
@@ -1778,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
1778 SM_I(sbi)->dirty_info = dirty_i; 1996 SM_I(sbi)->dirty_info = dirty_i;
1779 mutex_init(&dirty_i->seglist_lock); 1997 mutex_init(&dirty_i->seglist_lock);
1780 1998
1781 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1999 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1782 2000
1783 for (i = 0; i < NR_DIRTY_TYPE; i++) { 2001 for (i = 0; i < NR_DIRTY_TYPE; i++) {
1784 dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); 2002 dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
@@ -1802,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
1802 2020
1803 sit_i->min_mtime = LLONG_MAX; 2021 sit_i->min_mtime = LLONG_MAX;
1804 2022
1805 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 2023 for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
1806 unsigned int i; 2024 unsigned int i;
1807 unsigned long long mtime = 0; 2025 unsigned long long mtime = 0;
1808 2026
@@ -1840,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1840 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); 2058 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1841 sm_info->rec_prefree_segments = sm_info->main_segments * 2059 sm_info->rec_prefree_segments = sm_info->main_segments *
1842 DEF_RECLAIM_PREFREE_SEGMENTS / 100; 2060 DEF_RECLAIM_PREFREE_SEGMENTS / 100;
1843 sm_info->ipu_policy = F2FS_IPU_DISABLE; 2061 sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
1844 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; 2062 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
2063 sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
1845 2064
1846 INIT_LIST_HEAD(&sm_info->discard_list); 2065 INIT_LIST_HEAD(&sm_info->discard_list);
1847 sm_info->nr_discards = 0; 2066 sm_info->nr_discards = 0;
1848 sm_info->max_discards = 0; 2067 sm_info->max_discards = 0;
1849 2068
2069 INIT_LIST_HEAD(&sm_info->sit_entry_set);
2070
1850 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { 2071 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
1851 err = create_flush_cmd_control(sbi); 2072 err = create_flush_cmd_control(sbi);
1852 if (err) 2073 if (err)
@@ -1942,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
1942 return; 2163 return;
1943 2164
1944 if (sit_i->sentries) { 2165 if (sit_i->sentries) {
1945 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 2166 for (start = 0; start < MAIN_SEGS(sbi); start++) {
1946 kfree(sit_i->sentries[start].cur_valid_map); 2167 kfree(sit_i->sentries[start].cur_valid_map);
1947 kfree(sit_i->sentries[start].ckpt_valid_map); 2168 kfree(sit_i->sentries[start].ckpt_valid_map);
1948 } 2169 }
@@ -1976,11 +2197,30 @@ int __init create_segment_manager_caches(void)
1976 discard_entry_slab = f2fs_kmem_cache_create("discard_entry", 2197 discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
1977 sizeof(struct discard_entry)); 2198 sizeof(struct discard_entry));
1978 if (!discard_entry_slab) 2199 if (!discard_entry_slab)
1979 return -ENOMEM; 2200 goto fail;
2201
2202 sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
2203 sizeof(struct nat_entry_set));
2204 if (!sit_entry_set_slab)
2205 goto destory_discard_entry;
2206
2207 inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
2208 sizeof(struct inmem_pages));
2209 if (!inmem_entry_slab)
2210 goto destroy_sit_entry_set;
1980 return 0; 2211 return 0;
2212
2213destroy_sit_entry_set:
2214 kmem_cache_destroy(sit_entry_set_slab);
2215destory_discard_entry:
2216 kmem_cache_destroy(discard_entry_slab);
2217fail:
2218 return -ENOMEM;
1981} 2219}
1982 2220
1983void destroy_segment_manager_caches(void) 2221void destroy_segment_manager_caches(void)
1984{ 2222{
2223 kmem_cache_destroy(sit_entry_set_slab);
1985 kmem_cache_destroy(discard_entry_slab); 2224 kmem_cache_destroy(discard_entry_slab);
2225 kmem_cache_destroy(inmem_entry_slab);
1986} 2226}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ff483257283b..2495bec1c621 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -45,16 +45,26 @@
45 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ 45 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
46 sbi->segs_per_sec)) \ 46 sbi->segs_per_sec)) \
47 47
48#define START_BLOCK(sbi, segno) \ 48#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
49 (SM_I(sbi)->seg0_blkaddr + \ 49#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
50
51#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
52#define MAIN_SECS(sbi) (sbi->total_sections)
53
54#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
55#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
56
57#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
58#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
59 sbi->log_blocks_per_seg))
60
61#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
50 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) 62 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
63
51#define NEXT_FREE_BLKADDR(sbi, curseg) \ 64#define NEXT_FREE_BLKADDR(sbi, curseg) \
52 (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) 65 (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
53 66
54#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr) 67#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
55
56#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ 68#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) 69 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
60#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ 70#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
@@ -77,23 +87,21 @@
77 87
78#define SIT_ENTRY_OFFSET(sit_i, segno) \ 88#define SIT_ENTRY_OFFSET(sit_i, segno) \
79 (segno % sit_i->sents_per_block) 89 (segno % sit_i->sents_per_block)
80#define SIT_BLOCK_OFFSET(sit_i, segno) \ 90#define SIT_BLOCK_OFFSET(segno) \
81 (segno / SIT_ENTRY_PER_BLOCK) 91 (segno / SIT_ENTRY_PER_BLOCK)
82#define START_SEGNO(sit_i, segno) \ 92#define START_SEGNO(segno) \
83 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) 93 (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
84#define SIT_BLK_CNT(sbi) \ 94#define SIT_BLK_CNT(sbi) \
85 ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) 95 ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
86#define f2fs_bitmap_size(nr) \ 96#define f2fs_bitmap_size(nr) \
87 (BITS_TO_LONGS(nr) * sizeof(unsigned long)) 97 (BITS_TO_LONGS(nr) * sizeof(unsigned long))
88#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
89#define TOTAL_SECS(sbi) (sbi->total_sections)
90 98
91#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ 99#define SECTOR_FROM_BLOCK(blk_addr) \
92 (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) 100 (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
93#define SECTOR_TO_BLOCK(sbi, sectors) \ 101#define SECTOR_TO_BLOCK(sectors) \
94 (sectors >> (sbi)->log_sectors_per_block) 102 (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
95#define MAX_BIO_BLOCKS(max_hw_blocks) \ 103#define MAX_BIO_BLOCKS(sbi) \
96 (min((int)max_hw_blocks, BIO_MAX_PAGES)) 104 ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
97 105
98/* 106/*
99 * indicate a block allocation direction: RIGHT and LEFT. 107 * indicate a block allocation direction: RIGHT and LEFT.
@@ -167,6 +175,11 @@ struct segment_allocation {
167 void (*allocate_segment)(struct f2fs_sb_info *, int, bool); 175 void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
168}; 176};
169 177
178struct inmem_pages {
179 struct list_head list;
180 struct page *page;
181};
182
170struct sit_info { 183struct sit_info {
171 const struct segment_allocation *s_ops; 184 const struct segment_allocation *s_ops;
172 185
@@ -237,6 +250,12 @@ struct curseg_info {
237 unsigned int next_segno; /* preallocated segment */ 250 unsigned int next_segno; /* preallocated segment */
238}; 251};
239 252
253struct sit_entry_set {
254 struct list_head set_list; /* link with all sit sets */
255 unsigned int start_segno; /* start segno of sits in set */
256 unsigned int entry_cnt; /* the # of sit entries in set */
257};
258
240/* 259/*
241 * inline functions 260 * inline functions
242 */ 261 */
@@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
316 clear_bit(segno, free_i->free_segmap); 335 clear_bit(segno, free_i->free_segmap);
317 free_i->free_segments++; 336 free_i->free_segments++;
318 337
319 next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno); 338 next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
320 if (next >= start_segno + sbi->segs_per_sec) { 339 if (next >= start_segno + sbi->segs_per_sec) {
321 clear_bit(secno, free_i->free_secmap); 340 clear_bit(secno, free_i->free_secmap);
322 free_i->free_sections++; 341 free_i->free_sections++;
@@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
430 449
431static inline bool need_SSR(struct f2fs_sb_info *sbi) 450static inline bool need_SSR(struct f2fs_sb_info *sbi)
432{ 451{
433 return (prefree_segments(sbi) / sbi->segs_per_sec) 452 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
434 + free_sections(sbi) < overprovision_sections(sbi); 453 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
454 return free_sections(sbi) <= (node_secs + 2 * dent_secs +
455 reserved_sections(sbi) + 1);
435} 456}
436 457
437static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) 458static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -466,48 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi)
466 * F2FS_IPU_UTIL - if FS utilization is over threashold, 487 * F2FS_IPU_UTIL - if FS utilization is over threashold,
467 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over 488 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
468 * threashold, 489 * threashold,
490 * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
491 * storages. IPU will be triggered only if the # of dirty
492 * pages over min_fsync_blocks.
469 * F2FS_IPUT_DISABLE - disable IPU. (=default option) 493 * F2FS_IPUT_DISABLE - disable IPU. (=default option)
470 */ 494 */
471#define DEF_MIN_IPU_UTIL 70 495#define DEF_MIN_IPU_UTIL 70
496#define DEF_MIN_FSYNC_BLOCKS 8
472 497
473enum { 498enum {
474 F2FS_IPU_FORCE, 499 F2FS_IPU_FORCE,
475 F2FS_IPU_SSR, 500 F2FS_IPU_SSR,
476 F2FS_IPU_UTIL, 501 F2FS_IPU_UTIL,
477 F2FS_IPU_SSR_UTIL, 502 F2FS_IPU_SSR_UTIL,
478 F2FS_IPU_DISABLE, 503 F2FS_IPU_FSYNC,
479}; 504};
480 505
481static inline bool need_inplace_update(struct inode *inode) 506static inline bool need_inplace_update(struct inode *inode)
482{ 507{
483 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 508 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
509 unsigned int policy = SM_I(sbi)->ipu_policy;
484 510
485 /* IPU can be done only for the user data */ 511 /* IPU can be done only for the user data */
486 if (S_ISDIR(inode->i_mode)) 512 if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
487 return false; 513 return false;
488 514
489 /* this is only set during fdatasync */ 515 if (policy & (0x1 << F2FS_IPU_FORCE))
490 if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) 516 return true;
517 if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
518 return true;
519 if (policy & (0x1 << F2FS_IPU_UTIL) &&
520 utilization(sbi) > SM_I(sbi)->min_ipu_util)
521 return true;
522 if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
523 utilization(sbi) > SM_I(sbi)->min_ipu_util)
491 return true; 524 return true;
492 525
493 switch (SM_I(sbi)->ipu_policy) { 526 /* this is only set during fdatasync */
494 case F2FS_IPU_FORCE: 527 if (policy & (0x1 << F2FS_IPU_FSYNC) &&
528 is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
495 return true; 529 return true;
496 case F2FS_IPU_SSR: 530
497 if (need_SSR(sbi))
498 return true;
499 break;
500 case F2FS_IPU_UTIL:
501 if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
502 return true;
503 break;
504 case F2FS_IPU_SSR_UTIL:
505 if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
506 return true;
507 break;
508 case F2FS_IPU_DISABLE:
509 break;
510 }
511 return false; 531 return false;
512} 532}
513 533
@@ -534,18 +554,13 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
534#ifdef CONFIG_F2FS_CHECK_FS 554#ifdef CONFIG_F2FS_CHECK_FS
535static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) 555static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
536{ 556{
537 unsigned int end_segno = SM_I(sbi)->segment_count - 1; 557 BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
538 BUG_ON(segno > end_segno);
539} 558}
540 559
541static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) 560static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
542{ 561{
543 struct f2fs_sm_info *sm_info = SM_I(sbi); 562 BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
544 block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg; 563 BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
545 block_t start_addr = sm_info->seg0_blkaddr;
546 block_t end_addr = start_addr + total_blks - 1;
547 BUG_ON(blk_addr < start_addr);
548 BUG_ON(blk_addr > end_addr);
549} 564}
550 565
551/* 566/*
@@ -554,8 +569,6 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
554static inline void check_block_count(struct f2fs_sb_info *sbi, 569static inline void check_block_count(struct f2fs_sb_info *sbi,
555 int segno, struct f2fs_sit_entry *raw_sit) 570 int segno, struct f2fs_sit_entry *raw_sit)
556{ 571{
557 struct f2fs_sm_info *sm_info = SM_I(sbi);
558 unsigned int end_segno = sm_info->segment_count - 1;
559 bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; 572 bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
560 int valid_blocks = 0; 573 int valid_blocks = 0;
561 int cur_pos = 0, next_pos; 574 int cur_pos = 0, next_pos;
@@ -564,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
564 BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); 577 BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
565 578
566 /* check boundary of a given segment number */ 579 /* check boundary of a given segment number */
567 BUG_ON(segno > end_segno); 580 BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
568 581
569 /* check bitmap with valid block count */ 582 /* check bitmap with valid block count */
570 do { 583 do {
@@ -583,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
583 BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); 596 BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
584} 597}
585#else 598#else
586#define check_seg_range(sbi, segno) 599static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
587#define verify_block_addr(sbi, blk_addr) 600{
588#define check_block_count(sbi, segno, raw_sit) 601 if (segno > TOTAL_SEGS(sbi) - 1)
602 sbi->need_fsck = true;
603}
604
605static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
606{
607 if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
608 sbi->need_fsck = true;
609}
610
611/*
612 * Summary block is always treated as an invalid block
613 */
614static inline void check_block_count(struct f2fs_sb_info *sbi,
615 int segno, struct f2fs_sit_entry *raw_sit)
616{
617 /* check segment usage */
618 if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
619 sbi->need_fsck = true;
620
621 /* check boundary of a given segment number */
622 if (segno > TOTAL_SEGS(sbi) - 1)
623 sbi->need_fsck = true;
624}
589#endif 625#endif
590 626
591static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, 627static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
592 unsigned int start) 628 unsigned int start)
593{ 629{
594 struct sit_info *sit_i = SIT_I(sbi); 630 struct sit_info *sit_i = SIT_I(sbi);
595 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start); 631 unsigned int offset = SIT_BLOCK_OFFSET(start);
596 block_t blk_addr = sit_i->sit_base_addr + offset; 632 block_t blk_addr = sit_i->sit_base_addr + offset;
597 633
598 check_seg_range(sbi, start); 634 check_seg_range(sbi, start);
@@ -619,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
619 655
620static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) 656static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
621{ 657{
622 unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start); 658 unsigned int block_off = SIT_BLOCK_OFFSET(start);
623 659
624 if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) 660 if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
625 f2fs_clear_bit(block_off, sit_i->sit_bitmap); 661 f2fs_clear_bit(block_off, sit_i->sit_bitmap);
@@ -666,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
666{ 702{
667 struct block_device *bdev = sbi->sb->s_bdev; 703 struct block_device *bdev = sbi->sb->s_bdev;
668 struct request_queue *q = bdev_get_queue(bdev); 704 struct request_queue *q = bdev_get_queue(bdev);
669 return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); 705 return SECTOR_TO_BLOCK(queue_max_sectors(q));
670} 706}
671 707
672/* 708/*
@@ -683,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
683 else if (type == NODE) 719 else if (type == NODE)
684 return 3 * sbi->blocks_per_seg; 720 return 3 * sbi->blocks_per_seg;
685 else if (type == META) 721 else if (type == META)
686 return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 722 return MAX_BIO_BLOCKS(sbi);
687 else 723 else
688 return 0; 724 return 0;
689} 725}
@@ -706,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
706 else if (type == NODE) 742 else if (type == NODE)
707 desired = 3 * max_hw_blocks(sbi); 743 desired = 3 * max_hw_blocks(sbi);
708 else 744 else
709 desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 745 desired = MAX_BIO_BLOCKS(sbi);
710 746
711 wbc->nr_to_write = desired; 747 wbc->nr_to_write = desired;
712 return desired - nr_to_write; 748 return desired - nr_to_write;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 41bdf511003d..41d6f700f4ee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -190,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
190F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); 190F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
191F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); 191F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
192F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); 192F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
193F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
193F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); 194F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
194F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); 195F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
195F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); 196F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -204,6 +205,7 @@ static struct attribute *f2fs_attrs[] = {
204 ATTR_LIST(max_small_discards), 205 ATTR_LIST(max_small_discards),
205 ATTR_LIST(ipu_policy), 206 ATTR_LIST(ipu_policy),
206 ATTR_LIST(min_ipu_util), 207 ATTR_LIST(min_ipu_util),
208 ATTR_LIST(min_fsync_blocks),
207 ATTR_LIST(max_victim_search), 209 ATTR_LIST(max_victim_search),
208 ATTR_LIST(dir_level), 210 ATTR_LIST(dir_level),
209 ATTR_LIST(ram_thresh), 211 ATTR_LIST(ram_thresh),
@@ -366,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
366 368
367 /* Initialize f2fs-specific inode info */ 369 /* Initialize f2fs-specific inode info */
368 fi->vfs_inode.i_version = 1; 370 fi->vfs_inode.i_version = 1;
369 atomic_set(&fi->dirty_dents, 0); 371 atomic_set(&fi->dirty_pages, 0);
370 fi->i_current_depth = 1; 372 fi->i_current_depth = 1;
371 fi->i_advise = 0; 373 fi->i_advise = 0;
372 rwlock_init(&fi->ext.ext_lock); 374 rwlock_init(&fi->ext.ext_lock);
373 init_rwsem(&fi->i_sem); 375 init_rwsem(&fi->i_sem);
376 INIT_LIST_HEAD(&fi->inmem_pages);
377 mutex_init(&fi->inmem_lock);
374 378
375 set_inode_flag(fi, FI_NEW_INODE); 379 set_inode_flag(fi, FI_NEW_INODE);
376 380
@@ -432,14 +436,19 @@ static void f2fs_put_super(struct super_block *sb)
432 stop_gc_thread(sbi); 436 stop_gc_thread(sbi);
433 437
434 /* We don't need to do checkpoint when it's clean */ 438 /* We don't need to do checkpoint when it's clean */
435 if (sbi->s_dirty) 439 if (sbi->s_dirty) {
436 write_checkpoint(sbi, true); 440 struct cp_control cpc = {
441 .reason = CP_UMOUNT,
442 };
443 write_checkpoint(sbi, &cpc);
444 }
437 445
438 /* 446 /*
439 * normally superblock is clean, so we need to release this. 447 * normally superblock is clean, so we need to release this.
440 * In addition, EIO will skip do checkpoint, we need this as well. 448 * In addition, EIO will skip do checkpoint, we need this as well.
441 */ 449 */
442 release_dirty_inode(sbi); 450 release_dirty_inode(sbi);
451 release_discard_addrs(sbi);
443 452
444 iput(sbi->node_inode); 453 iput(sbi->node_inode);
445 iput(sbi->meta_inode); 454 iput(sbi->meta_inode);
@@ -464,8 +473,11 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
464 trace_f2fs_sync_fs(sb, sync); 473 trace_f2fs_sync_fs(sb, sync);
465 474
466 if (sync) { 475 if (sync) {
476 struct cp_control cpc = {
477 .reason = CP_SYNC,
478 };
467 mutex_lock(&sbi->gc_mutex); 479 mutex_lock(&sbi->gc_mutex);
468 write_checkpoint(sbi, false); 480 write_checkpoint(sbi, &cpc);
469 mutex_unlock(&sbi->gc_mutex); 481 mutex_unlock(&sbi->gc_mutex);
470 } else { 482 } else {
471 f2fs_balance_fs(sbi); 483 f2fs_balance_fs(sbi);
@@ -616,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
616 org_mount_opt = sbi->mount_opt; 628 org_mount_opt = sbi->mount_opt;
617 active_logs = sbi->active_logs; 629 active_logs = sbi->active_logs;
618 630
631 sbi->mount_opt.opt = 0;
632 sbi->active_logs = NR_CURSEG_TYPE;
633
619 /* parse mount options */ 634 /* parse mount options */
620 err = parse_options(sb, data); 635 err = parse_options(sb, data);
621 if (err) 636 if (err)
@@ -786,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb,
786 return 1; 801 return 1;
787 } 802 }
788 803
789 if (le32_to_cpu(raw_super->log_sectorsize) != 804 /* Currently, support 512/1024/2048/4096 bytes sector size */
790 F2FS_LOG_SECTOR_SIZE) { 805 if (le32_to_cpu(raw_super->log_sectorsize) >
791 f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); 806 F2FS_MAX_LOG_SECTOR_SIZE ||
807 le32_to_cpu(raw_super->log_sectorsize) <
808 F2FS_MIN_LOG_SECTOR_SIZE) {
809 f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
810 le32_to_cpu(raw_super->log_sectorsize));
792 return 1; 811 return 1;
793 } 812 }
794 if (le32_to_cpu(raw_super->log_sectors_per_block) != 813 if (le32_to_cpu(raw_super->log_sectors_per_block) +
795 F2FS_LOG_SECTORS_PER_BLOCK) { 814 le32_to_cpu(raw_super->log_sectorsize) !=
796 f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); 815 F2FS_MAX_LOG_SECTOR_SIZE) {
816 f2fs_msg(sb, KERN_INFO,
817 "Invalid log sectors per block(%u) log sectorsize(%u)",
818 le32_to_cpu(raw_super->log_sectors_per_block),
819 le32_to_cpu(raw_super->log_sectorsize));
797 return 1; 820 return 1;
798 } 821 }
799 return 0; 822 return 0;
@@ -849,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
849 atomic_set(&sbi->nr_pages[i], 0); 872 atomic_set(&sbi->nr_pages[i], 0);
850 873
851 sbi->dir_level = DEF_DIR_LEVEL; 874 sbi->dir_level = DEF_DIR_LEVEL;
875 sbi->need_fsck = false;
852} 876}
853 877
854/* 878/*
@@ -1082,6 +1106,9 @@ try_onemore:
1082 if (err) 1106 if (err)
1083 goto free_proc; 1107 goto free_proc;
1084 1108
1109 if (!retry)
1110 sbi->need_fsck = true;
1111
1085 /* recover fsynced data */ 1112 /* recover fsynced data */
1086 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { 1113 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
1087 err = recover_fsync_data(sbi); 1114 err = recover_fsync_data(sbi);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 728a5dc3dc16..deca8728117b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
266 266
267static void *read_all_xattrs(struct inode *inode, struct page *ipage) 267static void *read_all_xattrs(struct inode *inode, struct page *ipage)
268{ 268{
269 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 269 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
270 struct f2fs_xattr_header *header; 270 struct f2fs_xattr_header *header;
271 size_t size = PAGE_SIZE, inline_size = 0; 271 size_t size = PAGE_SIZE, inline_size = 0;
272 void *txattr_addr; 272 void *txattr_addr;
@@ -325,7 +325,7 @@ fail:
325static inline int write_all_xattrs(struct inode *inode, __u32 hsize, 325static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
326 void *txattr_addr, struct page *ipage) 326 void *txattr_addr, struct page *ipage)
327{ 327{
328 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 328 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
329 size_t inline_size = 0; 329 size_t inline_size = 0;
330 void *xattr_addr; 330 void *xattr_addr;
331 struct page *xpage; 331 struct page *xpage;
@@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
373 alloc_nid_failed(sbi, new_nid); 373 alloc_nid_failed(sbi, new_nid);
374 return PTR_ERR(xpage); 374 return PTR_ERR(xpage);
375 } 375 }
376 f2fs_bug_on(new_nid); 376 f2fs_bug_on(sbi, new_nid);
377 f2fs_wait_on_page_writeback(xpage, NODE); 377 f2fs_wait_on_page_writeback(xpage, NODE);
378 } else { 378 } else {
379 struct dnode_of_data dn; 379 struct dnode_of_data dn;
@@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
596 const void *value, size_t size, 596 const void *value, size_t size,
597 struct page *ipage, int flags) 597 struct page *ipage, int flags)
598{ 598{
599 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 599 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
600 int err; 600 int err;
601 601
602 /* this case is only from init_inode_metadata */ 602 /* this case is only from init_inode_metadata */
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ca58d64374ca..9b320cc2a8cf 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,7 @@
5obj-$(CONFIG_LOCKD) += lockd.o 5obj-$(CONFIG_LOCKD) += lockd.o
6 6
7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ 7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o 8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o
9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o 9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
10lockd-objs-$(CONFIG_PROC_FS) += procfs.o
10lockd-objs := $(lockd-objs-y) 11lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index daa8e7514eae..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
159 159
160 msg.rpc_proc = &clnt->cl_procinfo[proc]; 160 msg.rpc_proc = &clnt->cl_procinfo[proc];
161 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); 161 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
162 if (status == -ECONNREFUSED) {
163 dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n",
164 status);
165 rpc_force_rebind(clnt);
166 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
167 }
162 if (status < 0) 168 if (status < 0)
163 dprintk("lockd: NSM upcall RPC failed, status=%d\n", 169 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
164 status); 170 status);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5010b55628b4..097bfa3adb1c 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -11,7 +11,6 @@ struct lockd_net {
11 11
12 struct delayed_work grace_period_end; 12 struct delayed_work grace_period_end;
13 struct lock_manager lockd_manager; 13 struct lock_manager lockd_manager;
14 struct list_head grace_list;
15 14
16 spinlock_t nsm_clnt_lock; 15 spinlock_t nsm_clnt_lock;
17 unsigned int nsm_users; 16 unsigned int nsm_users;
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
new file mode 100644
index 000000000000..2a0a98480e39
--- /dev/null
+++ b/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
1/*
2 * Procfs support for lockd
3 *
4 * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
5 */
6
7#include <linux/fs.h>
8#include <linux/proc_fs.h>
9#include <linux/module.h>
10#include <linux/nsproxy.h>
11#include <net/net_namespace.h>
12
13#include "netns.h"
14#include "procfs.h"
15
16/*
17 * We only allow strings that start with 'Y', 'y', or '1'.
18 */
19static ssize_t
20nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
21 loff_t *pos)
22{
23 char *data;
24 struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
25 lockd_net_id);
26
27 if (size < 1)
28 return -EINVAL;
29
30 data = simple_transaction_get(file, buf, size);
31 if (IS_ERR(data))
32 return PTR_ERR(data);
33
34 switch(data[0]) {
35 case 'Y':
36 case 'y':
37 case '1':
38 locks_end_grace(&ln->lockd_manager);
39 break;
40 default:
41 return -EINVAL;
42 }
43
44 return size;
45}
46
47static ssize_t
48nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
49 loff_t *pos)
50{
51 struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
52 lockd_net_id);
53 char resp[3];
54
55 resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
56 resp[1] = '\n';
57 resp[2] = '\0';
58
59 return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
60}
61
62static const struct file_operations lockd_end_grace_operations = {
63 .write = nlm_end_grace_write,
64 .read = nlm_end_grace_read,
65 .llseek = default_llseek,
66 .release = simple_transaction_release,
67 .owner = THIS_MODULE,
68};
69
70int __init
71lockd_create_procfs(void)
72{
73 struct proc_dir_entry *entry;
74
75 entry = proc_mkdir("fs/lockd", NULL);
76 if (!entry)
77 return -ENOMEM;
78 entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
79 &lockd_end_grace_operations);
80 if (!entry) {
81 remove_proc_entry("fs/lockd", NULL);
82 return -ENOMEM;
83 }
84 return 0;
85}
86
87void __exit
88lockd_remove_procfs(void)
89{
90 remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
91 remove_proc_entry("fs/lockd", NULL);
92}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
new file mode 100644
index 000000000000..2257a1311027
--- /dev/null
+++ b/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
1/*
2 * Procfs support for lockd
3 *
4 * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
5 */
6#ifndef _LOCKD_PROCFS_H
7#define _LOCKD_PROCFS_H
8
9#include <linux/kconfig.h>
10
11#if IS_ENABLED(CONFIG_PROC_FS)
12int lockd_create_procfs(void);
13void lockd_remove_procfs(void);
14#else
15static inline int
16lockd_create_procfs(void)
17{
18 return 0;
19}
20
21static inline void
22lockd_remove_procfs(void)
23{
24 return;
25}
26#endif /* IS_ENABLED(CONFIG_PROC_FS) */
27
28#endif /* _LOCKD_PROCFS_H */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index ec9e082f9ecd..d1bb7ecfd201 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -36,6 +36,7 @@
36#include <linux/nfs.h> 36#include <linux/nfs.h>
37 37
38#include "netns.h" 38#include "netns.h"
39#include "procfs.h"
39 40
40#define NLMDBG_FACILITY NLMDBG_SVC 41#define NLMDBG_FACILITY NLMDBG_SVC
41#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) 42#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
@@ -304,13 +305,16 @@ static int lockd_start_svc(struct svc_serv *serv)
304 svc_sock_update_bufs(serv); 305 svc_sock_update_bufs(serv);
305 serv->sv_maxconn = nlm_max_connections; 306 serv->sv_maxconn = nlm_max_connections;
306 307
307 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name); 308 nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name);
308 if (IS_ERR(nlmsvc_task)) { 309 if (IS_ERR(nlmsvc_task)) {
309 error = PTR_ERR(nlmsvc_task); 310 error = PTR_ERR(nlmsvc_task);
310 printk(KERN_WARNING 311 printk(KERN_WARNING
311 "lockd_up: kthread_run failed, error=%d\n", error); 312 "lockd_up: kthread_run failed, error=%d\n", error);
312 goto out_task; 313 goto out_task;
313 } 314 }
315 nlmsvc_rqst->rq_task = nlmsvc_task;
316 wake_up_process(nlmsvc_task);
317
314 dprintk("lockd_up: service started\n"); 318 dprintk("lockd_up: service started\n");
315 return 0; 319 return 0;
316 320
@@ -581,7 +585,7 @@ static int lockd_init_net(struct net *net)
581 struct lockd_net *ln = net_generic(net, lockd_net_id); 585 struct lockd_net *ln = net_generic(net, lockd_net_id);
582 586
583 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); 587 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
584 INIT_LIST_HEAD(&ln->grace_list); 588 INIT_LIST_HEAD(&ln->lockd_manager.list);
585 spin_lock_init(&ln->nsm_clnt_lock); 589 spin_lock_init(&ln->nsm_clnt_lock);
586 return 0; 590 return 0;
587} 591}
@@ -615,8 +619,15 @@ static int __init init_nlm(void)
615 err = register_pernet_subsys(&lockd_net_ops); 619 err = register_pernet_subsys(&lockd_net_ops);
616 if (err) 620 if (err)
617 goto err_pernet; 621 goto err_pernet;
622
623 err = lockd_create_procfs();
624 if (err)
625 goto err_procfs;
626
618 return 0; 627 return 0;
619 628
629err_procfs:
630 unregister_pernet_subsys(&lockd_net_ops);
620err_pernet: 631err_pernet:
621#ifdef CONFIG_SYSCTL 632#ifdef CONFIG_SYSCTL
622 unregister_sysctl_table(nlm_sysctl_table); 633 unregister_sysctl_table(nlm_sysctl_table);
@@ -629,6 +640,7 @@ static void __exit exit_nlm(void)
629{ 640{
630 /* FIXME: delete all NLM clients */ 641 /* FIXME: delete all NLM clients */
631 nlm_shutdown_hosts(); 642 nlm_shutdown_hosts();
643 lockd_remove_procfs();
632 unregister_pernet_subsys(&lockd_net_ops); 644 unregister_pernet_subsys(&lockd_net_ops);
633#ifdef CONFIG_SYSCTL 645#ifdef CONFIG_SYSCTL
634 unregister_sysctl_table(nlm_sysctl_table); 646 unregister_sysctl_table(nlm_sysctl_table);
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
2# Makefile for the pNFS block layout driver kernel module 2# Makefile for the pNFS block layout driver kernel module
3# 3#
4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o 4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
5blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o 5
6blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cbb1797149d5..5228f201d3d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/namei.h> 36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */ 37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h> 38#include <linux/prefetch.h>
40#include <linux/pagevec.h> 39#include <linux/pagevec.h>
41 40
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); 49MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
51MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); 50MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
52 51
53static void print_page(struct page *page) 52static bool is_hole(struct pnfs_block_extent *be)
54{ 53{
55 dprintk("PRINTPAGE page %p\n", page); 54 switch (be->be_state) {
56 dprintk(" PagePrivate %d\n", PagePrivate(page)); 55 case PNFS_BLOCK_NONE_DATA:
57 dprintk(" PageUptodate %d\n", PageUptodate(page)); 56 return true;
58 dprintk(" PageError %d\n", PageError(page)); 57 case PNFS_BLOCK_INVALID_DATA:
59 dprintk(" PageDirty %d\n", PageDirty(page)); 58 return be->be_tag ? false : true;
60 dprintk(" PageReferenced %d\n", PageReferenced(page)); 59 default:
61 dprintk(" PageLocked %d\n", PageLocked(page)); 60 return false;
62 dprintk(" PageWriteback %d\n", PageWriteback(page)); 61 }
63 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
64 dprintk("\n");
65}
66
67/* Given the be associated with isect, determine if page data needs to be
68 * initialized.
69 */
70static int is_hole(struct pnfs_block_extent *be, sector_t isect)
71{
72 if (be->be_state == PNFS_BLOCK_NONE_DATA)
73 return 1;
74 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
75 return 0;
76 else
77 return !bl_is_sector_init(be->be_inval, isect);
78}
79
80/* Given the be associated with isect, determine if page data can be
81 * written to disk.
82 */
83static int is_writable(struct pnfs_block_extent *be, sector_t isect)
84{
85 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
86 be->be_state == PNFS_BLOCK_INVALID_DATA);
87} 62}
88 63
89/* The data we are handed might be spread across several bios. We need 64/* The data we are handed might be spread across several bios. We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
91 */ 66 */
92struct parallel_io { 67struct parallel_io {
93 struct kref refcnt; 68 struct kref refcnt;
94 void (*pnfs_callback) (void *data, int num_se); 69 void (*pnfs_callback) (void *data);
95 void *data; 70 void *data;
96 int bse_count;
97}; 71};
98 72
99static inline struct parallel_io *alloc_parallel(void *data) 73static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
104 if (rv) { 78 if (rv) {
105 rv->data = data; 79 rv->data = data;
106 kref_init(&rv->refcnt); 80 kref_init(&rv->refcnt);
107 rv->bse_count = 0;
108 } 81 }
109 return rv; 82 return rv;
110} 83}
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
119 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 92 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
120 93
121 dprintk("%s enter\n", __func__); 94 dprintk("%s enter\n", __func__);
122 p->pnfs_callback(p->data, p->bse_count); 95 p->pnfs_callback(p->data);
123 kfree(p); 96 kfree(p);
124} 97}
125 98
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
141 return NULL; 114 return NULL;
142} 115}
143 116
144static struct bio *bl_alloc_init_bio(int npg, sector_t isect, 117static struct bio *
145 struct pnfs_block_extent *be, 118bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
146 void (*end_io)(struct bio *, int err), 119 void (*end_io)(struct bio *, int err), struct parallel_io *par)
147 struct parallel_io *par)
148{ 120{
149 struct bio *bio; 121 struct bio *bio;
150 122
@@ -156,58 +128,64 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
156 } 128 }
157 129
158 if (bio) { 130 if (bio) {
159 bio->bi_iter.bi_sector = isect - be->be_f_offset + 131 bio->bi_iter.bi_sector = disk_sector;
160 be->be_v_offset; 132 bio->bi_bdev = bdev;
161 bio->bi_bdev = be->be_mdev;
162 bio->bi_end_io = end_io; 133 bio->bi_end_io = end_io;
163 bio->bi_private = par; 134 bio->bi_private = par;
164 } 135 }
165 return bio; 136 return bio;
166} 137}
167 138
168static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, 139static struct bio *
169 sector_t isect, struct page *page, 140do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
170 struct pnfs_block_extent *be, 141 struct page *page, struct pnfs_block_dev_map *map,
171 void (*end_io)(struct bio *, int err), 142 struct pnfs_block_extent *be,
172 struct parallel_io *par, 143 void (*end_io)(struct bio *, int err),
173 unsigned int offset, int len) 144 struct parallel_io *par, unsigned int offset, int *len)
174{ 145{
175 isect = isect + (offset >> SECTOR_SHIFT); 146 struct pnfs_block_dev *dev =
147 container_of(be->be_device, struct pnfs_block_dev, node);
148 u64 disk_addr, end;
149
176 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, 150 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
177 npg, rw, (unsigned long long)isect, offset, len); 151 npg, rw, (unsigned long long)isect, offset, *len);
152
153 /* translate to device offset */
154 isect += be->be_v_offset;
155 isect -= be->be_f_offset;
156
157 /* translate to physical disk offset */
158 disk_addr = (u64)isect << SECTOR_SHIFT;
159 if (disk_addr < map->start || disk_addr >= map->start + map->len) {
160 if (!dev->map(dev, disk_addr, map))
161 return ERR_PTR(-EIO);
162 bio = bl_submit_bio(rw, bio);
163 }
164 disk_addr += map->disk_offset;
165 disk_addr -= map->start;
166
167 /* limit length to what the device mapping allows */
168 end = disk_addr + *len;
169 if (end >= map->start + map->len)
170 *len = map->start + map->len - disk_addr;
171
178retry: 172retry:
179 if (!bio) { 173 if (!bio) {
180 bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 174 bio = bl_alloc_init_bio(npg, map->bdev,
175 disk_addr >> SECTOR_SHIFT, end_io, par);
181 if (!bio) 176 if (!bio)
182 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
183 } 178 }
184 if (bio_add_page(bio, page, len, offset) < len) { 179 if (bio_add_page(bio, page, *len, offset) < *len) {
185 bio = bl_submit_bio(rw, bio); 180 bio = bl_submit_bio(rw, bio);
186 goto retry; 181 goto retry;
187 } 182 }
188 return bio; 183 return bio;
189} 184}
190 185
191static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
192 sector_t isect, struct page *page,
193 struct pnfs_block_extent *be,
194 void (*end_io)(struct bio *, int err),
195 struct parallel_io *par)
196{
197 return do_add_page_to_bio(bio, npg, rw, isect, page, be,
198 end_io, par, 0, PAGE_CACHE_SIZE);
199}
200
201/* This is basically copied from mpage_end_io_read */
202static void bl_end_io_read(struct bio *bio, int err) 186static void bl_end_io_read(struct bio *bio, int err)
203{ 187{
204 struct parallel_io *par = bio->bi_private; 188 struct parallel_io *par = bio->bi_private;
205 struct bio_vec *bvec;
206 int i;
207
208 if (!err)
209 bio_for_each_segment_all(bvec, bio, i)
210 SetPageUptodate(bvec->bv_page);
211 189
212 if (err) { 190 if (err) {
213 struct nfs_pgio_header *header = par->data; 191 struct nfs_pgio_header *header = par->data;
@@ -216,6 +194,7 @@ static void bl_end_io_read(struct bio *bio, int err)
216 header->pnfs_error = -EIO; 194 header->pnfs_error = -EIO;
217 pnfs_set_lo_fail(header->lseg); 195 pnfs_set_lo_fail(header->lseg);
218 } 196 }
197
219 bio_put(bio); 198 bio_put(bio);
220 put_parallel(par); 199 put_parallel(par);
221} 200}
@@ -231,7 +210,7 @@ static void bl_read_cleanup(struct work_struct *work)
231} 210}
232 211
233static void 212static void
234bl_end_par_io_read(void *data, int unused) 213bl_end_par_io_read(void *data)
235{ 214{
236 struct nfs_pgio_header *hdr = data; 215 struct nfs_pgio_header *hdr = data;
237 216
@@ -241,88 +220,78 @@ bl_end_par_io_read(void *data, int unused)
241} 220}
242 221
243static enum pnfs_try_status 222static enum pnfs_try_status
244bl_read_pagelist(struct nfs_pgio_header *hdr) 223bl_read_pagelist(struct nfs_pgio_header *header)
245{ 224{
246 struct nfs_pgio_header *header = hdr; 225 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
247 int i, hole; 226 struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
248 struct bio *bio = NULL; 227 struct bio *bio = NULL;
249 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 228 struct pnfs_block_extent be;
250 sector_t isect, extent_length = 0; 229 sector_t isect, extent_length = 0;
251 struct parallel_io *par; 230 struct parallel_io *par;
252 loff_t f_offset = hdr->args.offset; 231 loff_t f_offset = header->args.offset;
253 size_t bytes_left = hdr->args.count; 232 size_t bytes_left = header->args.count;
254 unsigned int pg_offset, pg_len; 233 unsigned int pg_offset, pg_len;
255 struct page **pages = hdr->args.pages; 234 struct page **pages = header->args.pages;
256 int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT; 235 int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
257 const bool is_dio = (header->dreq != NULL); 236 const bool is_dio = (header->dreq != NULL);
237 struct blk_plug plug;
238 int i;
258 239
259 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 240 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
260 hdr->page_array.npages, f_offset, 241 header->page_array.npages, f_offset,
261 (unsigned int)hdr->args.count); 242 (unsigned int)header->args.count);
262 243
263 par = alloc_parallel(hdr); 244 par = alloc_parallel(header);
264 if (!par) 245 if (!par)
265 goto use_mds; 246 return PNFS_NOT_ATTEMPTED;
266 par->pnfs_callback = bl_end_par_io_read; 247 par->pnfs_callback = bl_end_par_io_read;
267 /* At this point, we can no longer jump to use_mds */ 248
249 blk_start_plug(&plug);
268 250
269 isect = (sector_t) (f_offset >> SECTOR_SHIFT); 251 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
270 /* Code assumes extents are page-aligned */ 252 /* Code assumes extents are page-aligned */
271 for (i = pg_index; i < hdr->page_array.npages; i++) { 253 for (i = pg_index; i < header->page_array.npages; i++) {
272 if (!extent_length) { 254 if (extent_length <= 0) {
273 /* We've used up the previous extent */ 255 /* We've used up the previous extent */
274 bl_put_extent(be);
275 bl_put_extent(cow_read);
276 bio = bl_submit_bio(READ, bio); 256 bio = bl_submit_bio(READ, bio);
257
277 /* Get the next one */ 258 /* Get the next one */
278 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 259 if (!ext_tree_lookup(bl, isect, &be, false)) {
279 isect, &cow_read);
280 if (!be) {
281 header->pnfs_error = -EIO; 260 header->pnfs_error = -EIO;
282 goto out; 261 goto out;
283 } 262 }
284 extent_length = be->be_length - 263 extent_length = be.be_length - (isect - be.be_f_offset);
285 (isect - be->be_f_offset);
286 if (cow_read) {
287 sector_t cow_length = cow_read->be_length -
288 (isect - cow_read->be_f_offset);
289 extent_length = min(extent_length, cow_length);
290 }
291 } 264 }
292 265
266 pg_offset = f_offset & ~PAGE_CACHE_MASK;
293 if (is_dio) { 267 if (is_dio) {
294 pg_offset = f_offset & ~PAGE_CACHE_MASK;
295 if (pg_offset + bytes_left > PAGE_CACHE_SIZE) 268 if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
296 pg_len = PAGE_CACHE_SIZE - pg_offset; 269 pg_len = PAGE_CACHE_SIZE - pg_offset;
297 else 270 else
298 pg_len = bytes_left; 271 pg_len = bytes_left;
299
300 f_offset += pg_len;
301 bytes_left -= pg_len;
302 isect += (pg_offset >> SECTOR_SHIFT);
303 } else { 272 } else {
304 pg_offset = 0; 273 BUG_ON(pg_offset != 0);
305 pg_len = PAGE_CACHE_SIZE; 274 pg_len = PAGE_CACHE_SIZE;
306 } 275 }
307 276
308 hole = is_hole(be, isect); 277 isect += (pg_offset >> SECTOR_SHIFT);
309 if (hole && !cow_read) { 278 extent_length -= (pg_offset >> SECTOR_SHIFT);
279
280 if (is_hole(&be)) {
310 bio = bl_submit_bio(READ, bio); 281 bio = bl_submit_bio(READ, bio);
311 /* Fill hole w/ zeroes w/o accessing device */ 282 /* Fill hole w/ zeroes w/o accessing device */
312 dprintk("%s Zeroing page for hole\n", __func__); 283 dprintk("%s Zeroing page for hole\n", __func__);
313 zero_user_segment(pages[i], pg_offset, pg_len); 284 zero_user_segment(pages[i], pg_offset, pg_len);
314 print_page(pages[i]);
315 SetPageUptodate(pages[i]);
316 } else {
317 struct pnfs_block_extent *be_read;
318 285
319 be_read = (hole && cow_read) ? cow_read : be; 286 /* invalidate map */
287 map.start = NFS4_MAX_UINT64;
288 } else {
320 bio = do_add_page_to_bio(bio, 289 bio = do_add_page_to_bio(bio,
321 hdr->page_array.npages - i, 290 header->page_array.npages - i,
322 READ, 291 READ,
323 isect, pages[i], be_read, 292 isect, pages[i], &map, &be,
324 bl_end_io_read, par, 293 bl_end_io_read, par,
325 pg_offset, pg_len); 294 pg_offset, &pg_len);
326 if (IS_ERR(bio)) { 295 if (IS_ERR(bio)) {
327 header->pnfs_error = PTR_ERR(bio); 296 header->pnfs_error = PTR_ERR(bio);
328 bio = NULL; 297 bio = NULL;
@@ -330,75 +299,21 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
330 } 299 }
331 } 300 }
332 isect += (pg_len >> SECTOR_SHIFT); 301 isect += (pg_len >> SECTOR_SHIFT);
333 extent_length -= PAGE_CACHE_SECTORS; 302 extent_length -= (pg_len >> SECTOR_SHIFT);
303 f_offset += pg_len;
304 bytes_left -= pg_len;
334 } 305 }
335 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 306 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
336 hdr->res.eof = 1; 307 header->res.eof = 1;
337 hdr->res.count = header->inode->i_size - hdr->args.offset; 308 header->res.count = header->inode->i_size - header->args.offset;
338 } else { 309 } else {
339 hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset; 310 header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
340 } 311 }
341out: 312out:
342 bl_put_extent(be);
343 bl_put_extent(cow_read);
344 bl_submit_bio(READ, bio); 313 bl_submit_bio(READ, bio);
314 blk_finish_plug(&plug);
345 put_parallel(par); 315 put_parallel(par);
346 return PNFS_ATTEMPTED; 316 return PNFS_ATTEMPTED;
347
348 use_mds:
349 dprintk("Giving up and using normal NFS\n");
350 return PNFS_NOT_ATTEMPTED;
351}
352
353static void mark_extents_written(struct pnfs_block_layout *bl,
354 __u64 offset, __u32 count)
355{
356 sector_t isect, end;
357 struct pnfs_block_extent *be;
358 struct pnfs_block_short_extent *se;
359
360 dprintk("%s(%llu, %u)\n", __func__, offset, count);
361 if (count == 0)
362 return;
363 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
364 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
365 end >>= SECTOR_SHIFT;
366 while (isect < end) {
367 sector_t len;
368 be = bl_find_get_extent(bl, isect, NULL);
369 BUG_ON(!be); /* FIXME */
370 len = min(end, be->be_f_offset + be->be_length) - isect;
371 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
372 se = bl_pop_one_short_extent(be->be_inval);
373 BUG_ON(!se);
374 bl_mark_for_commit(be, isect, len, se);
375 }
376 isect += len;
377 bl_put_extent(be);
378 }
379}
380
381static void bl_end_io_write_zero(struct bio *bio, int err)
382{
383 struct parallel_io *par = bio->bi_private;
384 struct bio_vec *bvec;
385 int i;
386
387 bio_for_each_segment_all(bvec, bio, i) {
388 /* This is the zeroing page we added */
389 end_page_writeback(bvec->bv_page);
390 page_cache_release(bvec->bv_page);
391 }
392
393 if (unlikely(err)) {
394 struct nfs_pgio_header *header = par->data;
395
396 if (!header->pnfs_error)
397 header->pnfs_error = -EIO;
398 pnfs_set_lo_fail(header->lseg);
399 }
400 bio_put(bio);
401 put_parallel(par);
402} 317}
403 318
404static void bl_end_io_write(struct bio *bio, int err) 319static void bl_end_io_write(struct bio *bio, int err)
@@ -421,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
421 */ 336 */
422static void bl_write_cleanup(struct work_struct *work) 337static void bl_write_cleanup(struct work_struct *work)
423{ 338{
424 struct rpc_task *task; 339 struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
425 struct nfs_pgio_header *hdr; 340 struct nfs_pgio_header *hdr =
341 container_of(task, struct nfs_pgio_header, task);
342
426 dprintk("%s enter\n", __func__); 343 dprintk("%s enter\n", __func__);
427 task = container_of(work, struct rpc_task, u.tk_work); 344
428 hdr = container_of(task, struct nfs_pgio_header, task);
429 if (likely(!hdr->pnfs_error)) { 345 if (likely(!hdr->pnfs_error)) {
430 /* Marks for LAYOUTCOMMIT */ 346 struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
431 mark_extents_written(BLK_LSEG2EXT(hdr->lseg), 347 u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
432 hdr->args.offset, hdr->args.count); 348 u64 end = (hdr->args.offset + hdr->args.count +
349 PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
350
351 ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
352 (end - start) >> SECTOR_SHIFT);
433 } 353 }
354
434 pnfs_ld_write_done(hdr); 355 pnfs_ld_write_done(hdr);
435} 356}
436 357
437/* Called when last of bios associated with a bl_write_pagelist call finishes */ 358/* Called when last of bios associated with a bl_write_pagelist call finishes */
438static void bl_end_par_io_write(void *data, int num_se) 359static void bl_end_par_io_write(void *data)
439{ 360{
440 struct nfs_pgio_header *hdr = data; 361 struct nfs_pgio_header *hdr = data;
441 362
442 if (unlikely(hdr->pnfs_error)) {
443 bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
444 num_se);
445 }
446
447 hdr->task.tk_status = hdr->pnfs_error; 363 hdr->task.tk_status = hdr->pnfs_error;
448 hdr->verf.committed = NFS_FILE_SYNC; 364 hdr->verf.committed = NFS_FILE_SYNC;
449 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup); 365 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
450 schedule_work(&hdr->task.u.tk_work); 366 schedule_work(&hdr->task.u.tk_work);
451} 367}
452 368
453/* FIXME STUB - mark intersection of layout and page as bad, so is not
454 * used again.
455 */
456static void mark_bad_read(void)
457{
458 return;
459}
460
461/*
462 * map_block: map a requested I/0 block (isect) into an offset in the LVM
463 * block_device
464 */
465static void
466map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
467{
468 dprintk("%s enter be=%p\n", __func__, be);
469
470 set_buffer_mapped(bh);
471 bh->b_bdev = be->be_mdev;
472 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
473 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
474
475 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
476 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
477 bh->b_size);
478 return;
479}
480
481static void
482bl_read_single_end_io(struct bio *bio, int error)
483{
484 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
485 struct page *page = bvec->bv_page;
486
487 /* Only one page in bvec */
488 unlock_page(page);
489}
490
491static int
492bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
493 unsigned int offset, unsigned int len)
494{
495 struct bio *bio;
496 struct page *shadow_page;
497 sector_t isect;
498 char *kaddr, *kshadow_addr;
499 int ret = 0;
500
501 dprintk("%s: offset %u len %u\n", __func__, offset, len);
502
503 shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
504 if (shadow_page == NULL)
505 return -ENOMEM;
506
507 bio = bio_alloc(GFP_NOIO, 1);
508 if (bio == NULL)
509 return -ENOMEM;
510
511 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
512 (offset / SECTOR_SIZE);
513
514 bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
515 bio->bi_bdev = be->be_mdev;
516 bio->bi_end_io = bl_read_single_end_io;
517
518 lock_page(shadow_page);
519 if (bio_add_page(bio, shadow_page,
520 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
521 unlock_page(shadow_page);
522 bio_put(bio);
523 return -EIO;
524 }
525
526 submit_bio(READ, bio);
527 wait_on_page_locked(shadow_page);
528 if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
529 ret = -EIO;
530 } else {
531 kaddr = kmap_atomic(page);
532 kshadow_addr = kmap_atomic(shadow_page);
533 memcpy(kaddr + offset, kshadow_addr + offset, len);
534 kunmap_atomic(kshadow_addr);
535 kunmap_atomic(kaddr);
536 }
537 __free_page(shadow_page);
538 bio_put(bio);
539
540 return ret;
541}
542
543static int
544bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
545 unsigned int dirty_offset, unsigned int dirty_len,
546 bool full_page)
547{
548 int ret = 0;
549 unsigned int start, end;
550
551 if (full_page) {
552 start = 0;
553 end = PAGE_CACHE_SIZE;
554 } else {
555 start = round_down(dirty_offset, SECTOR_SIZE);
556 end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
557 }
558
559 dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
560 if (!be) {
561 zero_user_segments(page, start, dirty_offset,
562 dirty_offset + dirty_len, end);
563 if (start == 0 && end == PAGE_CACHE_SIZE &&
564 trylock_page(page)) {
565 SetPageUptodate(page);
566 unlock_page(page);
567 }
568 return ret;
569 }
570
571 if (start != dirty_offset)
572 ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
573
574 if (!ret && (dirty_offset + dirty_len < end))
575 ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
576 end - dirty_offset - dirty_len);
577
578 return ret;
579}
580
581/* Given an unmapped page, zero it or read in page for COW, page is locked
582 * by caller.
583 */
584static int
585init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
586{
587 struct buffer_head *bh = NULL;
588 int ret = 0;
589 sector_t isect;
590
591 dprintk("%s enter, %p\n", __func__, page);
592 BUG_ON(PageUptodate(page));
593 if (!cow_read) {
594 zero_user_segment(page, 0, PAGE_SIZE);
595 SetPageUptodate(page);
596 goto cleanup;
597 }
598
599 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
600 if (!bh) {
601 ret = -ENOMEM;
602 goto cleanup;
603 }
604
605 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
606 map_block(bh, isect, cow_read);
607 if (!bh_uptodate_or_lock(bh))
608 ret = bh_submit_read(bh);
609 if (ret)
610 goto cleanup;
611 SetPageUptodate(page);
612
613cleanup:
614 if (bh)
615 free_buffer_head(bh);
616 if (ret) {
617 /* Need to mark layout with bad read...should now
618 * just use nfs4 for reads and writes.
619 */
620 mark_bad_read();
621 }
622 return ret;
623}
624
625/* Find or create a zeroing page marked being writeback.
626 * Return ERR_PTR on error, NULL to indicate skip this page and page itself
627 * to indicate write out.
628 */
629static struct page *
630bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
631 struct pnfs_block_extent *cow_read)
632{
633 struct page *page;
634 int locked = 0;
635 page = find_get_page(inode->i_mapping, index);
636 if (page)
637 goto check_page;
638
639 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
640 if (unlikely(!page)) {
641 dprintk("%s oom\n", __func__);
642 return ERR_PTR(-ENOMEM);
643 }
644 locked = 1;
645
646check_page:
647 /* PageDirty: Other will write this out
648 * PageWriteback: Other is writing this out
649 * PageUptodate: It was read before
650 */
651 if (PageDirty(page) || PageWriteback(page)) {
652 print_page(page);
653 if (locked)
654 unlock_page(page);
655 page_cache_release(page);
656 return NULL;
657 }
658
659 if (!locked) {
660 lock_page(page);
661 locked = 1;
662 goto check_page;
663 }
664 if (!PageUptodate(page)) {
665 /* New page, readin or zero it */
666 init_page_for_write(page, cow_read);
667 }
668 set_page_writeback(page);
669 unlock_page(page);
670
671 return page;
672}
673
674static enum pnfs_try_status 369static enum pnfs_try_status
675bl_write_pagelist(struct nfs_pgio_header *header, int sync) 370bl_write_pagelist(struct nfs_pgio_header *header, int sync)
676{ 371{
677 int i, ret, npg_zero, pg_index, last = 0; 372 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
373 struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
678 struct bio *bio = NULL; 374 struct bio *bio = NULL;
679 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 375 struct pnfs_block_extent be;
680 sector_t isect, last_isect = 0, extent_length = 0; 376 sector_t isect, extent_length = 0;
681 struct parallel_io *par = NULL; 377 struct parallel_io *par = NULL;
682 loff_t offset = header->args.offset; 378 loff_t offset = header->args.offset;
683 size_t count = header->args.count; 379 size_t count = header->args.count;
684 unsigned int pg_offset, pg_len, saved_len;
685 struct page **pages = header->args.pages; 380 struct page **pages = header->args.pages;
686 struct page *page; 381 int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
687 pgoff_t index; 382 unsigned int pg_len;
688 u64 temp; 383 struct blk_plug plug;
689 int npg_per_block = 384 int i;
690 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
691 385
692 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 386 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
693 387
694 if (header->dreq != NULL &&
695 (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
696 !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
697 dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
698 goto out_mds;
699 }
700 /* At this point, header->page_aray is a (sequential) list of nfs_pages. 388 /* At this point, header->page_aray is a (sequential) list of nfs_pages.
701 * We want to write each, and if there is an error set pnfs_error 389 * We want to write each, and if there is an error set pnfs_error
702 * to have it redone using nfs. 390 * to have it redone using nfs.
703 */ 391 */
704 par = alloc_parallel(header); 392 par = alloc_parallel(header);
705 if (!par) 393 if (!par)
706 goto out_mds; 394 return PNFS_NOT_ATTEMPTED;
707 par->pnfs_callback = bl_end_par_io_write; 395 par->pnfs_callback = bl_end_par_io_write;
708 /* At this point, have to be more careful with error handling */
709 396
710 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 397 blk_start_plug(&plug);
711 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
712 if (!be || !is_writable(be, isect)) {
713 dprintk("%s no matching extents!\n", __func__);
714 goto out_mds;
715 }
716 398
717 /* First page inside INVALID extent */ 399 /* we always write out the whole page */
718 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 400 offset = offset & (loff_t)PAGE_CACHE_MASK;
719 if (likely(!bl_push_one_short_extent(be->be_inval))) 401 isect = offset >> SECTOR_SHIFT;
720 par->bse_count++;
721 else
722 goto out_mds;
723 temp = offset >> PAGE_CACHE_SHIFT;
724 npg_zero = do_div(temp, npg_per_block);
725 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
726 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
727 extent_length = be->be_length - (isect - be->be_f_offset);
728
729fill_invalid_ext:
730 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
731 for (;npg_zero > 0; npg_zero--) {
732 if (bl_is_sector_init(be->be_inval, isect)) {
733 dprintk("isect %llu already init\n",
734 (unsigned long long)isect);
735 goto next_page;
736 }
737 /* page ref released in bl_end_io_write_zero */
738 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
739 dprintk("%s zero %dth page: index %lu isect %llu\n",
740 __func__, npg_zero, index,
741 (unsigned long long)isect);
742 page = bl_find_get_zeroing_page(header->inode, index,
743 cow_read);
744 if (unlikely(IS_ERR(page))) {
745 header->pnfs_error = PTR_ERR(page);
746 goto out;
747 } else if (page == NULL)
748 goto next_page;
749
750 ret = bl_mark_sectors_init(be->be_inval, isect,
751 PAGE_CACHE_SECTORS);
752 if (unlikely(ret)) {
753 dprintk("%s bl_mark_sectors_init fail %d\n",
754 __func__, ret);
755 end_page_writeback(page);
756 page_cache_release(page);
757 header->pnfs_error = ret;
758 goto out;
759 }
760 if (likely(!bl_push_one_short_extent(be->be_inval)))
761 par->bse_count++;
762 else {
763 end_page_writeback(page);
764 page_cache_release(page);
765 header->pnfs_error = -ENOMEM;
766 goto out;
767 }
768 /* FIXME: This should be done in bi_end_io */
769 mark_extents_written(BLK_LSEG2EXT(header->lseg),
770 page->index << PAGE_CACHE_SHIFT,
771 PAGE_CACHE_SIZE);
772
773 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
774 isect, page, be,
775 bl_end_io_write_zero, par);
776 if (IS_ERR(bio)) {
777 header->pnfs_error = PTR_ERR(bio);
778 bio = NULL;
779 goto out;
780 }
781next_page:
782 isect += PAGE_CACHE_SECTORS;
783 extent_length -= PAGE_CACHE_SECTORS;
784 }
785 if (last)
786 goto write_done;
787 }
788 bio = bl_submit_bio(WRITE, bio);
789 402
790 /* Middle pages */
791 pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
792 for (i = pg_index; i < header->page_array.npages; i++) { 403 for (i = pg_index; i < header->page_array.npages; i++) {
793 if (!extent_length) { 404 if (extent_length <= 0) {
794 /* We've used up the previous extent */ 405 /* We've used up the previous extent */
795 bl_put_extent(be);
796 bl_put_extent(cow_read);
797 bio = bl_submit_bio(WRITE, bio); 406 bio = bl_submit_bio(WRITE, bio);
798 /* Get the next one */ 407 /* Get the next one */
799 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 408 if (!ext_tree_lookup(bl, isect, &be, true)) {
800 isect, &cow_read);
801 if (!be || !is_writable(be, isect)) {
802 header->pnfs_error = -EINVAL; 409 header->pnfs_error = -EINVAL;
803 goto out; 410 goto out;
804 } 411 }
805 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
806 if (likely(!bl_push_one_short_extent(
807 be->be_inval)))
808 par->bse_count++;
809 else {
810 header->pnfs_error = -ENOMEM;
811 goto out;
812 }
813 }
814 extent_length = be->be_length -
815 (isect - be->be_f_offset);
816 }
817
818 dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
819 pg_offset = offset & ~PAGE_CACHE_MASK;
820 if (pg_offset + count > PAGE_CACHE_SIZE)
821 pg_len = PAGE_CACHE_SIZE - pg_offset;
822 else
823 pg_len = count;
824
825 saved_len = pg_len;
826 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
827 !bl_is_sector_init(be->be_inval, isect)) {
828 ret = bl_read_partial_page_sync(pages[i], cow_read,
829 pg_offset, pg_len, true);
830 if (ret) {
831 dprintk("%s bl_read_partial_page_sync fail %d\n",
832 __func__, ret);
833 header->pnfs_error = ret;
834 goto out;
835 }
836
837 ret = bl_mark_sectors_init(be->be_inval, isect,
838 PAGE_CACHE_SECTORS);
839 if (unlikely(ret)) {
840 dprintk("%s bl_mark_sectors_init fail %d\n",
841 __func__, ret);
842 header->pnfs_error = ret;
843 goto out;
844 }
845 412
846 /* Expand to full page write */ 413 extent_length = be.be_length - (isect - be.be_f_offset);
847 pg_offset = 0;
848 pg_len = PAGE_CACHE_SIZE;
849 } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
850 (pg_len & (SECTOR_SIZE - 1))){
851 /* ahh, nasty case. We have to do sync full sector
852 * read-modify-write cycles.
853 */
854 unsigned int saved_offset = pg_offset;
855 ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
856 pg_len, false);
857 pg_offset = round_down(pg_offset, SECTOR_SIZE);
858 pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
859 - pg_offset;
860 } 414 }
861 415
862 416 pg_len = PAGE_CACHE_SIZE;
863 bio = do_add_page_to_bio(bio, header->page_array.npages - i, 417 bio = do_add_page_to_bio(bio, header->page_array.npages - i,
864 WRITE, 418 WRITE, isect, pages[i], &map, &be,
865 isect, pages[i], be,
866 bl_end_io_write, par, 419 bl_end_io_write, par,
867 pg_offset, pg_len); 420 0, &pg_len);
868 if (IS_ERR(bio)) { 421 if (IS_ERR(bio)) {
869 header->pnfs_error = PTR_ERR(bio); 422 header->pnfs_error = PTR_ERR(bio);
870 bio = NULL; 423 bio = NULL;
871 goto out; 424 goto out;
872 } 425 }
873 offset += saved_len;
874 count -= saved_len;
875 isect += PAGE_CACHE_SECTORS;
876 last_isect = isect;
877 extent_length -= PAGE_CACHE_SECTORS;
878 }
879 426
880 /* Last page inside INVALID extent */ 427 offset += pg_len;
881 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 428 count -= pg_len;
882 bio = bl_submit_bio(WRITE, bio); 429 isect += (pg_len >> SECTOR_SHIFT);
883 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; 430 extent_length -= (pg_len >> SECTOR_SHIFT);
884 npg_zero = npg_per_block - do_div(temp, npg_per_block);
885 if (npg_zero < npg_per_block) {
886 last = 1;
887 goto fill_invalid_ext;
888 }
889 } 431 }
890 432
891write_done:
892 header->res.count = header->args.count; 433 header->res.count = header->args.count;
893out: 434out:
894 bl_put_extent(be);
895 bl_put_extent(cow_read);
896 bl_submit_bio(WRITE, bio); 435 bl_submit_bio(WRITE, bio);
436 blk_finish_plug(&plug);
897 put_parallel(par); 437 put_parallel(par);
898 return PNFS_ATTEMPTED; 438 return PNFS_ATTEMPTED;
899out_mds:
900 bl_put_extent(be);
901 bl_put_extent(cow_read);
902 kfree(par);
903 return PNFS_NOT_ATTEMPTED;
904}
905
906/* FIXME - range ignored */
907static void
908release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
909{
910 int i;
911 struct pnfs_block_extent *be;
912
913 spin_lock(&bl->bl_ext_lock);
914 for (i = 0; i < EXTENT_LISTS; i++) {
915 while (!list_empty(&bl->bl_extents[i])) {
916 be = list_first_entry(&bl->bl_extents[i],
917 struct pnfs_block_extent,
918 be_node);
919 list_del(&be->be_node);
920 bl_put_extent(be);
921 }
922 }
923 spin_unlock(&bl->bl_ext_lock);
924}
925
926static void
927release_inval_marks(struct pnfs_inval_markings *marks)
928{
929 struct pnfs_inval_tracking *pos, *temp;
930 struct pnfs_block_short_extent *se, *stemp;
931
932 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
933 list_del(&pos->it_link);
934 kfree(pos);
935 }
936
937 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
938 list_del(&se->bse_node);
939 kfree(se);
940 }
941 return;
942} 439}
943 440
944static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) 441static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
945{ 442{
946 struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 443 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
444 int err;
947 445
948 dprintk("%s enter\n", __func__); 446 dprintk("%s enter\n", __func__);
949 release_extents(bl, NULL); 447
950 release_inval_marks(&bl->bl_inval); 448 err = ext_tree_remove(bl, true, 0, LLONG_MAX);
449 WARN_ON(err);
450
951 kfree(bl); 451 kfree(bl);
952} 452}
953 453
@@ -960,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
960 bl = kzalloc(sizeof(*bl), gfp_flags); 460 bl = kzalloc(sizeof(*bl), gfp_flags);
961 if (!bl) 461 if (!bl)
962 return NULL; 462 return NULL;
463
464 bl->bl_ext_rw = RB_ROOT;
465 bl->bl_ext_ro = RB_ROOT;
963 spin_lock_init(&bl->bl_ext_lock); 466 spin_lock_init(&bl->bl_ext_lock);
964 INIT_LIST_HEAD(&bl->bl_extents[0]); 467
965 INIT_LIST_HEAD(&bl->bl_extents[1]);
966 INIT_LIST_HEAD(&bl->bl_commit);
967 INIT_LIST_HEAD(&bl->bl_committing);
968 bl->bl_count = 0;
969 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
970 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
971 return &bl->bl_layout; 468 return &bl->bl_layout;
972} 469}
973 470
@@ -977,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
977 kfree(lseg); 474 kfree(lseg);
978} 475}
979 476
980/* We pretty much ignore lseg, and store all data layout wide, so we 477/* Tracks info needed to ensure extents in layout obey constraints of spec */
981 * can correctly merge. 478struct layout_verification {
982 */ 479 u32 mode; /* R or RW */
983static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, 480 u64 start; /* Expected start of next non-COW extent */
984 struct nfs4_layoutget_res *lgr, 481 u64 inval; /* Start of INVAL coverage */
985 gfp_t gfp_flags) 482 u64 cowread; /* End of COW read coverage */
986{ 483};
987 struct pnfs_layout_segment *lseg;
988 int status;
989 484
990 dprintk("%s enter\n", __func__); 485/* Verify the extent meets the layout requirements of the pnfs-block draft,
991 lseg = kzalloc(sizeof(*lseg), gfp_flags); 486 * section 2.3.1.
992 if (!lseg) 487 */
993 return ERR_PTR(-ENOMEM); 488static int verify_extent(struct pnfs_block_extent *be,
994 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); 489 struct layout_verification *lv)
995 if (status) { 490{
996 /* We don't want to call the full-blown bl_free_lseg, 491 if (lv->mode == IOMODE_READ) {
997 * since on error extents were not touched. 492 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
998 */ 493 be->be_state == PNFS_BLOCK_INVALID_DATA)
999 kfree(lseg); 494 return -EIO;
1000 return ERR_PTR(status); 495 if (be->be_f_offset != lv->start)
496 return -EIO;
497 lv->start += be->be_length;
498 return 0;
1001 } 499 }
1002 return lseg; 500 /* lv->mode == IOMODE_RW */
501 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
502 if (be->be_f_offset != lv->start)
503 return -EIO;
504 if (lv->cowread > lv->start)
505 return -EIO;
506 lv->start += be->be_length;
507 lv->inval = lv->start;
508 return 0;
509 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
510 if (be->be_f_offset != lv->start)
511 return -EIO;
512 lv->start += be->be_length;
513 return 0;
514 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
515 if (be->be_f_offset > lv->start)
516 return -EIO;
517 if (be->be_f_offset < lv->inval)
518 return -EIO;
519 if (be->be_f_offset < lv->cowread)
520 return -EIO;
521 /* It looks like you might want to min this with lv->start,
522 * but you really don't.
523 */
524 lv->inval = lv->inval + be->be_length;
525 lv->cowread = be->be_f_offset + be->be_length;
526 return 0;
527 } else
528 return -EIO;
1003} 529}
1004 530
1005static void 531static int decode_sector_number(__be32 **rp, sector_t *sp)
1006bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
1007 const struct nfs4_layoutcommit_args *arg)
1008{ 532{
1009 dprintk("%s enter\n", __func__); 533 uint64_t s;
1010 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); 534
535 *rp = xdr_decode_hyper(*rp, &s);
536 if (s & 0x1ff) {
537 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
538 return -1;
539 }
540 *sp = s >> SECTOR_SHIFT;
541 return 0;
1011} 542}
1012 543
1013static void 544static int
1014bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) 545bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
546 struct layout_verification *lv, struct list_head *extents,
547 gfp_t gfp_mask)
1015{ 548{
1016 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; 549 struct pnfs_block_extent *be;
550 struct nfs4_deviceid id;
551 int error;
552 __be32 *p;
1017 553
1018 dprintk("%s enter\n", __func__); 554 p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
1019 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); 555 if (!p)
1020} 556 return -EIO;
1021 557
1022static void free_blk_mountid(struct block_mount_id *mid) 558 be = kzalloc(sizeof(*be), GFP_NOFS);
1023{ 559 if (!be)
1024 if (mid) { 560 return -ENOMEM;
1025 struct pnfs_block_dev *dev, *tmp;
1026 561
1027 /* No need to take bm_lock as we are last user freeing bm_devlist */ 562 memcpy(&id, p, NFS4_DEVICEID4_SIZE);
1028 list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { 563 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
1029 list_del(&dev->bm_node); 564
1030 bl_free_block_dev(dev); 565 error = -EIO;
1031 } 566 be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
1032 kfree(mid); 567 lo->plh_lc_cred, gfp_mask);
568 if (!be->be_device)
569 goto out_free_be;
570
571 /*
572 * The next three values are read in as bytes, but stored in the
573 * extent structure in 512-byte granularity.
574 */
575 if (decode_sector_number(&p, &be->be_f_offset) < 0)
576 goto out_put_deviceid;
577 if (decode_sector_number(&p, &be->be_length) < 0)
578 goto out_put_deviceid;
579 if (decode_sector_number(&p, &be->be_v_offset) < 0)
580 goto out_put_deviceid;
581 be->be_state = be32_to_cpup(p++);
582
583 error = verify_extent(be, lv);
584 if (error) {
585 dprintk("%s: extent verification failed\n", __func__);
586 goto out_put_deviceid;
1033 } 587 }
588
589 list_add_tail(&be->be_list, extents);
590 return 0;
591
592out_put_deviceid:
593 nfs4_put_deviceid_node(be->be_device);
594out_free_be:
595 kfree(be);
596 return error;
1034} 597}
1035 598
1036/* This is mostly copied from the filelayout_get_device_info function. 599static struct pnfs_layout_segment *
1037 * It seems much of this should be at the generic pnfs level. 600bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
1038 */ 601 gfp_t gfp_mask)
1039static struct pnfs_block_dev *
1040nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1041 struct nfs4_deviceid *d_id)
1042{ 602{
1043 struct pnfs_device *dev; 603 struct layout_verification lv = {
1044 struct pnfs_block_dev *rv; 604 .mode = lgr->range.iomode,
1045 u32 max_resp_sz; 605 .start = lgr->range.offset >> SECTOR_SHIFT,
1046 int max_pages; 606 .inval = lgr->range.offset >> SECTOR_SHIFT,
1047 struct page **pages = NULL; 607 .cowread = lgr->range.offset >> SECTOR_SHIFT,
1048 int i, rc; 608 };
609 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
610 struct pnfs_layout_segment *lseg;
611 struct xdr_buf buf;
612 struct xdr_stream xdr;
613 struct page *scratch;
614 int status, i;
615 uint32_t count;
616 __be32 *p;
617 LIST_HEAD(extents);
618
619 dprintk("---> %s\n", __func__);
620
621 lseg = kzalloc(sizeof(*lseg), gfp_mask);
622 if (!lseg)
623 return ERR_PTR(-ENOMEM);
624
625 status = -ENOMEM;
626 scratch = alloc_page(gfp_mask);
627 if (!scratch)
628 goto out;
629
630 xdr_init_decode_pages(&xdr, &buf,
631 lgr->layoutp->pages, lgr->layoutp->len);
632 xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
633
634 status = -EIO;
635 p = xdr_inline_decode(&xdr, 4);
636 if (unlikely(!p))
637 goto out_free_scratch;
638
639 count = be32_to_cpup(p++);
640 dprintk("%s: number of extents %d\n", __func__, count);
1049 641
1050 /* 642 /*
1051 * Use the session max response size as the basis for setting 643 * Decode individual extents, putting them in temporary staging area
1052 * GETDEVICEINFO's maxcount 644 * until whole layout is decoded to make error recovery easier.
1053 */ 645 */
1054 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 646 for (i = 0; i < count; i++) {
1055 max_pages = nfs_page_array_len(0, max_resp_sz); 647 status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
1056 dprintk("%s max_resp_sz %u max_pages %d\n", 648 if (status)
1057 __func__, max_resp_sz, max_pages); 649 goto process_extents;
1058
1059 dev = kmalloc(sizeof(*dev), GFP_NOFS);
1060 if (!dev) {
1061 dprintk("%s kmalloc failed\n", __func__);
1062 return ERR_PTR(-ENOMEM);
1063 } 650 }
1064 651
1065 pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS); 652 if (lgr->range.offset + lgr->range.length !=
1066 if (pages == NULL) { 653 lv.start << SECTOR_SHIFT) {
1067 kfree(dev); 654 dprintk("%s Final length mismatch\n", __func__);
1068 return ERR_PTR(-ENOMEM); 655 status = -EIO;
656 goto process_extents;
1069 } 657 }
1070 for (i = 0; i < max_pages; i++) { 658
1071 pages[i] = alloc_page(GFP_NOFS); 659 if (lv.start < lv.cowread) {
1072 if (!pages[i]) { 660 dprintk("%s Final uncovered COW extent\n", __func__);
1073 rv = ERR_PTR(-ENOMEM); 661 status = -EIO;
1074 goto out_free;
1075 }
1076 } 662 }
1077 663
1078 memcpy(&dev->dev_id, d_id, sizeof(*d_id)); 664process_extents:
1079 dev->layout_type = LAYOUT_BLOCK_VOLUME; 665 while (!list_empty(&extents)) {
1080 dev->pages = pages; 666 struct pnfs_block_extent *be =
1081 dev->pgbase = 0; 667 list_first_entry(&extents, struct pnfs_block_extent,
1082 dev->pglen = PAGE_SIZE * max_pages; 668 be_list);
1083 dev->mincount = 0; 669 list_del(&be->be_list);
1084 dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; 670
1085 671 if (!status)
1086 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 672 status = ext_tree_insert(bl, be);
1087 rc = nfs4_proc_getdeviceinfo(server, dev, NULL); 673
1088 dprintk("%s getdevice info returns %d\n", __func__, rc); 674 if (status) {
1089 if (rc) { 675 nfs4_put_deviceid_node(be->be_device);
1090 rv = ERR_PTR(rc); 676 kfree(be);
1091 goto out_free; 677 }
1092 } 678 }
1093 679
1094 rv = nfs4_blk_decode_device(server, dev); 680out_free_scratch:
1095 out_free: 681 __free_page(scratch);
1096 for (i = 0; i < max_pages; i++) 682out:
1097 __free_page(pages[i]); 683 dprintk("%s returns %d\n", __func__, status);
1098 kfree(pages); 684 if (status) {
1099 kfree(dev); 685 kfree(lseg);
1100 return rv; 686 return ERR_PTR(status);
687 }
688 return lseg;
1101} 689}
1102 690
1103static int 691static void
1104bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) 692bl_return_range(struct pnfs_layout_hdr *lo,
693 struct pnfs_layout_range *range)
1105{ 694{
1106 struct block_mount_id *b_mt_id = NULL; 695 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
1107 struct pnfs_devicelist *dlist = NULL; 696 sector_t offset = range->offset >> SECTOR_SHIFT, end;
1108 struct pnfs_block_dev *bdev;
1109 LIST_HEAD(block_disklist);
1110 int status, i;
1111
1112 dprintk("%s enter\n", __func__);
1113 697
1114 if (server->pnfs_blksize == 0) { 698 if (range->offset % 8) {
1115 dprintk("%s Server did not return blksize\n", __func__); 699 dprintk("%s: offset %lld not block size aligned\n",
1116 return -EINVAL; 700 __func__, range->offset);
1117 } 701 return;
1118 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
1119 if (!b_mt_id) {
1120 status = -ENOMEM;
1121 goto out_error;
1122 }
1123 /* Initialize nfs4 block layout mount id */
1124 spin_lock_init(&b_mt_id->bm_lock);
1125 INIT_LIST_HEAD(&b_mt_id->bm_devlist);
1126
1127 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
1128 if (!dlist) {
1129 status = -ENOMEM;
1130 goto out_error;
1131 } 702 }
1132 dlist->eof = 0; 703
1133 while (!dlist->eof) { 704 if (range->length != NFS4_MAX_UINT64) {
1134 status = nfs4_proc_getdevicelist(server, fh, dlist); 705 if (range->length % 8) {
1135 if (status) 706 dprintk("%s: length %lld not block size aligned\n",
1136 goto out_error; 707 __func__, range->length);
1137 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", 708 return;
1138 __func__, dlist->num_devs, dlist->eof);
1139 for (i = 0; i < dlist->num_devs; i++) {
1140 bdev = nfs4_blk_get_deviceinfo(server, fh,
1141 &dlist->dev_id[i]);
1142 if (IS_ERR(bdev)) {
1143 status = PTR_ERR(bdev);
1144 goto out_error;
1145 }
1146 spin_lock(&b_mt_id->bm_lock);
1147 list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
1148 spin_unlock(&b_mt_id->bm_lock);
1149 } 709 }
1150 }
1151 dprintk("%s SUCCESS\n", __func__);
1152 server->pnfs_ld_data = b_mt_id;
1153 710
1154 out_return: 711 end = offset + (range->length >> SECTOR_SHIFT);
1155 kfree(dlist); 712 } else {
1156 return status; 713 end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
714 }
1157 715
1158 out_error: 716 ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
1159 free_blk_mountid(b_mt_id);
1160 goto out_return;
1161} 717}
1162 718
1163static int 719static int
1164bl_clear_layoutdriver(struct nfs_server *server) 720bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
721{
722 return ext_tree_prepare_commit(arg);
723}
724
725static void
726bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
1165{ 727{
1166 struct block_mount_id *b_mt_id = server->pnfs_ld_data; 728 ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
729}
1167 730
731static int
732bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
733{
1168 dprintk("%s enter\n", __func__); 734 dprintk("%s enter\n", __func__);
1169 free_blk_mountid(b_mt_id); 735
1170 dprintk("%s RETURNS\n", __func__); 736 if (server->pnfs_blksize == 0) {
737 dprintk("%s Server did not return blksize\n", __func__);
738 return -EINVAL;
739 }
740 if (server->pnfs_blksize > PAGE_SIZE) {
741 printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
742 __func__, server->pnfs_blksize);
743 return -EINVAL;
744 }
745
1171 return 0; 746 return 0;
1172} 747}
1173 748
1174static bool 749static bool
1175is_aligned_req(struct nfs_page *req, unsigned int alignment) 750is_aligned_req(struct nfs_pageio_descriptor *pgio,
751 struct nfs_page *req, unsigned int alignment)
1176{ 752{
1177 return IS_ALIGNED(req->wb_offset, alignment) && 753 /*
1178 IS_ALIGNED(req->wb_bytes, alignment); 754 * Always accept buffered writes, higher layers take care of the
755 * right alignment.
756 */
757 if (pgio->pg_dreq == NULL)
758 return true;
759
760 if (!IS_ALIGNED(req->wb_offset, alignment))
761 return false;
762
763 if (IS_ALIGNED(req->wb_bytes, alignment))
764 return true;
765
766 if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
767 /*
768 * If the write goes up to the inode size, just write
769 * the full page. Data past the inode size is
770 * guaranteed to be zeroed by the higher level client
771 * code, and this behaviour is mandated by RFC 5663
772 * section 2.3.2.
773 */
774 return true;
775 }
776
777 return false;
1179} 778}
1180 779
1181static void 780static void
1182bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 781bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1183{ 782{
1184 if (pgio->pg_dreq != NULL && 783 if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
1185 !is_aligned_req(req, SECTOR_SIZE))
1186 nfs_pageio_reset_read_mds(pgio); 784 nfs_pageio_reset_read_mds(pgio);
1187 else 785 return;
1188 pnfs_generic_pg_init_read(pgio, req); 786 }
787
788 pnfs_generic_pg_init_read(pgio, req);
1189} 789}
1190 790
1191/* 791/*
@@ -1196,10 +796,8 @@ static size_t
1196bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 796bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1197 struct nfs_page *req) 797 struct nfs_page *req)
1198{ 798{
1199 if (pgio->pg_dreq != NULL && 799 if (!is_aligned_req(pgio, req, SECTOR_SIZE))
1200 !is_aligned_req(req, SECTOR_SIZE))
1201 return 0; 800 return 0;
1202
1203 return pnfs_generic_pg_test(pgio, prev, req); 801 return pnfs_generic_pg_test(pgio, prev, req);
1204} 802}
1205 803
@@ -1229,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1229static void 827static void
1230bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 828bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1231{ 829{
1232 if (pgio->pg_dreq != NULL && 830 u64 wb_size;
1233 !is_aligned_req(req, PAGE_CACHE_SIZE)) { 831
832 if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
1234 nfs_pageio_reset_write_mds(pgio); 833 nfs_pageio_reset_write_mds(pgio);
1235 } else { 834 return;
1236 u64 wb_size;
1237 if (pgio->pg_dreq == NULL)
1238 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
1239 req->wb_index);
1240 else
1241 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1242
1243 pnfs_generic_pg_init_write(pgio, req, wb_size);
1244 } 835 }
836
837 if (pgio->pg_dreq == NULL)
838 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
839 req->wb_index);
840 else
841 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
842
843 pnfs_generic_pg_init_write(pgio, req, wb_size);
1245} 844}
1246 845
1247/* 846/*
@@ -1252,10 +851,8 @@ static size_t
1252bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 851bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1253 struct nfs_page *req) 852 struct nfs_page *req)
1254{ 853{
1255 if (pgio->pg_dreq != NULL && 854 if (!is_aligned_req(pgio, req, PAGE_SIZE))
1256 !is_aligned_req(req, PAGE_CACHE_SIZE))
1257 return 0; 855 return 0;
1258
1259 return pnfs_generic_pg_test(pgio, prev, req); 856 return pnfs_generic_pg_test(pgio, prev, req);
1260} 857}
1261 858
@@ -1275,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
1275 .id = LAYOUT_BLOCK_VOLUME, 872 .id = LAYOUT_BLOCK_VOLUME,
1276 .name = "LAYOUT_BLOCK_VOLUME", 873 .name = "LAYOUT_BLOCK_VOLUME",
1277 .owner = THIS_MODULE, 874 .owner = THIS_MODULE,
875 .flags = PNFS_LAYOUTRET_ON_SETATTR |
876 PNFS_READ_WHOLE_PAGE,
1278 .read_pagelist = bl_read_pagelist, 877 .read_pagelist = bl_read_pagelist,
1279 .write_pagelist = bl_write_pagelist, 878 .write_pagelist = bl_write_pagelist,
1280 .alloc_layout_hdr = bl_alloc_layout_hdr, 879 .alloc_layout_hdr = bl_alloc_layout_hdr,
1281 .free_layout_hdr = bl_free_layout_hdr, 880 .free_layout_hdr = bl_free_layout_hdr,
1282 .alloc_lseg = bl_alloc_lseg, 881 .alloc_lseg = bl_alloc_lseg,
1283 .free_lseg = bl_free_lseg, 882 .free_lseg = bl_free_lseg,
1284 .encode_layoutcommit = bl_encode_layoutcommit, 883 .return_range = bl_return_range,
884 .prepare_layoutcommit = bl_prepare_layoutcommit,
1285 .cleanup_layoutcommit = bl_cleanup_layoutcommit, 885 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
1286 .set_layoutdriver = bl_set_layoutdriver, 886 .set_layoutdriver = bl_set_layoutdriver,
1287 .clear_layoutdriver = bl_clear_layoutdriver, 887 .alloc_deviceid_node = bl_alloc_deviceid_node,
888 .free_deviceid_node = bl_free_deviceid_node,
1288 .pg_read_ops = &bl_pg_read_ops, 889 .pg_read_ops = &bl_pg_read_ops,
1289 .pg_write_ops = &bl_pg_write_ops, 890 .pg_write_ops = &bl_pg_write_ops,
1290}; 891};
1291 892
1292static const struct rpc_pipe_ops bl_upcall_ops = {
1293 .upcall = rpc_pipe_generic_upcall,
1294 .downcall = bl_pipe_downcall,
1295 .destroy_msg = bl_pipe_destroy_msg,
1296};
1297
1298static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
1299 struct rpc_pipe *pipe)
1300{
1301 struct dentry *dir, *dentry;
1302
1303 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
1304 if (dir == NULL)
1305 return ERR_PTR(-ENOENT);
1306 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
1307 dput(dir);
1308 return dentry;
1309}
1310
1311static void nfs4blocklayout_unregister_sb(struct super_block *sb,
1312 struct rpc_pipe *pipe)
1313{
1314 if (pipe->dentry)
1315 rpc_unlink(pipe->dentry);
1316}
1317
1318static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
1319 void *ptr)
1320{
1321 struct super_block *sb = ptr;
1322 struct net *net = sb->s_fs_info;
1323 struct nfs_net *nn = net_generic(net, nfs_net_id);
1324 struct dentry *dentry;
1325 int ret = 0;
1326
1327 if (!try_module_get(THIS_MODULE))
1328 return 0;
1329
1330 if (nn->bl_device_pipe == NULL) {
1331 module_put(THIS_MODULE);
1332 return 0;
1333 }
1334
1335 switch (event) {
1336 case RPC_PIPEFS_MOUNT:
1337 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
1338 if (IS_ERR(dentry)) {
1339 ret = PTR_ERR(dentry);
1340 break;
1341 }
1342 nn->bl_device_pipe->dentry = dentry;
1343 break;
1344 case RPC_PIPEFS_UMOUNT:
1345 if (nn->bl_device_pipe->dentry)
1346 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
1347 break;
1348 default:
1349 ret = -ENOTSUPP;
1350 break;
1351 }
1352 module_put(THIS_MODULE);
1353 return ret;
1354}
1355
1356static struct notifier_block nfs4blocklayout_block = {
1357 .notifier_call = rpc_pipefs_event,
1358};
1359
1360static struct dentry *nfs4blocklayout_register_net(struct net *net,
1361 struct rpc_pipe *pipe)
1362{
1363 struct super_block *pipefs_sb;
1364 struct dentry *dentry;
1365
1366 pipefs_sb = rpc_get_sb_net(net);
1367 if (!pipefs_sb)
1368 return NULL;
1369 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
1370 rpc_put_sb_net(net);
1371 return dentry;
1372}
1373
1374static void nfs4blocklayout_unregister_net(struct net *net,
1375 struct rpc_pipe *pipe)
1376{
1377 struct super_block *pipefs_sb;
1378
1379 pipefs_sb = rpc_get_sb_net(net);
1380 if (pipefs_sb) {
1381 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
1382 rpc_put_sb_net(net);
1383 }
1384}
1385
1386static int nfs4blocklayout_net_init(struct net *net)
1387{
1388 struct nfs_net *nn = net_generic(net, nfs_net_id);
1389 struct dentry *dentry;
1390
1391 init_waitqueue_head(&nn->bl_wq);
1392 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
1393 if (IS_ERR(nn->bl_device_pipe))
1394 return PTR_ERR(nn->bl_device_pipe);
1395 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
1396 if (IS_ERR(dentry)) {
1397 rpc_destroy_pipe_data(nn->bl_device_pipe);
1398 return PTR_ERR(dentry);
1399 }
1400 nn->bl_device_pipe->dentry = dentry;
1401 return 0;
1402}
1403
1404static void nfs4blocklayout_net_exit(struct net *net)
1405{
1406 struct nfs_net *nn = net_generic(net, nfs_net_id);
1407
1408 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
1409 rpc_destroy_pipe_data(nn->bl_device_pipe);
1410 nn->bl_device_pipe = NULL;
1411}
1412
1413static struct pernet_operations nfs4blocklayout_net_ops = {
1414 .init = nfs4blocklayout_net_init,
1415 .exit = nfs4blocklayout_net_exit,
1416};
1417
1418static int __init nfs4blocklayout_init(void) 893static int __init nfs4blocklayout_init(void)
1419{ 894{
1420 int ret; 895 int ret;
@@ -1424,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
1424 ret = pnfs_register_layoutdriver(&blocklayout_type); 899 ret = pnfs_register_layoutdriver(&blocklayout_type);
1425 if (ret) 900 if (ret)
1426 goto out; 901 goto out;
1427 902 ret = bl_init_pipefs();
1428 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
1429 if (ret) 903 if (ret)
1430 goto out_remove; 904 goto out_unregister;
1431 ret = register_pernet_subsys(&nfs4blocklayout_net_ops); 905 return 0;
1432 if (ret)
1433 goto out_notifier;
1434out:
1435 return ret;
1436 906
1437out_notifier: 907out_unregister:
1438 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
1439out_remove:
1440 pnfs_unregister_layoutdriver(&blocklayout_type); 908 pnfs_unregister_layoutdriver(&blocklayout_type);
909out:
1441 return ret; 910 return ret;
1442} 911}
1443 912
@@ -1446,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
1446 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", 915 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1447 __func__); 916 __func__);
1448 917
1449 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); 918 bl_cleanup_pipefs();
1450 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
1451 pnfs_unregister_layoutdriver(&blocklayout_type); 919 pnfs_unregister_layoutdriver(&blocklayout_type);
1452} 920}
1453 921
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
44#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) 44#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
45#define SECTOR_SIZE (1 << SECTOR_SHIFT) 45#define SECTOR_SIZE (1 << SECTOR_SHIFT)
46 46
47struct block_mount_id { 47struct pnfs_block_dev;
48 spinlock_t bm_lock; /* protects list */
49 struct list_head bm_devlist; /* holds pnfs_block_dev */
50};
51 48
52struct pnfs_block_dev { 49enum pnfs_block_volume_type {
53 struct list_head bm_node; 50 PNFS_BLOCK_VOLUME_SIMPLE = 0,
54 struct nfs4_deviceid bm_mdevid; /* associated devid */ 51 PNFS_BLOCK_VOLUME_SLICE = 1,
55 struct block_device *bm_mdev; /* meta device itself */ 52 PNFS_BLOCK_VOLUME_CONCAT = 2,
56 struct net *net; 53 PNFS_BLOCK_VOLUME_STRIPE = 3,
57}; 54};
58 55
59enum exstate4 { 56#define PNFS_BLOCK_MAX_UUIDS 4
60 PNFS_BLOCK_READWRITE_DATA = 0, 57#define PNFS_BLOCK_MAX_DEVICES 64
61 PNFS_BLOCK_READ_DATA = 1, 58
62 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ 59/*
63 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ 60 * Random upper cap for the uuid length to avoid unbounded allocation.
61 * Not actually limited by the protocol.
62 */
63#define PNFS_BLOCK_UUID_LEN 128
64
65
66struct pnfs_block_volume {
67 enum pnfs_block_volume_type type;
68 union {
69 struct {
70 int len;
71 int nr_sigs;
72 struct {
73 u64 offset;
74 u32 sig_len;
75 u8 sig[PNFS_BLOCK_UUID_LEN];
76 } sigs[PNFS_BLOCK_MAX_UUIDS];
77 } simple;
78 struct {
79 u64 start;
80 u64 len;
81 u32 volume;
82 } slice;
83 struct {
84 u32 volumes_count;
85 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
86 } concat;
87 struct {
88 u64 chunk_size;
89 u32 volumes_count;
90 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
91 } stripe;
92 };
64}; 93};
65 94
66#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ 95struct pnfs_block_dev_map {
96 sector_t start;
97 sector_t len;
67 98
68struct my_tree { 99 sector_t disk_offset;
69 sector_t mtt_step_size; /* Internal sector alignment */ 100 struct block_device *bdev;
70 struct list_head mtt_stub; /* Should be a radix tree */
71}; 101};
72 102
73struct pnfs_inval_markings { 103struct pnfs_block_dev {
74 spinlock_t im_lock; 104 struct nfs4_deviceid_node node;
75 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ 105
76 sector_t im_block_size; /* Server blocksize in sectors */ 106 u64 start;
77 struct list_head im_extents; /* Short extents for INVAL->RW conversion */ 107 u64 len;
108
109 u32 nr_children;
110 struct pnfs_block_dev *children;
111 u64 chunk_size;
112
113 struct block_device *bdev;
114 u64 disk_offset;
115
116 bool (*map)(struct pnfs_block_dev *dev, u64 offset,
117 struct pnfs_block_dev_map *map);
78}; 118};
79 119
80struct pnfs_inval_tracking { 120enum exstate4 {
81 struct list_head it_link; 121 PNFS_BLOCK_READWRITE_DATA = 0,
82 int it_sector; 122 PNFS_BLOCK_READ_DATA = 1,
83 int it_tags; 123 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
124 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
84}; 125};
85 126
86/* sector_t fields are all in 512-byte sectors */ 127/* sector_t fields are all in 512-byte sectors */
87struct pnfs_block_extent { 128struct pnfs_block_extent {
88 struct kref be_refcnt; 129 union {
89 struct list_head be_node; /* link into lseg list */ 130 struct rb_node be_node;
90 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ 131 struct list_head be_list;
91 struct block_device *be_mdev; 132 };
133 struct nfs4_deviceid_node *be_device;
92 sector_t be_f_offset; /* the starting offset in the file */ 134 sector_t be_f_offset; /* the starting offset in the file */
93 sector_t be_length; /* the size of the extent */ 135 sector_t be_length; /* the size of the extent */
94 sector_t be_v_offset; /* the starting offset in the volume */ 136 sector_t be_v_offset; /* the starting offset in the volume */
95 enum exstate4 be_state; /* the state of this extent */ 137 enum exstate4 be_state; /* the state of this extent */
96 struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ 138#define EXTENT_WRITTEN 1
139#define EXTENT_COMMITTING 2
140 unsigned int be_tag;
97}; 141};
98 142
99/* Shortened extent used by LAYOUTCOMMIT */ 143/* on the wire size of the extent */
100struct pnfs_block_short_extent { 144#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
101 struct list_head bse_node;
102 struct nfs4_deviceid bse_devid;
103 struct block_device *bse_mdev;
104 sector_t bse_f_offset; /* the starting offset in the file */
105 sector_t bse_length; /* the size of the extent */
106};
107
108static inline void
109BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
110{
111 spin_lock_init(&marks->im_lock);
112 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
113 INIT_LIST_HEAD(&marks->im_extents);
114 marks->im_block_size = blocksize;
115 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
116 blocksize);
117}
118
119enum extentclass4 {
120 RW_EXTENT = 0, /* READWRTE and INVAL */
121 RO_EXTENT = 1, /* READ and NONE */
122 EXTENT_LISTS = 2,
123};
124
125static inline int bl_choose_list(enum exstate4 state)
126{
127 if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
128 return RO_EXTENT;
129 else
130 return RW_EXTENT;
131}
132 145
133struct pnfs_block_layout { 146struct pnfs_block_layout {
134 struct pnfs_layout_hdr bl_layout; 147 struct pnfs_layout_hdr bl_layout;
135 struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ 148 struct rb_root bl_ext_rw;
149 struct rb_root bl_ext_ro;
136 spinlock_t bl_ext_lock; /* Protects list manipulation */ 150 spinlock_t bl_ext_lock; /* Protects list manipulation */
137 struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
138 struct list_head bl_commit; /* Needs layout commit */
139 struct list_head bl_committing; /* Layout committing */
140 unsigned int bl_count; /* entries in bl_commit */
141 sector_t bl_blocksize; /* Server blocksize in sectors */
142}; 151};
143 152
144#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
145
146static inline struct pnfs_block_layout * 153static inline struct pnfs_block_layout *
147BLK_LO2EXT(struct pnfs_layout_hdr *lo) 154BLK_LO2EXT(struct pnfs_layout_hdr *lo)
148{ 155{
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
171#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ 178#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
172#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ 179#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
173 180
174/* blocklayoutdev.c */ 181/* dev.c */
175ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 182struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
176void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 183 struct pnfs_device *pdev, gfp_t gfp_mask);
177void nfs4_blkdev_put(struct block_device *bdev); 184void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
178struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, 185
179 struct pnfs_device *dev); 186/* extent_tree.c */
180int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, 187int ext_tree_insert(struct pnfs_block_layout *bl,
181 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 188 struct pnfs_block_extent *new);
182 189int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
183/* blocklayoutdm.c */ 190 sector_t end);
184void bl_free_block_dev(struct pnfs_block_dev *bdev); 191int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
185 192 sector_t len);
186/* extents.c */ 193bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
187struct pnfs_block_extent * 194 struct pnfs_block_extent *ret, bool rw);
188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 195int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
189 struct pnfs_block_extent **cow_read); 196void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
190int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 197
191 sector_t offset, sector_t length); 198/* rpc_pipefs.c */
192void bl_put_extent(struct pnfs_block_extent *be); 199dev_t bl_resolve_deviceid(struct nfs_server *server,
193struct pnfs_block_extent *bl_alloc_extent(void); 200 struct pnfs_block_volume *b, gfp_t gfp_mask);
194int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); 201int __init bl_init_pipefs(void);
195int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 202void __exit bl_cleanup_pipefs(void);
196 struct xdr_stream *xdr,
197 const struct nfs4_layoutcommit_args *arg);
198void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
199 const struct nfs4_layoutcommit_args *arg,
200 int status);
201int bl_add_merge_extent(struct pnfs_block_layout *bl,
202 struct pnfs_block_extent *new);
203int bl_mark_for_commit(struct pnfs_block_extent *be,
204 sector_t offset, sector_t length,
205 struct pnfs_block_short_extent *new);
206int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
207struct pnfs_block_short_extent *
208bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
209void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
210 203
211#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 204#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c9361..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
3 *
4 * Device operations for the pnfs nfs4 file layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#include <linux/module.h>
33#include <linux/buffer_head.h> /* __bread */
34
35#include <linux/genhd.h>
36#include <linux/blkdev.h>
37#include <linux/hash.h>
38
39#include "blocklayout.h"
40
41#define NFSDBG_FACILITY NFSDBG_PNFS_LD
42
43static int decode_sector_number(__be32 **rp, sector_t *sp)
44{
45 uint64_t s;
46
47 *rp = xdr_decode_hyper(*rp, &s);
48 if (s & 0x1ff) {
49 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
50 return -1;
51 }
52 *sp = s >> SECTOR_SHIFT;
53 return 0;
54}
55
56/*
57 * Release the block device
58 */
59void nfs4_blkdev_put(struct block_device *bdev)
60{
61 dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
62 MINOR(bdev->bd_dev));
63 blkdev_put(bdev, FMODE_READ);
64}
65
66ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
67 size_t mlen)
68{
69 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
70 nfs_net_id);
71
72 if (mlen != sizeof (struct bl_dev_msg))
73 return -EINVAL;
74
75 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
76 return -EFAULT;
77
78 wake_up(&nn->bl_wq);
79
80 return mlen;
81}
82
83void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
84{
85 struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
86
87 if (msg->errno >= 0)
88 return;
89 wake_up(bl_pipe_msg->bl_wq);
90}
91
92/*
93 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
94 */
95struct pnfs_block_dev *
96nfs4_blk_decode_device(struct nfs_server *server,
97 struct pnfs_device *dev)
98{
99 struct pnfs_block_dev *rv;
100 struct block_device *bd = NULL;
101 struct bl_pipe_msg bl_pipe_msg;
102 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
103 struct bl_msg_hdr bl_msg = {
104 .type = BL_DEVICE_MOUNT,
105 .totallen = dev->mincount,
106 };
107 uint8_t *dataptr;
108 DECLARE_WAITQUEUE(wq, current);
109 int offset, len, i, rc;
110 struct net *net = server->nfs_client->cl_net;
111 struct nfs_net *nn = net_generic(net, nfs_net_id);
112 struct bl_dev_msg *reply = &nn->bl_mount_reply;
113
114 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
115 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
116 dev->mincount);
117
118 bl_pipe_msg.bl_wq = &nn->bl_wq;
119 memset(msg, 0, sizeof(*msg));
120 msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
121 if (!msg->data) {
122 rv = ERR_PTR(-ENOMEM);
123 goto out;
124 }
125
126 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
127 dataptr = (uint8_t *) msg->data;
128 len = dev->mincount;
129 offset = sizeof(bl_msg);
130 for (i = 0; len > 0; i++) {
131 memcpy(&dataptr[offset], page_address(dev->pages[i]),
132 len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
133 len -= PAGE_CACHE_SIZE;
134 offset += PAGE_CACHE_SIZE;
135 }
136 msg->len = sizeof(bl_msg) + dev->mincount;
137
138 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
139 add_wait_queue(&nn->bl_wq, &wq);
140 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
141 if (rc < 0) {
142 remove_wait_queue(&nn->bl_wq, &wq);
143 rv = ERR_PTR(rc);
144 goto out;
145 }
146
147 set_current_state(TASK_UNINTERRUPTIBLE);
148 schedule();
149 __set_current_state(TASK_RUNNING);
150 remove_wait_queue(&nn->bl_wq, &wq);
151
152 if (reply->status != BL_DEVICE_REQUEST_PROC) {
153 dprintk("%s failed to open device: %d\n",
154 __func__, reply->status);
155 rv = ERR_PTR(-EINVAL);
156 goto out;
157 }
158
159 bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
160 FMODE_READ, NULL);
161 if (IS_ERR(bd)) {
162 dprintk("%s failed to open device : %ld\n", __func__,
163 PTR_ERR(bd));
164 rv = ERR_CAST(bd);
165 goto out;
166 }
167
168 rv = kzalloc(sizeof(*rv), GFP_NOFS);
169 if (!rv) {
170 rv = ERR_PTR(-ENOMEM);
171 goto out;
172 }
173
174 rv->bm_mdev = bd;
175 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
176 rv->net = net;
177 dprintk("%s Created device %s with bd_block_size %u\n",
178 __func__,
179 bd->bd_disk->disk_name,
180 bd->bd_block_size);
181
182out:
183 kfree(msg->data);
184 return rv;
185}
186
187/* Map deviceid returned by the server to constructed block_device */
188static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
189 struct nfs4_deviceid *id)
190{
191 struct block_device *rv = NULL;
192 struct block_mount_id *mid;
193 struct pnfs_block_dev *dev;
194
195 dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
196 mid = BLK_ID(lo);
197 spin_lock(&mid->bm_lock);
198 list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
199 if (memcmp(id->data, dev->bm_mdevid.data,
200 NFS4_DEVICEID4_SIZE) == 0) {
201 rv = dev->bm_mdev;
202 goto out;
203 }
204 }
205 out:
206 spin_unlock(&mid->bm_lock);
207 dprintk("%s returning %p\n", __func__, rv);
208 return rv;
209}
210
211/* Tracks info needed to ensure extents in layout obey constraints of spec */
212struct layout_verification {
213 u32 mode; /* R or RW */
214 u64 start; /* Expected start of next non-COW extent */
215 u64 inval; /* Start of INVAL coverage */
216 u64 cowread; /* End of COW read coverage */
217};
218
219/* Verify the extent meets the layout requirements of the pnfs-block draft,
220 * section 2.3.1.
221 */
222static int verify_extent(struct pnfs_block_extent *be,
223 struct layout_verification *lv)
224{
225 if (lv->mode == IOMODE_READ) {
226 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
227 be->be_state == PNFS_BLOCK_INVALID_DATA)
228 return -EIO;
229 if (be->be_f_offset != lv->start)
230 return -EIO;
231 lv->start += be->be_length;
232 return 0;
233 }
234 /* lv->mode == IOMODE_RW */
235 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
236 if (be->be_f_offset != lv->start)
237 return -EIO;
238 if (lv->cowread > lv->start)
239 return -EIO;
240 lv->start += be->be_length;
241 lv->inval = lv->start;
242 return 0;
243 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
244 if (be->be_f_offset != lv->start)
245 return -EIO;
246 lv->start += be->be_length;
247 return 0;
248 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
249 if (be->be_f_offset > lv->start)
250 return -EIO;
251 if (be->be_f_offset < lv->inval)
252 return -EIO;
253 if (be->be_f_offset < lv->cowread)
254 return -EIO;
255 /* It looks like you might want to min this with lv->start,
256 * but you really don't.
257 */
258 lv->inval = lv->inval + be->be_length;
259 lv->cowread = be->be_f_offset + be->be_length;
260 return 0;
261 } else
262 return -EIO;
263}
264
265/* XDR decode pnfs_block_layout4 structure */
266int
267nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
268 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
269{
270 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
271 int i, status = -EIO;
272 uint32_t count;
273 struct pnfs_block_extent *be = NULL, *save;
274 struct xdr_stream stream;
275 struct xdr_buf buf;
276 struct page *scratch;
277 __be32 *p;
278 struct layout_verification lv = {
279 .mode = lgr->range.iomode,
280 .start = lgr->range.offset >> SECTOR_SHIFT,
281 .inval = lgr->range.offset >> SECTOR_SHIFT,
282 .cowread = lgr->range.offset >> SECTOR_SHIFT,
283 };
284 LIST_HEAD(extents);
285
286 dprintk("---> %s\n", __func__);
287
288 scratch = alloc_page(gfp_flags);
289 if (!scratch)
290 return -ENOMEM;
291
292 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
293 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
294
295 p = xdr_inline_decode(&stream, 4);
296 if (unlikely(!p))
297 goto out_err;
298
299 count = be32_to_cpup(p++);
300
301 dprintk("%s enter, number of extents %i\n", __func__, count);
302 p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
303 if (unlikely(!p))
304 goto out_err;
305
306 /* Decode individual extents, putting them in temporary
307 * staging area until whole layout is decoded to make error
308 * recovery easier.
309 */
310 for (i = 0; i < count; i++) {
311 be = bl_alloc_extent();
312 if (!be) {
313 status = -ENOMEM;
314 goto out_err;
315 }
316 memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
317 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
318 be->be_mdev = translate_devid(lo, &be->be_devid);
319 if (!be->be_mdev)
320 goto out_err;
321
322 /* The next three values are read in as bytes,
323 * but stored as 512-byte sector lengths
324 */
325 if (decode_sector_number(&p, &be->be_f_offset) < 0)
326 goto out_err;
327 if (decode_sector_number(&p, &be->be_length) < 0)
328 goto out_err;
329 if (decode_sector_number(&p, &be->be_v_offset) < 0)
330 goto out_err;
331 be->be_state = be32_to_cpup(p++);
332 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
333 be->be_inval = &bl->bl_inval;
334 if (verify_extent(be, &lv)) {
335 dprintk("%s verify failed\n", __func__);
336 goto out_err;
337 }
338 list_add_tail(&be->be_node, &extents);
339 }
340 if (lgr->range.offset + lgr->range.length !=
341 lv.start << SECTOR_SHIFT) {
342 dprintk("%s Final length mismatch\n", __func__);
343 be = NULL;
344 goto out_err;
345 }
346 if (lv.start < lv.cowread) {
347 dprintk("%s Final uncovered COW extent\n", __func__);
348 be = NULL;
349 goto out_err;
350 }
351 /* Extents decoded properly, now try to merge them in to
352 * existing layout extents.
353 */
354 spin_lock(&bl->bl_ext_lock);
355 list_for_each_entry_safe(be, save, &extents, be_node) {
356 list_del(&be->be_node);
357 status = bl_add_merge_extent(bl, be);
358 if (status) {
359 spin_unlock(&bl->bl_ext_lock);
360 /* This is a fairly catastrophic error, as the
361 * entire layout extent lists are now corrupted.
362 * We should have some way to distinguish this.
363 */
364 be = NULL;
365 goto out_err;
366 }
367 }
368 spin_unlock(&bl->bl_ext_lock);
369 status = 0;
370 out:
371 __free_page(scratch);
372 dprintk("%s returns %i\n", __func__, status);
373 return status;
374
375 out_err:
376 bl_put_extent(be);
377 while (!list_empty(&extents)) {
378 be = list_first_entry(&extents, struct pnfs_block_extent,
379 be_node);
380 list_del(&be->be_node);
381 bl_put_extent(be);
382 }
383 goto out;
384}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd866..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdm.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2007 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Fred Isaman <iisaman@umich.edu>
10 * Andy Adamson <andros@citi.umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/genhd.h> /* gendisk - used in a dprintk*/
34#include <linux/sched.h>
35#include <linux/hash.h>
36
37#include "blocklayout.h"
38
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
40
41static void dev_remove(struct net *net, dev_t dev)
42{
43 struct bl_pipe_msg bl_pipe_msg;
44 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
45 struct bl_dev_msg bl_umount_request;
46 struct bl_msg_hdr bl_msg = {
47 .type = BL_DEVICE_UMOUNT,
48 .totallen = sizeof(bl_umount_request),
49 };
50 uint8_t *dataptr;
51 DECLARE_WAITQUEUE(wq, current);
52 struct nfs_net *nn = net_generic(net, nfs_net_id);
53
54 dprintk("Entering %s\n", __func__);
55
56 bl_pipe_msg.bl_wq = &nn->bl_wq;
57 memset(msg, 0, sizeof(*msg));
58 msg->len = sizeof(bl_msg) + bl_msg.totallen;
59 msg->data = kzalloc(msg->len, GFP_NOFS);
60 if (!msg->data)
61 goto out;
62
63 memset(&bl_umount_request, 0, sizeof(bl_umount_request));
64 bl_umount_request.major = MAJOR(dev);
65 bl_umount_request.minor = MINOR(dev);
66
67 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
68 dataptr = (uint8_t *) msg->data;
69 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
70
71 add_wait_queue(&nn->bl_wq, &wq);
72 if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
73 remove_wait_queue(&nn->bl_wq, &wq);
74 goto out;
75 }
76
77 set_current_state(TASK_UNINTERRUPTIBLE);
78 schedule();
79 __set_current_state(TASK_RUNNING);
80 remove_wait_queue(&nn->bl_wq, &wq);
81
82out:
83 kfree(msg->data);
84}
85
86/*
87 * Release meta device
88 */
89static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
90{
91 dprintk("%s Releasing\n", __func__);
92 nfs4_blkdev_put(bdev->bm_mdev);
93 dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
94}
95
96void bl_free_block_dev(struct pnfs_block_dev *bdev)
97{
98 if (bdev) {
99 if (bdev->bm_mdev) {
100 dprintk("%s Removing DM device: %d:%d\n",
101 __func__,
102 MAJOR(bdev->bm_mdev->bd_dev),
103 MINOR(bdev->bm_mdev->bd_dev));
104 nfs4_blk_metadev_release(bdev);
105 }
106 kfree(bdev);
107 }
108}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..5aed4f98df41
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/sunrpc/svc.h>
5#include <linux/blkdev.h>
6#include <linux/nfs4.h>
7#include <linux/nfs_fs.h>
8#include <linux/nfs_xdr.h>
9
10#include "blocklayout.h"
11
12#define NFSDBG_FACILITY NFSDBG_PNFS_LD
13
14static void
15bl_free_device(struct pnfs_block_dev *dev)
16{
17 if (dev->nr_children) {
18 int i;
19
20 for (i = 0; i < dev->nr_children; i++)
21 bl_free_device(&dev->children[i]);
22 kfree(dev->children);
23 } else {
24 if (dev->bdev)
25 blkdev_put(dev->bdev, FMODE_READ);
26 }
27}
28
29void
30bl_free_deviceid_node(struct nfs4_deviceid_node *d)
31{
32 struct pnfs_block_dev *dev =
33 container_of(d, struct pnfs_block_dev, node);
34
35 bl_free_device(dev);
36 kfree(dev);
37}
38
39static int
40nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
41{
42 __be32 *p;
43 int i;
44
45 p = xdr_inline_decode(xdr, 4);
46 if (!p)
47 return -EIO;
48 b->type = be32_to_cpup(p++);
49
50 switch (b->type) {
51 case PNFS_BLOCK_VOLUME_SIMPLE:
52 p = xdr_inline_decode(xdr, 4);
53 if (!p)
54 return -EIO;
55 b->simple.nr_sigs = be32_to_cpup(p++);
56 if (!b->simple.nr_sigs) {
57 dprintk("no signature\n");
58 return -EIO;
59 }
60
61 b->simple.len = 4 + 4;
62 for (i = 0; i < b->simple.nr_sigs; i++) {
63 p = xdr_inline_decode(xdr, 8 + 4);
64 if (!p)
65 return -EIO;
66 p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
67 b->simple.sigs[i].sig_len = be32_to_cpup(p++);
68
69 p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
70 if (!p)
71 return -EIO;
72 memcpy(&b->simple.sigs[i].sig, p,
73 b->simple.sigs[i].sig_len);
74
75 b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
76 }
77 break;
78 case PNFS_BLOCK_VOLUME_SLICE:
79 p = xdr_inline_decode(xdr, 8 + 8 + 4);
80 if (!p)
81 return -EIO;
82 p = xdr_decode_hyper(p, &b->slice.start);
83 p = xdr_decode_hyper(p, &b->slice.len);
84 b->slice.volume = be32_to_cpup(p++);
85 break;
86 case PNFS_BLOCK_VOLUME_CONCAT:
87 p = xdr_inline_decode(xdr, 4);
88 if (!p)
89 return -EIO;
90 b->concat.volumes_count = be32_to_cpup(p++);
91
92 p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
93 if (!p)
94 return -EIO;
95 for (i = 0; i < b->concat.volumes_count; i++)
96 b->concat.volumes[i] = be32_to_cpup(p++);
97 break;
98 case PNFS_BLOCK_VOLUME_STRIPE:
99 p = xdr_inline_decode(xdr, 8 + 4);
100 if (!p)
101 return -EIO;
102 p = xdr_decode_hyper(p, &b->stripe.chunk_size);
103 b->stripe.volumes_count = be32_to_cpup(p++);
104
105 p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
106 if (!p)
107 return -EIO;
108 for (i = 0; i < b->stripe.volumes_count; i++)
109 b->stripe.volumes[i] = be32_to_cpup(p++);
110 break;
111 default:
112 dprintk("unknown volume type!\n");
113 return -EIO;
114 }
115
116 return 0;
117}
118
119static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
120 struct pnfs_block_dev_map *map)
121{
122 map->start = dev->start;
123 map->len = dev->len;
124 map->disk_offset = dev->disk_offset;
125 map->bdev = dev->bdev;
126 return true;
127}
128
129static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
130 struct pnfs_block_dev_map *map)
131{
132 int i;
133
134 for (i = 0; i < dev->nr_children; i++) {
135 struct pnfs_block_dev *child = &dev->children[i];
136
137 if (child->start > offset ||
138 child->start + child->len <= offset)
139 continue;
140
141 child->map(child, offset - child->start, map);
142 return true;
143 }
144
145 dprintk("%s: ran off loop!\n", __func__);
146 return false;
147}
148
149static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
150 struct pnfs_block_dev_map *map)
151{
152 struct pnfs_block_dev *child;
153 u64 chunk;
154 u32 chunk_idx;
155 u64 disk_offset;
156
157 chunk = div_u64(offset, dev->chunk_size);
158 div_u64_rem(chunk, dev->nr_children, &chunk_idx);
159
160 if (chunk_idx > dev->nr_children) {
161 dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
162 __func__, chunk_idx, offset, dev->chunk_size);
163 /* error, should not happen */
164 return false;
165 }
166
167 /* truncate offset to the beginning of the stripe */
168 offset = chunk * dev->chunk_size;
169
170 /* disk offset of the stripe */
171 disk_offset = div_u64(offset, dev->nr_children);
172
173 child = &dev->children[chunk_idx];
174 child->map(child, disk_offset, map);
175
176 map->start += offset;
177 map->disk_offset += disk_offset;
178 map->len = dev->chunk_size;
179 return true;
180}
181
182static int
183bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
184 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
185
186
187static int
188bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
189 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
190{
191 struct pnfs_block_volume *v = &volumes[idx];
192 dev_t dev;
193
194 dev = bl_resolve_deviceid(server, v, gfp_mask);
195 if (!dev)
196 return -EIO;
197
198 d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
199 if (IS_ERR(d->bdev)) {
200 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
201 MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
202 return PTR_ERR(d->bdev);
203 }
204
205
206 d->len = i_size_read(d->bdev->bd_inode);
207 d->map = bl_map_simple;
208
209 printk(KERN_INFO "pNFS: using block device %s\n",
210 d->bdev->bd_disk->disk_name);
211 return 0;
212}
213
214static int
215bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
216 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
217{
218 struct pnfs_block_volume *v = &volumes[idx];
219 int ret;
220
221 ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
222 if (ret)
223 return ret;
224
225 d->disk_offset = v->slice.start;
226 d->len = v->slice.len;
227 return 0;
228}
229
230static int
231bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
232 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
233{
234 struct pnfs_block_volume *v = &volumes[idx];
235 u64 len = 0;
236 int ret, i;
237
238 d->children = kcalloc(v->concat.volumes_count,
239 sizeof(struct pnfs_block_dev), GFP_KERNEL);
240 if (!d->children)
241 return -ENOMEM;
242
243 for (i = 0; i < v->concat.volumes_count; i++) {
244 ret = bl_parse_deviceid(server, &d->children[i],
245 volumes, v->concat.volumes[i], gfp_mask);
246 if (ret)
247 return ret;
248
249 d->nr_children++;
250 d->children[i].start += len;
251 len += d->children[i].len;
252 }
253
254 d->len = len;
255 d->map = bl_map_concat;
256 return 0;
257}
258
259static int
260bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
261 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
262{
263 struct pnfs_block_volume *v = &volumes[idx];
264 u64 len = 0;
265 int ret, i;
266
267 d->children = kcalloc(v->stripe.volumes_count,
268 sizeof(struct pnfs_block_dev), GFP_KERNEL);
269 if (!d->children)
270 return -ENOMEM;
271
272 for (i = 0; i < v->stripe.volumes_count; i++) {
273 ret = bl_parse_deviceid(server, &d->children[i],
274 volumes, v->stripe.volumes[i], gfp_mask);
275 if (ret)
276 return ret;
277
278 d->nr_children++;
279 len += d->children[i].len;
280 }
281
282 d->len = len;
283 d->chunk_size = v->stripe.chunk_size;
284 d->map = bl_map_stripe;
285 return 0;
286}
287
288static int
289bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
290 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
291{
292 switch (volumes[idx].type) {
293 case PNFS_BLOCK_VOLUME_SIMPLE:
294 return bl_parse_simple(server, d, volumes, idx, gfp_mask);
295 case PNFS_BLOCK_VOLUME_SLICE:
296 return bl_parse_slice(server, d, volumes, idx, gfp_mask);
297 case PNFS_BLOCK_VOLUME_CONCAT:
298 return bl_parse_concat(server, d, volumes, idx, gfp_mask);
299 case PNFS_BLOCK_VOLUME_STRIPE:
300 return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
301 default:
302 dprintk("unsupported volume type: %d\n", volumes[idx].type);
303 return -EIO;
304 }
305}
306
307struct nfs4_deviceid_node *
308bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
309 gfp_t gfp_mask)
310{
311 struct nfs4_deviceid_node *node = NULL;
312 struct pnfs_block_volume *volumes;
313 struct pnfs_block_dev *top;
314 struct xdr_stream xdr;
315 struct xdr_buf buf;
316 struct page *scratch;
317 int nr_volumes, ret, i;
318 __be32 *p;
319
320 scratch = alloc_page(gfp_mask);
321 if (!scratch)
322 goto out;
323
324 xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
325 xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
326
327 p = xdr_inline_decode(&xdr, sizeof(__be32));
328 if (!p)
329 goto out_free_scratch;
330 nr_volumes = be32_to_cpup(p++);
331
332 volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
333 gfp_mask);
334 if (!volumes)
335 goto out_free_scratch;
336
337 for (i = 0; i < nr_volumes; i++) {
338 ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
339 if (ret < 0)
340 goto out_free_volumes;
341 }
342
343 top = kzalloc(sizeof(*top), gfp_mask);
344 if (!top)
345 goto out_free_volumes;
346
347 ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
348 if (ret) {
349 bl_free_device(top);
350 kfree(top);
351 goto out_free_volumes;
352 }
353
354 node = &top->node;
355 nfs4_init_deviceid_node(node, server, &pdev->dev_id);
356
357out_free_volumes:
358 kfree(volumes);
359out_free_scratch:
360 __free_page(scratch);
361out:
362 return node;
363}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..31d0b5e53dfd
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4
5#include <linux/vmalloc.h>
6
7#include "blocklayout.h"
8
9#define NFSDBG_FACILITY NFSDBG_PNFS_LD
10
11static inline struct pnfs_block_extent *
12ext_node(struct rb_node *node)
13{
14 return rb_entry(node, struct pnfs_block_extent, be_node);
15}
16
17static struct pnfs_block_extent *
18ext_tree_first(struct rb_root *root)
19{
20 struct rb_node *node = rb_first(root);
21 return node ? ext_node(node) : NULL;
22}
23
24static struct pnfs_block_extent *
25ext_tree_prev(struct pnfs_block_extent *be)
26{
27 struct rb_node *node = rb_prev(&be->be_node);
28 return node ? ext_node(node) : NULL;
29}
30
31static struct pnfs_block_extent *
32ext_tree_next(struct pnfs_block_extent *be)
33{
34 struct rb_node *node = rb_next(&be->be_node);
35 return node ? ext_node(node) : NULL;
36}
37
38static inline sector_t
39ext_f_end(struct pnfs_block_extent *be)
40{
41 return be->be_f_offset + be->be_length;
42}
43
44static struct pnfs_block_extent *
45__ext_tree_search(struct rb_root *root, sector_t start)
46{
47 struct rb_node *node = root->rb_node;
48 struct pnfs_block_extent *be = NULL;
49
50 while (node) {
51 be = ext_node(node);
52 if (start < be->be_f_offset)
53 node = node->rb_left;
54 else if (start >= ext_f_end(be))
55 node = node->rb_right;
56 else
57 return be;
58 }
59
60 if (be) {
61 if (start < be->be_f_offset)
62 return be;
63
64 if (start >= ext_f_end(be))
65 return ext_tree_next(be);
66 }
67
68 return NULL;
69}
70
71static bool
72ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
73{
74 if (be1->be_state != be2->be_state)
75 return false;
76 if (be1->be_device != be2->be_device)
77 return false;
78
79 if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
80 return false;
81
82 if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
83 (be1->be_v_offset + be1->be_length != be2->be_v_offset))
84 return false;
85
86 if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
87 be1->be_tag != be2->be_tag)
88 return false;
89
90 return true;
91}
92
93static struct pnfs_block_extent *
94ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
95{
96 struct pnfs_block_extent *left = ext_tree_prev(be);
97
98 if (left && ext_can_merge(left, be)) {
99 left->be_length += be->be_length;
100 rb_erase(&be->be_node, root);
101 nfs4_put_deviceid_node(be->be_device);
102 kfree(be);
103 return left;
104 }
105
106 return be;
107}
108
109static struct pnfs_block_extent *
110ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
111{
112 struct pnfs_block_extent *right = ext_tree_next(be);
113
114 if (right && ext_can_merge(be, right)) {
115 be->be_length += right->be_length;
116 rb_erase(&right->be_node, root);
117 nfs4_put_deviceid_node(right->be_device);
118 kfree(right);
119 }
120
121 return be;
122}
123
124static void
125__ext_tree_insert(struct rb_root *root,
126 struct pnfs_block_extent *new, bool merge_ok)
127{
128 struct rb_node **p = &root->rb_node, *parent = NULL;
129 struct pnfs_block_extent *be;
130
131 while (*p) {
132 parent = *p;
133 be = ext_node(parent);
134
135 if (new->be_f_offset < be->be_f_offset) {
136 if (merge_ok && ext_can_merge(new, be)) {
137 be->be_f_offset = new->be_f_offset;
138 if (be->be_state != PNFS_BLOCK_NONE_DATA)
139 be->be_v_offset = new->be_v_offset;
140 be->be_length += new->be_length;
141 be = ext_try_to_merge_left(root, be);
142 goto free_new;
143 }
144 p = &(*p)->rb_left;
145 } else if (new->be_f_offset >= ext_f_end(be)) {
146 if (merge_ok && ext_can_merge(be, new)) {
147 be->be_length += new->be_length;
148 be = ext_try_to_merge_right(root, be);
149 goto free_new;
150 }
151 p = &(*p)->rb_right;
152 } else {
153 BUG();
154 }
155 }
156
157 rb_link_node(&new->be_node, parent, p);
158 rb_insert_color(&new->be_node, root);
159 return;
160free_new:
161 nfs4_put_deviceid_node(new->be_device);
162 kfree(new);
163}
164
165static int
166__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
167{
168 struct pnfs_block_extent *be;
169 sector_t len1 = 0, len2 = 0;
170 sector_t orig_v_offset;
171 sector_t orig_len;
172
173 be = __ext_tree_search(root, start);
174 if (!be)
175 return 0;
176 if (be->be_f_offset >= end)
177 return 0;
178
179 orig_v_offset = be->be_v_offset;
180 orig_len = be->be_length;
181
182 if (start > be->be_f_offset)
183 len1 = start - be->be_f_offset;
184 if (ext_f_end(be) > end)
185 len2 = ext_f_end(be) - end;
186
187 if (len2 > 0) {
188 if (len1 > 0) {
189 struct pnfs_block_extent *new;
190
191 new = kzalloc(sizeof(*new), GFP_ATOMIC);
192 if (!new)
193 return -ENOMEM;
194
195 be->be_length = len1;
196
197 new->be_f_offset = end;
198 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
199 new->be_v_offset =
200 orig_v_offset + orig_len - len2;
201 }
202 new->be_length = len2;
203 new->be_state = be->be_state;
204 new->be_tag = be->be_tag;
205 new->be_device = nfs4_get_deviceid(be->be_device);
206
207 __ext_tree_insert(root, new, true);
208 } else {
209 be->be_f_offset = end;
210 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
211 be->be_v_offset =
212 orig_v_offset + orig_len - len2;
213 }
214 be->be_length = len2;
215 }
216 } else {
217 if (len1 > 0) {
218 be->be_length = len1;
219 be = ext_tree_next(be);
220 }
221
222 while (be && ext_f_end(be) <= end) {
223 struct pnfs_block_extent *next = ext_tree_next(be);
224
225 rb_erase(&be->be_node, root);
226 nfs4_put_deviceid_node(be->be_device);
227 kfree(be);
228 be = next;
229 }
230
231 if (be && be->be_f_offset < end) {
232 len1 = ext_f_end(be) - end;
233 be->be_f_offset = end;
234 if (be->be_state != PNFS_BLOCK_NONE_DATA)
235 be->be_v_offset += be->be_length - len1;
236 be->be_length = len1;
237 }
238 }
239
240 return 0;
241}
242
243int
244ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
245{
246 struct pnfs_block_extent *be;
247 struct rb_root *root;
248 int err = 0;
249
250 switch (new->be_state) {
251 case PNFS_BLOCK_READWRITE_DATA:
252 case PNFS_BLOCK_INVALID_DATA:
253 root = &bl->bl_ext_rw;
254 break;
255 case PNFS_BLOCK_READ_DATA:
256 case PNFS_BLOCK_NONE_DATA:
257 root = &bl->bl_ext_ro;
258 break;
259 default:
260 dprintk("invalid extent type\n");
261 return -EINVAL;
262 }
263
264 spin_lock(&bl->bl_ext_lock);
265retry:
266 be = __ext_tree_search(root, new->be_f_offset);
267 if (!be || be->be_f_offset >= ext_f_end(new)) {
268 __ext_tree_insert(root, new, true);
269 } else if (new->be_f_offset >= be->be_f_offset) {
270 if (ext_f_end(new) <= ext_f_end(be)) {
271 nfs4_put_deviceid_node(new->be_device);
272 kfree(new);
273 } else {
274 sector_t new_len = ext_f_end(new) - ext_f_end(be);
275 sector_t diff = new->be_length - new_len;
276
277 new->be_f_offset += diff;
278 new->be_v_offset += diff;
279 new->be_length = new_len;
280 goto retry;
281 }
282 } else if (ext_f_end(new) <= ext_f_end(be)) {
283 new->be_length = be->be_f_offset - new->be_f_offset;
284 __ext_tree_insert(root, new, true);
285 } else {
286 struct pnfs_block_extent *split;
287 sector_t new_len = ext_f_end(new) - ext_f_end(be);
288 sector_t diff = new->be_length - new_len;
289
290 split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
291 if (!split) {
292 err = -EINVAL;
293 goto out;
294 }
295
296 split->be_length = be->be_f_offset - split->be_f_offset;
297 split->be_device = nfs4_get_deviceid(new->be_device);
298 __ext_tree_insert(root, split, true);
299
300 new->be_f_offset += diff;
301 new->be_v_offset += diff;
302 new->be_length = new_len;
303 goto retry;
304 }
305out:
306 spin_unlock(&bl->bl_ext_lock);
307 return err;
308}
309
310static bool
311__ext_tree_lookup(struct rb_root *root, sector_t isect,
312 struct pnfs_block_extent *ret)
313{
314 struct rb_node *node;
315 struct pnfs_block_extent *be;
316
317 node = root->rb_node;
318 while (node) {
319 be = ext_node(node);
320 if (isect < be->be_f_offset)
321 node = node->rb_left;
322 else if (isect >= ext_f_end(be))
323 node = node->rb_right;
324 else {
325 *ret = *be;
326 return true;
327 }
328 }
329
330 return false;
331}
332
333bool
334ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
335 struct pnfs_block_extent *ret, bool rw)
336{
337 bool found = false;
338
339 spin_lock(&bl->bl_ext_lock);
340 if (!rw)
341 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
342 if (!found)
343 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
344 spin_unlock(&bl->bl_ext_lock);
345
346 return found;
347}
348
349int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
350 sector_t start, sector_t end)
351{
352 int err, err2;
353
354 spin_lock(&bl->bl_ext_lock);
355 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
356 if (rw) {
357 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
358 if (!err)
359 err = err2;
360 }
361 spin_unlock(&bl->bl_ext_lock);
362
363 return err;
364}
365
366static int
367ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
368 sector_t split)
369{
370 struct pnfs_block_extent *new;
371 sector_t orig_len = be->be_length;
372
373 new = kzalloc(sizeof(*new), GFP_ATOMIC);
374 if (!new)
375 return -ENOMEM;
376
377 be->be_length = split - be->be_f_offset;
378
379 new->be_f_offset = split;
380 if (be->be_state != PNFS_BLOCK_NONE_DATA)
381 new->be_v_offset = be->be_v_offset + be->be_length;
382 new->be_length = orig_len - be->be_length;
383 new->be_state = be->be_state;
384 new->be_tag = be->be_tag;
385 new->be_device = nfs4_get_deviceid(be->be_device);
386
387 __ext_tree_insert(root, new, false);
388 return 0;
389}
390
391int
392ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
393 sector_t len)
394{
395 struct rb_root *root = &bl->bl_ext_rw;
396 sector_t end = start + len;
397 struct pnfs_block_extent *be;
398 int err = 0;
399
400 spin_lock(&bl->bl_ext_lock);
401 /*
402 * First remove all COW extents or holes from written to range.
403 */
404 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
405 if (err)
406 goto out;
407
408 /*
409 * Then mark all invalid extents in the range as written to.
410 */
411 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
412 if (be->be_f_offset >= end)
413 break;
414
415 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
416 continue;
417
418 if (be->be_f_offset < start) {
419 struct pnfs_block_extent *left = ext_tree_prev(be);
420
421 if (left && ext_can_merge(left, be)) {
422 sector_t diff = start - be->be_f_offset;
423
424 left->be_length += diff;
425
426 be->be_f_offset += diff;
427 be->be_v_offset += diff;
428 be->be_length -= diff;
429 } else {
430 err = ext_tree_split(root, be, start);
431 if (err)
432 goto out;
433 }
434 }
435
436 if (ext_f_end(be) > end) {
437 struct pnfs_block_extent *right = ext_tree_next(be);
438
439 if (right && ext_can_merge(be, right)) {
440 sector_t diff = end - be->be_f_offset;
441
442 be->be_length -= diff;
443
444 right->be_f_offset -= diff;
445 right->be_v_offset -= diff;
446 right->be_length += diff;
447 } else {
448 err = ext_tree_split(root, be, end);
449 if (err)
450 goto out;
451 }
452 }
453
454 if (be->be_f_offset >= start && ext_f_end(be) <= end) {
455 be->be_tag = EXTENT_WRITTEN;
456 be = ext_try_to_merge_left(root, be);
457 be = ext_try_to_merge_right(root, be);
458 }
459 }
460out:
461 spin_unlock(&bl->bl_ext_lock);
462 return err;
463}
464
465static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
466 size_t buffer_size)
467{
468 if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
469 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
470
471 for (i = 0; i < nr_pages; i++)
472 put_page(arg->layoutupdate_pages[i]);
473 kfree(arg->layoutupdate_pages);
474 } else {
475 put_page(arg->layoutupdate_page);
476 }
477}
478
479static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
480 size_t buffer_size, size_t *count)
481{
482 struct pnfs_block_extent *be;
483 int ret = 0;
484
485 spin_lock(&bl->bl_ext_lock);
486 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
487 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
488 be->be_tag != EXTENT_WRITTEN)
489 continue;
490
491 (*count)++;
492 if (*count * BL_EXTENT_SIZE > buffer_size) {
493 /* keep counting.. */
494 ret = -ENOSPC;
495 continue;
496 }
497
498 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
499 NFS4_DEVICEID4_SIZE);
500 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
501 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
502 p = xdr_encode_hyper(p, 0LL);
503 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
504
505 be->be_tag = EXTENT_COMMITTING;
506 }
507 spin_unlock(&bl->bl_ext_lock);
508
509 return ret;
510}
511
512int
513ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
514{
515 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
516 size_t count = 0, buffer_size = PAGE_SIZE;
517 __be32 *start_p;
518 int ret;
519
520 dprintk("%s enter\n", __func__);
521
522 arg->layoutupdate_page = alloc_page(GFP_NOFS);
523 if (!arg->layoutupdate_page)
524 return -ENOMEM;
525 start_p = page_address(arg->layoutupdate_page);
526 arg->layoutupdate_pages = &arg->layoutupdate_page;
527
528retry:
529 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
530 if (unlikely(ret)) {
531 ext_tree_free_commitdata(arg, buffer_size);
532
533 buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
534 count = 0;
535
536 arg->layoutupdate_pages =
537 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
538 sizeof(struct page *), GFP_NOFS);
539 if (!arg->layoutupdate_pages)
540 return -ENOMEM;
541
542 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
543 if (!start_p) {
544 kfree(arg->layoutupdate_pages);
545 return -ENOMEM;
546 }
547
548 goto retry;
549 }
550
551 *start_p = cpu_to_be32(count);
552 arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
553
554 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
555 __be32 *p = start_p;
556 int i = 0;
557
558 for (p = start_p;
559 p < start_p + arg->layoutupdate_len;
560 p += PAGE_SIZE) {
561 arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
562 }
563 }
564
565 dprintk("%s found %zu ranges\n", __func__, count);
566 return 0;
567}
568
569void
570ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
571{
572 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
573 struct rb_root *root = &bl->bl_ext_rw;
574 struct pnfs_block_extent *be;
575
576 dprintk("%s status %d\n", __func__, status);
577
578 ext_tree_free_commitdata(arg, arg->layoutupdate_len);
579
580 spin_lock(&bl->bl_ext_lock);
581 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
582 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
583 be->be_tag != EXTENT_COMMITTING)
584 continue;
585
586 if (status) {
587 /*
588 * Mark as written and try again.
589 *
590 * XXX: some real error handling here wouldn't hurt..
591 */
592 be->be_tag = EXTENT_WRITTEN;
593 } else {
594 be->be_state = PNFS_BLOCK_READWRITE_DATA;
595 be->be_tag = 0;
596 }
597
598 be = ext_try_to_merge_left(root, be);
599 be = ext_try_to_merge_right(root, be);
600 }
601 spin_unlock(&bl->bl_ext_lock);
602}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include "blocklayout.h"
34#define NFSDBG_FACILITY NFSDBG_PNFS_LD
35
36/* Bit numbers */
37#define EXTENT_INITIALIZED 0
38#define EXTENT_WRITTEN 1
39#define EXTENT_IN_COMMIT 2
40#define INTERNAL_EXISTS MY_MAX_TAGS
41#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
42
43/* Returns largest t<=s s.t. t%base==0 */
44static inline sector_t normalize(sector_t s, int base)
45{
46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - sector_div(tmp, base);
48}
49
50static inline sector_t normalize_up(sector_t s, int base)
51{
52 return normalize(s + base - 1, base);
53}
54
55/* Complete stub using list while determine API wanted */
56
57/* Returns tags, or negative */
58static int32_t _find_entry(struct my_tree *tree, u64 s)
59{
60 struct pnfs_inval_tracking *pos;
61
62 dprintk("%s(%llu) enter\n", __func__, s);
63 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
64 if (pos->it_sector > s)
65 continue;
66 else if (pos->it_sector == s)
67 return pos->it_tags & INTERNAL_MASK;
68 else
69 break;
70 }
71 return -ENOENT;
72}
73
74static inline
75int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
76{
77 int32_t tags;
78
79 dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
80 s = normalize(s, tree->mtt_step_size);
81 tags = _find_entry(tree, s);
82 if ((tags < 0) || !(tags & (1 << tag)))
83 return 0;
84 else
85 return 1;
86}
87
88/* Creates entry with tag, or if entry already exists, unions tag to it.
89 * If storage is not NULL, newly created entry will use it.
90 * Returns number of entries added, or negative on error.
91 */
92static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
93 struct pnfs_inval_tracking *storage)
94{
95 int found = 0;
96 struct pnfs_inval_tracking *pos;
97
98 dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
99 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
100 if (pos->it_sector > s)
101 continue;
102 else if (pos->it_sector == s) {
103 found = 1;
104 break;
105 } else
106 break;
107 }
108 if (found) {
109 pos->it_tags |= (1 << tag);
110 return 0;
111 } else {
112 struct pnfs_inval_tracking *new;
113 new = storage;
114 new->it_sector = s;
115 new->it_tags = (1 << tag);
116 list_add(&new->it_link, &pos->it_link);
117 return 1;
118 }
119}
120
121/* XXXX Really want option to not create */
122/* Over range, unions tag with existing entries, else creates entry with tag */
123static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
124{
125 u64 i;
126
127 dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
128 for (i = normalize(s, tree->mtt_step_size); i < s + length;
129 i += tree->mtt_step_size)
130 if (_add_entry(tree, i, tag, NULL))
131 return -ENOMEM;
132 return 0;
133}
134
135/* Ensure that future operations on given range of tree will not malloc */
136static int _preload_range(struct pnfs_inval_markings *marks,
137 u64 offset, u64 length)
138{
139 u64 start, end, s;
140 int count, i, used = 0, status = -ENOMEM;
141 struct pnfs_inval_tracking **storage;
142 struct my_tree *tree = &marks->im_tree;
143
144 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
145 start = normalize(offset, tree->mtt_step_size);
146 end = normalize_up(offset + length, tree->mtt_step_size);
147 count = (int)(end - start) / (int)tree->mtt_step_size;
148
149 /* Pre-malloc what memory we might need */
150 storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
151 if (!storage)
152 return -ENOMEM;
153 for (i = 0; i < count; i++) {
154 storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
155 GFP_NOFS);
156 if (!storage[i])
157 goto out_cleanup;
158 }
159
160 spin_lock_bh(&marks->im_lock);
161 for (s = start; s < end; s += tree->mtt_step_size)
162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163 spin_unlock_bh(&marks->im_lock);
164
165 status = 0;
166
167 out_cleanup:
168 for (i = used; i < count; i++) {
169 if (!storage[i])
170 break;
171 kfree(storage[i]);
172 }
173 kfree(storage);
174 return status;
175}
176
177/* We are relying on page lock to serialize this */
178int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
179{
180 int rv;
181
182 spin_lock_bh(&marks->im_lock);
183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
184 spin_unlock_bh(&marks->im_lock);
185 return rv;
186}
187
188/* Assume start, end already sector aligned */
189static int
190_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
191{
192 struct pnfs_inval_tracking *pos;
193 u64 expect = 0;
194
195 dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
196 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
197 if (pos->it_sector >= end)
198 continue;
199 if (!expect) {
200 if ((pos->it_sector == end - tree->mtt_step_size) &&
201 (pos->it_tags & (1 << tag))) {
202 expect = pos->it_sector - tree->mtt_step_size;
203 if (pos->it_sector < tree->mtt_step_size || expect < start)
204 return 1;
205 continue;
206 } else {
207 return 0;
208 }
209 }
210 if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
211 return 0;
212 expect -= tree->mtt_step_size;
213 if (expect < start)
214 return 1;
215 }
216 return 0;
217}
218
219static int is_range_written(struct pnfs_inval_markings *marks,
220 sector_t start, sector_t end)
221{
222 int rv;
223
224 spin_lock_bh(&marks->im_lock);
225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
226 spin_unlock_bh(&marks->im_lock);
227 return rv;
228}
229
230/* Marks sectors in [offest, offset_length) as having been initialized.
231 * All lengths are step-aligned, where step is min(pagesize, blocksize).
232 * Currently assumes offset is page-aligned
233 */
234int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
235 sector_t offset, sector_t length)
236{
237 sector_t start, end;
238
239 dprintk("%s(offset=%llu,len=%llu) enter\n",
240 __func__, (u64)offset, (u64)length);
241
242 start = normalize(offset, marks->im_block_size);
243 end = normalize_up(offset + length, marks->im_block_size);
244 if (_preload_range(marks, start, end - start))
245 goto outerr;
246
247 spin_lock_bh(&marks->im_lock);
248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
249 goto out_unlock;
250 spin_unlock_bh(&marks->im_lock);
251
252 return 0;
253
254out_unlock:
255 spin_unlock_bh(&marks->im_lock);
256outerr:
257 return -ENOMEM;
258}
259
260/* Marks sectors in [offest, offset+length) as having been written to disk.
261 * All lengths should be block aligned.
262 */
263static int mark_written_sectors(struct pnfs_inval_markings *marks,
264 sector_t offset, sector_t length)
265{
266 int status;
267
268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
269 (u64)offset, (u64)length);
270 spin_lock_bh(&marks->im_lock);
271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
272 spin_unlock_bh(&marks->im_lock);
273 return status;
274}
275
276static void print_short_extent(struct pnfs_block_short_extent *be)
277{
278 dprintk("PRINT SHORT EXTENT extent %p\n", be);
279 if (be) {
280 dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
281 dprintk(" be_length %llu\n", (u64)be->bse_length);
282 }
283}
284
285static void print_clist(struct list_head *list, unsigned int count)
286{
287 struct pnfs_block_short_extent *be;
288 unsigned int i = 0;
289
290 ifdebug(FACILITY) {
291 printk(KERN_DEBUG "****************\n");
292 printk(KERN_DEBUG "Extent list looks like:\n");
293 list_for_each_entry(be, list, bse_node) {
294 i++;
295 print_short_extent(be);
296 }
297 if (i != count)
298 printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
299 printk(KERN_DEBUG "****************\n");
300 }
301}
302
303/* Note: In theory, we should do more checking that devid's match between
304 * old and new, but if they don't, the lists are too corrupt to salvage anyway.
305 */
306/* Note this is very similar to bl_add_merge_extent */
307static void add_to_commitlist(struct pnfs_block_layout *bl,
308 struct pnfs_block_short_extent *new)
309{
310 struct list_head *clist = &bl->bl_commit;
311 struct pnfs_block_short_extent *old, *save;
312 sector_t end = new->bse_f_offset + new->bse_length;
313
314 dprintk("%s enter\n", __func__);
315 print_short_extent(new);
316 print_clist(clist, bl->bl_count);
317 bl->bl_count++;
318 /* Scan for proper place to insert, extending new to the left
319 * as much as possible.
320 */
321 list_for_each_entry_safe(old, save, clist, bse_node) {
322 if (new->bse_f_offset < old->bse_f_offset)
323 break;
324 if (end <= old->bse_f_offset + old->bse_length) {
325 /* Range is already in list */
326 bl->bl_count--;
327 kfree(new);
328 return;
329 } else if (new->bse_f_offset <=
330 old->bse_f_offset + old->bse_length) {
331 /* new overlaps or abuts existing be */
332 if (new->bse_mdev == old->bse_mdev) {
333 /* extend new to fully replace old */
334 new->bse_length += new->bse_f_offset -
335 old->bse_f_offset;
336 new->bse_f_offset = old->bse_f_offset;
337 list_del(&old->bse_node);
338 bl->bl_count--;
339 kfree(old);
340 }
341 }
342 }
343 /* Note that if we never hit the above break, old will not point to a
344 * valid extent. However, in that case &old->bse_node==list.
345 */
346 list_add_tail(&new->bse_node, &old->bse_node);
347 /* Scan forward for overlaps. If we find any, extend new and
348 * remove the overlapped extent.
349 */
350 old = list_prepare_entry(new, clist, bse_node);
351 list_for_each_entry_safe_continue(old, save, clist, bse_node) {
352 if (end < old->bse_f_offset)
353 break;
354 /* new overlaps or abuts old */
355 if (new->bse_mdev == old->bse_mdev) {
356 if (end < old->bse_f_offset + old->bse_length) {
357 /* extend new to fully cover old */
358 end = old->bse_f_offset + old->bse_length;
359 new->bse_length = end - new->bse_f_offset;
360 }
361 list_del(&old->bse_node);
362 bl->bl_count--;
363 kfree(old);
364 }
365 }
366 dprintk("%s: after merging\n", __func__);
367 print_clist(clist, bl->bl_count);
368}
369
370/* Note the range described by offset, length is guaranteed to be contained
371 * within be.
372 * new will be freed, either by this function or add_to_commitlist if they
373 * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
374 */
375int bl_mark_for_commit(struct pnfs_block_extent *be,
376 sector_t offset, sector_t length,
377 struct pnfs_block_short_extent *new)
378{
379 sector_t new_end, end = offset + length;
380 struct pnfs_block_layout *bl = container_of(be->be_inval,
381 struct pnfs_block_layout,
382 bl_inval);
383
384 mark_written_sectors(be->be_inval, offset, length);
385 /* We want to add the range to commit list, but it must be
386 * block-normalized, and verified that the normalized range has
387 * been entirely written to disk.
388 */
389 new->bse_f_offset = offset;
390 offset = normalize(offset, bl->bl_blocksize);
391 if (offset < new->bse_f_offset) {
392 if (is_range_written(be->be_inval, offset, new->bse_f_offset))
393 new->bse_f_offset = offset;
394 else
395 new->bse_f_offset = offset + bl->bl_blocksize;
396 }
397 new_end = normalize_up(end, bl->bl_blocksize);
398 if (end < new_end) {
399 if (is_range_written(be->be_inval, end, new_end))
400 end = new_end;
401 else
402 end = new_end - bl->bl_blocksize;
403 }
404 if (end <= new->bse_f_offset) {
405 kfree(new);
406 return 0;
407 }
408 new->bse_length = end - new->bse_f_offset;
409 new->bse_devid = be->be_devid;
410 new->bse_mdev = be->be_mdev;
411
412 spin_lock(&bl->bl_ext_lock);
413 add_to_commitlist(bl, new);
414 spin_unlock(&bl->bl_ext_lock);
415 return 0;
416}
417
418static void print_bl_extent(struct pnfs_block_extent *be)
419{
420 dprintk("PRINT EXTENT extent %p\n", be);
421 if (be) {
422 dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
423 dprintk(" be_length %llu\n", (u64)be->be_length);
424 dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
425 dprintk(" be_state %d\n", be->be_state);
426 }
427}
428
429static void
430destroy_extent(struct kref *kref)
431{
432 struct pnfs_block_extent *be;
433
434 be = container_of(kref, struct pnfs_block_extent, be_refcnt);
435 dprintk("%s be=%p\n", __func__, be);
436 kfree(be);
437}
438
439void
440bl_put_extent(struct pnfs_block_extent *be)
441{
442 if (be) {
443 dprintk("%s enter %p (%i)\n", __func__, be,
444 atomic_read(&be->be_refcnt.refcount));
445 kref_put(&be->be_refcnt, destroy_extent);
446 }
447}
448
449struct pnfs_block_extent *bl_alloc_extent(void)
450{
451 struct pnfs_block_extent *be;
452
453 be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
454 if (!be)
455 return NULL;
456 INIT_LIST_HEAD(&be->be_node);
457 kref_init(&be->be_refcnt);
458 be->be_inval = NULL;
459 return be;
460}
461
462static void print_elist(struct list_head *list)
463{
464 struct pnfs_block_extent *be;
465 dprintk("****************\n");
466 dprintk("Extent list looks like:\n");
467 list_for_each_entry(be, list, be_node) {
468 print_bl_extent(be);
469 }
470 dprintk("****************\n");
471}
472
473static inline int
474extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
475{
476 /* Note this assumes new->be_f_offset >= old->be_f_offset */
477 return (new->be_state == old->be_state) &&
478 ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
479 ((new->be_v_offset - old->be_v_offset ==
480 new->be_f_offset - old->be_f_offset) &&
481 new->be_mdev == old->be_mdev));
482}
483
484/* Adds new to appropriate list in bl, modifying new and removing existing
485 * extents as appropriate to deal with overlaps.
486 *
487 * See bl_find_get_extent for list constraints.
488 *
489 * Refcount on new is already set. If end up not using it, or error out,
490 * need to put the reference.
491 *
492 * bl->bl_ext_lock is held by caller.
493 */
494int
495bl_add_merge_extent(struct pnfs_block_layout *bl,
496 struct pnfs_block_extent *new)
497{
498 struct pnfs_block_extent *be, *tmp;
499 sector_t end = new->be_f_offset + new->be_length;
500 struct list_head *list;
501
502 dprintk("%s enter with be=%p\n", __func__, new);
503 print_bl_extent(new);
504 list = &bl->bl_extents[bl_choose_list(new->be_state)];
505 print_elist(list);
506
507 /* Scan for proper place to insert, extending new to the left
508 * as much as possible.
509 */
510 list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
511 if (new->be_f_offset >= be->be_f_offset + be->be_length)
512 break;
513 if (new->be_f_offset >= be->be_f_offset) {
514 if (end <= be->be_f_offset + be->be_length) {
515 /* new is a subset of existing be*/
516 if (extents_consistent(be, new)) {
517 dprintk("%s: new is subset, ignoring\n",
518 __func__);
519 bl_put_extent(new);
520 return 0;
521 } else {
522 goto out_err;
523 }
524 } else {
525 /* |<-- be -->|
526 * |<-- new -->| */
527 if (extents_consistent(be, new)) {
528 /* extend new to fully replace be */
529 new->be_length += new->be_f_offset -
530 be->be_f_offset;
531 new->be_f_offset = be->be_f_offset;
532 new->be_v_offset = be->be_v_offset;
533 dprintk("%s: removing %p\n", __func__, be);
534 list_del(&be->be_node);
535 bl_put_extent(be);
536 } else {
537 goto out_err;
538 }
539 }
540 } else if (end >= be->be_f_offset + be->be_length) {
541 /* new extent overlap existing be */
542 if (extents_consistent(be, new)) {
543 /* extend new to fully replace be */
544 dprintk("%s: removing %p\n", __func__, be);
545 list_del(&be->be_node);
546 bl_put_extent(be);
547 } else {
548 goto out_err;
549 }
550 } else if (end > be->be_f_offset) {
551 /* |<-- be -->|
552 *|<-- new -->| */
553 if (extents_consistent(new, be)) {
554 /* extend new to fully replace be */
555 new->be_length += be->be_f_offset + be->be_length -
556 new->be_f_offset - new->be_length;
557 dprintk("%s: removing %p\n", __func__, be);
558 list_del(&be->be_node);
559 bl_put_extent(be);
560 } else {
561 goto out_err;
562 }
563 }
564 }
565 /* Note that if we never hit the above break, be will not point to a
566 * valid extent. However, in that case &be->be_node==list.
567 */
568 list_add(&new->be_node, &be->be_node);
569 dprintk("%s: inserting new\n", __func__);
570 print_elist(list);
571 /* FIXME - The per-list consistency checks have all been done,
572 * should now check cross-list consistency.
573 */
574 return 0;
575
576 out_err:
577 bl_put_extent(new);
578 return -EIO;
579}
580
581/* Returns extent, or NULL. If a second READ extent exists, it is returned
582 * in cow_read, if given.
583 *
584 * The extents are kept in two seperate ordered lists, one for READ and NONE,
585 * one for READWRITE and INVALID. Within each list, we assume:
586 * 1. Extents are ordered by file offset.
587 * 2. For any given isect, there is at most one extents that matches.
588 */
589struct pnfs_block_extent *
590bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
591 struct pnfs_block_extent **cow_read)
592{
593 struct pnfs_block_extent *be, *cow, *ret;
594 int i;
595
596 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
597 cow = ret = NULL;
598 spin_lock(&bl->bl_ext_lock);
599 for (i = 0; i < EXTENT_LISTS; i++) {
600 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
601 if (isect >= be->be_f_offset + be->be_length)
602 break;
603 if (isect >= be->be_f_offset) {
604 /* We have found an extent */
605 dprintk("%s Get %p (%i)\n", __func__, be,
606 atomic_read(&be->be_refcnt.refcount));
607 kref_get(&be->be_refcnt);
608 if (!ret)
609 ret = be;
610 else if (be->be_state != PNFS_BLOCK_READ_DATA)
611 bl_put_extent(be);
612 else
613 cow = be;
614 break;
615 }
616 }
617 if (ret &&
618 (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
619 break;
620 }
621 spin_unlock(&bl->bl_ext_lock);
622 if (cow_read)
623 *cow_read = cow;
624 print_bl_extent(ret);
625 return ret;
626}
627
628/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
629static struct pnfs_block_extent *
630bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
631{
632 struct pnfs_block_extent *be, *ret = NULL;
633 int i;
634
635 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
636 for (i = 0; i < EXTENT_LISTS; i++) {
637 if (ret)
638 break;
639 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
640 if (isect >= be->be_f_offset + be->be_length)
641 break;
642 if (isect >= be->be_f_offset) {
643 /* We have found an extent */
644 dprintk("%s Get %p (%i)\n", __func__, be,
645 atomic_read(&be->be_refcnt.refcount));
646 kref_get(&be->be_refcnt);
647 ret = be;
648 break;
649 }
650 }
651 }
652 print_bl_extent(ret);
653 return ret;
654}
655
656int
657encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
658 struct xdr_stream *xdr,
659 const struct nfs4_layoutcommit_args *arg)
660{
661 struct pnfs_block_short_extent *lce, *save;
662 unsigned int count = 0;
663 __be32 *p, *xdr_start;
664
665 dprintk("%s enter\n", __func__);
666 /* BUG - creation of bl_commit is buggy - need to wait for
667 * entire block to be marked WRITTEN before it can be added.
668 */
669 spin_lock(&bl->bl_ext_lock);
670 /* Want to adjust for possible truncate */
671 /* We now want to adjust argument range */
672
673 /* XDR encode the ranges found */
674 xdr_start = xdr_reserve_space(xdr, 8);
675 if (!xdr_start)
676 goto out;
677 list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
678 p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
679 if (!p)
680 break;
681 p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
682 p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
684 p = xdr_encode_hyper(p, 0LL);
685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
686 list_move_tail(&lce->bse_node, &bl->bl_committing);
687 bl->bl_count--;
688 count++;
689 }
690 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
691 xdr_start[1] = cpu_to_be32(count);
692out:
693 spin_unlock(&bl->bl_ext_lock);
694 dprintk("%s found %i ranges\n", __func__, count);
695 return 0;
696}
697
698/* Helper function to set_to_rw that initialize a new extent */
699static void
700_prep_new_extent(struct pnfs_block_extent *new,
701 struct pnfs_block_extent *orig,
702 sector_t offset, sector_t length, int state)
703{
704 kref_init(&new->be_refcnt);
705 /* don't need to INIT_LIST_HEAD(&new->be_node) */
706 memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
707 new->be_mdev = orig->be_mdev;
708 new->be_f_offset = offset;
709 new->be_length = length;
710 new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
711 new->be_state = state;
712 new->be_inval = orig->be_inval;
713}
714
715/* Tries to merge be with extent in front of it in list.
716 * Frees storage if not used.
717 */
718static struct pnfs_block_extent *
719_front_merge(struct pnfs_block_extent *be, struct list_head *head,
720 struct pnfs_block_extent *storage)
721{
722 struct pnfs_block_extent *prev;
723
724 if (!storage)
725 goto no_merge;
726 if (&be->be_node == head || be->be_node.prev == head)
727 goto no_merge;
728 prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
729 if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
730 !extents_consistent(prev, be))
731 goto no_merge;
732 _prep_new_extent(storage, prev, prev->be_f_offset,
733 prev->be_length + be->be_length, prev->be_state);
734 list_replace(&prev->be_node, &storage->be_node);
735 bl_put_extent(prev);
736 list_del(&be->be_node);
737 bl_put_extent(be);
738 return storage;
739
740 no_merge:
741 kfree(storage);
742 return be;
743}
744
745static u64
746set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
747{
748 u64 rv = offset + length;
749 struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
750 struct pnfs_block_extent *children[3];
751 struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
752 int i = 0, j;
753
754 dprintk("%s(%llu, %llu)\n", __func__, offset, length);
755 /* Create storage for up to three new extents e1, e2, e3 */
756 e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
757 e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
758 e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
759 /* BUG - we are ignoring any failure */
760 if (!e1 || !e2 || !e3)
761 goto out_nosplit;
762
763 spin_lock(&bl->bl_ext_lock);
764 be = bl_find_get_extent_locked(bl, offset);
765 rv = be->be_f_offset + be->be_length;
766 if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
767 spin_unlock(&bl->bl_ext_lock);
768 goto out_nosplit;
769 }
770 /* Add e* to children, bumping e*'s krefs */
771 if (be->be_f_offset != offset) {
772 _prep_new_extent(e1, be, be->be_f_offset,
773 offset - be->be_f_offset,
774 PNFS_BLOCK_INVALID_DATA);
775 children[i++] = e1;
776 print_bl_extent(e1);
777 } else
778 merge1 = e1;
779 _prep_new_extent(e2, be, offset,
780 min(length, be->be_f_offset + be->be_length - offset),
781 PNFS_BLOCK_READWRITE_DATA);
782 children[i++] = e2;
783 print_bl_extent(e2);
784 if (offset + length < be->be_f_offset + be->be_length) {
785 _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
786 be->be_f_offset + be->be_length -
787 offset - length,
788 PNFS_BLOCK_INVALID_DATA);
789 children[i++] = e3;
790 print_bl_extent(e3);
791 } else
792 merge2 = e3;
793
794 /* Remove be from list, and insert the e* */
795 /* We don't get refs on e*, since this list is the base reference
796 * set when init'ed.
797 */
798 if (i < 3)
799 children[i] = NULL;
800 new = children[0];
801 list_replace(&be->be_node, &new->be_node);
802 bl_put_extent(be);
803 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
804 for (j = 1; j < i; j++) {
805 old = new;
806 new = children[j];
807 list_add(&new->be_node, &old->be_node);
808 }
809 if (merge2) {
810 /* This is a HACK, should just create a _back_merge function */
811 new = list_entry(new->be_node.next,
812 struct pnfs_block_extent, be_node);
813 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
814 }
815 spin_unlock(&bl->bl_ext_lock);
816
817 /* Since we removed the base reference above, be is now scheduled for
818 * destruction.
819 */
820 bl_put_extent(be);
821 dprintk("%s returns %llu after split\n", __func__, rv);
822 return rv;
823
824 out_nosplit:
825 kfree(e1);
826 kfree(e2);
827 kfree(e3);
828 dprintk("%s returns %llu without splitting\n", __func__, rv);
829 return rv;
830}
831
832void
833clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
834 const struct nfs4_layoutcommit_args *arg,
835 int status)
836{
837 struct pnfs_block_short_extent *lce, *save;
838
839 dprintk("%s status %d\n", __func__, status);
840 list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
841 if (likely(!status)) {
842 u64 offset = lce->bse_f_offset;
843 u64 end = offset + lce->bse_length;
844
845 do {
846 offset = set_to_rw(bl, offset, end - offset);
847 } while (offset < end);
848 list_del(&lce->bse_node);
849
850 kfree(lce);
851 } else {
852 list_del(&lce->bse_node);
853 spin_lock(&bl->bl_ext_lock);
854 add_to_commitlist(bl, lce);
855 spin_unlock(&bl->bl_ext_lock);
856 }
857 }
858}
859
860int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
861{
862 struct pnfs_block_short_extent *new;
863
864 new = kmalloc(sizeof(*new), GFP_NOFS);
865 if (unlikely(!new))
866 return -ENOMEM;
867
868 spin_lock_bh(&marks->im_lock);
869 list_add(&new->bse_node, &marks->im_extents);
870 spin_unlock_bh(&marks->im_lock);
871
872 return 0;
873}
874
875struct pnfs_block_short_extent *
876bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
877{
878 struct pnfs_block_short_extent *rv = NULL;
879
880 spin_lock_bh(&marks->im_lock);
881 if (!list_empty(&marks->im_extents)) {
882 rv = list_entry((&marks->im_extents)->next,
883 struct pnfs_block_short_extent, bse_node);
884 list_del_init(&rv->bse_node);
885 }
886 spin_unlock_bh(&marks->im_lock);
887
888 return rv;
889}
890
891void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
892{
893 struct pnfs_block_short_extent *se = NULL, *tmp;
894
895 if (num_to_free <= 0)
896 return;
897
898 spin_lock(&marks->im_lock);
899 list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
900 list_del(&se->bse_node);
901 kfree(se);
902 if (--num_to_free == 0)
903 break;
904 }
905 spin_unlock(&marks->im_lock);
906
907 BUG_ON(num_to_free > 0);
908}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..8d04bda2bd2e
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,285 @@
1/*
2 * Copyright (c) 2006,2007 The Regents of the University of Michigan.
3 * All rights reserved.
4 *
5 * Andy Adamson <andros@citi.umich.edu>
6 * Fred Isaman <iisaman@umich.edu>
7 *
8 * permission is granted to use, copy, create derivative works and
9 * redistribute this software and such derivative works for any purpose,
10 * so long as the name of the university of michigan is not used in
11 * any advertising or publicity pertaining to the use or distribution
12 * of this software without specific, written prior authorization. if
13 * the above copyright notice or any other identification of the
14 * university of michigan is included in any copy of any portion of
15 * this software, then the disclaimer below must also be included.
16 *
17 * this software is provided as is, without representation from the
18 * university of michigan as to its fitness for any purpose, and without
19 * warranty by the university of michigan of any kind, either express
20 * or implied, including without limitation the implied warranties of
21 * merchantability and fitness for a particular purpose. the regents
22 * of the university of michigan shall not be liable for any damages,
23 * including special, indirect, incidental, or consequential damages,
24 * with respect to any claim arising out or in connection with the use
25 * of the software, even if it has been or is hereafter advised of the
26 * possibility of such damages.
27 */
28
29#include <linux/module.h>
30#include <linux/genhd.h>
31#include <linux/blkdev.h>
32
33#include "blocklayout.h"
34
35#define NFSDBG_FACILITY NFSDBG_PNFS_LD
36
37static void
38nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
39{
40 int i;
41
42 *p++ = cpu_to_be32(1);
43 *p++ = cpu_to_be32(b->type);
44 *p++ = cpu_to_be32(b->simple.nr_sigs);
45 for (i = 0; i < b->simple.nr_sigs; i++) {
46 p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
47 p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
48 b->simple.sigs[i].sig_len);
49 }
50}
51
52dev_t
53bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
54 gfp_t gfp_mask)
55{
56 struct net *net = server->nfs_client->cl_net;
57 struct nfs_net *nn = net_generic(net, nfs_net_id);
58 struct bl_dev_msg *reply = &nn->bl_mount_reply;
59 struct bl_pipe_msg bl_pipe_msg;
60 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
61 struct bl_msg_hdr *bl_msg;
62 DECLARE_WAITQUEUE(wq, current);
63 dev_t dev = 0;
64 int rc;
65
66 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
67
68 bl_pipe_msg.bl_wq = &nn->bl_wq;
69
70 b->simple.len += 4; /* single volume */
71 if (b->simple.len > PAGE_SIZE)
72 return -EIO;
73
74 memset(msg, 0, sizeof(*msg));
75 msg->len = sizeof(*bl_msg) + b->simple.len;
76 msg->data = kzalloc(msg->len, gfp_mask);
77 if (!msg->data)
78 goto out;
79
80 bl_msg = msg->data;
81 bl_msg->type = BL_DEVICE_MOUNT,
82 bl_msg->totallen = b->simple.len;
83 nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
84
85 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
86 add_wait_queue(&nn->bl_wq, &wq);
87 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
88 if (rc < 0) {
89 remove_wait_queue(&nn->bl_wq, &wq);
90 goto out;
91 }
92
93 set_current_state(TASK_UNINTERRUPTIBLE);
94 schedule();
95 __set_current_state(TASK_RUNNING);
96 remove_wait_queue(&nn->bl_wq, &wq);
97
98 if (reply->status != BL_DEVICE_REQUEST_PROC) {
99 printk(KERN_WARNING "%s failed to decode device: %d\n",
100 __func__, reply->status);
101 goto out;
102 }
103
104 dev = MKDEV(reply->major, reply->minor);
105out:
106 kfree(msg->data);
107 return dev;
108}
109
110static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
111 size_t mlen)
112{
113 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
114 nfs_net_id);
115
116 if (mlen != sizeof (struct bl_dev_msg))
117 return -EINVAL;
118
119 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
120 return -EFAULT;
121
122 wake_up(&nn->bl_wq);
123
124 return mlen;
125}
126
127static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
128{
129 struct bl_pipe_msg *bl_pipe_msg =
130 container_of(msg, struct bl_pipe_msg, msg);
131
132 if (msg->errno >= 0)
133 return;
134 wake_up(bl_pipe_msg->bl_wq);
135}
136
137static const struct rpc_pipe_ops bl_upcall_ops = {
138 .upcall = rpc_pipe_generic_upcall,
139 .downcall = bl_pipe_downcall,
140 .destroy_msg = bl_pipe_destroy_msg,
141};
142
143static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
144 struct rpc_pipe *pipe)
145{
146 struct dentry *dir, *dentry;
147
148 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
149 if (dir == NULL)
150 return ERR_PTR(-ENOENT);
151 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
152 dput(dir);
153 return dentry;
154}
155
156static void nfs4blocklayout_unregister_sb(struct super_block *sb,
157 struct rpc_pipe *pipe)
158{
159 if (pipe->dentry)
160 rpc_unlink(pipe->dentry);
161}
162
163static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
164 void *ptr)
165{
166 struct super_block *sb = ptr;
167 struct net *net = sb->s_fs_info;
168 struct nfs_net *nn = net_generic(net, nfs_net_id);
169 struct dentry *dentry;
170 int ret = 0;
171
172 if (!try_module_get(THIS_MODULE))
173 return 0;
174
175 if (nn->bl_device_pipe == NULL) {
176 module_put(THIS_MODULE);
177 return 0;
178 }
179
180 switch (event) {
181 case RPC_PIPEFS_MOUNT:
182 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
183 if (IS_ERR(dentry)) {
184 ret = PTR_ERR(dentry);
185 break;
186 }
187 nn->bl_device_pipe->dentry = dentry;
188 break;
189 case RPC_PIPEFS_UMOUNT:
190 if (nn->bl_device_pipe->dentry)
191 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
192 break;
193 default:
194 ret = -ENOTSUPP;
195 break;
196 }
197 module_put(THIS_MODULE);
198 return ret;
199}
200
201static struct notifier_block nfs4blocklayout_block = {
202 .notifier_call = rpc_pipefs_event,
203};
204
205static struct dentry *nfs4blocklayout_register_net(struct net *net,
206 struct rpc_pipe *pipe)
207{
208 struct super_block *pipefs_sb;
209 struct dentry *dentry;
210
211 pipefs_sb = rpc_get_sb_net(net);
212 if (!pipefs_sb)
213 return NULL;
214 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
215 rpc_put_sb_net(net);
216 return dentry;
217}
218
219static void nfs4blocklayout_unregister_net(struct net *net,
220 struct rpc_pipe *pipe)
221{
222 struct super_block *pipefs_sb;
223
224 pipefs_sb = rpc_get_sb_net(net);
225 if (pipefs_sb) {
226 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
227 rpc_put_sb_net(net);
228 }
229}
230
231static int nfs4blocklayout_net_init(struct net *net)
232{
233 struct nfs_net *nn = net_generic(net, nfs_net_id);
234 struct dentry *dentry;
235
236 init_waitqueue_head(&nn->bl_wq);
237 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
238 if (IS_ERR(nn->bl_device_pipe))
239 return PTR_ERR(nn->bl_device_pipe);
240 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
241 if (IS_ERR(dentry)) {
242 rpc_destroy_pipe_data(nn->bl_device_pipe);
243 return PTR_ERR(dentry);
244 }
245 nn->bl_device_pipe->dentry = dentry;
246 return 0;
247}
248
249static void nfs4blocklayout_net_exit(struct net *net)
250{
251 struct nfs_net *nn = net_generic(net, nfs_net_id);
252
253 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
254 rpc_destroy_pipe_data(nn->bl_device_pipe);
255 nn->bl_device_pipe = NULL;
256}
257
258static struct pernet_operations nfs4blocklayout_net_ops = {
259 .init = nfs4blocklayout_net_init,
260 .exit = nfs4blocklayout_net_exit,
261};
262
263int __init bl_init_pipefs(void)
264{
265 int ret;
266
267 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
268 if (ret)
269 goto out;
270 ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
271 if (ret)
272 goto out_unregister_notifier;
273 return 0;
274
275out_unregister_notifier:
276 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
277out:
278 return ret;
279}
280
281void __exit bl_cleanup_pipefs(void)
282{
283 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
284 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
285}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 54de482143cc..b8fb3a4ef649 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -235,7 +235,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
235 235
236 cb_info->serv = serv; 236 cb_info->serv = serv;
237 cb_info->rqst = rqstp; 237 cb_info->rqst = rqstp;
238 cb_info->task = kthread_run(callback_svc, cb_info->rqst, 238 cb_info->task = kthread_create(callback_svc, cb_info->rqst,
239 "nfsv4.%u-svc", minorversion); 239 "nfsv4.%u-svc", minorversion);
240 if (IS_ERR(cb_info->task)) { 240 if (IS_ERR(cb_info->task)) {
241 ret = PTR_ERR(cb_info->task); 241 ret = PTR_ERR(cb_info->task);
@@ -244,6 +244,8 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
244 cb_info->task = NULL; 244 cb_info->task = NULL;
245 return ret; 245 return ret;
246 } 246 }
247 rqstp->rq_task = cb_info->task;
248 wake_up_process(cb_info->task);
247 dprintk("nfs_callback_up: service started\n"); 249 dprintk("nfs_callback_up: service started\n");
248 return 0; 250 return 0;
249} 251}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
171 goto out; 171 goto out;
172 172
173 ino = lo->plh_inode; 173 ino = lo->plh_inode;
174
175 spin_lock(&ino->i_lock);
176 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
177 spin_unlock(&ino->i_lock);
178
179 pnfs_layoutcommit_inode(ino, false);
180
174 spin_lock(&ino->i_lock); 181 spin_lock(&ino->i_lock);
175 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 182 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
176 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, 183 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
177 &args->cbl_range)) 184 &args->cbl_range)) {
178 rv = NFS4ERR_DELAY; 185 rv = NFS4ERR_DELAY;
179 else 186 goto unlock;
180 rv = NFS4ERR_NOMATCHING_LAYOUT; 187 }
181 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 188
189 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
190 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
191 &args->cbl_range);
192 }
193unlock:
182 spin_unlock(&ino->i_lock); 194 spin_unlock(&ino->i_lock);
183 pnfs_free_lseg_list(&free_me_list); 195 pnfs_free_lseg_list(&free_me_list);
184 pnfs_put_layout_hdr(lo); 196 pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
277 } 289 }
278 290
279 found: 291 found:
280 if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
281 dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
282 "deleting instead\n", __func__);
283 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); 292 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
284 } 293 }
285 294
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 6a4f3666e273..f9f4845db989 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
1252 * set up the iterator to start reading from the server list and return the first item 1252 * set up the iterator to start reading from the server list and return the first item
1253 */ 1253 */
1254static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) 1254static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1255 __acquires(&nn->nfs_client_lock)
1255{ 1256{
1256 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1257 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1257 1258
@@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1274 * clean up after reading from the transports list 1275 * clean up after reading from the transports list
1275 */ 1276 */
1276static void nfs_server_list_stop(struct seq_file *p, void *v) 1277static void nfs_server_list_stop(struct seq_file *p, void *v)
1278 __releases(&nn->nfs_client_lock)
1277{ 1279{
1278 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1280 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1279 1281
@@ -1318,7 +1320,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1318 */ 1320 */
1319static int nfs_volume_list_open(struct inode *inode, struct file *file) 1321static int nfs_volume_list_open(struct inode *inode, struct file *file)
1320{ 1322{
1321 return seq_open_net(inode, file, &nfs_server_list_ops, 1323 return seq_open_net(inode, file, &nfs_volume_list_ops,
1322 sizeof(struct seq_net_private)); 1324 sizeof(struct seq_net_private));
1323} 1325}
1324 1326
@@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
1326 * set up the iterator to start reading from the volume list and return the first item 1328 * set up the iterator to start reading from the volume list and return the first item
1327 */ 1329 */
1328static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) 1330static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1331 __acquires(&nn->nfs_client_lock)
1329{ 1332{
1330 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1333 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1331 1334
@@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1348 * clean up after reading from the transports list 1351 * clean up after reading from the transports list
1349 */ 1352 */
1350static void nfs_volume_list_stop(struct seq_file *p, void *v) 1353static void nfs_volume_list_stop(struct seq_file *p, void *v)
1354 __releases(&nn->nfs_client_lock)
1351{ 1355{
1352 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1356 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1353 1357
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 65ef6e00deee..dda4b8667c02 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
178 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 178 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
179} 179}
180 180
181#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
182/* 181/*
183 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data 182 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
184 * @dreq - direct request possibly spanning multiple servers 183 * @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
197 WARN_ON_ONCE(verfp->committed < 0); 196 WARN_ON_ONCE(verfp->committed < 0);
198 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 197 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
199} 198}
200#endif
201 199
202/** 200/**
203 * nfs_direct_IO - NFS address space operation for direct I/O 201 * nfs_direct_IO - NFS address space operation for direct I/O
@@ -576,7 +574,6 @@ out:
576 return result; 574 return result;
577} 575}
578 576
579#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
580static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 577static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
581{ 578{
582 struct nfs_pageio_descriptor desc; 579 struct nfs_pageio_descriptor desc;
@@ -700,17 +697,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
700 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ 697 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
701} 698}
702 699
703#else
704static void nfs_direct_write_schedule_work(struct work_struct *work)
705{
706}
707
708static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
709{
710 nfs_direct_complete(dreq, true);
711}
712#endif
713
714static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) 700static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
715{ 701{
716 struct nfs_direct_req *dreq = hdr->dreq; 702 struct nfs_direct_req *dreq = hdr->dreq;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 524dd80d1898..6920127c5eb7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39#include "pnfs.h"
39 40
40#include "nfstrace.h" 41#include "nfstrace.h"
41 42
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
327 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); 328 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
328 unsigned int end = offset + len; 329 unsigned int end = offset + len;
329 330
331 if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
332 if (!PageUptodate(page))
333 return 1;
334 return 0;
335 }
336
330 if ((file->f_mode & FMODE_READ) && /* open for read? */ 337 if ((file->f_mode & FMODE_READ) && /* open for read? */
331 !PageUptodate(page) && /* Uptodate? */ 338 !PageUptodate(page) && /* Uptodate? */
332 !PagePrivate(page) && /* i/o request already? */ 339 !PagePrivate(page) && /* i/o request already? */
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
468 475
469 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 476 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
470 477
471 /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not 478 /* Always try to initiate a 'commit' if relevant, but only
472 * doing this memory reclaim for a fs-related allocation. 479 * wait for it if __GFP_WAIT is set. Even then, only wait 1
480 * second and only if the 'bdi' is not congested.
481 * Waiting indefinitely can cause deadlocks when the NFS
482 * server is on this machine, when a new TCP connection is
483 * needed and in other rare cases. There is no particular
484 * need to wait extensively here. A short wait has the
485 * benefit that someone else can worry about the freezer.
473 */ 486 */
474 if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && 487 if (mapping) {
475 !(current->flags & PF_FSTRANS)) { 488 struct nfs_server *nfss = NFS_SERVER(mapping->host);
476 int how = FLUSH_SYNC; 489 nfs_commit_inode(mapping->host, 0);
477 490 if ((gfp & __GFP_WAIT) &&
478 /* Don't let kswapd deadlock waiting for OOM RPC calls */ 491 !bdi_write_congested(&nfss->backing_dev_info)) {
479 if (current_is_kswapd()) 492 wait_on_page_bit_killable_timeout(page, PG_private,
480 how = 0; 493 HZ);
481 nfs_commit_inode(mapping->host, how); 494 if (PagePrivate(page))
495 set_bdi_congested(&nfss->backing_dev_info,
496 BLK_RW_ASYNC);
497 }
482 } 498 }
483 /* If PagePrivate() is set, then the page is not freeable */ 499 /* If PagePrivate() is set, then the page is not freeable */
484 if (PagePrivate(page)) 500 if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
539static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, 555static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
540 sector_t *span) 556 sector_t *span)
541{ 557{
558 int ret;
559 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
560
542 *span = sis->pages; 561 *span = sis->pages;
543 return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); 562
563 rcu_read_lock();
564 ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
565 rcu_read_unlock();
566
567 return ret;
544} 568}
545 569
546static void nfs_swap_deactivate(struct file *file) 570static void nfs_swap_deactivate(struct file *file)
547{ 571{
548 xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); 572 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
573
574 rcu_read_lock();
575 xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
576 rcu_read_unlock();
549} 577}
550#endif 578#endif
551 579
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 90978075f730..abc5056999d6 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
265{ 265{
266 266
267 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || 267 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
268 hdr->res.verf->committed == NFS_FILE_SYNC) 268 hdr->res.verf->committed != NFS_DATA_SYNC)
269 return; 269 return;
270 270
271 pnfs_set_layoutcommit(hdr); 271 pnfs_set_layoutcommit(hdr);
@@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
403 return -EAGAIN; 403 return -EAGAIN;
404 } 404 }
405 405
406 if (data->verf.committed == NFS_UNSTABLE)
407 pnfs_commit_set_layoutcommit(data);
408
406 return 0; 409 return 0;
407} 410}
408 411
@@ -646,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
646 } 649 }
647 650
648 /* find and reference the deviceid */ 651 /* find and reference the deviceid */
649 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 652 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
650 NFS_SERVER(lo->plh_inode)->nfs_client, id); 653 lo->plh_lc_cred, gfp_flags);
651 if (d == NULL) { 654 if (d == NULL)
652 dsaddr = filelayout_get_device_info(lo->plh_inode, id, 655 goto out;
653 lo->plh_lc_cred, gfp_flags); 656
654 if (dsaddr == NULL) 657 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
655 goto out;
656 } else
657 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
658 /* Found deviceid is unavailable */ 658 /* Found deviceid is unavailable */
659 if (filelayout_test_devid_unavailable(&dsaddr->id_node)) 659 if (filelayout_test_devid_unavailable(&dsaddr->id_node))
660 goto out_put; 660 goto out_put;
661 661
662 fl->dsaddr = dsaddr; 662 fl->dsaddr = dsaddr;
663 663
@@ -1368,6 +1368,17 @@ out:
1368 cinfo->ds->ncommitting = 0; 1368 cinfo->ds->ncommitting = 0;
1369 return PNFS_ATTEMPTED; 1369 return PNFS_ATTEMPTED;
1370} 1370}
1371static struct nfs4_deviceid_node *
1372filelayout_alloc_deviceid_node(struct nfs_server *server,
1373 struct pnfs_device *pdev, gfp_t gfp_flags)
1374{
1375 struct nfs4_file_layout_dsaddr *dsaddr;
1376
1377 dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
1378 if (!dsaddr)
1379 return NULL;
1380 return &dsaddr->id_node;
1381}
1371 1382
1372static void 1383static void
1373filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) 1384filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1420,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1420 .commit_pagelist = filelayout_commit_pagelist, 1431 .commit_pagelist = filelayout_commit_pagelist,
1421 .read_pagelist = filelayout_read_pagelist, 1432 .read_pagelist = filelayout_read_pagelist,
1422 .write_pagelist = filelayout_write_pagelist, 1433 .write_pagelist = filelayout_write_pagelist,
1434 .alloc_deviceid_node = filelayout_alloc_deviceid_node,
1423 .free_deviceid_node = filelayout_free_deveiceid_node, 1435 .free_deviceid_node = filelayout_free_deveiceid_node,
1424}; 1436};
1425 1437
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
149 u32 ds_idx); 149 u32 ds_idx);
150
151extern struct nfs4_file_layout_dsaddr *
152nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
153 struct pnfs_device *pdev, gfp_t gfp_flags);
150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 154extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 155extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
152struct nfs4_file_layout_dsaddr *
153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
154 struct rpc_cred *cred, gfp_t gfp_flags);
155 156
156#endif /* FS_NFS_NFS4FILELAYOUT_H */ 157#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 8540516f4d71..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
484} 484}
485 485
486/* Decode opaque device data and return the result */ 486/* Decode opaque device data and return the result */
487static struct nfs4_file_layout_dsaddr* 487struct nfs4_file_layout_dsaddr *
488decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) 488nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
489 gfp_t gfp_flags)
489{ 490{
490 int i; 491 int i;
491 u32 cnt, num; 492 u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
570 dsaddr->stripe_indices = stripe_indices; 571 dsaddr->stripe_indices = stripe_indices;
571 stripe_indices = NULL; 572 stripe_indices = NULL;
572 dsaddr->ds_num = num; 573 dsaddr->ds_num = num;
573 nfs4_init_deviceid_node(&dsaddr->id_node, 574 nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
574 NFS_SERVER(ino)->pnfs_curr_ld,
575 NFS_SERVER(ino)->nfs_client,
576 &pdev->dev_id);
577 575
578 INIT_LIST_HEAD(&dsaddrs); 576 INIT_LIST_HEAD(&dsaddrs);
579 577
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
587 585
588 mp_count = be32_to_cpup(p); /* multipath count */ 586 mp_count = be32_to_cpup(p); /* multipath count */
589 for (j = 0; j < mp_count; j++) { 587 for (j = 0; j < mp_count; j++) {
590 da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, 588 da = decode_ds_addr(server->nfs_client->cl_net,
591 &stream, gfp_flags); 589 &stream, gfp_flags);
592 if (da) 590 if (da)
593 list_add_tail(&da->da_node, &dsaddrs); 591 list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
637 return NULL; 635 return NULL;
638} 636}
639 637
640/*
641 * Decode the opaque device specified in 'dev' and add it to the cache of
642 * available devices.
643 */
644static struct nfs4_file_layout_dsaddr *
645decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
646{
647 struct nfs4_deviceid_node *d;
648 struct nfs4_file_layout_dsaddr *n, *new;
649
650 new = decode_device(inode, dev, gfp_flags);
651 if (!new) {
652 printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
653 __func__);
654 return NULL;
655 }
656
657 d = nfs4_insert_deviceid_node(&new->id_node);
658 n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
659 if (n != new) {
660 nfs4_fl_free_deviceid(new);
661 return n;
662 }
663
664 return new;
665}
666
667/*
668 * Retrieve the information for dev_id, add it to the list
669 * of available devices, and return it.
670 */
671struct nfs4_file_layout_dsaddr *
672filelayout_get_device_info(struct inode *inode,
673 struct nfs4_deviceid *dev_id,
674 struct rpc_cred *cred,
675 gfp_t gfp_flags)
676{
677 struct pnfs_device *pdev = NULL;
678 u32 max_resp_sz;
679 int max_pages;
680 struct page **pages = NULL;
681 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
682 int rc, i;
683 struct nfs_server *server = NFS_SERVER(inode);
684
685 /*
686 * Use the session max response size as the basis for setting
687 * GETDEVICEINFO's maxcount
688 */
689 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
690 max_pages = nfs_page_array_len(0, max_resp_sz);
691 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
692 __func__, inode, max_resp_sz, max_pages);
693
694 pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
695 if (pdev == NULL)
696 return NULL;
697
698 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
699 if (pages == NULL) {
700 kfree(pdev);
701 return NULL;
702 }
703 for (i = 0; i < max_pages; i++) {
704 pages[i] = alloc_page(gfp_flags);
705 if (!pages[i])
706 goto out_free;
707 }
708
709 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
710 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
711 pdev->pages = pages;
712 pdev->pgbase = 0;
713 pdev->pglen = max_resp_sz;
714 pdev->mincount = 0;
715 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
716
717 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
718 dprintk("%s getdevice info returns %d\n", __func__, rc);
719 if (rc)
720 goto out_free;
721
722 /*
723 * Found new device, need to decode it and then add it to the
724 * list of known devices for this mountpoint.
725 */
726 dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
727out_free:
728 for (i = 0; i < max_pages; i++)
729 __free_page(pages[i]);
730 kfree(pages);
731 kfree(pdev);
732 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
733 return dsaddr;
734}
735
736void 638void
737nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 639nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
738{ 640{
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b08..777b055063f6 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
74 struct nfs_server_key *key = buffer; 74 struct nfs_server_key *key = buffer;
75 uint16_t len = sizeof(struct nfs_server_key); 75 uint16_t len = sizeof(struct nfs_server_key);
76 76
77 memset(key, 0, len);
77 key->nfsversion = clp->rpc_ops->version; 78 key->nfsversion = clp->rpc_ops->version;
78 key->family = clp->cl_addr.ss_family; 79 key->family = clp->cl_addr.ss_family;
79 80
80 memset(key, 0, len);
81
82 switch (clp->cl_addr.ss_family) { 81 switch (clp->cl_addr.ss_family) {
83 case AF_INET: 82 case AF_INET:
84 key->port = sin->sin_port; 83 key->port = sin->sin_port;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 577a36f0a510..141c9f4a40de 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
505 attr->ia_valid &= ~ATTR_MODE; 505 attr->ia_valid &= ~ATTR_MODE;
506 506
507 if (attr->ia_valid & ATTR_SIZE) { 507 if (attr->ia_valid & ATTR_SIZE) {
508 if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) 508 BUG_ON(!S_ISREG(inode->i_mode));
509
510 if (attr->ia_size == i_size_read(inode))
509 attr->ia_valid &= ~ATTR_SIZE; 511 attr->ia_valid &= ~ATTR_SIZE;
510 } 512 }
511 513
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9056622d2230..14ae6f20a172 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
218int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); 218int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
219#endif 219#endif
220 220
221/* nfs3client.c */
222#if IS_ENABLED(CONFIG_NFS_V3)
223struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
224struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
225 struct nfs_fattr *, rpc_authflavor_t);
226#endif
227
228/* callback_xdr.c */ 221/* callback_xdr.c */
229extern struct svc_version nfs4_callback_version1; 222extern struct svc_version nfs4_callback_version1;
230extern struct svc_version nfs4_callback_version4; 223extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..333ae4068506
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) 2014 Anna Schumaker.
3 *
4 * NFSv3-specific filesystem definitions and declarations
5 */
6#ifndef __LINUX_FS_NFS_NFS3_FS_H
7#define __LINUX_FS_NFS_NFS3_FS_H
8
9/*
10 * nfs3acl.c
11 */
12#ifdef CONFIG_NFS_V3_ACL
13extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
14extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
15extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
16 struct posix_acl *dfacl);
17extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
18extern const struct xattr_handler *nfs3_xattr_handlers[];
19#else
20static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
21 struct posix_acl *dfacl)
22{
23 return 0;
24}
25#define nfs3_listxattr NULL
26#endif /* CONFIG_NFS_V3_ACL */
27
28/* nfs3client.c */
29struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
31 struct nfs_fattr *, rpc_authflavor_t);
32
33
34#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 24c6898159cc..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
7#include <linux/nfsacl.h> 7#include <linux/nfsacl.h>
8 8
9#include "internal.h" 9#include "internal.h"
10#include "nfs3_fs.h"
10 11
11#define NFSDBG_FACILITY NFSDBG_PROC 12#define NFSDBG_FACILITY NFSDBG_PROC
12 13
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
1#include <linux/nfs_fs.h> 1#include <linux/nfs_fs.h>
2#include <linux/nfs_mount.h> 2#include <linux/nfs_mount.h>
3#include "internal.h" 3#include "internal.h"
4#include "nfs3_fs.h"
4 5
5#ifdef CONFIG_NFS_V3_ACL 6#ifdef CONFIG_NFS_V3_ACL
6static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; 7static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 809670eba52a..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
22 22
23#include "iostat.h" 23#include "iostat.h"
24#include "internal.h" 24#include "internal.h"
25#include "nfs3_fs.h"
25 26
26#define NFSDBG_FACILITY NFSDBG_PROC 27#define NFSDBG_FACILITY NFSDBG_PROC
27 28
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
6#include "internal.h" 6#include "internal.h"
7#include "nfs3_fs.h"
7#include "nfs.h" 8#include "nfs.h"
8 9
9static struct nfs_subversion nfs_v3 = { 10static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6ca0c8e7a945..5aa55c132aa2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
77static int _nfs4_proc_open(struct nfs4_opendata *data); 77static int _nfs4_proc_open(struct nfs4_opendata *data);
78static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 78static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
79static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 79static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
80static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 80static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
81static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); 81static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
82static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); 82static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); 83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
314 kunmap_atomic(start); 314 kunmap_atomic(start);
315} 315}
316 316
317static long nfs4_update_delay(long *timeout)
318{
319 long ret;
320 if (!timeout)
321 return NFS4_POLL_RETRY_MAX;
322 if (*timeout <= 0)
323 *timeout = NFS4_POLL_RETRY_MIN;
324 if (*timeout > NFS4_POLL_RETRY_MAX)
325 *timeout = NFS4_POLL_RETRY_MAX;
326 ret = *timeout;
327 *timeout <<= 1;
328 return ret;
329}
330
317static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) 331static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
318{ 332{
319 int res = 0; 333 int res = 0;
320 334
321 might_sleep(); 335 might_sleep();
322 336
323 if (*timeout <= 0) 337 freezable_schedule_timeout_killable_unsafe(
324 *timeout = NFS4_POLL_RETRY_MIN; 338 nfs4_update_delay(timeout));
325 if (*timeout > NFS4_POLL_RETRY_MAX)
326 *timeout = NFS4_POLL_RETRY_MAX;
327 freezable_schedule_timeout_killable_unsafe(*timeout);
328 if (fatal_signal_pending(current)) 339 if (fatal_signal_pending(current))
329 res = -ERESTARTSYS; 340 res = -ERESTARTSYS;
330 *timeout <<= 1;
331 return res; 341 return res;
332} 342}
333 343
@@ -1307,15 +1317,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1307 int ret = -EAGAIN; 1317 int ret = -EAGAIN;
1308 1318
1309 for (;;) { 1319 for (;;) {
1320 spin_lock(&state->owner->so_lock);
1310 if (can_open_cached(state, fmode, open_mode)) { 1321 if (can_open_cached(state, fmode, open_mode)) {
1311 spin_lock(&state->owner->so_lock); 1322 update_open_stateflags(state, fmode);
1312 if (can_open_cached(state, fmode, open_mode)) {
1313 update_open_stateflags(state, fmode);
1314 spin_unlock(&state->owner->so_lock);
1315 goto out_return_state;
1316 }
1317 spin_unlock(&state->owner->so_lock); 1323 spin_unlock(&state->owner->so_lock);
1324 goto out_return_state;
1318 } 1325 }
1326 spin_unlock(&state->owner->so_lock);
1319 rcu_read_lock(); 1327 rcu_read_lock();
1320 delegation = rcu_dereference(nfsi->delegation); 1328 delegation = rcu_dereference(nfsi->delegation);
1321 if (!can_open_delegated(delegation, fmode)) { 1329 if (!can_open_delegated(delegation, fmode)) {
@@ -2589,7 +2597,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2589 if (calldata->arg.fmode == 0) 2597 if (calldata->arg.fmode == 0)
2590 break; 2598 break;
2591 default: 2599 default:
2592 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { 2600 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
2593 rpc_restart_call_prepare(task); 2601 rpc_restart_call_prepare(task);
2594 goto out_release; 2602 goto out_release;
2595 } 2603 }
@@ -3217,7 +3225,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
3217 struct nfs4_label *label = NULL; 3225 struct nfs4_label *label = NULL;
3218 int status; 3226 int status;
3219 3227
3220 if (pnfs_ld_layoutret_on_setattr(inode)) 3228 if (pnfs_ld_layoutret_on_setattr(inode) &&
3229 sattr->ia_valid & ATTR_SIZE &&
3230 sattr->ia_size < i_size_read(inode))
3221 pnfs_commit_and_return_layout(inode); 3231 pnfs_commit_and_return_layout(inode);
3222 3232
3223 nfs_fattr_init(fattr); 3233 nfs_fattr_init(fattr);
@@ -3576,7 +3586,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
3576 3586
3577 if (!nfs4_sequence_done(task, &res->seq_res)) 3587 if (!nfs4_sequence_done(task, &res->seq_res))
3578 return 0; 3588 return 0;
3579 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 3589 if (nfs4_async_handle_error(task, res->server, NULL,
3590 &data->timeout) == -EAGAIN)
3580 return 0; 3591 return 0;
3581 update_changeattr(dir, &res->cinfo); 3592 update_changeattr(dir, &res->cinfo);
3582 return 1; 3593 return 1;
@@ -3609,7 +3620,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
3609 3620
3610 if (!nfs4_sequence_done(task, &res->seq_res)) 3621 if (!nfs4_sequence_done(task, &res->seq_res))
3611 return 0; 3622 return 0;
3612 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 3623 if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
3613 return 0; 3624 return 0;
3614 3625
3615 update_changeattr(old_dir, &res->old_cinfo); 3626 update_changeattr(old_dir, &res->old_cinfo);
@@ -4113,7 +4124,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
4113 4124
4114 trace_nfs4_read(hdr, task->tk_status); 4125 trace_nfs4_read(hdr, task->tk_status);
4115 if (nfs4_async_handle_error(task, server, 4126 if (nfs4_async_handle_error(task, server,
4116 hdr->args.context->state) == -EAGAIN) { 4127 hdr->args.context->state,
4128 NULL) == -EAGAIN) {
4117 rpc_restart_call_prepare(task); 4129 rpc_restart_call_prepare(task);
4118 return -EAGAIN; 4130 return -EAGAIN;
4119 } 4131 }
@@ -4181,10 +4193,11 @@ static int nfs4_write_done_cb(struct rpc_task *task,
4181 struct nfs_pgio_header *hdr) 4193 struct nfs_pgio_header *hdr)
4182{ 4194{
4183 struct inode *inode = hdr->inode; 4195 struct inode *inode = hdr->inode;
4184 4196
4185 trace_nfs4_write(hdr, task->tk_status); 4197 trace_nfs4_write(hdr, task->tk_status);
4186 if (nfs4_async_handle_error(task, NFS_SERVER(inode), 4198 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4187 hdr->args.context->state) == -EAGAIN) { 4199 hdr->args.context->state,
4200 NULL) == -EAGAIN) {
4188 rpc_restart_call_prepare(task); 4201 rpc_restart_call_prepare(task);
4189 return -EAGAIN; 4202 return -EAGAIN;
4190 } 4203 }
@@ -4264,7 +4277,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
4264 struct inode *inode = data->inode; 4277 struct inode *inode = data->inode;
4265 4278
4266 trace_nfs4_commit(data, task->tk_status); 4279 trace_nfs4_commit(data, task->tk_status);
4267 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 4280 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4281 NULL, NULL) == -EAGAIN) {
4268 rpc_restart_call_prepare(task); 4282 rpc_restart_call_prepare(task);
4269 return -EAGAIN; 4283 return -EAGAIN;
4270 } 4284 }
@@ -4817,7 +4831,8 @@ out:
4817 4831
4818 4832
4819static int 4833static int
4820nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 4834nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4835 struct nfs4_state *state, long *timeout)
4821{ 4836{
4822 struct nfs_client *clp = server->nfs_client; 4837 struct nfs_client *clp = server->nfs_client;
4823 4838
@@ -4867,6 +4882,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4867#endif /* CONFIG_NFS_V4_1 */ 4882#endif /* CONFIG_NFS_V4_1 */
4868 case -NFS4ERR_DELAY: 4883 case -NFS4ERR_DELAY:
4869 nfs_inc_server_stats(server, NFSIOS_DELAY); 4884 nfs_inc_server_stats(server, NFSIOS_DELAY);
4885 rpc_delay(task, nfs4_update_delay(timeout));
4886 goto restart_call;
4870 case -NFS4ERR_GRACE: 4887 case -NFS4ERR_GRACE:
4871 rpc_delay(task, NFS4_POLL_RETRY_MAX); 4888 rpc_delay(task, NFS4_POLL_RETRY_MAX);
4872 case -NFS4ERR_RETRY_UNCACHED_REP: 4889 case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -5107,8 +5124,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5107 pnfs_roc_set_barrier(data->inode, data->roc_barrier); 5124 pnfs_roc_set_barrier(data->inode, data->roc_barrier);
5108 break; 5125 break;
5109 default: 5126 default:
5110 if (nfs4_async_handle_error(task, data->res.server, NULL) == 5127 if (nfs4_async_handle_error(task, data->res.server,
5111 -EAGAIN) { 5128 NULL, NULL) == -EAGAIN) {
5112 rpc_restart_call_prepare(task); 5129 rpc_restart_call_prepare(task);
5113 return; 5130 return;
5114 } 5131 }
@@ -5372,7 +5389,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
5372 case -NFS4ERR_EXPIRED: 5389 case -NFS4ERR_EXPIRED:
5373 break; 5390 break;
5374 default: 5391 default:
5375 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 5392 if (nfs4_async_handle_error(task, calldata->server,
5393 NULL, NULL) == -EAGAIN)
5376 rpc_restart_call_prepare(task); 5394 rpc_restart_call_prepare(task);
5377 } 5395 }
5378 nfs_release_seqid(calldata->arg.seqid); 5396 nfs_release_seqid(calldata->arg.seqid);
@@ -5978,7 +5996,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
5978 break; 5996 break;
5979 case -NFS4ERR_LEASE_MOVED: 5997 case -NFS4ERR_LEASE_MOVED:
5980 case -NFS4ERR_DELAY: 5998 case -NFS4ERR_DELAY:
5981 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) 5999 if (nfs4_async_handle_error(task, server,
6000 NULL, NULL) == -EAGAIN)
5982 rpc_restart_call_prepare(task); 6001 rpc_restart_call_prepare(task);
5983 } 6002 }
5984} 6003}
@@ -7353,7 +7372,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
7353 int ret = 0; 7372 int ret = 0;
7354 7373
7355 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) 7374 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
7356 return 0; 7375 return -EAGAIN;
7357 task = _nfs41_proc_sequence(clp, cred, false); 7376 task = _nfs41_proc_sequence(clp, cred, false);
7358 if (IS_ERR(task)) 7377 if (IS_ERR(task))
7359 ret = PTR_ERR(task); 7378 ret = PTR_ERR(task);
@@ -7583,14 +7602,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7583 } else { 7602 } else {
7584 LIST_HEAD(head); 7603 LIST_HEAD(head);
7585 7604
7605 /*
7606 * Mark the bad layout state as invalid, then retry
7607 * with the current stateid.
7608 */
7586 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); 7609 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
7587 spin_unlock(&inode->i_lock); 7610 spin_unlock(&inode->i_lock);
7588 /* Mark the bad layout state as invalid, then
7589 * retry using the open stateid. */
7590 pnfs_free_lseg_list(&head); 7611 pnfs_free_lseg_list(&head);
7612
7613 task->tk_status = 0;
7614 rpc_restart_call_prepare(task);
7591 } 7615 }
7592 } 7616 }
7593 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) 7617 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
7594 rpc_restart_call_prepare(task); 7618 rpc_restart_call_prepare(task);
7595out: 7619out:
7596 dprintk("<-- %s\n", __func__); 7620 dprintk("<-- %s\n", __func__);
@@ -7750,7 +7774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
7750 case 0: 7774 case 0:
7751 break; 7775 break;
7752 case -NFS4ERR_DELAY: 7776 case -NFS4ERR_DELAY:
7753 if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) 7777 if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
7754 break; 7778 break;
7755 rpc_restart_call_prepare(task); 7779 rpc_restart_call_prepare(task);
7756 return; 7780 return;
@@ -7809,54 +7833,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
7809 return status; 7833 return status;
7810} 7834}
7811 7835
7812/*
7813 * Retrieve the list of Data Server devices from the MDS.
7814 */
7815static int _nfs4_getdevicelist(struct nfs_server *server,
7816 const struct nfs_fh *fh,
7817 struct pnfs_devicelist *devlist)
7818{
7819 struct nfs4_getdevicelist_args args = {
7820 .fh = fh,
7821 .layoutclass = server->pnfs_curr_ld->id,
7822 };
7823 struct nfs4_getdevicelist_res res = {
7824 .devlist = devlist,
7825 };
7826 struct rpc_message msg = {
7827 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
7828 .rpc_argp = &args,
7829 .rpc_resp = &res,
7830 };
7831 int status;
7832
7833 dprintk("--> %s\n", __func__);
7834 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
7835 &res.seq_res, 0);
7836 dprintk("<-- %s status=%d\n", __func__, status);
7837 return status;
7838}
7839
7840int nfs4_proc_getdevicelist(struct nfs_server *server,
7841 const struct nfs_fh *fh,
7842 struct pnfs_devicelist *devlist)
7843{
7844 struct nfs4_exception exception = { };
7845 int err;
7846
7847 do {
7848 err = nfs4_handle_exception(server,
7849 _nfs4_getdevicelist(server, fh, devlist),
7850 &exception);
7851 } while (exception.retry);
7852
7853 dprintk("%s: err=%d, num_devs=%u\n", __func__,
7854 err, devlist->num_devs);
7855
7856 return err;
7857}
7858EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
7859
7860static int 7836static int
7861_nfs4_proc_getdeviceinfo(struct nfs_server *server, 7837_nfs4_proc_getdeviceinfo(struct nfs_server *server,
7862 struct pnfs_device *pdev, 7838 struct pnfs_device *pdev,
@@ -7929,7 +7905,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
7929 case 0: 7905 case 0:
7930 break; 7906 break;
7931 default: 7907 default:
7932 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 7908 if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
7933 rpc_restart_call_prepare(task); 7909 rpc_restart_call_prepare(task);
7934 return; 7910 return;
7935 } 7911 }
@@ -8225,7 +8201,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
8225 8201
8226 switch (task->tk_status) { 8202 switch (task->tk_status) {
8227 case -NFS4ERR_DELAY: 8203 case -NFS4ERR_DELAY:
8228 if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) 8204 if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
8229 rpc_restart_call_prepare(task); 8205 rpc_restart_call_prepare(task);
8230 } 8206 }
8231} 8207}
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
88 } 88 }
89 nfs_expire_all_delegations(clp); 89 nfs_expire_all_delegations(clp);
90 } else { 90 } else {
91 int ret;
92
91 /* Queue an asynchronous RENEW. */ 93 /* Queue an asynchronous RENEW. */
92 ops->sched_state_renewal(clp, cred, renew_flags); 94 ret = ops->sched_state_renewal(clp, cred, renew_flags);
93 put_rpccred(cred); 95 put_rpccred(cred);
94 goto out_exp; 96 switch (ret) {
97 default:
98 goto out_exp;
99 case -EAGAIN:
100 case -ENOMEM:
101 break;
102 }
95 } 103 }
96 } else { 104 } else {
97 dprintk("%s: failed to call renewd. Reason: lease not expired \n", 105 dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 22fe35104c0c..5194933ed419 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1705,7 +1705,8 @@ restart:
1705 if (status < 0) { 1705 if (status < 0) {
1706 set_bit(ops->owner_flag_bit, &sp->so_flags); 1706 set_bit(ops->owner_flag_bit, &sp->so_flags);
1707 nfs4_put_state_owner(sp); 1707 nfs4_put_state_owner(sp);
1708 return nfs4_recovery_handle_error(clp, status); 1708 status = nfs4_recovery_handle_error(clp, status);
1709 return (status != 0) ? status : -EAGAIN;
1709 } 1710 }
1710 1711
1711 nfs4_put_state_owner(sp); 1712 nfs4_put_state_owner(sp);
@@ -1714,7 +1715,7 @@ restart:
1714 spin_unlock(&clp->cl_lock); 1715 spin_unlock(&clp->cl_lock);
1715 } 1716 }
1716 rcu_read_unlock(); 1717 rcu_read_unlock();
1717 return status; 1718 return 0;
1718} 1719}
1719 1720
1720static int nfs4_check_lease(struct nfs_client *clp) 1721static int nfs4_check_lease(struct nfs_client *clp)
@@ -1761,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1761 break; 1762 break;
1762 case -NFS4ERR_STALE_CLIENTID: 1763 case -NFS4ERR_STALE_CLIENTID:
1763 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); 1764 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
1764 nfs4_state_clear_reclaim_reboot(clp);
1765 nfs4_state_start_reclaim_reboot(clp); 1765 nfs4_state_start_reclaim_reboot(clp);
1766 break; 1766 break;
1767 case -NFS4ERR_CLID_INUSE: 1767 case -NFS4ERR_CLID_INUSE:
@@ -2345,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
2345 status = nfs4_check_lease(clp); 2345 status = nfs4_check_lease(clp);
2346 if (status < 0) 2346 if (status < 0)
2347 goto out_error; 2347 goto out_error;
2348 continue;
2348 } 2349 }
2349 2350
2350 if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { 2351 if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2366,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
2366 section = "reclaim reboot"; 2367 section = "reclaim reboot";
2367 status = nfs4_do_reclaim(clp, 2368 status = nfs4_do_reclaim(clp,
2368 clp->cl_mvops->reboot_recovery_ops); 2369 clp->cl_mvops->reboot_recovery_ops);
2369 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 2370 if (status == -EAGAIN)
2370 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
2371 continue;
2372 nfs4_state_end_reclaim_reboot(clp);
2373 if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
2374 continue; 2371 continue;
2375 if (status < 0) 2372 if (status < 0)
2376 goto out_error; 2373 goto out_error;
2374 nfs4_state_end_reclaim_reboot(clp);
2377 } 2375 }
2378 2376
2379 /* Now recover expired state... */ 2377 /* Now recover expired state... */
@@ -2381,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
2381 section = "reclaim nograce"; 2379 section = "reclaim nograce";
2382 status = nfs4_do_reclaim(clp, 2380 status = nfs4_do_reclaim(clp,
2383 clp->cl_mvops->nograce_recovery_ops); 2381 clp->cl_mvops->nograce_recovery_ops);
2384 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 2382 if (status == -EAGAIN)
2385 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
2386 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
2387 continue; 2383 continue;
2388 if (status < 0) 2384 if (status < 0)
2389 goto out_error; 2385 goto out_error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e13b59d8d9aa..005d03c5d274 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
362 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 362 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
363#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 363#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
364#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 364#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
365#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ 365#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
366 encode_verifier_maxsz) 366 XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
367#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ 367 1 /* layout type */ + \
368 2 /* nfs_cookie4 gdlr_cookie */ + \ 368 1 /* maxcount */ + \
369 decode_verifier_maxsz \ 369 1 /* bitmap size */ + \
370 /* verifier4 gdlr_verifier */ + \ 370 1 /* notification bitmap length */ + \
371 1 /* gdlr_deviceid_list count */ + \ 371 1 /* notification bitmap, word 0 */)
372 XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
373 NFS4_DEVICEID4_SIZE) \
374 /* gdlr_deviceid_list */ + \
375 1 /* bool gdlr_eof */)
376#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
377 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
378#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 372#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
379 1 /* layout type */ + \ 373 1 /* layout type */ + \
380 1 /* opaque devaddr4 length */ + \ 374 1 /* opaque devaddr4 length */ + \
381 /* devaddr4 payload is read into page */ \ 375 /* devaddr4 payload is read into page */ \
382 1 /* notification bitmap length */ + \ 376 1 /* notification bitmap length */ + \
383 1 /* notification bitmap */) 377 1 /* notification bitmap, word 0 */)
384#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ 378#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
385 encode_stateid_maxsz) 379 encode_stateid_maxsz)
386#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ 380#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
395 2 /* last byte written */ + \ 389 2 /* last byte written */ + \
396 1 /* nt_timechanged (false) */ + \ 390 1 /* nt_timechanged (false) */ + \
397 1 /* layoutupdate4 layout type */ + \ 391 1 /* layoutupdate4 layout type */ + \
398 1 /* NULL filelayout layoutupdate4 payload */) 392 1 /* layoutupdate4 opaqueue len */)
393 /* the actual content of layoutupdate4 should
394 be allocated by drivers and spliced in
395 using xdr_write_pages */
399#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) 396#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
400#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ 397#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
401 encode_stateid_maxsz + \ 398 encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
809#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 806#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
810 decode_sequence_maxsz + \ 807 decode_sequence_maxsz + \
811 decode_reclaim_complete_maxsz) 808 decode_reclaim_complete_maxsz)
812#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
813 encode_sequence_maxsz + \
814 encode_putfh_maxsz + \
815 encode_getdevicelist_maxsz)
816#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
817 decode_sequence_maxsz + \
818 decode_putfh_maxsz + \
819 decode_getdevicelist_maxsz)
820#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ 809#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
821 encode_sequence_maxsz +\ 810 encode_sequence_maxsz +\
822 encode_getdeviceinfo_maxsz) 811 encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
1927 1916
1928#ifdef CONFIG_NFS_V4_1 1917#ifdef CONFIG_NFS_V4_1
1929static void 1918static void
1930encode_getdevicelist(struct xdr_stream *xdr,
1931 const struct nfs4_getdevicelist_args *args,
1932 struct compound_hdr *hdr)
1933{
1934 __be32 *p;
1935 nfs4_verifier dummy = {
1936 .data = "dummmmmy",
1937 };
1938
1939 encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
1940 p = reserve_space(xdr, 16);
1941 *p++ = cpu_to_be32(args->layoutclass);
1942 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
1943 xdr_encode_hyper(p, 0ULL); /* cookie */
1944 encode_nfs4_verifier(xdr, &dummy);
1945}
1946
1947static void
1948encode_getdeviceinfo(struct xdr_stream *xdr, 1919encode_getdeviceinfo(struct xdr_stream *xdr,
1949 const struct nfs4_getdeviceinfo_args *args, 1920 const struct nfs4_getdeviceinfo_args *args,
1950 struct compound_hdr *hdr) 1921 struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1952 __be32 *p; 1923 __be32 *p;
1953 1924
1954 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); 1925 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
1955 p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); 1926 p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
1956 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, 1927 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1957 NFS4_DEVICEID4_SIZE); 1928 NFS4_DEVICEID4_SIZE);
1958 *p++ = cpu_to_be32(args->pdev->layout_type); 1929 *p++ = cpu_to_be32(args->pdev->layout_type);
1959 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ 1930 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
1960 *p++ = cpu_to_be32(0); /* bitmap length 0 */ 1931
1932 p = reserve_space(xdr, 4 + 4);
1933 *p++ = cpu_to_be32(1); /* bitmap length */
1934 *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
1961} 1935}
1962 1936
1963static void 1937static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
1990static int 1964static int
1991encode_layoutcommit(struct xdr_stream *xdr, 1965encode_layoutcommit(struct xdr_stream *xdr,
1992 struct inode *inode, 1966 struct inode *inode,
1993 const struct nfs4_layoutcommit_args *args, 1967 struct nfs4_layoutcommit_args *args,
1994 struct compound_hdr *hdr) 1968 struct compound_hdr *hdr)
1995{ 1969{
1996 __be32 *p; 1970 __be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
2011 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1985 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
2012 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1986 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
2013 1987
2014 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) 1988 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
2015 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( 1989 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
2016 NFS_I(inode)->layout, xdr, args); 1990 NFS_I(inode)->layout, xdr, args);
2017 else 1991 } else {
2018 encode_uint32(xdr, 0); /* no layout-type payload */ 1992 encode_uint32(xdr, args->layoutupdate_len);
1993 if (args->layoutupdate_pages) {
1994 xdr_write_pages(xdr, args->layoutupdate_pages, 0,
1995 args->layoutupdate_len);
1996 }
1997 }
2019 1998
2020 return 0; 1999 return 0;
2021} 2000}
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2893} 2872}
2894 2873
2895/* 2874/*
2896 * Encode GETDEVICELIST request
2897 */
2898static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
2899 struct xdr_stream *xdr,
2900 struct nfs4_getdevicelist_args *args)
2901{
2902 struct compound_hdr hdr = {
2903 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2904 };
2905
2906 encode_compound_hdr(xdr, req, &hdr);
2907 encode_sequence(xdr, &args->seq_args, &hdr);
2908 encode_putfh(xdr, args->fh, &hdr);
2909 encode_getdevicelist(xdr, args, &hdr);
2910 encode_nops(&hdr);
2911}
2912
2913/*
2914 * Encode GETDEVICEINFO request 2875 * Encode GETDEVICEINFO request
2915 */ 2876 */
2916static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, 2877static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
5765} 5726}
5766 5727
5767#if defined(CONFIG_NFS_V4_1) 5728#if defined(CONFIG_NFS_V4_1)
5768/*
5769 * TODO: Need to handle case when EOF != true;
5770 */
5771static int decode_getdevicelist(struct xdr_stream *xdr,
5772 struct pnfs_devicelist *res)
5773{
5774 __be32 *p;
5775 int status, i;
5776 nfs4_verifier verftemp;
5777
5778 status = decode_op_hdr(xdr, OP_GETDEVICELIST);
5779 if (status)
5780 return status;
5781
5782 p = xdr_inline_decode(xdr, 8 + 8 + 4);
5783 if (unlikely(!p))
5784 goto out_overflow;
5785
5786 /* TODO: Skip cookie for now */
5787 p += 2;
5788
5789 /* Read verifier */
5790 p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
5791
5792 res->num_devs = be32_to_cpup(p);
5793
5794 dprintk("%s: num_dev %d\n", __func__, res->num_devs);
5795
5796 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
5797 printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
5798 __func__, res->num_devs);
5799 return -EIO;
5800 }
5801
5802 p = xdr_inline_decode(xdr,
5803 res->num_devs * NFS4_DEVICEID4_SIZE + 4);
5804 if (unlikely(!p))
5805 goto out_overflow;
5806 for (i = 0; i < res->num_devs; i++)
5807 p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
5808 NFS4_DEVICEID4_SIZE);
5809 res->eof = be32_to_cpup(p);
5810 return 0;
5811out_overflow:
5812 print_overflow_msg(__func__, xdr);
5813 return -EIO;
5814}
5815
5816static int decode_getdeviceinfo(struct xdr_stream *xdr, 5729static int decode_getdeviceinfo(struct xdr_stream *xdr,
5817 struct pnfs_device *pdev) 5730 struct pnfs_device *pdev)
5818{ 5731{
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5862 p = xdr_inline_decode(xdr, 4 * len); 5775 p = xdr_inline_decode(xdr, 4 * len);
5863 if (unlikely(!p)) 5776 if (unlikely(!p))
5864 goto out_overflow; 5777 goto out_overflow;
5865 for (i = 0; i < len; i++, p++) { 5778
5866 if (be32_to_cpup(p)) { 5779 if (be32_to_cpup(p++) &
5867 dprintk("%s: notifications not supported\n", 5780 ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
5781 dprintk("%s: unsupported notification\n",
5782 __func__);
5783 }
5784
5785 for (i = 1; i < len; i++) {
5786 if (be32_to_cpup(p++)) {
5787 dprintk("%s: unsupported notification\n",
5868 __func__); 5788 __func__);
5869 return -EIO; 5789 return -EIO;
5870 } 5790 }
@@ -7097,32 +7017,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
7097} 7017}
7098 7018
7099/* 7019/*
7100 * Decode GETDEVICELIST response
7101 */
7102static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
7103 struct xdr_stream *xdr,
7104 struct nfs4_getdevicelist_res *res)
7105{
7106 struct compound_hdr hdr;
7107 int status;
7108
7109 dprintk("encoding getdevicelist!\n");
7110
7111 status = decode_compound_hdr(xdr, &hdr);
7112 if (status != 0)
7113 goto out;
7114 status = decode_sequence(xdr, &res->seq_res, rqstp);
7115 if (status != 0)
7116 goto out;
7117 status = decode_putfh(xdr);
7118 if (status != 0)
7119 goto out;
7120 status = decode_getdevicelist(xdr, res->devlist);
7121out:
7122 return status;
7123}
7124
7125/*
7126 * Decode GETDEVINFO response 7020 * Decode GETDEVINFO response
7127 */ 7021 */
7128static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, 7022static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -7490,7 +7384,6 @@ struct rpc_procinfo nfs4_procedures[] = {
7490 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), 7384 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
7491 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), 7385 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
7492 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), 7386 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7493 PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
7494 PROC(BIND_CONN_TO_SESSION, 7387 PROC(BIND_CONN_TO_SESSION,
7495 enc_bind_conn_to_session, dec_bind_conn_to_session), 7388 enc_bind_conn_to_session, dec_bind_conn_to_session),
7496 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), 7389 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278b3761..c6e4bda63000 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60 kfree(de); 60 kfree(de);
61} 61}
62 62
63static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
64 const struct nfs4_deviceid *d_id)
65{
66 struct nfs4_deviceid_node *d;
67 struct objio_dev_ent *de;
68
69 d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
70 if (!d)
71 return NULL;
72
73 de = container_of(d, struct objio_dev_ent, id_node);
74 return de;
75}
76
77static struct objio_dev_ent *
78_dev_list_add(const struct nfs_server *nfss,
79 const struct nfs4_deviceid *d_id, struct osd_dev *od,
80 gfp_t gfp_flags)
81{
82 struct nfs4_deviceid_node *d;
83 struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
84 struct objio_dev_ent *n;
85
86 if (!de) {
87 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
88 return NULL;
89 }
90
91 dprintk("%s: Adding od=%p\n", __func__, od);
92 nfs4_init_deviceid_node(&de->id_node,
93 nfss->pnfs_curr_ld,
94 nfss->nfs_client,
95 d_id);
96 de->od.od = od;
97
98 d = nfs4_insert_deviceid_node(&de->id_node);
99 n = container_of(d, struct objio_dev_ent, id_node);
100 if (n != de) {
101 dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
102 objio_free_deviceid_node(&de->id_node);
103 de = n;
104 }
105
106 return de;
107}
108
109struct objio_segment { 63struct objio_segment {
110 struct pnfs_layout_segment lseg; 64 struct pnfs_layout_segment lseg;
111 65
@@ -130,29 +84,24 @@ struct objio_state {
130 84
131/* Send and wait for a get_device_info of devices in the layout, 85/* Send and wait for a get_device_info of devices in the layout,
132 then look them up with the osd_initiator library */ 86 then look them up with the osd_initiator library */
133static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, 87struct nfs4_deviceid_node *
134 struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, 88objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
135 gfp_t gfp_flags) 89 gfp_t gfp_flags)
136{ 90{
137 struct pnfs_osd_deviceaddr *deviceaddr; 91 struct pnfs_osd_deviceaddr *deviceaddr;
138 struct objio_dev_ent *ode; 92 struct objio_dev_ent *ode = NULL;
139 struct osd_dev *od; 93 struct osd_dev *od;
140 struct osd_dev_info odi; 94 struct osd_dev_info odi;
141 bool retry_flag = true; 95 bool retry_flag = true;
96 __be32 *p;
142 int err; 97 int err;
143 98
144 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 99 deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
145 if (ode) { 100 if (!deviceaddr)
146 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ 101 return NULL;
147 return 0;
148 }
149 102
150 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); 103 p = page_address(pdev->pages[0]);
151 if (unlikely(err)) { 104 pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
152 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
153 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
154 return err;
155 }
156 105
157 odi.systemid_len = deviceaddr->oda_systemid.len; 106 odi.systemid_len = deviceaddr->oda_systemid.len;
158 if (odi.systemid_len > sizeof(odi.systemid)) { 107 if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
188 goto out; 137 goto out;
189 } 138 }
190 139
191 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
192 gfp_flags);
193 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
194 dprintk("Adding new dev_id(%llx:%llx)\n", 140 dprintk("Adding new dev_id(%llx:%llx)\n",
195 _DEVID_LO(d_id), _DEVID_HI(d_id)); 141 _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
142
143 ode = kzalloc(sizeof(*ode), gfp_flags);
144 if (!ode) {
145 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
146 goto out;
147 }
148
149 nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
150 kfree(deviceaddr);
151
152 ode->od.od = od;
153 return &ode->id_node;
154
196out: 155out:
197 objlayout_put_deviceinfo(deviceaddr); 156 kfree(deviceaddr);
198 return err; 157 return NULL;
199} 158}
200 159
201static void copy_single_comp(struct ore_components *oc, unsigned c, 160static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
254 struct xdr_stream *xdr, 213 struct xdr_stream *xdr,
255 gfp_t gfp_flags) 214 gfp_t gfp_flags)
256{ 215{
216 struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
257 struct objio_segment *objio_seg; 217 struct objio_segment *objio_seg;
258 struct pnfs_osd_xdr_decode_layout_iter iter; 218 struct pnfs_osd_xdr_decode_layout_iter iter;
259 struct pnfs_osd_layout layout; 219 struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
283 objio_seg->oc.first_dev = layout.olo_comps_index; 243 objio_seg->oc.first_dev = layout.olo_comps_index;
284 cur_comp = 0; 244 cur_comp = 0;
285 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { 245 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
246 struct nfs4_deviceid_node *d;
247 struct objio_dev_ent *ode;
248
286 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); 249 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
287 err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, 250
288 &src_comp.oc_object_id.oid_device_id, 251 d = nfs4_find_get_deviceid(server,
289 gfp_flags); 252 &src_comp.oc_object_id.oid_device_id,
290 if (err) 253 pnfslay->plh_lc_cred, gfp_flags);
254 if (!d) {
255 err = -ENXIO;
291 goto err; 256 goto err;
292 ++cur_comp; 257 }
258
259 ode = container_of(d, struct objio_dev_ent, id_node);
260 objio_seg->oc.ods[cur_comp++] = &ode->od;
293 } 261 }
294 /* pnfs_osd_xdr_decode_layout_comp returns false on error */ 262 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
295 if (unlikely(err)) 263 if (unlikely(err))
@@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
653 .flags = PNFS_LAYOUTRET_ON_SETATTR | 621 .flags = PNFS_LAYOUTRET_ON_SETATTR |
654 PNFS_LAYOUTRET_ON_ERROR, 622 PNFS_LAYOUTRET_ON_ERROR,
655 623
624 .max_deviceinfo_size = PAGE_SIZE,
656 .owner = THIS_MODULE, 625 .owner = THIS_MODULE,
657 .alloc_layout_hdr = objlayout_alloc_layout_hdr, 626 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
658 .free_layout_hdr = objlayout_free_layout_hdr, 627 .free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d11fac..c89357c7a914 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -574,76 +574,6 @@ loop_done:
574 dprintk("%s: Return\n", __func__); 574 dprintk("%s: Return\n", __func__);
575} 575}
576 576
577
578/*
579 * Get Device Info API for io engines
580 */
581struct objlayout_deviceinfo {
582 struct page *page;
583 struct pnfs_osd_deviceaddr da; /* This must be last */
584};
585
586/* Initialize and call nfs_getdeviceinfo, then decode and return a
587 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
588 * should be called.
589 */
590int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
591 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
592 gfp_t gfp_flags)
593{
594 struct objlayout_deviceinfo *odi;
595 struct pnfs_device pd;
596 struct page *page, **pages;
597 u32 *p;
598 int err;
599
600 page = alloc_page(gfp_flags);
601 if (!page)
602 return -ENOMEM;
603
604 pages = &page;
605 pd.pages = pages;
606
607 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
608 pd.layout_type = LAYOUT_OSD2_OBJECTS;
609 pd.pages = &page;
610 pd.pgbase = 0;
611 pd.pglen = PAGE_SIZE;
612 pd.mincount = 0;
613 pd.maxcount = PAGE_SIZE;
614
615 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
616 pnfslay->plh_lc_cred);
617 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
618 if (err)
619 goto err_out;
620
621 p = page_address(page);
622 odi = kzalloc(sizeof(*odi), gfp_flags);
623 if (!odi) {
624 err = -ENOMEM;
625 goto err_out;
626 }
627 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
628 odi->page = page;
629 *deviceaddr = &odi->da;
630 return 0;
631
632err_out:
633 __free_page(page);
634 return err;
635}
636
637void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
638{
639 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
640 struct objlayout_deviceinfo,
641 da);
642
643 __free_page(odi->page);
644 kfree(odi);
645}
646
647enum { 577enum {
648 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, 578 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
649 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, 579 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d2f136..3a0828d57339 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
149extern void objlayout_write_done(struct objlayout_io_res *oir, 149extern void objlayout_write_done(struct objlayout_io_res *oir,
150 ssize_t status, bool sync); 150 ssize_t status, bool sync);
151 151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
153 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
154 gfp_t gfp_flags);
155extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
156
157/* 152/*
158 * exported generic objects function vectors 153 * exported generic objects function vectors
159 */ 154 */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index be7cbce6e4c7..94e16ec88312 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -481,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
481 return 0; 481 return 0;
482 } 482 }
483 483
484 /*
485 * Limit the request size so that we can still allocate a page array
486 * for it without upsetting the slab allocator.
487 */
488 if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
489 sizeof(struct page) > PAGE_SIZE)
490 return 0;
491
484 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); 492 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
485} 493}
486EXPORT_SYMBOL_GPL(nfs_generic_pg_test); 494EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3851debf8a2..76de7f568119 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -594,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
594 dprintk("%s freeing layout for inode %lu\n", __func__, 594 dprintk("%s freeing layout for inode %lu\n", __func__,
595 lo->plh_inode->i_ino); 595 lo->plh_inode->i_ino);
596 inode = lo->plh_inode; 596 inode = lo->plh_inode;
597
598 pnfs_layoutcommit_inode(inode, false);
599
597 spin_lock(&inode->i_lock); 600 spin_lock(&inode->i_lock);
598 list_del_init(&lo->plh_bulk_destroy); 601 list_del_init(&lo->plh_bulk_destroy);
599 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 602 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -682,17 +685,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
682 return (s32)(s1 - s2) > 0; 685 return (s32)(s1 - s2) > 0;
683} 686}
684 687
685static void
686pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
687 const nfs4_stateid *new,
688 struct list_head *free_me_list)
689{
690 if (nfs4_stateid_match_other(&lo->plh_stateid, new))
691 return;
692 /* Layout is new! Kill existing layout segments */
693 pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
694}
695
696/* update lo->plh_stateid with new if is more recent */ 688/* update lo->plh_stateid with new if is more recent */
697void 689void
698pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 690pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -749,7 +741,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
749 status = -EAGAIN; 741 status = -EAGAIN;
750 } else if (!nfs4_valid_open_stateid(open_state)) { 742 } else if (!nfs4_valid_open_stateid(open_state)) {
751 status = -EBADF; 743 status = -EBADF;
752 } else if (list_empty(&lo->plh_segs)) { 744 } else if (list_empty(&lo->plh_segs) ||
745 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
753 int seq; 746 int seq;
754 747
755 do { 748 do {
@@ -864,6 +857,16 @@ _pnfs_return_layout(struct inode *ino)
864 empty = list_empty(&lo->plh_segs); 857 empty = list_empty(&lo->plh_segs);
865 pnfs_clear_layoutcommit(ino, &tmp_list); 858 pnfs_clear_layoutcommit(ino, &tmp_list);
866 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 859 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
860
861 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
862 struct pnfs_layout_range range = {
863 .iomode = IOMODE_ANY,
864 .offset = 0,
865 .length = NFS4_MAX_UINT64,
866 };
867 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
868 }
869
867 /* Don't send a LAYOUTRETURN if list was initially empty */ 870 /* Don't send a LAYOUTRETURN if list was initially empty */
868 if (empty) { 871 if (empty) {
869 spin_unlock(&ino->i_lock); 872 spin_unlock(&ino->i_lock);
@@ -871,6 +874,8 @@ _pnfs_return_layout(struct inode *ino)
871 dprintk("NFS: %s no layout segments to return\n", __func__); 874 dprintk("NFS: %s no layout segments to return\n", __func__);
872 goto out; 875 goto out;
873 } 876 }
877
878 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
874 lo->plh_block_lgets++; 879 lo->plh_block_lgets++;
875 spin_unlock(&ino->i_lock); 880 spin_unlock(&ino->i_lock);
876 pnfs_free_lseg_list(&tmp_list); 881 pnfs_free_lseg_list(&tmp_list);
@@ -1358,25 +1363,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1358 goto out; 1363 goto out;
1359 } 1364 }
1360 1365
1366 init_lseg(lo, lseg);
1367 lseg->pls_range = res->range;
1368
1361 spin_lock(&ino->i_lock); 1369 spin_lock(&ino->i_lock);
1362 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1370 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1363 dprintk("%s forget reply due to recall\n", __func__); 1371 dprintk("%s forget reply due to recall\n", __func__);
1364 goto out_forget_reply; 1372 goto out_forget_reply;
1365 } 1373 }
1366 1374
1367 if (pnfs_layoutgets_blocked(lo, 1) || 1375 if (pnfs_layoutgets_blocked(lo, 1)) {
1368 pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1369 dprintk("%s forget reply due to state\n", __func__); 1376 dprintk("%s forget reply due to state\n", __func__);
1370 goto out_forget_reply; 1377 goto out_forget_reply;
1371 } 1378 }
1372 1379
1373 /* Check that the new stateid matches the old stateid */ 1380 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1374 pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); 1381 /* existing state ID, make sure the sequence number matches. */
1375 /* Done processing layoutget. Set the layout stateid */ 1382 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1376 pnfs_set_layout_stateid(lo, &res->stateid, false); 1383 dprintk("%s forget reply due to sequence\n", __func__);
1384 goto out_forget_reply;
1385 }
1386 pnfs_set_layout_stateid(lo, &res->stateid, false);
1387 } else {
1388 /*
1389 * We got an entirely new state ID. Mark all segments for the
1390 * inode invalid, and don't bother validating the stateid
1391 * sequence number.
1392 */
1393 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
1394
1395 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1396 lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1397 }
1398
1399 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1377 1400
1378 init_lseg(lo, lseg);
1379 lseg->pls_range = res->range;
1380 pnfs_get_lseg(lseg); 1401 pnfs_get_lseg(lseg);
1381 pnfs_layout_insert_lseg(lo, lseg); 1402 pnfs_layout_insert_lseg(lo, lseg);
1382 1403
@@ -1797,6 +1818,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
1797} 1818}
1798EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1819EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1799 1820
1821void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
1822{
1823 struct inode *inode = data->inode;
1824 struct nfs_inode *nfsi = NFS_I(inode);
1825 bool mark_as_dirty = false;
1826
1827 spin_lock(&inode->i_lock);
1828 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1829 mark_as_dirty = true;
1830 dprintk("%s: Set layoutcommit for inode %lu ",
1831 __func__, inode->i_ino);
1832 }
1833 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
1834 /* references matched in nfs4_layoutcommit_release */
1835 pnfs_get_lseg(data->lseg);
1836 }
1837 if (data->lwb > nfsi->layout->plh_lwb)
1838 nfsi->layout->plh_lwb = data->lwb;
1839 spin_unlock(&inode->i_lock);
1840 dprintk("%s: lseg %p end_pos %llu\n",
1841 __func__, data->lseg, nfsi->layout->plh_lwb);
1842
1843 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1844 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
1845 if (mark_as_dirty)
1846 mark_inode_dirty_sync(inode);
1847}
1848EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
1849
1800void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 1850void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1801{ 1851{
1802 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 1852 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1817,6 +1867,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1817int 1867int
1818pnfs_layoutcommit_inode(struct inode *inode, bool sync) 1868pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1819{ 1869{
1870 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1820 struct nfs4_layoutcommit_data *data; 1871 struct nfs4_layoutcommit_data *data;
1821 struct nfs_inode *nfsi = NFS_I(inode); 1872 struct nfs_inode *nfsi = NFS_I(inode);
1822 loff_t end_pos; 1873 loff_t end_pos;
@@ -1867,6 +1918,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1867 data->args.lastbytewritten = end_pos - 1; 1918 data->args.lastbytewritten = end_pos - 1;
1868 data->res.server = NFS_SERVER(inode); 1919 data->res.server = NFS_SERVER(inode);
1869 1920
1921 if (ld->prepare_layoutcommit) {
1922 status = ld->prepare_layoutcommit(&data->args);
1923 if (status) {
1924 spin_lock(&inode->i_lock);
1925 if (end_pos < nfsi->layout->plh_lwb)
1926 nfsi->layout->plh_lwb = end_pos;
1927 spin_unlock(&inode->i_lock);
1928 put_rpccred(data->cred);
1929 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
1930 goto clear_layoutcommitting;
1931 }
1932 }
1933
1934
1870 status = nfs4_proc_layoutcommit(data, sync); 1935 status = nfs4_proc_layoutcommit(data, sync);
1871out: 1936out:
1872 if (status) 1937 if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index aca3dff5dae6..693ce42ec683 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,12 +65,15 @@ enum {
65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */ 67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */
68 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
68}; 69};
69 70
70enum layoutdriver_policy_flags { 71enum layoutdriver_policy_flags {
71 /* Should the pNFS client commit and return the layout upon a setattr */ 72 /* Should the pNFS client commit and return the layout upon truncate to
73 * a smaller size */
72 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, 74 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
73 PNFS_LAYOUTRET_ON_ERROR = 1 << 1, 75 PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
76 PNFS_READ_WHOLE_PAGE = 1 << 2,
74}; 77};
75 78
76struct nfs4_deviceid_node; 79struct nfs4_deviceid_node;
@@ -82,6 +85,7 @@ struct pnfs_layoutdriver_type {
82 const char *name; 85 const char *name;
83 struct module *owner; 86 struct module *owner;
84 unsigned flags; 87 unsigned flags;
88 unsigned max_deviceinfo_size;
85 89
86 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); 90 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
87 int (*clear_layoutdriver) (struct nfs_server *); 91 int (*clear_layoutdriver) (struct nfs_server *);
@@ -92,6 +96,9 @@ struct pnfs_layoutdriver_type {
92 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 96 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
93 void (*free_lseg) (struct pnfs_layout_segment *lseg); 97 void (*free_lseg) (struct pnfs_layout_segment *lseg);
94 98
99 void (*return_range) (struct pnfs_layout_hdr *lo,
100 struct pnfs_layout_range *range);
101
95 /* test for nfs page cache coalescing */ 102 /* test for nfs page cache coalescing */
96 const struct nfs_pageio_ops *pg_read_ops; 103 const struct nfs_pageio_ops *pg_read_ops;
97 const struct nfs_pageio_ops *pg_write_ops; 104 const struct nfs_pageio_ops *pg_write_ops;
@@ -121,14 +128,17 @@ struct pnfs_layoutdriver_type {
121 enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int); 128 enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
122 129
123 void (*free_deviceid_node) (struct nfs4_deviceid_node *); 130 void (*free_deviceid_node) (struct nfs4_deviceid_node *);
131 struct nfs4_deviceid_node * (*alloc_deviceid_node)
132 (struct nfs_server *server, struct pnfs_device *pdev,
133 gfp_t gfp_flags);
124 134
125 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, 135 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
126 struct xdr_stream *xdr, 136 struct xdr_stream *xdr,
127 const struct nfs4_layoutreturn_args *args); 137 const struct nfs4_layoutreturn_args *args);
128 138
129 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); 139 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
130 140 int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
131 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, 141 void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
132 struct xdr_stream *xdr, 142 struct xdr_stream *xdr,
133 const struct nfs4_layoutcommit_args *args); 143 const struct nfs4_layoutcommit_args *args);
134}; 144};
@@ -171,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
171extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 181extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
172 182
173/* nfs4proc.c */ 183/* nfs4proc.c */
174extern int nfs4_proc_getdevicelist(struct nfs_server *server,
175 const struct nfs_fh *fh,
176 struct pnfs_devicelist *devlist);
177extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 184extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
178 struct pnfs_device *dev, 185 struct pnfs_device *dev,
179 struct rpc_cred *cred); 186 struct rpc_cred *cred);
@@ -219,6 +226,7 @@ void pnfs_roc_release(struct inode *ino);
219void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 226void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
220bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); 227bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
221void pnfs_set_layoutcommit(struct nfs_pgio_header *); 228void pnfs_set_layoutcommit(struct nfs_pgio_header *);
229void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
222void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 230void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
223int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 231int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
224int _pnfs_return_layout(struct inode *); 232int _pnfs_return_layout(struct inode *);
@@ -255,11 +263,12 @@ struct nfs4_deviceid_node {
255 atomic_t ref; 263 atomic_t ref;
256}; 264};
257 265
258struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 266struct nfs4_deviceid_node *
267nfs4_find_get_deviceid(struct nfs_server *server,
268 const struct nfs4_deviceid *id, struct rpc_cred *cred,
269 gfp_t gfp_mask);
259void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 270void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
260void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, 271void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
261 const struct pnfs_layoutdriver_type *,
262 const struct nfs_client *,
263 const struct nfs4_deviceid *); 272 const struct nfs4_deviceid *);
264struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); 273struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
265bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); 274bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
@@ -267,6 +276,13 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
267bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); 276bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
268void nfs4_deviceid_purge_client(const struct nfs_client *); 277void nfs4_deviceid_purge_client(const struct nfs_client *);
269 278
279static inline struct nfs4_deviceid_node *
280nfs4_get_deviceid(struct nfs4_deviceid_node *d)
281{
282 atomic_inc(&d->ref);
283 return d;
284}
285
270static inline struct pnfs_layout_segment * 286static inline struct pnfs_layout_segment *
271pnfs_get_lseg(struct pnfs_layout_segment *lseg) 287pnfs_get_lseg(struct pnfs_layout_segment *lseg)
272{ 288{
@@ -368,6 +384,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
368} 384}
369 385
370static inline bool 386static inline bool
387pnfs_ld_read_whole_page(struct inode *inode)
388{
389 if (!pnfs_enabled_sb(NFS_SERVER(inode)))
390 return false;
391 return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
392}
393
394static inline bool
371pnfs_layoutcommit_outstanding(struct inode *inode) 395pnfs_layoutcommit_outstanding(struct inode *inode)
372{ 396{
373 struct nfs_inode *nfsi = NFS_I(inode); 397 struct nfs_inode *nfsi = NFS_I(inode);
@@ -443,6 +467,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
443} 467}
444 468
445static inline bool 469static inline bool
470pnfs_ld_read_whole_page(struct inode *inode)
471{
472 return false;
473}
474
475static inline bool
446pnfs_roc(struct inode *ino) 476pnfs_roc(struct inode *ino)
447{ 477{
448 return false; 478 return false;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
29 */ 29 */
30 30
31#include <linux/export.h> 31#include <linux/export.h>
32#include <linux/nfs_fs.h>
33#include "nfs4session.h"
34#include "internal.h"
32#include "pnfs.h" 35#include "pnfs.h"
33 36
34#define NFSDBG_FACILITY NFSDBG_PNFS 37#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
89 return NULL; 92 return NULL;
90} 93}
91 94
95static struct nfs4_deviceid_node *
96nfs4_get_device_info(struct nfs_server *server,
97 const struct nfs4_deviceid *dev_id,
98 struct rpc_cred *cred, gfp_t gfp_flags)
99{
100 struct nfs4_deviceid_node *d = NULL;
101 struct pnfs_device *pdev = NULL;
102 struct page **pages = NULL;
103 u32 max_resp_sz;
104 int max_pages;
105 int rc, i;
106
107 /*
108 * Use the session max response size as the basis for setting
109 * GETDEVICEINFO's maxcount
110 */
111 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
112 if (server->pnfs_curr_ld->max_deviceinfo_size &&
113 server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
114 max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
115 max_pages = nfs_page_array_len(0, max_resp_sz);
116 dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
117 __func__, server, max_resp_sz, max_pages);
118
119 pdev = kzalloc(sizeof(*pdev), gfp_flags);
120 if (!pdev)
121 return NULL;
122
123 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
124 if (!pages)
125 goto out_free_pdev;
126
127 for (i = 0; i < max_pages; i++) {
128 pages[i] = alloc_page(gfp_flags);
129 if (!pages[i])
130 goto out_free_pages;
131 }
132
133 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
134 pdev->layout_type = server->pnfs_curr_ld->id;
135 pdev->pages = pages;
136 pdev->pgbase = 0;
137 pdev->pglen = max_resp_sz;
138 pdev->mincount = 0;
139 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
140
141 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
142 dprintk("%s getdevice info returns %d\n", __func__, rc);
143 if (rc)
144 goto out_free_pages;
145
146 /*
147 * Found new device, need to decode it and then add it to the
148 * list of known devices for this mountpoint.
149 */
150 d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
151 gfp_flags);
152
153out_free_pages:
154 for (i = 0; i < max_pages; i++)
155 __free_page(pages[i]);
156 kfree(pages);
157out_free_pdev:
158 kfree(pdev);
159 dprintk("<-- %s d %p\n", __func__, d);
160 return d;
161}
162
92/* 163/*
93 * Lookup a deviceid in cache and get a reference count on it if found 164 * Lookup a deviceid in cache and get a reference count on it if found
94 * 165 *
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
96 * @id deviceid to look up 167 * @id deviceid to look up
97 */ 168 */
98static struct nfs4_deviceid_node * 169static struct nfs4_deviceid_node *
99_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 170__nfs4_find_get_deviceid(struct nfs_server *server,
100 const struct nfs_client *clp, const struct nfs4_deviceid *id, 171 const struct nfs4_deviceid *id, long hash)
101 long hash)
102{ 172{
103 struct nfs4_deviceid_node *d; 173 struct nfs4_deviceid_node *d;
104 174
105 rcu_read_lock(); 175 rcu_read_lock();
106 d = _lookup_deviceid(ld, clp, id, hash); 176 d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
177 hash);
107 if (d != NULL) 178 if (d != NULL)
108 atomic_inc(&d->ref); 179 atomic_inc(&d->ref);
109 rcu_read_unlock(); 180 rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
111} 182}
112 183
113struct nfs4_deviceid_node * 184struct nfs4_deviceid_node *
114nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 185nfs4_find_get_deviceid(struct nfs_server *server,
115 const struct nfs_client *clp, const struct nfs4_deviceid *id) 186 const struct nfs4_deviceid *id, struct rpc_cred *cred,
187 gfp_t gfp_mask)
116{ 188{
117 return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); 189 long hash = nfs4_deviceid_hash(id);
190 struct nfs4_deviceid_node *d, *new;
191
192 d = __nfs4_find_get_deviceid(server, id, hash);
193 if (d)
194 return d;
195
196 new = nfs4_get_device_info(server, id, cred, gfp_mask);
197 if (!new)
198 return new;
199
200 spin_lock(&nfs4_deviceid_lock);
201 d = __nfs4_find_get_deviceid(server, id, hash);
202 if (d) {
203 spin_unlock(&nfs4_deviceid_lock);
204 server->pnfs_curr_ld->free_deviceid_node(new);
205 return d;
206 }
207 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
208 atomic_inc(&new->ref);
209 spin_unlock(&nfs4_deviceid_lock);
210
211 return new;
118} 212}
119EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); 213EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
120 214
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
151EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); 245EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
152 246
153void 247void
154nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, 248nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
155 const struct pnfs_layoutdriver_type *ld,
156 const struct nfs_client *nfs_client,
157 const struct nfs4_deviceid *id) 249 const struct nfs4_deviceid *id)
158{ 250{
159 INIT_HLIST_NODE(&d->node); 251 INIT_HLIST_NODE(&d->node);
160 INIT_HLIST_NODE(&d->tmpnode); 252 INIT_HLIST_NODE(&d->tmpnode);
161 d->ld = ld; 253 d->ld = server->pnfs_curr_ld;
162 d->nfs_client = nfs_client; 254 d->nfs_client = server->nfs_client;
163 d->flags = 0; 255 d->flags = 0;
164 d->deviceid = *id; 256 d->deviceid = *id;
165 atomic_set(&d->ref, 1); 257 atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
167EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); 259EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
168 260
169/* 261/*
170 * Uniquely initialize and insert a deviceid node into cache
171 *
172 * @new new deviceid node
173 * Note that the caller must set up the following members:
174 * new->ld
175 * new->nfs_client
176 * new->deviceid
177 *
178 * @ret the inserted node, if none found, otherwise, the found entry.
179 */
180struct nfs4_deviceid_node *
181nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
182{
183 struct nfs4_deviceid_node *d;
184 long hash;
185
186 spin_lock(&nfs4_deviceid_lock);
187 hash = nfs4_deviceid_hash(&new->deviceid);
188 d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
189 if (d) {
190 spin_unlock(&nfs4_deviceid_lock);
191 return d;
192 }
193
194 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
195 spin_unlock(&nfs4_deviceid_lock);
196 atomic_inc(&new->ref);
197
198 return new;
199}
200EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
201
202/*
203 * Dereference a deviceid node and delete it when its reference count drops 262 * Dereference a deviceid node and delete it when its reference count drops
204 * to zero. 263 * to zero.
205 * 264 *
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
299 } 358 }
300 rcu_read_unlock(); 359 rcu_read_unlock();
301} 360}
302
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e4499d5b51e8..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
2065 return NFS_TEXT_DATA; 2065 return NFS_TEXT_DATA;
2066 } 2066 }
2067 2067
2068#if !IS_ENABLED(CONFIG_NFS_V3)
2069 if (args->version == 3)
2070 goto out_v3_not_compiled;
2071#endif /* !CONFIG_NFS_V3 */
2072
2073 return 0; 2068 return 0;
2074 2069
2075out_no_data: 2070out_no_data:
@@ -2085,12 +2080,6 @@ out_no_sec:
2085 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); 2080 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
2086 return -EINVAL; 2081 return -EINVAL;
2087 2082
2088#if !IS_ENABLED(CONFIG_NFS_V3)
2089out_v3_not_compiled:
2090 dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
2091 return -EPROTONOSUPPORT;
2092#endif /* !CONFIG_NFS_V3 */
2093
2094out_nomem: 2083out_nomem:
2095 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); 2084 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
2096 return -ENOMEM; 2085 return -ENOMEM;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 175d5d073ccf..12493846a2d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops;
49static void nfs_clear_request_commit(struct nfs_page *req); 49static void nfs_clear_request_commit(struct nfs_page *req);
50static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, 50static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
51 struct inode *inode); 51 struct inode *inode);
52static struct nfs_page *
53nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
54 struct page *page);
52 55
53static struct kmem_cache *nfs_wdata_cachep; 56static struct kmem_cache *nfs_wdata_cachep;
54static mempool_t *nfs_wdata_mempool; 57static mempool_t *nfs_wdata_mempool;
@@ -95,38 +98,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
95} 98}
96 99
97/* 100/*
98 * nfs_page_search_commits_for_head_request_locked
99 *
100 * Search through commit lists on @inode for the head request for @page.
101 * Must be called while holding the inode (which is cinfo) lock.
102 *
103 * Returns the head request if found, or NULL if not found.
104 */
105static struct nfs_page *
106nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
107 struct page *page)
108{
109 struct nfs_page *freq, *t;
110 struct nfs_commit_info cinfo;
111 struct inode *inode = &nfsi->vfs_inode;
112
113 nfs_init_cinfo_from_inode(&cinfo, inode);
114
115 /* search through pnfs commit lists */
116 freq = pnfs_search_commit_reqs(inode, &cinfo, page);
117 if (freq)
118 return freq->wb_head;
119
120 /* Linearly search the commit list for the correct request */
121 list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
122 if (freq->wb_page == page)
123 return freq->wb_head;
124 }
125
126 return NULL;
127}
128
129/*
130 * nfs_page_find_head_request_locked - find head request associated with @page 101 * nfs_page_find_head_request_locked - find head request associated with @page
131 * 102 *
132 * must be called while holding the inode lock. 103 * must be called while holding the inode lock.
@@ -271,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
271 242
272static int wb_priority(struct writeback_control *wbc) 243static int wb_priority(struct writeback_control *wbc)
273{ 244{
245 int ret = 0;
274 if (wbc->for_reclaim) 246 if (wbc->for_reclaim)
275 return FLUSH_HIGHPRI | FLUSH_STABLE; 247 return FLUSH_HIGHPRI | FLUSH_STABLE;
248 if (wbc->sync_mode == WB_SYNC_ALL)
249 ret = FLUSH_COND_STABLE;
276 if (wbc->for_kupdate || wbc->for_background) 250 if (wbc->for_kupdate || wbc->for_background)
277 return FLUSH_LOWPRI | FLUSH_COND_STABLE; 251 ret |= FLUSH_LOWPRI;
278 return FLUSH_COND_STABLE; 252 return ret;
279} 253}
280 254
281/* 255/*
@@ -731,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
731 if (likely(!PageSwapCache(head->wb_page))) { 705 if (likely(!PageSwapCache(head->wb_page))) {
732 set_page_private(head->wb_page, 0); 706 set_page_private(head->wb_page, 0);
733 ClearPagePrivate(head->wb_page); 707 ClearPagePrivate(head->wb_page);
708 smp_mb__after_atomic();
709 wake_up_page(head->wb_page, PG_private);
734 clear_bit(PG_MAPPED, &head->wb_flags); 710 clear_bit(PG_MAPPED, &head->wb_flags);
735 } 711 }
736 nfsi->npages--; 712 nfsi->npages--;
@@ -749,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
749 __set_page_dirty_nobuffers(req->wb_page); 725 __set_page_dirty_nobuffers(req->wb_page);
750} 726}
751 727
752#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 728/*
729 * nfs_page_search_commits_for_head_request_locked
730 *
731 * Search through commit lists on @inode for the head request for @page.
732 * Must be called while holding the inode (which is cinfo) lock.
733 *
734 * Returns the head request if found, or NULL if not found.
735 */
736static struct nfs_page *
737nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
738 struct page *page)
739{
740 struct nfs_page *freq, *t;
741 struct nfs_commit_info cinfo;
742 struct inode *inode = &nfsi->vfs_inode;
743
744 nfs_init_cinfo_from_inode(&cinfo, inode);
745
746 /* search through pnfs commit lists */
747 freq = pnfs_search_commit_reqs(inode, &cinfo, page);
748 if (freq)
749 return freq->wb_head;
750
751 /* Linearly search the commit list for the correct request */
752 list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
753 if (freq->wb_page == page)
754 return freq->wb_head;
755 }
756
757 return NULL;
758}
759
753/** 760/**
754 * nfs_request_add_commit_list - add request to a commit list 761 * nfs_request_add_commit_list - add request to a commit list
755 * @req: pointer to a struct nfs_page 762 * @req: pointer to a struct nfs_page
@@ -867,36 +874,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr)
867 return hdr->verf.committed != NFS_FILE_SYNC; 874 return hdr->verf.committed != NFS_FILE_SYNC;
868} 875}
869 876
870#else
871static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
872 struct inode *inode)
873{
874}
875
876void nfs_init_cinfo(struct nfs_commit_info *cinfo,
877 struct inode *inode,
878 struct nfs_direct_req *dreq)
879{
880}
881
882void
883nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
884 struct nfs_commit_info *cinfo)
885{
886}
887
888static void
889nfs_clear_request_commit(struct nfs_page *req)
890{
891}
892
893int nfs_write_need_commit(struct nfs_pgio_header *hdr)
894{
895 return 0;
896}
897
898#endif
899
900static void nfs_write_completion(struct nfs_pgio_header *hdr) 877static void nfs_write_completion(struct nfs_pgio_header *hdr)
901{ 878{
902 struct nfs_commit_info cinfo; 879 struct nfs_commit_info cinfo;
@@ -932,7 +909,6 @@ out:
932 hdr->release(hdr); 909 hdr->release(hdr);
933} 910}
934 911
935#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
936unsigned long 912unsigned long
937nfs_reqs_to_commit(struct nfs_commit_info *cinfo) 913nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
938{ 914{
@@ -989,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
989 return ret; 965 return ret;
990} 966}
991 967
992#else
993unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
994{
995 return 0;
996}
997
998int nfs_scan_commit(struct inode *inode, struct list_head *dst,
999 struct nfs_commit_info *cinfo)
1000{
1001 return 0;
1002}
1003#endif
1004
1005/* 968/*
1006 * Search for an existing write request, and attempt to update 969 * Search for an existing write request, and attempt to update
1007 * it to reflect a new dirty region on a given page. 970 * it to reflect a new dirty region on a given page.
@@ -1394,7 +1357,6 @@ static int nfs_writeback_done(struct rpc_task *task,
1394 return status; 1357 return status;
1395 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); 1358 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
1396 1359
1397#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1398 if (hdr->res.verf->committed < hdr->args.stable && 1360 if (hdr->res.verf->committed < hdr->args.stable &&
1399 task->tk_status >= 0) { 1361 task->tk_status >= 0) {
1400 /* We tried a write call, but the server did not 1362 /* We tried a write call, but the server did not
@@ -1416,7 +1378,6 @@ static int nfs_writeback_done(struct rpc_task *task,
1416 complain = jiffies + 300 * HZ; 1378 complain = jiffies + 300 * HZ;
1417 } 1379 }
1418 } 1380 }
1419#endif
1420 1381
1421 /* Deal with the suid/sgid bit corner case */ 1382 /* Deal with the suid/sgid bit corner case */
1422 if (nfs_should_remove_suid(inode)) 1383 if (nfs_should_remove_suid(inode))
@@ -1469,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task,
1469} 1430}
1470 1431
1471 1432
1472#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1473static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) 1433static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1474{ 1434{
1475 int ret; 1435 int ret;
@@ -1538,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1538} 1498}
1539EXPORT_SYMBOL_GPL(nfs_initiate_commit); 1499EXPORT_SYMBOL_GPL(nfs_initiate_commit);
1540 1500
1501static loff_t nfs_get_lwb(struct list_head *head)
1502{
1503 loff_t lwb = 0;
1504 struct nfs_page *req;
1505
1506 list_for_each_entry(req, head, wb_list)
1507 if (lwb < (req_offset(req) + req->wb_bytes))
1508 lwb = req_offset(req) + req->wb_bytes;
1509
1510 return lwb;
1511}
1512
1541/* 1513/*
1542 * Set up the argument/result storage required for the RPC call. 1514 * Set up the argument/result storage required for the RPC call.
1543 */ 1515 */
@@ -1557,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
1557 data->inode = inode; 1529 data->inode = inode;
1558 data->cred = first->wb_context->cred; 1530 data->cred = first->wb_context->cred;
1559 data->lseg = lseg; /* reference transferred */ 1531 data->lseg = lseg; /* reference transferred */
1532 /* only set lwb for pnfs commit */
1533 if (lseg)
1534 data->lwb = nfs_get_lwb(&data->pages);
1560 data->mds_ops = &nfs_commit_ops; 1535 data->mds_ops = &nfs_commit_ops;
1561 data->completion_ops = cinfo->completion_ops; 1536 data->completion_ops = cinfo->completion_ops;
1562 data->dreq = cinfo->dreq; 1537 data->dreq = cinfo->dreq;
@@ -1636,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1636 struct nfs_page *req; 1611 struct nfs_page *req;
1637 int status = data->task.tk_status; 1612 int status = data->task.tk_status;
1638 struct nfs_commit_info cinfo; 1613 struct nfs_commit_info cinfo;
1614 struct nfs_server *nfss;
1639 1615
1640 while (!list_empty(&data->pages)) { 1616 while (!list_empty(&data->pages)) {
1641 req = nfs_list_entry(data->pages.next); 1617 req = nfs_list_entry(data->pages.next);
@@ -1669,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1669 next: 1645 next:
1670 nfs_unlock_and_release_request(req); 1646 nfs_unlock_and_release_request(req);
1671 } 1647 }
1648 nfss = NFS_SERVER(data->inode);
1649 if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
1650 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
1651
1672 nfs_init_cinfo(&cinfo, data->inode, data->dreq); 1652 nfs_init_cinfo(&cinfo, data->inode, data->dreq);
1673 if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) 1653 if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
1674 nfs_commit_clear_lock(NFS_I(data->inode)); 1654 nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1778,12 +1758,6 @@ out_mark_dirty:
1778 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 1758 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1779 return ret; 1759 return ret;
1780} 1760}
1781#else
1782static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1783{
1784 return 0;
1785}
1786#endif
1787 1761
1788int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) 1762int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1789{ 1763{
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index f689ed82af3a..d153ca3ea577 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -3,5 +3,6 @@
3# 3#
4 4
5obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o 5obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
6
7nfs_acl-objs := nfsacl.o 6nfs_acl-objs := nfsacl.o
7
8obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c
index 6d1ee7204c88..ae6e58ea4de5 100644
--- a/fs/lockd/grace.c
+++ b/fs/nfs_common/grace.c
@@ -1,17 +1,20 @@
1/* 1/*
2 * Common code for control of lockd and nfsv4 grace periods. 2 * Common code for control of lockd and nfsv4 grace periods.
3 *
4 * Transplanted from lockd code
3 */ 5 */
4 6
5#include <linux/module.h> 7#include <linux/module.h>
6#include <linux/lockd/bind.h>
7#include <net/net_namespace.h> 8#include <net/net_namespace.h>
9#include <net/netns/generic.h>
10#include <linux/fs.h>
8 11
9#include "netns.h" 12static int grace_net_id;
10
11static DEFINE_SPINLOCK(grace_lock); 13static DEFINE_SPINLOCK(grace_lock);
12 14
13/** 15/**
14 * locks_start_grace 16 * locks_start_grace
17 * @net: net namespace that this lock manager belongs to
15 * @lm: who this grace period is for 18 * @lm: who this grace period is for
16 * 19 *
17 * A grace period is a period during which locks should not be given 20 * A grace period is a period during which locks should not be given
@@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock);
21 * 24 *
22 * This function is called to start a grace period. 25 * This function is called to start a grace period.
23 */ 26 */
24void locks_start_grace(struct net *net, struct lock_manager *lm) 27void
28locks_start_grace(struct net *net, struct lock_manager *lm)
25{ 29{
26 struct lockd_net *ln = net_generic(net, lockd_net_id); 30 struct list_head *grace_list = net_generic(net, grace_net_id);
27 31
28 spin_lock(&grace_lock); 32 spin_lock(&grace_lock);
29 list_add(&lm->list, &ln->grace_list); 33 list_add(&lm->list, grace_list);
30 spin_unlock(&grace_lock); 34 spin_unlock(&grace_lock);
31} 35}
32EXPORT_SYMBOL_GPL(locks_start_grace); 36EXPORT_SYMBOL_GPL(locks_start_grace);
33 37
34/** 38/**
35 * locks_end_grace 39 * locks_end_grace
40 * @net: net namespace that this lock manager belongs to
36 * @lm: who this grace period is for 41 * @lm: who this grace period is for
37 * 42 *
38 * Call this function to state that the given lock manager is ready to 43 * Call this function to state that the given lock manager is ready to
@@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
41 * Note that callers count on it being safe to call this more than once, 46 * Note that callers count on it being safe to call this more than once,
42 * and the second call should be a no-op. 47 * and the second call should be a no-op.
43 */ 48 */
44void locks_end_grace(struct lock_manager *lm) 49void
50locks_end_grace(struct lock_manager *lm)
45{ 51{
46 spin_lock(&grace_lock); 52 spin_lock(&grace_lock);
47 list_del_init(&lm->list); 53 list_del_init(&lm->list);
@@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
56 * to answer ordinary lock requests, and when they should accept only 62 * to answer ordinary lock requests, and when they should accept only
57 * lock reclaims. 63 * lock reclaims.
58 */ 64 */
59int locks_in_grace(struct net *net) 65int
66locks_in_grace(struct net *net)
60{ 67{
61 struct lockd_net *ln = net_generic(net, lockd_net_id); 68 struct list_head *grace_list = net_generic(net, grace_net_id);
62 69
63 return !list_empty(&ln->grace_list); 70 return !list_empty(grace_list);
64} 71}
65EXPORT_SYMBOL_GPL(locks_in_grace); 72EXPORT_SYMBOL_GPL(locks_in_grace);
73
74static int __net_init
75grace_init_net(struct net *net)
76{
77 struct list_head *grace_list = net_generic(net, grace_net_id);
78
79 INIT_LIST_HEAD(grace_list);
80 return 0;
81}
82
83static void __net_exit
84grace_exit_net(struct net *net)
85{
86 struct list_head *grace_list = net_generic(net, grace_net_id);
87
88 BUG_ON(!list_empty(grace_list));
89}
90
91static struct pernet_operations grace_net_ops = {
92 .init = grace_init_net,
93 .exit = grace_exit_net,
94 .id = &grace_net_id,
95 .size = sizeof(struct list_head),
96};
97
98static int __init
99init_grace(void)
100{
101 return register_pernet_subsys(&grace_net_ops);
102}
103
104static void __exit
105exit_grace(void)
106{
107 unregister_pernet_subsys(&grace_net_ops);
108}
109
110MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
111MODULE_LICENSE("GPL");
112module_init(init_grace)
113module_exit(exit_grace)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f994e750e0d1..73395156bdb4 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -71,6 +71,7 @@ config NFSD_V4
71 select FS_POSIX_ACL 71 select FS_POSIX_ACL
72 select SUNRPC_GSS 72 select SUNRPC_GSS
73 select CRYPTO 73 select CRYPTO
74 select GRACE_PERIOD
74 help 75 help
75 This option enables support in your system's NFS server for 76 This option enables support in your system's NFS server for
76 version 4 of the NFS protocol (RFC 3530). 77 version 4 of the NFS protocol (RFC 3530).
@@ -94,9 +95,6 @@ config NFSD_V4_SECURITY_LABEL
94 If you do not wish to enable fine-grained security labels SELinux or 95 If you do not wish to enable fine-grained security labels SELinux or
95 Smack policies on NFSv4 files, say N. 96 Smack policies on NFSv4 files, say N.
96 97
97 WARNING: there is still a chance of backwards-incompatible protocol changes.
98 For now we recommend "Y" only for developers and testers.
99
100config NFSD_FAULT_INJECTION 98config NFSD_FAULT_INJECTION
101 bool "NFS server manual fault injection" 99 bool "NFS server manual fault injection"
102 depends on NFSD_V4 && DEBUG_KERNEL 100 depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b582f9ab6b2a..dd96a3830004 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -18,7 +18,6 @@
18 * is much larger than a sockaddr_in6. 18 * is much larger than a sockaddr_in6.
19 */ 19 */
20struct svc_cacherep { 20struct svc_cacherep {
21 struct hlist_node c_hash;
22 struct list_head c_lru; 21 struct list_head c_lru;
23 22
24 unsigned char c_state, /* unused, inprog, done */ 23 unsigned char c_state, /* unused, inprog, done */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 72ffd7cce3c3..30a739d896ff 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1145,6 +1145,7 @@ static struct flags {
1145 { NFSEXP_ALLSQUASH, {"all_squash", ""}}, 1145 { NFSEXP_ALLSQUASH, {"all_squash", ""}},
1146 { NFSEXP_ASYNC, {"async", "sync"}}, 1146 { NFSEXP_ASYNC, {"async", "sync"}},
1147 { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, 1147 { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}},
1148 { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}},
1148 { NFSEXP_NOHIDE, {"nohide", ""}}, 1149 { NFSEXP_NOHIDE, {"nohide", ""}},
1149 { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, 1150 { NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
1150 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, 1151 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index fa2525b2e9d7..12f2aab4f614 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -223,11 +223,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
223 newfhp = fh_init(&resp->fh, NFS3_FHSIZE); 223 newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
224 attr = &argp->attrs; 224 attr = &argp->attrs;
225 225
226 /* Get the directory inode */
227 nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
228 if (nfserr)
229 RETURN_STATUS(nfserr);
230
231 /* Unfudge the mode bits */ 226 /* Unfudge the mode bits */
232 attr->ia_mode &= ~S_IFMT; 227 attr->ia_mode &= ~S_IFMT;
233 if (!(attr->ia_valid & ATTR_MODE)) { 228 if (!(attr->ia_valid & ATTR_MODE)) {
@@ -471,6 +466,14 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
471 resp->buflen = resp->count; 466 resp->buflen = resp->count;
472 resp->rqstp = rqstp; 467 resp->rqstp = rqstp;
473 offset = argp->cookie; 468 offset = argp->cookie;
469
470 nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
471 if (nfserr)
472 RETURN_STATUS(nfserr);
473
474 if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS)
475 RETURN_STATUS(nfserr_notsupp);
476
474 nfserr = nfsd_readdir(rqstp, &resp->fh, 477 nfserr = nfsd_readdir(rqstp, &resp->fh,
475 &offset, 478 &offset,
476 &resp->common, 479 &resp->common,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e0be57b0f79b..ed2b1151b171 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
49 49
50/* Index of predefined Linux callback client operations */ 50/* Index of predefined Linux callback client operations */
51 51
52enum {
53 NFSPROC4_CLNT_CB_NULL = 0,
54 NFSPROC4_CLNT_CB_RECALL,
55 NFSPROC4_CLNT_CB_SEQUENCE,
56};
57
58struct nfs4_cb_compound_hdr { 52struct nfs4_cb_compound_hdr {
59 /* args */ 53 /* args */
60 u32 ident; /* minorversion 0 only */ 54 u32 ident; /* minorversion 0 only */
@@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
494static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, 488static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
495 const struct nfsd4_callback *cb) 489 const struct nfsd4_callback *cb)
496{ 490{
497 const struct nfs4_delegation *args = cb->cb_op; 491 const struct nfs4_delegation *dp = cb_to_delegation(cb);
498 struct nfs4_cb_compound_hdr hdr = { 492 struct nfs4_cb_compound_hdr hdr = {
499 .ident = cb->cb_clp->cl_cb_ident, 493 .ident = cb->cb_clp->cl_cb_ident,
500 .minorversion = cb->cb_minorversion, 494 .minorversion = cb->cb_minorversion,
@@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
502 496
503 encode_cb_compound4args(xdr, &hdr); 497 encode_cb_compound4args(xdr, &hdr);
504 encode_cb_sequence4args(xdr, cb, &hdr); 498 encode_cb_sequence4args(xdr, cb, &hdr);
505 encode_cb_recall4args(xdr, args, &hdr); 499 encode_cb_recall4args(xdr, dp, &hdr);
506 encode_cb_nops(&hdr); 500 encode_cb_nops(&hdr);
507} 501}
508 502
@@ -746,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
746 740
747static struct workqueue_struct *callback_wq; 741static struct workqueue_struct *callback_wq;
748 742
749static void run_nfsd4_cb(struct nfsd4_callback *cb)
750{
751 queue_work(callback_wq, &cb->cb_work);
752}
753
754static void do_probe_callback(struct nfs4_client *clp)
755{
756 struct nfsd4_callback *cb = &clp->cl_cb_null;
757
758 cb->cb_op = NULL;
759 cb->cb_clp = clp;
760
761 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
762 cb->cb_msg.rpc_argp = NULL;
763 cb->cb_msg.rpc_resp = NULL;
764
765 cb->cb_ops = &nfsd4_cb_probe_ops;
766
767 run_nfsd4_cb(cb);
768}
769
770/* 743/*
771 * Poke the callback thread to process any updates to the callback 744 * Poke the callback thread to process any updates to the callback
772 * parameters, and send a null probe. 745 * parameters, and send a null probe.
@@ -775,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
775{ 748{
776 clp->cl_cb_state = NFSD4_CB_UNKNOWN; 749 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
777 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); 750 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
778 do_probe_callback(clp); 751 nfsd4_run_cb(&clp->cl_cb_null);
779} 752}
780 753
781void nfsd4_probe_callback_sync(struct nfs4_client *clp) 754void nfsd4_probe_callback_sync(struct nfs4_client *clp)
@@ -847,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
847 rpc_wake_up_next(&clp->cl_cb_waitq); 820 rpc_wake_up_next(&clp->cl_cb_waitq);
848 dprintk("%s: freed slot, new seqid=%d\n", __func__, 821 dprintk("%s: freed slot, new seqid=%d\n", __func__,
849 clp->cl_cb_session->se_cb_seq_nr); 822 clp->cl_cb_session->se_cb_seq_nr);
850
851 /* We're done looking into the sequence information */
852 task->tk_msg.rpc_resp = NULL;
853 } 823 }
854}
855
856
857static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
858{
859 struct nfsd4_callback *cb = calldata;
860 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
861 struct nfs4_client *clp = cb->cb_clp;
862 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
863
864 nfsd4_cb_done(task, calldata);
865 824
866 if (current_rpc_client != task->tk_client) { 825 if (clp->cl_cb_client != task->tk_client) {
867 /* We're shutting down or changing cl_cb_client; leave 826 /* We're shutting down or changing cl_cb_client; leave
868 * it to nfsd4_process_cb_update to restart the call if 827 * it to nfsd4_process_cb_update to restart the call if
869 * necessary. */ 828 * necessary. */
@@ -872,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
872 831
873 if (cb->cb_done) 832 if (cb->cb_done)
874 return; 833 return;
875 switch (task->tk_status) { 834
835 switch (cb->cb_ops->done(cb, task)) {
876 case 0: 836 case 0:
877 cb->cb_done = true; 837 task->tk_status = 0;
838 rpc_restart_call_prepare(task);
878 return; 839 return;
879 case -EBADHANDLE: 840 case 1:
880 case -NFS4ERR_BAD_STATEID:
881 /* Race: client probably got cb_recall
882 * before open reply granting delegation */
883 break; 841 break;
884 default: 842 case -1:
885 /* Network partition? */ 843 /* Network partition? */
886 nfsd4_mark_cb_down(clp, task->tk_status); 844 nfsd4_mark_cb_down(clp, task->tk_status);
845 break;
846 default:
847 BUG();
887 } 848 }
888 if (dp->dl_retries--) {
889 rpc_delay(task, 2*HZ);
890 task->tk_status = 0;
891 rpc_restart_call_prepare(task);
892 return;
893 }
894 nfsd4_mark_cb_down(clp, task->tk_status);
895 cb->cb_done = true; 849 cb->cb_done = true;
896} 850}
897 851
898static void nfsd4_cb_recall_release(void *calldata) 852static void nfsd4_cb_release(void *calldata)
899{ 853{
900 struct nfsd4_callback *cb = calldata; 854 struct nfsd4_callback *cb = calldata;
901 struct nfs4_client *clp = cb->cb_clp; 855 struct nfs4_client *clp = cb->cb_clp;
902 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
903 856
904 if (cb->cb_done) { 857 if (cb->cb_done) {
905 spin_lock(&clp->cl_lock); 858 spin_lock(&clp->cl_lock);
906 list_del(&cb->cb_per_client); 859 list_del(&cb->cb_per_client);
907 spin_unlock(&clp->cl_lock); 860 spin_unlock(&clp->cl_lock);
908 nfs4_put_stid(&dp->dl_stid); 861
862 cb->cb_ops->release(cb);
909 } 863 }
910} 864}
911 865
912static const struct rpc_call_ops nfsd4_cb_recall_ops = { 866static const struct rpc_call_ops nfsd4_cb_ops = {
913 .rpc_call_prepare = nfsd4_cb_prepare, 867 .rpc_call_prepare = nfsd4_cb_prepare,
914 .rpc_call_done = nfsd4_cb_recall_done, 868 .rpc_call_done = nfsd4_cb_done,
915 .rpc_release = nfsd4_cb_recall_release, 869 .rpc_release = nfsd4_cb_release,
916}; 870};
917 871
918int nfsd4_create_callback_queue(void) 872int nfsd4_create_callback_queue(void)
@@ -937,16 +891,10 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
937 * instead, nfsd4_run_cb_null() will detect the killed 891 * instead, nfsd4_run_cb_null() will detect the killed
938 * client, destroy the rpc client, and stop: 892 * client, destroy the rpc client, and stop:
939 */ 893 */
940 do_probe_callback(clp); 894 nfsd4_run_cb(&clp->cl_cb_null);
941 flush_workqueue(callback_wq); 895 flush_workqueue(callback_wq);
942} 896}
943 897
944static void nfsd4_release_cb(struct nfsd4_callback *cb)
945{
946 if (cb->cb_ops->rpc_release)
947 cb->cb_ops->rpc_release(cb);
948}
949
950/* requires cl_lock: */ 898/* requires cl_lock: */
951static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) 899static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
952{ 900{
@@ -1009,63 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
1009 } 957 }
1010 /* Yay, the callback channel's back! Restart any callbacks: */ 958 /* Yay, the callback channel's back! Restart any callbacks: */
1011 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) 959 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
1012 run_nfsd4_cb(cb); 960 queue_work(callback_wq, &cb->cb_work);
1013} 961}
1014 962
1015static void 963static void
1016nfsd4_run_callback_rpc(struct nfsd4_callback *cb) 964nfsd4_run_cb_work(struct work_struct *work)
1017{ 965{
966 struct nfsd4_callback *cb =
967 container_of(work, struct nfsd4_callback, cb_work);
1018 struct nfs4_client *clp = cb->cb_clp; 968 struct nfs4_client *clp = cb->cb_clp;
1019 struct rpc_clnt *clnt; 969 struct rpc_clnt *clnt;
1020 970
971 if (cb->cb_ops && cb->cb_ops->prepare)
972 cb->cb_ops->prepare(cb);
973
1021 if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) 974 if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
1022 nfsd4_process_cb_update(cb); 975 nfsd4_process_cb_update(cb);
1023 976
1024 clnt = clp->cl_cb_client; 977 clnt = clp->cl_cb_client;
1025 if (!clnt) { 978 if (!clnt) {
1026 /* Callback channel broken, or client killed; give up: */ 979 /* Callback channel broken, or client killed; give up: */
1027 nfsd4_release_cb(cb); 980 if (cb->cb_ops && cb->cb_ops->release)
981 cb->cb_ops->release(cb);
1028 return; 982 return;
1029 } 983 }
1030 cb->cb_msg.rpc_cred = clp->cl_cb_cred; 984 cb->cb_msg.rpc_cred = clp->cl_cb_cred;
1031 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 985 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
1032 cb->cb_ops, cb); 986 cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
1033} 987}
1034 988
1035void 989void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
1036nfsd4_run_cb_null(struct work_struct *w) 990 struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
1037{ 991{
1038 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
1039 cb_work);
1040 nfsd4_run_callback_rpc(cb);
1041}
1042
1043void
1044nfsd4_run_cb_recall(struct work_struct *w)
1045{
1046 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
1047 cb_work);
1048
1049 nfsd4_prepare_cb_recall(cb->cb_op);
1050 nfsd4_run_callback_rpc(cb);
1051}
1052
1053void nfsd4_cb_recall(struct nfs4_delegation *dp)
1054{
1055 struct nfsd4_callback *cb = &dp->dl_recall;
1056 struct nfs4_client *clp = dp->dl_stid.sc_client;
1057
1058 dp->dl_retries = 1;
1059 cb->cb_op = dp;
1060 cb->cb_clp = clp; 992 cb->cb_clp = clp;
1061 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; 993 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
1062 cb->cb_msg.rpc_argp = cb; 994 cb->cb_msg.rpc_argp = cb;
1063 cb->cb_msg.rpc_resp = cb; 995 cb->cb_msg.rpc_resp = cb;
1064 996 cb->cb_ops = ops;
1065 cb->cb_ops = &nfsd4_cb_recall_ops; 997 INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
1066
1067 INIT_LIST_HEAD(&cb->cb_per_client); 998 INIT_LIST_HEAD(&cb->cb_per_client);
1068 cb->cb_done = true; 999 cb->cb_done = true;
1000}
1069 1001
1070 run_nfsd4_cb(&dp->dl_recall); 1002void nfsd4_run_cb(struct nfsd4_callback *cb)
1003{
1004 queue_work(callback_wq, &cb->cb_work);
1071} 1005}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a0ab0a847d69..e1b3d3d472da 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
215 memset(&ent, 0, sizeof(ent)); 215 memset(&ent, 0, sizeof(ent));
216 216
217 /* Authentication name */ 217 /* Authentication name */
218 if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) 218 len = qword_get(&buf, buf1, PAGE_SIZE);
219 if (len <= 0 || len >= IDMAP_NAMESZ)
219 goto out; 220 goto out;
220 memcpy(ent.authname, buf1, sizeof(ent.authname)); 221 memcpy(ent.authname, buf1, sizeof(ent.authname));
221 222
@@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
245 /* Name */ 246 /* Name */
246 error = -EINVAL; 247 error = -EINVAL;
247 len = qword_get(&buf, buf1, PAGE_SIZE); 248 len = qword_get(&buf, buf1, PAGE_SIZE);
248 if (len < 0) 249 if (len < 0 || len >= IDMAP_NAMESZ)
249 goto out; 250 goto out;
250 if (len == 0) 251 if (len == 0)
251 set_bit(CACHE_NEGATIVE, &ent.h.flags); 252 set_bit(CACHE_NEGATIVE, &ent.h.flags);
252 else if (len >= IDMAP_NAMESZ)
253 goto out;
254 else 253 else
255 memcpy(ent.name, buf1, sizeof(ent.name)); 254 memcpy(ent.name, buf1, sizeof(ent.name));
256 error = -ENOMEM; 255 error = -ENOMEM;
@@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
259 goto out; 258 goto out;
260 259
261 cache_put(&res->h, cd); 260 cache_put(&res->h, cd);
262
263 error = 0; 261 error = 0;
264out: 262out:
265 kfree(buf1); 263 kfree(buf1);
266
267 return error; 264 return error;
268} 265}
269 266
270
271static struct ent * 267static struct ent *
272idtoname_lookup(struct cache_detail *cd, struct ent *item) 268idtoname_lookup(struct cache_detail *cd, struct ent *item)
273{ 269{
@@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
368{ 364{
369 struct ent ent, *res; 365 struct ent ent, *res;
370 char *buf1; 366 char *buf1;
371 int error = -EINVAL; 367 int len, error = -EINVAL;
372 368
373 if (buf[buflen - 1] != '\n') 369 if (buf[buflen - 1] != '\n')
374 return (-EINVAL); 370 return (-EINVAL);
@@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
381 memset(&ent, 0, sizeof(ent)); 377 memset(&ent, 0, sizeof(ent));
382 378
383 /* Authentication name */ 379 /* Authentication name */
384 if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) 380 len = qword_get(&buf, buf1, PAGE_SIZE);
381 if (len <= 0 || len >= IDMAP_NAMESZ)
385 goto out; 382 goto out;
386 memcpy(ent.authname, buf1, sizeof(ent.authname)); 383 memcpy(ent.authname, buf1, sizeof(ent.authname));
387 384
@@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
392 IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; 389 IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
393 390
394 /* Name */ 391 /* Name */
395 error = qword_get(&buf, buf1, PAGE_SIZE); 392 len = qword_get(&buf, buf1, PAGE_SIZE);
396 if (error <= 0 || error >= IDMAP_NAMESZ) 393 if (len <= 0 || len >= IDMAP_NAMESZ)
397 goto out; 394 goto out;
398 memcpy(ent.name, buf1, sizeof(ent.name)); 395 memcpy(ent.name, buf1, sizeof(ent.name));
399 396
@@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
421 error = 0; 418 error = 0;
422out: 419out:
423 kfree(buf1); 420 kfree(buf1);
424
425 return (error); 421 return (error);
426} 422}
427 423
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5e0dc528a0e8..cdeb3cfd6f32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1013,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1013 return status; 1013 return status;
1014} 1014}
1015 1015
1016static __be32
1017nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1018 struct nfsd4_seek *seek)
1019{
1020 int whence;
1021 __be32 status;
1022 struct file *file;
1023
1024 status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
1025 &seek->seek_stateid,
1026 RD_STATE, &file);
1027 if (status) {
1028 dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
1029 return status;
1030 }
1031
1032 switch (seek->seek_whence) {
1033 case NFS4_CONTENT_DATA:
1034 whence = SEEK_DATA;
1035 break;
1036 case NFS4_CONTENT_HOLE:
1037 whence = SEEK_HOLE;
1038 break;
1039 default:
1040 status = nfserr_union_notsupp;
1041 goto out;
1042 }
1043
1044 /*
1045 * Note: This call does change file->f_pos, but nothing in NFSD
1046 * should ever file->f_pos.
1047 */
1048 seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
1049 if (seek->seek_pos < 0)
1050 status = nfserrno(seek->seek_pos);
1051 else if (seek->seek_pos >= i_size_read(file_inode(file)))
1052 seek->seek_eof = true;
1053
1054out:
1055 fput(file);
1056 return status;
1057}
1058
1016/* This routine never returns NFS_OK! If there are no other errors, it 1059/* This routine never returns NFS_OK! If there are no other errors, it
1017 * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the 1060 * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
1018 * attributes matched. VERIFY is implemented by mapping NFSERR_SAME 1061 * attributes matched. VERIFY is implemented by mapping NFSERR_SAME
@@ -1881,6 +1924,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
1881 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, 1924 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
1882 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 1925 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1883 }, 1926 },
1927
1928 /* NFSv4.2 operations */
1929 [OP_SEEK] = {
1930 .op_func = (nfsd4op_func)nfsd4_seek,
1931 .op_name = "OP_SEEK",
1932 },
1884}; 1933};
1885 1934
1886int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) 1935int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..ea95a2bc21b5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops {
58 void (*create)(struct nfs4_client *); 58 void (*create)(struct nfs4_client *);
59 void (*remove)(struct nfs4_client *); 59 void (*remove)(struct nfs4_client *);
60 int (*check)(struct nfs4_client *); 60 int (*check)(struct nfs4_client *);
61 void (*grace_done)(struct nfsd_net *, time_t); 61 void (*grace_done)(struct nfsd_net *);
62}; 62};
63 63
64/* Globals */ 64/* Globals */
@@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
188 188
189 status = mnt_want_write_file(nn->rec_file); 189 status = mnt_want_write_file(nn->rec_file);
190 if (status) 190 if (status)
191 return; 191 goto out_creds;
192 192
193 dir = nn->rec_file->f_path.dentry; 193 dir = nn->rec_file->f_path.dentry;
194 /* lock the parent */ 194 /* lock the parent */
@@ -228,6 +228,7 @@ out_unlock:
228 user_recovery_dirname); 228 user_recovery_dirname);
229 } 229 }
230 mnt_drop_write_file(nn->rec_file); 230 mnt_drop_write_file(nn->rec_file);
231out_creds:
231 nfs4_reset_creds(original_cred); 232 nfs4_reset_creds(original_cred);
232} 233}
233 234
@@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
392} 393}
393 394
394static void 395static void
395nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time) 396nfsd4_recdir_purge_old(struct nfsd_net *nn)
396{ 397{
397 int status; 398 int status;
398 399
@@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net)
479 return status; 480 return status;
480} 481}
481 482
483static void
484nfsd4_shutdown_recdir(struct net *net)
485{
486 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
487
488 if (!nn->rec_file)
489 return;
490 fput(nn->rec_file);
491 nn->rec_file = NULL;
492}
482 493
483static int 494static int
484nfs4_legacy_state_init(struct net *net) 495nfs4_legacy_state_init(struct net *net)
@@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net)
512 int status; 523 int status;
513 524
514 status = nfsd4_init_recdir(net); 525 status = nfsd4_init_recdir(net);
515 if (!status)
516 status = nfsd4_recdir_load(net);
517 if (status) 526 if (status)
518 printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); 527 return status;
528
529 status = nfsd4_recdir_load(net);
530 if (status)
531 nfsd4_shutdown_recdir(net);
532
519 return status; 533 return status;
520} 534}
521 535
@@ -546,21 +560,12 @@ err:
546} 560}
547 561
548static void 562static void
549nfsd4_shutdown_recdir(struct nfsd_net *nn)
550{
551 if (!nn->rec_file)
552 return;
553 fput(nn->rec_file);
554 nn->rec_file = NULL;
555}
556
557static void
558nfsd4_legacy_tracking_exit(struct net *net) 563nfsd4_legacy_tracking_exit(struct net *net)
559{ 564{
560 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 565 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
561 566
562 nfs4_release_reclaim(nn); 567 nfs4_release_reclaim(nn);
563 nfsd4_shutdown_recdir(nn); 568 nfsd4_shutdown_recdir(net);
564 nfs4_legacy_state_shutdown(net); 569 nfs4_legacy_state_shutdown(net);
565} 570}
566 571
@@ -1016,7 +1021,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
1016} 1021}
1017 1022
1018static void 1023static void
1019nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time) 1024nfsd4_cld_grace_done(struct nfsd_net *nn)
1020{ 1025{
1021 int ret; 1026 int ret;
1022 struct cld_upcall *cup; 1027 struct cld_upcall *cup;
@@ -1029,7 +1034,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
1029 } 1034 }
1030 1035
1031 cup->cu_msg.cm_cmd = Cld_GraceDone; 1036 cup->cu_msg.cm_cmd = Cld_GraceDone;
1032 cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time; 1037 cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
1033 ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); 1038 ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
1034 if (!ret) 1039 if (!ret)
1035 ret = cup->cu_msg.cm_status; 1040 ret = cup->cu_msg.cm_status;
@@ -1062,6 +1067,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable,
1062 1067
1063#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" 1068#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
1064#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" 1069#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
1070#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
1071#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
1065 1072
1066static char * 1073static char *
1067nfsd4_cltrack_legacy_topdir(void) 1074nfsd4_cltrack_legacy_topdir(void)
@@ -1126,10 +1133,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
1126 return result; 1133 return result;
1127} 1134}
1128 1135
1136static char *
1137nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
1138{
1139 int copied;
1140 size_t len;
1141 char *result;
1142
1143 /* prefix + Y/N character + terminating NULL */
1144 len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
1145
1146 result = kmalloc(len, GFP_KERNEL);
1147 if (!result)
1148 return result;
1149
1150 copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
1151 clp->cl_minorversion ? 'Y' : 'N');
1152 if (copied >= len) {
1153 /* just return nothing if output was truncated */
1154 kfree(result);
1155 return NULL;
1156 }
1157
1158 return result;
1159}
1160
1161static char *
1162nfsd4_cltrack_grace_start(time_t grace_start)
1163{
1164 int copied;
1165 size_t len;
1166 char *result;
1167
1168 /* prefix + max width of int64_t string + terminating NULL */
1169 len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
1170
1171 result = kmalloc(len, GFP_KERNEL);
1172 if (!result)
1173 return result;
1174
1175 copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
1176 grace_start);
1177 if (copied >= len) {
1178 /* just return nothing if output was truncated */
1179 kfree(result);
1180 return NULL;
1181 }
1182
1183 return result;
1184}
1185
1129static int 1186static int
1130nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) 1187nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
1131{ 1188{
1132 char *envp[2]; 1189 char *envp[3];
1133 char *argv[4]; 1190 char *argv[4];
1134 int ret; 1191 int ret;
1135 1192
@@ -1140,10 +1197,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
1140 1197
1141 dprintk("%s: cmd: %s\n", __func__, cmd); 1198 dprintk("%s: cmd: %s\n", __func__, cmd);
1142 dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); 1199 dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
1143 dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)"); 1200 dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
1201 dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
1144 1202
1145 envp[0] = legacy; 1203 envp[0] = env0;
1146 envp[1] = NULL; 1204 envp[1] = env1;
1205 envp[2] = NULL;
1147 1206
1148 argv[0] = (char *)cltrack_prog; 1207 argv[0] = (char *)cltrack_prog;
1149 argv[1] = cmd; 1208 argv[1] = cmd;
@@ -1187,28 +1246,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
1187} 1246}
1188 1247
1189static int 1248static int
1190nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) 1249nfsd4_umh_cltrack_init(struct net *net)
1191{ 1250{
1251 int ret;
1252 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1253 char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
1254
1192 /* XXX: The usermode helper s not working in container yet. */ 1255 /* XXX: The usermode helper s not working in container yet. */
1193 if (net != &init_net) { 1256 if (net != &init_net) {
1194 WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " 1257 WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
1195 "tracking in a container!\n"); 1258 "tracking in a container!\n");
1196 return -EINVAL; 1259 return -EINVAL;
1197 } 1260 }
1198 return nfsd4_umh_cltrack_upcall("init", NULL, NULL); 1261
1262 ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
1263 kfree(grace_start);
1264 return ret;
1265}
1266
1267static void
1268nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
1269{
1270 wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
1271 TASK_UNINTERRUPTIBLE);
1272}
1273
1274static void
1275nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
1276{
1277 smp_mb__before_atomic();
1278 clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
1279 smp_mb__after_atomic();
1280 wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
1199} 1281}
1200 1282
1201static void 1283static void
1202nfsd4_umh_cltrack_create(struct nfs4_client *clp) 1284nfsd4_umh_cltrack_create(struct nfs4_client *clp)
1203{ 1285{
1204 char *hexid; 1286 char *hexid, *has_session, *grace_start;
1287 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1288
1289 /*
1290 * With v4.0 clients, there's little difference in outcome between a
1291 * create and check operation, and we can end up calling into this
1292 * function multiple times per client (once for each openowner). So,
1293 * for v4.0 clients skip upcalling once the client has been recorded
1294 * on stable storage.
1295 *
1296 * For v4.1+ clients, the outcome of the two operations is different,
1297 * so we must ensure that we upcall for the create operation. v4.1+
1298 * clients call this on RECLAIM_COMPLETE though, so we should only end
1299 * up doing a single create upcall per client.
1300 */
1301 if (clp->cl_minorversion == 0 &&
1302 test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1303 return;
1205 1304
1206 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1305 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1207 if (!hexid) { 1306 if (!hexid) {
1208 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1307 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1209 return; 1308 return;
1210 } 1309 }
1211 nfsd4_umh_cltrack_upcall("create", hexid, NULL); 1310
1311 has_session = nfsd4_cltrack_client_has_session(clp);
1312 grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
1313
1314 nfsd4_cltrack_upcall_lock(clp);
1315 if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
1316 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1317 nfsd4_cltrack_upcall_unlock(clp);
1318
1319 kfree(has_session);
1320 kfree(grace_start);
1212 kfree(hexid); 1321 kfree(hexid);
1213} 1322}
1214 1323
@@ -1217,12 +1326,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
1217{ 1326{
1218 char *hexid; 1327 char *hexid;
1219 1328
1329 if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1330 return;
1331
1220 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1332 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1221 if (!hexid) { 1333 if (!hexid) {
1222 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1334 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1223 return; 1335 return;
1224 } 1336 }
1225 nfsd4_umh_cltrack_upcall("remove", hexid, NULL); 1337
1338 nfsd4_cltrack_upcall_lock(clp);
1339 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
1340 nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
1341 clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1342 nfsd4_cltrack_upcall_unlock(clp);
1343
1226 kfree(hexid); 1344 kfree(hexid);
1227} 1345}
1228 1346
@@ -1230,30 +1348,45 @@ static int
1230nfsd4_umh_cltrack_check(struct nfs4_client *clp) 1348nfsd4_umh_cltrack_check(struct nfs4_client *clp)
1231{ 1349{
1232 int ret; 1350 int ret;
1233 char *hexid, *legacy; 1351 char *hexid, *has_session, *legacy;
1352
1353 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1354 return 0;
1234 1355
1235 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1356 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1236 if (!hexid) { 1357 if (!hexid) {
1237 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1358 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1238 return -ENOMEM; 1359 return -ENOMEM;
1239 } 1360 }
1361
1362 has_session = nfsd4_cltrack_client_has_session(clp);
1240 legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); 1363 legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
1241 ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy); 1364
1365 nfsd4_cltrack_upcall_lock(clp);
1366 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
1367 ret = 0;
1368 } else {
1369 ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
1370 if (ret == 0)
1371 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1372 }
1373 nfsd4_cltrack_upcall_unlock(clp);
1374 kfree(has_session);
1242 kfree(legacy); 1375 kfree(legacy);
1243 kfree(hexid); 1376 kfree(hexid);
1377
1244 return ret; 1378 return ret;
1245} 1379}
1246 1380
1247static void 1381static void
1248nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn, 1382nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
1249 time_t boot_time)
1250{ 1383{
1251 char *legacy; 1384 char *legacy;
1252 char timestr[22]; /* FIXME: better way to determine max size? */ 1385 char timestr[22]; /* FIXME: better way to determine max size? */
1253 1386
1254 sprintf(timestr, "%ld", boot_time); 1387 sprintf(timestr, "%ld", nn->boot_time);
1255 legacy = nfsd4_cltrack_legacy_topdir(); 1388 legacy = nfsd4_cltrack_legacy_topdir();
1256 nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy); 1389 nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
1257 kfree(legacy); 1390 kfree(legacy);
1258} 1391}
1259 1392
@@ -1356,10 +1489,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
1356} 1489}
1357 1490
1358void 1491void
1359nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time) 1492nfsd4_record_grace_done(struct nfsd_net *nn)
1360{ 1493{
1361 if (nn->client_tracking_ops) 1494 if (nn->client_tracking_ops)
1362 nn->client_tracking_ops->grace_done(nn, boot_time); 1495 nn->client_tracking_ops->grace_done(nn);
1363} 1496}
1364 1497
1365static int 1498static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2e80a59e7e91..5c0cac173068 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -96,6 +96,8 @@ static struct kmem_cache *deleg_slab;
96 96
97static void free_session(struct nfsd4_session *); 97static void free_session(struct nfsd4_session *);
98 98
99static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
100
99static bool is_session_dead(struct nfsd4_session *ses) 101static bool is_session_dead(struct nfsd4_session *ses)
100{ 102{
101 return ses->se_flags & NFS4_SESSION_DEAD; 103 return ses->se_flags & NFS4_SESSION_DEAD;
@@ -645,7 +647,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
645 INIT_LIST_HEAD(&dp->dl_perclnt); 647 INIT_LIST_HEAD(&dp->dl_perclnt);
646 INIT_LIST_HEAD(&dp->dl_recall_lru); 648 INIT_LIST_HEAD(&dp->dl_recall_lru);
647 dp->dl_type = NFS4_OPEN_DELEGATE_READ; 649 dp->dl_type = NFS4_OPEN_DELEGATE_READ;
648 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_run_cb_recall); 650 dp->dl_retries = 1;
651 nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
652 &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
649 return dp; 653 return dp;
650out_dec: 654out_dec:
651 atomic_long_dec(&num_delegations); 655 atomic_long_dec(&num_delegations);
@@ -673,15 +677,20 @@ nfs4_put_stid(struct nfs4_stid *s)
673 677
674static void nfs4_put_deleg_lease(struct nfs4_file *fp) 678static void nfs4_put_deleg_lease(struct nfs4_file *fp)
675{ 679{
676 lockdep_assert_held(&state_lock); 680 struct file *filp = NULL;
681 struct file_lock *fl;
677 682
678 if (!fp->fi_lease) 683 spin_lock(&fp->fi_lock);
679 return; 684 if (fp->fi_lease && atomic_dec_and_test(&fp->fi_delegees)) {
680 if (atomic_dec_and_test(&fp->fi_delegees)) { 685 swap(filp, fp->fi_deleg_file);
681 vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); 686 fl = fp->fi_lease;
682 fp->fi_lease = NULL; 687 fp->fi_lease = NULL;
683 fput(fp->fi_deleg_file); 688 }
684 fp->fi_deleg_file = NULL; 689 spin_unlock(&fp->fi_lock);
690
691 if (filp) {
692 vfs_setlease(filp, F_UNLCK, &fl);
693 fput(filp);
685 } 694 }
686} 695}
687 696
@@ -717,8 +726,6 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
717 list_del_init(&dp->dl_recall_lru); 726 list_del_init(&dp->dl_recall_lru);
718 list_del_init(&dp->dl_perfile); 727 list_del_init(&dp->dl_perfile);
719 spin_unlock(&fp->fi_lock); 728 spin_unlock(&fp->fi_lock);
720 if (fp)
721 nfs4_put_deleg_lease(fp);
722} 729}
723 730
724static void destroy_delegation(struct nfs4_delegation *dp) 731static void destroy_delegation(struct nfs4_delegation *dp)
@@ -726,6 +733,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
726 spin_lock(&state_lock); 733 spin_lock(&state_lock);
727 unhash_delegation_locked(dp); 734 unhash_delegation_locked(dp);
728 spin_unlock(&state_lock); 735 spin_unlock(&state_lock);
736 nfs4_put_deleg_lease(dp->dl_stid.sc_file);
729 nfs4_put_stid(&dp->dl_stid); 737 nfs4_put_stid(&dp->dl_stid);
730} 738}
731 739
@@ -735,6 +743,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
735 743
736 WARN_ON(!list_empty(&dp->dl_recall_lru)); 744 WARN_ON(!list_empty(&dp->dl_recall_lru));
737 745
746 nfs4_put_deleg_lease(dp->dl_stid.sc_file);
747
738 if (clp->cl_minorversion == 0) 748 if (clp->cl_minorversion == 0)
739 nfs4_put_stid(&dp->dl_stid); 749 nfs4_put_stid(&dp->dl_stid);
740 else { 750 else {
@@ -1635,6 +1645,7 @@ __destroy_client(struct nfs4_client *clp)
1635 while (!list_empty(&reaplist)) { 1645 while (!list_empty(&reaplist)) {
1636 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); 1646 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
1637 list_del_init(&dp->dl_recall_lru); 1647 list_del_init(&dp->dl_recall_lru);
1648 nfs4_put_deleg_lease(dp->dl_stid.sc_file);
1638 nfs4_put_stid(&dp->dl_stid); 1649 nfs4_put_stid(&dp->dl_stid);
1639 } 1650 }
1640 while (!list_empty(&clp->cl_revoked)) { 1651 while (!list_empty(&clp->cl_revoked)) {
@@ -1862,7 +1873,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
1862 free_client(clp); 1873 free_client(clp);
1863 return NULL; 1874 return NULL;
1864 } 1875 }
1865 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_run_cb_null); 1876 nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
1866 clp->cl_time = get_seconds(); 1877 clp->cl_time = get_seconds();
1867 clear_bit(0, &clp->cl_cb_slot_busy); 1878 clear_bit(0, &clp->cl_cb_slot_busy);
1868 copy_verf(clp, verf); 1879 copy_verf(clp, verf);
@@ -3349,8 +3360,9 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
3349 return ret; 3360 return ret;
3350} 3361}
3351 3362
3352void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp) 3363static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
3353{ 3364{
3365 struct nfs4_delegation *dp = cb_to_delegation(cb);
3354 struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, 3366 struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
3355 nfsd_net_id); 3367 nfsd_net_id);
3356 3368
@@ -3371,6 +3383,43 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
3371 spin_unlock(&state_lock); 3383 spin_unlock(&state_lock);
3372} 3384}
3373 3385
3386static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
3387 struct rpc_task *task)
3388{
3389 struct nfs4_delegation *dp = cb_to_delegation(cb);
3390
3391 switch (task->tk_status) {
3392 case 0:
3393 return 1;
3394 case -EBADHANDLE:
3395 case -NFS4ERR_BAD_STATEID:
3396 /*
3397 * Race: client probably got cb_recall before open reply
3398 * granting delegation.
3399 */
3400 if (dp->dl_retries--) {
3401 rpc_delay(task, 2 * HZ);
3402 return 0;
3403 }
3404 /*FALLTHRU*/
3405 default:
3406 return -1;
3407 }
3408}
3409
3410static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
3411{
3412 struct nfs4_delegation *dp = cb_to_delegation(cb);
3413
3414 nfs4_put_stid(&dp->dl_stid);
3415}
3416
3417static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
3418 .prepare = nfsd4_cb_recall_prepare,
3419 .done = nfsd4_cb_recall_done,
3420 .release = nfsd4_cb_recall_release,
3421};
3422
3374static void nfsd_break_one_deleg(struct nfs4_delegation *dp) 3423static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
3375{ 3424{
3376 /* 3425 /*
@@ -3381,7 +3430,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
3381 * it's safe to take a reference. 3430 * it's safe to take a reference.
3382 */ 3431 */
3383 atomic_inc(&dp->dl_stid.sc_count); 3432 atomic_inc(&dp->dl_stid.sc_count);
3384 nfsd4_cb_recall(dp); 3433 nfsd4_run_cb(&dp->dl_recall);
3385} 3434}
3386 3435
3387/* Called from break_lease() with i_lock held. */ 3436/* Called from break_lease() with i_lock held. */
@@ -3759,7 +3808,6 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
3759 fl = locks_alloc_lock(); 3808 fl = locks_alloc_lock();
3760 if (!fl) 3809 if (!fl)
3761 return NULL; 3810 return NULL;
3762 locks_init_lock(fl);
3763 fl->fl_lmops = &nfsd_lease_mng_ops; 3811 fl->fl_lmops = &nfsd_lease_mng_ops;
3764 fl->fl_flags = FL_DELEG; 3812 fl->fl_flags = FL_DELEG;
3765 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; 3813 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
@@ -4107,7 +4155,7 @@ out:
4107 return status; 4155 return status;
4108} 4156}
4109 4157
4110static void 4158void
4111nfsd4_end_grace(struct nfsd_net *nn) 4159nfsd4_end_grace(struct nfsd_net *nn)
4112{ 4160{
4113 /* do nothing if grace period already ended */ 4161 /* do nothing if grace period already ended */
@@ -4116,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn)
4116 4164
4117 dprintk("NFSD: end of grace period\n"); 4165 dprintk("NFSD: end of grace period\n");
4118 nn->grace_ended = true; 4166 nn->grace_ended = true;
4119 nfsd4_record_grace_done(nn, nn->boot_time); 4167 /*
4168 * If the server goes down again right now, an NFSv4
4169 * client will still be allowed to reclaim after it comes back up,
4170 * even if it hasn't yet had a chance to reclaim state this time.
4171 *
4172 */
4173 nfsd4_record_grace_done(nn);
4174 /*
4175 * At this point, NFSv4 clients can still reclaim. But if the
4176 * server crashes, any that have not yet reclaimed will be out
4177 * of luck on the next boot.
4178 *
4179 * (NFSv4.1+ clients are considered to have reclaimed once they
4180 * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to
4181 * have reclaimed after their first OPEN.)
4182 */
4120 locks_end_grace(&nn->nfsd4_manager); 4183 locks_end_grace(&nn->nfsd4_manager);
4121 /* 4184 /*
4122 * Now that every NFSv4 client has had the chance to recover and 4185 * At this point, and once lockd and/or any other containers
4123 * to see the (possibly new, possibly shorter) lease time, we 4186 * exit their grace period, further reclaims will fail and
4124 * can safely set the next grace time to the current lease time: 4187 * regular locking can resume.
4125 */ 4188 */
4126 nn->nfsd4_grace = nn->nfsd4_lease;
4127} 4189}
4128 4190
4129static time_t 4191static time_t
@@ -5210,7 +5272,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
5210 } 5272 }
5211 5273
5212 fp = lock_stp->st_stid.sc_file; 5274 fp = lock_stp->st_stid.sc_file;
5213 locks_init_lock(file_lock);
5214 switch (lock->lk_type) { 5275 switch (lock->lk_type) {
5215 case NFS4_READ_LT: 5276 case NFS4_READ_LT:
5216 case NFS4_READW_LT: 5277 case NFS4_READW_LT:
@@ -5354,7 +5415,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
5354 status = nfserr_jukebox; 5415 status = nfserr_jukebox;
5355 goto out; 5416 goto out;
5356 } 5417 }
5357 locks_init_lock(file_lock); 5418
5358 switch (lockt->lt_type) { 5419 switch (lockt->lt_type) {
5359 case NFS4_READ_LT: 5420 case NFS4_READ_LT:
5360 case NFS4_READW_LT: 5421 case NFS4_READW_LT:
@@ -5432,7 +5493,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
5432 status = nfserr_jukebox; 5493 status = nfserr_jukebox;
5433 goto fput; 5494 goto fput;
5434 } 5495 }
5435 locks_init_lock(file_lock); 5496
5436 file_lock->fl_type = F_UNLCK; 5497 file_lock->fl_type = F_UNLCK;
5437 file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); 5498 file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner);
5438 file_lock->fl_pid = current->tgid; 5499 file_lock->fl_pid = current->tgid;
@@ -5645,6 +5706,9 @@ nfs4_check_open_reclaim(clientid_t *clid,
5645 if (status) 5706 if (status)
5646 return nfserr_reclaim_bad; 5707 return nfserr_reclaim_bad;
5647 5708
5709 if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
5710 return nfserr_no_grace;
5711
5648 if (nfsd4_client_record_check(cstate->clp)) 5712 if (nfsd4_client_record_check(cstate->clp))
5649 return nfserr_reclaim_bad; 5713 return nfserr_reclaim_bad;
5650 5714
@@ -6342,10 +6406,10 @@ nfs4_state_start_net(struct net *net)
6342 ret = nfs4_state_create_net(net); 6406 ret = nfs4_state_create_net(net);
6343 if (ret) 6407 if (ret)
6344 return ret; 6408 return ret;
6345 nfsd4_client_tracking_init(net);
6346 nn->boot_time = get_seconds(); 6409 nn->boot_time = get_seconds();
6347 locks_start_grace(net, &nn->nfsd4_manager);
6348 nn->grace_ended = false; 6410 nn->grace_ended = false;
6411 locks_start_grace(net, &nn->nfsd4_manager);
6412 nfsd4_client_tracking_init(net);
6349 printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", 6413 printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
6350 nn->nfsd4_grace, net); 6414 nn->nfsd4_grace, net);
6351 queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); 6415 queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
@@ -6402,6 +6466,7 @@ nfs4_state_shutdown_net(struct net *net)
6402 list_for_each_safe(pos, next, &reaplist) { 6466 list_for_each_safe(pos, next, &reaplist) {
6403 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 6467 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
6404 list_del_init(&dp->dl_recall_lru); 6468 list_del_init(&dp->dl_recall_lru);
6469 nfs4_put_deleg_lease(dp->dl_stid.sc_file);
6405 nfs4_put_stid(&dp->dl_stid); 6470 nfs4_put_stid(&dp->dl_stid);
6406 } 6471 }
6407 6472
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b01f6e100ee8..eeea7a90eb87 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -31,13 +31,6 @@
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 *
35 * TODO: Neil Brown made the following observation: We currently
36 * initially reserve NFSD_BUFSIZE space on the transmit queue and
37 * never release any of that until the request is complete.
38 * It would be good to calculate a new maximum response size while
39 * decoding the COMPOUND, and call svc_reserve with this number
40 * at the end of nfs4svc_decode_compoundargs.
41 */ 34 */
42 35
43#include <linux/slab.h> 36#include <linux/slab.h>
@@ -1521,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
1521} 1514}
1522 1515
1523static __be32 1516static __be32
1517nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
1518{
1519 DECODE_HEAD;
1520
1521 status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
1522 if (status)
1523 return status;
1524
1525 READ_BUF(8 + 4);
1526 p = xdr_decode_hyper(p, &seek->seek_offset);
1527 seek->seek_whence = be32_to_cpup(p);
1528
1529 DECODE_TAIL;
1530}
1531
1532static __be32
1524nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) 1533nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1525{ 1534{
1526 return nfs_ok; 1535 return nfs_ok;
@@ -1593,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1593 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1602 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1594 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, 1603 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
1595 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, 1604 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
1605
1606 /* new operations for NFSv4.2 */
1607 [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
1608 [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp,
1609 [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp,
1610 [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
1611 [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
1612 [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
1613 [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
1614 [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
1615 [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
1616 [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
1617 [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
1618 [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
1596}; 1619};
1597 1620
1598static inline bool 1621static inline bool
@@ -1670,6 +1693,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1670 readbytes += nfsd4_max_reply(argp->rqstp, op); 1693 readbytes += nfsd4_max_reply(argp->rqstp, op);
1671 } else 1694 } else
1672 max_reply += nfsd4_max_reply(argp->rqstp, op); 1695 max_reply += nfsd4_max_reply(argp->rqstp, op);
1696 /*
1697 * OP_LOCK may return a conflicting lock. (Special case
1698 * because it will just skip encoding this if it runs
1699 * out of xdr buffer space, and it is the only operation
1700 * that behaves this way.)
1701 */
1702 if (op->opnum == OP_LOCK)
1703 max_reply += NFS4_OPAQUE_LIMIT;
1673 1704
1674 if (op->status) { 1705 if (op->status) {
1675 argp->opcnt = i+1; 1706 argp->opcnt = i+1;
@@ -3764,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3764} 3795}
3765 3796
3766static __be32 3797static __be32
3798nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
3799 struct nfsd4_seek *seek)
3800{
3801 __be32 *p;
3802
3803 if (nfserr)
3804 return nfserr;
3805
3806 p = xdr_reserve_space(&resp->xdr, 4 + 8);
3807 *p++ = cpu_to_be32(seek->seek_eof);
3808 p = xdr_encode_hyper(p, seek->seek_pos);
3809
3810 return nfserr;
3811}
3812
3813static __be32
3767nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) 3814nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
3768{ 3815{
3769 return nfserr; 3816 return nfserr;
@@ -3835,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3835 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, 3882 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3836 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, 3883 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
3837 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, 3884 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
3885
3886 /* NFSv4.2 operations */
3887 [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
3888 [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop,
3889 [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop,
3890 [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
3891 [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
3892 [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
3893 [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
3894 [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
3895 [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
3896 [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
3897 [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
3898 [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
3838}; 3899};
3839 3900
3840/* 3901/*
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index ff9567633245..122f69185ef5 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -27,8 +27,12 @@
27 */ 27 */
28#define TARGET_BUCKET_SIZE 64 28#define TARGET_BUCKET_SIZE 64
29 29
30static struct hlist_head * cache_hash; 30struct nfsd_drc_bucket {
31static struct list_head lru_head; 31 struct list_head lru_head;
32 spinlock_t cache_lock;
33};
34
35static struct nfsd_drc_bucket *drc_hashtbl;
32static struct kmem_cache *drc_slab; 36static struct kmem_cache *drc_slab;
33 37
34/* max number of entries allowed in the cache */ 38/* max number of entries allowed in the cache */
@@ -36,6 +40,7 @@ static unsigned int max_drc_entries;
36 40
37/* number of significant bits in the hash value */ 41/* number of significant bits in the hash value */
38static unsigned int maskbits; 42static unsigned int maskbits;
43static unsigned int drc_hashsize;
39 44
40/* 45/*
41 * Stats and other tracking of on the duplicate reply cache. All of these and 46 * Stats and other tracking of on the duplicate reply cache. All of these and
@@ -43,7 +48,7 @@ static unsigned int maskbits;
43 */ 48 */
44 49
45/* total number of entries */ 50/* total number of entries */
46static unsigned int num_drc_entries; 51static atomic_t num_drc_entries;
47 52
48/* cache misses due only to checksum comparison failures */ 53/* cache misses due only to checksum comparison failures */
49static unsigned int payload_misses; 54static unsigned int payload_misses;
@@ -75,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
75 * A cache entry is "single use" if c_state == RC_INPROG 80 * A cache entry is "single use" if c_state == RC_INPROG
76 * Otherwise, it when accessing _prev or _next, the lock must be held. 81 * Otherwise, it when accessing _prev or _next, the lock must be held.
77 */ 82 */
78static DEFINE_SPINLOCK(cache_lock);
79static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); 83static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
80 84
81/* 85/*
@@ -116,6 +120,12 @@ nfsd_hashsize(unsigned int limit)
116 return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); 120 return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
117} 121}
118 122
123static u32
124nfsd_cache_hash(__be32 xid)
125{
126 return hash_32(be32_to_cpu(xid), maskbits);
127}
128
119static struct svc_cacherep * 129static struct svc_cacherep *
120nfsd_reply_cache_alloc(void) 130nfsd_reply_cache_alloc(void)
121{ 131{
@@ -126,7 +136,6 @@ nfsd_reply_cache_alloc(void)
126 rp->c_state = RC_UNUSED; 136 rp->c_state = RC_UNUSED;
127 rp->c_type = RC_NOCACHE; 137 rp->c_type = RC_NOCACHE;
128 INIT_LIST_HEAD(&rp->c_lru); 138 INIT_LIST_HEAD(&rp->c_lru);
129 INIT_HLIST_NODE(&rp->c_hash);
130 } 139 }
131 return rp; 140 return rp;
132} 141}
@@ -138,29 +147,27 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
138 drc_mem_usage -= rp->c_replvec.iov_len; 147 drc_mem_usage -= rp->c_replvec.iov_len;
139 kfree(rp->c_replvec.iov_base); 148 kfree(rp->c_replvec.iov_base);
140 } 149 }
141 if (!hlist_unhashed(&rp->c_hash))
142 hlist_del(&rp->c_hash);
143 list_del(&rp->c_lru); 150 list_del(&rp->c_lru);
144 --num_drc_entries; 151 atomic_dec(&num_drc_entries);
145 drc_mem_usage -= sizeof(*rp); 152 drc_mem_usage -= sizeof(*rp);
146 kmem_cache_free(drc_slab, rp); 153 kmem_cache_free(drc_slab, rp);
147} 154}
148 155
149static void 156static void
150nfsd_reply_cache_free(struct svc_cacherep *rp) 157nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
151{ 158{
152 spin_lock(&cache_lock); 159 spin_lock(&b->cache_lock);
153 nfsd_reply_cache_free_locked(rp); 160 nfsd_reply_cache_free_locked(rp);
154 spin_unlock(&cache_lock); 161 spin_unlock(&b->cache_lock);
155} 162}
156 163
157int nfsd_reply_cache_init(void) 164int nfsd_reply_cache_init(void)
158{ 165{
159 unsigned int hashsize; 166 unsigned int hashsize;
167 unsigned int i;
160 168
161 INIT_LIST_HEAD(&lru_head);
162 max_drc_entries = nfsd_cache_size_limit(); 169 max_drc_entries = nfsd_cache_size_limit();
163 num_drc_entries = 0; 170 atomic_set(&num_drc_entries, 0);
164 hashsize = nfsd_hashsize(max_drc_entries); 171 hashsize = nfsd_hashsize(max_drc_entries);
165 maskbits = ilog2(hashsize); 172 maskbits = ilog2(hashsize);
166 173
@@ -170,9 +177,14 @@ int nfsd_reply_cache_init(void)
170 if (!drc_slab) 177 if (!drc_slab)
171 goto out_nomem; 178 goto out_nomem;
172 179
173 cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL); 180 drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
174 if (!cache_hash) 181 if (!drc_hashtbl)
175 goto out_nomem; 182 goto out_nomem;
183 for (i = 0; i < hashsize; i++) {
184 INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
185 spin_lock_init(&drc_hashtbl[i].cache_lock);
186 }
187 drc_hashsize = hashsize;
176 188
177 return 0; 189 return 0;
178out_nomem: 190out_nomem:
@@ -184,17 +196,22 @@ out_nomem:
184void nfsd_reply_cache_shutdown(void) 196void nfsd_reply_cache_shutdown(void)
185{ 197{
186 struct svc_cacherep *rp; 198 struct svc_cacherep *rp;
199 unsigned int i;
187 200
188 unregister_shrinker(&nfsd_reply_cache_shrinker); 201 unregister_shrinker(&nfsd_reply_cache_shrinker);
189 cancel_delayed_work_sync(&cache_cleaner); 202 cancel_delayed_work_sync(&cache_cleaner);
190 203
191 while (!list_empty(&lru_head)) { 204 for (i = 0; i < drc_hashsize; i++) {
192 rp = list_entry(lru_head.next, struct svc_cacherep, c_lru); 205 struct list_head *head = &drc_hashtbl[i].lru_head;
193 nfsd_reply_cache_free_locked(rp); 206 while (!list_empty(head)) {
207 rp = list_first_entry(head, struct svc_cacherep, c_lru);
208 nfsd_reply_cache_free_locked(rp);
209 }
194 } 210 }
195 211
196 kfree (cache_hash); 212 kfree (drc_hashtbl);
197 cache_hash = NULL; 213 drc_hashtbl = NULL;
214 drc_hashsize = 0;
198 215
199 if (drc_slab) { 216 if (drc_slab) {
200 kmem_cache_destroy(drc_slab); 217 kmem_cache_destroy(drc_slab);
@@ -207,61 +224,63 @@ void nfsd_reply_cache_shutdown(void)
207 * not already scheduled. 224 * not already scheduled.
208 */ 225 */
209static void 226static void
210lru_put_end(struct svc_cacherep *rp) 227lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
211{ 228{
212 rp->c_timestamp = jiffies; 229 rp->c_timestamp = jiffies;
213 list_move_tail(&rp->c_lru, &lru_head); 230 list_move_tail(&rp->c_lru, &b->lru_head);
214 schedule_delayed_work(&cache_cleaner, RC_EXPIRE); 231 schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
215} 232}
216 233
217/*
218 * Move a cache entry from one hash list to another
219 */
220static void
221hash_refile(struct svc_cacherep *rp)
222{
223 hlist_del_init(&rp->c_hash);
224 /*
225 * No point in byte swapping c_xid since we're just using it to pick
226 * a hash bucket.
227 */
228 hlist_add_head(&rp->c_hash, cache_hash +
229 hash_32((__force u32)rp->c_xid, maskbits));
230}
231
232/*
233 * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
234 * Also prune the oldest ones when the total exceeds the max number of entries.
235 */
236static long 234static long
237prune_cache_entries(void) 235prune_bucket(struct nfsd_drc_bucket *b)
238{ 236{
239 struct svc_cacherep *rp, *tmp; 237 struct svc_cacherep *rp, *tmp;
240 long freed = 0; 238 long freed = 0;
241 239
242 list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { 240 list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
243 /* 241 /*
244 * Don't free entries attached to calls that are still 242 * Don't free entries attached to calls that are still
245 * in-progress, but do keep scanning the list. 243 * in-progress, but do keep scanning the list.
246 */ 244 */
247 if (rp->c_state == RC_INPROG) 245 if (rp->c_state == RC_INPROG)
248 continue; 246 continue;
249 if (num_drc_entries <= max_drc_entries && 247 if (atomic_read(&num_drc_entries) <= max_drc_entries &&
250 time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) 248 time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
251 break; 249 break;
252 nfsd_reply_cache_free_locked(rp); 250 nfsd_reply_cache_free_locked(rp);
253 freed++; 251 freed++;
254 } 252 }
253 return freed;
254}
255
256/*
257 * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
258 * Also prune the oldest ones when the total exceeds the max number of entries.
259 */
260static long
261prune_cache_entries(void)
262{
263 unsigned int i;
264 long freed = 0;
265 bool cancel = true;
266
267 for (i = 0; i < drc_hashsize; i++) {
268 struct nfsd_drc_bucket *b = &drc_hashtbl[i];
269
270 if (list_empty(&b->lru_head))
271 continue;
272 spin_lock(&b->cache_lock);
273 freed += prune_bucket(b);
274 if (!list_empty(&b->lru_head))
275 cancel = false;
276 spin_unlock(&b->cache_lock);
277 }
255 278
256 /* 279 /*
257 * Conditionally rearm the job. If we cleaned out the list, then 280 * Conditionally rearm the job to run in RC_EXPIRE since we just
258 * cancel any pending run (since there won't be any work to do). 281 * ran the pruner.
259 * Otherwise, we rearm the job or modify the existing one to run in
260 * RC_EXPIRE since we just ran the pruner.
261 */ 282 */
262 if (list_empty(&lru_head)) 283 if (!cancel)
263 cancel_delayed_work(&cache_cleaner);
264 else
265 mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); 284 mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
266 return freed; 285 return freed;
267} 286}
@@ -269,32 +288,19 @@ prune_cache_entries(void)
269static void 288static void
270cache_cleaner_func(struct work_struct *unused) 289cache_cleaner_func(struct work_struct *unused)
271{ 290{
272 spin_lock(&cache_lock);
273 prune_cache_entries(); 291 prune_cache_entries();
274 spin_unlock(&cache_lock);
275} 292}
276 293
277static unsigned long 294static unsigned long
278nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) 295nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
279{ 296{
280 unsigned long num; 297 return atomic_read(&num_drc_entries);
281
282 spin_lock(&cache_lock);
283 num = num_drc_entries;
284 spin_unlock(&cache_lock);
285
286 return num;
287} 298}
288 299
289static unsigned long 300static unsigned long
290nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) 301nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
291{ 302{
292 unsigned long freed; 303 return prune_cache_entries();
293
294 spin_lock(&cache_lock);
295 freed = prune_cache_entries();
296 spin_unlock(&cache_lock);
297 return freed;
298} 304}
299/* 305/*
300 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes 306 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
@@ -332,20 +338,24 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
332static bool 338static bool
333nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) 339nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
334{ 340{
335 /* Check RPC header info first */ 341 /* Check RPC XID first */
336 if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc || 342 if (rqstp->rq_xid != rp->c_xid)
337 rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
338 rqstp->rq_arg.len != rp->c_len ||
339 !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
340 rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
341 return false; 343 return false;
342
343 /* compare checksum of NFS data */ 344 /* compare checksum of NFS data */
344 if (csum != rp->c_csum) { 345 if (csum != rp->c_csum) {
345 ++payload_misses; 346 ++payload_misses;
346 return false; 347 return false;
347 } 348 }
348 349
350 /* Other discriminators */
351 if (rqstp->rq_proc != rp->c_proc ||
352 rqstp->rq_prot != rp->c_prot ||
353 rqstp->rq_vers != rp->c_vers ||
354 rqstp->rq_arg.len != rp->c_len ||
355 !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
356 rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
357 return false;
358
349 return true; 359 return true;
350} 360}
351 361
@@ -355,18 +365,14 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
355 * NULL on failure. 365 * NULL on failure.
356 */ 366 */
357static struct svc_cacherep * 367static struct svc_cacherep *
358nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) 368nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
369 __wsum csum)
359{ 370{
360 struct svc_cacherep *rp, *ret = NULL; 371 struct svc_cacherep *rp, *ret = NULL;
361 struct hlist_head *rh; 372 struct list_head *rh = &b->lru_head;
362 unsigned int entries = 0; 373 unsigned int entries = 0;
363 374
364 /* 375 list_for_each_entry(rp, rh, c_lru) {
365 * No point in byte swapping rq_xid since we're just using it to pick
366 * a hash bucket.
367 */
368 rh = &cache_hash[hash_32((__force u32)rqstp->rq_xid, maskbits)];
369 hlist_for_each_entry(rp, rh, c_hash) {
370 ++entries; 376 ++entries;
371 if (nfsd_cache_match(rqstp, csum, rp)) { 377 if (nfsd_cache_match(rqstp, csum, rp)) {
372 ret = rp; 378 ret = rp;
@@ -377,11 +383,12 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
377 /* tally hash chain length stats */ 383 /* tally hash chain length stats */
378 if (entries > longest_chain) { 384 if (entries > longest_chain) {
379 longest_chain = entries; 385 longest_chain = entries;
380 longest_chain_cachesize = num_drc_entries; 386 longest_chain_cachesize = atomic_read(&num_drc_entries);
381 } else if (entries == longest_chain) { 387 } else if (entries == longest_chain) {
382 /* prefer to keep the smallest cachesize possible here */ 388 /* prefer to keep the smallest cachesize possible here */
383 longest_chain_cachesize = min(longest_chain_cachesize, 389 longest_chain_cachesize = min_t(unsigned int,
384 num_drc_entries); 390 longest_chain_cachesize,
391 atomic_read(&num_drc_entries));
385 } 392 }
386 393
387 return ret; 394 return ret;
@@ -403,6 +410,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
403 vers = rqstp->rq_vers, 410 vers = rqstp->rq_vers,
404 proc = rqstp->rq_proc; 411 proc = rqstp->rq_proc;
405 __wsum csum; 412 __wsum csum;
413 u32 hash = nfsd_cache_hash(xid);
414 struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
406 unsigned long age; 415 unsigned long age;
407 int type = rqstp->rq_cachetype; 416 int type = rqstp->rq_cachetype;
408 int rtn = RC_DOIT; 417 int rtn = RC_DOIT;
@@ -420,16 +429,16 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
420 * preallocate an entry. 429 * preallocate an entry.
421 */ 430 */
422 rp = nfsd_reply_cache_alloc(); 431 rp = nfsd_reply_cache_alloc();
423 spin_lock(&cache_lock); 432 spin_lock(&b->cache_lock);
424 if (likely(rp)) { 433 if (likely(rp)) {
425 ++num_drc_entries; 434 atomic_inc(&num_drc_entries);
426 drc_mem_usage += sizeof(*rp); 435 drc_mem_usage += sizeof(*rp);
427 } 436 }
428 437
429 /* go ahead and prune the cache */ 438 /* go ahead and prune the cache */
430 prune_cache_entries(); 439 prune_bucket(b);
431 440
432 found = nfsd_cache_search(rqstp, csum); 441 found = nfsd_cache_search(b, rqstp, csum);
433 if (found) { 442 if (found) {
434 if (likely(rp)) 443 if (likely(rp))
435 nfsd_reply_cache_free_locked(rp); 444 nfsd_reply_cache_free_locked(rp);
@@ -454,8 +463,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
454 rp->c_len = rqstp->rq_arg.len; 463 rp->c_len = rqstp->rq_arg.len;
455 rp->c_csum = csum; 464 rp->c_csum = csum;
456 465
457 hash_refile(rp); 466 lru_put_end(b, rp);
458 lru_put_end(rp);
459 467
460 /* release any buffer */ 468 /* release any buffer */
461 if (rp->c_type == RC_REPLBUFF) { 469 if (rp->c_type == RC_REPLBUFF) {
@@ -465,14 +473,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
465 } 473 }
466 rp->c_type = RC_NOCACHE; 474 rp->c_type = RC_NOCACHE;
467 out: 475 out:
468 spin_unlock(&cache_lock); 476 spin_unlock(&b->cache_lock);
469 return rtn; 477 return rtn;
470 478
471found_entry: 479found_entry:
472 nfsdstats.rchits++; 480 nfsdstats.rchits++;
473 /* We found a matching entry which is either in progress or done. */ 481 /* We found a matching entry which is either in progress or done. */
474 age = jiffies - rp->c_timestamp; 482 age = jiffies - rp->c_timestamp;
475 lru_put_end(rp); 483 lru_put_end(b, rp);
476 484
477 rtn = RC_DROPIT; 485 rtn = RC_DROPIT;
478 /* Request being processed or excessive rexmits */ 486 /* Request being processed or excessive rexmits */
@@ -527,18 +535,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
527{ 535{
528 struct svc_cacherep *rp = rqstp->rq_cacherep; 536 struct svc_cacherep *rp = rqstp->rq_cacherep;
529 struct kvec *resv = &rqstp->rq_res.head[0], *cachv; 537 struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
538 u32 hash;
539 struct nfsd_drc_bucket *b;
530 int len; 540 int len;
531 size_t bufsize = 0; 541 size_t bufsize = 0;
532 542
533 if (!rp) 543 if (!rp)
534 return; 544 return;
535 545
546 hash = nfsd_cache_hash(rp->c_xid);
547 b = &drc_hashtbl[hash];
548
536 len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); 549 len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
537 len >>= 2; 550 len >>= 2;
538 551
539 /* Don't cache excessive amounts of data and XDR failures */ 552 /* Don't cache excessive amounts of data and XDR failures */
540 if (!statp || len > (256 >> 2)) { 553 if (!statp || len > (256 >> 2)) {
541 nfsd_reply_cache_free(rp); 554 nfsd_reply_cache_free(b, rp);
542 return; 555 return;
543 } 556 }
544 557
@@ -553,23 +566,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
553 bufsize = len << 2; 566 bufsize = len << 2;
554 cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); 567 cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
555 if (!cachv->iov_base) { 568 if (!cachv->iov_base) {
556 nfsd_reply_cache_free(rp); 569 nfsd_reply_cache_free(b, rp);
557 return; 570 return;
558 } 571 }
559 cachv->iov_len = bufsize; 572 cachv->iov_len = bufsize;
560 memcpy(cachv->iov_base, statp, bufsize); 573 memcpy(cachv->iov_base, statp, bufsize);
561 break; 574 break;
562 case RC_NOCACHE: 575 case RC_NOCACHE:
563 nfsd_reply_cache_free(rp); 576 nfsd_reply_cache_free(b, rp);
564 return; 577 return;
565 } 578 }
566 spin_lock(&cache_lock); 579 spin_lock(&b->cache_lock);
567 drc_mem_usage += bufsize; 580 drc_mem_usage += bufsize;
568 lru_put_end(rp); 581 lru_put_end(b, rp);
569 rp->c_secure = rqstp->rq_secure; 582 rp->c_secure = rqstp->rq_secure;
570 rp->c_type = cachetype; 583 rp->c_type = cachetype;
571 rp->c_state = RC_DONE; 584 rp->c_state = RC_DONE;
572 spin_unlock(&cache_lock); 585 spin_unlock(&b->cache_lock);
573 return; 586 return;
574} 587}
575 588
@@ -600,9 +613,9 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
600 */ 613 */
601static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) 614static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
602{ 615{
603 spin_lock(&cache_lock);
604 seq_printf(m, "max entries: %u\n", max_drc_entries); 616 seq_printf(m, "max entries: %u\n", max_drc_entries);
605 seq_printf(m, "num entries: %u\n", num_drc_entries); 617 seq_printf(m, "num entries: %u\n",
618 atomic_read(&num_drc_entries));
606 seq_printf(m, "hash buckets: %u\n", 1 << maskbits); 619 seq_printf(m, "hash buckets: %u\n", 1 << maskbits);
607 seq_printf(m, "mem usage: %u\n", drc_mem_usage); 620 seq_printf(m, "mem usage: %u\n", drc_mem_usage);
608 seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); 621 seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
@@ -611,7 +624,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
611 seq_printf(m, "payload misses: %u\n", payload_misses); 624 seq_printf(m, "payload misses: %u\n", payload_misses);
612 seq_printf(m, "longest chain len: %u\n", longest_chain); 625 seq_printf(m, "longest chain len: %u\n", longest_chain);
613 seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); 626 seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize);
614 spin_unlock(&cache_lock);
615 return 0; 627 return 0;
616} 628}
617 629
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4e042105fb6e..ca73ca79a0ee 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -49,6 +49,7 @@ enum {
49 NFSD_Leasetime, 49 NFSD_Leasetime,
50 NFSD_Gracetime, 50 NFSD_Gracetime,
51 NFSD_RecoveryDir, 51 NFSD_RecoveryDir,
52 NFSD_V4EndGrace,
52#endif 53#endif
53}; 54};
54 55
@@ -68,6 +69,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
68static ssize_t write_leasetime(struct file *file, char *buf, size_t size); 69static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
69static ssize_t write_gracetime(struct file *file, char *buf, size_t size); 70static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
70static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 71static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
72static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
71#endif 73#endif
72 74
73static ssize_t (*write_op[])(struct file *, char *, size_t) = { 75static ssize_t (*write_op[])(struct file *, char *, size_t) = {
@@ -84,6 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
84 [NFSD_Leasetime] = write_leasetime, 86 [NFSD_Leasetime] = write_leasetime,
85 [NFSD_Gracetime] = write_gracetime, 87 [NFSD_Gracetime] = write_gracetime,
86 [NFSD_RecoveryDir] = write_recoverydir, 88 [NFSD_RecoveryDir] = write_recoverydir,
89 [NFSD_V4EndGrace] = write_v4_end_grace,
87#endif 90#endif
88}; 91};
89 92
@@ -1077,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
1077 return rv; 1080 return rv;
1078} 1081}
1079 1082
1083/**
1084 * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
1085 *
1086 * Input:
1087 * buf: ignored
1088 * size: zero
1089 * OR
1090 *
1091 * Input:
1092 * buf: any value
1093 * size: non-zero length of C string in @buf
1094 * Output:
1095 * passed-in buffer filled with "Y" or "N" with a newline
1096 * and NULL-terminated C string. This indicates whether
1097 * the grace period has ended in the current net
1098 * namespace. Return code is the size in bytes of the
1099 * string. Writing a string that starts with 'Y', 'y', or
1100 * '1' to the file will end the grace period for nfsd's v4
1101 * lock manager.
1102 */
1103static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
1104{
1105 struct net *net = file->f_dentry->d_sb->s_fs_info;
1106 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1107
1108 if (size > 0) {
1109 switch(buf[0]) {
1110 case 'Y':
1111 case 'y':
1112 case '1':
1113 nfsd4_end_grace(nn);
1114 break;
1115 default:
1116 return -EINVAL;
1117 }
1118 }
1119
1120 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
1121 nn->grace_ended ? 'Y' : 'N');
1122}
1123
1080#endif 1124#endif
1081 1125
1082/*----------------------------------------------------------------------------*/ 1126/*----------------------------------------------------------------------------*/
@@ -1110,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1110 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1154 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
1111 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1155 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
1112 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, 1156 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
1157 [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
1113#endif 1158#endif
1114 /* last one */ {""} 1159 /* last one */ {""}
1115 }; 1160 };
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 847daf37e566..747f3b95bd11 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -251,7 +251,7 @@ void nfsd_lockd_shutdown(void);
251#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) 251#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
252#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP) 252#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
253#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH) 253#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
254#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP) 254#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP)
255#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) 255#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
256#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) 256#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
257#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) 257#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e883a5868be6..88026fc6a981 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -209,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
209 * fix that case easily. 209 * fix that case easily.
210 */ 210 */
211 struct cred *new = prepare_creds(); 211 struct cred *new = prepare_creds();
212 if (!new) 212 if (!new) {
213 return nfserrno(-ENOMEM); 213 error = nfserrno(-ENOMEM);
214 goto out;
215 }
214 new->cap_effective = 216 new->cap_effective =
215 cap_raise_nfsd_set(new->cap_effective, 217 cap_raise_nfsd_set(new->cap_effective,
216 new->cap_permitted); 218 new->cap_permitted);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4a89e00d7461..0a47c6a6b301 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -62,16 +62,21 @@ typedef struct {
62 (s)->si_generation 62 (s)->si_generation
63 63
64struct nfsd4_callback { 64struct nfsd4_callback {
65 void *cb_op;
66 struct nfs4_client *cb_clp; 65 struct nfs4_client *cb_clp;
67 struct list_head cb_per_client; 66 struct list_head cb_per_client;
68 u32 cb_minorversion; 67 u32 cb_minorversion;
69 struct rpc_message cb_msg; 68 struct rpc_message cb_msg;
70 const struct rpc_call_ops *cb_ops; 69 struct nfsd4_callback_ops *cb_ops;
71 struct work_struct cb_work; 70 struct work_struct cb_work;
72 bool cb_done; 71 bool cb_done;
73}; 72};
74 73
74struct nfsd4_callback_ops {
75 void (*prepare)(struct nfsd4_callback *);
76 int (*done)(struct nfsd4_callback *, struct rpc_task *);
77 void (*release)(struct nfsd4_callback *);
78};
79
75/* 80/*
76 * A core object that represents a "common" stateid. These are generally 81 * A core object that represents a "common" stateid. These are generally
77 * embedded within the different (more specific) stateid objects and contain 82 * embedded within the different (more specific) stateid objects and contain
@@ -127,6 +132,9 @@ struct nfs4_delegation {
127 struct nfsd4_callback dl_recall; 132 struct nfsd4_callback dl_recall;
128}; 133};
129 134
135#define cb_to_delegation(cb) \
136 container_of(cb, struct nfs4_delegation, dl_recall)
137
130/* client delegation callback info */ 138/* client delegation callback info */
131struct nfs4_cb_conn { 139struct nfs4_cb_conn {
132 /* SETCLIENTID info */ 140 /* SETCLIENTID info */
@@ -306,6 +314,7 @@ struct nfs4_client {
306#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ 314#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
307#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ 315#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
308#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */ 316#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
317#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
309#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 318#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
310 1 << NFSD4_CLIENT_CB_KILL) 319 1 << NFSD4_CLIENT_CB_KILL)
311 unsigned long cl_flags; 320 unsigned long cl_flags;
@@ -517,6 +526,13 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
517#define RD_STATE 0x00000010 526#define RD_STATE 0x00000010
518#define WR_STATE 0x00000020 527#define WR_STATE 0x00000020
519 528
529enum nfsd4_cb_op {
530 NFSPROC4_CLNT_CB_NULL = 0,
531 NFSPROC4_CLNT_CB_RECALL,
532 NFSPROC4_CLNT_CB_SEQUENCE,
533};
534
535
520struct nfsd4_compound_state; 536struct nfsd4_compound_state;
521struct nfsd_net; 537struct nfsd_net;
522 538
@@ -531,12 +547,12 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
531extern __be32 nfs4_check_open_reclaim(clientid_t *clid, 547extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
532 struct nfsd4_compound_state *cstate, struct nfsd_net *nn); 548 struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
533extern int set_callback_cred(void); 549extern int set_callback_cred(void);
534void nfsd4_run_cb_null(struct work_struct *w);
535void nfsd4_run_cb_recall(struct work_struct *w);
536extern void nfsd4_probe_callback(struct nfs4_client *clp); 550extern void nfsd4_probe_callback(struct nfs4_client *clp);
537extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); 551extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
538extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 552extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
539extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 553extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
554 struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
555extern void nfsd4_run_cb(struct nfsd4_callback *cb);
540extern int nfsd4_create_callback_queue(void); 556extern int nfsd4_create_callback_queue(void);
541extern void nfsd4_destroy_callback_queue(void); 557extern void nfsd4_destroy_callback_queue(void);
542extern void nfsd4_shutdown_callback(struct nfs4_client *); 558extern void nfsd4_shutdown_callback(struct nfs4_client *);
@@ -545,13 +561,16 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
545 struct nfsd_net *nn); 561 struct nfsd_net *nn);
546extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); 562extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
547 563
564/* grace period management */
565void nfsd4_end_grace(struct nfsd_net *nn);
566
548/* nfs4recover operations */ 567/* nfs4recover operations */
549extern int nfsd4_client_tracking_init(struct net *net); 568extern int nfsd4_client_tracking_init(struct net *net);
550extern void nfsd4_client_tracking_exit(struct net *net); 569extern void nfsd4_client_tracking_exit(struct net *net);
551extern void nfsd4_client_record_create(struct nfs4_client *clp); 570extern void nfsd4_client_record_create(struct nfs4_client *clp);
552extern void nfsd4_client_record_remove(struct nfs4_client *clp); 571extern void nfsd4_client_record_remove(struct nfs4_client *clp);
553extern int nfsd4_client_record_check(struct nfs4_client *clp); 572extern int nfsd4_client_record_check(struct nfs4_client *clp);
554extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); 573extern void nfsd4_record_grace_done(struct nfsd_net *nn);
555 574
556/* nfs fault injection functions */ 575/* nfs fault injection functions */
557#ifdef CONFIG_NFSD_FAULT_INJECTION 576#ifdef CONFIG_NFSD_FAULT_INJECTION
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f501a9b5c9df..965cffd17a0c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -445,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
445 if (err) 445 if (err)
446 goto out; 446 goto out;
447 size_change = 1; 447 size_change = 1;
448
449 /*
450 * RFC5661, Section 18.30.4:
451 * Changing the size of a file with SETATTR indirectly
452 * changes the time_modify and change attributes.
453 *
454 * (and similar for the older RFCs)
455 */
456 if (iap->ia_size != i_size_read(inode))
457 iap->ia_valid |= ATTR_MTIME;
448 } 458 }
449 459
450 iap->ia_valid |= ATTR_CTIME; 460 iap->ia_valid |= ATTR_CTIME;
@@ -649,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
649{ 659{
650 struct path path; 660 struct path path;
651 struct inode *inode; 661 struct inode *inode;
662 struct file *file;
652 int flags = O_RDONLY|O_LARGEFILE; 663 int flags = O_RDONLY|O_LARGEFILE;
653 __be32 err; 664 __be32 err;
654 int host_err = 0; 665 int host_err = 0;
@@ -703,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
703 else 714 else
704 flags = O_WRONLY|O_LARGEFILE; 715 flags = O_WRONLY|O_LARGEFILE;
705 } 716 }
706 *filp = dentry_open(&path, flags, current_cred());
707 if (IS_ERR(*filp)) {
708 host_err = PTR_ERR(*filp);
709 *filp = NULL;
710 } else {
711 host_err = ima_file_check(*filp, may_flags);
712 717
713 if (may_flags & NFSD_MAY_64BIT_COOKIE) 718 file = dentry_open(&path, flags, current_cred());
714 (*filp)->f_mode |= FMODE_64BITHASH; 719 if (IS_ERR(file)) {
715 else 720 host_err = PTR_ERR(file);
716 (*filp)->f_mode |= FMODE_32BITHASH; 721 goto out_nfserr;
717 } 722 }
718 723
724 host_err = ima_file_check(file, may_flags);
725 if (host_err) {
726 nfsd_close(file);
727 goto out_nfserr;
728 }
729
730 if (may_flags & NFSD_MAY_64BIT_COOKIE)
731 file->f_mode |= FMODE_64BITHASH;
732 else
733 file->f_mode |= FMODE_32BITHASH;
734
735 *filp = file;
719out_nfserr: 736out_nfserr:
720 err = nfserrno(host_err); 737 err = nfserrno(host_err);
721out: 738out:
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 465e7799742a..5720e9457f33 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,17 @@ struct nfsd4_reclaim_complete {
428 u32 rca_one_fs; 428 u32 rca_one_fs;
429}; 429};
430 430
431struct nfsd4_seek {
432 /* request */
433 stateid_t seek_stateid;
434 loff_t seek_offset;
435 u32 seek_whence;
436
437 /* response */
438 u32 seek_eof;
439 loff_t seek_pos;
440};
441
431struct nfsd4_op { 442struct nfsd4_op {
432 int opnum; 443 int opnum;
433 __be32 status; 444 __be32 status;
@@ -473,6 +484,9 @@ struct nfsd4_op {
473 struct nfsd4_reclaim_complete reclaim_complete; 484 struct nfsd4_reclaim_complete reclaim_complete;
474 struct nfsd4_test_stateid test_stateid; 485 struct nfsd4_test_stateid test_stateid;
475 struct nfsd4_free_stateid free_stateid; 486 struct nfsd4_free_stateid free_stateid;
487
488 /* NFSv4.2 */
489 struct nfsd4_seek seek;
476 } u; 490 } u;
477 struct nfs4_replay * replay; 491 struct nfs4_replay * replay;
478}; 492};
diff --git a/fs/stack.c b/fs/stack.c
index 5b5388250e29..a54e33ed10f1 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -44,7 +44,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
44 * include/linux/fs.h). We don't necessarily hold i_mutex when this 44 * include/linux/fs.h). We don't necessarily hold i_mutex when this
45 * is called, so take i_lock for that case. 45 * is called, so take i_lock for that case.
46 * 46 *
47 * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the 47 * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
48 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock 48 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
49 * for that case too, and do both at once by combining the tests. 49 * for that case too, and do both at once by combining the tests.
50 * 50 *
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 80c350216ea8..b46ffa94372a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -333,8 +333,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
333 spin_lock_irq(&ctx->wqh.lock); 333 spin_lock_irq(&ctx->wqh.lock);
334 if (!timerfd_canceled(ctx)) { 334 if (!timerfd_canceled(ctx)) {
335 ctx->ticks = ticks; 335 ctx->ticks = ticks;
336 if (ticks) 336 wake_up_locked(&ctx->wqh);
337 wake_up_locked(&ctx->wqh);
338 } else 337 } else
339 ret = -ECANCELED; 338 ret = -ECANCELED;
340 spin_unlock_irq(&ctx->wqh.lock); 339 spin_unlock_irq(&ctx->wqh.lock);