aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/aio.c94
-rw-r--r--fs/block_dev.c7
-rw-r--r--fs/btrfs/async-thread.c55
-rw-r--r--fs/btrfs/async-thread.h29
-rw-r--r--fs/btrfs/backref.c123
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h40
-rw-r--r--fs/btrfs/check-integrity.c18
-rw-r--r--fs/btrfs/compression.c21
-rw-r--r--fs/btrfs/ctree.c106
-rw-r--r--fs/btrfs/ctree.h93
-rw-r--r--fs/btrfs/delayed-inode.c12
-rw-r--r--fs/btrfs/dev-replace.c82
-rw-r--r--fs/btrfs/dir-item.c12
-rw-r--r--fs/btrfs/disk-io.c344
-rw-r--r--fs/btrfs/disk-io.h16
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c290
-rw-r--r--fs/btrfs/extent_io.c488
-rw-r--r--fs/btrfs/extent_io.h60
-rw-r--r--fs/btrfs/file-item.c30
-rw-r--r--fs/btrfs/file.c170
-rw-r--r--fs/btrfs/free-space-cache.c157
-rw-r--r--fs/btrfs/hash.c4
-rw-r--r--fs/btrfs/inode-item.c12
-rw-r--r--fs/btrfs/inode-map.c68
-rw-r--r--fs/btrfs/inode.c942
-rw-r--r--fs/btrfs/ioctl.c128
-rw-r--r--fs/btrfs/lzo.c3
-rw-r--r--fs/btrfs/ordered-data.c1
-rw-r--r--fs/btrfs/orphan.c4
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c33
-rw-r--r--fs/btrfs/raid56.c17
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/relocation.c142
-rw-r--r--fs/btrfs/scrub.c92
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/super.c137
-rw-r--r--fs/btrfs/sysfs.c43
-rw-r--r--fs/btrfs/sysfs.h16
-rw-r--r--fs/btrfs/tests/free-space-tests.c516
-rw-r--r--fs/btrfs/transaction.c52
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-log.c334
-rw-r--r--fs/btrfs/tree-log.h4
-rw-r--r--fs/btrfs/uuid-tree.c1
-rw-r--r--fs/btrfs/volumes.c722
-rw-r--r--fs/btrfs/volumes.h166
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zlib.c141
-rw-r--r--fs/buffer.c34
-rw-r--r--fs/cachefiles/bind.c8
-rw-r--r--fs/cachefiles/daemon.c30
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/main.c2
-rw-r--r--fs/cachefiles/namei.c17
-rw-r--r--fs/cachefiles/rdwr.c6
-rw-r--r--fs/cachefiles/xattr.c10
-rw-r--r--fs/cifs/Kconfig35
-rw-r--r--fs/cifs/cifsfs.c24
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h10
-rw-r--r--fs/cifs/cifspdu.h23
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/dir.c8
-rw-r--r--fs/cifs/file.c18
-rw-r--r--fs/cifs/inode.c11
-rw-r--r--fs/cifs/link.c12
-rw-r--r--fs/cifs/misc.c7
-rw-r--r--fs/cifs/netmisc.c20
-rw-r--r--fs/cifs/readdir.c6
-rw-r--r--fs/cifs/sess.c24
-rw-r--r--fs/cifs/smb1ops.c9
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2inode.c2
-rw-r--r--fs/cifs/smb2maperror.c6
-rw-r--r--fs/cifs/smb2misc.c17
-rw-r--r--fs/cifs/smb2ops.c172
-rw-r--r--fs/cifs/smb2pdu.c23
-rw-r--r--fs/cifs/smb2pdu.h6
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/dcache.c111
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/inode.c25
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext3/ext3.h12
-rw-r--r--fs/ext3/super.c25
-rw-r--r--fs/ext4/ext4.h18
-rw-r--r--fs/ext4/extents.c88
-rw-r--r--fs/ext4/inode.c44
-rw-r--r--fs/ext4/mballoc.c5
-rw-r--r--fs/ext4/namei.c58
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c19
-rw-r--r--fs/f2fs/Kconfig4
-rw-r--r--fs/f2fs/checkpoint.c175
-rw-r--r--fs/f2fs/data.c86
-rw-r--r--fs/f2fs/debug.c24
-rw-r--r--fs/f2fs/dir.c25
-rw-r--r--fs/f2fs/f2fs.h187
-rw-r--r--fs/f2fs/file.c315
-rw-r--r--fs/f2fs/gc.c34
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/hash.c7
-rw-r--r--fs/f2fs/inline.c58
-rw-r--r--fs/f2fs/inode.c37
-rw-r--r--fs/f2fs/namei.c66
-rw-r--r--fs/f2fs/node.c536
-rw-r--r--fs/f2fs/node.h60
-rw-r--r--fs/f2fs/recovery.c219
-rw-r--r--fs/f2fs/segment.c573
-rw-r--r--fs/f2fs/segment.h162
-rw-r--r--fs/f2fs/super.c77
-rw-r--r--fs/f2fs/xattr.c10
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/fscache/object.c1
-rw-r--r--fs/fscache/page.c25
-rw-r--r--fs/fuse/file.c1
-rw-r--r--fs/gfs2/bmap.c9
-rw-r--r--fs/gfs2/dir.c9
-rw-r--r--fs/gfs2/dir.h1
-rw-r--r--fs/gfs2/file.c15
-rw-r--r--fs/gfs2/glock.c4
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c16
-rw-r--r--fs/gfs2/rgrp.c30
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/super.c20
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/internal.h5
-rw-r--r--fs/isofs/inode.c15
-rw-r--r--fs/isofs/isofs.h23
-rw-r--r--fs/isofs/rock.c39
-rw-r--r--fs/jbd2/commit.c21
-rw-r--r--fs/jbd2/journal.c56
-rw-r--r--fs/jbd2/recovery.c33
-rw-r--r--fs/jbd2/revoke.c6
-rw-r--r--fs/lockd/Makefile3
-rw-r--r--fs/lockd/mon.c6
-rw-r--r--fs/lockd/netns.h1
-rw-r--r--fs/lockd/procfs.c92
-rw-r--r--fs/lockd/procfs.h28
-rw-r--r--fs/lockd/svc.c15
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c96
-rw-r--r--fs/namespace.c10
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1386
-rw-r--r--fs/nfs/blocklayout/blocklayout.h213
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c384
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c108
-rw-r--r--fs/nfs/blocklayout/dev.c363
-rw-r--r--fs/nfs/blocklayout/extent_tree.c602
-rw-r--r--fs/nfs/blocklayout/extents.c908
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c285
-rw-r--r--fs/nfs/callback_proc.c23
-rw-r--r--fs/nfs/client.c18
-rw-r--r--fs/nfs/direct.c14
-rw-r--r--fs/nfs/file.c52
-rw-r--r--fs/nfs/filelayout/filelayout.c39
-rw-r--r--fs/nfs/filelayout/filelayout.h7
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c108
-rw-r--r--fs/nfs/fscache-index.c3
-rw-r--r--fs/nfs/inode.c4
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/nfs3_fs.h34
-rw-r--r--fs/nfs/nfs3acl.c6
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs4_fs.h13
-rw-r--r--fs/nfs/nfs4client.c38
-rw-r--r--fs/nfs/nfs4proc.c188
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c42
-rw-r--r--fs/nfs/nfs4xdr.c179
-rw-r--r--fs/nfs/objlayout/objio_osd.c113
-rw-r--r--fs/nfs/objlayout/objlayout.c70
-rw-r--r--fs/nfs/objlayout/objlayout.h5
-rw-r--r--fs/nfs/pagelist.c92
-rw-r--r--fs/nfs/pnfs.c105
-rw-r--r--fs/nfs/pnfs.h50
-rw-r--r--fs/nfs/pnfs_dev.c150
-rw-r--r--fs/nfs/super.c11
-rw-r--r--fs/nfs/write.c171
-rw-r--r--fs/nfs_common/Makefile3
-rw-r--r--fs/nfs_common/grace.c (renamed from fs/lockd/grace.c)68
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs4callback.c144
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfs4proc.c49
-rw-r--r--fs/nfsd/nfs4recover.c205
-rw-r--r--fs/nfsd/nfs4state.c83
-rw-r--r--fs/nfsd/nfs4xdr.c77
-rw-r--r--fs/nfsd/nfsctl.c45
-rw-r--r--fs/nfsd/nfsfh.c6
-rw-r--r--fs/nfsd/state.h31
-rw-r--r--fs/nfsd/vfs.c37
-rw-r--r--fs/nfsd/xdr4.h14
-rw-r--r--fs/nilfs2/inode.c7
-rw-r--r--fs/notify/fanotify/fanotify_user.c2
-rw-r--r--fs/notify/fdinfo.c4
-rw-r--r--fs/notify/fsnotify.h3
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c6
-rw-r--r--fs/ntfs/debug.c2
-rw-r--r--fs/ntfs/file.c5
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/aops.c15
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/cluster/heartbeat.h1
-rw-r--r--fs/ocfs2/cluster/netdebug.c78
-rw-r--r--fs/ocfs2/cluster/quorum.c13
-rw-r--r--fs/ocfs2/cluster/tcp.c88
-rw-r--r--fs/ocfs2/cluster/tcp.h1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c39
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c25
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/dlmglue.c23
-rw-r--r--fs/ocfs2/file.c49
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/ioctl.c129
-rw-r--r--fs/ocfs2/move_extents.c2
-rw-r--r--fs/ocfs2/quota.h5
-rw-r--r--fs/ocfs2/quota_global.c4
-rw-r--r--fs/ocfs2/quota_local.c33
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c31
-rw-r--r--fs/pnode.c1
-rw-r--r--fs/proc/base.c75
-rw-r--r--fs/proc/internal.h5
-rw-r--r--fs/proc/kcore.c4
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/task_mmu.c361
-rw-r--r--fs/proc/task_nommu.c88
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/reiserfs/reiserfs.h5
-rw-r--r--fs/reiserfs/super.c16
-rw-r--r--fs/stack.c2
-rw-r--r--fs/super.c3
-rw-r--r--fs/sync.c2
-rw-r--r--fs/timerfd.c3
-rw-r--r--fs/udf/file.c9
-rw-r--r--fs/udf/ialloc.c28
-rw-r--r--fs/udf/inode.c165
-rw-r--r--fs/udf/namei.c156
-rw-r--r--fs/udf/super.c71
-rw-r--r--fs/udf/udfdecl.h16
-rw-r--r--fs/udf/udftime.c2
-rw-r--r--fs/ufs/ialloc.c6
-rw-r--r--fs/ufs/inode.c7
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c18
-rw-r--r--fs/xfs/xfs_aops.c61
-rw-r--r--fs/xfs/xfs_bmap_util.c20
-rw-r--r--fs/xfs/xfs_file.c27
264 files changed, 10943 insertions, 7595 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..db5dc1598716 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS
233source "fs/nfs/Kconfig" 233source "fs/nfs/Kconfig"
234source "fs/nfsd/Kconfig" 234source "fs/nfsd/Kconfig"
235 235
236config GRACE_PERIOD
237 tristate
238
236config LOCKD 239config LOCKD
237 tristate 240 tristate
238 depends on FILE_LOCKING 241 depends on FILE_LOCKING
242 select GRACE_PERIOD
239 243
240config LOCKD_V4 244config LOCKD_V4
241 bool 245 bool
@@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT
249 253
250config NFS_COMMON 254config NFS_COMMON
251 bool 255 bool
252 depends on NFSD || NFS_FS 256 depends on NFSD || NFS_FS || LOCKD
253 default y 257 default y
254 258
255source "net/sunrpc/Kconfig" 259source "net/sunrpc/Kconfig"
diff --git a/fs/aio.c b/fs/aio.c
index ae635872affb..84a751005f5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
141 141
142 struct { 142 struct {
143 unsigned tail; 143 unsigned tail;
144 unsigned completed_events;
144 spinlock_t completion_lock; 145 spinlock_t completion_lock;
145 } ____cacheline_aligned_in_smp; 146 } ____cacheline_aligned_in_smp;
146 147
@@ -660,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
660 661
661 INIT_LIST_HEAD(&ctx->active_reqs); 662 INIT_LIST_HEAD(&ctx->active_reqs);
662 663
663 if (percpu_ref_init(&ctx->users, free_ioctx_users)) 664 if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
664 goto err; 665 goto err;
665 666
666 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) 667 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
667 goto err; 668 goto err;
668 669
669 ctx->cpu = alloc_percpu(struct kioctx_cpu); 670 ctx->cpu = alloc_percpu(struct kioctx_cpu);
@@ -792,6 +793,8 @@ void exit_aio(struct mm_struct *mm)
792 793
793 for (i = 0; i < table->nr; ++i) { 794 for (i = 0; i < table->nr; ++i) {
794 struct kioctx *ctx = table->table[i]; 795 struct kioctx *ctx = table->table[i];
796 struct completion requests_done =
797 COMPLETION_INITIALIZER_ONSTACK(requests_done);
795 798
796 if (!ctx) 799 if (!ctx)
797 continue; 800 continue;
@@ -803,7 +806,10 @@ void exit_aio(struct mm_struct *mm)
803 * that it needs to unmap the area, just set it to 0. 806 * that it needs to unmap the area, just set it to 0.
804 */ 807 */
805 ctx->mmap_size = 0; 808 ctx->mmap_size = 0;
806 kill_ioctx(mm, ctx, NULL); 809 kill_ioctx(mm, ctx, &requests_done);
810
811 /* Wait until all IO for the context are done. */
812 wait_for_completion(&requests_done);
807 } 813 }
808 814
809 RCU_INIT_POINTER(mm->ioctx_table, NULL); 815 RCU_INIT_POINTER(mm->ioctx_table, NULL);
@@ -857,6 +863,68 @@ out:
857 return ret; 863 return ret;
858} 864}
859 865
866/* refill_reqs_available
867 * Updates the reqs_available reference counts used for tracking the
868 * number of free slots in the completion ring. This can be called
869 * from aio_complete() (to optimistically update reqs_available) or
870 * from aio_get_req() (the we're out of events case). It must be
871 * called holding ctx->completion_lock.
872 */
873static void refill_reqs_available(struct kioctx *ctx, unsigned head,
874 unsigned tail)
875{
876 unsigned events_in_ring, completed;
877
878 /* Clamp head since userland can write to it. */
879 head %= ctx->nr_events;
880 if (head <= tail)
881 events_in_ring = tail - head;
882 else
883 events_in_ring = ctx->nr_events - (head - tail);
884
885 completed = ctx->completed_events;
886 if (events_in_ring < completed)
887 completed -= events_in_ring;
888 else
889 completed = 0;
890
891 if (!completed)
892 return;
893
894 ctx->completed_events -= completed;
895 put_reqs_available(ctx, completed);
896}
897
898/* user_refill_reqs_available
899 * Called to refill reqs_available when aio_get_req() encounters an
900 * out of space in the completion ring.
901 */
902static void user_refill_reqs_available(struct kioctx *ctx)
903{
904 spin_lock_irq(&ctx->completion_lock);
905 if (ctx->completed_events) {
906 struct aio_ring *ring;
907 unsigned head;
908
909 /* Access of ring->head may race with aio_read_events_ring()
910 * here, but that's okay since whether we read the old version
911 * or the new version, and either will be valid. The important
912 * part is that head cannot pass tail since we prevent
913 * aio_complete() from updating tail by holding
914 * ctx->completion_lock. Even if head is invalid, the check
915 * against ctx->completed_events below will make sure we do the
916 * safe/right thing.
917 */
918 ring = kmap_atomic(ctx->ring_pages[0]);
919 head = ring->head;
920 kunmap_atomic(ring);
921
922 refill_reqs_available(ctx, head, ctx->tail);
923 }
924
925 spin_unlock_irq(&ctx->completion_lock);
926}
927
860/* aio_get_req 928/* aio_get_req
861 * Allocate a slot for an aio request. 929 * Allocate a slot for an aio request.
862 * Returns NULL if no requests are free. 930 * Returns NULL if no requests are free.
@@ -865,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
865{ 933{
866 struct kiocb *req; 934 struct kiocb *req;
867 935
868 if (!get_reqs_available(ctx)) 936 if (!get_reqs_available(ctx)) {
869 return NULL; 937 user_refill_reqs_available(ctx);
938 if (!get_reqs_available(ctx))
939 return NULL;
940 }
870 941
871 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); 942 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
872 if (unlikely(!req)) 943 if (unlikely(!req))
@@ -925,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
925 struct kioctx *ctx = iocb->ki_ctx; 996 struct kioctx *ctx = iocb->ki_ctx;
926 struct aio_ring *ring; 997 struct aio_ring *ring;
927 struct io_event *ev_page, *event; 998 struct io_event *ev_page, *event;
999 unsigned tail, pos, head;
928 unsigned long flags; 1000 unsigned long flags;
929 unsigned tail, pos;
930 1001
931 /* 1002 /*
932 * Special case handling for sync iocbs: 1003 * Special case handling for sync iocbs:
@@ -987,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
987 ctx->tail = tail; 1058 ctx->tail = tail;
988 1059
989 ring = kmap_atomic(ctx->ring_pages[0]); 1060 ring = kmap_atomic(ctx->ring_pages[0]);
1061 head = ring->head;
990 ring->tail = tail; 1062 ring->tail = tail;
991 kunmap_atomic(ring); 1063 kunmap_atomic(ring);
992 flush_dcache_page(ctx->ring_pages[0]); 1064 flush_dcache_page(ctx->ring_pages[0]);
993 1065
1066 ctx->completed_events++;
1067 if (ctx->completed_events > 1)
1068 refill_reqs_available(ctx, head, tail);
994 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1069 spin_unlock_irqrestore(&ctx->completion_lock, flags);
995 1070
996 pr_debug("added to ring %p at [%u]\n", iocb, tail); 1071 pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1005,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1005 1080
1006 /* everything turned out well, dispose of the aiocb. */ 1081 /* everything turned out well, dispose of the aiocb. */
1007 kiocb_free(iocb); 1082 kiocb_free(iocb);
1008 put_reqs_available(ctx, 1);
1009 1083
1010 /* 1084 /*
1011 * We have to order our ring_info tail store above and test 1085 * We have to order our ring_info tail store above and test
@@ -1042,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
1042 tail = ring->tail; 1116 tail = ring->tail;
1043 kunmap_atomic(ring); 1117 kunmap_atomic(ring);
1044 1118
1119 /*
1120 * Ensure that once we've read the current tail pointer, that
1121 * we also see the events that were stored up to the tail.
1122 */
1123 smp_rmb();
1124
1045 pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); 1125 pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
1046 1126
1047 if (head == tail) 1127 if (head == tail)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d7274619bf9..e2f3ad0879ce 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -304,6 +304,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
304 return block_read_full_page(page, blkdev_get_block); 304 return block_read_full_page(page, blkdev_get_block);
305} 305}
306 306
307static int blkdev_readpages(struct file *file, struct address_space *mapping,
308 struct list_head *pages, unsigned nr_pages)
309{
310 return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
311}
312
307static int blkdev_write_begin(struct file *file, struct address_space *mapping, 313static int blkdev_write_begin(struct file *file, struct address_space *mapping,
308 loff_t pos, unsigned len, unsigned flags, 314 loff_t pos, unsigned len, unsigned flags,
309 struct page **pagep, void **fsdata) 315 struct page **pagep, void **fsdata)
@@ -1622,6 +1628,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
1622 1628
1623static const struct address_space_operations def_blk_aops = { 1629static const struct address_space_operations def_blk_aops = {
1624 .readpage = blkdev_readpage, 1630 .readpage = blkdev_readpage,
1631 .readpages = blkdev_readpages,
1625 .writepage = blkdev_writepage, 1632 .writepage = blkdev_writepage,
1626 .write_begin = blkdev_write_begin, 1633 .write_begin = blkdev_write_begin,
1627 .write_end = blkdev_write_end, 1634 .write_end = blkdev_write_end,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5a201d81049c..4dabeb893b7c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -22,7 +22,6 @@
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
26#include "async-thread.h" 25#include "async-thread.h"
27#include "ctree.h" 26#include "ctree.h"
28 27
@@ -55,13 +54,45 @@ struct btrfs_workqueue {
55 struct __btrfs_workqueue *high; 54 struct __btrfs_workqueue *high;
56}; 55};
57 56
58static inline struct __btrfs_workqueue 57static void normal_work_helper(struct btrfs_work *work);
59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, 58
59#define BTRFS_WORK_HELPER(name) \
60void btrfs_##name(struct work_struct *arg) \
61{ \
62 struct btrfs_work *work = container_of(arg, struct btrfs_work, \
63 normal_work); \
64 normal_work_helper(work); \
65}
66
67BTRFS_WORK_HELPER(worker_helper);
68BTRFS_WORK_HELPER(delalloc_helper);
69BTRFS_WORK_HELPER(flush_delalloc_helper);
70BTRFS_WORK_HELPER(cache_helper);
71BTRFS_WORK_HELPER(submit_helper);
72BTRFS_WORK_HELPER(fixup_helper);
73BTRFS_WORK_HELPER(endio_helper);
74BTRFS_WORK_HELPER(endio_meta_helper);
75BTRFS_WORK_HELPER(endio_meta_write_helper);
76BTRFS_WORK_HELPER(endio_raid56_helper);
77BTRFS_WORK_HELPER(endio_repair_helper);
78BTRFS_WORK_HELPER(rmw_helper);
79BTRFS_WORK_HELPER(endio_write_helper);
80BTRFS_WORK_HELPER(freespace_write_helper);
81BTRFS_WORK_HELPER(delayed_meta_helper);
82BTRFS_WORK_HELPER(readahead_helper);
83BTRFS_WORK_HELPER(qgroup_rescan_helper);
84BTRFS_WORK_HELPER(extent_refs_helper);
85BTRFS_WORK_HELPER(scrub_helper);
86BTRFS_WORK_HELPER(scrubwrc_helper);
87BTRFS_WORK_HELPER(scrubnc_helper);
88
89static struct __btrfs_workqueue *
90__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh) 91 int thresh)
61{ 92{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); 93 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
63 94
64 if (unlikely(!ret)) 95 if (!ret)
65 return NULL; 96 return NULL;
66 97
67 ret->max_active = max_active; 98 ret->max_active = max_active;
@@ -85,7 +116,7 @@ static inline struct __btrfs_workqueue
85 ret->normal_wq = alloc_workqueue("%s-%s", flags, 116 ret->normal_wq = alloc_workqueue("%s-%s", flags,
86 ret->max_active, "btrfs", 117 ret->max_active, "btrfs",
87 name); 118 name);
88 if (unlikely(!ret->normal_wq)) { 119 if (!ret->normal_wq) {
89 kfree(ret); 120 kfree(ret);
90 return NULL; 121 return NULL;
91 } 122 }
@@ -107,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
107{ 138{
108 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); 139 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
109 140
110 if (unlikely(!ret)) 141 if (!ret)
111 return NULL; 142 return NULL;
112 143
113 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, 144 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
114 max_active, thresh); 145 max_active, thresh);
115 if (unlikely(!ret->normal)) { 146 if (!ret->normal) {
116 kfree(ret); 147 kfree(ret);
117 return NULL; 148 return NULL;
118 } 149 }
@@ -120,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
120 if (flags & WQ_HIGHPRI) { 151 if (flags & WQ_HIGHPRI) {
121 ret->high = __btrfs_alloc_workqueue(name, flags, max_active, 152 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
122 thresh); 153 thresh);
123 if (unlikely(!ret->high)) { 154 if (!ret->high) {
124 __btrfs_destroy_workqueue(ret->normal); 155 __btrfs_destroy_workqueue(ret->normal);
125 kfree(ret); 156 kfree(ret);
126 return NULL; 157 return NULL;
@@ -232,13 +263,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
232 spin_unlock_irqrestore(lock, flags); 263 spin_unlock_irqrestore(lock, flags);
233} 264}
234 265
235static void normal_work_helper(struct work_struct *arg) 266static void normal_work_helper(struct btrfs_work *work)
236{ 267{
237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq; 268 struct __btrfs_workqueue *wq;
239 int need_order = 0; 269 int need_order = 0;
240 270
241 work = container_of(arg, struct btrfs_work, normal_work);
242 /* 271 /*
243 * We should not touch things inside work in the following cases: 272 * We should not touch things inside work in the following cases:
244 * 1) after work->func() if it has no ordered_free 273 * 1) after work->func() if it has no ordered_free
@@ -262,7 +291,7 @@ static void normal_work_helper(struct work_struct *arg)
262 trace_btrfs_all_work_done(work); 291 trace_btrfs_all_work_done(work);
263} 292}
264 293
265void btrfs_init_work(struct btrfs_work *work, 294void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
266 btrfs_func_t func, 295 btrfs_func_t func,
267 btrfs_func_t ordered_func, 296 btrfs_func_t ordered_func,
268 btrfs_func_t ordered_free) 297 btrfs_func_t ordered_free)
@@ -270,7 +299,7 @@ void btrfs_init_work(struct btrfs_work *work,
270 work->func = func; 299 work->func = func;
271 work->ordered_func = ordered_func; 300 work->ordered_func = ordered_func;
272 work->ordered_free = ordered_free; 301 work->ordered_free = ordered_free;
273 INIT_WORK(&work->normal_work, normal_work_helper); 302 INIT_WORK(&work->normal_work, uniq_func);
274 INIT_LIST_HEAD(&work->ordered_list); 303 INIT_LIST_HEAD(&work->ordered_list);
275 work->flags = 0; 304 work->flags = 0;
276} 305}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9c6b66d15fb0..e386c29ef1f6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -19,12 +19,14 @@
19 19
20#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
21#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
22#include <linux/workqueue.h>
22 23
23struct btrfs_workqueue; 24struct btrfs_workqueue;
24/* Internal use only */ 25/* Internal use only */
25struct __btrfs_workqueue; 26struct __btrfs_workqueue;
26struct btrfs_work; 27struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg); 28typedef void (*btrfs_func_t)(struct btrfs_work *arg);
29typedef void (*btrfs_work_func_t)(struct work_struct *arg);
28 30
29struct btrfs_work { 31struct btrfs_work {
30 btrfs_func_t func; 32 btrfs_func_t func;
@@ -38,11 +40,36 @@ struct btrfs_work {
38 unsigned long flags; 40 unsigned long flags;
39}; 41};
40 42
43#define BTRFS_WORK_HELPER_PROTO(name) \
44void btrfs_##name(struct work_struct *arg)
45
46BTRFS_WORK_HELPER_PROTO(worker_helper);
47BTRFS_WORK_HELPER_PROTO(delalloc_helper);
48BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
49BTRFS_WORK_HELPER_PROTO(cache_helper);
50BTRFS_WORK_HELPER_PROTO(submit_helper);
51BTRFS_WORK_HELPER_PROTO(fixup_helper);
52BTRFS_WORK_HELPER_PROTO(endio_helper);
53BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
54BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
55BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
56BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
57BTRFS_WORK_HELPER_PROTO(rmw_helper);
58BTRFS_WORK_HELPER_PROTO(endio_write_helper);
59BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
60BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
61BTRFS_WORK_HELPER_PROTO(readahead_helper);
62BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
63BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
64BTRFS_WORK_HELPER_PROTO(scrub_helper);
65BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
66BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
67
41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, 68struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
42 int flags, 69 int flags,
43 int max_active, 70 int max_active,
44 int thresh); 71 int thresh);
45void btrfs_init_work(struct btrfs_work *work, 72void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
46 btrfs_func_t func, 73 btrfs_func_t func,
47 btrfs_func_t ordered_func, 74 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free); 75 btrfs_func_t ordered_free);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 54a201dac7f9..2d3e32ebfd15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -25,6 +25,9 @@
25#include "delayed-ref.h" 25#include "delayed-ref.h"
26#include "locking.h" 26#include "locking.h"
27 27
28/* Just an arbitrary number so we can be sure this happened */
29#define BACKREF_FOUND_SHARED 6
30
28struct extent_inode_elem { 31struct extent_inode_elem {
29 u64 inum; 32 u64 inum;
30 u64 offset; 33 u64 offset;
@@ -377,7 +380,8 @@ out:
377static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 380static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
378 struct btrfs_path *path, u64 time_seq, 381 struct btrfs_path *path, u64 time_seq,
379 struct list_head *head, 382 struct list_head *head,
380 const u64 *extent_item_pos, u64 total_refs) 383 const u64 *extent_item_pos, u64 total_refs,
384 u64 root_objectid)
381{ 385{
382 int err; 386 int err;
383 int ret = 0; 387 int ret = 0;
@@ -402,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
402 continue; 406 continue;
403 if (ref->count == 0) 407 if (ref->count == 0)
404 continue; 408 continue;
409 if (root_objectid && ref->root_id != root_objectid) {
410 ret = BACKREF_FOUND_SHARED;
411 goto out;
412 }
405 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 413 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
406 parents, extent_item_pos, 414 parents, extent_item_pos,
407 total_refs); 415 total_refs);
@@ -482,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
482 continue; 490 continue;
483 BUG_ON(!ref->wanted_disk_byte); 491 BUG_ON(!ref->wanted_disk_byte);
484 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 492 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
485 fs_info->tree_root->leafsize, 0); 493 0);
486 if (!eb || !extent_buffer_uptodate(eb)) { 494 if (!eb || !extent_buffer_uptodate(eb)) {
487 free_extent_buffer(eb); 495 free_extent_buffer(eb);
488 return -EIO; 496 return -EIO;
@@ -561,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode)
561 * smaller or equal that seq to the list 569 * smaller or equal that seq to the list
562 */ 570 */
563static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 571static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
564 struct list_head *prefs, u64 *total_refs) 572 struct list_head *prefs, u64 *total_refs,
573 u64 inum)
565{ 574{
566 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 575 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
567 struct rb_node *n = &head->node.rb_node; 576 struct rb_node *n = &head->node.rb_node;
@@ -625,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
625 key.objectid = ref->objectid; 634 key.objectid = ref->objectid;
626 key.type = BTRFS_EXTENT_DATA_KEY; 635 key.type = BTRFS_EXTENT_DATA_KEY;
627 key.offset = ref->offset; 636 key.offset = ref->offset;
637
638 /*
639 * Found a inum that doesn't match our known inum, we
640 * know it's shared.
641 */
642 if (inum && ref->objectid != inum) {
643 ret = BACKREF_FOUND_SHARED;
644 break;
645 }
646
628 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, 647 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
629 node->bytenr, 648 node->bytenr,
630 node->ref_mod * sgn, GFP_ATOMIC); 649 node->ref_mod * sgn, GFP_ATOMIC);
@@ -659,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
659static int __add_inline_refs(struct btrfs_fs_info *fs_info, 678static int __add_inline_refs(struct btrfs_fs_info *fs_info,
660 struct btrfs_path *path, u64 bytenr, 679 struct btrfs_path *path, u64 bytenr,
661 int *info_level, struct list_head *prefs, 680 int *info_level, struct list_head *prefs,
662 u64 *total_refs) 681 u64 *total_refs, u64 inum)
663{ 682{
664 int ret = 0; 683 int ret = 0;
665 int slot; 684 int slot;
@@ -744,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
744 dref); 763 dref);
745 key.type = BTRFS_EXTENT_DATA_KEY; 764 key.type = BTRFS_EXTENT_DATA_KEY;
746 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 765 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
766
767 if (inum && key.objectid != inum) {
768 ret = BACKREF_FOUND_SHARED;
769 break;
770 }
771
747 root = btrfs_extent_data_ref_root(leaf, dref); 772 root = btrfs_extent_data_ref_root(leaf, dref);
748 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 773 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
749 bytenr, count, GFP_NOFS); 774 bytenr, count, GFP_NOFS);
@@ -765,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
765 */ 790 */
766static int __add_keyed_refs(struct btrfs_fs_info *fs_info, 791static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
767 struct btrfs_path *path, u64 bytenr, 792 struct btrfs_path *path, u64 bytenr,
768 int info_level, struct list_head *prefs) 793 int info_level, struct list_head *prefs, u64 inum)
769{ 794{
770 struct btrfs_root *extent_root = fs_info->extent_root; 795 struct btrfs_root *extent_root = fs_info->extent_root;
771 int ret; 796 int ret;
@@ -827,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
827 dref); 852 dref);
828 key.type = BTRFS_EXTENT_DATA_KEY; 853 key.type = BTRFS_EXTENT_DATA_KEY;
829 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 854 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
855
856 if (inum && key.objectid != inum) {
857 ret = BACKREF_FOUND_SHARED;
858 break;
859 }
860
830 root = btrfs_extent_data_ref_root(leaf, dref); 861 root = btrfs_extent_data_ref_root(leaf, dref);
831 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 862 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
832 bytenr, count, GFP_NOFS); 863 bytenr, count, GFP_NOFS);
@@ -854,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
854static int find_parent_nodes(struct btrfs_trans_handle *trans, 885static int find_parent_nodes(struct btrfs_trans_handle *trans,
855 struct btrfs_fs_info *fs_info, u64 bytenr, 886 struct btrfs_fs_info *fs_info, u64 bytenr,
856 u64 time_seq, struct ulist *refs, 887 u64 time_seq, struct ulist *refs,
857 struct ulist *roots, const u64 *extent_item_pos) 888 struct ulist *roots, const u64 *extent_item_pos,
889 u64 root_objectid, u64 inum)
858{ 890{
859 struct btrfs_key key; 891 struct btrfs_key key;
860 struct btrfs_path *path; 892 struct btrfs_path *path;
@@ -929,7 +961,8 @@ again:
929 } 961 }
930 spin_unlock(&delayed_refs->lock); 962 spin_unlock(&delayed_refs->lock);
931 ret = __add_delayed_refs(head, time_seq, 963 ret = __add_delayed_refs(head, time_seq,
932 &prefs_delayed, &total_refs); 964 &prefs_delayed, &total_refs,
965 inum);
933 mutex_unlock(&head->mutex); 966 mutex_unlock(&head->mutex);
934 if (ret) 967 if (ret)
935 goto out; 968 goto out;
@@ -951,11 +984,11 @@ again:
951 key.type == BTRFS_METADATA_ITEM_KEY)) { 984 key.type == BTRFS_METADATA_ITEM_KEY)) {
952 ret = __add_inline_refs(fs_info, path, bytenr, 985 ret = __add_inline_refs(fs_info, path, bytenr,
953 &info_level, &prefs, 986 &info_level, &prefs,
954 &total_refs); 987 &total_refs, inum);
955 if (ret) 988 if (ret)
956 goto out; 989 goto out;
957 ret = __add_keyed_refs(fs_info, path, bytenr, 990 ret = __add_keyed_refs(fs_info, path, bytenr,
958 info_level, &prefs); 991 info_level, &prefs, inum);
959 if (ret) 992 if (ret)
960 goto out; 993 goto out;
961 } 994 }
@@ -971,7 +1004,8 @@ again:
971 __merge_refs(&prefs, 1); 1004 __merge_refs(&prefs, 1);
972 1005
973 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 1006 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
974 extent_item_pos, total_refs); 1007 extent_item_pos, total_refs,
1008 root_objectid);
975 if (ret) 1009 if (ret)
976 goto out; 1010 goto out;
977 1011
@@ -981,6 +1015,11 @@ again:
981 ref = list_first_entry(&prefs, struct __prelim_ref, list); 1015 ref = list_first_entry(&prefs, struct __prelim_ref, list);
982 WARN_ON(ref->count < 0); 1016 WARN_ON(ref->count < 0);
983 if (roots && ref->count && ref->root_id && ref->parent == 0) { 1017 if (roots && ref->count && ref->root_id && ref->parent == 0) {
1018 if (root_objectid && ref->root_id != root_objectid) {
1019 ret = BACKREF_FOUND_SHARED;
1020 goto out;
1021 }
1022
984 /* no parent == root of tree */ 1023 /* no parent == root of tree */
985 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 1024 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
986 if (ret < 0) 1025 if (ret < 0)
@@ -989,12 +1028,10 @@ again:
989 if (ref->count && ref->parent) { 1028 if (ref->count && ref->parent) {
990 if (extent_item_pos && !ref->inode_list && 1029 if (extent_item_pos && !ref->inode_list &&
991 ref->level == 0) { 1030 ref->level == 0) {
992 u32 bsz;
993 struct extent_buffer *eb; 1031 struct extent_buffer *eb;
994 bsz = btrfs_level_size(fs_info->extent_root, 1032
995 ref->level);
996 eb = read_tree_block(fs_info->extent_root, 1033 eb = read_tree_block(fs_info->extent_root,
997 ref->parent, bsz, 0); 1034 ref->parent, 0);
998 if (!eb || !extent_buffer_uptodate(eb)) { 1035 if (!eb || !extent_buffer_uptodate(eb)) {
999 free_extent_buffer(eb); 1036 free_extent_buffer(eb);
1000 ret = -EIO; 1037 ret = -EIO;
@@ -1087,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1087 return -ENOMEM; 1124 return -ENOMEM;
1088 1125
1089 ret = find_parent_nodes(trans, fs_info, bytenr, 1126 ret = find_parent_nodes(trans, fs_info, bytenr,
1090 time_seq, *leafs, NULL, extent_item_pos); 1127 time_seq, *leafs, NULL, extent_item_pos, 0, 0);
1091 if (ret < 0 && ret != -ENOENT) { 1128 if (ret < 0 && ret != -ENOENT) {
1092 free_leaf_list(*leafs); 1129 free_leaf_list(*leafs);
1093 return ret; 1130 return ret;
@@ -1130,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1130 ULIST_ITER_INIT(&uiter); 1167 ULIST_ITER_INIT(&uiter);
1131 while (1) { 1168 while (1) {
1132 ret = find_parent_nodes(trans, fs_info, bytenr, 1169 ret = find_parent_nodes(trans, fs_info, bytenr,
1133 time_seq, tmp, *roots, NULL); 1170 time_seq, tmp, *roots, NULL, 0, 0);
1134 if (ret < 0 && ret != -ENOENT) { 1171 if (ret < 0 && ret != -ENOENT) {
1135 ulist_free(tmp); 1172 ulist_free(tmp);
1136 ulist_free(*roots); 1173 ulist_free(*roots);
@@ -1161,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1161 return ret; 1198 return ret;
1162} 1199}
1163 1200
1201int btrfs_check_shared(struct btrfs_trans_handle *trans,
1202 struct btrfs_fs_info *fs_info, u64 root_objectid,
1203 u64 inum, u64 bytenr)
1204{
1205 struct ulist *tmp = NULL;
1206 struct ulist *roots = NULL;
1207 struct ulist_iterator uiter;
1208 struct ulist_node *node;
1209 struct seq_list elem = {};
1210 int ret = 0;
1211
1212 tmp = ulist_alloc(GFP_NOFS);
1213 roots = ulist_alloc(GFP_NOFS);
1214 if (!tmp || !roots) {
1215 ulist_free(tmp);
1216 ulist_free(roots);
1217 return -ENOMEM;
1218 }
1219
1220 if (trans)
1221 btrfs_get_tree_mod_seq(fs_info, &elem);
1222 else
1223 down_read(&fs_info->commit_root_sem);
1224 ULIST_ITER_INIT(&uiter);
1225 while (1) {
1226 ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
1227 roots, NULL, root_objectid, inum);
1228 if (ret == BACKREF_FOUND_SHARED) {
1229 ret = 1;
1230 break;
1231 }
1232 if (ret < 0 && ret != -ENOENT)
1233 break;
1234 node = ulist_next(tmp, &uiter);
1235 if (!node)
1236 break;
1237 bytenr = node->val;
1238 cond_resched();
1239 }
1240 if (trans)
1241 btrfs_put_tree_mod_seq(fs_info, &elem);
1242 else
1243 up_read(&fs_info->commit_root_sem);
1244 ulist_free(tmp);
1245 ulist_free(roots);
1246 return ret;
1247}
1248
1164/* 1249/*
1165 * this makes the path point to (inum INODE_ITEM ioff) 1250 * this makes the path point to (inum INODE_ITEM ioff)
1166 */ 1251 */
@@ -1193,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1193 unsigned long ptr; 1278 unsigned long ptr;
1194 1279
1195 key.objectid = inode_objectid; 1280 key.objectid = inode_objectid;
1196 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 1281 key.type = BTRFS_INODE_EXTREF_KEY;
1197 key.offset = start_off; 1282 key.offset = start_off;
1198 1283
1199 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1284 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1233,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1233 ret = -ENOENT; 1318 ret = -ENOENT;
1234 if (found_key.objectid != inode_objectid) 1319 if (found_key.objectid != inode_objectid)
1235 break; 1320 break;
1236 if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) 1321 if (found_key.type != BTRFS_INODE_EXTREF_KEY)
1237 break; 1322 break;
1238 1323
1239 ret = 0; 1324 ret = 0;
@@ -1366,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1366 } 1451 }
1367 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1452 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1368 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1453 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1369 size = fs_info->extent_root->leafsize; 1454 size = fs_info->extent_root->nodesize;
1370 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1455 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1371 size = found_key->offset; 1456 size = found_key->offset;
1372 1457
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 86fc20fec282..2a1ac6bfc724 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
71 u64 start_off, struct btrfs_path *path, 71 u64 start_off, struct btrfs_path *path,
72 struct btrfs_inode_extref **ret_extref, 72 struct btrfs_inode_extref **ret_extref,
73 u64 *found_off); 73 u64 *found_off);
74int btrfs_check_shared(struct btrfs_trans_handle *trans,
75 struct btrfs_fs_info *fs_info, u64 root_objectid,
76 u64 inum, u64 bytenr);
74 77
75int __init btrfs_prelim_ref_init(void); 78int __init btrfs_prelim_ref_init(void);
76void btrfs_prelim_ref_exit(void); 79void btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43527fd78825..4aadadcfab20 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,17 @@
44#define BTRFS_INODE_IN_DELALLOC_LIST 9 44#define BTRFS_INODE_IN_DELALLOC_LIST 9
45#define BTRFS_INODE_READDIO_NEED_LOCK 10 45#define BTRFS_INODE_READDIO_NEED_LOCK 10
46#define BTRFS_INODE_HAS_PROPS 11 46#define BTRFS_INODE_HAS_PROPS 11
47/*
48 * The following 3 bits are meant only for the btree inode.
49 * When any of them is set, it means an error happened while writing an
50 * extent buffer belonging to:
51 * 1) a non-log btree
52 * 2) a log btree and first log sub-transaction
53 * 3) a log btree and second log sub-transaction
54 */
55#define BTRFS_INODE_BTREE_ERR 12
56#define BTRFS_INODE_BTREE_LOG1_ERR 13
57#define BTRFS_INODE_BTREE_LOG2_ERR 14
47 58
48/* in memory btrfs inode */ 59/* in memory btrfs inode */
49struct btrfs_inode { 60struct btrfs_inode {
@@ -121,6 +132,12 @@ struct btrfs_inode {
121 u64 delalloc_bytes; 132 u64 delalloc_bytes;
122 133
123 /* 134 /*
135 * total number of bytes pending defrag, used by stat to check whether
136 * it needs COW.
137 */
138 u64 defrag_bytes;
139
140 /*
124 * the size of the file stored in the metadata on disk. data=ordered 141 * the size of the file stored in the metadata on disk. data=ordered
125 * means the in-memory i_size might be larger than the size on disk 142 * means the in-memory i_size might be larger than the size on disk
126 * because not all the blocks are written yet. 143 * because not all the blocks are written yet.
@@ -234,13 +251,25 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
234 BTRFS_I(inode)->last_sub_trans <= 251 BTRFS_I(inode)->last_sub_trans <=
235 BTRFS_I(inode)->last_log_commit && 252 BTRFS_I(inode)->last_log_commit &&
236 BTRFS_I(inode)->last_sub_trans <= 253 BTRFS_I(inode)->last_sub_trans <=
237 BTRFS_I(inode)->root->last_log_commit) 254 BTRFS_I(inode)->root->last_log_commit) {
238 return 1; 255 /*
256 * After a ranged fsync we might have left some extent maps
257 * (that fall outside the fsync's range). So return false
258 * here if the list isn't empty, to make sure btrfs_log_inode()
259 * will be called and process those extent maps.
260 */
261 smp_mb();
262 if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
263 return 1;
264 }
239 return 0; 265 return 0;
240} 266}
241 267
268#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
269
242struct btrfs_dio_private { 270struct btrfs_dio_private {
243 struct inode *inode; 271 struct inode *inode;
272 unsigned long flags;
244 u64 logical_offset; 273 u64 logical_offset;
245 u64 disk_bytenr; 274 u64 disk_bytenr;
246 u64 bytes; 275 u64 bytes;
@@ -257,7 +286,12 @@ struct btrfs_dio_private {
257 286
258 /* dio_bio came from fs/direct-io.c */ 287 /* dio_bio came from fs/direct-io.c */
259 struct bio *dio_bio; 288 struct bio *dio_bio;
260 u8 csum[0]; 289
290 /*
291 * The original bio may be splited to several sub-bios, this is
292 * done during endio of sub-bios
293 */
294 int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
261}; 295};
262 296
263/* 297/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce92ae30250f..cb7f3fe9c9f6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror(
807 807
808 /* super block bytenr is always the unmapped device bytenr */ 808 /* super block bytenr is always the unmapped device bytenr */
809 dev_bytenr = btrfs_sb_offset(superblock_mirror_num); 809 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
810 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 810 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
811 return -1; 811 return -1;
812 bh = __bread(superblock_bdev, dev_bytenr / 4096, 812 bh = __bread(superblock_bdev, dev_bytenr / 4096,
813 BTRFS_SUPER_INFO_SIZE); 813 BTRFS_SUPER_INFO_SIZE);
@@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror(
820 btrfs_super_magic(super_tmp) != BTRFS_MAGIC || 820 btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
821 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || 821 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
822 btrfs_super_nodesize(super_tmp) != state->metablock_size || 822 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
823 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
824 btrfs_super_sectorsize(super_tmp) != state->datablock_size) { 823 btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
825 brelse(bh); 824 brelse(bh);
826 return 0; 825 return 0;
@@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data(
1252 1251
1253 while (len > 0) { 1252 while (len > 0) {
1254 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); 1253 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
1255 BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> 1254 BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
1256 PAGE_CACHE_SHIFT);
1257 kaddr = block_ctx->datav[i]; 1255 kaddr = block_ctx->datav[i];
1258 memcpy(dst, kaddr + offset_in_page, cur); 1256 memcpy(dst, kaddr + offset_in_page, cur);
1259 1257
@@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root,
3120 struct list_head *dev_head = &fs_devices->devices; 3118 struct list_head *dev_head = &fs_devices->devices;
3121 struct btrfs_device *device; 3119 struct btrfs_device *device;
3122 3120
3123 if (root->nodesize != root->leafsize) {
3124 printk(KERN_INFO
3125 "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
3126 root->nodesize, root->leafsize);
3127 return -1;
3128 }
3129 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { 3121 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
3130 printk(KERN_INFO 3122 printk(KERN_INFO
3131 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 3123 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3132 root->nodesize, PAGE_CACHE_SIZE); 3124 root->nodesize, PAGE_CACHE_SIZE);
3133 return -1; 3125 return -1;
3134 } 3126 }
3135 if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3136 printk(KERN_INFO
3137 "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3138 root->leafsize, PAGE_CACHE_SIZE);
3139 return -1;
3140 }
3141 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { 3127 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3142 printk(KERN_INFO 3128 printk(KERN_INFO
3143 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 3129 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1daea0b47187..d3220d31d3cb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
91 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 91 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
92 92
93 return sizeof(struct compressed_bio) + 93 return sizeof(struct compressed_bio) +
94 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 94 (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
95 csum_size;
96} 95}
97 96
98static struct bio *compressed_bio_alloc(struct block_device *bdev, 97static struct bio *compressed_bio_alloc(struct block_device *bdev,
@@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
389 * freed before we're done setting it up 388 * freed before we're done setting it up
390 */ 389 */
391 atomic_inc(&cb->pending_bios); 390 atomic_inc(&cb->pending_bios);
392 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 391 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
392 BTRFS_WQ_ENDIO_DATA);
393 BUG_ON(ret); /* -ENOMEM */ 393 BUG_ON(ret); /* -ENOMEM */
394 394
395 if (!skip_sum) { 395 if (!skip_sum) {
@@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
420 } 420 }
421 bio_get(bio); 421 bio_get(bio);
422 422
423 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 423 ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
424 BUG_ON(ret); /* -ENOMEM */ 424 BUG_ON(ret); /* -ENOMEM */
425 425
426 if (!skip_sum) { 426 if (!skip_sum) {
@@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
615 cb->compress_type = extent_compress_type(bio_flags); 615 cb->compress_type = extent_compress_type(bio_flags);
616 cb->orig_bio = bio; 616 cb->orig_bio = bio;
617 617
618 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 618 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
619 PAGE_CACHE_SIZE;
620 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, 619 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
621 GFP_NOFS); 620 GFP_NOFS);
622 if (!cb->compressed_pages) 621 if (!cb->compressed_pages)
@@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
670 PAGE_CACHE_SIZE) { 669 PAGE_CACHE_SIZE) {
671 bio_get(comp_bio); 670 bio_get(comp_bio);
672 671
673 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 672 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
673 BTRFS_WQ_ENDIO_DATA);
674 BUG_ON(ret); /* -ENOMEM */ 674 BUG_ON(ret); /* -ENOMEM */
675 675
676 /* 676 /*
@@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
686 comp_bio, sums); 686 comp_bio, sums);
687 BUG_ON(ret); /* -ENOMEM */ 687 BUG_ON(ret); /* -ENOMEM */
688 } 688 }
689 sums += (comp_bio->bi_iter.bi_size + 689 sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
690 root->sectorsize - 1) / root->sectorsize; 690 root->sectorsize);
691 691
692 ret = btrfs_map_bio(root, READ, comp_bio, 692 ret = btrfs_map_bio(root, READ, comp_bio,
693 mirror_num, 0); 693 mirror_num, 0);
@@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
708 } 708 }
709 bio_get(comp_bio); 709 bio_get(comp_bio);
710 710
711 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 711 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
712 BTRFS_WQ_ENDIO_DATA);
712 BUG_ON(ret); /* -ENOMEM */ 713 BUG_ON(ret); /* -ENOMEM */
713 714
714 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 715 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 44ee5d2e52a4..19bc6162fb8e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
258 else 258 else
259 btrfs_node_key(buf, &disk_key, 0); 259 btrfs_node_key(buf, &disk_key, 0);
260 260
261 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 261 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
262 new_root_objectid, &disk_key, level, 262 &disk_key, level, buf->start, 0);
263 buf->start, 0);
264 if (IS_ERR(cow)) 263 if (IS_ERR(cow))
265 return PTR_ERR(cow); 264 return PTR_ERR(cow);
266 265
@@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1133 } else 1132 } else
1134 parent_start = 0; 1133 parent_start = 0;
1135 1134
1136 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 1135 cow = btrfs_alloc_tree_block(trans, root, parent_start,
1137 root->root_key.objectid, &disk_key, 1136 root->root_key.objectid, &disk_key, level,
1138 level, search_start, empty_size); 1137 search_start, empty_size);
1139 if (IS_ERR(cow)) 1138 if (IS_ERR(cow))
1140 return PTR_ERR(cow); 1139 return PTR_ERR(cow);
1141 1140
@@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1425 struct tree_mod_root *old_root = NULL; 1424 struct tree_mod_root *old_root = NULL;
1426 u64 old_generation = 0; 1425 u64 old_generation = 0;
1427 u64 logical; 1426 u64 logical;
1428 u32 blocksize;
1429 1427
1430 eb_root = btrfs_read_lock_root_node(root); 1428 eb_root = btrfs_read_lock_root_node(root);
1431 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); 1429 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1444 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1442 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1445 btrfs_tree_read_unlock(eb_root); 1443 btrfs_tree_read_unlock(eb_root);
1446 free_extent_buffer(eb_root); 1444 free_extent_buffer(eb_root);
1447 blocksize = btrfs_level_size(root, old_root->level); 1445 old = read_tree_block(root, logical, 0);
1448 old = read_tree_block(root, logical, blocksize, 0);
1449 if (WARN_ON(!old || !extent_buffer_uptodate(old))) { 1446 if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1450 free_extent_buffer(old); 1447 free_extent_buffer(old);
1451 btrfs_warn(root->fs_info, 1448 btrfs_warn(root->fs_info,
@@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
1506 struct btrfs_root *root, 1503 struct btrfs_root *root,
1507 struct extent_buffer *buf) 1504 struct extent_buffer *buf)
1508{ 1505{
1509#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1506 if (btrfs_test_is_dummy_root(root))
1510 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1511 return 0; 1507 return 0;
1512#endif 1508
1513 /* ensure we can see the force_cow */ 1509 /* ensure we can see the force_cow */
1514 smp_rmb(); 1510 smp_rmb();
1515 1511
@@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1651 WARN_ON(trans->transid != root->fs_info->generation); 1647 WARN_ON(trans->transid != root->fs_info->generation);
1652 1648
1653 parent_nritems = btrfs_header_nritems(parent); 1649 parent_nritems = btrfs_header_nritems(parent);
1654 blocksize = btrfs_level_size(root, parent_level - 1); 1650 blocksize = root->nodesize;
1655 end_slot = parent_nritems; 1651 end_slot = parent_nritems;
1656 1652
1657 if (parent_nritems == 1) 1653 if (parent_nritems == 1)
@@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1685 continue; 1681 continue;
1686 } 1682 }
1687 1683
1688 cur = btrfs_find_tree_block(root, blocknr, blocksize); 1684 cur = btrfs_find_tree_block(root, blocknr);
1689 if (cur) 1685 if (cur)
1690 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1686 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1691 else 1687 else
1692 uptodate = 0; 1688 uptodate = 0;
1693 if (!cur || !uptodate) { 1689 if (!cur || !uptodate) {
1694 if (!cur) { 1690 if (!cur) {
1695 cur = read_tree_block(root, blocknr, 1691 cur = read_tree_block(root, blocknr, gen);
1696 blocksize, gen);
1697 if (!cur || !extent_buffer_uptodate(cur)) { 1692 if (!cur || !extent_buffer_uptodate(cur)) {
1698 free_extent_buffer(cur); 1693 free_extent_buffer(cur);
1699 return -EIO; 1694 return -EIO;
@@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1872 BUG_ON(level == 0); 1867 BUG_ON(level == 0);
1873 1868
1874 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), 1869 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1875 btrfs_level_size(root, level - 1),
1876 btrfs_node_ptr_generation(parent, slot)); 1870 btrfs_node_ptr_generation(parent, slot));
1877 if (eb && !extent_buffer_uptodate(eb)) { 1871 if (eb && !extent_buffer_uptodate(eb)) {
1878 free_extent_buffer(eb); 1872 free_extent_buffer(eb);
@@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root,
2267 node = path->nodes[level]; 2261 node = path->nodes[level];
2268 2262
2269 search = btrfs_node_blockptr(node, slot); 2263 search = btrfs_node_blockptr(node, slot);
2270 blocksize = btrfs_level_size(root, level - 1); 2264 blocksize = root->nodesize;
2271 eb = btrfs_find_tree_block(root, search, blocksize); 2265 eb = btrfs_find_tree_block(root, search);
2272 if (eb) { 2266 if (eb) {
2273 free_extent_buffer(eb); 2267 free_extent_buffer(eb);
2274 return; 2268 return;
@@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root,
2298 if ((search <= target && target - search <= 65536) || 2292 if ((search <= target && target - search <= 65536) ||
2299 (search > target && search - target <= 65536)) { 2293 (search > target && search - target <= 65536)) {
2300 gen = btrfs_node_ptr_generation(node, nr); 2294 gen = btrfs_node_ptr_generation(node, nr);
2301 readahead_tree_block(root, search, blocksize, gen); 2295 readahead_tree_block(root, search, blocksize);
2302 nread += blocksize; 2296 nread += blocksize;
2303 } 2297 }
2304 nscan++; 2298 nscan++;
@@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2325 2319
2326 nritems = btrfs_header_nritems(parent); 2320 nritems = btrfs_header_nritems(parent);
2327 slot = path->slots[level + 1]; 2321 slot = path->slots[level + 1];
2328 blocksize = btrfs_level_size(root, level); 2322 blocksize = root->nodesize;
2329 2323
2330 if (slot > 0) { 2324 if (slot > 0) {
2331 block1 = btrfs_node_blockptr(parent, slot - 1); 2325 block1 = btrfs_node_blockptr(parent, slot - 1);
2332 gen = btrfs_node_ptr_generation(parent, slot - 1); 2326 gen = btrfs_node_ptr_generation(parent, slot - 1);
2333 eb = btrfs_find_tree_block(root, block1, blocksize); 2327 eb = btrfs_find_tree_block(root, block1);
2334 /* 2328 /*
2335 * if we get -eagain from btrfs_buffer_uptodate, we 2329 * if we get -eagain from btrfs_buffer_uptodate, we
2336 * don't want to return eagain here. That will loop 2330 * don't want to return eagain here. That will loop
@@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2343 if (slot + 1 < nritems) { 2337 if (slot + 1 < nritems) {
2344 block2 = btrfs_node_blockptr(parent, slot + 1); 2338 block2 = btrfs_node_blockptr(parent, slot + 1);
2345 gen = btrfs_node_ptr_generation(parent, slot + 1); 2339 gen = btrfs_node_ptr_generation(parent, slot + 1);
2346 eb = btrfs_find_tree_block(root, block2, blocksize); 2340 eb = btrfs_find_tree_block(root, block2);
2347 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2341 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2348 block2 = 0; 2342 block2 = 0;
2349 free_extent_buffer(eb); 2343 free_extent_buffer(eb);
2350 } 2344 }
2351 2345
2352 if (block1) 2346 if (block1)
2353 readahead_tree_block(root, block1, blocksize, 0); 2347 readahead_tree_block(root, block1, blocksize);
2354 if (block2) 2348 if (block2)
2355 readahead_tree_block(root, block2, blocksize, 0); 2349 readahead_tree_block(root, block2, blocksize);
2356} 2350}
2357 2351
2358 2352
@@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2454{ 2448{
2455 u64 blocknr; 2449 u64 blocknr;
2456 u64 gen; 2450 u64 gen;
2457 u32 blocksize;
2458 struct extent_buffer *b = *eb_ret; 2451 struct extent_buffer *b = *eb_ret;
2459 struct extent_buffer *tmp; 2452 struct extent_buffer *tmp;
2460 int ret; 2453 int ret;
2461 2454
2462 blocknr = btrfs_node_blockptr(b, slot); 2455 blocknr = btrfs_node_blockptr(b, slot);
2463 gen = btrfs_node_ptr_generation(b, slot); 2456 gen = btrfs_node_ptr_generation(b, slot);
2464 blocksize = btrfs_level_size(root, level - 1);
2465 2457
2466 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2458 tmp = btrfs_find_tree_block(root, blocknr);
2467 if (tmp) { 2459 if (tmp) {
2468 /* first we do an atomic uptodate check */ 2460 /* first we do an atomic uptodate check */
2469 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2461 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2507 btrfs_release_path(p); 2499 btrfs_release_path(p);
2508 2500
2509 ret = -EAGAIN; 2501 ret = -EAGAIN;
2510 tmp = read_tree_block(root, blocknr, blocksize, 0); 2502 tmp = read_tree_block(root, blocknr, 0);
2511 if (tmp) { 2503 if (tmp) {
2512 /* 2504 /*
2513 * If the read above didn't mark this buffer up to date, 2505 * If the read above didn't mark this buffer up to date,
@@ -2792,8 +2784,6 @@ again:
2792 if (!should_cow_block(trans, root, b)) 2784 if (!should_cow_block(trans, root, b))
2793 goto cow_done; 2785 goto cow_done;
2794 2786
2795 btrfs_set_path_blocking(p);
2796
2797 /* 2787 /*
2798 * must have write locks on this node and the 2788 * must have write locks on this node and the
2799 * parent 2789 * parent
@@ -2807,6 +2797,7 @@ again:
2807 goto again; 2797 goto again;
2808 } 2798 }
2809 2799
2800 btrfs_set_path_blocking(p);
2810 err = btrfs_cow_block(trans, root, b, 2801 err = btrfs_cow_block(trans, root, b,
2811 p->nodes[level + 1], 2802 p->nodes[level + 1],
2812 p->slots[level + 1], &b); 2803 p->slots[level + 1], &b);
@@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3362 else 3353 else
3363 btrfs_node_key(lower, &lower_key, 0); 3354 btrfs_node_key(lower, &lower_key, 0);
3364 3355
3365 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3356 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3366 root->root_key.objectid, &lower_key, 3357 &lower_key, level, root->node->start, 0);
3367 level, root->node->start, 0);
3368 if (IS_ERR(c)) 3358 if (IS_ERR(c))
3369 return PTR_ERR(c); 3359 return PTR_ERR(c);
3370 3360
@@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3502 mid = (c_nritems + 1) / 2; 3492 mid = (c_nritems + 1) / 2;
3503 btrfs_node_key(c, &disk_key, mid); 3493 btrfs_node_key(c, &disk_key, mid);
3504 3494
3505 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3495 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3506 root->root_key.objectid, 3496 &disk_key, level, c->start, 0);
3507 &disk_key, level, c->start, 0);
3508 if (IS_ERR(split)) 3497 if (IS_ERR(split))
3509 return PTR_ERR(split); 3498 return PTR_ERR(split);
3510 3499
@@ -4282,13 +4271,12 @@ again:
4282 else 4271 else
4283 btrfs_item_key(l, &disk_key, mid); 4272 btrfs_item_key(l, &disk_key, mid);
4284 4273
4285 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 4274 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4286 root->root_key.objectid, 4275 &disk_key, 0, l->start, 0);
4287 &disk_key, 0, l->start, 0);
4288 if (IS_ERR(right)) 4276 if (IS_ERR(right))
4289 return PTR_ERR(right); 4277 return PTR_ERR(right);
4290 4278
4291 root_add_used(root, root->leafsize); 4279 root_add_used(root, root->nodesize);
4292 4280
4293 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 4281 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4294 btrfs_set_header_bytenr(right, right->start); 4282 btrfs_set_header_bytenr(right, right->start);
@@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4626 ptr = btrfs_item_ptr_offset(leaf, slot); 4614 ptr = btrfs_item_ptr_offset(leaf, slot);
4627 memmove_extent_buffer(leaf, ptr, 4615 memmove_extent_buffer(leaf, ptr,
4628 (unsigned long)fi, 4616 (unsigned long)fi,
4629 offsetof(struct btrfs_file_extent_item, 4617 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4630 disk_bytenr));
4631 } 4618 }
4632 } 4619 }
4633 4620
@@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4738 int slot; 4725 int slot;
4739 struct btrfs_map_token token; 4726 struct btrfs_map_token token;
4740 4727
4728 if (path->slots[0] == 0) {
4729 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4730 fixup_low_keys(root, path, &disk_key, 1);
4731 }
4732 btrfs_unlock_up_safe(path, 1);
4733
4741 btrfs_init_map_token(&token); 4734 btrfs_init_map_token(&token);
4742 4735
4743 leaf = path->nodes[0]; 4736 leaf = path->nodes[0];
@@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4798 } 4791 }
4799 4792
4800 btrfs_set_header_nritems(leaf, nritems + nr); 4793 btrfs_set_header_nritems(leaf, nritems + nr);
4801
4802 if (slot == 0) {
4803 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4804 fixup_low_keys(root, path, &disk_key, 1);
4805 }
4806 btrfs_unlock_up_safe(path, 1);
4807 btrfs_mark_buffer_dirty(leaf); 4794 btrfs_mark_buffer_dirty(leaf);
4808 4795
4809 if (btrfs_leaf_free_space(root, leaf) < 0) { 4796 if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5145 u32 nritems; 5132 u32 nritems;
5146 int level; 5133 int level;
5147 int ret = 1; 5134 int ret = 1;
5135 int keep_locks = path->keep_locks;
5148 5136
5149 WARN_ON(!path->keep_locks); 5137 path->keep_locks = 1;
5150again: 5138again:
5151 cur = btrfs_read_lock_root_node(root); 5139 cur = btrfs_read_lock_root_node(root);
5152 level = btrfs_header_level(cur); 5140 level = btrfs_header_level(cur);
@@ -5210,7 +5198,6 @@ find_next_key:
5210 path->slots[level] = slot; 5198 path->slots[level] = slot;
5211 if (level == path->lowest_level) { 5199 if (level == path->lowest_level) {
5212 ret = 0; 5200 ret = 0;
5213 unlock_up(path, level, 1, 0, NULL);
5214 goto out; 5201 goto out;
5215 } 5202 }
5216 btrfs_set_path_blocking(path); 5203 btrfs_set_path_blocking(path);
@@ -5225,9 +5212,12 @@ find_next_key:
5225 btrfs_clear_path_blocking(path, NULL, 0); 5212 btrfs_clear_path_blocking(path, NULL, 0);
5226 } 5213 }
5227out: 5214out:
5228 if (ret == 0) 5215 path->keep_locks = keep_locks;
5216 if (ret == 0) {
5217 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5218 btrfs_set_path_blocking(path);
5229 memcpy(min_key, &found_key, sizeof(found_key)); 5219 memcpy(min_key, &found_key, sizeof(found_key));
5230 btrfs_set_path_blocking(path); 5220 }
5231 return ret; 5221 return ret;
5232} 5222}
5233 5223
@@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5375 goto out; 5365 goto out;
5376 } 5366 }
5377 5367
5378 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); 5368 tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5379 if (!tmp_buf) { 5369 if (!tmp_buf) {
5380 ret = -ENOMEM; 5370 ret = -ENOMEM;
5381 goto out; 5371 goto out;
@@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5520 goto out; 5510 goto out;
5521 advance_right = ADVANCE; 5511 advance_right = ADVANCE;
5522 } else { 5512 } else {
5523 enum btrfs_compare_tree_result cmp; 5513 enum btrfs_compare_tree_result result;
5524 5514
5525 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5515 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5526 ret = tree_compare_item(left_root, left_path, 5516 ret = tree_compare_item(left_root, left_path,
5527 right_path, tmp_buf); 5517 right_path, tmp_buf);
5528 if (ret) 5518 if (ret)
5529 cmp = BTRFS_COMPARE_TREE_CHANGED; 5519 result = BTRFS_COMPARE_TREE_CHANGED;
5530 else 5520 else
5531 cmp = BTRFS_COMPARE_TREE_SAME; 5521 result = BTRFS_COMPARE_TREE_SAME;
5532 ret = changed_cb(left_root, right_root, 5522 ret = changed_cb(left_root, right_root,
5533 left_path, right_path, 5523 left_path, right_path,
5534 &left_key, cmp, ctx); 5524 &left_key, result, ctx);
5535 if (ret < 0) 5525 if (ret < 0)
5536 goto out; 5526 goto out;
5537 advance_left = ADVANCE; 5527 advance_left = ADVANCE;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8e29b614fe93..d557264ee974 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
34#include <linux/pagemap.h> 34#include <linux/pagemap.h>
35#include <linux/btrfs.h> 35#include <linux/btrfs.h>
36#include <linux/workqueue.h> 36#include <linux/workqueue.h>
37#include <linux/security.h>
37#include "extent_io.h" 38#include "extent_io.h"
38#include "extent_map.h" 39#include "extent_map.h"
39#include "async-thread.h" 40#include "async-thread.h"
@@ -62,13 +63,6 @@ struct btrfs_ordered_sum;
62 63
63#define BTRFS_COMPAT_EXTENT_TREE_V0 64#define BTRFS_COMPAT_EXTENT_TREE_V0
64 65
65/*
66 * files bigger than this get some pre-flushing when they are added
67 * to the ordered operations list. That way we limit the total
68 * work done by the commit
69 */
70#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
71
72/* holds pointers to all of the tree roots */ 66/* holds pointers to all of the tree roots */
73#define BTRFS_ROOT_TREE_OBJECTID 1ULL 67#define BTRFS_ROOT_TREE_OBJECTID 1ULL
74 68
@@ -391,10 +385,12 @@ struct btrfs_header {
391 sizeof(struct btrfs_header)) / \ 385 sizeof(struct btrfs_header)) / \
392 sizeof(struct btrfs_key_ptr)) 386 sizeof(struct btrfs_key_ptr))
393#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) 387#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
394#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) 388#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
389#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
390 (offsetof(struct btrfs_file_extent_item, disk_bytenr))
395#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ 391#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
396 sizeof(struct btrfs_item) - \ 392 sizeof(struct btrfs_item) - \
397 sizeof(struct btrfs_file_extent_item)) 393 BTRFS_FILE_EXTENT_INLINE_DATA_START)
398#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ 394#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
399 sizeof(struct btrfs_item) -\ 395 sizeof(struct btrfs_item) -\
400 sizeof(struct btrfs_dir_item)) 396 sizeof(struct btrfs_dir_item))
@@ -474,7 +470,7 @@ struct btrfs_super_block {
474 __le64 num_devices; 470 __le64 num_devices;
475 __le32 sectorsize; 471 __le32 sectorsize;
476 __le32 nodesize; 472 __le32 nodesize;
477 __le32 leafsize; 473 __le32 __unused_leafsize;
478 __le32 stripesize; 474 __le32 stripesize;
479 __le32 sys_chunk_array_size; 475 __le32 sys_chunk_array_size;
480 __le64 chunk_root_generation; 476 __le64 chunk_root_generation;
@@ -903,6 +899,8 @@ struct btrfs_file_extent_item {
903 /* 899 /*
904 * disk space consumed by the extent, checksum blocks are included 900 * disk space consumed by the extent, checksum blocks are included
905 * in these numbers 901 * in these numbers
902 *
903 * At this offset in the structure, the inline extent data start.
906 */ 904 */
907 __le64 disk_bytenr; 905 __le64 disk_bytenr;
908 __le64 disk_num_bytes; 906 __le64 disk_num_bytes;
@@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache {
1305 */ 1303 */
1306 struct list_head cluster_list; 1304 struct list_head cluster_list;
1307 1305
1308 /* For delayed block group creation */ 1306 /* For delayed block group creation or deletion of empty block groups */
1309 struct list_head new_bg_list; 1307 struct list_head bg_list;
1310}; 1308};
1311 1309
1312/* delayed seq elem */ 1310/* delayed seq elem */
@@ -1545,6 +1543,7 @@ struct btrfs_fs_info {
1545 struct btrfs_workqueue *endio_workers; 1543 struct btrfs_workqueue *endio_workers;
1546 struct btrfs_workqueue *endio_meta_workers; 1544 struct btrfs_workqueue *endio_meta_workers;
1547 struct btrfs_workqueue *endio_raid56_workers; 1545 struct btrfs_workqueue *endio_raid56_workers;
1546 struct btrfs_workqueue *endio_repair_workers;
1548 struct btrfs_workqueue *rmw_workers; 1547 struct btrfs_workqueue *rmw_workers;
1549 struct btrfs_workqueue *endio_meta_write_workers; 1548 struct btrfs_workqueue *endio_meta_write_workers;
1550 struct btrfs_workqueue *endio_write_workers; 1549 struct btrfs_workqueue *endio_write_workers;
@@ -1574,6 +1573,7 @@ struct btrfs_fs_info {
1574 int do_barriers; 1573 int do_barriers;
1575 int closing; 1574 int closing;
1576 int log_root_recovering; 1575 int log_root_recovering;
1576 int open;
1577 1577
1578 u64 total_pinned; 1578 u64 total_pinned;
1579 1579
@@ -1723,6 +1723,12 @@ struct btrfs_fs_info {
1723 1723
1724 /* Used to reclaim the metadata space in the background. */ 1724 /* Used to reclaim the metadata space in the background. */
1725 struct work_struct async_reclaim_work; 1725 struct work_struct async_reclaim_work;
1726
1727 spinlock_t unused_bgs_lock;
1728 struct list_head unused_bgs;
1729
1730 /* For btrfs to record security options */
1731 struct security_mnt_opts security_opts;
1726}; 1732};
1727 1733
1728struct btrfs_subvolume_writers { 1734struct btrfs_subvolume_writers {
@@ -1776,12 +1782,12 @@ struct btrfs_root {
1776 1782
1777 /* free ino cache stuff */ 1783 /* free ino cache stuff */
1778 struct btrfs_free_space_ctl *free_ino_ctl; 1784 struct btrfs_free_space_ctl *free_ino_ctl;
1779 enum btrfs_caching_type cached; 1785 enum btrfs_caching_type ino_cache_state;
1780 spinlock_t cache_lock; 1786 spinlock_t ino_cache_lock;
1781 wait_queue_head_t cache_wait; 1787 wait_queue_head_t ino_cache_wait;
1782 struct btrfs_free_space_ctl *free_ino_pinned; 1788 struct btrfs_free_space_ctl *free_ino_pinned;
1783 u64 cache_progress; 1789 u64 ino_cache_progress;
1784 struct inode *cache_inode; 1790 struct inode *ino_cache_inode;
1785 1791
1786 struct mutex log_mutex; 1792 struct mutex log_mutex;
1787 wait_queue_head_t log_writer_wait; 1793 wait_queue_head_t log_writer_wait;
@@ -1806,18 +1812,14 @@ struct btrfs_root {
1806 /* node allocations are done in nodesize units */ 1812 /* node allocations are done in nodesize units */
1807 u32 nodesize; 1813 u32 nodesize;
1808 1814
1809 /* leaf allocations are done in leafsize units */
1810 u32 leafsize;
1811
1812 u32 stripesize; 1815 u32 stripesize;
1813 1816
1814 u32 type; 1817 u32 type;
1815 1818
1816 u64 highest_objectid; 1819 u64 highest_objectid;
1817 1820
1818#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1821 /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
1819 u64 alloc_bytenr; 1822 u64 alloc_bytenr;
1820#endif
1821 1823
1822 u64 defrag_trans_start; 1824 u64 defrag_trans_start;
1823 struct btrfs_key defrag_progress; 1825 struct btrfs_key defrag_progress;
@@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args {
2094#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) 2096#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
2095 2097
2096#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2098#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2099#define BTRFS_DEFAULT_MAX_INLINE (8192)
2097 2100
2098#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 2101#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
2099#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 2102#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
2995 sectorsize, 32); 2998 sectorsize, 32);
2996BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, 2999BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
2997 nodesize, 32); 3000 nodesize, 32);
2998BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
2999 leafsize, 32);
3000BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, 3001BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
3001 stripesize, 32); 3002 stripesize, 32);
3002BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, 3003BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
@@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
3049static inline unsigned long 3050static inline unsigned long
3050btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) 3051btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
3051{ 3052{
3052 unsigned long offset = (unsigned long)e; 3053 return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
3053 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
3054 return offset;
3055} 3054}
3056 3055
3057static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) 3056static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
3058{ 3057{
3059 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; 3058 return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
3060} 3059}
3061 3060
3062BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, 3061BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
@@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
3086static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, 3085static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
3087 struct btrfs_item *e) 3086 struct btrfs_item *e)
3088{ 3087{
3089 unsigned long offset; 3088 return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
3090 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
3091 return btrfs_item_size(eb, e) - offset;
3092} 3089}
3093 3090
3094/* this returns the number of file bytes represented by the inline item. 3091/* this returns the number of file bytes represented by the inline item.
@@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
3232 return sb->s_fs_info; 3229 return sb->s_fs_info;
3233} 3230}
3234 3231
3235static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
3236{
3237 if (level == 0)
3238 return root->leafsize;
3239 return root->nodesize;
3240}
3241
3242/* helper function to cast into the data area of the leaf. */ 3232/* helper function to cast into the data area of the leaf. */
3243#define btrfs_item_ptr(leaf, slot, type) \ 3233#define btrfs_item_ptr(leaf, slot, type) \
3244 ((type *)(btrfs_leaf_data(leaf) + \ 3234 ((type *)(btrfs_leaf_data(leaf) + \
@@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
3263static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3253static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3264 unsigned num_items) 3254 unsigned num_items)
3265{ 3255{
3266 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3256 return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3267 2 * num_items; 3257 2 * num_items;
3268} 3258}
3269 3259
@@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3274static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, 3264static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3275 unsigned num_items) 3265 unsigned num_items)
3276{ 3266{
3277 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3267 return root->nodesize * BTRFS_MAX_LEVEL * num_items;
3278 num_items;
3279} 3268}
3280 3269
3281int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 3270int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
3305 u64 bytenr); 3294 u64 bytenr);
3306void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3295void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3307int get_block_group_index(struct btrfs_block_group_cache *cache); 3296int get_block_group_index(struct btrfs_block_group_cache *cache);
3308struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 3297struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
3309 struct btrfs_root *root, u32 blocksize, 3298 struct btrfs_root *root, u64 parent,
3310 u64 parent, u64 root_objectid, 3299 u64 root_objectid,
3311 struct btrfs_disk_key *key, int level, 3300 struct btrfs_disk_key *key, int level,
3312 u64 hint, u64 empty_size); 3301 u64 hint, u64 empty_size);
3313void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 3302void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
@@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3363 u64 size); 3352 u64 size);
3364int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3353int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3365 struct btrfs_root *root, u64 group_start); 3354 struct btrfs_root *root, u64 group_start);
3355void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
3366void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 3356void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
3367 struct btrfs_root *root); 3357 struct btrfs_root *root);
3368u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3358u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
3604 kfree(fs_info->uuid_root); 3594 kfree(fs_info->uuid_root);
3605 kfree(fs_info->super_copy); 3595 kfree(fs_info->super_copy);
3606 kfree(fs_info->super_for_commit); 3596 kfree(fs_info->super_for_commit);
3597 security_free_mnt_opts(&fs_info->security_opts);
3607 kfree(fs_info); 3598 kfree(fs_info);
3608} 3599}
3609 3600
@@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
3739int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 3730int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
3740 struct bio *bio, u32 *dst); 3731 struct bio *bio, u32 *dst);
3741int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, 3732int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
3742 struct btrfs_dio_private *dip, struct bio *bio, 3733 struct bio *bio, u64 logical_offset);
3743 u64 logical_offset);
3744int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 3734int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
3745 struct btrfs_root *root, 3735 struct btrfs_root *root,
3746 u64 objectid, u64 pos, 3736 u64 objectid, u64 pos,
@@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
4141/* Sanity test specific functions */ 4131/* Sanity test specific functions */
4142#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4132#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4143void btrfs_test_destroy_inode(struct inode *inode); 4133void btrfs_test_destroy_inode(struct inode *inode);
4144int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
4145 u64 rfer, u64 excl);
4146#endif 4134#endif
4147 4135
4136static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
4137{
4138#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4139 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
4140 return 1;
4141#endif
4142 return 0;
4143}
4144
4148#endif 4145#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index da775bfdebc9..054577bddaf2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1042 int ret; 1042 int ret;
1043 1043
1044 key.objectid = node->inode_id; 1044 key.objectid = node->inode_id;
1045 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 1045 key.type = BTRFS_INODE_ITEM_KEY;
1046 key.offset = 0; 1046 key.offset = 0;
1047 1047
1048 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1048 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
@@ -1099,7 +1099,7 @@ err_out:
1099search: 1099search:
1100 btrfs_release_path(path); 1100 btrfs_release_path(path);
1101 1101
1102 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 1102 key.type = BTRFS_INODE_EXTREF_KEY;
1103 key.offset = -1; 1103 key.offset = -1;
1104 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1104 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1105 if (ret < 0) 1105 if (ret < 0)
@@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1395 return -ENOMEM; 1395 return -ENOMEM;
1396 1396
1397 async_work->delayed_root = delayed_root; 1397 async_work->delayed_root = delayed_root;
1398 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, 1398 btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
1399 NULL, NULL); 1399 btrfs_async_run_delayed_root, NULL, NULL);
1400 async_work->nr = nr; 1400 async_work->nr = nr;
1401 1401
1402 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); 1402 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
@@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1473 } 1473 }
1474 1474
1475 delayed_item->key.objectid = btrfs_ino(dir); 1475 delayed_item->key.objectid = btrfs_ino(dir);
1476 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); 1476 delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
1477 delayed_item->key.offset = index; 1477 delayed_item->key.offset = index;
1478 1478
1479 dir_item = (struct btrfs_dir_item *)delayed_item->data; 1479 dir_item = (struct btrfs_dir_item *)delayed_item->data;
@@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1542 return PTR_ERR(node); 1542 return PTR_ERR(node);
1543 1543
1544 item_key.objectid = btrfs_ino(dir); 1544 item_key.objectid = btrfs_ino(dir);
1545 btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); 1545 item_key.type = BTRFS_DIR_INDEX_KEY;
1546 item_key.offset = index; 1546 item_key.offset = index;
1547 1547
1548 ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); 1548 ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index eea26e1b2fda..6f662b34ba0e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found:
168 dev_replace->srcdev->total_bytes; 168 dev_replace->srcdev->total_bytes;
169 dev_replace->tgtdev->disk_total_bytes = 169 dev_replace->tgtdev->disk_total_bytes =
170 dev_replace->srcdev->disk_total_bytes; 170 dev_replace->srcdev->disk_total_bytes;
171 dev_replace->tgtdev->commit_total_bytes =
172 dev_replace->srcdev->commit_total_bytes;
171 dev_replace->tgtdev->bytes_used = 173 dev_replace->tgtdev->bytes_used =
172 dev_replace->srcdev->bytes_used; 174 dev_replace->srcdev->bytes_used;
175 dev_replace->tgtdev->commit_bytes_used =
176 dev_replace->srcdev->commit_bytes_used;
173 } 177 }
174 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; 178 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
175 btrfs_init_dev_replace_tgtdev_for_resume(fs_info, 179 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
@@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
329 args->start.tgtdev_name[0] == '\0') 333 args->start.tgtdev_name[0] == '\0')
330 return -EINVAL; 334 return -EINVAL;
331 335
332 mutex_lock(&fs_info->volume_mutex); 336 /*
333 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, 337 * Here we commit the transaction to make sure commit_total_bytes
334 &tgt_device); 338 * of all the devices are updated.
335 if (ret) { 339 */
336 btrfs_err(fs_info, "target device %s is invalid!", 340 trans = btrfs_attach_transaction(root);
337 args->start.tgtdev_name); 341 if (!IS_ERR(trans)) {
338 mutex_unlock(&fs_info->volume_mutex); 342 ret = btrfs_commit_transaction(trans, root);
339 return -EINVAL; 343 if (ret)
344 return ret;
345 } else if (PTR_ERR(trans) != -ENOENT) {
346 return PTR_ERR(trans);
340 } 347 }
341 348
349 /* the disk copy procedure reuses the scrub code */
350 mutex_lock(&fs_info->volume_mutex);
342 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, 351 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
343 args->start.srcdev_name, 352 args->start.srcdev_name,
344 &src_device); 353 &src_device);
345 mutex_unlock(&fs_info->volume_mutex);
346 if (ret) { 354 if (ret) {
347 ret = -EINVAL; 355 mutex_unlock(&fs_info->volume_mutex);
348 goto leave_no_lock; 356 return ret;
349 } 357 }
350 358
351 if (tgt_device->total_bytes < src_device->total_bytes) { 359 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
352 btrfs_err(fs_info, "target device is smaller than source device!"); 360 src_device, &tgt_device);
353 ret = -EINVAL; 361 mutex_unlock(&fs_info->volume_mutex);
354 goto leave_no_lock; 362 if (ret)
355 } 363 return ret;
356 364
357 btrfs_dev_replace_lock(dev_replace); 365 btrfs_dev_replace_lock(dev_replace);
358 switch (dev_replace->replace_state) { 366 switch (dev_replace->replace_state) {
@@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
380 src_device->devid, 388 src_device->devid,
381 rcu_str_deref(tgt_device->name)); 389 rcu_str_deref(tgt_device->name));
382 390
383 tgt_device->total_bytes = src_device->total_bytes;
384 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
385 tgt_device->bytes_used = src_device->bytes_used;
386
387 /* 391 /*
388 * from now on, the writes to the srcdev are all duplicated to 392 * from now on, the writes to the srcdev are all duplicated to
389 * go to the tgtdev as well (refer to btrfs_map_block()). 393 * go to the tgtdev as well (refer to btrfs_map_block()).
@@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
414 418
415 /* the disk copy procedure reuses the scrub code */ 419 /* the disk copy procedure reuses the scrub code */
416 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 420 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
417 src_device->total_bytes, 421 btrfs_device_get_total_bytes(src_device),
418 &dev_replace->scrub_progress, 0, 1); 422 &dev_replace->scrub_progress, 0, 1);
419 423
420 ret = btrfs_dev_replace_finishing(root->fs_info, ret); 424 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
@@ -426,9 +430,7 @@ leave:
426 dev_replace->srcdev = NULL; 430 dev_replace->srcdev = NULL;
427 dev_replace->tgtdev = NULL; 431 dev_replace->tgtdev = NULL;
428 btrfs_dev_replace_unlock(dev_replace); 432 btrfs_dev_replace_unlock(dev_replace);
429leave_no_lock: 433 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
430 if (tgt_device)
431 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
432 return ret; 434 return ret;
433} 435}
434 436
@@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
507 ret = btrfs_commit_transaction(trans, root); 509 ret = btrfs_commit_transaction(trans, root);
508 WARN_ON(ret); 510 WARN_ON(ret);
509 511
512 mutex_lock(&uuid_mutex);
510 /* keep away write_all_supers() during the finishing procedure */ 513 /* keep away write_all_supers() during the finishing procedure */
511 mutex_lock(&root->fs_info->chunk_mutex);
512 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 514 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
515 mutex_lock(&root->fs_info->chunk_mutex);
513 btrfs_dev_replace_lock(dev_replace); 516 btrfs_dev_replace_lock(dev_replace);
514 dev_replace->replace_state = 517 dev_replace->replace_state =
515 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 518 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 src_device->devid, 535 src_device->devid,
533 rcu_str_deref(tgt_device->name), scrub_ret); 536 rcu_str_deref(tgt_device->name), scrub_ret);
534 btrfs_dev_replace_unlock(dev_replace); 537 btrfs_dev_replace_unlock(dev_replace);
535 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
536 mutex_unlock(&root->fs_info->chunk_mutex); 538 mutex_unlock(&root->fs_info->chunk_mutex);
539 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
540 mutex_unlock(&uuid_mutex);
537 if (tgt_device) 541 if (tgt_device)
538 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 542 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
539 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 543 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
542 } 546 }
543 547
544 printk_in_rcu(KERN_INFO 548 printk_in_rcu(KERN_INFO
545 "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", 549 "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
546 src_device->missing ? "<missing disk>" : 550 src_device->missing ? "<missing disk>" :
547 rcu_str_deref(src_device->name), 551 rcu_str_deref(src_device->name),
548 src_device->devid, 552 src_device->devid,
@@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
550 tgt_device->is_tgtdev_for_dev_replace = 0; 554 tgt_device->is_tgtdev_for_dev_replace = 0;
551 tgt_device->devid = src_device->devid; 555 tgt_device->devid = src_device->devid;
552 src_device->devid = BTRFS_DEV_REPLACE_DEVID; 556 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
553 tgt_device->bytes_used = src_device->bytes_used;
554 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); 557 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
555 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); 558 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
556 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); 559 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
557 tgt_device->total_bytes = src_device->total_bytes; 560 btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
558 tgt_device->disk_total_bytes = src_device->disk_total_bytes; 561 btrfs_device_set_disk_total_bytes(tgt_device,
559 tgt_device->bytes_used = src_device->bytes_used; 562 src_device->disk_total_bytes);
563 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
564 ASSERT(list_empty(&src_device->resized_list));
565 tgt_device->commit_total_bytes = src_device->commit_total_bytes;
566 tgt_device->commit_bytes_used = src_device->bytes_used;
560 if (fs_info->sb->s_bdev == src_device->bdev) 567 if (fs_info->sb->s_bdev == src_device->bdev)
561 fs_info->sb->s_bdev = tgt_device->bdev; 568 fs_info->sb->s_bdev = tgt_device->bdev;
562 if (fs_info->fs_devices->latest_bdev == src_device->bdev) 569 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
563 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 570 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
564 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 571 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
572 fs_info->fs_devices->rw_devices++;
565 573
566 /* replace the sysfs entry */ 574 /* replace the sysfs entry */
567 btrfs_kobj_rm_device(fs_info, src_device); 575 btrfs_kobj_rm_device(fs_info, src_device);
568 btrfs_kobj_add_device(fs_info, tgt_device); 576 btrfs_kobj_add_device(fs_info, tgt_device);
569 577
578 btrfs_dev_replace_unlock(dev_replace);
579
570 btrfs_rm_dev_replace_blocked(fs_info); 580 btrfs_rm_dev_replace_blocked(fs_info);
571 581
572 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 582 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
@@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
580 * superblock is scratched out so that it is no longer marked to 590 * superblock is scratched out so that it is no longer marked to
581 * belong to this filesystem. 591 * belong to this filesystem.
582 */ 592 */
583 btrfs_dev_replace_unlock(dev_replace);
584 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
585 mutex_unlock(&root->fs_info->chunk_mutex); 593 mutex_unlock(&root->fs_info->chunk_mutex);
594 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
595 mutex_unlock(&uuid_mutex);
586 596
587 /* write back the superblocks */ 597 /* write back the superblocks */
588 trans = btrfs_start_transaction(root, 0); 598 trans = btrfs_start_transaction(root, 0);
@@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
643 struct btrfs_ioctl_dev_replace_args *args) 653 struct btrfs_ioctl_dev_replace_args *args)
644{ 654{
645 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 655 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
656 struct btrfs_device *srcdev;
646 657
647 btrfs_dev_replace_lock(dev_replace); 658 btrfs_dev_replace_lock(dev_replace);
648 /* even if !dev_replace_is_valid, the values are good enough for 659 /* even if !dev_replace_is_valid, the values are good enough for
@@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
665 break; 676 break;
666 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 677 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
667 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 678 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
679 srcdev = dev_replace->srcdev;
668 args->status.progress_1000 = div64_u64(dev_replace->cursor_left, 680 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
669 div64_u64(dev_replace->srcdev->total_bytes, 1000)); 681 div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
670 break; 682 break;
671 } 683 }
672 btrfs_dev_replace_unlock(dev_replace); 684 btrfs_dev_replace_unlock(dev_replace);
@@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
825 837
826 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 838 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
827 dev_replace->committed_cursor_left, 839 dev_replace->committed_cursor_left,
828 dev_replace->srcdev->total_bytes, 840 btrfs_device_get_total_bytes(dev_replace->srcdev),
829 &dev_replace->scrub_progress, 0, 1); 841 &dev_replace->scrub_progress, 0, 1);
830 ret = btrfs_dev_replace_finishing(fs_info, ret); 842 ret = btrfs_dev_replace_finishing(fs_info, ret);
831 WARN_ON(ret); 843 WARN_ON(ret);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a0691df5dcea..fc8df866e919 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
86 BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); 86 BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
87 87
88 key.objectid = objectid; 88 key.objectid = objectid;
89 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 89 key.type = BTRFS_XATTR_ITEM_KEY;
90 key.offset = btrfs_name_hash(name, name_len); 90 key.offset = btrfs_name_hash(name, name_len);
91 91
92 data_size = sizeof(*dir_item) + name_len + data_len; 92 data_size = sizeof(*dir_item) + name_len + data_len;
@@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
137 u32 data_size; 137 u32 data_size;
138 138
139 key.objectid = btrfs_ino(dir); 139 key.objectid = btrfs_ino(dir);
140 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 140 key.type = BTRFS_DIR_ITEM_KEY;
141 key.offset = btrfs_name_hash(name, name_len); 141 key.offset = btrfs_name_hash(name, name_len);
142 142
143 path = btrfs_alloc_path(); 143 path = btrfs_alloc_path();
@@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
204 int cow = mod != 0; 204 int cow = mod != 0;
205 205
206 key.objectid = dir; 206 key.objectid = dir;
207 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 207 key.type = BTRFS_DIR_ITEM_KEY;
208 208
209 key.offset = btrfs_name_hash(name, name_len); 209 key.offset = btrfs_name_hash(name, name_len);
210 210
@@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
234 return -ENOMEM; 234 return -ENOMEM;
235 235
236 key.objectid = dir; 236 key.objectid = dir;
237 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 237 key.type = BTRFS_DIR_ITEM_KEY;
238 key.offset = btrfs_name_hash(name, name_len); 238 key.offset = btrfs_name_hash(name, name_len);
239 239
240 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 240 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
297 int cow = mod != 0; 297 int cow = mod != 0;
298 298
299 key.objectid = dir; 299 key.objectid = dir;
300 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 300 key.type = BTRFS_DIR_INDEX_KEY;
301 key.offset = objectid; 301 key.offset = objectid;
302 302
303 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 303 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
@@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
367 int cow = mod != 0; 367 int cow = mod != 0;
368 368
369 key.objectid = dir; 369 key.objectid = dir;
370 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 370 key.type = BTRFS_XATTR_ITEM_KEY;
371 key.offset = btrfs_name_hash(name, name_len); 371 key.offset = btrfs_name_hash(name, name_len);
372 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 372 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
373 if (ret < 0) 373 if (ret < 0)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d0ed9e664f7d..fa45e3cae40d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -39,7 +39,6 @@
39#include "btrfs_inode.h" 39#include "btrfs_inode.h"
40#include "volumes.h" 40#include "volumes.h"
41#include "print-tree.h" 41#include "print-tree.h"
42#include "async-thread.h"
43#include "locking.h" 42#include "locking.h"
44#include "tree-log.h" 43#include "tree-log.h"
45#include "free-space-cache.h" 44#include "free-space-cache.h"
@@ -73,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root);
73static void btrfs_error_commit_super(struct btrfs_root *root); 72static void btrfs_error_commit_super(struct btrfs_root *root);
74 73
75/* 74/*
76 * end_io_wq structs are used to do processing in task context when an IO is 75 * btrfs_end_io_wq structs are used to do processing in task context when an IO
77 * complete. This is used during reads to verify checksums, and it is used 76 * is complete. This is used during reads to verify checksums, and it is used
78 * by writes to insert metadata for new file extents after IO is complete. 77 * by writes to insert metadata for new file extents after IO is complete.
79 */ 78 */
80struct end_io_wq { 79struct btrfs_end_io_wq {
81 struct bio *bio; 80 struct bio *bio;
82 bio_end_io_t *end_io; 81 bio_end_io_t *end_io;
83 void *private; 82 void *private;
84 struct btrfs_fs_info *info; 83 struct btrfs_fs_info *info;
85 int error; 84 int error;
86 int metadata; 85 enum btrfs_wq_endio_type metadata;
87 struct list_head list; 86 struct list_head list;
88 struct btrfs_work work; 87 struct btrfs_work work;
89}; 88};
90 89
90static struct kmem_cache *btrfs_end_io_wq_cache;
91
92int __init btrfs_end_io_wq_init(void)
93{
94 btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
95 sizeof(struct btrfs_end_io_wq),
96 0,
97 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
98 NULL);
99 if (!btrfs_end_io_wq_cache)
100 return -ENOMEM;
101 return 0;
102}
103
104void btrfs_end_io_wq_exit(void)
105{
106 if (btrfs_end_io_wq_cache)
107 kmem_cache_destroy(btrfs_end_io_wq_cache);
108}
109
91/* 110/*
92 * async submit bios are used to offload expensive checksumming 111 * async submit bios are used to offload expensive checksumming
93 * onto the worker threads. They checksum file and metadata bios 112 * onto the worker threads. They checksum file and metadata bios
@@ -328,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
328{ 347{
329 struct extent_state *cached_state = NULL; 348 struct extent_state *cached_state = NULL;
330 int ret; 349 int ret;
331 bool need_lock = (current->journal_info == 350 bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
332 (void *)BTRFS_SEND_TRANS_STUB);
333 351
334 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 352 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
335 return 0; 353 return 0;
@@ -349,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
349 ret = 0; 367 ret = 0;
350 goto out; 368 goto out;
351 } 369 }
352 printk_ratelimited("parent transid verify failed on %llu wanted %llu " 370 printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
353 "found %llu\n", 371 eb->fs_info->sb->s_id, eb->start,
354 eb->start, parent_transid, btrfs_header_generation(eb)); 372 parent_transid, btrfs_header_generation(eb));
355 ret = 1; 373 ret = 1;
356 374
357 /* 375 /*
@@ -608,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
608 goto err; 626 goto err;
609 627
610 eb->read_mirror = mirror; 628 eb->read_mirror = mirror;
611 if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 629 if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
612 ret = -EIO; 630 ret = -EIO;
613 goto err; 631 goto err;
614 } 632 }
615 633
616 found_start = btrfs_header_bytenr(eb); 634 found_start = btrfs_header_bytenr(eb);
617 if (found_start != eb->start) { 635 if (found_start != eb->start) {
618 printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " 636 printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
619 "%llu %llu\n", 637 "%llu %llu\n",
620 found_start, eb->start); 638 eb->fs_info->sb->s_id, found_start, eb->start);
621 ret = -EIO; 639 ret = -EIO;
622 goto err; 640 goto err;
623 } 641 }
624 if (check_tree_block_fsid(root, eb)) { 642 if (check_tree_block_fsid(root, eb)) {
625 printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", 643 printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
626 eb->start); 644 eb->fs_info->sb->s_id, eb->start);
627 ret = -EIO; 645 ret = -EIO;
628 goto err; 646 goto err;
629 } 647 }
@@ -681,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
681 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 699 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
682 700
683 eb = (struct extent_buffer *)page->private; 701 eb = (struct extent_buffer *)page->private;
684 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 702 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
685 eb->read_mirror = failed_mirror; 703 eb->read_mirror = failed_mirror;
686 atomic_dec(&eb->io_pages); 704 atomic_dec(&eb->io_pages);
687 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 705 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -691,52 +709,55 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
691 709
692static void end_workqueue_bio(struct bio *bio, int err) 710static void end_workqueue_bio(struct bio *bio, int err)
693{ 711{
694 struct end_io_wq *end_io_wq = bio->bi_private; 712 struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
695 struct btrfs_fs_info *fs_info; 713 struct btrfs_fs_info *fs_info;
714 struct btrfs_workqueue *wq;
715 btrfs_work_func_t func;
696 716
697 fs_info = end_io_wq->info; 717 fs_info = end_io_wq->info;
698 end_io_wq->error = err; 718 end_io_wq->error = err;
699 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
700 719
701 if (bio->bi_rw & REQ_WRITE) { 720 if (bio->bi_rw & REQ_WRITE) {
702 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 721 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
703 btrfs_queue_work(fs_info->endio_meta_write_workers, 722 wq = fs_info->endio_meta_write_workers;
704 &end_io_wq->work); 723 func = btrfs_endio_meta_write_helper;
705 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 724 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
706 btrfs_queue_work(fs_info->endio_freespace_worker, 725 wq = fs_info->endio_freespace_worker;
707 &end_io_wq->work); 726 func = btrfs_freespace_write_helper;
708 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 727 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
709 btrfs_queue_work(fs_info->endio_raid56_workers, 728 wq = fs_info->endio_raid56_workers;
710 &end_io_wq->work); 729 func = btrfs_endio_raid56_helper;
711 else 730 } else {
712 btrfs_queue_work(fs_info->endio_write_workers, 731 wq = fs_info->endio_write_workers;
713 &end_io_wq->work); 732 func = btrfs_endio_write_helper;
733 }
714 } else { 734 } else {
715 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 735 if (unlikely(end_io_wq->metadata ==
716 btrfs_queue_work(fs_info->endio_raid56_workers, 736 BTRFS_WQ_ENDIO_DIO_REPAIR)) {
717 &end_io_wq->work); 737 wq = fs_info->endio_repair_workers;
718 else if (end_io_wq->metadata) 738 func = btrfs_endio_repair_helper;
719 btrfs_queue_work(fs_info->endio_meta_workers, 739 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
720 &end_io_wq->work); 740 wq = fs_info->endio_raid56_workers;
721 else 741 func = btrfs_endio_raid56_helper;
722 btrfs_queue_work(fs_info->endio_workers, 742 } else if (end_io_wq->metadata) {
723 &end_io_wq->work); 743 wq = fs_info->endio_meta_workers;
744 func = btrfs_endio_meta_helper;
745 } else {
746 wq = fs_info->endio_workers;
747 func = btrfs_endio_helper;
748 }
724 } 749 }
750
751 btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
752 btrfs_queue_work(wq, &end_io_wq->work);
725} 753}
726 754
727/*
728 * For the metadata arg you want
729 *
730 * 0 - if data
731 * 1 - if normal metadta
732 * 2 - if writing to the free space cache area
733 * 3 - raid parity work
734 */
735int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 755int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
736 int metadata) 756 enum btrfs_wq_endio_type metadata)
737{ 757{
738 struct end_io_wq *end_io_wq; 758 struct btrfs_end_io_wq *end_io_wq;
739 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); 759
760 end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
740 if (!end_io_wq) 761 if (!end_io_wq)
741 return -ENOMEM; 762 return -ENOMEM;
742 763
@@ -828,7 +849,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
828 async->submit_bio_start = submit_bio_start; 849 async->submit_bio_start = submit_bio_start;
829 async->submit_bio_done = submit_bio_done; 850 async->submit_bio_done = submit_bio_done;
830 851
831 btrfs_init_work(&async->work, run_one_async_start, 852 btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
832 run_one_async_done, run_one_async_free); 853 run_one_async_done, run_one_async_free);
833 854
834 async->bio_flags = bio_flags; 855 async->bio_flags = bio_flags;
@@ -920,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
920 * can happen in the async kernel threads 941 * can happen in the async kernel threads
921 */ 942 */
922 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 943 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
923 bio, 1); 944 bio, BTRFS_WQ_ENDIO_METADATA);
924 if (ret) 945 if (ret)
925 goto out_w_error; 946 goto out_w_error;
926 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 947 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
@@ -1052,20 +1073,17 @@ static const struct address_space_operations btree_aops = {
1052 .set_page_dirty = btree_set_page_dirty, 1073 .set_page_dirty = btree_set_page_dirty,
1053}; 1074};
1054 1075
1055int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 1076void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
1056 u64 parent_transid)
1057{ 1077{
1058 struct extent_buffer *buf = NULL; 1078 struct extent_buffer *buf = NULL;
1059 struct inode *btree_inode = root->fs_info->btree_inode; 1079 struct inode *btree_inode = root->fs_info->btree_inode;
1060 int ret = 0;
1061 1080
1062 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 1081 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1063 if (!buf) 1082 if (!buf)
1064 return 0; 1083 return;
1065 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1084 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
1066 buf, 0, WAIT_NONE, btree_get_extent, 0); 1085 buf, 0, WAIT_NONE, btree_get_extent, 0);
1067 free_extent_buffer(buf); 1086 free_extent_buffer(buf);
1068 return ret;
1069} 1087}
1070 1088
1071int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, 1089int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1101,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1101} 1119}
1102 1120
1103struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1121struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1104 u64 bytenr, u32 blocksize) 1122 u64 bytenr)
1105{ 1123{
1106 return find_extent_buffer(root->fs_info, bytenr); 1124 return find_extent_buffer(root->fs_info, bytenr);
1107} 1125}
@@ -1109,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1109struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 1127struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1110 u64 bytenr, u32 blocksize) 1128 u64 bytenr, u32 blocksize)
1111{ 1129{
1112#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1130 if (btrfs_test_is_dummy_root(root))
1113 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1114 return alloc_test_extent_buffer(root->fs_info, bytenr, 1131 return alloc_test_extent_buffer(root->fs_info, bytenr,
1115 blocksize); 1132 blocksize);
1116#endif
1117 return alloc_extent_buffer(root->fs_info, bytenr, blocksize); 1133 return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
1118} 1134}
1119 1135
@@ -1131,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1131} 1147}
1132 1148
1133struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 1149struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1134 u32 blocksize, u64 parent_transid) 1150 u64 parent_transid)
1135{ 1151{
1136 struct extent_buffer *buf = NULL; 1152 struct extent_buffer *buf = NULL;
1137 int ret; 1153 int ret;
1138 1154
1139 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 1155 buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
1140 if (!buf) 1156 if (!buf)
1141 return NULL; 1157 return NULL;
1142 1158
@@ -1178,7 +1194,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1178 if (!writers) 1194 if (!writers)
1179 return ERR_PTR(-ENOMEM); 1195 return ERR_PTR(-ENOMEM);
1180 1196
1181 ret = percpu_counter_init(&writers->counter, 0); 1197 ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
1182 if (ret < 0) { 1198 if (ret < 0) {
1183 kfree(writers); 1199 kfree(writers);
1184 return ERR_PTR(ret); 1200 return ERR_PTR(ret);
@@ -1195,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1195 kfree(writers); 1211 kfree(writers);
1196} 1212}
1197 1213
1198static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1214static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1199 u32 stripesize, struct btrfs_root *root, 1215 struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1200 struct btrfs_fs_info *fs_info,
1201 u64 objectid) 1216 u64 objectid)
1202{ 1217{
1203 root->node = NULL; 1218 root->node = NULL;
1204 root->commit_root = NULL; 1219 root->commit_root = NULL;
1205 root->sectorsize = sectorsize; 1220 root->sectorsize = sectorsize;
1206 root->nodesize = nodesize; 1221 root->nodesize = nodesize;
1207 root->leafsize = leafsize;
1208 root->stripesize = stripesize; 1222 root->stripesize = stripesize;
1209 root->state = 0; 1223 root->state = 0;
1210 root->orphan_cleanup_state = 0; 1224 root->orphan_cleanup_state = 0;
@@ -1290,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
1290 root = btrfs_alloc_root(NULL); 1304 root = btrfs_alloc_root(NULL);
1291 if (!root) 1305 if (!root)
1292 return ERR_PTR(-ENOMEM); 1306 return ERR_PTR(-ENOMEM);
1293 __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); 1307 __setup_root(4096, 4096, 4096, root, NULL, 1);
1294 set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); 1308 set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
1295 root->alloc_bytenr = 0; 1309 root->alloc_bytenr = 0;
1296 1310
@@ -1313,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1313 if (!root) 1327 if (!root)
1314 return ERR_PTR(-ENOMEM); 1328 return ERR_PTR(-ENOMEM);
1315 1329
1316 __setup_root(tree_root->nodesize, tree_root->leafsize, 1330 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1317 tree_root->sectorsize, tree_root->stripesize, 1331 tree_root->stripesize, root, fs_info, objectid);
1318 root, fs_info, objectid);
1319 root->root_key.objectid = objectid; 1332 root->root_key.objectid = objectid;
1320 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1333 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1321 root->root_key.offset = 0; 1334 root->root_key.offset = 0;
1322 1335
1323 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 1336 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
1324 0, objectid, NULL, 0, 0, 0);
1325 if (IS_ERR(leaf)) { 1337 if (IS_ERR(leaf)) {
1326 ret = PTR_ERR(leaf); 1338 ret = PTR_ERR(leaf);
1327 leaf = NULL; 1339 leaf = NULL;
@@ -1391,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1391 if (!root) 1403 if (!root)
1392 return ERR_PTR(-ENOMEM); 1404 return ERR_PTR(-ENOMEM);
1393 1405
1394 __setup_root(tree_root->nodesize, tree_root->leafsize, 1406 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1395 tree_root->sectorsize, tree_root->stripesize, 1407 tree_root->stripesize, root, fs_info,
1396 root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1408 BTRFS_TREE_LOG_OBJECTID);
1397 1409
1398 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1410 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1399 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1411 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1408,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1408 * updated (along with back refs to the log tree). 1420 * updated (along with back refs to the log tree).
1409 */ 1421 */
1410 1422
1411 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1423 leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1412 BTRFS_TREE_LOG_OBJECTID, NULL, 1424 NULL, 0, 0, 0);
1413 0, 0, 0);
1414 if (IS_ERR(leaf)) { 1425 if (IS_ERR(leaf)) {
1415 kfree(root); 1426 kfree(root);
1416 return ERR_CAST(leaf); 1427 return ERR_CAST(leaf);
@@ -1460,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1460 btrfs_set_stack_inode_generation(inode_item, 1); 1471 btrfs_set_stack_inode_generation(inode_item, 1);
1461 btrfs_set_stack_inode_size(inode_item, 3); 1472 btrfs_set_stack_inode_size(inode_item, 3);
1462 btrfs_set_stack_inode_nlink(inode_item, 1); 1473 btrfs_set_stack_inode_nlink(inode_item, 1);
1463 btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); 1474 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
1464 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 1475 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1465 1476
1466 btrfs_set_root_node(&log_root->root_item, log_root->node); 1477 btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1480,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1480 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1491 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1481 struct btrfs_path *path; 1492 struct btrfs_path *path;
1482 u64 generation; 1493 u64 generation;
1483 u32 blocksize;
1484 int ret; 1494 int ret;
1485 1495
1486 path = btrfs_alloc_path(); 1496 path = btrfs_alloc_path();
@@ -1493,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1493 goto alloc_fail; 1503 goto alloc_fail;
1494 } 1504 }
1495 1505
1496 __setup_root(tree_root->nodesize, tree_root->leafsize, 1506 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1497 tree_root->sectorsize, tree_root->stripesize, 1507 tree_root->stripesize, root, fs_info, key->objectid);
1498 root, fs_info, key->objectid);
1499 1508
1500 ret = btrfs_find_root(tree_root, key, path, 1509 ret = btrfs_find_root(tree_root, key, path,
1501 &root->root_item, &root->root_key); 1510 &root->root_item, &root->root_key);
@@ -1506,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1506 } 1515 }
1507 1516
1508 generation = btrfs_root_generation(&root->root_item); 1517 generation = btrfs_root_generation(&root->root_item);
1509 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1510 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1518 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1511 blocksize, generation); 1519 generation);
1512 if (!root->node) { 1520 if (!root->node) {
1513 ret = -ENOMEM; 1521 ret = -ENOMEM;
1514 goto find_fail; 1522 goto find_fail;
@@ -1568,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1568 root->subv_writers = writers; 1576 root->subv_writers = writers;
1569 1577
1570 btrfs_init_free_ino_ctl(root); 1578 btrfs_init_free_ino_ctl(root);
1571 spin_lock_init(&root->cache_lock); 1579 spin_lock_init(&root->ino_cache_lock);
1572 init_waitqueue_head(&root->cache_wait); 1580 init_waitqueue_head(&root->ino_cache_wait);
1573 1581
1574 ret = get_anon_bdev(&root->anon_dev); 1582 ret = get_anon_bdev(&root->anon_dev);
1575 if (ret) 1583 if (ret)
@@ -1703,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1703 return ret; 1711 return ret;
1704} 1712}
1705 1713
1706/*
1707 * If this fails, caller must call bdi_destroy() to get rid of the
1708 * bdi again.
1709 */
1710static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) 1714static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1711{ 1715{
1712 int err; 1716 int err;
@@ -1729,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1729static void end_workqueue_fn(struct btrfs_work *work) 1733static void end_workqueue_fn(struct btrfs_work *work)
1730{ 1734{
1731 struct bio *bio; 1735 struct bio *bio;
1732 struct end_io_wq *end_io_wq; 1736 struct btrfs_end_io_wq *end_io_wq;
1733 int error; 1737 int error;
1734 1738
1735 end_io_wq = container_of(work, struct end_io_wq, work); 1739 end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1736 bio = end_io_wq->bio; 1740 bio = end_io_wq->bio;
1737 1741
1738 error = end_io_wq->error; 1742 error = end_io_wq->error;
1739 bio->bi_private = end_io_wq->private; 1743 bio->bi_private = end_io_wq->private;
1740 bio->bi_end_io = end_io_wq->end_io; 1744 bio->bi_end_io = end_io_wq->end_io;
1741 kfree(end_io_wq); 1745 kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1742 bio_endio_nodec(bio, error); 1746 bio_endio_nodec(bio, error);
1743} 1747}
1744 1748
@@ -1767,6 +1771,7 @@ static int cleaner_kthread(void *arg)
1767 } 1771 }
1768 1772
1769 btrfs_run_delayed_iputs(root); 1773 btrfs_run_delayed_iputs(root);
1774 btrfs_delete_unused_bgs(root->fs_info);
1770 again = btrfs_clean_one_deleted_snapshot(root); 1775 again = btrfs_clean_one_deleted_snapshot(root);
1771 mutex_unlock(&root->fs_info->cleaner_mutex); 1776 mutex_unlock(&root->fs_info->cleaner_mutex);
1772 1777
@@ -2058,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2058 btrfs_destroy_workqueue(fs_info->endio_workers); 2063 btrfs_destroy_workqueue(fs_info->endio_workers);
2059 btrfs_destroy_workqueue(fs_info->endio_meta_workers); 2064 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
2060 btrfs_destroy_workqueue(fs_info->endio_raid56_workers); 2065 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2066 btrfs_destroy_workqueue(fs_info->endio_repair_workers);
2061 btrfs_destroy_workqueue(fs_info->rmw_workers); 2067 btrfs_destroy_workqueue(fs_info->rmw_workers);
2062 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); 2068 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2063 btrfs_destroy_workqueue(fs_info->endio_write_workers); 2069 btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2138,8 +2144,6 @@ int open_ctree(struct super_block *sb,
2138{ 2144{
2139 u32 sectorsize; 2145 u32 sectorsize;
2140 u32 nodesize; 2146 u32 nodesize;
2141 u32 leafsize;
2142 u32 blocksize;
2143 u32 stripesize; 2147 u32 stripesize;
2144 u64 generation; 2148 u64 generation;
2145 u64 features; 2149 u64 features;
@@ -2183,7 +2187,7 @@ int open_ctree(struct super_block *sb,
2183 goto fail_srcu; 2187 goto fail_srcu;
2184 } 2188 }
2185 2189
2186 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); 2190 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2187 if (ret) { 2191 if (ret) {
2188 err = ret; 2192 err = ret;
2189 goto fail_bdi; 2193 goto fail_bdi;
@@ -2191,13 +2195,13 @@ int open_ctree(struct super_block *sb,
2191 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * 2195 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2192 (1 + ilog2(nr_cpu_ids)); 2196 (1 + ilog2(nr_cpu_ids));
2193 2197
2194 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); 2198 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2195 if (ret) { 2199 if (ret) {
2196 err = ret; 2200 err = ret;
2197 goto fail_dirty_metadata_bytes; 2201 goto fail_dirty_metadata_bytes;
2198 } 2202 }
2199 2203
2200 ret = percpu_counter_init(&fs_info->bio_counter, 0); 2204 ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
2201 if (ret) { 2205 if (ret) {
2202 err = ret; 2206 err = ret;
2203 goto fail_delalloc_bytes; 2207 goto fail_delalloc_bytes;
@@ -2228,6 +2232,7 @@ int open_ctree(struct super_block *sb,
2228 spin_lock_init(&fs_info->super_lock); 2232 spin_lock_init(&fs_info->super_lock);
2229 spin_lock_init(&fs_info->qgroup_op_lock); 2233 spin_lock_init(&fs_info->qgroup_op_lock);
2230 spin_lock_init(&fs_info->buffer_lock); 2234 spin_lock_init(&fs_info->buffer_lock);
2235 spin_lock_init(&fs_info->unused_bgs_lock);
2231 rwlock_init(&fs_info->tree_mod_log_lock); 2236 rwlock_init(&fs_info->tree_mod_log_lock);
2232 mutex_init(&fs_info->reloc_mutex); 2237 mutex_init(&fs_info->reloc_mutex);
2233 mutex_init(&fs_info->delalloc_root_mutex); 2238 mutex_init(&fs_info->delalloc_root_mutex);
@@ -2237,6 +2242,7 @@ int open_ctree(struct super_block *sb,
2237 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2242 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2238 INIT_LIST_HEAD(&fs_info->space_info); 2243 INIT_LIST_HEAD(&fs_info->space_info);
2239 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2244 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2245 INIT_LIST_HEAD(&fs_info->unused_bgs);
2240 btrfs_mapping_init(&fs_info->mapping_tree); 2246 btrfs_mapping_init(&fs_info->mapping_tree);
2241 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2247 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2242 BTRFS_BLOCK_RSV_GLOBAL); 2248 BTRFS_BLOCK_RSV_GLOBAL);
@@ -2255,7 +2261,7 @@ int open_ctree(struct super_block *sb,
2255 atomic_set(&fs_info->qgroup_op_seq, 0); 2261 atomic_set(&fs_info->qgroup_op_seq, 0);
2256 atomic64_set(&fs_info->tree_mod_seq, 0); 2262 atomic64_set(&fs_info->tree_mod_seq, 0);
2257 fs_info->sb = sb; 2263 fs_info->sb = sb;
2258 fs_info->max_inline = 8192 * 1024; 2264 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2259 fs_info->metadata_ratio = 0; 2265 fs_info->metadata_ratio = 0;
2260 fs_info->defrag_inodes = RB_ROOT; 2266 fs_info->defrag_inodes = RB_ROOT;
2261 fs_info->free_chunk_space = 0; 2267 fs_info->free_chunk_space = 0;
@@ -2384,7 +2390,7 @@ int open_ctree(struct super_block *sb,
2384 goto fail_alloc; 2390 goto fail_alloc;
2385 } 2391 }
2386 2392
2387 __setup_root(4096, 4096, 4096, 4096, tree_root, 2393 __setup_root(4096, 4096, 4096, tree_root,
2388 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2394 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2389 2395
2390 invalidate_bdev(fs_devices->latest_bdev); 2396 invalidate_bdev(fs_devices->latest_bdev);
@@ -2464,19 +2470,22 @@ int open_ctree(struct super_block *sb,
2464 goto fail_alloc; 2470 goto fail_alloc;
2465 } 2471 }
2466 2472
2467 if (btrfs_super_leafsize(disk_super) != 2473 /*
2474 * Leafsize and nodesize were always equal, this is only a sanity check.
2475 */
2476 if (le32_to_cpu(disk_super->__unused_leafsize) !=
2468 btrfs_super_nodesize(disk_super)) { 2477 btrfs_super_nodesize(disk_super)) {
2469 printk(KERN_ERR "BTRFS: couldn't mount because metadata " 2478 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2470 "blocksizes don't match. node %d leaf %d\n", 2479 "blocksizes don't match. node %d leaf %d\n",
2471 btrfs_super_nodesize(disk_super), 2480 btrfs_super_nodesize(disk_super),
2472 btrfs_super_leafsize(disk_super)); 2481 le32_to_cpu(disk_super->__unused_leafsize));
2473 err = -EINVAL; 2482 err = -EINVAL;
2474 goto fail_alloc; 2483 goto fail_alloc;
2475 } 2484 }
2476 if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { 2485 if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
2477 printk(KERN_ERR "BTRFS: couldn't mount because metadata " 2486 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2478 "blocksize (%d) was too large\n", 2487 "blocksize (%d) was too large\n",
2479 btrfs_super_leafsize(disk_super)); 2488 btrfs_super_nodesize(disk_super));
2480 err = -EINVAL; 2489 err = -EINVAL;
2481 goto fail_alloc; 2490 goto fail_alloc;
2482 } 2491 }
@@ -2493,17 +2502,16 @@ int open_ctree(struct super_block *sb,
2493 * flag our filesystem as having big metadata blocks if 2502 * flag our filesystem as having big metadata blocks if
2494 * they are bigger than the page size 2503 * they are bigger than the page size
2495 */ 2504 */
2496 if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { 2505 if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
2497 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) 2506 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2498 printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); 2507 printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
2499 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; 2508 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2500 } 2509 }
2501 2510
2502 nodesize = btrfs_super_nodesize(disk_super); 2511 nodesize = btrfs_super_nodesize(disk_super);
2503 leafsize = btrfs_super_leafsize(disk_super);
2504 sectorsize = btrfs_super_sectorsize(disk_super); 2512 sectorsize = btrfs_super_sectorsize(disk_super);
2505 stripesize = btrfs_super_stripesize(disk_super); 2513 stripesize = btrfs_super_stripesize(disk_super);
2506 fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); 2514 fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
2507 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); 2515 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2508 2516
2509 /* 2517 /*
@@ -2511,7 +2519,7 @@ int open_ctree(struct super_block *sb,
2511 * extent buffers for the same range. It leads to corruptions 2519 * extent buffers for the same range. It leads to corruptions
2512 */ 2520 */
2513 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && 2521 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2514 (sectorsize != leafsize)) { 2522 (sectorsize != nodesize)) {
2515 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " 2523 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
2516 "are not allowed for mixed block groups on %s\n", 2524 "are not allowed for mixed block groups on %s\n",
2517 sb->s_id); 2525 sb->s_id);
@@ -2574,6 +2582,8 @@ int open_ctree(struct super_block *sb,
2574 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); 2582 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2575 fs_info->endio_raid56_workers = 2583 fs_info->endio_raid56_workers =
2576 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); 2584 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2585 fs_info->endio_repair_workers =
2586 btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
2577 fs_info->rmw_workers = 2587 fs_info->rmw_workers =
2578 btrfs_alloc_workqueue("rmw", flags, max_active, 2); 2588 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2579 fs_info->endio_write_workers = 2589 fs_info->endio_write_workers =
@@ -2595,11 +2605,12 @@ int open_ctree(struct super_block *sb,
2595 fs_info->submit_workers && fs_info->flush_workers && 2605 fs_info->submit_workers && fs_info->flush_workers &&
2596 fs_info->endio_workers && fs_info->endio_meta_workers && 2606 fs_info->endio_workers && fs_info->endio_meta_workers &&
2597 fs_info->endio_meta_write_workers && 2607 fs_info->endio_meta_write_workers &&
2608 fs_info->endio_repair_workers &&
2598 fs_info->endio_write_workers && fs_info->endio_raid56_workers && 2609 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2599 fs_info->endio_freespace_worker && fs_info->rmw_workers && 2610 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2600 fs_info->caching_workers && fs_info->readahead_workers && 2611 fs_info->caching_workers && fs_info->readahead_workers &&
2601 fs_info->fixup_workers && fs_info->delayed_workers && 2612 fs_info->fixup_workers && fs_info->delayed_workers &&
2602 fs_info->fixup_workers && fs_info->extent_workers && 2613 fs_info->extent_workers &&
2603 fs_info->qgroup_rescan_workers)) { 2614 fs_info->qgroup_rescan_workers)) {
2604 err = -ENOMEM; 2615 err = -ENOMEM;
2605 goto fail_sb_buffer; 2616 goto fail_sb_buffer;
@@ -2610,7 +2621,6 @@ int open_ctree(struct super_block *sb,
2610 4 * 1024 * 1024 / PAGE_CACHE_SIZE); 2621 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
2611 2622
2612 tree_root->nodesize = nodesize; 2623 tree_root->nodesize = nodesize;
2613 tree_root->leafsize = leafsize;
2614 tree_root->sectorsize = sectorsize; 2624 tree_root->sectorsize = sectorsize;
2615 tree_root->stripesize = stripesize; 2625 tree_root->stripesize = stripesize;
2616 2626
@@ -2637,16 +2647,14 @@ int open_ctree(struct super_block *sb,
2637 goto fail_sb_buffer; 2647 goto fail_sb_buffer;
2638 } 2648 }
2639 2649
2640 blocksize = btrfs_level_size(tree_root,
2641 btrfs_super_chunk_root_level(disk_super));
2642 generation = btrfs_super_chunk_root_generation(disk_super); 2650 generation = btrfs_super_chunk_root_generation(disk_super);
2643 2651
2644 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2652 __setup_root(nodesize, sectorsize, stripesize, chunk_root,
2645 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); 2653 fs_info, BTRFS_CHUNK_TREE_OBJECTID);
2646 2654
2647 chunk_root->node = read_tree_block(chunk_root, 2655 chunk_root->node = read_tree_block(chunk_root,
2648 btrfs_super_chunk_root(disk_super), 2656 btrfs_super_chunk_root(disk_super),
2649 blocksize, generation); 2657 generation);
2650 if (!chunk_root->node || 2658 if (!chunk_root->node ||
2651 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2659 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
2652 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", 2660 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
@@ -2679,13 +2687,11 @@ int open_ctree(struct super_block *sb,
2679 } 2687 }
2680 2688
2681retry_root_backup: 2689retry_root_backup:
2682 blocksize = btrfs_level_size(tree_root,
2683 btrfs_super_root_level(disk_super));
2684 generation = btrfs_super_generation(disk_super); 2690 generation = btrfs_super_generation(disk_super);
2685 2691
2686 tree_root->node = read_tree_block(tree_root, 2692 tree_root->node = read_tree_block(tree_root,
2687 btrfs_super_root(disk_super), 2693 btrfs_super_root(disk_super),
2688 blocksize, generation); 2694 generation);
2689 if (!tree_root->node || 2695 if (!tree_root->node ||
2690 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { 2696 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
2691 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", 2697 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
@@ -2854,9 +2860,6 @@ retry_root_backup:
2854 err = -EIO; 2860 err = -EIO;
2855 goto fail_qgroup; 2861 goto fail_qgroup;
2856 } 2862 }
2857 blocksize =
2858 btrfs_level_size(tree_root,
2859 btrfs_super_log_root_level(disk_super));
2860 2863
2861 log_tree_root = btrfs_alloc_root(fs_info); 2864 log_tree_root = btrfs_alloc_root(fs_info);
2862 if (!log_tree_root) { 2865 if (!log_tree_root) {
@@ -2864,11 +2867,10 @@ retry_root_backup:
2864 goto fail_qgroup; 2867 goto fail_qgroup;
2865 } 2868 }
2866 2869
2867 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2870 __setup_root(nodesize, sectorsize, stripesize,
2868 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 2871 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
2869 2872
2870 log_tree_root->node = read_tree_block(tree_root, bytenr, 2873 log_tree_root->node = read_tree_block(tree_root, bytenr,
2871 blocksize,
2872 generation + 1); 2874 generation + 1);
2873 if (!log_tree_root->node || 2875 if (!log_tree_root->node ||
2874 !extent_buffer_uptodate(log_tree_root->node)) { 2876 !extent_buffer_uptodate(log_tree_root->node)) {
@@ -2975,6 +2977,8 @@ retry_root_backup:
2975 fs_info->update_uuid_tree_gen = 1; 2977 fs_info->update_uuid_tree_gen = 1;
2976 } 2978 }
2977 2979
2980 fs_info->open = 1;
2981
2978 return 0; 2982 return 0;
2979 2983
2980fail_qgroup: 2984fail_qgroup:
@@ -3134,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device,
3134 3138
3135 for (i = 0; i < max_mirrors; i++) { 3139 for (i = 0; i < max_mirrors; i++) {
3136 bytenr = btrfs_sb_offset(i); 3140 bytenr = btrfs_sb_offset(i);
3137 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 3141 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3142 device->commit_total_bytes)
3138 break; 3143 break;
3139 3144
3140 if (wait) { 3145 if (wait) {
@@ -3450,8 +3455,10 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3450 btrfs_set_stack_device_generation(dev_item, 0); 3455 btrfs_set_stack_device_generation(dev_item, 0);
3451 btrfs_set_stack_device_type(dev_item, dev->type); 3456 btrfs_set_stack_device_type(dev_item, dev->type);
3452 btrfs_set_stack_device_id(dev_item, dev->devid); 3457 btrfs_set_stack_device_id(dev_item, dev->devid);
3453 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); 3458 btrfs_set_stack_device_total_bytes(dev_item,
3454 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); 3459 dev->commit_total_bytes);
3460 btrfs_set_stack_device_bytes_used(dev_item,
3461 dev->commit_bytes_used);
3455 btrfs_set_stack_device_io_align(dev_item, dev->io_align); 3462 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
3456 btrfs_set_stack_device_io_width(dev_item, dev->io_width); 3463 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
3457 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); 3464 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@ -3526,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3526 3533
3527static void free_fs_root(struct btrfs_root *root) 3534static void free_fs_root(struct btrfs_root *root)
3528{ 3535{
3529 iput(root->cache_inode); 3536 iput(root->ino_cache_inode);
3530 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 3537 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3531 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3538 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3532 root->orphan_block_rsv = NULL; 3539 root->orphan_block_rsv = NULL;
@@ -3617,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root)
3617 return btrfs_commit_transaction(trans, root); 3624 return btrfs_commit_transaction(trans, root);
3618} 3625}
3619 3626
3620int close_ctree(struct btrfs_root *root) 3627void close_ctree(struct btrfs_root *root)
3621{ 3628{
3622 struct btrfs_fs_info *fs_info = root->fs_info; 3629 struct btrfs_fs_info *fs_info = root->fs_info;
3623 int ret; 3630 int ret;
@@ -3683,6 +3690,7 @@ int close_ctree(struct btrfs_root *root)
3683 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 3690 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3684 btrfs_stop_all_workers(fs_info); 3691 btrfs_stop_all_workers(fs_info);
3685 3692
3693 fs_info->open = 0;
3686 free_root_pointers(fs_info, 1); 3694 free_root_pointers(fs_info, 1);
3687 3695
3688 iput(fs_info->btree_inode); 3696 iput(fs_info->btree_inode);
@@ -3705,8 +3713,6 @@ int close_ctree(struct btrfs_root *root)
3705 3713
3706 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3714 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3707 root->orphan_block_rsv = NULL; 3715 root->orphan_block_rsv = NULL;
3708
3709 return 0;
3710} 3716}
3711 3717
3712int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 3718int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3808,10 +3814,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3808static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 3814static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3809 int read_only) 3815 int read_only)
3810{ 3816{
3817 struct btrfs_super_block *sb = fs_info->super_copy;
3818 int ret = 0;
3819
3820 if (sb->root_level > BTRFS_MAX_LEVEL) {
3821 printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
3822 sb->root_level, BTRFS_MAX_LEVEL);
3823 ret = -EINVAL;
3824 }
3825 if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
3826 printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
3827 sb->chunk_root_level, BTRFS_MAX_LEVEL);
3828 ret = -EINVAL;
3829 }
3830 if (sb->log_root_level > BTRFS_MAX_LEVEL) {
3831 printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
3832 sb->log_root_level, BTRFS_MAX_LEVEL);
3833 ret = -EINVAL;
3834 }
3835
3811 /* 3836 /*
3812 * Placeholder for checks 3837 * The common minimum, we don't know if we can trust the nodesize/sectorsize
3838 * items yet, they'll be verified later. Issue just a warning.
3813 */ 3839 */
3814 return 0; 3840 if (!IS_ALIGNED(sb->root, 4096))
3841 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3842 sb->root);
3843 if (!IS_ALIGNED(sb->chunk_root, 4096))
3844 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3845 sb->chunk_root);
3846 if (!IS_ALIGNED(sb->log_root, 4096))
3847 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3848 sb->log_root);
3849
3850 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
3851 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
3852 fs_info->fsid, sb->dev_item.fsid);
3853 ret = -EINVAL;
3854 }
3855
3856 /*
3857 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
3858 * done later
3859 */
3860 if (sb->num_devices > (1UL << 31))
3861 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
3862 sb->num_devices);
3863
3864 if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
3865 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
3866 sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
3867 ret = -EINVAL;
3868 }
3869
3870 /*
3871 * The generation is a global counter, we'll trust it more than the others
3872 * but it's still possible that it's the one that's wrong.
3873 */
3874 if (sb->generation < sb->chunk_root_generation)
3875 printk(KERN_WARNING
3876 "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
3877 sb->generation, sb->chunk_root_generation);
3878 if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
3879 printk(KERN_WARNING
3880 "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
3881 sb->generation, sb->cache_generation);
3882
3883 return ret;
3815} 3884}
3816 3885
3817static void btrfs_error_commit_super(struct btrfs_root *root) 3886static void btrfs_error_commit_super(struct btrfs_root *root)
@@ -4003,9 +4072,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
4003 4072
4004 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 4073 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
4005 while (start <= end) { 4074 while (start <= end) {
4006 eb = btrfs_find_tree_block(root, start, 4075 eb = btrfs_find_tree_block(root, start);
4007 root->leafsize); 4076 start += root->nodesize;
4008 start += root->leafsize;
4009 if (!eb) 4077 if (!eb)
4010 continue; 4078 continue;
4011 wait_on_extent_buffer_writeback(eb); 4079 wait_on_extent_buffer_writeback(eb);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 23ce3ceba0a9..414651821fb3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,11 +25,12 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum { 28enum btrfs_wq_endio_type {
29 BTRFS_WQ_ENDIO_DATA = 0, 29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1, 30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2, 31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3, 32 BTRFS_WQ_ENDIO_RAID56 = 3,
33 BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
33}; 34};
34 35
35static inline u64 btrfs_sb_offset(int mirror) 36static inline u64 btrfs_sb_offset(int mirror)
@@ -44,9 +45,8 @@ struct btrfs_device;
44struct btrfs_fs_devices; 45struct btrfs_fs_devices;
45 46
46struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 47struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
47 u32 blocksize, u64 parent_transid); 48 u64 parent_transid);
48int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 49void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
49 u64 parent_transid);
50int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, 50int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
51 int mirror_num, struct extent_buffer **eb); 51 int mirror_num, struct extent_buffer **eb);
52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
56int open_ctree(struct super_block *sb, 56int open_ctree(struct super_block *sb,
57 struct btrfs_fs_devices *fs_devices, 57 struct btrfs_fs_devices *fs_devices,
58 char *options); 58 char *options);
59int close_ctree(struct btrfs_root *root); 59void close_ctree(struct btrfs_root *root);
60int write_ctree_super(struct btrfs_trans_handle *trans, 60int write_ctree_super(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, int max_mirrors); 61 struct btrfs_root *root, int max_mirrors);
62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
65 u64 bytenr, u32 blocksize); 65 u64 bytenr);
66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
68int btrfs_init_fs_root(struct btrfs_root *root); 68int btrfs_init_fs_root(struct btrfs_root *root);
@@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
119u32 btrfs_csum_data(char *data, u32 seed, size_t len); 119u32 btrfs_csum_data(char *data, u32 seed, size_t len);
120void btrfs_csum_final(u32 crc, char *result); 120void btrfs_csum_final(u32 crc, char *result);
121int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 121int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
122 int metadata); 122 enum btrfs_wq_endio_type metadata);
123int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 123int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
124 int rw, struct bio *bio, int mirror_num, 124 int rw, struct bio *bio, int mirror_num,
125 unsigned long bio_flags, u64 bio_offset, 125 unsigned long bio_flags, u64 bio_offset,
@@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data,
141 void (*flush_fn)(void *)); 141 void (*flush_fn)(void *));
142int btrfs_calc_num_tolerated_disk_barrier_failures( 142int btrfs_calc_num_tolerated_disk_barrier_failures(
143 struct btrfs_fs_info *fs_info); 143 struct btrfs_fs_info *fs_info);
144int __init btrfs_end_io_wq_init(void);
145void btrfs_end_io_wq_exit(void);
144 146
145#ifdef CONFIG_DEBUG_LOCK_ALLOC 147#ifdef CONFIG_DEBUG_LOCK_ALLOC
146void btrfs_init_lockdep(void); 148void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 41422a3de8ed..37d164540c3a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
70 return ERR_PTR(-ESTALE); 70 return ERR_PTR(-ESTALE);
71 71
72 key.objectid = root_objectid; 72 key.objectid = root_objectid;
73 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 73 key.type = BTRFS_ROOT_ITEM_KEY;
74 key.offset = (u64)-1; 74 key.offset = (u64)-1;
75 75
76 index = srcu_read_lock(&fs_info->subvol_srcu); 76 index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
82 } 82 }
83 83
84 key.objectid = objectid; 84 key.objectid = objectid;
85 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 85 key.type = BTRFS_INODE_ITEM_KEY;
86 key.offset = 0; 86 key.offset = 0;
87 87
88 inode = btrfs_iget(sb, &key, root, NULL); 88 inode = btrfs_iget(sb, &key, root, NULL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 102ed3143976..d56589571012 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -491,7 +491,7 @@ next:
491 key.objectid); 491 key.objectid);
492 if (key.type == BTRFS_METADATA_ITEM_KEY) 492 if (key.type == BTRFS_METADATA_ITEM_KEY)
493 last = key.objectid + 493 last = key.objectid +
494 fs_info->tree_root->leafsize; 494 fs_info->tree_root->nodesize;
495 else 495 else
496 last = key.objectid + key.offset; 496 last = key.objectid + key.offset;
497 497
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
552 caching_ctl->block_group = cache; 552 caching_ctl->block_group = cache;
553 caching_ctl->progress = cache->key.objectid; 553 caching_ctl->progress = cache->key.objectid;
554 atomic_set(&caching_ctl->count, 1); 554 atomic_set(&caching_ctl->count, 1);
555 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 555 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
556 caching_thread, NULL, NULL);
556 557
557 spin_lock(&cache->lock); 558 spin_lock(&cache->lock);
558 /* 559 /*
@@ -764,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
764 * different 765 * different
765 */ 766 */
766 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 767 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
767 offset = root->leafsize; 768 offset = root->nodesize;
768 metadata = 0; 769 metadata = 0;
769 } 770 }
770 771
@@ -798,13 +799,13 @@ again:
798 path->slots[0]); 799 path->slots[0]);
799 if (key.objectid == bytenr && 800 if (key.objectid == bytenr &&
800 key.type == BTRFS_EXTENT_ITEM_KEY && 801 key.type == BTRFS_EXTENT_ITEM_KEY &&
801 key.offset == root->leafsize) 802 key.offset == root->nodesize)
802 ret = 0; 803 ret = 0;
803 } 804 }
804 if (ret) { 805 if (ret) {
805 key.objectid = bytenr; 806 key.objectid = bytenr;
806 key.type = BTRFS_EXTENT_ITEM_KEY; 807 key.type = BTRFS_EXTENT_ITEM_KEY;
807 key.offset = root->leafsize; 808 key.offset = root->nodesize;
808 btrfs_release_path(path); 809 btrfs_release_path(path);
809 goto again; 810 goto again;
810 } 811 }
@@ -2650,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2650 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2651 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2651 num_heads = heads_to_leaves(root, num_heads); 2652 num_heads = heads_to_leaves(root, num_heads);
2652 if (num_heads > 1) 2653 if (num_heads > 1)
2653 num_bytes += (num_heads - 1) * root->leafsize; 2654 num_bytes += (num_heads - 1) * root->nodesize;
2654 num_bytes <<= 1; 2655 num_bytes <<= 1;
2655 global_rsv = &root->fs_info->global_block_rsv; 2656 global_rsv = &root->fs_info->global_block_rsv;
2656 2657
@@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2749 async->sync = 0; 2750 async->sync = 0;
2750 init_completion(&async->wait); 2751 init_completion(&async->wait);
2751 2752
2752 btrfs_init_work(&async->work, delayed_ref_async_start, 2753 btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2753 NULL, NULL); 2754 delayed_ref_async_start, NULL, NULL);
2754 2755
2755 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2756 btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2756 2757
@@ -3072,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3072 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3073 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3073 u64, u64, u64, u64, u64, u64, int); 3074 u64, u64, u64, u64, u64, u64, int);
3074 3075
3075#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3076
3076 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 3077 if (btrfs_test_is_dummy_root(root))
3077 return 0; 3078 return 0;
3078#endif 3079
3079 ref_root = btrfs_header_owner(buf); 3080 ref_root = btrfs_header_owner(buf);
3080 nritems = btrfs_header_nritems(buf); 3081 nritems = btrfs_header_nritems(buf);
3081 level = btrfs_header_level(buf); 3082 level = btrfs_header_level(buf);
@@ -3096,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3096 for (i = 0; i < nritems; i++) { 3097 for (i = 0; i < nritems; i++) {
3097 if (level == 0) { 3098 if (level == 0) {
3098 btrfs_item_key_to_cpu(buf, &key, i); 3099 btrfs_item_key_to_cpu(buf, &key, i);
3099 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3100 if (key.type != BTRFS_EXTENT_DATA_KEY)
3100 continue; 3101 continue;
3101 fi = btrfs_item_ptr(buf, i, 3102 fi = btrfs_item_ptr(buf, i,
3102 struct btrfs_file_extent_item); 3103 struct btrfs_file_extent_item);
@@ -3116,7 +3117,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3116 goto fail; 3117 goto fail;
3117 } else { 3118 } else {
3118 bytenr = btrfs_node_blockptr(buf, i); 3119 bytenr = btrfs_node_blockptr(buf, i);
3119 num_bytes = btrfs_level_size(root, level - 1); 3120 num_bytes = root->nodesize;
3120 ret = process_func(trans, root, bytenr, num_bytes, 3121 ret = process_func(trans, root, bytenr, num_bytes,
3121 parent, ref_root, level - 1, 0, 3122 parent, ref_root, level - 1, 0,
3122 1); 3123 1);
@@ -3493,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3493 if (!found) 3494 if (!found)
3494 return -ENOMEM; 3495 return -ENOMEM;
3495 3496
3496 ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3497 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3497 if (ret) { 3498 if (ret) {
3498 kfree(found); 3499 kfree(found);
3499 return ret; 3500 return ret;
@@ -3586,13 +3587,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3586 */ 3587 */
3587static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3588static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3588{ 3589{
3589 /* 3590 u64 num_devices = root->fs_info->fs_devices->rw_devices;
3590 * we add in the count of missing devices because we want
3591 * to make sure that any RAID levels on a degraded FS
3592 * continue to be honored.
3593 */
3594 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3595 root->fs_info->fs_devices->missing_devices;
3596 u64 target; 3591 u64 target;
3597 u64 tmp; 3592 u64 tmp;
3598 3593
@@ -4348,11 +4343,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4348} 4343}
4349 4344
4350static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4345static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4351 struct btrfs_fs_info *fs_info) 4346 struct btrfs_fs_info *fs_info,
4347 int flush_state)
4352{ 4348{
4353 u64 used; 4349 u64 used;
4354 4350
4355 spin_lock(&space_info->lock); 4351 spin_lock(&space_info->lock);
4352 /*
4353 * We run out of space and have not got any free space via flush_space,
4354 * so don't bother doing async reclaim.
4355 */
4356 if (flush_state > COMMIT_TRANS && space_info->full) {
4357 spin_unlock(&space_info->lock);
4358 return 0;
4359 }
4360
4356 used = space_info->bytes_used + space_info->bytes_reserved + 4361 used = space_info->bytes_used + space_info->bytes_reserved +
4357 space_info->bytes_pinned + space_info->bytes_readonly + 4362 space_info->bytes_pinned + space_info->bytes_readonly +
4358 space_info->bytes_may_use; 4363 space_info->bytes_may_use;
@@ -4385,11 +4390,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4385 flush_space(fs_info->fs_root, space_info, to_reclaim, 4390 flush_space(fs_info->fs_root, space_info, to_reclaim,
4386 to_reclaim, flush_state); 4391 to_reclaim, flush_state);
4387 flush_state++; 4392 flush_state++;
4388 if (!btrfs_need_do_async_reclaim(space_info, fs_info)) 4393 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4394 flush_state))
4389 return; 4395 return;
4390 } while (flush_state <= COMMIT_TRANS); 4396 } while (flush_state <= COMMIT_TRANS);
4391 4397
4392 if (btrfs_need_do_async_reclaim(space_info, fs_info)) 4398 if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
4393 queue_work(system_unbound_wq, work); 4399 queue_work(system_unbound_wq, work);
4394} 4400}
4395 4401
@@ -4507,7 +4513,13 @@ again:
4507 space_info->flush = 1; 4513 space_info->flush = 1;
4508 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4514 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4509 used += orig_bytes; 4515 used += orig_bytes;
4510 if (need_do_async_reclaim(space_info, root->fs_info, used) && 4516 /*
4517 * We will do the space reservation dance during log replay,
4518 * which means we won't have fs_info->fs_root set, so don't do
4519 * the async reclaim as we will panic.
4520 */
4521 if (!root->fs_info->log_root_recovering &&
4522 need_do_async_reclaim(space_info, root->fs_info, used) &&
4511 !work_busy(&root->fs_info->async_reclaim_work)) 4523 !work_busy(&root->fs_info->async_reclaim_work))
4512 queue_work(system_unbound_wq, 4524 queue_work(system_unbound_wq,
4513 &root->fs_info->async_reclaim_work); 4525 &root->fs_info->async_reclaim_work);
@@ -4844,7 +4856,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4844 if (num_bytes * 3 > meta_used) 4856 if (num_bytes * 3 > meta_used)
4845 num_bytes = div64_u64(meta_used, 3); 4857 num_bytes = div64_u64(meta_used, 3);
4846 4858
4847 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4859 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
4848} 4860}
4849 4861
4850static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4862static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4993,7 +5005,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4993 5005
4994 if (root->fs_info->quota_enabled) { 5006 if (root->fs_info->quota_enabled) {
4995 /* One for parent inode, two for dir entries */ 5007 /* One for parent inode, two for dir entries */
4996 num_bytes = 3 * root->leafsize; 5008 num_bytes = 3 * root->nodesize;
4997 ret = btrfs_qgroup_reserve(root, num_bytes); 5009 ret = btrfs_qgroup_reserve(root, num_bytes);
4998 if (ret) 5010 if (ret)
4999 return ret; 5011 return ret;
@@ -5181,7 +5193,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5181 5193
5182 if (root->fs_info->quota_enabled) { 5194 if (root->fs_info->quota_enabled) {
5183 ret = btrfs_qgroup_reserve(root, num_bytes + 5195 ret = btrfs_qgroup_reserve(root, num_bytes +
5184 nr_extents * root->leafsize); 5196 nr_extents * root->nodesize);
5185 if (ret) 5197 if (ret)
5186 goto out_fail; 5198 goto out_fail;
5187 } 5199 }
@@ -5190,7 +5202,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5190 if (unlikely(ret)) { 5202 if (unlikely(ret)) {
5191 if (root->fs_info->quota_enabled) 5203 if (root->fs_info->quota_enabled)
5192 btrfs_qgroup_free(root, num_bytes + 5204 btrfs_qgroup_free(root, num_bytes +
5193 nr_extents * root->leafsize); 5205 nr_extents * root->nodesize);
5194 goto out_fail; 5206 goto out_fail;
5195 } 5207 }
5196 5208
@@ -5306,7 +5318,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5306 btrfs_ino(inode), to_free, 0); 5318 btrfs_ino(inode), to_free, 0);
5307 if (root->fs_info->quota_enabled) { 5319 if (root->fs_info->quota_enabled) {
5308 btrfs_qgroup_free(root, num_bytes + 5320 btrfs_qgroup_free(root, num_bytes +
5309 dropped * root->leafsize); 5321 dropped * root->nodesize);
5310 } 5322 }
5311 5323
5312 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5324 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5427,6 +5439,20 @@ static int update_block_group(struct btrfs_root *root,
5427 spin_unlock(&cache->space_info->lock); 5439 spin_unlock(&cache->space_info->lock);
5428 } else { 5440 } else {
5429 old_val -= num_bytes; 5441 old_val -= num_bytes;
5442
5443 /*
5444 * No longer have used bytes in this block group, queue
5445 * it for deletion.
5446 */
5447 if (old_val == 0) {
5448 spin_lock(&info->unused_bgs_lock);
5449 if (list_empty(&cache->bg_list)) {
5450 btrfs_get_block_group(cache);
5451 list_add_tail(&cache->bg_list,
5452 &info->unused_bgs);
5453 }
5454 spin_unlock(&info->unused_bgs_lock);
5455 }
5430 btrfs_set_block_group_used(&cache->item, old_val); 5456 btrfs_set_block_group_used(&cache->item, old_val);
5431 cache->pinned += num_bytes; 5457 cache->pinned += num_bytes;
5432 cache->space_info->bytes_pinned += num_bytes; 5458 cache->space_info->bytes_pinned += num_bytes;
@@ -6238,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6238 int ret; 6264 int ret;
6239 struct btrfs_fs_info *fs_info = root->fs_info; 6265 struct btrfs_fs_info *fs_info = root->fs_info;
6240 6266
6241#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6267 if (btrfs_test_is_dummy_root(root))
6242 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
6243 return 0; 6268 return 0;
6244#endif 6269
6245 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6270 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6246 6271
6247 /* 6272 /*
@@ -6268,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6268 return ret; 6293 return ret;
6269} 6294}
6270 6295
6271static u64 stripe_align(struct btrfs_root *root,
6272 struct btrfs_block_group_cache *cache,
6273 u64 val, u64 num_bytes)
6274{
6275 u64 ret = ALIGN(val, root->stripesize);
6276 return ret;
6277}
6278
6279/* 6296/*
6280 * when we wait for progress in the block group caching, its because 6297 * when we wait for progress in the block group caching, its because
6281 * our allocation attempt failed at least once. So, we must sleep 6298 * our allocation attempt failed at least once. So, we must sleep
@@ -6469,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6469 bool have_caching_bg = false; 6486 bool have_caching_bg = false;
6470 6487
6471 WARN_ON(num_bytes < root->sectorsize); 6488 WARN_ON(num_bytes < root->sectorsize);
6472 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 6489 ins->type = BTRFS_EXTENT_ITEM_KEY;
6473 ins->objectid = 0; 6490 ins->objectid = 0;
6474 ins->offset = 0; 6491 ins->offset = 0;
6475 6492
@@ -6756,8 +6773,7 @@ unclustered_alloc:
6756 goto loop; 6773 goto loop;
6757 } 6774 }
6758checks: 6775checks:
6759 search_start = stripe_align(root, block_group, 6776 search_start = ALIGN(offset, root->stripesize);
6760 offset, num_bytes);
6761 6777
6762 /* move on to the next group */ 6778 /* move on to the next group */
6763 if (search_start + num_bytes > 6779 if (search_start + num_bytes >
@@ -7082,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7082 path = btrfs_alloc_path(); 7098 path = btrfs_alloc_path();
7083 if (!path) { 7099 if (!path) {
7084 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7100 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7085 root->leafsize); 7101 root->nodesize);
7086 return -ENOMEM; 7102 return -ENOMEM;
7087 } 7103 }
7088 7104
@@ -7091,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7091 ins, size); 7107 ins, size);
7092 if (ret) { 7108 if (ret) {
7093 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7109 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7094 root->leafsize); 7110 root->nodesize);
7095 btrfs_free_path(path); 7111 btrfs_free_path(path);
7096 return ret; 7112 return ret;
7097 } 7113 }
@@ -7106,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7106 7122
7107 if (skinny_metadata) { 7123 if (skinny_metadata) {
7108 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7124 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7109 num_bytes = root->leafsize; 7125 num_bytes = root->nodesize;
7110 } else { 7126 } else {
7111 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7127 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7112 btrfs_set_tree_block_key(leaf, block_info, key); 7128 btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7136,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7136 return ret; 7152 return ret;
7137 } 7153 }
7138 7154
7139 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 7155 ret = update_block_group(root, ins->objectid, root->nodesize, 1);
7140 if (ret) { /* -ENOENT, logic error */ 7156 if (ret) { /* -ENOENT, logic error */
7141 btrfs_err(fs_info, "update block group failed for %llu %llu", 7157 btrfs_err(fs_info, "update block group failed for %llu %llu",
7142 ins->objectid, ins->offset); 7158 ins->objectid, ins->offset);
7143 BUG(); 7159 BUG();
7144 } 7160 }
7145 7161
7146 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); 7162 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7147 return ret; 7163 return ret;
7148} 7164}
7149 7165
@@ -7218,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7218 btrfs_set_buffer_uptodate(buf); 7234 btrfs_set_buffer_uptodate(buf);
7219 7235
7220 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 7236 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7237 buf->log_index = root->log_transid % 2;
7221 /* 7238 /*
7222 * we allow two log transactions at a time, use different 7239 * we allow two log transactions at a time, use different
7223 * EXENT bit to differentiate dirty pages. 7240 * EXENT bit to differentiate dirty pages.
7224 */ 7241 */
7225 if (root->log_transid % 2 == 0) 7242 if (buf->log_index == 0)
7226 set_extent_dirty(&root->dirty_log_pages, buf->start, 7243 set_extent_dirty(&root->dirty_log_pages, buf->start,
7227 buf->start + buf->len - 1, GFP_NOFS); 7244 buf->start + buf->len - 1, GFP_NOFS);
7228 else 7245 else
7229 set_extent_new(&root->dirty_log_pages, buf->start, 7246 set_extent_new(&root->dirty_log_pages, buf->start,
7230 buf->start + buf->len - 1, GFP_NOFS); 7247 buf->start + buf->len - 1, GFP_NOFS);
7231 } else { 7248 } else {
7249 buf->log_index = -1;
7232 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 7250 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7233 buf->start + buf->len - 1, GFP_NOFS); 7251 buf->start + buf->len - 1, GFP_NOFS);
7234 } 7252 }
@@ -7305,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7305 * 7323 *
7306 * returns the tree buffer or NULL. 7324 * returns the tree buffer or NULL.
7307 */ 7325 */
7308struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 7326struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7309 struct btrfs_root *root, u32 blocksize, 7327 struct btrfs_root *root,
7310 u64 parent, u64 root_objectid, 7328 u64 parent, u64 root_objectid,
7311 struct btrfs_disk_key *key, int level, 7329 struct btrfs_disk_key *key, int level,
7312 u64 hint, u64 empty_size) 7330 u64 hint, u64 empty_size)
@@ -7316,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
7316 struct extent_buffer *buf; 7334 struct extent_buffer *buf;
7317 u64 flags = 0; 7335 u64 flags = 0;
7318 int ret; 7336 int ret;
7337 u32 blocksize = root->nodesize;
7319 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7338 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7320 SKINNY_METADATA); 7339 SKINNY_METADATA);
7321 7340
7322#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 7341 if (btrfs_test_is_dummy_root(root)) {
7323 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
7324 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7342 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7325 blocksize, level); 7343 blocksize, level);
7326 if (!IS_ERR(buf)) 7344 if (!IS_ERR(buf))
7327 root->alloc_bytenr += blocksize; 7345 root->alloc_bytenr += blocksize;
7328 return buf; 7346 return buf;
7329 } 7347 }
7330#endif 7348
7331 block_rsv = use_block_rsv(trans, root, blocksize); 7349 block_rsv = use_block_rsv(trans, root, blocksize);
7332 if (IS_ERR(block_rsv)) 7350 if (IS_ERR(block_rsv))
7333 return ERR_CAST(block_rsv); 7351 return ERR_CAST(block_rsv);
@@ -7422,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7422 7440
7423 eb = path->nodes[wc->level]; 7441 eb = path->nodes[wc->level];
7424 nritems = btrfs_header_nritems(eb); 7442 nritems = btrfs_header_nritems(eb);
7425 blocksize = btrfs_level_size(root, wc->level - 1); 7443 blocksize = root->nodesize;
7426 7444
7427 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7445 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7428 if (nread >= wc->reada_count) 7446 if (nread >= wc->reada_count)
@@ -7469,10 +7487,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7469 continue; 7487 continue;
7470 } 7488 }
7471reada: 7489reada:
7472 ret = readahead_tree_block(root, bytenr, blocksize, 7490 readahead_tree_block(root, bytenr, blocksize);
7473 generation);
7474 if (ret)
7475 break;
7476 nread++; 7491 nread++;
7477 } 7492 }
7478 wc->reada_slot = slot; 7493 wc->reada_slot = slot;
@@ -7631,7 +7646,6 @@ walk_down:
7631 level = root_level; 7646 level = root_level;
7632 while (level >= 0) { 7647 while (level >= 0) {
7633 if (path->nodes[level] == NULL) { 7648 if (path->nodes[level] == NULL) {
7634 int child_bsize = root->nodesize;
7635 int parent_slot; 7649 int parent_slot;
7636 u64 child_gen; 7650 u64 child_gen;
7637 u64 child_bytenr; 7651 u64 child_bytenr;
@@ -7643,8 +7657,7 @@ walk_down:
7643 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 7657 child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7644 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 7658 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7645 7659
7646 eb = read_tree_block(root, child_bytenr, child_bsize, 7660 eb = read_tree_block(root, child_bytenr, child_gen);
7647 child_gen);
7648 if (!eb || !extent_buffer_uptodate(eb)) { 7661 if (!eb || !extent_buffer_uptodate(eb)) {
7649 ret = -EIO; 7662 ret = -EIO;
7650 goto out; 7663 goto out;
@@ -7660,7 +7673,7 @@ walk_down:
7660 ret = btrfs_qgroup_record_ref(trans, root->fs_info, 7673 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7661 root->objectid, 7674 root->objectid,
7662 child_bytenr, 7675 child_bytenr,
7663 child_bsize, 7676 root->nodesize,
7664 BTRFS_QGROUP_OPER_SUB_SUBTREE, 7677 BTRFS_QGROUP_OPER_SUB_SUBTREE,
7665 0); 7678 0);
7666 if (ret) 7679 if (ret)
@@ -7811,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7811 } 7824 }
7812 7825
7813 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7826 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7814 blocksize = btrfs_level_size(root, level - 1); 7827 blocksize = root->nodesize;
7815 7828
7816 next = btrfs_find_tree_block(root, bytenr, blocksize); 7829 next = btrfs_find_tree_block(root, bytenr);
7817 if (!next) { 7830 if (!next) {
7818 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7831 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
7819 if (!next) 7832 if (!next)
@@ -7875,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7875 if (!next) { 7888 if (!next) {
7876 if (reada && level == 1) 7889 if (reada && level == 1)
7877 reada_walk_down(trans, root, wc, path); 7890 reada_walk_down(trans, root, wc, path);
7878 next = read_tree_block(root, bytenr, blocksize, generation); 7891 next = read_tree_block(root, bytenr, generation);
7879 if (!next || !extent_buffer_uptodate(next)) { 7892 if (!next || !extent_buffer_uptodate(next)) {
7880 free_extent_buffer(next); 7893 free_extent_buffer(next);
7881 return -EIO; 7894 return -EIO;
@@ -8440,13 +8453,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
8440 if (stripped) 8453 if (stripped)
8441 return extended_to_chunk(stripped); 8454 return extended_to_chunk(stripped);
8442 8455
8443 /* 8456 num_devices = root->fs_info->fs_devices->rw_devices;
8444 * we add in the count of missing devices because we want
8445 * to make sure that any RAID levels on a degraded FS
8446 * continue to be honored.
8447 */
8448 num_devices = root->fs_info->fs_devices->rw_devices +
8449 root->fs_info->fs_devices->missing_devices;
8450 8457
8451 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8458 stripped = BTRFS_BLOCK_GROUP_RAID0 |
8452 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 8459 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
@@ -8864,6 +8871,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8864 } 8871 }
8865 up_write(&info->commit_root_sem); 8872 up_write(&info->commit_root_sem);
8866 8873
8874 spin_lock(&info->unused_bgs_lock);
8875 while (!list_empty(&info->unused_bgs)) {
8876 block_group = list_first_entry(&info->unused_bgs,
8877 struct btrfs_block_group_cache,
8878 bg_list);
8879 list_del_init(&block_group->bg_list);
8880 btrfs_put_block_group(block_group);
8881 }
8882 spin_unlock(&info->unused_bgs_lock);
8883
8867 spin_lock(&info->block_group_cache_lock); 8884 spin_lock(&info->block_group_cache_lock);
8868 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8885 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8869 block_group = rb_entry(n, struct btrfs_block_group_cache, 8886 block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8998,7 +9015,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8998 init_rwsem(&cache->data_rwsem); 9015 init_rwsem(&cache->data_rwsem);
8999 INIT_LIST_HEAD(&cache->list); 9016 INIT_LIST_HEAD(&cache->list);
9000 INIT_LIST_HEAD(&cache->cluster_list); 9017 INIT_LIST_HEAD(&cache->cluster_list);
9001 INIT_LIST_HEAD(&cache->new_bg_list); 9018 INIT_LIST_HEAD(&cache->bg_list);
9002 btrfs_init_free_space_ctl(cache); 9019 btrfs_init_free_space_ctl(cache);
9003 9020
9004 return cache; 9021 return cache;
@@ -9020,7 +9037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
9020 root = info->extent_root; 9037 root = info->extent_root;
9021 key.objectid = 0; 9038 key.objectid = 0;
9022 key.offset = 0; 9039 key.offset = 0;
9023 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 9040 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
9024 path = btrfs_alloc_path(); 9041 path = btrfs_alloc_path();
9025 if (!path) 9042 if (!path)
9026 return -ENOMEM; 9043 return -ENOMEM;
@@ -9139,8 +9156,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
9139 __link_block_group(space_info, cache); 9156 __link_block_group(space_info, cache);
9140 9157
9141 set_avail_alloc_bits(root->fs_info, cache->flags); 9158 set_avail_alloc_bits(root->fs_info, cache->flags);
9142 if (btrfs_chunk_readonly(root, cache->key.objectid)) 9159 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
9143 set_block_group_ro(cache, 1); 9160 set_block_group_ro(cache, 1);
9161 } else if (btrfs_block_group_used(&cache->item) == 0) {
9162 spin_lock(&info->unused_bgs_lock);
9163 /* Should always be true but just in case. */
9164 if (list_empty(&cache->bg_list)) {
9165 btrfs_get_block_group(cache);
9166 list_add_tail(&cache->bg_list,
9167 &info->unused_bgs);
9168 }
9169 spin_unlock(&info->unused_bgs_lock);
9170 }
9144 } 9171 }
9145 9172
9146 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 9173 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -9181,10 +9208,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
9181 struct btrfs_key key; 9208 struct btrfs_key key;
9182 int ret = 0; 9209 int ret = 0;
9183 9210
9184 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 9211 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
9185 new_bg_list) { 9212 list_del_init(&block_group->bg_list);
9186 list_del_init(&block_group->new_bg_list);
9187
9188 if (ret) 9213 if (ret)
9189 continue; 9214 continue;
9190 9215
@@ -9270,7 +9295,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9270 9295
9271 __link_block_group(cache->space_info, cache); 9296 __link_block_group(cache->space_info, cache);
9272 9297
9273 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 9298 list_add_tail(&cache->bg_list, &trans->new_bgs);
9274 9299
9275 set_avail_alloc_bits(extent_root->fs_info, type); 9300 set_avail_alloc_bits(extent_root->fs_info, type);
9276 9301
@@ -9424,8 +9449,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9424 9449
9425 memcpy(&key, &block_group->key, sizeof(key)); 9450 memcpy(&key, &block_group->key, sizeof(key));
9426 9451
9427 btrfs_clear_space_info_full(root->fs_info);
9428
9429 btrfs_put_block_group(block_group); 9452 btrfs_put_block_group(block_group);
9430 btrfs_put_block_group(block_group); 9453 btrfs_put_block_group(block_group);
9431 9454
@@ -9441,6 +9464,101 @@ out:
9441 return ret; 9464 return ret;
9442} 9465}
9443 9466
9467/*
9468 * Process the unused_bgs list and remove any that don't have any allocated
9469 * space inside of them.
9470 */
9471void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9472{
9473 struct btrfs_block_group_cache *block_group;
9474 struct btrfs_space_info *space_info;
9475 struct btrfs_root *root = fs_info->extent_root;
9476 struct btrfs_trans_handle *trans;
9477 int ret = 0;
9478
9479 if (!fs_info->open)
9480 return;
9481
9482 spin_lock(&fs_info->unused_bgs_lock);
9483 while (!list_empty(&fs_info->unused_bgs)) {
9484 u64 start, end;
9485
9486 block_group = list_first_entry(&fs_info->unused_bgs,
9487 struct btrfs_block_group_cache,
9488 bg_list);
9489 space_info = block_group->space_info;
9490 list_del_init(&block_group->bg_list);
9491 if (ret || btrfs_mixed_space_info(space_info)) {
9492 btrfs_put_block_group(block_group);
9493 continue;
9494 }
9495 spin_unlock(&fs_info->unused_bgs_lock);
9496
9497 /* Don't want to race with allocators so take the groups_sem */
9498 down_write(&space_info->groups_sem);
9499 spin_lock(&block_group->lock);
9500 if (block_group->reserved ||
9501 btrfs_block_group_used(&block_group->item) ||
9502 block_group->ro) {
9503 /*
9504 * We want to bail if we made new allocations or have
9505 * outstanding allocations in this block group. We do
9506 * the ro check in case balance is currently acting on
9507 * this block group.
9508 */
9509 spin_unlock(&block_group->lock);
9510 up_write(&space_info->groups_sem);
9511 goto next;
9512 }
9513 spin_unlock(&block_group->lock);
9514
9515 /* We don't want to force the issue, only flip if it's ok. */
9516 ret = set_block_group_ro(block_group, 0);
9517 up_write(&space_info->groups_sem);
9518 if (ret < 0) {
9519 ret = 0;
9520 goto next;
9521 }
9522
9523 /*
9524 * Want to do this before we do anything else so we can recover
9525 * properly if we fail to join the transaction.
9526 */
9527 trans = btrfs_join_transaction(root);
9528 if (IS_ERR(trans)) {
9529 btrfs_set_block_group_rw(root, block_group);
9530 ret = PTR_ERR(trans);
9531 goto next;
9532 }
9533
9534 /*
9535 * We could have pending pinned extents for this block group,
9536 * just delete them, we don't care about them anymore.
9537 */
9538 start = block_group->key.objectid;
9539 end = start + block_group->key.offset - 1;
9540 clear_extent_bits(&fs_info->freed_extents[0], start, end,
9541 EXTENT_DIRTY, GFP_NOFS);
9542 clear_extent_bits(&fs_info->freed_extents[1], start, end,
9543 EXTENT_DIRTY, GFP_NOFS);
9544
9545 /* Reset pinned so btrfs_put_block_group doesn't complain */
9546 block_group->pinned = 0;
9547
9548 /*
9549 * Btrfs_remove_chunk will abort the transaction if things go
9550 * horribly wrong.
9551 */
9552 ret = btrfs_remove_chunk(trans, root,
9553 block_group->key.objectid);
9554 btrfs_end_transaction(trans, root);
9555next:
9556 btrfs_put_block_group(block_group);
9557 spin_lock(&fs_info->unused_bgs_lock);
9558 }
9559 spin_unlock(&fs_info->unused_bgs_lock);
9560}
9561
9444int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 9562int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
9445{ 9563{
9446 struct btrfs_space_info *space_info; 9564 struct btrfs_space_info *space_info;
@@ -9572,7 +9690,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
9572 9690
9573int btrfs_start_nocow_write(struct btrfs_root *root) 9691int btrfs_start_nocow_write(struct btrfs_root *root)
9574{ 9692{
9575 if (unlikely(atomic_read(&root->will_be_snapshoted))) 9693 if (atomic_read(&root->will_be_snapshoted))
9576 return 0; 9694 return 0;
9577 9695
9578 percpu_counter_inc(&root->subv_writers->counter); 9696 percpu_counter_inc(&root->subv_writers->counter);
@@ -9580,7 +9698,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
9580 * Make sure counter is updated before we check for snapshot creation. 9698 * Make sure counter is updated before we check for snapshot creation.
9581 */ 9699 */
9582 smp_mb(); 9700 smp_mb();
9583 if (unlikely(atomic_read(&root->will_be_snapshoted))) { 9701 if (atomic_read(&root->will_be_snapshoted)) {
9584 btrfs_end_nocow_write(root); 9702 btrfs_end_nocow_write(root);
9585 return 0; 9703 return 0;
9586 } 9704 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3e11aab9f391..bf3f424e0013 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache;
25static struct kmem_cache *extent_buffer_cache; 25static struct kmem_cache *extent_buffer_cache;
26static struct bio_set *btrfs_bioset; 26static struct bio_set *btrfs_bioset;
27 27
28static inline bool extent_state_in_tree(const struct extent_state *state)
29{
30 return !RB_EMPTY_NODE(&state->rb_node);
31}
32
28#ifdef CONFIG_BTRFS_DEBUG 33#ifdef CONFIG_BTRFS_DEBUG
29static LIST_HEAD(buffers); 34static LIST_HEAD(buffers);
30static LIST_HEAD(states); 35static LIST_HEAD(states);
@@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void)
59 64
60 while (!list_empty(&states)) { 65 while (!list_empty(&states)) {
61 state = list_entry(states.next, struct extent_state, leak_list); 66 state = list_entry(states.next, struct extent_state, leak_list);
62 printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " 67 pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
63 "state %lu in tree %p refs %d\n", 68 state->start, state->end, state->state,
64 state->start, state->end, state->state, state->tree, 69 extent_state_in_tree(state),
65 atomic_read(&state->refs)); 70 atomic_read(&state->refs));
66 list_del(&state->leak_list); 71 list_del(&state->leak_list);
67 kmem_cache_free(extent_state_cache, state); 72 kmem_cache_free(extent_state_cache, state);
@@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
209 return state; 214 return state;
210 state->state = 0; 215 state->state = 0;
211 state->private = 0; 216 state->private = 0;
212 state->tree = NULL; 217 RB_CLEAR_NODE(&state->rb_node);
213 btrfs_leak_debug_add(&state->leak_list, &states); 218 btrfs_leak_debug_add(&state->leak_list, &states);
214 atomic_set(&state->refs, 1); 219 atomic_set(&state->refs, 1);
215 init_waitqueue_head(&state->wq); 220 init_waitqueue_head(&state->wq);
@@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state)
222 if (!state) 227 if (!state)
223 return; 228 return;
224 if (atomic_dec_and_test(&state->refs)) { 229 if (atomic_dec_and_test(&state->refs)) {
225 WARN_ON(state->tree); 230 WARN_ON(extent_state_in_tree(state));
226 btrfs_leak_debug_del(&state->leak_list); 231 btrfs_leak_debug_del(&state->leak_list);
227 trace_free_extent_state(state, _RET_IP_); 232 trace_free_extent_state(state, _RET_IP_);
228 kmem_cache_free(extent_state_cache, state); 233 kmem_cache_free(extent_state_cache, state);
@@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree,
371 other->state == state->state) { 376 other->state == state->state) {
372 merge_cb(tree, state, other); 377 merge_cb(tree, state, other);
373 state->start = other->start; 378 state->start = other->start;
374 other->tree = NULL;
375 rb_erase(&other->rb_node, &tree->state); 379 rb_erase(&other->rb_node, &tree->state);
380 RB_CLEAR_NODE(&other->rb_node);
376 free_extent_state(other); 381 free_extent_state(other);
377 } 382 }
378 } 383 }
@@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree,
383 other->state == state->state) { 388 other->state == state->state) {
384 merge_cb(tree, state, other); 389 merge_cb(tree, state, other);
385 state->end = other->end; 390 state->end = other->end;
386 other->tree = NULL;
387 rb_erase(&other->rb_node, &tree->state); 391 rb_erase(&other->rb_node, &tree->state);
392 RB_CLEAR_NODE(&other->rb_node);
388 free_extent_state(other); 393 free_extent_state(other);
389 } 394 }
390 } 395 }
@@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree,
442 found->start, found->end, start, end); 447 found->start, found->end, start, end);
443 return -EEXIST; 448 return -EEXIST;
444 } 449 }
445 state->tree = tree;
446 merge_state(tree, state); 450 merge_state(tree, state);
447 return 0; 451 return 0;
448} 452}
@@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
486 free_extent_state(prealloc); 490 free_extent_state(prealloc);
487 return -EEXIST; 491 return -EEXIST;
488 } 492 }
489 prealloc->tree = tree;
490 return 0; 493 return 0;
491} 494}
492 495
@@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
524 wake_up(&state->wq); 527 wake_up(&state->wq);
525 if (state->state == 0) { 528 if (state->state == 0) {
526 next = next_state(state); 529 next = next_state(state);
527 if (state->tree) { 530 if (extent_state_in_tree(state)) {
528 rb_erase(&state->rb_node, &tree->state); 531 rb_erase(&state->rb_node, &tree->state);
529 state->tree = NULL; 532 RB_CLEAR_NODE(&state->rb_node);
530 free_extent_state(state); 533 free_extent_state(state);
531 } else { 534 } else {
532 WARN_ON(1); 535 WARN_ON(1);
@@ -606,8 +609,8 @@ again:
606 cached_state = NULL; 609 cached_state = NULL;
607 } 610 }
608 611
609 if (cached && cached->tree && cached->start <= start && 612 if (cached && extent_state_in_tree(cached) &&
610 cached->end > start) { 613 cached->start <= start && cached->end > start) {
611 if (clear) 614 if (clear)
612 atomic_dec(&cached->refs); 615 atomic_dec(&cached->refs);
613 state = cached; 616 state = cached;
@@ -843,7 +846,7 @@ again:
843 if (cached_state && *cached_state) { 846 if (cached_state && *cached_state) {
844 state = *cached_state; 847 state = *cached_state;
845 if (state->start <= start && state->end > start && 848 if (state->start <= start && state->end > start &&
846 state->tree) { 849 extent_state_in_tree(state)) {
847 node = &state->rb_node; 850 node = &state->rb_node;
848 goto hit_next; 851 goto hit_next;
849 } 852 }
@@ -1069,7 +1072,7 @@ again:
1069 if (cached_state && *cached_state) { 1072 if (cached_state && *cached_state) {
1070 state = *cached_state; 1073 state = *cached_state;
1071 if (state->start <= start && state->end > start && 1074 if (state->start <= start && state->end > start &&
1072 state->tree) { 1075 extent_state_in_tree(state)) {
1073 node = &state->rb_node; 1076 node = &state->rb_node;
1074 goto hit_next; 1077 goto hit_next;
1075 } 1078 }
@@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1459 spin_lock(&tree->lock); 1462 spin_lock(&tree->lock);
1460 if (cached_state && *cached_state) { 1463 if (cached_state && *cached_state) {
1461 state = *cached_state; 1464 state = *cached_state;
1462 if (state->end == start - 1 && state->tree) { 1465 if (state->end == start - 1 && extent_state_in_tree(state)) {
1463 n = rb_next(&state->rb_node); 1466 n = rb_next(&state->rb_node);
1464 while (n) { 1467 while (n) {
1465 state = rb_entry(n, struct extent_state, 1468 state = rb_entry(n, struct extent_state,
@@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1905 int bitset = 0; 1908 int bitset = 0;
1906 1909
1907 spin_lock(&tree->lock); 1910 spin_lock(&tree->lock);
1908 if (cached && cached->tree && cached->start <= start && 1911 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
1909 cached->end > start) 1912 cached->end > start)
1910 node = &cached->rb_node; 1913 node = &cached->rb_node;
1911 else 1914 else
@@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1959 SetPageUptodate(page); 1962 SetPageUptodate(page);
1960} 1963}
1961 1964
1962/* 1965int free_io_failure(struct inode *inode, struct io_failure_record *rec)
1963 * When IO fails, either with EIO or csum verification fails, we
1964 * try other mirrors that might have a good copy of the data. This
1965 * io_failure_record is used to record state as we go through all the
1966 * mirrors. If another mirror has good data, the page is set up to date
1967 * and things continue. If a good mirror can't be found, the original
1968 * bio end_io callback is called to indicate things have failed.
1969 */
1970struct io_failure_record {
1971 struct page *page;
1972 u64 start;
1973 u64 len;
1974 u64 logical;
1975 unsigned long bio_flags;
1976 int this_mirror;
1977 int failed_mirror;
1978 int in_validation;
1979};
1980
1981static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1982 int did_repair)
1983{ 1966{
1984 int ret; 1967 int ret;
1985 int err = 0; 1968 int err = 0;
@@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
2012 * currently, there can be no more than two copies of every data bit. thus, 1995 * currently, there can be no more than two copies of every data bit. thus,
2013 * exactly one rewrite is required. 1996 * exactly one rewrite is required.
2014 */ 1997 */
2015int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 1998int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
2016 u64 length, u64 logical, struct page *page, 1999 struct page *page, unsigned int pg_offset, int mirror_num)
2017 int mirror_num)
2018{ 2000{
2001 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2019 struct bio *bio; 2002 struct bio *bio;
2020 struct btrfs_device *dev; 2003 struct btrfs_device *dev;
2021 u64 map_length = 0; 2004 u64 map_length = 0;
@@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2053 return -EIO; 2036 return -EIO;
2054 } 2037 }
2055 bio->bi_bdev = dev->bdev; 2038 bio->bi_bdev = dev->bdev;
2056 bio_add_page(bio, page, length, start - page_offset(page)); 2039 bio_add_page(bio, page, length, pg_offset);
2057 2040
2058 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { 2041 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
2059 /* try to remap that extent elsewhere? */ 2042 /* try to remap that extent elsewhere? */
@@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2063 } 2046 }
2064 2047
2065 printk_ratelimited_in_rcu(KERN_INFO 2048 printk_ratelimited_in_rcu(KERN_INFO
2066 "BTRFS: read error corrected: ino %lu off %llu " 2049 "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
2067 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 2050 btrfs_ino(inode), start,
2068 start, rcu_str_deref(dev->name), sector); 2051 rcu_str_deref(dev->name), sector);
2069
2070 bio_put(bio); 2052 bio_put(bio);
2071 return 0; 2053 return 0;
2072} 2054}
@@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2082 return -EROFS; 2064 return -EROFS;
2083 2065
2084 for (i = 0; i < num_pages; i++) { 2066 for (i = 0; i < num_pages; i++) {
2085 struct page *p = extent_buffer_page(eb, i); 2067 struct page *p = eb->pages[i];
2086 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 2068
2087 start, p, mirror_num); 2069 ret = repair_io_failure(root->fs_info->btree_inode, start,
2070 PAGE_CACHE_SIZE, start, p,
2071 start - page_offset(p), mirror_num);
2088 if (ret) 2072 if (ret)
2089 break; 2073 break;
2090 start += PAGE_CACHE_SIZE; 2074 start += PAGE_CACHE_SIZE;
@@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2097 * each time an IO finishes, we do a fast check in the IO failure tree 2081 * each time an IO finishes, we do a fast check in the IO failure tree
2098 * to see if we need to process or clean up an io_failure_record 2082 * to see if we need to process or clean up an io_failure_record
2099 */ 2083 */
2100static int clean_io_failure(u64 start, struct page *page) 2084int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2085 unsigned int pg_offset)
2101{ 2086{
2102 u64 private; 2087 u64 private;
2103 u64 private_failure; 2088 u64 private_failure;
2104 struct io_failure_record *failrec; 2089 struct io_failure_record *failrec;
2105 struct inode *inode = page->mapping->host;
2106 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2090 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2107 struct extent_state *state; 2091 struct extent_state *state;
2108 int num_copies; 2092 int num_copies;
2109 int did_repair = 0;
2110 int ret; 2093 int ret;
2111 2094
2112 private = 0; 2095 private = 0;
@@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page)
2127 /* there was no real error, just free the record */ 2110 /* there was no real error, just free the record */
2128 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2111 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2129 failrec->start); 2112 failrec->start);
2130 did_repair = 1;
2131 goto out; 2113 goto out;
2132 } 2114 }
2133 if (fs_info->sb->s_flags & MS_RDONLY) 2115 if (fs_info->sb->s_flags & MS_RDONLY)
@@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page)
2144 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2126 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2145 failrec->len); 2127 failrec->len);
2146 if (num_copies > 1) { 2128 if (num_copies > 1) {
2147 ret = repair_io_failure(fs_info, start, failrec->len, 2129 repair_io_failure(inode, start, failrec->len,
2148 failrec->logical, page, 2130 failrec->logical, page,
2149 failrec->failed_mirror); 2131 pg_offset, failrec->failed_mirror);
2150 did_repair = !ret;
2151 } 2132 }
2152 ret = 0;
2153 } 2133 }
2154 2134
2155out: 2135out:
2156 if (!ret) 2136 free_io_failure(inode, failrec);
2157 ret = free_io_failure(inode, failrec, did_repair);
2158 2137
2159 return ret; 2138 return 0;
2160} 2139}
2161 2140
2162/* 2141/*
2163 * this is a generic handler for readpage errors (default 2142 * Can be called when
2164 * readpage_io_failed_hook). if other copies exist, read those and write back 2143 * - hold extent lock
2165 * good data to the failed position. does not investigate in remapping the 2144 * - under ordered extent
2166 * failed extent elsewhere, hoping the device will be smart enough to do this as 2145 * - the inode is freeing
2167 * needed
2168 */ 2146 */
2147void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2148{
2149 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2150 struct io_failure_record *failrec;
2151 struct extent_state *state, *next;
2169 2152
2170static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2153 if (RB_EMPTY_ROOT(&failure_tree->state))
2171 struct page *page, u64 start, u64 end, 2154 return;
2172 int failed_mirror) 2155
2156 spin_lock(&failure_tree->lock);
2157 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2158 while (state) {
2159 if (state->start > end)
2160 break;
2161
2162 ASSERT(state->end <= end);
2163
2164 next = next_state(state);
2165
2166 failrec = (struct io_failure_record *)state->private;
2167 free_extent_state(state);
2168 kfree(failrec);
2169
2170 state = next;
2171 }
2172 spin_unlock(&failure_tree->lock);
2173}
2174
2175int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2176 struct io_failure_record **failrec_ret)
2173{ 2177{
2174 struct io_failure_record *failrec = NULL; 2178 struct io_failure_record *failrec;
2175 u64 private; 2179 u64 private;
2176 struct extent_map *em; 2180 struct extent_map *em;
2177 struct inode *inode = page->mapping->host;
2178 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2181 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2179 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2182 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2180 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2183 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2181 struct bio *bio;
2182 struct btrfs_io_bio *btrfs_failed_bio;
2183 struct btrfs_io_bio *btrfs_bio;
2184 int num_copies;
2185 int ret; 2184 int ret;
2186 int read_mode;
2187 u64 logical; 2185 u64 logical;
2188 2186
2189 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2190
2191 ret = get_state_private(failure_tree, start, &private); 2187 ret = get_state_private(failure_tree, start, &private);
2192 if (ret) { 2188 if (ret) {
2193 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2189 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2194 if (!failrec) 2190 if (!failrec)
2195 return -ENOMEM; 2191 return -ENOMEM;
2192
2196 failrec->start = start; 2193 failrec->start = start;
2197 failrec->len = end - start + 1; 2194 failrec->len = end - start + 1;
2198 failrec->this_mirror = 0; 2195 failrec->this_mirror = 0;
@@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2212 em = NULL; 2209 em = NULL;
2213 } 2210 }
2214 read_unlock(&em_tree->lock); 2211 read_unlock(&em_tree->lock);
2215
2216 if (!em) { 2212 if (!em) {
2217 kfree(failrec); 2213 kfree(failrec);
2218 return -EIO; 2214 return -EIO;
2219 } 2215 }
2216
2220 logical = start - em->start; 2217 logical = start - em->start;
2221 logical = em->block_start + logical; 2218 logical = em->block_start + logical;
2222 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2219 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
@@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2225 extent_set_compress_type(&failrec->bio_flags, 2222 extent_set_compress_type(&failrec->bio_flags,
2226 em->compress_type); 2223 em->compress_type);
2227 } 2224 }
2228 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2225
2229 "len=%llu\n", logical, start, failrec->len); 2226 pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
2227 logical, start, failrec->len);
2228
2230 failrec->logical = logical; 2229 failrec->logical = logical;
2231 free_extent_map(em); 2230 free_extent_map(em);
2232 2231
@@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2246 } 2245 }
2247 } else { 2246 } else {
2248 failrec = (struct io_failure_record *)(unsigned long)private; 2247 failrec = (struct io_failure_record *)(unsigned long)private;
2249 pr_debug("bio_readpage_error: (found) logical=%llu, " 2248 pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
2250 "start=%llu, len=%llu, validation=%d\n",
2251 failrec->logical, failrec->start, failrec->len, 2249 failrec->logical, failrec->start, failrec->len,
2252 failrec->in_validation); 2250 failrec->in_validation);
2253 /* 2251 /*
@@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2256 * clean_io_failure() clean all those errors at once. 2254 * clean_io_failure() clean all those errors at once.
2257 */ 2255 */
2258 } 2256 }
2257
2258 *failrec_ret = failrec;
2259
2260 return 0;
2261}
2262
2263int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
2264 struct io_failure_record *failrec, int failed_mirror)
2265{
2266 int num_copies;
2267
2259 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2268 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2260 failrec->logical, failrec->len); 2269 failrec->logical, failrec->len);
2261 if (num_copies == 1) { 2270 if (num_copies == 1) {
@@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2264 * all the retry and error correction code that follows. no 2273 * all the retry and error correction code that follows. no
2265 * matter what the error is, it is very likely to persist. 2274 * matter what the error is, it is very likely to persist.
2266 */ 2275 */
2267 pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", 2276 pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
2268 num_copies, failrec->this_mirror, failed_mirror); 2277 num_copies, failrec->this_mirror, failed_mirror);
2269 free_io_failure(inode, failrec, 0); 2278 return 0;
2270 return -EIO;
2271 } 2279 }
2272 2280
2273 /* 2281 /*
@@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2287 BUG_ON(failrec->in_validation); 2295 BUG_ON(failrec->in_validation);
2288 failrec->in_validation = 1; 2296 failrec->in_validation = 1;
2289 failrec->this_mirror = failed_mirror; 2297 failrec->this_mirror = failed_mirror;
2290 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2291 } else { 2298 } else {
2292 /* 2299 /*
2293 * we're ready to fulfill a) and b) alongside. get a good copy 2300 * we're ready to fulfill a) and b) alongside. get a good copy
@@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2303 failrec->this_mirror++; 2310 failrec->this_mirror++;
2304 if (failrec->this_mirror == failed_mirror) 2311 if (failrec->this_mirror == failed_mirror)
2305 failrec->this_mirror++; 2312 failrec->this_mirror++;
2306 read_mode = READ_SYNC;
2307 } 2313 }
2308 2314
2309 if (failrec->this_mirror > num_copies) { 2315 if (failrec->this_mirror > num_copies) {
2310 pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", 2316 pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
2311 num_copies, failrec->this_mirror, failed_mirror); 2317 num_copies, failrec->this_mirror, failed_mirror);
2312 free_io_failure(inode, failrec, 0); 2318 return 0;
2313 return -EIO;
2314 } 2319 }
2315 2320
2321 return 1;
2322}
2323
2324
2325struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2326 struct io_failure_record *failrec,
2327 struct page *page, int pg_offset, int icsum,
2328 bio_end_io_t *endio_func, void *data)
2329{
2330 struct bio *bio;
2331 struct btrfs_io_bio *btrfs_failed_bio;
2332 struct btrfs_io_bio *btrfs_bio;
2333
2316 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2334 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2317 if (!bio) { 2335 if (!bio)
2318 free_io_failure(inode, failrec, 0); 2336 return NULL;
2319 return -EIO; 2337
2320 } 2338 bio->bi_end_io = endio_func;
2321 bio->bi_end_io = failed_bio->bi_end_io;
2322 bio->bi_iter.bi_sector = failrec->logical >> 9; 2339 bio->bi_iter.bi_sector = failrec->logical >> 9;
2323 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2340 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2324 bio->bi_iter.bi_size = 0; 2341 bio->bi_iter.bi_size = 0;
2342 bio->bi_private = data;
2325 2343
2326 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2344 btrfs_failed_bio = btrfs_io_bio(failed_bio);
2327 if (btrfs_failed_bio->csum) { 2345 if (btrfs_failed_bio->csum) {
@@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2330 2348
2331 btrfs_bio = btrfs_io_bio(bio); 2349 btrfs_bio = btrfs_io_bio(bio);
2332 btrfs_bio->csum = btrfs_bio->csum_inline; 2350 btrfs_bio->csum = btrfs_bio->csum_inline;
2333 phy_offset >>= inode->i_sb->s_blocksize_bits; 2351 icsum *= csum_size;
2334 phy_offset *= csum_size; 2352 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
2335 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
2336 csum_size); 2353 csum_size);
2337 } 2354 }
2338 2355
2339 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2356 bio_add_page(bio, page, failrec->len, pg_offset);
2357
2358 return bio;
2359}
2360
2361/*
2362 * this is a generic handler for readpage errors (default
2363 * readpage_io_failed_hook). if other copies exist, read those and write back
2364 * good data to the failed position. does not investigate in remapping the
2365 * failed extent elsewhere, hoping the device will be smart enough to do this as
2366 * needed
2367 */
2368
2369static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2370 struct page *page, u64 start, u64 end,
2371 int failed_mirror)
2372{
2373 struct io_failure_record *failrec;
2374 struct inode *inode = page->mapping->host;
2375 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2376 struct bio *bio;
2377 int read_mode;
2378 int ret;
2379
2380 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2381
2382 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2383 if (ret)
2384 return ret;
2385
2386 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
2387 if (!ret) {
2388 free_io_failure(inode, failrec);
2389 return -EIO;
2390 }
2391
2392 if (failed_bio->bi_vcnt > 1)
2393 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2394 else
2395 read_mode = READ_SYNC;
2396
2397 phy_offset >>= inode->i_sb->s_blocksize_bits;
2398 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2399 start - page_offset(page),
2400 (int)phy_offset, failed_bio->bi_end_io,
2401 NULL);
2402 if (!bio) {
2403 free_io_failure(inode, failrec);
2404 return -EIO;
2405 }
2340 2406
2341 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2407 pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
2342 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2408 read_mode, failrec->this_mirror, failrec->in_validation);
2343 failrec->this_mirror, num_copies, failrec->in_validation);
2344 2409
2345 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2410 ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2346 failrec->this_mirror, 2411 failrec->this_mirror,
2347 failrec->bio_flags, 0); 2412 failrec->bio_flags, 0);
2413 if (ret) {
2414 free_io_failure(inode, failrec);
2415 bio_put(bio);
2416 }
2417
2348 return ret; 2418 return ret;
2349} 2419}
2350 2420
@@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2469 struct inode *inode = page->mapping->host; 2539 struct inode *inode = page->mapping->host;
2470 2540
2471 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2541 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2472 "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, 2542 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
2473 io_bio->mirror_num); 2543 io_bio->mirror_num);
2474 tree = &BTRFS_I(inode)->io_tree; 2544 tree = &BTRFS_I(inode)->io_tree;
2475 2545
@@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2503 if (ret) 2573 if (ret)
2504 uptodate = 0; 2574 uptodate = 0;
2505 else 2575 else
2506 clean_io_failure(start, page); 2576 clean_io_failure(inode, start, page, 0);
2507 } 2577 }
2508 2578
2509 if (likely(uptodate)) 2579 if (likely(uptodate))
@@ -2532,6 +2602,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2532 test_bit(BIO_UPTODATE, &bio->bi_flags); 2602 test_bit(BIO_UPTODATE, &bio->bi_flags);
2533 if (err) 2603 if (err)
2534 uptodate = 0; 2604 uptodate = 0;
2605 offset += len;
2535 continue; 2606 continue;
2536 } 2607 }
2537 } 2608 }
@@ -2539,12 +2610,12 @@ readpage_ok:
2539 if (likely(uptodate)) { 2610 if (likely(uptodate)) {
2540 loff_t i_size = i_size_read(inode); 2611 loff_t i_size = i_size_read(inode);
2541 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2612 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2542 unsigned offset; 2613 unsigned off;
2543 2614
2544 /* Zero out the end if this page straddles i_size */ 2615 /* Zero out the end if this page straddles i_size */
2545 offset = i_size & (PAGE_CACHE_SIZE-1); 2616 off = i_size & (PAGE_CACHE_SIZE-1);
2546 if (page->index == end_index && offset) 2617 if (page->index == end_index && off)
2547 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2618 zero_user_segment(page, off, PAGE_CACHE_SIZE);
2548 SetPageUptodate(page); 2619 SetPageUptodate(page);
2549 } else { 2620 } else {
2550 ClearPageUptodate(page); 2621 ClearPageUptodate(page);
@@ -2617,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2617 2688
2618struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2689struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
2619{ 2690{
2620 return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2691 struct btrfs_io_bio *btrfs_bio;
2621} 2692 struct bio *new;
2622 2693
2694 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
2695 if (new) {
2696 btrfs_bio = btrfs_io_bio(new);
2697 btrfs_bio->csum = NULL;
2698 btrfs_bio->csum_allocated = NULL;
2699 btrfs_bio->end_io = NULL;
2700 }
2701 return new;
2702}
2623 2703
2624/* this also allocates from the btrfs_bioset */ 2704/* this also allocates from the btrfs_bioset */
2625struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2705struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -3500,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
3500 3580
3501 num_pages = num_extent_pages(eb->start, eb->len); 3581 num_pages = num_extent_pages(eb->start, eb->len);
3502 for (i = 0; i < num_pages; i++) { 3582 for (i = 0; i < num_pages; i++) {
3503 struct page *p = extent_buffer_page(eb, i); 3583 struct page *p = eb->pages[i];
3504 3584
3505 if (!trylock_page(p)) { 3585 if (!trylock_page(p)) {
3506 if (!flush) { 3586 if (!flush) {
@@ -3521,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
3521 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3601 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3522} 3602}
3523 3603
3604static void set_btree_ioerr(struct page *page)
3605{
3606 struct extent_buffer *eb = (struct extent_buffer *)page->private;
3607 struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
3608
3609 SetPageError(page);
3610 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3611 return;
3612
3613 /*
3614 * If writeback for a btree extent that doesn't belong to a log tree
3615 * failed, increment the counter transaction->eb_write_errors.
3616 * We do this because while the transaction is running and before it's
3617 * committing (when we call filemap_fdata[write|wait]_range against
3618 * the btree inode), we might have
3619 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3620 * returns an error or an error happens during writeback, when we're
3621 * committing the transaction we wouldn't know about it, since the pages
3622 * can be no longer dirty nor marked anymore for writeback (if a
3623 * subsequent modification to the extent buffer didn't happen before the
3624 * transaction commit), which makes filemap_fdata[write|wait]_range not
3625 * able to find the pages tagged with SetPageError at transaction
3626 * commit time. So if this happens we must abort the transaction,
3627 * otherwise we commit a super block with btree roots that point to
3628 * btree nodes/leafs whose content on disk is invalid - either garbage
3629 * or the content of some node/leaf from a past generation that got
3630 * cowed or deleted and is no longer valid.
3631 *
3632 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3633 * not be enough - we need to distinguish between log tree extents vs
3634 * non-log tree extents, and the next filemap_fdatawait_range() call
3635 * will catch and clear such errors in the mapping - and that call might
3636 * be from a log sync and not from a transaction commit. Also, checking
3637 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3638 * not done and would not be reliable - the eb might have been released
3639 * from memory and reading it back again means that flag would not be
3640 * set (since it's a runtime flag, not persisted on disk).
3641 *
3642 * Using the flags below in the btree inode also makes us achieve the
3643 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3644 * writeback for all dirty pages and before filemap_fdatawait_range()
3645 * is called, the writeback for all dirty pages had already finished
3646 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3647 * filemap_fdatawait_range() would return success, as it could not know
3648 * that writeback errors happened (the pages were no longer tagged for
3649 * writeback).
3650 */
3651 switch (eb->log_index) {
3652 case -1:
3653 set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
3654 break;
3655 case 0:
3656 set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
3657 break;
3658 case 1:
3659 set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
3660 break;
3661 default:
3662 BUG(); /* unexpected, logic error */
3663 }
3664}
3665
3524static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3666static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3525{ 3667{
3526 struct bio_vec *bvec; 3668 struct bio_vec *bvec;
@@ -3534,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3534 BUG_ON(!eb); 3676 BUG_ON(!eb);
3535 done = atomic_dec_and_test(&eb->io_pages); 3677 done = atomic_dec_and_test(&eb->io_pages);
3536 3678
3537 if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3679 if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3538 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3539 ClearPageUptodate(page); 3680 ClearPageUptodate(page);
3540 SetPageError(page); 3681 set_btree_ioerr(page);
3541 } 3682 }
3542 3683
3543 end_page_writeback(page); 3684 end_page_writeback(page);
@@ -3564,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3564 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; 3705 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
3565 int ret = 0; 3706 int ret = 0;
3566 3707
3567 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3708 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3568 num_pages = num_extent_pages(eb->start, eb->len); 3709 num_pages = num_extent_pages(eb->start, eb->len);
3569 atomic_set(&eb->io_pages, num_pages); 3710 atomic_set(&eb->io_pages, num_pages);
3570 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3711 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3571 bio_flags = EXTENT_BIO_TREE_LOG; 3712 bio_flags = EXTENT_BIO_TREE_LOG;
3572 3713
3573 for (i = 0; i < num_pages; i++) { 3714 for (i = 0; i < num_pages; i++) {
3574 struct page *p = extent_buffer_page(eb, i); 3715 struct page *p = eb->pages[i];
3575 3716
3576 clear_page_dirty_for_io(p); 3717 clear_page_dirty_for_io(p);
3577 set_page_writeback(p); 3718 set_page_writeback(p);
@@ -3581,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3581 0, epd->bio_flags, bio_flags); 3722 0, epd->bio_flags, bio_flags);
3582 epd->bio_flags = bio_flags; 3723 epd->bio_flags = bio_flags;
3583 if (ret) { 3724 if (ret) {
3584 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3725 set_btree_ioerr(p);
3585 SetPageError(p); 3726 end_page_writeback(p);
3586 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3727 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3587 end_extent_buffer_writeback(eb); 3728 end_extent_buffer_writeback(eb);
3588 ret = -EIO; 3729 ret = -EIO;
@@ -3595,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3595 3736
3596 if (unlikely(ret)) { 3737 if (unlikely(ret)) {
3597 for (; i < num_pages; i++) { 3738 for (; i < num_pages; i++) {
3598 struct page *p = extent_buffer_page(eb, i); 3739 struct page *p = eb->pages[i];
3740 clear_page_dirty_for_io(p);
3599 unlock_page(p); 3741 unlock_page(p);
3600 } 3742 }
3601 } 3743 }
@@ -4165,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
4165 return NULL; 4307 return NULL;
4166} 4308}
4167 4309
4168static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
4169{
4170 unsigned long cnt = *((unsigned long *)ctx);
4171
4172 cnt++;
4173 *((unsigned long *)ctx) = cnt;
4174
4175 /* Now we're sure that the extent is shared. */
4176 if (cnt > 1)
4177 return 1;
4178 return 0;
4179}
4180
4181int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4310int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4182 __u64 start, __u64 len, get_extent_t *get_extent) 4311 __u64 start, __u64 len, get_extent_t *get_extent)
4183{ 4312{
@@ -4194,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4194 struct extent_map *em = NULL; 4323 struct extent_map *em = NULL;
4195 struct extent_state *cached_state = NULL; 4324 struct extent_state *cached_state = NULL;
4196 struct btrfs_path *path; 4325 struct btrfs_path *path;
4326 struct btrfs_root *root = BTRFS_I(inode)->root;
4197 int end = 0; 4327 int end = 0;
4198 u64 em_start = 0; 4328 u64 em_start = 0;
4199 u64 em_len = 0; 4329 u64 em_len = 0;
@@ -4207,15 +4337,15 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4207 return -ENOMEM; 4337 return -ENOMEM;
4208 path->leave_spinning = 1; 4338 path->leave_spinning = 1;
4209 4339
4210 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 4340 start = round_down(start, BTRFS_I(inode)->root->sectorsize);
4211 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 4341 len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
4212 4342
4213 /* 4343 /*
4214 * lookup the last file extent. We're not using i_size here 4344 * lookup the last file extent. We're not using i_size here
4215 * because there might be preallocation past i_size 4345 * because there might be preallocation past i_size
4216 */ 4346 */
4217 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 4347 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
4218 path, btrfs_ino(inode), -1, 0); 4348 0);
4219 if (ret < 0) { 4349 if (ret < 0) {
4220 btrfs_free_path(path); 4350 btrfs_free_path(path);
4221 return ret; 4351 return ret;
@@ -4223,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4223 WARN_ON(!ret); 4353 WARN_ON(!ret);
4224 path->slots[0]--; 4354 path->slots[0]--;
4225 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4355 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4226 found_type = btrfs_key_type(&found_key); 4356 found_type = found_key.type;
4227 4357
4228 /* No extents, but there might be delalloc bits */ 4358 /* No extents, but there might be delalloc bits */
4229 if (found_key.objectid != btrfs_ino(inode) || 4359 if (found_key.objectid != btrfs_ino(inode) ||
@@ -4308,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4308 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4438 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
4309 flags |= (FIEMAP_EXTENT_DELALLOC | 4439 flags |= (FIEMAP_EXTENT_DELALLOC |
4310 FIEMAP_EXTENT_UNKNOWN); 4440 FIEMAP_EXTENT_UNKNOWN);
4311 } else { 4441 } else if (fieinfo->fi_extents_max) {
4312 unsigned long ref_cnt = 0; 4442 u64 bytenr = em->block_start -
4443 (em->start - em->orig_start);
4313 4444
4314 disko = em->block_start + offset_in_extent; 4445 disko = em->block_start + offset_in_extent;
4315 4446
4316 /* 4447 /*
4317 * As btrfs supports shared space, this information 4448 * As btrfs supports shared space, this information
4318 * can be exported to userspace tools via 4449 * can be exported to userspace tools via
4319 * flag FIEMAP_EXTENT_SHARED. 4450 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
4451 * then we're just getting a count and we can skip the
4452 * lookup stuff.
4320 */ 4453 */
4321 ret = iterate_inodes_from_logical( 4454 ret = btrfs_check_shared(NULL, root->fs_info,
4322 em->block_start, 4455 root->objectid,
4323 BTRFS_I(inode)->root->fs_info, 4456 btrfs_ino(inode), bytenr);
4324 path, count_ext_ref, &ref_cnt); 4457 if (ret < 0)
4325 if (ret < 0 && ret != -ENOENT)
4326 goto out_free; 4458 goto out_free;
4327 4459 if (ret)
4328 if (ref_cnt > 1)
4329 flags |= FIEMAP_EXTENT_SHARED; 4460 flags |= FIEMAP_EXTENT_SHARED;
4461 ret = 0;
4330 } 4462 }
4331 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4463 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4332 flags |= FIEMAP_EXTENT_ENCODED; 4464 flags |= FIEMAP_EXTENT_ENCODED;
@@ -4380,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb)
4380/* 4512/*
4381 * Helper for releasing extent buffer page. 4513 * Helper for releasing extent buffer page.
4382 */ 4514 */
4383static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4515static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
4384 unsigned long start_idx)
4385{ 4516{
4386 unsigned long index; 4517 unsigned long index;
4387 unsigned long num_pages;
4388 struct page *page; 4518 struct page *page;
4389 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4519 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4390 4520
4391 BUG_ON(extent_buffer_under_io(eb)); 4521 BUG_ON(extent_buffer_under_io(eb));
4392 4522
4393 num_pages = num_extent_pages(eb->start, eb->len); 4523 index = num_extent_pages(eb->start, eb->len);
4394 index = start_idx + num_pages; 4524 if (index == 0)
4395 if (start_idx >= index)
4396 return; 4525 return;
4397 4526
4398 do { 4527 do {
4399 index--; 4528 index--;
4400 page = extent_buffer_page(eb, index); 4529 page = eb->pages[index];
4401 if (page && mapped) { 4530 if (page && mapped) {
4402 spin_lock(&page->mapping->private_lock); 4531 spin_lock(&page->mapping->private_lock);
4403 /* 4532 /*
@@ -4428,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4428 /* One for when we alloced the page */ 4557 /* One for when we alloced the page */
4429 page_cache_release(page); 4558 page_cache_release(page);
4430 } 4559 }
4431 } while (index != start_idx); 4560 } while (index != 0);
4432} 4561}
4433 4562
4434/* 4563/*
@@ -4436,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4436 */ 4565 */
4437static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4566static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4438{ 4567{
4439 btrfs_release_extent_buffer_page(eb, 0); 4568 btrfs_release_extent_buffer_page(eb);
4440 __free_extent_buffer(eb); 4569 __free_extent_buffer(eb);
4441} 4570}
4442 4571
@@ -4579,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4579 4708
4580 num_pages = num_extent_pages(eb->start, eb->len); 4709 num_pages = num_extent_pages(eb->start, eb->len);
4581 for (i = 0; i < num_pages; i++) { 4710 for (i = 0; i < num_pages; i++) {
4582 struct page *p = extent_buffer_page(eb, i); 4711 struct page *p = eb->pages[i];
4712
4583 if (p != accessed) 4713 if (p != accessed)
4584 mark_page_accessed(p); 4714 mark_page_accessed(p);
4585 } 4715 }
@@ -4748,7 +4878,7 @@ again:
4748 */ 4878 */
4749 SetPageChecked(eb->pages[0]); 4879 SetPageChecked(eb->pages[0]);
4750 for (i = 1; i < num_pages; i++) { 4880 for (i = 1; i < num_pages; i++) {
4751 p = extent_buffer_page(eb, i); 4881 p = eb->pages[i];
4752 ClearPageChecked(p); 4882 ClearPageChecked(p);
4753 unlock_page(p); 4883 unlock_page(p);
4754 } 4884 }
@@ -4793,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
4793 } 4923 }
4794 4924
4795 /* Should be safe to release our pages at this point */ 4925 /* Should be safe to release our pages at this point */
4796 btrfs_release_extent_buffer_page(eb, 0); 4926 btrfs_release_extent_buffer_page(eb);
4797 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4927 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4798 return 1; 4928 return 1;
4799 } 4929 }
@@ -4859,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
4859 num_pages = num_extent_pages(eb->start, eb->len); 4989 num_pages = num_extent_pages(eb->start, eb->len);
4860 4990
4861 for (i = 0; i < num_pages; i++) { 4991 for (i = 0; i < num_pages; i++) {
4862 page = extent_buffer_page(eb, i); 4992 page = eb->pages[i];
4863 if (!PageDirty(page)) 4993 if (!PageDirty(page))
4864 continue; 4994 continue;
4865 4995
@@ -4895,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
4895 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5025 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4896 5026
4897 for (i = 0; i < num_pages; i++) 5027 for (i = 0; i < num_pages; i++)
4898 set_page_dirty(extent_buffer_page(eb, i)); 5028 set_page_dirty(eb->pages[i]);
4899 return was_dirty; 5029 return was_dirty;
4900} 5030}
4901 5031
@@ -4908,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4908 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5038 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4909 num_pages = num_extent_pages(eb->start, eb->len); 5039 num_pages = num_extent_pages(eb->start, eb->len);
4910 for (i = 0; i < num_pages; i++) { 5040 for (i = 0; i < num_pages; i++) {
4911 page = extent_buffer_page(eb, i); 5041 page = eb->pages[i];
4912 if (page) 5042 if (page)
4913 ClearPageUptodate(page); 5043 ClearPageUptodate(page);
4914 } 5044 }
@@ -4924,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
4924 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5054 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4925 num_pages = num_extent_pages(eb->start, eb->len); 5055 num_pages = num_extent_pages(eb->start, eb->len);
4926 for (i = 0; i < num_pages; i++) { 5056 for (i = 0; i < num_pages; i++) {
4927 page = extent_buffer_page(eb, i); 5057 page = eb->pages[i];
4928 SetPageUptodate(page); 5058 SetPageUptodate(page);
4929 } 5059 }
4930 return 0; 5060 return 0;
@@ -4964,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4964 5094
4965 num_pages = num_extent_pages(eb->start, eb->len); 5095 num_pages = num_extent_pages(eb->start, eb->len);
4966 for (i = start_i; i < num_pages; i++) { 5096 for (i = start_i; i < num_pages; i++) {
4967 page = extent_buffer_page(eb, i); 5097 page = eb->pages[i];
4968 if (wait == WAIT_NONE) { 5098 if (wait == WAIT_NONE) {
4969 if (!trylock_page(page)) 5099 if (!trylock_page(page))
4970 goto unlock_exit; 5100 goto unlock_exit;
@@ -4983,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4983 goto unlock_exit; 5113 goto unlock_exit;
4984 } 5114 }
4985 5115
4986 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 5116 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
4987 eb->read_mirror = 0; 5117 eb->read_mirror = 0;
4988 atomic_set(&eb->io_pages, num_reads); 5118 atomic_set(&eb->io_pages, num_reads);
4989 for (i = start_i; i < num_pages; i++) { 5119 for (i = start_i; i < num_pages; i++) {
4990 page = extent_buffer_page(eb, i); 5120 page = eb->pages[i];
4991 if (!PageUptodate(page)) { 5121 if (!PageUptodate(page)) {
4992 ClearPageError(page); 5122 ClearPageError(page);
4993 err = __extent_read_full_page(tree, page, 5123 err = __extent_read_full_page(tree, page,
@@ -5012,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
5012 return ret; 5142 return ret;
5013 5143
5014 for (i = start_i; i < num_pages; i++) { 5144 for (i = start_i; i < num_pages; i++) {
5015 page = extent_buffer_page(eb, i); 5145 page = eb->pages[i];
5016 wait_on_page_locked(page); 5146 wait_on_page_locked(page);
5017 if (!PageUptodate(page)) 5147 if (!PageUptodate(page))
5018 ret = -EIO; 5148 ret = -EIO;
@@ -5023,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
5023unlock_exit: 5153unlock_exit:
5024 i = start_i; 5154 i = start_i;
5025 while (locked_pages > 0) { 5155 while (locked_pages > 0) {
5026 page = extent_buffer_page(eb, i); 5156 page = eb->pages[i];
5027 i++; 5157 i++;
5028 unlock_page(page); 5158 unlock_page(page);
5029 locked_pages--; 5159 locked_pages--;
@@ -5049,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
5049 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5179 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5050 5180
5051 while (len > 0) { 5181 while (len > 0) {
5052 page = extent_buffer_page(eb, i); 5182 page = eb->pages[i];
5053 5183
5054 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5184 cur = min(len, (PAGE_CACHE_SIZE - offset));
5055 kaddr = page_address(page); 5185 kaddr = page_address(page);
@@ -5081,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
5081 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5211 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5082 5212
5083 while (len > 0) { 5213 while (len > 0) {
5084 page = extent_buffer_page(eb, i); 5214 page = eb->pages[i];
5085 5215
5086 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5216 cur = min(len, (PAGE_CACHE_SIZE - offset));
5087 kaddr = page_address(page); 5217 kaddr = page_address(page);
@@ -5130,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
5130 return -EINVAL; 5260 return -EINVAL;
5131 } 5261 }
5132 5262
5133 p = extent_buffer_page(eb, i); 5263 p = eb->pages[i];
5134 kaddr = page_address(p); 5264 kaddr = page_address(p);
5135 *map = kaddr + offset; 5265 *map = kaddr + offset;
5136 *map_len = PAGE_CACHE_SIZE - offset; 5266 *map_len = PAGE_CACHE_SIZE - offset;
@@ -5156,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
5156 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5286 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5157 5287
5158 while (len > 0) { 5288 while (len > 0) {
5159 page = extent_buffer_page(eb, i); 5289 page = eb->pages[i];
5160 5290
5161 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5291 cur = min(len, (PAGE_CACHE_SIZE - offset));
5162 5292
@@ -5190,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5190 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5320 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5191 5321
5192 while (len > 0) { 5322 while (len > 0) {
5193 page = extent_buffer_page(eb, i); 5323 page = eb->pages[i];
5194 WARN_ON(!PageUptodate(page)); 5324 WARN_ON(!PageUptodate(page));
5195 5325
5196 cur = min(len, PAGE_CACHE_SIZE - offset); 5326 cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5220,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
5220 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5350 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5221 5351
5222 while (len > 0) { 5352 while (len > 0) {
5223 page = extent_buffer_page(eb, i); 5353 page = eb->pages[i];
5224 WARN_ON(!PageUptodate(page)); 5354 WARN_ON(!PageUptodate(page));
5225 5355
5226 cur = min(len, PAGE_CACHE_SIZE - offset); 5356 cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5251,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5251 (PAGE_CACHE_SIZE - 1); 5381 (PAGE_CACHE_SIZE - 1);
5252 5382
5253 while (len > 0) { 5383 while (len > 0) {
5254 page = extent_buffer_page(dst, i); 5384 page = dst->pages[i];
5255 WARN_ON(!PageUptodate(page)); 5385 WARN_ON(!PageUptodate(page));
5256 5386
5257 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 5387 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
@@ -5329,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5329 cur = min_t(unsigned long, cur, 5459 cur = min_t(unsigned long, cur,
5330 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 5460 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
5331 5461
5332 copy_pages(extent_buffer_page(dst, dst_i), 5462 copy_pages(dst->pages[dst_i], dst->pages[src_i],
5333 extent_buffer_page(dst, src_i),
5334 dst_off_in_page, src_off_in_page, cur); 5463 dst_off_in_page, src_off_in_page, cur);
5335 5464
5336 src_offset += cur; 5465 src_offset += cur;
@@ -5376,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5376 5505
5377 cur = min_t(unsigned long, len, src_off_in_page + 1); 5506 cur = min_t(unsigned long, len, src_off_in_page + 1);
5378 cur = min(cur, dst_off_in_page + 1); 5507 cur = min(cur, dst_off_in_page + 1);
5379 copy_pages(extent_buffer_page(dst, dst_i), 5508 copy_pages(dst->pages[dst_i], dst->pages[src_i],
5380 extent_buffer_page(dst, src_i),
5381 dst_off_in_page - cur + 1, 5509 dst_off_in_page - cur + 1,
5382 src_off_in_page - cur + 1, cur); 5510 src_off_in_page - cur + 1, cur);
5383 5511
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ccc264e7bde1..6d4b938be986 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -11,8 +11,6 @@
11#define EXTENT_NEW (1 << 4) 11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5) 12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6) 13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_BOUNDARY (1 << 9) 14#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 15#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 16#define EXTENT_DO_ACCOUNTING (1 << 11)
@@ -34,16 +32,16 @@
34 32
35/* these are bit numbers for test/set bit */ 33/* these are bit numbers for test/set bit */
36#define EXTENT_BUFFER_UPTODATE 0 34#define EXTENT_BUFFER_UPTODATE 0
37#define EXTENT_BUFFER_BLOCKING 1
38#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
39#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
40#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ 37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
41#define EXTENT_BUFFER_TREE_REF 5 38#define EXTENT_BUFFER_TREE_REF 5
42#define EXTENT_BUFFER_STALE 6 39#define EXTENT_BUFFER_STALE 6
43#define EXTENT_BUFFER_WRITEBACK 7 40#define EXTENT_BUFFER_WRITEBACK 7
44#define EXTENT_BUFFER_IOERR 8 41#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
45#define EXTENT_BUFFER_DUMMY 9 42#define EXTENT_BUFFER_DUMMY 9
46#define EXTENT_BUFFER_IN_TREE 10 43#define EXTENT_BUFFER_IN_TREE 10
44#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
47 45
48/* these are flags for extent_clear_unlock_delalloc */ 46/* these are flags for extent_clear_unlock_delalloc */
49#define PAGE_UNLOCK (1 << 0) 47#define PAGE_UNLOCK (1 << 0)
@@ -57,7 +55,6 @@
57 * map has page->private set to one. 55 * map has page->private set to one.
58 */ 56 */
59#define EXTENT_PAGE_PRIVATE 1 57#define EXTENT_PAGE_PRIVATE 1
60#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
61 58
62struct extent_state; 59struct extent_state;
63struct btrfs_root; 60struct btrfs_root;
@@ -108,7 +105,6 @@ struct extent_state {
108 struct rb_node rb_node; 105 struct rb_node rb_node;
109 106
110 /* ADD NEW ELEMENTS AFTER THIS */ 107 /* ADD NEW ELEMENTS AFTER THIS */
111 struct extent_io_tree *tree;
112 wait_queue_head_t wq; 108 wait_queue_head_t wq;
113 atomic_t refs; 109 atomic_t refs;
114 unsigned long state; 110 unsigned long state;
@@ -126,8 +122,6 @@ struct extent_state {
126struct extent_buffer { 122struct extent_buffer {
127 u64 start; 123 u64 start;
128 unsigned long len; 124 unsigned long len;
129 unsigned long map_start;
130 unsigned long map_len;
131 unsigned long bflags; 125 unsigned long bflags;
132 struct btrfs_fs_info *fs_info; 126 struct btrfs_fs_info *fs_info;
133 spinlock_t refs_lock; 127 spinlock_t refs_lock;
@@ -144,7 +138,9 @@ struct extent_buffer {
144 atomic_t blocking_readers; 138 atomic_t blocking_readers;
145 atomic_t spinning_readers; 139 atomic_t spinning_readers;
146 atomic_t spinning_writers; 140 atomic_t spinning_writers;
147 int lock_nested; 141 short lock_nested;
142 /* >= 0 if eb belongs to a log tree, -1 otherwise */
143 short log_index;
148 144
149 /* protects write locks */ 145 /* protects write locks */
150 rwlock_t lock; 146 rwlock_t lock;
@@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len)
286 (start >> PAGE_CACHE_SHIFT); 282 (start >> PAGE_CACHE_SHIFT);
287} 283}
288 284
289static inline struct page *extent_buffer_page(struct extent_buffer *eb,
290 unsigned long i)
291{
292 return eb->pages[i];
293}
294
295static inline void extent_buffer_get(struct extent_buffer *eb) 285static inline void extent_buffer_get(struct extent_buffer *eb)
296{ 286{
297 atomic_inc(&eb->refs); 287 atomic_inc(&eb->refs);
@@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
341 331
342struct btrfs_fs_info; 332struct btrfs_fs_info;
343 333
344int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 334int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
345 u64 length, u64 logical, struct page *page, 335 struct page *page, unsigned int pg_offset,
346 int mirror_num); 336 int mirror_num);
337int clean_io_failure(struct inode *inode, u64 start, struct page *page,
338 unsigned int pg_offset);
347int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 339int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
348int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 340int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
349 int mirror_num); 341 int mirror_num);
342
343/*
344 * When IO fails, either with EIO or csum verification fails, we
345 * try other mirrors that might have a good copy of the data. This
346 * io_failure_record is used to record state as we go through all the
347 * mirrors. If another mirror has good data, the page is set up to date
348 * and things continue. If a good mirror can't be found, the original
349 * bio end_io callback is called to indicate things have failed.
350 */
351struct io_failure_record {
352 struct page *page;
353 u64 start;
354 u64 len;
355 u64 logical;
356 unsigned long bio_flags;
357 int this_mirror;
358 int failed_mirror;
359 int in_validation;
360};
361
362void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
363int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
364 struct io_failure_record **failrec_ret);
365int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
366 struct io_failure_record *failrec, int fail_mirror);
367struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
368 struct io_failure_record *failrec,
369 struct page *page, int pg_offset, int icsum,
370 bio_end_io_t *endio_func, void *data);
371int free_io_failure(struct inode *inode, struct io_failure_record *rec);
350#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 372#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
351noinline u64 find_lock_delalloc_range(struct inode *inode, 373noinline u64 find_lock_delalloc_range(struct inode *inode,
352 struct extent_io_tree *tree, 374 struct extent_io_tree *tree,
353 struct page *locked_page, u64 *start, 375 struct page *locked_page, u64 *start,
354 u64 *end, u64 max_bytes); 376 u64 *end, u64 max_bytes);
377#endif
355struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 378struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
356 u64 start, unsigned long len); 379 u64 start, unsigned long len);
357#endif 380#endif
358#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54c84daec9b5..783a94355efd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
55 return -ENOMEM; 55 return -ENOMEM;
56 file_key.objectid = objectid; 56 file_key.objectid = objectid;
57 file_key.offset = pos; 57 file_key.offset = pos;
58 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 58 file_key.type = BTRFS_EXTENT_DATA_KEY;
59 59
60 path->leave_spinning = 1; 60 path->leave_spinning = 1;
61 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 61 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
@@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
100 100
101 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 101 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
102 file_key.offset = bytenr; 102 file_key.offset = bytenr;
103 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 103 file_key.type = BTRFS_EXTENT_CSUM_KEY;
104 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); 104 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
105 if (ret < 0) 105 if (ret < 0)
106 goto fail; 106 goto fail;
@@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
111 goto fail; 111 goto fail;
112 path->slots[0]--; 112 path->slots[0]--;
113 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 113 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
114 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) 114 if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
115 goto fail; 115 goto fail;
116 116
117 csum_offset = (bytenr - found_key.offset) >> 117 csum_offset = (bytenr - found_key.offset) >>
@@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
148 148
149 file_key.objectid = objectid; 149 file_key.objectid = objectid;
150 file_key.offset = offset; 150 file_key.offset = offset;
151 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 151 file_key.type = BTRFS_EXTENT_DATA_KEY;
152 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); 152 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
153 return ret; 153 return ret;
154} 154}
@@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
299} 299}
300 300
301int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, 301int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
302 struct btrfs_dio_private *dip, struct bio *bio, 302 struct bio *bio, u64 offset)
303 u64 offset)
304{ 303{
305 int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; 304 return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
306 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
307 int ret;
308
309 len >>= inode->i_sb->s_blocksize_bits;
310 len *= csum_size;
311
312 ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
313 (u32 *)(dip->csum + len), 1);
314 return ret;
315} 305}
316 306
317int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 307int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
329 u64 csum_end; 319 u64 csum_end;
330 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 320 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
331 321
332 ASSERT(start == ALIGN(start, root->sectorsize) && 322 ASSERT(IS_ALIGNED(start, root->sectorsize) &&
333 (end + 1) == ALIGN(end + 1, root->sectorsize)); 323 IS_ALIGNED(end + 1, root->sectorsize));
334 324
335 path = btrfs_alloc_path(); 325 path = btrfs_alloc_path();
336 if (!path) 326 if (!path)
@@ -720,7 +710,7 @@ again:
720 bytenr = sums->bytenr + total_bytes; 710 bytenr = sums->bytenr + total_bytes;
721 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 711 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
722 file_key.offset = bytenr; 712 file_key.offset = bytenr;
723 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 713 file_key.type = BTRFS_EXTENT_CSUM_KEY;
724 714
725 item = btrfs_lookup_csum(trans, root, path, bytenr, 1); 715 item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
726 if (!IS_ERR(item)) { 716 if (!IS_ERR(item)) {
@@ -790,7 +780,7 @@ again:
790 csum_offset = (bytenr - found_key.offset) >> 780 csum_offset = (bytenr - found_key.offset) >>
791 root->fs_info->sb->s_blocksize_bits; 781 root->fs_info->sb->s_blocksize_bits;
792 782
793 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || 783 if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
794 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || 784 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
795 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { 785 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
796 goto insert; 786 goto insert;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d3afac292d67..a18ceabd99a8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
299 299
300 /* get the inode */ 300 /* get the inode */
301 key.objectid = defrag->root; 301 key.objectid = defrag->root;
302 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 302 key.type = BTRFS_ROOT_ITEM_KEY;
303 key.offset = (u64)-1; 303 key.offset = (u64)-1;
304 304
305 index = srcu_read_lock(&fs_info->subvol_srcu); 305 index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
311 } 311 }
312 312
313 key.objectid = defrag->ino; 313 key.objectid = defrag->ino;
314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 314 key.type = BTRFS_INODE_ITEM_KEY;
315 key.offset = 0; 315 key.offset = 0;
316 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 316 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
317 if (IS_ERR(inode)) { 317 if (IS_ERR(inode)) {
@@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
452 if (unlikely(copied == 0)) 452 if (unlikely(copied == 0))
453 break; 453 break;
454 454
455 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 455 if (copied < PAGE_CACHE_SIZE - offset) {
456 offset += copied; 456 offset += copied;
457 } else { 457 } else {
458 pg++; 458 pg++;
@@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1481 bool force_page_uptodate = false; 1481 bool force_page_uptodate = false;
1482 bool need_unlock; 1482 bool need_unlock;
1483 1483
1484 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1484 nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
1485 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1485 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1486 (sizeof(struct page *)));
1487 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1486 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1488 nrptrs = max(nrptrs, 8); 1487 nrptrs = max(nrptrs, 8);
1489 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1488 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1497 size_t write_bytes = min(iov_iter_count(i), 1496 size_t write_bytes = min(iov_iter_count(i),
1498 nrptrs * (size_t)PAGE_CACHE_SIZE - 1497 nrptrs * (size_t)PAGE_CACHE_SIZE -
1499 offset); 1498 offset);
1500 size_t num_pages = (write_bytes + offset + 1499 size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1501 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1500 PAGE_CACHE_SIZE);
1502 size_t reserve_bytes; 1501 size_t reserve_bytes;
1503 size_t dirty_pages; 1502 size_t dirty_pages;
1504 size_t copied; 1503 size_t copied;
@@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1526 * our prealloc extent may be smaller than 1525 * our prealloc extent may be smaller than
1527 * write_bytes, so scale down. 1526 * write_bytes, so scale down.
1528 */ 1527 */
1529 num_pages = (write_bytes + offset + 1528 num_pages = DIV_ROUND_UP(write_bytes + offset,
1530 PAGE_CACHE_SIZE - 1) >> 1529 PAGE_CACHE_SIZE);
1531 PAGE_CACHE_SHIFT;
1532 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1530 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1533 ret = 0; 1531 ret = 0;
1534 } else { 1532 } else {
@@ -1590,9 +1588,8 @@ again:
1590 dirty_pages = 0; 1588 dirty_pages = 0;
1591 } else { 1589 } else {
1592 force_page_uptodate = false; 1590 force_page_uptodate = false;
1593 dirty_pages = (copied + offset + 1591 dirty_pages = DIV_ROUND_UP(copied + offset,
1594 PAGE_CACHE_SIZE - 1) >> 1592 PAGE_CACHE_SIZE);
1595 PAGE_CACHE_SHIFT;
1596 } 1593 }
1597 1594
1598 /* 1595 /*
@@ -1653,7 +1650,7 @@ again:
1653 cond_resched(); 1650 cond_resched();
1654 1651
1655 balance_dirty_pages_ratelimited(inode->i_mapping); 1652 balance_dirty_pages_ratelimited(inode->i_mapping);
1656 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1653 if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
1657 btrfs_btree_balance_dirty(root); 1654 btrfs_btree_balance_dirty(root);
1658 1655
1659 pos += copied; 1656 pos += copied;
@@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1795 if (sync) 1792 if (sync)
1796 atomic_inc(&BTRFS_I(inode)->sync_writers); 1793 atomic_inc(&BTRFS_I(inode)->sync_writers);
1797 1794
1798 if (unlikely(file->f_flags & O_DIRECT)) { 1795 if (file->f_flags & O_DIRECT) {
1799 num_written = __btrfs_direct_write(iocb, from, pos); 1796 num_written = __btrfs_direct_write(iocb, from, pos);
1800 } else { 1797 } else {
1801 num_written = __btrfs_buffered_write(file, from, pos); 1798 num_written = __btrfs_buffered_write(file, from, pos);
@@ -1840,10 +1837,32 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1840{ 1837{
1841 if (filp->private_data) 1838 if (filp->private_data)
1842 btrfs_ioctl_trans_end(filp); 1839 btrfs_ioctl_trans_end(filp);
1843 filemap_flush(inode->i_mapping); 1840 /*
1841 * ordered_data_close is set by settattr when we are about to truncate
1842 * a file from a non-zero size to a zero size. This tries to
1843 * flush down new bytes that may have been written if the
1844 * application were using truncate to replace a file in place.
1845 */
1846 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1847 &BTRFS_I(inode)->runtime_flags))
1848 filemap_flush(inode->i_mapping);
1844 return 0; 1849 return 0;
1845} 1850}
1846 1851
1852static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1853{
1854 int ret;
1855
1856 atomic_inc(&BTRFS_I(inode)->sync_writers);
1857 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1858 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1859 &BTRFS_I(inode)->runtime_flags))
1860 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1861 atomic_dec(&BTRFS_I(inode)->sync_writers);
1862
1863 return ret;
1864}
1865
1847/* 1866/*
1848 * fsync call for both files and directories. This logs the inode into 1867 * fsync call for both files and directories. This logs the inode into
1849 * the tree log instead of forcing full commits whenever possible. 1868 * the tree log instead of forcing full commits whenever possible.
@@ -1873,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1873 * multi-task, and make the performance up. See 1892 * multi-task, and make the performance up. See
1874 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1893 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1875 */ 1894 */
1876 atomic_inc(&BTRFS_I(inode)->sync_writers); 1895 ret = start_ordered_ops(inode, start, end);
1877 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1878 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1879 &BTRFS_I(inode)->runtime_flags))
1880 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1881 atomic_dec(&BTRFS_I(inode)->sync_writers);
1882 if (ret) 1896 if (ret)
1883 return ret; 1897 return ret;
1884 1898
1885 mutex_lock(&inode->i_mutex); 1899 mutex_lock(&inode->i_mutex);
1886
1887 /*
1888 * We flush the dirty pages again to avoid some dirty pages in the
1889 * range being left.
1890 */
1891 atomic_inc(&root->log_batch); 1900 atomic_inc(&root->log_batch);
1892 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1901 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1893 &BTRFS_I(inode)->runtime_flags); 1902 &BTRFS_I(inode)->runtime_flags);
1903 /*
1904 * We might have have had more pages made dirty after calling
1905 * start_ordered_ops and before acquiring the inode's i_mutex.
1906 */
1894 if (full_sync) { 1907 if (full_sync) {
1908 /*
1909 * For a full sync, we need to make sure any ordered operations
1910 * start and finish before we start logging the inode, so that
1911 * all extents are persisted and the respective file extent
1912 * items are in the fs/subvol btree.
1913 */
1895 ret = btrfs_wait_ordered_range(inode, start, end - start + 1); 1914 ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
1896 if (ret) { 1915 } else {
1897 mutex_unlock(&inode->i_mutex); 1916 /*
1898 goto out; 1917 * Start any new ordered operations before starting to log the
1899 } 1918 * inode. We will wait for them to finish in btrfs_sync_log().
1919 *
1920 * Right before acquiring the inode's mutex, we might have new
1921 * writes dirtying pages, which won't immediately start the
1922 * respective ordered operations - that is done through the
1923 * fill_delalloc callbacks invoked from the writepage and
1924 * writepages address space operations. So make sure we start
1925 * all ordered operations before starting to log our inode. Not
1926 * doing this means that while logging the inode, writeback
1927 * could start and invoke writepage/writepages, which would call
1928 * the fill_delalloc callbacks (cow_file_range,
1929 * submit_compressed_extents). These callbacks add first an
1930 * extent map to the modified list of extents and then create
1931 * the respective ordered operation, which means in
1932 * tree-log.c:btrfs_log_inode() we might capture all existing
1933 * ordered operations (with btrfs_get_logged_extents()) before
1934 * the fill_delalloc callback adds its ordered operation, and by
1935 * the time we visit the modified list of extent maps (with
1936 * btrfs_log_changed_extents()), we see and process the extent
1937 * map they created. We then use the extent map to construct a
1938 * file extent item for logging without waiting for the
1939 * respective ordered operation to finish - this file extent
1940 * item points to a disk location that might not have yet been
1941 * written to, containing random data - so after a crash a log
1942 * replay will make our inode have file extent items that point
1943 * to disk locations containing invalid data, as we returned
1944 * success to userspace without waiting for the respective
1945 * ordered operation to finish, because it wasn't captured by
1946 * btrfs_get_logged_extents().
1947 */
1948 ret = start_ordered_ops(inode, start, end);
1949 }
1950 if (ret) {
1951 mutex_unlock(&inode->i_mutex);
1952 goto out;
1900 } 1953 }
1901 atomic_inc(&root->log_batch); 1954 atomic_inc(&root->log_batch);
1902 1955
@@ -1958,7 +2011,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1958 2011
1959 btrfs_init_log_ctx(&ctx); 2012 btrfs_init_log_ctx(&ctx);
1960 2013
1961 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); 2014 ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
1962 if (ret < 0) { 2015 if (ret < 0) {
1963 /* Fallthrough and commit/free transaction. */ 2016 /* Fallthrough and commit/free transaction. */
1964 ret = 1; 2017 ret = 1;
@@ -1976,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1976 */ 2029 */
1977 mutex_unlock(&inode->i_mutex); 2030 mutex_unlock(&inode->i_mutex);
1978 2031
2032 /*
2033 * If any of the ordered extents had an error, just return it to user
2034 * space, so that the application knows some writes didn't succeed and
2035 * can take proper action (retry for e.g.). Blindly committing the
2036 * transaction in this case, would fool userspace that everything was
2037 * successful. And we also want to make sure our log doesn't contain
2038 * file extent items pointing to extents that weren't fully written to -
2039 * just like in the non fast fsync path, where we check for the ordered
2040 * operation's error flag before writing to the log tree and return -EIO
2041 * if any of them had this flag set (btrfs_wait_ordered_range) -
2042 * therefore we need to check for errors in the ordered operations,
2043 * which are indicated by ctx.io_err.
2044 */
2045 if (ctx.io_err) {
2046 btrfs_end_transaction(trans, root);
2047 ret = ctx.io_err;
2048 goto out;
2049 }
2050
1979 if (ret != BTRFS_NO_LOG_SYNC) { 2051 if (ret != BTRFS_NO_LOG_SYNC) {
1980 if (!ret) { 2052 if (!ret) {
1981 ret = btrfs_sync_log(trans, root, &ctx); 2053 ret = btrfs_sync_log(trans, root, &ctx);
@@ -2088,10 +2160,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2088 goto out; 2160 goto out;
2089 } 2161 }
2090 2162
2091 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { 2163 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2092 u64 num_bytes; 2164 u64 num_bytes;
2093 2165
2094 path->slots[0]++;
2095 key.offset = offset; 2166 key.offset = offset;
2096 btrfs_set_item_key_safe(root, path, &key); 2167 btrfs_set_item_key_safe(root, path, &key);
2097 fi = btrfs_item_ptr(leaf, path->slots[0], 2168 fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -2216,7 +2287,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2216 goto out_only_mutex; 2287 goto out_only_mutex;
2217 } 2288 }
2218 2289
2219 lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); 2290 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2220 lockend = round_down(offset + len, 2291 lockend = round_down(offset + len,
2221 BTRFS_I(inode)->root->sectorsize) - 1; 2292 BTRFS_I(inode)->root->sectorsize) - 1;
2222 same_page = ((offset >> PAGE_CACHE_SHIFT) == 2293 same_page = ((offset >> PAGE_CACHE_SHIFT) ==
@@ -2277,7 +2348,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2277 tail_start + tail_len, 0, 1); 2348 tail_start + tail_len, 0, 1);
2278 if (ret) 2349 if (ret)
2279 goto out_only_mutex; 2350 goto out_only_mutex;
2280 } 2351 }
2281 } 2352 }
2282 } 2353 }
2283 2354
@@ -2614,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2614 struct btrfs_root *root = BTRFS_I(inode)->root; 2685 struct btrfs_root *root = BTRFS_I(inode)->root;
2615 struct extent_map *em = NULL; 2686 struct extent_map *em = NULL;
2616 struct extent_state *cached_state = NULL; 2687 struct extent_state *cached_state = NULL;
2617 u64 lockstart = *offset; 2688 u64 lockstart;
2618 u64 lockend = i_size_read(inode); 2689 u64 lockend;
2619 u64 start = *offset; 2690 u64 start;
2620 u64 len = i_size_read(inode); 2691 u64 len;
2621 int ret = 0; 2692 int ret = 0;
2622 2693
2623 lockend = max_t(u64, root->sectorsize, lockend); 2694 if (inode->i_size == 0)
2695 return -ENXIO;
2696
2697 /*
2698 * *offset can be negative, in this case we start finding DATA/HOLE from
2699 * the very start of the file.
2700 */
2701 start = max_t(loff_t, 0, *offset);
2702
2703 lockstart = round_down(start, root->sectorsize);
2704 lockend = round_up(i_size_read(inode), root->sectorsize);
2624 if (lockend <= lockstart) 2705 if (lockend <= lockstart)
2625 lockend = lockstart + root->sectorsize; 2706 lockend = lockstart + root->sectorsize;
2626
2627 lockend--; 2707 lockend--;
2628 len = lockend - lockstart + 1; 2708 len = lockend - lockstart + 1;
2629 2709
2630 len = max_t(u64, len, root->sectorsize);
2631 if (inode->i_size == 0)
2632 return -ENXIO;
2633
2634 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, 2710 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2635 &cached_state); 2711 &cached_state);
2636 2712
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2b0a627cb5f9..33848196550e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
279 int num_pages; 279 int num_pages;
280 int check_crcs = 0; 280 int check_crcs = 0;
281 281
282 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 282 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
283 PAGE_CACHE_SHIFT;
284 283
285 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) 284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
286 check_crcs = 1; 285 check_crcs = 1;
@@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
1998 return merged; 1997 return merged;
1999} 1998}
2000 1999
2000static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2001 struct btrfs_free_space *info,
2002 bool update_stat)
2003{
2004 struct btrfs_free_space *bitmap;
2005 unsigned long i;
2006 unsigned long j;
2007 const u64 end = info->offset + info->bytes;
2008 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2009 u64 bytes;
2010
2011 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2012 if (!bitmap)
2013 return false;
2014
2015 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2016 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2017 if (j == i)
2018 return false;
2019 bytes = (j - i) * ctl->unit;
2020 info->bytes += bytes;
2021
2022 if (update_stat)
2023 bitmap_clear_bits(ctl, bitmap, end, bytes);
2024 else
2025 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2026
2027 if (!bitmap->bytes)
2028 free_bitmap(ctl, bitmap);
2029
2030 return true;
2031}
2032
2033static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2034 struct btrfs_free_space *info,
2035 bool update_stat)
2036{
2037 struct btrfs_free_space *bitmap;
2038 u64 bitmap_offset;
2039 unsigned long i;
2040 unsigned long j;
2041 unsigned long prev_j;
2042 u64 bytes;
2043
2044 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2045 /* If we're on a boundary, try the previous logical bitmap. */
2046 if (bitmap_offset == info->offset) {
2047 if (info->offset == 0)
2048 return false;
2049 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2050 }
2051
2052 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2053 if (!bitmap)
2054 return false;
2055
2056 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2057 j = 0;
2058 prev_j = (unsigned long)-1;
2059 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2060 if (j > i)
2061 break;
2062 prev_j = j;
2063 }
2064 if (prev_j == i)
2065 return false;
2066
2067 if (prev_j == (unsigned long)-1)
2068 bytes = (i + 1) * ctl->unit;
2069 else
2070 bytes = (i - prev_j) * ctl->unit;
2071
2072 info->offset -= bytes;
2073 info->bytes += bytes;
2074
2075 if (update_stat)
2076 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2077 else
2078 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2079
2080 if (!bitmap->bytes)
2081 free_bitmap(ctl, bitmap);
2082
2083 return true;
2084}
2085
2086/*
2087 * We prefer always to allocate from extent entries, both for clustered and
2088 * non-clustered allocation requests. So when attempting to add a new extent
2089 * entry, try to see if there's adjacent free space in bitmap entries, and if
2090 * there is, migrate that space from the bitmaps to the extent.
2091 * Like this we get better chances of satisfying space allocation requests
2092 * because we attempt to satisfy them based on a single cache entry, and never
2093 * on 2 or more entries - even if the entries represent a contiguous free space
2094 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2095 * ends).
2096 */
2097static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2098 struct btrfs_free_space *info,
2099 bool update_stat)
2100{
2101 /*
2102 * Only work with disconnected entries, as we can change their offset,
2103 * and must be extent entries.
2104 */
2105 ASSERT(!info->bitmap);
2106 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2107
2108 if (ctl->total_bitmaps > 0) {
2109 bool stole_end;
2110 bool stole_front = false;
2111
2112 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2113 if (ctl->total_bitmaps > 0)
2114 stole_front = steal_from_bitmap_to_front(ctl, info,
2115 update_stat);
2116
2117 if (stole_end || stole_front)
2118 try_merge_free_space(ctl, info, update_stat);
2119 }
2120}
2121
2001int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, 2122int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2002 u64 offset, u64 bytes) 2123 u64 offset, u64 bytes)
2003{ 2124{
@@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2010 2131
2011 info->offset = offset; 2132 info->offset = offset;
2012 info->bytes = bytes; 2133 info->bytes = bytes;
2134 RB_CLEAR_NODE(&info->offset_index);
2013 2135
2014 spin_lock(&ctl->tree_lock); 2136 spin_lock(&ctl->tree_lock);
2015 2137
@@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2029 goto out; 2151 goto out;
2030 } 2152 }
2031link: 2153link:
2154 /*
2155 * Only steal free space from adjacent bitmaps if we're sure we're not
2156 * going to add the new free space to existing bitmap entries - because
2157 * that would mean unnecessary work that would be reverted. Therefore
2158 * attempt to steal space from bitmaps if we're adding an extent entry.
2159 */
2160 steal_from_bitmap(ctl, info, true);
2161
2032 ret = link_free_space(ctl, info); 2162 ret = link_free_space(ctl, info);
2033 if (ret) 2163 if (ret)
2034 kmem_cache_free(btrfs_free_space_cachep, info); 2164 kmem_cache_free(btrfs_free_space_cachep, info);
@@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space(
2205 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2335 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2206 node = rb_next(&entry->offset_index); 2336 node = rb_next(&entry->offset_index);
2207 rb_erase(&entry->offset_index, &cluster->root); 2337 rb_erase(&entry->offset_index, &cluster->root);
2338 RB_CLEAR_NODE(&entry->offset_index);
2208 2339
2209 bitmap = (entry->bitmap != NULL); 2340 bitmap = (entry->bitmap != NULL);
2210 if (!bitmap) 2341 if (!bitmap) {
2211 try_merge_free_space(ctl, entry, false); 2342 try_merge_free_space(ctl, entry, false);
2343 steal_from_bitmap(ctl, entry, false);
2344 }
2212 tree_insert_offset(&ctl->free_space_offset, 2345 tree_insert_offset(&ctl->free_space_offset,
2213 entry->offset, &entry->offset_index, bitmap); 2346 entry->offset, &entry->offset_index, bitmap);
2214 } 2347 }
@@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3033{ 3166{
3034 struct inode *inode = NULL; 3167 struct inode *inode = NULL;
3035 3168
3036 spin_lock(&root->cache_lock); 3169 spin_lock(&root->ino_cache_lock);
3037 if (root->cache_inode) 3170 if (root->ino_cache_inode)
3038 inode = igrab(root->cache_inode); 3171 inode = igrab(root->ino_cache_inode);
3039 spin_unlock(&root->cache_lock); 3172 spin_unlock(&root->ino_cache_lock);
3040 if (inode) 3173 if (inode)
3041 return inode; 3174 return inode;
3042 3175
@@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3044 if (IS_ERR(inode)) 3177 if (IS_ERR(inode))
3045 return inode; 3178 return inode;
3046 3179
3047 spin_lock(&root->cache_lock); 3180 spin_lock(&root->ino_cache_lock);
3048 if (!btrfs_fs_closing(root->fs_info)) 3181 if (!btrfs_fs_closing(root->fs_info))
3049 root->cache_inode = igrab(inode); 3182 root->ino_cache_inode = igrab(inode);
3050 spin_unlock(&root->cache_lock); 3183 spin_unlock(&root->ino_cache_lock);
3051 3184
3052 return inode; 3185 return inode;
3053} 3186}
@@ -3176,6 +3309,7 @@ again:
3176 map = NULL; 3309 map = NULL;
3177 add_new_bitmap(ctl, info, offset); 3310 add_new_bitmap(ctl, info, offset);
3178 bitmap_info = info; 3311 bitmap_info = info;
3312 info = NULL;
3179 } 3313 }
3180 3314
3181 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 3315 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
@@ -3186,6 +3320,8 @@ again:
3186 if (bytes) 3320 if (bytes)
3187 goto again; 3321 goto again;
3188 3322
3323 if (info)
3324 kmem_cache_free(btrfs_free_space_cachep, info);
3189 if (map) 3325 if (map)
3190 kfree(map); 3326 kfree(map);
3191 return 0; 3327 return 0;
@@ -3260,6 +3396,7 @@ have_info:
3260 goto have_info; 3396 goto have_info;
3261 } 3397 }
3262 3398
3399 ret = 0;
3263 goto out; 3400 goto out;
3264 } 3401 }
3265 3402
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 85889aa82c62..64f15bb30a81 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -20,10 +20,8 @@ static struct crypto_shash *tfm;
20int __init btrfs_hash_init(void) 20int __init btrfs_hash_init(void)
21{ 21{
22 tfm = crypto_alloc_shash("crc32c", 0, 0); 22 tfm = crypto_alloc_shash("crc32c", 0, 0);
23 if (IS_ERR(tfm))
24 return PTR_ERR(tfm);
25 23
26 return 0; 24 return PTR_ERR_OR_ZERO(tfm);
27} 25}
28 26
29void btrfs_hash_exit(void) 27void btrfs_hash_exit(void)
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 2be38df703c9..8ffa4783cbf4 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
135 u32 item_size; 135 u32 item_size;
136 136
137 key.objectid = inode_objectid; 137 key.objectid = inode_objectid;
138 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 138 key.type = BTRFS_INODE_EXTREF_KEY;
139 key.offset = btrfs_extref_hash(ref_objectid, name, name_len); 139 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
140 140
141 path = btrfs_alloc_path(); 141 path = btrfs_alloc_path();
@@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
209 209
210 key.objectid = inode_objectid; 210 key.objectid = inode_objectid;
211 key.offset = ref_objectid; 211 key.offset = ref_objectid;
212 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); 212 key.type = BTRFS_INODE_REF_KEY;
213 213
214 path = btrfs_alloc_path(); 214 path = btrfs_alloc_path();
215 if (!path) 215 if (!path)
@@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
337 337
338 key.objectid = inode_objectid; 338 key.objectid = inode_objectid;
339 key.offset = ref_objectid; 339 key.offset = ref_objectid;
340 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); 340 key.type = BTRFS_INODE_REF_KEY;
341 341
342 path = btrfs_alloc_path(); 342 path = btrfs_alloc_path();
343 if (!path) 343 if (!path)
@@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
400 struct btrfs_key key; 400 struct btrfs_key key;
401 int ret; 401 int ret;
402 key.objectid = objectid; 402 key.objectid = objectid;
403 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 403 key.type = BTRFS_INODE_ITEM_KEY;
404 key.offset = 0; 404 key.offset = 0;
405 405
406 ret = btrfs_insert_empty_item(trans, root, path, &key, 406 ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
420 struct btrfs_key found_key; 420 struct btrfs_key found_key;
421 421
422 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); 422 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
423 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && 423 if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
424 location->offset == (u64)-1 && path->slots[0] != 0) { 424 location->offset == (u64)-1 && path->slots[0] != 0) {
425 slot = path->slots[0] - 1; 425 slot = path->slots[0] - 1;
426 leaf = path->nodes[0]; 426 leaf = path->nodes[0];
427 btrfs_item_key_to_cpu(leaf, &found_key, slot); 427 btrfs_item_key_to_cpu(leaf, &found_key, slot);
428 if (found_key.objectid == location->objectid && 428 if (found_key.objectid == location->objectid &&
429 btrfs_key_type(&found_key) == btrfs_key_type(location)) { 429 found_key.type == location->type) {
430 path->slots[0]--; 430 path->slots[0]--;
431 return 0; 431 return 0;
432 } 432 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 888fbe19079f..83d646bd2e4b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -87,7 +87,7 @@ again:
87 */ 87 */
88 btrfs_item_key_to_cpu(leaf, &key, 0); 88 btrfs_item_key_to_cpu(leaf, &key, 0);
89 btrfs_release_path(path); 89 btrfs_release_path(path);
90 root->cache_progress = last; 90 root->ino_cache_progress = last;
91 up_read(&fs_info->commit_root_sem); 91 up_read(&fs_info->commit_root_sem);
92 schedule_timeout(1); 92 schedule_timeout(1);
93 goto again; 93 goto again;
@@ -106,7 +106,7 @@ again:
106 if (last != (u64)-1 && last + 1 != key.objectid) { 106 if (last != (u64)-1 && last + 1 != key.objectid) {
107 __btrfs_add_free_space(ctl, last + 1, 107 __btrfs_add_free_space(ctl, last + 1,
108 key.objectid - last - 1); 108 key.objectid - last - 1);
109 wake_up(&root->cache_wait); 109 wake_up(&root->ino_cache_wait);
110 } 110 }
111 111
112 last = key.objectid; 112 last = key.objectid;
@@ -119,14 +119,14 @@ next:
119 root->highest_objectid - last - 1); 119 root->highest_objectid - last - 1);
120 } 120 }
121 121
122 spin_lock(&root->cache_lock); 122 spin_lock(&root->ino_cache_lock);
123 root->cached = BTRFS_CACHE_FINISHED; 123 root->ino_cache_state = BTRFS_CACHE_FINISHED;
124 spin_unlock(&root->cache_lock); 124 spin_unlock(&root->ino_cache_lock);
125 125
126 root->cache_progress = (u64)-1; 126 root->ino_cache_progress = (u64)-1;
127 btrfs_unpin_free_ino(root); 127 btrfs_unpin_free_ino(root);
128out: 128out:
129 wake_up(&root->cache_wait); 129 wake_up(&root->ino_cache_wait);
130 up_read(&fs_info->commit_root_sem); 130 up_read(&fs_info->commit_root_sem);
131 131
132 btrfs_free_path(path); 132 btrfs_free_path(path);
@@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root)
144 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 144 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
145 return; 145 return;
146 146
147 spin_lock(&root->cache_lock); 147 spin_lock(&root->ino_cache_lock);
148 if (root->cached != BTRFS_CACHE_NO) { 148 if (root->ino_cache_state != BTRFS_CACHE_NO) {
149 spin_unlock(&root->cache_lock); 149 spin_unlock(&root->ino_cache_lock);
150 return; 150 return;
151 } 151 }
152 152
153 root->cached = BTRFS_CACHE_STARTED; 153 root->ino_cache_state = BTRFS_CACHE_STARTED;
154 spin_unlock(&root->cache_lock); 154 spin_unlock(&root->ino_cache_lock);
155 155
156 ret = load_free_ino_cache(root->fs_info, root); 156 ret = load_free_ino_cache(root->fs_info, root);
157 if (ret == 1) { 157 if (ret == 1) {
158 spin_lock(&root->cache_lock); 158 spin_lock(&root->ino_cache_lock);
159 root->cached = BTRFS_CACHE_FINISHED; 159 root->ino_cache_state = BTRFS_CACHE_FINISHED;
160 spin_unlock(&root->cache_lock); 160 spin_unlock(&root->ino_cache_lock);
161 return; 161 return;
162 } 162 }
163 163
@@ -196,11 +196,11 @@ again:
196 196
197 start_caching(root); 197 start_caching(root);
198 198
199 wait_event(root->cache_wait, 199 wait_event(root->ino_cache_wait,
200 root->cached == BTRFS_CACHE_FINISHED || 200 root->ino_cache_state == BTRFS_CACHE_FINISHED ||
201 root->free_ino_ctl->free_space > 0); 201 root->free_ino_ctl->free_space > 0);
202 202
203 if (root->cached == BTRFS_CACHE_FINISHED && 203 if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
204 root->free_ino_ctl->free_space == 0) 204 root->free_ino_ctl->free_space == 0)
205 return -ENOSPC; 205 return -ENOSPC;
206 else 206 else
@@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
214 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 214 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
215 return; 215 return;
216again: 216again:
217 if (root->cached == BTRFS_CACHE_FINISHED) { 217 if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
218 __btrfs_add_free_space(pinned, objectid, 1); 218 __btrfs_add_free_space(pinned, objectid, 1);
219 } else { 219 } else {
220 down_write(&root->fs_info->commit_root_sem); 220 down_write(&root->fs_info->commit_root_sem);
221 spin_lock(&root->cache_lock); 221 spin_lock(&root->ino_cache_lock);
222 if (root->cached == BTRFS_CACHE_FINISHED) { 222 if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
223 spin_unlock(&root->cache_lock); 223 spin_unlock(&root->ino_cache_lock);
224 up_write(&root->fs_info->commit_root_sem); 224 up_write(&root->fs_info->commit_root_sem);
225 goto again; 225 goto again;
226 } 226 }
227 spin_unlock(&root->cache_lock); 227 spin_unlock(&root->ino_cache_lock);
228 228
229 start_caching(root); 229 start_caching(root);
230 230
@@ -235,10 +235,10 @@ again:
235} 235}
236 236
237/* 237/*
238 * When a transaction is committed, we'll move those inode numbers which 238 * When a transaction is committed, we'll move those inode numbers which are
239 * are smaller than root->cache_progress from pinned tree to free_ino tree, 239 * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
240 * and others will just be dropped, because the commit root we were 240 * others will just be dropped, because the commit root we were searching has
241 * searching has changed. 241 * changed.
242 * 242 *
243 * Must be called with root->fs_info->commit_root_sem held 243 * Must be called with root->fs_info->commit_root_sem held
244 */ 244 */
@@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
261 info = rb_entry(n, struct btrfs_free_space, offset_index); 261 info = rb_entry(n, struct btrfs_free_space, offset_index);
262 BUG_ON(info->bitmap); /* Logic error */ 262 BUG_ON(info->bitmap); /* Logic error */
263 263
264 if (info->offset > root->cache_progress) 264 if (info->offset > root->ino_cache_progress)
265 goto free; 265 goto free;
266 else if (info->offset + info->bytes > root->cache_progress) 266 else if (info->offset + info->bytes > root->ino_cache_progress)
267 count = root->cache_progress - info->offset + 1; 267 count = root->ino_cache_progress - info->offset + 1;
268 else 268 else
269 count = info->bytes; 269 count = info->bytes;
270 270
@@ -462,13 +462,13 @@ again:
462 } 462 }
463 } 463 }
464 464
465 spin_lock(&root->cache_lock); 465 spin_lock(&root->ino_cache_lock);
466 if (root->cached != BTRFS_CACHE_FINISHED) { 466 if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
467 ret = -1; 467 ret = -1;
468 spin_unlock(&root->cache_lock); 468 spin_unlock(&root->ino_cache_lock);
469 goto out_put; 469 goto out_put;
470 } 470 }
471 spin_unlock(&root->cache_lock); 471 spin_unlock(&root->ino_cache_lock);
472 472
473 spin_lock(&ctl->tree_lock); 473 spin_lock(&ctl->tree_lock);
474 prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; 474 prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 03708ef3deef..fc9c0439caa3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
153 153
154 key.objectid = btrfs_ino(inode); 154 key.objectid = btrfs_ino(inode);
155 key.offset = start; 155 key.offset = start;
156 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 156 key.type = BTRFS_EXTENT_DATA_KEY;
157 157
158 datasize = btrfs_file_extent_calc_inline_size(cur_size); 158 datasize = btrfs_file_extent_calc_inline_size(cur_size);
159 path->leave_spinning = 1; 159 path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
249 data_len = compressed_size; 249 data_len = compressed_size;
250 250
251 if (start > 0 || 251 if (start > 0 ||
252 actual_end >= PAGE_CACHE_SIZE || 252 actual_end > PAGE_CACHE_SIZE ||
253 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 253 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
254 (!compressed_size && 254 (!compressed_size &&
255 (actual_end & (root->sectorsize - 1)) == 0) || 255 (actual_end & (root->sectorsize - 1)) == 0) ||
256 end + 1 < isize || 256 end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
348 return 0; 348 return 0;
349} 349}
350 350
351static inline int inode_need_compress(struct inode *inode)
352{
353 struct btrfs_root *root = BTRFS_I(inode)->root;
354
355 /* force compress */
356 if (btrfs_test_opt(root, FORCE_COMPRESS))
357 return 1;
358 /* bad compression ratios */
359 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
360 return 0;
361 if (btrfs_test_opt(root, COMPRESS) ||
362 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
363 BTRFS_I(inode)->force_compress)
364 return 1;
365 return 0;
366}
367
351/* 368/*
352 * we create compressed extents in two phases. The first 369 * we create compressed extents in two phases. The first
353 * phase compresses a range of pages that have already been 370 * phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
444 * inode has not been flagged as nocompress. This flag can 461 * inode has not been flagged as nocompress. This flag can
445 * change at any time if we discover bad compression ratios. 462 * change at any time if we discover bad compression ratios.
446 */ 463 */
447 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 464 if (inode_need_compress(inode)) {
448 (btrfs_test_opt(root, COMPRESS) ||
449 (BTRFS_I(inode)->force_compress) ||
450 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
451 WARN_ON(pages); 465 WARN_ON(pages);
452 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 466 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
453 if (!pages) { 467 if (!pages) {
@@ -778,8 +792,12 @@ retry:
778 ins.offset, 792 ins.offset,
779 BTRFS_ORDERED_COMPRESSED, 793 BTRFS_ORDERED_COMPRESSED,
780 async_extent->compress_type); 794 async_extent->compress_type);
781 if (ret) 795 if (ret) {
796 btrfs_drop_extent_cache(inode, async_extent->start,
797 async_extent->start +
798 async_extent->ram_size - 1, 0);
782 goto out_free_reserve; 799 goto out_free_reserve;
800 }
783 801
784 /* 802 /*
785 * clear dirty, set writeback and unlock the pages. 803 * clear dirty, set writeback and unlock the pages.
@@ -971,14 +989,14 @@ static noinline int cow_file_range(struct inode *inode,
971 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 989 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
972 ram_size, cur_alloc_size, 0); 990 ram_size, cur_alloc_size, 0);
973 if (ret) 991 if (ret)
974 goto out_reserve; 992 goto out_drop_extent_cache;
975 993
976 if (root->root_key.objectid == 994 if (root->root_key.objectid ==
977 BTRFS_DATA_RELOC_TREE_OBJECTID) { 995 BTRFS_DATA_RELOC_TREE_OBJECTID) {
978 ret = btrfs_reloc_clone_csums(inode, start, 996 ret = btrfs_reloc_clone_csums(inode, start,
979 cur_alloc_size); 997 cur_alloc_size);
980 if (ret) 998 if (ret)
981 goto out_reserve; 999 goto out_drop_extent_cache;
982 } 1000 }
983 1001
984 if (disk_num_bytes < cur_alloc_size) 1002 if (disk_num_bytes < cur_alloc_size)
@@ -1006,6 +1024,8 @@ static noinline int cow_file_range(struct inode *inode,
1006out: 1024out:
1007 return ret; 1025 return ret;
1008 1026
1027out_drop_extent_cache:
1028 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1009out_reserve: 1029out_reserve:
1010 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 1030 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1011out_unlock: 1031out_unlock:
@@ -1088,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1088 async_cow->locked_page = locked_page; 1108 async_cow->locked_page = locked_page;
1089 async_cow->start = start; 1109 async_cow->start = start;
1090 1110
1091 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1111 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1112 !btrfs_test_opt(root, FORCE_COMPRESS))
1092 cur_end = end; 1113 cur_end = end;
1093 else 1114 else
1094 cur_end = min(end, start + 512 * 1024 - 1); 1115 cur_end = min(end, start + 512 * 1024 - 1);
@@ -1096,8 +1117,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1096 async_cow->end = cur_end; 1117 async_cow->end = cur_end;
1097 INIT_LIST_HEAD(&async_cow->extents); 1118 INIT_LIST_HEAD(&async_cow->extents);
1098 1119
1099 btrfs_init_work(&async_cow->work, async_cow_start, 1120 btrfs_init_work(&async_cow->work,
1100 async_cow_submit, async_cow_free); 1121 btrfs_delalloc_helper,
1122 async_cow_start, async_cow_submit,
1123 async_cow_free);
1101 1124
1102 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1125 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1103 PAGE_CACHE_SHIFT; 1126 PAGE_CACHE_SHIFT;
@@ -1437,6 +1460,26 @@ error:
1437 return ret; 1460 return ret;
1438} 1461}
1439 1462
1463static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1464{
1465
1466 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1467 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1468 return 0;
1469
1470 /*
1471 * @defrag_bytes is a hint value, no spinlock held here,
1472 * if is not zero, it means the file is defragging.
1473 * Force cow if given extent needs to be defragged.
1474 */
1475 if (BTRFS_I(inode)->defrag_bytes &&
1476 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1477 EXTENT_DEFRAG, 0, NULL))
1478 return 1;
1479
1480 return 0;
1481}
1482
1440/* 1483/*
1441 * extent_io.c call back to do delayed allocation processing 1484 * extent_io.c call back to do delayed allocation processing
1442 */ 1485 */
@@ -1445,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1445 unsigned long *nr_written) 1488 unsigned long *nr_written)
1446{ 1489{
1447 int ret; 1490 int ret;
1448 struct btrfs_root *root = BTRFS_I(inode)->root; 1491 int force_cow = need_force_cow(inode, start, end);
1449 1492
1450 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1493 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1451 ret = run_delalloc_nocow(inode, locked_page, start, end, 1494 ret = run_delalloc_nocow(inode, locked_page, start, end,
1452 page_started, 1, nr_written); 1495 page_started, 1, nr_written);
1453 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1496 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1454 ret = run_delalloc_nocow(inode, locked_page, start, end, 1497 ret = run_delalloc_nocow(inode, locked_page, start, end,
1455 page_started, 0, nr_written); 1498 page_started, 0, nr_written);
1456 } else if (!btrfs_test_opt(root, COMPRESS) && 1499 } else if (!inode_need_compress(inode)) {
1457 !(BTRFS_I(inode)->force_compress) &&
1458 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1459 ret = cow_file_range(inode, locked_page, start, end, 1500 ret = cow_file_range(inode, locked_page, start, end,
1460 page_started, nr_written, 1); 1501 page_started, nr_written, 1);
1461 } else { 1502 } else {
@@ -1547,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1547 struct extent_state *state, unsigned long *bits) 1588 struct extent_state *state, unsigned long *bits)
1548{ 1589{
1549 1590
1591 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1592 WARN_ON(1);
1550 /* 1593 /*
1551 * set_bit and clear bit hooks normally require _irqsave/restore 1594 * set_bit and clear bit hooks normally require _irqsave/restore
1552 * but in this case, we are only testing for the DELALLOC 1595 * but in this case, we are only testing for the DELALLOC
@@ -1569,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1569 root->fs_info->delalloc_batch); 1612 root->fs_info->delalloc_batch);
1570 spin_lock(&BTRFS_I(inode)->lock); 1613 spin_lock(&BTRFS_I(inode)->lock);
1571 BTRFS_I(inode)->delalloc_bytes += len; 1614 BTRFS_I(inode)->delalloc_bytes += len;
1615 if (*bits & EXTENT_DEFRAG)
1616 BTRFS_I(inode)->defrag_bytes += len;
1572 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1617 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1573 &BTRFS_I(inode)->runtime_flags)) 1618 &BTRFS_I(inode)->runtime_flags))
1574 btrfs_add_delalloc_inodes(root, inode); 1619 btrfs_add_delalloc_inodes(root, inode);
@@ -1583,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1583 struct extent_state *state, 1628 struct extent_state *state,
1584 unsigned long *bits) 1629 unsigned long *bits)
1585{ 1630{
1631 u64 len = state->end + 1 - state->start;
1632
1633 spin_lock(&BTRFS_I(inode)->lock);
1634 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1635 BTRFS_I(inode)->defrag_bytes -= len;
1636 spin_unlock(&BTRFS_I(inode)->lock);
1637
1586 /* 1638 /*
1587 * set_bit and clear bit hooks normally require _irqsave/restore 1639 * set_bit and clear bit hooks normally require _irqsave/restore
1588 * but in this case, we are only testing for the DELALLOC 1640 * but in this case, we are only testing for the DELALLOC
@@ -1590,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1590 */ 1642 */
1591 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1643 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1592 struct btrfs_root *root = BTRFS_I(inode)->root; 1644 struct btrfs_root *root = BTRFS_I(inode)->root;
1593 u64 len = state->end + 1 - state->start;
1594 bool do_list = !btrfs_is_free_space_inode(inode); 1645 bool do_list = !btrfs_is_free_space_inode(inode);
1595 1646
1596 if (*bits & EXTENT_FIRST_DELALLOC) { 1647 if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -1881,7 +1932,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1881 1932
1882 SetPageChecked(page); 1933 SetPageChecked(page);
1883 page_cache_get(page); 1934 page_cache_get(page);
1884 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); 1935 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
1936 btrfs_writepage_fixup_worker, NULL, NULL);
1885 fixup->page = page; 1937 fixup->page = page;
1886 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); 1938 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1887 return -EBUSY; 1939 return -EBUSY;
@@ -2651,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2651 goto out; 2703 goto out;
2652 } 2704 }
2653 2705
2706 btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2707 ordered_extent->file_offset +
2708 ordered_extent->len - 1);
2709
2654 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2710 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2655 truncated = true; 2711 truncated = true;
2656 logical_len = ordered_extent->truncated_len; 2712 logical_len = ordered_extent->truncated_len;
@@ -2822,7 +2878,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2822 struct inode *inode = page->mapping->host; 2878 struct inode *inode = page->mapping->host;
2823 struct btrfs_root *root = BTRFS_I(inode)->root; 2879 struct btrfs_root *root = BTRFS_I(inode)->root;
2824 struct btrfs_ordered_extent *ordered_extent = NULL; 2880 struct btrfs_ordered_extent *ordered_extent = NULL;
2825 struct btrfs_workqueue *workers; 2881 struct btrfs_workqueue *wq;
2882 btrfs_work_func_t func;
2826 2883
2827 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2884 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2828 2885
@@ -2831,17 +2888,55 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2831 end - start + 1, uptodate)) 2888 end - start + 1, uptodate))
2832 return 0; 2889 return 0;
2833 2890
2834 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); 2891 if (btrfs_is_free_space_inode(inode)) {
2892 wq = root->fs_info->endio_freespace_worker;
2893 func = btrfs_freespace_write_helper;
2894 } else {
2895 wq = root->fs_info->endio_write_workers;
2896 func = btrfs_endio_write_helper;
2897 }
2835 2898
2836 if (btrfs_is_free_space_inode(inode)) 2899 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
2837 workers = root->fs_info->endio_freespace_worker; 2900 NULL);
2838 else 2901 btrfs_queue_work(wq, &ordered_extent->work);
2839 workers = root->fs_info->endio_write_workers;
2840 btrfs_queue_work(workers, &ordered_extent->work);
2841 2902
2842 return 0; 2903 return 0;
2843} 2904}
2844 2905
2906static int __readpage_endio_check(struct inode *inode,
2907 struct btrfs_io_bio *io_bio,
2908 int icsum, struct page *page,
2909 int pgoff, u64 start, size_t len)
2910{
2911 char *kaddr;
2912 u32 csum_expected;
2913 u32 csum = ~(u32)0;
2914 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2915 DEFAULT_RATELIMIT_BURST);
2916
2917 csum_expected = *(((u32 *)io_bio->csum) + icsum);
2918
2919 kaddr = kmap_atomic(page);
2920 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
2921 btrfs_csum_final(csum, (char *)&csum);
2922 if (csum != csum_expected)
2923 goto zeroit;
2924
2925 kunmap_atomic(kaddr);
2926 return 0;
2927zeroit:
2928 if (__ratelimit(&_rs))
2929 btrfs_info(BTRFS_I(inode)->root->fs_info,
2930 "csum failed ino %llu off %llu csum %u expected csum %u",
2931 btrfs_ino(inode), start, csum, csum_expected);
2932 memset(kaddr + pgoff, 1, len);
2933 flush_dcache_page(page);
2934 kunmap_atomic(kaddr);
2935 if (csum_expected == 0)
2936 return 0;
2937 return -EIO;
2938}
2939
2845/* 2940/*
2846 * when reads are done, we need to check csums to verify the data is correct 2941 * when reads are done, we need to check csums to verify the data is correct
2847 * if there's a match, we allow the bio to finish. If not, the code in 2942 * if there's a match, we allow the bio to finish. If not, the code in
@@ -2854,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2854 size_t offset = start - page_offset(page); 2949 size_t offset = start - page_offset(page);
2855 struct inode *inode = page->mapping->host; 2950 struct inode *inode = page->mapping->host;
2856 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2951 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2857 char *kaddr;
2858 struct btrfs_root *root = BTRFS_I(inode)->root; 2952 struct btrfs_root *root = BTRFS_I(inode)->root;
2859 u32 csum_expected;
2860 u32 csum = ~(u32)0;
2861 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2862 DEFAULT_RATELIMIT_BURST);
2863 2953
2864 if (PageChecked(page)) { 2954 if (PageChecked(page)) {
2865 ClearPageChecked(page); 2955 ClearPageChecked(page);
2866 goto good; 2956 return 0;
2867 } 2957 }
2868 2958
2869 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2959 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2870 goto good; 2960 return 0;
2871 2961
2872 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2962 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2873 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2963 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2877,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2877 } 2967 }
2878 2968
2879 phy_offset >>= inode->i_sb->s_blocksize_bits; 2969 phy_offset >>= inode->i_sb->s_blocksize_bits;
2880 csum_expected = *(((u32 *)io_bio->csum) + phy_offset); 2970 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
2881 2971 start, (size_t)(end - start + 1));
2882 kaddr = kmap_atomic(page);
2883 csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
2884 btrfs_csum_final(csum, (char *)&csum);
2885 if (csum != csum_expected)
2886 goto zeroit;
2887
2888 kunmap_atomic(kaddr);
2889good:
2890 return 0;
2891
2892zeroit:
2893 if (__ratelimit(&_rs))
2894 btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2895 btrfs_ino(page->mapping->host), start, csum, csum_expected);
2896 memset(kaddr + offset, 1, end - start + 1);
2897 flush_dcache_page(page);
2898 kunmap_atomic(kaddr);
2899 if (csum_expected == 0)
2900 return 0;
2901 return -EIO;
2902} 2972}
2903 2973
2904struct delayed_iput { 2974struct delayed_iput {
@@ -3145,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3145 path->reada = -1; 3215 path->reada = -1;
3146 3216
3147 key.objectid = BTRFS_ORPHAN_OBJECTID; 3217 key.objectid = BTRFS_ORPHAN_OBJECTID;
3148 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 3218 key.type = BTRFS_ORPHAN_ITEM_KEY;
3149 key.offset = (u64)-1; 3219 key.offset = (u64)-1;
3150 3220
3151 while (1) { 3221 while (1) {
@@ -3172,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3172 /* make sure the item matches what we want */ 3242 /* make sure the item matches what we want */
3173 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3243 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3174 break; 3244 break;
3175 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 3245 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3176 break; 3246 break;
3177 3247
3178 /* release the path since we're done with it */ 3248 /* release the path since we're done with it */
@@ -3648,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3648 * without delay 3718 * without delay
3649 */ 3719 */
3650 if (!btrfs_is_free_space_inode(inode) 3720 if (!btrfs_is_free_space_inode(inode)
3651 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 3721 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3722 && !root->fs_info->log_root_recovering) {
3652 btrfs_update_root_times(trans, root); 3723 btrfs_update_root_times(trans, root);
3653 3724
3654 ret = btrfs_delayed_update_inode(trans, root, inode); 3725 ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4071,7 +4142,7 @@ search_again:
4071 fi = NULL; 4142 fi = NULL;
4072 leaf = path->nodes[0]; 4143 leaf = path->nodes[0];
4073 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4074 found_type = btrfs_key_type(&found_key); 4145 found_type = found_key.type;
4075 4146
4076 if (found_key.objectid != ino) 4147 if (found_key.objectid != ino)
4077 break; 4148 break;
@@ -4234,7 +4305,8 @@ out:
4234 btrfs_abort_transaction(trans, root, ret); 4305 btrfs_abort_transaction(trans, root, ret);
4235 } 4306 }
4236error: 4307error:
4237 if (last_size != (u64)-1) 4308 if (last_size != (u64)-1 &&
4309 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4238 btrfs_ordered_update_i_size(inode, last_size, NULL); 4310 btrfs_ordered_update_i_size(inode, last_size, NULL);
4239 btrfs_free_path(path); 4311 btrfs_free_path(path);
4240 return err; 4312 return err;
@@ -4674,6 +4746,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
4674 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 4746 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4675 remove_extent_mapping(map_tree, em); 4747 remove_extent_mapping(map_tree, em);
4676 free_extent_map(em); 4748 free_extent_map(em);
4749 if (need_resched()) {
4750 write_unlock(&map_tree->lock);
4751 cond_resched();
4752 write_lock(&map_tree->lock);
4753 }
4677 } 4754 }
4678 write_unlock(&map_tree->lock); 4755 write_unlock(&map_tree->lock);
4679 4756
@@ -4696,6 +4773,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
4696 &cached_state, GFP_NOFS); 4773 &cached_state, GFP_NOFS);
4697 free_extent_state(state); 4774 free_extent_state(state);
4698 4775
4776 cond_resched();
4699 spin_lock(&io_tree->lock); 4777 spin_lock(&io_tree->lock);
4700 } 4778 }
4701 spin_unlock(&io_tree->lock); 4779 spin_unlock(&io_tree->lock);
@@ -4726,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode)
4726 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 4804 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4727 btrfs_wait_ordered_range(inode, 0, (u64)-1); 4805 btrfs_wait_ordered_range(inode, 0, (u64)-1);
4728 4806
4807 btrfs_free_io_failure_record(inode, 0, (u64)-1);
4808
4729 if (root->fs_info->log_root_recovering) { 4809 if (root->fs_info->log_root_recovering) {
4730 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 4810 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4731 &BTRFS_I(inode)->runtime_flags)); 4811 &BTRFS_I(inode)->runtime_flags));
@@ -5181,6 +5261,42 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5181 iput(inode); 5261 iput(inode);
5182 inode = ERR_PTR(ret); 5262 inode = ERR_PTR(ret);
5183 } 5263 }
5264 /*
5265 * If orphan cleanup did remove any orphans, it means the tree
5266 * was modified and therefore the commit root is not the same as
5267 * the current root anymore. This is a problem, because send
5268 * uses the commit root and therefore can see inode items that
5269 * don't exist in the current root anymore, and for example make
5270 * calls to btrfs_iget, which will do tree lookups based on the
5271 * current root and not on the commit root. Those lookups will
5272 * fail, returning a -ESTALE error, and making send fail with
5273 * that error. So make sure a send does not see any orphans we
5274 * have just removed, and that it will see the same inodes
5275 * regardless of whether a transaction commit happened before
5276 * it started (meaning that the commit root will be the same as
5277 * the current root) or not.
5278 */
5279 if (sub_root->node != sub_root->commit_root) {
5280 u64 sub_flags = btrfs_root_flags(&sub_root->root_item);
5281
5282 if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) {
5283 struct extent_buffer *eb;
5284
5285 /*
5286 * Assert we can't have races between dentry
5287 * lookup called through the snapshot creation
5288 * ioctl and the VFS.
5289 */
5290 ASSERT(mutex_is_locked(&dir->i_mutex));
5291
5292 down_write(&root->fs_info->commit_root_sem);
5293 eb = sub_root->commit_root;
5294 sub_root->commit_root =
5295 btrfs_root_node(sub_root);
5296 up_write(&root->fs_info->commit_root_sem);
5297 free_extent_buffer(eb);
5298 }
5299 }
5184 } 5300 }
5185 5301
5186 return inode; 5302 return inode;
@@ -5274,7 +5390,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5274 btrfs_get_delayed_items(inode, &ins_list, &del_list); 5390 btrfs_get_delayed_items(inode, &ins_list, &del_list);
5275 } 5391 }
5276 5392
5277 btrfs_set_key_type(&key, key_type); 5393 key.type = key_type;
5278 key.offset = ctx->pos; 5394 key.offset = ctx->pos;
5279 key.objectid = btrfs_ino(inode); 5395 key.objectid = btrfs_ino(inode);
5280 5396
@@ -5299,7 +5415,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5299 5415
5300 if (found_key.objectid != key.objectid) 5416 if (found_key.objectid != key.objectid)
5301 break; 5417 break;
5302 if (btrfs_key_type(&found_key) != key_type) 5418 if (found_key.type != key_type)
5303 break; 5419 break;
5304 if (found_key.offset < ctx->pos) 5420 if (found_key.offset < ctx->pos)
5305 goto next; 5421 goto next;
@@ -5511,7 +5627,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
5511 int ret; 5627 int ret;
5512 5628
5513 key.objectid = btrfs_ino(inode); 5629 key.objectid = btrfs_ino(inode);
5514 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 5630 key.type = BTRFS_DIR_INDEX_KEY;
5515 key.offset = (u64)-1; 5631 key.offset = (u64)-1;
5516 5632
5517 path = btrfs_alloc_path(); 5633 path = btrfs_alloc_path();
@@ -5543,7 +5659,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
5543 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5659 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5544 5660
5545 if (found_key.objectid != btrfs_ino(inode) || 5661 if (found_key.objectid != btrfs_ino(inode) ||
5546 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 5662 found_key.type != BTRFS_DIR_INDEX_KEY) {
5547 BTRFS_I(inode)->index_cnt = 2; 5663 BTRFS_I(inode)->index_cnt = 2;
5548 goto out; 5664 goto out;
5549 } 5665 }
@@ -5577,6 +5693,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
5577 return ret; 5693 return ret;
5578} 5694}
5579 5695
5696static int btrfs_insert_inode_locked(struct inode *inode)
5697{
5698 struct btrfs_iget_args args;
5699 args.location = &BTRFS_I(inode)->location;
5700 args.root = BTRFS_I(inode)->root;
5701
5702 return insert_inode_locked4(inode,
5703 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
5704 btrfs_find_actor, &args);
5705}
5706
5580static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 5707static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5581 struct btrfs_root *root, 5708 struct btrfs_root *root,
5582 struct inode *dir, 5709 struct inode *dir,
@@ -5606,6 +5733,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5606 } 5733 }
5607 5734
5608 /* 5735 /*
5736 * O_TMPFILE, set link count to 0, so that after this point,
5737 * we fill in an inode item with the correct link count.
5738 */
5739 if (!name)
5740 set_nlink(inode, 0);
5741
5742 /*
5609 * we have to initialize this early, so we can reclaim the inode 5743 * we have to initialize this early, so we can reclaim the inode
5610 * number if we fail afterwards in this function. 5744 * number if we fail afterwards in this function.
5611 */ 5745 */
@@ -5643,7 +5777,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5643 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 5777 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5644 5778
5645 key[0].objectid = objectid; 5779 key[0].objectid = objectid;
5646 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 5780 key[0].type = BTRFS_INODE_ITEM_KEY;
5647 key[0].offset = 0; 5781 key[0].offset = 0;
5648 5782
5649 sizes[0] = sizeof(struct btrfs_inode_item); 5783 sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5656,16 +5790,25 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5656 * add more hard links than can fit in the ref item. 5790 * add more hard links than can fit in the ref item.
5657 */ 5791 */
5658 key[1].objectid = objectid; 5792 key[1].objectid = objectid;
5659 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 5793 key[1].type = BTRFS_INODE_REF_KEY;
5660 key[1].offset = ref_objectid; 5794 key[1].offset = ref_objectid;
5661 5795
5662 sizes[1] = name_len + sizeof(*ref); 5796 sizes[1] = name_len + sizeof(*ref);
5663 } 5797 }
5664 5798
5799 location = &BTRFS_I(inode)->location;
5800 location->objectid = objectid;
5801 location->offset = 0;
5802 location->type = BTRFS_INODE_ITEM_KEY;
5803
5804 ret = btrfs_insert_inode_locked(inode);
5805 if (ret < 0)
5806 goto fail;
5807
5665 path->leave_spinning = 1; 5808 path->leave_spinning = 1;
5666 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); 5809 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5667 if (ret != 0) 5810 if (ret != 0)
5668 goto fail; 5811 goto fail_unlock;
5669 5812
5670 inode_init_owner(inode, dir, mode); 5813 inode_init_owner(inode, dir, mode);
5671 inode_set_bytes(inode, 0); 5814 inode_set_bytes(inode, 0);
@@ -5688,11 +5831,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5688 btrfs_mark_buffer_dirty(path->nodes[0]); 5831 btrfs_mark_buffer_dirty(path->nodes[0]);
5689 btrfs_free_path(path); 5832 btrfs_free_path(path);
5690 5833
5691 location = &BTRFS_I(inode)->location;
5692 location->objectid = objectid;
5693 location->offset = 0;
5694 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5695
5696 btrfs_inherit_iflags(inode, dir); 5834 btrfs_inherit_iflags(inode, dir);
5697 5835
5698 if (S_ISREG(mode)) { 5836 if (S_ISREG(mode)) {
@@ -5703,7 +5841,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5703 BTRFS_INODE_NODATASUM; 5841 BTRFS_INODE_NODATASUM;
5704 } 5842 }
5705 5843
5706 btrfs_insert_inode_hash(inode);
5707 inode_tree_add(inode); 5844 inode_tree_add(inode);
5708 5845
5709 trace_btrfs_inode_new(inode); 5846 trace_btrfs_inode_new(inode);
@@ -5718,6 +5855,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5718 btrfs_ino(inode), root->root_key.objectid, ret); 5855 btrfs_ino(inode), root->root_key.objectid, ret);
5719 5856
5720 return inode; 5857 return inode;
5858
5859fail_unlock:
5860 unlock_new_inode(inode);
5721fail: 5861fail:
5722 if (dir && name) 5862 if (dir && name)
5723 BTRFS_I(dir)->index_cnt--; 5863 BTRFS_I(dir)->index_cnt--;
@@ -5751,7 +5891,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
5751 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 5891 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5752 } else { 5892 } else {
5753 key.objectid = ino; 5893 key.objectid = ino;
5754 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 5894 key.type = BTRFS_INODE_ITEM_KEY;
5755 key.offset = 0; 5895 key.offset = 0;
5756 } 5896 }
5757 5897
@@ -5852,28 +5992,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5852 goto out_unlock; 5992 goto out_unlock;
5853 } 5993 }
5854 5994
5855 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5856 if (err) {
5857 drop_inode = 1;
5858 goto out_unlock;
5859 }
5860
5861 /* 5995 /*
5862 * If the active LSM wants to access the inode during 5996 * If the active LSM wants to access the inode during
5863 * d_instantiate it needs these. Smack checks to see 5997 * d_instantiate it needs these. Smack checks to see
5864 * if the filesystem supports xattrs by looking at the 5998 * if the filesystem supports xattrs by looking at the
5865 * ops vector. 5999 * ops vector.
5866 */ 6000 */
5867
5868 inode->i_op = &btrfs_special_inode_operations; 6001 inode->i_op = &btrfs_special_inode_operations;
5869 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 6002 init_special_inode(inode, inode->i_mode, rdev);
6003
6004 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5870 if (err) 6005 if (err)
5871 drop_inode = 1; 6006 goto out_unlock_inode;
5872 else { 6007
5873 init_special_inode(inode, inode->i_mode, rdev); 6008 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6009 if (err) {
6010 goto out_unlock_inode;
6011 } else {
5874 btrfs_update_inode(trans, root, inode); 6012 btrfs_update_inode(trans, root, inode);
6013 unlock_new_inode(inode);
5875 d_instantiate(dentry, inode); 6014 d_instantiate(dentry, inode);
5876 } 6015 }
6016
5877out_unlock: 6017out_unlock:
5878 btrfs_end_transaction(trans, root); 6018 btrfs_end_transaction(trans, root);
5879 btrfs_balance_delayed_items(root); 6019 btrfs_balance_delayed_items(root);
@@ -5883,6 +6023,12 @@ out_unlock:
5883 iput(inode); 6023 iput(inode);
5884 } 6024 }
5885 return err; 6025 return err;
6026
6027out_unlock_inode:
6028 drop_inode = 1;
6029 unlock_new_inode(inode);
6030 goto out_unlock;
6031
5886} 6032}
5887 6033
5888static int btrfs_create(struct inode *dir, struct dentry *dentry, 6034static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5917,15 +6063,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5917 goto out_unlock; 6063 goto out_unlock;
5918 } 6064 }
5919 drop_inode_on_err = 1; 6065 drop_inode_on_err = 1;
5920
5921 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5922 if (err)
5923 goto out_unlock;
5924
5925 err = btrfs_update_inode(trans, root, inode);
5926 if (err)
5927 goto out_unlock;
5928
5929 /* 6066 /*
5930 * If the active LSM wants to access the inode during 6067 * If the active LSM wants to access the inode during
5931 * d_instantiate it needs these. Smack checks to see 6068 * d_instantiate it needs these. Smack checks to see
@@ -5934,14 +6071,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5934 */ 6071 */
5935 inode->i_fop = &btrfs_file_operations; 6072 inode->i_fop = &btrfs_file_operations;
5936 inode->i_op = &btrfs_file_inode_operations; 6073 inode->i_op = &btrfs_file_inode_operations;
6074 inode->i_mapping->a_ops = &btrfs_aops;
6075 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6076
6077 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6078 if (err)
6079 goto out_unlock_inode;
6080
6081 err = btrfs_update_inode(trans, root, inode);
6082 if (err)
6083 goto out_unlock_inode;
5937 6084
5938 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 6085 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5939 if (err) 6086 if (err)
5940 goto out_unlock; 6087 goto out_unlock_inode;
5941 6088
5942 inode->i_mapping->a_ops = &btrfs_aops;
5943 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5944 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 6089 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6090 unlock_new_inode(inode);
5945 d_instantiate(dentry, inode); 6091 d_instantiate(dentry, inode);
5946 6092
5947out_unlock: 6093out_unlock:
@@ -5953,6 +6099,11 @@ out_unlock:
5953 btrfs_balance_delayed_items(root); 6099 btrfs_balance_delayed_items(root);
5954 btrfs_btree_balance_dirty(root); 6100 btrfs_btree_balance_dirty(root);
5955 return err; 6101 return err;
6102
6103out_unlock_inode:
6104 unlock_new_inode(inode);
6105 goto out_unlock;
6106
5956} 6107}
5957 6108
5958static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6109static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6060,25 +6211,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6060 } 6211 }
6061 6212
6062 drop_on_err = 1; 6213 drop_on_err = 1;
6214 /* these must be set before we unlock the inode */
6215 inode->i_op = &btrfs_dir_inode_operations;
6216 inode->i_fop = &btrfs_dir_file_operations;
6063 6217
6064 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6218 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6065 if (err) 6219 if (err)
6066 goto out_fail; 6220 goto out_fail_inode;
6067
6068 inode->i_op = &btrfs_dir_inode_operations;
6069 inode->i_fop = &btrfs_dir_file_operations;
6070 6221
6071 btrfs_i_size_write(inode, 0); 6222 btrfs_i_size_write(inode, 0);
6072 err = btrfs_update_inode(trans, root, inode); 6223 err = btrfs_update_inode(trans, root, inode);
6073 if (err) 6224 if (err)
6074 goto out_fail; 6225 goto out_fail_inode;
6075 6226
6076 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 6227 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6077 dentry->d_name.len, 0, index); 6228 dentry->d_name.len, 0, index);
6078 if (err) 6229 if (err)
6079 goto out_fail; 6230 goto out_fail_inode;
6080 6231
6081 d_instantiate(dentry, inode); 6232 d_instantiate(dentry, inode);
6233 /*
6234 * mkdir is special. We're unlocking after we call d_instantiate
6235 * to avoid a race with nfsd calling d_instantiate.
6236 */
6237 unlock_new_inode(inode);
6082 drop_on_err = 0; 6238 drop_on_err = 0;
6083 6239
6084out_fail: 6240out_fail:
@@ -6088,23 +6244,66 @@ out_fail:
6088 btrfs_balance_delayed_items(root); 6244 btrfs_balance_delayed_items(root);
6089 btrfs_btree_balance_dirty(root); 6245 btrfs_btree_balance_dirty(root);
6090 return err; 6246 return err;
6247
6248out_fail_inode:
6249 unlock_new_inode(inode);
6250 goto out_fail;
6251}
6252
6253/* Find next extent map of a given extent map, caller needs to ensure locks */
6254static struct extent_map *next_extent_map(struct extent_map *em)
6255{
6256 struct rb_node *next;
6257
6258 next = rb_next(&em->rb_node);
6259 if (!next)
6260 return NULL;
6261 return container_of(next, struct extent_map, rb_node);
6262}
6263
6264static struct extent_map *prev_extent_map(struct extent_map *em)
6265{
6266 struct rb_node *prev;
6267
6268 prev = rb_prev(&em->rb_node);
6269 if (!prev)
6270 return NULL;
6271 return container_of(prev, struct extent_map, rb_node);
6091} 6272}
6092 6273
6093/* helper for btfs_get_extent. Given an existing extent in the tree, 6274/* helper for btfs_get_extent. Given an existing extent in the tree,
6275 * the existing extent is the nearest extent to map_start,
6094 * and an extent that you want to insert, deal with overlap and insert 6276 * and an extent that you want to insert, deal with overlap and insert
6095 * the new extent into the tree. 6277 * the best fitted new extent into the tree.
6096 */ 6278 */
6097static int merge_extent_mapping(struct extent_map_tree *em_tree, 6279static int merge_extent_mapping(struct extent_map_tree *em_tree,
6098 struct extent_map *existing, 6280 struct extent_map *existing,
6099 struct extent_map *em, 6281 struct extent_map *em,
6100 u64 map_start, u64 map_len) 6282 u64 map_start)
6101{ 6283{
6284 struct extent_map *prev;
6285 struct extent_map *next;
6286 u64 start;
6287 u64 end;
6102 u64 start_diff; 6288 u64 start_diff;
6103 6289
6104 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 6290 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6105 start_diff = map_start - em->start; 6291
6106 em->start = map_start; 6292 if (existing->start > map_start) {
6107 em->len = map_len; 6293 next = existing;
6294 prev = prev_extent_map(next);
6295 } else {
6296 prev = existing;
6297 next = next_extent_map(prev);
6298 }
6299
6300 start = prev ? extent_map_end(prev) : em->start;
6301 start = max_t(u64, start, em->start);
6302 end = next ? next->start : extent_map_end(em);
6303 end = min_t(u64, end, extent_map_end(em));
6304 start_diff = start - em->start;
6305 em->start = start;
6306 em->len = end - start;
6108 if (em->block_start < EXTENT_MAP_LAST_BYTE && 6307 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6109 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 6308 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6110 em->block_start += start_diff; 6309 em->block_start += start_diff;
@@ -6232,7 +6431,7 @@ again:
6232 struct btrfs_file_extent_item); 6431 struct btrfs_file_extent_item);
6233 /* are we inside the extent that was found? */ 6432 /* are we inside the extent that was found? */
6234 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6433 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6235 found_type = btrfs_key_type(&found_key); 6434 found_type = found_key.type;
6236 if (found_key.objectid != objectid || 6435 if (found_key.objectid != objectid ||
6237 found_type != BTRFS_EXTENT_DATA_KEY) { 6436 found_type != BTRFS_EXTENT_DATA_KEY) {
6238 /* 6437 /*
@@ -6275,6 +6474,8 @@ next:
6275 goto not_found; 6474 goto not_found;
6276 if (start + len <= found_key.offset) 6475 if (start + len <= found_key.offset)
6277 goto not_found; 6476 goto not_found;
6477 if (start > found_key.offset)
6478 goto next;
6278 em->start = start; 6479 em->start = start;
6279 em->orig_start = start; 6480 em->orig_start = start;
6280 em->len = found_key.offset - start; 6481 em->len = found_key.offset - start;
@@ -6379,26 +6580,21 @@ insert:
6379 6580
6380 ret = 0; 6581 ret = 0;
6381 6582
6382 existing = lookup_extent_mapping(em_tree, start, len); 6583 existing = search_extent_mapping(em_tree, start, len);
6383 if (existing && (existing->start > start || 6584 /*
6384 existing->start + existing->len <= start)) { 6585 * existing will always be non-NULL, since there must be
6586 * extent causing the -EEXIST.
6587 */
6588 if (start >= extent_map_end(existing) ||
6589 start <= existing->start) {
6590 /*
6591 * The existing extent map is the one nearest to
6592 * the [start, start + len) range which overlaps
6593 */
6594 err = merge_extent_mapping(em_tree, existing,
6595 em, start);
6385 free_extent_map(existing); 6596 free_extent_map(existing);
6386 existing = NULL; 6597 if (err) {
6387 }
6388 if (!existing) {
6389 existing = lookup_extent_mapping(em_tree, em->start,
6390 em->len);
6391 if (existing) {
6392 err = merge_extent_mapping(em_tree, existing,
6393 em, start,
6394 root->sectorsize);
6395 free_extent_map(existing);
6396 if (err) {
6397 free_extent_map(em);
6398 em = NULL;
6399 }
6400 } else {
6401 err = -EIO;
6402 free_extent_map(em); 6598 free_extent_map(em);
6403 em = NULL; 6599 em = NULL;
6404 } 6600 }
@@ -7010,8 +7206,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7010 block_start, len, 7206 block_start, len,
7011 orig_block_len, 7207 orig_block_len,
7012 ram_bytes, type); 7208 ram_bytes, type);
7013 if (IS_ERR(em)) 7209 if (IS_ERR(em)) {
7210 ret = PTR_ERR(em);
7014 goto unlock_err; 7211 goto unlock_err;
7212 }
7015 } 7213 }
7016 7214
7017 ret = btrfs_add_ordered_extent_dio(inode, start, 7215 ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7086,45 +7284,277 @@ unlock_err:
7086 return ret; 7284 return ret;
7087} 7285}
7088 7286
7089static void btrfs_endio_direct_read(struct bio *bio, int err) 7287static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7288 int rw, int mirror_num)
7090{ 7289{
7091 struct btrfs_dio_private *dip = bio->bi_private;
7092 struct bio_vec *bvec;
7093 struct inode *inode = dip->inode;
7094 struct btrfs_root *root = BTRFS_I(inode)->root; 7290 struct btrfs_root *root = BTRFS_I(inode)->root;
7095 struct bio *dio_bio; 7291 int ret;
7096 u32 *csums = (u32 *)dip->csum; 7292
7293 BUG_ON(rw & REQ_WRITE);
7294
7295 bio_get(bio);
7296
7297 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7298 BTRFS_WQ_ENDIO_DIO_REPAIR);
7299 if (ret)
7300 goto err;
7301
7302 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
7303err:
7304 bio_put(bio);
7305 return ret;
7306}
7307
7308static int btrfs_check_dio_repairable(struct inode *inode,
7309 struct bio *failed_bio,
7310 struct io_failure_record *failrec,
7311 int failed_mirror)
7312{
7313 int num_copies;
7314
7315 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
7316 failrec->logical, failrec->len);
7317 if (num_copies == 1) {
7318 /*
7319 * we only have a single copy of the data, so don't bother with
7320 * all the retry and error correction code that follows. no
7321 * matter what the error is, it is very likely to persist.
7322 */
7323 pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
7324 num_copies, failrec->this_mirror, failed_mirror);
7325 return 0;
7326 }
7327
7328 failrec->failed_mirror = failed_mirror;
7329 failrec->this_mirror++;
7330 if (failrec->this_mirror == failed_mirror)
7331 failrec->this_mirror++;
7332
7333 if (failrec->this_mirror > num_copies) {
7334 pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
7335 num_copies, failrec->this_mirror, failed_mirror);
7336 return 0;
7337 }
7338
7339 return 1;
7340}
7341
7342static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7343 struct page *page, u64 start, u64 end,
7344 int failed_mirror, bio_end_io_t *repair_endio,
7345 void *repair_arg)
7346{
7347 struct io_failure_record *failrec;
7348 struct bio *bio;
7349 int isector;
7350 int read_mode;
7351 int ret;
7352
7353 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
7354
7355 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7356 if (ret)
7357 return ret;
7358
7359 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7360 failed_mirror);
7361 if (!ret) {
7362 free_io_failure(inode, failrec);
7363 return -EIO;
7364 }
7365
7366 if (failed_bio->bi_vcnt > 1)
7367 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7368 else
7369 read_mode = READ_SYNC;
7370
7371 isector = start - btrfs_io_bio(failed_bio)->logical;
7372 isector >>= inode->i_sb->s_blocksize_bits;
7373 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7374 0, isector, repair_endio, repair_arg);
7375 if (!bio) {
7376 free_io_failure(inode, failrec);
7377 return -EIO;
7378 }
7379
7380 btrfs_debug(BTRFS_I(inode)->root->fs_info,
7381 "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
7382 read_mode, failrec->this_mirror, failrec->in_validation);
7383
7384 ret = submit_dio_repair_bio(inode, bio, read_mode,
7385 failrec->this_mirror);
7386 if (ret) {
7387 free_io_failure(inode, failrec);
7388 bio_put(bio);
7389 }
7390
7391 return ret;
7392}
7393
7394struct btrfs_retry_complete {
7395 struct completion done;
7396 struct inode *inode;
7097 u64 start; 7397 u64 start;
7398 int uptodate;
7399};
7400
7401static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
7402{
7403 struct btrfs_retry_complete *done = bio->bi_private;
7404 struct bio_vec *bvec;
7098 int i; 7405 int i;
7099 7406
7100 start = dip->logical_offset; 7407 if (err)
7408 goto end;
7409
7410 done->uptodate = 1;
7411 bio_for_each_segment_all(bvec, bio, i)
7412 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
7413end:
7414 complete(&done->done);
7415 bio_put(bio);
7416}
7417
7418static int __btrfs_correct_data_nocsum(struct inode *inode,
7419 struct btrfs_io_bio *io_bio)
7420{
7421 struct bio_vec *bvec;
7422 struct btrfs_retry_complete done;
7423 u64 start;
7424 int i;
7425 int ret;
7426
7427 start = io_bio->logical;
7428 done.inode = inode;
7429
7430 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7431try_again:
7432 done.uptodate = 0;
7433 done.start = start;
7434 init_completion(&done.done);
7435
7436 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7437 start + bvec->bv_len - 1,
7438 io_bio->mirror_num,
7439 btrfs_retry_endio_nocsum, &done);
7440 if (ret)
7441 return ret;
7442
7443 wait_for_completion(&done.done);
7444
7445 if (!done.uptodate) {
7446 /* We might have another mirror, so try again */
7447 goto try_again;
7448 }
7449
7450 start += bvec->bv_len;
7451 }
7452
7453 return 0;
7454}
7455
7456static void btrfs_retry_endio(struct bio *bio, int err)
7457{
7458 struct btrfs_retry_complete *done = bio->bi_private;
7459 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7460 struct bio_vec *bvec;
7461 int uptodate;
7462 int ret;
7463 int i;
7464
7465 if (err)
7466 goto end;
7467
7468 uptodate = 1;
7101 bio_for_each_segment_all(bvec, bio, i) { 7469 bio_for_each_segment_all(bvec, bio, i) {
7102 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 7470 ret = __readpage_endio_check(done->inode, io_bio, i,
7103 struct page *page = bvec->bv_page; 7471 bvec->bv_page, 0,
7104 char *kaddr; 7472 done->start, bvec->bv_len);
7105 u32 csum = ~(u32)0; 7473 if (!ret)
7106 unsigned long flags; 7474 clean_io_failure(done->inode, done->start,
7107 7475 bvec->bv_page, 0);
7108 local_irq_save(flags); 7476 else
7109 kaddr = kmap_atomic(page); 7477 uptodate = 0;
7110 csum = btrfs_csum_data(kaddr + bvec->bv_offset, 7478 }
7111 csum, bvec->bv_len); 7479
7112 btrfs_csum_final(csum, (char *)&csum); 7480 done->uptodate = uptodate;
7113 kunmap_atomic(kaddr); 7481end:
7114 local_irq_restore(flags); 7482 complete(&done->done);
7115 7483 bio_put(bio);
7116 flush_dcache_page(bvec->bv_page); 7484}
7117 if (csum != csums[i]) { 7485
7118 btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", 7486static int __btrfs_subio_endio_read(struct inode *inode,
7119 btrfs_ino(inode), start, csum, 7487 struct btrfs_io_bio *io_bio, int err)
7120 csums[i]); 7488{
7121 err = -EIO; 7489 struct bio_vec *bvec;
7122 } 7490 struct btrfs_retry_complete done;
7491 u64 start;
7492 u64 offset = 0;
7493 int i;
7494 int ret;
7495
7496 err = 0;
7497 start = io_bio->logical;
7498 done.inode = inode;
7499
7500 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7501 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
7502 0, start, bvec->bv_len);
7503 if (likely(!ret))
7504 goto next;
7505try_again:
7506 done.uptodate = 0;
7507 done.start = start;
7508 init_completion(&done.done);
7509
7510 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7511 start + bvec->bv_len - 1,
7512 io_bio->mirror_num,
7513 btrfs_retry_endio, &done);
7514 if (ret) {
7515 err = ret;
7516 goto next;
7123 } 7517 }
7124 7518
7519 wait_for_completion(&done.done);
7520
7521 if (!done.uptodate) {
7522 /* We might have another mirror, so try again */
7523 goto try_again;
7524 }
7525next:
7526 offset += bvec->bv_len;
7125 start += bvec->bv_len; 7527 start += bvec->bv_len;
7126 } 7528 }
7127 7529
7530 return err;
7531}
7532
7533static int btrfs_subio_endio_read(struct inode *inode,
7534 struct btrfs_io_bio *io_bio, int err)
7535{
7536 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7537
7538 if (skip_csum) {
7539 if (unlikely(err))
7540 return __btrfs_correct_data_nocsum(inode, io_bio);
7541 else
7542 return 0;
7543 } else {
7544 return __btrfs_subio_endio_read(inode, io_bio, err);
7545 }
7546}
7547
7548static void btrfs_endio_direct_read(struct bio *bio, int err)
7549{
7550 struct btrfs_dio_private *dip = bio->bi_private;
7551 struct inode *inode = dip->inode;
7552 struct bio *dio_bio;
7553 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7554
7555 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
7556 err = btrfs_subio_endio_read(inode, io_bio, err);
7557
7128 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 7558 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7129 dip->logical_offset + dip->bytes - 1); 7559 dip->logical_offset + dip->bytes - 1);
7130 dio_bio = dip->dio_bio; 7560 dio_bio = dip->dio_bio;
@@ -7135,6 +7565,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
7135 if (err) 7565 if (err)
7136 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); 7566 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7137 dio_end_io(dio_bio, err); 7567 dio_end_io(dio_bio, err);
7568
7569 if (io_bio->end_io)
7570 io_bio->end_io(io_bio, err);
7138 bio_put(bio); 7571 bio_put(bio);
7139} 7572}
7140 7573
@@ -7158,7 +7591,8 @@ again:
7158 if (!ret) 7591 if (!ret)
7159 goto out_test; 7592 goto out_test;
7160 7593
7161 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); 7594 btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
7595 finish_ordered_fn, NULL, NULL);
7162 btrfs_queue_work(root->fs_info->endio_write_workers, 7596 btrfs_queue_work(root->fs_info->endio_write_workers,
7163 &ordered->work); 7597 &ordered->work);
7164out_test: 7598out_test:
@@ -7199,12 +7633,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
7199{ 7633{
7200 struct btrfs_dio_private *dip = bio->bi_private; 7634 struct btrfs_dio_private *dip = bio->bi_private;
7201 7635
7636 if (err)
7637 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7638 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7639 btrfs_ino(dip->inode), bio->bi_rw,
7640 (unsigned long long)bio->bi_iter.bi_sector,
7641 bio->bi_iter.bi_size, err);
7642
7643 if (dip->subio_endio)
7644 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
7645
7202 if (err) { 7646 if (err) {
7203 btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
7204 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7205 btrfs_ino(dip->inode), bio->bi_rw,
7206 (unsigned long long)bio->bi_iter.bi_sector,
7207 bio->bi_iter.bi_size, err);
7208 dip->errors = 1; 7647 dip->errors = 1;
7209 7648
7210 /* 7649 /*
@@ -7235,6 +7674,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7235 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 7674 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7236} 7675}
7237 7676
7677static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
7678 struct inode *inode,
7679 struct btrfs_dio_private *dip,
7680 struct bio *bio,
7681 u64 file_offset)
7682{
7683 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7684 struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
7685 int ret;
7686
7687 /*
7688 * We load all the csum data we need when we submit
7689 * the first bio to reduce the csum tree search and
7690 * contention.
7691 */
7692 if (dip->logical_offset == file_offset) {
7693 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
7694 file_offset);
7695 if (ret)
7696 return ret;
7697 }
7698
7699 if (bio == dip->orig_bio)
7700 return 0;
7701
7702 file_offset -= dip->logical_offset;
7703 file_offset >>= inode->i_sb->s_blocksize_bits;
7704 io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
7705
7706 return 0;
7707}
7708
7238static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 7709static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7239 int rw, u64 file_offset, int skip_sum, 7710 int rw, u64 file_offset, int skip_sum,
7240 int async_submit) 7711 int async_submit)
@@ -7250,7 +7721,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7250 bio_get(bio); 7721 bio_get(bio);
7251 7722
7252 if (!write) { 7723 if (!write) {
7253 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 7724 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7725 BTRFS_WQ_ENDIO_DATA);
7254 if (ret) 7726 if (ret)
7255 goto err; 7727 goto err;
7256 } 7728 }
@@ -7273,13 +7745,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7273 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 7745 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
7274 if (ret) 7746 if (ret)
7275 goto err; 7747 goto err;
7276 } else if (!skip_sum) { 7748 } else {
7277 ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, 7749 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
7278 file_offset); 7750 file_offset);
7279 if (ret) 7751 if (ret)
7280 goto err; 7752 goto err;
7281 } 7753 }
7282
7283map: 7754map:
7284 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 7755 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
7285err: 7756err:
@@ -7300,19 +7771,18 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7300 u64 submit_len = 0; 7771 u64 submit_len = 0;
7301 u64 map_length; 7772 u64 map_length;
7302 int nr_pages = 0; 7773 int nr_pages = 0;
7303 int ret = 0; 7774 int ret;
7304 int async_submit = 0; 7775 int async_submit = 0;
7305 7776
7306 map_length = orig_bio->bi_iter.bi_size; 7777 map_length = orig_bio->bi_iter.bi_size;
7307 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 7778 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
7308 &map_length, NULL, 0); 7779 &map_length, NULL, 0);
7309 if (ret) { 7780 if (ret)
7310 bio_put(orig_bio);
7311 return -EIO; 7781 return -EIO;
7312 }
7313 7782
7314 if (map_length >= orig_bio->bi_iter.bi_size) { 7783 if (map_length >= orig_bio->bi_iter.bi_size) {
7315 bio = orig_bio; 7784 bio = orig_bio;
7785 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
7316 goto submit; 7786 goto submit;
7317 } 7787 }
7318 7788
@@ -7326,14 +7796,16 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7326 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 7796 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
7327 if (!bio) 7797 if (!bio)
7328 return -ENOMEM; 7798 return -ENOMEM;
7799
7329 bio->bi_private = dip; 7800 bio->bi_private = dip;
7330 bio->bi_end_io = btrfs_end_dio_bio; 7801 bio->bi_end_io = btrfs_end_dio_bio;
7802 btrfs_io_bio(bio)->logical = file_offset;
7331 atomic_inc(&dip->pending_bios); 7803 atomic_inc(&dip->pending_bios);
7332 7804
7333 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 7805 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
7334 if (unlikely(map_length < submit_len + bvec->bv_len || 7806 if (map_length < submit_len + bvec->bv_len ||
7335 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 7807 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
7336 bvec->bv_offset) < bvec->bv_len)) { 7808 bvec->bv_offset) < bvec->bv_len) {
7337 /* 7809 /*
7338 * inc the count before we submit the bio so 7810 * inc the count before we submit the bio so
7339 * we know the end IO handler won't happen before 7811 * we know the end IO handler won't happen before
@@ -7362,6 +7834,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7362 goto out_err; 7834 goto out_err;
7363 bio->bi_private = dip; 7835 bio->bi_private = dip;
7364 bio->bi_end_io = btrfs_end_dio_bio; 7836 bio->bi_end_io = btrfs_end_dio_bio;
7837 btrfs_io_bio(bio)->logical = file_offset;
7365 7838
7366 map_length = orig_bio->bi_iter.bi_size; 7839 map_length = orig_bio->bi_iter.bi_size;
7367 ret = btrfs_map_block(root->fs_info, rw, 7840 ret = btrfs_map_block(root->fs_info, rw,
@@ -7405,11 +7878,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7405 struct btrfs_root *root = BTRFS_I(inode)->root; 7878 struct btrfs_root *root = BTRFS_I(inode)->root;
7406 struct btrfs_dio_private *dip; 7879 struct btrfs_dio_private *dip;
7407 struct bio *io_bio; 7880 struct bio *io_bio;
7881 struct btrfs_io_bio *btrfs_bio;
7408 int skip_sum; 7882 int skip_sum;
7409 int sum_len;
7410 int write = rw & REQ_WRITE; 7883 int write = rw & REQ_WRITE;
7411 int ret = 0; 7884 int ret = 0;
7412 u16 csum_size;
7413 7885
7414 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 7886 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7415 7887
@@ -7419,16 +7891,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7419 goto free_ordered; 7891 goto free_ordered;
7420 } 7892 }
7421 7893
7422 if (!skip_sum && !write) { 7894 dip = kzalloc(sizeof(*dip), GFP_NOFS);
7423 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7424 sum_len = dio_bio->bi_iter.bi_size >>
7425 inode->i_sb->s_blocksize_bits;
7426 sum_len *= csum_size;
7427 } else {
7428 sum_len = 0;
7429 }
7430
7431 dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7432 if (!dip) { 7895 if (!dip) {
7433 ret = -ENOMEM; 7896 ret = -ENOMEM;
7434 goto free_io_bio; 7897 goto free_io_bio;
@@ -7440,20 +7903,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7440 dip->bytes = dio_bio->bi_iter.bi_size; 7903 dip->bytes = dio_bio->bi_iter.bi_size;
7441 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 7904 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7442 io_bio->bi_private = dip; 7905 io_bio->bi_private = dip;
7443 dip->errors = 0;
7444 dip->orig_bio = io_bio; 7906 dip->orig_bio = io_bio;
7445 dip->dio_bio = dio_bio; 7907 dip->dio_bio = dio_bio;
7446 atomic_set(&dip->pending_bios, 0); 7908 atomic_set(&dip->pending_bios, 0);
7909 btrfs_bio = btrfs_io_bio(io_bio);
7910 btrfs_bio->logical = file_offset;
7447 7911
7448 if (write) 7912 if (write) {
7449 io_bio->bi_end_io = btrfs_endio_direct_write; 7913 io_bio->bi_end_io = btrfs_endio_direct_write;
7450 else 7914 } else {
7451 io_bio->bi_end_io = btrfs_endio_direct_read; 7915 io_bio->bi_end_io = btrfs_endio_direct_read;
7916 dip->subio_endio = btrfs_subio_endio_read;
7917 }
7452 7918
7453 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 7919 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7454 if (!ret) 7920 if (!ret)
7455 return; 7921 return;
7456 7922
7923 if (btrfs_bio->end_io)
7924 btrfs_bio->end_io(btrfs_bio, ret);
7457free_io_bio: 7925free_io_bio:
7458 bio_put(io_bio); 7926 bio_put(io_bio);
7459 7927
@@ -7534,7 +8002,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7534 count = iov_iter_count(iter); 8002 count = iov_iter_count(iter);
7535 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 8003 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7536 &BTRFS_I(inode)->runtime_flags)) 8004 &BTRFS_I(inode)->runtime_flags))
7537 filemap_fdatawrite_range(inode->i_mapping, offset, count); 8005 filemap_fdatawrite_range(inode->i_mapping, offset,
8006 offset + count - 1);
7538 8007
7539 if (rw & WRITE) { 8008 if (rw & WRITE) {
7540 /* 8009 /*
@@ -7549,8 +8018,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7549 ret = btrfs_delalloc_reserve_space(inode, count); 8018 ret = btrfs_delalloc_reserve_space(inode, count);
7550 if (ret) 8019 if (ret)
7551 goto out; 8020 goto out;
7552 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 8021 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7553 &BTRFS_I(inode)->runtime_flags))) { 8022 &BTRFS_I(inode)->runtime_flags)) {
7554 inode_dio_done(inode); 8023 inode_dio_done(inode);
7555 flags = DIO_LOCKING | DIO_SKIP_HOLES; 8024 flags = DIO_LOCKING | DIO_SKIP_HOLES;
7556 wakeup = false; 8025 wakeup = false;
@@ -8041,6 +8510,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8041 8510
8042 set_nlink(inode, 1); 8511 set_nlink(inode, 1);
8043 btrfs_i_size_write(inode, 0); 8512 btrfs_i_size_write(inode, 0);
8513 unlock_new_inode(inode);
8044 8514
8045 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8515 err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8046 if (err) 8516 if (err)
@@ -8069,6 +8539,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8069 ei->last_sub_trans = 0; 8539 ei->last_sub_trans = 0;
8070 ei->logged_trans = 0; 8540 ei->logged_trans = 0;
8071 ei->delalloc_bytes = 0; 8541 ei->delalloc_bytes = 0;
8542 ei->defrag_bytes = 0;
8072 ei->disk_i_size = 0; 8543 ei->disk_i_size = 0;
8073 ei->flags = 0; 8544 ei->flags = 0;
8074 ei->csum_bytes = 0; 8545 ei->csum_bytes = 0;
@@ -8127,6 +8598,7 @@ void btrfs_destroy_inode(struct inode *inode)
8127 WARN_ON(BTRFS_I(inode)->reserved_extents); 8598 WARN_ON(BTRFS_I(inode)->reserved_extents);
8128 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 8599 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8129 WARN_ON(BTRFS_I(inode)->csum_bytes); 8600 WARN_ON(BTRFS_I(inode)->csum_bytes);
8601 WARN_ON(BTRFS_I(inode)->defrag_bytes);
8130 8602
8131 /* 8603 /*
8132 * This can happen where we create an inode, but somebody else also 8604 * This can happen where we create an inode, but somebody else also
@@ -8495,7 +8967,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8495 work->inode = inode; 8967 work->inode = inode;
8496 work->wait = wait; 8968 work->wait = wait;
8497 work->delay_iput = delay_iput; 8969 work->delay_iput = delay_iput;
8498 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); 8970 WARN_ON_ONCE(!inode);
8971 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
8972 btrfs_run_delalloc_work, NULL, NULL);
8499 8973
8500 return work; 8974 return work;
8501} 8975}
@@ -8540,7 +9014,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8540 spin_unlock(&root->delalloc_lock); 9014 spin_unlock(&root->delalloc_lock);
8541 9015
8542 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 9016 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8543 if (unlikely(!work)) { 9017 if (!work) {
8544 if (delay_iput) 9018 if (delay_iput)
8545 btrfs_add_delayed_iput(inode); 9019 btrfs_add_delayed_iput(inode);
8546 else 9020 else
@@ -8699,12 +9173,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8699 goto out_unlock; 9173 goto out_unlock;
8700 } 9174 }
8701 9175
8702 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8703 if (err) {
8704 drop_inode = 1;
8705 goto out_unlock;
8706 }
8707
8708 /* 9176 /*
8709 * If the active LSM wants to access the inode during 9177 * If the active LSM wants to access the inode during
8710 * d_instantiate it needs these. Smack checks to see 9178 * d_instantiate it needs these. Smack checks to see
@@ -8713,34 +9181,32 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8713 */ 9181 */
8714 inode->i_fop = &btrfs_file_operations; 9182 inode->i_fop = &btrfs_file_operations;
8715 inode->i_op = &btrfs_file_inode_operations; 9183 inode->i_op = &btrfs_file_inode_operations;
9184 inode->i_mapping->a_ops = &btrfs_aops;
9185 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9186 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9187
9188 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
9189 if (err)
9190 goto out_unlock_inode;
8716 9191
8717 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 9192 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8718 if (err) 9193 if (err)
8719 drop_inode = 1; 9194 goto out_unlock_inode;
8720 else {
8721 inode->i_mapping->a_ops = &btrfs_aops;
8722 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8723 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8724 }
8725 if (drop_inode)
8726 goto out_unlock;
8727 9195
8728 path = btrfs_alloc_path(); 9196 path = btrfs_alloc_path();
8729 if (!path) { 9197 if (!path) {
8730 err = -ENOMEM; 9198 err = -ENOMEM;
8731 drop_inode = 1; 9199 goto out_unlock_inode;
8732 goto out_unlock;
8733 } 9200 }
8734 key.objectid = btrfs_ino(inode); 9201 key.objectid = btrfs_ino(inode);
8735 key.offset = 0; 9202 key.offset = 0;
8736 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 9203 key.type = BTRFS_EXTENT_DATA_KEY;
8737 datasize = btrfs_file_extent_calc_inline_size(name_len); 9204 datasize = btrfs_file_extent_calc_inline_size(name_len);
8738 err = btrfs_insert_empty_item(trans, root, path, &key, 9205 err = btrfs_insert_empty_item(trans, root, path, &key,
8739 datasize); 9206 datasize);
8740 if (err) { 9207 if (err) {
8741 drop_inode = 1;
8742 btrfs_free_path(path); 9208 btrfs_free_path(path);
8743 goto out_unlock; 9209 goto out_unlock_inode;
8744 } 9210 }
8745 leaf = path->nodes[0]; 9211 leaf = path->nodes[0];
8746 ei = btrfs_item_ptr(leaf, path->slots[0], 9212 ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8764,12 +9230,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8764 inode_set_bytes(inode, name_len); 9230 inode_set_bytes(inode, name_len);
8765 btrfs_i_size_write(inode, name_len); 9231 btrfs_i_size_write(inode, name_len);
8766 err = btrfs_update_inode(trans, root, inode); 9232 err = btrfs_update_inode(trans, root, inode);
8767 if (err) 9233 if (err) {
8768 drop_inode = 1; 9234 drop_inode = 1;
9235 goto out_unlock_inode;
9236 }
9237
9238 unlock_new_inode(inode);
9239 d_instantiate(dentry, inode);
8769 9240
8770out_unlock: 9241out_unlock:
8771 if (!err)
8772 d_instantiate(dentry, inode);
8773 btrfs_end_transaction(trans, root); 9242 btrfs_end_transaction(trans, root);
8774 if (drop_inode) { 9243 if (drop_inode) {
8775 inode_dec_link_count(inode); 9244 inode_dec_link_count(inode);
@@ -8777,6 +9246,11 @@ out_unlock:
8777 } 9246 }
8778 btrfs_btree_balance_dirty(root); 9247 btrfs_btree_balance_dirty(root);
8779 return err; 9248 return err;
9249
9250out_unlock_inode:
9251 drop_inode = 1;
9252 unlock_new_inode(inode);
9253 goto out_unlock;
8780} 9254}
8781 9255
8782static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 9256static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8960,14 +9434,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
8960 goto out; 9434 goto out;
8961 } 9435 }
8962 9436
8963 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
8964 if (ret)
8965 goto out;
8966
8967 ret = btrfs_update_inode(trans, root, inode);
8968 if (ret)
8969 goto out;
8970
8971 inode->i_fop = &btrfs_file_operations; 9437 inode->i_fop = &btrfs_file_operations;
8972 inode->i_op = &btrfs_file_inode_operations; 9438 inode->i_op = &btrfs_file_inode_operations;
8973 9439
@@ -8975,10 +9441,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
8975 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 9441 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8976 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9442 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8977 9443
9444 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
9445 if (ret)
9446 goto out_inode;
9447
9448 ret = btrfs_update_inode(trans, root, inode);
9449 if (ret)
9450 goto out_inode;
8978 ret = btrfs_orphan_add(trans, inode); 9451 ret = btrfs_orphan_add(trans, inode);
8979 if (ret) 9452 if (ret)
8980 goto out; 9453 goto out_inode;
8981 9454
9455 /*
9456 * We set number of links to 0 in btrfs_new_inode(), and here we set
9457 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
9458 * through:
9459 *
9460 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9461 */
9462 set_nlink(inode, 1);
9463 unlock_new_inode(inode);
8982 d_tmpfile(dentry, inode); 9464 d_tmpfile(dentry, inode);
8983 mark_inode_dirty(inode); 9465 mark_inode_dirty(inode);
8984 9466
@@ -8988,8 +9470,12 @@ out:
8988 iput(inode); 9470 iput(inode);
8989 btrfs_balance_delayed_items(root); 9471 btrfs_balance_delayed_items(root);
8990 btrfs_btree_balance_dirty(root); 9472 btrfs_btree_balance_dirty(root);
8991
8992 return ret; 9473 return ret;
9474
9475out_inode:
9476 unlock_new_inode(inode);
9477 goto out;
9478
8993} 9479}
8994 9480
8995static const struct inode_operations btrfs_dir_inode_operations = { 9481static const struct inode_operations btrfs_dir_inode_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47aceb494d1d..e732274f1afd 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
332 goto out_drop; 332 goto out_drop;
333 333
334 } else { 334 } else {
335 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
336 if (ret && ret != -ENODATA)
337 goto out_drop;
335 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 338 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
336 } 339 }
337 340
@@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir,
477 if (ret) 480 if (ret)
478 goto fail; 481 goto fail;
479 482
480 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 483 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
481 0, objectid, NULL, 0, 0, 0);
482 if (IS_ERR(leaf)) { 484 if (IS_ERR(leaf)) {
483 ret = PTR_ERR(leaf); 485 ret = PTR_ERR(leaf);
484 goto fail; 486 goto fail;
@@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir,
503 btrfs_set_stack_inode_generation(inode_item, 1); 505 btrfs_set_stack_inode_generation(inode_item, 1);
504 btrfs_set_stack_inode_size(inode_item, 3); 506 btrfs_set_stack_inode_size(inode_item, 3);
505 btrfs_set_stack_inode_nlink(inode_item, 1); 507 btrfs_set_stack_inode_nlink(inode_item, 1);
506 btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); 508 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
507 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 509 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
508 510
509 btrfs_set_root_flags(&root_item, 0); 511 btrfs_set_root_flags(&root_item, 0);
@@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir,
535 537
536 key.objectid = objectid; 538 key.objectid = objectid;
537 key.offset = 0; 539 key.offset = 0;
538 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 540 key.type = BTRFS_ROOT_ITEM_KEY;
539 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 541 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
540 &root_item); 542 &root_item);
541 if (ret) 543 if (ret)
@@ -711,39 +713,6 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
711 if (ret) 713 if (ret)
712 goto fail; 714 goto fail;
713 715
714 ret = btrfs_orphan_cleanup(pending_snapshot->snap);
715 if (ret)
716 goto fail;
717
718 /*
719 * If orphan cleanup did remove any orphans, it means the tree was
720 * modified and therefore the commit root is not the same as the
721 * current root anymore. This is a problem, because send uses the
722 * commit root and therefore can see inode items that don't exist
723 * in the current root anymore, and for example make calls to
724 * btrfs_iget, which will do tree lookups based on the current root
725 * and not on the commit root. Those lookups will fail, returning a
726 * -ESTALE error, and making send fail with that error. So make sure
727 * a send does not see any orphans we have just removed, and that it
728 * will see the same inodes regardless of whether a transaction
729 * commit happened before it started (meaning that the commit root
730 * will be the same as the current root) or not.
731 */
732 if (readonly && pending_snapshot->snap->node !=
733 pending_snapshot->snap->commit_root) {
734 trans = btrfs_join_transaction(pending_snapshot->snap);
735 if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
736 ret = PTR_ERR(trans);
737 goto fail;
738 }
739 if (!IS_ERR(trans)) {
740 ret = btrfs_commit_transaction(trans,
741 pending_snapshot->snap);
742 if (ret)
743 goto fail;
744 }
745 }
746
747 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 716 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
748 if (IS_ERR(inode)) { 717 if (IS_ERR(inode)) {
749 ret = PTR_ERR(inode); 718 ret = PTR_ERR(inode);
@@ -915,7 +884,7 @@ out_unlock:
915 * file you want to defrag, we return 0 to let you know to skip this 884 * file you want to defrag, we return 0 to let you know to skip this
916 * part of the file 885 * part of the file
917 */ 886 */
918static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) 887static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
919{ 888{
920 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 889 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
921 struct extent_map *em = NULL; 890 struct extent_map *em = NULL;
@@ -950,7 +919,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
950 */ 919 */
951static int find_new_extents(struct btrfs_root *root, 920static int find_new_extents(struct btrfs_root *root,
952 struct inode *inode, u64 newer_than, 921 struct inode *inode, u64 newer_than,
953 u64 *off, int thresh) 922 u64 *off, u32 thresh)
954{ 923{
955 struct btrfs_path *path; 924 struct btrfs_path *path;
956 struct btrfs_key min_key; 925 struct btrfs_key min_key;
@@ -969,12 +938,9 @@ static int find_new_extents(struct btrfs_root *root,
969 min_key.offset = *off; 938 min_key.offset = *off;
970 939
971 while (1) { 940 while (1) {
972 path->keep_locks = 1;
973 ret = btrfs_search_forward(root, &min_key, path, newer_than); 941 ret = btrfs_search_forward(root, &min_key, path, newer_than);
974 if (ret != 0) 942 if (ret != 0)
975 goto none; 943 goto none;
976 path->keep_locks = 0;
977 btrfs_unlock_up_safe(path, 1);
978process_slot: 944process_slot:
979 if (min_key.objectid != ino) 945 if (min_key.objectid != ino)
980 goto none; 946 goto none;
@@ -1052,15 +1018,17 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
1052 return false; 1018 return false;
1053 1019
1054 next = defrag_lookup_extent(inode, em->start + em->len); 1020 next = defrag_lookup_extent(inode, em->start + em->len);
1055 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || 1021 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
1056 (em->block_start + em->block_len == next->block_start)) 1022 ret = false;
1023 else if ((em->block_start + em->block_len == next->block_start) &&
1024 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
1057 ret = false; 1025 ret = false;
1058 1026
1059 free_extent_map(next); 1027 free_extent_map(next);
1060 return ret; 1028 return ret;
1061} 1029}
1062 1030
1063static int should_defrag_range(struct inode *inode, u64 start, int thresh, 1031static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
1064 u64 *last_len, u64 *skip, u64 *defrag_end, 1032 u64 *last_len, u64 *skip, u64 *defrag_end,
1065 int compress) 1033 int compress)
1066{ 1034{
@@ -1088,7 +1056,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
1088 } 1056 }
1089 1057
1090 next_mergeable = defrag_check_next_extent(inode, em); 1058 next_mergeable = defrag_check_next_extent(inode, em);
1091
1092 /* 1059 /*
1093 * we hit a real extent, if it is big or the next extent is not a 1060 * we hit a real extent, if it is big or the next extent is not a
1094 * real extent, don't bother defragging it 1061 * real extent, don't bother defragging it
@@ -1291,7 +1258,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1291 int ret; 1258 int ret;
1292 int defrag_count = 0; 1259 int defrag_count = 0;
1293 int compress_type = BTRFS_COMPRESS_ZLIB; 1260 int compress_type = BTRFS_COMPRESS_ZLIB;
1294 int extent_thresh = range->extent_thresh; 1261 u32 extent_thresh = range->extent_thresh;
1295 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 1262 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
1296 unsigned long cluster = max_cluster; 1263 unsigned long cluster = max_cluster;
1297 u64 new_align = ~((u64)128 * 1024 - 1); 1264 u64 new_align = ~((u64)128 * 1024 - 1);
@@ -1367,8 +1334,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1367 inode->i_mapping->writeback_index = i; 1334 inode->i_mapping->writeback_index = i;
1368 1335
1369 while (i <= last_index && defrag_count < max_to_defrag && 1336 while (i <= last_index && defrag_count < max_to_defrag &&
1370 (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 1337 (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
1371 PAGE_CACHE_SHIFT)) {
1372 /* 1338 /*
1373 * make sure we stop running if someone unmounts 1339 * make sure we stop running if someone unmounts
1374 * the FS 1340 * the FS
@@ -1391,7 +1357,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1391 * the should_defrag function tells us how much to skip 1357 * the should_defrag function tells us how much to skip
1392 * bump our counter by the suggested amount 1358 * bump our counter by the suggested amount
1393 */ 1359 */
1394 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1360 next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
1395 i = max(i + 1, next); 1361 i = max(i + 1, next);
1396 continue; 1362 continue;
1397 } 1363 }
@@ -1586,7 +1552,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1586 goto out_free; 1552 goto out_free;
1587 } 1553 }
1588 1554
1589 old_size = device->total_bytes; 1555 old_size = btrfs_device_get_total_bytes(device);
1590 1556
1591 if (mod < 0) { 1557 if (mod < 0) {
1592 if (new_size > old_size) { 1558 if (new_size > old_size) {
@@ -1735,7 +1701,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1735 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | 1701 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
1736 BTRFS_SUBVOL_QGROUP_INHERIT)) { 1702 BTRFS_SUBVOL_QGROUP_INHERIT)) {
1737 ret = -EOPNOTSUPP; 1703 ret = -EOPNOTSUPP;
1738 goto out; 1704 goto free_args;
1739 } 1705 }
1740 1706
1741 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) 1707 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1745,27 +1711,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1745 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { 1711 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1746 if (vol_args->size > PAGE_CACHE_SIZE) { 1712 if (vol_args->size > PAGE_CACHE_SIZE) {
1747 ret = -EINVAL; 1713 ret = -EINVAL;
1748 goto out; 1714 goto free_args;
1749 } 1715 }
1750 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); 1716 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1751 if (IS_ERR(inherit)) { 1717 if (IS_ERR(inherit)) {
1752 ret = PTR_ERR(inherit); 1718 ret = PTR_ERR(inherit);
1753 goto out; 1719 goto free_args;
1754 } 1720 }
1755 } 1721 }
1756 1722
1757 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1723 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1758 vol_args->fd, subvol, ptr, 1724 vol_args->fd, subvol, ptr,
1759 readonly, inherit); 1725 readonly, inherit);
1726 if (ret)
1727 goto free_inherit;
1760 1728
1761 if (ret == 0 && ptr && 1729 if (ptr && copy_to_user(arg +
1762 copy_to_user(arg + 1730 offsetof(struct btrfs_ioctl_vol_args_v2,
1763 offsetof(struct btrfs_ioctl_vol_args_v2, 1731 transid),
1764 transid), ptr, sizeof(*ptr))) 1732 ptr, sizeof(*ptr)))
1765 ret = -EFAULT; 1733 ret = -EFAULT;
1766out: 1734
1767 kfree(vol_args); 1735free_inherit:
1768 kfree(inherit); 1736 kfree(inherit);
1737free_args:
1738 kfree(vol_args);
1769 return ret; 1739 return ret;
1770} 1740}
1771 1741
@@ -2117,8 +2087,6 @@ static noinline int search_ioctl(struct inode *inode,
2117 key.type = sk->min_type; 2087 key.type = sk->min_type;
2118 key.offset = sk->min_offset; 2088 key.offset = sk->min_offset;
2119 2089
2120 path->keep_locks = 1;
2121
2122 while (1) { 2090 while (1) {
2123 ret = btrfs_search_forward(root, &key, path, sk->min_transid); 2091 ret = btrfs_search_forward(root, &key, path, sk->min_transid);
2124 if (ret != 0) { 2092 if (ret != 0) {
@@ -2554,9 +2522,9 @@ out_unlock:
2554 ASSERT(dest->send_in_progress == 0); 2522 ASSERT(dest->send_in_progress == 0);
2555 2523
2556 /* the last ref */ 2524 /* the last ref */
2557 if (dest->cache_inode) { 2525 if (dest->ino_cache_inode) {
2558 iput(dest->cache_inode); 2526 iput(dest->ino_cache_inode);
2559 dest->cache_inode = NULL; 2527 dest->ino_cache_inode = NULL;
2560 } 2528 }
2561 } 2529 }
2562out_dput: 2530out_dput:
@@ -2662,6 +2630,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2662 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2630 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2663 ret = btrfs_init_new_device(root, vol_args->name); 2631 ret = btrfs_init_new_device(root, vol_args->name);
2664 2632
2633 if (!ret)
2634 btrfs_info(root->fs_info, "disk added %s",vol_args->name);
2635
2665 kfree(vol_args); 2636 kfree(vol_args);
2666out: 2637out:
2667 mutex_unlock(&root->fs_info->volume_mutex); 2638 mutex_unlock(&root->fs_info->volume_mutex);
@@ -2685,7 +2656,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2685 vol_args = memdup_user(arg, sizeof(*vol_args)); 2656 vol_args = memdup_user(arg, sizeof(*vol_args));
2686 if (IS_ERR(vol_args)) { 2657 if (IS_ERR(vol_args)) {
2687 ret = PTR_ERR(vol_args); 2658 ret = PTR_ERR(vol_args);
2688 goto out; 2659 goto err_drop;
2689 } 2660 }
2690 2661
2691 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2662 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2701,8 +2672,12 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2701 mutex_unlock(&root->fs_info->volume_mutex); 2672 mutex_unlock(&root->fs_info->volume_mutex);
2702 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2673 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2703 2674
2675 if (!ret)
2676 btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
2677
2704out: 2678out:
2705 kfree(vol_args); 2679 kfree(vol_args);
2680err_drop:
2706 mnt_drop_write_file(file); 2681 mnt_drop_write_file(file);
2707 return ret; 2682 return ret;
2708} 2683}
@@ -2764,8 +2739,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2764 } 2739 }
2765 2740
2766 di_args->devid = dev->devid; 2741 di_args->devid = dev->devid;
2767 di_args->bytes_used = dev->bytes_used; 2742 di_args->bytes_used = btrfs_device_get_bytes_used(dev);
2768 di_args->total_bytes = dev->total_bytes; 2743 di_args->total_bytes = btrfs_device_get_total_bytes(dev);
2769 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2744 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2770 if (dev->name) { 2745 if (dev->name) {
2771 struct rcu_string *name; 2746 struct rcu_string *name;
@@ -3191,7 +3166,7 @@ static void clone_update_extent_map(struct inode *inode,
3191 em->start + em->len - 1, 0); 3166 em->start + em->len - 1, 0);
3192 } 3167 }
3193 3168
3194 if (unlikely(ret)) 3169 if (ret)
3195 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3170 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3196 &BTRFS_I(inode)->runtime_flags); 3171 &BTRFS_I(inode)->runtime_flags);
3197} 3172}
@@ -3226,7 +3201,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
3226 u64 last_dest_end = destoff; 3201 u64 last_dest_end = destoff;
3227 3202
3228 ret = -ENOMEM; 3203 ret = -ENOMEM;
3229 buf = vmalloc(btrfs_level_size(root, 0)); 3204 buf = vmalloc(root->nodesize);
3230 if (!buf) 3205 if (!buf)
3231 return ret; 3206 return ret;
3232 3207
@@ -3279,11 +3254,11 @@ process_slot:
3279 slot = path->slots[0]; 3254 slot = path->slots[0];
3280 3255
3281 btrfs_item_key_to_cpu(leaf, &key, slot); 3256 btrfs_item_key_to_cpu(leaf, &key, slot);
3282 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || 3257 if (key.type > BTRFS_EXTENT_DATA_KEY ||
3283 key.objectid != btrfs_ino(src)) 3258 key.objectid != btrfs_ino(src))
3284 break; 3259 break;
3285 3260
3286 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { 3261 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3287 struct btrfs_file_extent_item *extent; 3262 struct btrfs_file_extent_item *extent;
3288 int type; 3263 int type;
3289 u32 size; 3264 u32 size;
@@ -3527,7 +3502,8 @@ process_slot:
3527 btrfs_mark_buffer_dirty(leaf); 3502 btrfs_mark_buffer_dirty(leaf);
3528 btrfs_release_path(path); 3503 btrfs_release_path(path);
3529 3504
3530 last_dest_end = new_key.offset + datal; 3505 last_dest_end = ALIGN(new_key.offset + datal,
3506 root->sectorsize);
3531 ret = clone_finish_inode_update(trans, inode, 3507 ret = clone_finish_inode_update(trans, inode,
3532 last_dest_end, 3508 last_dest_end,
3533 destoff, olen); 3509 destoff, olen);
@@ -5309,6 +5285,12 @@ long btrfs_ioctl(struct file *file, unsigned int
5309 if (ret) 5285 if (ret)
5310 return ret; 5286 return ret;
5311 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 5287 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
5288 /*
5289 * The transaction thread may want to do more work,
5290 * namely it pokes the cleaner ktread that will start
5291 * processing uncleaned subvols.
5292 */
5293 wake_up_process(root->fs_info->transaction_kthread);
5312 return ret; 5294 return ret;
5313 } 5295 }
5314 case BTRFS_IOC_START_SYNC: 5296 case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index dfad8514f0da..78285f30909e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
266 char *data_in; 266 char *data_in;
267 unsigned long page_in_index = 0; 267 unsigned long page_in_index = 0;
268 unsigned long page_out_index = 0; 268 unsigned long page_out_index = 0;
269 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 269 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
270 PAGE_CACHE_SIZE;
271 unsigned long buf_start; 270 unsigned long buf_start;
272 unsigned long buf_offset = 0; 271 unsigned long buf_offset = 0;
273 unsigned long bytes; 272 unsigned long bytes;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 963895c1f801..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -615,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
615 spin_unlock(&root->ordered_extent_lock); 615 spin_unlock(&root->ordered_extent_lock);
616 616
617 btrfs_init_work(&ordered->flush_work, 617 btrfs_init_work(&ordered->flush_work,
618 btrfs_flush_delalloc_helper,
618 btrfs_run_ordered_extent_work, NULL, NULL); 619 btrfs_run_ordered_extent_work, NULL, NULL);
619 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
620 btrfs_queue_work(root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 65793edb38ca..47767d5b8f0b 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
27 int ret = 0; 27 int ret = 0;
28 28
29 key.objectid = BTRFS_ORPHAN_OBJECTID; 29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 30 key.type = BTRFS_ORPHAN_ITEM_KEY;
31 key.offset = offset; 31 key.offset = offset;
32 32
33 path = btrfs_alloc_path(); 33 path = btrfs_alloc_path();
@@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
48 int ret = 0; 48 int ret = 0;
49 49
50 key.objectid = BTRFS_ORPHAN_OBJECTID; 50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 51 key.type = BTRFS_ORPHAN_ITEM_KEY;
52 key.offset = offset; 52 key.offset = offset;
53 53
54 path = btrfs_alloc_path(); 54 path = btrfs_alloc_path();
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9626b4ad3b9a..647ab12fdf5d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
195 for (i = 0 ; i < nr ; i++) { 195 for (i = 0 ; i < nr ; i++) {
196 item = btrfs_item_nr(i); 196 item = btrfs_item_nr(i);
197 btrfs_item_key_to_cpu(l, &key, i); 197 btrfs_item_key_to_cpu(l, &key, i);
198 type = btrfs_key_type(&key); 198 type = key.type;
199 printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " 199 printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
200 "itemsize %d\n", 200 "itemsize %d\n",
201 i, key.objectid, type, key.offset, 201 i, key.objectid, type, key.offset,
@@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
336 for (i = 0; i < nr; i++) { 336 for (i = 0; i < nr; i++) {
337 struct extent_buffer *next = read_tree_block(root, 337 struct extent_buffer *next = read_tree_block(root,
338 btrfs_node_blockptr(c, i), 338 btrfs_node_blockptr(c, i),
339 btrfs_level_size(root, level - 1),
340 btrfs_node_ptr_generation(c, i)); 339 btrfs_node_ptr_generation(c, i));
341 if (btrfs_is_leaf(next) && 340 if (btrfs_is_leaf(next) &&
342 level != 1) 341 level != 1)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b497498484be..48b60dbf807f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
539 struct extent_buffer *leaf; 539 struct extent_buffer *leaf;
540 struct btrfs_key key; 540 struct btrfs_key key;
541 541
542#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 542 if (btrfs_test_is_dummy_root(quota_root))
543 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
544 return 0; 543 return 0;
545#endif 544
546 path = btrfs_alloc_path(); 545 path = btrfs_alloc_path();
547 if (!path) 546 if (!path)
548 return -ENOMEM; 547 return -ENOMEM;
@@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
551 key.type = BTRFS_QGROUP_INFO_KEY; 550 key.type = BTRFS_QGROUP_INFO_KEY;
552 key.offset = qgroupid; 551 key.offset = qgroupid;
553 552
553 /*
554 * Avoid a transaction abort by catching -EEXIST here. In that
555 * case, we proceed by re-initializing the existing structure
556 * on disk.
557 */
558
554 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 559 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
555 sizeof(*qgroup_info)); 560 sizeof(*qgroup_info));
556 if (ret) 561 if (ret && ret != -EEXIST)
557 goto out; 562 goto out;
558 563
559 leaf = path->nodes[0]; 564 leaf = path->nodes[0];
@@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
572 key.type = BTRFS_QGROUP_LIMIT_KEY; 577 key.type = BTRFS_QGROUP_LIMIT_KEY;
573 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 578 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
574 sizeof(*qgroup_limit)); 579 sizeof(*qgroup_limit));
575 if (ret) 580 if (ret && ret != -EEXIST)
576 goto out; 581 goto out;
577 582
578 leaf = path->nodes[0]; 583 leaf = path->nodes[0];
@@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
692 int ret; 697 int ret;
693 int slot; 698 int slot;
694 699
695#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 700 if (btrfs_test_is_dummy_root(root))
696 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
697 return 0; 701 return 0;
698#endif 702
699 key.objectid = 0; 703 key.objectid = 0;
700 key.type = BTRFS_QGROUP_INFO_KEY; 704 key.type = BTRFS_QGROUP_INFO_KEY;
701 key.offset = qgroup->qgroupid; 705 key.offset = qgroup->qgroupid;
@@ -1335,6 +1339,8 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1335 INIT_LIST_HEAD(&oper->elem.list); 1339 INIT_LIST_HEAD(&oper->elem.list);
1336 oper->elem.seq = 0; 1340 oper->elem.seq = 0;
1337 1341
1342 trace_btrfs_qgroup_record_ref(oper);
1343
1338 if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { 1344 if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
1339 /* 1345 /*
1340 * If any operation for this bytenr/ref_root combo 1346 * If any operation for this bytenr/ref_root combo
@@ -1973,7 +1979,7 @@ static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
1973 elem.seq, &roots); 1979 elem.seq, &roots);
1974 btrfs_put_tree_mod_seq(fs_info, &elem); 1980 btrfs_put_tree_mod_seq(fs_info, &elem);
1975 if (ret < 0) 1981 if (ret < 0)
1976 return ret; 1982 goto out;
1977 1983
1978 if (roots->nnodes != 1) 1984 if (roots->nnodes != 1)
1979 goto out; 1985 goto out;
@@ -2077,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
2077 2083
2078 ASSERT(is_fstree(oper->ref_root)); 2084 ASSERT(is_fstree(oper->ref_root));
2079 2085
2086 trace_btrfs_qgroup_account(oper);
2087
2080 switch (oper->type) { 2088 switch (oper->type) {
2081 case BTRFS_QGROUP_OPER_ADD_EXCL: 2089 case BTRFS_QGROUP_OPER_ADD_EXCL:
2082 case BTRFS_QGROUP_OPER_SUB_EXCL: 2090 case BTRFS_QGROUP_OPER_SUB_EXCL:
@@ -2237,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2237 if (srcid) { 2245 if (srcid) {
2238 struct btrfs_root *srcroot; 2246 struct btrfs_root *srcroot;
2239 struct btrfs_key srckey; 2247 struct btrfs_key srckey;
2240 int srcroot_level;
2241 2248
2242 srckey.objectid = srcid; 2249 srckey.objectid = srcid;
2243 srckey.type = BTRFS_ROOT_ITEM_KEY; 2250 srckey.type = BTRFS_ROOT_ITEM_KEY;
@@ -2249,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2249 } 2256 }
2250 2257
2251 rcu_read_lock(); 2258 rcu_read_lock();
2252 srcroot_level = btrfs_header_level(srcroot->node); 2259 level_size = srcroot->nodesize;
2253 level_size = btrfs_level_size(srcroot, srcroot_level);
2254 rcu_read_unlock(); 2260 rcu_read_unlock();
2255 } 2261 }
2256 2262
@@ -2566,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2566 found.type != BTRFS_METADATA_ITEM_KEY) 2572 found.type != BTRFS_METADATA_ITEM_KEY)
2567 continue; 2573 continue;
2568 if (found.type == BTRFS_METADATA_ITEM_KEY) 2574 if (found.type == BTRFS_METADATA_ITEM_KEY)
2569 num_bytes = fs_info->extent_root->leafsize; 2575 num_bytes = fs_info->extent_root->nodesize;
2570 else 2576 else
2571 num_bytes = found.offset; 2577 num_bytes = found.offset;
2572 2578
@@ -2720,6 +2726,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2720 memset(&fs_info->qgroup_rescan_work, 0, 2726 memset(&fs_info->qgroup_rescan_work, 0,
2721 sizeof(fs_info->qgroup_rescan_work)); 2727 sizeof(fs_info->qgroup_rescan_work));
2722 btrfs_init_work(&fs_info->qgroup_rescan_work, 2728 btrfs_init_work(&fs_info->qgroup_rescan_work,
2729 btrfs_qgroup_rescan_helper,
2723 btrfs_qgroup_rescan_worker, NULL, NULL); 2730 btrfs_qgroup_rescan_worker, NULL, NULL);
2724 2731
2725 if (ret) { 2732 if (ret) {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4a88f073fdd7..6a41631cb959 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
913{ 913{
914 unsigned long nr = stripe_len * nr_stripes; 914 unsigned long nr = stripe_len * nr_stripes;
915 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 915 return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
916} 916}
917 917
918/* 918/*
@@ -1416,7 +1416,8 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); 1419 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1420 rmw_work, NULL, NULL);
1420 1421
1421 btrfs_queue_work(rbio->fs_info->rmw_workers, 1422 btrfs_queue_work(rbio->fs_info->rmw_workers,
1422 &rbio->work); 1423 &rbio->work);
@@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1424 1425
1425static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1426static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1426{ 1427{
1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); 1428 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1429 read_rebuild_work, NULL, NULL);
1428 1430
1429 btrfs_queue_work(rbio->fs_info->rmw_workers, 1431 btrfs_queue_work(rbio->fs_info->rmw_workers,
1430 &rbio->work); 1432 &rbio->work);
@@ -1440,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440 struct btrfs_bio *bbio = rbio->bbio; 1442 struct btrfs_bio *bbio = rbio->bbio;
1441 struct bio_list bio_list; 1443 struct bio_list bio_list;
1442 int ret; 1444 int ret;
1443 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1445 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1444 int pagenr; 1446 int pagenr;
1445 int stripe; 1447 int stripe;
1446 struct bio *bio; 1448 struct bio *bio;
@@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1665 plug = container_of(cb, struct btrfs_plug_cb, cb); 1667 plug = container_of(cb, struct btrfs_plug_cb, cb);
1666 1668
1667 if (from_schedule) { 1669 if (from_schedule) {
1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1670 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1671 unplug_work, NULL, NULL);
1669 btrfs_queue_work(plug->info->rmw_workers, 1672 btrfs_queue_work(plug->info->rmw_workers,
1670 &plug->work); 1673 &plug->work);
1671 return; 1674 return;
@@ -1722,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1722 int pagenr, stripe; 1725 int pagenr, stripe;
1723 void **pointers; 1726 void **pointers;
1724 int faila = -1, failb = -1; 1727 int faila = -1, failb = -1;
1725 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1728 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1726 struct page *page; 1729 struct page *page;
1727 int err; 1730 int err;
1728 int i; 1731 int i;
@@ -1937,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1937 struct btrfs_bio *bbio = rbio->bbio; 1940 struct btrfs_bio *bbio = rbio->bbio;
1938 struct bio_list bio_list; 1941 struct bio_list bio_list;
1939 int ret; 1942 int ret;
1940 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1943 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1941 int pagenr; 1944 int pagenr;
1942 int stripe; 1945 int stripe;
1943 struct bio *bio; 1946 struct bio *bio;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 09230cf3a244..b63ae20618fb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
347 if (!re) 347 if (!re)
348 return NULL; 348 return NULL;
349 349
350 blocksize = btrfs_level_size(root, level); 350 blocksize = root->nodesize;
351 re->logical = logical; 351 re->logical = logical;
352 re->blocksize = blocksize; 352 re->blocksize = blocksize;
353 re->top = *top; 353 re->top = *top;
@@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
798 /* FIXME we cannot handle this properly right now */ 798 /* FIXME we cannot handle this properly right now */
799 BUG(); 799 BUG();
800 } 800 }
801 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); 801 btrfs_init_work(&rmw->work, btrfs_readahead_helper,
802 reada_start_machine_worker, NULL, NULL);
802 rmw->fs_info = fs_info; 803 rmw->fs_info = fs_info;
803 804
804 btrfs_queue_work(fs_info->readahead_workers, &rmw->work); 805 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65245a07275b..74257d6436ad 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
736 err = ret; 736 err = ret;
737 goto out; 737 goto out;
738 } 738 }
739 BUG_ON(!ret || !path1->slots[0]); 739 ASSERT(ret);
740 ASSERT(path1->slots[0]);
740 741
741 path1->slots[0]--; 742 path1->slots[0]--;
742 743
@@ -746,10 +747,10 @@ again:
746 * the backref was added previously when processing 747 * the backref was added previously when processing
747 * backref of type BTRFS_TREE_BLOCK_REF_KEY 748 * backref of type BTRFS_TREE_BLOCK_REF_KEY
748 */ 749 */
749 BUG_ON(!list_is_singular(&cur->upper)); 750 ASSERT(list_is_singular(&cur->upper));
750 edge = list_entry(cur->upper.next, struct backref_edge, 751 edge = list_entry(cur->upper.next, struct backref_edge,
751 list[LOWER]); 752 list[LOWER]);
752 BUG_ON(!list_empty(&edge->list[UPPER])); 753 ASSERT(list_empty(&edge->list[UPPER]));
753 exist = edge->node[UPPER]; 754 exist = edge->node[UPPER];
754 /* 755 /*
755 * add the upper level block to pending list if we need 756 * add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
831 cur->cowonly = 1; 832 cur->cowonly = 1;
832 } 833 }
833#else 834#else
834 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 835 ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
835 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { 836 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
836#endif 837#endif
837 if (key.objectid == key.offset) { 838 if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
840 * backref of this type. 841 * backref of this type.
841 */ 842 */
842 root = find_reloc_root(rc, cur->bytenr); 843 root = find_reloc_root(rc, cur->bytenr);
843 BUG_ON(!root); 844 ASSERT(root);
844 cur->root = root; 845 cur->root = root;
845 break; 846 break;
846 } 847 }
@@ -868,7 +869,7 @@ again:
868 } else { 869 } else {
869 upper = rb_entry(rb_node, struct backref_node, 870 upper = rb_entry(rb_node, struct backref_node,
870 rb_node); 871 rb_node);
871 BUG_ON(!upper->checked); 872 ASSERT(upper->checked);
872 INIT_LIST_HEAD(&edge->list[UPPER]); 873 INIT_LIST_HEAD(&edge->list[UPPER]);
873 } 874 }
874 list_add_tail(&edge->list[LOWER], &cur->upper); 875 list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
892 893
893 if (btrfs_root_level(&root->root_item) == cur->level) { 894 if (btrfs_root_level(&root->root_item) == cur->level) {
894 /* tree root */ 895 /* tree root */
895 BUG_ON(btrfs_root_bytenr(&root->root_item) != 896 ASSERT(btrfs_root_bytenr(&root->root_item) ==
896 cur->bytenr); 897 cur->bytenr);
897 if (should_ignore_root(root)) 898 if (should_ignore_root(root))
898 list_add(&cur->list, &useless); 899 list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
927 need_check = true; 928 need_check = true;
928 for (; level < BTRFS_MAX_LEVEL; level++) { 929 for (; level < BTRFS_MAX_LEVEL; level++) {
929 if (!path2->nodes[level]) { 930 if (!path2->nodes[level]) {
930 BUG_ON(btrfs_root_bytenr(&root->root_item) != 931 ASSERT(btrfs_root_bytenr(&root->root_item) ==
931 lower->bytenr); 932 lower->bytenr);
932 if (should_ignore_root(root)) 933 if (should_ignore_root(root))
933 list_add(&lower->list, &useless); 934 list_add(&lower->list, &useless);
@@ -977,12 +978,15 @@ again:
977 need_check = false; 978 need_check = false;
978 list_add_tail(&edge->list[UPPER], 979 list_add_tail(&edge->list[UPPER],
979 &list); 980 &list);
980 } else 981 } else {
982 if (upper->checked)
983 need_check = true;
981 INIT_LIST_HEAD(&edge->list[UPPER]); 984 INIT_LIST_HEAD(&edge->list[UPPER]);
985 }
982 } else { 986 } else {
983 upper = rb_entry(rb_node, struct backref_node, 987 upper = rb_entry(rb_node, struct backref_node,
984 rb_node); 988 rb_node);
985 BUG_ON(!upper->checked); 989 ASSERT(upper->checked);
986 INIT_LIST_HEAD(&edge->list[UPPER]); 990 INIT_LIST_HEAD(&edge->list[UPPER]);
987 if (!upper->owner) 991 if (!upper->owner)
988 upper->owner = btrfs_header_owner(eb); 992 upper->owner = btrfs_header_owner(eb);
@@ -1026,7 +1030,7 @@ next:
1026 * everything goes well, connect backref nodes and insert backref nodes 1030 * everything goes well, connect backref nodes and insert backref nodes
1027 * into the cache. 1031 * into the cache.
1028 */ 1032 */
1029 BUG_ON(!node->checked); 1033 ASSERT(node->checked);
1030 cowonly = node->cowonly; 1034 cowonly = node->cowonly;
1031 if (!cowonly) { 1035 if (!cowonly) {
1032 rb_node = tree_insert(&cache->rb_root, node->bytenr, 1036 rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1062,8 +1066,21 @@ next:
1062 continue; 1066 continue;
1063 } 1067 }
1064 1068
1065 BUG_ON(!upper->checked); 1069 if (!upper->checked) {
1066 BUG_ON(cowonly != upper->cowonly); 1070 /*
1071 * Still want to blow up for developers since this is a
1072 * logic bug.
1073 */
1074 ASSERT(0);
1075 err = -EINVAL;
1076 goto out;
1077 }
1078 if (cowonly != upper->cowonly) {
1079 ASSERT(0);
1080 err = -EINVAL;
1081 goto out;
1082 }
1083
1067 if (!cowonly) { 1084 if (!cowonly) {
1068 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1085 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1069 &upper->rb_node); 1086 &upper->rb_node);
@@ -1086,7 +1103,7 @@ next:
1086 while (!list_empty(&useless)) { 1103 while (!list_empty(&useless)) {
1087 upper = list_entry(useless.next, struct backref_node, list); 1104 upper = list_entry(useless.next, struct backref_node, list);
1088 list_del_init(&upper->list); 1105 list_del_init(&upper->list);
1089 BUG_ON(!list_empty(&upper->upper)); 1106 ASSERT(list_empty(&upper->upper));
1090 if (upper == node) 1107 if (upper == node)
1091 node = NULL; 1108 node = NULL;
1092 if (upper->lowest) { 1109 if (upper->lowest) {
@@ -1119,29 +1136,45 @@ out:
1119 if (err) { 1136 if (err) {
1120 while (!list_empty(&useless)) { 1137 while (!list_empty(&useless)) {
1121 lower = list_entry(useless.next, 1138 lower = list_entry(useless.next,
1122 struct backref_node, upper); 1139 struct backref_node, list);
1123 list_del_init(&lower->upper); 1140 list_del_init(&lower->list);
1124 } 1141 }
1125 upper = node; 1142 while (!list_empty(&list)) {
1126 INIT_LIST_HEAD(&list); 1143 edge = list_first_entry(&list, struct backref_edge,
1127 while (upper) { 1144 list[UPPER]);
1128 if (RB_EMPTY_NODE(&upper->rb_node)) { 1145 list_del(&edge->list[UPPER]);
1129 list_splice_tail(&upper->upper, &list);
1130 free_backref_node(cache, upper);
1131 }
1132
1133 if (list_empty(&list))
1134 break;
1135
1136 edge = list_entry(list.next, struct backref_edge,
1137 list[LOWER]);
1138 list_del(&edge->list[LOWER]); 1146 list_del(&edge->list[LOWER]);
1147 lower = edge->node[LOWER];
1139 upper = edge->node[UPPER]; 1148 upper = edge->node[UPPER];
1140 free_backref_edge(cache, edge); 1149 free_backref_edge(cache, edge);
1150
1151 /*
1152 * Lower is no longer linked to any upper backref nodes
1153 * and isn't in the cache, we can free it ourselves.
1154 */
1155 if (list_empty(&lower->upper) &&
1156 RB_EMPTY_NODE(&lower->rb_node))
1157 list_add(&lower->list, &useless);
1158
1159 if (!RB_EMPTY_NODE(&upper->rb_node))
1160 continue;
1161
1162 /* Add this guy's upper edges to the list to proces */
1163 list_for_each_entry(edge, &upper->upper, list[LOWER])
1164 list_add_tail(&edge->list[UPPER], &list);
1165 if (list_empty(&upper->upper))
1166 list_add(&upper->list, &useless);
1167 }
1168
1169 while (!list_empty(&useless)) {
1170 lower = list_entry(useless.next,
1171 struct backref_node, list);
1172 list_del_init(&lower->list);
1173 free_backref_node(cache, lower);
1141 } 1174 }
1142 return ERR_PTR(err); 1175 return ERR_PTR(err);
1143 } 1176 }
1144 BUG_ON(node && node->detached); 1177 ASSERT(!node || !node->detached);
1145 return node; 1178 return node;
1146} 1179}
1147 1180
@@ -1787,7 +1820,7 @@ again:
1787 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1820 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1788 1821
1789 old_bytenr = btrfs_node_blockptr(parent, slot); 1822 old_bytenr = btrfs_node_blockptr(parent, slot);
1790 blocksize = btrfs_level_size(dest, level - 1); 1823 blocksize = dest->nodesize;
1791 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1824 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1792 1825
1793 if (level <= max_level) { 1826 if (level <= max_level) {
@@ -1813,8 +1846,7 @@ again:
1813 break; 1846 break;
1814 } 1847 }
1815 1848
1816 eb = read_tree_block(dest, old_bytenr, blocksize, 1849 eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
1817 old_ptr_gen);
1818 if (!eb || !extent_buffer_uptodate(eb)) { 1850 if (!eb || !extent_buffer_uptodate(eb)) {
1819 ret = (!eb) ? -ENOMEM : -EIO; 1851 ret = (!eb) ? -ENOMEM : -EIO;
1820 free_extent_buffer(eb); 1852 free_extent_buffer(eb);
@@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1944 u64 bytenr; 1976 u64 bytenr;
1945 u64 ptr_gen = 0; 1977 u64 ptr_gen = 0;
1946 u64 last_snapshot; 1978 u64 last_snapshot;
1947 u32 blocksize;
1948 u32 nritems; 1979 u32 nritems;
1949 1980
1950 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1981 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
@@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1970 } 2001 }
1971 2002
1972 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2003 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1973 blocksize = btrfs_level_size(root, i - 1); 2004 eb = read_tree_block(root, bytenr, ptr_gen);
1974 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1975 if (!eb || !extent_buffer_uptodate(eb)) { 2005 if (!eb || !extent_buffer_uptodate(eb)) {
1976 free_extent_buffer(eb); 2006 free_extent_buffer(eb);
1977 return -EIO; 2007 return -EIO;
@@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list)
2316} 2346}
2317 2347
2318static noinline_for_stack 2348static noinline_for_stack
2319int merge_reloc_roots(struct reloc_control *rc) 2349void merge_reloc_roots(struct reloc_control *rc)
2320{ 2350{
2321 struct btrfs_root *root; 2351 struct btrfs_root *root;
2322 struct btrfs_root *reloc_root; 2352 struct btrfs_root *reloc_root;
@@ -2397,7 +2427,6 @@ out:
2397 } 2427 }
2398 2428
2399 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2429 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2400 return ret;
2401} 2430}
2402 2431
2403static void free_block_list(struct rb_root *blocks) 2432static void free_block_list(struct rb_root *blocks)
@@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
2544 if (next->processed && (reserve || next != node)) 2573 if (next->processed && (reserve || next != node))
2545 break; 2574 break;
2546 2575
2547 num_bytes += btrfs_level_size(rc->extent_root, 2576 num_bytes += rc->extent_root->nodesize;
2548 next->level);
2549 2577
2550 if (list_empty(&next->upper)) 2578 if (list_empty(&next->upper))
2551 break; 2579 break;
@@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2679 goto next; 2707 goto next;
2680 } 2708 }
2681 2709
2682 blocksize = btrfs_level_size(root, node->level); 2710 blocksize = root->nodesize;
2683 generation = btrfs_node_ptr_generation(upper->eb, slot); 2711 generation = btrfs_node_ptr_generation(upper->eb, slot);
2684 eb = read_tree_block(root, bytenr, blocksize, generation); 2712 eb = read_tree_block(root, bytenr, generation);
2685 if (!eb || !extent_buffer_uptodate(eb)) { 2713 if (!eb || !extent_buffer_uptodate(eb)) {
2686 free_extent_buffer(eb); 2714 free_extent_buffer(eb);
2687 err = -EIO; 2715 err = -EIO;
@@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc,
2789 u32 blocksize; 2817 u32 blocksize;
2790 if (node->level == 0 || 2818 if (node->level == 0 ||
2791 in_block_group(node->bytenr, rc->block_group)) { 2819 in_block_group(node->bytenr, rc->block_group)) {
2792 blocksize = btrfs_level_size(rc->extent_root, node->level); 2820 blocksize = rc->extent_root->nodesize;
2793 mark_block_processed(rc, node->bytenr, blocksize); 2821 mark_block_processed(rc, node->bytenr, blocksize);
2794 } 2822 }
2795 node->processed = 1; 2823 node->processed = 1;
@@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc,
2843 2871
2844 BUG_ON(block->key_ready); 2872 BUG_ON(block->key_ready);
2845 eb = read_tree_block(rc->extent_root, block->bytenr, 2873 eb = read_tree_block(rc->extent_root, block->bytenr,
2846 block->key.objectid, block->key.offset); 2874 block->key.offset);
2847 if (!eb || !extent_buffer_uptodate(eb)) { 2875 if (!eb || !extent_buffer_uptodate(eb)) {
2848 free_extent_buffer(eb); 2876 free_extent_buffer(eb);
2849 return -EIO; 2877 return -EIO;
@@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc,
2858 return 0; 2886 return 0;
2859} 2887}
2860 2888
2861static int reada_tree_block(struct reloc_control *rc,
2862 struct tree_block *block)
2863{
2864 BUG_ON(block->key_ready);
2865 if (block->key.type == BTRFS_METADATA_ITEM_KEY)
2866 readahead_tree_block(rc->extent_root, block->bytenr,
2867 block->key.objectid,
2868 rc->extent_root->leafsize);
2869 else
2870 readahead_tree_block(rc->extent_root, block->bytenr,
2871 block->key.objectid, block->key.offset);
2872 return 0;
2873}
2874
2875/* 2889/*
2876 * helper function to relocate a tree block 2890 * helper function to relocate a tree block
2877 */ 2891 */
@@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2951 while (rb_node) { 2965 while (rb_node) {
2952 block = rb_entry(rb_node, struct tree_block, rb_node); 2966 block = rb_entry(rb_node, struct tree_block, rb_node);
2953 if (!block->key_ready) 2967 if (!block->key_ready)
2954 reada_tree_block(rc, block); 2968 readahead_tree_block(rc->extent_root, block->bytenr,
2969 block->key.objectid);
2955 rb_node = rb_next(rb_node); 2970 rb_node = rb_next(rb_node);
2956 } 2971 }
2957 2972
@@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc,
3313 return -ENOMEM; 3328 return -ENOMEM;
3314 3329
3315 block->bytenr = extent_key->objectid; 3330 block->bytenr = extent_key->objectid;
3316 block->key.objectid = rc->extent_root->leafsize; 3331 block->key.objectid = rc->extent_root->nodesize;
3317 block->key.offset = generation; 3332 block->key.offset = generation;
3318 block->level = level; 3333 block->level = level;
3319 block->key_ready = 0; 3334 block->key_ready = 0;
@@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc,
3640 struct btrfs_extent_inline_ref *iref; 3655 struct btrfs_extent_inline_ref *iref;
3641 unsigned long ptr; 3656 unsigned long ptr;
3642 unsigned long end; 3657 unsigned long end;
3643 u32 blocksize = btrfs_level_size(rc->extent_root, 0); 3658 u32 blocksize = rc->extent_root->nodesize;
3644 int ret = 0; 3659 int ret = 0;
3645 int err = 0; 3660 int err = 0;
3646 3661
@@ -3783,7 +3798,7 @@ next:
3783 } 3798 }
3784 3799
3785 if (key.type == BTRFS_METADATA_ITEM_KEY && 3800 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3786 key.objectid + rc->extent_root->leafsize <= 3801 key.objectid + rc->extent_root->nodesize <=
3787 rc->search_start) { 3802 rc->search_start) {
3788 path->slots[0]++; 3803 path->slots[0]++;
3789 goto next; 3804 goto next;
@@ -3801,7 +3816,7 @@ next:
3801 rc->search_start = key.objectid + key.offset; 3816 rc->search_start = key.objectid + key.offset;
3802 else 3817 else
3803 rc->search_start = key.objectid + 3818 rc->search_start = key.objectid +
3804 rc->extent_root->leafsize; 3819 rc->extent_root->nodesize;
3805 memcpy(extent_key, &key, sizeof(key)); 3820 memcpy(extent_key, &key, sizeof(key));
3806 return 0; 3821 return 0;
3807 } 3822 }
@@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4096 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 4111 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4097 BTRFS_INODE_PREALLOC); 4112 BTRFS_INODE_PREALLOC);
4098 btrfs_mark_buffer_dirty(leaf); 4113 btrfs_mark_buffer_dirty(leaf);
4099 btrfs_release_path(path);
4100out: 4114out:
4101 btrfs_free_path(path); 4115 btrfs_free_path(path);
4102 return ret; 4116 return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b6d198f5181e..efa083113827 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -137,7 +137,6 @@ struct scrub_ctx {
137 int pages_per_rd_bio; 137 int pages_per_rd_bio;
138 u32 sectorsize; 138 u32 sectorsize;
139 u32 nodesize; 139 u32 nodesize;
140 u32 leafsize;
141 140
142 int is_dev_replace; 141 int is_dev_replace;
143 struct scrub_wr_ctx wr_ctx; 142 struct scrub_wr_ctx wr_ctx;
@@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx {
178struct scrub_warning { 177struct scrub_warning {
179 struct btrfs_path *path; 178 struct btrfs_path *path;
180 u64 extent_item_size; 179 u64 extent_item_size;
181 char *scratch_buf;
182 char *msg_buf;
183 const char *errstr; 180 const char *errstr;
184 sector_t sector; 181 sector_t sector;
185 u64 logical; 182 u64 logical;
186 struct btrfs_device *dev; 183 struct btrfs_device *dev;
187 int msg_bufsize;
188 int scratch_bufsize;
189}; 184};
190 185
191
192static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 186static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
193static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 187static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
194static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 188static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -428,8 +422,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
428 sbio->index = i; 422 sbio->index = i;
429 sbio->sctx = sctx; 423 sbio->sctx = sctx;
430 sbio->page_count = 0; 424 sbio->page_count = 0;
431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, 425 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
432 NULL, NULL); 426 scrub_bio_end_io_worker, NULL, NULL);
433 427
434 if (i != SCRUB_BIOS_PER_SCTX - 1) 428 if (i != SCRUB_BIOS_PER_SCTX - 1)
435 sctx->bios[i]->next_free = i + 1; 429 sctx->bios[i]->next_free = i + 1;
@@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
438 } 432 }
439 sctx->first_free = 0; 433 sctx->first_free = 0;
440 sctx->nodesize = dev->dev_root->nodesize; 434 sctx->nodesize = dev->dev_root->nodesize;
441 sctx->leafsize = dev->dev_root->leafsize;
442 sctx->sectorsize = dev->dev_root->sectorsize; 435 sctx->sectorsize = dev->dev_root->sectorsize;
443 atomic_set(&sctx->bios_in_flight, 0); 436 atomic_set(&sctx->bios_in_flight, 0);
444 atomic_set(&sctx->workers_pending, 0); 437 atomic_set(&sctx->workers_pending, 0);
@@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
553 u64 ref_root; 546 u64 ref_root;
554 u32 item_size; 547 u32 item_size;
555 u8 ref_level; 548 u8 ref_level;
556 const int bufsize = 4096;
557 int ret; 549 int ret;
558 550
559 WARN_ON(sblock->page_count < 1); 551 WARN_ON(sblock->page_count < 1);
@@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
561 fs_info = sblock->sctx->dev_root->fs_info; 553 fs_info = sblock->sctx->dev_root->fs_info;
562 554
563 path = btrfs_alloc_path(); 555 path = btrfs_alloc_path();
556 if (!path)
557 return;
564 558
565 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
566 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
567 swarn.sector = (sblock->pagev[0]->physical) >> 9; 559 swarn.sector = (sblock->pagev[0]->physical) >> 9;
568 swarn.logical = sblock->pagev[0]->logical; 560 swarn.logical = sblock->pagev[0]->logical;
569 swarn.errstr = errstr; 561 swarn.errstr = errstr;
570 swarn.dev = NULL; 562 swarn.dev = NULL;
571 swarn.msg_bufsize = bufsize;
572 swarn.scratch_bufsize = bufsize;
573
574 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
575 goto out;
576 563
577 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 564 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
578 &flags); 565 &flags);
@@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
613 600
614out: 601out:
615 btrfs_free_path(path); 602 btrfs_free_path(path);
616 kfree(swarn.scratch_buf);
617 kfree(swarn.msg_buf);
618} 603}
619 604
620static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 605static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
@@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
681 ret = -EIO; 666 ret = -EIO;
682 goto out; 667 goto out;
683 } 668 }
684 fs_info = BTRFS_I(inode)->root->fs_info; 669 ret = repair_io_failure(inode, offset, PAGE_SIZE,
685 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
686 fixup->logical, page, 670 fixup->logical, page,
671 offset - page_offset(page),
687 fixup->mirror_num); 672 fixup->mirror_num);
688 unlock_page(page); 673 unlock_page(page);
689 corrected = !ret; 674 corrected = !ret;
@@ -999,8 +984,8 @@ nodatasum_case:
999 fixup_nodatasum->root = fs_info->extent_root; 984 fixup_nodatasum->root = fs_info->extent_root;
1000 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 985 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1001 scrub_pending_trans_workers_inc(sctx); 986 scrub_pending_trans_workers_inc(sctx);
1002 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, 987 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1003 NULL, NULL); 988 scrub_fixup_nodatasum, NULL, NULL);
1004 btrfs_queue_work(fs_info->scrub_workers, 989 btrfs_queue_work(fs_info->scrub_workers,
1005 &fixup_nodatasum->work); 990 &fixup_nodatasum->work);
1006 goto out; 991 goto out;
@@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1361 return; 1346 return;
1362} 1347}
1363 1348
1349static inline int scrub_check_fsid(u8 fsid[],
1350 struct scrub_page *spage)
1351{
1352 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1353 int ret;
1354
1355 ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1356 return !ret;
1357}
1358
1364static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1359static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1365 struct scrub_block *sblock, 1360 struct scrub_block *sblock,
1366 int is_metadata, int have_csum, 1361 int is_metadata, int have_csum,
@@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1380 h = (struct btrfs_header *)mapped_buffer; 1375 h = (struct btrfs_header *)mapped_buffer;
1381 1376
1382 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || 1377 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1383 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1378 !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1384 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1379 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1385 BTRFS_UUID_SIZE)) { 1380 BTRFS_UUID_SIZE)) {
1386 sblock->header_error = 1; 1381 sblock->header_error = 1;
@@ -1616,7 +1611,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1616 sbio->err = err; 1611 sbio->err = err;
1617 sbio->bio = bio; 1612 sbio->bio = bio;
1618 1613
1619 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); 1614 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1615 scrub_wr_bio_end_io_worker, NULL, NULL);
1620 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 1616 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1621} 1617}
1622 1618
@@ -1750,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1750 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) 1746 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1751 ++fail; 1747 ++fail;
1752 1748
1753 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1749 if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1754 ++fail; 1750 ++fail;
1755 1751
1756 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1752 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1757 BTRFS_UUID_SIZE)) 1753 BTRFS_UUID_SIZE))
1758 ++fail; 1754 ++fail;
1759 1755
1760 WARN_ON(sctx->nodesize != sctx->leafsize);
1761 len = sctx->nodesize - BTRFS_CSUM_SIZE; 1756 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1762 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1757 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1763 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1758 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1790,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1790{ 1785{
1791 struct btrfs_super_block *s; 1786 struct btrfs_super_block *s;
1792 struct scrub_ctx *sctx = sblock->sctx; 1787 struct scrub_ctx *sctx = sblock->sctx;
1793 struct btrfs_root *root = sctx->dev_root;
1794 struct btrfs_fs_info *fs_info = root->fs_info;
1795 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1788 u8 calculated_csum[BTRFS_CSUM_SIZE];
1796 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1789 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1797 struct page *page; 1790 struct page *page;
@@ -1816,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1816 if (sblock->pagev[0]->generation != btrfs_super_generation(s)) 1809 if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1817 ++fail_gen; 1810 ++fail_gen;
1818 1811
1819 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1812 if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1820 ++fail_cor; 1813 ++fail_cor;
1821 1814
1822 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1815 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
@@ -2195,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2195 sctx->stat.data_bytes_scrubbed += len; 2188 sctx->stat.data_bytes_scrubbed += len;
2196 spin_unlock(&sctx->stat_lock); 2189 spin_unlock(&sctx->stat_lock);
2197 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2190 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2198 WARN_ON(sctx->nodesize != sctx->leafsize);
2199 blocksize = sctx->nodesize; 2191 blocksize = sctx->nodesize;
2200 spin_lock(&sctx->stat_lock); 2192 spin_lock(&sctx->stat_lock);
2201 sctx->stat.tree_extents_scrubbed++; 2193 sctx->stat.tree_extents_scrubbed++;
@@ -2486,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2486 btrfs_item_key_to_cpu(l, &key, slot); 2478 btrfs_item_key_to_cpu(l, &key, slot);
2487 2479
2488 if (key.type == BTRFS_METADATA_ITEM_KEY) 2480 if (key.type == BTRFS_METADATA_ITEM_KEY)
2489 bytes = root->leafsize; 2481 bytes = root->nodesize;
2490 else 2482 else
2491 bytes = key.offset; 2483 bytes = key.offset;
2492 2484
@@ -2713,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2713 if (found_key.objectid != scrub_dev->devid) 2705 if (found_key.objectid != scrub_dev->devid)
2714 break; 2706 break;
2715 2707
2716 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2708 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
2717 break; 2709 break;
2718 2710
2719 if (found_key.offset >= end) 2711 if (found_key.offset >= end)
@@ -2827,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2827 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 2819 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2828 return -EIO; 2820 return -EIO;
2829 2821
2830 gen = root->fs_info->last_trans_committed; 2822 /* Seed devices of a new filesystem has their own generation. */
2823 if (scrub_dev->fs_devices != root->fs_info->fs_devices)
2824 gen = scrub_dev->generation;
2825 else
2826 gen = root->fs_info->last_trans_committed;
2831 2827
2832 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2828 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2833 bytenr = btrfs_sb_offset(i); 2829 bytenr = btrfs_sb_offset(i);
2834 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) 2830 if (bytenr + BTRFS_SUPER_INFO_SIZE >
2831 scrub_dev->commit_total_bytes)
2835 break; 2832 break;
2836 2833
2837 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2834 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
@@ -2904,21 +2901,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2904 struct scrub_ctx *sctx; 2901 struct scrub_ctx *sctx;
2905 int ret; 2902 int ret;
2906 struct btrfs_device *dev; 2903 struct btrfs_device *dev;
2904 struct rcu_string *name;
2907 2905
2908 if (btrfs_fs_closing(fs_info)) 2906 if (btrfs_fs_closing(fs_info))
2909 return -EINVAL; 2907 return -EINVAL;
2910 2908
2911 /*
2912 * check some assumptions
2913 */
2914 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2915 btrfs_err(fs_info,
2916 "scrub: size assumption nodesize == leafsize (%d == %d) fails",
2917 fs_info->chunk_root->nodesize,
2918 fs_info->chunk_root->leafsize);
2919 return -EINVAL;
2920 }
2921
2922 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { 2909 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2923 /* 2910 /*
2924 * in this case scrub is unable to calculate the checksum 2911 * in this case scrub is unable to calculate the checksum
@@ -2965,6 +2952,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2965 return -ENODEV; 2952 return -ENODEV;
2966 } 2953 }
2967 2954
2955 if (!is_dev_replace && !readonly && !dev->writeable) {
2956 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2957 rcu_read_lock();
2958 name = rcu_dereference(dev->name);
2959 btrfs_err(fs_info, "scrub: device %s is not writable",
2960 name->str);
2961 rcu_read_unlock();
2962 return -EROFS;
2963 }
2964
2968 mutex_lock(&fs_info->scrub_lock); 2965 mutex_lock(&fs_info->scrub_lock);
2969 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { 2966 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2970 mutex_unlock(&fs_info->scrub_lock); 2967 mutex_unlock(&fs_info->scrub_lock);
@@ -3203,7 +3200,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3203 nocow_ctx->len = len; 3200 nocow_ctx->len = len;
3204 nocow_ctx->mirror_num = mirror_num; 3201 nocow_ctx->mirror_num = mirror_num;
3205 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3202 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3206 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); 3203 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3204 copy_nocow_pages_worker, NULL, NULL);
3207 INIT_LIST_HEAD(&nocow_ctx->inodes); 3205 INIT_LIST_HEAD(&nocow_ctx->inodes);
3208 btrfs_queue_work(fs_info->scrub_nocow_workers, 3206 btrfs_queue_work(fs_info->scrub_nocow_workers,
3209 &nocow_ctx->work); 3207 &nocow_ctx->work);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6528aa662181..874828dd0a86 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
515 set_fs(KERNEL_DS); 515 set_fs(KERNEL_DS);
516 516
517 while (pos < len) { 517 while (pos < len) {
518 ret = vfs_write(filp, (char *)buf + pos, len - pos, off); 518 ret = vfs_write(filp, (__force const char __user *)buf + pos,
519 len - pos, off);
519 /* TODO handle that correctly */ 520 /* TODO handle that correctly */
520 /*if (ret == -ERESTARTSYS) { 521 /*if (ret == -ERESTARTSYS) {
521 continue; 522 continue;
@@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
985 int num; 986 int num;
986 u8 type; 987 u8 type;
987 988
988 if (found_key->type == BTRFS_XATTR_ITEM_KEY) 989 /*
989 buf_len = BTRFS_MAX_XATTR_SIZE(root); 990 * Start with a small buffer (1 page). If later we end up needing more
990 else 991 * space, which can happen for xattrs on a fs with a leaf size greater
991 buf_len = PATH_MAX; 992 * then the page size, attempt to increase the buffer. Typically xattr
992 993 * values are small.
994 */
995 buf_len = PATH_MAX;
993 buf = kmalloc(buf_len, GFP_NOFS); 996 buf = kmalloc(buf_len, GFP_NOFS);
994 if (!buf) { 997 if (!buf) {
995 ret = -ENOMEM; 998 ret = -ENOMEM;
@@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1016 ret = -ENAMETOOLONG; 1019 ret = -ENAMETOOLONG;
1017 goto out; 1020 goto out;
1018 } 1021 }
1019 if (name_len + data_len > buf_len) { 1022 if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
1020 ret = -E2BIG; 1023 ret = -E2BIG;
1021 goto out; 1024 goto out;
1022 } 1025 }
@@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1024 /* 1027 /*
1025 * Path too long 1028 * Path too long
1026 */ 1029 */
1027 if (name_len + data_len > buf_len) { 1030 if (name_len + data_len > PATH_MAX) {
1028 ret = -ENAMETOOLONG; 1031 ret = -ENAMETOOLONG;
1029 goto out; 1032 goto out;
1030 } 1033 }
1031 } 1034 }
1032 1035
1036 if (name_len + data_len > buf_len) {
1037 buf_len = name_len + data_len;
1038 if (is_vmalloc_addr(buf)) {
1039 vfree(buf);
1040 buf = NULL;
1041 } else {
1042 char *tmp = krealloc(buf, buf_len,
1043 GFP_NOFS | __GFP_NOWARN);
1044
1045 if (!tmp)
1046 kfree(buf);
1047 buf = tmp;
1048 }
1049 if (!buf) {
1050 buf = vmalloc(buf_len);
1051 if (!buf) {
1052 ret = -ENOMEM;
1053 goto out;
1054 }
1055 }
1056 }
1057
1033 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1058 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
1034 name_len + data_len); 1059 name_len + data_len);
1035 1060
@@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1050 } 1075 }
1051 1076
1052out: 1077out:
1053 kfree(buf); 1078 kvfree(buf);
1054 return ret; 1079 return ret;
1055} 1080}
1056 1081
@@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3302 if (ret < 0 && ret != -ENOENT) { 3327 if (ret < 0 && ret != -ENOENT) {
3303 goto out; 3328 goto out;
3304 } else if (ret == -ENOENT) { 3329 } else if (ret == -ENOENT) {
3305 ret = 1; 3330 ret = 0;
3306 break; 3331 break;
3307 } 3332 }
3308 3333
@@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5703 NULL); 5728 NULL);
5704 sort_clone_roots = 1; 5729 sort_clone_roots = 1;
5705 5730
5706 current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; 5731 current->journal_info = BTRFS_SEND_TRANS_STUB;
5707 ret = send_subvol(sctx); 5732 ret = send_subvol(sctx);
5708 current->journal_info = NULL; 5733 current->journal_info = NULL;
5709 if (ret < 0) 5734 if (ret < 0)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c4124de4435b..a2b97ef10317 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
60#include "backref.h" 60#include "backref.h"
61#include "tests/btrfs-tests.h" 61#include "tests/btrfs-tests.h"
62 62
63#include "qgroup.h"
63#define CREATE_TRACE_POINTS 64#define CREATE_TRACE_POINTS
64#include <trace/events/btrfs.h> 65#include <trace/events/btrfs.h>
65 66
@@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
307 308
308static void btrfs_put_super(struct super_block *sb) 309static void btrfs_put_super(struct super_block *sb)
309{ 310{
310 (void)close_ctree(btrfs_sb(sb)->tree_root); 311 close_ctree(btrfs_sb(sb)->tree_root);
311 /* FIXME: need to fix VFS to return error? */
312 /* AV: return it _where_? ->put_super() can be triggered by any number
313 * of async events, up to and including delivery of SIGKILL to the
314 * last process that kept it busy. Or segfault in the aforementioned
315 * process... Whom would you report that to?
316 */
317} 312}
318 313
319enum { 314enum {
@@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
400 int ret = 0; 395 int ret = 0;
401 char *compress_type; 396 char *compress_type;
402 bool compress_force = false; 397 bool compress_force = false;
403 bool compress = false;
404 398
405 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 399 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
406 if (cache_gen) 400 if (cache_gen)
@@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
478 /* Fallthrough */ 472 /* Fallthrough */
479 case Opt_compress: 473 case Opt_compress:
480 case Opt_compress_type: 474 case Opt_compress_type:
481 compress = true;
482 if (token == Opt_compress || 475 if (token == Opt_compress ||
483 token == Opt_compress_force || 476 token == Opt_compress_force ||
484 strcmp(args[0].from, "zlib") == 0) { 477 strcmp(args[0].from, "zlib") == 0) {
@@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
508 btrfs_set_and_info(root, FORCE_COMPRESS, 501 btrfs_set_and_info(root, FORCE_COMPRESS,
509 "force %s compression", 502 "force %s compression",
510 compress_type); 503 compress_type);
511 } else if (compress) { 504 } else {
512 if (!btrfs_test_opt(root, COMPRESS)) 505 if (!btrfs_test_opt(root, COMPRESS))
513 btrfs_info(root->fs_info, 506 btrfs_info(root->fs_info,
514 "btrfs: use %s compression", 507 "btrfs: use %s compression",
515 compress_type); 508 compress_type);
509 /*
510 * If we remount from compress-force=xxx to
511 * compress=xxx, we need clear FORCE_COMPRESS
512 * flag, otherwise, there is no way for users
513 * to disable forcible compression separately.
514 */
515 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
516 } 516 }
517 break; 517 break;
518 case Opt_ssd: 518 case Opt_ssd:
@@ -1014,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1014 seq_puts(seq, ",nodatacow"); 1014 seq_puts(seq, ",nodatacow");
1015 if (btrfs_test_opt(root, NOBARRIER)) 1015 if (btrfs_test_opt(root, NOBARRIER))
1016 seq_puts(seq, ",nobarrier"); 1016 seq_puts(seq, ",nobarrier");
1017 if (info->max_inline != 8192 * 1024) 1017 if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
1018 seq_printf(seq, ",max_inline=%llu", info->max_inline); 1018 seq_printf(seq, ",max_inline=%llu", info->max_inline);
1019 if (info->alloc_start != 0) 1019 if (info->alloc_start != 0)
1020 seq_printf(seq, ",alloc_start=%llu", info->alloc_start); 1020 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
@@ -1215,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
1215 return root; 1215 return root;
1216} 1216}
1217 1217
1218static int parse_security_options(char *orig_opts,
1219 struct security_mnt_opts *sec_opts)
1220{
1221 char *secdata = NULL;
1222 int ret = 0;
1223
1224 secdata = alloc_secdata();
1225 if (!secdata)
1226 return -ENOMEM;
1227 ret = security_sb_copy_data(orig_opts, secdata);
1228 if (ret) {
1229 free_secdata(secdata);
1230 return ret;
1231 }
1232 ret = security_sb_parse_opts_str(secdata, sec_opts);
1233 free_secdata(secdata);
1234 return ret;
1235}
1236
1237static int setup_security_options(struct btrfs_fs_info *fs_info,
1238 struct super_block *sb,
1239 struct security_mnt_opts *sec_opts)
1240{
1241 int ret = 0;
1242
1243 /*
1244 * Call security_sb_set_mnt_opts() to check whether new sec_opts
1245 * is valid.
1246 */
1247 ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
1248 if (ret)
1249 return ret;
1250
1251#ifdef CONFIG_SECURITY
1252 if (!fs_info->security_opts.num_mnt_opts) {
1253 /* first time security setup, copy sec_opts to fs_info */
1254 memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
1255 } else {
1256 /*
1257 * Since SELinux(the only one supports security_mnt_opts) does
1258 * NOT support changing context during remount/mount same sb,
1259 * This must be the same or part of the same security options,
1260 * just free it.
1261 */
1262 security_free_mnt_opts(sec_opts);
1263 }
1264#endif
1265 return ret;
1266}
1267
1218/* 1268/*
1219 * Find a superblock for the given device / mount point. 1269 * Find a superblock for the given device / mount point.
1220 * 1270 *
@@ -1229,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1229 struct dentry *root; 1279 struct dentry *root;
1230 struct btrfs_fs_devices *fs_devices = NULL; 1280 struct btrfs_fs_devices *fs_devices = NULL;
1231 struct btrfs_fs_info *fs_info = NULL; 1281 struct btrfs_fs_info *fs_info = NULL;
1282 struct security_mnt_opts new_sec_opts;
1232 fmode_t mode = FMODE_READ; 1283 fmode_t mode = FMODE_READ;
1233 char *subvol_name = NULL; 1284 char *subvol_name = NULL;
1234 u64 subvol_objectid = 0; 1285 u64 subvol_objectid = 0;
@@ -1251,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1251 return root; 1302 return root;
1252 } 1303 }
1253 1304
1305 security_init_mnt_opts(&new_sec_opts);
1306 if (data) {
1307 error = parse_security_options(data, &new_sec_opts);
1308 if (error)
1309 return ERR_PTR(error);
1310 }
1311
1254 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 1312 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
1255 if (error) 1313 if (error)
1256 return ERR_PTR(error); 1314 goto error_sec_opts;
1257 1315
1258 /* 1316 /*
1259 * Setup a dummy root and fs_info for test/set super. This is because 1317 * Setup a dummy root and fs_info for test/set super. This is because
@@ -1262,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1262 * then open_ctree will properly initialize everything later. 1320 * then open_ctree will properly initialize everything later.
1263 */ 1321 */
1264 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 1322 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
1265 if (!fs_info) 1323 if (!fs_info) {
1266 return ERR_PTR(-ENOMEM); 1324 error = -ENOMEM;
1325 goto error_sec_opts;
1326 }
1267 1327
1268 fs_info->fs_devices = fs_devices; 1328 fs_info->fs_devices = fs_devices;
1269 1329
1270 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 1330 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
1271 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 1331 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
1332 security_init_mnt_opts(&fs_info->security_opts);
1272 if (!fs_info->super_copy || !fs_info->super_for_commit) { 1333 if (!fs_info->super_copy || !fs_info->super_for_commit) {
1273 error = -ENOMEM; 1334 error = -ENOMEM;
1274 goto error_fs_info; 1335 goto error_fs_info;
@@ -1306,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1306 } 1367 }
1307 1368
1308 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); 1369 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
1309 if (IS_ERR(root)) 1370 if (IS_ERR(root)) {
1371 deactivate_locked_super(s);
1372 error = PTR_ERR(root);
1373 goto error_sec_opts;
1374 }
1375
1376 fs_info = btrfs_sb(s);
1377 error = setup_security_options(fs_info, s, &new_sec_opts);
1378 if (error) {
1379 dput(root);
1310 deactivate_locked_super(s); 1380 deactivate_locked_super(s);
1381 goto error_sec_opts;
1382 }
1311 1383
1312 return root; 1384 return root;
1313 1385
@@ -1315,6 +1387,8 @@ error_close_devices:
1315 btrfs_close_devices(fs_devices); 1387 btrfs_close_devices(fs_devices);
1316error_fs_info: 1388error_fs_info:
1317 free_fs_info(fs_info); 1389 free_fs_info(fs_info);
1390error_sec_opts:
1391 security_free_mnt_opts(&new_sec_opts);
1318 return ERR_PTR(error); 1392 return ERR_PTR(error);
1319} 1393}
1320 1394
@@ -1396,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1396 sync_filesystem(sb); 1470 sync_filesystem(sb);
1397 btrfs_remount_prepare(fs_info); 1471 btrfs_remount_prepare(fs_info);
1398 1472
1473 if (data) {
1474 struct security_mnt_opts new_sec_opts;
1475
1476 security_init_mnt_opts(&new_sec_opts);
1477 ret = parse_security_options(data, &new_sec_opts);
1478 if (ret)
1479 goto restore;
1480 ret = setup_security_options(fs_info, sb,
1481 &new_sec_opts);
1482 if (ret) {
1483 security_free_mnt_opts(&new_sec_opts);
1484 goto restore;
1485 }
1486 }
1487
1399 ret = btrfs_parse_options(root, data); 1488 ret = btrfs_parse_options(root, data);
1400 if (ret) { 1489 if (ret) {
1401 ret = -EINVAL; 1490 ret = -EINVAL;
@@ -1694,7 +1783,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1694 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 1783 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
1695 int ret; 1784 int ret;
1696 1785
1697 /* holding chunk_muext to avoid allocating new chunks */ 1786 /*
1787 * holding chunk_muext to avoid allocating new chunks, holding
1788 * device_list_mutex to avoid the device being removed
1789 */
1790 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1698 mutex_lock(&fs_info->chunk_mutex); 1791 mutex_lock(&fs_info->chunk_mutex);
1699 rcu_read_lock(); 1792 rcu_read_lock();
1700 list_for_each_entry_rcu(found, head, list) { 1793 list_for_each_entry_rcu(found, head, list) {
@@ -1735,11 +1828,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1735 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); 1828 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1736 if (ret) { 1829 if (ret) {
1737 mutex_unlock(&fs_info->chunk_mutex); 1830 mutex_unlock(&fs_info->chunk_mutex);
1831 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1738 return ret; 1832 return ret;
1739 } 1833 }
1740 buf->f_bavail += div_u64(total_free_data, factor); 1834 buf->f_bavail += div_u64(total_free_data, factor);
1741 buf->f_bavail = buf->f_bavail >> bits; 1835 buf->f_bavail = buf->f_bavail >> bits;
1742 mutex_unlock(&fs_info->chunk_mutex); 1836 mutex_unlock(&fs_info->chunk_mutex);
1837 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1743 1838
1744 buf->f_type = BTRFS_SUPER_MAGIC; 1839 buf->f_type = BTRFS_SUPER_MAGIC;
1745 buf->f_bsize = dentry->d_sb->s_blocksize; 1840 buf->f_bsize = dentry->d_sb->s_blocksize;
@@ -1769,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = {
1769 .name = "btrfs", 1864 .name = "btrfs",
1770 .mount = btrfs_mount, 1865 .mount = btrfs_mount,
1771 .kill_sb = btrfs_kill_super, 1866 .kill_sb = btrfs_kill_super,
1772 .fs_flags = FS_REQUIRES_DEV, 1867 .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
1773}; 1868};
1774MODULE_ALIAS_FS("btrfs"); 1869MODULE_ALIAS_FS("btrfs");
1775 1870
@@ -1993,11 +2088,15 @@ static int __init init_btrfs_fs(void)
1993 2088
1994 err = btrfs_prelim_ref_init(); 2089 err = btrfs_prelim_ref_init();
1995 if (err) 2090 if (err)
2091 goto free_delayed_ref;
2092
2093 err = btrfs_end_io_wq_init();
2094 if (err)
1996 goto free_prelim_ref; 2095 goto free_prelim_ref;
1997 2096
1998 err = btrfs_interface_init(); 2097 err = btrfs_interface_init();
1999 if (err) 2098 if (err)
2000 goto free_delayed_ref; 2099 goto free_end_io_wq;
2001 2100
2002 btrfs_init_lockdep(); 2101 btrfs_init_lockdep();
2003 2102
@@ -2015,6 +2114,8 @@ static int __init init_btrfs_fs(void)
2015 2114
2016unregister_ioctl: 2115unregister_ioctl:
2017 btrfs_interface_exit(); 2116 btrfs_interface_exit();
2117free_end_io_wq:
2118 btrfs_end_io_wq_exit();
2018free_prelim_ref: 2119free_prelim_ref:
2019 btrfs_prelim_ref_exit(); 2120 btrfs_prelim_ref_exit();
2020free_delayed_ref: 2121free_delayed_ref:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78699364f537..b2e7bb4393f6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
242 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 242 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
243 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); 243 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
244} 244}
245BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); 245BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
246 246
247static ssize_t global_rsv_reserved_show(struct kobject *kobj, 247static ssize_t global_rsv_reserved_show(struct kobject *kobj,
248 struct kobj_attribute *a, char *buf) 248 struct kobj_attribute *a, char *buf)
@@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
251 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 251 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
252 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); 252 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
253} 253}
254BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); 254BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
255 255
256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) 256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
257#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) 257#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
@@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
306 struct btrfs_space_info *sinfo = to_space_info(kobj); \ 306 struct btrfs_space_info *sinfo = to_space_info(kobj); \
307 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ 307 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
308} \ 308} \
309BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) 309BTRFS_ATTR(field, btrfs_space_info_show_##field)
310 310
311static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, 311static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
312 struct kobj_attribute *a, 312 struct kobj_attribute *a,
@@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved);
325SPACE_INFO_ATTR(bytes_may_use); 325SPACE_INFO_ATTR(bytes_may_use);
326SPACE_INFO_ATTR(disk_used); 326SPACE_INFO_ATTR(disk_used);
327SPACE_INFO_ATTR(disk_total); 327SPACE_INFO_ATTR(disk_total);
328BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); 328BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
329 329
330static struct attribute *space_info_attrs[] = { 330static struct attribute *space_info_attrs[] = {
331 BTRFS_ATTR_PTR(flags), 331 BTRFS_ATTR_PTR(flags),
@@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
363 struct kobj_attribute *a, char *buf) 363 struct kobj_attribute *a, char *buf)
364{ 364{
365 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 365 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
366 return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); 366 char *label = fs_info->super_copy->label;
367 return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
367} 368}
368 369
369static ssize_t btrfs_label_store(struct kobject *kobj, 370static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
374 struct btrfs_trans_handle *trans; 375 struct btrfs_trans_handle *trans;
375 struct btrfs_root *root = fs_info->fs_root; 376 struct btrfs_root *root = fs_info->fs_root;
376 int ret; 377 int ret;
378 size_t p_len;
377 379
378 if (len >= BTRFS_LABEL_SIZE) 380 if (fs_info->sb->s_flags & MS_RDONLY)
381 return -EROFS;
382
383 /*
384 * p_len is the len until the first occurrence of either
385 * '\n' or '\0'
386 */
387 p_len = strcspn(buf, "\n");
388
389 if (p_len >= BTRFS_LABEL_SIZE)
379 return -EINVAL; 390 return -EINVAL;
380 391
381 trans = btrfs_start_transaction(root, 0); 392 trans = btrfs_start_transaction(root, 0);
@@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
383 return PTR_ERR(trans); 394 return PTR_ERR(trans);
384 395
385 spin_lock(&root->fs_info->super_lock); 396 spin_lock(&root->fs_info->super_lock);
386 strcpy(fs_info->super_copy->label, buf); 397 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
398 memcpy(fs_info->super_copy->label, buf, p_len);
387 spin_unlock(&root->fs_info->super_lock); 399 spin_unlock(&root->fs_info->super_lock);
388 ret = btrfs_commit_transaction(trans, root); 400 ret = btrfs_commit_transaction(trans, root);
389 401
@@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
392 404
393 return ret; 405 return ret;
394} 406}
395BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); 407BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
396
397static ssize_t btrfs_no_store(struct kobject *kobj,
398 struct kobj_attribute *a,
399 const char *buf, size_t len)
400{
401 return -EPERM;
402}
403 408
404static ssize_t btrfs_nodesize_show(struct kobject *kobj, 409static ssize_t btrfs_nodesize_show(struct kobject *kobj,
405 struct kobj_attribute *a, char *buf) 410 struct kobj_attribute *a, char *buf)
@@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
409 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); 414 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
410} 415}
411 416
412BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); 417BTRFS_ATTR(nodesize, btrfs_nodesize_show);
413 418
414static ssize_t btrfs_sectorsize_show(struct kobject *kobj, 419static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
415 struct kobj_attribute *a, char *buf) 420 struct kobj_attribute *a, char *buf)
@@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
419 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); 424 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
420} 425}
421 426
422BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); 427BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
423 428
424static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, 429static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
425 struct kobj_attribute *a, char *buf) 430 struct kobj_attribute *a, char *buf)
@@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
429 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); 434 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
430} 435}
431 436
432BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); 437BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
433 438
434static struct attribute *btrfs_attrs[] = { 439static struct attribute *btrfs_attrs[] = {
435 BTRFS_ATTR_PTR(label), 440 BTRFS_ATTR_PTR(label),
@@ -614,7 +619,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
614 if (!fs_info->device_dir_kobj) 619 if (!fs_info->device_dir_kobj)
615 return -EINVAL; 620 return -EINVAL;
616 621
617 if (one_device) { 622 if (one_device && one_device->bdev) {
618 disk = one_device->bdev->bd_part; 623 disk = one_device->bdev->bd_part;
619 disk_kobj = &part_to_dev(disk)->kobj; 624 disk_kobj = &part_to_dev(disk)->kobj;
620 625
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index ac46df37504c..f7dd298b3cf6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -20,16 +20,20 @@ enum btrfs_feature_set {
20 .store = _store, \ 20 .store = _store, \
21} 21}
22 22
23#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ 23#define BTRFS_ATTR_RW(_name, _show, _store) \
24static struct kobj_attribute btrfs_attr_##_name = \ 24 static struct kobj_attribute btrfs_attr_##_name = \
25 __INIT_KOBJ_ATTR(_name, _mode, _show, _store) 25 __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
26#define BTRFS_ATTR(_name, _mode, _show) \ 26
27 BTRFS_ATTR_RW(_name, _mode, _show, NULL) 27#define BTRFS_ATTR(_name, _show) \
28 static struct kobj_attribute btrfs_attr_##_name = \
29 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
30
28#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) 31#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
29 32
30#define BTRFS_RAID_ATTR(_name, _show) \ 33#define BTRFS_RAID_ATTR(_name, _show) \
31static struct kobj_attribute btrfs_raid_attr_##_name = \ 34 static struct kobj_attribute btrfs_raid_attr_##_name = \
32 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) 35 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
36
33#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) 37#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
34 38
35 39
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8d9ddf84c69..2299bfde39ee 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
40 cache->key.offset = 1024 * 1024 * 1024; 40 cache->key.offset = 1024 * 1024 * 1024;
41 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 41 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
42 cache->sectorsize = 4096; 42 cache->sectorsize = 4096;
43 cache->full_stripe_len = 4096;
43 44
44 spin_lock_init(&cache->lock); 45 spin_lock_init(&cache->lock);
45 INIT_LIST_HEAD(&cache->list); 46 INIT_LIST_HEAD(&cache->list);
46 INIT_LIST_HEAD(&cache->cluster_list); 47 INIT_LIST_HEAD(&cache->cluster_list);
47 INIT_LIST_HEAD(&cache->new_bg_list); 48 INIT_LIST_HEAD(&cache->bg_list);
48 49
49 btrfs_init_free_space_ctl(cache); 50 btrfs_init_free_space_ctl(cache);
50 51
@@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
364 return 0; 365 return 0;
365} 366}
366 367
368/* Used by test_steal_space_from_bitmap_to_extent(). */
369static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
370 struct btrfs_free_space *info)
371{
372 return ctl->free_extents > 0;
373}
374
375/* Used by test_steal_space_from_bitmap_to_extent(). */
376static int
377check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
378 const int num_extents,
379 const int num_bitmaps)
380{
381 if (cache->free_space_ctl->free_extents != num_extents) {
382 test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
383 cache->free_space_ctl->free_extents, num_extents);
384 return -EINVAL;
385 }
386 if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
387 test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
388 cache->free_space_ctl->total_bitmaps, num_bitmaps);
389 return -EINVAL;
390 }
391 return 0;
392}
393
394/* Used by test_steal_space_from_bitmap_to_extent(). */
395static int check_cache_empty(struct btrfs_block_group_cache *cache)
396{
397 u64 offset;
398 u64 max_extent_size;
399
400 /*
401 * Now lets confirm that there's absolutely no free space left to
402 * allocate.
403 */
404 if (cache->free_space_ctl->free_space != 0) {
405 test_msg("Cache free space is not 0\n");
406 return -EINVAL;
407 }
408
409 /* And any allocation request, no matter how small, should fail now. */
410 offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
411 &max_extent_size);
412 if (offset != 0) {
413 test_msg("Space allocation did not fail, returned offset: %llu",
414 offset);
415 return -EINVAL;
416 }
417
418 /* And no extent nor bitmap entries in the cache anymore. */
419 return check_num_extents_and_bitmaps(cache, 0, 0);
420}
421
422/*
423 * Before we were able to steal free space from a bitmap entry to an extent
424 * entry, we could end up with 2 entries representing a contiguous free space.
425 * One would be an extent entry and the other a bitmap entry. Since in order
426 * to allocate space to a caller we use only 1 entry, we couldn't return that
427 * whole range to the caller if it was requested. This forced the caller to
428 * either assume ENOSPC or perform several smaller space allocations, which
429 * wasn't optimal as they could be spread all over the block group while under
430 * concurrency (extra overhead and fragmentation).
431 *
432 * This stealing approach is benefical, since we always prefer to allocate from
433 * extent entries, both for clustered and non-clustered allocation requests.
434 */
435static int
436test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
437{
438 int ret;
439 u64 offset;
440 u64 max_extent_size;
441
442 bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
443 struct btrfs_free_space *);
444
445 test_msg("Running space stealing from bitmap to extent\n");
446
447 /*
448 * For this test, we want to ensure we end up with an extent entry
449 * immediately adjacent to a bitmap entry, where the bitmap starts
450 * at an offset where the extent entry ends. We keep adding and
451 * removing free space to reach into this state, but to get there
452 * we need to reach a point where marking new free space doesn't
453 * result in adding new extent entries or merging the new space
454 * with existing extent entries - the space ends up being marked
455 * in an existing bitmap that covers the new free space range.
456 *
457 * To get there, we need to reach the threshold defined set at
458 * cache->free_space_ctl->extents_thresh, which currently is
459 * 256 extents on a x86_64 system at least, and a few other
460 * conditions (check free_space_cache.c). Instead of making the
461 * test much longer and complicated, use a "use_bitmap" operation
462 * that forces use of bitmaps as soon as we have at least 1
463 * extent entry.
464 */
465 use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
466 cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
467
468 /*
469 * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
470 */
471 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
472 128 * 1024, 0);
473 if (ret) {
474 test_msg("Couldn't add extent entry %d\n", ret);
475 return ret;
476 }
477
478 /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
479 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
480 128 * 1024 * 1024 - 512 * 1024, 1);
481 if (ret) {
482 test_msg("Couldn't add bitmap entry %d\n", ret);
483 return ret;
484 }
485
486 ret = check_num_extents_and_bitmaps(cache, 2, 1);
487 if (ret)
488 return ret;
489
490 /*
491 * Now make only the first 256Kb of the bitmap marked as free, so that
492 * we end up with only the following ranges marked as free space:
493 *
494 * [128Mb - 256Kb, 128Mb - 128Kb[
495 * [128Mb + 512Kb, 128Mb + 768Kb[
496 */
497 ret = btrfs_remove_free_space(cache,
498 128 * 1024 * 1024 + 768 * 1024,
499 128 * 1024 * 1024 - 768 * 1024);
500 if (ret) {
501 test_msg("Failed to free part of bitmap space %d\n", ret);
502 return ret;
503 }
504
505 /* Confirm that only those 2 ranges are marked as free. */
506 if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
507 128 * 1024)) {
508 test_msg("Free space range missing\n");
509 return -ENOENT;
510 }
511 if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
512 256 * 1024)) {
513 test_msg("Free space range missing\n");
514 return -ENOENT;
515 }
516
517 /*
518 * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
519 * as free anymore.
520 */
521 if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
522 128 * 1024 * 1024 - 768 * 1024)) {
523 test_msg("Bitmap region not removed from space cache\n");
524 return -EINVAL;
525 }
526
527 /*
528 * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
529 * covered by the bitmap, isn't marked as free.
530 */
531 if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
532 256 * 1024)) {
533 test_msg("Invalid bitmap region marked as free\n");
534 return -EINVAL;
535 }
536
537 /*
538 * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
539 * by the bitmap too, isn't marked as free either.
540 */
541 if (test_check_exists(cache, 128 * 1024 * 1024,
542 256 * 1024)) {
543 test_msg("Invalid bitmap region marked as free\n");
544 return -EINVAL;
545 }
546
547 /*
548 * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
549 * lets make sure the free space cache marks it as free in the bitmap,
550 * and doesn't insert a new extent entry to represent this region.
551 */
552 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
553 if (ret) {
554 test_msg("Error adding free space: %d\n", ret);
555 return ret;
556 }
557 /* Confirm the region is marked as free. */
558 if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
559 test_msg("Bitmap region not marked as free\n");
560 return -ENOENT;
561 }
562
563 /*
564 * Confirm that no new extent entries or bitmap entries were added to
565 * the cache after adding that free space region.
566 */
567 ret = check_num_extents_and_bitmaps(cache, 2, 1);
568 if (ret)
569 return ret;
570
571 /*
572 * Now lets add a small free space region to the right of the previous
573 * one, which is not contiguous with it and is part of the bitmap too.
574 * The goal is to test that the bitmap entry space stealing doesn't
575 * steal this space region.
576 */
577 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
578 4096);
579 if (ret) {
580 test_msg("Error adding free space: %d\n", ret);
581 return ret;
582 }
583
584 /*
585 * Confirm that no new extent entries or bitmap entries were added to
586 * the cache after adding that free space region.
587 */
588 ret = check_num_extents_and_bitmaps(cache, 2, 1);
589 if (ret)
590 return ret;
591
592 /*
593 * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
594 * expand the range covered by the existing extent entry that represents
595 * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
596 */
597 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
598 128 * 1024);
599 if (ret) {
600 test_msg("Error adding free space: %d\n", ret);
601 return ret;
602 }
603 /* Confirm the region is marked as free. */
604 if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
605 128 * 1024)) {
606 test_msg("Extent region not marked as free\n");
607 return -ENOENT;
608 }
609
610 /*
611 * Confirm that our extent entry didn't stole all free space from the
612 * bitmap, because of the small 4Kb free space region.
613 */
614 ret = check_num_extents_and_bitmaps(cache, 2, 1);
615 if (ret)
616 return ret;
617
618 /*
619 * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
620 * space. Without stealing bitmap free space into extent entry space,
621 * we would have all this free space represented by 2 entries in the
622 * cache:
623 *
624 * extent entry covering range: [128Mb - 256Kb, 128Mb[
625 * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
626 *
627 * Attempting to allocate the whole free space (1Mb) would fail, because
628 * we can't allocate from multiple entries.
629 * With the bitmap free space stealing, we get a single extent entry
630 * that represents the 1Mb free space, and therefore we're able to
631 * allocate the whole free space at once.
632 */
633 if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
634 1 * 1024 * 1024)) {
635 test_msg("Expected region not marked as free\n");
636 return -ENOENT;
637 }
638
639 if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
640 test_msg("Cache free space is not 1Mb + 4Kb\n");
641 return -EINVAL;
642 }
643
644 offset = btrfs_find_space_for_alloc(cache,
645 0, 1 * 1024 * 1024, 0,
646 &max_extent_size);
647 if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
648 test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
649 offset);
650 return -EINVAL;
651 }
652
653 /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
654 ret = check_num_extents_and_bitmaps(cache, 1, 1);
655 if (ret)
656 return ret;
657
658 if (cache->free_space_ctl->free_space != 4096) {
659 test_msg("Cache free space is not 4Kb\n");
660 return -EINVAL;
661 }
662
663 offset = btrfs_find_space_for_alloc(cache,
664 0, 4096, 0,
665 &max_extent_size);
666 if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
667 test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
668 offset);
669 return -EINVAL;
670 }
671
672 ret = check_cache_empty(cache);
673 if (ret)
674 return ret;
675
676 __btrfs_remove_free_space_cache(cache->free_space_ctl);
677
678 /*
679 * Now test a similar scenario, but where our extent entry is located
680 * to the right of the bitmap entry, so that we can check that stealing
681 * space from a bitmap to the front of an extent entry works.
682 */
683
684 /*
685 * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
686 */
687 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
688 128 * 1024, 0);
689 if (ret) {
690 test_msg("Couldn't add extent entry %d\n", ret);
691 return ret;
692 }
693
694 /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
695 ret = test_add_free_space_entry(cache, 0,
696 128 * 1024 * 1024 - 512 * 1024, 1);
697 if (ret) {
698 test_msg("Couldn't add bitmap entry %d\n", ret);
699 return ret;
700 }
701
702 ret = check_num_extents_and_bitmaps(cache, 2, 1);
703 if (ret)
704 return ret;
705
706 /*
707 * Now make only the last 256Kb of the bitmap marked as free, so that
708 * we end up with only the following ranges marked as free space:
709 *
710 * [128Mb + 128b, 128Mb + 256Kb[
711 * [128Mb - 768Kb, 128Mb - 512Kb[
712 */
713 ret = btrfs_remove_free_space(cache,
714 0,
715 128 * 1024 * 1024 - 768 * 1024);
716 if (ret) {
717 test_msg("Failed to free part of bitmap space %d\n", ret);
718 return ret;
719 }
720
721 /* Confirm that only those 2 ranges are marked as free. */
722 if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
723 128 * 1024)) {
724 test_msg("Free space range missing\n");
725 return -ENOENT;
726 }
727 if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
728 256 * 1024)) {
729 test_msg("Free space range missing\n");
730 return -ENOENT;
731 }
732
733 /*
734 * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
735 * as free anymore.
736 */
737 if (test_check_exists(cache, 0,
738 128 * 1024 * 1024 - 768 * 1024)) {
739 test_msg("Bitmap region not removed from space cache\n");
740 return -EINVAL;
741 }
742
743 /*
744 * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
745 * covered by the bitmap, isn't marked as free.
746 */
747 if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
748 512 * 1024)) {
749 test_msg("Invalid bitmap region marked as free\n");
750 return -EINVAL;
751 }
752
753 /*
754 * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
755 * lets make sure the free space cache marks it as free in the bitmap,
756 * and doesn't insert a new extent entry to represent this region.
757 */
758 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
759 512 * 1024);
760 if (ret) {
761 test_msg("Error adding free space: %d\n", ret);
762 return ret;
763 }
764 /* Confirm the region is marked as free. */
765 if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
766 512 * 1024)) {
767 test_msg("Bitmap region not marked as free\n");
768 return -ENOENT;
769 }
770
771 /*
772 * Confirm that no new extent entries or bitmap entries were added to
773 * the cache after adding that free space region.
774 */
775 ret = check_num_extents_and_bitmaps(cache, 2, 1);
776 if (ret)
777 return ret;
778
779 /*
780 * Now lets add a small free space region to the left of the previous
781 * one, which is not contiguous with it and is part of the bitmap too.
782 * The goal is to test that the bitmap entry space stealing doesn't
783 * steal this space region.
784 */
785 ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
786 if (ret) {
787 test_msg("Error adding free space: %d\n", ret);
788 return ret;
789 }
790
791 /*
792 * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
793 * expand the range covered by the existing extent entry that represents
794 * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
795 */
796 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
797 if (ret) {
798 test_msg("Error adding free space: %d\n", ret);
799 return ret;
800 }
801 /* Confirm the region is marked as free. */
802 if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
803 test_msg("Extent region not marked as free\n");
804 return -ENOENT;
805 }
806
807 /*
808 * Confirm that our extent entry didn't stole all free space from the
809 * bitmap, because of the small 8Kb free space region.
810 */
811 ret = check_num_extents_and_bitmaps(cache, 2, 1);
812 if (ret)
813 return ret;
814
815 /*
816 * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
817 * space. Without stealing bitmap free space into extent entry space,
818 * we would have all this free space represented by 2 entries in the
819 * cache:
820 *
821 * extent entry covering range: [128Mb, 128Mb + 256Kb[
822 * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
823 *
824 * Attempting to allocate the whole free space (1Mb) would fail, because
825 * we can't allocate from multiple entries.
826 * With the bitmap free space stealing, we get a single extent entry
827 * that represents the 1Mb free space, and therefore we're able to
828 * allocate the whole free space at once.
829 */
830 if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
831 1 * 1024 * 1024)) {
832 test_msg("Expected region not marked as free\n");
833 return -ENOENT;
834 }
835
836 if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
837 test_msg("Cache free space is not 1Mb + 8Kb\n");
838 return -EINVAL;
839 }
840
841 offset = btrfs_find_space_for_alloc(cache,
842 0, 1 * 1024 * 1024, 0,
843 &max_extent_size);
844 if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
845 test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
846 offset);
847 return -EINVAL;
848 }
849
850 /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
851 ret = check_num_extents_and_bitmaps(cache, 1, 1);
852 if (ret)
853 return ret;
854
855 if (cache->free_space_ctl->free_space != 8192) {
856 test_msg("Cache free space is not 8Kb\n");
857 return -EINVAL;
858 }
859
860 offset = btrfs_find_space_for_alloc(cache,
861 0, 8192, 0,
862 &max_extent_size);
863 if (offset != (32 * 1024 * 1024)) {
864 test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
865 offset);
866 return -EINVAL;
867 }
868
869 ret = check_cache_empty(cache);
870 if (ret)
871 return ret;
872
873 cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
874 __btrfs_remove_free_space_cache(cache->free_space_ctl);
875
876 return 0;
877}
878
367int btrfs_test_free_space_cache(void) 879int btrfs_test_free_space_cache(void)
368{ 880{
369 struct btrfs_block_group_cache *cache; 881 struct btrfs_block_group_cache *cache;
@@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void)
386 ret = test_bitmaps_and_extents(cache); 898 ret = test_bitmaps_and_extents(cache);
387 if (ret) 899 if (ret)
388 goto out; 900 goto out;
901
902 ret = test_steal_space_from_bitmap_to_extent(cache);
389out: 903out:
390 __btrfs_remove_free_space_cache(cache->free_space_ctl); 904 __btrfs_remove_free_space_cache(cache->free_space_ctl);
391 kfree(cache->free_space_ctl); 905 kfree(cache->free_space_ctl);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d89c6d3542ca..dcaae3616728 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -386,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
386 int ret; 386 int ret;
387 387
388 /* Send isn't supposed to start transactions. */ 388 /* Send isn't supposed to start transactions. */
389 ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); 389 ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
390 390
391 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 391 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
392 return ERR_PTR(-EROFS); 392 return ERR_PTR(-EROFS);
@@ -408,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
408 if (num_items > 0 && root != root->fs_info->chunk_root) { 408 if (num_items > 0 && root != root->fs_info->chunk_root) {
409 if (root->fs_info->quota_enabled && 409 if (root->fs_info->quota_enabled &&
410 is_fstree(root->root_key.objectid)) { 410 is_fstree(root->root_key.objectid)) {
411 qgroup_reserved = num_items * root->leafsize; 411 qgroup_reserved = num_items * root->nodesize;
412 ret = btrfs_qgroup_reserve(root, qgroup_reserved); 412 ret = btrfs_qgroup_reserve(root, qgroup_reserved);
413 if (ret) 413 if (ret)
414 return ERR_PTR(ret); 414 return ERR_PTR(ret);
@@ -418,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
418 /* 418 /*
419 * Do the reservation for the relocation root creation 419 * Do the reservation for the relocation root creation
420 */ 420 */
421 if (unlikely(need_reserve_reloc_root(root))) { 421 if (need_reserve_reloc_root(root)) {
422 num_bytes += root->nodesize; 422 num_bytes += root->nodesize;
423 reloc_reserved = true; 423 reloc_reserved = true;
424 } 424 }
@@ -609,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
609 if (transid <= root->fs_info->last_trans_committed) 609 if (transid <= root->fs_info->last_trans_committed)
610 goto out; 610 goto out;
611 611
612 ret = -EINVAL;
613 /* find specified transaction */ 612 /* find specified transaction */
614 spin_lock(&root->fs_info->trans_lock); 613 spin_lock(&root->fs_info->trans_lock);
615 list_for_each_entry(t, &root->fs_info->trans_list, list) { 614 list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -625,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
625 } 624 }
626 } 625 }
627 spin_unlock(&root->fs_info->trans_lock); 626 spin_unlock(&root->fs_info->trans_lock);
628 /* The specified transaction doesn't exist */ 627
629 if (!cur_trans) 628 /*
629 * The specified transaction doesn't exist, or we
630 * raced with btrfs_commit_transaction
631 */
632 if (!cur_trans) {
633 if (transid > root->fs_info->last_trans_committed)
634 ret = -EINVAL;
630 goto out; 635 goto out;
636 }
631 } else { 637 } else {
632 /* find newest transaction that is committing | committed */ 638 /* find newest transaction that is committing | committed */
633 spin_lock(&root->fs_info->trans_lock); 639 spin_lock(&root->fs_info->trans_lock);
@@ -851,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
851 struct extent_state *cached_state = NULL; 857 struct extent_state *cached_state = NULL;
852 u64 start = 0; 858 u64 start = 0;
853 u64 end; 859 u64 end;
860 struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
861 bool errors = false;
854 862
855 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 863 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
856 EXTENT_NEED_WAIT, &cached_state)) { 864 EXTENT_NEED_WAIT, &cached_state)) {
@@ -864,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
864 } 872 }
865 if (err) 873 if (err)
866 werr = err; 874 werr = err;
875
876 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
877 if ((mark & EXTENT_DIRTY) &&
878 test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
879 &btree_ino->runtime_flags))
880 errors = true;
881
882 if ((mark & EXTENT_NEW) &&
883 test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
884 &btree_ino->runtime_flags))
885 errors = true;
886 } else {
887 if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
888 &btree_ino->runtime_flags))
889 errors = true;
890 }
891
892 if (errors && !werr)
893 werr = -EIO;
894
867 return werr; 895 return werr;
868} 896}
869 897
@@ -1629,6 +1657,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1629{ 1657{
1630 struct btrfs_transaction *cur_trans = trans->transaction; 1658 struct btrfs_transaction *cur_trans = trans->transaction;
1631 struct btrfs_transaction *prev_trans = NULL; 1659 struct btrfs_transaction *prev_trans = NULL;
1660 struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
1632 int ret; 1661 int ret;
1633 1662
1634 /* Stop the commit early if ->aborted is set */ 1663 /* Stop the commit early if ->aborted is set */
@@ -1868,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1868 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, 1897 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1869 sizeof(*root->fs_info->super_copy)); 1898 sizeof(*root->fs_info->super_copy));
1870 1899
1900 btrfs_update_commit_device_size(root->fs_info);
1901 btrfs_update_commit_device_bytes_used(root, cur_trans);
1902
1903 clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
1904 clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
1905
1871 spin_lock(&root->fs_info->trans_lock); 1906 spin_lock(&root->fs_info->trans_lock);
1872 cur_trans->state = TRANS_STATE_UNBLOCKED; 1907 cur_trans->state = TRANS_STATE_UNBLOCKED;
1873 root->fs_info->running_transaction = NULL; 1908 root->fs_info->running_transaction = NULL;
@@ -1981,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
1981 ret = btrfs_drop_snapshot(root, NULL, 0, 0); 2016 ret = btrfs_drop_snapshot(root, NULL, 0, 0);
1982 else 2017 else
1983 ret = btrfs_drop_snapshot(root, NULL, 1, 0); 2018 ret = btrfs_drop_snapshot(root, NULL, 1, 0);
1984 /* 2019
1985 * If we encounter a transaction abort during snapshot cleaning, we
1986 * don't want to crash here
1987 */
1988 return (ret < 0) ? 0 : 1; 2020 return (ret < 0) ? 0 : 1;
1989} 2021}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 579be51b27e5..d8f40e1a5d2d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -79,7 +79,7 @@ struct btrfs_transaction {
79#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ 79#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
80 __TRANS_ATTACH) 80 __TRANS_ATTACH)
81 81
82#define BTRFS_SEND_TRANS_STUB 1 82#define BTRFS_SEND_TRANS_STUB ((void *)1)
83 83
84struct btrfs_trans_handle { 84struct btrfs_trans_handle {
85 u64 transid; 85 u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9e1f2cd5e67a..1475979e5718 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,11 @@
94#define LOG_WALK_REPLAY_ALL 3 94#define LOG_WALK_REPLAY_ALL 3
95 95
96static int btrfs_log_inode(struct btrfs_trans_handle *trans, 96static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode, 97 struct btrfs_root *root, struct inode *inode,
98 int inode_only); 98 int inode_only,
99 const loff_t start,
100 const loff_t end,
101 struct btrfs_log_ctx *ctx);
99static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root, 103 struct btrfs_root *root,
101 struct btrfs_path *path, u64 objectid); 104 struct btrfs_path *path, u64 objectid);
@@ -1496,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1496 return -EIO; 1499 return -EIO;
1497 1500
1498 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1501 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1499 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1502 key.type = BTRFS_ORPHAN_ITEM_KEY;
1500 key.offset = objectid; 1503 key.offset = objectid;
1501 1504
1502 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1505 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1635,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1635 found_key.type == log_key.type && 1638 found_key.type == log_key.type &&
1636 found_key.offset == log_key.offset && 1639 found_key.offset == log_key.offset &&
1637 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1640 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1641 update_size = false;
1638 goto out; 1642 goto out;
1639 } 1643 }
1640 1644
@@ -2155,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2155 2159
2156 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2160 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2157 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2161 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2158 blocksize = btrfs_level_size(root, *level - 1); 2162 blocksize = root->nodesize;
2159 2163
2160 parent = path->nodes[*level]; 2164 parent = path->nodes[*level];
2161 root_owner = btrfs_header_owner(parent); 2165 root_owner = btrfs_header_owner(parent);
@@ -2981,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2981 min_key.type = key_type; 2985 min_key.type = key_type;
2982 min_key.offset = min_offset; 2986 min_key.offset = min_offset;
2983 2987
2984 path->keep_locks = 1;
2985
2986 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 2988 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
2987 2989
2988 /* 2990 /*
@@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3298 struct list_head ordered_sums; 3300 struct list_head ordered_sums;
3299 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3301 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3300 bool has_extents = false; 3302 bool has_extents = false;
3301 bool need_find_last_extent = (*last_extent == 0); 3303 bool need_find_last_extent = true;
3302 bool done = false; 3304 bool done = false;
3303 3305
3304 INIT_LIST_HEAD(&ordered_sums); 3306 INIT_LIST_HEAD(&ordered_sums);
@@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3352 */ 3354 */
3353 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3355 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3354 has_extents = true; 3356 has_extents = true;
3355 if (need_find_last_extent && 3357 if (first_key.objectid == (u64)-1)
3356 first_key.objectid == (u64)-1)
3357 first_key = ins_keys[i]; 3358 first_key = ins_keys[i];
3358 } else { 3359 } else {
3359 need_find_last_extent = false; 3360 need_find_last_extent = false;
@@ -3363,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3363 * or deletes of this inode don't have to relog the inode 3364 * or deletes of this inode don't have to relog the inode
3364 * again 3365 * again
3365 */ 3366 */
3366 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && 3367 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3367 !skip_csum) { 3368 !skip_csum) {
3368 int found_type; 3369 int found_type;
3369 extent = btrfs_item_ptr(src, start_slot + i, 3370 extent = btrfs_item_ptr(src, start_slot + i,
@@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3427 if (!has_extents) 3428 if (!has_extents)
3428 return ret; 3429 return ret;
3429 3430
3431 if (need_find_last_extent && *last_extent == first_key.offset) {
3432 /*
3433 * We don't have any leafs between our current one and the one
3434 * we processed before that can have file extent items for our
3435 * inode (and have a generation number smaller than our current
3436 * transaction id).
3437 */
3438 need_find_last_extent = false;
3439 }
3440
3430 /* 3441 /*
3431 * Because we use btrfs_search_forward we could skip leaves that were 3442 * Because we use btrfs_search_forward we could skip leaves that were
3432 * not modified and then assume *last_extent is valid when it really 3443 * not modified and then assume *last_extent is valid when it really
@@ -3537,7 +3548,7 @@ fill_holes:
3537 0, 0); 3548 0, 0);
3538 if (ret) 3549 if (ret)
3539 break; 3550 break;
3540 *last_extent = offset + len; 3551 *last_extent = extent_end;
3541 } 3552 }
3542 /* 3553 /*
3543 * Need to let the callers know we dropped the path so they should 3554 * Need to let the callers know we dropped the path so they should
@@ -3562,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3562 return 0; 3573 return 0;
3563} 3574}
3564 3575
3565static int log_one_extent(struct btrfs_trans_handle *trans, 3576static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3566 struct inode *inode, struct btrfs_root *root, 3577 struct inode *inode,
3567 struct extent_map *em, struct btrfs_path *path, 3578 struct btrfs_root *root,
3568 struct list_head *logged_list) 3579 const struct extent_map *em,
3580 const struct list_head *logged_list,
3581 bool *ordered_io_error)
3569{ 3582{
3570 struct btrfs_root *log = root->log_root;
3571 struct btrfs_file_extent_item *fi;
3572 struct extent_buffer *leaf;
3573 struct btrfs_ordered_extent *ordered; 3583 struct btrfs_ordered_extent *ordered;
3574 struct list_head ordered_sums; 3584 struct btrfs_root *log = root->log_root;
3575 struct btrfs_map_token token;
3576 struct btrfs_key key;
3577 u64 mod_start = em->mod_start; 3585 u64 mod_start = em->mod_start;
3578 u64 mod_len = em->mod_len; 3586 u64 mod_len = em->mod_len;
3587 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3579 u64 csum_offset; 3588 u64 csum_offset;
3580 u64 csum_len; 3589 u64 csum_len;
3581 u64 extent_offset = em->start - em->orig_start; 3590 LIST_HEAD(ordered_sums);
3582 u64 block_len; 3591 int ret = 0;
3583 int ret;
3584 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3585 int extent_inserted = 0;
3586
3587 INIT_LIST_HEAD(&ordered_sums);
3588 btrfs_init_map_token(&token);
3589
3590 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3591 em->start + em->len, NULL, 0, 1,
3592 sizeof(*fi), &extent_inserted);
3593 if (ret)
3594 return ret;
3595
3596 if (!extent_inserted) {
3597 key.objectid = btrfs_ino(inode);
3598 key.type = BTRFS_EXTENT_DATA_KEY;
3599 key.offset = em->start;
3600
3601 ret = btrfs_insert_empty_item(trans, log, path, &key,
3602 sizeof(*fi));
3603 if (ret)
3604 return ret;
3605 }
3606 leaf = path->nodes[0];
3607 fi = btrfs_item_ptr(leaf, path->slots[0],
3608 struct btrfs_file_extent_item);
3609
3610 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3611 &token);
3612 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3613 skip_csum = true;
3614 btrfs_set_token_file_extent_type(leaf, fi,
3615 BTRFS_FILE_EXTENT_PREALLOC,
3616 &token);
3617 } else {
3618 btrfs_set_token_file_extent_type(leaf, fi,
3619 BTRFS_FILE_EXTENT_REG,
3620 &token);
3621 if (em->block_start == EXTENT_MAP_HOLE)
3622 skip_csum = true;
3623 }
3624
3625 block_len = max(em->block_len, em->orig_block_len);
3626 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3627 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3628 em->block_start,
3629 &token);
3630 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3631 &token);
3632 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3633 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3634 em->block_start -
3635 extent_offset, &token);
3636 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3637 &token);
3638 } else {
3639 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3640 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3641 &token);
3642 }
3643 3592
3644 btrfs_set_token_file_extent_offset(leaf, fi, 3593 *ordered_io_error = false;
3645 em->start - em->orig_start,
3646 &token);
3647 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3648 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3649 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3650 &token);
3651 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3652 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3653 btrfs_mark_buffer_dirty(leaf);
3654 3594
3655 btrfs_release_path(path); 3595 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3656 if (ret) { 3596 em->block_start == EXTENT_MAP_HOLE)
3657 return ret;
3658 }
3659
3660 if (skip_csum)
3661 return 0; 3597 return 0;
3662 3598
3663 /* 3599 /*
3664 * First check and see if our csums are on our outstanding ordered 3600 * Wait far any ordered extent that covers our extent map. If it
3665 * extents. 3601 * finishes without an error, first check and see if our csums are on
3602 * our outstanding ordered extents.
3666 */ 3603 */
3667 list_for_each_entry(ordered, logged_list, log_list) { 3604 list_for_each_entry(ordered, logged_list, log_list) {
3668 struct btrfs_ordered_sum *sum; 3605 struct btrfs_ordered_sum *sum;
@@ -3674,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3674 mod_start + mod_len <= ordered->file_offset) 3611 mod_start + mod_len <= ordered->file_offset)
3675 continue; 3612 continue;
3676 3613
3614 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3615 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3616 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3617 const u64 start = ordered->file_offset;
3618 const u64 end = ordered->file_offset + ordered->len - 1;
3619
3620 WARN_ON(ordered->inode != inode);
3621 filemap_fdatawrite_range(inode->i_mapping, start, end);
3622 }
3623
3624 wait_event(ordered->wait,
3625 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3626 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3627
3628 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3629 *ordered_io_error = true;
3630 break;
3631 }
3677 /* 3632 /*
3678 * We are going to copy all the csums on this ordered extent, so 3633 * We are going to copy all the csums on this ordered extent, so
3679 * go ahead and adjust mod_start and mod_len in case this 3634 * go ahead and adjust mod_start and mod_len in case this
@@ -3705,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3705 } 3660 }
3706 } 3661 }
3707 3662
3663 if (skip_csum)
3664 continue;
3665
3708 /* 3666 /*
3709 * To keep us from looping for the above case of an ordered 3667 * To keep us from looping for the above case of an ordered
3710 * extent that falls inside of the logged extent. 3668 * extent that falls inside of the logged extent.
@@ -3722,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3722 list_for_each_entry(sum, &ordered->list, list) { 3680 list_for_each_entry(sum, &ordered->list, list) {
3723 ret = btrfs_csum_file_blocks(trans, log, sum); 3681 ret = btrfs_csum_file_blocks(trans, log, sum);
3724 if (ret) 3682 if (ret)
3725 goto unlocked; 3683 break;
3726 } 3684 }
3727
3728 } 3685 }
3729unlocked:
3730 3686
3731 if (!mod_len || ret) 3687 if (*ordered_io_error || !mod_len || ret || skip_csum)
3732 return ret; 3688 return ret;
3733 3689
3734 if (em->compress_type) { 3690 if (em->compress_type) {
3735 csum_offset = 0; 3691 csum_offset = 0;
3736 csum_len = block_len; 3692 csum_len = max(em->block_len, em->orig_block_len);
3737 } else { 3693 } else {
3738 csum_offset = mod_start - em->start; 3694 csum_offset = mod_start - em->start;
3739 csum_len = mod_len; 3695 csum_len = mod_len;
@@ -3760,11 +3716,106 @@ unlocked:
3760 return ret; 3716 return ret;
3761} 3717}
3762 3718
3719static int log_one_extent(struct btrfs_trans_handle *trans,
3720 struct inode *inode, struct btrfs_root *root,
3721 const struct extent_map *em,
3722 struct btrfs_path *path,
3723 const struct list_head *logged_list,
3724 struct btrfs_log_ctx *ctx)
3725{
3726 struct btrfs_root *log = root->log_root;
3727 struct btrfs_file_extent_item *fi;
3728 struct extent_buffer *leaf;
3729 struct btrfs_map_token token;
3730 struct btrfs_key key;
3731 u64 extent_offset = em->start - em->orig_start;
3732 u64 block_len;
3733 int ret;
3734 int extent_inserted = 0;
3735 bool ordered_io_err = false;
3736
3737 ret = wait_ordered_extents(trans, inode, root, em, logged_list,
3738 &ordered_io_err);
3739 if (ret)
3740 return ret;
3741
3742 if (ordered_io_err) {
3743 ctx->io_err = -EIO;
3744 return 0;
3745 }
3746
3747 btrfs_init_map_token(&token);
3748
3749 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3750 em->start + em->len, NULL, 0, 1,
3751 sizeof(*fi), &extent_inserted);
3752 if (ret)
3753 return ret;
3754
3755 if (!extent_inserted) {
3756 key.objectid = btrfs_ino(inode);
3757 key.type = BTRFS_EXTENT_DATA_KEY;
3758 key.offset = em->start;
3759
3760 ret = btrfs_insert_empty_item(trans, log, path, &key,
3761 sizeof(*fi));
3762 if (ret)
3763 return ret;
3764 }
3765 leaf = path->nodes[0];
3766 fi = btrfs_item_ptr(leaf, path->slots[0],
3767 struct btrfs_file_extent_item);
3768
3769 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3770 &token);
3771 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3772 btrfs_set_token_file_extent_type(leaf, fi,
3773 BTRFS_FILE_EXTENT_PREALLOC,
3774 &token);
3775 else
3776 btrfs_set_token_file_extent_type(leaf, fi,
3777 BTRFS_FILE_EXTENT_REG,
3778 &token);
3779
3780 block_len = max(em->block_len, em->orig_block_len);
3781 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3782 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3783 em->block_start,
3784 &token);
3785 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3786 &token);
3787 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3788 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3789 em->block_start -
3790 extent_offset, &token);
3791 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3792 &token);
3793 } else {
3794 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3795 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3796 &token);
3797 }
3798
3799 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
3800 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3801 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3802 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3803 &token);
3804 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3805 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3806 btrfs_mark_buffer_dirty(leaf);
3807
3808 btrfs_release_path(path);
3809
3810 return ret;
3811}
3812
3763static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3813static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3764 struct btrfs_root *root, 3814 struct btrfs_root *root,
3765 struct inode *inode, 3815 struct inode *inode,
3766 struct btrfs_path *path, 3816 struct btrfs_path *path,
3767 struct list_head *logged_list) 3817 struct list_head *logged_list,
3818 struct btrfs_log_ctx *ctx)
3768{ 3819{
3769 struct extent_map *em, *n; 3820 struct extent_map *em, *n;
3770 struct list_head extents; 3821 struct list_head extents;
@@ -3822,7 +3873,8 @@ process:
3822 3873
3823 write_unlock(&tree->lock); 3874 write_unlock(&tree->lock);
3824 3875
3825 ret = log_one_extent(trans, inode, root, em, path, logged_list); 3876 ret = log_one_extent(trans, inode, root, em, path, logged_list,
3877 ctx);
3826 write_lock(&tree->lock); 3878 write_lock(&tree->lock);
3827 clear_em_logging(tree, em); 3879 clear_em_logging(tree, em);
3828 free_extent_map(em); 3880 free_extent_map(em);
@@ -3849,8 +3901,11 @@ process:
3849 * This handles both files and directories. 3901 * This handles both files and directories.
3850 */ 3902 */
3851static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3903static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3852 struct btrfs_root *root, struct inode *inode, 3904 struct btrfs_root *root, struct inode *inode,
3853 int inode_only) 3905 int inode_only,
3906 const loff_t start,
3907 const loff_t end,
3908 struct btrfs_log_ctx *ctx)
3854{ 3909{
3855 struct btrfs_path *path; 3910 struct btrfs_path *path;
3856 struct btrfs_path *dst_path; 3911 struct btrfs_path *dst_path;
@@ -3867,6 +3922,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3867 int ins_nr; 3922 int ins_nr;
3868 bool fast_search = false; 3923 bool fast_search = false;
3869 u64 ino = btrfs_ino(inode); 3924 u64 ino = btrfs_ino(inode);
3925 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3870 3926
3871 path = btrfs_alloc_path(); 3927 path = btrfs_alloc_path();
3872 if (!path) 3928 if (!path)
@@ -3950,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3950 err = ret; 4006 err = ret;
3951 goto out_unlock; 4007 goto out_unlock;
3952 } 4008 }
3953 path->keep_locks = 1;
3954 4009
3955 while (1) { 4010 while (1) {
3956 ins_nr = 0; 4011 ins_nr = 0;
@@ -3980,7 +4035,8 @@ again:
3980 if (ret < 0) { 4035 if (ret < 0) {
3981 err = ret; 4036 err = ret;
3982 goto out_unlock; 4037 goto out_unlock;
3983 } if (ret) { 4038 }
4039 if (ret) {
3984 ins_nr = 0; 4040 ins_nr = 0;
3985 btrfs_release_path(path); 4041 btrfs_release_path(path);
3986 continue; 4042 continue;
@@ -4034,19 +4090,41 @@ log_extents:
4034 btrfs_release_path(dst_path); 4090 btrfs_release_path(dst_path);
4035 if (fast_search) { 4091 if (fast_search) {
4036 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4092 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4037 &logged_list); 4093 &logged_list, ctx);
4038 if (ret) { 4094 if (ret) {
4039 err = ret; 4095 err = ret;
4040 goto out_unlock; 4096 goto out_unlock;
4041 } 4097 }
4042 } else if (inode_only == LOG_INODE_ALL) { 4098 } else if (inode_only == LOG_INODE_ALL) {
4043 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
4044 struct extent_map *em, *n; 4099 struct extent_map *em, *n;
4045 4100
4046 write_lock(&tree->lock); 4101 write_lock(&em_tree->lock);
4047 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 4102 /*
4048 list_del_init(&em->list); 4103 * We can't just remove every em if we're called for a ranged
4049 write_unlock(&tree->lock); 4104 * fsync - that is, one that doesn't cover the whole possible
4105 * file range (0 to LLONG_MAX). This is because we can have
4106 * em's that fall outside the range we're logging and therefore
4107 * their ordered operations haven't completed yet
4108 * (btrfs_finish_ordered_io() not invoked yet). This means we
4109 * didn't get their respective file extent item in the fs/subvol
4110 * tree yet, and need to let the next fast fsync (one which
4111 * consults the list of modified extent maps) find the em so
4112 * that it logs a matching file extent item and waits for the
4113 * respective ordered operation to complete (if it's still
4114 * running).
4115 *
4116 * Removing every em outside the range we're logging would make
4117 * the next fast fsync not log their matching file extent items,
4118 * therefore making us lose data after a log replay.
4119 */
4120 list_for_each_entry_safe(em, n, &em_tree->modified_extents,
4121 list) {
4122 const u64 mod_end = em->mod_start + em->mod_len - 1;
4123
4124 if (em->mod_start >= start && mod_end <= end)
4125 list_del_init(&em->list);
4126 }
4127 write_unlock(&em_tree->lock);
4050 } 4128 }
4051 4129
4052 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4130 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4056,6 +4134,7 @@ log_extents:
4056 goto out_unlock; 4134 goto out_unlock;
4057 } 4135 }
4058 } 4136 }
4137
4059 BTRFS_I(inode)->logged_trans = trans->transid; 4138 BTRFS_I(inode)->logged_trans = trans->transid;
4060 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4139 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4061out_unlock: 4140out_unlock:
@@ -4152,7 +4231,10 @@ out:
4152 */ 4231 */
4153static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4232static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4154 struct btrfs_root *root, struct inode *inode, 4233 struct btrfs_root *root, struct inode *inode,
4155 struct dentry *parent, int exists_only, 4234 struct dentry *parent,
4235 const loff_t start,
4236 const loff_t end,
4237 int exists_only,
4156 struct btrfs_log_ctx *ctx) 4238 struct btrfs_log_ctx *ctx)
4157{ 4239{
4158 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4240 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4198,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4198 if (ret) 4280 if (ret)
4199 goto end_no_trans; 4281 goto end_no_trans;
4200 4282
4201 ret = btrfs_log_inode(trans, root, inode, inode_only); 4283 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
4202 if (ret) 4284 if (ret)
4203 goto end_trans; 4285 goto end_trans;
4204 4286
@@ -4226,7 +4308,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4226 4308
4227 if (BTRFS_I(inode)->generation > 4309 if (BTRFS_I(inode)->generation >
4228 root->fs_info->last_trans_committed) { 4310 root->fs_info->last_trans_committed) {
4229 ret = btrfs_log_inode(trans, root, inode, inode_only); 4311 ret = btrfs_log_inode(trans, root, inode, inode_only,
4312 0, LLONG_MAX, ctx);
4230 if (ret) 4313 if (ret)
4231 goto end_trans; 4314 goto end_trans;
4232 } 4315 }
@@ -4260,13 +4343,15 @@ end_no_trans:
4260 */ 4343 */
4261int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4344int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4262 struct btrfs_root *root, struct dentry *dentry, 4345 struct btrfs_root *root, struct dentry *dentry,
4346 const loff_t start,
4347 const loff_t end,
4263 struct btrfs_log_ctx *ctx) 4348 struct btrfs_log_ctx *ctx)
4264{ 4349{
4265 struct dentry *parent = dget_parent(dentry); 4350 struct dentry *parent = dget_parent(dentry);
4266 int ret; 4351 int ret;
4267 4352
4268 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 4353 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4269 0, ctx); 4354 start, end, 0, ctx);
4270 dput(parent); 4355 dput(parent);
4271 4356
4272 return ret; 4357 return ret;
@@ -4316,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
4316again: 4401again:
4317 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4402 key.objectid = BTRFS_TREE_LOG_OBJECTID;
4318 key.offset = (u64)-1; 4403 key.offset = (u64)-1;
4319 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 4404 key.type = BTRFS_ROOT_ITEM_KEY;
4320 4405
4321 while (1) { 4406 while (1) {
4322 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4407 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
@@ -4503,6 +4588,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4503 root->fs_info->last_trans_committed)) 4588 root->fs_info->last_trans_committed))
4504 return 0; 4589 return 0;
4505 4590
4506 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); 4591 return btrfs_log_inode_parent(trans, root, inode, parent, 0,
4592 LLONG_MAX, 1, NULL);
4507} 4593}
4508 4594
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..154990c26dcb 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -28,6 +28,7 @@
28struct btrfs_log_ctx { 28struct btrfs_log_ctx {
29 int log_ret; 29 int log_ret;
30 int log_transid; 30 int log_transid;
31 int io_err;
31 struct list_head list; 32 struct list_head list;
32}; 33};
33 34
@@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
35{ 36{
36 ctx->log_ret = 0; 37 ctx->log_ret = 0;
37 ctx->log_transid = 0; 38 ctx->log_transid = 0;
39 ctx->io_err = 0;
38 INIT_LIST_HEAD(&ctx->list); 40 INIT_LIST_HEAD(&ctx->list);
39} 41}
40 42
@@ -59,6 +61,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
59int btrfs_recover_log_trees(struct btrfs_root *tree_root); 61int btrfs_recover_log_trees(struct btrfs_root *tree_root);
60int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 62int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, struct dentry *dentry, 63 struct btrfs_root *root, struct dentry *dentry,
64 const loff_t start,
65 const loff_t end,
62 struct btrfs_log_ctx *ctx); 66 struct btrfs_log_ctx *ctx);
63int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 67int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root, 68 struct btrfs_root *root,
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index f6a4c03ee7d8..778282944530 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
279 key.offset = 0; 279 key.offset = 0;
280 280
281again_search_slot: 281again_search_slot:
282 path->keep_locks = 1;
283 ret = btrfs_search_forward(root, &key, path, 0); 282 ret = btrfs_search_forward(root, &key, path, 0);
284 if (ret) { 283 if (ret) {
285 if (ret > 0) 284 if (ret > 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6cb82f62cb7c..d47289c715c8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
50static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 50static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
51static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 51static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
52 52
53static DEFINE_MUTEX(uuid_mutex); 53DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids); 54static LIST_HEAD(fs_uuids);
55 55
56static void lock_chunks(struct btrfs_root *root) 56static void lock_chunks(struct btrfs_root *root)
@@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
74 mutex_init(&fs_devs->device_list_mutex); 74 mutex_init(&fs_devs->device_list_mutex);
75 75
76 INIT_LIST_HEAD(&fs_devs->devices); 76 INIT_LIST_HEAD(&fs_devs->devices);
77 INIT_LIST_HEAD(&fs_devs->resized_devices);
77 INIT_LIST_HEAD(&fs_devs->alloc_list); 78 INIT_LIST_HEAD(&fs_devs->alloc_list);
78 INIT_LIST_HEAD(&fs_devs->list); 79 INIT_LIST_HEAD(&fs_devs->list);
79 80
@@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void)
154 155
155 INIT_LIST_HEAD(&dev->dev_list); 156 INIT_LIST_HEAD(&dev->dev_list);
156 INIT_LIST_HEAD(&dev->dev_alloc_list); 157 INIT_LIST_HEAD(&dev->dev_alloc_list);
158 INIT_LIST_HEAD(&dev->resized_list);
157 159
158 spin_lock_init(&dev->io_lock); 160 spin_lock_init(&dev->io_lock);
159 161
160 spin_lock_init(&dev->reada_lock); 162 spin_lock_init(&dev->reada_lock);
161 atomic_set(&dev->reada_in_flight, 0); 163 atomic_set(&dev->reada_in_flight, 0);
164 atomic_set(&dev->dev_stats_ccnt, 0);
162 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); 165 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
163 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); 166 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
164 167
@@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path,
474 return PTR_ERR(fs_devices); 477 return PTR_ERR(fs_devices);
475 478
476 list_add(&fs_devices->list, &fs_uuids); 479 list_add(&fs_devices->list, &fs_uuids);
477 fs_devices->latest_devid = devid;
478 fs_devices->latest_trans = found_transid;
479 480
480 device = NULL; 481 device = NULL;
481 } else { 482 } else {
482 device = __find_device(&fs_devices->devices, devid, 483 device = __find_device(&fs_devices->devices, devid,
483 disk_super->dev_item.uuid); 484 disk_super->dev_item.uuid);
484 } 485 }
486
485 if (!device) { 487 if (!device) {
486 if (fs_devices->opened) 488 if (fs_devices->opened)
487 return -EBUSY; 489 return -EBUSY;
@@ -508,6 +510,43 @@ static noinline int device_list_add(const char *path,
508 ret = 1; 510 ret = 1;
509 device->fs_devices = fs_devices; 511 device->fs_devices = fs_devices;
510 } else if (!device->name || strcmp(device->name->str, path)) { 512 } else if (!device->name || strcmp(device->name->str, path)) {
513 /*
514 * When FS is already mounted.
515 * 1. If you are here and if the device->name is NULL that
516 * means this device was missing at time of FS mount.
517 * 2. If you are here and if the device->name is different
518 * from 'path' that means either
519 * a. The same device disappeared and reappeared with
520 * different name. or
521 * b. The missing-disk-which-was-replaced, has
522 * reappeared now.
523 *
524 * We must allow 1 and 2a above. But 2b would be a spurious
525 * and unintentional.
526 *
527 * Further in case of 1 and 2a above, the disk at 'path'
528 * would have missed some transaction when it was away and
529 * in case of 2a the stale bdev has to be updated as well.
530 * 2b must not be allowed at all time.
531 */
532
533 /*
534 * For now, we do allow update to btrfs_fs_device through the
535 * btrfs dev scan cli after FS has been mounted. We're still
536 * tracking a problem where systems fail mount by subvolume id
537 * when we reject replacement on a mounted FS.
538 */
539 if (!fs_devices->opened && found_transid < device->generation) {
540 /*
541 * That is if the FS is _not_ mounted and if you
542 * are here, that means there is more than one
543 * disk with same uuid and devid.We keep the one
544 * with larger generation number or the last-in if
545 * generation are equal.
546 */
547 return -EEXIST;
548 }
549
511 name = rcu_string_strdup(path, GFP_NOFS); 550 name = rcu_string_strdup(path, GFP_NOFS);
512 if (!name) 551 if (!name)
513 return -ENOMEM; 552 return -ENOMEM;
@@ -519,10 +558,15 @@ static noinline int device_list_add(const char *path,
519 } 558 }
520 } 559 }
521 560
522 if (found_transid > fs_devices->latest_trans) { 561 /*
523 fs_devices->latest_devid = devid; 562 * Unmount does not free the btrfs_device struct but would zero
524 fs_devices->latest_trans = found_transid; 563 * generation along with most of the other members. So just update
525 } 564 * it back. We need it to pick the disk with largest generation
565 * (as above).
566 */
567 if (!fs_devices->opened)
568 device->generation = found_transid;
569
526 *fs_devices_ret = fs_devices; 570 *fs_devices_ret = fs_devices;
527 571
528 return ret; 572 return ret;
@@ -538,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
538 if (IS_ERR(fs_devices)) 582 if (IS_ERR(fs_devices))
539 return fs_devices; 583 return fs_devices;
540 584
541 fs_devices->latest_devid = orig->latest_devid; 585 mutex_lock(&orig->device_list_mutex);
542 fs_devices->latest_trans = orig->latest_trans;
543 fs_devices->total_devices = orig->total_devices; 586 fs_devices->total_devices = orig->total_devices;
544 587
545 /* We have held the volume lock, it is safe to get the devices. */ 588 /* We have held the volume lock, it is safe to get the devices. */
@@ -568,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
568 device->fs_devices = fs_devices; 611 device->fs_devices = fs_devices;
569 fs_devices->num_devices++; 612 fs_devices->num_devices++;
570 } 613 }
614 mutex_unlock(&orig->device_list_mutex);
571 return fs_devices; 615 return fs_devices;
572error: 616error:
617 mutex_unlock(&orig->device_list_mutex);
573 free_fs_devices(fs_devices); 618 free_fs_devices(fs_devices);
574 return ERR_PTR(-ENOMEM); 619 return ERR_PTR(-ENOMEM);
575} 620}
@@ -578,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
578 struct btrfs_fs_devices *fs_devices, int step) 623 struct btrfs_fs_devices *fs_devices, int step)
579{ 624{
580 struct btrfs_device *device, *next; 625 struct btrfs_device *device, *next;
581 626 struct btrfs_device *latest_dev = NULL;
582 struct block_device *latest_bdev = NULL;
583 u64 latest_devid = 0;
584 u64 latest_transid = 0;
585 627
586 mutex_lock(&uuid_mutex); 628 mutex_lock(&uuid_mutex);
587again: 629again:
@@ -589,11 +631,9 @@ again:
589 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 631 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
590 if (device->in_fs_metadata) { 632 if (device->in_fs_metadata) {
591 if (!device->is_tgtdev_for_dev_replace && 633 if (!device->is_tgtdev_for_dev_replace &&
592 (!latest_transid || 634 (!latest_dev ||
593 device->generation > latest_transid)) { 635 device->generation > latest_dev->generation)) {
594 latest_devid = device->devid; 636 latest_dev = device;
595 latest_transid = device->generation;
596 latest_bdev = device->bdev;
597 } 637 }
598 continue; 638 continue;
599 } 639 }
@@ -635,9 +675,7 @@ again:
635 goto again; 675 goto again;
636 } 676 }
637 677
638 fs_devices->latest_bdev = latest_bdev; 678 fs_devices->latest_bdev = latest_dev->bdev;
639 fs_devices->latest_devid = latest_devid;
640 fs_devices->latest_trans = latest_transid;
641 679
642 mutex_unlock(&uuid_mutex); 680 mutex_unlock(&uuid_mutex);
643} 681}
@@ -686,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
686 fs_devices->rw_devices--; 724 fs_devices->rw_devices--;
687 } 725 }
688 726
689 if (device->can_discard)
690 fs_devices->num_can_discard--;
691 if (device->missing) 727 if (device->missing)
692 fs_devices->missing_devices--; 728 fs_devices->missing_devices--;
693 729
@@ -752,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
752 struct block_device *bdev; 788 struct block_device *bdev;
753 struct list_head *head = &fs_devices->devices; 789 struct list_head *head = &fs_devices->devices;
754 struct btrfs_device *device; 790 struct btrfs_device *device;
755 struct block_device *latest_bdev = NULL; 791 struct btrfs_device *latest_dev = NULL;
756 struct buffer_head *bh; 792 struct buffer_head *bh;
757 struct btrfs_super_block *disk_super; 793 struct btrfs_super_block *disk_super;
758 u64 latest_devid = 0;
759 u64 latest_transid = 0;
760 u64 devid; 794 u64 devid;
761 int seeding = 1; 795 int seeding = 1;
762 int ret = 0; 796 int ret = 0;
@@ -784,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
784 goto error_brelse; 818 goto error_brelse;
785 819
786 device->generation = btrfs_super_generation(disk_super); 820 device->generation = btrfs_super_generation(disk_super);
787 if (!latest_transid || device->generation > latest_transid) { 821 if (!latest_dev ||
788 latest_devid = devid; 822 device->generation > latest_dev->generation)
789 latest_transid = device->generation; 823 latest_dev = device;
790 latest_bdev = bdev;
791 }
792 824
793 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 825 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
794 device->writeable = 0; 826 device->writeable = 0;
@@ -798,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
798 } 830 }
799 831
800 q = bdev_get_queue(bdev); 832 q = bdev_get_queue(bdev);
801 if (blk_queue_discard(q)) { 833 if (blk_queue_discard(q))
802 device->can_discard = 1; 834 device->can_discard = 1;
803 fs_devices->num_can_discard++;
804 }
805 835
806 device->bdev = bdev; 836 device->bdev = bdev;
807 device->in_fs_metadata = 0; 837 device->in_fs_metadata = 0;
@@ -831,9 +861,7 @@ error_brelse:
831 } 861 }
832 fs_devices->seeding = seeding; 862 fs_devices->seeding = seeding;
833 fs_devices->opened = 1; 863 fs_devices->opened = 1;
834 fs_devices->latest_bdev = latest_bdev; 864 fs_devices->latest_bdev = latest_dev->bdev;
835 fs_devices->latest_devid = latest_devid;
836 fs_devices->latest_trans = latest_transid;
837 fs_devices->total_rw_bytes = 0; 865 fs_devices->total_rw_bytes = 0;
838out: 866out:
839 return ret; 867 return ret;
@@ -1007,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1007 if (key.objectid > device->devid) 1035 if (key.objectid > device->devid)
1008 break; 1036 break;
1009 1037
1010 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1038 if (key.type != BTRFS_DEV_EXTENT_KEY)
1011 goto next; 1039 goto next;
1012 1040
1013 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1041 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1159,7 +1187,7 @@ again:
1159 if (key.objectid > device->devid) 1187 if (key.objectid > device->devid)
1160 break; 1188 break;
1161 1189
1162 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1190 if (key.type != BTRFS_DEV_EXTENT_KEY)
1163 goto next; 1191 goto next;
1164 1192
1165 if (key.offset > search_start) { 1193 if (key.offset > search_start) {
@@ -1238,7 +1266,7 @@ out:
1238 1266
1239static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1267static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1240 struct btrfs_device *device, 1268 struct btrfs_device *device,
1241 u64 start) 1269 u64 start, u64 *dev_extent_len)
1242{ 1270{
1243 int ret; 1271 int ret;
1244 struct btrfs_path *path; 1272 struct btrfs_path *path;
@@ -1280,13 +1308,8 @@ again:
1280 goto out; 1308 goto out;
1281 } 1309 }
1282 1310
1283 if (device->bytes_used > 0) { 1311 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1284 u64 len = btrfs_dev_extent_length(leaf, extent); 1312
1285 device->bytes_used -= len;
1286 spin_lock(&root->fs_info->free_chunk_lock);
1287 root->fs_info->free_chunk_space += len;
1288 spin_unlock(&root->fs_info->free_chunk_lock);
1289 }
1290 ret = btrfs_del_item(trans, root, path); 1313 ret = btrfs_del_item(trans, root, path);
1291 if (ret) { 1314 if (ret) {
1292 btrfs_error(root->fs_info, ret, 1315 btrfs_error(root->fs_info, ret,
@@ -1436,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
1436 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1459 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1437 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1460 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1438 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1461 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1439 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1462 btrfs_set_device_total_bytes(leaf, dev_item,
1440 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1463 btrfs_device_get_disk_total_bytes(device));
1464 btrfs_set_device_bytes_used(leaf, dev_item,
1465 btrfs_device_get_bytes_used(device));
1441 btrfs_set_device_group(leaf, dev_item, 0); 1466 btrfs_set_device_group(leaf, dev_item, 0);
1442 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1467 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1443 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1468 btrfs_set_device_bandwidth(leaf, dev_item, 0);
@@ -1493,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1493 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1518 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1494 key.type = BTRFS_DEV_ITEM_KEY; 1519 key.type = BTRFS_DEV_ITEM_KEY;
1495 key.offset = device->devid; 1520 key.offset = device->devid;
1496 lock_chunks(root);
1497 1521
1498 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1522 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1499 if (ret < 0) 1523 if (ret < 0)
@@ -1509,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1509 goto out; 1533 goto out;
1510out: 1534out:
1511 btrfs_free_path(path); 1535 btrfs_free_path(path);
1512 unlock_chunks(root);
1513 btrfs_commit_transaction(trans, root); 1536 btrfs_commit_transaction(trans, root);
1514 return ret; 1537 return ret;
1515} 1538}
@@ -1625,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1625 if (device->writeable) { 1648 if (device->writeable) {
1626 lock_chunks(root); 1649 lock_chunks(root);
1627 list_del_init(&device->dev_alloc_list); 1650 list_del_init(&device->dev_alloc_list);
1651 device->fs_devices->rw_devices--;
1628 unlock_chunks(root); 1652 unlock_chunks(root);
1629 root->fs_info->fs_devices->rw_devices--;
1630 clear_super = true; 1653 clear_super = true;
1631 } 1654 }
1632 1655
@@ -1645,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1645 if (ret) 1668 if (ret)
1646 goto error_undo; 1669 goto error_undo;
1647 1670
1648 spin_lock(&root->fs_info->free_chunk_lock);
1649 root->fs_info->free_chunk_space = device->total_bytes -
1650 device->bytes_used;
1651 spin_unlock(&root->fs_info->free_chunk_lock);
1652
1653 device->in_fs_metadata = 0; 1671 device->in_fs_metadata = 0;
1654 btrfs_scrub_cancel_dev(root->fs_info, device); 1672 btrfs_scrub_cancel_dev(root->fs_info, device);
1655 1673
@@ -1671,7 +1689,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1671 device->fs_devices->total_devices--; 1689 device->fs_devices->total_devices--;
1672 1690
1673 if (device->missing) 1691 if (device->missing)
1674 root->fs_info->fs_devices->missing_devices--; 1692 device->fs_devices->missing_devices--;
1675 1693
1676 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1694 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1677 struct btrfs_device, dev_list); 1695 struct btrfs_device, dev_list);
@@ -1703,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1703 fs_devices = fs_devices->seed; 1721 fs_devices = fs_devices->seed;
1704 } 1722 }
1705 cur_devices->seed = NULL; 1723 cur_devices->seed = NULL;
1706 lock_chunks(root);
1707 __btrfs_close_devices(cur_devices); 1724 __btrfs_close_devices(cur_devices);
1708 unlock_chunks(root);
1709 free_fs_devices(cur_devices); 1725 free_fs_devices(cur_devices);
1710 } 1726 }
1711 1727
@@ -1778,8 +1794,8 @@ error_undo:
1778 lock_chunks(root); 1794 lock_chunks(root);
1779 list_add(&device->dev_alloc_list, 1795 list_add(&device->dev_alloc_list,
1780 &root->fs_info->fs_devices->alloc_list); 1796 &root->fs_info->fs_devices->alloc_list);
1797 device->fs_devices->rw_devices++;
1781 unlock_chunks(root); 1798 unlock_chunks(root);
1782 root->fs_info->fs_devices->rw_devices++;
1783 } 1799 }
1784 goto error_brelse; 1800 goto error_brelse;
1785} 1801}
@@ -1787,25 +1803,57 @@ error_undo:
1787void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1803void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1788 struct btrfs_device *srcdev) 1804 struct btrfs_device *srcdev)
1789{ 1805{
1806 struct btrfs_fs_devices *fs_devices;
1807
1790 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1808 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1791 1809
1810 /*
1811 * in case of fs with no seed, srcdev->fs_devices will point
1812 * to fs_devices of fs_info. However when the dev being replaced is
1813 * a seed dev it will point to the seed's local fs_devices. In short
1814 * srcdev will have its correct fs_devices in both the cases.
1815 */
1816 fs_devices = srcdev->fs_devices;
1817
1792 list_del_rcu(&srcdev->dev_list); 1818 list_del_rcu(&srcdev->dev_list);
1793 list_del_rcu(&srcdev->dev_alloc_list); 1819 list_del_rcu(&srcdev->dev_alloc_list);
1794 fs_info->fs_devices->num_devices--; 1820 fs_devices->num_devices--;
1795 if (srcdev->missing) { 1821 if (srcdev->missing)
1796 fs_info->fs_devices->missing_devices--; 1822 fs_devices->missing_devices--;
1797 fs_info->fs_devices->rw_devices++;
1798 }
1799 if (srcdev->can_discard)
1800 fs_info->fs_devices->num_can_discard--;
1801 if (srcdev->bdev) {
1802 fs_info->fs_devices->open_devices--;
1803 1823
1804 /* zero out the old super */ 1824 if (srcdev->writeable) {
1825 fs_devices->rw_devices--;
1826 /* zero out the old super if it is writable */
1805 btrfs_scratch_superblock(srcdev); 1827 btrfs_scratch_superblock(srcdev);
1806 } 1828 }
1807 1829
1830 if (srcdev->bdev)
1831 fs_devices->open_devices--;
1832
1808 call_rcu(&srcdev->rcu, free_device); 1833 call_rcu(&srcdev->rcu, free_device);
1834
1835 /*
1836 * unless fs_devices is seed fs, num_devices shouldn't go
1837 * zero
1838 */
1839 BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
1840
1841 /* if this is no devs we rather delete the fs_devices */
1842 if (!fs_devices->num_devices) {
1843 struct btrfs_fs_devices *tmp_fs_devices;
1844
1845 tmp_fs_devices = fs_info->fs_devices;
1846 while (tmp_fs_devices) {
1847 if (tmp_fs_devices->seed == fs_devices) {
1848 tmp_fs_devices->seed = fs_devices->seed;
1849 break;
1850 }
1851 tmp_fs_devices = tmp_fs_devices->seed;
1852 }
1853 fs_devices->seed = NULL;
1854 __btrfs_close_devices(fs_devices);
1855 free_fs_devices(fs_devices);
1856 }
1809} 1857}
1810 1858
1811void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1859void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
@@ -1813,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1813{ 1861{
1814 struct btrfs_device *next_device; 1862 struct btrfs_device *next_device;
1815 1863
1864 mutex_lock(&uuid_mutex);
1816 WARN_ON(!tgtdev); 1865 WARN_ON(!tgtdev);
1817 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1866 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1818 if (tgtdev->bdev) { 1867 if (tgtdev->bdev) {
@@ -1820,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1820 fs_info->fs_devices->open_devices--; 1869 fs_info->fs_devices->open_devices--;
1821 } 1870 }
1822 fs_info->fs_devices->num_devices--; 1871 fs_info->fs_devices->num_devices--;
1823 if (tgtdev->can_discard)
1824 fs_info->fs_devices->num_can_discard++;
1825 1872
1826 next_device = list_entry(fs_info->fs_devices->devices.next, 1873 next_device = list_entry(fs_info->fs_devices->devices.next,
1827 struct btrfs_device, dev_list); 1874 struct btrfs_device, dev_list);
@@ -1834,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1834 call_rcu(&tgtdev->rcu, free_device); 1881 call_rcu(&tgtdev->rcu, free_device);
1835 1882
1836 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1883 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1884 mutex_unlock(&uuid_mutex);
1837} 1885}
1838 1886
1839static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1887static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -1932,15 +1980,18 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1932 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1980 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1933 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1981 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1934 synchronize_rcu); 1982 synchronize_rcu);
1983 list_for_each_entry(device, &seed_devices->devices, dev_list)
1984 device->fs_devices = seed_devices;
1935 1985
1986 lock_chunks(root);
1936 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1987 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1937 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1988 unlock_chunks(root);
1938 device->fs_devices = seed_devices;
1939 }
1940 1989
1941 fs_devices->seeding = 0; 1990 fs_devices->seeding = 0;
1942 fs_devices->num_devices = 0; 1991 fs_devices->num_devices = 0;
1943 fs_devices->open_devices = 0; 1992 fs_devices->open_devices = 0;
1993 fs_devices->missing_devices = 0;
1994 fs_devices->rotating = 0;
1944 fs_devices->seed = seed_devices; 1995 fs_devices->seed = seed_devices;
1945 1996
1946 generate_random_uuid(fs_devices->fsid); 1997 generate_random_uuid(fs_devices->fsid);
@@ -2039,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2039 struct list_head *devices; 2090 struct list_head *devices;
2040 struct super_block *sb = root->fs_info->sb; 2091 struct super_block *sb = root->fs_info->sb;
2041 struct rcu_string *name; 2092 struct rcu_string *name;
2042 u64 total_bytes; 2093 u64 tmp;
2043 int seeding_dev = 0; 2094 int seeding_dev = 0;
2044 int ret = 0; 2095 int ret = 0;
2045 2096
@@ -2095,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2095 goto error; 2146 goto error;
2096 } 2147 }
2097 2148
2098 lock_chunks(root);
2099
2100 q = bdev_get_queue(bdev); 2149 q = bdev_get_queue(bdev);
2101 if (blk_queue_discard(q)) 2150 if (blk_queue_discard(q))
2102 device->can_discard = 1; 2151 device->can_discard = 1;
@@ -2107,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2107 device->sector_size = root->sectorsize; 2156 device->sector_size = root->sectorsize;
2108 device->total_bytes = i_size_read(bdev->bd_inode); 2157 device->total_bytes = i_size_read(bdev->bd_inode);
2109 device->disk_total_bytes = device->total_bytes; 2158 device->disk_total_bytes = device->total_bytes;
2159 device->commit_total_bytes = device->total_bytes;
2110 device->dev_root = root->fs_info->dev_root; 2160 device->dev_root = root->fs_info->dev_root;
2111 device->bdev = bdev; 2161 device->bdev = bdev;
2112 device->in_fs_metadata = 1; 2162 device->in_fs_metadata = 1;
@@ -2124,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2124 device->fs_devices = root->fs_info->fs_devices; 2174 device->fs_devices = root->fs_info->fs_devices;
2125 2175
2126 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2176 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2177 lock_chunks(root);
2127 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 2178 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2128 list_add(&device->dev_alloc_list, 2179 list_add(&device->dev_alloc_list,
2129 &root->fs_info->fs_devices->alloc_list); 2180 &root->fs_info->fs_devices->alloc_list);
@@ -2131,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2131 root->fs_info->fs_devices->open_devices++; 2182 root->fs_info->fs_devices->open_devices++;
2132 root->fs_info->fs_devices->rw_devices++; 2183 root->fs_info->fs_devices->rw_devices++;
2133 root->fs_info->fs_devices->total_devices++; 2184 root->fs_info->fs_devices->total_devices++;
2134 if (device->can_discard)
2135 root->fs_info->fs_devices->num_can_discard++;
2136 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2185 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2137 2186
2138 spin_lock(&root->fs_info->free_chunk_lock); 2187 spin_lock(&root->fs_info->free_chunk_lock);
@@ -2142,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2142 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2191 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2143 root->fs_info->fs_devices->rotating = 1; 2192 root->fs_info->fs_devices->rotating = 1;
2144 2193
2145 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 2194 tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
2146 btrfs_set_super_total_bytes(root->fs_info->super_copy, 2195 btrfs_set_super_total_bytes(root->fs_info->super_copy,
2147 total_bytes + device->total_bytes); 2196 tmp + device->total_bytes);
2148 2197
2149 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 2198 tmp = btrfs_super_num_devices(root->fs_info->super_copy);
2150 btrfs_set_super_num_devices(root->fs_info->super_copy, 2199 btrfs_set_super_num_devices(root->fs_info->super_copy,
2151 total_bytes + 1); 2200 tmp + 1);
2152 2201
2153 /* add sysfs device entry */ 2202 /* add sysfs device entry */
2154 btrfs_kobj_add_device(root->fs_info, device); 2203 btrfs_kobj_add_device(root->fs_info, device);
2155 2204
2205 /*
2206 * we've got more storage, clear any full flags on the space
2207 * infos
2208 */
2209 btrfs_clear_space_info_full(root->fs_info);
2210
2211 unlock_chunks(root);
2156 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2212 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2157 2213
2158 if (seeding_dev) { 2214 if (seeding_dev) {
2159 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2215 lock_chunks(root);
2160 ret = init_first_rw_device(trans, root, device); 2216 ret = init_first_rw_device(trans, root, device);
2217 unlock_chunks(root);
2161 if (ret) { 2218 if (ret) {
2162 btrfs_abort_transaction(trans, root, ret); 2219 btrfs_abort_transaction(trans, root, ret);
2163 goto error_trans; 2220 goto error_trans;
2164 } 2221 }
2222 }
2223
2224 ret = btrfs_add_device(trans, root, device);
2225 if (ret) {
2226 btrfs_abort_transaction(trans, root, ret);
2227 goto error_trans;
2228 }
2229
2230 if (seeding_dev) {
2231 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2232
2165 ret = btrfs_finish_sprout(trans, root); 2233 ret = btrfs_finish_sprout(trans, root);
2166 if (ret) { 2234 if (ret) {
2167 btrfs_abort_transaction(trans, root, ret); 2235 btrfs_abort_transaction(trans, root, ret);
@@ -2175,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2175 root->fs_info->fsid); 2243 root->fs_info->fsid);
2176 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) 2244 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
2177 goto error_trans; 2245 goto error_trans;
2178 } else {
2179 ret = btrfs_add_device(trans, root, device);
2180 if (ret) {
2181 btrfs_abort_transaction(trans, root, ret);
2182 goto error_trans;
2183 }
2184 } 2246 }
2185 2247
2186 /*
2187 * we've got more storage, clear any full flags on the space
2188 * infos
2189 */
2190 btrfs_clear_space_info_full(root->fs_info);
2191
2192 unlock_chunks(root);
2193 root->fs_info->num_tolerated_disk_barrier_failures = 2248 root->fs_info->num_tolerated_disk_barrier_failures =
2194 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 2249 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2195 ret = btrfs_commit_transaction(trans, root); 2250 ret = btrfs_commit_transaction(trans, root);
@@ -2221,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2221 return ret; 2276 return ret;
2222 2277
2223error_trans: 2278error_trans:
2224 unlock_chunks(root);
2225 btrfs_end_transaction(trans, root); 2279 btrfs_end_transaction(trans, root);
2226 rcu_string_free(device->name); 2280 rcu_string_free(device->name);
2227 btrfs_kobj_rm_device(root->fs_info, device); 2281 btrfs_kobj_rm_device(root->fs_info, device);
@@ -2236,6 +2290,7 @@ error:
2236} 2290}
2237 2291
2238int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 2292int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2293 struct btrfs_device *srcdev,
2239 struct btrfs_device **device_out) 2294 struct btrfs_device **device_out)
2240{ 2295{
2241 struct request_queue *q; 2296 struct request_queue *q;
@@ -2248,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2248 int ret = 0; 2303 int ret = 0;
2249 2304
2250 *device_out = NULL; 2305 *device_out = NULL;
2251 if (fs_info->fs_devices->seeding) 2306 if (fs_info->fs_devices->seeding) {
2307 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2252 return -EINVAL; 2308 return -EINVAL;
2309 }
2253 2310
2254 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2311 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2255 fs_info->bdev_holder); 2312 fs_info->bdev_holder);
2256 if (IS_ERR(bdev)) 2313 if (IS_ERR(bdev)) {
2314 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2257 return PTR_ERR(bdev); 2315 return PTR_ERR(bdev);
2316 }
2258 2317
2259 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2318 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2260 2319
2261 devices = &fs_info->fs_devices->devices; 2320 devices = &fs_info->fs_devices->devices;
2262 list_for_each_entry(device, devices, dev_list) { 2321 list_for_each_entry(device, devices, dev_list) {
2263 if (device->bdev == bdev) { 2322 if (device->bdev == bdev) {
2323 btrfs_err(fs_info, "target device is in the filesystem!");
2264 ret = -EEXIST; 2324 ret = -EEXIST;
2265 goto error; 2325 goto error;
2266 } 2326 }
2267 } 2327 }
2268 2328
2329
2330 if (i_size_read(bdev->bd_inode) <
2331 btrfs_device_get_total_bytes(srcdev)) {
2332 btrfs_err(fs_info, "target device is smaller than source device!");
2333 ret = -EINVAL;
2334 goto error;
2335 }
2336
2337
2269 device = btrfs_alloc_device(NULL, &devid, NULL); 2338 device = btrfs_alloc_device(NULL, &devid, NULL);
2270 if (IS_ERR(device)) { 2339 if (IS_ERR(device)) {
2271 ret = PTR_ERR(device); 2340 ret = PTR_ERR(device);
@@ -2289,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2289 device->io_width = root->sectorsize; 2358 device->io_width = root->sectorsize;
2290 device->io_align = root->sectorsize; 2359 device->io_align = root->sectorsize;
2291 device->sector_size = root->sectorsize; 2360 device->sector_size = root->sectorsize;
2292 device->total_bytes = i_size_read(bdev->bd_inode); 2361 device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2293 device->disk_total_bytes = device->total_bytes; 2362 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2363 device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2364 ASSERT(list_empty(&srcdev->resized_list));
2365 device->commit_total_bytes = srcdev->commit_total_bytes;
2366 device->commit_bytes_used = device->bytes_used;
2294 device->dev_root = fs_info->dev_root; 2367 device->dev_root = fs_info->dev_root;
2295 device->bdev = bdev; 2368 device->bdev = bdev;
2296 device->in_fs_metadata = 1; 2369 device->in_fs_metadata = 1;
@@ -2302,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2302 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2375 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2303 fs_info->fs_devices->num_devices++; 2376 fs_info->fs_devices->num_devices++;
2304 fs_info->fs_devices->open_devices++; 2377 fs_info->fs_devices->open_devices++;
2305 if (device->can_discard)
2306 fs_info->fs_devices->num_can_discard++;
2307 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2378 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2308 2379
2309 *device_out = device; 2380 *device_out = device;
@@ -2362,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2362 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2433 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2363 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2434 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2364 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2435 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2365 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 2436 btrfs_set_device_total_bytes(leaf, dev_item,
2366 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 2437 btrfs_device_get_disk_total_bytes(device));
2438 btrfs_set_device_bytes_used(leaf, dev_item,
2439 btrfs_device_get_bytes_used(device));
2367 btrfs_mark_buffer_dirty(leaf); 2440 btrfs_mark_buffer_dirty(leaf);
2368 2441
2369out: 2442out:
@@ -2371,40 +2444,44 @@ out:
2371 return ret; 2444 return ret;
2372} 2445}
2373 2446
2374static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 2447int btrfs_grow_device(struct btrfs_trans_handle *trans,
2375 struct btrfs_device *device, u64 new_size) 2448 struct btrfs_device *device, u64 new_size)
2376{ 2449{
2377 struct btrfs_super_block *super_copy = 2450 struct btrfs_super_block *super_copy =
2378 device->dev_root->fs_info->super_copy; 2451 device->dev_root->fs_info->super_copy;
2379 u64 old_total = btrfs_super_total_bytes(super_copy); 2452 struct btrfs_fs_devices *fs_devices;
2380 u64 diff = new_size - device->total_bytes; 2453 u64 old_total;
2454 u64 diff;
2381 2455
2382 if (!device->writeable) 2456 if (!device->writeable)
2383 return -EACCES; 2457 return -EACCES;
2458
2459 lock_chunks(device->dev_root);
2460 old_total = btrfs_super_total_bytes(super_copy);
2461 diff = new_size - device->total_bytes;
2462
2384 if (new_size <= device->total_bytes || 2463 if (new_size <= device->total_bytes ||
2385 device->is_tgtdev_for_dev_replace) 2464 device->is_tgtdev_for_dev_replace) {
2465 unlock_chunks(device->dev_root);
2386 return -EINVAL; 2466 return -EINVAL;
2467 }
2468
2469 fs_devices = device->dev_root->fs_info->fs_devices;
2387 2470
2388 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2471 btrfs_set_super_total_bytes(super_copy, old_total + diff);
2389 device->fs_devices->total_rw_bytes += diff; 2472 device->fs_devices->total_rw_bytes += diff;
2390 2473
2391 device->total_bytes = new_size; 2474 btrfs_device_set_total_bytes(device, new_size);
2392 device->disk_total_bytes = new_size; 2475 btrfs_device_set_disk_total_bytes(device, new_size);
2393 btrfs_clear_space_info_full(device->dev_root->fs_info); 2476 btrfs_clear_space_info_full(device->dev_root->fs_info);
2477 if (list_empty(&device->resized_list))
2478 list_add_tail(&device->resized_list,
2479 &fs_devices->resized_devices);
2480 unlock_chunks(device->dev_root);
2394 2481
2395 return btrfs_update_device(trans, device); 2482 return btrfs_update_device(trans, device);
2396} 2483}
2397 2484
2398int btrfs_grow_device(struct btrfs_trans_handle *trans,
2399 struct btrfs_device *device, u64 new_size)
2400{
2401 int ret;
2402 lock_chunks(device->dev_root);
2403 ret = __btrfs_grow_device(trans, device, new_size);
2404 unlock_chunks(device->dev_root);
2405 return ret;
2406}
2407
2408static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2485static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root, 2486 struct btrfs_root *root,
2410 u64 chunk_tree, u64 chunk_objectid, 2487 u64 chunk_tree, u64 chunk_objectid,
@@ -2456,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2456 u32 cur; 2533 u32 cur;
2457 struct btrfs_key key; 2534 struct btrfs_key key;
2458 2535
2536 lock_chunks(root);
2459 array_size = btrfs_super_sys_array_size(super_copy); 2537 array_size = btrfs_super_sys_array_size(super_copy);
2460 2538
2461 ptr = super_copy->sys_chunk_array; 2539 ptr = super_copy->sys_chunk_array;
@@ -2485,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2485 cur += len; 2563 cur += len;
2486 } 2564 }
2487 } 2565 }
2566 unlock_chunks(root);
2488 return ret; 2567 return ret;
2489} 2568}
2490 2569
2491static int btrfs_relocate_chunk(struct btrfs_root *root, 2570int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2492 u64 chunk_tree, u64 chunk_objectid, 2571 struct btrfs_root *root, u64 chunk_offset)
2493 u64 chunk_offset)
2494{ 2572{
2495 struct extent_map_tree *em_tree; 2573 struct extent_map_tree *em_tree;
2496 struct btrfs_root *extent_root;
2497 struct btrfs_trans_handle *trans;
2498 struct extent_map *em; 2574 struct extent_map *em;
2575 struct btrfs_root *extent_root = root->fs_info->extent_root;
2499 struct map_lookup *map; 2576 struct map_lookup *map;
2500 int ret; 2577 u64 dev_extent_len = 0;
2501 int i; 2578 u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2579 u64 chunk_tree = root->fs_info->chunk_root->objectid;
2580 int i, ret = 0;
2502 2581
2582 /* Just in case */
2503 root = root->fs_info->chunk_root; 2583 root = root->fs_info->chunk_root;
2504 extent_root = root->fs_info->extent_root;
2505 em_tree = &root->fs_info->mapping_tree.map_tree; 2584 em_tree = &root->fs_info->mapping_tree.map_tree;
2506 2585
2507 ret = btrfs_can_relocate(extent_root, chunk_offset);
2508 if (ret)
2509 return -ENOSPC;
2510
2511 /* step one, relocate all the extents inside this chunk */
2512 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2513 if (ret)
2514 return ret;
2515
2516 trans = btrfs_start_transaction(root, 0);
2517 if (IS_ERR(trans)) {
2518 ret = PTR_ERR(trans);
2519 btrfs_std_error(root->fs_info, ret);
2520 return ret;
2521 }
2522
2523 lock_chunks(root);
2524
2525 /*
2526 * step two, delete the device extents and the
2527 * chunk tree entries
2528 */
2529 read_lock(&em_tree->lock); 2586 read_lock(&em_tree->lock);
2530 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2587 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2531 read_unlock(&em_tree->lock); 2588 read_unlock(&em_tree->lock);
2532 2589
2533 BUG_ON(!em || em->start > chunk_offset || 2590 if (!em || em->start > chunk_offset ||
2534 em->start + em->len < chunk_offset); 2591 em->start + em->len < chunk_offset) {
2592 /*
2593 * This is a logic error, but we don't want to just rely on the
2594 * user having built with ASSERT enabled, so if ASSERT doens't
2595 * do anything we still error out.
2596 */
2597 ASSERT(0);
2598 if (em)
2599 free_extent_map(em);
2600 return -EINVAL;
2601 }
2535 map = (struct map_lookup *)em->bdev; 2602 map = (struct map_lookup *)em->bdev;
2536 2603
2537 for (i = 0; i < map->num_stripes; i++) { 2604 for (i = 0; i < map->num_stripes; i++) {
2538 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 2605 struct btrfs_device *device = map->stripes[i].dev;
2539 map->stripes[i].physical); 2606 ret = btrfs_free_dev_extent(trans, device,
2540 BUG_ON(ret); 2607 map->stripes[i].physical,
2608 &dev_extent_len);
2609 if (ret) {
2610 btrfs_abort_transaction(trans, root, ret);
2611 goto out;
2612 }
2613
2614 if (device->bytes_used > 0) {
2615 lock_chunks(root);
2616 btrfs_device_set_bytes_used(device,
2617 device->bytes_used - dev_extent_len);
2618 spin_lock(&root->fs_info->free_chunk_lock);
2619 root->fs_info->free_chunk_space += dev_extent_len;
2620 spin_unlock(&root->fs_info->free_chunk_lock);
2621 btrfs_clear_space_info_full(root->fs_info);
2622 unlock_chunks(root);
2623 }
2541 2624
2542 if (map->stripes[i].dev) { 2625 if (map->stripes[i].dev) {
2543 ret = btrfs_update_device(trans, map->stripes[i].dev); 2626 ret = btrfs_update_device(trans, map->stripes[i].dev);
2544 BUG_ON(ret); 2627 if (ret) {
2628 btrfs_abort_transaction(trans, root, ret);
2629 goto out;
2630 }
2545 } 2631 }
2546 } 2632 }
2547 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2633 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2548 chunk_offset); 2634 chunk_offset);
2549 2635 if (ret) {
2550 BUG_ON(ret); 2636 btrfs_abort_transaction(trans, root, ret);
2637 goto out;
2638 }
2551 2639
2552 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2640 trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2553 2641
2554 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2642 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2555 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2643 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2556 BUG_ON(ret); 2644 if (ret) {
2645 btrfs_abort_transaction(trans, root, ret);
2646 goto out;
2647 }
2557 } 2648 }
2558 2649
2559 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2650 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2560 BUG_ON(ret); 2651 if (ret) {
2652 btrfs_abort_transaction(trans, extent_root, ret);
2653 goto out;
2654 }
2561 2655
2562 write_lock(&em_tree->lock); 2656 write_lock(&em_tree->lock);
2563 remove_extent_mapping(em_tree, em); 2657 remove_extent_mapping(em_tree, em);
@@ -2565,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
2565 2659
2566 /* once for the tree */ 2660 /* once for the tree */
2567 free_extent_map(em); 2661 free_extent_map(em);
2662out:
2568 /* once for us */ 2663 /* once for us */
2569 free_extent_map(em); 2664 free_extent_map(em);
2665 return ret;
2666}
2570 2667
2571 unlock_chunks(root); 2668static int btrfs_relocate_chunk(struct btrfs_root *root,
2669 u64 chunk_tree, u64 chunk_objectid,
2670 u64 chunk_offset)
2671{
2672 struct btrfs_root *extent_root;
2673 struct btrfs_trans_handle *trans;
2674 int ret;
2675
2676 root = root->fs_info->chunk_root;
2677 extent_root = root->fs_info->extent_root;
2678
2679 ret = btrfs_can_relocate(extent_root, chunk_offset);
2680 if (ret)
2681 return -ENOSPC;
2682
2683 /* step one, relocate all the extents inside this chunk */
2684 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2685 if (ret)
2686 return ret;
2687
2688 trans = btrfs_start_transaction(root, 0);
2689 if (IS_ERR(trans)) {
2690 ret = PTR_ERR(trans);
2691 btrfs_std_error(root->fs_info, ret);
2692 return ret;
2693 }
2694
2695 /*
2696 * step two, delete the device extents and the
2697 * chunk tree entries
2698 */
2699 ret = btrfs_remove_chunk(trans, root, chunk_offset);
2572 btrfs_end_transaction(trans, root); 2700 btrfs_end_transaction(trans, root);
2573 return 0; 2701 return ret;
2574} 2702}
2575 2703
2576static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2704static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
@@ -2623,8 +2751,8 @@ again:
2623 found_key.offset); 2751 found_key.offset);
2624 if (ret == -ENOSPC) 2752 if (ret == -ENOSPC)
2625 failed++; 2753 failed++;
2626 else if (ret) 2754 else
2627 BUG(); 2755 BUG_ON(ret);
2628 } 2756 }
2629 2757
2630 if (found_key.offset == 0) 2758 if (found_key.offset == 0)
@@ -3031,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3031 /* step one make some room on all the devices */ 3159 /* step one make some room on all the devices */
3032 devices = &fs_info->fs_devices->devices; 3160 devices = &fs_info->fs_devices->devices;
3033 list_for_each_entry(device, devices, dev_list) { 3161 list_for_each_entry(device, devices, dev_list) {
3034 old_size = device->total_bytes; 3162 old_size = btrfs_device_get_total_bytes(device);
3035 size_to_free = div_factor(old_size, 1); 3163 size_to_free = div_factor(old_size, 1);
3036 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 3164 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
3037 if (!device->writeable || 3165 if (!device->writeable ||
3038 device->total_bytes - device->bytes_used > size_to_free || 3166 btrfs_device_get_total_bytes(device) -
3167 btrfs_device_get_bytes_used(device) > size_to_free ||
3039 device->is_tgtdev_for_dev_replace) 3168 device->is_tgtdev_for_dev_replace)
3040 continue; 3169 continue;
3041 3170
@@ -3590,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data)
3590 max_key.type = BTRFS_ROOT_ITEM_KEY; 3719 max_key.type = BTRFS_ROOT_ITEM_KEY;
3591 max_key.offset = (u64)-1; 3720 max_key.offset = (u64)-1;
3592 3721
3593 path->keep_locks = 1;
3594
3595 while (1) { 3722 while (1) {
3596 ret = btrfs_search_forward(root, &key, path, 0); 3723 ret = btrfs_search_forward(root, &key, path, 0);
3597 if (ret) { 3724 if (ret) {
@@ -3843,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3843 struct btrfs_key key; 3970 struct btrfs_key key;
3844 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3971 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3845 u64 old_total = btrfs_super_total_bytes(super_copy); 3972 u64 old_total = btrfs_super_total_bytes(super_copy);
3846 u64 old_size = device->total_bytes; 3973 u64 old_size = btrfs_device_get_total_bytes(device);
3847 u64 diff = device->total_bytes - new_size; 3974 u64 diff = old_size - new_size;
3848 3975
3849 if (device->is_tgtdev_for_dev_replace) 3976 if (device->is_tgtdev_for_dev_replace)
3850 return -EINVAL; 3977 return -EINVAL;
@@ -3857,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3857 3984
3858 lock_chunks(root); 3985 lock_chunks(root);
3859 3986
3860 device->total_bytes = new_size; 3987 btrfs_device_set_total_bytes(device, new_size);
3861 if (device->writeable) { 3988 if (device->writeable) {
3862 device->fs_devices->total_rw_bytes -= diff; 3989 device->fs_devices->total_rw_bytes -= diff;
3863 spin_lock(&root->fs_info->free_chunk_lock); 3990 spin_lock(&root->fs_info->free_chunk_lock);
@@ -3923,7 +4050,7 @@ again:
3923 ret = -ENOSPC; 4050 ret = -ENOSPC;
3924 lock_chunks(root); 4051 lock_chunks(root);
3925 4052
3926 device->total_bytes = old_size; 4053 btrfs_device_set_total_bytes(device, old_size);
3927 if (device->writeable) 4054 if (device->writeable)
3928 device->fs_devices->total_rw_bytes += diff; 4055 device->fs_devices->total_rw_bytes += diff;
3929 spin_lock(&root->fs_info->free_chunk_lock); 4056 spin_lock(&root->fs_info->free_chunk_lock);
@@ -3941,18 +4068,17 @@ again:
3941 } 4068 }
3942 4069
3943 lock_chunks(root); 4070 lock_chunks(root);
4071 btrfs_device_set_disk_total_bytes(device, new_size);
4072 if (list_empty(&device->resized_list))
4073 list_add_tail(&device->resized_list,
4074 &root->fs_info->fs_devices->resized_devices);
3944 4075
3945 device->disk_total_bytes = new_size;
3946 /* Now btrfs_update_device() will change the on-disk size. */
3947 ret = btrfs_update_device(trans, device);
3948 if (ret) {
3949 unlock_chunks(root);
3950 btrfs_end_transaction(trans, root);
3951 goto done;
3952 }
3953 WARN_ON(diff > old_total); 4076 WARN_ON(diff > old_total);
3954 btrfs_set_super_total_bytes(super_copy, old_total - diff); 4077 btrfs_set_super_total_bytes(super_copy, old_total - diff);
3955 unlock_chunks(root); 4078 unlock_chunks(root);
4079
4080 /* Now btrfs_update_device() will change the on-disk size. */
4081 ret = btrfs_update_device(trans, device);
3956 btrfs_end_transaction(trans, root); 4082 btrfs_end_transaction(trans, root);
3957done: 4083done:
3958 btrfs_free_path(path); 4084 btrfs_free_path(path);
@@ -3968,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
3968 u32 array_size; 4094 u32 array_size;
3969 u8 *ptr; 4095 u8 *ptr;
3970 4096
4097 lock_chunks(root);
3971 array_size = btrfs_super_sys_array_size(super_copy); 4098 array_size = btrfs_super_sys_array_size(super_copy);
3972 if (array_size + item_size + sizeof(disk_key) 4099 if (array_size + item_size + sizeof(disk_key)
3973 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 4100 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4101 unlock_chunks(root);
3974 return -EFBIG; 4102 return -EFBIG;
4103 }
3975 4104
3976 ptr = super_copy->sys_chunk_array + array_size; 4105 ptr = super_copy->sys_chunk_array + array_size;
3977 btrfs_cpu_key_to_disk(&disk_key, key); 4106 btrfs_cpu_key_to_disk(&disk_key, key);
@@ -3980,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
3980 memcpy(ptr, chunk, item_size); 4109 memcpy(ptr, chunk, item_size);
3981 item_size += sizeof(disk_key); 4110 item_size += sizeof(disk_key);
3982 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4111 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4112 unlock_chunks(root);
4113
3983 return 0; 4114 return 0;
3984} 4115}
3985 4116
@@ -4349,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4349 if (ret) 4480 if (ret)
4350 goto error_del_extent; 4481 goto error_del_extent;
4351 4482
4483 for (i = 0; i < map->num_stripes; i++) {
4484 num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4485 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4486 }
4487
4488 spin_lock(&extent_root->fs_info->free_chunk_lock);
4489 extent_root->fs_info->free_chunk_space -= (stripe_size *
4490 map->num_stripes);
4491 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4492
4352 free_extent_map(em); 4493 free_extent_map(em);
4353 check_raid56_incompat_flag(extent_root->fs_info, type); 4494 check_raid56_incompat_flag(extent_root->fs_info, type);
4354 4495
@@ -4420,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4420 device = map->stripes[i].dev; 4561 device = map->stripes[i].dev;
4421 dev_offset = map->stripes[i].physical; 4562 dev_offset = map->stripes[i].physical;
4422 4563
4423 device->bytes_used += stripe_size;
4424 ret = btrfs_update_device(trans, device); 4564 ret = btrfs_update_device(trans, device);
4425 if (ret) 4565 if (ret)
4426 goto out; 4566 goto out;
@@ -4433,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4433 goto out; 4573 goto out;
4434 } 4574 }
4435 4575
4436 spin_lock(&extent_root->fs_info->free_chunk_lock);
4437 extent_root->fs_info->free_chunk_space -= (stripe_size *
4438 map->num_stripes);
4439 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4440
4441 stripe = &chunk->stripe; 4576 stripe = &chunk->stripe;
4442 for (i = 0; i < map->num_stripes; i++) { 4577 for (i = 0; i < map->num_stripes; i++) {
4443 device = map->stripes[i].dev; 4578 device = map->stripes[i].dev;
@@ -4517,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4517 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4652 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4518 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, 4653 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4519 alloc_profile); 4654 alloc_profile);
4520 if (ret) { 4655 return ret;
4521 btrfs_abort_transaction(trans, root, ret); 4656}
4522 goto out; 4657
4658static inline int btrfs_chunk_max_errors(struct map_lookup *map)
4659{
4660 int max_errors;
4661
4662 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4663 BTRFS_BLOCK_GROUP_RAID10 |
4664 BTRFS_BLOCK_GROUP_RAID5 |
4665 BTRFS_BLOCK_GROUP_DUP)) {
4666 max_errors = 1;
4667 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4668 max_errors = 2;
4669 } else {
4670 max_errors = 0;
4523 } 4671 }
4524 4672
4525 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4673 return max_errors;
4526 if (ret)
4527 btrfs_abort_transaction(trans, root, ret);
4528out:
4529 return ret;
4530} 4674}
4531 4675
4532int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 4676int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -4535,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4535 struct map_lookup *map; 4679 struct map_lookup *map;
4536 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4680 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4537 int readonly = 0; 4681 int readonly = 0;
4682 int miss_ndevs = 0;
4538 int i; 4683 int i;
4539 4684
4540 read_lock(&map_tree->map_tree.lock); 4685 read_lock(&map_tree->map_tree.lock);
@@ -4543,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4543 if (!em) 4688 if (!em)
4544 return 1; 4689 return 1;
4545 4690
4546 if (btrfs_test_opt(root, DEGRADED)) {
4547 free_extent_map(em);
4548 return 0;
4549 }
4550
4551 map = (struct map_lookup *)em->bdev; 4691 map = (struct map_lookup *)em->bdev;
4552 for (i = 0; i < map->num_stripes; i++) { 4692 for (i = 0; i < map->num_stripes; i++) {
4693 if (map->stripes[i].dev->missing) {
4694 miss_ndevs++;
4695 continue;
4696 }
4697
4553 if (!map->stripes[i].dev->writeable) { 4698 if (!map->stripes[i].dev->writeable) {
4554 readonly = 1; 4699 readonly = 1;
4555 break; 4700 goto end;
4556 } 4701 }
4557 } 4702 }
4703
4704 /*
4705 * If the number of missing devices is larger than max errors,
4706 * we can not write the data into that chunk successfully, so
4707 * set it readonly.
4708 */
4709 if (miss_ndevs > btrfs_chunk_max_errors(map))
4710 readonly = 1;
4711end:
4558 free_extent_map(em); 4712 free_extent_map(em);
4559 return readonly; 4713 return readonly;
4560} 4714}
@@ -4955,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4955 num_stripes = min_t(u64, map->num_stripes, 5109 num_stripes = min_t(u64, map->num_stripes,
4956 stripe_nr_end - stripe_nr_orig); 5110 stripe_nr_end - stripe_nr_orig);
4957 stripe_index = do_div(stripe_nr, map->num_stripes); 5111 stripe_index = do_div(stripe_nr, map->num_stripes);
5112 if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
5113 mirror_num = 1;
4958 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5114 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
4959 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 5115 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
4960 num_stripes = map->num_stripes; 5116 num_stripes = map->num_stripes;
@@ -5058,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5058 /* We distribute the parity blocks across stripes */ 5214 /* We distribute the parity blocks across stripes */
5059 tmp = stripe_nr + stripe_index; 5215 tmp = stripe_nr + stripe_index;
5060 stripe_index = do_div(tmp, map->num_stripes); 5216 stripe_index = do_div(tmp, map->num_stripes);
5217 if (!(rw & (REQ_WRITE | REQ_DISCARD |
5218 REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
5219 mirror_num = 1;
5061 } 5220 }
5062 } else { 5221 } else {
5063 /* 5222 /*
@@ -5165,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5165 } 5324 }
5166 } 5325 }
5167 5326
5168 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 5327 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5169 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5328 max_errors = btrfs_chunk_max_errors(map);
5170 BTRFS_BLOCK_GROUP_RAID10 |
5171 BTRFS_BLOCK_GROUP_RAID5 |
5172 BTRFS_BLOCK_GROUP_DUP)) {
5173 max_errors = 1;
5174 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5175 max_errors = 2;
5176 }
5177 }
5178 5329
5179 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5330 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5180 dev_replace->tgtdev != NULL) { 5331 dev_replace->tgtdev != NULL) {
@@ -5557,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5557 name = rcu_dereference(dev->name); 5708 name = rcu_dereference(dev->name);
5558 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 5709 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5559 "(%s id %llu), size=%u\n", rw, 5710 "(%s id %llu), size=%u\n", rw,
5560 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 5711 (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
5561 name->str, dev->devid, bio->bi_size); 5712 name->str, dev->devid, bio->bi_iter.bi_size);
5562 rcu_read_unlock(); 5713 rcu_read_unlock();
5563 } 5714 }
5564#endif 5715#endif
@@ -5736,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5736} 5887}
5737 5888
5738static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 5889static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5890 struct btrfs_fs_devices *fs_devices,
5739 u64 devid, u8 *dev_uuid) 5891 u64 devid, u8 *dev_uuid)
5740{ 5892{
5741 struct btrfs_device *device; 5893 struct btrfs_device *device;
5742 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5743 5894
5744 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 5895 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
5745 if (IS_ERR(device)) 5896 if (IS_ERR(device))
@@ -5800,7 +5951,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5800 else 5951 else
5801 generate_random_uuid(dev->uuid); 5952 generate_random_uuid(dev->uuid);
5802 5953
5803 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); 5954 btrfs_init_work(&dev->work, btrfs_submit_helper,
5955 pending_bios_fn, NULL, NULL);
5804 5956
5805 return dev; 5957 return dev;
5806} 5958}
@@ -5875,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5875 } 6027 }
5876 if (!map->stripes[i].dev) { 6028 if (!map->stripes[i].dev) {
5877 map->stripes[i].dev = 6029 map->stripes[i].dev =
5878 add_missing_dev(root, devid, uuid); 6030 add_missing_dev(root, root->fs_info->fs_devices,
6031 devid, uuid);
5879 if (!map->stripes[i].dev) { 6032 if (!map->stripes[i].dev) {
5880 free_extent_map(em); 6033 free_extent_map(em);
5881 return -EIO; 6034 return -EIO;
@@ -5902,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf,
5902 device->devid = btrfs_device_id(leaf, dev_item); 6055 device->devid = btrfs_device_id(leaf, dev_item);
5903 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6056 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
5904 device->total_bytes = device->disk_total_bytes; 6057 device->total_bytes = device->disk_total_bytes;
6058 device->commit_total_bytes = device->disk_total_bytes;
5905 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6059 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6060 device->commit_bytes_used = device->bytes_used;
5906 device->type = btrfs_device_type(leaf, dev_item); 6061 device->type = btrfs_device_type(leaf, dev_item);
5907 device->io_align = btrfs_device_io_align(leaf, dev_item); 6062 device->io_align = btrfs_device_io_align(leaf, dev_item);
5908 device->io_width = btrfs_device_io_width(leaf, dev_item); 6063 device->io_width = btrfs_device_io_width(leaf, dev_item);
@@ -5914,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
5914 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6069 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
5915} 6070}
5916 6071
5917static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 6072static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
6073 u8 *fsid)
5918{ 6074{
5919 struct btrfs_fs_devices *fs_devices; 6075 struct btrfs_fs_devices *fs_devices;
5920 int ret; 6076 int ret;
@@ -5923,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
5923 6079
5924 fs_devices = root->fs_info->fs_devices->seed; 6080 fs_devices = root->fs_info->fs_devices->seed;
5925 while (fs_devices) { 6081 while (fs_devices) {
5926 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 6082 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
5927 ret = 0; 6083 return fs_devices;
5928 goto out; 6084
5929 }
5930 fs_devices = fs_devices->seed; 6085 fs_devices = fs_devices->seed;
5931 } 6086 }
5932 6087
5933 fs_devices = find_fsid(fsid); 6088 fs_devices = find_fsid(fsid);
5934 if (!fs_devices) { 6089 if (!fs_devices) {
5935 ret = -ENOENT; 6090 if (!btrfs_test_opt(root, DEGRADED))
5936 goto out; 6091 return ERR_PTR(-ENOENT);
6092
6093 fs_devices = alloc_fs_devices(fsid);
6094 if (IS_ERR(fs_devices))
6095 return fs_devices;
6096
6097 fs_devices->seeding = 1;
6098 fs_devices->opened = 1;
6099 return fs_devices;
5937 } 6100 }
5938 6101
5939 fs_devices = clone_fs_devices(fs_devices); 6102 fs_devices = clone_fs_devices(fs_devices);
5940 if (IS_ERR(fs_devices)) { 6103 if (IS_ERR(fs_devices))
5941 ret = PTR_ERR(fs_devices); 6104 return fs_devices;
5942 goto out;
5943 }
5944 6105
5945 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6106 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
5946 root->fs_info->bdev_holder); 6107 root->fs_info->bdev_holder);
5947 if (ret) { 6108 if (ret) {
5948 free_fs_devices(fs_devices); 6109 free_fs_devices(fs_devices);
6110 fs_devices = ERR_PTR(ret);
5949 goto out; 6111 goto out;
5950 } 6112 }
5951 6113
5952 if (!fs_devices->seeding) { 6114 if (!fs_devices->seeding) {
5953 __btrfs_close_devices(fs_devices); 6115 __btrfs_close_devices(fs_devices);
5954 free_fs_devices(fs_devices); 6116 free_fs_devices(fs_devices);
5955 ret = -EINVAL; 6117 fs_devices = ERR_PTR(-EINVAL);
5956 goto out; 6118 goto out;
5957 } 6119 }
5958 6120
5959 fs_devices->seed = root->fs_info->fs_devices->seed; 6121 fs_devices->seed = root->fs_info->fs_devices->seed;
5960 root->fs_info->fs_devices->seed = fs_devices; 6122 root->fs_info->fs_devices->seed = fs_devices;
5961out: 6123out:
5962 return ret; 6124 return fs_devices;
5963} 6125}
5964 6126
5965static int read_one_dev(struct btrfs_root *root, 6127static int read_one_dev(struct btrfs_root *root,
5966 struct extent_buffer *leaf, 6128 struct extent_buffer *leaf,
5967 struct btrfs_dev_item *dev_item) 6129 struct btrfs_dev_item *dev_item)
5968{ 6130{
6131 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5969 struct btrfs_device *device; 6132 struct btrfs_device *device;
5970 u64 devid; 6133 u64 devid;
5971 int ret; 6134 int ret;
@@ -5979,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root,
5979 BTRFS_UUID_SIZE); 6142 BTRFS_UUID_SIZE);
5980 6143
5981 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 6144 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
5982 ret = open_seed_devices(root, fs_uuid); 6145 fs_devices = open_seed_devices(root, fs_uuid);
5983 if (ret && !btrfs_test_opt(root, DEGRADED)) 6146 if (IS_ERR(fs_devices))
5984 return ret; 6147 return PTR_ERR(fs_devices);
5985 } 6148 }
5986 6149
5987 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 6150 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
5988 if (!device || !device->bdev) { 6151 if (!device) {
5989 if (!btrfs_test_opt(root, DEGRADED)) 6152 if (!btrfs_test_opt(root, DEGRADED))
5990 return -EIO; 6153 return -EIO;
5991 6154
5992 if (!device) { 6155 btrfs_warn(root->fs_info, "devid %llu missing", devid);
5993 btrfs_warn(root->fs_info, "devid %llu missing", devid); 6156 device = add_missing_dev(root, fs_devices, devid, dev_uuid);
5994 device = add_missing_dev(root, devid, dev_uuid); 6157 if (!device)
5995 if (!device) 6158 return -ENOMEM;
5996 return -ENOMEM; 6159 } else {
5997 } else if (!device->missing) { 6160 if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
6161 return -EIO;
6162
6163 if(!device->bdev && !device->missing) {
5998 /* 6164 /*
5999 * this happens when a device that was properly setup 6165 * this happens when a device that was properly setup
6000 * in the device info lists suddenly goes bad. 6166 * in the device info lists suddenly goes bad.
6001 * device->bdev is NULL, and so we have to set 6167 * device->bdev is NULL, and so we have to set
6002 * device->missing to one here 6168 * device->missing to one here
6003 */ 6169 */
6004 root->fs_info->fs_devices->missing_devices++; 6170 device->fs_devices->missing_devices++;
6005 device->missing = 1; 6171 device->missing = 1;
6006 } 6172 }
6173
6174 /* Move the device to its own fs_devices */
6175 if (device->fs_devices != fs_devices) {
6176 ASSERT(device->missing);
6177
6178 list_move(&device->dev_list, &fs_devices->devices);
6179 device->fs_devices->num_devices--;
6180 fs_devices->num_devices++;
6181
6182 device->fs_devices->missing_devices--;
6183 fs_devices->missing_devices++;
6184
6185 device->fs_devices = fs_devices;
6186 }
6007 } 6187 }
6008 6188
6009 if (device->fs_devices != root->fs_info->fs_devices) { 6189 if (device->fs_devices != root->fs_info->fs_devices) {
@@ -6319,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
6319 struct btrfs_root *dev_root = fs_info->dev_root; 6499 struct btrfs_root *dev_root = fs_info->dev_root;
6320 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6500 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6321 struct btrfs_device *device; 6501 struct btrfs_device *device;
6502 int stats_cnt;
6322 int ret = 0; 6503 int ret = 0;
6323 6504
6324 mutex_lock(&fs_devices->device_list_mutex); 6505 mutex_lock(&fs_devices->device_list_mutex);
6325 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6506 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6326 if (!device->dev_stats_valid || !device->dev_stats_dirty) 6507 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
6327 continue; 6508 continue;
6328 6509
6510 stats_cnt = atomic_read(&device->dev_stats_ccnt);
6329 ret = update_dev_stat_item(trans, dev_root, device); 6511 ret = update_dev_stat_item(trans, dev_root, device);
6330 if (!ret) 6512 if (!ret)
6331 device->dev_stats_dirty = 0; 6513 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
6332 } 6514 }
6333 mutex_unlock(&fs_devices->device_list_mutex); 6515 mutex_unlock(&fs_devices->device_list_mutex);
6334 6516
@@ -6427,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
6427 6609
6428 return 0; 6610 return 0;
6429} 6611}
6612
6613/*
6614 * Update the size of all devices, which is used for writing out the
6615 * super blocks.
6616 */
6617void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
6618{
6619 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6620 struct btrfs_device *curr, *next;
6621
6622 if (list_empty(&fs_devices->resized_devices))
6623 return;
6624
6625 mutex_lock(&fs_devices->device_list_mutex);
6626 lock_chunks(fs_info->dev_root);
6627 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
6628 resized_list) {
6629 list_del_init(&curr->resized_list);
6630 curr->commit_total_bytes = curr->disk_total_bytes;
6631 }
6632 unlock_chunks(fs_info->dev_root);
6633 mutex_unlock(&fs_devices->device_list_mutex);
6634}
6635
6636/* Must be invoked during the transaction commit */
6637void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
6638 struct btrfs_transaction *transaction)
6639{
6640 struct extent_map *em;
6641 struct map_lookup *map;
6642 struct btrfs_device *dev;
6643 int i;
6644
6645 if (list_empty(&transaction->pending_chunks))
6646 return;
6647
6648 /* In order to kick the device replace finish process */
6649 lock_chunks(root);
6650 list_for_each_entry(em, &transaction->pending_chunks, list) {
6651 map = (struct map_lookup *)em->bdev;
6652
6653 for (i = 0; i < map->num_stripes; i++) {
6654 dev = map->stripes[i].dev;
6655 dev->commit_bytes_used = dev->bytes_used;
6656 }
6657 }
6658 unlock_chunks(root);
6659}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2aaa00c47816..08980fa23039 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
24#include <linux/btrfs.h> 24#include <linux/btrfs.h>
25#include "async-thread.h" 25#include "async-thread.h"
26 26
27extern struct mutex uuid_mutex;
28
27#define BTRFS_STRIPE_LEN (64 * 1024) 29#define BTRFS_STRIPE_LEN (64 * 1024)
28 30
29struct buffer_head; 31struct buffer_head;
@@ -32,41 +34,59 @@ struct btrfs_pending_bios {
32 struct bio *tail; 34 struct bio *tail;
33}; 35};
34 36
37/*
38 * Use sequence counter to get consistent device stat data on
39 * 32-bit processors.
40 */
41#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
42#include <linux/seqlock.h>
43#define __BTRFS_NEED_DEVICE_DATA_ORDERED
44#define btrfs_device_data_ordered_init(device) \
45 seqcount_init(&device->data_seqcount)
46#else
47#define btrfs_device_data_ordered_init(device) do { } while (0)
48#endif
49
35struct btrfs_device { 50struct btrfs_device {
36 struct list_head dev_list; 51 struct list_head dev_list;
37 struct list_head dev_alloc_list; 52 struct list_head dev_alloc_list;
38 struct btrfs_fs_devices *fs_devices; 53 struct btrfs_fs_devices *fs_devices;
54
39 struct btrfs_root *dev_root; 55 struct btrfs_root *dev_root;
40 56
57 struct rcu_string *name;
58
59 u64 generation;
60
61 spinlock_t io_lock ____cacheline_aligned;
62 int running_pending;
41 /* regular prio bios */ 63 /* regular prio bios */
42 struct btrfs_pending_bios pending_bios; 64 struct btrfs_pending_bios pending_bios;
43 /* WRITE_SYNC bios */ 65 /* WRITE_SYNC bios */
44 struct btrfs_pending_bios pending_sync_bios; 66 struct btrfs_pending_bios pending_sync_bios;
45 67
46 u64 generation; 68 struct block_device *bdev;
47 int running_pending; 69
70 /* the mode sent to blkdev_get */
71 fmode_t mode;
72
48 int writeable; 73 int writeable;
49 int in_fs_metadata; 74 int in_fs_metadata;
50 int missing; 75 int missing;
51 int can_discard; 76 int can_discard;
52 int is_tgtdev_for_dev_replace; 77 int is_tgtdev_for_dev_replace;
53 78
54 spinlock_t io_lock; 79#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
55 /* the mode sent to blkdev_get */ 80 seqcount_t data_seqcount;
56 fmode_t mode; 81#endif
57
58 struct block_device *bdev;
59
60
61 struct rcu_string *name;
62 82
63 /* the internal btrfs device id */ 83 /* the internal btrfs device id */
64 u64 devid; 84 u64 devid;
65 85
66 /* size of the device */ 86 /* size of the device in memory */
67 u64 total_bytes; 87 u64 total_bytes;
68 88
69 /* size of the disk */ 89 /* size of the device on disk */
70 u64 disk_total_bytes; 90 u64 disk_total_bytes;
71 91
72 /* bytes used */ 92 /* bytes used */
@@ -83,10 +103,26 @@ struct btrfs_device {
83 /* minimal io size for this device */ 103 /* minimal io size for this device */
84 u32 sector_size; 104 u32 sector_size;
85 105
86
87 /* physical drive uuid (or lvm uuid) */ 106 /* physical drive uuid (or lvm uuid) */
88 u8 uuid[BTRFS_UUID_SIZE]; 107 u8 uuid[BTRFS_UUID_SIZE];
89 108
109 /*
110 * size of the device on the current transaction
111 *
112 * This variant is update when committing the transaction,
113 * and protected by device_list_mutex
114 */
115 u64 commit_total_bytes;
116
117 /* bytes used on the current transaction */
118 u64 commit_bytes_used;
119 /*
120 * used to manage the device which is resized
121 *
122 * It is protected by chunk_lock.
123 */
124 struct list_head resized_list;
125
90 /* for sending down flush barriers */ 126 /* for sending down flush barriers */
91 int nobarriers; 127 int nobarriers;
92 struct bio *flush_bio; 128 struct bio *flush_bio;
@@ -107,26 +143,90 @@ struct btrfs_device {
107 struct radix_tree_root reada_zones; 143 struct radix_tree_root reada_zones;
108 struct radix_tree_root reada_extents; 144 struct radix_tree_root reada_extents;
109 145
110
111 /* disk I/O failure stats. For detailed description refer to 146 /* disk I/O failure stats. For detailed description refer to
112 * enum btrfs_dev_stat_values in ioctl.h */ 147 * enum btrfs_dev_stat_values in ioctl.h */
113 int dev_stats_valid; 148 int dev_stats_valid;
114 int dev_stats_dirty; /* counters need to be written to disk */ 149
150 /* Counter to record the change of device stats */
151 atomic_t dev_stats_ccnt;
115 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; 152 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
116}; 153};
117 154
155/*
156 * If we read those variants at the context of their own lock, we needn't
157 * use the following helpers, reading them directly is safe.
158 */
159#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
160#define BTRFS_DEVICE_GETSET_FUNCS(name) \
161static inline u64 \
162btrfs_device_get_##name(const struct btrfs_device *dev) \
163{ \
164 u64 size; \
165 unsigned int seq; \
166 \
167 do { \
168 seq = read_seqcount_begin(&dev->data_seqcount); \
169 size = dev->name; \
170 } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
171 return size; \
172} \
173 \
174static inline void \
175btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
176{ \
177 preempt_disable(); \
178 write_seqcount_begin(&dev->data_seqcount); \
179 dev->name = size; \
180 write_seqcount_end(&dev->data_seqcount); \
181 preempt_enable(); \
182}
183#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
184#define BTRFS_DEVICE_GETSET_FUNCS(name) \
185static inline u64 \
186btrfs_device_get_##name(const struct btrfs_device *dev) \
187{ \
188 u64 size; \
189 \
190 preempt_disable(); \
191 size = dev->name; \
192 preempt_enable(); \
193 return size; \
194} \
195 \
196static inline void \
197btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
198{ \
199 preempt_disable(); \
200 dev->name = size; \
201 preempt_enable(); \
202}
203#else
204#define BTRFS_DEVICE_GETSET_FUNCS(name) \
205static inline u64 \
206btrfs_device_get_##name(const struct btrfs_device *dev) \
207{ \
208 return dev->name; \
209} \
210 \
211static inline void \
212btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
213{ \
214 dev->name = size; \
215}
216#endif
217
218BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
219BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
220BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
221
118struct btrfs_fs_devices { 222struct btrfs_fs_devices {
119 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 223 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
120 224
121 /* the device with this id has the most recent copy of the super */
122 u64 latest_devid;
123 u64 latest_trans;
124 u64 num_devices; 225 u64 num_devices;
125 u64 open_devices; 226 u64 open_devices;
126 u64 rw_devices; 227 u64 rw_devices;
127 u64 missing_devices; 228 u64 missing_devices;
128 u64 total_rw_bytes; 229 u64 total_rw_bytes;
129 u64 num_can_discard;
130 u64 total_devices; 230 u64 total_devices;
131 struct block_device *latest_bdev; 231 struct block_device *latest_bdev;
132 232
@@ -139,6 +239,7 @@ struct btrfs_fs_devices {
139 struct mutex device_list_mutex; 239 struct mutex device_list_mutex;
140 struct list_head devices; 240 struct list_head devices;
141 241
242 struct list_head resized_devices;
142 /* devices not currently being allocated */ 243 /* devices not currently being allocated */
143 struct list_head alloc_list; 244 struct list_head alloc_list;
144 struct list_head list; 245 struct list_head list;
@@ -167,8 +268,9 @@ struct btrfs_fs_devices {
167 */ 268 */
168typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); 269typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
169struct btrfs_io_bio { 270struct btrfs_io_bio {
170 unsigned long mirror_num; 271 unsigned int mirror_num;
171 unsigned long stripe_index; 272 unsigned int stripe_index;
273 u64 logical;
172 u8 *csum; 274 u8 *csum;
173 u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; 275 u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
174 u8 *csum_allocated; 276 u8 *csum_allocated;
@@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
325int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 427int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
326int btrfs_init_new_device(struct btrfs_root *root, char *path); 428int btrfs_init_new_device(struct btrfs_root *root, char *path);
327int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 429int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
430 struct btrfs_device *srcdev,
328 struct btrfs_device **device_out); 431 struct btrfs_device **device_out);
329int btrfs_balance(struct btrfs_balance_control *bctl, 432int btrfs_balance(struct btrfs_balance_control *bctl,
330 struct btrfs_ioctl_balance_args *bargs); 433 struct btrfs_ioctl_balance_args *bargs);
@@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
360int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 463int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
361 struct btrfs_root *extent_root, 464 struct btrfs_root *extent_root,
362 u64 chunk_offset, u64 chunk_size); 465 u64 chunk_offset, u64 chunk_size);
466int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
467 struct btrfs_root *root, u64 chunk_offset);
468
469static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
470{
471 return atomic_read(&dev->dev_stats_ccnt);
472}
473
363static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 474static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
364 int index) 475 int index)
365{ 476{
366 atomic_inc(dev->dev_stat_values + index); 477 atomic_inc(dev->dev_stat_values + index);
367 dev->dev_stats_dirty = 1; 478 smp_mb__before_atomic();
479 atomic_inc(&dev->dev_stats_ccnt);
368} 480}
369 481
370static inline int btrfs_dev_stat_read(struct btrfs_device *dev, 482static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
@@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
379 int ret; 491 int ret;
380 492
381 ret = atomic_xchg(dev->dev_stat_values + index, 0); 493 ret = atomic_xchg(dev->dev_stat_values + index, 0);
382 dev->dev_stats_dirty = 1; 494 smp_mb__before_atomic();
495 atomic_inc(&dev->dev_stats_ccnt);
383 return ret; 496 return ret;
384} 497}
385 498
@@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
387 int index, unsigned long val) 500 int index, unsigned long val)
388{ 501{
389 atomic_set(dev->dev_stat_values + index, val); 502 atomic_set(dev->dev_stat_values + index, val);
390 dev->dev_stats_dirty = 1; 503 smp_mb__before_atomic();
504 atomic_inc(&dev->dev_stats_ccnt);
391} 505}
392 506
393static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, 507static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
@@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
395{ 509{
396 btrfs_dev_stat_set(dev, index, 0); 510 btrfs_dev_stat_set(dev, index, 0);
397} 511}
512
513void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
514void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
515 struct btrfs_transaction *transaction);
398#endif 516#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ad8328d797ea..dcf20131fbe4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
237 * first xattr that we find and walk forward 237 * first xattr that we find and walk forward
238 */ 238 */
239 key.objectid = btrfs_ino(inode); 239 key.objectid = btrfs_ino(inode);
240 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 240 key.type = BTRFS_XATTR_ITEM_KEY;
241 key.offset = 0; 241 key.offset = 0;
242 242
243 path = btrfs_alloc_path(); 243 path = btrfs_alloc_path();
@@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
273 /* check to make sure this item is what we want */ 273 /* check to make sure this item is what we want */
274 if (found_key.objectid != key.objectid) 274 if (found_key.objectid != key.objectid)
275 break; 275 break;
276 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) 276 if (found_key.type != BTRFS_XATTR_ITEM_KEY)
277 break; 277 break;
278 278
279 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 279 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b67d8fc81277..759fa4e2de8f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -33,8 +33,7 @@
33#include "compression.h" 33#include "compression.h"
34 34
35struct workspace { 35struct workspace {
36 z_stream inf_strm; 36 z_stream strm;
37 z_stream def_strm;
38 char *buf; 37 char *buf;
39 struct list_head list; 38 struct list_head list;
40}; 39};
@@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws)
43{ 42{
44 struct workspace *workspace = list_entry(ws, struct workspace, list); 43 struct workspace *workspace = list_entry(ws, struct workspace, list);
45 44
46 vfree(workspace->def_strm.workspace); 45 vfree(workspace->strm.workspace);
47 vfree(workspace->inf_strm.workspace);
48 kfree(workspace->buf); 46 kfree(workspace->buf);
49 kfree(workspace); 47 kfree(workspace);
50} 48}
@@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws)
52static struct list_head *zlib_alloc_workspace(void) 50static struct list_head *zlib_alloc_workspace(void)
53{ 51{
54 struct workspace *workspace; 52 struct workspace *workspace;
53 int workspacesize;
55 54
56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 55 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
57 if (!workspace) 56 if (!workspace)
58 return ERR_PTR(-ENOMEM); 57 return ERR_PTR(-ENOMEM);
59 58
60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( 59 workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
61 MAX_WBITS, MAX_MEM_LEVEL)); 60 zlib_inflate_workspacesize());
62 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->strm.workspace = vmalloc(workspacesize);
63 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
64 if (!workspace->def_strm.workspace || 63 if (!workspace->strm.workspace || !workspace->buf)
65 !workspace->inf_strm.workspace || !workspace->buf)
66 goto fail; 64 goto fail;
67 65
68 INIT_LIST_HEAD(&workspace->list); 66 INIT_LIST_HEAD(&workspace->list);
@@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws,
96 *total_out = 0; 94 *total_out = 0;
97 *total_in = 0; 95 *total_in = 0;
98 96
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 97 if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
100 printk(KERN_WARNING "BTRFS: deflateInit failed\n"); 98 printk(KERN_WARNING "BTRFS: deflateInit failed\n");
101 ret = -EIO; 99 ret = -EIO;
102 goto out; 100 goto out;
103 } 101 }
104 102
105 workspace->def_strm.total_in = 0; 103 workspace->strm.total_in = 0;
106 workspace->def_strm.total_out = 0; 104 workspace->strm.total_out = 0;
107 105
108 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 106 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
109 data_in = kmap(in_page); 107 data_in = kmap(in_page);
@@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws,
117 pages[0] = out_page; 115 pages[0] = out_page;
118 nr_pages = 1; 116 nr_pages = 1;
119 117
120 workspace->def_strm.next_in = data_in; 118 workspace->strm.next_in = data_in;
121 workspace->def_strm.next_out = cpage_out; 119 workspace->strm.next_out = cpage_out;
122 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 120 workspace->strm.avail_out = PAGE_CACHE_SIZE;
123 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); 121 workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
124 122
125 while (workspace->def_strm.total_in < len) { 123 while (workspace->strm.total_in < len) {
126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 124 ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
127 if (ret != Z_OK) { 125 if (ret != Z_OK) {
128 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", 126 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
129 ret); 127 ret);
130 zlib_deflateEnd(&workspace->def_strm); 128 zlib_deflateEnd(&workspace->strm);
131 ret = -EIO; 129 ret = -EIO;
132 goto out; 130 goto out;
133 } 131 }
134 132
135 /* we're making it bigger, give up */ 133 /* we're making it bigger, give up */
136 if (workspace->def_strm.total_in > 8192 && 134 if (workspace->strm.total_in > 8192 &&
137 workspace->def_strm.total_in < 135 workspace->strm.total_in <
138 workspace->def_strm.total_out) { 136 workspace->strm.total_out) {
139 ret = -E2BIG; 137 ret = -E2BIG;
140 goto out; 138 goto out;
141 } 139 }
@@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws,
143 * before the total_in so we will pull in a new page for 141 * before the total_in so we will pull in a new page for
144 * the stream end if required 142 * the stream end if required
145 */ 143 */
146 if (workspace->def_strm.avail_out == 0) { 144 if (workspace->strm.avail_out == 0) {
147 kunmap(out_page); 145 kunmap(out_page);
148 if (nr_pages == nr_dest_pages) { 146 if (nr_pages == nr_dest_pages) {
149 out_page = NULL; 147 out_page = NULL;
@@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws,
158 cpage_out = kmap(out_page); 156 cpage_out = kmap(out_page);
159 pages[nr_pages] = out_page; 157 pages[nr_pages] = out_page;
160 nr_pages++; 158 nr_pages++;
161 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 159 workspace->strm.avail_out = PAGE_CACHE_SIZE;
162 workspace->def_strm.next_out = cpage_out; 160 workspace->strm.next_out = cpage_out;
163 } 161 }
164 /* we're all done */ 162 /* we're all done */
165 if (workspace->def_strm.total_in >= len) 163 if (workspace->strm.total_in >= len)
166 break; 164 break;
167 165
168 /* we've read in a full page, get a new one */ 166 /* we've read in a full page, get a new one */
169 if (workspace->def_strm.avail_in == 0) { 167 if (workspace->strm.avail_in == 0) {
170 if (workspace->def_strm.total_out > max_out) 168 if (workspace->strm.total_out > max_out)
171 break; 169 break;
172 170
173 bytes_left = len - workspace->def_strm.total_in; 171 bytes_left = len - workspace->strm.total_in;
174 kunmap(in_page); 172 kunmap(in_page);
175 page_cache_release(in_page); 173 page_cache_release(in_page);
176 174
@@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws,
178 in_page = find_get_page(mapping, 176 in_page = find_get_page(mapping,
179 start >> PAGE_CACHE_SHIFT); 177 start >> PAGE_CACHE_SHIFT);
180 data_in = kmap(in_page); 178 data_in = kmap(in_page);
181 workspace->def_strm.avail_in = min(bytes_left, 179 workspace->strm.avail_in = min(bytes_left,
182 PAGE_CACHE_SIZE); 180 PAGE_CACHE_SIZE);
183 workspace->def_strm.next_in = data_in; 181 workspace->strm.next_in = data_in;
184 } 182 }
185 } 183 }
186 workspace->def_strm.avail_in = 0; 184 workspace->strm.avail_in = 0;
187 ret = zlib_deflate(&workspace->def_strm, Z_FINISH); 185 ret = zlib_deflate(&workspace->strm, Z_FINISH);
188 zlib_deflateEnd(&workspace->def_strm); 186 zlib_deflateEnd(&workspace->strm);
189 187
190 if (ret != Z_STREAM_END) { 188 if (ret != Z_STREAM_END) {
191 ret = -EIO; 189 ret = -EIO;
192 goto out; 190 goto out;
193 } 191 }
194 192
195 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { 193 if (workspace->strm.total_out >= workspace->strm.total_in) {
196 ret = -E2BIG; 194 ret = -E2BIG;
197 goto out; 195 goto out;
198 } 196 }
199 197
200 ret = 0; 198 ret = 0;
201 *total_out = workspace->def_strm.total_out; 199 *total_out = workspace->strm.total_out;
202 *total_in = workspace->def_strm.total_in; 200 *total_in = workspace->strm.total_in;
203out: 201out:
204 *out_pages = nr_pages; 202 *out_pages = nr_pages;
205 if (out_page) 203 if (out_page)
@@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
225 size_t total_out = 0; 223 size_t total_out = 0;
226 unsigned long page_in_index = 0; 224 unsigned long page_in_index = 0;
227 unsigned long page_out_index = 0; 225 unsigned long page_out_index = 0;
228 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 226 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
229 PAGE_CACHE_SIZE;
230 unsigned long buf_start; 227 unsigned long buf_start;
231 unsigned long pg_offset; 228 unsigned long pg_offset;
232 229
233 data_in = kmap(pages_in[page_in_index]); 230 data_in = kmap(pages_in[page_in_index]);
234 workspace->inf_strm.next_in = data_in; 231 workspace->strm.next_in = data_in;
235 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); 232 workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
236 workspace->inf_strm.total_in = 0; 233 workspace->strm.total_in = 0;
237 234
238 workspace->inf_strm.total_out = 0; 235 workspace->strm.total_out = 0;
239 workspace->inf_strm.next_out = workspace->buf; 236 workspace->strm.next_out = workspace->buf;
240 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 237 workspace->strm.avail_out = PAGE_CACHE_SIZE;
241 pg_offset = 0; 238 pg_offset = 0;
242 239
243 /* If it's deflate, and it's got no preset dictionary, then 240 /* If it's deflate, and it's got no preset dictionary, then
@@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
247 !(((data_in[0]<<8) + data_in[1]) % 31)) { 244 !(((data_in[0]<<8) + data_in[1]) % 31)) {
248 245
249 wbits = -((data_in[0] >> 4) + 8); 246 wbits = -((data_in[0] >> 4) + 8);
250 workspace->inf_strm.next_in += 2; 247 workspace->strm.next_in += 2;
251 workspace->inf_strm.avail_in -= 2; 248 workspace->strm.avail_in -= 2;
252 } 249 }
253 250
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 251 if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
255 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 252 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
256 return -EIO; 253 return -EIO;
257 } 254 }
258 while (workspace->inf_strm.total_in < srclen) { 255 while (workspace->strm.total_in < srclen) {
259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 256 ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
260 if (ret != Z_OK && ret != Z_STREAM_END) 257 if (ret != Z_OK && ret != Z_STREAM_END)
261 break; 258 break;
262 259
263 buf_start = total_out; 260 buf_start = total_out;
264 total_out = workspace->inf_strm.total_out; 261 total_out = workspace->strm.total_out;
265 262
266 /* we didn't make progress in this inflate call, we're done */ 263 /* we didn't make progress in this inflate call, we're done */
267 if (buf_start == total_out) 264 if (buf_start == total_out)
@@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
276 goto done; 273 goto done;
277 } 274 }
278 275
279 workspace->inf_strm.next_out = workspace->buf; 276 workspace->strm.next_out = workspace->buf;
280 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 277 workspace->strm.avail_out = PAGE_CACHE_SIZE;
281 278
282 if (workspace->inf_strm.avail_in == 0) { 279 if (workspace->strm.avail_in == 0) {
283 unsigned long tmp; 280 unsigned long tmp;
284 kunmap(pages_in[page_in_index]); 281 kunmap(pages_in[page_in_index]);
285 page_in_index++; 282 page_in_index++;
@@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
288 break; 285 break;
289 } 286 }
290 data_in = kmap(pages_in[page_in_index]); 287 data_in = kmap(pages_in[page_in_index]);
291 workspace->inf_strm.next_in = data_in; 288 workspace->strm.next_in = data_in;
292 tmp = srclen - workspace->inf_strm.total_in; 289 tmp = srclen - workspace->strm.total_in;
293 workspace->inf_strm.avail_in = min(tmp, 290 workspace->strm.avail_in = min(tmp,
294 PAGE_CACHE_SIZE); 291 PAGE_CACHE_SIZE);
295 } 292 }
296 } 293 }
@@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
299 else 296 else
300 ret = 0; 297 ret = 0;
301done: 298done:
302 zlib_inflateEnd(&workspace->inf_strm); 299 zlib_inflateEnd(&workspace->strm);
303 if (data_in) 300 if (data_in)
304 kunmap(pages_in[page_in_index]); 301 kunmap(pages_in[page_in_index]);
305 return ret; 302 return ret;
@@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
317 unsigned long total_out = 0; 314 unsigned long total_out = 0;
318 char *kaddr; 315 char *kaddr;
319 316
320 workspace->inf_strm.next_in = data_in; 317 workspace->strm.next_in = data_in;
321 workspace->inf_strm.avail_in = srclen; 318 workspace->strm.avail_in = srclen;
322 workspace->inf_strm.total_in = 0; 319 workspace->strm.total_in = 0;
323 320
324 workspace->inf_strm.next_out = workspace->buf; 321 workspace->strm.next_out = workspace->buf;
325 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 322 workspace->strm.avail_out = PAGE_CACHE_SIZE;
326 workspace->inf_strm.total_out = 0; 323 workspace->strm.total_out = 0;
327 /* If it's deflate, and it's got no preset dictionary, then 324 /* If it's deflate, and it's got no preset dictionary, then
328 we can tell zlib to skip the adler32 check. */ 325 we can tell zlib to skip the adler32 check. */
329 if (srclen > 2 && !(data_in[1] & PRESET_DICT) && 326 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
@@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
331 !(((data_in[0]<<8) + data_in[1]) % 31)) { 328 !(((data_in[0]<<8) + data_in[1]) % 31)) {
332 329
333 wbits = -((data_in[0] >> 4) + 8); 330 wbits = -((data_in[0] >> 4) + 8);
334 workspace->inf_strm.next_in += 2; 331 workspace->strm.next_in += 2;
335 workspace->inf_strm.avail_in -= 2; 332 workspace->strm.avail_in -= 2;
336 } 333 }
337 334
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 335 if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
339 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 336 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
340 return -EIO; 337 return -EIO;
341 } 338 }
@@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
346 unsigned long bytes; 343 unsigned long bytes;
347 unsigned long pg_offset = 0; 344 unsigned long pg_offset = 0;
348 345
349 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 346 ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
350 if (ret != Z_OK && ret != Z_STREAM_END) 347 if (ret != Z_OK && ret != Z_STREAM_END)
351 break; 348 break;
352 349
353 buf_start = total_out; 350 buf_start = total_out;
354 total_out = workspace->inf_strm.total_out; 351 total_out = workspace->strm.total_out;
355 352
356 if (total_out == buf_start) { 353 if (total_out == buf_start) {
357 ret = -EIO; 354 ret = -EIO;
@@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
377 pg_offset += bytes; 374 pg_offset += bytes;
378 bytes_left -= bytes; 375 bytes_left -= bytes;
379next: 376next:
380 workspace->inf_strm.next_out = workspace->buf; 377 workspace->strm.next_out = workspace->buf;
381 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 378 workspace->strm.avail_out = PAGE_CACHE_SIZE;
382 } 379 }
383 380
384 if (ret != Z_STREAM_END && bytes_left != 0) 381 if (ret != Z_STREAM_END && bytes_left != 0)
@@ -386,7 +383,7 @@ next:
386 else 383 else
387 ret = 0; 384 ret = 0;
388 385
389 zlib_inflateEnd(&workspace->inf_strm); 386 zlib_inflateEnd(&workspace->strm);
390 return ret; 387 return ret;
391} 388}
392 389
diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111bbb8b..44c14a87750e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1022,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
1022 bh = page_buffers(page); 1022 bh = page_buffers(page);
1023 if (bh->b_size == size) { 1023 if (bh->b_size == size) {
1024 end_block = init_page_buffers(page, bdev, 1024 end_block = init_page_buffers(page, bdev,
1025 index << sizebits, size); 1025 (sector_t)index << sizebits,
1026 size);
1026 goto done; 1027 goto done;
1027 } 1028 }
1028 if (!try_to_free_buffers(page)) 1029 if (!try_to_free_buffers(page))
@@ -1043,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
1043 */ 1044 */
1044 spin_lock(&inode->i_mapping->private_lock); 1045 spin_lock(&inode->i_mapping->private_lock);
1045 link_dev_buffers(page, bh); 1046 link_dev_buffers(page, bh);
1046 end_block = init_page_buffers(page, bdev, index << sizebits, size); 1047 end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1048 size);
1047 spin_unlock(&inode->i_mapping->private_lock); 1049 spin_unlock(&inode->i_mapping->private_lock);
1048done: 1050done:
1049 ret = (block < end_block) ? 1 : -ENXIO; 1051 ret = (block < end_block) ? 1 : -ENXIO;
@@ -1251,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
1251 * a local interrupt disable for that. 1253 * a local interrupt disable for that.
1252 */ 1254 */
1253 1255
1254#define BH_LRU_SIZE 8 1256#define BH_LRU_SIZE 16
1255 1257
1256struct bh_lru { 1258struct bh_lru {
1257 struct buffer_head *bhs[BH_LRU_SIZE]; 1259 struct buffer_head *bhs[BH_LRU_SIZE];
@@ -2954,7 +2956,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2954 2956
2955/* 2957/*
2956 * This allows us to do IO even on the odd last sectors 2958 * This allows us to do IO even on the odd last sectors
2957 * of a device, even if the bh block size is some multiple 2959 * of a device, even if the block size is some multiple
2958 * of the physical sector size. 2960 * of the physical sector size.
2959 * 2961 *
2960 * We'll just truncate the bio to the size of the device, 2962 * We'll just truncate the bio to the size of the device,
@@ -2964,10 +2966,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2964 * errors, this only handles the "we need to be able to 2966 * errors, this only handles the "we need to be able to
2965 * do IO at the final sector" case. 2967 * do IO at the final sector" case.
2966 */ 2968 */
2967static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) 2969void guard_bio_eod(int rw, struct bio *bio)
2968{ 2970{
2969 sector_t maxsector; 2971 sector_t maxsector;
2970 unsigned bytes; 2972 struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2973 unsigned truncated_bytes;
2971 2974
2972 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; 2975 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2973 if (!maxsector) 2976 if (!maxsector)
@@ -2982,23 +2985,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2982 return; 2985 return;
2983 2986
2984 maxsector -= bio->bi_iter.bi_sector; 2987 maxsector -= bio->bi_iter.bi_sector;
2985 bytes = bio->bi_iter.bi_size; 2988 if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2986 if (likely((bytes >> 9) <= maxsector))
2987 return; 2989 return;
2988 2990
2989 /* Uhhuh. We've got a bh that straddles the device size! */ 2991 /* Uhhuh. We've got a bio that straddles the device size! */
2990 bytes = maxsector << 9; 2992 truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2991 2993
2992 /* Truncate the bio.. */ 2994 /* Truncate the bio.. */
2993 bio->bi_iter.bi_size = bytes; 2995 bio->bi_iter.bi_size -= truncated_bytes;
2994 bio->bi_io_vec[0].bv_len = bytes; 2996 bvec->bv_len -= truncated_bytes;
2995 2997
2996 /* ..and clear the end of the buffer for reads */ 2998 /* ..and clear the end of the buffer for reads */
2997 if ((rw & RW_MASK) == READ) { 2999 if ((rw & RW_MASK) == READ) {
2998 void *kaddr = kmap_atomic(bh->b_page); 3000 zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
2999 memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); 3001 truncated_bytes);
3000 kunmap_atomic(kaddr);
3001 flush_dcache_page(bh->b_page);
3002 } 3002 }
3003} 3003}
3004 3004
@@ -3039,7 +3039,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3039 bio->bi_flags |= bio_flags; 3039 bio->bi_flags |= bio_flags;
3040 3040
3041 /* Take care of bh's that straddle the end of the device */ 3041 /* Take care of bh's that straddle the end of the device */
3042 guard_bh_eod(rw, bio, bh); 3042 guard_bio_eod(rw, bio);
3043 3043
3044 if (buffer_meta(bh)) 3044 if (buffer_meta(bh))
3045 rw |= REQ_META; 3045 rw |= REQ_META;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index d749731dc0ee..fbb08e97438d 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
50 cache->brun_percent < 100); 50 cache->brun_percent < 100);
51 51
52 if (*args) { 52 if (*args) {
53 pr_err("'bind' command doesn't take an argument"); 53 pr_err("'bind' command doesn't take an argument\n");
54 return -EINVAL; 54 return -EINVAL;
55 } 55 }
56 56
57 if (!cache->rootdirname) { 57 if (!cache->rootdirname) {
58 pr_err("No cache directory specified"); 58 pr_err("No cache directory specified\n");
59 return -EINVAL; 59 return -EINVAL;
60 } 60 }
61 61
62 /* don't permit already bound caches to be re-bound */ 62 /* don't permit already bound caches to be re-bound */
63 if (test_bit(CACHEFILES_READY, &cache->flags)) { 63 if (test_bit(CACHEFILES_READY, &cache->flags)) {
64 pr_err("Cache already bound"); 64 pr_err("Cache already bound\n");
65 return -EBUSY; 65 return -EBUSY;
66 } 66 }
67 67
@@ -248,7 +248,7 @@ error_open_root:
248 kmem_cache_free(cachefiles_object_jar, fsdef); 248 kmem_cache_free(cachefiles_object_jar, fsdef);
249error_root_object: 249error_root_object:
250 cachefiles_end_secure(cache, saved_cred); 250 cachefiles_end_secure(cache, saved_cred);
251 pr_err("Failed to register: %d", ret); 251 pr_err("Failed to register: %d\n", ret);
252 return ret; 252 return ret;
253} 253}
254 254
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index b078d3081d6c..ce1b115dcc28 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -315,7 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file,
315static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, 315static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
316 char *args) 316 char *args)
317{ 317{
318 pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%"); 318 pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
319 319
320 return -EINVAL; 320 return -EINVAL;
321} 321}
@@ -475,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
475 _enter(",%s", args); 475 _enter(",%s", args);
476 476
477 if (!*args) { 477 if (!*args) {
478 pr_err("Empty directory specified"); 478 pr_err("Empty directory specified\n");
479 return -EINVAL; 479 return -EINVAL;
480 } 480 }
481 481
482 if (cache->rootdirname) { 482 if (cache->rootdirname) {
483 pr_err("Second cache directory specified"); 483 pr_err("Second cache directory specified\n");
484 return -EEXIST; 484 return -EEXIST;
485 } 485 }
486 486
@@ -503,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
503 _enter(",%s", args); 503 _enter(",%s", args);
504 504
505 if (!*args) { 505 if (!*args) {
506 pr_err("Empty security context specified"); 506 pr_err("Empty security context specified\n");
507 return -EINVAL; 507 return -EINVAL;
508 } 508 }
509 509
510 if (cache->secctx) { 510 if (cache->secctx) {
511 pr_err("Second security context specified"); 511 pr_err("Second security context specified\n");
512 return -EINVAL; 512 return -EINVAL;
513 } 513 }
514 514
@@ -531,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
531 _enter(",%s", args); 531 _enter(",%s", args);
532 532
533 if (!*args) { 533 if (!*args) {
534 pr_err("Empty tag specified"); 534 pr_err("Empty tag specified\n");
535 return -EINVAL; 535 return -EINVAL;
536 } 536 }
537 537
@@ -562,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
562 goto inval; 562 goto inval;
563 563
564 if (!test_bit(CACHEFILES_READY, &cache->flags)) { 564 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
565 pr_err("cull applied to unready cache"); 565 pr_err("cull applied to unready cache\n");
566 return -EIO; 566 return -EIO;
567 } 567 }
568 568
569 if (test_bit(CACHEFILES_DEAD, &cache->flags)) { 569 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
570 pr_err("cull applied to dead cache"); 570 pr_err("cull applied to dead cache\n");
571 return -EIO; 571 return -EIO;
572 } 572 }
573 573
@@ -587,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
587 587
588notdir: 588notdir:
589 path_put(&path); 589 path_put(&path);
590 pr_err("cull command requires dirfd to be a directory"); 590 pr_err("cull command requires dirfd to be a directory\n");
591 return -ENOTDIR; 591 return -ENOTDIR;
592 592
593inval: 593inval:
594 pr_err("cull command requires dirfd and filename"); 594 pr_err("cull command requires dirfd and filename\n");
595 return -EINVAL; 595 return -EINVAL;
596} 596}
597 597
@@ -614,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
614 return 0; 614 return 0;
615 615
616inval: 616inval:
617 pr_err("debug command requires mask"); 617 pr_err("debug command requires mask\n");
618 return -EINVAL; 618 return -EINVAL;
619} 619}
620 620
@@ -634,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
634 goto inval; 634 goto inval;
635 635
636 if (!test_bit(CACHEFILES_READY, &cache->flags)) { 636 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
637 pr_err("inuse applied to unready cache"); 637 pr_err("inuse applied to unready cache\n");
638 return -EIO; 638 return -EIO;
639 } 639 }
640 640
641 if (test_bit(CACHEFILES_DEAD, &cache->flags)) { 641 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
642 pr_err("inuse applied to dead cache"); 642 pr_err("inuse applied to dead cache\n");
643 return -EIO; 643 return -EIO;
644 } 644 }
645 645
@@ -659,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
659 659
660notdir: 660notdir:
661 path_put(&path); 661 path_put(&path);
662 pr_err("inuse command requires dirfd to be a directory"); 662 pr_err("inuse command requires dirfd to be a directory\n");
663 return -ENOTDIR; 663 return -ENOTDIR;
664 664
665inval: 665inval:
666 pr_err("inuse command requires dirfd and filename"); 666 pr_err("inuse command requires dirfd and filename\n");
667 return -EINVAL; 667 return -EINVAL;
668} 668}
669 669
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 3d50998abf57..8c52472d2efa 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -255,7 +255,7 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
255 255
256#define cachefiles_io_error(___cache, FMT, ...) \ 256#define cachefiles_io_error(___cache, FMT, ...) \
257do { \ 257do { \
258 pr_err("I/O Error: " FMT, ##__VA_ARGS__); \ 258 pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
259 fscache_io_error(&(___cache)->cache); \ 259 fscache_io_error(&(___cache)->cache); \
260 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ 260 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
261} while (0) 261} while (0)
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 180edfb45f66..711f13d8c2de 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -84,7 +84,7 @@ error_proc:
84error_object_jar: 84error_object_jar:
85 misc_deregister(&cachefiles_dev); 85 misc_deregister(&cachefiles_dev);
86error_dev: 86error_dev:
87 pr_err("failed to register: %d", ret); 87 pr_err("failed to register: %d\n", ret);
88 return ret; 88 return ret;
89} 89}
90 90
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 5bf2b41e66d3..dad7d9542a24 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -543,7 +543,7 @@ lookup_again:
543 next, next->d_inode, next->d_inode->i_ino); 543 next, next->d_inode, next->d_inode->i_ino);
544 544
545 } else if (!S_ISDIR(next->d_inode->i_mode)) { 545 } else if (!S_ISDIR(next->d_inode->i_mode)) {
546 pr_err("inode %lu is not a directory", 546 pr_err("inode %lu is not a directory\n",
547 next->d_inode->i_ino); 547 next->d_inode->i_ino);
548 ret = -ENOBUFS; 548 ret = -ENOBUFS;
549 goto error; 549 goto error;
@@ -574,7 +574,7 @@ lookup_again:
574 } else if (!S_ISDIR(next->d_inode->i_mode) && 574 } else if (!S_ISDIR(next->d_inode->i_mode) &&
575 !S_ISREG(next->d_inode->i_mode) 575 !S_ISREG(next->d_inode->i_mode)
576 ) { 576 ) {
577 pr_err("inode %lu is not a file or directory", 577 pr_err("inode %lu is not a file or directory\n",
578 next->d_inode->i_ino); 578 next->d_inode->i_ino);
579 ret = -ENOBUFS; 579 ret = -ENOBUFS;
580 goto error; 580 goto error;
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
768 ASSERT(subdir->d_inode); 768 ASSERT(subdir->d_inode);
769 769
770 if (!S_ISDIR(subdir->d_inode->i_mode)) { 770 if (!S_ISDIR(subdir->d_inode->i_mode)) {
771 pr_err("%s is not a directory", dirname); 771 pr_err("%s is not a directory\n", dirname);
772 ret = -EIO; 772 ret = -EIO;
773 goto check_error; 773 goto check_error;
774 } 774 }
@@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
779 !subdir->d_inode->i_op->lookup || 779 !subdir->d_inode->i_op->lookup ||
780 !subdir->d_inode->i_op->mkdir || 780 !subdir->d_inode->i_op->mkdir ||
781 !subdir->d_inode->i_op->create || 781 !subdir->d_inode->i_op->create ||
782 !subdir->d_inode->i_op->rename || 782 (!subdir->d_inode->i_op->rename &&
783 !subdir->d_inode->i_op->rename2) ||
783 !subdir->d_inode->i_op->rmdir || 784 !subdir->d_inode->i_op->rmdir ||
784 !subdir->d_inode->i_op->unlink) 785 !subdir->d_inode->i_op->unlink)
785 goto check_error; 786 goto check_error;
@@ -795,13 +796,13 @@ check_error:
795mkdir_error: 796mkdir_error:
796 mutex_unlock(&dir->d_inode->i_mutex); 797 mutex_unlock(&dir->d_inode->i_mutex);
797 dput(subdir); 798 dput(subdir);
798 pr_err("mkdir %s failed with error %d", dirname, ret); 799 pr_err("mkdir %s failed with error %d\n", dirname, ret);
799 return ERR_PTR(ret); 800 return ERR_PTR(ret);
800 801
801lookup_error: 802lookup_error:
802 mutex_unlock(&dir->d_inode->i_mutex); 803 mutex_unlock(&dir->d_inode->i_mutex);
803 ret = PTR_ERR(subdir); 804 ret = PTR_ERR(subdir);
804 pr_err("Lookup %s failed with error %d", dirname, ret); 805 pr_err("Lookup %s failed with error %d\n", dirname, ret);
805 return ERR_PTR(ret); 806 return ERR_PTR(ret);
806 807
807nomem_d_alloc: 808nomem_d_alloc:
@@ -891,7 +892,7 @@ lookup_error:
891 if (ret == -EIO) { 892 if (ret == -EIO) {
892 cachefiles_io_error(cache, "Lookup failed"); 893 cachefiles_io_error(cache, "Lookup failed");
893 } else if (ret != -ENOMEM) { 894 } else if (ret != -ENOMEM) {
894 pr_err("Internal error: %d", ret); 895 pr_err("Internal error: %d\n", ret);
895 ret = -EIO; 896 ret = -EIO;
896 } 897 }
897 898
@@ -950,7 +951,7 @@ error:
950 } 951 }
951 952
952 if (ret != -ENOMEM) { 953 if (ret != -ENOMEM) {
953 pr_err("Internal error: %d", ret); 954 pr_err("Internal error: %d\n", ret);
954 ret = -EIO; 955 ret = -EIO;
955 } 956 }
956 957
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 4b1fb5ca65b8..25e745b8eb1b 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
151 struct cachefiles_one_read *monitor; 151 struct cachefiles_one_read *monitor;
152 struct cachefiles_object *object; 152 struct cachefiles_object *object;
153 struct fscache_retrieval *op; 153 struct fscache_retrieval *op;
154 struct pagevec pagevec;
155 int error, max; 154 int error, max;
156 155
157 op = container_of(_op, struct fscache_retrieval, op); 156 op = container_of(_op, struct fscache_retrieval, op);
@@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
160 159
161 _enter("{ino=%lu}", object->backer->d_inode->i_ino); 160 _enter("{ino=%lu}", object->backer->d_inode->i_ino);
162 161
163 pagevec_init(&pagevec, 0);
164
165 max = 8; 162 max = 8;
166 spin_lock_irq(&object->work_lock); 163 spin_lock_irq(&object->work_lock);
167 164
@@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
396{ 393{
397 struct cachefiles_object *object; 394 struct cachefiles_object *object;
398 struct cachefiles_cache *cache; 395 struct cachefiles_cache *cache;
399 struct pagevec pagevec;
400 struct inode *inode; 396 struct inode *inode;
401 sector_t block0, block; 397 sector_t block0, block;
402 unsigned shift; 398 unsigned shift;
@@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
427 op->op.flags |= FSCACHE_OP_ASYNC; 423 op->op.flags |= FSCACHE_OP_ASYNC;
428 op->op.processor = cachefiles_read_copier; 424 op->op.processor = cachefiles_read_copier;
429 425
430 pagevec_init(&pagevec, 0);
431
432 /* we assume the absence or presence of the first block is a good 426 /* we assume the absence or presence of the first block is a good
433 * enough indication for the page as a whole 427 * enough indication for the page as a whole
434 * - TODO: don't use bmap() for this as it is _not_ actually good 428 * - TODO: don't use bmap() for this as it is _not_ actually good
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 1ad51ffbb275..acbc1f094fb1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
51 } 51 }
52 52
53 if (ret != -EEXIST) { 53 if (ret != -EEXIST) {
54 pr_err("Can't set xattr on %*.*s [%lu] (err %d)", 54 pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n",
55 dentry->d_name.len, dentry->d_name.len, 55 dentry->d_name.len, dentry->d_name.len,
56 dentry->d_name.name, dentry->d_inode->i_ino, 56 dentry->d_name.name, dentry->d_inode->i_ino,
57 -ret); 57 -ret);
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
64 if (ret == -ERANGE) 64 if (ret == -ERANGE)
65 goto bad_type_length; 65 goto bad_type_length;
66 66
67 pr_err("Can't read xattr on %*.*s [%lu] (err %d)", 67 pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n",
68 dentry->d_name.len, dentry->d_name.len, 68 dentry->d_name.len, dentry->d_name.len,
69 dentry->d_name.name, dentry->d_inode->i_ino, 69 dentry->d_name.name, dentry->d_inode->i_ino,
70 -ret); 70 -ret);
@@ -85,14 +85,14 @@ error:
85 return ret; 85 return ret;
86 86
87bad_type_length: 87bad_type_length:
88 pr_err("Cache object %lu type xattr length incorrect", 88 pr_err("Cache object %lu type xattr length incorrect\n",
89 dentry->d_inode->i_ino); 89 dentry->d_inode->i_ino);
90 ret = -EIO; 90 ret = -EIO;
91 goto error; 91 goto error;
92 92
93bad_type: 93bad_type:
94 xtype[2] = 0; 94 xtype[2] = 0;
95 pr_err("Cache object %*.*s [%lu] type %s not %s", 95 pr_err("Cache object %*.*s [%lu] type %s not %s\n",
96 dentry->d_name.len, dentry->d_name.len, 96 dentry->d_name.len, dentry->d_name.len,
97 dentry->d_name.name, dentry->d_inode->i_ino, 97 dentry->d_name.name, dentry->d_inode->i_ino,
98 xtype, type); 98 xtype, type);
@@ -293,7 +293,7 @@ error:
293 return ret; 293 return ret;
294 294
295bad_type_length: 295bad_type_length:
296 pr_err("Cache object %lu xattr length incorrect", 296 pr_err("Cache object %lu xattr length incorrect\n",
297 dentry->d_inode->i_ino); 297 dentry->d_inode->i_ino);
298 ret = -EIO; 298 ret = -EIO;
299 goto error; 299 goto error;
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 603f18a65c12..a2172f3f69e3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -22,6 +22,11 @@ config CIFS
22 support for OS/2 and Windows ME and similar servers is provided as 22 support for OS/2 and Windows ME and similar servers is provided as
23 well. 23 well.
24 24
25 The module also provides optional support for the followon
26 protocols for CIFS including SMB3, which enables
27 useful performance and security features (see the description
28 of CONFIG_CIFS_SMB2).
29
25 The cifs module provides an advanced network file system 30 The cifs module provides an advanced network file system
26 client for mounting to CIFS compliant servers. It includes 31 client for mounting to CIFS compliant servers. It includes
27 support for DFS (hierarchical name space), secure per-user 32 support for DFS (hierarchical name space), secure per-user
@@ -121,7 +126,8 @@ config CIFS_ACL
121 depends on CIFS_XATTR && KEYS 126 depends on CIFS_XATTR && KEYS
122 help 127 help
123 Allows fetching CIFS/NTFS ACL from the server. The DACL blob 128 Allows fetching CIFS/NTFS ACL from the server. The DACL blob
124 is handed over to the application/caller. 129 is handed over to the application/caller. See the man
130 page for getcifsacl for more information.
125 131
126config CIFS_DEBUG 132config CIFS_DEBUG
127 bool "Enable CIFS debugging routines" 133 bool "Enable CIFS debugging routines"
@@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT
162 Allows NFS server to export a CIFS mounted share (nfsd over cifs) 168 Allows NFS server to export a CIFS mounted share (nfsd over cifs)
163 169
164config CIFS_SMB2 170config CIFS_SMB2
165 bool "SMB2 network file system support" 171 bool "SMB2 and SMB3 network file system support"
166 depends on CIFS && INET 172 depends on CIFS && INET
167 select NLS 173 select NLS
168 select KEYS 174 select KEYS
@@ -170,16 +176,21 @@ config CIFS_SMB2
170 select DNS_RESOLVER 176 select DNS_RESOLVER
171 177
172 help 178 help
173 This enables experimental support for the SMB2 (Server Message Block 179 This enables support for the Server Message Block version 2
174 version 2) protocol. The SMB2 protocol is the successor to the 180 family of protocols, including SMB3. SMB3 support is
175 popular CIFS and SMB network file sharing protocols. SMB2 is the 181 enabled on mount by specifying "vers=3.0" in the mount
176 native file sharing mechanism for recent versions of Windows 182 options. These protocols are the successors to the popular
177 operating systems (since Vista). SMB2 enablement will eventually 183 CIFS and SMB network file sharing protocols. SMB3 is the
178 allow users better performance, security and features, than would be 184 native file sharing mechanism for the more recent
179 possible with cifs. Note that smb2 mount options also are simpler 185 versions of Windows (Windows 8 and Windows 2012 and
180 (compared to cifs) due to protocol improvements. 186 later) and Samba server and many others support SMB3 well.
181 187 In general SMB3 enables better performance, security
182 Unless you are a developer or tester, say N. 188 and features, than would be possible with CIFS (Note that
189 when mounting to Samba, due to the CIFS POSIX extensions,
190 CIFS mounts can provide slightly better POSIX compatibility
191 than SMB3 mounts do though). Note that SMB2/SMB3 mount
192 options are also slightly simpler (compared to CIFS) due
193 to protocol improvements.
183 194
184config CIFS_FSCACHE 195config CIFS_FSCACHE
185 bool "Provide CIFS client caching support" 196 bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 85c70d5969ac..9d7996e8e793 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -207,6 +207,19 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
207 return 0; 207 return 0;
208} 208}
209 209
210static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
211{
212 struct super_block *sb = file->f_path.dentry->d_sb;
213 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
214 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
215 struct TCP_Server_Info *server = tcon->ses->server;
216
217 if (server->ops->fallocate)
218 return server->ops->fallocate(file, tcon, mode, off, len);
219
220 return -EOPNOTSUPP;
221}
222
210static int cifs_permission(struct inode *inode, int mask) 223static int cifs_permission(struct inode *inode, int mask)
211{ 224{
212 struct cifs_sb_info *cifs_sb; 225 struct cifs_sb_info *cifs_sb;
@@ -813,8 +826,9 @@ cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv
813 if (!(S_ISREG(inode->i_mode))) 826 if (!(S_ISREG(inode->i_mode)))
814 return -EINVAL; 827 return -EINVAL;
815 828
816 /* check if file is oplocked */ 829 /* Check if file is oplocked if this is request for new lease */
817 if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || 830 if (arg == F_UNLCK ||
831 ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
818 ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode)))) 832 ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
819 return generic_setlease(file, arg, lease, priv); 833 return generic_setlease(file, arg, lease, priv);
820 else if (tlink_tcon(cfile->tlink)->local_lease && 834 else if (tlink_tcon(cfile->tlink)->local_lease &&
@@ -909,6 +923,7 @@ const struct file_operations cifs_file_ops = {
909 .unlocked_ioctl = cifs_ioctl, 923 .unlocked_ioctl = cifs_ioctl,
910#endif /* CONFIG_CIFS_POSIX */ 924#endif /* CONFIG_CIFS_POSIX */
911 .setlease = cifs_setlease, 925 .setlease = cifs_setlease,
926 .fallocate = cifs_fallocate,
912}; 927};
913 928
914const struct file_operations cifs_file_strict_ops = { 929const struct file_operations cifs_file_strict_ops = {
@@ -928,6 +943,7 @@ const struct file_operations cifs_file_strict_ops = {
928 .unlocked_ioctl = cifs_ioctl, 943 .unlocked_ioctl = cifs_ioctl,
929#endif /* CONFIG_CIFS_POSIX */ 944#endif /* CONFIG_CIFS_POSIX */
930 .setlease = cifs_setlease, 945 .setlease = cifs_setlease,
946 .fallocate = cifs_fallocate,
931}; 947};
932 948
933const struct file_operations cifs_file_direct_ops = { 949const struct file_operations cifs_file_direct_ops = {
@@ -948,6 +964,7 @@ const struct file_operations cifs_file_direct_ops = {
948#endif /* CONFIG_CIFS_POSIX */ 964#endif /* CONFIG_CIFS_POSIX */
949 .llseek = cifs_llseek, 965 .llseek = cifs_llseek,
950 .setlease = cifs_setlease, 966 .setlease = cifs_setlease,
967 .fallocate = cifs_fallocate,
951}; 968};
952 969
953const struct file_operations cifs_file_nobrl_ops = { 970const struct file_operations cifs_file_nobrl_ops = {
@@ -966,6 +983,7 @@ const struct file_operations cifs_file_nobrl_ops = {
966 .unlocked_ioctl = cifs_ioctl, 983 .unlocked_ioctl = cifs_ioctl,
967#endif /* CONFIG_CIFS_POSIX */ 984#endif /* CONFIG_CIFS_POSIX */
968 .setlease = cifs_setlease, 985 .setlease = cifs_setlease,
986 .fallocate = cifs_fallocate,
969}; 987};
970 988
971const struct file_operations cifs_file_strict_nobrl_ops = { 989const struct file_operations cifs_file_strict_nobrl_ops = {
@@ -984,6 +1002,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
984 .unlocked_ioctl = cifs_ioctl, 1002 .unlocked_ioctl = cifs_ioctl,
985#endif /* CONFIG_CIFS_POSIX */ 1003#endif /* CONFIG_CIFS_POSIX */
986 .setlease = cifs_setlease, 1004 .setlease = cifs_setlease,
1005 .fallocate = cifs_fallocate,
987}; 1006};
988 1007
989const struct file_operations cifs_file_direct_nobrl_ops = { 1008const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -1003,6 +1022,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
1003#endif /* CONFIG_CIFS_POSIX */ 1022#endif /* CONFIG_CIFS_POSIX */
1004 .llseek = cifs_llseek, 1023 .llseek = cifs_llseek,
1005 .setlease = cifs_setlease, 1024 .setlease = cifs_setlease,
1025 .fallocate = cifs_fallocate,
1006}; 1026};
1007 1027
1008const struct file_operations cifs_dir_ops = { 1028const struct file_operations cifs_dir_ops = {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b0fafa499505..002e0c173939 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
136extern const struct export_operations cifs_export_ops; 136extern const struct export_operations cifs_export_ops;
137#endif /* CONFIG_CIFS_NFSD_EXPORT */ 137#endif /* CONFIG_CIFS_NFSD_EXPORT */
138 138
139#define CIFS_VERSION "2.04" 139#define CIFS_VERSION "2.05"
140#endif /* _CIFSFS_H */ 140#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0012e1e291d4..25b8392bfdd2 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,11 +70,6 @@
70#define SERVER_NAME_LENGTH 40 70#define SERVER_NAME_LENGTH 40
71#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) 71#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1)
72 72
73/* used to define string lengths for reversing unicode strings */
74/* (256+1)*2 = 514 */
75/* (max path length + 1 for null) * 2 for unicode */
76#define MAX_NAME 514
77
78/* SMB echo "timeout" -- FIXME: tunable? */ 73/* SMB echo "timeout" -- FIXME: tunable? */
79#define SMB_ECHO_INTERVAL (60 * HZ) 74#define SMB_ECHO_INTERVAL (60 * HZ)
80 75
@@ -409,6 +404,10 @@ struct smb_version_operations {
409 /* get mtu credits */ 404 /* get mtu credits */
410 int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int, 405 int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
411 unsigned int *, unsigned int *); 406 unsigned int *, unsigned int *);
407 /* check if we need to issue closedir */
408 bool (*dir_needs_close)(struct cifsFileInfo *);
409 long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
410 loff_t);
412}; 411};
413 412
414struct smb_version_values { 413struct smb_version_values {
@@ -883,6 +882,7 @@ struct cifs_tcon {
883 for this mount even if server would support */ 882 for this mount even if server would support */
884 bool local_lease:1; /* check leases (only) on local system not remote */ 883 bool local_lease:1; /* check leases (only) on local system not remote */
885 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ 884 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
885 bool broken_sparse_sup; /* if server or share does not support sparse */
886 bool need_reconnect:1; /* connection reset, tid now invalid */ 886 bool need_reconnect:1; /* connection reset, tid now invalid */
887#ifdef CONFIG_CIFS_SMB2 887#ifdef CONFIG_CIFS_SMB2
888 bool print:1; /* set if connection to printer share */ 888 bool print:1; /* set if connection to printer share */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 33df36ef9d52..5f9822ac0245 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2253,6 +2253,29 @@ typedef struct {
2253/* minimum includes first three fields, and empty FS Name */ 2253/* minimum includes first three fields, and empty FS Name */
2254#define MIN_FS_ATTR_INFO_SIZE 12 2254#define MIN_FS_ATTR_INFO_SIZE 12
2255 2255
2256
2257/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
2258#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000
2259#define FILE_SUPPORTS_USN_JOURNAL 0x02000000
2260#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000
2261#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000
2262#define FILE_SUPPORTS_HARD_LINKS 0x00400000
2263#define FILE_SUPPORTS_TRANSACTIONS 0x00200000
2264#define FILE_SEQUENTIAL_WRITE_ONCE 0x00100000
2265#define FILE_READ_ONLY_VOLUME 0x00080000
2266#define FILE_NAMED_STREAMS 0x00040000
2267#define FILE_SUPPORTS_ENCRYPTION 0x00020000
2268#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
2269#define FILE_VOLUME_IS_COMPRESSED 0x00008000
2270#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
2271#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
2272#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
2273#define FILE_VOLUME_QUOTAS 0x00000020
2274#define FILE_FILE_COMPRESSION 0x00000010
2275#define FILE_PERSISTENT_ACLS 0x00000008
2276#define FILE_UNICODE_ON_DISK 0x00000004
2277#define FILE_CASE_PRESERVED_NAMES 0x00000002
2278#define FILE_CASE_SENSITIVE_SEARCH 0x00000001
2256typedef struct { 2279typedef struct {
2257 __le32 Attributes; 2280 __le32 Attributes;
2258 __le32 MaxPathNameComponentLength; 2281 __le32 MaxPathNameComponentLength;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 03ed8a09581c..36ca2045009b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1600,6 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1600 tmp_end++; 1600 tmp_end++;
1601 if (!(tmp_end < end && tmp_end[1] == delim)) { 1601 if (!(tmp_end < end && tmp_end[1] == delim)) {
1602 /* No it is not. Set the password to NULL */ 1602 /* No it is not. Set the password to NULL */
1603 kfree(vol->password);
1603 vol->password = NULL; 1604 vol->password = NULL;
1604 break; 1605 break;
1605 } 1606 }
@@ -1637,6 +1638,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1637 options = end; 1638 options = end;
1638 } 1639 }
1639 1640
1641 kfree(vol->password);
1640 /* Now build new password string */ 1642 /* Now build new password string */
1641 temp_len = strlen(value); 1643 temp_len = strlen(value);
1642 vol->password = kzalloc(temp_len+1, GFP_KERNEL); 1644 vol->password = kzalloc(temp_len+1, GFP_KERNEL);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3db0c5fd9a11..6cbd9c688cfe 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
497 goto out; 497 goto out;
498 } 498 }
499 499
500 if (file->f_flags & O_DIRECT &&
501 CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
502 if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
503 file->f_op = &cifs_file_direct_nobrl_ops;
504 else
505 file->f_op = &cifs_file_direct_ops;
506 }
507
500 file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); 508 file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
501 if (file_info == NULL) { 509 if (file_info == NULL) {
502 if (server->ops->close) 510 if (server->ops->close)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 4ab2f79ffa7a..5f29354b072a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file)
467 cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", 467 cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
468 inode, file->f_flags, full_path); 468 inode, file->f_flags, full_path);
469 469
470 if (file->f_flags & O_DIRECT &&
471 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
472 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
473 file->f_op = &cifs_file_direct_nobrl_ops;
474 else
475 file->f_op = &cifs_file_direct_ops;
476 }
477
470 if (server->oplocks) 478 if (server->oplocks)
471 oplock = REQ_OPLOCK; 479 oplock = REQ_OPLOCK;
472 else 480 else
@@ -762,7 +770,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
762 770
763 cifs_dbg(FYI, "Freeing private data in close dir\n"); 771 cifs_dbg(FYI, "Freeing private data in close dir\n");
764 spin_lock(&cifs_file_list_lock); 772 spin_lock(&cifs_file_list_lock);
765 if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { 773 if (server->ops->dir_needs_close(cfile)) {
766 cfile->invalidHandle = true; 774 cfile->invalidHandle = true;
767 spin_unlock(&cifs_file_list_lock); 775 spin_unlock(&cifs_file_list_lock);
768 if (server->ops->close_dir) 776 if (server->ops->close_dir)
@@ -3560,15 +3568,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3560 lru_cache_add_file(page); 3568 lru_cache_add_file(page);
3561 unlock_page(page); 3569 unlock_page(page);
3562 page_cache_release(page); 3570 page_cache_release(page);
3563 if (rc == -EAGAIN)
3564 list_add_tail(&page->lru, &tmplist);
3565 } 3571 }
3572 /* Fallback to the readpage in error/reconnect cases */
3566 kref_put(&rdata->refcount, cifs_readdata_release); 3573 kref_put(&rdata->refcount, cifs_readdata_release);
3567 if (rc == -EAGAIN) {
3568 /* Re-add pages to the page_list and retry */
3569 list_splice(&tmplist, page_list);
3570 continue;
3571 }
3572 break; 3574 break;
3573 } 3575 }
3574 3576
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 426d6c6ad8bf..7899a40465b3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1720,13 +1720,22 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
1720unlink_target: 1720unlink_target:
1721 /* Try unlinking the target dentry if it's not negative */ 1721 /* Try unlinking the target dentry if it's not negative */
1722 if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { 1722 if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
1723 tmprc = cifs_unlink(target_dir, target_dentry); 1723 if (d_is_dir(target_dentry))
1724 tmprc = cifs_rmdir(target_dir, target_dentry);
1725 else
1726 tmprc = cifs_unlink(target_dir, target_dentry);
1724 if (tmprc) 1727 if (tmprc)
1725 goto cifs_rename_exit; 1728 goto cifs_rename_exit;
1726 rc = cifs_do_rename(xid, source_dentry, from_name, 1729 rc = cifs_do_rename(xid, source_dentry, from_name,
1727 target_dentry, to_name); 1730 target_dentry, to_name);
1728 } 1731 }
1729 1732
1733 /* force revalidate to go get info when needed */
1734 CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
1735
1736 source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
1737 target_dir->i_mtime = current_fs_time(source_dir->i_sb);
1738
1730cifs_rename_exit: 1739cifs_rename_exit:
1731 kfree(info_buf_source); 1740 kfree(info_buf_source);
1732 kfree(from_name); 1741 kfree(from_name);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 68559fd557fb..5657416d3483 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
213 if (rc) 213 if (rc)
214 goto out; 214 goto out;
215 215
216 rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb, 216 if (tcon->ses->server->ops->create_mf_symlink)
217 fromName, buf, &bytes_written); 217 rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
218 cifs_sb, fromName, buf, &bytes_written);
219 else
220 rc = -EOPNOTSUPP;
221
218 if (rc) 222 if (rc)
219 goto out; 223 goto out;
220 224
@@ -339,9 +343,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
339 if (rc) 343 if (rc)
340 return rc; 344 return rc;
341 345
342 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) 346 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
347 rc = -ENOENT;
343 /* it's not a symlink */ 348 /* it's not a symlink */
344 goto out; 349 goto out;
350 }
345 351
346 io_parms.netfid = fid.netfid; 352 io_parms.netfid = fid.netfid;
347 io_parms.pid = current->tgid; 353 io_parms.pid = current->tgid;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 81340c6253eb..b7415d596dbd 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -574,13 +574,6 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
574 cinode->oplock = 0; 574 cinode->oplock = 0;
575} 575}
576 576
577static int
578cifs_oplock_break_wait(void *unused)
579{
580 schedule();
581 return signal_pending(current) ? -ERESTARTSYS : 0;
582}
583
584/* 577/*
585 * We wait for oplock breaks to be processed before we attempt to perform 578 * We wait for oplock breaks to be processed before we attempt to perform
586 * writes. 579 * writes.
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6834b9c3bec1..b333ff60781d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc)
925 /* BB what about the timezone? BB */ 925 /* BB what about the timezone? BB */
926 926
927 /* Subtract the NTFS time offset, then convert to 1s intervals. */ 927 /* Subtract the NTFS time offset, then convert to 1s intervals. */
928 u64 t; 928 s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
929
930 /*
931 * Unfortunately can not use normal 64 bit division on 32 bit arch, but
932 * the alternative, do_div, does not work with negative numbers so have
933 * to special case them
934 */
935 if (t < 0) {
936 t = -t;
937 ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
938 ts.tv_nsec = -ts.tv_nsec;
939 ts.tv_sec = -t;
940 } else {
941 ts.tv_nsec = (long)do_div(t, 10000000) * 100;
942 ts.tv_sec = t;
943 }
929 944
930 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
931 ts.tv_nsec = do_div(t, 10000000) * 100;
932 ts.tv_sec = t;
933 return ts; 945 return ts;
934} 946}
935 947
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b15862e0f68c..b334a89d6a66 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -593,11 +593,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
593 /* close and restart search */ 593 /* close and restart search */
594 cifs_dbg(FYI, "search backing up - close and restart search\n"); 594 cifs_dbg(FYI, "search backing up - close and restart search\n");
595 spin_lock(&cifs_file_list_lock); 595 spin_lock(&cifs_file_list_lock);
596 if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { 596 if (server->ops->dir_needs_close(cfile)) {
597 cfile->invalidHandle = true; 597 cfile->invalidHandle = true;
598 spin_unlock(&cifs_file_list_lock); 598 spin_unlock(&cifs_file_list_lock);
599 if (server->ops->close) 599 if (server->ops->close_dir)
600 server->ops->close(xid, tcon, &cfile->fid); 600 server->ops->close_dir(xid, tcon, &cfile->fid);
601 } else 601 } else
602 spin_unlock(&cifs_file_list_lock); 602 spin_unlock(&cifs_file_list_lock);
603 if (cfile->srch_inf.ntwrk_buf_start) { 603 if (cfile->srch_inf.ntwrk_buf_start) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 39ee32688eac..57db63ff88da 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
243 kfree(ses->serverOS); 243 kfree(ses->serverOS);
244 244
245 ses->serverOS = kzalloc(len + 1, GFP_KERNEL); 245 ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
246 if (ses->serverOS) 246 if (ses->serverOS) {
247 strncpy(ses->serverOS, bcc_ptr, len); 247 strncpy(ses->serverOS, bcc_ptr, len);
248 if (strncmp(ses->serverOS, "OS/2", 4) == 0) 248 if (strncmp(ses->serverOS, "OS/2", 4) == 0)
249 cifs_dbg(FYI, "OS/2 server\n"); 249 cifs_dbg(FYI, "OS/2 server\n");
250 }
250 251
251 bcc_ptr += len + 1; 252 bcc_ptr += len + 1;
252 bleft -= len + 1; 253 bleft -= len + 1;
@@ -744,14 +745,6 @@ out:
744 sess_free_buffer(sess_data); 745 sess_free_buffer(sess_data);
745} 746}
746 747
747#else
748
749static void
750sess_auth_lanman(struct sess_data *sess_data)
751{
752 sess_data->result = -EOPNOTSUPP;
753 sess_data->func = NULL;
754}
755#endif 748#endif
756 749
757static void 750static void
@@ -1102,15 +1095,6 @@ out:
1102 ses->auth_key.response = NULL; 1095 ses->auth_key.response = NULL;
1103} 1096}
1104 1097
1105#else
1106
1107static void
1108sess_auth_kerberos(struct sess_data *sess_data)
1109{
1110 cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
1111 sess_data->result = -ENOSYS;
1112 sess_data->func = NULL;
1113}
1114#endif /* ! CONFIG_CIFS_UPCALL */ 1098#endif /* ! CONFIG_CIFS_UPCALL */
1115 1099
1116/* 1100/*
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 5e8c22d6c7b9..52131d8cb4d5 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -586,7 +586,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
586 tmprc = CIFS_open(xid, &oparms, &oplock, NULL); 586 tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
587 if (tmprc == -EOPNOTSUPP) 587 if (tmprc == -EOPNOTSUPP)
588 *symlink = true; 588 *symlink = true;
589 else 589 else if (tmprc == 0)
590 CIFSSMBClose(xid, tcon, fid.netfid); 590 CIFSSMBClose(xid, tcon, fid.netfid);
591 } 591 }
592 592
@@ -1015,6 +1015,12 @@ cifs_wp_retry_size(struct inode *inode)
1015 return CIFS_SB(inode->i_sb)->wsize; 1015 return CIFS_SB(inode->i_sb)->wsize;
1016} 1016}
1017 1017
1018static bool
1019cifs_dir_needs_close(struct cifsFileInfo *cfile)
1020{
1021 return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
1022}
1023
1018struct smb_version_operations smb1_operations = { 1024struct smb_version_operations smb1_operations = {
1019 .send_cancel = send_nt_cancel, 1025 .send_cancel = send_nt_cancel,
1020 .compare_fids = cifs_compare_fids, 1026 .compare_fids = cifs_compare_fids,
@@ -1086,6 +1092,7 @@ struct smb_version_operations smb1_operations = {
1086 .create_mf_symlink = cifs_create_mf_symlink, 1092 .create_mf_symlink = cifs_create_mf_symlink,
1087 .is_read_op = cifs_is_read_op, 1093 .is_read_op = cifs_is_read_op,
1088 .wp_retry_size = cifs_wp_retry_size, 1094 .wp_retry_size = cifs_wp_retry_size,
1095 .dir_needs_close = cifs_dir_needs_close,
1089#ifdef CONFIG_CIFS_XATTR 1096#ifdef CONFIG_CIFS_XATTR
1090 .query_all_EAs = CIFSSMBQAllEAs, 1097 .query_all_EAs = CIFSSMBQAllEAs,
1091 .set_EA = CIFSSMBSetEA, 1098 .set_EA = CIFSSMBSetEA,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 3f17b4550831..45992944e238 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
50 goto out; 50 goto out;
51 } 51 }
52 52
53 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, 53 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
54 GFP_KERNEL); 54 GFP_KERNEL);
55 if (smb2_data == NULL) { 55 if (smb2_data == NULL) {
56 rc = -ENOMEM; 56 rc = -ENOMEM;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 0150182a4494..899bbc86f73e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
131 *adjust_tz = false; 131 *adjust_tz = false;
132 *symlink = false; 132 *symlink = false;
133 133
134 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, 134 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
135 GFP_KERNEL); 135 GFP_KERNEL);
136 if (smb2_data == NULL) 136 if (smb2_data == NULL)
137 return -ENOMEM; 137 return -ENOMEM;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index e31a9dfdcd39..8257a5a97cc0 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
214 {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"}, 214 {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
215 {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"}, 215 {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
216 {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"}, 216 {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
217 {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"}, 217 {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
218 {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"}, 218 {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
219 {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"}, 219 {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
220 {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"}, 220 {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
@@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
256 {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO, 256 {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
257 "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"}, 257 "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
258 {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"}, 258 {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
259 {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
260 "STATUS_REPARSE_NOT_HANDLED"},
259 {STATUS_DEVICE_REQUIRES_CLEANING, -EIO, 261 {STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
260 "STATUS_DEVICE_REQUIRES_CLEANING"}, 262 "STATUS_DEVICE_REQUIRES_CLEANING"},
261 {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"}, 263 {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
@@ -298,7 +300,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
298 {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"}, 300 {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"},
299 {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"}, 301 {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"},
300 {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"}, 302 {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"},
301 {STATUS_INVALID_DEVICE_REQUEST, -EIO, "STATUS_INVALID_DEVICE_REQUEST"}, 303 {STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"},
302 {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"}, 304 {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"},
303 {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"}, 305 {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"},
304 {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"}, 306 {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index f2e6ac29a8d6..4aa7a0f07d6e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -178,9 +178,24 @@ smb2_check_message(char *buf, unsigned int length)
178 /* Windows 7 server returns 24 bytes more */ 178 /* Windows 7 server returns 24 bytes more */
179 if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) 179 if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
180 return 0; 180 return 0;
181 /* server can return one byte more */ 181 /* server can return one byte more due to implied bcc[0] */
182 if (clc_len == 4 + len + 1) 182 if (clc_len == 4 + len + 1)
183 return 0; 183 return 0;
184
185 /*
186 * MacOS server pads after SMB2.1 write response with 3 bytes
187 * of junk. Other servers match RFC1001 len to actual
188 * SMB2/SMB3 frame length (header + smb2 response specific data)
189 * Log the server error (once), but allow it and continue
190 * since the frame is parseable.
191 */
192 if (clc_len < 4 /* RFC1001 header size */ + len) {
193 printk_once(KERN_WARNING
194 "SMB2 server sent bad RFC1001 len %d not %d\n",
195 len, clc_len - 4);
196 return 0;
197 }
198
184 return 1; 199 return 1;
185 } 200 }
186 return 0; 201 return 0;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 77f8aeb9c2fc..f522193b7184 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -389,7 +389,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
389 int rc; 389 int rc;
390 struct smb2_file_all_info *smb2_data; 390 struct smb2_file_all_info *smb2_data;
391 391
392 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, 392 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
393 GFP_KERNEL); 393 GFP_KERNEL);
394 if (smb2_data == NULL) 394 if (smb2_data == NULL)
395 return -ENOMEM; 395 return -ENOMEM;
@@ -731,11 +731,72 @@ smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile,
731 return SMB2_write(xid, parms, written, iov, nr_segs); 731 return SMB2_write(xid, parms, written, iov, nr_segs);
732} 732}
733 733
734/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */
735static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
736 struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse)
737{
738 struct cifsInodeInfo *cifsi;
739 int rc;
740
741 cifsi = CIFS_I(inode);
742
743 /* if file already sparse don't bother setting sparse again */
744 if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse)
745 return true; /* already sparse */
746
747 if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse)
748 return true; /* already not sparse */
749
750 /*
751 * Can't check for sparse support on share the usual way via the
752 * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share
753 * since Samba server doesn't set the flag on the share, yet
754 * supports the set sparse FSCTL and returns sparse correctly
755 * in the file attributes. If we fail setting sparse though we
756 * mark that server does not support sparse files for this share
757 * to avoid repeatedly sending the unsupported fsctl to server
758 * if the file is repeatedly extended.
759 */
760 if (tcon->broken_sparse_sup)
761 return false;
762
763 rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
764 cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
765 true /* is_fctl */, &setsparse, 1, NULL, NULL);
766 if (rc) {
767 tcon->broken_sparse_sup = true;
768 cifs_dbg(FYI, "set sparse rc = %d\n", rc);
769 return false;
770 }
771
772 if (setsparse)
773 cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE;
774 else
775 cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE);
776
777 return true;
778}
779
734static int 780static int
735smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, 781smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
736 struct cifsFileInfo *cfile, __u64 size, bool set_alloc) 782 struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
737{ 783{
738 __le64 eof = cpu_to_le64(size); 784 __le64 eof = cpu_to_le64(size);
785 struct inode *inode;
786
787 /*
788 * If extending file more than one page make sparse. Many Linux fs
789 * make files sparse by default when extending via ftruncate
790 */
791 inode = cfile->dentry->d_inode;
792
793 if (!set_alloc && (size > inode->i_size + 8192)) {
794 __u8 set_sparse = 1;
795
796 /* whether set sparse succeeds or not, extend the file */
797 smb2_set_sparse(xid, tcon, cfile, inode, set_sparse);
798 }
799
739 return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, 800 return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
740 cfile->fid.volatile_fid, cfile->pid, &eof, false); 801 cfile->fid.volatile_fid, cfile->pid, &eof, false);
741} 802}
@@ -954,6 +1015,105 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
954 return rc; 1015 return rc;
955} 1016}
956 1017
1018static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
1019 loff_t offset, loff_t len, bool keep_size)
1020{
1021 struct inode *inode;
1022 struct cifsInodeInfo *cifsi;
1023 struct cifsFileInfo *cfile = file->private_data;
1024 struct file_zero_data_information fsctl_buf;
1025 long rc;
1026 unsigned int xid;
1027
1028 xid = get_xid();
1029
1030 inode = cfile->dentry->d_inode;
1031 cifsi = CIFS_I(inode);
1032
1033 /* if file not oplocked can't be sure whether asking to extend size */
1034 if (!CIFS_CACHE_READ(cifsi))
1035 if (keep_size == false)
1036 return -EOPNOTSUPP;
1037
1038 /*
1039 * Must check if file sparse since fallocate -z (zero range) assumes
1040 * non-sparse allocation
1041 */
1042 if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
1043 return -EOPNOTSUPP;
1044
1045 /*
1046 * need to make sure we are not asked to extend the file since the SMB3
1047 * fsctl does not change the file size. In the future we could change
1048 * this to zero the first part of the range then set the file size
1049 * which for a non sparse file would zero the newly extended range
1050 */
1051 if (keep_size == false)
1052 if (i_size_read(inode) < offset + len)
1053 return -EOPNOTSUPP;
1054
1055 cifs_dbg(FYI, "offset %lld len %lld", offset, len);
1056
1057 fsctl_buf.FileOffset = cpu_to_le64(offset);
1058 fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
1059
1060 rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
1061 cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
1062 true /* is_fctl */, (char *)&fsctl_buf,
1063 sizeof(struct file_zero_data_information), NULL, NULL);
1064 free_xid(xid);
1065 return rc;
1066}
1067
1068static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
1069 loff_t offset, loff_t len)
1070{
1071 struct inode *inode;
1072 struct cifsInodeInfo *cifsi;
1073 struct cifsFileInfo *cfile = file->private_data;
1074 struct file_zero_data_information fsctl_buf;
1075 long rc;
1076 unsigned int xid;
1077 __u8 set_sparse = 1;
1078
1079 xid = get_xid();
1080
1081 inode = cfile->dentry->d_inode;
1082 cifsi = CIFS_I(inode);
1083
1084 /* Need to make file sparse, if not already, before freeing range. */
1085 /* Consider adding equivalent for compressed since it could also work */
1086 if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
1087 return -EOPNOTSUPP;
1088
1089 cifs_dbg(FYI, "offset %lld len %lld", offset, len);
1090
1091 fsctl_buf.FileOffset = cpu_to_le64(offset);
1092 fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
1093
1094 rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
1095 cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
1096 true /* is_fctl */, (char *)&fsctl_buf,
1097 sizeof(struct file_zero_data_information), NULL, NULL);
1098 free_xid(xid);
1099 return rc;
1100}
1101
1102static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
1103 loff_t off, loff_t len)
1104{
1105 /* KEEP_SIZE already checked for by do_fallocate */
1106 if (mode & FALLOC_FL_PUNCH_HOLE)
1107 return smb3_punch_hole(file, tcon, off, len);
1108 else if (mode & FALLOC_FL_ZERO_RANGE) {
1109 if (mode & FALLOC_FL_KEEP_SIZE)
1110 return smb3_zero_range(file, tcon, off, len, true);
1111 return smb3_zero_range(file, tcon, off, len, false);
1112 }
1113
1114 return -EOPNOTSUPP;
1115}
1116
957static void 1117static void
958smb2_downgrade_oplock(struct TCP_Server_Info *server, 1118smb2_downgrade_oplock(struct TCP_Server_Info *server,
959 struct cifsInodeInfo *cinode, bool set_level2) 1119 struct cifsInodeInfo *cinode, bool set_level2)
@@ -1161,6 +1321,12 @@ smb2_wp_retry_size(struct inode *inode)
1161 SMB2_MAX_BUFFER_SIZE); 1321 SMB2_MAX_BUFFER_SIZE);
1162} 1322}
1163 1323
1324static bool
1325smb2_dir_needs_close(struct cifsFileInfo *cfile)
1326{
1327 return !cfile->invalidHandle;
1328}
1329
1164struct smb_version_operations smb20_operations = { 1330struct smb_version_operations smb20_operations = {
1165 .compare_fids = smb2_compare_fids, 1331 .compare_fids = smb2_compare_fids,
1166 .setup_request = smb2_setup_request, 1332 .setup_request = smb2_setup_request,
@@ -1236,6 +1402,7 @@ struct smb_version_operations smb20_operations = {
1236 .parse_lease_buf = smb2_parse_lease_buf, 1402 .parse_lease_buf = smb2_parse_lease_buf,
1237 .clone_range = smb2_clone_range, 1403 .clone_range = smb2_clone_range,
1238 .wp_retry_size = smb2_wp_retry_size, 1404 .wp_retry_size = smb2_wp_retry_size,
1405 .dir_needs_close = smb2_dir_needs_close,
1239}; 1406};
1240 1407
1241struct smb_version_operations smb21_operations = { 1408struct smb_version_operations smb21_operations = {
@@ -1313,6 +1480,7 @@ struct smb_version_operations smb21_operations = {
1313 .parse_lease_buf = smb2_parse_lease_buf, 1480 .parse_lease_buf = smb2_parse_lease_buf,
1314 .clone_range = smb2_clone_range, 1481 .clone_range = smb2_clone_range,
1315 .wp_retry_size = smb2_wp_retry_size, 1482 .wp_retry_size = smb2_wp_retry_size,
1483 .dir_needs_close = smb2_dir_needs_close,
1316}; 1484};
1317 1485
1318struct smb_version_operations smb30_operations = { 1486struct smb_version_operations smb30_operations = {
@@ -1393,6 +1561,8 @@ struct smb_version_operations smb30_operations = {
1393 .clone_range = smb2_clone_range, 1561 .clone_range = smb2_clone_range,
1394 .validate_negotiate = smb3_validate_negotiate, 1562 .validate_negotiate = smb3_validate_negotiate,
1395 .wp_retry_size = smb2_wp_retry_size, 1563 .wp_retry_size = smb2_wp_retry_size,
1564 .dir_needs_close = smb2_dir_needs_close,
1565 .fallocate = smb3_fallocate,
1396}; 1566};
1397 1567
1398struct smb_version_values smb20_values = { 1568struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 42ebc1a8be6c..74b3a6684383 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -530,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
530 struct smb2_sess_setup_rsp *rsp = NULL; 530 struct smb2_sess_setup_rsp *rsp = NULL;
531 struct kvec iov[2]; 531 struct kvec iov[2];
532 int rc = 0; 532 int rc = 0;
533 int resp_buftype; 533 int resp_buftype = CIFS_NO_BUFFER;
534 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 534 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
535 struct TCP_Server_Info *server = ses->server; 535 struct TCP_Server_Info *server = ses->server;
536 u16 blob_length = 0; 536 u16 blob_length = 0;
@@ -907,7 +907,8 @@ tcon_exit:
907tcon_error_exit: 907tcon_error_exit:
908 if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { 908 if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
909 cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); 909 cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
910 tcon->bad_network_name = true; 910 if (tcon)
911 tcon->bad_network_name = true;
911 } 912 }
912 goto tcon_exit; 913 goto tcon_exit;
913} 914}
@@ -1224,7 +1225,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
1224 1225
1225 cifs_dbg(FYI, "SMB2 IOCTL\n"); 1226 cifs_dbg(FYI, "SMB2 IOCTL\n");
1226 1227
1227 *out_data = NULL; 1228 if (out_data != NULL)
1229 *out_data = NULL;
1230
1228 /* zero out returned data len, in case of error */ 1231 /* zero out returned data len, in case of error */
1229 if (plen) 1232 if (plen)
1230 *plen = 0; 1233 *plen = 0;
@@ -1400,8 +1403,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
1400 rsp = (struct smb2_close_rsp *)iov[0].iov_base; 1403 rsp = (struct smb2_close_rsp *)iov[0].iov_base;
1401 1404
1402 if (rc != 0) { 1405 if (rc != 0) {
1403 if (tcon) 1406 cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
1404 cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
1405 goto close_exit; 1407 goto close_exit;
1406 } 1408 }
1407 1409
@@ -1530,7 +1532,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
1530{ 1532{
1531 return query_info(xid, tcon, persistent_fid, volatile_fid, 1533 return query_info(xid, tcon, persistent_fid, volatile_fid,
1532 FILE_ALL_INFORMATION, 1534 FILE_ALL_INFORMATION,
1533 sizeof(struct smb2_file_all_info) + MAX_NAME * 2, 1535 sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
1534 sizeof(struct smb2_file_all_info), data); 1536 sizeof(struct smb2_file_all_info), data);
1535} 1537}
1536 1538
@@ -2177,6 +2179,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
2177 rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; 2179 rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
2178 2180
2179 if (rc) { 2181 if (rc) {
2182 if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
2183 srch_inf->endOfSearch = true;
2184 rc = 0;
2185 }
2180 cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); 2186 cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
2181 goto qdir_exit; 2187 goto qdir_exit;
2182 } 2188 }
@@ -2214,11 +2220,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
2214 else 2220 else
2215 cifs_dbg(VFS, "illegal search buffer type\n"); 2221 cifs_dbg(VFS, "illegal search buffer type\n");
2216 2222
2217 if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
2218 srch_inf->endOfSearch = 1;
2219 else
2220 srch_inf->endOfSearch = 0;
2221
2222 return rc; 2223 return rc;
2223 2224
2224qdir_exit: 2225qdir_exit:
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 69f3595d3952..fbe486c285a9 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -573,6 +573,12 @@ struct copychunk_ioctl {
573 __u32 Reserved2; 573 __u32 Reserved2;
574} __packed; 574} __packed;
575 575
576/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
577struct file_zero_data_information {
578 __le64 FileOffset;
579 __le64 BeyondFinalZero;
580} __packed;
581
576struct copychunk_ioctl_rsp { 582struct copychunk_ioctl_rsp {
577 __le32 ChunksWritten; 583 __le32 ChunksWritten;
578 __le32 ChunkBytesWritten; 584 __le32 ChunkBytesWritten;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 0e538b5c9622..83efa59535be 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -63,7 +63,7 @@
63#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */ 63#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
64#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */ 64#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
65#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */ 65#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */
66#define FSCTL_SET_ZERO_DATA 0x000900C8 /* BB add struct */ 66#define FSCTL_SET_ZERO_DATA 0x000980C8
67#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */ 67#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */
68#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */ 68#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */
69#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */ 69#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */
diff --git a/fs/dcache.c b/fs/dcache.c
index d30ce699ae4b..cb25a1a5e307 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
106 unsigned int hash) 106 unsigned int hash)
107{ 107{
108 hash += (unsigned long) parent / L1_CACHE_BYTES; 108 hash += (unsigned long) parent / L1_CACHE_BYTES;
109 hash = hash + (hash >> d_hash_shift); 109 return dentry_hashtable + hash_32(hash, d_hash_shift);
110 return dentry_hashtable + (hash & d_hash_mask);
111} 110}
112 111
113/* Statistics gathering. */ 112/* Statistics gathering. */
@@ -2373,7 +2372,8 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
2373} 2372}
2374EXPORT_SYMBOL(dentry_update_name_case); 2373EXPORT_SYMBOL(dentry_update_name_case);
2375 2374
2376static void switch_names(struct dentry *dentry, struct dentry *target) 2375static void switch_names(struct dentry *dentry, struct dentry *target,
2376 bool exchange)
2377{ 2377{
2378 if (dname_external(target)) { 2378 if (dname_external(target)) {
2379 if (dname_external(dentry)) { 2379 if (dname_external(dentry)) {
@@ -2407,13 +2407,19 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
2407 */ 2407 */
2408 unsigned int i; 2408 unsigned int i;
2409 BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); 2409 BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
2410 if (!exchange) {
2411 memcpy(dentry->d_iname, target->d_name.name,
2412 target->d_name.len + 1);
2413 dentry->d_name.hash_len = target->d_name.hash_len;
2414 return;
2415 }
2410 for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { 2416 for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
2411 swap(((long *) &dentry->d_iname)[i], 2417 swap(((long *) &dentry->d_iname)[i],
2412 ((long *) &target->d_iname)[i]); 2418 ((long *) &target->d_iname)[i]);
2413 } 2419 }
2414 } 2420 }
2415 } 2421 }
2416 swap(dentry->d_name.len, target->d_name.len); 2422 swap(dentry->d_name.hash_len, target->d_name.hash_len);
2417} 2423}
2418 2424
2419static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) 2425static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
@@ -2443,25 +2449,29 @@ static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
2443 } 2449 }
2444} 2450}
2445 2451
2446static void dentry_unlock_parents_for_move(struct dentry *dentry, 2452static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
2447 struct dentry *target)
2448{ 2453{
2449 if (target->d_parent != dentry->d_parent) 2454 if (target->d_parent != dentry->d_parent)
2450 spin_unlock(&dentry->d_parent->d_lock); 2455 spin_unlock(&dentry->d_parent->d_lock);
2451 if (target->d_parent != target) 2456 if (target->d_parent != target)
2452 spin_unlock(&target->d_parent->d_lock); 2457 spin_unlock(&target->d_parent->d_lock);
2458 spin_unlock(&target->d_lock);
2459 spin_unlock(&dentry->d_lock);
2453} 2460}
2454 2461
2455/* 2462/*
2456 * When switching names, the actual string doesn't strictly have to 2463 * When switching names, the actual string doesn't strictly have to
2457 * be preserved in the target - because we're dropping the target 2464 * be preserved in the target - because we're dropping the target
2458 * anyway. As such, we can just do a simple memcpy() to copy over 2465 * anyway. As such, we can just do a simple memcpy() to copy over
2459 * the new name before we switch. 2466 * the new name before we switch, unless we are going to rehash
2460 * 2467 * it. Note that if we *do* unhash the target, we are not allowed
2461 * Note that we have to be a lot more careful about getting the hash 2468 * to rehash it without giving it a new name/hash key - whether
2462 * switched - we have to switch the hash value properly even if it 2469 * we swap or overwrite the names here, resulting name won't match
2463 * then no longer matches the actual (corrupted) string of the target. 2470 * the reality in filesystem; it's only there for d_path() purposes.
2464 * The hash value has to match the hash queue that the dentry is on.. 2471 * Note that all of this is happening under rename_lock, so the
2472 * any hash lookup seeing it in the middle of manipulations will
2473 * be discarded anyway. So we do not care what happens to the hash
2474 * key in that case.
2465 */ 2475 */
2466/* 2476/*
2467 * __d_move - move a dentry 2477 * __d_move - move a dentry
@@ -2507,36 +2517,30 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
2507 d_hash(dentry->d_parent, dentry->d_name.hash)); 2517 d_hash(dentry->d_parent, dentry->d_name.hash));
2508 } 2518 }
2509 2519
2510 list_del(&dentry->d_u.d_child);
2511 list_del(&target->d_u.d_child);
2512
2513 /* Switch the names.. */ 2520 /* Switch the names.. */
2514 switch_names(dentry, target); 2521 switch_names(dentry, target, exchange);
2515 swap(dentry->d_name.hash, target->d_name.hash);
2516 2522
2517 /* ... and switch the parents */ 2523 /* ... and switch them in the tree */
2518 if (IS_ROOT(dentry)) { 2524 if (IS_ROOT(dentry)) {
2525 /* splicing a tree */
2519 dentry->d_parent = target->d_parent; 2526 dentry->d_parent = target->d_parent;
2520 target->d_parent = target; 2527 target->d_parent = target;
2521 INIT_LIST_HEAD(&target->d_u.d_child); 2528 list_del_init(&target->d_u.d_child);
2529 list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2522 } else { 2530 } else {
2531 /* swapping two dentries */
2523 swap(dentry->d_parent, target->d_parent); 2532 swap(dentry->d_parent, target->d_parent);
2524 2533 list_move(&target->d_u.d_child, &target->d_parent->d_subdirs);
2525 /* And add them back to the (new) parent lists */ 2534 list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2526 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); 2535 if (exchange)
2536 fsnotify_d_move(target);
2537 fsnotify_d_move(dentry);
2527 } 2538 }
2528 2539
2529 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2530
2531 write_seqcount_end(&target->d_seq); 2540 write_seqcount_end(&target->d_seq);
2532 write_seqcount_end(&dentry->d_seq); 2541 write_seqcount_end(&dentry->d_seq);
2533 2542
2534 dentry_unlock_parents_for_move(dentry, target); 2543 dentry_unlock_for_move(dentry, target);
2535 if (exchange)
2536 fsnotify_d_move(target);
2537 spin_unlock(&target->d_lock);
2538 fsnotify_d_move(dentry);
2539 spin_unlock(&dentry->d_lock);
2540} 2544}
2541 2545
2542/* 2546/*
@@ -2634,39 +2638,6 @@ out_err:
2634 return ret; 2638 return ret;
2635} 2639}
2636 2640
2637/*
2638 * Prepare an anonymous dentry for life in the superblock's dentry tree as a
2639 * named dentry in place of the dentry to be replaced.
2640 * returns with anon->d_lock held!
2641 */
2642static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
2643{
2644 struct dentry *dparent;
2645
2646 dentry_lock_for_move(anon, dentry);
2647
2648 write_seqcount_begin(&dentry->d_seq);
2649 write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED);
2650
2651 dparent = dentry->d_parent;
2652
2653 switch_names(dentry, anon);
2654 swap(dentry->d_name.hash, anon->d_name.hash);
2655
2656 dentry->d_parent = dentry;
2657 list_del_init(&dentry->d_u.d_child);
2658 anon->d_parent = dparent;
2659 list_move(&anon->d_u.d_child, &dparent->d_subdirs);
2660
2661 write_seqcount_end(&dentry->d_seq);
2662 write_seqcount_end(&anon->d_seq);
2663
2664 dentry_unlock_parents_for_move(anon, dentry);
2665 spin_unlock(&dentry->d_lock);
2666
2667 /* anon->d_lock still locked, returns locked */
2668}
2669
2670/** 2641/**
2671 * d_splice_alias - splice a disconnected dentry into the tree if one exists 2642 * d_splice_alias - splice a disconnected dentry into the tree if one exists
2672 * @inode: the inode which may have a disconnected dentry 2643 * @inode: the inode which may have a disconnected dentry
@@ -2712,11 +2683,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
2712 return ERR_PTR(-EIO); 2683 return ERR_PTR(-EIO);
2713 } 2684 }
2714 write_seqlock(&rename_lock); 2685 write_seqlock(&rename_lock);
2715 __d_materialise_dentry(dentry, new); 2686 __d_move(new, dentry, false);
2716 write_sequnlock(&rename_lock); 2687 write_sequnlock(&rename_lock);
2717 __d_drop(new);
2718 _d_rehash(new);
2719 spin_unlock(&new->d_lock);
2720 spin_unlock(&inode->i_lock); 2688 spin_unlock(&inode->i_lock);
2721 security_d_instantiate(new, inode); 2689 security_d_instantiate(new, inode);
2722 iput(inode); 2690 iput(inode);
@@ -2776,9 +2744,8 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2776 } else if (IS_ROOT(alias)) { 2744 } else if (IS_ROOT(alias)) {
2777 /* Is this an anonymous mountpoint that we 2745 /* Is this an anonymous mountpoint that we
2778 * could splice into our tree? */ 2746 * could splice into our tree? */
2779 __d_materialise_dentry(dentry, alias); 2747 __d_move(alias, dentry, false);
2780 write_sequnlock(&rename_lock); 2748 write_sequnlock(&rename_lock);
2781 __d_drop(alias);
2782 goto found; 2749 goto found;
2783 } else { 2750 } else {
2784 /* Nope, but we must(!) avoid directory 2751 /* Nope, but we must(!) avoid directory
@@ -2804,13 +2771,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2804 actual = __d_instantiate_unique(dentry, inode); 2771 actual = __d_instantiate_unique(dentry, inode);
2805 if (!actual) 2772 if (!actual)
2806 actual = dentry; 2773 actual = dentry;
2807 else
2808 BUG_ON(!d_unhashed(actual));
2809 2774
2810 spin_lock(&actual->d_lock); 2775 d_rehash(actual);
2811found: 2776found:
2812 _d_rehash(actual);
2813 spin_unlock(&actual->d_lock);
2814 spin_unlock(&inode->i_lock); 2777 spin_unlock(&inode->i_lock);
2815out_nolock: 2778out_nolock:
2816 if (actual == dentry) { 2779 if (actual == dentry) {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c3116404ab49..e181b6b2e297 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
158{ 158{
159 ssize_t ret; 159 ssize_t ret;
160 160
161 ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES, 161 ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
162 &sdio->from); 162 &sdio->from);
163 163
164 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { 164 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index db0fad3269c0..b4b6ab9873ae 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -229,8 +229,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
229 if (rc) { 229 if (rc) {
230 printk(KERN_ERR "%s: Error attempting to initialize " 230 printk(KERN_ERR "%s: Error attempting to initialize "
231 "the lower file for the dentry with name " 231 "the lower file for the dentry with name "
232 "[%s]; rc = [%d]\n", __func__, 232 "[%pd]; rc = [%d]\n", __func__,
233 ecryptfs_dentry->d_name.name, rc); 233 ecryptfs_dentry, rc);
234 goto out_free; 234 goto out_free;
235 } 235 }
236 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) 236 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d4a9431ec73c..1686dc2da9fd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -53,9 +53,7 @@ static void unlock_dir(struct dentry *dir)
53 53
54static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) 54static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
55{ 55{
56 if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode) 56 return ecryptfs_inode_to_lower(inode) == lower_inode;
57 return 1;
58 return 0;
59} 57}
60 58
61static int ecryptfs_inode_set(struct inode *inode, void *opaque) 59static int ecryptfs_inode_set(struct inode *inode, void *opaque)
@@ -192,12 +190,6 @@ ecryptfs_do_create(struct inode *directory_inode,
192 190
193 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 191 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
194 lower_dir_dentry = lock_parent(lower_dentry); 192 lower_dir_dentry = lock_parent(lower_dentry);
195 if (IS_ERR(lower_dir_dentry)) {
196 ecryptfs_printk(KERN_ERR, "Error locking directory of "
197 "dentry\n");
198 inode = ERR_CAST(lower_dir_dentry);
199 goto out;
200 }
201 rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true); 193 rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true);
202 if (rc) { 194 if (rc) {
203 printk(KERN_ERR "%s: Failure to create dentry in lower fs; " 195 printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
@@ -215,7 +207,6 @@ ecryptfs_do_create(struct inode *directory_inode,
215 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); 207 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
216out_lock: 208out_lock:
217 unlock_dir(lower_dir_dentry); 209 unlock_dir(lower_dir_dentry);
218out:
219 return inode; 210 return inode;
220} 211}
221 212
@@ -250,8 +241,8 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
250 if (rc) { 241 if (rc) {
251 printk(KERN_ERR "%s: Error attempting to initialize " 242 printk(KERN_ERR "%s: Error attempting to initialize "
252 "the lower file for the dentry with name " 243 "the lower file for the dentry with name "
253 "[%s]; rc = [%d]\n", __func__, 244 "[%pd]; rc = [%d]\n", __func__,
254 ecryptfs_dentry->d_name.name, rc); 245 ecryptfs_dentry, rc);
255 goto out; 246 goto out;
256 } 247 }
257 rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); 248 rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
@@ -313,8 +304,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
313 if (rc) { 304 if (rc) {
314 printk(KERN_ERR "%s: Error attempting to initialize " 305 printk(KERN_ERR "%s: Error attempting to initialize "
315 "the lower file for the dentry with name " 306 "the lower file for the dentry with name "
316 "[%s]; rc = [%d]\n", __func__, 307 "[%pd]; rc = [%d]\n", __func__,
317 dentry->d_name.name, rc); 308 dentry, rc);
318 return rc; 309 return rc;
319 } 310 }
320 311
@@ -418,8 +409,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
418 if (IS_ERR(lower_dentry)) { 409 if (IS_ERR(lower_dentry)) {
419 rc = PTR_ERR(lower_dentry); 410 rc = PTR_ERR(lower_dentry);
420 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 411 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
421 "[%d] on lower_dentry = [%s]\n", __func__, rc, 412 "[%d] on lower_dentry = [%pd]\n", __func__, rc,
422 ecryptfs_dentry->d_name.name); 413 ecryptfs_dentry);
423 goto out; 414 goto out;
424 } 415 }
425 if (lower_dentry->d_inode) 416 if (lower_dentry->d_inode)
@@ -1039,7 +1030,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1039 } 1030 }
1040 1031
1041 rc = vfs_setxattr(lower_dentry, name, value, size, flags); 1032 rc = vfs_setxattr(lower_dentry, name, value, size, flags);
1042 if (!rc) 1033 if (!rc && dentry->d_inode)
1043 fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); 1034 fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
1044out: 1035out:
1045 return rc; 1036 return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 4725a07f003c..635e8e16a5b7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -26,7 +26,6 @@
26 */ 26 */
27 27
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/syscalls.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/key.h> 30#include <linux/key.h>
32#include <linux/random.h> 31#include <linux/random.h>
@@ -1846,7 +1845,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1846 "(Tag 11 not allowed by itself)\n"); 1845 "(Tag 11 not allowed by itself)\n");
1847 rc = -EIO; 1846 rc = -EIO;
1848 goto out_wipe_list; 1847 goto out_wipe_list;
1849 break;
1850 default: 1848 default:
1851 ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] " 1849 ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
1852 "of the file header; hex value of " 1850 "of the file header; hex value of "
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index e57380e5f6bd..286f10b0363b 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -434,8 +434,7 @@ void ecryptfs_release_messaging(void)
434 mutex_lock(&ecryptfs_msg_ctx_lists_mux); 434 mutex_lock(&ecryptfs_msg_ctx_lists_mux);
435 for (i = 0; i < ecryptfs_message_buf_len; i++) { 435 for (i = 0; i < ecryptfs_message_buf_len; i++) {
436 mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); 436 mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
437 if (ecryptfs_msg_ctx_arr[i].msg) 437 kfree(ecryptfs_msg_ctx_arr[i].msg);
438 kfree(ecryptfs_msg_ctx_arr[i].msg);
439 mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); 438 mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
440 } 439 }
441 kfree(ecryptfs_msg_ctx_arr); 440 kfree(ecryptfs_msg_ctx_arr);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b10b48c2a7af..7bcfff900f05 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1852 goto error_tgt_fput; 1852 goto error_tgt_fput;
1853 1853
1854 /* Check if EPOLLWAKEUP is allowed */ 1854 /* Check if EPOLLWAKEUP is allowed */
1855 ep_take_care_of_epollwakeup(&epds); 1855 if (ep_op_has_event(op))
1856 ep_take_care_of_epollwakeup(&epds);
1856 1857
1857 /* 1858 /*
1858 * We have to check that the file structure underneath the file descriptor 1859 * We have to check that the file structure underneath the file descriptor
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b88edc05c230..170dc41e8bf4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1067 ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); 1067 ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
1068 1068
1069 err = percpu_counter_init(&sbi->s_freeblocks_counter, 1069 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1070 ext2_count_free_blocks(sb)); 1070 ext2_count_free_blocks(sb), GFP_KERNEL);
1071 if (!err) { 1071 if (!err) {
1072 err = percpu_counter_init(&sbi->s_freeinodes_counter, 1072 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1073 ext2_count_free_inodes(sb)); 1073 ext2_count_free_inodes(sb), GFP_KERNEL);
1074 } 1074 }
1075 if (!err) { 1075 if (!err) {
1076 err = percpu_counter_init(&sbi->s_dirs_counter, 1076 err = percpu_counter_init(&sbi->s_dirs_counter,
1077 ext2_count_dirs(sb)); 1077 ext2_count_dirs(sb), GFP_KERNEL);
1078 } 1078 }
1079 if (err) { 1079 if (err) {
1080 ext2_msg(sb, KERN_ERR, "error: insufficient memory"); 1080 ext2_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
index e85ff15a060e..fc3cdcf24aed 100644
--- a/fs/ext3/ext3.h
+++ b/fs/ext3/ext3.h
@@ -237,6 +237,8 @@ struct ext3_new_group_data {
237#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 237#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
238#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 238#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
239 239
240/* Number of supported quota types */
241#define EXT3_MAXQUOTAS 2
240 242
241/* 243/*
242 * Mount options 244 * Mount options
@@ -248,7 +250,7 @@ struct ext3_mount_options {
248 unsigned long s_commit_interval; 250 unsigned long s_commit_interval;
249#ifdef CONFIG_QUOTA 251#ifdef CONFIG_QUOTA
250 int s_jquota_fmt; 252 int s_jquota_fmt;
251 char *s_qf_names[MAXQUOTAS]; 253 char *s_qf_names[EXT3_MAXQUOTAS];
252#endif 254#endif
253}; 255};
254 256
@@ -669,7 +671,7 @@ struct ext3_sb_info {
669 unsigned long s_commit_interval; 671 unsigned long s_commit_interval;
670 struct block_device *journal_bdev; 672 struct block_device *journal_bdev;
671#ifdef CONFIG_QUOTA 673#ifdef CONFIG_QUOTA
672 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ 674 char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */
673 int s_jquota_fmt; /* Format of quota to use */ 675 int s_jquota_fmt; /* Format of quota to use */
674#endif 676#endif
675}; 677};
@@ -1183,9 +1185,9 @@ extern const struct inode_operations ext3_fast_symlink_inode_operations;
1183#define EXT3_QUOTA_INIT_BLOCKS(sb) 0 1185#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
1184#define EXT3_QUOTA_DEL_BLOCKS(sb) 0 1186#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
1185#endif 1187#endif
1186#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) 1188#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
1187#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) 1189#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
1188#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) 1190#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
1189 1191
1190int 1192int
1191ext3_mark_iloc_dirty(handle_t *handle, 1193ext3_mark_iloc_dirty(handle_t *handle,
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 08cdfe5461e3..7015db0bafd1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -441,7 +441,7 @@ static void ext3_put_super (struct super_block * sb)
441 percpu_counter_destroy(&sbi->s_dirs_counter); 441 percpu_counter_destroy(&sbi->s_dirs_counter);
442 brelse(sbi->s_sbh); 442 brelse(sbi->s_sbh);
443#ifdef CONFIG_QUOTA 443#ifdef CONFIG_QUOTA
444 for (i = 0; i < MAXQUOTAS; i++) 444 for (i = 0; i < EXT3_MAXQUOTAS; i++)
445 kfree(sbi->s_qf_names[i]); 445 kfree(sbi->s_qf_names[i]);
446#endif 446#endif
447 447
@@ -1555,7 +1555,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1555 /* Needed for iput() to work correctly and not trash data */ 1555 /* Needed for iput() to work correctly and not trash data */
1556 sb->s_flags |= MS_ACTIVE; 1556 sb->s_flags |= MS_ACTIVE;
1557 /* Turn on quotas so that they are updated correctly */ 1557 /* Turn on quotas so that they are updated correctly */
1558 for (i = 0; i < MAXQUOTAS; i++) { 1558 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
1559 if (EXT3_SB(sb)->s_qf_names[i]) { 1559 if (EXT3_SB(sb)->s_qf_names[i]) {
1560 int ret = ext3_quota_on_mount(sb, i); 1560 int ret = ext3_quota_on_mount(sb, i);
1561 if (ret < 0) 1561 if (ret < 0)
@@ -1606,7 +1606,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1606 PLURAL(nr_truncates)); 1606 PLURAL(nr_truncates));
1607#ifdef CONFIG_QUOTA 1607#ifdef CONFIG_QUOTA
1608 /* Turn quotas off */ 1608 /* Turn quotas off */
1609 for (i = 0; i < MAXQUOTAS; i++) { 1609 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
1610 if (sb_dqopt(sb)->files[i]) 1610 if (sb_dqopt(sb)->files[i])
1611 dquot_quota_off(sb, i); 1611 dquot_quota_off(sb, i);
1612 } 1612 }
@@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2039 goto failed_mount2; 2039 goto failed_mount2;
2040 } 2040 }
2041 err = percpu_counter_init(&sbi->s_freeblocks_counter, 2041 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2042 ext3_count_free_blocks(sb)); 2042 ext3_count_free_blocks(sb), GFP_KERNEL);
2043 if (!err) { 2043 if (!err) {
2044 err = percpu_counter_init(&sbi->s_freeinodes_counter, 2044 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2045 ext3_count_free_inodes(sb)); 2045 ext3_count_free_inodes(sb), GFP_KERNEL);
2046 } 2046 }
2047 if (!err) { 2047 if (!err) {
2048 err = percpu_counter_init(&sbi->s_dirs_counter, 2048 err = percpu_counter_init(&sbi->s_dirs_counter,
2049 ext3_count_dirs(sb)); 2049 ext3_count_dirs(sb), GFP_KERNEL);
2050 } 2050 }
2051 if (err) { 2051 if (err) {
2052 ext3_msg(sb, KERN_ERR, "error: insufficient memory"); 2052 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
@@ -2139,7 +2139,7 @@ failed_mount2:
2139 kfree(sbi->s_group_desc); 2139 kfree(sbi->s_group_desc);
2140failed_mount: 2140failed_mount:
2141#ifdef CONFIG_QUOTA 2141#ifdef CONFIG_QUOTA
2142 for (i = 0; i < MAXQUOTAS; i++) 2142 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2143 kfree(sbi->s_qf_names[i]); 2143 kfree(sbi->s_qf_names[i]);
2144#endif 2144#endif
2145 ext3_blkdev_remove(sbi); 2145 ext3_blkdev_remove(sbi);
@@ -2659,7 +2659,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2659 old_opts.s_commit_interval = sbi->s_commit_interval; 2659 old_opts.s_commit_interval = sbi->s_commit_interval;
2660#ifdef CONFIG_QUOTA 2660#ifdef CONFIG_QUOTA
2661 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 2661 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2662 for (i = 0; i < MAXQUOTAS; i++) 2662 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2663 if (sbi->s_qf_names[i]) { 2663 if (sbi->s_qf_names[i]) {
2664 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], 2664 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
2665 GFP_KERNEL); 2665 GFP_KERNEL);
@@ -2763,7 +2763,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2763 } 2763 }
2764#ifdef CONFIG_QUOTA 2764#ifdef CONFIG_QUOTA
2765 /* Release old quota file names */ 2765 /* Release old quota file names */
2766 for (i = 0; i < MAXQUOTAS; i++) 2766 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2767 kfree(old_opts.s_qf_names[i]); 2767 kfree(old_opts.s_qf_names[i]);
2768#endif 2768#endif
2769 if (enable_quota) 2769 if (enable_quota)
@@ -2777,7 +2777,7 @@ restore_opts:
2777 sbi->s_commit_interval = old_opts.s_commit_interval; 2777 sbi->s_commit_interval = old_opts.s_commit_interval;
2778#ifdef CONFIG_QUOTA 2778#ifdef CONFIG_QUOTA
2779 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 2779 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
2780 for (i = 0; i < MAXQUOTAS; i++) { 2780 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
2781 kfree(sbi->s_qf_names[i]); 2781 kfree(sbi->s_qf_names[i]);
2782 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 2782 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2783 } 2783 }
@@ -2828,8 +2828,9 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2828 */ 2828 */
2829 overhead += ngroups * (2 + sbi->s_itb_per_group); 2829 overhead += ngroups * (2 + sbi->s_itb_per_group);
2830 2830
2831 /* Add the journal blocks as well */ 2831 /* Add the internal journal blocks as well */
2832 overhead += sbi->s_journal->j_maxlen; 2832 if (sbi->s_journal && !sbi->journal_bdev)
2833 overhead += sbi->s_journal->j_maxlen;
2833 2834
2834 sbi->s_overhead_last = overhead; 2835 sbi->s_overhead_last = overhead;
2835 smp_wmb(); 2836 smp_wmb();
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5b19760b1de5..b0c225cdb52c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1825,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
1825/* 1825/*
1826 * Special error return code only used by dx_probe() and its callers. 1826 * Special error return code only used by dx_probe() and its callers.
1827 */ 1827 */
1828#define ERR_BAD_DX_DIR -75000 1828#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
1829 1829
1830/* 1830/*
1831 * Timeout and state flag for lazy initialization inode thread. 1831 * Timeout and state flag for lazy initialization inode thread.
@@ -2454,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
2454 up_write(&EXT4_I(inode)->i_data_sem); 2454 up_write(&EXT4_I(inode)->i_data_sem);
2455} 2455}
2456 2456
2457/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
2458static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
2459{
2460 int changed = 0;
2461
2462 if (newsize > inode->i_size) {
2463 i_size_write(inode, newsize);
2464 changed = 1;
2465 }
2466 if (newsize > EXT4_I(inode)->i_disksize) {
2467 ext4_update_i_disksize(inode, newsize);
2468 changed |= 2;
2469 }
2470 return changed;
2471}
2472
2457struct ext4_group_info { 2473struct ext4_group_info {
2458 unsigned long bb_state; 2474 unsigned long bb_state;
2459 struct rb_root bb_free_root; 2475 struct rb_root bb_free_root;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 76c2df382b7d..74292a71b384 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4665,7 +4665,8 @@ retry:
4665} 4665}
4666 4666
4667static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, 4667static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4668 ext4_lblk_t len, int flags, int mode) 4668 ext4_lblk_t len, loff_t new_size,
4669 int flags, int mode)
4669{ 4670{
4670 struct inode *inode = file_inode(file); 4671 struct inode *inode = file_inode(file);
4671 handle_t *handle; 4672 handle_t *handle;
@@ -4674,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4674 int retries = 0; 4675 int retries = 0;
4675 struct ext4_map_blocks map; 4676 struct ext4_map_blocks map;
4676 unsigned int credits; 4677 unsigned int credits;
4678 loff_t epos;
4677 4679
4678 map.m_lblk = offset; 4680 map.m_lblk = offset;
4681 map.m_len = len;
4679 /* 4682 /*
4680 * Don't normalize the request if it can fit in one extent so 4683 * Don't normalize the request if it can fit in one extent so
4681 * that it doesn't get unnecessarily split into multiple 4684 * that it doesn't get unnecessarily split into multiple
@@ -4690,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4690 credits = ext4_chunk_trans_blocks(inode, len); 4693 credits = ext4_chunk_trans_blocks(inode, len);
4691 4694
4692retry: 4695retry:
4693 while (ret >= 0 && ret < len) { 4696 while (ret >= 0 && len) {
4694 map.m_lblk = map.m_lblk + ret;
4695 map.m_len = len = len - ret;
4696 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4697 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4697 credits); 4698 credits);
4698 if (IS_ERR(handle)) { 4699 if (IS_ERR(handle)) {
@@ -4709,6 +4710,21 @@ retry:
4709 ret2 = ext4_journal_stop(handle); 4710 ret2 = ext4_journal_stop(handle);
4710 break; 4711 break;
4711 } 4712 }
4713 map.m_lblk += ret;
4714 map.m_len = len = len - ret;
4715 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4716 inode->i_ctime = ext4_current_time(inode);
4717 if (new_size) {
4718 if (epos > new_size)
4719 epos = new_size;
4720 if (ext4_update_inode_size(inode, epos) & 0x1)
4721 inode->i_mtime = inode->i_ctime;
4722 } else {
4723 if (epos > inode->i_size)
4724 ext4_set_inode_flag(inode,
4725 EXT4_INODE_EOFBLOCKS);
4726 }
4727 ext4_mark_inode_dirty(handle, inode);
4712 ret2 = ext4_journal_stop(handle); 4728 ret2 = ext4_journal_stop(handle);
4713 if (ret2) 4729 if (ret2)
4714 break; 4730 break;
@@ -4731,7 +4747,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4731 loff_t new_size = 0; 4747 loff_t new_size = 0;
4732 int ret = 0; 4748 int ret = 0;
4733 int flags; 4749 int flags;
4734 int partial; 4750 int credits;
4751 int partial_begin, partial_end;
4735 loff_t start, end; 4752 loff_t start, end;
4736 ext4_lblk_t lblk; 4753 ext4_lblk_t lblk;
4737 struct address_space *mapping = inode->i_mapping; 4754 struct address_space *mapping = inode->i_mapping;
@@ -4771,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4771 4788
4772 if (start < offset || end > offset + len) 4789 if (start < offset || end > offset + len)
4773 return -EINVAL; 4790 return -EINVAL;
4774 partial = (offset + len) & ((1 << blkbits) - 1); 4791 partial_begin = offset & ((1 << blkbits) - 1);
4792 partial_end = (offset + len) & ((1 << blkbits) - 1);
4775 4793
4776 lblk = start >> blkbits; 4794 lblk = start >> blkbits;
4777 max_blocks = (end >> blkbits); 4795 max_blocks = (end >> blkbits);
@@ -4805,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4805 * If we have a partial block after EOF we have to allocate 4823 * If we have a partial block after EOF we have to allocate
4806 * the entire block. 4824 * the entire block.
4807 */ 4825 */
4808 if (partial) 4826 if (partial_end)
4809 max_blocks += 1; 4827 max_blocks += 1;
4810 } 4828 }
4811 4829
@@ -4813,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4813 4831
4814 /* Now release the pages and zero block aligned part of pages*/ 4832 /* Now release the pages and zero block aligned part of pages*/
4815 truncate_pagecache_range(inode, start, end - 1); 4833 truncate_pagecache_range(inode, start, end - 1);
4834 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4816 4835
4817 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4836 /* Wait all existing dio workers, newcomers will block on i_mutex */
4818 ext4_inode_block_unlocked_dio(inode); 4837 ext4_inode_block_unlocked_dio(inode);
@@ -4825,13 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4825 if (ret) 4844 if (ret)
4826 goto out_dio; 4845 goto out_dio;
4827 4846
4828 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, 4847 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4829 mode); 4848 flags, mode);
4830 if (ret) 4849 if (ret)
4831 goto out_dio; 4850 goto out_dio;
4832 } 4851 }
4852 if (!partial_begin && !partial_end)
4853 goto out_dio;
4833 4854
4834 handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); 4855 /*
4856 * In worst case we have to writeout two nonadjacent unwritten
4857 * blocks and update the inode
4858 */
4859 credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4860 if (ext4_should_journal_data(inode))
4861 credits += 2;
4862 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4835 if (IS_ERR(handle)) { 4863 if (IS_ERR(handle)) {
4836 ret = PTR_ERR(handle); 4864 ret = PTR_ERR(handle);
4837 ext4_std_error(inode->i_sb, ret); 4865 ext4_std_error(inode->i_sb, ret);
@@ -4839,12 +4867,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4839 } 4867 }
4840 4868
4841 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4869 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4842
4843 if (new_size) { 4870 if (new_size) {
4844 if (new_size > i_size_read(inode)) 4871 ext4_update_inode_size(inode, new_size);
4845 i_size_write(inode, new_size);
4846 if (new_size > EXT4_I(inode)->i_disksize)
4847 ext4_update_i_disksize(inode, new_size);
4848 } else { 4872 } else {
4849 /* 4873 /*
4850 * Mark that we allocate beyond EOF so the subsequent truncate 4874 * Mark that we allocate beyond EOF so the subsequent truncate
@@ -4853,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4853 if ((offset + len) > i_size_read(inode)) 4877 if ((offset + len) > i_size_read(inode))
4854 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4878 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4855 } 4879 }
4856
4857 ext4_mark_inode_dirty(handle, inode); 4880 ext4_mark_inode_dirty(handle, inode);
4858 4881
4859 /* Zero out partial block at the edges of the range */ 4882 /* Zero out partial block at the edges of the range */
@@ -4880,13 +4903,11 @@ out_mutex:
4880long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4903long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4881{ 4904{
4882 struct inode *inode = file_inode(file); 4905 struct inode *inode = file_inode(file);
4883 handle_t *handle;
4884 loff_t new_size = 0; 4906 loff_t new_size = 0;
4885 unsigned int max_blocks; 4907 unsigned int max_blocks;
4886 int ret = 0; 4908 int ret = 0;
4887 int flags; 4909 int flags;
4888 ext4_lblk_t lblk; 4910 ext4_lblk_t lblk;
4889 struct timespec tv;
4890 unsigned int blkbits = inode->i_blkbits; 4911 unsigned int blkbits = inode->i_blkbits;
4891 4912
4892 /* Return error if mode is not supported */ 4913 /* Return error if mode is not supported */
@@ -4937,36 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4937 goto out; 4958 goto out;
4938 } 4959 }
4939 4960
4940 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); 4961 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4962 flags, mode);
4941 if (ret) 4963 if (ret)
4942 goto out; 4964 goto out;
4943 4965
4944 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 4966 if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4945 if (IS_ERR(handle)) 4967 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4946 goto out; 4968 EXT4_I(inode)->i_sync_tid);
4947
4948 tv = inode->i_ctime = ext4_current_time(inode);
4949
4950 if (new_size) {
4951 if (new_size > i_size_read(inode)) {
4952 i_size_write(inode, new_size);
4953 inode->i_mtime = tv;
4954 }
4955 if (new_size > EXT4_I(inode)->i_disksize)
4956 ext4_update_i_disksize(inode, new_size);
4957 } else {
4958 /*
4959 * Mark that we allocate beyond EOF so the subsequent truncate
4960 * can proceed even if the new size is the same as i_size.
4961 */
4962 if ((offset + len) > i_size_read(inode))
4963 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4964 } 4969 }
4965 ext4_mark_inode_dirty(handle, inode);
4966 if (file->f_flags & O_SYNC)
4967 ext4_handle_sync(handle);
4968
4969 ext4_journal_stop(handle);
4970out: 4970out:
4971 mutex_unlock(&inode->i_mutex); 4971 mutex_unlock(&inode->i_mutex);
4972 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4972 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 367a60c07cf0..3aa26e9117c4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1055,27 +1055,11 @@ static int ext4_write_end(struct file *file,
1055 } else 1055 } else
1056 copied = block_write_end(file, mapping, pos, 1056 copied = block_write_end(file, mapping, pos,
1057 len, copied, page, fsdata); 1057 len, copied, page, fsdata);
1058
1059 /* 1058 /*
1060 * No need to use i_size_read() here, the i_size 1059 * it's important to update i_size while still holding page lock:
1061 * cannot change under us because we hole i_mutex.
1062 *
1063 * But it's important to update i_size while still holding page lock:
1064 * page writeout could otherwise come in and zero beyond i_size. 1060 * page writeout could otherwise come in and zero beyond i_size.
1065 */ 1061 */
1066 if (pos + copied > inode->i_size) { 1062 i_size_changed = ext4_update_inode_size(inode, pos + copied);
1067 i_size_write(inode, pos + copied);
1068 i_size_changed = 1;
1069 }
1070
1071 if (pos + copied > EXT4_I(inode)->i_disksize) {
1072 /* We need to mark inode dirty even if
1073 * new_i_size is less that inode->i_size
1074 * but greater than i_disksize. (hint delalloc)
1075 */
1076 ext4_update_i_disksize(inode, (pos + copied));
1077 i_size_changed = 1;
1078 }
1079 unlock_page(page); 1063 unlock_page(page);
1080 page_cache_release(page); 1064 page_cache_release(page);
1081 1065
@@ -1123,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file,
1123 int ret = 0, ret2; 1107 int ret = 0, ret2;
1124 int partial = 0; 1108 int partial = 0;
1125 unsigned from, to; 1109 unsigned from, to;
1126 loff_t new_i_size; 1110 int size_changed = 0;
1127 1111
1128 trace_ext4_journalled_write_end(inode, pos, len, copied); 1112 trace_ext4_journalled_write_end(inode, pos, len, copied);
1129 from = pos & (PAGE_CACHE_SIZE - 1); 1113 from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1146,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file,
1146 if (!partial) 1130 if (!partial)
1147 SetPageUptodate(page); 1131 SetPageUptodate(page);
1148 } 1132 }
1149 new_i_size = pos + copied; 1133 size_changed = ext4_update_inode_size(inode, pos + copied);
1150 if (new_i_size > inode->i_size)
1151 i_size_write(inode, pos+copied);
1152 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1134 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1153 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1135 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1154 if (new_i_size > EXT4_I(inode)->i_disksize) { 1136 unlock_page(page);
1155 ext4_update_i_disksize(inode, new_i_size); 1137 page_cache_release(page);
1138
1139 if (size_changed) {
1156 ret2 = ext4_mark_inode_dirty(handle, inode); 1140 ret2 = ext4_mark_inode_dirty(handle, inode);
1157 if (!ret) 1141 if (!ret)
1158 ret = ret2; 1142 ret = ret2;
1159 } 1143 }
1160 1144
1161 unlock_page(page);
1162 page_cache_release(page);
1163 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1145 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1164 /* if we have allocated more blocks and copied 1146 /* if we have allocated more blocks and copied
1165 * less. We will have blocks allocated outside 1147 * less. We will have blocks allocated outside
@@ -2095,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2095 struct ext4_map_blocks *map = &mpd->map; 2077 struct ext4_map_blocks *map = &mpd->map;
2096 int err; 2078 int err;
2097 loff_t disksize; 2079 loff_t disksize;
2080 int progress = 0;
2098 2081
2099 mpd->io_submit.io_end->offset = 2082 mpd->io_submit.io_end->offset =
2100 ((loff_t)map->m_lblk) << inode->i_blkbits; 2083 ((loff_t)map->m_lblk) << inode->i_blkbits;
@@ -2111,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2111 * is non-zero, a commit should free up blocks. 2094 * is non-zero, a commit should free up blocks.
2112 */ 2095 */
2113 if ((err == -ENOMEM) || 2096 if ((err == -ENOMEM) ||
2114 (err == -ENOSPC && ext4_count_free_clusters(sb))) 2097 (err == -ENOSPC && ext4_count_free_clusters(sb))) {
2098 if (progress)
2099 goto update_disksize;
2115 return err; 2100 return err;
2101 }
2116 ext4_msg(sb, KERN_CRIT, 2102 ext4_msg(sb, KERN_CRIT,
2117 "Delayed block allocation failed for " 2103 "Delayed block allocation failed for "
2118 "inode %lu at logical offset %llu with" 2104 "inode %lu at logical offset %llu with"
@@ -2129,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2129 *give_up_on_write = true; 2115 *give_up_on_write = true;
2130 return err; 2116 return err;
2131 } 2117 }
2118 progress = 1;
2132 /* 2119 /*
2133 * Update buffer state, submit mapped pages, and get us new 2120 * Update buffer state, submit mapped pages, and get us new
2134 * extent to map 2121 * extent to map
2135 */ 2122 */
2136 err = mpage_map_and_submit_buffers(mpd); 2123 err = mpage_map_and_submit_buffers(mpd);
2137 if (err < 0) 2124 if (err < 0)
2138 return err; 2125 goto update_disksize;
2139 } while (map->m_len); 2126 } while (map->m_len);
2140 2127
2128update_disksize:
2141 /* 2129 /*
2142 * Update on-disk size after IO is submitted. Races with 2130 * Update on-disk size after IO is submitted. Races with
2143 * truncate are avoided by checking i_size under i_data_sem. 2131 * truncate are avoided by checking i_size under i_data_sem.
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 956027711faf..8b0f9ef517d6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1412 int last = first + count - 1; 1412 int last = first + count - 1;
1413 struct super_block *sb = e4b->bd_sb; 1413 struct super_block *sb = e4b->bd_sb;
1414 1414
1415 if (WARN_ON(count == 0))
1416 return;
1415 BUG_ON(last >= (sb->s_blocksize << 3)); 1417 BUG_ON(last >= (sb->s_blocksize << 3));
1416 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1418 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1417 /* Don't bother if the block group is corrupt. */ 1419 /* Don't bother if the block group is corrupt. */
@@ -3221,6 +3223,8 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3221 int err; 3223 int err;
3222 3224
3223 if (pa == NULL) { 3225 if (pa == NULL) {
3226 if (ac->ac_f_ex.fe_len == 0)
3227 return;
3224 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); 3228 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3225 if (err) { 3229 if (err) {
3226 /* 3230 /*
@@ -3235,6 +3239,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3235 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, 3239 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3236 ac->ac_f_ex.fe_len); 3240 ac->ac_f_ex.fe_len);
3237 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); 3241 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3242 ext4_mb_unload_buddy(&e4b);
3238 return; 3243 return;
3239 } 3244 }
3240 if (pa->pa_type == MB_INODE_PA) 3245 if (pa->pa_type == MB_INODE_PA)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index b147a67baa0d..603e4ebbd0ac 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1227 buffer */ 1227 buffer */
1228 int num = 0; 1228 int num = 0;
1229 ext4_lblk_t nblocks; 1229 ext4_lblk_t nblocks;
1230 int i, err; 1230 int i, err = 0;
1231 int namelen; 1231 int namelen;
1232 1232
1233 *res_dir = NULL; 1233 *res_dir = NULL;
@@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1264 * return. Otherwise, fall back to doing a search the 1264 * return. Otherwise, fall back to doing a search the
1265 * old fashioned way. 1265 * old fashioned way.
1266 */ 1266 */
1267 if (bh || (err != ERR_BAD_DX_DIR)) 1267 if (err == -ENOENT)
1268 return NULL;
1269 if (err && err != ERR_BAD_DX_DIR)
1270 return ERR_PTR(err);
1271 if (bh)
1268 return bh; 1272 return bh;
1269 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1273 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1270 "falling back\n")); 1274 "falling back\n"));
@@ -1295,6 +1299,11 @@ restart:
1295 } 1299 }
1296 num++; 1300 num++;
1297 bh = ext4_getblk(NULL, dir, b++, 0, &err); 1301 bh = ext4_getblk(NULL, dir, b++, 0, &err);
1302 if (unlikely(err)) {
1303 if (ra_max == 0)
1304 return ERR_PTR(err);
1305 break;
1306 }
1298 bh_use[ra_max] = bh; 1307 bh_use[ra_max] = bh;
1299 if (bh) 1308 if (bh)
1300 ll_rw_block(READ | REQ_META | REQ_PRIO, 1309 ll_rw_block(READ | REQ_META | REQ_PRIO,
@@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
1417 return ERR_PTR(-ENAMETOOLONG); 1426 return ERR_PTR(-ENAMETOOLONG);
1418 1427
1419 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 1428 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
1429 if (IS_ERR(bh))
1430 return (struct dentry *) bh;
1420 inode = NULL; 1431 inode = NULL;
1421 if (bh) { 1432 if (bh) {
1422 __u32 ino = le32_to_cpu(de->inode); 1433 __u32 ino = le32_to_cpu(de->inode);
@@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
1450 struct buffer_head *bh; 1461 struct buffer_head *bh;
1451 1462
1452 bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); 1463 bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
1464 if (IS_ERR(bh))
1465 return (struct dentry *) bh;
1453 if (!bh) 1466 if (!bh)
1454 return ERR_PTR(-ENOENT); 1467 return ERR_PTR(-ENOENT);
1455 ino = le32_to_cpu(de->inode); 1468 ino = le32_to_cpu(de->inode);
@@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2727 2740
2728 retval = -ENOENT; 2741 retval = -ENOENT;
2729 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2742 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2743 if (IS_ERR(bh))
2744 return PTR_ERR(bh);
2730 if (!bh) 2745 if (!bh)
2731 goto end_rmdir; 2746 goto end_rmdir;
2732 2747
@@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2794 2809
2795 retval = -ENOENT; 2810 retval = -ENOENT;
2796 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2811 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2812 if (IS_ERR(bh))
2813 return PTR_ERR(bh);
2797 if (!bh) 2814 if (!bh)
2798 goto end_unlink; 2815 goto end_unlink;
2799 2816
@@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
3121 struct ext4_dir_entry_2 *de; 3138 struct ext4_dir_entry_2 *de;
3122 3139
3123 bh = ext4_find_entry(dir, d_name, &de, NULL); 3140 bh = ext4_find_entry(dir, d_name, &de, NULL);
3141 if (IS_ERR(bh))
3142 return PTR_ERR(bh);
3124 if (bh) { 3143 if (bh) {
3125 retval = ext4_delete_entry(handle, dir, de, bh); 3144 retval = ext4_delete_entry(handle, dir, de, bh);
3126 brelse(bh); 3145 brelse(bh);
@@ -3128,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
3128 return retval; 3147 return retval;
3129} 3148}
3130 3149
3131static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) 3150static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
3151 int force_reread)
3132{ 3152{
3133 int retval; 3153 int retval;
3134 /* 3154 /*
@@ -3140,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
3140 if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || 3160 if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
3141 ent->de->name_len != ent->dentry->d_name.len || 3161 ent->de->name_len != ent->dentry->d_name.len ||
3142 strncmp(ent->de->name, ent->dentry->d_name.name, 3162 strncmp(ent->de->name, ent->dentry->d_name.name,
3143 ent->de->name_len)) { 3163 ent->de->name_len) ||
3164 force_reread) {
3144 retval = ext4_find_delete_entry(handle, ent->dir, 3165 retval = ext4_find_delete_entry(handle, ent->dir,
3145 &ent->dentry->d_name); 3166 &ent->dentry->d_name);
3146 } else { 3167 } else {
@@ -3191,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3191 .dentry = new_dentry, 3212 .dentry = new_dentry,
3192 .inode = new_dentry->d_inode, 3213 .inode = new_dentry->d_inode,
3193 }; 3214 };
3215 int force_reread;
3194 int retval; 3216 int retval;
3195 3217
3196 dquot_initialize(old.dir); 3218 dquot_initialize(old.dir);
@@ -3202,6 +3224,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3202 dquot_initialize(new.inode); 3224 dquot_initialize(new.inode);
3203 3225
3204 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); 3226 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
3227 if (IS_ERR(old.bh))
3228 return PTR_ERR(old.bh);
3205 /* 3229 /*
3206 * Check for inode number is _not_ due to possible IO errors. 3230 * Check for inode number is _not_ due to possible IO errors.
3207 * We might rmdir the source, keep it as pwd of some process 3231 * We might rmdir the source, keep it as pwd of some process
@@ -3214,6 +3238,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3214 3238
3215 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, 3239 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3216 &new.de, &new.inlined); 3240 &new.de, &new.inlined);
3241 if (IS_ERR(new.bh)) {
3242 retval = PTR_ERR(new.bh);
3243 new.bh = NULL;
3244 goto end_rename;
3245 }
3217 if (new.bh) { 3246 if (new.bh) {
3218 if (!new.inode) { 3247 if (!new.inode) {
3219 brelse(new.bh); 3248 brelse(new.bh);
@@ -3246,6 +3275,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3246 if (retval) 3275 if (retval)
3247 goto end_rename; 3276 goto end_rename;
3248 } 3277 }
3278 /*
3279 * If we're renaming a file within an inline_data dir and adding or
3280 * setting the new dirent causes a conversion from inline_data to
3281 * extents/blockmap, we need to force the dirent delete code to
3282 * re-read the directory, or else we end up trying to delete a dirent
3283 * from what is now the extent tree root (or a block map).
3284 */
3285 force_reread = (new.dir->i_ino == old.dir->i_ino &&
3286 ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
3249 if (!new.bh) { 3287 if (!new.bh) {
3250 retval = ext4_add_entry(handle, new.dentry, old.inode); 3288 retval = ext4_add_entry(handle, new.dentry, old.inode);
3251 if (retval) 3289 if (retval)
@@ -3256,6 +3294,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3256 if (retval) 3294 if (retval)
3257 goto end_rename; 3295 goto end_rename;
3258 } 3296 }
3297 if (force_reread)
3298 force_reread = !ext4_test_inode_flag(new.dir,
3299 EXT4_INODE_INLINE_DATA);
3259 3300
3260 /* 3301 /*
3261 * Like most other Unix systems, set the ctime for inodes on a 3302 * Like most other Unix systems, set the ctime for inodes on a
@@ -3267,7 +3308,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3267 /* 3308 /*
3268 * ok, that's it 3309 * ok, that's it
3269 */ 3310 */
3270 ext4_rename_delete(handle, &old); 3311 ext4_rename_delete(handle, &old, force_reread);
3271 3312
3272 if (new.inode) { 3313 if (new.inode) {
3273 ext4_dec_count(handle, new.inode); 3314 ext4_dec_count(handle, new.inode);
@@ -3330,6 +3371,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3330 3371
3331 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, 3372 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
3332 &old.de, &old.inlined); 3373 &old.de, &old.inlined);
3374 if (IS_ERR(old.bh))
3375 return PTR_ERR(old.bh);
3333 /* 3376 /*
3334 * Check for inode number is _not_ due to possible IO errors. 3377 * Check for inode number is _not_ due to possible IO errors.
3335 * We might rmdir the source, keep it as pwd of some process 3378 * We might rmdir the source, keep it as pwd of some process
@@ -3342,6 +3385,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3342 3385
3343 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, 3386 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3344 &new.de, &new.inlined); 3387 &new.de, &new.inlined);
3388 if (IS_ERR(new.bh)) {
3389 retval = PTR_ERR(new.bh);
3390 new.bh = NULL;
3391 goto end_rename;
3392 }
3345 3393
3346 /* RENAME_EXCHANGE case: old *and* new must both exist */ 3394 /* RENAME_EXCHANGE case: old *and* new must both exist */
3347 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) 3395 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f03e2e..1e43b905ff98 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -575,6 +575,7 @@ handle_bb:
575 bh = bclean(handle, sb, block); 575 bh = bclean(handle, sb, block);
576 if (IS_ERR(bh)) { 576 if (IS_ERR(bh)) {
577 err = PTR_ERR(bh); 577 err = PTR_ERR(bh);
578 bh = NULL;
578 goto out; 579 goto out;
579 } 580 }
580 overhead = ext4_group_overhead_blocks(sb, group); 581 overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@ handle_ib:
603 bh = bclean(handle, sb, block); 604 bh = bclean(handle, sb, block);
604 if (IS_ERR(bh)) { 605 if (IS_ERR(bh)) {
605 err = PTR_ERR(bh); 606 err = PTR_ERR(bh);
607 bh = NULL;
606 goto out; 608 goto out;
607 } 609 }
608 610
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 32b43ad154b9..05c159218bc2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3181,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb)
3181 3181
3182 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3182 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3183 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 3183 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3184 /* journal checksum v2 */ 3184 /* journal checksum v3 */
3185 compat = 0; 3185 compat = 0;
3186 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; 3186 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
3187 } else { 3187 } else {
3188 /* journal checksum v1 */ 3188 /* journal checksum v1 */
3189 compat = JBD2_FEATURE_COMPAT_CHECKSUM; 3189 compat = JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -3205,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
3205 jbd2_journal_clear_features(sbi->s_journal, 3205 jbd2_journal_clear_features(sbi->s_journal,
3206 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3206 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3207 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | 3207 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3208 JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3208 JBD2_FEATURE_INCOMPAT_CSUM_V2); 3209 JBD2_FEATURE_INCOMPAT_CSUM_V2);
3209 } 3210 }
3210 3211
@@ -3891,7 +3892,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3891 /* Register extent status tree shrinker */ 3892 /* Register extent status tree shrinker */
3892 ext4_es_register_shrinker(sbi); 3893 ext4_es_register_shrinker(sbi);
3893 3894
3894 if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { 3895 err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
3896 if (err) {
3895 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3897 ext4_msg(sb, KERN_ERR, "insufficient memory");
3896 goto failed_mount3; 3898 goto failed_mount3;
3897 } 3899 }
@@ -4105,17 +4107,20 @@ no_journal:
4105 block = ext4_count_free_clusters(sb); 4107 block = ext4_count_free_clusters(sb);
4106 ext4_free_blocks_count_set(sbi->s_es, 4108 ext4_free_blocks_count_set(sbi->s_es,
4107 EXT4_C2B(sbi, block)); 4109 EXT4_C2B(sbi, block));
4108 err = percpu_counter_init(&sbi->s_freeclusters_counter, block); 4110 err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
4111 GFP_KERNEL);
4109 if (!err) { 4112 if (!err) {
4110 unsigned long freei = ext4_count_free_inodes(sb); 4113 unsigned long freei = ext4_count_free_inodes(sb);
4111 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 4114 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
4112 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); 4115 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
4116 GFP_KERNEL);
4113 } 4117 }
4114 if (!err) 4118 if (!err)
4115 err = percpu_counter_init(&sbi->s_dirs_counter, 4119 err = percpu_counter_init(&sbi->s_dirs_counter,
4116 ext4_count_dirs(sb)); 4120 ext4_count_dirs(sb), GFP_KERNEL);
4117 if (!err) 4121 if (!err)
4118 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); 4122 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
4123 GFP_KERNEL);
4119 if (err) { 4124 if (err) {
4120 ext4_msg(sb, KERN_ERR, "insufficient memory"); 4125 ext4_msg(sb, KERN_ERR, "insufficient memory");
4121 goto failed_mount6; 4126 goto failed_mount6;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 214fe1054fce..736a348509f7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -23,7 +23,7 @@ config F2FS_STAT_FS
23 mounted as f2fs. Each file shows the whole f2fs information. 23 mounted as f2fs. Each file shows the whole f2fs information.
24 24
25 /sys/kernel/debug/f2fs/status includes: 25 /sys/kernel/debug/f2fs/status includes:
26 - major file system information managed by f2fs currently 26 - major filesystem information managed by f2fs currently
27 - average SIT information about whole segments 27 - average SIT information about whole segments
28 - current memory footprint consumed by f2fs. 28 - current memory footprint consumed by f2fs.
29 29
@@ -68,6 +68,6 @@ config F2FS_CHECK_FS
68 bool "F2FS consistency checking feature" 68 bool "F2FS consistency checking feature"
69 depends on F2FS_FS 69 depends on F2FS_FS
70 help 70 help
71 Enables BUG_ONs which check the file system consistency in runtime. 71 Enables BUG_ONs which check the filesystem consistency in runtime.
72 72
73 If you want to improve the performance, say N. 73 If you want to improve the performance, say N.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6aeed5bada52..dd10a031c052 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -72,7 +72,22 @@ out:
72 return page; 72 return page;
73} 73}
74 74
75static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) 75struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
76{
77 bool readahead = false;
78 struct page *page;
79
80 page = find_get_page(META_MAPPING(sbi), index);
81 if (!page || (page && !PageUptodate(page)))
82 readahead = true;
83 f2fs_put_page(page, 0);
84
85 if (readahead)
86 ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
87 return get_meta_page(sbi, index);
88}
89
90static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
76{ 91{
77 switch (type) { 92 switch (type) {
78 case META_NAT: 93 case META_NAT:
@@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
82 case META_SSA: 97 case META_SSA:
83 case META_CP: 98 case META_CP:
84 return 0; 99 return 0;
100 case META_POR:
101 return MAX_BLKADDR(sbi);
85 default: 102 default:
86 BUG(); 103 BUG();
87 } 104 }
@@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
90/* 107/*
91 * Readahead CP/NAT/SIT/SSA pages 108 * Readahead CP/NAT/SIT/SSA pages
92 */ 109 */
93int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) 110int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
94{ 111{
95 block_t prev_blk_addr = 0; 112 block_t prev_blk_addr = 0;
96 struct page *page; 113 struct page *page;
97 int blkno = start; 114 block_t blkno = start;
98 int max_blks = get_max_meta_blks(sbi, type); 115 block_t max_blks = get_max_meta_blks(sbi, type);
99 116
100 struct f2fs_io_info fio = { 117 struct f2fs_io_info fio = {
101 .type = META, 118 .type = META,
@@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
125 break; 142 break;
126 case META_SSA: 143 case META_SSA:
127 case META_CP: 144 case META_CP:
128 /* get ssa/cp block addr */ 145 case META_POR:
146 if (unlikely(blkno >= max_blks))
147 goto out;
148 if (unlikely(blkno < SEG0_BLKADDR(sbi)))
149 goto out;
129 blk_addr = blkno; 150 blk_addr = blkno;
130 break; 151 break;
131 default: 152 default:
@@ -151,8 +172,7 @@ out:
151static int f2fs_write_meta_page(struct page *page, 172static int f2fs_write_meta_page(struct page *page,
152 struct writeback_control *wbc) 173 struct writeback_control *wbc)
153{ 174{
154 struct inode *inode = page->mapping->host; 175 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
155 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
156 176
157 trace_f2fs_writepage(page, META); 177 trace_f2fs_writepage(page, META);
158 178
@@ -160,14 +180,11 @@ static int f2fs_write_meta_page(struct page *page,
160 goto redirty_out; 180 goto redirty_out;
161 if (wbc->for_reclaim) 181 if (wbc->for_reclaim)
162 goto redirty_out; 182 goto redirty_out;
163 183 if (unlikely(f2fs_cp_error(sbi)))
164 /* Should not write any meta pages, if any IO error was occurred */ 184 goto redirty_out;
165 if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
166 goto no_write;
167 185
168 f2fs_wait_on_page_writeback(page, META); 186 f2fs_wait_on_page_writeback(page, META);
169 write_meta_page(sbi, page); 187 write_meta_page(sbi, page);
170no_write:
171 dec_page_count(sbi, F2FS_DIRTY_META); 188 dec_page_count(sbi, F2FS_DIRTY_META);
172 unlock_page(page); 189 unlock_page(page);
173 return 0; 190 return 0;
@@ -180,7 +197,7 @@ redirty_out:
180static int f2fs_write_meta_pages(struct address_space *mapping, 197static int f2fs_write_meta_pages(struct address_space *mapping,
181 struct writeback_control *wbc) 198 struct writeback_control *wbc)
182{ 199{
183 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 200 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
184 long diff, written; 201 long diff, written;
185 202
186 trace_f2fs_writepages(mapping->host, wbc, META); 203 trace_f2fs_writepages(mapping->host, wbc, META);
@@ -262,15 +279,12 @@ continue_unlock:
262 279
263static int f2fs_set_meta_page_dirty(struct page *page) 280static int f2fs_set_meta_page_dirty(struct page *page)
264{ 281{
265 struct address_space *mapping = page->mapping;
266 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
267
268 trace_f2fs_set_page_dirty(page, META); 282 trace_f2fs_set_page_dirty(page, META);
269 283
270 SetPageUptodate(page); 284 SetPageUptodate(page);
271 if (!PageDirty(page)) { 285 if (!PageDirty(page)) {
272 __set_page_dirty_nobuffers(page); 286 __set_page_dirty_nobuffers(page);
273 inc_page_count(sbi, F2FS_DIRTY_META); 287 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
274 return 1; 288 return 1;
275 } 289 }
276 return 0; 290 return 0;
@@ -348,7 +362,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
348 return e ? true : false; 362 return e ? true : false;
349} 363}
350 364
351static void release_dirty_inode(struct f2fs_sb_info *sbi) 365void release_dirty_inode(struct f2fs_sb_info *sbi)
352{ 366{
353 struct ino_entry *e, *tmp; 367 struct ino_entry *e, *tmp;
354 int i; 368 int i;
@@ -381,7 +395,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi)
381void release_orphan_inode(struct f2fs_sb_info *sbi) 395void release_orphan_inode(struct f2fs_sb_info *sbi)
382{ 396{
383 spin_lock(&sbi->ino_lock[ORPHAN_INO]); 397 spin_lock(&sbi->ino_lock[ORPHAN_INO]);
384 f2fs_bug_on(sbi->n_orphans == 0); 398 f2fs_bug_on(sbi, sbi->n_orphans == 0);
385 sbi->n_orphans--; 399 sbi->n_orphans--;
386 spin_unlock(&sbi->ino_lock[ORPHAN_INO]); 400 spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
387} 401}
@@ -401,7 +415,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
401static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 415static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
402{ 416{
403 struct inode *inode = f2fs_iget(sbi->sb, ino); 417 struct inode *inode = f2fs_iget(sbi->sb, ino);
404 f2fs_bug_on(IS_ERR(inode)); 418 f2fs_bug_on(sbi, IS_ERR(inode));
405 clear_nlink(inode); 419 clear_nlink(inode);
406 420
407 /* truncate all the data during iput */ 421 /* truncate all the data during iput */
@@ -446,8 +460,8 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
446 struct f2fs_orphan_block *orphan_blk = NULL; 460 struct f2fs_orphan_block *orphan_blk = NULL;
447 unsigned int nentries = 0; 461 unsigned int nentries = 0;
448 unsigned short index; 462 unsigned short index;
449 unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + 463 unsigned short orphan_blocks =
450 (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); 464 (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
451 struct page *page = NULL; 465 struct page *page = NULL;
452 struct ino_entry *orphan = NULL; 466 struct ino_entry *orphan = NULL;
453 467
@@ -462,7 +476,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
462 list_for_each_entry(orphan, head, list) { 476 list_for_each_entry(orphan, head, list) {
463 if (!page) { 477 if (!page) {
464 page = find_get_page(META_MAPPING(sbi), start_blk++); 478 page = find_get_page(META_MAPPING(sbi), start_blk++);
465 f2fs_bug_on(!page); 479 f2fs_bug_on(sbi, !page);
466 orphan_blk = 480 orphan_blk =
467 (struct f2fs_orphan_block *)page_address(page); 481 (struct f2fs_orphan_block *)page_address(page);
468 memset(orphan_blk, 0, sizeof(*orphan_blk)); 482 memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -622,7 +636,7 @@ fail_no_cp:
622 636
623static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) 637static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
624{ 638{
625 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 639 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
626 640
627 if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) 641 if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
628 return -EEXIST; 642 return -EEXIST;
@@ -634,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
634 return 0; 648 return 0;
635} 649}
636 650
637void set_dirty_dir_page(struct inode *inode, struct page *page) 651void update_dirty_page(struct inode *inode, struct page *page)
638{ 652{
639 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 653 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
640 struct dir_inode_entry *new; 654 struct dir_inode_entry *new;
641 int ret = 0; 655 int ret = 0;
642 656
643 if (!S_ISDIR(inode->i_mode)) 657 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
644 return; 658 return;
645 659
660 if (!S_ISDIR(inode->i_mode)) {
661 inode_inc_dirty_pages(inode);
662 goto out;
663 }
664
646 new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 665 new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
647 new->inode = inode; 666 new->inode = inode;
648 INIT_LIST_HEAD(&new->list); 667 INIT_LIST_HEAD(&new->list);
649 668
650 spin_lock(&sbi->dir_inode_lock); 669 spin_lock(&sbi->dir_inode_lock);
651 ret = __add_dirty_inode(inode, new); 670 ret = __add_dirty_inode(inode, new);
652 inode_inc_dirty_dents(inode); 671 inode_inc_dirty_pages(inode);
653 SetPagePrivate(page);
654 spin_unlock(&sbi->dir_inode_lock); 672 spin_unlock(&sbi->dir_inode_lock);
655 673
656 if (ret) 674 if (ret)
657 kmem_cache_free(inode_entry_slab, new); 675 kmem_cache_free(inode_entry_slab, new);
676out:
677 SetPagePrivate(page);
658} 678}
659 679
660void add_dirty_dir_inode(struct inode *inode) 680void add_dirty_dir_inode(struct inode *inode)
661{ 681{
662 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 682 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
663 struct dir_inode_entry *new = 683 struct dir_inode_entry *new =
664 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 684 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
665 int ret = 0; 685 int ret = 0;
@@ -677,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode)
677 697
678void remove_dirty_dir_inode(struct inode *inode) 698void remove_dirty_dir_inode(struct inode *inode)
679{ 699{
680 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 700 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
681 struct dir_inode_entry *entry; 701 struct dir_inode_entry *entry;
682 702
683 if (!S_ISDIR(inode->i_mode)) 703 if (!S_ISDIR(inode->i_mode))
684 return; 704 return;
685 705
686 spin_lock(&sbi->dir_inode_lock); 706 spin_lock(&sbi->dir_inode_lock);
687 if (get_dirty_dents(inode) || 707 if (get_dirty_pages(inode) ||
688 !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { 708 !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
689 spin_unlock(&sbi->dir_inode_lock); 709 spin_unlock(&sbi->dir_inode_lock);
690 return; 710 return;
@@ -737,7 +757,7 @@ retry:
737/* 757/*
738 * Freeze all the FS-operations for checkpoint. 758 * Freeze all the FS-operations for checkpoint.
739 */ 759 */
740static void block_operations(struct f2fs_sb_info *sbi) 760static int block_operations(struct f2fs_sb_info *sbi)
741{ 761{
742 struct writeback_control wbc = { 762 struct writeback_control wbc = {
743 .sync_mode = WB_SYNC_ALL, 763 .sync_mode = WB_SYNC_ALL,
@@ -745,6 +765,7 @@ static void block_operations(struct f2fs_sb_info *sbi)
745 .for_reclaim = 0, 765 .for_reclaim = 0,
746 }; 766 };
747 struct blk_plug plug; 767 struct blk_plug plug;
768 int err = 0;
748 769
749 blk_start_plug(&plug); 770 blk_start_plug(&plug);
750 771
@@ -754,11 +775,15 @@ retry_flush_dents:
754 if (get_pages(sbi, F2FS_DIRTY_DENTS)) { 775 if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
755 f2fs_unlock_all(sbi); 776 f2fs_unlock_all(sbi);
756 sync_dirty_dir_inodes(sbi); 777 sync_dirty_dir_inodes(sbi);
778 if (unlikely(f2fs_cp_error(sbi))) {
779 err = -EIO;
780 goto out;
781 }
757 goto retry_flush_dents; 782 goto retry_flush_dents;
758 } 783 }
759 784
760 /* 785 /*
761 * POR: we should ensure that there is no dirty node pages 786 * POR: we should ensure that there are no dirty node pages
762 * until finishing nat/sit flush. 787 * until finishing nat/sit flush.
763 */ 788 */
764retry_flush_nodes: 789retry_flush_nodes:
@@ -767,9 +792,16 @@ retry_flush_nodes:
767 if (get_pages(sbi, F2FS_DIRTY_NODES)) { 792 if (get_pages(sbi, F2FS_DIRTY_NODES)) {
768 up_write(&sbi->node_write); 793 up_write(&sbi->node_write);
769 sync_node_pages(sbi, 0, &wbc); 794 sync_node_pages(sbi, 0, &wbc);
795 if (unlikely(f2fs_cp_error(sbi))) {
796 f2fs_unlock_all(sbi);
797 err = -EIO;
798 goto out;
799 }
770 goto retry_flush_nodes; 800 goto retry_flush_nodes;
771 } 801 }
802out:
772 blk_finish_plug(&plug); 803 blk_finish_plug(&plug);
804 return err;
773} 805}
774 806
775static void unblock_operations(struct f2fs_sb_info *sbi) 807static void unblock_operations(struct f2fs_sb_info *sbi)
@@ -793,11 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
793 finish_wait(&sbi->cp_wait, &wait); 825 finish_wait(&sbi->cp_wait, &wait);
794} 826}
795 827
796static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) 828static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
797{ 829{
798 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 830 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
799 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); 831 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
800 nid_t last_nid = 0; 832 struct f2fs_nm_info *nm_i = NM_I(sbi);
833 nid_t last_nid = nm_i->next_scan_nid;
801 block_t start_blk; 834 block_t start_blk;
802 struct page *cp_page; 835 struct page *cp_page;
803 unsigned int data_sum_blocks, orphan_blocks; 836 unsigned int data_sum_blocks, orphan_blocks;
@@ -813,8 +846,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
813 discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg)); 846 discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
814 847
815 /* Flush all the NAT/SIT pages */ 848 /* Flush all the NAT/SIT pages */
816 while (get_pages(sbi, F2FS_DIRTY_META)) 849 while (get_pages(sbi, F2FS_DIRTY_META)) {
817 sync_meta_pages(sbi, META, LONG_MAX); 850 sync_meta_pages(sbi, META, LONG_MAX);
851 if (unlikely(f2fs_cp_error(sbi)))
852 return;
853 }
818 854
819 next_free_nid(sbi, &last_nid); 855 next_free_nid(sbi, &last_nid);
820 856
@@ -825,7 +861,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
825 ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); 861 ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
826 ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); 862 ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
827 ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); 863 ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
828 for (i = 0; i < 3; i++) { 864 for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
829 ckpt->cur_node_segno[i] = 865 ckpt->cur_node_segno[i] =
830 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); 866 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
831 ckpt->cur_node_blkoff[i] = 867 ckpt->cur_node_blkoff[i] =
@@ -833,7 +869,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
833 ckpt->alloc_type[i + CURSEG_HOT_NODE] = 869 ckpt->alloc_type[i + CURSEG_HOT_NODE] =
834 curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); 870 curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
835 } 871 }
836 for (i = 0; i < 3; i++) { 872 for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
837 ckpt->cur_data_segno[i] = 873 ckpt->cur_data_segno[i] =
838 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); 874 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
839 ckpt->cur_data_blkoff[i] = 875 ckpt->cur_data_blkoff[i] =
@@ -848,24 +884,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
848 884
849 /* 2 cp + n data seg summary + orphan inode blocks */ 885 /* 2 cp + n data seg summary + orphan inode blocks */
850 data_sum_blocks = npages_for_summary_flush(sbi); 886 data_sum_blocks = npages_for_summary_flush(sbi);
851 if (data_sum_blocks < 3) 887 if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
852 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); 888 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
853 else 889 else
854 clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); 890 clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
855 891
856 orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) 892 orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
857 / F2FS_ORPHANS_PER_BLOCK;
858 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + 893 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
859 orphan_blocks); 894 orphan_blocks);
860 895
861 if (is_umount) { 896 if (cpc->reason == CP_UMOUNT) {
862 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); 897 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
863 ckpt->cp_pack_total_block_count = cpu_to_le32(2 + 898 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
864 cp_payload_blks + data_sum_blocks + 899 cp_payload_blks + data_sum_blocks +
865 orphan_blocks + NR_CURSEG_NODE_TYPE); 900 orphan_blocks + NR_CURSEG_NODE_TYPE);
866 } else { 901 } else {
867 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); 902 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
868 ckpt->cp_pack_total_block_count = cpu_to_le32(2 + 903 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
869 cp_payload_blks + data_sum_blocks + 904 cp_payload_blks + data_sum_blocks +
870 orphan_blocks); 905 orphan_blocks);
871 } 906 }
@@ -875,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
875 else 910 else
876 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); 911 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
877 912
913 if (sbi->need_fsck)
914 set_ckpt_flags(ckpt, CP_FSCK_FLAG);
915
878 /* update SIT/NAT bitmap */ 916 /* update SIT/NAT bitmap */
879 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); 917 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
880 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); 918 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -909,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
909 947
910 write_data_summaries(sbi, start_blk); 948 write_data_summaries(sbi, start_blk);
911 start_blk += data_sum_blocks; 949 start_blk += data_sum_blocks;
912 if (is_umount) { 950 if (cpc->reason == CP_UMOUNT) {
913 write_node_summaries(sbi, start_blk); 951 write_node_summaries(sbi, start_blk);
914 start_blk += NR_CURSEG_NODE_TYPE; 952 start_blk += NR_CURSEG_NODE_TYPE;
915 } 953 }
@@ -924,6 +962,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
924 /* wait for previous submitted node/meta pages writeback */ 962 /* wait for previous submitted node/meta pages writeback */
925 wait_on_all_pages_writeback(sbi); 963 wait_on_all_pages_writeback(sbi);
926 964
965 if (unlikely(f2fs_cp_error(sbi)))
966 return;
967
927 filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); 968 filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
928 filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); 969 filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
929 970
@@ -934,27 +975,35 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
934 /* Here, we only have one bio having CP pack */ 975 /* Here, we only have one bio having CP pack */
935 sync_meta_pages(sbi, META_FLUSH, LONG_MAX); 976 sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
936 977
937 if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) { 978 release_dirty_inode(sbi);
938 clear_prefree_segments(sbi); 979
939 release_dirty_inode(sbi); 980 if (unlikely(f2fs_cp_error(sbi)))
940 F2FS_RESET_SB_DIRT(sbi); 981 return;
941 } 982
983 clear_prefree_segments(sbi);
984 F2FS_RESET_SB_DIRT(sbi);
942} 985}
943 986
944/* 987/*
945 * We guarantee that this checkpoint procedure should not fail. 988 * We guarantee that this checkpoint procedure will not fail.
946 */ 989 */
947void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) 990void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
948{ 991{
949 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 992 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
950 unsigned long long ckpt_ver; 993 unsigned long long ckpt_ver;
951 994
952 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); 995 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
953 996
954 mutex_lock(&sbi->cp_mutex); 997 mutex_lock(&sbi->cp_mutex);
955 block_operations(sbi);
956 998
957 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); 999 if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
1000 goto out;
1001 if (unlikely(f2fs_cp_error(sbi)))
1002 goto out;
1003 if (block_operations(sbi))
1004 goto out;
1005
1006 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
958 1007
959 f2fs_submit_merged_bio(sbi, DATA, WRITE); 1008 f2fs_submit_merged_bio(sbi, DATA, WRITE);
960 f2fs_submit_merged_bio(sbi, NODE, WRITE); 1009 f2fs_submit_merged_bio(sbi, NODE, WRITE);
@@ -970,16 +1019,16 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
970 1019
971 /* write cached NAT/SIT entries to NAT/SIT area */ 1020 /* write cached NAT/SIT entries to NAT/SIT area */
972 flush_nat_entries(sbi); 1021 flush_nat_entries(sbi);
973 flush_sit_entries(sbi); 1022 flush_sit_entries(sbi, cpc);
974 1023
975 /* unlock all the fs_lock[] in do_checkpoint() */ 1024 /* unlock all the fs_lock[] in do_checkpoint() */
976 do_checkpoint(sbi, is_umount); 1025 do_checkpoint(sbi, cpc);
977 1026
978 unblock_operations(sbi); 1027 unblock_operations(sbi);
979 mutex_unlock(&sbi->cp_mutex);
980
981 stat_inc_cp_count(sbi->stat_info); 1028 stat_inc_cp_count(sbi->stat_info);
982 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); 1029out:
1030 mutex_unlock(&sbi->cp_mutex);
1031 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
983} 1032}
984 1033
985void init_ino_entry_info(struct f2fs_sb_info *sbi) 1034void init_ino_entry_info(struct f2fs_sb_info *sbi)
@@ -999,8 +1048,8 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi)
999 * for cp pack we can have max 1020*504 orphan entries 1048 * for cp pack we can have max 1020*504 orphan entries
1000 */ 1049 */
1001 sbi->n_orphans = 0; 1050 sbi->n_orphans = 0;
1002 sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) 1051 sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
1003 * F2FS_ORPHANS_PER_BLOCK; 1052 NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
1004} 1053}
1005 1054
1006int __init create_checkpoint_caches(void) 1055int __init create_checkpoint_caches(void)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 03313099c51c..8e58c4cc2cb9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -53,7 +53,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
53 struct page *page = bvec->bv_page; 53 struct page *page = bvec->bv_page;
54 54
55 if (unlikely(err)) { 55 if (unlikely(err)) {
56 SetPageError(page); 56 set_page_dirty(page);
57 set_bit(AS_EIO, &page->mapping->flags); 57 set_bit(AS_EIO, &page->mapping->flags);
58 f2fs_stop_checkpoint(sbi); 58 f2fs_stop_checkpoint(sbi);
59 } 59 }
@@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
85 bio = bio_alloc(GFP_NOIO, npages); 85 bio = bio_alloc(GFP_NOIO, npages);
86 86
87 bio->bi_bdev = sbi->sb->s_bdev; 87 bio->bi_bdev = sbi->sb->s_bdev;
88 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); 88 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
89 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; 89 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
90 bio->bi_private = sbi; 90 bio->bi_private = sbi;
91 91
@@ -193,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
193 __submit_merged_bio(io); 193 __submit_merged_bio(io);
194alloc_new: 194alloc_new:
195 if (io->bio == NULL) { 195 if (io->bio == NULL) {
196 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 196 int bio_blocks = MAX_BIO_BLOCKS(sbi);
197 197
198 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); 198 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
199 io->fio = *fio; 199 io->fio = *fio;
@@ -236,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
236 236
237int reserve_new_block(struct dnode_of_data *dn) 237int reserve_new_block(struct dnode_of_data *dn)
238{ 238{
239 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 239 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
240 240
241 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 241 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
242 return -EPERM; 242 return -EPERM;
@@ -258,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
258 int err; 258 int err;
259 259
260 /* if inode_page exists, index should be zero */ 260 /* if inode_page exists, index should be zero */
261 f2fs_bug_on(!need_put && index); 261 f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
262 262
263 err = get_dnode_of_data(dn, index, ALLOC_NODE); 263 err = get_dnode_of_data(dn, index, ALLOC_NODE);
264 if (err) 264 if (err)
@@ -321,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
321 block_t start_blkaddr, end_blkaddr; 321 block_t start_blkaddr, end_blkaddr;
322 int need_update = true; 322 int need_update = true;
323 323
324 f2fs_bug_on(blk_addr == NEW_ADDR); 324 f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
325 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 325 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
326 dn->ofs_in_node; 326 dn->ofs_in_node;
327 327
@@ -396,7 +396,6 @@ end_update:
396 396
397struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) 397struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
398{ 398{
399 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
400 struct address_space *mapping = inode->i_mapping; 399 struct address_space *mapping = inode->i_mapping;
401 struct dnode_of_data dn; 400 struct dnode_of_data dn;
402 struct page *page; 401 struct page *page;
@@ -429,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
429 return page; 428 return page;
430 } 429 }
431 430
432 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 431 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
433 sync ? READ_SYNC : READA); 432 sync ? READ_SYNC : READA);
434 if (err) 433 if (err)
435 return ERR_PTR(err); 434 return ERR_PTR(err);
@@ -451,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
451 */ 450 */
452struct page *get_lock_data_page(struct inode *inode, pgoff_t index) 451struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
453{ 452{
454 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
455 struct address_space *mapping = inode->i_mapping; 453 struct address_space *mapping = inode->i_mapping;
456 struct dnode_of_data dn; 454 struct dnode_of_data dn;
457 struct page *page; 455 struct page *page;
@@ -490,7 +488,8 @@ repeat:
490 return page; 488 return page;
491 } 489 }
492 490
493 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); 491 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
492 dn.data_blkaddr, READ_SYNC);
494 if (err) 493 if (err)
495 return ERR_PTR(err); 494 return ERR_PTR(err);
496 495
@@ -517,7 +516,6 @@ repeat:
517struct page *get_new_data_page(struct inode *inode, 516struct page *get_new_data_page(struct inode *inode,
518 struct page *ipage, pgoff_t index, bool new_i_size) 517 struct page *ipage, pgoff_t index, bool new_i_size)
519{ 518{
520 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
521 struct address_space *mapping = inode->i_mapping; 519 struct address_space *mapping = inode->i_mapping;
522 struct page *page; 520 struct page *page;
523 struct dnode_of_data dn; 521 struct dnode_of_data dn;
@@ -541,8 +539,8 @@ repeat:
541 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 539 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
542 SetPageUptodate(page); 540 SetPageUptodate(page);
543 } else { 541 } else {
544 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 542 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
545 READ_SYNC); 543 dn.data_blkaddr, READ_SYNC);
546 if (err) 544 if (err)
547 goto put_err; 545 goto put_err;
548 546
@@ -573,10 +571,12 @@ put_err:
573 571
574static int __allocate_data_block(struct dnode_of_data *dn) 572static int __allocate_data_block(struct dnode_of_data *dn)
575{ 573{
576 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 574 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
575 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
577 struct f2fs_summary sum; 576 struct f2fs_summary sum;
578 block_t new_blkaddr; 577 block_t new_blkaddr;
579 struct node_info ni; 578 struct node_info ni;
579 pgoff_t fofs;
580 int type; 580 int type;
581 581
582 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 582 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
@@ -599,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
599 update_extent_cache(new_blkaddr, dn); 599 update_extent_cache(new_blkaddr, dn);
600 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); 600 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
601 601
602 /* update i_size */
603 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
604 dn->ofs_in_node;
605 if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
606 i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
607
602 dn->data_blkaddr = new_blkaddr; 608 dn->data_blkaddr = new_blkaddr;
603 return 0; 609 return 0;
604} 610}
@@ -614,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
614static int __get_data_block(struct inode *inode, sector_t iblock, 620static int __get_data_block(struct inode *inode, sector_t iblock,
615 struct buffer_head *bh_result, int create, bool fiemap) 621 struct buffer_head *bh_result, int create, bool fiemap)
616{ 622{
617 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
618 unsigned int blkbits = inode->i_sb->s_blocksize_bits; 623 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
619 unsigned maxblocks = bh_result->b_size >> blkbits; 624 unsigned maxblocks = bh_result->b_size >> blkbits;
620 struct dnode_of_data dn; 625 struct dnode_of_data dn;
@@ -630,8 +635,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
630 goto out; 635 goto out;
631 636
632 if (create) { 637 if (create) {
633 f2fs_balance_fs(sbi); 638 f2fs_balance_fs(F2FS_I_SB(inode));
634 f2fs_lock_op(sbi); 639 f2fs_lock_op(F2FS_I_SB(inode));
635 } 640 }
636 641
637 /* When reading holes, we need its node page */ 642 /* When reading holes, we need its node page */
@@ -691,7 +696,7 @@ get_next:
691 allocated = true; 696 allocated = true;
692 blkaddr = dn.data_blkaddr; 697 blkaddr = dn.data_blkaddr;
693 } 698 }
694 /* Give more consecutive addresses for the read ahead */ 699 /* Give more consecutive addresses for the readahead */
695 if (blkaddr == (bh_result->b_blocknr + ofs)) { 700 if (blkaddr == (bh_result->b_blocknr + ofs)) {
696 ofs++; 701 ofs++;
697 dn.ofs_in_node++; 702 dn.ofs_in_node++;
@@ -707,7 +712,7 @@ put_out:
707 f2fs_put_dnode(&dn); 712 f2fs_put_dnode(&dn);
708unlock_out: 713unlock_out:
709 if (create) 714 if (create)
710 f2fs_unlock_op(sbi); 715 f2fs_unlock_op(F2FS_I_SB(inode));
711out: 716out:
712 trace_f2fs_get_data_block(inode, iblock, bh_result, err); 717 trace_f2fs_get_data_block(inode, iblock, bh_result, err);
713 return err; 718 return err;
@@ -739,7 +744,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
739 744
740 trace_f2fs_readpage(page, DATA); 745 trace_f2fs_readpage(page, DATA);
741 746
742 /* If the file has inline data, try to read it directlly */ 747 /* If the file has inline data, try to read it directly */
743 if (f2fs_has_inline_data(inode)) 748 if (f2fs_has_inline_data(inode))
744 ret = f2fs_read_inline_data(inode, page); 749 ret = f2fs_read_inline_data(inode, page);
745 else 750 else
@@ -804,7 +809,7 @@ static int f2fs_write_data_page(struct page *page,
804 struct writeback_control *wbc) 809 struct writeback_control *wbc)
805{ 810{
806 struct inode *inode = page->mapping->host; 811 struct inode *inode = page->mapping->host;
807 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 812 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
808 loff_t i_size = i_size_read(inode); 813 loff_t i_size = i_size_read(inode);
809 const pgoff_t end_index = ((unsigned long long) i_size) 814 const pgoff_t end_index = ((unsigned long long) i_size)
810 >> PAGE_CACHE_SHIFT; 815 >> PAGE_CACHE_SHIFT;
@@ -836,10 +841,19 @@ write:
836 841
837 /* Dentry blocks are controlled by checkpoint */ 842 /* Dentry blocks are controlled by checkpoint */
838 if (S_ISDIR(inode->i_mode)) { 843 if (S_ISDIR(inode->i_mode)) {
844 if (unlikely(f2fs_cp_error(sbi)))
845 goto redirty_out;
839 err = do_write_data_page(page, &fio); 846 err = do_write_data_page(page, &fio);
840 goto done; 847 goto done;
841 } 848 }
842 849
850 /* we should bypass data pages to proceed the kworkder jobs */
851 if (unlikely(f2fs_cp_error(sbi))) {
852 SetPageError(page);
853 unlock_page(page);
854 goto out;
855 }
856
843 if (!wbc->for_reclaim) 857 if (!wbc->for_reclaim)
844 need_balance_fs = true; 858 need_balance_fs = true;
845 else if (has_not_enough_free_secs(sbi, 0)) 859 else if (has_not_enough_free_secs(sbi, 0))
@@ -857,7 +871,7 @@ done:
857 871
858 clear_cold_data(page); 872 clear_cold_data(page);
859out: 873out:
860 inode_dec_dirty_dents(inode); 874 inode_dec_dirty_pages(inode);
861 unlock_page(page); 875 unlock_page(page);
862 if (need_balance_fs) 876 if (need_balance_fs)
863 f2fs_balance_fs(sbi); 877 f2fs_balance_fs(sbi);
@@ -883,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
883 struct writeback_control *wbc) 897 struct writeback_control *wbc)
884{ 898{
885 struct inode *inode = mapping->host; 899 struct inode *inode = mapping->host;
886 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 900 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
887 bool locked = false; 901 bool locked = false;
888 int ret; 902 int ret;
889 long diff; 903 long diff;
@@ -895,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
895 return 0; 909 return 0;
896 910
897 if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && 911 if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
898 get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) && 912 get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
899 available_free_memory(sbi, DIRTY_DENTS)) 913 available_free_memory(sbi, DIRTY_DENTS))
900 goto skip_write; 914 goto skip_write;
901 915
@@ -917,7 +931,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
917 return ret; 931 return ret;
918 932
919skip_write: 933skip_write:
920 wbc->pages_skipped += get_dirty_dents(inode); 934 wbc->pages_skipped += get_dirty_pages(inode);
921 return 0; 935 return 0;
922} 936}
923 937
@@ -927,7 +941,7 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to)
927 941
928 if (to > inode->i_size) { 942 if (to > inode->i_size) {
929 truncate_pagecache(inode, inode->i_size); 943 truncate_pagecache(inode, inode->i_size);
930 truncate_blocks(inode, inode->i_size); 944 truncate_blocks(inode, inode->i_size, true);
931 } 945 }
932} 946}
933 947
@@ -936,7 +950,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
936 struct page **pagep, void **fsdata) 950 struct page **pagep, void **fsdata)
937{ 951{
938 struct inode *inode = mapping->host; 952 struct inode *inode = mapping->host;
939 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 953 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
940 struct page *page; 954 struct page *page;
941 pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; 955 pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
942 struct dnode_of_data dn; 956 struct dnode_of_data dn;
@@ -946,7 +960,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
946 960
947 f2fs_balance_fs(sbi); 961 f2fs_balance_fs(sbi);
948repeat: 962repeat:
949 err = f2fs_convert_inline_data(inode, pos + len); 963 err = f2fs_convert_inline_data(inode, pos + len, NULL);
950 if (err) 964 if (err)
951 goto fail; 965 goto fail;
952 966
@@ -1038,7 +1052,10 @@ static int f2fs_write_end(struct file *file,
1038 1052
1039 trace_f2fs_write_end(inode, pos, len, copied); 1053 trace_f2fs_write_end(inode, pos, len, copied);
1040 1054
1041 set_page_dirty(page); 1055 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
1056 register_inmem_page(inode, page);
1057 else
1058 set_page_dirty(page);
1042 1059
1043 if (pos + copied > i_size_read(inode)) { 1060 if (pos + copied > i_size_read(inode)) {
1044 i_size_write(inode, pos + copied); 1061 i_size_write(inode, pos + copied);
@@ -1083,9 +1100,6 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1083 if (check_direct_IO(inode, rw, iter, offset)) 1100 if (check_direct_IO(inode, rw, iter, offset))
1084 return 0; 1101 return 0;
1085 1102
1086 /* clear fsync mark to recover these blocks */
1087 fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
1088
1089 trace_f2fs_direct_IO_enter(inode, offset, count, rw); 1103 trace_f2fs_direct_IO_enter(inode, offset, count, rw);
1090 1104
1091 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); 1105 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
@@ -1101,8 +1115,12 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
1101 unsigned int length) 1115 unsigned int length)
1102{ 1116{
1103 struct inode *inode = page->mapping->host; 1117 struct inode *inode = page->mapping->host;
1118
1119 if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
1120 return;
1121
1104 if (PageDirty(page)) 1122 if (PageDirty(page))
1105 inode_dec_dirty_dents(inode); 1123 inode_dec_dirty_pages(inode);
1106 ClearPagePrivate(page); 1124 ClearPagePrivate(page);
1107} 1125}
1108 1126
@@ -1124,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
1124 1142
1125 if (!PageDirty(page)) { 1143 if (!PageDirty(page)) {
1126 __set_page_dirty_nobuffers(page); 1144 __set_page_dirty_nobuffers(page);
1127 set_dirty_dir_page(inode, page); 1145 update_dirty_page(inode, page);
1128 return 1; 1146 return 1;
1129 } 1147 }
1130 return 0; 1148 return 0;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a441ba33be11..0a91ab813a9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -32,7 +32,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
32 struct f2fs_stat_info *si = F2FS_STAT(sbi); 32 struct f2fs_stat_info *si = F2FS_STAT(sbi);
33 int i; 33 int i;
34 34
35 /* valid check of the segment numbers */ 35 /* validation check of the segment numbers */
36 si->hit_ext = sbi->read_hit_ext; 36 si->hit_ext = sbi->read_hit_ext;
37 si->total_ext = sbi->total_hit_ext; 37 si->total_ext = sbi->total_hit_ext;
38 si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); 38 si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
93 total_vblocks = 0; 93 total_vblocks = 0;
94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); 94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
95 hblks_per_sec = blks_per_sec / 2; 95 hblks_per_sec = blks_per_sec / 2;
96 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 96 for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
97 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); 97 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
98 dist = abs(vblocks - hblks_per_sec); 98 dist = abs(vblocks - hblks_per_sec);
99 bimodal += dist * dist; 99 bimodal += dist * dist;
@@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
103 ndirty++; 103 ndirty++;
104 } 104 }
105 } 105 }
106 dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; 106 dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
107 si->bimodal = bimodal / dist; 107 si->bimodal = bimodal / dist;
108 if (si->dirty_count) 108 if (si->dirty_count)
109 si->avg_vblocks = total_vblocks / ndirty; 109 si->avg_vblocks = total_vblocks / ndirty;
@@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
131 131
132 /* build sit */ 132 /* build sit */
133 si->base_mem += sizeof(struct sit_info); 133 si->base_mem += sizeof(struct sit_info);
134 si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry); 134 si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
135 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); 135 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
136 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); 136 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
137 if (sbi->segs_per_sec > 1) 137 if (sbi->segs_per_sec > 1)
138 si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); 138 si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
139 si->base_mem += __bitmap_size(sbi, SIT_BITMAP); 139 si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
140 140
141 /* build free segmap */ 141 /* build free segmap */
142 si->base_mem += sizeof(struct free_segmap_info); 142 si->base_mem += sizeof(struct free_segmap_info);
143 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); 143 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
144 si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); 144 si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
145 145
146 /* build curseg */ 146 /* build curseg */
147 si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; 147 si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -149,10 +149,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
149 149
150 /* build dirty segmap */ 150 /* build dirty segmap */
151 si->base_mem += sizeof(struct dirty_seglist_info); 151 si->base_mem += sizeof(struct dirty_seglist_info);
152 si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); 152 si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
153 si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); 153 si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
154 154
155 /* buld nm */ 155 /* build nm */
156 si->base_mem += sizeof(struct f2fs_nm_info); 156 si->base_mem += sizeof(struct f2fs_nm_info);
157 si->base_mem += __bitmap_size(sbi, NAT_BITMAP); 157 si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
158 158
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index bcf893c3d903..b54f87149c09 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -124,9 +124,9 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
124 124
125 /* 125 /*
126 * For the most part, it should be a bug when name_len is zero. 126 * For the most part, it should be a bug when name_len is zero.
127 * We stop here for figuring out where the bugs are occurred. 127 * We stop here for figuring out where the bugs has occurred.
128 */ 128 */
129 f2fs_bug_on(!de->name_len); 129 f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
130 130
131 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); 131 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
132 } 132 }
@@ -151,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
151 bool room = false; 151 bool room = false;
152 int max_slots = 0; 152 int max_slots = 0;
153 153
154 f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); 154 f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
155 155
156 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); 156 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
157 nblock = bucket_blocks(level); 157 nblock = bucket_blocks(level);
@@ -284,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
284 284
285int update_dent_inode(struct inode *inode, const struct qstr *name) 285int update_dent_inode(struct inode *inode, const struct qstr *name)
286{ 286{
287 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
288 struct page *page; 287 struct page *page;
289 288
290 page = get_node_page(sbi, inode->i_ino); 289 page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
291 if (IS_ERR(page)) 290 if (IS_ERR(page))
292 return PTR_ERR(page); 291 return PTR_ERR(page);
293 292
@@ -337,7 +336,6 @@ static int make_empty_dir(struct inode *inode,
337static struct page *init_inode_metadata(struct inode *inode, 336static struct page *init_inode_metadata(struct inode *inode,
338 struct inode *dir, const struct qstr *name) 337 struct inode *dir, const struct qstr *name)
339{ 338{
340 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
341 struct page *page; 339 struct page *page;
342 int err; 340 int err;
343 341
@@ -360,7 +358,7 @@ static struct page *init_inode_metadata(struct inode *inode,
360 if (err) 358 if (err)
361 goto put_error; 359 goto put_error;
362 } else { 360 } else {
363 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); 361 page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
364 if (IS_ERR(page)) 362 if (IS_ERR(page))
365 return page; 363 return page;
366 364
@@ -381,7 +379,7 @@ static struct page *init_inode_metadata(struct inode *inode,
381 * we should remove this inode from orphan list. 379 * we should remove this inode from orphan list.
382 */ 380 */
383 if (inode->i_nlink == 0) 381 if (inode->i_nlink == 0)
384 remove_orphan_inode(sbi, inode->i_ino); 382 remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
385 inc_nlink(inode); 383 inc_nlink(inode);
386 } 384 }
387 return page; 385 return page;
@@ -391,7 +389,7 @@ put_error:
391error: 389error:
392 /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ 390 /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
393 truncate_inode_pages(&inode->i_data, 0); 391 truncate_inode_pages(&inode->i_data, 0);
394 truncate_blocks(inode, 0); 392 truncate_blocks(inode, 0, false);
395 remove_dirty_dir_inode(inode); 393 remove_dirty_dir_inode(inode);
396 remove_inode_page(inode); 394 remove_inode_page(inode);
397 return ERR_PTR(err); 395 return ERR_PTR(err);
@@ -563,7 +561,7 @@ fail:
563} 561}
564 562
565/* 563/*
566 * It only removes the dentry from the dentry page,corresponding name 564 * It only removes the dentry from the dentry page, corresponding name
567 * entry in name page does not need to be touched during deletion. 565 * entry in name page does not need to be touched during deletion.
568 */ 566 */
569void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, 567void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
@@ -571,8 +569,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
571{ 569{
572 struct f2fs_dentry_block *dentry_blk; 570 struct f2fs_dentry_block *dentry_blk;
573 unsigned int bit_pos; 571 unsigned int bit_pos;
574 struct address_space *mapping = page->mapping; 572 struct inode *dir = page->mapping->host;
575 struct inode *dir = mapping->host;
576 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); 573 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
577 int i; 574 int i;
578 575
@@ -594,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
594 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 591 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
595 592
596 if (inode) { 593 if (inode) {
597 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 594 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
598 595
599 down_write(&F2FS_I(inode)->i_sem); 596 down_write(&F2FS_I(inode)->i_sem);
600 597
@@ -621,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
621 truncate_hole(dir, page->index, page->index + 1); 618 truncate_hole(dir, page->index, page->index + 1);
622 clear_page_dirty_for_io(page); 619 clear_page_dirty_for_io(page);
623 ClearPageUptodate(page); 620 ClearPageUptodate(page);
624 inode_dec_dirty_dents(dir); 621 inode_dec_dirty_pages(dir);
625 } 622 }
626 f2fs_put_page(page, 1); 623 f2fs_put_page(page, 1);
627} 624}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 4dab5338a97a..8171e80b2ee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,16 @@
21#include <linux/sched.h> 21#include <linux/sched.h>
22 22
23#ifdef CONFIG_F2FS_CHECK_FS 23#ifdef CONFIG_F2FS_CHECK_FS
24#define f2fs_bug_on(condition) BUG_ON(condition) 24#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
25#define f2fs_down_write(x, y) down_write_nest_lock(x, y) 25#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
26#else 26#else
27#define f2fs_bug_on(condition) 27#define f2fs_bug_on(sbi, condition) \
28 do { \
29 if (unlikely(condition)) { \
30 WARN_ON(1); \
31 sbi->need_fsck = true; \
32 } \
33 } while (0)
28#define f2fs_down_write(x, y) down_write(x) 34#define f2fs_down_write(x, y) down_write(x)
29#endif 35#endif
30 36
@@ -90,6 +96,20 @@ enum {
90 SIT_BITMAP 96 SIT_BITMAP
91}; 97};
92 98
99enum {
100 CP_UMOUNT,
101 CP_SYNC,
102 CP_DISCARD,
103};
104
105struct cp_control {
106 int reason;
107 __u64 trim_start;
108 __u64 trim_end;
109 __u64 trim_minlen;
110 __u64 trimmed;
111};
112
93/* 113/*
94 * For CP/NAT/SIT/SSA readahead 114 * For CP/NAT/SIT/SSA readahead
95 */ 115 */
@@ -97,7 +117,8 @@ enum {
97 META_CP, 117 META_CP,
98 META_NAT, 118 META_NAT,
99 META_SIT, 119 META_SIT,
100 META_SSA 120 META_SSA,
121 META_POR,
101}; 122};
102 123
103/* for the list of ino */ 124/* for the list of ino */
@@ -130,7 +151,9 @@ struct discard_entry {
130struct fsync_inode_entry { 151struct fsync_inode_entry {
131 struct list_head list; /* list head */ 152 struct list_head list; /* list head */
132 struct inode *inode; /* vfs inode pointer */ 153 struct inode *inode; /* vfs inode pointer */
133 block_t blkaddr; /* block address locating the last inode */ 154 block_t blkaddr; /* block address locating the last fsync */
155 block_t last_dentry; /* block address locating the last dentry */
156 block_t last_inode; /* block address locating the last inode */
134}; 157};
135 158
136#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) 159#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
@@ -141,6 +164,9 @@ struct fsync_inode_entry {
141#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) 164#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
142#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) 165#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
143 166
167#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
168#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
169
144static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) 170static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
145{ 171{
146 int before = nats_in_cursum(rs); 172 int before = nats_in_cursum(rs);
@@ -155,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
155 return before; 181 return before;
156} 182}
157 183
184static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
185 int type)
186{
187 if (type == NAT_JOURNAL)
188 return size <= MAX_NAT_JENTRIES(sum);
189 return size <= MAX_SIT_JENTRIES(sum);
190}
191
158/* 192/*
159 * ioctl commands 193 * ioctl commands
160 */ 194 */
161#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS 195#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
162#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS 196#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
197
198#define F2FS_IOCTL_MAGIC 0xf5
199#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
200#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
201#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
163 202
164#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 203#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
165/* 204/*
@@ -222,13 +261,16 @@ struct f2fs_inode_info {
222 /* Use below internally in f2fs*/ 261 /* Use below internally in f2fs*/
223 unsigned long flags; /* use to pass per-file flags */ 262 unsigned long flags; /* use to pass per-file flags */
224 struct rw_semaphore i_sem; /* protect fi info */ 263 struct rw_semaphore i_sem; /* protect fi info */
225 atomic_t dirty_dents; /* # of dirty dentry pages */ 264 atomic_t dirty_pages; /* # of dirty pages */
226 f2fs_hash_t chash; /* hash value of given file name */ 265 f2fs_hash_t chash; /* hash value of given file name */
227 unsigned int clevel; /* maximum level of given file name */ 266 unsigned int clevel; /* maximum level of given file name */
228 nid_t i_xattr_nid; /* node id that contains xattrs */ 267 nid_t i_xattr_nid; /* node id that contains xattrs */
229 unsigned long long xattr_ver; /* cp version of xattr modification */ 268 unsigned long long xattr_ver; /* cp version of xattr modification */
230 struct extent_info ext; /* in-memory extent cache entry */ 269 struct extent_info ext; /* in-memory extent cache entry */
231 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ 270 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
271
272 struct list_head inmem_pages; /* inmemory pages managed by f2fs */
273 struct mutex inmem_lock; /* lock for inmemory pages */
232}; 274};
233 275
234static inline void get_extent_info(struct extent_info *ext, 276static inline void get_extent_info(struct extent_info *ext,
@@ -260,11 +302,10 @@ struct f2fs_nm_info {
260 302
261 /* NAT cache management */ 303 /* NAT cache management */
262 struct radix_tree_root nat_root;/* root of the nat entry cache */ 304 struct radix_tree_root nat_root;/* root of the nat entry cache */
305 struct radix_tree_root nat_set_root;/* root of the nat set cache */
263 rwlock_t nat_tree_lock; /* protect nat_tree_lock */ 306 rwlock_t nat_tree_lock; /* protect nat_tree_lock */
264 unsigned int nat_cnt; /* the # of cached nat entries */
265 struct list_head nat_entries; /* cached nat entry list (clean) */ 307 struct list_head nat_entries; /* cached nat entry list (clean) */
266 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ 308 unsigned int nat_cnt; /* the # of cached nat entries */
267 struct list_head nat_entry_set; /* nat entry set list */
268 unsigned int dirty_nat_cnt; /* total num of nat entries in set */ 309 unsigned int dirty_nat_cnt; /* total num of nat entries in set */
269 310
270 /* free node ids management */ 311 /* free node ids management */
@@ -332,18 +373,16 @@ enum {
332}; 373};
333 374
334struct flush_cmd { 375struct flush_cmd {
335 struct flush_cmd *next;
336 struct completion wait; 376 struct completion wait;
377 struct llist_node llnode;
337 int ret; 378 int ret;
338}; 379};
339 380
340struct flush_cmd_control { 381struct flush_cmd_control {
341 struct task_struct *f2fs_issue_flush; /* flush thread */ 382 struct task_struct *f2fs_issue_flush; /* flush thread */
342 wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ 383 wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
343 struct flush_cmd *issue_list; /* list for command issue */ 384 struct llist_head issue_list; /* list for command issue */
344 struct flush_cmd *dispatch_list; /* list for command dispatch */ 385 struct llist_node *dispatch_list; /* list for command dispatch */
345 spinlock_t issue_lock; /* for issue list lock */
346 struct flush_cmd *issue_tail; /* list tail of issue list */
347}; 386};
348 387
349struct f2fs_sm_info { 388struct f2fs_sm_info {
@@ -369,8 +408,11 @@ struct f2fs_sm_info {
369 int nr_discards; /* # of discards in the list */ 408 int nr_discards; /* # of discards in the list */
370 int max_discards; /* max. discards to be issued */ 409 int max_discards; /* max. discards to be issued */
371 410
411 struct list_head sit_entry_set; /* sit entry set list */
412
372 unsigned int ipu_policy; /* in-place-update policy */ 413 unsigned int ipu_policy; /* in-place-update policy */
373 unsigned int min_ipu_util; /* in-place-update threshold */ 414 unsigned int min_ipu_util; /* in-place-update threshold */
415 unsigned int min_fsync_blocks; /* threshold for fsync */
374 416
375 /* for flush command control */ 417 /* for flush command control */
376 struct flush_cmd_control *cmd_control_info; 418 struct flush_cmd_control *cmd_control_info;
@@ -395,7 +437,7 @@ enum count_type {
395}; 437};
396 438
397/* 439/*
398 * The below are the page types of bios used in submti_bio(). 440 * The below are the page types of bios used in submit_bio().
399 * The available types are: 441 * The available types are:
400 * DATA User data pages. It operates as async mode. 442 * DATA User data pages. It operates as async mode.
401 * NODE Node pages. It operates as async mode. 443 * NODE Node pages. It operates as async mode.
@@ -434,6 +476,7 @@ struct f2fs_sb_info {
434 struct buffer_head *raw_super_buf; /* buffer head of raw sb */ 476 struct buffer_head *raw_super_buf; /* buffer head of raw sb */
435 struct f2fs_super_block *raw_super; /* raw super block pointer */ 477 struct f2fs_super_block *raw_super; /* raw super block pointer */
436 int s_dirty; /* dirty flag for checkpoint */ 478 int s_dirty; /* dirty flag for checkpoint */
479 bool need_fsck; /* need fsck.f2fs to fix */
437 480
438 /* for node-related operations */ 481 /* for node-related operations */
439 struct f2fs_nm_info *nm_info; /* node manager */ 482 struct f2fs_nm_info *nm_info; /* node manager */
@@ -470,7 +513,7 @@ struct f2fs_sb_info {
470 struct list_head dir_inode_list; /* dir inode list */ 513 struct list_head dir_inode_list; /* dir inode list */
471 spinlock_t dir_inode_lock; /* for dir inode list lock */ 514 spinlock_t dir_inode_lock; /* for dir inode list lock */
472 515
473 /* basic file system units */ 516 /* basic filesystem units */
474 unsigned int log_sectors_per_block; /* log2 sectors per block */ 517 unsigned int log_sectors_per_block; /* log2 sectors per block */
475 unsigned int log_blocksize; /* log2 block size */ 518 unsigned int log_blocksize; /* log2 block size */
476 unsigned int blocksize; /* block size */ 519 unsigned int blocksize; /* block size */
@@ -539,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
539 return sb->s_fs_info; 582 return sb->s_fs_info;
540} 583}
541 584
585static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
586{
587 return F2FS_SB(inode->i_sb);
588}
589
590static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
591{
592 return F2FS_I_SB(mapping->host);
593}
594
595static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
596{
597 return F2FS_M_SB(page->mapping);
598}
599
542static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) 600static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
543{ 601{
544 return (struct f2fs_super_block *)(sbi->raw_super); 602 return (struct f2fs_super_block *)(sbi->raw_super);
@@ -703,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
703 blkcnt_t count) 761 blkcnt_t count)
704{ 762{
705 spin_lock(&sbi->stat_lock); 763 spin_lock(&sbi->stat_lock);
706 f2fs_bug_on(sbi->total_valid_block_count < (block_t) count); 764 f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
707 f2fs_bug_on(inode->i_blocks < count); 765 f2fs_bug_on(sbi, inode->i_blocks < count);
708 inode->i_blocks -= count; 766 inode->i_blocks -= count;
709 sbi->total_valid_block_count -= (block_t)count; 767 sbi->total_valid_block_count -= (block_t)count;
710 spin_unlock(&sbi->stat_lock); 768 spin_unlock(&sbi->stat_lock);
@@ -716,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
716 F2FS_SET_SB_DIRT(sbi); 774 F2FS_SET_SB_DIRT(sbi);
717} 775}
718 776
719static inline void inode_inc_dirty_dents(struct inode *inode) 777static inline void inode_inc_dirty_pages(struct inode *inode)
720{ 778{
721 inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); 779 atomic_inc(&F2FS_I(inode)->dirty_pages);
722 atomic_inc(&F2FS_I(inode)->dirty_dents); 780 if (S_ISDIR(inode->i_mode))
781 inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
723} 782}
724 783
725static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) 784static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -727,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
727 atomic_dec(&sbi->nr_pages[count_type]); 786 atomic_dec(&sbi->nr_pages[count_type]);
728} 787}
729 788
730static inline void inode_dec_dirty_dents(struct inode *inode) 789static inline void inode_dec_dirty_pages(struct inode *inode)
731{ 790{
732 if (!S_ISDIR(inode->i_mode)) 791 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
733 return; 792 return;
734 793
735 dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); 794 atomic_dec(&F2FS_I(inode)->dirty_pages);
736 atomic_dec(&F2FS_I(inode)->dirty_dents); 795
796 if (S_ISDIR(inode->i_mode))
797 dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
737} 798}
738 799
739static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) 800static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -741,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
741 return atomic_read(&sbi->nr_pages[count_type]); 802 return atomic_read(&sbi->nr_pages[count_type]);
742} 803}
743 804
744static inline int get_dirty_dents(struct inode *inode) 805static inline int get_dirty_pages(struct inode *inode)
745{ 806{
746 return atomic_read(&F2FS_I(inode)->dirty_dents); 807 return atomic_read(&F2FS_I(inode)->dirty_pages);
747} 808}
748 809
749static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) 810static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -799,7 +860,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
799 860
800 /* 861 /*
801 * odd numbered checkpoint should at cp segment 0 862 * odd numbered checkpoint should at cp segment 0
802 * and even segent must be at cp segment 1 863 * and even segment must be at cp segment 1
803 */ 864 */
804 if (!(ckpt_version & 1)) 865 if (!(ckpt_version & 1))
805 start_addr += sbi->blocks_per_seg; 866 start_addr += sbi->blocks_per_seg;
@@ -848,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
848{ 909{
849 spin_lock(&sbi->stat_lock); 910 spin_lock(&sbi->stat_lock);
850 911
851 f2fs_bug_on(!sbi->total_valid_block_count); 912 f2fs_bug_on(sbi, !sbi->total_valid_block_count);
852 f2fs_bug_on(!sbi->total_valid_node_count); 913 f2fs_bug_on(sbi, !sbi->total_valid_node_count);
853 f2fs_bug_on(!inode->i_blocks); 914 f2fs_bug_on(sbi, !inode->i_blocks);
854 915
855 inode->i_blocks--; 916 inode->i_blocks--;
856 sbi->total_valid_node_count--; 917 sbi->total_valid_node_count--;
@@ -867,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
867static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) 928static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
868{ 929{
869 spin_lock(&sbi->stat_lock); 930 spin_lock(&sbi->stat_lock);
870 f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count); 931 f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
871 sbi->total_valid_inode_count++; 932 sbi->total_valid_inode_count++;
872 spin_unlock(&sbi->stat_lock); 933 spin_unlock(&sbi->stat_lock);
873} 934}
@@ -875,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
875static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) 936static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
876{ 937{
877 spin_lock(&sbi->stat_lock); 938 spin_lock(&sbi->stat_lock);
878 f2fs_bug_on(!sbi->total_valid_inode_count); 939 f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
879 sbi->total_valid_inode_count--; 940 sbi->total_valid_inode_count--;
880 spin_unlock(&sbi->stat_lock); 941 spin_unlock(&sbi->stat_lock);
881} 942}
@@ -891,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
891 return; 952 return;
892 953
893 if (unlock) { 954 if (unlock) {
894 f2fs_bug_on(!PageLocked(page)); 955 f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
895 unlock_page(page); 956 unlock_page(page);
896 } 957 }
897 page_cache_release(page); 958 page_cache_release(page);
@@ -998,7 +1059,9 @@ enum {
998 FI_INLINE_DATA, /* used for inline data*/ 1059 FI_INLINE_DATA, /* used for inline data*/
999 FI_APPEND_WRITE, /* inode has appended data */ 1060 FI_APPEND_WRITE, /* inode has appended data */
1000 FI_UPDATE_WRITE, /* inode has in-place-update data */ 1061 FI_UPDATE_WRITE, /* inode has in-place-update data */
1001 FI_NEED_IPU, /* used fo ipu for fdatasync */ 1062 FI_NEED_IPU, /* used for ipu per file */
1063 FI_ATOMIC_FILE, /* indicate atomic file */
1064 FI_VOLATILE_FILE, /* indicate volatile file */
1002}; 1065};
1003 1066
1004static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 1067static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1085,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode)
1085 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); 1148 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
1086} 1149}
1087 1150
1151static inline bool f2fs_is_atomic_file(struct inode *inode)
1152{
1153 return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
1154}
1155
1156static inline bool f2fs_is_volatile_file(struct inode *inode)
1157{
1158 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
1159}
1160
1088static inline void *inline_data_addr(struct page *page) 1161static inline void *inline_data_addr(struct page *page)
1089{ 1162{
1090 struct f2fs_inode *ri = F2FS_INODE(page); 1163 struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1096,6 +1169,11 @@ static inline int f2fs_readonly(struct super_block *sb)
1096 return sb->s_flags & MS_RDONLY; 1169 return sb->s_flags & MS_RDONLY;
1097} 1170}
1098 1171
1172static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
1173{
1174 return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
1175}
1176
1099static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) 1177static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
1100{ 1178{
1101 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); 1179 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -1117,7 +1195,7 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
1117 */ 1195 */
1118int f2fs_sync_file(struct file *, loff_t, loff_t, int); 1196int f2fs_sync_file(struct file *, loff_t, loff_t, int);
1119void truncate_data_blocks(struct dnode_of_data *); 1197void truncate_data_blocks(struct dnode_of_data *);
1120int truncate_blocks(struct inode *, u64); 1198int truncate_blocks(struct inode *, u64, bool);
1121void f2fs_truncate(struct inode *); 1199void f2fs_truncate(struct inode *);
1122int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 1200int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
1123int f2fs_setattr(struct dentry *, struct iattr *); 1201int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1136,6 +1214,7 @@ void update_inode(struct inode *, struct page *);
1136void update_inode_page(struct inode *); 1214void update_inode_page(struct inode *);
1137int f2fs_write_inode(struct inode *, struct writeback_control *); 1215int f2fs_write_inode(struct inode *, struct writeback_control *);
1138void f2fs_evict_inode(struct inode *); 1216void f2fs_evict_inode(struct inode *);
1217void handle_failed_inode(struct inode *);
1139 1218
1140/* 1219/*
1141 * namei.c 1220 * namei.c
@@ -1183,9 +1262,9 @@ struct dnode_of_data;
1183struct node_info; 1262struct node_info;
1184 1263
1185bool available_free_memory(struct f2fs_sb_info *, int); 1264bool available_free_memory(struct f2fs_sb_info *, int);
1186int is_checkpointed_node(struct f2fs_sb_info *, nid_t); 1265bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
1187bool fsync_mark_done(struct f2fs_sb_info *, nid_t); 1266bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
1188void fsync_mark_clear(struct f2fs_sb_info *, nid_t); 1267bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
1189void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); 1268void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
1190int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 1269int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
1191int truncate_inode_blocks(struct inode *, pgoff_t); 1270int truncate_inode_blocks(struct inode *, pgoff_t);
@@ -1202,10 +1281,8 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
1202bool alloc_nid(struct f2fs_sb_info *, nid_t *); 1281bool alloc_nid(struct f2fs_sb_info *, nid_t *);
1203void alloc_nid_done(struct f2fs_sb_info *, nid_t); 1282void alloc_nid_done(struct f2fs_sb_info *, nid_t);
1204void alloc_nid_failed(struct f2fs_sb_info *, nid_t); 1283void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
1205void recover_node_page(struct f2fs_sb_info *, struct page *,
1206 struct f2fs_summary *, struct node_info *, block_t);
1207void recover_inline_xattr(struct inode *, struct page *); 1284void recover_inline_xattr(struct inode *, struct page *);
1208bool recover_xattr_data(struct inode *, struct page *, block_t); 1285void recover_xattr_data(struct inode *, struct page *, block_t);
1209int recover_inode_page(struct f2fs_sb_info *, struct page *); 1286int recover_inode_page(struct f2fs_sb_info *, struct page *);
1210int restore_node_summary(struct f2fs_sb_info *, unsigned int, 1287int restore_node_summary(struct f2fs_sb_info *, unsigned int,
1211 struct f2fs_summary_block *); 1288 struct f2fs_summary_block *);
@@ -1218,6 +1295,8 @@ void destroy_node_manager_caches(void);
1218/* 1295/*
1219 * segment.c 1296 * segment.c
1220 */ 1297 */
1298void register_inmem_page(struct inode *, struct page *);
1299void commit_inmem_pages(struct inode *, bool);
1221void f2fs_balance_fs(struct f2fs_sb_info *); 1300void f2fs_balance_fs(struct f2fs_sb_info *);
1222void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1301void f2fs_balance_fs_bg(struct f2fs_sb_info *);
1223int f2fs_issue_flush(struct f2fs_sb_info *); 1302int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1226,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
1226void invalidate_blocks(struct f2fs_sb_info *, block_t); 1305void invalidate_blocks(struct f2fs_sb_info *, block_t);
1227void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); 1306void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
1228void clear_prefree_segments(struct f2fs_sb_info *); 1307void clear_prefree_segments(struct f2fs_sb_info *);
1308void release_discard_addrs(struct f2fs_sb_info *);
1229void discard_next_dnode(struct f2fs_sb_info *, block_t); 1309void discard_next_dnode(struct f2fs_sb_info *, block_t);
1230int npages_for_summary_flush(struct f2fs_sb_info *); 1310int npages_for_summary_flush(struct f2fs_sb_info *);
1231void allocate_new_segments(struct f2fs_sb_info *); 1311void allocate_new_segments(struct f2fs_sb_info *);
1312int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
1232struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); 1313struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
1233void write_meta_page(struct f2fs_sb_info *, struct page *); 1314void write_meta_page(struct f2fs_sb_info *, struct page *);
1234void write_node_page(struct f2fs_sb_info *, struct page *, 1315void write_node_page(struct f2fs_sb_info *, struct page *,
@@ -1238,8 +1319,6 @@ void write_data_page(struct page *, struct dnode_of_data *, block_t *,
1238void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); 1319void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
1239void recover_data_page(struct f2fs_sb_info *, struct page *, 1320void recover_data_page(struct f2fs_sb_info *, struct page *,
1240 struct f2fs_summary *, block_t, block_t); 1321 struct f2fs_summary *, block_t, block_t);
1241void rewrite_node_page(struct f2fs_sb_info *, struct page *,
1242 struct f2fs_summary *, block_t, block_t);
1243void allocate_data_block(struct f2fs_sb_info *, struct page *, 1322void allocate_data_block(struct f2fs_sb_info *, struct page *,
1244 block_t, block_t *, struct f2fs_summary *, int); 1323 block_t, block_t *, struct f2fs_summary *, int);
1245void f2fs_wait_on_page_writeback(struct page *, enum page_type); 1324void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1247,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t);
1247void write_node_summaries(struct f2fs_sb_info *, block_t); 1326void write_node_summaries(struct f2fs_sb_info *, block_t);
1248int lookup_journal_in_cursum(struct f2fs_summary_block *, 1327int lookup_journal_in_cursum(struct f2fs_summary_block *,
1249 int, unsigned int, int); 1328 int, unsigned int, int);
1250void flush_sit_entries(struct f2fs_sb_info *); 1329void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
1251int build_segment_manager(struct f2fs_sb_info *); 1330int build_segment_manager(struct f2fs_sb_info *);
1252void destroy_segment_manager(struct f2fs_sb_info *); 1331void destroy_segment_manager(struct f2fs_sb_info *);
1253int __init create_segment_manager_caches(void); 1332int __init create_segment_manager_caches(void);
@@ -1258,10 +1337,12 @@ void destroy_segment_manager_caches(void);
1258 */ 1337 */
1259struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); 1338struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
1260struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); 1339struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
1261int ra_meta_pages(struct f2fs_sb_info *, int, int, int); 1340struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
1341int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
1262long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); 1342long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
1263void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); 1343void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
1264void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); 1344void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
1345void release_dirty_inode(struct f2fs_sb_info *);
1265bool exist_written_data(struct f2fs_sb_info *, nid_t, int); 1346bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
1266int acquire_orphan_inode(struct f2fs_sb_info *); 1347int acquire_orphan_inode(struct f2fs_sb_info *);
1267void release_orphan_inode(struct f2fs_sb_info *); 1348void release_orphan_inode(struct f2fs_sb_info *);
@@ -1269,11 +1350,11 @@ void add_orphan_inode(struct f2fs_sb_info *, nid_t);
1269void remove_orphan_inode(struct f2fs_sb_info *, nid_t); 1350void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
1270void recover_orphan_inodes(struct f2fs_sb_info *); 1351void recover_orphan_inodes(struct f2fs_sb_info *);
1271int get_valid_checkpoint(struct f2fs_sb_info *); 1352int get_valid_checkpoint(struct f2fs_sb_info *);
1272void set_dirty_dir_page(struct inode *, struct page *); 1353void update_dirty_page(struct inode *, struct page *);
1273void add_dirty_dir_inode(struct inode *); 1354void add_dirty_dir_inode(struct inode *);
1274void remove_dirty_dir_inode(struct inode *); 1355void remove_dirty_dir_inode(struct inode *);
1275void sync_dirty_dir_inodes(struct f2fs_sb_info *); 1356void sync_dirty_dir_inodes(struct f2fs_sb_info *);
1276void write_checkpoint(struct f2fs_sb_info *, bool); 1357void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
1277void init_ino_entry_info(struct f2fs_sb_info *); 1358void init_ino_entry_info(struct f2fs_sb_info *);
1278int __init create_checkpoint_caches(void); 1359int __init create_checkpoint_caches(void);
1279void destroy_checkpoint_caches(void); 1360void destroy_checkpoint_caches(void);
@@ -1357,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1357#define stat_inc_inline_inode(inode) \ 1438#define stat_inc_inline_inode(inode) \
1358 do { \ 1439 do { \
1359 if (f2fs_has_inline_data(inode)) \ 1440 if (f2fs_has_inline_data(inode)) \
1360 ((F2FS_SB(inode->i_sb))->inline_inode++); \ 1441 ((F2FS_I_SB(inode))->inline_inode++); \
1361 } while (0) 1442 } while (0)
1362#define stat_dec_inline_inode(inode) \ 1443#define stat_dec_inline_inode(inode) \
1363 do { \ 1444 do { \
1364 if (f2fs_has_inline_data(inode)) \ 1445 if (f2fs_has_inline_data(inode)) \
1365 ((F2FS_SB(inode->i_sb))->inline_inode--); \ 1446 ((F2FS_I_SB(inode))->inline_inode--); \
1366 } while (0) 1447 } while (0)
1367 1448
1368#define stat_inc_seg_type(sbi, curseg) \ 1449#define stat_inc_seg_type(sbi, curseg) \
@@ -1439,8 +1520,8 @@ extern const struct inode_operations f2fs_special_inode_operations;
1439 */ 1520 */
1440bool f2fs_may_inline(struct inode *); 1521bool f2fs_may_inline(struct inode *);
1441int f2fs_read_inline_data(struct inode *, struct page *); 1522int f2fs_read_inline_data(struct inode *, struct page *);
1442int f2fs_convert_inline_data(struct inode *, pgoff_t); 1523int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
1443int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); 1524int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
1444void truncate_inline_data(struct inode *, u64); 1525void truncate_inline_data(struct inode *, u64);
1445int recover_inline_data(struct inode *, struct page *); 1526bool recover_inline_data(struct inode *, struct page *);
1446#endif 1527#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 208f1a9bd569..8e68bb64f835 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
33{ 33{
34 struct page *page = vmf->page; 34 struct page *page = vmf->page;
35 struct inode *inode = file_inode(vma->vm_file); 35 struct inode *inode = file_inode(vma->vm_file);
36 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 36 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
37 struct dnode_of_data dn; 37 struct dnode_of_data dn;
38 int err; 38 int err;
39 39
@@ -41,6 +41,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
41 41
42 sb_start_pagefault(inode->i_sb); 42 sb_start_pagefault(inode->i_sb);
43 43
44 /* force to convert with normal data indices */
45 err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
46 if (err)
47 goto out;
48
44 /* block allocation */ 49 /* block allocation */
45 f2fs_lock_op(sbi); 50 f2fs_lock_op(sbi);
46 set_new_dnode(&dn, inode, NULL, NULL, 0); 51 set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -110,11 +115,31 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
110 return 1; 115 return 1;
111} 116}
112 117
118static inline bool need_do_checkpoint(struct inode *inode)
119{
120 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
121 bool need_cp = false;
122
123 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
124 need_cp = true;
125 else if (file_wrong_pino(inode))
126 need_cp = true;
127 else if (!space_for_roll_forward(sbi))
128 need_cp = true;
129 else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
130 need_cp = true;
131 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
132 need_cp = true;
133
134 return need_cp;
135}
136
113int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 137int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
114{ 138{
115 struct inode *inode = file->f_mapping->host; 139 struct inode *inode = file->f_mapping->host;
116 struct f2fs_inode_info *fi = F2FS_I(inode); 140 struct f2fs_inode_info *fi = F2FS_I(inode);
117 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 141 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
142 nid_t ino = inode->i_ino;
118 int ret = 0; 143 int ret = 0;
119 bool need_cp = false; 144 bool need_cp = false;
120 struct writeback_control wbc = { 145 struct writeback_control wbc = {
@@ -129,12 +154,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
129 trace_f2fs_sync_file_enter(inode); 154 trace_f2fs_sync_file_enter(inode);
130 155
131 /* if fdatasync is triggered, let's do in-place-update */ 156 /* if fdatasync is triggered, let's do in-place-update */
132 if (datasync) 157 if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
133 set_inode_flag(fi, FI_NEED_IPU); 158 set_inode_flag(fi, FI_NEED_IPU);
134
135 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 159 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
136 if (datasync) 160 clear_inode_flag(fi, FI_NEED_IPU);
137 clear_inode_flag(fi, FI_NEED_IPU); 161
138 if (ret) { 162 if (ret) {
139 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 163 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
140 return ret; 164 return ret;
@@ -144,33 +168,31 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
144 * if there is no written data, don't waste time to write recovery info. 168 * if there is no written data, don't waste time to write recovery info.
145 */ 169 */
146 if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && 170 if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
147 !exist_written_data(sbi, inode->i_ino, APPEND_INO)) { 171 !exist_written_data(sbi, ino, APPEND_INO)) {
172 struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
173
174 /* But we need to avoid that there are some inode updates */
175 if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
176 f2fs_put_page(i, 0);
177 goto go_write;
178 }
179 f2fs_put_page(i, 0);
180
148 if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || 181 if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
149 exist_written_data(sbi, inode->i_ino, UPDATE_INO)) 182 exist_written_data(sbi, ino, UPDATE_INO))
150 goto flush_out; 183 goto flush_out;
151 goto out; 184 goto out;
152 } 185 }
153 186go_write:
154 /* guarantee free sections for fsync */ 187 /* guarantee free sections for fsync */
155 f2fs_balance_fs(sbi); 188 f2fs_balance_fs(sbi);
156 189
157 down_read(&fi->i_sem);
158
159 /* 190 /*
160 * Both of fdatasync() and fsync() are able to be recovered from 191 * Both of fdatasync() and fsync() are able to be recovered from
161 * sudden-power-off. 192 * sudden-power-off.
162 */ 193 */
163 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) 194 down_read(&fi->i_sem);
164 need_cp = true; 195 need_cp = need_do_checkpoint(inode);
165 else if (file_wrong_pino(inode))
166 need_cp = true;
167 else if (!space_for_roll_forward(sbi))
168 need_cp = true;
169 else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
170 need_cp = true;
171 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
172 need_cp = true;
173
174 up_read(&fi->i_sem); 196 up_read(&fi->i_sem);
175 197
176 if (need_cp) { 198 if (need_cp) {
@@ -194,26 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
194 up_write(&fi->i_sem); 216 up_write(&fi->i_sem);
195 } 217 }
196 } else { 218 } else {
197 /* if there is no written node page, write its inode page */ 219sync_nodes:
198 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { 220 sync_node_pages(sbi, ino, &wbc);
199 if (fsync_mark_done(sbi, inode->i_ino)) 221
200 goto out; 222 if (need_inode_block_update(sbi, ino)) {
201 mark_inode_dirty_sync(inode); 223 mark_inode_dirty_sync(inode);
202 ret = f2fs_write_inode(inode, NULL); 224 ret = f2fs_write_inode(inode, NULL);
203 if (ret) 225 if (ret)
204 goto out; 226 goto out;
227 goto sync_nodes;
205 } 228 }
206 ret = wait_on_node_pages_writeback(sbi, inode->i_ino); 229
230 ret = wait_on_node_pages_writeback(sbi, ino);
207 if (ret) 231 if (ret)
208 goto out; 232 goto out;
209 233
210 /* once recovery info is written, don't need to tack this */ 234 /* once recovery info is written, don't need to tack this */
211 remove_dirty_inode(sbi, inode->i_ino, APPEND_INO); 235 remove_dirty_inode(sbi, ino, APPEND_INO);
212 clear_inode_flag(fi, FI_APPEND_WRITE); 236 clear_inode_flag(fi, FI_APPEND_WRITE);
213flush_out: 237flush_out:
214 remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO); 238 remove_dirty_inode(sbi, ino, UPDATE_INO);
215 clear_inode_flag(fi, FI_UPDATE_WRITE); 239 clear_inode_flag(fi, FI_UPDATE_WRITE);
216 ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); 240 ret = f2fs_issue_flush(F2FS_I_SB(inode));
217 } 241 }
218out: 242out:
219 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 243 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -288,7 +312,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
288 if (err && err != -ENOENT) { 312 if (err && err != -ENOENT) {
289 goto fail; 313 goto fail;
290 } else if (err == -ENOENT) { 314 } else if (err == -ENOENT) {
291 /* direct node is not exist */ 315 /* direct node does not exists */
292 if (whence == SEEK_DATA) { 316 if (whence == SEEK_DATA) {
293 pgofs = PGOFS_OF_NEXT_DNODE(pgofs, 317 pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
294 F2FS_I(inode)); 318 F2FS_I(inode));
@@ -340,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
340 maxbytes, i_size_read(inode)); 364 maxbytes, i_size_read(inode));
341 case SEEK_DATA: 365 case SEEK_DATA:
342 case SEEK_HOLE: 366 case SEEK_HOLE:
367 if (offset < 0)
368 return -ENXIO;
343 return f2fs_seek_block(file, offset, whence); 369 return f2fs_seek_block(file, offset, whence);
344 } 370 }
345 371
@@ -356,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
356int truncate_data_blocks_range(struct dnode_of_data *dn, int count) 382int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
357{ 383{
358 int nr_free = 0, ofs = dn->ofs_in_node; 384 int nr_free = 0, ofs = dn->ofs_in_node;
359 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 385 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
360 struct f2fs_node *raw_node; 386 struct f2fs_node *raw_node;
361 __le32 *addr; 387 __le32 *addr;
362 388
@@ -417,9 +443,9 @@ out:
417 f2fs_put_page(page, 1); 443 f2fs_put_page(page, 1);
418} 444}
419 445
420int truncate_blocks(struct inode *inode, u64 from) 446int truncate_blocks(struct inode *inode, u64 from, bool lock)
421{ 447{
422 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 448 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
423 unsigned int blocksize = inode->i_sb->s_blocksize; 449 unsigned int blocksize = inode->i_sb->s_blocksize;
424 struct dnode_of_data dn; 450 struct dnode_of_data dn;
425 pgoff_t free_from; 451 pgoff_t free_from;
@@ -433,14 +459,16 @@ int truncate_blocks(struct inode *inode, u64 from)
433 free_from = (pgoff_t) 459 free_from = (pgoff_t)
434 ((from + blocksize - 1) >> (sbi->log_blocksize)); 460 ((from + blocksize - 1) >> (sbi->log_blocksize));
435 461
436 f2fs_lock_op(sbi); 462 if (lock)
463 f2fs_lock_op(sbi);
437 464
438 set_new_dnode(&dn, inode, NULL, NULL, 0); 465 set_new_dnode(&dn, inode, NULL, NULL, 0);
439 err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); 466 err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
440 if (err) { 467 if (err) {
441 if (err == -ENOENT) 468 if (err == -ENOENT)
442 goto free_next; 469 goto free_next;
443 f2fs_unlock_op(sbi); 470 if (lock)
471 f2fs_unlock_op(sbi);
444 trace_f2fs_truncate_blocks_exit(inode, err); 472 trace_f2fs_truncate_blocks_exit(inode, err);
445 return err; 473 return err;
446 } 474 }
@@ -448,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from)
448 count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); 476 count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
449 477
450 count -= dn.ofs_in_node; 478 count -= dn.ofs_in_node;
451 f2fs_bug_on(count < 0); 479 f2fs_bug_on(sbi, count < 0);
452 480
453 if (dn.ofs_in_node || IS_INODE(dn.node_page)) { 481 if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
454 truncate_data_blocks_range(&dn, count); 482 truncate_data_blocks_range(&dn, count);
@@ -458,7 +486,8 @@ int truncate_blocks(struct inode *inode, u64 from)
458 f2fs_put_dnode(&dn); 486 f2fs_put_dnode(&dn);
459free_next: 487free_next:
460 err = truncate_inode_blocks(inode, free_from); 488 err = truncate_inode_blocks(inode, free_from);
461 f2fs_unlock_op(sbi); 489 if (lock)
490 f2fs_unlock_op(sbi);
462done: 491done:
463 /* lastly zero out the first data page */ 492 /* lastly zero out the first data page */
464 truncate_partial_data_page(inode, from); 493 truncate_partial_data_page(inode, from);
@@ -475,7 +504,7 @@ void f2fs_truncate(struct inode *inode)
475 504
476 trace_f2fs_truncate(inode); 505 trace_f2fs_truncate(inode);
477 506
478 if (!truncate_blocks(inode, i_size_read(inode))) { 507 if (!truncate_blocks(inode, i_size_read(inode), true)) {
479 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 508 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
480 mark_inode_dirty(inode); 509 mark_inode_dirty(inode);
481 } 510 }
@@ -531,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
531 if (err) 560 if (err)
532 return err; 561 return err;
533 562
534 if ((attr->ia_valid & ATTR_SIZE) && 563 if (attr->ia_valid & ATTR_SIZE) {
535 attr->ia_size != i_size_read(inode)) { 564 err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
536 err = f2fs_convert_inline_data(inode, attr->ia_size);
537 if (err) 565 if (err)
538 return err; 566 return err;
539 567
540 truncate_setsize(inode, attr->ia_size); 568 if (attr->ia_size != i_size_read(inode)) {
541 f2fs_truncate(inode); 569 truncate_setsize(inode, attr->ia_size);
542 f2fs_balance_fs(F2FS_SB(inode->i_sb)); 570 f2fs_truncate(inode);
571 f2fs_balance_fs(F2FS_I_SB(inode));
572 } else {
573 /*
574 * giving a chance to truncate blocks past EOF which
575 * are fallocated with FALLOC_FL_KEEP_SIZE.
576 */
577 f2fs_truncate(inode);
578 }
543 } 579 }
544 580
545 __setattr_copy(inode, attr); 581 __setattr_copy(inode, attr);
@@ -573,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = {
573static void fill_zero(struct inode *inode, pgoff_t index, 609static void fill_zero(struct inode *inode, pgoff_t index,
574 loff_t start, loff_t len) 610 loff_t start, loff_t len)
575{ 611{
576 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 612 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
577 struct page *page; 613 struct page *page;
578 614
579 if (!len) 615 if (!len)
@@ -622,7 +658,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
622 loff_t off_start, off_end; 658 loff_t off_start, off_end;
623 int ret = 0; 659 int ret = 0;
624 660
625 ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1); 661 if (!S_ISREG(inode->i_mode))
662 return -EOPNOTSUPP;
663
664 /* skip punching hole beyond i_size */
665 if (offset >= inode->i_size)
666 return ret;
667
668 ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
626 if (ret) 669 if (ret)
627 return ret; 670 return ret;
628 671
@@ -645,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
645 if (pg_start < pg_end) { 688 if (pg_start < pg_end) {
646 struct address_space *mapping = inode->i_mapping; 689 struct address_space *mapping = inode->i_mapping;
647 loff_t blk_start, blk_end; 690 loff_t blk_start, blk_end;
648 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 691 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
649 692
650 f2fs_balance_fs(sbi); 693 f2fs_balance_fs(sbi);
651 694
@@ -666,7 +709,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
666static int expand_inode_data(struct inode *inode, loff_t offset, 709static int expand_inode_data(struct inode *inode, loff_t offset,
667 loff_t len, int mode) 710 loff_t len, int mode)
668{ 711{
669 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 712 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
670 pgoff_t index, pg_start, pg_end; 713 pgoff_t index, pg_start, pg_end;
671 loff_t new_size = i_size_read(inode); 714 loff_t new_size = i_size_read(inode);
672 loff_t off_start, off_end; 715 loff_t off_start, off_end;
@@ -678,7 +721,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
678 if (ret) 721 if (ret)
679 return ret; 722 return ret;
680 723
681 ret = f2fs_convert_inline_data(inode, offset + len); 724 ret = f2fs_convert_inline_data(inode, offset + len, NULL);
682 if (ret) 725 if (ret)
683 return ret; 726 return ret;
684 727
@@ -762,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
762 return flags & F2FS_OTHER_FLMASK; 805 return flags & F2FS_OTHER_FLMASK;
763} 806}
764 807
765long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 808static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
809{
810 struct inode *inode = file_inode(filp);
811 struct f2fs_inode_info *fi = F2FS_I(inode);
812 unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
813 return put_user(flags, (int __user *)arg);
814}
815
816static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
766{ 817{
767 struct inode *inode = file_inode(filp); 818 struct inode *inode = file_inode(filp);
768 struct f2fs_inode_info *fi = F2FS_I(inode); 819 struct f2fs_inode_info *fi = F2FS_I(inode);
769 unsigned int flags; 820 unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
821 unsigned int oldflags;
770 int ret; 822 int ret;
771 823
772 switch (cmd) { 824 ret = mnt_want_write_file(filp);
773 case F2FS_IOC_GETFLAGS: 825 if (ret)
774 flags = fi->i_flags & FS_FL_USER_VISIBLE; 826 return ret;
775 return put_user(flags, (int __user *) arg);
776 case F2FS_IOC_SETFLAGS:
777 {
778 unsigned int oldflags;
779 827
780 ret = mnt_want_write_file(filp); 828 if (!inode_owner_or_capable(inode)) {
781 if (ret) 829 ret = -EACCES;
782 return ret; 830 goto out;
831 }
783 832
784 if (!inode_owner_or_capable(inode)) { 833 if (get_user(flags, (int __user *)arg)) {
785 ret = -EACCES; 834 ret = -EFAULT;
786 goto out; 835 goto out;
787 } 836 }
837
838 flags = f2fs_mask_flags(inode->i_mode, flags);
839
840 mutex_lock(&inode->i_mutex);
841
842 oldflags = fi->i_flags;
788 843
789 if (get_user(flags, (int __user *) arg)) { 844 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
790 ret = -EFAULT; 845 if (!capable(CAP_LINUX_IMMUTABLE)) {
846 mutex_unlock(&inode->i_mutex);
847 ret = -EPERM;
791 goto out; 848 goto out;
792 } 849 }
850 }
793 851
794 flags = f2fs_mask_flags(inode->i_mode, flags); 852 flags = flags & FS_FL_USER_MODIFIABLE;
853 flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
854 fi->i_flags = flags;
855 mutex_unlock(&inode->i_mutex);
795 856
796 mutex_lock(&inode->i_mutex); 857 f2fs_set_inode_flags(inode);
858 inode->i_ctime = CURRENT_TIME;
859 mark_inode_dirty(inode);
860out:
861 mnt_drop_write_file(filp);
862 return ret;
863}
797 864
798 oldflags = fi->i_flags; 865static int f2fs_ioc_start_atomic_write(struct file *filp)
866{
867 struct inode *inode = file_inode(filp);
868 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
799 869
800 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 870 if (!inode_owner_or_capable(inode))
801 if (!capable(CAP_LINUX_IMMUTABLE)) { 871 return -EACCES;
802 mutex_unlock(&inode->i_mutex); 872
803 ret = -EPERM; 873 f2fs_balance_fs(sbi);
804 goto out;
805 }
806 }
807 874
808 flags = flags & FS_FL_USER_MODIFIABLE; 875 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
809 flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
810 fi->i_flags = flags;
811 mutex_unlock(&inode->i_mutex);
812 876
813 f2fs_set_inode_flags(inode); 877 return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
814 inode->i_ctime = CURRENT_TIME; 878}
815 mark_inode_dirty(inode); 879
816out: 880static int f2fs_ioc_commit_atomic_write(struct file *filp)
817 mnt_drop_write_file(filp); 881{
882 struct inode *inode = file_inode(filp);
883 int ret;
884
885 if (!inode_owner_or_capable(inode))
886 return -EACCES;
887
888 if (f2fs_is_volatile_file(inode))
889 return 0;
890
891 ret = mnt_want_write_file(filp);
892 if (ret)
818 return ret; 893 return ret;
819 } 894
895 if (f2fs_is_atomic_file(inode))
896 commit_inmem_pages(inode, false);
897
898 ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
899 mnt_drop_write_file(filp);
900 return ret;
901}
902
903static int f2fs_ioc_start_volatile_write(struct file *filp)
904{
905 struct inode *inode = file_inode(filp);
906
907 if (!inode_owner_or_capable(inode))
908 return -EACCES;
909
910 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
911 return 0;
912}
913
914static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
915{
916 struct inode *inode = file_inode(filp);
917 struct super_block *sb = inode->i_sb;
918 struct request_queue *q = bdev_get_queue(sb->s_bdev);
919 struct fstrim_range range;
920 int ret;
921
922 if (!capable(CAP_SYS_ADMIN))
923 return -EPERM;
924
925 if (!blk_queue_discard(q))
926 return -EOPNOTSUPP;
927
928 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
929 sizeof(range)))
930 return -EFAULT;
931
932 range.minlen = max((unsigned int)range.minlen,
933 q->limits.discard_granularity);
934 ret = f2fs_trim_fs(F2FS_SB(sb), &range);
935 if (ret < 0)
936 return ret;
937
938 if (copy_to_user((struct fstrim_range __user *)arg, &range,
939 sizeof(range)))
940 return -EFAULT;
941 return 0;
942}
943
944long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
945{
946 switch (cmd) {
947 case F2FS_IOC_GETFLAGS:
948 return f2fs_ioc_getflags(filp, arg);
949 case F2FS_IOC_SETFLAGS:
950 return f2fs_ioc_setflags(filp, arg);
951 case F2FS_IOC_START_ATOMIC_WRITE:
952 return f2fs_ioc_start_atomic_write(filp);
953 case F2FS_IOC_COMMIT_ATOMIC_WRITE:
954 return f2fs_ioc_commit_atomic_write(filp);
955 case F2FS_IOC_START_VOLATILE_WRITE:
956 return f2fs_ioc_start_volatile_write(filp);
957 case FITRIM:
958 return f2fs_ioc_fitrim(filp, arg);
820 default: 959 default:
821 return -ENOTTY; 960 return -ENOTTY;
822 } 961 }
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index d7947d90ccc3..2a8f4acdb86b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -58,7 +58,7 @@ static int gc_thread_func(void *data)
58 * 3. IO subsystem is idle by checking the # of requests in 58 * 3. IO subsystem is idle by checking the # of requests in
59 * bdev's request list. 59 * bdev's request list.
60 * 60 *
61 * Note) We have to avoid triggering GCs too much frequently. 61 * Note) We have to avoid triggering GCs frequently.
62 * Because it is possible that some segments can be 62 * Because it is possible that some segments can be
63 * invalidated soon after by user update or deletion. 63 * invalidated soon after by user update or deletion.
64 * So, I'd like to wait some time to collect dirty segments. 64 * So, I'd like to wait some time to collect dirty segments.
@@ -193,7 +193,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
193 * selected by background GC before. 193 * selected by background GC before.
194 * Those segments guarantee they have small valid blocks. 194 * Those segments guarantee they have small valid blocks.
195 */ 195 */
196 for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) { 196 for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
197 if (sec_usage_check(sbi, secno)) 197 if (sec_usage_check(sbi, secno))
198 continue; 198 continue;
199 clear_bit(secno, dirty_i->victim_secmap); 199 clear_bit(secno, dirty_i->victim_secmap);
@@ -222,7 +222,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
222 222
223 u = (vblocks * 100) >> sbi->log_blocks_per_seg; 223 u = (vblocks * 100) >> sbi->log_blocks_per_seg;
224 224
225 /* Handle if the system time is changed by user */ 225 /* Handle if the system time has changed by the user */
226 if (mtime < sit_i->min_mtime) 226 if (mtime < sit_i->min_mtime)
227 sit_i->min_mtime = mtime; 227 sit_i->min_mtime = mtime;
228 if (mtime > sit_i->max_mtime) 228 if (mtime > sit_i->max_mtime)
@@ -263,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
263 unsigned int secno, max_cost; 263 unsigned int secno, max_cost;
264 int nsearched = 0; 264 int nsearched = 0;
265 265
266 mutex_lock(&dirty_i->seglist_lock);
267
266 p.alloc_mode = alloc_mode; 268 p.alloc_mode = alloc_mode;
267 select_policy(sbi, gc_type, type, &p); 269 select_policy(sbi, gc_type, type, &p);
268 270
269 p.min_segno = NULL_SEGNO; 271 p.min_segno = NULL_SEGNO;
270 p.min_cost = max_cost = get_max_cost(sbi, &p); 272 p.min_cost = max_cost = get_max_cost(sbi, &p);
271 273
272 mutex_lock(&dirty_i->seglist_lock);
273
274 if (p.alloc_mode == LFS && gc_type == FG_GC) { 274 if (p.alloc_mode == LFS && gc_type == FG_GC) {
275 p.min_segno = check_bg_victims(sbi); 275 p.min_segno = check_bg_victims(sbi);
276 if (p.min_segno != NULL_SEGNO) 276 if (p.min_segno != NULL_SEGNO)
@@ -281,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
281 unsigned long cost; 281 unsigned long cost;
282 unsigned int segno; 282 unsigned int segno;
283 283
284 segno = find_next_bit(p.dirty_segmap, 284 segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
285 TOTAL_SEGS(sbi), p.offset); 285 if (segno >= MAIN_SEGS(sbi)) {
286 if (segno >= TOTAL_SEGS(sbi)) {
287 if (sbi->last_victim[p.gc_mode]) { 286 if (sbi->last_victim[p.gc_mode]) {
288 sbi->last_victim[p.gc_mode] = 0; 287 sbi->last_victim[p.gc_mode] = 0;
289 p.offset = 0; 288 p.offset = 0;
@@ -423,6 +422,12 @@ next_step:
423 if (IS_ERR(node_page)) 422 if (IS_ERR(node_page))
424 continue; 423 continue;
425 424
425 /* block may become invalid during get_node_page */
426 if (check_valid_map(sbi, segno, off) == 0) {
427 f2fs_put_page(node_page, 1);
428 continue;
429 }
430
426 /* set page dirty and write it */ 431 /* set page dirty and write it */
427 if (gc_type == FG_GC) { 432 if (gc_type == FG_GC) {
428 f2fs_wait_on_page_writeback(node_page, NODE); 433 f2fs_wait_on_page_writeback(node_page, NODE);
@@ -531,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
531 f2fs_wait_on_page_writeback(page, DATA); 536 f2fs_wait_on_page_writeback(page, DATA);
532 537
533 if (clear_page_dirty_for_io(page)) 538 if (clear_page_dirty_for_io(page))
534 inode_dec_dirty_dents(inode); 539 inode_dec_dirty_pages(inode);
535 set_cold_data(page); 540 set_cold_data(page);
536 do_write_data_page(page, &fio); 541 do_write_data_page(page, &fio);
537 clear_cold_data(page); 542 clear_cold_data(page);
@@ -593,7 +598,7 @@ next_step:
593 598
594 if (phase == 2) { 599 if (phase == 2) {
595 inode = f2fs_iget(sb, dni.ino); 600 inode = f2fs_iget(sb, dni.ino);
596 if (IS_ERR(inode)) 601 if (IS_ERR(inode) || is_bad_inode(inode))
597 continue; 602 continue;
598 603
599 start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); 604 start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
@@ -688,17 +693,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
688 int gc_type = BG_GC; 693 int gc_type = BG_GC;
689 int nfree = 0; 694 int nfree = 0;
690 int ret = -1; 695 int ret = -1;
696 struct cp_control cpc = {
697 .reason = CP_SYNC,
698 };
691 699
692 INIT_LIST_HEAD(&ilist); 700 INIT_LIST_HEAD(&ilist);
693gc_more: 701gc_more:
694 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) 702 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
695 goto stop; 703 goto stop;
696 if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) 704 if (unlikely(f2fs_cp_error(sbi)))
697 goto stop; 705 goto stop;
698 706
699 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { 707 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
700 gc_type = FG_GC; 708 gc_type = FG_GC;
701 write_checkpoint(sbi, false); 709 write_checkpoint(sbi, &cpc);
702 } 710 }
703 711
704 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) 712 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
@@ -723,7 +731,7 @@ gc_more:
723 goto gc_more; 731 goto gc_more;
724 732
725 if (gc_type == FG_GC) 733 if (gc_type == FG_GC)
726 write_checkpoint(sbi, false); 734 write_checkpoint(sbi, &cpc);
727stop: 735stop:
728 mutex_unlock(&sbi->gc_mutex); 736 mutex_unlock(&sbi->gc_mutex);
729 737
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5d5eb6047bf4..16f0b2b22999 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -91,7 +91,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
91 block_t invalid_user_blocks = sbi->user_block_count - 91 block_t invalid_user_blocks = sbi->user_block_count -
92 written_block_count(sbi); 92 written_block_count(sbi);
93 /* 93 /*
94 * Background GC is triggered with the following condition. 94 * Background GC is triggered with the following conditions.
95 * 1. There are a number of invalid blocks. 95 * 1. There are a number of invalid blocks.
96 * 2. There is not enough free space. 96 * 2. There is not enough free space.
97 */ 97 */
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 948d17bf7281..a844fcfb9a8d 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -42,7 +42,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[])
42 buf[1] += b1; 42 buf[1] += b1;
43} 43}
44 44
45static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) 45static void str2hashbuf(const unsigned char *msg, size_t len,
46 unsigned int *buf, int num)
46{ 47{
47 unsigned pad, val; 48 unsigned pad, val;
48 int i; 49 int i;
@@ -73,9 +74,9 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
73{ 74{
74 __u32 hash; 75 __u32 hash;
75 f2fs_hash_t f2fs_hash; 76 f2fs_hash_t f2fs_hash;
76 const char *p; 77 const unsigned char *p;
77 __u32 in[8], buf[4]; 78 __u32 in[8], buf[4];
78 const char *name = name_info->name; 79 const unsigned char *name = name_info->name;
79 size_t len = name_info->len; 80 size_t len = name_info->len;
80 81
81 if ((len <= 2) && (name[0] == '.') && 82 if ((len <= 2) && (name[0] == '.') &&
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5beeccef9ae1..88036fd75797 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,11 +15,13 @@
15 15
16bool f2fs_may_inline(struct inode *inode) 16bool f2fs_may_inline(struct inode *inode)
17{ 17{
18 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
19 block_t nr_blocks; 18 block_t nr_blocks;
20 loff_t i_size; 19 loff_t i_size;
21 20
22 if (!test_opt(sbi, INLINE_DATA)) 21 if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
22 return false;
23
24 if (f2fs_is_atomic_file(inode))
23 return false; 25 return false;
24 26
25 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; 27 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
@@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode)
35 37
36int f2fs_read_inline_data(struct inode *inode, struct page *page) 38int f2fs_read_inline_data(struct inode *inode, struct page *page)
37{ 39{
38 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
39 struct page *ipage; 40 struct page *ipage;
40 void *src_addr, *dst_addr; 41 void *src_addr, *dst_addr;
41 42
@@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
44 goto out; 45 goto out;
45 } 46 }
46 47
47 ipage = get_node_page(sbi, inode->i_ino); 48 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
48 if (IS_ERR(ipage)) { 49 if (IS_ERR(ipage)) {
49 unlock_page(page); 50 unlock_page(page);
50 return PTR_ERR(ipage); 51 return PTR_ERR(ipage);
@@ -68,12 +69,12 @@ out:
68 69
69static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) 70static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
70{ 71{
71 int err; 72 int err = 0;
72 struct page *ipage; 73 struct page *ipage;
73 struct dnode_of_data dn; 74 struct dnode_of_data dn;
74 void *src_addr, *dst_addr; 75 void *src_addr, *dst_addr;
75 block_t new_blk_addr; 76 block_t new_blk_addr;
76 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 77 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
77 struct f2fs_io_info fio = { 78 struct f2fs_io_info fio = {
78 .type = DATA, 79 .type = DATA,
79 .rw = WRITE_SYNC | REQ_PRIO, 80 .rw = WRITE_SYNC | REQ_PRIO,
@@ -86,6 +87,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
86 goto out; 87 goto out;
87 } 88 }
88 89
90 /* someone else converted inline_data already */
91 if (!f2fs_has_inline_data(inode))
92 goto out;
93
89 /* 94 /*
90 * i_addr[0] is not used for inline data, 95 * i_addr[0] is not used for inline data,
91 * so reserving new block will not destroy inline data 96 * so reserving new block will not destroy inline data
@@ -124,9 +129,10 @@ out:
124 return err; 129 return err;
125} 130}
126 131
127int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) 132int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
133 struct page *page)
128{ 134{
129 struct page *page; 135 struct page *new_page = page;
130 int err; 136 int err;
131 137
132 if (!f2fs_has_inline_data(inode)) 138 if (!f2fs_has_inline_data(inode))
@@ -134,17 +140,20 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
134 else if (to_size <= MAX_INLINE_DATA) 140 else if (to_size <= MAX_INLINE_DATA)
135 return 0; 141 return 0;
136 142
137 page = grab_cache_page(inode->i_mapping, 0); 143 if (!page || page->index != 0) {
138 if (!page) 144 new_page = grab_cache_page(inode->i_mapping, 0);
139 return -ENOMEM; 145 if (!new_page)
146 return -ENOMEM;
147 }
140 148
141 err = __f2fs_convert_inline_data(inode, page); 149 err = __f2fs_convert_inline_data(inode, new_page);
142 f2fs_put_page(page, 1); 150 if (!page || page->index != 0)
151 f2fs_put_page(new_page, 1);
143 return err; 152 return err;
144} 153}
145 154
146int f2fs_write_inline_data(struct inode *inode, 155int f2fs_write_inline_data(struct inode *inode,
147 struct page *page, unsigned size) 156 struct page *page, unsigned size)
148{ 157{
149 void *src_addr, *dst_addr; 158 void *src_addr, *dst_addr;
150 struct page *ipage; 159 struct page *ipage;
@@ -181,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode,
181 190
182void truncate_inline_data(struct inode *inode, u64 from) 191void truncate_inline_data(struct inode *inode, u64 from)
183{ 192{
184 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
185 struct page *ipage; 193 struct page *ipage;
186 194
187 if (from >= MAX_INLINE_DATA) 195 if (from >= MAX_INLINE_DATA)
188 return; 196 return;
189 197
190 ipage = get_node_page(sbi, inode->i_ino); 198 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
191 if (IS_ERR(ipage)) 199 if (IS_ERR(ipage))
192 return; 200 return;
193 201
@@ -199,9 +207,9 @@ void truncate_inline_data(struct inode *inode, u64 from)
199 f2fs_put_page(ipage, 1); 207 f2fs_put_page(ipage, 1);
200} 208}
201 209
202int recover_inline_data(struct inode *inode, struct page *npage) 210bool recover_inline_data(struct inode *inode, struct page *npage)
203{ 211{
204 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 212 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
205 struct f2fs_inode *ri = NULL; 213 struct f2fs_inode *ri = NULL;
206 void *src_addr, *dst_addr; 214 void *src_addr, *dst_addr;
207 struct page *ipage; 215 struct page *ipage;
@@ -218,10 +226,10 @@ int recover_inline_data(struct inode *inode, struct page *npage)
218 ri = F2FS_INODE(npage); 226 ri = F2FS_INODE(npage);
219 227
220 if (f2fs_has_inline_data(inode) && 228 if (f2fs_has_inline_data(inode) &&
221 ri && ri->i_inline & F2FS_INLINE_DATA) { 229 ri && (ri->i_inline & F2FS_INLINE_DATA)) {
222process_inline: 230process_inline:
223 ipage = get_node_page(sbi, inode->i_ino); 231 ipage = get_node_page(sbi, inode->i_ino);
224 f2fs_bug_on(IS_ERR(ipage)); 232 f2fs_bug_on(sbi, IS_ERR(ipage));
225 233
226 f2fs_wait_on_page_writeback(ipage, NODE); 234 f2fs_wait_on_page_writeback(ipage, NODE);
227 235
@@ -230,22 +238,22 @@ process_inline:
230 memcpy(dst_addr, src_addr, MAX_INLINE_DATA); 238 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
231 update_inode(inode, ipage); 239 update_inode(inode, ipage);
232 f2fs_put_page(ipage, 1); 240 f2fs_put_page(ipage, 1);
233 return -1; 241 return true;
234 } 242 }
235 243
236 if (f2fs_has_inline_data(inode)) { 244 if (f2fs_has_inline_data(inode)) {
237 ipage = get_node_page(sbi, inode->i_ino); 245 ipage = get_node_page(sbi, inode->i_ino);
238 f2fs_bug_on(IS_ERR(ipage)); 246 f2fs_bug_on(sbi, IS_ERR(ipage));
239 f2fs_wait_on_page_writeback(ipage, NODE); 247 f2fs_wait_on_page_writeback(ipage, NODE);
240 zero_user_segment(ipage, INLINE_DATA_OFFSET, 248 zero_user_segment(ipage, INLINE_DATA_OFFSET,
241 INLINE_DATA_OFFSET + MAX_INLINE_DATA); 249 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
242 clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); 250 clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
243 update_inode(inode, ipage); 251 update_inode(inode, ipage);
244 f2fs_put_page(ipage, 1); 252 f2fs_put_page(ipage, 1);
245 } else if (ri && ri->i_inline & F2FS_INLINE_DATA) { 253 } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
246 truncate_blocks(inode, 0); 254 truncate_blocks(inode, 0, false);
247 set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); 255 set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
248 goto process_inline; 256 goto process_inline;
249 } 257 }
250 return 0; 258 return false;
251} 259}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2c39999f3868..0deead4505e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
69 69
70static int do_read_inode(struct inode *inode) 70static int do_read_inode(struct inode *inode)
71{ 71{
72 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 72 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
73 struct f2fs_inode_info *fi = F2FS_I(inode); 73 struct f2fs_inode_info *fi = F2FS_I(inode);
74 struct page *node_page; 74 struct page *node_page;
75 struct f2fs_inode *ri; 75 struct f2fs_inode *ri;
@@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page)
218 218
219void update_inode_page(struct inode *inode) 219void update_inode_page(struct inode *inode)
220{ 220{
221 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 221 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
222 struct page *node_page; 222 struct page *node_page;
223retry: 223retry:
224 node_page = get_node_page(sbi, inode->i_ino); 224 node_page = get_node_page(sbi, inode->i_ino);
@@ -238,7 +238,7 @@ retry:
238 238
239int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) 239int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
240{ 240{
241 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 241 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
242 242
243 if (inode->i_ino == F2FS_NODE_INO(sbi) || 243 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
244 inode->i_ino == F2FS_META_INO(sbi)) 244 inode->i_ino == F2FS_META_INO(sbi))
@@ -266,9 +266,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
266 */ 266 */
267void f2fs_evict_inode(struct inode *inode) 267void f2fs_evict_inode(struct inode *inode)
268{ 268{
269 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 269 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
270 nid_t xnid = F2FS_I(inode)->i_xattr_nid; 270 nid_t xnid = F2FS_I(inode)->i_xattr_nid;
271 271
272 /* some remained atomic pages should discarded */
273 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
274 commit_inmem_pages(inode, true);
275
272 trace_f2fs_evict_inode(inode); 276 trace_f2fs_evict_inode(inode);
273 truncate_inode_pages_final(&inode->i_data); 277 truncate_inode_pages_final(&inode->i_data);
274 278
@@ -276,7 +280,7 @@ void f2fs_evict_inode(struct inode *inode)
276 inode->i_ino == F2FS_META_INO(sbi)) 280 inode->i_ino == F2FS_META_INO(sbi))
277 goto out_clear; 281 goto out_clear;
278 282
279 f2fs_bug_on(get_dirty_dents(inode)); 283 f2fs_bug_on(sbi, get_dirty_pages(inode));
280 remove_dirty_dir_inode(inode); 284 remove_dirty_dir_inode(inode);
281 285
282 if (inode->i_nlink || is_bad_inode(inode)) 286 if (inode->i_nlink || is_bad_inode(inode))
@@ -306,3 +310,26 @@ no_delete:
306out_clear: 310out_clear:
307 clear_inode(inode); 311 clear_inode(inode);
308} 312}
313
314/* caller should call f2fs_lock_op() */
315void handle_failed_inode(struct inode *inode)
316{
317 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
318
319 clear_nlink(inode);
320 make_bad_inode(inode);
321 unlock_new_inode(inode);
322
323 i_size_write(inode, 0);
324 if (F2FS_HAS_BLOCKS(inode))
325 f2fs_truncate(inode);
326
327 remove_inode_page(inode);
328 stat_dec_inline_inode(inode);
329
330 alloc_nid_failed(sbi, inode->i_ino);
331 f2fs_unlock_op(sbi);
332
333 /* iput will drop the inode object */
334 iput(inode);
335}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 27b03776ffd2..0d2526e5aa11 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -23,7 +23,7 @@
23 23
24static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) 24static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
25{ 25{
26 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 26 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
27 nid_t ino; 27 nid_t ino;
28 struct inode *inode; 28 struct inode *inode;
29 bool nid_free = false; 29 bool nid_free = false;
@@ -102,7 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
102static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 102static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
103 bool excl) 103 bool excl)
104{ 104{
105 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 105 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
106 struct inode *inode; 106 struct inode *inode;
107 nid_t ino = 0; 107 nid_t ino = 0;
108 int err; 108 int err;
@@ -123,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
123 123
124 f2fs_lock_op(sbi); 124 f2fs_lock_op(sbi);
125 err = f2fs_add_link(dentry, inode); 125 err = f2fs_add_link(dentry, inode);
126 f2fs_unlock_op(sbi);
127 if (err) 126 if (err)
128 goto out; 127 goto out;
128 f2fs_unlock_op(sbi);
129 129
130 alloc_nid_done(sbi, ino); 130 alloc_nid_done(sbi, ino);
131 131
@@ -133,11 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
133 unlock_new_inode(inode); 133 unlock_new_inode(inode);
134 return 0; 134 return 0;
135out: 135out:
136 clear_nlink(inode); 136 handle_failed_inode(inode);
137 unlock_new_inode(inode);
138 make_bad_inode(inode);
139 iput(inode);
140 alloc_nid_failed(sbi, ino);
141 return err; 137 return err;
142} 138}
143 139
@@ -145,7 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
145 struct dentry *dentry) 141 struct dentry *dentry)
146{ 142{
147 struct inode *inode = old_dentry->d_inode; 143 struct inode *inode = old_dentry->d_inode;
148 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 144 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
149 int err; 145 int err;
150 146
151 f2fs_balance_fs(sbi); 147 f2fs_balance_fs(sbi);
@@ -156,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
156 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 152 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
157 f2fs_lock_op(sbi); 153 f2fs_lock_op(sbi);
158 err = f2fs_add_link(dentry, inode); 154 err = f2fs_add_link(dentry, inode);
159 f2fs_unlock_op(sbi);
160 if (err) 155 if (err)
161 goto out; 156 goto out;
157 f2fs_unlock_op(sbi);
162 158
163 d_instantiate(dentry, inode); 159 d_instantiate(dentry, inode);
164 return 0; 160 return 0;
165out: 161out:
166 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 162 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
167 iput(inode); 163 iput(inode);
164 f2fs_unlock_op(sbi);
168 return err; 165 return err;
169} 166}
170 167
@@ -205,7 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
205 202
206static int f2fs_unlink(struct inode *dir, struct dentry *dentry) 203static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
207{ 204{
208 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 205 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
209 struct inode *inode = dentry->d_inode; 206 struct inode *inode = dentry->d_inode;
210 struct f2fs_dir_entry *de; 207 struct f2fs_dir_entry *de;
211 struct page *page; 208 struct page *page;
@@ -229,7 +226,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
229 f2fs_delete_entry(de, page, inode); 226 f2fs_delete_entry(de, page, inode);
230 f2fs_unlock_op(sbi); 227 f2fs_unlock_op(sbi);
231 228
232 /* In order to evict this inode, we set it dirty */ 229 /* In order to evict this inode, we set it dirty */
233 mark_inode_dirty(inode); 230 mark_inode_dirty(inode);
234fail: 231fail:
235 trace_f2fs_unlink_exit(inode, err); 232 trace_f2fs_unlink_exit(inode, err);
@@ -239,7 +236,7 @@ fail:
239static int f2fs_symlink(struct inode *dir, struct dentry *dentry, 236static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
240 const char *symname) 237 const char *symname)
241{ 238{
242 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 239 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
243 struct inode *inode; 240 struct inode *inode;
244 size_t symlen = strlen(symname) + 1; 241 size_t symlen = strlen(symname) + 1;
245 int err; 242 int err;
@@ -255,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
255 252
256 f2fs_lock_op(sbi); 253 f2fs_lock_op(sbi);
257 err = f2fs_add_link(dentry, inode); 254 err = f2fs_add_link(dentry, inode);
258 f2fs_unlock_op(sbi);
259 if (err) 255 if (err)
260 goto out; 256 goto out;
257 f2fs_unlock_op(sbi);
261 258
262 err = page_symlink(inode, symname, symlen); 259 err = page_symlink(inode, symname, symlen);
263 alloc_nid_done(sbi, inode->i_ino); 260 alloc_nid_done(sbi, inode->i_ino);
@@ -266,17 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
266 unlock_new_inode(inode); 263 unlock_new_inode(inode);
267 return err; 264 return err;
268out: 265out:
269 clear_nlink(inode); 266 handle_failed_inode(inode);
270 unlock_new_inode(inode);
271 make_bad_inode(inode);
272 iput(inode);
273 alloc_nid_failed(sbi, inode->i_ino);
274 return err; 267 return err;
275} 268}
276 269
277static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 270static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
278{ 271{
279 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 272 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
280 struct inode *inode; 273 struct inode *inode;
281 int err; 274 int err;
282 275
@@ -294,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
294 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 287 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
295 f2fs_lock_op(sbi); 288 f2fs_lock_op(sbi);
296 err = f2fs_add_link(dentry, inode); 289 err = f2fs_add_link(dentry, inode);
297 f2fs_unlock_op(sbi);
298 if (err) 290 if (err)
299 goto out_fail; 291 goto out_fail;
292 f2fs_unlock_op(sbi);
300 293
301 alloc_nid_done(sbi, inode->i_ino); 294 alloc_nid_done(sbi, inode->i_ino);
302 295
@@ -307,11 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
307 300
308out_fail: 301out_fail:
309 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 302 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
310 clear_nlink(inode); 303 handle_failed_inode(inode);
311 unlock_new_inode(inode);
312 make_bad_inode(inode);
313 iput(inode);
314 alloc_nid_failed(sbi, inode->i_ino);
315 return err; 304 return err;
316} 305}
317 306
@@ -326,7 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
326static int f2fs_mknod(struct inode *dir, struct dentry *dentry, 315static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
327 umode_t mode, dev_t rdev) 316 umode_t mode, dev_t rdev)
328{ 317{
329 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 318 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
330 struct inode *inode; 319 struct inode *inode;
331 int err = 0; 320 int err = 0;
332 321
@@ -344,27 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
344 333
345 f2fs_lock_op(sbi); 334 f2fs_lock_op(sbi);
346 err = f2fs_add_link(dentry, inode); 335 err = f2fs_add_link(dentry, inode);
347 f2fs_unlock_op(sbi);
348 if (err) 336 if (err)
349 goto out; 337 goto out;
338 f2fs_unlock_op(sbi);
350 339
351 alloc_nid_done(sbi, inode->i_ino); 340 alloc_nid_done(sbi, inode->i_ino);
352 d_instantiate(dentry, inode); 341 d_instantiate(dentry, inode);
353 unlock_new_inode(inode); 342 unlock_new_inode(inode);
354 return 0; 343 return 0;
355out: 344out:
356 clear_nlink(inode); 345 handle_failed_inode(inode);
357 unlock_new_inode(inode);
358 make_bad_inode(inode);
359 iput(inode);
360 alloc_nid_failed(sbi, inode->i_ino);
361 return err; 346 return err;
362} 347}
363 348
364static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, 349static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
365 struct inode *new_dir, struct dentry *new_dentry) 350 struct inode *new_dir, struct dentry *new_dentry)
366{ 351{
367 struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb); 352 struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
368 struct inode *old_inode = old_dentry->d_inode; 353 struct inode *old_inode = old_dentry->d_inode;
369 struct inode *new_inode = new_dentry->d_inode; 354 struct inode *new_inode = new_dentry->d_inode;
370 struct page *old_dir_page; 355 struct page *old_dir_page;
@@ -488,8 +473,7 @@ out:
488static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, 473static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
489 struct inode *new_dir, struct dentry *new_dentry) 474 struct inode *new_dir, struct dentry *new_dentry)
490{ 475{
491 struct super_block *sb = old_dir->i_sb; 476 struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
492 struct f2fs_sb_info *sbi = F2FS_SB(sb);
493 struct inode *old_inode = old_dentry->d_inode; 477 struct inode *old_inode = old_dentry->d_inode;
494 struct inode *new_inode = new_dentry->d_inode; 478 struct inode *new_inode = new_dentry->d_inode;
495 struct page *old_dir_page, *new_dir_page; 479 struct page *old_dir_page, *new_dir_page;
@@ -650,7 +634,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
650 634
651static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 635static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
652{ 636{
653 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 637 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
654 struct inode *inode; 638 struct inode *inode;
655 int err; 639 int err;
656 640
@@ -686,12 +670,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
686release_out: 670release_out:
687 release_orphan_inode(sbi); 671 release_orphan_inode(sbi);
688out: 672out:
689 f2fs_unlock_op(sbi); 673 handle_failed_inode(inode);
690 clear_nlink(inode);
691 unlock_new_inode(inode);
692 make_bad_inode(inode);
693 iput(inode);
694 alloc_nid_failed(sbi, inode->i_ino);
695 return err; 674 return err;
696} 675}
697 676
@@ -704,7 +683,6 @@ const struct inode_operations f2fs_dir_inode_operations = {
704 .mkdir = f2fs_mkdir, 683 .mkdir = f2fs_mkdir,
705 .rmdir = f2fs_rmdir, 684 .rmdir = f2fs_rmdir,
706 .mknod = f2fs_mknod, 685 .mknod = f2fs_mknod,
707 .rename = f2fs_rename,
708 .rename2 = f2fs_rename2, 686 .rename2 = f2fs_rename2,
709 .tmpfile = f2fs_tmpfile, 687 .tmpfile = f2fs_tmpfile,
710 .getattr = f2fs_getattr, 688 .getattr = f2fs_getattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index d3d90d284631..44b8afef43d9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -54,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
54static void clear_node_page_dirty(struct page *page) 54static void clear_node_page_dirty(struct page *page)
55{ 55{
56 struct address_space *mapping = page->mapping; 56 struct address_space *mapping = page->mapping;
57 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
58 unsigned int long flags; 57 unsigned int long flags;
59 58
60 if (PageDirty(page)) { 59 if (PageDirty(page)) {
@@ -65,7 +64,7 @@ static void clear_node_page_dirty(struct page *page)
65 spin_unlock_irqrestore(&mapping->tree_lock, flags); 64 spin_unlock_irqrestore(&mapping->tree_lock, flags);
66 65
67 clear_page_dirty_for_io(page); 66 clear_page_dirty_for_io(page);
68 dec_page_count(sbi, F2FS_DIRTY_NODES); 67 dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
69 } 68 }
70 ClearPageUptodate(page); 69 ClearPageUptodate(page);
71} 70}
@@ -92,7 +91,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
92 /* get current nat block page with lock */ 91 /* get current nat block page with lock */
93 src_page = get_meta_page(sbi, src_off); 92 src_page = get_meta_page(sbi, src_off);
94 dst_page = grab_meta_page(sbi, dst_off); 93 dst_page = grab_meta_page(sbi, dst_off);
95 f2fs_bug_on(PageDirty(src_page)); 94 f2fs_bug_on(sbi, PageDirty(src_page));
96 95
97 src_addr = page_address(src_page); 96 src_addr = page_address(src_page);
98 dst_addr = page_address(dst_page); 97 dst_addr = page_address(dst_page);
@@ -124,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
124 kmem_cache_free(nat_entry_slab, e); 123 kmem_cache_free(nat_entry_slab, e);
125} 124}
126 125
127int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) 126static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
127 struct nat_entry *ne)
128{
129 nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
130 struct nat_entry_set *head;
131
132 if (get_nat_flag(ne, IS_DIRTY))
133 return;
134retry:
135 head = radix_tree_lookup(&nm_i->nat_set_root, set);
136 if (!head) {
137 head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
138
139 INIT_LIST_HEAD(&head->entry_list);
140 INIT_LIST_HEAD(&head->set_list);
141 head->set = set;
142 head->entry_cnt = 0;
143
144 if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
145 cond_resched();
146 goto retry;
147 }
148 }
149 list_move_tail(&ne->list, &head->entry_list);
150 nm_i->dirty_nat_cnt++;
151 head->entry_cnt++;
152 set_nat_flag(ne, IS_DIRTY, true);
153}
154
155static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
156 struct nat_entry *ne)
157{
158 nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
159 struct nat_entry_set *head;
160
161 head = radix_tree_lookup(&nm_i->nat_set_root, set);
162 if (head) {
163 list_move_tail(&ne->list, &nm_i->nat_entries);
164 set_nat_flag(ne, IS_DIRTY, false);
165 head->entry_cnt--;
166 nm_i->dirty_nat_cnt--;
167 }
168}
169
170static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
171 nid_t start, unsigned int nr, struct nat_entry_set **ep)
172{
173 return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
174 start, nr);
175}
176
177bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
128{ 178{
129 struct f2fs_nm_info *nm_i = NM_I(sbi); 179 struct f2fs_nm_info *nm_i = NM_I(sbi);
130 struct nat_entry *e; 180 struct nat_entry *e;
131 int is_cp = 1; 181 bool is_cp = true;
132 182
133 read_lock(&nm_i->nat_tree_lock); 183 read_lock(&nm_i->nat_tree_lock);
134 e = __lookup_nat_cache(nm_i, nid); 184 e = __lookup_nat_cache(nm_i, nid);
135 if (e && !e->checkpointed) 185 if (e && !get_nat_flag(e, IS_CHECKPOINTED))
136 is_cp = 0; 186 is_cp = false;
137 read_unlock(&nm_i->nat_tree_lock); 187 read_unlock(&nm_i->nat_tree_lock);
138 return is_cp; 188 return is_cp;
139} 189}
140 190
141bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) 191bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
142{ 192{
143 struct f2fs_nm_info *nm_i = NM_I(sbi); 193 struct f2fs_nm_info *nm_i = NM_I(sbi);
144 struct nat_entry *e; 194 struct nat_entry *e;
145 bool fsync_done = false; 195 bool fsynced = false;
146 196
147 read_lock(&nm_i->nat_tree_lock); 197 read_lock(&nm_i->nat_tree_lock);
148 e = __lookup_nat_cache(nm_i, nid); 198 e = __lookup_nat_cache(nm_i, ino);
149 if (e) 199 if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
150 fsync_done = e->fsync_done; 200 fsynced = true;
151 read_unlock(&nm_i->nat_tree_lock); 201 read_unlock(&nm_i->nat_tree_lock);
152 return fsync_done; 202 return fsynced;
153} 203}
154 204
155void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) 205bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
156{ 206{
157 struct f2fs_nm_info *nm_i = NM_I(sbi); 207 struct f2fs_nm_info *nm_i = NM_I(sbi);
158 struct nat_entry *e; 208 struct nat_entry *e;
209 bool need_update = true;
159 210
160 write_lock(&nm_i->nat_tree_lock); 211 read_lock(&nm_i->nat_tree_lock);
161 e = __lookup_nat_cache(nm_i, nid); 212 e = __lookup_nat_cache(nm_i, ino);
162 if (e) 213 if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
163 e->fsync_done = false; 214 (get_nat_flag(e, IS_CHECKPOINTED) ||
164 write_unlock(&nm_i->nat_tree_lock); 215 get_nat_flag(e, HAS_FSYNCED_INODE)))
216 need_update = false;
217 read_unlock(&nm_i->nat_tree_lock);
218 return need_update;
165} 219}
166 220
167static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) 221static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
@@ -177,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
177 } 231 }
178 memset(new, 0, sizeof(struct nat_entry)); 232 memset(new, 0, sizeof(struct nat_entry));
179 nat_set_nid(new, nid); 233 nat_set_nid(new, nid);
180 new->checkpointed = true; 234 nat_reset_flag(new);
181 list_add_tail(&new->list, &nm_i->nat_entries); 235 list_add_tail(&new->list, &nm_i->nat_entries);
182 nm_i->nat_cnt++; 236 nm_i->nat_cnt++;
183 return new; 237 return new;
@@ -216,7 +270,7 @@ retry:
216 goto retry; 270 goto retry;
217 } 271 }
218 e->ni = *ni; 272 e->ni = *ni;
219 f2fs_bug_on(ni->blk_addr == NEW_ADDR); 273 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
220 } else if (new_blkaddr == NEW_ADDR) { 274 } else if (new_blkaddr == NEW_ADDR) {
221 /* 275 /*
222 * when nid is reallocated, 276 * when nid is reallocated,
@@ -224,20 +278,20 @@ retry:
224 * So, reinitialize it with new information. 278 * So, reinitialize it with new information.
225 */ 279 */
226 e->ni = *ni; 280 e->ni = *ni;
227 f2fs_bug_on(ni->blk_addr != NULL_ADDR); 281 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
228 } 282 }
229 283
230 /* sanity check */ 284 /* sanity check */
231 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); 285 f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
232 f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && 286 f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
233 new_blkaddr == NULL_ADDR); 287 new_blkaddr == NULL_ADDR);
234 f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && 288 f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
235 new_blkaddr == NEW_ADDR); 289 new_blkaddr == NEW_ADDR);
236 f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && 290 f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
237 nat_get_blkaddr(e) != NULL_ADDR && 291 nat_get_blkaddr(e) != NULL_ADDR &&
238 new_blkaddr == NEW_ADDR); 292 new_blkaddr == NEW_ADDR);
239 293
240 /* increament version no as node is removed */ 294 /* increment version no as node is removed */
241 if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { 295 if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
242 unsigned char version = nat_get_version(e); 296 unsigned char version = nat_get_version(e);
243 nat_set_version(e, inc_node_version(version)); 297 nat_set_version(e, inc_node_version(version));
@@ -245,12 +299,17 @@ retry:
245 299
246 /* change address */ 300 /* change address */
247 nat_set_blkaddr(e, new_blkaddr); 301 nat_set_blkaddr(e, new_blkaddr);
302 if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
303 set_nat_flag(e, IS_CHECKPOINTED, false);
248 __set_nat_cache_dirty(nm_i, e); 304 __set_nat_cache_dirty(nm_i, e);
249 305
250 /* update fsync_mark if its inode nat entry is still alive */ 306 /* update fsync_mark if its inode nat entry is still alive */
251 e = __lookup_nat_cache(nm_i, ni->ino); 307 e = __lookup_nat_cache(nm_i, ni->ino);
252 if (e) 308 if (e) {
253 e->fsync_done = fsync_done; 309 if (fsync_done && ni->nid == ni->ino)
310 set_nat_flag(e, HAS_FSYNCED_INODE, true);
311 set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
312 }
254 write_unlock(&nm_i->nat_tree_lock); 313 write_unlock(&nm_i->nat_tree_lock);
255} 314}
256 315
@@ -274,7 +333,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
274} 333}
275 334
276/* 335/*
277 * This function returns always success 336 * This function always returns success
278 */ 337 */
279void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) 338void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
280{ 339{
@@ -411,7 +470,7 @@ got:
411 */ 470 */
412int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) 471int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
413{ 472{
414 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 473 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
415 struct page *npage[4]; 474 struct page *npage[4];
416 struct page *parent; 475 struct page *parent;
417 int offset[4]; 476 int offset[4];
@@ -504,15 +563,15 @@ release_out:
504 563
505static void truncate_node(struct dnode_of_data *dn) 564static void truncate_node(struct dnode_of_data *dn)
506{ 565{
507 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 566 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
508 struct node_info ni; 567 struct node_info ni;
509 568
510 get_node_info(sbi, dn->nid, &ni); 569 get_node_info(sbi, dn->nid, &ni);
511 if (dn->inode->i_blocks == 0) { 570 if (dn->inode->i_blocks == 0) {
512 f2fs_bug_on(ni.blk_addr != NULL_ADDR); 571 f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
513 goto invalidate; 572 goto invalidate;
514 } 573 }
515 f2fs_bug_on(ni.blk_addr == NULL_ADDR); 574 f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
516 575
517 /* Deallocate node address */ 576 /* Deallocate node address */
518 invalidate_blocks(sbi, ni.blk_addr); 577 invalidate_blocks(sbi, ni.blk_addr);
@@ -540,14 +599,13 @@ invalidate:
540 599
541static int truncate_dnode(struct dnode_of_data *dn) 600static int truncate_dnode(struct dnode_of_data *dn)
542{ 601{
543 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
544 struct page *page; 602 struct page *page;
545 603
546 if (dn->nid == 0) 604 if (dn->nid == 0)
547 return 1; 605 return 1;
548 606
549 /* get direct node */ 607 /* get direct node */
550 page = get_node_page(sbi, dn->nid); 608 page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
551 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) 609 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
552 return 1; 610 return 1;
553 else if (IS_ERR(page)) 611 else if (IS_ERR(page))
@@ -564,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn)
564static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, 622static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
565 int ofs, int depth) 623 int ofs, int depth)
566{ 624{
567 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
568 struct dnode_of_data rdn = *dn; 625 struct dnode_of_data rdn = *dn;
569 struct page *page; 626 struct page *page;
570 struct f2fs_node *rn; 627 struct f2fs_node *rn;
@@ -578,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
578 635
579 trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); 636 trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
580 637
581 page = get_node_page(sbi, dn->nid); 638 page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
582 if (IS_ERR(page)) { 639 if (IS_ERR(page)) {
583 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); 640 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
584 return PTR_ERR(page); 641 return PTR_ERR(page);
@@ -636,7 +693,6 @@ out_err:
636static int truncate_partial_nodes(struct dnode_of_data *dn, 693static int truncate_partial_nodes(struct dnode_of_data *dn,
637 struct f2fs_inode *ri, int *offset, int depth) 694 struct f2fs_inode *ri, int *offset, int depth)
638{ 695{
639 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
640 struct page *pages[2]; 696 struct page *pages[2];
641 nid_t nid[3]; 697 nid_t nid[3];
642 nid_t child_nid; 698 nid_t child_nid;
@@ -650,8 +706,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
650 706
651 /* get indirect nodes in the path */ 707 /* get indirect nodes in the path */
652 for (i = 0; i < idx + 1; i++) { 708 for (i = 0; i < idx + 1; i++) {
653 /* refernece count'll be increased */ 709 /* reference count'll be increased */
654 pages[i] = get_node_page(sbi, nid[i]); 710 pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
655 if (IS_ERR(pages[i])) { 711 if (IS_ERR(pages[i])) {
656 err = PTR_ERR(pages[i]); 712 err = PTR_ERR(pages[i]);
657 idx = i - 1; 713 idx = i - 1;
@@ -696,7 +752,7 @@ fail:
696 */ 752 */
697int truncate_inode_blocks(struct inode *inode, pgoff_t from) 753int truncate_inode_blocks(struct inode *inode, pgoff_t from)
698{ 754{
699 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 755 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
700 int err = 0, cont = 1; 756 int err = 0, cont = 1;
701 int level, offset[4], noffset[4]; 757 int level, offset[4], noffset[4];
702 unsigned int nofs = 0; 758 unsigned int nofs = 0;
@@ -792,7 +848,7 @@ fail:
792 848
793int truncate_xattr_node(struct inode *inode, struct page *page) 849int truncate_xattr_node(struct inode *inode, struct page *page)
794{ 850{
795 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 851 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
796 nid_t nid = F2FS_I(inode)->i_xattr_nid; 852 nid_t nid = F2FS_I(inode)->i_xattr_nid;
797 struct dnode_of_data dn; 853 struct dnode_of_data dn;
798 struct page *npage; 854 struct page *npage;
@@ -823,22 +879,27 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
823 */ 879 */
824void remove_inode_page(struct inode *inode) 880void remove_inode_page(struct inode *inode)
825{ 881{
826 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
827 struct page *page;
828 nid_t ino = inode->i_ino;
829 struct dnode_of_data dn; 882 struct dnode_of_data dn;
830 883
831 page = get_node_page(sbi, ino); 884 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
832 if (IS_ERR(page)) 885 if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
833 return; 886 return;
834 887
835 if (truncate_xattr_node(inode, page)) { 888 if (truncate_xattr_node(inode, dn.inode_page)) {
836 f2fs_put_page(page, 1); 889 f2fs_put_dnode(&dn);
837 return; 890 return;
838 } 891 }
839 /* 0 is possible, after f2fs_new_inode() is failed */ 892
840 f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); 893 /* remove potential inline_data blocks */
841 set_new_dnode(&dn, inode, page, page, ino); 894 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
895 S_ISLNK(inode->i_mode))
896 truncate_data_blocks_range(&dn, 1);
897
898 /* 0 is possible, after f2fs_new_inode() has failed */
899 f2fs_bug_on(F2FS_I_SB(inode),
900 inode->i_blocks != 0 && inode->i_blocks != 1);
901
902 /* will put inode & node pages */
842 truncate_node(&dn); 903 truncate_node(&dn);
843} 904}
844 905
@@ -856,7 +917,7 @@ struct page *new_inode_page(struct inode *inode)
856struct page *new_node_page(struct dnode_of_data *dn, 917struct page *new_node_page(struct dnode_of_data *dn,
857 unsigned int ofs, struct page *ipage) 918 unsigned int ofs, struct page *ipage)
858{ 919{
859 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 920 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
860 struct node_info old_ni, new_ni; 921 struct node_info old_ni, new_ni;
861 struct page *page; 922 struct page *page;
862 int err; 923 int err;
@@ -876,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
876 get_node_info(sbi, dn->nid, &old_ni); 937 get_node_info(sbi, dn->nid, &old_ni);
877 938
878 /* Reinitialize old_ni with new node page */ 939 /* Reinitialize old_ni with new node page */
879 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); 940 f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
880 new_ni = old_ni; 941 new_ni = old_ni;
881 new_ni.ino = dn->inode->i_ino; 942 new_ni.ino = dn->inode->i_ino;
882 set_node_addr(sbi, &new_ni, NEW_ADDR, false); 943 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
@@ -914,7 +975,7 @@ fail:
914 */ 975 */
915static int read_node_page(struct page *page, int rw) 976static int read_node_page(struct page *page, int rw)
916{ 977{
917 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 978 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
918 struct node_info ni; 979 struct node_info ni;
919 980
920 get_node_info(sbi, page->index, &ni); 981 get_node_info(sbi, page->index, &ni);
@@ -990,7 +1051,7 @@ got_it:
990 */ 1051 */
991struct page *get_node_page_ra(struct page *parent, int start) 1052struct page *get_node_page_ra(struct page *parent, int start)
992{ 1053{
993 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); 1054 struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
994 struct blk_plug plug; 1055 struct blk_plug plug;
995 struct page *page; 1056 struct page *page;
996 int err, i, end; 1057 int err, i, end;
@@ -1120,17 +1181,24 @@ continue_unlock:
1120 1181
1121 /* called by fsync() */ 1182 /* called by fsync() */
1122 if (ino && IS_DNODE(page)) { 1183 if (ino && IS_DNODE(page)) {
1123 int mark = !is_checkpointed_node(sbi, ino);
1124 set_fsync_mark(page, 1); 1184 set_fsync_mark(page, 1);
1125 if (IS_INODE(page)) 1185 if (IS_INODE(page)) {
1126 set_dentry_mark(page, mark); 1186 if (!is_checkpointed_node(sbi, ino) &&
1187 !has_fsynced_inode(sbi, ino))
1188 set_dentry_mark(page, 1);
1189 else
1190 set_dentry_mark(page, 0);
1191 }
1127 nwritten++; 1192 nwritten++;
1128 } else { 1193 } else {
1129 set_fsync_mark(page, 0); 1194 set_fsync_mark(page, 0);
1130 set_dentry_mark(page, 0); 1195 set_dentry_mark(page, 0);
1131 } 1196 }
1132 NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); 1197
1133 wrote++; 1198 if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
1199 unlock_page(page);
1200 else
1201 wrote++;
1134 1202
1135 if (--wbc->nr_to_write == 0) 1203 if (--wbc->nr_to_write == 0)
1136 break; 1204 break;
@@ -1199,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1199static int f2fs_write_node_page(struct page *page, 1267static int f2fs_write_node_page(struct page *page,
1200 struct writeback_control *wbc) 1268 struct writeback_control *wbc)
1201{ 1269{
1202 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1270 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1203 nid_t nid; 1271 nid_t nid;
1204 block_t new_addr; 1272 block_t new_addr;
1205 struct node_info ni; 1273 struct node_info ni;
@@ -1212,12 +1280,14 @@ static int f2fs_write_node_page(struct page *page,
1212 1280
1213 if (unlikely(sbi->por_doing)) 1281 if (unlikely(sbi->por_doing))
1214 goto redirty_out; 1282 goto redirty_out;
1283 if (unlikely(f2fs_cp_error(sbi)))
1284 goto redirty_out;
1215 1285
1216 f2fs_wait_on_page_writeback(page, NODE); 1286 f2fs_wait_on_page_writeback(page, NODE);
1217 1287
1218 /* get old block addr of this node page */ 1288 /* get old block addr of this node page */
1219 nid = nid_of_node(page); 1289 nid = nid_of_node(page);
1220 f2fs_bug_on(page->index != nid); 1290 f2fs_bug_on(sbi, page->index != nid);
1221 1291
1222 get_node_info(sbi, nid, &ni); 1292 get_node_info(sbi, nid, &ni);
1223 1293
@@ -1248,7 +1318,7 @@ redirty_out:
1248static int f2fs_write_node_pages(struct address_space *mapping, 1318static int f2fs_write_node_pages(struct address_space *mapping,
1249 struct writeback_control *wbc) 1319 struct writeback_control *wbc)
1250{ 1320{
1251 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 1321 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
1252 long diff; 1322 long diff;
1253 1323
1254 trace_f2fs_writepages(mapping->host, wbc, NODE); 1324 trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -1273,15 +1343,12 @@ skip_write:
1273 1343
1274static int f2fs_set_node_page_dirty(struct page *page) 1344static int f2fs_set_node_page_dirty(struct page *page)
1275{ 1345{
1276 struct address_space *mapping = page->mapping;
1277 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1278
1279 trace_f2fs_set_page_dirty(page, NODE); 1346 trace_f2fs_set_page_dirty(page, NODE);
1280 1347
1281 SetPageUptodate(page); 1348 SetPageUptodate(page);
1282 if (!PageDirty(page)) { 1349 if (!PageDirty(page)) {
1283 __set_page_dirty_nobuffers(page); 1350 __set_page_dirty_nobuffers(page);
1284 inc_page_count(sbi, F2FS_DIRTY_NODES); 1351 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1285 SetPagePrivate(page); 1352 SetPagePrivate(page);
1286 return 1; 1353 return 1;
1287 } 1354 }
@@ -1292,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1292 unsigned int length) 1359 unsigned int length)
1293{ 1360{
1294 struct inode *inode = page->mapping->host; 1361 struct inode *inode = page->mapping->host;
1295 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1296 if (PageDirty(page)) 1362 if (PageDirty(page))
1297 dec_page_count(sbi, F2FS_DIRTY_NODES); 1363 dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
1298 ClearPagePrivate(page); 1364 ClearPagePrivate(page);
1299} 1365}
1300 1366
@@ -1347,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
1347 read_lock(&nm_i->nat_tree_lock); 1413 read_lock(&nm_i->nat_tree_lock);
1348 ne = __lookup_nat_cache(nm_i, nid); 1414 ne = __lookup_nat_cache(nm_i, nid);
1349 if (ne && 1415 if (ne &&
1350 (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) 1416 (!get_nat_flag(ne, IS_CHECKPOINTED) ||
1417 nat_get_blkaddr(ne) != NULL_ADDR))
1351 allocated = true; 1418 allocated = true;
1352 read_unlock(&nm_i->nat_tree_lock); 1419 read_unlock(&nm_i->nat_tree_lock);
1353 if (allocated) 1420 if (allocated)
@@ -1404,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
1404 break; 1471 break;
1405 1472
1406 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); 1473 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1407 f2fs_bug_on(blk_addr == NEW_ADDR); 1474 f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
1408 if (blk_addr == NULL_ADDR) { 1475 if (blk_addr == NULL_ADDR) {
1409 if (add_free_nid(sbi, start_nid, true) < 0) 1476 if (add_free_nid(sbi, start_nid, true) < 0)
1410 break; 1477 break;
@@ -1474,12 +1541,12 @@ retry:
1474 1541
1475 /* We should not use stale free nids created by build_free_nids */ 1542 /* We should not use stale free nids created by build_free_nids */
1476 if (nm_i->fcnt && !on_build_free_nids(nm_i)) { 1543 if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
1477 f2fs_bug_on(list_empty(&nm_i->free_nid_list)); 1544 f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
1478 list_for_each_entry(i, &nm_i->free_nid_list, list) 1545 list_for_each_entry(i, &nm_i->free_nid_list, list)
1479 if (i->state == NID_NEW) 1546 if (i->state == NID_NEW)
1480 break; 1547 break;
1481 1548
1482 f2fs_bug_on(i->state != NID_NEW); 1549 f2fs_bug_on(sbi, i->state != NID_NEW);
1483 *nid = i->nid; 1550 *nid = i->nid;
1484 i->state = NID_ALLOC; 1551 i->state = NID_ALLOC;
1485 nm_i->fcnt--; 1552 nm_i->fcnt--;
@@ -1505,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1505 1572
1506 spin_lock(&nm_i->free_nid_list_lock); 1573 spin_lock(&nm_i->free_nid_list_lock);
1507 i = __lookup_free_nid_list(nm_i, nid); 1574 i = __lookup_free_nid_list(nm_i, nid);
1508 f2fs_bug_on(!i || i->state != NID_ALLOC); 1575 f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
1509 __del_from_free_nid_list(nm_i, i); 1576 __del_from_free_nid_list(nm_i, i);
1510 spin_unlock(&nm_i->free_nid_list_lock); 1577 spin_unlock(&nm_i->free_nid_list_lock);
1511 1578
@@ -1526,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1526 1593
1527 spin_lock(&nm_i->free_nid_list_lock); 1594 spin_lock(&nm_i->free_nid_list_lock);
1528 i = __lookup_free_nid_list(nm_i, nid); 1595 i = __lookup_free_nid_list(nm_i, nid);
1529 f2fs_bug_on(!i || i->state != NID_ALLOC); 1596 f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
1530 if (!available_free_memory(sbi, FREE_NIDS)) { 1597 if (!available_free_memory(sbi, FREE_NIDS)) {
1531 __del_from_free_nid_list(nm_i, i); 1598 __del_from_free_nid_list(nm_i, i);
1532 need_free = true; 1599 need_free = true;
@@ -1540,35 +1607,21 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1540 kmem_cache_free(free_nid_slab, i); 1607 kmem_cache_free(free_nid_slab, i);
1541} 1608}
1542 1609
1543void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
1544 struct f2fs_summary *sum, struct node_info *ni,
1545 block_t new_blkaddr)
1546{
1547 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
1548 set_node_addr(sbi, ni, new_blkaddr, false);
1549 clear_node_page_dirty(page);
1550}
1551
1552void recover_inline_xattr(struct inode *inode, struct page *page) 1610void recover_inline_xattr(struct inode *inode, struct page *page)
1553{ 1611{
1554 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1555 void *src_addr, *dst_addr; 1612 void *src_addr, *dst_addr;
1556 size_t inline_size; 1613 size_t inline_size;
1557 struct page *ipage; 1614 struct page *ipage;
1558 struct f2fs_inode *ri; 1615 struct f2fs_inode *ri;
1559 1616
1560 if (!f2fs_has_inline_xattr(inode)) 1617 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
1561 return; 1618 f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
1562
1563 if (!IS_INODE(page))
1564 return;
1565 1619
1566 ri = F2FS_INODE(page); 1620 ri = F2FS_INODE(page);
1567 if (!(ri->i_inline & F2FS_INLINE_XATTR)) 1621 if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
1568 return; 1622 clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
1569 1623 goto update_inode;
1570 ipage = get_node_page(sbi, inode->i_ino); 1624 }
1571 f2fs_bug_on(IS_ERR(ipage));
1572 1625
1573 dst_addr = inline_xattr_addr(ipage); 1626 dst_addr = inline_xattr_addr(ipage);
1574 src_addr = inline_xattr_addr(page); 1627 src_addr = inline_xattr_addr(page);
@@ -1576,28 +1629,25 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
1576 1629
1577 f2fs_wait_on_page_writeback(ipage, NODE); 1630 f2fs_wait_on_page_writeback(ipage, NODE);
1578 memcpy(dst_addr, src_addr, inline_size); 1631 memcpy(dst_addr, src_addr, inline_size);
1579 1632update_inode:
1580 update_inode(inode, ipage); 1633 update_inode(inode, ipage);
1581 f2fs_put_page(ipage, 1); 1634 f2fs_put_page(ipage, 1);
1582} 1635}
1583 1636
1584bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) 1637void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1585{ 1638{
1586 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1639 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1587 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; 1640 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
1588 nid_t new_xnid = nid_of_node(page); 1641 nid_t new_xnid = nid_of_node(page);
1589 struct node_info ni; 1642 struct node_info ni;
1590 1643
1591 if (!f2fs_has_xattr_block(ofs_of_node(page)))
1592 return false;
1593
1594 /* 1: invalidate the previous xattr nid */ 1644 /* 1: invalidate the previous xattr nid */
1595 if (!prev_xnid) 1645 if (!prev_xnid)
1596 goto recover_xnid; 1646 goto recover_xnid;
1597 1647
1598 /* Deallocate node address */ 1648 /* Deallocate node address */
1599 get_node_info(sbi, prev_xnid, &ni); 1649 get_node_info(sbi, prev_xnid, &ni);
1600 f2fs_bug_on(ni.blk_addr == NULL_ADDR); 1650 f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
1601 invalidate_blocks(sbi, ni.blk_addr); 1651 invalidate_blocks(sbi, ni.blk_addr);
1602 dec_valid_node_count(sbi, inode); 1652 dec_valid_node_count(sbi, inode);
1603 set_node_addr(sbi, &ni, NULL_ADDR, false); 1653 set_node_addr(sbi, &ni, NULL_ADDR, false);
@@ -1605,7 +1655,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1605recover_xnid: 1655recover_xnid:
1606 /* 2: allocate new xattr nid */ 1656 /* 2: allocate new xattr nid */
1607 if (unlikely(!inc_valid_node_count(sbi, inode))) 1657 if (unlikely(!inc_valid_node_count(sbi, inode)))
1608 f2fs_bug_on(1); 1658 f2fs_bug_on(sbi, 1);
1609 1659
1610 remove_free_nid(NM_I(sbi), new_xnid); 1660 remove_free_nid(NM_I(sbi), new_xnid);
1611 get_node_info(sbi, new_xnid, &ni); 1661 get_node_info(sbi, new_xnid, &ni);
@@ -1618,7 +1668,6 @@ recover_xnid:
1618 set_node_addr(sbi, &ni, blkaddr, false); 1668 set_node_addr(sbi, &ni, blkaddr, false);
1619 1669
1620 update_inode_page(inode); 1670 update_inode_page(inode);
1621 return true;
1622} 1671}
1623 1672
1624int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) 1673int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1637,7 +1686,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1637 if (!ipage) 1686 if (!ipage)
1638 return -ENOMEM; 1687 return -ENOMEM;
1639 1688
1640 /* Should not use this inode from free nid list */ 1689 /* Should not use this inode from free nid list */
1641 remove_free_nid(NM_I(sbi), ino); 1690 remove_free_nid(NM_I(sbi), ino);
1642 1691
1643 SetPageUptodate(ipage); 1692 SetPageUptodate(ipage);
@@ -1651,6 +1700,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1651 dst->i_blocks = cpu_to_le64(1); 1700 dst->i_blocks = cpu_to_le64(1);
1652 dst->i_links = cpu_to_le32(1); 1701 dst->i_links = cpu_to_le32(1);
1653 dst->i_xattr_nid = 0; 1702 dst->i_xattr_nid = 0;
1703 dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
1654 1704
1655 new_ni = old_ni; 1705 new_ni = old_ni;
1656 new_ni.ino = ino; 1706 new_ni.ino = ino;
@@ -1659,13 +1709,14 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1659 WARN_ON(1); 1709 WARN_ON(1);
1660 set_node_addr(sbi, &new_ni, NEW_ADDR, false); 1710 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1661 inc_valid_inode_count(sbi); 1711 inc_valid_inode_count(sbi);
1712 set_page_dirty(ipage);
1662 f2fs_put_page(ipage, 1); 1713 f2fs_put_page(ipage, 1);
1663 return 0; 1714 return 0;
1664} 1715}
1665 1716
1666/* 1717/*
1667 * ra_sum_pages() merge contiguous pages into one bio and submit. 1718 * ra_sum_pages() merge contiguous pages into one bio and submit.
1668 * these pre-readed pages are alloced in bd_inode's mapping tree. 1719 * these pre-read pages are allocated in bd_inode's mapping tree.
1669 */ 1720 */
1670static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, 1721static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
1671 int start, int nrpages) 1722 int start, int nrpages)
@@ -1697,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1697 struct f2fs_summary *sum_entry; 1748 struct f2fs_summary *sum_entry;
1698 struct inode *inode = sbi->sb->s_bdev->bd_inode; 1749 struct inode *inode = sbi->sb->s_bdev->bd_inode;
1699 block_t addr; 1750 block_t addr;
1700 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1751 int bio_blocks = MAX_BIO_BLOCKS(sbi);
1701 struct page *pages[bio_blocks]; 1752 struct page *pages[bio_blocks];
1702 int i, idx, last_offset, nrpages, err = 0; 1753 int i, idx, last_offset, nrpages, err = 0;
1703 1754
@@ -1709,7 +1760,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1709 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { 1760 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
1710 nrpages = min(last_offset - i, bio_blocks); 1761 nrpages = min(last_offset - i, bio_blocks);
1711 1762
1712 /* read ahead node pages */ 1763 /* readahead node pages */
1713 nrpages = ra_sum_pages(sbi, pages, addr, nrpages); 1764 nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
1714 if (!nrpages) 1765 if (!nrpages)
1715 return -ENOMEM; 1766 return -ENOMEM;
@@ -1739,89 +1790,6 @@ skip:
1739 return err; 1790 return err;
1740} 1791}
1741 1792
1742static struct nat_entry_set *grab_nat_entry_set(void)
1743{
1744 struct nat_entry_set *nes =
1745 f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
1746
1747 nes->entry_cnt = 0;
1748 INIT_LIST_HEAD(&nes->set_list);
1749 INIT_LIST_HEAD(&nes->entry_list);
1750 return nes;
1751}
1752
1753static void release_nat_entry_set(struct nat_entry_set *nes,
1754 struct f2fs_nm_info *nm_i)
1755{
1756 f2fs_bug_on(!list_empty(&nes->entry_list));
1757
1758 nm_i->dirty_nat_cnt -= nes->entry_cnt;
1759 list_del(&nes->set_list);
1760 kmem_cache_free(nat_entry_set_slab, nes);
1761}
1762
1763static void adjust_nat_entry_set(struct nat_entry_set *nes,
1764 struct list_head *head)
1765{
1766 struct nat_entry_set *next = nes;
1767
1768 if (list_is_last(&nes->set_list, head))
1769 return;
1770
1771 list_for_each_entry_continue(next, head, set_list)
1772 if (nes->entry_cnt <= next->entry_cnt)
1773 break;
1774
1775 list_move_tail(&nes->set_list, &next->set_list);
1776}
1777
1778static void add_nat_entry(struct nat_entry *ne, struct list_head *head)
1779{
1780 struct nat_entry_set *nes;
1781 nid_t start_nid = START_NID(ne->ni.nid);
1782
1783 list_for_each_entry(nes, head, set_list) {
1784 if (nes->start_nid == start_nid) {
1785 list_move_tail(&ne->list, &nes->entry_list);
1786 nes->entry_cnt++;
1787 adjust_nat_entry_set(nes, head);
1788 return;
1789 }
1790 }
1791
1792 nes = grab_nat_entry_set();
1793
1794 nes->start_nid = start_nid;
1795 list_move_tail(&ne->list, &nes->entry_list);
1796 nes->entry_cnt++;
1797 list_add(&nes->set_list, head);
1798}
1799
1800static void merge_nats_in_set(struct f2fs_sb_info *sbi)
1801{
1802 struct f2fs_nm_info *nm_i = NM_I(sbi);
1803 struct list_head *dirty_list = &nm_i->dirty_nat_entries;
1804 struct list_head *set_list = &nm_i->nat_entry_set;
1805 struct nat_entry *ne, *tmp;
1806
1807 write_lock(&nm_i->nat_tree_lock);
1808 list_for_each_entry_safe(ne, tmp, dirty_list, list) {
1809 if (nat_get_blkaddr(ne) == NEW_ADDR)
1810 continue;
1811 add_nat_entry(ne, set_list);
1812 nm_i->dirty_nat_cnt++;
1813 }
1814 write_unlock(&nm_i->nat_tree_lock);
1815}
1816
1817static bool __has_cursum_space(struct f2fs_summary_block *sum, int size)
1818{
1819 if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES)
1820 return true;
1821 else
1822 return false;
1823}
1824
1825static void remove_nats_in_journal(struct f2fs_sb_info *sbi) 1793static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
1826{ 1794{
1827 struct f2fs_nm_info *nm_i = NM_I(sbi); 1795 struct f2fs_nm_info *nm_i = NM_I(sbi);
@@ -1856,99 +1824,130 @@ found:
1856 mutex_unlock(&curseg->curseg_mutex); 1824 mutex_unlock(&curseg->curseg_mutex);
1857} 1825}
1858 1826
1859/* 1827static void __adjust_nat_entry_set(struct nat_entry_set *nes,
1860 * This function is called during the checkpointing process. 1828 struct list_head *head, int max)
1861 */
1862void flush_nat_entries(struct f2fs_sb_info *sbi)
1863{ 1829{
1864 struct f2fs_nm_info *nm_i = NM_I(sbi); 1830 struct nat_entry_set *cur;
1865 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1866 struct f2fs_summary_block *sum = curseg->sum_blk;
1867 struct nat_entry_set *nes, *tmp;
1868 struct list_head *head = &nm_i->nat_entry_set;
1869 bool to_journal = true;
1870 1831
1871 /* merge nat entries of dirty list to nat entry set temporarily */ 1832 if (nes->entry_cnt >= max)
1872 merge_nats_in_set(sbi); 1833 goto add_out;
1873 1834
1874 /* 1835 list_for_each_entry(cur, head, set_list) {
1875 * if there are no enough space in journal to store dirty nat 1836 if (cur->entry_cnt >= nes->entry_cnt) {
1876 * entries, remove all entries from journal and merge them 1837 list_add(&nes->set_list, cur->set_list.prev);
1877 * into nat entry set. 1838 return;
1878 */ 1839 }
1879 if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) {
1880 remove_nats_in_journal(sbi);
1881
1882 /*
1883 * merge nat entries of dirty list to nat entry set temporarily
1884 */
1885 merge_nats_in_set(sbi);
1886 } 1840 }
1841add_out:
1842 list_add_tail(&nes->set_list, head);
1843}
1887 1844
1888 if (!nm_i->dirty_nat_cnt) 1845static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1889 return; 1846 struct nat_entry_set *set)
1847{
1848 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1849 struct f2fs_summary_block *sum = curseg->sum_blk;
1850 nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
1851 bool to_journal = true;
1852 struct f2fs_nat_block *nat_blk;
1853 struct nat_entry *ne, *cur;
1854 struct page *page = NULL;
1890 1855
1891 /* 1856 /*
1892 * there are two steps to flush nat entries: 1857 * there are two steps to flush nat entries:
1893 * #1, flush nat entries to journal in current hot data summary block. 1858 * #1, flush nat entries to journal in current hot data summary block.
1894 * #2, flush nat entries to nat page. 1859 * #2, flush nat entries to nat page.
1895 */ 1860 */
1896 list_for_each_entry_safe(nes, tmp, head, set_list) { 1861 if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
1897 struct f2fs_nat_block *nat_blk; 1862 to_journal = false;
1898 struct nat_entry *ne, *cur;
1899 struct page *page;
1900 nid_t start_nid = nes->start_nid;
1901 1863
1902 if (to_journal && !__has_cursum_space(sum, nes->entry_cnt)) 1864 if (to_journal) {
1903 to_journal = false; 1865 mutex_lock(&curseg->curseg_mutex);
1866 } else {
1867 page = get_next_nat_page(sbi, start_nid);
1868 nat_blk = page_address(page);
1869 f2fs_bug_on(sbi, !nat_blk);
1870 }
1871
1872 /* flush dirty nats in nat entry set */
1873 list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
1874 struct f2fs_nat_entry *raw_ne;
1875 nid_t nid = nat_get_nid(ne);
1876 int offset;
1877
1878 if (nat_get_blkaddr(ne) == NEW_ADDR)
1879 continue;
1904 1880
1905 if (to_journal) { 1881 if (to_journal) {
1906 mutex_lock(&curseg->curseg_mutex); 1882 offset = lookup_journal_in_cursum(sum,
1883 NAT_JOURNAL, nid, 1);
1884 f2fs_bug_on(sbi, offset < 0);
1885 raw_ne = &nat_in_journal(sum, offset);
1886 nid_in_journal(sum, offset) = cpu_to_le32(nid);
1907 } else { 1887 } else {
1908 page = get_next_nat_page(sbi, start_nid); 1888 raw_ne = &nat_blk->entries[nid - start_nid];
1909 nat_blk = page_address(page);
1910 f2fs_bug_on(!nat_blk);
1911 } 1889 }
1890 raw_nat_from_node_info(raw_ne, &ne->ni);
1912 1891
1913 /* flush dirty nats in nat entry set */ 1892 write_lock(&NM_I(sbi)->nat_tree_lock);
1914 list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { 1893 nat_reset_flag(ne);
1915 struct f2fs_nat_entry *raw_ne; 1894 __clear_nat_cache_dirty(NM_I(sbi), ne);
1916 nid_t nid = nat_get_nid(ne); 1895 write_unlock(&NM_I(sbi)->nat_tree_lock);
1917 int offset;
1918 1896
1919 if (to_journal) { 1897 if (nat_get_blkaddr(ne) == NULL_ADDR)
1920 offset = lookup_journal_in_cursum(sum, 1898 add_free_nid(sbi, nid, false);
1921 NAT_JOURNAL, nid, 1); 1899 }
1922 f2fs_bug_on(offset < 0);
1923 raw_ne = &nat_in_journal(sum, offset);
1924 nid_in_journal(sum, offset) = cpu_to_le32(nid);
1925 } else {
1926 raw_ne = &nat_blk->entries[nid - start_nid];
1927 }
1928 raw_nat_from_node_info(raw_ne, &ne->ni);
1929 1900
1930 if (nat_get_blkaddr(ne) == NULL_ADDR && 1901 if (to_journal)
1931 add_free_nid(sbi, nid, false) <= 0) { 1902 mutex_unlock(&curseg->curseg_mutex);
1932 write_lock(&nm_i->nat_tree_lock); 1903 else
1933 __del_from_nat_cache(nm_i, ne); 1904 f2fs_put_page(page, 1);
1934 write_unlock(&nm_i->nat_tree_lock);
1935 } else {
1936 write_lock(&nm_i->nat_tree_lock);
1937 __clear_nat_cache_dirty(nm_i, ne);
1938 write_unlock(&nm_i->nat_tree_lock);
1939 }
1940 }
1941 1905
1942 if (to_journal) 1906 if (!set->entry_cnt) {
1943 mutex_unlock(&curseg->curseg_mutex); 1907 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
1944 else 1908 kmem_cache_free(nat_entry_set_slab, set);
1945 f2fs_put_page(page, 1); 1909 }
1910}
1946 1911
1947 release_nat_entry_set(nes, nm_i); 1912/*
1913 * This function is called during the checkpointing process.
1914 */
1915void flush_nat_entries(struct f2fs_sb_info *sbi)
1916{
1917 struct f2fs_nm_info *nm_i = NM_I(sbi);
1918 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1919 struct f2fs_summary_block *sum = curseg->sum_blk;
1920 struct nat_entry_set *setvec[NATVEC_SIZE];
1921 struct nat_entry_set *set, *tmp;
1922 unsigned int found;
1923 nid_t set_idx = 0;
1924 LIST_HEAD(sets);
1925
1926 /*
1927 * if there are no enough space in journal to store dirty nat
1928 * entries, remove all entries from journal and merge them
1929 * into nat entry set.
1930 */
1931 if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
1932 remove_nats_in_journal(sbi);
1933
1934 if (!nm_i->dirty_nat_cnt)
1935 return;
1936
1937 while ((found = __gang_lookup_nat_set(nm_i,
1938 set_idx, NATVEC_SIZE, setvec))) {
1939 unsigned idx;
1940 set_idx = setvec[found - 1]->set + 1;
1941 for (idx = 0; idx < found; idx++)
1942 __adjust_nat_entry_set(setvec[idx], &sets,
1943 MAX_NAT_JENTRIES(sum));
1948 } 1944 }
1949 1945
1950 f2fs_bug_on(!list_empty(head)); 1946 /* flush dirty nats in nat entry set */
1951 f2fs_bug_on(nm_i->dirty_nat_cnt); 1947 list_for_each_entry_safe(set, tmp, &sets, set_list)
1948 __flush_nat_entry_set(sbi, set);
1949
1950 f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
1952} 1951}
1953 1952
1954static int init_node_manager(struct f2fs_sb_info *sbi) 1953static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1967,7 +1966,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
1967 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; 1966 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
1968 1967
1969 /* not used nids: 0, node, meta, (and root counted as valid node) */ 1968 /* not used nids: 0, node, meta, (and root counted as valid node) */
1970 nm_i->available_nids = nm_i->max_nid - 3; 1969 nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
1971 nm_i->fcnt = 0; 1970 nm_i->fcnt = 0;
1972 nm_i->nat_cnt = 0; 1971 nm_i->nat_cnt = 0;
1973 nm_i->ram_thresh = DEF_RAM_THRESHOLD; 1972 nm_i->ram_thresh = DEF_RAM_THRESHOLD;
@@ -1975,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
1975 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); 1974 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
1976 INIT_LIST_HEAD(&nm_i->free_nid_list); 1975 INIT_LIST_HEAD(&nm_i->free_nid_list);
1977 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); 1976 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1977 INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
1978 INIT_LIST_HEAD(&nm_i->nat_entries); 1978 INIT_LIST_HEAD(&nm_i->nat_entries);
1979 INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
1980 INIT_LIST_HEAD(&nm_i->nat_entry_set);
1981 1979
1982 mutex_init(&nm_i->build_lock); 1980 mutex_init(&nm_i->build_lock);
1983 spin_lock_init(&nm_i->free_nid_list_lock); 1981 spin_lock_init(&nm_i->free_nid_list_lock);
@@ -2026,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2026 /* destroy free nid list */ 2024 /* destroy free nid list */
2027 spin_lock(&nm_i->free_nid_list_lock); 2025 spin_lock(&nm_i->free_nid_list_lock);
2028 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { 2026 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
2029 f2fs_bug_on(i->state == NID_ALLOC); 2027 f2fs_bug_on(sbi, i->state == NID_ALLOC);
2030 __del_from_free_nid_list(nm_i, i); 2028 __del_from_free_nid_list(nm_i, i);
2031 nm_i->fcnt--; 2029 nm_i->fcnt--;
2032 spin_unlock(&nm_i->free_nid_list_lock); 2030 spin_unlock(&nm_i->free_nid_list_lock);
2033 kmem_cache_free(free_nid_slab, i); 2031 kmem_cache_free(free_nid_slab, i);
2034 spin_lock(&nm_i->free_nid_list_lock); 2032 spin_lock(&nm_i->free_nid_list_lock);
2035 } 2033 }
2036 f2fs_bug_on(nm_i->fcnt); 2034 f2fs_bug_on(sbi, nm_i->fcnt);
2037 spin_unlock(&nm_i->free_nid_list_lock); 2035 spin_unlock(&nm_i->free_nid_list_lock);
2038 2036
2039 /* destroy nat cache */ 2037 /* destroy nat cache */
@@ -2045,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
2045 for (idx = 0; idx < found; idx++) 2043 for (idx = 0; idx < found; idx++)
2046 __del_from_nat_cache(nm_i, natvec[idx]); 2044 __del_from_nat_cache(nm_i, natvec[idx]);
2047 } 2045 }
2048 f2fs_bug_on(nm_i->nat_cnt); 2046 f2fs_bug_on(sbi, nm_i->nat_cnt);
2049 write_unlock(&nm_i->nat_tree_lock); 2047 write_unlock(&nm_i->nat_tree_lock);
2050 2048
2051 kfree(nm_i->nat_bitmap); 2049 kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 8a116a407599..8d5e6e0dd840 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -39,10 +39,16 @@ struct node_info {
39 unsigned char version; /* version of the node */ 39 unsigned char version; /* version of the node */
40}; 40};
41 41
42enum {
43 IS_CHECKPOINTED, /* is it checkpointed before? */
44 HAS_FSYNCED_INODE, /* is the inode fsynced before? */
45 HAS_LAST_FSYNC, /* has the latest node fsync mark? */
46 IS_DIRTY, /* this nat entry is dirty? */
47};
48
42struct nat_entry { 49struct nat_entry {
43 struct list_head list; /* for clean or dirty nat list */ 50 struct list_head list; /* for clean or dirty nat list */
44 bool checkpointed; /* whether it is checkpointed or not */ 51 unsigned char flag; /* for node information bits */
45 bool fsync_done; /* whether the latest node has fsync mark */
46 struct node_info ni; /* in-memory node information */ 52 struct node_info ni; /* in-memory node information */
47}; 53};
48 54
@@ -55,18 +61,32 @@ struct nat_entry {
55#define nat_get_version(nat) (nat->ni.version) 61#define nat_get_version(nat) (nat->ni.version)
56#define nat_set_version(nat, v) (nat->ni.version = v) 62#define nat_set_version(nat, v) (nat->ni.version = v)
57 63
58#define __set_nat_cache_dirty(nm_i, ne) \
59 do { \
60 ne->checkpointed = false; \
61 list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
62 } while (0)
63#define __clear_nat_cache_dirty(nm_i, ne) \
64 do { \
65 ne->checkpointed = true; \
66 list_move_tail(&ne->list, &nm_i->nat_entries); \
67 } while (0)
68#define inc_node_version(version) (++version) 64#define inc_node_version(version) (++version)
69 65
66static inline void set_nat_flag(struct nat_entry *ne,
67 unsigned int type, bool set)
68{
69 unsigned char mask = 0x01 << type;
70 if (set)
71 ne->flag |= mask;
72 else
73 ne->flag &= ~mask;
74}
75
76static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
77{
78 unsigned char mask = 0x01 << type;
79 return ne->flag & mask;
80}
81
82static inline void nat_reset_flag(struct nat_entry *ne)
83{
84 /* these states can be set only after checkpoint was done */
85 set_nat_flag(ne, IS_CHECKPOINTED, true);
86 set_nat_flag(ne, HAS_FSYNCED_INODE, false);
87 set_nat_flag(ne, HAS_LAST_FSYNC, true);
88}
89
70static inline void node_info_from_raw_nat(struct node_info *ni, 90static inline void node_info_from_raw_nat(struct node_info *ni,
71 struct f2fs_nat_entry *raw_ne) 91 struct f2fs_nat_entry *raw_ne)
72{ 92{
@@ -90,9 +110,9 @@ enum mem_type {
90}; 110};
91 111
92struct nat_entry_set { 112struct nat_entry_set {
93 struct list_head set_list; /* link with all nat sets */ 113 struct list_head set_list; /* link with other nat sets */
94 struct list_head entry_list; /* link with dirty nat entries */ 114 struct list_head entry_list; /* link with dirty nat entries */
95 nid_t start_nid; /* start nid of nats in set */ 115 nid_t set; /* set number*/
96 unsigned int entry_cnt; /* the # of nat entries in set */ 116 unsigned int entry_cnt; /* the # of nat entries in set */
97}; 117};
98 118
@@ -110,18 +130,19 @@ struct free_nid {
110 int state; /* in use or not: NID_NEW or NID_ALLOC */ 130 int state; /* in use or not: NID_NEW or NID_ALLOC */
111}; 131};
112 132
113static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) 133static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
114{ 134{
115 struct f2fs_nm_info *nm_i = NM_I(sbi); 135 struct f2fs_nm_info *nm_i = NM_I(sbi);
116 struct free_nid *fnid; 136 struct free_nid *fnid;
117 137
118 if (nm_i->fcnt <= 0)
119 return -1;
120 spin_lock(&nm_i->free_nid_list_lock); 138 spin_lock(&nm_i->free_nid_list_lock);
139 if (nm_i->fcnt <= 0) {
140 spin_unlock(&nm_i->free_nid_list_lock);
141 return;
142 }
121 fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); 143 fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
122 *nid = fnid->nid; 144 *nid = fnid->nid;
123 spin_unlock(&nm_i->free_nid_list_lock); 145 spin_unlock(&nm_i->free_nid_list_lock);
124 return 0;
125} 146}
126 147
127/* 148/*
@@ -197,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src)
197 218
198static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) 219static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
199{ 220{
200 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 221 struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
201 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
202 struct f2fs_node *rn = F2FS_NODE(page); 222 struct f2fs_node *rn = F2FS_NODE(page);
203 223
204 rn->footer.cp_ver = ckpt->checkpoint_ver; 224 rn->footer.cp_ver = ckpt->checkpoint_ver;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index fe1c6d921ba2..ebd013225788 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -14,6 +14,37 @@
14#include "node.h" 14#include "node.h"
15#include "segment.h" 15#include "segment.h"
16 16
17/*
18 * Roll forward recovery scenarios.
19 *
20 * [Term] F: fsync_mark, D: dentry_mark
21 *
22 * 1. inode(x) | CP | inode(x) | dnode(F)
23 * -> Update the latest inode(x).
24 *
25 * 2. inode(x) | CP | inode(F) | dnode(F)
26 * -> No problem.
27 *
28 * 3. inode(x) | CP | dnode(F) | inode(x)
29 * -> Recover to the latest dnode(F), and drop the last inode(x)
30 *
31 * 4. inode(x) | CP | dnode(F) | inode(F)
32 * -> No problem.
33 *
34 * 5. CP | inode(x) | dnode(F)
35 * -> The inode(DF) was missing. Should drop this dnode(F).
36 *
37 * 6. CP | inode(DF) | dnode(F)
38 * -> No problem.
39 *
40 * 7. CP | dnode(F) | inode(DF)
41 * -> If f2fs_iget fails, then goto next to find inode(DF).
42 *
43 * 8. CP | dnode(F) | inode(x)
44 * -> If f2fs_iget fails, then goto next to find inode(DF).
45 * But it will fail due to no inode(DF).
46 */
47
17static struct kmem_cache *fsync_entry_slab; 48static struct kmem_cache *fsync_entry_slab;
18 49
19bool space_for_roll_forward(struct f2fs_sb_info *sbi) 50bool space_for_roll_forward(struct f2fs_sb_info *sbi)
@@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
36 return NULL; 67 return NULL;
37} 68}
38 69
39static int recover_dentry(struct page *ipage, struct inode *inode) 70static int recover_dentry(struct inode *inode, struct page *ipage)
40{ 71{
41 struct f2fs_inode *raw_inode = F2FS_INODE(ipage); 72 struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
42 nid_t pino = le32_to_cpu(raw_inode->i_pino); 73 nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -62,8 +93,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
62 } 93 }
63retry: 94retry:
64 de = f2fs_find_entry(dir, &name, &page); 95 de = f2fs_find_entry(dir, &name, &page);
65 if (de && inode->i_ino == le32_to_cpu(de->ino)) 96 if (de && inode->i_ino == le32_to_cpu(de->ino)) {
97 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
66 goto out_unmap_put; 98 goto out_unmap_put;
99 }
67 if (de) { 100 if (de) {
68 einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); 101 einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
69 if (IS_ERR(einode)) { 102 if (IS_ERR(einode)) {
@@ -73,7 +106,7 @@ retry:
73 err = -EEXIST; 106 err = -EEXIST;
74 goto out_unmap_put; 107 goto out_unmap_put;
75 } 108 }
76 err = acquire_orphan_inode(F2FS_SB(inode->i_sb)); 109 err = acquire_orphan_inode(F2FS_I_SB(inode));
77 if (err) { 110 if (err) {
78 iput(einode); 111 iput(einode);
79 goto out_unmap_put; 112 goto out_unmap_put;
@@ -108,35 +141,28 @@ out:
108 return err; 141 return err;
109} 142}
110 143
111static int recover_inode(struct inode *inode, struct page *node_page) 144static void recover_inode(struct inode *inode, struct page *page)
112{ 145{
113 struct f2fs_inode *raw_inode = F2FS_INODE(node_page); 146 struct f2fs_inode *raw = F2FS_INODE(page);
114 147
115 if (!IS_INODE(node_page)) 148 inode->i_mode = le16_to_cpu(raw->i_mode);
116 return 0; 149 i_size_write(inode, le64_to_cpu(raw->i_size));
117 150 inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
118 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 151 inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
119 i_size_write(inode, le64_to_cpu(raw_inode->i_size)); 152 inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
120 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 153 inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
121 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); 154 inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
122 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); 155 inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
123 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
124 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
125 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
126
127 if (is_dent_dnode(node_page))
128 return recover_dentry(node_page, inode);
129 156
130 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", 157 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
131 ino_of_node(node_page), raw_inode->i_name); 158 ino_of_node(page), F2FS_INODE(page)->i_name);
132 return 0;
133} 159}
134 160
135static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) 161static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
136{ 162{
137 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); 163 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
138 struct curseg_info *curseg; 164 struct curseg_info *curseg;
139 struct page *page; 165 struct page *page = NULL;
140 block_t blkaddr; 166 block_t blkaddr;
141 int err = 0; 167 int err = 0;
142 168
@@ -144,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
144 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); 170 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
145 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 171 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
146 172
147 /* read node page */
148 page = alloc_page(GFP_F2FS_ZERO);
149 if (!page)
150 return -ENOMEM;
151 lock_page(page);
152
153 while (1) { 173 while (1) {
154 struct fsync_inode_entry *entry; 174 struct fsync_inode_entry *entry;
155 175
156 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); 176 if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
157 if (err) 177 return 0;
158 return err;
159 178
160 lock_page(page); 179 page = get_meta_page_ra(sbi, blkaddr);
161 180
162 if (cp_ver != cpver_of_node(page)) 181 if (cp_ver != cpver_of_node(page))
163 break; 182 break;
@@ -178,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
178 } 197 }
179 198
180 /* add this fsync inode to the list */ 199 /* add this fsync inode to the list */
181 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); 200 entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
182 if (!entry) { 201 if (!entry) {
183 err = -ENOMEM; 202 err = -ENOMEM;
184 break; 203 break;
185 } 204 }
186 205 /*
206 * CP | dnode(F) | inode(DF)
207 * For this case, we should not give up now.
208 */
187 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); 209 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
188 if (IS_ERR(entry->inode)) { 210 if (IS_ERR(entry->inode)) {
189 err = PTR_ERR(entry->inode); 211 err = PTR_ERR(entry->inode);
190 kmem_cache_free(fsync_entry_slab, entry); 212 kmem_cache_free(fsync_entry_slab, entry);
213 if (err == -ENOENT)
214 goto next;
191 break; 215 break;
192 } 216 }
193 list_add_tail(&entry->list, head); 217 list_add_tail(&entry->list, head);
194 } 218 }
195 entry->blkaddr = blkaddr; 219 entry->blkaddr = blkaddr;
196 220
197 err = recover_inode(entry->inode, page); 221 if (IS_INODE(page)) {
198 if (err && err != -ENOENT) 222 entry->last_inode = blkaddr;
199 break; 223 if (is_dent_dnode(page))
224 entry->last_dentry = blkaddr;
225 }
200next: 226next:
201 /* check next segment */ 227 /* check next segment */
202 blkaddr = next_blkaddr_of_node(page); 228 blkaddr = next_blkaddr_of_node(page);
229 f2fs_put_page(page, 1);
203 } 230 }
204 231 f2fs_put_page(page, 1);
205 unlock_page(page);
206 __free_pages(page, 0);
207
208 return err; 232 return err;
209} 233}
210 234
@@ -277,16 +301,30 @@ got_it:
277 ino = ino_of_node(node_page); 301 ino = ino_of_node(node_page);
278 f2fs_put_page(node_page, 1); 302 f2fs_put_page(node_page, 1);
279 303
280 /* Deallocate previous index in the node page */ 304 if (ino != dn->inode->i_ino) {
281 inode = f2fs_iget(sbi->sb, ino); 305 /* Deallocate previous index in the node page */
282 if (IS_ERR(inode)) 306 inode = f2fs_iget(sbi->sb, ino);
283 return PTR_ERR(inode); 307 if (IS_ERR(inode))
308 return PTR_ERR(inode);
309 } else {
310 inode = dn->inode;
311 }
284 312
285 bidx = start_bidx_of_node(offset, F2FS_I(inode)) + 313 bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
286 le16_to_cpu(sum.ofs_in_node); 314 le16_to_cpu(sum.ofs_in_node);
287 315
288 truncate_hole(inode, bidx, bidx + 1); 316 if (ino != dn->inode->i_ino) {
289 iput(inode); 317 truncate_hole(inode, bidx, bidx + 1);
318 iput(inode);
319 } else {
320 struct dnode_of_data tdn;
321 set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
322 if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
323 return 0;
324 if (tdn.data_blkaddr != NULL_ADDR)
325 truncate_data_blocks_range(&tdn, 1);
326 f2fs_put_page(tdn.node_page, 1);
327 }
290 return 0; 328 return 0;
291} 329}
292 330
@@ -300,14 +338,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
300 struct node_info ni; 338 struct node_info ni;
301 int err = 0, recovered = 0; 339 int err = 0, recovered = 0;
302 340
303 recover_inline_xattr(inode, page); 341 /* step 1: recover xattr */
304 342 if (IS_INODE(page)) {
305 if (recover_inline_data(inode, page)) 343 recover_inline_xattr(inode, page);
344 } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
345 recover_xattr_data(inode, page, blkaddr);
306 goto out; 346 goto out;
347 }
307 348
308 if (recover_xattr_data(inode, page, blkaddr)) 349 /* step 2: recover inline data */
350 if (recover_inline_data(inode, page))
309 goto out; 351 goto out;
310 352
353 /* step 3: recover data indices */
311 start = start_bidx_of_node(ofs_of_node(page), fi); 354 start = start_bidx_of_node(ofs_of_node(page), fi);
312 end = start + ADDRS_PER_PAGE(page, fi); 355 end = start + ADDRS_PER_PAGE(page, fi);
313 356
@@ -324,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
324 f2fs_wait_on_page_writeback(dn.node_page, NODE); 367 f2fs_wait_on_page_writeback(dn.node_page, NODE);
325 368
326 get_node_info(sbi, dn.nid, &ni); 369 get_node_info(sbi, dn.nid, &ni);
327 f2fs_bug_on(ni.ino != ino_of_node(page)); 370 f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
328 f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page)); 371 f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
329 372
330 for (; start < end; start++) { 373 for (; start < end; start++) {
331 block_t src, dest; 374 block_t src, dest;
@@ -337,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
337 if (src == NULL_ADDR) { 380 if (src == NULL_ADDR) {
338 err = reserve_new_block(&dn); 381 err = reserve_new_block(&dn);
339 /* We should not get -ENOSPC */ 382 /* We should not get -ENOSPC */
340 f2fs_bug_on(err); 383 f2fs_bug_on(sbi, err);
341 } 384 }
342 385
343 /* Check the previous node page having this index */ 386 /* Check the previous node page having this index */
@@ -364,8 +407,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
364 fill_node_footer(dn.node_page, dn.nid, ni.ino, 407 fill_node_footer(dn.node_page, dn.nid, ni.ino,
365 ofs_of_node(page), false); 408 ofs_of_node(page), false);
366 set_page_dirty(dn.node_page); 409 set_page_dirty(dn.node_page);
367
368 recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
369err: 410err:
370 f2fs_put_dnode(&dn); 411 f2fs_put_dnode(&dn);
371 f2fs_unlock_op(sbi); 412 f2fs_unlock_op(sbi);
@@ -381,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
381{ 422{
382 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); 423 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
383 struct curseg_info *curseg; 424 struct curseg_info *curseg;
384 struct page *page; 425 struct page *page = NULL;
385 int err = 0; 426 int err = 0;
386 block_t blkaddr; 427 block_t blkaddr;
387 428
@@ -389,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi,
389 curseg = CURSEG_I(sbi, type); 430 curseg = CURSEG_I(sbi, type);
390 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 431 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
391 432
392 /* read node page */
393 page = alloc_page(GFP_F2FS_ZERO);
394 if (!page)
395 return -ENOMEM;
396
397 lock_page(page);
398
399 while (1) { 433 while (1) {
400 struct fsync_inode_entry *entry; 434 struct fsync_inode_entry *entry;
401 435
402 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); 436 if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
403 if (err) 437 break;
404 return err;
405 438
406 lock_page(page); 439 page = get_meta_page_ra(sbi, blkaddr);
407 440
408 if (cp_ver != cpver_of_node(page)) 441 if (cp_ver != cpver_of_node(page)) {
442 f2fs_put_page(page, 1);
409 break; 443 break;
444 }
410 445
411 entry = get_fsync_inode(head, ino_of_node(page)); 446 entry = get_fsync_inode(head, ino_of_node(page));
412 if (!entry) 447 if (!entry)
413 goto next; 448 goto next;
414 449 /*
450 * inode(x) | CP | inode(x) | dnode(F)
451 * In this case, we can lose the latest inode(x).
452 * So, call recover_inode for the inode update.
453 */
454 if (entry->last_inode == blkaddr)
455 recover_inode(entry->inode, page);
456 if (entry->last_dentry == blkaddr) {
457 err = recover_dentry(entry->inode, page);
458 if (err) {
459 f2fs_put_page(page, 1);
460 break;
461 }
462 }
415 err = do_recover_data(sbi, entry->inode, page, blkaddr); 463 err = do_recover_data(sbi, entry->inode, page, blkaddr);
416 if (err) 464 if (err) {
465 f2fs_put_page(page, 1);
417 break; 466 break;
467 }
418 468
419 if (entry->blkaddr == blkaddr) { 469 if (entry->blkaddr == blkaddr) {
420 iput(entry->inode); 470 iput(entry->inode);
@@ -424,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
424next: 474next:
425 /* check next segment */ 475 /* check next segment */
426 blkaddr = next_blkaddr_of_node(page); 476 blkaddr = next_blkaddr_of_node(page);
477 f2fs_put_page(page, 1);
427 } 478 }
428
429 unlock_page(page);
430 __free_pages(page, 0);
431
432 if (!err) 479 if (!err)
433 allocate_new_segments(sbi); 480 allocate_new_segments(sbi);
434 return err; 481 return err;
@@ -452,6 +499,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
452 /* step #1: find fsynced inode numbers */ 499 /* step #1: find fsynced inode numbers */
453 sbi->por_doing = true; 500 sbi->por_doing = true;
454 501
502 /* prevent checkpoint */
503 mutex_lock(&sbi->cp_mutex);
504
455 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 505 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
456 506
457 err = find_fsync_dnodes(sbi, &inode_list); 507 err = find_fsync_dnodes(sbi, &inode_list);
@@ -465,11 +515,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
465 515
466 /* step #2: recover data */ 516 /* step #2: recover data */
467 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); 517 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
468 f2fs_bug_on(!list_empty(&inode_list)); 518 if (!err)
519 f2fs_bug_on(sbi, !list_empty(&inode_list));
469out: 520out:
470 destroy_fsync_dnodes(&inode_list); 521 destroy_fsync_dnodes(&inode_list);
471 kmem_cache_destroy(fsync_entry_slab); 522 kmem_cache_destroy(fsync_entry_slab);
472 523
524 /* truncate meta pages to be used by the recovery */
525 truncate_inode_pages_range(META_MAPPING(sbi),
526 MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
527
473 if (err) { 528 if (err) {
474 truncate_inode_pages_final(NODE_MAPPING(sbi)); 529 truncate_inode_pages_final(NODE_MAPPING(sbi));
475 truncate_inode_pages_final(META_MAPPING(sbi)); 530 truncate_inode_pages_final(META_MAPPING(sbi));
@@ -482,8 +537,16 @@ out:
482 /* Flush all the NAT/SIT pages */ 537 /* Flush all the NAT/SIT pages */
483 while (get_pages(sbi, F2FS_DIRTY_META)) 538 while (get_pages(sbi, F2FS_DIRTY_META))
484 sync_meta_pages(sbi, META, LONG_MAX); 539 sync_meta_pages(sbi, META, LONG_MAX);
540 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
541 mutex_unlock(&sbi->cp_mutex);
485 } else if (need_writecp) { 542 } else if (need_writecp) {
486 write_checkpoint(sbi, false); 543 struct cp_control cpc = {
544 .reason = CP_SYNC,
545 };
546 mutex_unlock(&sbi->cp_mutex);
547 write_checkpoint(sbi, &cpc);
548 } else {
549 mutex_unlock(&sbi->cp_mutex);
487 } 550 }
488 return err; 551 return err;
489} 552}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0dfeebae2a50..923cb76fdc46 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -25,6 +25,8 @@
25#define __reverse_ffz(x) __reverse_ffs(~(x)) 25#define __reverse_ffz(x) __reverse_ffs(~(x))
26 26
27static struct kmem_cache *discard_entry_slab; 27static struct kmem_cache *discard_entry_slab;
28static struct kmem_cache *sit_entry_set_slab;
29static struct kmem_cache *inmem_entry_slab;
28 30
29/* 31/*
30 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since 32 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -62,7 +64,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
62} 64}
63 65
64/* 66/*
65 * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue 67 * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
66 * f2fs_set_bit makes MSB and LSB reversed in a byte. 68 * f2fs_set_bit makes MSB and LSB reversed in a byte.
67 * Example: 69 * Example:
68 * LSB <--> MSB 70 * LSB <--> MSB
@@ -172,6 +174,60 @@ found_middle:
172 return result + __reverse_ffz(tmp); 174 return result + __reverse_ffz(tmp);
173} 175}
174 176
177void register_inmem_page(struct inode *inode, struct page *page)
178{
179 struct f2fs_inode_info *fi = F2FS_I(inode);
180 struct inmem_pages *new;
181
182 new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
183
184 /* add atomic page indices to the list */
185 new->page = page;
186 INIT_LIST_HEAD(&new->list);
187
188 /* increase reference count with clean state */
189 mutex_lock(&fi->inmem_lock);
190 get_page(page);
191 list_add_tail(&new->list, &fi->inmem_pages);
192 mutex_unlock(&fi->inmem_lock);
193}
194
195void commit_inmem_pages(struct inode *inode, bool abort)
196{
197 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
198 struct f2fs_inode_info *fi = F2FS_I(inode);
199 struct inmem_pages *cur, *tmp;
200 bool submit_bio = false;
201 struct f2fs_io_info fio = {
202 .type = DATA,
203 .rw = WRITE_SYNC,
204 };
205
206 f2fs_balance_fs(sbi);
207 f2fs_lock_op(sbi);
208
209 mutex_lock(&fi->inmem_lock);
210 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
211 lock_page(cur->page);
212 if (!abort && cur->page->mapping == inode->i_mapping) {
213 f2fs_wait_on_page_writeback(cur->page, DATA);
214 if (clear_page_dirty_for_io(cur->page))
215 inode_dec_dirty_pages(inode);
216 do_write_data_page(cur->page, &fio);
217 submit_bio = true;
218 }
219 f2fs_put_page(cur->page, 1);
220 list_del(&cur->list);
221 kmem_cache_free(inmem_entry_slab, cur);
222 }
223 if (submit_bio)
224 f2fs_submit_merged_bio(sbi, DATA, WRITE);
225 mutex_unlock(&fi->inmem_lock);
226
227 filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
228 f2fs_unlock_op(sbi);
229}
230
175/* 231/*
176 * This function balances dirty node and dentry pages. 232 * This function balances dirty node and dentry pages.
177 * In addition, it controls garbage collection. 233 * In addition, it controls garbage collection.
@@ -205,24 +261,20 @@ repeat:
205 if (kthread_should_stop()) 261 if (kthread_should_stop())
206 return 0; 262 return 0;
207 263
208 spin_lock(&fcc->issue_lock); 264 if (!llist_empty(&fcc->issue_list)) {
209 if (fcc->issue_list) {
210 fcc->dispatch_list = fcc->issue_list;
211 fcc->issue_list = fcc->issue_tail = NULL;
212 }
213 spin_unlock(&fcc->issue_lock);
214
215 if (fcc->dispatch_list) {
216 struct bio *bio = bio_alloc(GFP_NOIO, 0); 265 struct bio *bio = bio_alloc(GFP_NOIO, 0);
217 struct flush_cmd *cmd, *next; 266 struct flush_cmd *cmd, *next;
218 int ret; 267 int ret;
219 268
269 fcc->dispatch_list = llist_del_all(&fcc->issue_list);
270 fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
271
220 bio->bi_bdev = sbi->sb->s_bdev; 272 bio->bi_bdev = sbi->sb->s_bdev;
221 ret = submit_bio_wait(WRITE_FLUSH, bio); 273 ret = submit_bio_wait(WRITE_FLUSH, bio);
222 274
223 for (cmd = fcc->dispatch_list; cmd; cmd = next) { 275 llist_for_each_entry_safe(cmd, next,
276 fcc->dispatch_list, llnode) {
224 cmd->ret = ret; 277 cmd->ret = ret;
225 next = cmd->next;
226 complete(&cmd->wait); 278 complete(&cmd->wait);
227 } 279 }
228 bio_put(bio); 280 bio_put(bio);
@@ -230,7 +282,7 @@ repeat:
230 } 282 }
231 283
232 wait_event_interruptible(*q, 284 wait_event_interruptible(*q,
233 kthread_should_stop() || fcc->issue_list); 285 kthread_should_stop() || !llist_empty(&fcc->issue_list));
234 goto repeat; 286 goto repeat;
235} 287}
236 288
@@ -249,15 +301,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
249 return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); 301 return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
250 302
251 init_completion(&cmd.wait); 303 init_completion(&cmd.wait);
252 cmd.next = NULL;
253 304
254 spin_lock(&fcc->issue_lock); 305 llist_add(&cmd.llnode, &fcc->issue_list);
255 if (fcc->issue_list)
256 fcc->issue_tail->next = &cmd;
257 else
258 fcc->issue_list = &cmd;
259 fcc->issue_tail = &cmd;
260 spin_unlock(&fcc->issue_lock);
261 306
262 if (!fcc->dispatch_list) 307 if (!fcc->dispatch_list)
263 wake_up(&fcc->flush_wait_queue); 308 wake_up(&fcc->flush_wait_queue);
@@ -276,8 +321,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
276 fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); 321 fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
277 if (!fcc) 322 if (!fcc)
278 return -ENOMEM; 323 return -ENOMEM;
279 spin_lock_init(&fcc->issue_lock);
280 init_waitqueue_head(&fcc->flush_wait_queue); 324 init_waitqueue_head(&fcc->flush_wait_queue);
325 init_llist_head(&fcc->issue_list);
281 SM_I(sbi)->cmd_control_info = fcc; 326 SM_I(sbi)->cmd_control_info = fcc;
282 fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, 327 fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
283 "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); 328 "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
@@ -317,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
317 struct seg_entry *sentry = get_seg_entry(sbi, segno); 362 struct seg_entry *sentry = get_seg_entry(sbi, segno);
318 enum dirty_type t = sentry->type; 363 enum dirty_type t = sentry->type;
319 364
365 if (unlikely(t >= DIRTY)) {
366 f2fs_bug_on(sbi, 1);
367 return;
368 }
320 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) 369 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
321 dirty_i->nr_dirty[t]++; 370 dirty_i->nr_dirty[t]++;
322 } 371 }
@@ -376,8 +425,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
376static int f2fs_issue_discard(struct f2fs_sb_info *sbi, 425static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
377 block_t blkstart, block_t blklen) 426 block_t blkstart, block_t blklen)
378{ 427{
379 sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); 428 sector_t start = SECTOR_FROM_BLOCK(blkstart);
380 sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); 429 sector_t len = SECTOR_FROM_BLOCK(blklen);
381 trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); 430 trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
382 return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); 431 return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
383} 432}
@@ -392,21 +441,47 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
392 } 441 }
393} 442}
394 443
395static void add_discard_addrs(struct f2fs_sb_info *sbi, 444static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
396 unsigned int segno, struct seg_entry *se)
397{ 445{
398 struct list_head *head = &SM_I(sbi)->discard_list; 446 struct list_head *head = &SM_I(sbi)->discard_list;
399 struct discard_entry *new; 447 struct discard_entry *new;
400 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); 448 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
401 int max_blocks = sbi->blocks_per_seg; 449 int max_blocks = sbi->blocks_per_seg;
450 struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
402 unsigned long *cur_map = (unsigned long *)se->cur_valid_map; 451 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
403 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; 452 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
404 unsigned long dmap[entries]; 453 unsigned long dmap[entries];
405 unsigned int start = 0, end = -1; 454 unsigned int start = 0, end = -1;
455 bool force = (cpc->reason == CP_DISCARD);
406 int i; 456 int i;
407 457
408 if (!test_opt(sbi, DISCARD)) 458 if (!force && !test_opt(sbi, DISCARD))
459 return;
460
461 if (force && !se->valid_blocks) {
462 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
463 /*
464 * if this segment is registered in the prefree list, then
465 * we should skip adding a discard candidate, and let the
466 * checkpoint do that later.
467 */
468 mutex_lock(&dirty_i->seglist_lock);
469 if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
470 mutex_unlock(&dirty_i->seglist_lock);
471 cpc->trimmed += sbi->blocks_per_seg;
472 return;
473 }
474 mutex_unlock(&dirty_i->seglist_lock);
475
476 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
477 INIT_LIST_HEAD(&new->list);
478 new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
479 new->len = sbi->blocks_per_seg;
480 list_add_tail(&new->list, head);
481 SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
482 cpc->trimmed += sbi->blocks_per_seg;
409 return; 483 return;
484 }
410 485
411 /* zero block will be discarded through the prefree list */ 486 /* zero block will be discarded through the prefree list */
412 if (!se->valid_blocks || se->valid_blocks == max_blocks) 487 if (!se->valid_blocks || se->valid_blocks == max_blocks)
@@ -416,23 +491,39 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi,
416 for (i = 0; i < entries; i++) 491 for (i = 0; i < entries; i++)
417 dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; 492 dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
418 493
419 while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { 494 while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
420 start = __find_rev_next_bit(dmap, max_blocks, end + 1); 495 start = __find_rev_next_bit(dmap, max_blocks, end + 1);
421 if (start >= max_blocks) 496 if (start >= max_blocks)
422 break; 497 break;
423 498
424 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); 499 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
425 500
501 if (end - start < cpc->trim_minlen)
502 continue;
503
426 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); 504 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
427 INIT_LIST_HEAD(&new->list); 505 INIT_LIST_HEAD(&new->list);
428 new->blkaddr = START_BLOCK(sbi, segno) + start; 506 new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
429 new->len = end - start; 507 new->len = end - start;
508 cpc->trimmed += end - start;
430 509
431 list_add_tail(&new->list, head); 510 list_add_tail(&new->list, head);
432 SM_I(sbi)->nr_discards += end - start; 511 SM_I(sbi)->nr_discards += end - start;
433 } 512 }
434} 513}
435 514
515void release_discard_addrs(struct f2fs_sb_info *sbi)
516{
517 struct list_head *head = &(SM_I(sbi)->discard_list);
518 struct discard_entry *entry, *this;
519
520 /* drop caches */
521 list_for_each_entry_safe(entry, this, head, list) {
522 list_del(&entry->list);
523 kmem_cache_free(discard_entry_slab, entry);
524 }
525}
526
436/* 527/*
437 * Should call clear_prefree_segments after checkpoint is done. 528 * Should call clear_prefree_segments after checkpoint is done.
438 */ 529 */
@@ -440,10 +531,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
440{ 531{
441 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 532 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
442 unsigned int segno; 533 unsigned int segno;
443 unsigned int total_segs = TOTAL_SEGS(sbi);
444 534
445 mutex_lock(&dirty_i->seglist_lock); 535 mutex_lock(&dirty_i->seglist_lock);
446 for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs) 536 for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
447 __set_test_and_free(sbi, segno); 537 __set_test_and_free(sbi, segno);
448 mutex_unlock(&dirty_i->seglist_lock); 538 mutex_unlock(&dirty_i->seglist_lock);
449} 539}
@@ -454,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
454 struct discard_entry *entry, *this; 544 struct discard_entry *entry, *this;
455 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 545 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
456 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; 546 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
457 unsigned int total_segs = TOTAL_SEGS(sbi);
458 unsigned int start = 0, end = -1; 547 unsigned int start = 0, end = -1;
459 548
460 mutex_lock(&dirty_i->seglist_lock); 549 mutex_lock(&dirty_i->seglist_lock);
461 550
462 while (1) { 551 while (1) {
463 int i; 552 int i;
464 start = find_next_bit(prefree_map, total_segs, end + 1); 553 start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
465 if (start >= total_segs) 554 if (start >= MAIN_SEGS(sbi))
466 break; 555 break;
467 end = find_next_zero_bit(prefree_map, total_segs, start + 1); 556 end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
557 start + 1);
468 558
469 for (i = start; i < end; i++) 559 for (i = start; i < end; i++)
470 clear_bit(i, prefree_map); 560 clear_bit(i, prefree_map);
@@ -488,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
488 } 578 }
489} 579}
490 580
491static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) 581static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
492{ 582{
493 struct sit_info *sit_i = SIT_I(sbi); 583 struct sit_info *sit_i = SIT_I(sbi);
494 if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) 584
585 if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
495 sit_i->dirty_sentries++; 586 sit_i->dirty_sentries++;
587 return false;
588 }
589
590 return true;
496} 591}
497 592
498static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, 593static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
@@ -516,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
516 new_vblocks = se->valid_blocks + del; 611 new_vblocks = se->valid_blocks + del;
517 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); 612 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
518 613
519 f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || 614 f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
520 (new_vblocks > sbi->blocks_per_seg))); 615 (new_vblocks > sbi->blocks_per_seg)));
521 616
522 se->valid_blocks = new_vblocks; 617 se->valid_blocks = new_vblocks;
@@ -526,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
526 /* Update valid block bitmap */ 621 /* Update valid block bitmap */
527 if (del > 0) { 622 if (del > 0) {
528 if (f2fs_set_bit(offset, se->cur_valid_map)) 623 if (f2fs_set_bit(offset, se->cur_valid_map))
529 BUG(); 624 f2fs_bug_on(sbi, 1);
530 } else { 625 } else {
531 if (!f2fs_clear_bit(offset, se->cur_valid_map)) 626 if (!f2fs_clear_bit(offset, se->cur_valid_map))
532 BUG(); 627 f2fs_bug_on(sbi, 1);
533 } 628 }
534 if (!f2fs_test_bit(offset, se->ckpt_valid_map)) 629 if (!f2fs_test_bit(offset, se->ckpt_valid_map))
535 se->ckpt_valid_blocks += del; 630 se->ckpt_valid_blocks += del;
@@ -558,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
558 unsigned int segno = GET_SEGNO(sbi, addr); 653 unsigned int segno = GET_SEGNO(sbi, addr);
559 struct sit_info *sit_i = SIT_I(sbi); 654 struct sit_info *sit_i = SIT_I(sbi);
560 655
561 f2fs_bug_on(addr == NULL_ADDR); 656 f2fs_bug_on(sbi, addr == NULL_ADDR);
562 if (addr == NEW_ADDR) 657 if (addr == NEW_ADDR)
563 return; 658 return;
564 659
@@ -634,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
634 unsigned int segno = curseg->segno + 1; 729 unsigned int segno = curseg->segno + 1;
635 struct free_segmap_info *free_i = FREE_I(sbi); 730 struct free_segmap_info *free_i = FREE_I(sbi);
636 731
637 if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) 732 if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
638 return !test_bit(segno, free_i->free_segmap); 733 return !test_bit(segno, free_i->free_segmap);
639 return 0; 734 return 0;
640} 735}
@@ -648,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
648{ 743{
649 struct free_segmap_info *free_i = FREE_I(sbi); 744 struct free_segmap_info *free_i = FREE_I(sbi);
650 unsigned int segno, secno, zoneno; 745 unsigned int segno, secno, zoneno;
651 unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; 746 unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
652 unsigned int hint = *newseg / sbi->segs_per_sec; 747 unsigned int hint = *newseg / sbi->segs_per_sec;
653 unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); 748 unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
654 unsigned int left_start = hint; 749 unsigned int left_start = hint;
@@ -660,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
660 755
661 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { 756 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
662 segno = find_next_zero_bit(free_i->free_segmap, 757 segno = find_next_zero_bit(free_i->free_segmap,
663 TOTAL_SEGS(sbi), *newseg + 1); 758 MAIN_SEGS(sbi), *newseg + 1);
664 if (segno - *newseg < sbi->segs_per_sec - 759 if (segno - *newseg < sbi->segs_per_sec -
665 (*newseg % sbi->segs_per_sec)) 760 (*newseg % sbi->segs_per_sec))
666 goto got_it; 761 goto got_it;
667 } 762 }
668find_other_zone: 763find_other_zone:
669 secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); 764 secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
670 if (secno >= TOTAL_SECS(sbi)) { 765 if (secno >= MAIN_SECS(sbi)) {
671 if (dir == ALLOC_RIGHT) { 766 if (dir == ALLOC_RIGHT) {
672 secno = find_next_zero_bit(free_i->free_secmap, 767 secno = find_next_zero_bit(free_i->free_secmap,
673 TOTAL_SECS(sbi), 0); 768 MAIN_SECS(sbi), 0);
674 f2fs_bug_on(secno >= TOTAL_SECS(sbi)); 769 f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
675 } else { 770 } else {
676 go_left = 1; 771 go_left = 1;
677 left_start = hint - 1; 772 left_start = hint - 1;
@@ -686,8 +781,8 @@ find_other_zone:
686 continue; 781 continue;
687 } 782 }
688 left_start = find_next_zero_bit(free_i->free_secmap, 783 left_start = find_next_zero_bit(free_i->free_secmap,
689 TOTAL_SECS(sbi), 0); 784 MAIN_SECS(sbi), 0);
690 f2fs_bug_on(left_start >= TOTAL_SECS(sbi)); 785 f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
691 break; 786 break;
692 } 787 }
693 secno = left_start; 788 secno = left_start;
@@ -726,7 +821,7 @@ skip_left:
726 } 821 }
727got_it: 822got_it:
728 /* set it as dirty segment in free segmap */ 823 /* set it as dirty segment in free segmap */
729 f2fs_bug_on(test_bit(segno, free_i->free_segmap)); 824 f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
730 __set_inuse(sbi, segno); 825 __set_inuse(sbi, segno);
731 *newseg = segno; 826 *newseg = segno;
732 write_unlock(&free_i->segmap_lock); 827 write_unlock(&free_i->segmap_lock);
@@ -808,7 +903,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
808} 903}
809 904
810/* 905/*
811 * This function always allocates a used segment (from dirty seglist) by SSR 906 * This function always allocates a used segment(from dirty seglist) by SSR
812 * manner, so it should recover the existing segment information of valid blocks 907 * manner, so it should recover the existing segment information of valid blocks
813 */ 908 */
814static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) 909static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
@@ -898,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = {
898 .allocate_segment = allocate_segment_by_default, 993 .allocate_segment = allocate_segment_by_default,
899}; 994};
900 995
996int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
997{
998 __u64 start = range->start >> sbi->log_blocksize;
999 __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
1000 unsigned int start_segno, end_segno;
1001 struct cp_control cpc;
1002
1003 if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
1004 range->len < sbi->blocksize)
1005 return -EINVAL;
1006
1007 if (end <= MAIN_BLKADDR(sbi))
1008 goto out;
1009
1010 /* start/end segment number in main_area */
1011 start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
1012 end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
1013 GET_SEGNO(sbi, end);
1014 cpc.reason = CP_DISCARD;
1015 cpc.trim_start = start_segno;
1016 cpc.trim_end = end_segno;
1017 cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
1018 cpc.trimmed = 0;
1019
1020 /* do checkpoint to issue discard commands safely */
1021 write_checkpoint(sbi, &cpc);
1022out:
1023 range->len = cpc.trimmed << sbi->log_blocksize;
1024 return 0;
1025}
1026
901static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) 1027static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
902{ 1028{
903 struct curseg_info *curseg = CURSEG_I(sbi, type); 1029 struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -953,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
953 1079
954static int __get_segment_type(struct page *page, enum page_type p_type) 1080static int __get_segment_type(struct page *page, enum page_type p_type)
955{ 1081{
956 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1082 switch (F2FS_P_SB(page)->active_logs) {
957 switch (sbi->active_logs) {
958 case 2: 1083 case 2:
959 return __get_segment_type_2(page, p_type); 1084 return __get_segment_type_2(page, p_type);
960 case 4: 1085 case 4:
961 return __get_segment_type_4(page, p_type); 1086 return __get_segment_type_4(page, p_type);
962 } 1087 }
963 /* NR_CURSEG_TYPE(6) logs by default */ 1088 /* NR_CURSEG_TYPE(6) logs by default */
964 f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE); 1089 f2fs_bug_on(F2FS_P_SB(page),
1090 F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
965 return __get_segment_type_6(page, p_type); 1091 return __get_segment_type_6(page, p_type);
966} 1092}
967 1093
@@ -1041,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
1041void write_data_page(struct page *page, struct dnode_of_data *dn, 1167void write_data_page(struct page *page, struct dnode_of_data *dn,
1042 block_t *new_blkaddr, struct f2fs_io_info *fio) 1168 block_t *new_blkaddr, struct f2fs_io_info *fio)
1043{ 1169{
1044 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 1170 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1045 struct f2fs_summary sum; 1171 struct f2fs_summary sum;
1046 struct node_info ni; 1172 struct node_info ni;
1047 1173
1048 f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); 1174 f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
1049 get_node_info(sbi, dn->nid, &ni); 1175 get_node_info(sbi, dn->nid, &ni);
1050 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 1176 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
1051 1177
@@ -1055,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
1055void rewrite_data_page(struct page *page, block_t old_blkaddr, 1181void rewrite_data_page(struct page *page, block_t old_blkaddr,
1056 struct f2fs_io_info *fio) 1182 struct f2fs_io_info *fio)
1057{ 1183{
1058 struct inode *inode = page->mapping->host; 1184 f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
1059 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1060 f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
1061} 1185}
1062 1186
1063void recover_data_page(struct f2fs_sb_info *sbi, 1187void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1103,55 +1227,6 @@ void recover_data_page(struct f2fs_sb_info *sbi,
1103 mutex_unlock(&curseg->curseg_mutex); 1227 mutex_unlock(&curseg->curseg_mutex);
1104} 1228}
1105 1229
1106void rewrite_node_page(struct f2fs_sb_info *sbi,
1107 struct page *page, struct f2fs_summary *sum,
1108 block_t old_blkaddr, block_t new_blkaddr)
1109{
1110 struct sit_info *sit_i = SIT_I(sbi);
1111 int type = CURSEG_WARM_NODE;
1112 struct curseg_info *curseg;
1113 unsigned int segno, old_cursegno;
1114 block_t next_blkaddr = next_blkaddr_of_node(page);
1115 unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
1116 struct f2fs_io_info fio = {
1117 .type = NODE,
1118 .rw = WRITE_SYNC,
1119 };
1120
1121 curseg = CURSEG_I(sbi, type);
1122
1123 mutex_lock(&curseg->curseg_mutex);
1124 mutex_lock(&sit_i->sentry_lock);
1125
1126 segno = GET_SEGNO(sbi, new_blkaddr);
1127 old_cursegno = curseg->segno;
1128
1129 /* change the current segment */
1130 if (segno != curseg->segno) {
1131 curseg->next_segno = segno;
1132 change_curseg(sbi, type, true);
1133 }
1134 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
1135 __add_sum_entry(sbi, type, sum);
1136
1137 /* change the current log to the next block addr in advance */
1138 if (next_segno != segno) {
1139 curseg->next_segno = next_segno;
1140 change_curseg(sbi, type, true);
1141 }
1142 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
1143
1144 /* rewrite node page */
1145 set_page_writeback(page);
1146 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
1147 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1148 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
1149 locate_dirty_segment(sbi, old_cursegno);
1150
1151 mutex_unlock(&sit_i->sentry_lock);
1152 mutex_unlock(&curseg->curseg_mutex);
1153}
1154
1155static inline bool is_merged_page(struct f2fs_sb_info *sbi, 1230static inline bool is_merged_page(struct f2fs_sb_info *sbi,
1156 struct page *page, enum page_type type) 1231 struct page *page, enum page_type type)
1157{ 1232{
@@ -1179,8 +1254,9 @@ out:
1179void f2fs_wait_on_page_writeback(struct page *page, 1254void f2fs_wait_on_page_writeback(struct page *page,
1180 enum page_type type) 1255 enum page_type type)
1181{ 1256{
1182 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1183 if (PageWriteback(page)) { 1257 if (PageWriteback(page)) {
1258 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1259
1184 if (is_merged_page(sbi, page, type)) 1260 if (is_merged_page(sbi, page, type))
1185 f2fs_submit_merged_bio(sbi, type, WRITE); 1261 f2fs_submit_merged_bio(sbi, type, WRITE);
1186 wait_on_page_writeback(page); 1262 wait_on_page_writeback(page);
@@ -1449,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
1449 unsigned int segno) 1525 unsigned int segno)
1450{ 1526{
1451 struct sit_info *sit_i = SIT_I(sbi); 1527 struct sit_info *sit_i = SIT_I(sbi);
1452 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno); 1528 unsigned int offset = SIT_BLOCK_OFFSET(segno);
1453 block_t blk_addr = sit_i->sit_base_addr + offset; 1529 block_t blk_addr = sit_i->sit_base_addr + offset;
1454 1530
1455 check_seg_range(sbi, segno); 1531 check_seg_range(sbi, segno);
@@ -1475,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
1475 /* get current sit block page without lock */ 1551 /* get current sit block page without lock */
1476 src_page = get_meta_page(sbi, src_off); 1552 src_page = get_meta_page(sbi, src_off);
1477 dst_page = grab_meta_page(sbi, dst_off); 1553 dst_page = grab_meta_page(sbi, dst_off);
1478 f2fs_bug_on(PageDirty(src_page)); 1554 f2fs_bug_on(sbi, PageDirty(src_page));
1479 1555
1480 src_addr = page_address(src_page); 1556 src_addr = page_address(src_page);
1481 dst_addr = page_address(dst_page); 1557 dst_addr = page_address(dst_page);
@@ -1489,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
1489 return dst_page; 1565 return dst_page;
1490} 1566}
1491 1567
1492static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) 1568static struct sit_entry_set *grab_sit_entry_set(void)
1569{
1570 struct sit_entry_set *ses =
1571 f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
1572
1573 ses->entry_cnt = 0;
1574 INIT_LIST_HEAD(&ses->set_list);
1575 return ses;
1576}
1577
1578static void release_sit_entry_set(struct sit_entry_set *ses)
1579{
1580 list_del(&ses->set_list);
1581 kmem_cache_free(sit_entry_set_slab, ses);
1582}
1583
1584static void adjust_sit_entry_set(struct sit_entry_set *ses,
1585 struct list_head *head)
1586{
1587 struct sit_entry_set *next = ses;
1588
1589 if (list_is_last(&ses->set_list, head))
1590 return;
1591
1592 list_for_each_entry_continue(next, head, set_list)
1593 if (ses->entry_cnt <= next->entry_cnt)
1594 break;
1595
1596 list_move_tail(&ses->set_list, &next->set_list);
1597}
1598
1599static void add_sit_entry(unsigned int segno, struct list_head *head)
1600{
1601 struct sit_entry_set *ses;
1602 unsigned int start_segno = START_SEGNO(segno);
1603
1604 list_for_each_entry(ses, head, set_list) {
1605 if (ses->start_segno == start_segno) {
1606 ses->entry_cnt++;
1607 adjust_sit_entry_set(ses, head);
1608 return;
1609 }
1610 }
1611
1612 ses = grab_sit_entry_set();
1613
1614 ses->start_segno = start_segno;
1615 ses->entry_cnt++;
1616 list_add(&ses->set_list, head);
1617}
1618
1619static void add_sits_in_set(struct f2fs_sb_info *sbi)
1620{
1621 struct f2fs_sm_info *sm_info = SM_I(sbi);
1622 struct list_head *set_list = &sm_info->sit_entry_set;
1623 unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
1624 unsigned int segno;
1625
1626 for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
1627 add_sit_entry(segno, set_list);
1628}
1629
1630static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
1493{ 1631{
1494 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1632 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1495 struct f2fs_summary_block *sum = curseg->sum_blk; 1633 struct f2fs_summary_block *sum = curseg->sum_blk;
1496 int i; 1634 int i;
1497 1635
1498 /* 1636 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
1499 * If the journal area in the current summary is full of sit entries, 1637 unsigned int segno;
1500 * all the sit entries will be flushed. Otherwise the sit entries 1638 bool dirtied;
1501 * are not able to replace with newly hot sit entries. 1639
1502 */ 1640 segno = le32_to_cpu(segno_in_journal(sum, i));
1503 if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) { 1641 dirtied = __mark_sit_entry_dirty(sbi, segno);
1504 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { 1642
1505 unsigned int segno; 1643 if (!dirtied)
1506 segno = le32_to_cpu(segno_in_journal(sum, i)); 1644 add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
1507 __mark_sit_entry_dirty(sbi, segno);
1508 }
1509 update_sits_in_cursum(sum, -sits_in_cursum(sum));
1510 return true;
1511 } 1645 }
1512 return false; 1646 update_sits_in_cursum(sum, -sits_in_cursum(sum));
1513} 1647}
1514 1648
1515/* 1649/*
1516 * CP calls this function, which flushes SIT entries including sit_journal, 1650 * CP calls this function, which flushes SIT entries including sit_journal,
1517 * and moves prefree segs to free segs. 1651 * and moves prefree segs to free segs.
1518 */ 1652 */
1519void flush_sit_entries(struct f2fs_sb_info *sbi) 1653void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1520{ 1654{
1521 struct sit_info *sit_i = SIT_I(sbi); 1655 struct sit_info *sit_i = SIT_I(sbi);
1522 unsigned long *bitmap = sit_i->dirty_sentries_bitmap; 1656 unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
1523 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1657 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1524 struct f2fs_summary_block *sum = curseg->sum_blk; 1658 struct f2fs_summary_block *sum = curseg->sum_blk;
1525 unsigned long nsegs = TOTAL_SEGS(sbi); 1659 struct sit_entry_set *ses, *tmp;
1526 struct page *page = NULL; 1660 struct list_head *head = &SM_I(sbi)->sit_entry_set;
1527 struct f2fs_sit_block *raw_sit = NULL; 1661 bool to_journal = true;
1528 unsigned int start = 0, end = 0; 1662 struct seg_entry *se;
1529 unsigned int segno;
1530 bool flushed;
1531 1663
1532 mutex_lock(&curseg->curseg_mutex); 1664 mutex_lock(&curseg->curseg_mutex);
1533 mutex_lock(&sit_i->sentry_lock); 1665 mutex_lock(&sit_i->sentry_lock);
1534 1666
1535 /* 1667 /*
1536 * "flushed" indicates whether sit entries in journal are flushed 1668 * add and account sit entries of dirty bitmap in sit entry
1537 * to the SIT area or not. 1669 * set temporarily
1538 */ 1670 */
1539 flushed = flush_sits_in_journal(sbi); 1671 add_sits_in_set(sbi);
1540 1672
1541 for_each_set_bit(segno, bitmap, nsegs) { 1673 /*
1542 struct seg_entry *se = get_seg_entry(sbi, segno); 1674 * if there are no enough space in journal to store dirty sit
1543 int sit_offset, offset; 1675 * entries, remove all entries from journal and add and account
1676 * them in sit entry set.
1677 */
1678 if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
1679 remove_sits_in_journal(sbi);
1544 1680
1545 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); 1681 if (!sit_i->dirty_sentries)
1682 goto out;
1546 1683
1547 /* add discard candidates */ 1684 /*
1548 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) 1685 * there are two steps to flush sit entries:
1549 add_discard_addrs(sbi, segno, se); 1686 * #1, flush sit entries to journal in current cold data summary block.
1687 * #2, flush sit entries to sit page.
1688 */
1689 list_for_each_entry_safe(ses, tmp, head, set_list) {
1690 struct page *page;
1691 struct f2fs_sit_block *raw_sit = NULL;
1692 unsigned int start_segno = ses->start_segno;
1693 unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
1694 (unsigned long)MAIN_SEGS(sbi));
1695 unsigned int segno = start_segno;
1696
1697 if (to_journal &&
1698 !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
1699 to_journal = false;
1700
1701 if (!to_journal) {
1702 page = get_next_sit_page(sbi, start_segno);
1703 raw_sit = page_address(page);
1704 }
1550 1705
1551 if (flushed) 1706 /* flush dirty sit entries in region of current sit set */
1552 goto to_sit_page; 1707 for_each_set_bit_from(segno, bitmap, end) {
1708 int offset, sit_offset;
1553 1709
1554 offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1); 1710 se = get_seg_entry(sbi, segno);
1555 if (offset >= 0) { 1711
1556 segno_in_journal(sum, offset) = cpu_to_le32(segno); 1712 /* add discard candidates */
1557 seg_info_to_raw_sit(se, &sit_in_journal(sum, offset)); 1713 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
1558 goto flush_done; 1714 cpc->trim_start = segno;
1559 } 1715 add_discard_addrs(sbi, cpc);
1560to_sit_page:
1561 if (!page || (start > segno) || (segno > end)) {
1562 if (page) {
1563 f2fs_put_page(page, 1);
1564 page = NULL;
1565 } 1716 }
1566 1717
1567 start = START_SEGNO(sit_i, segno); 1718 if (to_journal) {
1568 end = start + SIT_ENTRY_PER_BLOCK - 1; 1719 offset = lookup_journal_in_cursum(sum,
1720 SIT_JOURNAL, segno, 1);
1721 f2fs_bug_on(sbi, offset < 0);
1722 segno_in_journal(sum, offset) =
1723 cpu_to_le32(segno);
1724 seg_info_to_raw_sit(se,
1725 &sit_in_journal(sum, offset));
1726 } else {
1727 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
1728 seg_info_to_raw_sit(se,
1729 &raw_sit->entries[sit_offset]);
1730 }
1569 1731
1570 /* read sit block that will be updated */ 1732 __clear_bit(segno, bitmap);
1571 page = get_next_sit_page(sbi, start); 1733 sit_i->dirty_sentries--;
1572 raw_sit = page_address(page); 1734 ses->entry_cnt--;
1573 } 1735 }
1574 1736
1575 /* udpate entry in SIT block */ 1737 if (!to_journal)
1576 seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); 1738 f2fs_put_page(page, 1);
1577flush_done: 1739
1578 __clear_bit(segno, bitmap); 1740 f2fs_bug_on(sbi, ses->entry_cnt);
1579 sit_i->dirty_sentries--; 1741 release_sit_entry_set(ses);
1742 }
1743
1744 f2fs_bug_on(sbi, !list_empty(head));
1745 f2fs_bug_on(sbi, sit_i->dirty_sentries);
1746out:
1747 if (cpc->reason == CP_DISCARD) {
1748 for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
1749 add_discard_addrs(sbi, cpc);
1580 } 1750 }
1581 mutex_unlock(&sit_i->sentry_lock); 1751 mutex_unlock(&sit_i->sentry_lock);
1582 mutex_unlock(&curseg->curseg_mutex); 1752 mutex_unlock(&curseg->curseg_mutex);
1583 1753
1584 /* writeout last modified SIT block */
1585 f2fs_put_page(page, 1);
1586
1587 set_prefree_as_free_segments(sbi); 1754 set_prefree_as_free_segments(sbi);
1588} 1755}
1589 1756
@@ -1603,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1603 1770
1604 SM_I(sbi)->sit_info = sit_i; 1771 SM_I(sbi)->sit_info = sit_i;
1605 1772
1606 sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry)); 1773 sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
1607 if (!sit_i->sentries) 1774 if (!sit_i->sentries)
1608 return -ENOMEM; 1775 return -ENOMEM;
1609 1776
1610 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1777 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1611 sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); 1778 sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
1612 if (!sit_i->dirty_sentries_bitmap) 1779 if (!sit_i->dirty_sentries_bitmap)
1613 return -ENOMEM; 1780 return -ENOMEM;
1614 1781
1615 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 1782 for (start = 0; start < MAIN_SEGS(sbi); start++) {
1616 sit_i->sentries[start].cur_valid_map 1783 sit_i->sentries[start].cur_valid_map
1617 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); 1784 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1618 sit_i->sentries[start].ckpt_valid_map 1785 sit_i->sentries[start].ckpt_valid_map
@@ -1623,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1623 } 1790 }
1624 1791
1625 if (sbi->segs_per_sec > 1) { 1792 if (sbi->segs_per_sec > 1) {
1626 sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * 1793 sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
1627 sizeof(struct sec_entry)); 1794 sizeof(struct sec_entry));
1628 if (!sit_i->sec_entries) 1795 if (!sit_i->sec_entries)
1629 return -ENOMEM; 1796 return -ENOMEM;
@@ -1658,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1658 1825
1659static int build_free_segmap(struct f2fs_sb_info *sbi) 1826static int build_free_segmap(struct f2fs_sb_info *sbi)
1660{ 1827{
1661 struct f2fs_sm_info *sm_info = SM_I(sbi);
1662 struct free_segmap_info *free_i; 1828 struct free_segmap_info *free_i;
1663 unsigned int bitmap_size, sec_bitmap_size; 1829 unsigned int bitmap_size, sec_bitmap_size;
1664 1830
@@ -1669,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
1669 1835
1670 SM_I(sbi)->free_info = free_i; 1836 SM_I(sbi)->free_info = free_i;
1671 1837
1672 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1838 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1673 free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); 1839 free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
1674 if (!free_i->free_segmap) 1840 if (!free_i->free_segmap)
1675 return -ENOMEM; 1841 return -ENOMEM;
1676 1842
1677 sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); 1843 sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
1678 free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); 1844 free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
1679 if (!free_i->free_secmap) 1845 if (!free_i->free_secmap)
1680 return -ENOMEM; 1846 return -ENOMEM;
@@ -1684,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
1684 memset(free_i->free_secmap, 0xff, sec_bitmap_size); 1850 memset(free_i->free_secmap, 0xff, sec_bitmap_size);
1685 1851
1686 /* init free segmap information */ 1852 /* init free segmap information */
1687 free_i->start_segno = 1853 free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
1688 (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
1689 free_i->free_segments = 0; 1854 free_i->free_segments = 0;
1690 free_i->free_sections = 0; 1855 free_i->free_sections = 0;
1691 rwlock_init(&free_i->segmap_lock); 1856 rwlock_init(&free_i->segmap_lock);
@@ -1722,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
1722 int sit_blk_cnt = SIT_BLK_CNT(sbi); 1887 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1723 unsigned int i, start, end; 1888 unsigned int i, start, end;
1724 unsigned int readed, start_blk = 0; 1889 unsigned int readed, start_blk = 0;
1725 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1890 int nrpages = MAX_BIO_BLOCKS(sbi);
1726 1891
1727 do { 1892 do {
1728 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); 1893 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
@@ -1730,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
1730 start = start_blk * sit_i->sents_per_block; 1895 start = start_blk * sit_i->sents_per_block;
1731 end = (start_blk + readed) * sit_i->sents_per_block; 1896 end = (start_blk + readed) * sit_i->sents_per_block;
1732 1897
1733 for (; start < end && start < TOTAL_SEGS(sbi); start++) { 1898 for (; start < end && start < MAIN_SEGS(sbi); start++) {
1734 struct seg_entry *se = &sit_i->sentries[start]; 1899 struct seg_entry *se = &sit_i->sentries[start];
1735 struct f2fs_sit_block *sit_blk; 1900 struct f2fs_sit_block *sit_blk;
1736 struct f2fs_sit_entry sit; 1901 struct f2fs_sit_entry sit;
@@ -1768,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
1768 unsigned int start; 1933 unsigned int start;
1769 int type; 1934 int type;
1770 1935
1771 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 1936 for (start = 0; start < MAIN_SEGS(sbi); start++) {
1772 struct seg_entry *sentry = get_seg_entry(sbi, start); 1937 struct seg_entry *sentry = get_seg_entry(sbi, start);
1773 if (!sentry->valid_blocks) 1938 if (!sentry->valid_blocks)
1774 __set_free(sbi, start); 1939 __set_free(sbi, start);
@@ -1785,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1785{ 1950{
1786 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 1951 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1787 struct free_segmap_info *free_i = FREE_I(sbi); 1952 struct free_segmap_info *free_i = FREE_I(sbi);
1788 unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); 1953 unsigned int segno = 0, offset = 0;
1789 unsigned short valid_blocks; 1954 unsigned short valid_blocks;
1790 1955
1791 while (1) { 1956 while (1) {
1792 /* find dirty segment based on free segmap */ 1957 /* find dirty segment based on free segmap */
1793 segno = find_next_inuse(free_i, total_segs, offset); 1958 segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
1794 if (segno >= total_segs) 1959 if (segno >= MAIN_SEGS(sbi))
1795 break; 1960 break;
1796 offset = segno + 1; 1961 offset = segno + 1;
1797 valid_blocks = get_valid_blocks(sbi, segno, 0); 1962 valid_blocks = get_valid_blocks(sbi, segno, 0);
1798 if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks) 1963 if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
1964 continue;
1965 if (valid_blocks > sbi->blocks_per_seg) {
1966 f2fs_bug_on(sbi, 1);
1799 continue; 1967 continue;
1968 }
1800 mutex_lock(&dirty_i->seglist_lock); 1969 mutex_lock(&dirty_i->seglist_lock);
1801 __locate_dirty_segment(sbi, segno, DIRTY); 1970 __locate_dirty_segment(sbi, segno, DIRTY);
1802 mutex_unlock(&dirty_i->seglist_lock); 1971 mutex_unlock(&dirty_i->seglist_lock);
@@ -1806,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1806static int init_victim_secmap(struct f2fs_sb_info *sbi) 1975static int init_victim_secmap(struct f2fs_sb_info *sbi)
1807{ 1976{
1808 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 1977 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1809 unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); 1978 unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
1810 1979
1811 dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); 1980 dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
1812 if (!dirty_i->victim_secmap) 1981 if (!dirty_i->victim_secmap)
@@ -1827,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
1827 SM_I(sbi)->dirty_info = dirty_i; 1996 SM_I(sbi)->dirty_info = dirty_i;
1828 mutex_init(&dirty_i->seglist_lock); 1997 mutex_init(&dirty_i->seglist_lock);
1829 1998
1830 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1999 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1831 2000
1832 for (i = 0; i < NR_DIRTY_TYPE; i++) { 2001 for (i = 0; i < NR_DIRTY_TYPE; i++) {
1833 dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); 2002 dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
@@ -1851,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
1851 2020
1852 sit_i->min_mtime = LLONG_MAX; 2021 sit_i->min_mtime = LLONG_MAX;
1853 2022
1854 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 2023 for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
1855 unsigned int i; 2024 unsigned int i;
1856 unsigned long long mtime = 0; 2025 unsigned long long mtime = 0;
1857 2026
@@ -1889,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1889 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); 2058 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1890 sm_info->rec_prefree_segments = sm_info->main_segments * 2059 sm_info->rec_prefree_segments = sm_info->main_segments *
1891 DEF_RECLAIM_PREFREE_SEGMENTS / 100; 2060 DEF_RECLAIM_PREFREE_SEGMENTS / 100;
1892 sm_info->ipu_policy = F2FS_IPU_DISABLE; 2061 sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
1893 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; 2062 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
2063 sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
1894 2064
1895 INIT_LIST_HEAD(&sm_info->discard_list); 2065 INIT_LIST_HEAD(&sm_info->discard_list);
1896 sm_info->nr_discards = 0; 2066 sm_info->nr_discards = 0;
1897 sm_info->max_discards = 0; 2067 sm_info->max_discards = 0;
1898 2068
2069 INIT_LIST_HEAD(&sm_info->sit_entry_set);
2070
1899 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { 2071 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
1900 err = create_flush_cmd_control(sbi); 2072 err = create_flush_cmd_control(sbi);
1901 if (err) 2073 if (err)
@@ -1991,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
1991 return; 2163 return;
1992 2164
1993 if (sit_i->sentries) { 2165 if (sit_i->sentries) {
1994 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 2166 for (start = 0; start < MAIN_SEGS(sbi); start++) {
1995 kfree(sit_i->sentries[start].cur_valid_map); 2167 kfree(sit_i->sentries[start].cur_valid_map);
1996 kfree(sit_i->sentries[start].ckpt_valid_map); 2168 kfree(sit_i->sentries[start].ckpt_valid_map);
1997 } 2169 }
@@ -2025,11 +2197,30 @@ int __init create_segment_manager_caches(void)
2025 discard_entry_slab = f2fs_kmem_cache_create("discard_entry", 2197 discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
2026 sizeof(struct discard_entry)); 2198 sizeof(struct discard_entry));
2027 if (!discard_entry_slab) 2199 if (!discard_entry_slab)
2028 return -ENOMEM; 2200 goto fail;
2201
2202 sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
2203 sizeof(struct nat_entry_set));
2204 if (!sit_entry_set_slab)
2205 goto destory_discard_entry;
2206
2207 inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
2208 sizeof(struct inmem_pages));
2209 if (!inmem_entry_slab)
2210 goto destroy_sit_entry_set;
2029 return 0; 2211 return 0;
2212
2213destroy_sit_entry_set:
2214 kmem_cache_destroy(sit_entry_set_slab);
2215destory_discard_entry:
2216 kmem_cache_destroy(discard_entry_slab);
2217fail:
2218 return -ENOMEM;
2030} 2219}
2031 2220
2032void destroy_segment_manager_caches(void) 2221void destroy_segment_manager_caches(void)
2033{ 2222{
2223 kmem_cache_destroy(sit_entry_set_slab);
2034 kmem_cache_destroy(discard_entry_slab); 2224 kmem_cache_destroy(discard_entry_slab);
2225 kmem_cache_destroy(inmem_entry_slab);
2035} 2226}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 55973f7b0330..2495bec1c621 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -45,16 +45,26 @@
45 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ 45 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
46 sbi->segs_per_sec)) \ 46 sbi->segs_per_sec)) \
47 47
48#define START_BLOCK(sbi, segno) \ 48#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
49 (SM_I(sbi)->seg0_blkaddr + \ 49#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
50
51#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
52#define MAIN_SECS(sbi) (sbi->total_sections)
53
54#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
55#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
56
57#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
58#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
59 sbi->log_blocks_per_seg))
60
61#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
50 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) 62 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
63
51#define NEXT_FREE_BLKADDR(sbi, curseg) \ 64#define NEXT_FREE_BLKADDR(sbi, curseg) \
52 (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) 65 (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
53 66
54#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr) 67#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
55
56#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ 68#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) 69 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
60#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ 70#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
@@ -77,23 +87,21 @@
77 87
78#define SIT_ENTRY_OFFSET(sit_i, segno) \ 88#define SIT_ENTRY_OFFSET(sit_i, segno) \
79 (segno % sit_i->sents_per_block) 89 (segno % sit_i->sents_per_block)
80#define SIT_BLOCK_OFFSET(sit_i, segno) \ 90#define SIT_BLOCK_OFFSET(segno) \
81 (segno / SIT_ENTRY_PER_BLOCK) 91 (segno / SIT_ENTRY_PER_BLOCK)
82#define START_SEGNO(sit_i, segno) \ 92#define START_SEGNO(segno) \
83 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) 93 (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
84#define SIT_BLK_CNT(sbi) \ 94#define SIT_BLK_CNT(sbi) \
85 ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) 95 ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
86#define f2fs_bitmap_size(nr) \ 96#define f2fs_bitmap_size(nr) \
87 (BITS_TO_LONGS(nr) * sizeof(unsigned long)) 97 (BITS_TO_LONGS(nr) * sizeof(unsigned long))
88#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
89#define TOTAL_SECS(sbi) (sbi->total_sections)
90 98
91#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ 99#define SECTOR_FROM_BLOCK(blk_addr) \
92 (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) 100 (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
93#define SECTOR_TO_BLOCK(sbi, sectors) \ 101#define SECTOR_TO_BLOCK(sectors) \
94 (sectors >> (sbi)->log_sectors_per_block) 102 (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
95#define MAX_BIO_BLOCKS(max_hw_blocks) \ 103#define MAX_BIO_BLOCKS(sbi) \
96 (min((int)max_hw_blocks, BIO_MAX_PAGES)) 104 ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
97 105
98/* 106/*
99 * indicate a block allocation direction: RIGHT and LEFT. 107 * indicate a block allocation direction: RIGHT and LEFT.
@@ -167,6 +175,11 @@ struct segment_allocation {
167 void (*allocate_segment)(struct f2fs_sb_info *, int, bool); 175 void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
168}; 176};
169 177
178struct inmem_pages {
179 struct list_head list;
180 struct page *page;
181};
182
170struct sit_info { 183struct sit_info {
171 const struct segment_allocation *s_ops; 184 const struct segment_allocation *s_ops;
172 185
@@ -237,6 +250,12 @@ struct curseg_info {
237 unsigned int next_segno; /* preallocated segment */ 250 unsigned int next_segno; /* preallocated segment */
238}; 251};
239 252
253struct sit_entry_set {
254 struct list_head set_list; /* link with all sit sets */
255 unsigned int start_segno; /* start segno of sits in set */
256 unsigned int entry_cnt; /* the # of sit entries in set */
257};
258
240/* 259/*
241 * inline functions 260 * inline functions
242 */ 261 */
@@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
316 clear_bit(segno, free_i->free_segmap); 335 clear_bit(segno, free_i->free_segmap);
317 free_i->free_segments++; 336 free_i->free_segments++;
318 337
319 next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno); 338 next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
320 if (next >= start_segno + sbi->segs_per_sec) { 339 if (next >= start_segno + sbi->segs_per_sec) {
321 clear_bit(secno, free_i->free_secmap); 340 clear_bit(secno, free_i->free_secmap);
322 free_i->free_sections++; 341 free_i->free_sections++;
@@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
430 449
431static inline bool need_SSR(struct f2fs_sb_info *sbi) 450static inline bool need_SSR(struct f2fs_sb_info *sbi)
432{ 451{
433 return (prefree_segments(sbi) / sbi->segs_per_sec) 452 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
434 + free_sections(sbi) < overprovision_sections(sbi); 453 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
454 return free_sections(sbi) <= (node_secs + 2 * dent_secs +
455 reserved_sections(sbi) + 1);
435} 456}
436 457
437static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) 458static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -466,48 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi)
466 * F2FS_IPU_UTIL - if FS utilization is over threashold, 487 * F2FS_IPU_UTIL - if FS utilization is over threashold,
467 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over 488 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
468 * threashold, 489 * threashold,
490 * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
491 * storages. IPU will be triggered only if the # of dirty
492 * pages over min_fsync_blocks.
469 * F2FS_IPUT_DISABLE - disable IPU. (=default option) 493 * F2FS_IPUT_DISABLE - disable IPU. (=default option)
470 */ 494 */
471#define DEF_MIN_IPU_UTIL 70 495#define DEF_MIN_IPU_UTIL 70
496#define DEF_MIN_FSYNC_BLOCKS 8
472 497
473enum { 498enum {
474 F2FS_IPU_FORCE, 499 F2FS_IPU_FORCE,
475 F2FS_IPU_SSR, 500 F2FS_IPU_SSR,
476 F2FS_IPU_UTIL, 501 F2FS_IPU_UTIL,
477 F2FS_IPU_SSR_UTIL, 502 F2FS_IPU_SSR_UTIL,
478 F2FS_IPU_DISABLE, 503 F2FS_IPU_FSYNC,
479}; 504};
480 505
481static inline bool need_inplace_update(struct inode *inode) 506static inline bool need_inplace_update(struct inode *inode)
482{ 507{
483 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 508 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
509 unsigned int policy = SM_I(sbi)->ipu_policy;
484 510
485 /* IPU can be done only for the user data */ 511 /* IPU can be done only for the user data */
486 if (S_ISDIR(inode->i_mode)) 512 if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
487 return false; 513 return false;
488 514
489 /* this is only set during fdatasync */ 515 if (policy & (0x1 << F2FS_IPU_FORCE))
490 if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) 516 return true;
517 if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
518 return true;
519 if (policy & (0x1 << F2FS_IPU_UTIL) &&
520 utilization(sbi) > SM_I(sbi)->min_ipu_util)
521 return true;
522 if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
523 utilization(sbi) > SM_I(sbi)->min_ipu_util)
491 return true; 524 return true;
492 525
493 switch (SM_I(sbi)->ipu_policy) { 526 /* this is only set during fdatasync */
494 case F2FS_IPU_FORCE: 527 if (policy & (0x1 << F2FS_IPU_FSYNC) &&
528 is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
495 return true; 529 return true;
496 case F2FS_IPU_SSR: 530
497 if (need_SSR(sbi))
498 return true;
499 break;
500 case F2FS_IPU_UTIL:
501 if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
502 return true;
503 break;
504 case F2FS_IPU_SSR_UTIL:
505 if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
506 return true;
507 break;
508 case F2FS_IPU_DISABLE:
509 break;
510 }
511 return false; 531 return false;
512} 532}
513 533
@@ -534,28 +554,21 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
534#ifdef CONFIG_F2FS_CHECK_FS 554#ifdef CONFIG_F2FS_CHECK_FS
535static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) 555static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
536{ 556{
537 unsigned int end_segno = SM_I(sbi)->segment_count - 1; 557 BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
538 BUG_ON(segno > end_segno);
539} 558}
540 559
541static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) 560static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
542{ 561{
543 struct f2fs_sm_info *sm_info = SM_I(sbi); 562 BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
544 block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg; 563 BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
545 block_t start_addr = sm_info->seg0_blkaddr;
546 block_t end_addr = start_addr + total_blks - 1;
547 BUG_ON(blk_addr < start_addr);
548 BUG_ON(blk_addr > end_addr);
549} 564}
550 565
551/* 566/*
552 * Summary block is always treated as invalid block 567 * Summary block is always treated as an invalid block
553 */ 568 */
554static inline void check_block_count(struct f2fs_sb_info *sbi, 569static inline void check_block_count(struct f2fs_sb_info *sbi,
555 int segno, struct f2fs_sit_entry *raw_sit) 570 int segno, struct f2fs_sit_entry *raw_sit)
556{ 571{
557 struct f2fs_sm_info *sm_info = SM_I(sbi);
558 unsigned int end_segno = sm_info->segment_count - 1;
559 bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; 572 bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
560 int valid_blocks = 0; 573 int valid_blocks = 0;
561 int cur_pos = 0, next_pos; 574 int cur_pos = 0, next_pos;
@@ -564,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
564 BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); 577 BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
565 578
566 /* check boundary of a given segment number */ 579 /* check boundary of a given segment number */
567 BUG_ON(segno > end_segno); 580 BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
568 581
569 /* check bitmap with valid block count */ 582 /* check bitmap with valid block count */
570 do { 583 do {
@@ -583,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
583 BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); 596 BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
584} 597}
585#else 598#else
586#define check_seg_range(sbi, segno) 599static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
587#define verify_block_addr(sbi, blk_addr) 600{
588#define check_block_count(sbi, segno, raw_sit) 601 if (segno > TOTAL_SEGS(sbi) - 1)
602 sbi->need_fsck = true;
603}
604
605static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
606{
607 if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
608 sbi->need_fsck = true;
609}
610
611/*
612 * Summary block is always treated as an invalid block
613 */
614static inline void check_block_count(struct f2fs_sb_info *sbi,
615 int segno, struct f2fs_sit_entry *raw_sit)
616{
617 /* check segment usage */
618 if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
619 sbi->need_fsck = true;
620
621 /* check boundary of a given segment number */
622 if (segno > TOTAL_SEGS(sbi) - 1)
623 sbi->need_fsck = true;
624}
589#endif 625#endif
590 626
591static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, 627static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
592 unsigned int start) 628 unsigned int start)
593{ 629{
594 struct sit_info *sit_i = SIT_I(sbi); 630 struct sit_info *sit_i = SIT_I(sbi);
595 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start); 631 unsigned int offset = SIT_BLOCK_OFFSET(start);
596 block_t blk_addr = sit_i->sit_base_addr + offset; 632 block_t blk_addr = sit_i->sit_base_addr + offset;
597 633
598 check_seg_range(sbi, start); 634 check_seg_range(sbi, start);
@@ -619,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
619 655
620static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) 656static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
621{ 657{
622 unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start); 658 unsigned int block_off = SIT_BLOCK_OFFSET(start);
623 659
624 if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) 660 if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
625 f2fs_clear_bit(block_off, sit_i->sit_bitmap); 661 f2fs_clear_bit(block_off, sit_i->sit_bitmap);
@@ -666,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
666{ 702{
667 struct block_device *bdev = sbi->sb->s_bdev; 703 struct block_device *bdev = sbi->sb->s_bdev;
668 struct request_queue *q = bdev_get_queue(bdev); 704 struct request_queue *q = bdev_get_queue(bdev);
669 return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); 705 return SECTOR_TO_BLOCK(queue_max_sectors(q));
670} 706}
671 707
672/* 708/*
@@ -683,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
683 else if (type == NODE) 719 else if (type == NODE)
684 return 3 * sbi->blocks_per_seg; 720 return 3 * sbi->blocks_per_seg;
685 else if (type == META) 721 else if (type == META)
686 return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 722 return MAX_BIO_BLOCKS(sbi);
687 else 723 else
688 return 0; 724 return 0;
689} 725}
@@ -706,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
706 else if (type == NODE) 742 else if (type == NODE)
707 desired = 3 * max_hw_blocks(sbi); 743 desired = 3 * max_hw_blocks(sbi);
708 else 744 else
709 desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 745 desired = MAX_BIO_BLOCKS(sbi);
710 746
711 wbc->nr_to_write = desired; 747 wbc->nr_to_write = desired;
712 return desired - nr_to_write; 748 return desired - nr_to_write;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 657582fc7601..41d6f700f4ee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -190,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
190F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); 190F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
191F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); 191F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
192F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); 192F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
193F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
193F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); 194F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
194F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); 195F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
195F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); 196F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -204,6 +205,7 @@ static struct attribute *f2fs_attrs[] = {
204 ATTR_LIST(max_small_discards), 205 ATTR_LIST(max_small_discards),
205 ATTR_LIST(ipu_policy), 206 ATTR_LIST(ipu_policy),
206 ATTR_LIST(min_ipu_util), 207 ATTR_LIST(min_ipu_util),
208 ATTR_LIST(min_fsync_blocks),
207 ATTR_LIST(max_victim_search), 209 ATTR_LIST(max_victim_search),
208 ATTR_LIST(dir_level), 210 ATTR_LIST(dir_level),
209 ATTR_LIST(ram_thresh), 211 ATTR_LIST(ram_thresh),
@@ -366,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
366 368
367 /* Initialize f2fs-specific inode info */ 369 /* Initialize f2fs-specific inode info */
368 fi->vfs_inode.i_version = 1; 370 fi->vfs_inode.i_version = 1;
369 atomic_set(&fi->dirty_dents, 0); 371 atomic_set(&fi->dirty_pages, 0);
370 fi->i_current_depth = 1; 372 fi->i_current_depth = 1;
371 fi->i_advise = 0; 373 fi->i_advise = 0;
372 rwlock_init(&fi->ext.ext_lock); 374 rwlock_init(&fi->ext.ext_lock);
373 init_rwsem(&fi->i_sem); 375 init_rwsem(&fi->i_sem);
376 INIT_LIST_HEAD(&fi->inmem_pages);
377 mutex_init(&fi->inmem_lock);
374 378
375 set_inode_flag(fi, FI_NEW_INODE); 379 set_inode_flag(fi, FI_NEW_INODE);
376 380
@@ -432,8 +436,19 @@ static void f2fs_put_super(struct super_block *sb)
432 stop_gc_thread(sbi); 436 stop_gc_thread(sbi);
433 437
434 /* We don't need to do checkpoint when it's clean */ 438 /* We don't need to do checkpoint when it's clean */
435 if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES)) 439 if (sbi->s_dirty) {
436 write_checkpoint(sbi, true); 440 struct cp_control cpc = {
441 .reason = CP_UMOUNT,
442 };
443 write_checkpoint(sbi, &cpc);
444 }
445
446 /*
447 * normally superblock is clean, so we need to release this.
448 * In addition, EIO will skip do checkpoint, we need this as well.
449 */
450 release_dirty_inode(sbi);
451 release_discard_addrs(sbi);
437 452
438 iput(sbi->node_inode); 453 iput(sbi->node_inode);
439 iput(sbi->meta_inode); 454 iput(sbi->meta_inode);
@@ -457,12 +472,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
457 472
458 trace_f2fs_sync_fs(sb, sync); 473 trace_f2fs_sync_fs(sb, sync);
459 474
460 if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
461 return 0;
462
463 if (sync) { 475 if (sync) {
476 struct cp_control cpc = {
477 .reason = CP_SYNC,
478 };
464 mutex_lock(&sbi->gc_mutex); 479 mutex_lock(&sbi->gc_mutex);
465 write_checkpoint(sbi, false); 480 write_checkpoint(sbi, &cpc);
466 mutex_unlock(&sbi->gc_mutex); 481 mutex_unlock(&sbi->gc_mutex);
467 } else { 482 } else {
468 f2fs_balance_fs(sbi); 483 f2fs_balance_fs(sbi);
@@ -505,8 +520,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
505 buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; 520 buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
506 buf->f_bavail = user_block_count - valid_user_blocks(sbi); 521 buf->f_bavail = user_block_count - valid_user_blocks(sbi);
507 522
508 buf->f_files = sbi->total_node_count; 523 buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
509 buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); 524 buf->f_ffree = buf->f_files - valid_inode_count(sbi);
510 525
511 buf->f_namelen = F2FS_NAME_LEN; 526 buf->f_namelen = F2FS_NAME_LEN;
512 buf->f_fsid.val[0] = (u32)id; 527 buf->f_fsid.val[0] = (u32)id;
@@ -613,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
613 org_mount_opt = sbi->mount_opt; 628 org_mount_opt = sbi->mount_opt;
614 active_logs = sbi->active_logs; 629 active_logs = sbi->active_logs;
615 630
631 sbi->mount_opt.opt = 0;
632 sbi->active_logs = NR_CURSEG_TYPE;
633
616 /* parse mount options */ 634 /* parse mount options */
617 err = parse_options(sb, data); 635 err = parse_options(sb, data);
618 if (err) 636 if (err)
@@ -663,7 +681,7 @@ restore_gc:
663 if (need_restart_gc) { 681 if (need_restart_gc) {
664 if (start_gc_thread(sbi)) 682 if (start_gc_thread(sbi))
665 f2fs_msg(sbi->sb, KERN_WARNING, 683 f2fs_msg(sbi->sb, KERN_WARNING,
666 "background gc thread is stop"); 684 "background gc thread has stopped");
667 } else if (need_stop_gc) { 685 } else if (need_stop_gc) {
668 stop_gc_thread(sbi); 686 stop_gc_thread(sbi);
669 } 687 }
@@ -783,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb,
783 return 1; 801 return 1;
784 } 802 }
785 803
786 if (le32_to_cpu(raw_super->log_sectorsize) != 804 /* Currently, support 512/1024/2048/4096 bytes sector size */
787 F2FS_LOG_SECTOR_SIZE) { 805 if (le32_to_cpu(raw_super->log_sectorsize) >
788 f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); 806 F2FS_MAX_LOG_SECTOR_SIZE ||
807 le32_to_cpu(raw_super->log_sectorsize) <
808 F2FS_MIN_LOG_SECTOR_SIZE) {
809 f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
810 le32_to_cpu(raw_super->log_sectorsize));
789 return 1; 811 return 1;
790 } 812 }
791 if (le32_to_cpu(raw_super->log_sectors_per_block) != 813 if (le32_to_cpu(raw_super->log_sectors_per_block) +
792 F2FS_LOG_SECTORS_PER_BLOCK) { 814 le32_to_cpu(raw_super->log_sectorsize) !=
793 f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); 815 F2FS_MAX_LOG_SECTOR_SIZE) {
816 f2fs_msg(sb, KERN_INFO,
817 "Invalid log sectors per block(%u) log sectorsize(%u)",
818 le32_to_cpu(raw_super->log_sectors_per_block),
819 le32_to_cpu(raw_super->log_sectorsize));
794 return 1; 820 return 1;
795 } 821 }
796 return 0; 822 return 0;
@@ -812,7 +838,7 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
812 if (unlikely(fsmeta >= total)) 838 if (unlikely(fsmeta >= total))
813 return 1; 839 return 1;
814 840
815 if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { 841 if (unlikely(f2fs_cp_error(sbi))) {
816 f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); 842 f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
817 return 1; 843 return 1;
818 } 844 }
@@ -846,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
846 atomic_set(&sbi->nr_pages[i], 0); 872 atomic_set(&sbi->nr_pages[i], 0);
847 873
848 sbi->dir_level = DEF_DIR_LEVEL; 874 sbi->dir_level = DEF_DIR_LEVEL;
875 sbi->need_fsck = false;
849} 876}
850 877
851/* 878/*
@@ -899,8 +926,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
899 struct buffer_head *raw_super_buf; 926 struct buffer_head *raw_super_buf;
900 struct inode *root; 927 struct inode *root;
901 long err = -EINVAL; 928 long err = -EINVAL;
929 bool retry = true;
902 int i; 930 int i;
903 931
932try_onemore:
904 /* allocate memory for f2fs-specific super block info */ 933 /* allocate memory for f2fs-specific super block info */
905 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); 934 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
906 if (!sbi) 935 if (!sbi)
@@ -1077,12 +1106,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
1077 if (err) 1106 if (err)
1078 goto free_proc; 1107 goto free_proc;
1079 1108
1109 if (!retry)
1110 sbi->need_fsck = true;
1111
1080 /* recover fsynced data */ 1112 /* recover fsynced data */
1081 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { 1113 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
1082 err = recover_fsync_data(sbi); 1114 err = recover_fsync_data(sbi);
1083 if (err) 1115 if (err) {
1084 f2fs_msg(sb, KERN_ERR, 1116 f2fs_msg(sb, KERN_ERR,
1085 "Cannot recover all fsync data errno=%ld", err); 1117 "Cannot recover all fsync data errno=%ld", err);
1118 goto free_kobj;
1119 }
1086 } 1120 }
1087 1121
1088 /* 1122 /*
@@ -1123,6 +1157,13 @@ free_sb_buf:
1123 brelse(raw_super_buf); 1157 brelse(raw_super_buf);
1124free_sbi: 1158free_sbi:
1125 kfree(sbi); 1159 kfree(sbi);
1160
1161 /* give only one another chance */
1162 if (retry) {
1163 retry = 0;
1164 shrink_dcache_sb(sb);
1165 goto try_onemore;
1166 }
1126 return err; 1167 return err;
1127} 1168}
1128 1169
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8bea941ee309..deca8728117b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
266 266
267static void *read_all_xattrs(struct inode *inode, struct page *ipage) 267static void *read_all_xattrs(struct inode *inode, struct page *ipage)
268{ 268{
269 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 269 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
270 struct f2fs_xattr_header *header; 270 struct f2fs_xattr_header *header;
271 size_t size = PAGE_SIZE, inline_size = 0; 271 size_t size = PAGE_SIZE, inline_size = 0;
272 void *txattr_addr; 272 void *txattr_addr;
@@ -325,7 +325,7 @@ fail:
325static inline int write_all_xattrs(struct inode *inode, __u32 hsize, 325static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
326 void *txattr_addr, struct page *ipage) 326 void *txattr_addr, struct page *ipage)
327{ 327{
328 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 328 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
329 size_t inline_size = 0; 329 size_t inline_size = 0;
330 void *xattr_addr; 330 void *xattr_addr;
331 struct page *xpage; 331 struct page *xpage;
@@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
373 alloc_nid_failed(sbi, new_nid); 373 alloc_nid_failed(sbi, new_nid);
374 return PTR_ERR(xpage); 374 return PTR_ERR(xpage);
375 } 375 }
376 f2fs_bug_on(new_nid); 376 f2fs_bug_on(sbi, new_nid);
377 f2fs_wait_on_page_writeback(xpage, NODE); 377 f2fs_wait_on_page_writeback(xpage, NODE);
378 } else { 378 } else {
379 struct dnode_of_data dn; 379 struct dnode_of_data dn;
@@ -528,7 +528,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
528 int free; 528 int free;
529 /* 529 /*
530 * If value is NULL, it is remove operation. 530 * If value is NULL, it is remove operation.
531 * In case of update operation, we caculate free. 531 * In case of update operation, we calculate free.
532 */ 532 */
533 free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); 533 free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
534 if (found) 534 if (found)
@@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
596 const void *value, size_t size, 596 const void *value, size_t size,
597 struct page *ipage, int flags) 597 struct page *ipage, int flags)
598{ 598{
599 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 599 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
600 int err; 600 int err;
601 601
602 /* this case is only from init_inode_metadata */ 602 /* this case is only from init_inode_metadata */
diff --git a/fs/file_table.c b/fs/file_table.c
index 385bfd31512a..0bab12b20460 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages)
331 331
332 n = (mempages * (PAGE_SIZE / 1024)) / 10; 332 n = (mempages * (PAGE_SIZE / 1024)) / 10;
333 files_stat.max_files = max_t(unsigned long, n, NR_FILE); 333 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
334 percpu_counter_init(&nr_files, 0); 334 percpu_counter_init(&nr_files, 0, GFP_KERNEL);
335} 335}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index d3b4539f1651..da032daf0e0d 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -982,6 +982,7 @@ nomem:
982submit_op_failed: 982submit_op_failed:
983 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); 983 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
984 spin_unlock(&cookie->lock); 984 spin_unlock(&cookie->lock);
985 fscache_unuse_cookie(object);
985 kfree(op); 986 kfree(op);
986 _leave(" [EIO]"); 987 _leave(" [EIO]");
987 return transit_to(KILL_OBJECT); 988 return transit_to(KILL_OBJECT);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 85332b9d19d1..de33b3fccca6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -44,6 +44,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
44EXPORT_SYMBOL(__fscache_wait_on_page_write); 44EXPORT_SYMBOL(__fscache_wait_on_page_write);
45 45
46/* 46/*
47 * wait for a page to finish being written to the cache. Put a timeout here
48 * since we might be called recursively via parent fs.
49 */
50static
51bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
52{
53 wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
54
55 return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
56 HZ);
57}
58
59/*
47 * decide whether a page can be released, possibly by cancelling a store to it 60 * decide whether a page can be released, possibly by cancelling a store to it
48 * - we're allowed to sleep if __GFP_WAIT is flagged 61 * - we're allowed to sleep if __GFP_WAIT is flagged
49 */ 62 */
@@ -115,7 +128,10 @@ page_busy:
115 } 128 }
116 129
117 fscache_stat(&fscache_n_store_vmscan_wait); 130 fscache_stat(&fscache_n_store_vmscan_wait);
118 __fscache_wait_on_page_write(cookie, page); 131 if (!release_page_wait_timeout(cookie, page))
132 _debug("fscache writeout timeout page: %p{%lx}",
133 page, page->index);
134
119 gfp &= ~__GFP_WAIT; 135 gfp &= ~__GFP_WAIT;
120 goto try_again; 136 goto try_again;
121} 137}
@@ -182,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
182{ 198{
183 struct fscache_operation *op; 199 struct fscache_operation *op;
184 struct fscache_object *object; 200 struct fscache_object *object;
185 bool wake_cookie; 201 bool wake_cookie = false;
186 202
187 _enter("%p", cookie); 203 _enter("%p", cookie);
188 204
@@ -212,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
212 228
213 __fscache_use_cookie(cookie); 229 __fscache_use_cookie(cookie);
214 if (fscache_submit_exclusive_op(object, op) < 0) 230 if (fscache_submit_exclusive_op(object, op) < 0)
215 goto nobufs; 231 goto nobufs_dec;
216 spin_unlock(&cookie->lock); 232 spin_unlock(&cookie->lock);
217 fscache_stat(&fscache_n_attr_changed_ok); 233 fscache_stat(&fscache_n_attr_changed_ok);
218 fscache_put_operation(op); 234 fscache_put_operation(op);
219 _leave(" = 0"); 235 _leave(" = 0");
220 return 0; 236 return 0;
221 237
222nobufs: 238nobufs_dec:
223 wake_cookie = __fscache_unuse_cookie(cookie); 239 wake_cookie = __fscache_unuse_cookie(cookie);
240nobufs:
224 spin_unlock(&cookie->lock); 241 spin_unlock(&cookie->lock);
225 kfree(op); 242 kfree(op);
226 if (wake_cookie) 243 if (wake_cookie)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 912061ac4baf..caa8d95b24e8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1305,6 +1305,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1305 size_t start; 1305 size_t start;
1306 ssize_t ret = iov_iter_get_pages(ii, 1306 ssize_t ret = iov_iter_get_pages(ii,
1307 &req->pages[req->num_pages], 1307 &req->pages[req->num_pages],
1308 *nbytesp - nbytes,
1308 req->max_pages - req->num_pages, 1309 req->max_pages - req->num_pages,
1309 &start); 1310 &start);
1310 if (ret < 0) 1311 if (ret < 0)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e6ee5b6e8d99..f0b945ab853e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp)
359 * Returns: The length of the extent (minimum of one block) 359 * Returns: The length of the extent (minimum of one block)
360 */ 360 */
361 361
362static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) 362static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
363{ 363{
364 const __be64 *end = (start + len); 364 const __be64 *end = (start + len);
365 const __be64 *first = ptr; 365 const __be64 *first = ptr;
@@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
449 struct buffer_head *bh_map, struct metapath *mp, 449 struct buffer_head *bh_map, struct metapath *mp,
450 const unsigned int sheight, 450 const unsigned int sheight,
451 const unsigned int height, 451 const unsigned int height,
452 const unsigned int maxlen) 452 const size_t maxlen)
453{ 453{
454 struct gfs2_inode *ip = GFS2_I(inode); 454 struct gfs2_inode *ip = GFS2_I(inode);
455 struct gfs2_sbd *sdp = GFS2_SB(inode); 455 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
483 } else { 483 } else {
484 /* Need to allocate indirect blocks */ 484 /* Need to allocate indirect blocks */
485 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; 485 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
486 dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); 486 dblks = min(maxlen, (size_t)(ptrs_per_blk -
487 mp->mp_list[end_of_metadata]));
487 if (height == ip->i_height) { 488 if (height == ip->i_height) {
488 /* Writing into existing tree, extend tree down */ 489 /* Writing into existing tree, extend tree down */
489 iblks = height - sheight; 490 iblks = height - sheight;
@@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
605 struct gfs2_inode *ip = GFS2_I(inode); 606 struct gfs2_inode *ip = GFS2_I(inode);
606 struct gfs2_sbd *sdp = GFS2_SB(inode); 607 struct gfs2_sbd *sdp = GFS2_SB(inode);
607 unsigned int bsize = sdp->sd_sb.sb_bsize; 608 unsigned int bsize = sdp->sd_sb.sb_bsize;
608 const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; 609 const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
609 const u64 *arr = sdp->sd_heightsize; 610 const u64 *arr = sdp->sd_heightsize;
610 __be64 *ptr; 611 __be64 *ptr;
611 u64 size; 612 u64 size;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a349f9a9685..5d4261ff5d23 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2100,8 +2100,13 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
2100 } 2100 }
2101 if (IS_ERR(dent)) 2101 if (IS_ERR(dent))
2102 return PTR_ERR(dent); 2102 return PTR_ERR(dent);
2103 da->bh = bh; 2103
2104 da->dent = dent; 2104 if (da->save_loc) {
2105 da->bh = bh;
2106 da->dent = dent;
2107 } else {
2108 brelse(bh);
2109 }
2105 return 0; 2110 return 0;
2106} 2111}
2107 2112
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 126c65dda028..e1b309c24dab 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,6 +23,7 @@ struct gfs2_diradd {
23 unsigned nr_blocks; 23 unsigned nr_blocks;
24 struct gfs2_dirent *dent; 24 struct gfs2_dirent *dent;
25 struct buffer_head *bh; 25 struct buffer_head *bh;
26 int save_loc;
26}; 27};
27 28
28extern struct inode *gfs2_dir_search(struct inode *dir, 29extern struct inode *gfs2_dir_search(struct inode *dir,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2c02478a86b0..80dd44dca028 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -26,6 +26,7 @@
26#include <linux/dlm.h> 26#include <linux/dlm.h>
27#include <linux/dlm_plock.h> 27#include <linux/dlm_plock.h>
28#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/delay.h>
29 30
30#include "gfs2.h" 31#include "gfs2.h"
31#include "incore.h" 32#include "incore.h"
@@ -959,9 +960,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
959 unsigned int state; 960 unsigned int state;
960 int flags; 961 int flags;
961 int error = 0; 962 int error = 0;
963 int sleeptime;
962 964
963 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; 965 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
964 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; 966 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
965 967
966 mutex_lock(&fp->f_fl_mutex); 968 mutex_lock(&fp->f_fl_mutex);
967 969
@@ -981,7 +983,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
981 gfs2_holder_init(gl, state, flags, fl_gh); 983 gfs2_holder_init(gl, state, flags, fl_gh);
982 gfs2_glock_put(gl); 984 gfs2_glock_put(gl);
983 } 985 }
984 error = gfs2_glock_nq(fl_gh); 986 for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
987 error = gfs2_glock_nq(fl_gh);
988 if (error != GLR_TRYFAILED)
989 break;
990 fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
991 fl_gh->gh_error = 0;
992 msleep(sleeptime);
993 }
985 if (error) { 994 if (error) {
986 gfs2_holder_uninit(fl_gh); 995 gfs2_holder_uninit(fl_gh);
987 if (error == GLR_TRYFAILED) 996 if (error == GLR_TRYFAILED)
@@ -1004,7 +1013,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
1004 mutex_lock(&fp->f_fl_mutex); 1013 mutex_lock(&fp->f_fl_mutex);
1005 flock_lock_file_wait(file, fl); 1014 flock_lock_file_wait(file, fl);
1006 if (fl_gh->gh_gl) { 1015 if (fl_gh->gh_gl) {
1007 gfs2_glock_dq_wait(fl_gh); 1016 gfs2_glock_dq(fl_gh);
1008 gfs2_holder_uninit(fl_gh); 1017 gfs2_holder_uninit(fl_gh);
1009 } 1018 }
1010 mutex_unlock(&fp->f_fl_mutex); 1019 mutex_unlock(&fp->f_fl_mutex);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7f513b1ceb2c..8f0c19d1d943 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -811,7 +811,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
811{ 811{
812 INIT_LIST_HEAD(&gh->gh_list); 812 INIT_LIST_HEAD(&gh->gh_list);
813 gh->gh_gl = gl; 813 gh->gh_gl = gl;
814 gh->gh_ip = (unsigned long)__builtin_return_address(0); 814 gh->gh_ip = _RET_IP_;
815 gh->gh_owner_pid = get_pid(task_pid(current)); 815 gh->gh_owner_pid = get_pid(task_pid(current));
816 gh->gh_state = state; 816 gh->gh_state = state;
817 gh->gh_flags = flags; 817 gh->gh_flags = flags;
@@ -835,7 +835,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
835 gh->gh_state = state; 835 gh->gh_state = state;
836 gh->gh_flags = flags; 836 gh->gh_flags = flags;
837 gh->gh_iflags = 0; 837 gh->gh_iflags = 0;
838 gh->gh_ip = (unsigned long)__builtin_return_address(0); 838 gh->gh_ip = _RET_IP_;
839 if (gh->gh_owner_pid) 839 if (gh->gh_owner_pid)
840 put_pid(gh->gh_owner_pid); 840 put_pid(gh->gh_owner_pid);
841 gh->gh_owner_pid = get_pid(task_pid(current)); 841 gh->gh_owner_pid = get_pid(task_pid(current));
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 2ffc67dce87f..1cc0bba6313f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -93,7 +93,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
93 * tr->alloced is not set since the transaction structure is 93 * tr->alloced is not set since the transaction structure is
94 * on the stack */ 94 * on the stack */
95 tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); 95 tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
96 tr.tr_ip = (unsigned long)__builtin_return_address(0); 96 tr.tr_ip = _RET_IP_;
97 sb_start_intwrite(sdp->sd_vfs); 97 sb_start_intwrite(sdp->sd_vfs);
98 if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) { 98 if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) {
99 sb_end_intwrite(sdp->sd_vfs); 99 sb_end_intwrite(sdp->sd_vfs);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 67d310c9ada3..39e7e9959b74 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -262,6 +262,9 @@ struct gfs2_holder {
262 unsigned long gh_ip; 262 unsigned long gh_ip;
263}; 263};
264 264
265/* Number of quota types we support */
266#define GFS2_MAXQUOTAS 2
267
265/* Resource group multi-block reservation, in order of appearance: 268/* Resource group multi-block reservation, in order of appearance:
266 269
267 Step 1. Function prepares to write, allocates a mb, sets the size hint. 270 Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -282,8 +285,8 @@ struct gfs2_blkreserv {
282 u64 rs_inum; /* Inode number for reservation */ 285 u64 rs_inum; /* Inode number for reservation */
283 286
284 /* ancillary quota stuff */ 287 /* ancillary quota stuff */
285 struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; 288 struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
286 struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS]; 289 struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
287 unsigned int rs_qa_qd_num; 290 unsigned int rs_qa_qd_num;
288}; 291};
289 292
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e62e59477884..fcf42eadb69c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -600,7 +600,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
600 int error, free_vfs_inode = 0; 600 int error, free_vfs_inode = 0;
601 u32 aflags = 0; 601 u32 aflags = 0;
602 unsigned blocks = 1; 602 unsigned blocks = 1;
603 struct gfs2_diradd da = { .bh = NULL, }; 603 struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
604 604
605 if (!name->len || name->len > GFS2_FNAMESIZE) 605 if (!name->len || name->len > GFS2_FNAMESIZE)
606 return -ENAMETOOLONG; 606 return -ENAMETOOLONG;
@@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
626 if (!IS_ERR(inode)) { 626 if (!IS_ERR(inode)) {
627 d = d_splice_alias(inode, dentry); 627 d = d_splice_alias(inode, dentry);
628 error = PTR_ERR(d); 628 error = PTR_ERR(d);
629 if (IS_ERR(d)) 629 if (IS_ERR(d)) {
630 inode = ERR_CAST(d);
630 goto fail_gunlock; 631 goto fail_gunlock;
632 }
631 error = 0; 633 error = 0;
632 if (file) { 634 if (file) {
633 if (S_ISREG(inode->i_mode)) { 635 if (S_ISREG(inode->i_mode)) {
@@ -670,6 +672,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
670 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 672 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
671 gfs2_set_inode_blocks(inode, 1); 673 gfs2_set_inode_blocks(inode, 1);
672 munge_mode_uid_gid(dip, inode); 674 munge_mode_uid_gid(dip, inode);
675 check_and_update_goal(dip);
673 ip->i_goal = dip->i_goal; 676 ip->i_goal = dip->i_goal;
674 ip->i_diskflags = 0; 677 ip->i_diskflags = 0;
675 ip->i_eattr = 0; 678 ip->i_eattr = 0;
@@ -840,8 +843,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
840 int error; 843 int error;
841 844
842 inode = gfs2_lookupi(dir, &dentry->d_name, 0); 845 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
843 if (!inode) 846 if (inode == NULL) {
847 d_add(dentry, NULL);
844 return NULL; 848 return NULL;
849 }
845 if (IS_ERR(inode)) 850 if (IS_ERR(inode))
846 return ERR_CAST(inode); 851 return ERR_CAST(inode);
847 852
@@ -854,7 +859,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
854 859
855 d = d_splice_alias(inode, dentry); 860 d = d_splice_alias(inode, dentry);
856 if (IS_ERR(d)) { 861 if (IS_ERR(d)) {
857 iput(inode);
858 gfs2_glock_dq_uninit(&gh); 862 gfs2_glock_dq_uninit(&gh);
859 return d; 863 return d;
860 } 864 }
@@ -896,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
896 struct gfs2_inode *ip = GFS2_I(inode); 900 struct gfs2_inode *ip = GFS2_I(inode);
897 struct gfs2_holder ghs[2]; 901 struct gfs2_holder ghs[2];
898 struct buffer_head *dibh; 902 struct buffer_head *dibh;
899 struct gfs2_diradd da = { .bh = NULL, }; 903 struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
900 int error; 904 int error;
901 905
902 if (S_ISDIR(inode->i_mode)) 906 if (S_ISDIR(inode->i_mode))
@@ -1334,7 +1338,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1334 struct gfs2_rgrpd *nrgd; 1338 struct gfs2_rgrpd *nrgd;
1335 unsigned int num_gh; 1339 unsigned int num_gh;
1336 int dir_rename = 0; 1340 int dir_rename = 0;
1337 struct gfs2_diradd da = { .nr_blocks = 0, }; 1341 struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, };
1338 unsigned int x; 1342 unsigned int x;
1339 int error; 1343 int error;
1340 1344
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f4cb9c0d6bbd..7474c413ffd1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -577,6 +577,13 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
577 return rgd; 577 return rgd;
578} 578}
579 579
580void check_and_update_goal(struct gfs2_inode *ip)
581{
582 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
583 if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
584 ip->i_goal = ip->i_no_addr;
585}
586
580void gfs2_free_clones(struct gfs2_rgrpd *rgd) 587void gfs2_free_clones(struct gfs2_rgrpd *rgd)
581{ 588{
582 int x; 589 int x;
@@ -1910,6 +1917,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
1910 } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { 1917 } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
1911 rs->rs_rbm.rgd = begin = ip->i_rgd; 1918 rs->rs_rbm.rgd = begin = ip->i_rgd;
1912 } else { 1919 } else {
1920 check_and_update_goal(ip);
1913 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); 1921 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
1914 } 1922 }
1915 if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) 1923 if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
@@ -2089,7 +2097,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2089 u32 blen, unsigned char new_state) 2097 u32 blen, unsigned char new_state)
2090{ 2098{
2091 struct gfs2_rbm rbm; 2099 struct gfs2_rbm rbm;
2092 struct gfs2_bitmap *bi; 2100 struct gfs2_bitmap *bi, *bi_prev = NULL;
2093 2101
2094 rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); 2102 rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
2095 if (!rbm.rgd) { 2103 if (!rbm.rgd) {
@@ -2098,18 +2106,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2098 return NULL; 2106 return NULL;
2099 } 2107 }
2100 2108
2109 gfs2_rbm_from_block(&rbm, bstart);
2101 while (blen--) { 2110 while (blen--) {
2102 gfs2_rbm_from_block(&rbm, bstart);
2103 bi = rbm_bi(&rbm); 2111 bi = rbm_bi(&rbm);
2104 bstart++; 2112 if (bi != bi_prev) {
2105 if (!bi->bi_clone) { 2113 if (!bi->bi_clone) {
2106 bi->bi_clone = kmalloc(bi->bi_bh->b_size, 2114 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2107 GFP_NOFS | __GFP_NOFAIL); 2115 GFP_NOFS | __GFP_NOFAIL);
2108 memcpy(bi->bi_clone + bi->bi_offset, 2116 memcpy(bi->bi_clone + bi->bi_offset,
2109 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); 2117 bi->bi_bh->b_data + bi->bi_offset,
2118 bi->bi_len);
2119 }
2120 gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2121 bi_prev = bi;
2110 } 2122 }
2111 gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2112 gfs2_setbit(&rbm, false, new_state); 2123 gfs2_setbit(&rbm, false, new_state);
2124 gfs2_rbm_incr(&rbm);
2113 } 2125 }
2114 2126
2115 return rbm.rgd; 2127 return rbm.rgd;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 463ab2e95d1c..5d8f085f7ade 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -80,4 +80,5 @@ static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
80 return rs && !RB_EMPTY_NODE(&rs->rs_node); 80 return rs && !RB_EMPTY_NODE(&rs->rs_node);
81} 81}
82 82
83extern void check_and_update_goal(struct gfs2_inode *ip);
83#endif /* __RGRP_DOT_H__ */ 84#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2607ff13d486..a346f56c4c6d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1294,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1294 int val; 1294 int val;
1295 1295
1296 if (is_ancestor(root, sdp->sd_master_dir)) 1296 if (is_ancestor(root, sdp->sd_master_dir))
1297 seq_printf(s, ",meta"); 1297 seq_puts(s, ",meta");
1298 if (args->ar_lockproto[0]) 1298 if (args->ar_lockproto[0])
1299 seq_printf(s, ",lockproto=%s", args->ar_lockproto); 1299 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
1300 if (args->ar_locktable[0]) 1300 if (args->ar_locktable[0])
@@ -1302,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1302 if (args->ar_hostdata[0]) 1302 if (args->ar_hostdata[0])
1303 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1303 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1304 if (args->ar_spectator) 1304 if (args->ar_spectator)
1305 seq_printf(s, ",spectator"); 1305 seq_puts(s, ",spectator");
1306 if (args->ar_localflocks) 1306 if (args->ar_localflocks)
1307 seq_printf(s, ",localflocks"); 1307 seq_puts(s, ",localflocks");
1308 if (args->ar_debug) 1308 if (args->ar_debug)
1309 seq_printf(s, ",debug"); 1309 seq_puts(s, ",debug");
1310 if (args->ar_posix_acl) 1310 if (args->ar_posix_acl)
1311 seq_printf(s, ",acl"); 1311 seq_puts(s, ",acl");
1312 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 1312 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
1313 char *state; 1313 char *state;
1314 switch (args->ar_quota) { 1314 switch (args->ar_quota) {
@@ -1328,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1328 seq_printf(s, ",quota=%s", state); 1328 seq_printf(s, ",quota=%s", state);
1329 } 1329 }
1330 if (args->ar_suiddir) 1330 if (args->ar_suiddir)
1331 seq_printf(s, ",suiddir"); 1331 seq_puts(s, ",suiddir");
1332 if (args->ar_data != GFS2_DATA_DEFAULT) { 1332 if (args->ar_data != GFS2_DATA_DEFAULT) {
1333 char *state; 1333 char *state;
1334 switch (args->ar_data) { 1334 switch (args->ar_data) {
@@ -1345,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1345 seq_printf(s, ",data=%s", state); 1345 seq_printf(s, ",data=%s", state);
1346 } 1346 }
1347 if (args->ar_discard) 1347 if (args->ar_discard)
1348 seq_printf(s, ",discard"); 1348 seq_puts(s, ",discard");
1349 val = sdp->sd_tune.gt_logd_secs; 1349 val = sdp->sd_tune.gt_logd_secs;
1350 if (val != 30) 1350 if (val != 30)
1351 seq_printf(s, ",commit=%d", val); 1351 seq_printf(s, ",commit=%d", val);
@@ -1376,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1376 seq_printf(s, ",errors=%s", state); 1376 seq_printf(s, ",errors=%s", state);
1377 } 1377 }
1378 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 1378 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
1379 seq_printf(s, ",nobarrier"); 1379 seq_puts(s, ",nobarrier");
1380 if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) 1380 if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
1381 seq_printf(s, ",demote_interface_used"); 1381 seq_puts(s, ",demote_interface_used");
1382 if (args->ar_rgrplvb) 1382 if (args->ar_rgrplvb)
1383 seq_printf(s, ",rgrplvb"); 1383 seq_puts(s, ",rgrplvb");
1384 return 0; 1384 return 0;
1385} 1385}
1386 1386
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 0546ab4e28e8..42bfd3361979 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -44,7 +44,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
44 if (!tr) 44 if (!tr)
45 return -ENOMEM; 45 return -ENOMEM;
46 46
47 tr->tr_ip = (unsigned long)__builtin_return_address(0); 47 tr->tr_ip = _RET_IP_;
48 tr->tr_blocks = blocks; 48 tr->tr_blocks = blocks;
49 tr->tr_revokes = revokes; 49 tr->tr_revokes = revokes;
50 tr->tr_reserved = 1; 50 tr->tr_reserved = 1;
diff --git a/fs/internal.h b/fs/internal.h
index e325b4f9c799..b2623200107b 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
35#endif 35#endif
36 36
37/* 37/*
38 * buffer.c
39 */
40extern void guard_bio_eod(int rw, struct bio *bio);
41
42/*
38 * char_dev.c 43 * char_dev.c
39 */ 44 */
40extern void __init chrdev_init(void); 45extern void __init chrdev_init(void);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4556ce1af5b0..5ddaf8625d3b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -61,7 +61,7 @@ static void isofs_put_super(struct super_block *sb)
61 return; 61 return;
62} 62}
63 63
64static int isofs_read_inode(struct inode *); 64static int isofs_read_inode(struct inode *, int relocated);
65static int isofs_statfs (struct dentry *, struct kstatfs *); 65static int isofs_statfs (struct dentry *, struct kstatfs *);
66 66
67static struct kmem_cache *isofs_inode_cachep; 67static struct kmem_cache *isofs_inode_cachep;
@@ -1259,7 +1259,7 @@ out_toomany:
1259 goto out; 1259 goto out;
1260} 1260}
1261 1261
1262static int isofs_read_inode(struct inode *inode) 1262static int isofs_read_inode(struct inode *inode, int relocated)
1263{ 1263{
1264 struct super_block *sb = inode->i_sb; 1264 struct super_block *sb = inode->i_sb;
1265 struct isofs_sb_info *sbi = ISOFS_SB(sb); 1265 struct isofs_sb_info *sbi = ISOFS_SB(sb);
@@ -1404,7 +1404,7 @@ static int isofs_read_inode(struct inode *inode)
1404 */ 1404 */
1405 1405
1406 if (!high_sierra) { 1406 if (!high_sierra) {
1407 parse_rock_ridge_inode(de, inode); 1407 parse_rock_ridge_inode(de, inode, relocated);
1408 /* if we want uid/gid set, override the rock ridge setting */ 1408 /* if we want uid/gid set, override the rock ridge setting */
1409 if (sbi->s_uid_set) 1409 if (sbi->s_uid_set)
1410 inode->i_uid = sbi->s_uid; 1410 inode->i_uid = sbi->s_uid;
@@ -1483,9 +1483,10 @@ static int isofs_iget5_set(struct inode *ino, void *data)
1483 * offset that point to the underlying meta-data for the inode. The 1483 * offset that point to the underlying meta-data for the inode. The
1484 * code below is otherwise similar to the iget() code in 1484 * code below is otherwise similar to the iget() code in
1485 * include/linux/fs.h */ 1485 * include/linux/fs.h */
1486struct inode *isofs_iget(struct super_block *sb, 1486struct inode *__isofs_iget(struct super_block *sb,
1487 unsigned long block, 1487 unsigned long block,
1488 unsigned long offset) 1488 unsigned long offset,
1489 int relocated)
1489{ 1490{
1490 unsigned long hashval; 1491 unsigned long hashval;
1491 struct inode *inode; 1492 struct inode *inode;
@@ -1507,7 +1508,7 @@ struct inode *isofs_iget(struct super_block *sb,
1507 return ERR_PTR(-ENOMEM); 1508 return ERR_PTR(-ENOMEM);
1508 1509
1509 if (inode->i_state & I_NEW) { 1510 if (inode->i_state & I_NEW) {
1510 ret = isofs_read_inode(inode); 1511 ret = isofs_read_inode(inode, relocated);
1511 if (ret < 0) { 1512 if (ret < 0) {
1512 iget_failed(inode); 1513 iget_failed(inode);
1513 inode = ERR_PTR(ret); 1514 inode = ERR_PTR(ret);
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 99167238518d..0ac4c1f73fbd 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -107,7 +107,7 @@ extern int iso_date(char *, int);
107 107
108struct inode; /* To make gcc happy */ 108struct inode; /* To make gcc happy */
109 109
110extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *); 110extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated);
111extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *); 111extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
112extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *); 112extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);
113 113
@@ -118,9 +118,24 @@ extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int
118extern struct buffer_head *isofs_bread(struct inode *, sector_t); 118extern struct buffer_head *isofs_bread(struct inode *, sector_t);
119extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long); 119extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
120 120
121extern struct inode *isofs_iget(struct super_block *sb, 121struct inode *__isofs_iget(struct super_block *sb,
122 unsigned long block, 122 unsigned long block,
123 unsigned long offset); 123 unsigned long offset,
124 int relocated);
125
126static inline struct inode *isofs_iget(struct super_block *sb,
127 unsigned long block,
128 unsigned long offset)
129{
130 return __isofs_iget(sb, block, offset, 0);
131}
132
133static inline struct inode *isofs_iget_reloc(struct super_block *sb,
134 unsigned long block,
135 unsigned long offset)
136{
137 return __isofs_iget(sb, block, offset, 1);
138}
124 139
125/* Because the inode number is no longer relevant to finding the 140/* Because the inode number is no longer relevant to finding the
126 * underlying meta-data for an inode, we are free to choose a more 141 * underlying meta-data for an inode, we are free to choose a more
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c0bf42472e40..f488bbae541a 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -288,12 +288,16 @@ eio:
288 goto out; 288 goto out;
289} 289}
290 290
291#define RR_REGARD_XA 1
292#define RR_RELOC_DE 2
293
291static int 294static int
292parse_rock_ridge_inode_internal(struct iso_directory_record *de, 295parse_rock_ridge_inode_internal(struct iso_directory_record *de,
293 struct inode *inode, int regard_xa) 296 struct inode *inode, int flags)
294{ 297{
295 int symlink_len = 0; 298 int symlink_len = 0;
296 int cnt, sig; 299 int cnt, sig;
300 unsigned int reloc_block;
297 struct inode *reloc; 301 struct inode *reloc;
298 struct rock_ridge *rr; 302 struct rock_ridge *rr;
299 int rootflag; 303 int rootflag;
@@ -305,7 +309,7 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
305 309
306 init_rock_state(&rs, inode); 310 init_rock_state(&rs, inode);
307 setup_rock_ridge(de, inode, &rs); 311 setup_rock_ridge(de, inode, &rs);
308 if (regard_xa) { 312 if (flags & RR_REGARD_XA) {
309 rs.chr += 14; 313 rs.chr += 14;
310 rs.len -= 14; 314 rs.len -= 14;
311 if (rs.len < 0) 315 if (rs.len < 0)
@@ -485,12 +489,22 @@ repeat:
485 "relocated directory\n"); 489 "relocated directory\n");
486 goto out; 490 goto out;
487 case SIG('C', 'L'): 491 case SIG('C', 'L'):
488 ISOFS_I(inode)->i_first_extent = 492 if (flags & RR_RELOC_DE) {
489 isonum_733(rr->u.CL.location); 493 printk(KERN_ERR
490 reloc = 494 "ISOFS: Recursive directory relocation "
491 isofs_iget(inode->i_sb, 495 "is not supported\n");
492 ISOFS_I(inode)->i_first_extent, 496 goto eio;
493 0); 497 }
498 reloc_block = isonum_733(rr->u.CL.location);
499 if (reloc_block == ISOFS_I(inode)->i_iget5_block &&
500 ISOFS_I(inode)->i_iget5_offset == 0) {
501 printk(KERN_ERR
502 "ISOFS: Directory relocation points to "
503 "itself\n");
504 goto eio;
505 }
506 ISOFS_I(inode)->i_first_extent = reloc_block;
507 reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0);
494 if (IS_ERR(reloc)) { 508 if (IS_ERR(reloc)) {
495 ret = PTR_ERR(reloc); 509 ret = PTR_ERR(reloc);
496 goto out; 510 goto out;
@@ -637,9 +651,11 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
637 return rpnt; 651 return rpnt;
638} 652}
639 653
640int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode) 654int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
655 int relocated)
641{ 656{
642 int result = parse_rock_ridge_inode_internal(de, inode, 0); 657 int flags = relocated ? RR_RELOC_DE : 0;
658 int result = parse_rock_ridge_inode_internal(de, inode, flags);
643 659
644 /* 660 /*
645 * if rockridge flag was reset and we didn't look for attributes 661 * if rockridge flag was reset and we didn't look for attributes
@@ -647,7 +663,8 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
647 */ 663 */
648 if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1) 664 if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
649 && (ISOFS_SB(inode->i_sb)->s_rock == 2)) { 665 && (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
650 result = parse_rock_ridge_inode_internal(de, inode, 14); 666 result = parse_rock_ridge_inode_internal(de, inode,
667 flags | RR_REGARD_XA);
651 } 668 }
652 return result; 669 return result;
653} 670}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6fac74349856..b73e0215baa7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
97 struct commit_header *h; 97 struct commit_header *h;
98 __u32 csum; 98 __u32 csum;
99 99
100 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 100 if (!jbd2_journal_has_csum_v2or3(j))
101 return; 101 return;
102 102
103 h = (struct commit_header *)(bh->b_data); 103 h = (struct commit_header *)(bh->b_data);
@@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
313 return checksum; 313 return checksum;
314} 314}
315 315
316static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, 316static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
317 unsigned long long block) 317 unsigned long long block)
318{ 318{
319 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 319 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
320 if (tag_bytes > JBD2_TAG_SIZE32) 320 if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
321 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 321 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322} 322}
323 323
@@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j,
327 struct jbd2_journal_block_tail *tail; 327 struct jbd2_journal_block_tail *tail;
328 __u32 csum; 328 __u32 csum;
329 329
330 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 330 if (!jbd2_journal_has_csum_v2or3(j))
331 return; 331 return;
332 332
333 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - 333 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
@@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j,
340static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 340static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
341 struct buffer_head *bh, __u32 sequence) 341 struct buffer_head *bh, __u32 sequence)
342{ 342{
343 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
343 struct page *page = bh->b_page; 344 struct page *page = bh->b_page;
344 __u8 *addr; 345 __u8 *addr;
345 __u32 csum32; 346 __u32 csum32;
346 __be32 seq; 347 __be32 seq;
347 348
348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 349 if (!jbd2_journal_has_csum_v2or3(j))
349 return; 350 return;
350 351
351 seq = cpu_to_be32(sequence); 352 seq = cpu_to_be32(sequence);
@@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
355 bh->b_size); 356 bh->b_size);
356 kunmap_atomic(addr); 357 kunmap_atomic(addr);
357 358
358 /* We only have space to store the lower 16 bits of the crc32c. */ 359 if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
359 tag->t_checksum = cpu_to_be16(csum32); 360 tag3->t_checksum = cpu_to_be32(csum32);
361 else
362 tag->t_checksum = cpu_to_be16(csum32);
360} 363}
361/* 364/*
362 * jbd2_journal_commit_transaction 365 * jbd2_journal_commit_transaction
@@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
396 LIST_HEAD(io_bufs); 399 LIST_HEAD(io_bufs);
397 LIST_HEAD(log_bufs); 400 LIST_HEAD(log_bufs);
398 401
399 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 402 if (jbd2_journal_has_csum_v2or3(journal))
400 csum_size = sizeof(struct jbd2_journal_block_tail); 403 csum_size = sizeof(struct jbd2_journal_block_tail);
401 404
402 /* 405 /*
@@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
690 tag_flag |= JBD2_FLAG_SAME_UUID; 693 tag_flag |= JBD2_FLAG_SAME_UUID;
691 694
692 tag = (journal_block_tag_t *) tagp; 695 tag = (journal_block_tag_t *) tagp;
693 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 696 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
694 tag->t_flags = cpu_to_be16(tag_flag); 697 tag->t_flags = cpu_to_be16(tag_flag);
695 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 698 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
696 commit_transaction->t_tid); 699 commit_transaction->t_tid);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 67b8e303946c..19d74d86d99c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
124/* Checksumming functions */ 124/* Checksumming functions */
125static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
126{ 126{
127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 127 if (!jbd2_journal_has_csum_v2or3(j))
128 return 1; 128 return 1;
129 129
130 return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; 130 return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
145 145
146static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) 146static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
147{ 147{
148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 148 if (!jbd2_journal_has_csum_v2or3(j))
149 return 1; 149 return 1;
150 150
151 return sb->s_checksum == jbd2_superblock_csum(j, sb); 151 return sb->s_checksum == jbd2_superblock_csum(j, sb);
@@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
153 153
154static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) 154static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
155{ 155{
156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 156 if (!jbd2_journal_has_csum_v2or3(j))
157 return; 157 return;
158 158
159 sb->s_checksum = jbd2_superblock_csum(j, sb); 159 sb->s_checksum = jbd2_superblock_csum(j, sb);
@@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal)
1522 goto out; 1522 goto out;
1523 } 1523 }
1524 1524
1525 if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && 1525 if (jbd2_journal_has_csum_v2or3(journal) &&
1526 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1526 JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
1527 /* Can't have checksum v1 and v2 on at the same time! */ 1527 /* Can't have checksum v1 and v2 on at the same time! */
1528 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " 1528 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
1529 "at the same time!\n"); 1529 "at the same time!\n");
1530 goto out; 1530 goto out;
1531 } 1531 }
1532 1532
1533 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
1534 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1535 /* Can't have checksum v2 and v3 at the same time! */
1536 printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
1537 "at the same time!\n");
1538 goto out;
1539 }
1540
1533 if (!jbd2_verify_csum_type(journal, sb)) { 1541 if (!jbd2_verify_csum_type(journal, sb)) {
1534 printk(KERN_ERR "JBD2: Unknown checksum type\n"); 1542 printk(KERN_ERR "JBD2: Unknown checksum type\n");
1535 goto out; 1543 goto out;
1536 } 1544 }
1537 1545
1538 /* Load the checksum driver */ 1546 /* Load the checksum driver */
1539 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1547 if (jbd2_journal_has_csum_v2or3(journal)) {
1540 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); 1548 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1541 if (IS_ERR(journal->j_chksum_driver)) { 1549 if (IS_ERR(journal->j_chksum_driver)) {
1542 printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); 1550 printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal)
1553 } 1561 }
1554 1562
1555 /* Precompute checksum seed for all metadata */ 1563 /* Precompute checksum seed for all metadata */
1556 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 1564 if (jbd2_journal_has_csum_v2or3(journal))
1557 journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, 1565 journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
1558 sizeof(sb->s_uuid)); 1566 sizeof(sb->s_uuid));
1559 1567
@@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1813 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) 1821 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1814 return 0; 1822 return 0;
1815 1823
1816 /* Asking for checksumming v2 and v1? Only give them v2. */ 1824 /* If enabling v2 checksums, turn on v3 instead */
1817 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 && 1825 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
1826 incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
1827 incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
1828 }
1829
1830 /* Asking for checksumming v3 and v1? Only give them v3. */
1831 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
1818 compat & JBD2_FEATURE_COMPAT_CHECKSUM) 1832 compat & JBD2_FEATURE_COMPAT_CHECKSUM)
1819 compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; 1833 compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
1820 1834
@@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1823 1837
1824 sb = journal->j_superblock; 1838 sb = journal->j_superblock;
1825 1839
1826 /* If enabling v2 checksums, update superblock */ 1840 /* If enabling v3 checksums, update superblock */
1827 if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1841 if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1828 sb->s_checksum_type = JBD2_CRC32C_CHKSUM; 1842 sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
1829 sb->s_feature_compat &= 1843 sb->s_feature_compat &=
1830 ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); 1844 ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
@@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1842 } 1856 }
1843 1857
1844 /* Precompute checksum seed for all metadata */ 1858 /* Precompute checksum seed for all metadata */
1845 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 1859 if (jbd2_journal_has_csum_v2or3(journal))
1846 JBD2_FEATURE_INCOMPAT_CSUM_V2))
1847 journal->j_csum_seed = jbd2_chksum(journal, ~0, 1860 journal->j_csum_seed = jbd2_chksum(journal, ~0,
1848 sb->s_uuid, 1861 sb->s_uuid,
1849 sizeof(sb->s_uuid)); 1862 sizeof(sb->s_uuid));
@@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1852 /* If enabling v1 checksums, downgrade superblock */ 1865 /* If enabling v1 checksums, downgrade superblock */
1853 if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) 1866 if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
1854 sb->s_feature_incompat &= 1867 sb->s_feature_incompat &=
1855 ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2); 1868 ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
1869 JBD2_FEATURE_INCOMPAT_CSUM_V3);
1856 1870
1857 sb->s_feature_compat |= cpu_to_be32(compat); 1871 sb->s_feature_compat |= cpu_to_be32(compat);
1858 sb->s_feature_ro_compat |= cpu_to_be32(ro); 1872 sb->s_feature_ro_compat |= cpu_to_be32(ro);
@@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
2165 */ 2179 */
2166size_t journal_tag_bytes(journal_t *journal) 2180size_t journal_tag_bytes(journal_t *journal)
2167{ 2181{
2168 journal_block_tag_t tag; 2182 size_t sz;
2169 size_t x = 0; 2183
2184 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
2185 return sizeof(journal_block_tag3_t);
2186
2187 sz = sizeof(journal_block_tag_t);
2170 2188
2171 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 2189 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
2172 x += sizeof(tag.t_checksum); 2190 sz += sizeof(__u16);
2173 2191
2174 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) 2192 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
2175 return x + JBD2_TAG_SIZE64; 2193 return sz;
2176 else 2194 else
2177 return x + JBD2_TAG_SIZE32; 2195 return sz - sizeof(__u32);
2178} 2196}
2179 2197
2180/* 2198/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3b6bb19d60b1..9b329b55ffe3 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
181 __be32 provided; 181 __be32 provided;
182 __u32 calculated; 182 __u32 calculated;
183 183
184 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 184 if (!jbd2_journal_has_csum_v2or3(j))
185 return 1; 185 return 1;
186 186
187 tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - 187 tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
@@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
205 int nr = 0, size = journal->j_blocksize; 205 int nr = 0, size = journal->j_blocksize;
206 int tag_bytes = journal_tag_bytes(journal); 206 int tag_bytes = journal_tag_bytes(journal);
207 207
208 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 208 if (jbd2_journal_has_csum_v2or3(journal))
209 size -= sizeof(struct jbd2_journal_block_tail); 209 size -= sizeof(struct jbd2_journal_block_tail);
210 210
211 tagp = &bh->b_data[sizeof(journal_header_t)]; 211 tagp = &bh->b_data[sizeof(journal_header_t)];
@@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal)
338 return err; 338 return err;
339} 339}
340 340
341static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag) 341static inline unsigned long long read_tag_block(journal_t *journal,
342 journal_block_tag_t *tag)
342{ 343{
343 unsigned long long block = be32_to_cpu(tag->t_blocknr); 344 unsigned long long block = be32_to_cpu(tag->t_blocknr);
344 if (tag_bytes > JBD2_TAG_SIZE32) 345 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
345 block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; 346 block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
346 return block; 347 return block;
347} 348}
@@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
384 __be32 provided; 385 __be32 provided;
385 __u32 calculated; 386 __u32 calculated;
386 387
387 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 388 if (!jbd2_journal_has_csum_v2or3(j))
388 return 1; 389 return 1;
389 390
390 h = buf; 391 h = buf;
@@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, 400static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
400 void *buf, __u32 sequence) 401 void *buf, __u32 sequence)
401{ 402{
403 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
402 __u32 csum32; 404 __u32 csum32;
403 __be32 seq; 405 __be32 seq;
404 406
405 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 407 if (!jbd2_journal_has_csum_v2or3(j))
406 return 1; 408 return 1;
407 409
408 seq = cpu_to_be32(sequence); 410 seq = cpu_to_be32(sequence);
409 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); 411 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
410 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); 412 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
411 413
412 return tag->t_checksum == cpu_to_be16(csum32); 414 if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
415 return tag3->t_checksum == cpu_to_be32(csum32);
416 else
417 return tag->t_checksum == cpu_to_be16(csum32);
413} 418}
414 419
415static int do_one_pass(journal_t *journal, 420static int do_one_pass(journal_t *journal,
@@ -426,6 +431,7 @@ static int do_one_pass(journal_t *journal,
426 int tag_bytes = journal_tag_bytes(journal); 431 int tag_bytes = journal_tag_bytes(journal);
427 __u32 crc32_sum = ~0; /* Transactional Checksums */ 432 __u32 crc32_sum = ~0; /* Transactional Checksums */
428 int descr_csum_size = 0; 433 int descr_csum_size = 0;
434 int block_error = 0;
429 435
430 /* 436 /*
431 * First thing is to establish what we expect to find in the log 437 * First thing is to establish what we expect to find in the log
@@ -512,8 +518,7 @@ static int do_one_pass(journal_t *journal,
512 switch(blocktype) { 518 switch(blocktype) {
513 case JBD2_DESCRIPTOR_BLOCK: 519 case JBD2_DESCRIPTOR_BLOCK:
514 /* Verify checksum first */ 520 /* Verify checksum first */
515 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 521 if (jbd2_journal_has_csum_v2or3(journal))
516 JBD2_FEATURE_INCOMPAT_CSUM_V2))
517 descr_csum_size = 522 descr_csum_size =
518 sizeof(struct jbd2_journal_block_tail); 523 sizeof(struct jbd2_journal_block_tail);
519 if (descr_csum_size > 0 && 524 if (descr_csum_size > 0 &&
@@ -574,7 +579,7 @@ static int do_one_pass(journal_t *journal,
574 unsigned long long blocknr; 579 unsigned long long blocknr;
575 580
576 J_ASSERT(obh != NULL); 581 J_ASSERT(obh != NULL);
577 blocknr = read_tag_block(tag_bytes, 582 blocknr = read_tag_block(journal,
578 tag); 583 tag);
579 584
580 /* If the block has been 585 /* If the block has been
@@ -598,7 +603,8 @@ static int do_one_pass(journal_t *journal,
598 "checksum recovering " 603 "checksum recovering "
599 "block %llu in log\n", 604 "block %llu in log\n",
600 blocknr); 605 blocknr);
601 continue; 606 block_error = 1;
607 goto skip_write;
602 } 608 }
603 609
604 /* Find a buffer for the new 610 /* Find a buffer for the new
@@ -797,7 +803,8 @@ static int do_one_pass(journal_t *journal,
797 success = -EIO; 803 success = -EIO;
798 } 804 }
799 } 805 }
800 806 if (block_error && success == 0)
807 success = -EIO;
801 return success; 808 return success;
802 809
803 failed: 810 failed:
@@ -811,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
811 __be32 provided; 818 __be32 provided;
812 __u32 calculated; 819 __u32 calculated;
813 820
814 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 821 if (!jbd2_journal_has_csum_v2or3(j))
815 return 1; 822 return 1;
816 823
817 tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - 824 tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 198c9c10276d..d5e95a175c92 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -91,8 +91,8 @@
91#include <linux/list.h> 91#include <linux/list.h>
92#include <linux/init.h> 92#include <linux/init.h>
93#include <linux/bio.h> 93#include <linux/bio.h>
94#endif
95#include <linux/log2.h> 94#include <linux/log2.h>
95#endif
96 96
97static struct kmem_cache *jbd2_revoke_record_cache; 97static struct kmem_cache *jbd2_revoke_record_cache;
98static struct kmem_cache *jbd2_revoke_table_cache; 98static struct kmem_cache *jbd2_revoke_table_cache;
@@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal,
597 offset = *offsetp; 597 offset = *offsetp;
598 598
599 /* Do we need to leave space at the end for a checksum? */ 599 /* Do we need to leave space at the end for a checksum? */
600 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 600 if (jbd2_journal_has_csum_v2or3(journal))
601 csum_size = sizeof(struct jbd2_journal_revoke_tail); 601 csum_size = sizeof(struct jbd2_journal_revoke_tail);
602 602
603 /* Make sure we have a descriptor with space left for the record */ 603 /* Make sure we have a descriptor with space left for the record */
@@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
644 struct jbd2_journal_revoke_tail *tail; 644 struct jbd2_journal_revoke_tail *tail;
645 __u32 csum; 645 __u32 csum;
646 646
647 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 647 if (!jbd2_journal_has_csum_v2or3(j))
648 return; 648 return;
649 649
650 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - 650 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ca58d64374ca..9b320cc2a8cf 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,7 @@
5obj-$(CONFIG_LOCKD) += lockd.o 5obj-$(CONFIG_LOCKD) += lockd.o
6 6
7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ 7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o 8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o
9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o 9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
10lockd-objs-$(CONFIG_PROC_FS) += procfs.o
10lockd-objs := $(lockd-objs-y) 11lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index daa8e7514eae..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
159 159
160 msg.rpc_proc = &clnt->cl_procinfo[proc]; 160 msg.rpc_proc = &clnt->cl_procinfo[proc];
161 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); 161 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
162 if (status == -ECONNREFUSED) {
163 dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n",
164 status);
165 rpc_force_rebind(clnt);
166 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
167 }
162 if (status < 0) 168 if (status < 0)
163 dprintk("lockd: NSM upcall RPC failed, status=%d\n", 169 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
164 status); 170 status);
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5010b55628b4..097bfa3adb1c 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -11,7 +11,6 @@ struct lockd_net {
11 11
12 struct delayed_work grace_period_end; 12 struct delayed_work grace_period_end;
13 struct lock_manager lockd_manager; 13 struct lock_manager lockd_manager;
14 struct list_head grace_list;
15 14
16 spinlock_t nsm_clnt_lock; 15 spinlock_t nsm_clnt_lock;
17 unsigned int nsm_users; 16 unsigned int nsm_users;
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
new file mode 100644
index 000000000000..2a0a98480e39
--- /dev/null
+++ b/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
1/*
2 * Procfs support for lockd
3 *
4 * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
5 */
6
7#include <linux/fs.h>
8#include <linux/proc_fs.h>
9#include <linux/module.h>
10#include <linux/nsproxy.h>
11#include <net/net_namespace.h>
12
13#include "netns.h"
14#include "procfs.h"
15
16/*
17 * We only allow strings that start with 'Y', 'y', or '1'.
18 */
19static ssize_t
20nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
21 loff_t *pos)
22{
23 char *data;
24 struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
25 lockd_net_id);
26
27 if (size < 1)
28 return -EINVAL;
29
30 data = simple_transaction_get(file, buf, size);
31 if (IS_ERR(data))
32 return PTR_ERR(data);
33
34 switch(data[0]) {
35 case 'Y':
36 case 'y':
37 case '1':
38 locks_end_grace(&ln->lockd_manager);
39 break;
40 default:
41 return -EINVAL;
42 }
43
44 return size;
45}
46
47static ssize_t
48nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
49 loff_t *pos)
50{
51 struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
52 lockd_net_id);
53 char resp[3];
54
55 resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
56 resp[1] = '\n';
57 resp[2] = '\0';
58
59 return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
60}
61
62static const struct file_operations lockd_end_grace_operations = {
63 .write = nlm_end_grace_write,
64 .read = nlm_end_grace_read,
65 .llseek = default_llseek,
66 .release = simple_transaction_release,
67 .owner = THIS_MODULE,
68};
69
70int __init
71lockd_create_procfs(void)
72{
73 struct proc_dir_entry *entry;
74
75 entry = proc_mkdir("fs/lockd", NULL);
76 if (!entry)
77 return -ENOMEM;
78 entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
79 &lockd_end_grace_operations);
80 if (!entry) {
81 remove_proc_entry("fs/lockd", NULL);
82 return -ENOMEM;
83 }
84 return 0;
85}
86
87void __exit
88lockd_remove_procfs(void)
89{
90 remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
91 remove_proc_entry("fs/lockd", NULL);
92}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
new file mode 100644
index 000000000000..2257a1311027
--- /dev/null
+++ b/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
1/*
2 * Procfs support for lockd
3 *
4 * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
5 */
6#ifndef _LOCKD_PROCFS_H
7#define _LOCKD_PROCFS_H
8
9#include <linux/kconfig.h>
10
11#if IS_ENABLED(CONFIG_PROC_FS)
12int lockd_create_procfs(void);
13void lockd_remove_procfs(void);
14#else
15static inline int
16lockd_create_procfs(void)
17{
18 return 0;
19}
20
21static inline void
22lockd_remove_procfs(void)
23{
24 return;
25}
26#endif /* IS_ENABLED(CONFIG_PROC_FS) */
27
28#endif /* _LOCKD_PROCFS_H */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 09857b48d0c3..d1bb7ecfd201 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -36,6 +36,7 @@
36#include <linux/nfs.h> 36#include <linux/nfs.h>
37 37
38#include "netns.h" 38#include "netns.h"
39#include "procfs.h"
39 40
40#define NLMDBG_FACILITY NLMDBG_SVC 41#define NLMDBG_FACILITY NLMDBG_SVC
41#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) 42#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
@@ -253,13 +254,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
253 254
254 error = make_socks(serv, net); 255 error = make_socks(serv, net);
255 if (error < 0) 256 if (error < 0)
256 goto err_socks; 257 goto err_bind;
257 set_grace_period(net); 258 set_grace_period(net);
258 dprintk("lockd_up_net: per-net data created; net=%p\n", net); 259 dprintk("lockd_up_net: per-net data created; net=%p\n", net);
259 return 0; 260 return 0;
260 261
261err_socks:
262 svc_rpcb_cleanup(serv, net);
263err_bind: 262err_bind:
264 ln->nlmsvc_users--; 263 ln->nlmsvc_users--;
265 return error; 264 return error;
@@ -586,7 +585,7 @@ static int lockd_init_net(struct net *net)
586 struct lockd_net *ln = net_generic(net, lockd_net_id); 585 struct lockd_net *ln = net_generic(net, lockd_net_id);
587 586
588 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); 587 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
589 INIT_LIST_HEAD(&ln->grace_list); 588 INIT_LIST_HEAD(&ln->lockd_manager.list);
590 spin_lock_init(&ln->nsm_clnt_lock); 589 spin_lock_init(&ln->nsm_clnt_lock);
591 return 0; 590 return 0;
592} 591}
@@ -620,8 +619,15 @@ static int __init init_nlm(void)
620 err = register_pernet_subsys(&lockd_net_ops); 619 err = register_pernet_subsys(&lockd_net_ops);
621 if (err) 620 if (err)
622 goto err_pernet; 621 goto err_pernet;
622
623 err = lockd_create_procfs();
624 if (err)
625 goto err_procfs;
626
623 return 0; 627 return 0;
624 628
629err_procfs:
630 unregister_pernet_subsys(&lockd_net_ops);
625err_pernet: 631err_pernet:
626#ifdef CONFIG_SYSCTL 632#ifdef CONFIG_SYSCTL
627 unregister_sysctl_table(nlm_sysctl_table); 633 unregister_sysctl_table(nlm_sysctl_table);
@@ -634,6 +640,7 @@ static void __exit exit_nlm(void)
634{ 640{
635 /* FIXME: delete all NLM clients */ 641 /* FIXME: delete all NLM clients */
636 nlm_shutdown_hosts(); 642 nlm_shutdown_hosts();
643 lockd_remove_procfs();
637 unregister_pernet_subsys(&lockd_net_ops); 644 unregister_pernet_subsys(&lockd_net_ops);
638#ifdef CONFIG_SYSCTL 645#ifdef CONFIG_SYSCTL
639 unregister_sysctl_table(nlm_sysctl_table); 646 unregister_sysctl_table(nlm_sysctl_table);
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f9ed622274f..3e79220babac 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -28,6 +28,7 @@
28#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/cleancache.h> 30#include <linux/cleancache.h>
31#include "internal.h"
31 32
32/* 33/*
33 * I/O completion handler for multipage BIOs. 34 * I/O completion handler for multipage BIOs.
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err)
57static struct bio *mpage_bio_submit(int rw, struct bio *bio) 58static struct bio *mpage_bio_submit(int rw, struct bio *bio)
58{ 59{
59 bio->bi_end_io = mpage_end_io; 60 bio->bi_end_io = mpage_end_io;
61 guard_bio_eod(rw, bio);
60 submit_bio(rw, bio); 62 submit_bio(rw, bio);
61 return NULL; 63 return NULL;
62} 64}
diff --git a/fs/namei.c b/fs/namei.c
index a996bb48dfab..a7b05bf82d31 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
34#include <linux/device_cgroup.h> 34#include <linux/device_cgroup.h>
35#include <linux/fs_struct.h> 35#include <linux/fs_struct.h>
36#include <linux/posix_acl.h> 36#include <linux/posix_acl.h>
37#include <linux/hash.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38 39
39#include "internal.h" 40#include "internal.h"
@@ -643,24 +644,22 @@ static int complete_walk(struct nameidata *nd)
643 644
644static __always_inline void set_root(struct nameidata *nd) 645static __always_inline void set_root(struct nameidata *nd)
645{ 646{
646 if (!nd->root.mnt) 647 get_fs_root(current->fs, &nd->root);
647 get_fs_root(current->fs, &nd->root);
648} 648}
649 649
650static int link_path_walk(const char *, struct nameidata *); 650static int link_path_walk(const char *, struct nameidata *);
651 651
652static __always_inline void set_root_rcu(struct nameidata *nd) 652static __always_inline unsigned set_root_rcu(struct nameidata *nd)
653{ 653{
654 if (!nd->root.mnt) { 654 struct fs_struct *fs = current->fs;
655 struct fs_struct *fs = current->fs; 655 unsigned seq, res;
656 unsigned seq;
657 656
658 do { 657 do {
659 seq = read_seqcount_begin(&fs->seq); 658 seq = read_seqcount_begin(&fs->seq);
660 nd->root = fs->root; 659 nd->root = fs->root;
661 nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); 660 res = __read_seqcount_begin(&nd->root.dentry->d_seq);
662 } while (read_seqcount_retry(&fs->seq, seq)); 661 } while (read_seqcount_retry(&fs->seq, seq));
663 } 662 return res;
664} 663}
665 664
666static void path_put_conditional(struct path *path, struct nameidata *nd) 665static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -860,7 +859,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
860 return PTR_ERR(s); 859 return PTR_ERR(s);
861 } 860 }
862 if (*s == '/') { 861 if (*s == '/') {
863 set_root(nd); 862 if (!nd->root.mnt)
863 set_root(nd);
864 path_put(&nd->path); 864 path_put(&nd->path);
865 nd->path = nd->root; 865 nd->path = nd->root;
866 path_get(&nd->root); 866 path_get(&nd->root);
@@ -1137,13 +1137,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1137 */ 1137 */
1138 *inode = path->dentry->d_inode; 1138 *inode = path->dentry->d_inode;
1139 } 1139 }
1140 return read_seqretry(&mount_lock, nd->m_seq) && 1140 return !read_seqretry(&mount_lock, nd->m_seq) &&
1141 !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); 1141 !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1142} 1142}
1143 1143
1144static int follow_dotdot_rcu(struct nameidata *nd) 1144static int follow_dotdot_rcu(struct nameidata *nd)
1145{ 1145{
1146 set_root_rcu(nd); 1146 struct inode *inode = nd->inode;
1147 if (!nd->root.mnt)
1148 set_root_rcu(nd);
1147 1149
1148 while (1) { 1150 while (1) {
1149 if (nd->path.dentry == nd->root.dentry && 1151 if (nd->path.dentry == nd->root.dentry &&
@@ -1155,6 +1157,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1155 struct dentry *parent = old->d_parent; 1157 struct dentry *parent = old->d_parent;
1156 unsigned seq; 1158 unsigned seq;
1157 1159
1160 inode = parent->d_inode;
1158 seq = read_seqcount_begin(&parent->d_seq); 1161 seq = read_seqcount_begin(&parent->d_seq);
1159 if (read_seqcount_retry(&old->d_seq, nd->seq)) 1162 if (read_seqcount_retry(&old->d_seq, nd->seq))
1160 goto failed; 1163 goto failed;
@@ -1164,6 +1167,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1164 } 1167 }
1165 if (!follow_up_rcu(&nd->path)) 1168 if (!follow_up_rcu(&nd->path))
1166 break; 1169 break;
1170 inode = nd->path.dentry->d_inode;
1167 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 1171 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1168 } 1172 }
1169 while (d_mountpoint(nd->path.dentry)) { 1173 while (d_mountpoint(nd->path.dentry)) {
@@ -1173,11 +1177,12 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1173 break; 1177 break;
1174 nd->path.mnt = &mounted->mnt; 1178 nd->path.mnt = &mounted->mnt;
1175 nd->path.dentry = mounted->mnt.mnt_root; 1179 nd->path.dentry = mounted->mnt.mnt_root;
1180 inode = nd->path.dentry->d_inode;
1176 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 1181 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1177 if (!read_seqretry(&mount_lock, nd->m_seq)) 1182 if (read_seqretry(&mount_lock, nd->m_seq))
1178 goto failed; 1183 goto failed;
1179 } 1184 }
1180 nd->inode = nd->path.dentry->d_inode; 1185 nd->inode = inode;
1181 return 0; 1186 return 0;
1182 1187
1183failed: 1188failed:
@@ -1256,7 +1261,8 @@ static void follow_mount(struct path *path)
1256 1261
1257static void follow_dotdot(struct nameidata *nd) 1262static void follow_dotdot(struct nameidata *nd)
1258{ 1263{
1259 set_root(nd); 1264 if (!nd->root.mnt)
1265 set_root(nd);
1260 1266
1261 while(1) { 1267 while(1) {
1262 struct dentry *old = nd->path.dentry; 1268 struct dentry *old = nd->path.dentry;
@@ -1634,8 +1640,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
1634 1640
1635static inline unsigned int fold_hash(unsigned long hash) 1641static inline unsigned int fold_hash(unsigned long hash)
1636{ 1642{
1637 hash += hash >> (8*sizeof(int)); 1643 return hash_64(hash, 32);
1638 return hash;
1639} 1644}
1640 1645
1641#else /* 32-bit case */ 1646#else /* 32-bit case */
@@ -1669,9 +1674,9 @@ EXPORT_SYMBOL(full_name_hash);
1669 1674
1670/* 1675/*
1671 * Calculate the length and hash of the path component, and 1676 * Calculate the length and hash of the path component, and
1672 * return the length of the component; 1677 * return the "hash_len" as the result.
1673 */ 1678 */
1674static inline unsigned long hash_name(const char *name, unsigned int *hashp) 1679static inline u64 hash_name(const char *name)
1675{ 1680{
1676 unsigned long a, b, adata, bdata, mask, hash, len; 1681 unsigned long a, b, adata, bdata, mask, hash, len;
1677 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; 1682 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
@@ -1691,9 +1696,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
1691 mask = create_zero_mask(adata | bdata); 1696 mask = create_zero_mask(adata | bdata);
1692 1697
1693 hash += a & zero_bytemask(mask); 1698 hash += a & zero_bytemask(mask);
1694 *hashp = fold_hash(hash); 1699 len += find_zero(mask);
1695 1700 return hashlen_create(fold_hash(hash), len);
1696 return len + find_zero(mask);
1697} 1701}
1698 1702
1699#else 1703#else
@@ -1711,7 +1715,7 @@ EXPORT_SYMBOL(full_name_hash);
1711 * We know there's a real path component here of at least 1715 * We know there's a real path component here of at least
1712 * one character. 1716 * one character.
1713 */ 1717 */
1714static inline unsigned long hash_name(const char *name, unsigned int *hashp) 1718static inline u64 hash_name(const char *name)
1715{ 1719{
1716 unsigned long hash = init_name_hash(); 1720 unsigned long hash = init_name_hash();
1717 unsigned long len = 0, c; 1721 unsigned long len = 0, c;
@@ -1722,8 +1726,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
1722 hash = partial_name_hash(c, hash); 1726 hash = partial_name_hash(c, hash);
1723 c = (unsigned char)name[len]; 1727 c = (unsigned char)name[len];
1724 } while (c && c != '/'); 1728 } while (c && c != '/');
1725 *hashp = end_name_hash(hash); 1729 return hashlen_create(end_name_hash(hash), len);
1726 return len;
1727} 1730}
1728 1731
1729#endif 1732#endif
@@ -1748,20 +1751,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1748 1751
1749 /* At this point we know we have a real path component. */ 1752 /* At this point we know we have a real path component. */
1750 for(;;) { 1753 for(;;) {
1751 struct qstr this; 1754 u64 hash_len;
1752 long len;
1753 int type; 1755 int type;
1754 1756
1755 err = may_lookup(nd); 1757 err = may_lookup(nd);
1756 if (err) 1758 if (err)
1757 break; 1759 break;
1758 1760
1759 len = hash_name(name, &this.hash); 1761 hash_len = hash_name(name);
1760 this.name = name;
1761 this.len = len;
1762 1762
1763 type = LAST_NORM; 1763 type = LAST_NORM;
1764 if (name[0] == '.') switch (len) { 1764 if (name[0] == '.') switch (hashlen_len(hash_len)) {
1765 case 2: 1765 case 2:
1766 if (name[1] == '.') { 1766 if (name[1] == '.') {
1767 type = LAST_DOTDOT; 1767 type = LAST_DOTDOT;
@@ -1775,29 +1775,32 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1775 struct dentry *parent = nd->path.dentry; 1775 struct dentry *parent = nd->path.dentry;
1776 nd->flags &= ~LOOKUP_JUMPED; 1776 nd->flags &= ~LOOKUP_JUMPED;
1777 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1777 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1778 struct qstr this = { { .hash_len = hash_len }, .name = name };
1778 err = parent->d_op->d_hash(parent, &this); 1779 err = parent->d_op->d_hash(parent, &this);
1779 if (err < 0) 1780 if (err < 0)
1780 break; 1781 break;
1782 hash_len = this.hash_len;
1783 name = this.name;
1781 } 1784 }
1782 } 1785 }
1783 1786
1784 nd->last = this; 1787 nd->last.hash_len = hash_len;
1788 nd->last.name = name;
1785 nd->last_type = type; 1789 nd->last_type = type;
1786 1790
1787 if (!name[len]) 1791 name += hashlen_len(hash_len);
1792 if (!*name)
1788 return 0; 1793 return 0;
1789 /* 1794 /*
1790 * If it wasn't NUL, we know it was '/'. Skip that 1795 * If it wasn't NUL, we know it was '/'. Skip that
1791 * slash, and continue until no more slashes. 1796 * slash, and continue until no more slashes.
1792 */ 1797 */
1793 do { 1798 do {
1794 len++; 1799 name++;
1795 } while (unlikely(name[len] == '/')); 1800 } while (unlikely(*name == '/'));
1796 if (!name[len]) 1801 if (!*name)
1797 return 0; 1802 return 0;
1798 1803
1799 name += len;
1800
1801 err = walk_component(nd, &next, LOOKUP_FOLLOW); 1804 err = walk_component(nd, &next, LOOKUP_FOLLOW);
1802 if (err < 0) 1805 if (err < 0)
1803 return err; 1806 return err;
@@ -1852,7 +1855,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1852 if (*name=='/') { 1855 if (*name=='/') {
1853 if (flags & LOOKUP_RCU) { 1856 if (flags & LOOKUP_RCU) {
1854 rcu_read_lock(); 1857 rcu_read_lock();
1855 set_root_rcu(nd); 1858 nd->seq = set_root_rcu(nd);
1856 } else { 1859 } else {
1857 set_root(nd); 1860 set_root(nd);
1858 path_get(&nd->root); 1861 path_get(&nd->root);
@@ -1903,7 +1906,14 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1903 } 1906 }
1904 1907
1905 nd->inode = nd->path.dentry->d_inode; 1908 nd->inode = nd->path.dentry->d_inode;
1906 return 0; 1909 if (!(flags & LOOKUP_RCU))
1910 return 0;
1911 if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
1912 return 0;
1913 if (!(nd->flags & LOOKUP_ROOT))
1914 nd->root.mnt = NULL;
1915 rcu_read_unlock();
1916 return -ECHILD;
1907} 1917}
1908 1918
1909static inline int lookup_last(struct nameidata *nd, struct path *path) 1919static inline int lookup_last(struct nameidata *nd, struct path *path)
diff --git a/fs/namespace.c b/fs/namespace.c
index a01c7730e9af..ef42d9bee212 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1217,6 +1217,11 @@ static void namespace_unlock(void)
1217 head.first->pprev = &head.first; 1217 head.first->pprev = &head.first;
1218 INIT_HLIST_HEAD(&unmounted); 1218 INIT_HLIST_HEAD(&unmounted);
1219 1219
1220 /* undo decrements we'd done in umount_tree() */
1221 hlist_for_each_entry(mnt, &head, mnt_hash)
1222 if (mnt->mnt_ex_mountpoint.mnt)
1223 mntget(mnt->mnt_ex_mountpoint.mnt);
1224
1220 up_write(&namespace_sem); 1225 up_write(&namespace_sem);
1221 1226
1222 synchronize_rcu(); 1227 synchronize_rcu();
@@ -1253,6 +1258,9 @@ void umount_tree(struct mount *mnt, int how)
1253 hlist_add_head(&p->mnt_hash, &tmp_list); 1258 hlist_add_head(&p->mnt_hash, &tmp_list);
1254 } 1259 }
1255 1260
1261 hlist_for_each_entry(p, &tmp_list, mnt_hash)
1262 list_del_init(&p->mnt_child);
1263
1256 if (how) 1264 if (how)
1257 propagate_umount(&tmp_list); 1265 propagate_umount(&tmp_list);
1258 1266
@@ -1263,9 +1271,9 @@ void umount_tree(struct mount *mnt, int how)
1263 p->mnt_ns = NULL; 1271 p->mnt_ns = NULL;
1264 if (how < 2) 1272 if (how < 2)
1265 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1273 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1266 list_del_init(&p->mnt_child);
1267 if (mnt_has_parent(p)) { 1274 if (mnt_has_parent(p)) {
1268 put_mountpoint(p->mnt_mp); 1275 put_mountpoint(p->mnt_mp);
1276 mnt_add_count(p->mnt_parent, -1);
1269 /* move the reference to mountpoint into ->mnt_ex_mountpoint */ 1277 /* move the reference to mountpoint into ->mnt_ex_mountpoint */
1270 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; 1278 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
1271 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; 1279 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
2# Makefile for the pNFS block layout driver kernel module 2# Makefile for the pNFS block layout driver kernel module
3# 3#
4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o 4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
5blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o 5
6blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index cbb1797149d5..5228f201d3d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/namei.h> 36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */ 37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h> 38#include <linux/prefetch.h>
40#include <linux/pagevec.h> 39#include <linux/pagevec.h>
41 40
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); 49MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
51MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); 50MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
52 51
53static void print_page(struct page *page) 52static bool is_hole(struct pnfs_block_extent *be)
54{ 53{
55 dprintk("PRINTPAGE page %p\n", page); 54 switch (be->be_state) {
56 dprintk(" PagePrivate %d\n", PagePrivate(page)); 55 case PNFS_BLOCK_NONE_DATA:
57 dprintk(" PageUptodate %d\n", PageUptodate(page)); 56 return true;
58 dprintk(" PageError %d\n", PageError(page)); 57 case PNFS_BLOCK_INVALID_DATA:
59 dprintk(" PageDirty %d\n", PageDirty(page)); 58 return be->be_tag ? false : true;
60 dprintk(" PageReferenced %d\n", PageReferenced(page)); 59 default:
61 dprintk(" PageLocked %d\n", PageLocked(page)); 60 return false;
62 dprintk(" PageWriteback %d\n", PageWriteback(page)); 61 }
63 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
64 dprintk("\n");
65}
66
67/* Given the be associated with isect, determine if page data needs to be
68 * initialized.
69 */
70static int is_hole(struct pnfs_block_extent *be, sector_t isect)
71{
72 if (be->be_state == PNFS_BLOCK_NONE_DATA)
73 return 1;
74 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
75 return 0;
76 else
77 return !bl_is_sector_init(be->be_inval, isect);
78}
79
80/* Given the be associated with isect, determine if page data can be
81 * written to disk.
82 */
83static int is_writable(struct pnfs_block_extent *be, sector_t isect)
84{
85 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
86 be->be_state == PNFS_BLOCK_INVALID_DATA);
87} 62}
88 63
89/* The data we are handed might be spread across several bios. We need 64/* The data we are handed might be spread across several bios. We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
91 */ 66 */
92struct parallel_io { 67struct parallel_io {
93 struct kref refcnt; 68 struct kref refcnt;
94 void (*pnfs_callback) (void *data, int num_se); 69 void (*pnfs_callback) (void *data);
95 void *data; 70 void *data;
96 int bse_count;
97}; 71};
98 72
99static inline struct parallel_io *alloc_parallel(void *data) 73static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
104 if (rv) { 78 if (rv) {
105 rv->data = data; 79 rv->data = data;
106 kref_init(&rv->refcnt); 80 kref_init(&rv->refcnt);
107 rv->bse_count = 0;
108 } 81 }
109 return rv; 82 return rv;
110} 83}
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
119 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 92 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
120 93
121 dprintk("%s enter\n", __func__); 94 dprintk("%s enter\n", __func__);
122 p->pnfs_callback(p->data, p->bse_count); 95 p->pnfs_callback(p->data);
123 kfree(p); 96 kfree(p);
124} 97}
125 98
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
141 return NULL; 114 return NULL;
142} 115}
143 116
144static struct bio *bl_alloc_init_bio(int npg, sector_t isect, 117static struct bio *
145 struct pnfs_block_extent *be, 118bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
146 void (*end_io)(struct bio *, int err), 119 void (*end_io)(struct bio *, int err), struct parallel_io *par)
147 struct parallel_io *par)
148{ 120{
149 struct bio *bio; 121 struct bio *bio;
150 122
@@ -156,58 +128,64 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
156 } 128 }
157 129
158 if (bio) { 130 if (bio) {
159 bio->bi_iter.bi_sector = isect - be->be_f_offset + 131 bio->bi_iter.bi_sector = disk_sector;
160 be->be_v_offset; 132 bio->bi_bdev = bdev;
161 bio->bi_bdev = be->be_mdev;
162 bio->bi_end_io = end_io; 133 bio->bi_end_io = end_io;
163 bio->bi_private = par; 134 bio->bi_private = par;
164 } 135 }
165 return bio; 136 return bio;
166} 137}
167 138
168static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, 139static struct bio *
169 sector_t isect, struct page *page, 140do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
170 struct pnfs_block_extent *be, 141 struct page *page, struct pnfs_block_dev_map *map,
171 void (*end_io)(struct bio *, int err), 142 struct pnfs_block_extent *be,
172 struct parallel_io *par, 143 void (*end_io)(struct bio *, int err),
173 unsigned int offset, int len) 144 struct parallel_io *par, unsigned int offset, int *len)
174{ 145{
175 isect = isect + (offset >> SECTOR_SHIFT); 146 struct pnfs_block_dev *dev =
147 container_of(be->be_device, struct pnfs_block_dev, node);
148 u64 disk_addr, end;
149
176 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, 150 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
177 npg, rw, (unsigned long long)isect, offset, len); 151 npg, rw, (unsigned long long)isect, offset, *len);
152
153 /* translate to device offset */
154 isect += be->be_v_offset;
155 isect -= be->be_f_offset;
156
157 /* translate to physical disk offset */
158 disk_addr = (u64)isect << SECTOR_SHIFT;
159 if (disk_addr < map->start || disk_addr >= map->start + map->len) {
160 if (!dev->map(dev, disk_addr, map))
161 return ERR_PTR(-EIO);
162 bio = bl_submit_bio(rw, bio);
163 }
164 disk_addr += map->disk_offset;
165 disk_addr -= map->start;
166
167 /* limit length to what the device mapping allows */
168 end = disk_addr + *len;
169 if (end >= map->start + map->len)
170 *len = map->start + map->len - disk_addr;
171
178retry: 172retry:
179 if (!bio) { 173 if (!bio) {
180 bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 174 bio = bl_alloc_init_bio(npg, map->bdev,
175 disk_addr >> SECTOR_SHIFT, end_io, par);
181 if (!bio) 176 if (!bio)
182 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
183 } 178 }
184 if (bio_add_page(bio, page, len, offset) < len) { 179 if (bio_add_page(bio, page, *len, offset) < *len) {
185 bio = bl_submit_bio(rw, bio); 180 bio = bl_submit_bio(rw, bio);
186 goto retry; 181 goto retry;
187 } 182 }
188 return bio; 183 return bio;
189} 184}
190 185
191static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
192 sector_t isect, struct page *page,
193 struct pnfs_block_extent *be,
194 void (*end_io)(struct bio *, int err),
195 struct parallel_io *par)
196{
197 return do_add_page_to_bio(bio, npg, rw, isect, page, be,
198 end_io, par, 0, PAGE_CACHE_SIZE);
199}
200
201/* This is basically copied from mpage_end_io_read */
202static void bl_end_io_read(struct bio *bio, int err) 186static void bl_end_io_read(struct bio *bio, int err)
203{ 187{
204 struct parallel_io *par = bio->bi_private; 188 struct parallel_io *par = bio->bi_private;
205 struct bio_vec *bvec;
206 int i;
207
208 if (!err)
209 bio_for_each_segment_all(bvec, bio, i)
210 SetPageUptodate(bvec->bv_page);
211 189
212 if (err) { 190 if (err) {
213 struct nfs_pgio_header *header = par->data; 191 struct nfs_pgio_header *header = par->data;
@@ -216,6 +194,7 @@ static void bl_end_io_read(struct bio *bio, int err)
216 header->pnfs_error = -EIO; 194 header->pnfs_error = -EIO;
217 pnfs_set_lo_fail(header->lseg); 195 pnfs_set_lo_fail(header->lseg);
218 } 196 }
197
219 bio_put(bio); 198 bio_put(bio);
220 put_parallel(par); 199 put_parallel(par);
221} 200}
@@ -231,7 +210,7 @@ static void bl_read_cleanup(struct work_struct *work)
231} 210}
232 211
233static void 212static void
234bl_end_par_io_read(void *data, int unused) 213bl_end_par_io_read(void *data)
235{ 214{
236 struct nfs_pgio_header *hdr = data; 215 struct nfs_pgio_header *hdr = data;
237 216
@@ -241,88 +220,78 @@ bl_end_par_io_read(void *data, int unused)
241} 220}
242 221
243static enum pnfs_try_status 222static enum pnfs_try_status
244bl_read_pagelist(struct nfs_pgio_header *hdr) 223bl_read_pagelist(struct nfs_pgio_header *header)
245{ 224{
246 struct nfs_pgio_header *header = hdr; 225 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
247 int i, hole; 226 struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
248 struct bio *bio = NULL; 227 struct bio *bio = NULL;
249 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 228 struct pnfs_block_extent be;
250 sector_t isect, extent_length = 0; 229 sector_t isect, extent_length = 0;
251 struct parallel_io *par; 230 struct parallel_io *par;
252 loff_t f_offset = hdr->args.offset; 231 loff_t f_offset = header->args.offset;
253 size_t bytes_left = hdr->args.count; 232 size_t bytes_left = header->args.count;
254 unsigned int pg_offset, pg_len; 233 unsigned int pg_offset, pg_len;
255 struct page **pages = hdr->args.pages; 234 struct page **pages = header->args.pages;
256 int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT; 235 int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
257 const bool is_dio = (header->dreq != NULL); 236 const bool is_dio = (header->dreq != NULL);
237 struct blk_plug plug;
238 int i;
258 239
259 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 240 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
260 hdr->page_array.npages, f_offset, 241 header->page_array.npages, f_offset,
261 (unsigned int)hdr->args.count); 242 (unsigned int)header->args.count);
262 243
263 par = alloc_parallel(hdr); 244 par = alloc_parallel(header);
264 if (!par) 245 if (!par)
265 goto use_mds; 246 return PNFS_NOT_ATTEMPTED;
266 par->pnfs_callback = bl_end_par_io_read; 247 par->pnfs_callback = bl_end_par_io_read;
267 /* At this point, we can no longer jump to use_mds */ 248
249 blk_start_plug(&plug);
268 250
269 isect = (sector_t) (f_offset >> SECTOR_SHIFT); 251 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
270 /* Code assumes extents are page-aligned */ 252 /* Code assumes extents are page-aligned */
271 for (i = pg_index; i < hdr->page_array.npages; i++) { 253 for (i = pg_index; i < header->page_array.npages; i++) {
272 if (!extent_length) { 254 if (extent_length <= 0) {
273 /* We've used up the previous extent */ 255 /* We've used up the previous extent */
274 bl_put_extent(be);
275 bl_put_extent(cow_read);
276 bio = bl_submit_bio(READ, bio); 256 bio = bl_submit_bio(READ, bio);
257
277 /* Get the next one */ 258 /* Get the next one */
278 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 259 if (!ext_tree_lookup(bl, isect, &be, false)) {
279 isect, &cow_read);
280 if (!be) {
281 header->pnfs_error = -EIO; 260 header->pnfs_error = -EIO;
282 goto out; 261 goto out;
283 } 262 }
284 extent_length = be->be_length - 263 extent_length = be.be_length - (isect - be.be_f_offset);
285 (isect - be->be_f_offset);
286 if (cow_read) {
287 sector_t cow_length = cow_read->be_length -
288 (isect - cow_read->be_f_offset);
289 extent_length = min(extent_length, cow_length);
290 }
291 } 264 }
292 265
266 pg_offset = f_offset & ~PAGE_CACHE_MASK;
293 if (is_dio) { 267 if (is_dio) {
294 pg_offset = f_offset & ~PAGE_CACHE_MASK;
295 if (pg_offset + bytes_left > PAGE_CACHE_SIZE) 268 if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
296 pg_len = PAGE_CACHE_SIZE - pg_offset; 269 pg_len = PAGE_CACHE_SIZE - pg_offset;
297 else 270 else
298 pg_len = bytes_left; 271 pg_len = bytes_left;
299
300 f_offset += pg_len;
301 bytes_left -= pg_len;
302 isect += (pg_offset >> SECTOR_SHIFT);
303 } else { 272 } else {
304 pg_offset = 0; 273 BUG_ON(pg_offset != 0);
305 pg_len = PAGE_CACHE_SIZE; 274 pg_len = PAGE_CACHE_SIZE;
306 } 275 }
307 276
308 hole = is_hole(be, isect); 277 isect += (pg_offset >> SECTOR_SHIFT);
309 if (hole && !cow_read) { 278 extent_length -= (pg_offset >> SECTOR_SHIFT);
279
280 if (is_hole(&be)) {
310 bio = bl_submit_bio(READ, bio); 281 bio = bl_submit_bio(READ, bio);
311 /* Fill hole w/ zeroes w/o accessing device */ 282 /* Fill hole w/ zeroes w/o accessing device */
312 dprintk("%s Zeroing page for hole\n", __func__); 283 dprintk("%s Zeroing page for hole\n", __func__);
313 zero_user_segment(pages[i], pg_offset, pg_len); 284 zero_user_segment(pages[i], pg_offset, pg_len);
314 print_page(pages[i]);
315 SetPageUptodate(pages[i]);
316 } else {
317 struct pnfs_block_extent *be_read;
318 285
319 be_read = (hole && cow_read) ? cow_read : be; 286 /* invalidate map */
287 map.start = NFS4_MAX_UINT64;
288 } else {
320 bio = do_add_page_to_bio(bio, 289 bio = do_add_page_to_bio(bio,
321 hdr->page_array.npages - i, 290 header->page_array.npages - i,
322 READ, 291 READ,
323 isect, pages[i], be_read, 292 isect, pages[i], &map, &be,
324 bl_end_io_read, par, 293 bl_end_io_read, par,
325 pg_offset, pg_len); 294 pg_offset, &pg_len);
326 if (IS_ERR(bio)) { 295 if (IS_ERR(bio)) {
327 header->pnfs_error = PTR_ERR(bio); 296 header->pnfs_error = PTR_ERR(bio);
328 bio = NULL; 297 bio = NULL;
@@ -330,75 +299,21 @@ bl_read_pagelist(struct nfs_pgio_header *hdr)
330 } 299 }
331 } 300 }
332 isect += (pg_len >> SECTOR_SHIFT); 301 isect += (pg_len >> SECTOR_SHIFT);
333 extent_length -= PAGE_CACHE_SECTORS; 302 extent_length -= (pg_len >> SECTOR_SHIFT);
303 f_offset += pg_len;
304 bytes_left -= pg_len;
334 } 305 }
335 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 306 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
336 hdr->res.eof = 1; 307 header->res.eof = 1;
337 hdr->res.count = header->inode->i_size - hdr->args.offset; 308 header->res.count = header->inode->i_size - header->args.offset;
338 } else { 309 } else {
339 hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset; 310 header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
340 } 311 }
341out: 312out:
342 bl_put_extent(be);
343 bl_put_extent(cow_read);
344 bl_submit_bio(READ, bio); 313 bl_submit_bio(READ, bio);
314 blk_finish_plug(&plug);
345 put_parallel(par); 315 put_parallel(par);
346 return PNFS_ATTEMPTED; 316 return PNFS_ATTEMPTED;
347
348 use_mds:
349 dprintk("Giving up and using normal NFS\n");
350 return PNFS_NOT_ATTEMPTED;
351}
352
353static void mark_extents_written(struct pnfs_block_layout *bl,
354 __u64 offset, __u32 count)
355{
356 sector_t isect, end;
357 struct pnfs_block_extent *be;
358 struct pnfs_block_short_extent *se;
359
360 dprintk("%s(%llu, %u)\n", __func__, offset, count);
361 if (count == 0)
362 return;
363 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
364 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
365 end >>= SECTOR_SHIFT;
366 while (isect < end) {
367 sector_t len;
368 be = bl_find_get_extent(bl, isect, NULL);
369 BUG_ON(!be); /* FIXME */
370 len = min(end, be->be_f_offset + be->be_length) - isect;
371 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
372 se = bl_pop_one_short_extent(be->be_inval);
373 BUG_ON(!se);
374 bl_mark_for_commit(be, isect, len, se);
375 }
376 isect += len;
377 bl_put_extent(be);
378 }
379}
380
381static void bl_end_io_write_zero(struct bio *bio, int err)
382{
383 struct parallel_io *par = bio->bi_private;
384 struct bio_vec *bvec;
385 int i;
386
387 bio_for_each_segment_all(bvec, bio, i) {
388 /* This is the zeroing page we added */
389 end_page_writeback(bvec->bv_page);
390 page_cache_release(bvec->bv_page);
391 }
392
393 if (unlikely(err)) {
394 struct nfs_pgio_header *header = par->data;
395
396 if (!header->pnfs_error)
397 header->pnfs_error = -EIO;
398 pnfs_set_lo_fail(header->lseg);
399 }
400 bio_put(bio);
401 put_parallel(par);
402} 317}
403 318
404static void bl_end_io_write(struct bio *bio, int err) 319static void bl_end_io_write(struct bio *bio, int err)
@@ -421,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
421 */ 336 */
422static void bl_write_cleanup(struct work_struct *work) 337static void bl_write_cleanup(struct work_struct *work)
423{ 338{
424 struct rpc_task *task; 339 struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
425 struct nfs_pgio_header *hdr; 340 struct nfs_pgio_header *hdr =
341 container_of(task, struct nfs_pgio_header, task);
342
426 dprintk("%s enter\n", __func__); 343 dprintk("%s enter\n", __func__);
427 task = container_of(work, struct rpc_task, u.tk_work); 344
428 hdr = container_of(task, struct nfs_pgio_header, task);
429 if (likely(!hdr->pnfs_error)) { 345 if (likely(!hdr->pnfs_error)) {
430 /* Marks for LAYOUTCOMMIT */ 346 struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
431 mark_extents_written(BLK_LSEG2EXT(hdr->lseg), 347 u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
432 hdr->args.offset, hdr->args.count); 348 u64 end = (hdr->args.offset + hdr->args.count +
349 PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
350
351 ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
352 (end - start) >> SECTOR_SHIFT);
433 } 353 }
354
434 pnfs_ld_write_done(hdr); 355 pnfs_ld_write_done(hdr);
435} 356}
436 357
437/* Called when last of bios associated with a bl_write_pagelist call finishes */ 358/* Called when last of bios associated with a bl_write_pagelist call finishes */
438static void bl_end_par_io_write(void *data, int num_se) 359static void bl_end_par_io_write(void *data)
439{ 360{
440 struct nfs_pgio_header *hdr = data; 361 struct nfs_pgio_header *hdr = data;
441 362
442 if (unlikely(hdr->pnfs_error)) {
443 bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval,
444 num_se);
445 }
446
447 hdr->task.tk_status = hdr->pnfs_error; 363 hdr->task.tk_status = hdr->pnfs_error;
448 hdr->verf.committed = NFS_FILE_SYNC; 364 hdr->verf.committed = NFS_FILE_SYNC;
449 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup); 365 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
450 schedule_work(&hdr->task.u.tk_work); 366 schedule_work(&hdr->task.u.tk_work);
451} 367}
452 368
453/* FIXME STUB - mark intersection of layout and page as bad, so is not
454 * used again.
455 */
456static void mark_bad_read(void)
457{
458 return;
459}
460
461/*
462 * map_block: map a requested I/0 block (isect) into an offset in the LVM
463 * block_device
464 */
465static void
466map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
467{
468 dprintk("%s enter be=%p\n", __func__, be);
469
470 set_buffer_mapped(bh);
471 bh->b_bdev = be->be_mdev;
472 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
473 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
474
475 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
476 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
477 bh->b_size);
478 return;
479}
480
481static void
482bl_read_single_end_io(struct bio *bio, int error)
483{
484 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
485 struct page *page = bvec->bv_page;
486
487 /* Only one page in bvec */
488 unlock_page(page);
489}
490
491static int
492bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
493 unsigned int offset, unsigned int len)
494{
495 struct bio *bio;
496 struct page *shadow_page;
497 sector_t isect;
498 char *kaddr, *kshadow_addr;
499 int ret = 0;
500
501 dprintk("%s: offset %u len %u\n", __func__, offset, len);
502
503 shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
504 if (shadow_page == NULL)
505 return -ENOMEM;
506
507 bio = bio_alloc(GFP_NOIO, 1);
508 if (bio == NULL)
509 return -ENOMEM;
510
511 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
512 (offset / SECTOR_SIZE);
513
514 bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
515 bio->bi_bdev = be->be_mdev;
516 bio->bi_end_io = bl_read_single_end_io;
517
518 lock_page(shadow_page);
519 if (bio_add_page(bio, shadow_page,
520 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
521 unlock_page(shadow_page);
522 bio_put(bio);
523 return -EIO;
524 }
525
526 submit_bio(READ, bio);
527 wait_on_page_locked(shadow_page);
528 if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
529 ret = -EIO;
530 } else {
531 kaddr = kmap_atomic(page);
532 kshadow_addr = kmap_atomic(shadow_page);
533 memcpy(kaddr + offset, kshadow_addr + offset, len);
534 kunmap_atomic(kshadow_addr);
535 kunmap_atomic(kaddr);
536 }
537 __free_page(shadow_page);
538 bio_put(bio);
539
540 return ret;
541}
542
543static int
544bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
545 unsigned int dirty_offset, unsigned int dirty_len,
546 bool full_page)
547{
548 int ret = 0;
549 unsigned int start, end;
550
551 if (full_page) {
552 start = 0;
553 end = PAGE_CACHE_SIZE;
554 } else {
555 start = round_down(dirty_offset, SECTOR_SIZE);
556 end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
557 }
558
559 dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len);
560 if (!be) {
561 zero_user_segments(page, start, dirty_offset,
562 dirty_offset + dirty_len, end);
563 if (start == 0 && end == PAGE_CACHE_SIZE &&
564 trylock_page(page)) {
565 SetPageUptodate(page);
566 unlock_page(page);
567 }
568 return ret;
569 }
570
571 if (start != dirty_offset)
572 ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
573
574 if (!ret && (dirty_offset + dirty_len < end))
575 ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
576 end - dirty_offset - dirty_len);
577
578 return ret;
579}
580
581/* Given an unmapped page, zero it or read in page for COW, page is locked
582 * by caller.
583 */
584static int
585init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
586{
587 struct buffer_head *bh = NULL;
588 int ret = 0;
589 sector_t isect;
590
591 dprintk("%s enter, %p\n", __func__, page);
592 BUG_ON(PageUptodate(page));
593 if (!cow_read) {
594 zero_user_segment(page, 0, PAGE_SIZE);
595 SetPageUptodate(page);
596 goto cleanup;
597 }
598
599 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
600 if (!bh) {
601 ret = -ENOMEM;
602 goto cleanup;
603 }
604
605 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
606 map_block(bh, isect, cow_read);
607 if (!bh_uptodate_or_lock(bh))
608 ret = bh_submit_read(bh);
609 if (ret)
610 goto cleanup;
611 SetPageUptodate(page);
612
613cleanup:
614 if (bh)
615 free_buffer_head(bh);
616 if (ret) {
617 /* Need to mark layout with bad read...should now
618 * just use nfs4 for reads and writes.
619 */
620 mark_bad_read();
621 }
622 return ret;
623}
624
625/* Find or create a zeroing page marked being writeback.
626 * Return ERR_PTR on error, NULL to indicate skip this page and page itself
627 * to indicate write out.
628 */
629static struct page *
630bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
631 struct pnfs_block_extent *cow_read)
632{
633 struct page *page;
634 int locked = 0;
635 page = find_get_page(inode->i_mapping, index);
636 if (page)
637 goto check_page;
638
639 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
640 if (unlikely(!page)) {
641 dprintk("%s oom\n", __func__);
642 return ERR_PTR(-ENOMEM);
643 }
644 locked = 1;
645
646check_page:
647 /* PageDirty: Other will write this out
648 * PageWriteback: Other is writing this out
649 * PageUptodate: It was read before
650 */
651 if (PageDirty(page) || PageWriteback(page)) {
652 print_page(page);
653 if (locked)
654 unlock_page(page);
655 page_cache_release(page);
656 return NULL;
657 }
658
659 if (!locked) {
660 lock_page(page);
661 locked = 1;
662 goto check_page;
663 }
664 if (!PageUptodate(page)) {
665 /* New page, readin or zero it */
666 init_page_for_write(page, cow_read);
667 }
668 set_page_writeback(page);
669 unlock_page(page);
670
671 return page;
672}
673
674static enum pnfs_try_status 369static enum pnfs_try_status
675bl_write_pagelist(struct nfs_pgio_header *header, int sync) 370bl_write_pagelist(struct nfs_pgio_header *header, int sync)
676{ 371{
677 int i, ret, npg_zero, pg_index, last = 0; 372 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
373 struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
678 struct bio *bio = NULL; 374 struct bio *bio = NULL;
679 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 375 struct pnfs_block_extent be;
680 sector_t isect, last_isect = 0, extent_length = 0; 376 sector_t isect, extent_length = 0;
681 struct parallel_io *par = NULL; 377 struct parallel_io *par = NULL;
682 loff_t offset = header->args.offset; 378 loff_t offset = header->args.offset;
683 size_t count = header->args.count; 379 size_t count = header->args.count;
684 unsigned int pg_offset, pg_len, saved_len;
685 struct page **pages = header->args.pages; 380 struct page **pages = header->args.pages;
686 struct page *page; 381 int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
687 pgoff_t index; 382 unsigned int pg_len;
688 u64 temp; 383 struct blk_plug plug;
689 int npg_per_block = 384 int i;
690 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
691 385
692 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 386 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
693 387
694 if (header->dreq != NULL &&
695 (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
696 !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
697 dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
698 goto out_mds;
699 }
700 /* At this point, header->page_aray is a (sequential) list of nfs_pages. 388 /* At this point, header->page_aray is a (sequential) list of nfs_pages.
701 * We want to write each, and if there is an error set pnfs_error 389 * We want to write each, and if there is an error set pnfs_error
702 * to have it redone using nfs. 390 * to have it redone using nfs.
703 */ 391 */
704 par = alloc_parallel(header); 392 par = alloc_parallel(header);
705 if (!par) 393 if (!par)
706 goto out_mds; 394 return PNFS_NOT_ATTEMPTED;
707 par->pnfs_callback = bl_end_par_io_write; 395 par->pnfs_callback = bl_end_par_io_write;
708 /* At this point, have to be more careful with error handling */
709 396
710 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 397 blk_start_plug(&plug);
711 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
712 if (!be || !is_writable(be, isect)) {
713 dprintk("%s no matching extents!\n", __func__);
714 goto out_mds;
715 }
716 398
717 /* First page inside INVALID extent */ 399 /* we always write out the whole page */
718 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 400 offset = offset & (loff_t)PAGE_CACHE_MASK;
719 if (likely(!bl_push_one_short_extent(be->be_inval))) 401 isect = offset >> SECTOR_SHIFT;
720 par->bse_count++;
721 else
722 goto out_mds;
723 temp = offset >> PAGE_CACHE_SHIFT;
724 npg_zero = do_div(temp, npg_per_block);
725 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
726 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
727 extent_length = be->be_length - (isect - be->be_f_offset);
728
729fill_invalid_ext:
730 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
731 for (;npg_zero > 0; npg_zero--) {
732 if (bl_is_sector_init(be->be_inval, isect)) {
733 dprintk("isect %llu already init\n",
734 (unsigned long long)isect);
735 goto next_page;
736 }
737 /* page ref released in bl_end_io_write_zero */
738 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
739 dprintk("%s zero %dth page: index %lu isect %llu\n",
740 __func__, npg_zero, index,
741 (unsigned long long)isect);
742 page = bl_find_get_zeroing_page(header->inode, index,
743 cow_read);
744 if (unlikely(IS_ERR(page))) {
745 header->pnfs_error = PTR_ERR(page);
746 goto out;
747 } else if (page == NULL)
748 goto next_page;
749
750 ret = bl_mark_sectors_init(be->be_inval, isect,
751 PAGE_CACHE_SECTORS);
752 if (unlikely(ret)) {
753 dprintk("%s bl_mark_sectors_init fail %d\n",
754 __func__, ret);
755 end_page_writeback(page);
756 page_cache_release(page);
757 header->pnfs_error = ret;
758 goto out;
759 }
760 if (likely(!bl_push_one_short_extent(be->be_inval)))
761 par->bse_count++;
762 else {
763 end_page_writeback(page);
764 page_cache_release(page);
765 header->pnfs_error = -ENOMEM;
766 goto out;
767 }
768 /* FIXME: This should be done in bi_end_io */
769 mark_extents_written(BLK_LSEG2EXT(header->lseg),
770 page->index << PAGE_CACHE_SHIFT,
771 PAGE_CACHE_SIZE);
772
773 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
774 isect, page, be,
775 bl_end_io_write_zero, par);
776 if (IS_ERR(bio)) {
777 header->pnfs_error = PTR_ERR(bio);
778 bio = NULL;
779 goto out;
780 }
781next_page:
782 isect += PAGE_CACHE_SECTORS;
783 extent_length -= PAGE_CACHE_SECTORS;
784 }
785 if (last)
786 goto write_done;
787 }
788 bio = bl_submit_bio(WRITE, bio);
789 402
790 /* Middle pages */
791 pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
792 for (i = pg_index; i < header->page_array.npages; i++) { 403 for (i = pg_index; i < header->page_array.npages; i++) {
793 if (!extent_length) { 404 if (extent_length <= 0) {
794 /* We've used up the previous extent */ 405 /* We've used up the previous extent */
795 bl_put_extent(be);
796 bl_put_extent(cow_read);
797 bio = bl_submit_bio(WRITE, bio); 406 bio = bl_submit_bio(WRITE, bio);
798 /* Get the next one */ 407 /* Get the next one */
799 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 408 if (!ext_tree_lookup(bl, isect, &be, true)) {
800 isect, &cow_read);
801 if (!be || !is_writable(be, isect)) {
802 header->pnfs_error = -EINVAL; 409 header->pnfs_error = -EINVAL;
803 goto out; 410 goto out;
804 } 411 }
805 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
806 if (likely(!bl_push_one_short_extent(
807 be->be_inval)))
808 par->bse_count++;
809 else {
810 header->pnfs_error = -ENOMEM;
811 goto out;
812 }
813 }
814 extent_length = be->be_length -
815 (isect - be->be_f_offset);
816 }
817
818 dprintk("%s offset %lld count %Zu\n", __func__, offset, count);
819 pg_offset = offset & ~PAGE_CACHE_MASK;
820 if (pg_offset + count > PAGE_CACHE_SIZE)
821 pg_len = PAGE_CACHE_SIZE - pg_offset;
822 else
823 pg_len = count;
824
825 saved_len = pg_len;
826 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
827 !bl_is_sector_init(be->be_inval, isect)) {
828 ret = bl_read_partial_page_sync(pages[i], cow_read,
829 pg_offset, pg_len, true);
830 if (ret) {
831 dprintk("%s bl_read_partial_page_sync fail %d\n",
832 __func__, ret);
833 header->pnfs_error = ret;
834 goto out;
835 }
836
837 ret = bl_mark_sectors_init(be->be_inval, isect,
838 PAGE_CACHE_SECTORS);
839 if (unlikely(ret)) {
840 dprintk("%s bl_mark_sectors_init fail %d\n",
841 __func__, ret);
842 header->pnfs_error = ret;
843 goto out;
844 }
845 412
846 /* Expand to full page write */ 413 extent_length = be.be_length - (isect - be.be_f_offset);
847 pg_offset = 0;
848 pg_len = PAGE_CACHE_SIZE;
849 } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
850 (pg_len & (SECTOR_SIZE - 1))){
851 /* ahh, nasty case. We have to do sync full sector
852 * read-modify-write cycles.
853 */
854 unsigned int saved_offset = pg_offset;
855 ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
856 pg_len, false);
857 pg_offset = round_down(pg_offset, SECTOR_SIZE);
858 pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
859 - pg_offset;
860 } 414 }
861 415
862 416 pg_len = PAGE_CACHE_SIZE;
863 bio = do_add_page_to_bio(bio, header->page_array.npages - i, 417 bio = do_add_page_to_bio(bio, header->page_array.npages - i,
864 WRITE, 418 WRITE, isect, pages[i], &map, &be,
865 isect, pages[i], be,
866 bl_end_io_write, par, 419 bl_end_io_write, par,
867 pg_offset, pg_len); 420 0, &pg_len);
868 if (IS_ERR(bio)) { 421 if (IS_ERR(bio)) {
869 header->pnfs_error = PTR_ERR(bio); 422 header->pnfs_error = PTR_ERR(bio);
870 bio = NULL; 423 bio = NULL;
871 goto out; 424 goto out;
872 } 425 }
873 offset += saved_len;
874 count -= saved_len;
875 isect += PAGE_CACHE_SECTORS;
876 last_isect = isect;
877 extent_length -= PAGE_CACHE_SECTORS;
878 }
879 426
880 /* Last page inside INVALID extent */ 427 offset += pg_len;
881 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 428 count -= pg_len;
882 bio = bl_submit_bio(WRITE, bio); 429 isect += (pg_len >> SECTOR_SHIFT);
883 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; 430 extent_length -= (pg_len >> SECTOR_SHIFT);
884 npg_zero = npg_per_block - do_div(temp, npg_per_block);
885 if (npg_zero < npg_per_block) {
886 last = 1;
887 goto fill_invalid_ext;
888 }
889 } 431 }
890 432
891write_done:
892 header->res.count = header->args.count; 433 header->res.count = header->args.count;
893out: 434out:
894 bl_put_extent(be);
895 bl_put_extent(cow_read);
896 bl_submit_bio(WRITE, bio); 435 bl_submit_bio(WRITE, bio);
436 blk_finish_plug(&plug);
897 put_parallel(par); 437 put_parallel(par);
898 return PNFS_ATTEMPTED; 438 return PNFS_ATTEMPTED;
899out_mds:
900 bl_put_extent(be);
901 bl_put_extent(cow_read);
902 kfree(par);
903 return PNFS_NOT_ATTEMPTED;
904}
905
906/* FIXME - range ignored */
907static void
908release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
909{
910 int i;
911 struct pnfs_block_extent *be;
912
913 spin_lock(&bl->bl_ext_lock);
914 for (i = 0; i < EXTENT_LISTS; i++) {
915 while (!list_empty(&bl->bl_extents[i])) {
916 be = list_first_entry(&bl->bl_extents[i],
917 struct pnfs_block_extent,
918 be_node);
919 list_del(&be->be_node);
920 bl_put_extent(be);
921 }
922 }
923 spin_unlock(&bl->bl_ext_lock);
924}
925
926static void
927release_inval_marks(struct pnfs_inval_markings *marks)
928{
929 struct pnfs_inval_tracking *pos, *temp;
930 struct pnfs_block_short_extent *se, *stemp;
931
932 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
933 list_del(&pos->it_link);
934 kfree(pos);
935 }
936
937 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
938 list_del(&se->bse_node);
939 kfree(se);
940 }
941 return;
942} 439}
943 440
944static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) 441static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
945{ 442{
946 struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 443 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
444 int err;
947 445
948 dprintk("%s enter\n", __func__); 446 dprintk("%s enter\n", __func__);
949 release_extents(bl, NULL); 447
950 release_inval_marks(&bl->bl_inval); 448 err = ext_tree_remove(bl, true, 0, LLONG_MAX);
449 WARN_ON(err);
450
951 kfree(bl); 451 kfree(bl);
952} 452}
953 453
@@ -960,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
960 bl = kzalloc(sizeof(*bl), gfp_flags); 460 bl = kzalloc(sizeof(*bl), gfp_flags);
961 if (!bl) 461 if (!bl)
962 return NULL; 462 return NULL;
463
464 bl->bl_ext_rw = RB_ROOT;
465 bl->bl_ext_ro = RB_ROOT;
963 spin_lock_init(&bl->bl_ext_lock); 466 spin_lock_init(&bl->bl_ext_lock);
964 INIT_LIST_HEAD(&bl->bl_extents[0]); 467
965 INIT_LIST_HEAD(&bl->bl_extents[1]);
966 INIT_LIST_HEAD(&bl->bl_commit);
967 INIT_LIST_HEAD(&bl->bl_committing);
968 bl->bl_count = 0;
969 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
970 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
971 return &bl->bl_layout; 468 return &bl->bl_layout;
972} 469}
973 470
@@ -977,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
977 kfree(lseg); 474 kfree(lseg);
978} 475}
979 476
980/* We pretty much ignore lseg, and store all data layout wide, so we 477/* Tracks info needed to ensure extents in layout obey constraints of spec */
981 * can correctly merge. 478struct layout_verification {
982 */ 479 u32 mode; /* R or RW */
983static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, 480 u64 start; /* Expected start of next non-COW extent */
984 struct nfs4_layoutget_res *lgr, 481 u64 inval; /* Start of INVAL coverage */
985 gfp_t gfp_flags) 482 u64 cowread; /* End of COW read coverage */
986{ 483};
987 struct pnfs_layout_segment *lseg;
988 int status;
989 484
990 dprintk("%s enter\n", __func__); 485/* Verify the extent meets the layout requirements of the pnfs-block draft,
991 lseg = kzalloc(sizeof(*lseg), gfp_flags); 486 * section 2.3.1.
992 if (!lseg) 487 */
993 return ERR_PTR(-ENOMEM); 488static int verify_extent(struct pnfs_block_extent *be,
994 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); 489 struct layout_verification *lv)
995 if (status) { 490{
996 /* We don't want to call the full-blown bl_free_lseg, 491 if (lv->mode == IOMODE_READ) {
997 * since on error extents were not touched. 492 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
998 */ 493 be->be_state == PNFS_BLOCK_INVALID_DATA)
999 kfree(lseg); 494 return -EIO;
1000 return ERR_PTR(status); 495 if (be->be_f_offset != lv->start)
496 return -EIO;
497 lv->start += be->be_length;
498 return 0;
1001 } 499 }
1002 return lseg; 500 /* lv->mode == IOMODE_RW */
501 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
502 if (be->be_f_offset != lv->start)
503 return -EIO;
504 if (lv->cowread > lv->start)
505 return -EIO;
506 lv->start += be->be_length;
507 lv->inval = lv->start;
508 return 0;
509 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
510 if (be->be_f_offset != lv->start)
511 return -EIO;
512 lv->start += be->be_length;
513 return 0;
514 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
515 if (be->be_f_offset > lv->start)
516 return -EIO;
517 if (be->be_f_offset < lv->inval)
518 return -EIO;
519 if (be->be_f_offset < lv->cowread)
520 return -EIO;
521 /* It looks like you might want to min this with lv->start,
522 * but you really don't.
523 */
524 lv->inval = lv->inval + be->be_length;
525 lv->cowread = be->be_f_offset + be->be_length;
526 return 0;
527 } else
528 return -EIO;
1003} 529}
1004 530
1005static void 531static int decode_sector_number(__be32 **rp, sector_t *sp)
1006bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
1007 const struct nfs4_layoutcommit_args *arg)
1008{ 532{
1009 dprintk("%s enter\n", __func__); 533 uint64_t s;
1010 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); 534
535 *rp = xdr_decode_hyper(*rp, &s);
536 if (s & 0x1ff) {
537 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
538 return -1;
539 }
540 *sp = s >> SECTOR_SHIFT;
541 return 0;
1011} 542}
1012 543
1013static void 544static int
1014bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) 545bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
546 struct layout_verification *lv, struct list_head *extents,
547 gfp_t gfp_mask)
1015{ 548{
1016 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; 549 struct pnfs_block_extent *be;
550 struct nfs4_deviceid id;
551 int error;
552 __be32 *p;
1017 553
1018 dprintk("%s enter\n", __func__); 554 p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
1019 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); 555 if (!p)
1020} 556 return -EIO;
1021 557
1022static void free_blk_mountid(struct block_mount_id *mid) 558 be = kzalloc(sizeof(*be), GFP_NOFS);
1023{ 559 if (!be)
1024 if (mid) { 560 return -ENOMEM;
1025 struct pnfs_block_dev *dev, *tmp;
1026 561
1027 /* No need to take bm_lock as we are last user freeing bm_devlist */ 562 memcpy(&id, p, NFS4_DEVICEID4_SIZE);
1028 list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { 563 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
1029 list_del(&dev->bm_node); 564
1030 bl_free_block_dev(dev); 565 error = -EIO;
1031 } 566 be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
1032 kfree(mid); 567 lo->plh_lc_cred, gfp_mask);
568 if (!be->be_device)
569 goto out_free_be;
570
571 /*
572 * The next three values are read in as bytes, but stored in the
573 * extent structure in 512-byte granularity.
574 */
575 if (decode_sector_number(&p, &be->be_f_offset) < 0)
576 goto out_put_deviceid;
577 if (decode_sector_number(&p, &be->be_length) < 0)
578 goto out_put_deviceid;
579 if (decode_sector_number(&p, &be->be_v_offset) < 0)
580 goto out_put_deviceid;
581 be->be_state = be32_to_cpup(p++);
582
583 error = verify_extent(be, lv);
584 if (error) {
585 dprintk("%s: extent verification failed\n", __func__);
586 goto out_put_deviceid;
1033 } 587 }
588
589 list_add_tail(&be->be_list, extents);
590 return 0;
591
592out_put_deviceid:
593 nfs4_put_deviceid_node(be->be_device);
594out_free_be:
595 kfree(be);
596 return error;
1034} 597}
1035 598
1036/* This is mostly copied from the filelayout_get_device_info function. 599static struct pnfs_layout_segment *
1037 * It seems much of this should be at the generic pnfs level. 600bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
1038 */ 601 gfp_t gfp_mask)
1039static struct pnfs_block_dev *
1040nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1041 struct nfs4_deviceid *d_id)
1042{ 602{
1043 struct pnfs_device *dev; 603 struct layout_verification lv = {
1044 struct pnfs_block_dev *rv; 604 .mode = lgr->range.iomode,
1045 u32 max_resp_sz; 605 .start = lgr->range.offset >> SECTOR_SHIFT,
1046 int max_pages; 606 .inval = lgr->range.offset >> SECTOR_SHIFT,
1047 struct page **pages = NULL; 607 .cowread = lgr->range.offset >> SECTOR_SHIFT,
1048 int i, rc; 608 };
609 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
610 struct pnfs_layout_segment *lseg;
611 struct xdr_buf buf;
612 struct xdr_stream xdr;
613 struct page *scratch;
614 int status, i;
615 uint32_t count;
616 __be32 *p;
617 LIST_HEAD(extents);
618
619 dprintk("---> %s\n", __func__);
620
621 lseg = kzalloc(sizeof(*lseg), gfp_mask);
622 if (!lseg)
623 return ERR_PTR(-ENOMEM);
624
625 status = -ENOMEM;
626 scratch = alloc_page(gfp_mask);
627 if (!scratch)
628 goto out;
629
630 xdr_init_decode_pages(&xdr, &buf,
631 lgr->layoutp->pages, lgr->layoutp->len);
632 xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
633
634 status = -EIO;
635 p = xdr_inline_decode(&xdr, 4);
636 if (unlikely(!p))
637 goto out_free_scratch;
638
639 count = be32_to_cpup(p++);
640 dprintk("%s: number of extents %d\n", __func__, count);
1049 641
1050 /* 642 /*
1051 * Use the session max response size as the basis for setting 643 * Decode individual extents, putting them in temporary staging area
1052 * GETDEVICEINFO's maxcount 644 * until whole layout is decoded to make error recovery easier.
1053 */ 645 */
1054 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 646 for (i = 0; i < count; i++) {
1055 max_pages = nfs_page_array_len(0, max_resp_sz); 647 status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
1056 dprintk("%s max_resp_sz %u max_pages %d\n", 648 if (status)
1057 __func__, max_resp_sz, max_pages); 649 goto process_extents;
1058
1059 dev = kmalloc(sizeof(*dev), GFP_NOFS);
1060 if (!dev) {
1061 dprintk("%s kmalloc failed\n", __func__);
1062 return ERR_PTR(-ENOMEM);
1063 } 650 }
1064 651
1065 pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS); 652 if (lgr->range.offset + lgr->range.length !=
1066 if (pages == NULL) { 653 lv.start << SECTOR_SHIFT) {
1067 kfree(dev); 654 dprintk("%s Final length mismatch\n", __func__);
1068 return ERR_PTR(-ENOMEM); 655 status = -EIO;
656 goto process_extents;
1069 } 657 }
1070 for (i = 0; i < max_pages; i++) { 658
1071 pages[i] = alloc_page(GFP_NOFS); 659 if (lv.start < lv.cowread) {
1072 if (!pages[i]) { 660 dprintk("%s Final uncovered COW extent\n", __func__);
1073 rv = ERR_PTR(-ENOMEM); 661 status = -EIO;
1074 goto out_free;
1075 }
1076 } 662 }
1077 663
1078 memcpy(&dev->dev_id, d_id, sizeof(*d_id)); 664process_extents:
1079 dev->layout_type = LAYOUT_BLOCK_VOLUME; 665 while (!list_empty(&extents)) {
1080 dev->pages = pages; 666 struct pnfs_block_extent *be =
1081 dev->pgbase = 0; 667 list_first_entry(&extents, struct pnfs_block_extent,
1082 dev->pglen = PAGE_SIZE * max_pages; 668 be_list);
1083 dev->mincount = 0; 669 list_del(&be->be_list);
1084 dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; 670
1085 671 if (!status)
1086 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 672 status = ext_tree_insert(bl, be);
1087 rc = nfs4_proc_getdeviceinfo(server, dev, NULL); 673
1088 dprintk("%s getdevice info returns %d\n", __func__, rc); 674 if (status) {
1089 if (rc) { 675 nfs4_put_deviceid_node(be->be_device);
1090 rv = ERR_PTR(rc); 676 kfree(be);
1091 goto out_free; 677 }
1092 } 678 }
1093 679
1094 rv = nfs4_blk_decode_device(server, dev); 680out_free_scratch:
1095 out_free: 681 __free_page(scratch);
1096 for (i = 0; i < max_pages; i++) 682out:
1097 __free_page(pages[i]); 683 dprintk("%s returns %d\n", __func__, status);
1098 kfree(pages); 684 if (status) {
1099 kfree(dev); 685 kfree(lseg);
1100 return rv; 686 return ERR_PTR(status);
687 }
688 return lseg;
1101} 689}
1102 690
1103static int 691static void
1104bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) 692bl_return_range(struct pnfs_layout_hdr *lo,
693 struct pnfs_layout_range *range)
1105{ 694{
1106 struct block_mount_id *b_mt_id = NULL; 695 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
1107 struct pnfs_devicelist *dlist = NULL; 696 sector_t offset = range->offset >> SECTOR_SHIFT, end;
1108 struct pnfs_block_dev *bdev;
1109 LIST_HEAD(block_disklist);
1110 int status, i;
1111
1112 dprintk("%s enter\n", __func__);
1113 697
1114 if (server->pnfs_blksize == 0) { 698 if (range->offset % 8) {
1115 dprintk("%s Server did not return blksize\n", __func__); 699 dprintk("%s: offset %lld not block size aligned\n",
1116 return -EINVAL; 700 __func__, range->offset);
1117 } 701 return;
1118 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
1119 if (!b_mt_id) {
1120 status = -ENOMEM;
1121 goto out_error;
1122 }
1123 /* Initialize nfs4 block layout mount id */
1124 spin_lock_init(&b_mt_id->bm_lock);
1125 INIT_LIST_HEAD(&b_mt_id->bm_devlist);
1126
1127 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
1128 if (!dlist) {
1129 status = -ENOMEM;
1130 goto out_error;
1131 } 702 }
1132 dlist->eof = 0; 703
1133 while (!dlist->eof) { 704 if (range->length != NFS4_MAX_UINT64) {
1134 status = nfs4_proc_getdevicelist(server, fh, dlist); 705 if (range->length % 8) {
1135 if (status) 706 dprintk("%s: length %lld not block size aligned\n",
1136 goto out_error; 707 __func__, range->length);
1137 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", 708 return;
1138 __func__, dlist->num_devs, dlist->eof);
1139 for (i = 0; i < dlist->num_devs; i++) {
1140 bdev = nfs4_blk_get_deviceinfo(server, fh,
1141 &dlist->dev_id[i]);
1142 if (IS_ERR(bdev)) {
1143 status = PTR_ERR(bdev);
1144 goto out_error;
1145 }
1146 spin_lock(&b_mt_id->bm_lock);
1147 list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
1148 spin_unlock(&b_mt_id->bm_lock);
1149 } 709 }
1150 }
1151 dprintk("%s SUCCESS\n", __func__);
1152 server->pnfs_ld_data = b_mt_id;
1153 710
1154 out_return: 711 end = offset + (range->length >> SECTOR_SHIFT);
1155 kfree(dlist); 712 } else {
1156 return status; 713 end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
714 }
1157 715
1158 out_error: 716 ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
1159 free_blk_mountid(b_mt_id);
1160 goto out_return;
1161} 717}
1162 718
1163static int 719static int
1164bl_clear_layoutdriver(struct nfs_server *server) 720bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
721{
722 return ext_tree_prepare_commit(arg);
723}
724
725static void
726bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
1165{ 727{
1166 struct block_mount_id *b_mt_id = server->pnfs_ld_data; 728 ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
729}
1167 730
731static int
732bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
733{
1168 dprintk("%s enter\n", __func__); 734 dprintk("%s enter\n", __func__);
1169 free_blk_mountid(b_mt_id); 735
1170 dprintk("%s RETURNS\n", __func__); 736 if (server->pnfs_blksize == 0) {
737 dprintk("%s Server did not return blksize\n", __func__);
738 return -EINVAL;
739 }
740 if (server->pnfs_blksize > PAGE_SIZE) {
741 printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
742 __func__, server->pnfs_blksize);
743 return -EINVAL;
744 }
745
1171 return 0; 746 return 0;
1172} 747}
1173 748
1174static bool 749static bool
1175is_aligned_req(struct nfs_page *req, unsigned int alignment) 750is_aligned_req(struct nfs_pageio_descriptor *pgio,
751 struct nfs_page *req, unsigned int alignment)
1176{ 752{
1177 return IS_ALIGNED(req->wb_offset, alignment) && 753 /*
1178 IS_ALIGNED(req->wb_bytes, alignment); 754 * Always accept buffered writes, higher layers take care of the
755 * right alignment.
756 */
757 if (pgio->pg_dreq == NULL)
758 return true;
759
760 if (!IS_ALIGNED(req->wb_offset, alignment))
761 return false;
762
763 if (IS_ALIGNED(req->wb_bytes, alignment))
764 return true;
765
766 if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
767 /*
768 * If the write goes up to the inode size, just write
769 * the full page. Data past the inode size is
770 * guaranteed to be zeroed by the higher level client
771 * code, and this behaviour is mandated by RFC 5663
772 * section 2.3.2.
773 */
774 return true;
775 }
776
777 return false;
1179} 778}
1180 779
1181static void 780static void
1182bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 781bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1183{ 782{
1184 if (pgio->pg_dreq != NULL && 783 if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
1185 !is_aligned_req(req, SECTOR_SIZE))
1186 nfs_pageio_reset_read_mds(pgio); 784 nfs_pageio_reset_read_mds(pgio);
1187 else 785 return;
1188 pnfs_generic_pg_init_read(pgio, req); 786 }
787
788 pnfs_generic_pg_init_read(pgio, req);
1189} 789}
1190 790
1191/* 791/*
@@ -1196,10 +796,8 @@ static size_t
1196bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 796bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1197 struct nfs_page *req) 797 struct nfs_page *req)
1198{ 798{
1199 if (pgio->pg_dreq != NULL && 799 if (!is_aligned_req(pgio, req, SECTOR_SIZE))
1200 !is_aligned_req(req, SECTOR_SIZE))
1201 return 0; 800 return 0;
1202
1203 return pnfs_generic_pg_test(pgio, prev, req); 801 return pnfs_generic_pg_test(pgio, prev, req);
1204} 802}
1205 803
@@ -1229,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1229static void 827static void
1230bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 828bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1231{ 829{
1232 if (pgio->pg_dreq != NULL && 830 u64 wb_size;
1233 !is_aligned_req(req, PAGE_CACHE_SIZE)) { 831
832 if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
1234 nfs_pageio_reset_write_mds(pgio); 833 nfs_pageio_reset_write_mds(pgio);
1235 } else { 834 return;
1236 u64 wb_size;
1237 if (pgio->pg_dreq == NULL)
1238 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
1239 req->wb_index);
1240 else
1241 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1242
1243 pnfs_generic_pg_init_write(pgio, req, wb_size);
1244 } 835 }
836
837 if (pgio->pg_dreq == NULL)
838 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
839 req->wb_index);
840 else
841 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
842
843 pnfs_generic_pg_init_write(pgio, req, wb_size);
1245} 844}
1246 845
1247/* 846/*
@@ -1252,10 +851,8 @@ static size_t
1252bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 851bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1253 struct nfs_page *req) 852 struct nfs_page *req)
1254{ 853{
1255 if (pgio->pg_dreq != NULL && 854 if (!is_aligned_req(pgio, req, PAGE_SIZE))
1256 !is_aligned_req(req, PAGE_CACHE_SIZE))
1257 return 0; 855 return 0;
1258
1259 return pnfs_generic_pg_test(pgio, prev, req); 856 return pnfs_generic_pg_test(pgio, prev, req);
1260} 857}
1261 858
@@ -1275,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
1275 .id = LAYOUT_BLOCK_VOLUME, 872 .id = LAYOUT_BLOCK_VOLUME,
1276 .name = "LAYOUT_BLOCK_VOLUME", 873 .name = "LAYOUT_BLOCK_VOLUME",
1277 .owner = THIS_MODULE, 874 .owner = THIS_MODULE,
875 .flags = PNFS_LAYOUTRET_ON_SETATTR |
876 PNFS_READ_WHOLE_PAGE,
1278 .read_pagelist = bl_read_pagelist, 877 .read_pagelist = bl_read_pagelist,
1279 .write_pagelist = bl_write_pagelist, 878 .write_pagelist = bl_write_pagelist,
1280 .alloc_layout_hdr = bl_alloc_layout_hdr, 879 .alloc_layout_hdr = bl_alloc_layout_hdr,
1281 .free_layout_hdr = bl_free_layout_hdr, 880 .free_layout_hdr = bl_free_layout_hdr,
1282 .alloc_lseg = bl_alloc_lseg, 881 .alloc_lseg = bl_alloc_lseg,
1283 .free_lseg = bl_free_lseg, 882 .free_lseg = bl_free_lseg,
1284 .encode_layoutcommit = bl_encode_layoutcommit, 883 .return_range = bl_return_range,
884 .prepare_layoutcommit = bl_prepare_layoutcommit,
1285 .cleanup_layoutcommit = bl_cleanup_layoutcommit, 885 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
1286 .set_layoutdriver = bl_set_layoutdriver, 886 .set_layoutdriver = bl_set_layoutdriver,
1287 .clear_layoutdriver = bl_clear_layoutdriver, 887 .alloc_deviceid_node = bl_alloc_deviceid_node,
888 .free_deviceid_node = bl_free_deviceid_node,
1288 .pg_read_ops = &bl_pg_read_ops, 889 .pg_read_ops = &bl_pg_read_ops,
1289 .pg_write_ops = &bl_pg_write_ops, 890 .pg_write_ops = &bl_pg_write_ops,
1290}; 891};
1291 892
1292static const struct rpc_pipe_ops bl_upcall_ops = {
1293 .upcall = rpc_pipe_generic_upcall,
1294 .downcall = bl_pipe_downcall,
1295 .destroy_msg = bl_pipe_destroy_msg,
1296};
1297
1298static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
1299 struct rpc_pipe *pipe)
1300{
1301 struct dentry *dir, *dentry;
1302
1303 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
1304 if (dir == NULL)
1305 return ERR_PTR(-ENOENT);
1306 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
1307 dput(dir);
1308 return dentry;
1309}
1310
1311static void nfs4blocklayout_unregister_sb(struct super_block *sb,
1312 struct rpc_pipe *pipe)
1313{
1314 if (pipe->dentry)
1315 rpc_unlink(pipe->dentry);
1316}
1317
1318static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
1319 void *ptr)
1320{
1321 struct super_block *sb = ptr;
1322 struct net *net = sb->s_fs_info;
1323 struct nfs_net *nn = net_generic(net, nfs_net_id);
1324 struct dentry *dentry;
1325 int ret = 0;
1326
1327 if (!try_module_get(THIS_MODULE))
1328 return 0;
1329
1330 if (nn->bl_device_pipe == NULL) {
1331 module_put(THIS_MODULE);
1332 return 0;
1333 }
1334
1335 switch (event) {
1336 case RPC_PIPEFS_MOUNT:
1337 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
1338 if (IS_ERR(dentry)) {
1339 ret = PTR_ERR(dentry);
1340 break;
1341 }
1342 nn->bl_device_pipe->dentry = dentry;
1343 break;
1344 case RPC_PIPEFS_UMOUNT:
1345 if (nn->bl_device_pipe->dentry)
1346 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
1347 break;
1348 default:
1349 ret = -ENOTSUPP;
1350 break;
1351 }
1352 module_put(THIS_MODULE);
1353 return ret;
1354}
1355
1356static struct notifier_block nfs4blocklayout_block = {
1357 .notifier_call = rpc_pipefs_event,
1358};
1359
1360static struct dentry *nfs4blocklayout_register_net(struct net *net,
1361 struct rpc_pipe *pipe)
1362{
1363 struct super_block *pipefs_sb;
1364 struct dentry *dentry;
1365
1366 pipefs_sb = rpc_get_sb_net(net);
1367 if (!pipefs_sb)
1368 return NULL;
1369 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
1370 rpc_put_sb_net(net);
1371 return dentry;
1372}
1373
1374static void nfs4blocklayout_unregister_net(struct net *net,
1375 struct rpc_pipe *pipe)
1376{
1377 struct super_block *pipefs_sb;
1378
1379 pipefs_sb = rpc_get_sb_net(net);
1380 if (pipefs_sb) {
1381 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
1382 rpc_put_sb_net(net);
1383 }
1384}
1385
1386static int nfs4blocklayout_net_init(struct net *net)
1387{
1388 struct nfs_net *nn = net_generic(net, nfs_net_id);
1389 struct dentry *dentry;
1390
1391 init_waitqueue_head(&nn->bl_wq);
1392 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
1393 if (IS_ERR(nn->bl_device_pipe))
1394 return PTR_ERR(nn->bl_device_pipe);
1395 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
1396 if (IS_ERR(dentry)) {
1397 rpc_destroy_pipe_data(nn->bl_device_pipe);
1398 return PTR_ERR(dentry);
1399 }
1400 nn->bl_device_pipe->dentry = dentry;
1401 return 0;
1402}
1403
1404static void nfs4blocklayout_net_exit(struct net *net)
1405{
1406 struct nfs_net *nn = net_generic(net, nfs_net_id);
1407
1408 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
1409 rpc_destroy_pipe_data(nn->bl_device_pipe);
1410 nn->bl_device_pipe = NULL;
1411}
1412
1413static struct pernet_operations nfs4blocklayout_net_ops = {
1414 .init = nfs4blocklayout_net_init,
1415 .exit = nfs4blocklayout_net_exit,
1416};
1417
1418static int __init nfs4blocklayout_init(void) 893static int __init nfs4blocklayout_init(void)
1419{ 894{
1420 int ret; 895 int ret;
@@ -1424,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
1424 ret = pnfs_register_layoutdriver(&blocklayout_type); 899 ret = pnfs_register_layoutdriver(&blocklayout_type);
1425 if (ret) 900 if (ret)
1426 goto out; 901 goto out;
1427 902 ret = bl_init_pipefs();
1428 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
1429 if (ret) 903 if (ret)
1430 goto out_remove; 904 goto out_unregister;
1431 ret = register_pernet_subsys(&nfs4blocklayout_net_ops); 905 return 0;
1432 if (ret)
1433 goto out_notifier;
1434out:
1435 return ret;
1436 906
1437out_notifier: 907out_unregister:
1438 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
1439out_remove:
1440 pnfs_unregister_layoutdriver(&blocklayout_type); 908 pnfs_unregister_layoutdriver(&blocklayout_type);
909out:
1441 return ret; 910 return ret;
1442} 911}
1443 912
@@ -1446,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
1446 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", 915 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1447 __func__); 916 __func__);
1448 917
1449 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); 918 bl_cleanup_pipefs();
1450 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
1451 pnfs_unregister_layoutdriver(&blocklayout_type); 919 pnfs_unregister_layoutdriver(&blocklayout_type);
1452} 920}
1453 921
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
44#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) 44#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
45#define SECTOR_SIZE (1 << SECTOR_SHIFT) 45#define SECTOR_SIZE (1 << SECTOR_SHIFT)
46 46
47struct block_mount_id { 47struct pnfs_block_dev;
48 spinlock_t bm_lock; /* protects list */
49 struct list_head bm_devlist; /* holds pnfs_block_dev */
50};
51 48
52struct pnfs_block_dev { 49enum pnfs_block_volume_type {
53 struct list_head bm_node; 50 PNFS_BLOCK_VOLUME_SIMPLE = 0,
54 struct nfs4_deviceid bm_mdevid; /* associated devid */ 51 PNFS_BLOCK_VOLUME_SLICE = 1,
55 struct block_device *bm_mdev; /* meta device itself */ 52 PNFS_BLOCK_VOLUME_CONCAT = 2,
56 struct net *net; 53 PNFS_BLOCK_VOLUME_STRIPE = 3,
57}; 54};
58 55
59enum exstate4 { 56#define PNFS_BLOCK_MAX_UUIDS 4
60 PNFS_BLOCK_READWRITE_DATA = 0, 57#define PNFS_BLOCK_MAX_DEVICES 64
61 PNFS_BLOCK_READ_DATA = 1, 58
62 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ 59/*
63 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ 60 * Random upper cap for the uuid length to avoid unbounded allocation.
61 * Not actually limited by the protocol.
62 */
63#define PNFS_BLOCK_UUID_LEN 128
64
65
66struct pnfs_block_volume {
67 enum pnfs_block_volume_type type;
68 union {
69 struct {
70 int len;
71 int nr_sigs;
72 struct {
73 u64 offset;
74 u32 sig_len;
75 u8 sig[PNFS_BLOCK_UUID_LEN];
76 } sigs[PNFS_BLOCK_MAX_UUIDS];
77 } simple;
78 struct {
79 u64 start;
80 u64 len;
81 u32 volume;
82 } slice;
83 struct {
84 u32 volumes_count;
85 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
86 } concat;
87 struct {
88 u64 chunk_size;
89 u32 volumes_count;
90 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
91 } stripe;
92 };
64}; 93};
65 94
66#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ 95struct pnfs_block_dev_map {
96 sector_t start;
97 sector_t len;
67 98
68struct my_tree { 99 sector_t disk_offset;
69 sector_t mtt_step_size; /* Internal sector alignment */ 100 struct block_device *bdev;
70 struct list_head mtt_stub; /* Should be a radix tree */
71}; 101};
72 102
73struct pnfs_inval_markings { 103struct pnfs_block_dev {
74 spinlock_t im_lock; 104 struct nfs4_deviceid_node node;
75 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ 105
76 sector_t im_block_size; /* Server blocksize in sectors */ 106 u64 start;
77 struct list_head im_extents; /* Short extents for INVAL->RW conversion */ 107 u64 len;
108
109 u32 nr_children;
110 struct pnfs_block_dev *children;
111 u64 chunk_size;
112
113 struct block_device *bdev;
114 u64 disk_offset;
115
116 bool (*map)(struct pnfs_block_dev *dev, u64 offset,
117 struct pnfs_block_dev_map *map);
78}; 118};
79 119
80struct pnfs_inval_tracking { 120enum exstate4 {
81 struct list_head it_link; 121 PNFS_BLOCK_READWRITE_DATA = 0,
82 int it_sector; 122 PNFS_BLOCK_READ_DATA = 1,
83 int it_tags; 123 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
124 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
84}; 125};
85 126
86/* sector_t fields are all in 512-byte sectors */ 127/* sector_t fields are all in 512-byte sectors */
87struct pnfs_block_extent { 128struct pnfs_block_extent {
88 struct kref be_refcnt; 129 union {
89 struct list_head be_node; /* link into lseg list */ 130 struct rb_node be_node;
90 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ 131 struct list_head be_list;
91 struct block_device *be_mdev; 132 };
133 struct nfs4_deviceid_node *be_device;
92 sector_t be_f_offset; /* the starting offset in the file */ 134 sector_t be_f_offset; /* the starting offset in the file */
93 sector_t be_length; /* the size of the extent */ 135 sector_t be_length; /* the size of the extent */
94 sector_t be_v_offset; /* the starting offset in the volume */ 136 sector_t be_v_offset; /* the starting offset in the volume */
95 enum exstate4 be_state; /* the state of this extent */ 137 enum exstate4 be_state; /* the state of this extent */
96 struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ 138#define EXTENT_WRITTEN 1
139#define EXTENT_COMMITTING 2
140 unsigned int be_tag;
97}; 141};
98 142
99/* Shortened extent used by LAYOUTCOMMIT */ 143/* on the wire size of the extent */
100struct pnfs_block_short_extent { 144#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
101 struct list_head bse_node;
102 struct nfs4_deviceid bse_devid;
103 struct block_device *bse_mdev;
104 sector_t bse_f_offset; /* the starting offset in the file */
105 sector_t bse_length; /* the size of the extent */
106};
107
108static inline void
109BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
110{
111 spin_lock_init(&marks->im_lock);
112 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
113 INIT_LIST_HEAD(&marks->im_extents);
114 marks->im_block_size = blocksize;
115 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
116 blocksize);
117}
118
119enum extentclass4 {
120 RW_EXTENT = 0, /* READWRTE and INVAL */
121 RO_EXTENT = 1, /* READ and NONE */
122 EXTENT_LISTS = 2,
123};
124
125static inline int bl_choose_list(enum exstate4 state)
126{
127 if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
128 return RO_EXTENT;
129 else
130 return RW_EXTENT;
131}
132 145
133struct pnfs_block_layout { 146struct pnfs_block_layout {
134 struct pnfs_layout_hdr bl_layout; 147 struct pnfs_layout_hdr bl_layout;
135 struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ 148 struct rb_root bl_ext_rw;
149 struct rb_root bl_ext_ro;
136 spinlock_t bl_ext_lock; /* Protects list manipulation */ 150 spinlock_t bl_ext_lock; /* Protects list manipulation */
137 struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
138 struct list_head bl_commit; /* Needs layout commit */
139 struct list_head bl_committing; /* Layout committing */
140 unsigned int bl_count; /* entries in bl_commit */
141 sector_t bl_blocksize; /* Server blocksize in sectors */
142}; 151};
143 152
144#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
145
146static inline struct pnfs_block_layout * 153static inline struct pnfs_block_layout *
147BLK_LO2EXT(struct pnfs_layout_hdr *lo) 154BLK_LO2EXT(struct pnfs_layout_hdr *lo)
148{ 155{
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
171#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ 178#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
172#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ 179#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
173 180
174/* blocklayoutdev.c */ 181/* dev.c */
175ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 182struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
176void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 183 struct pnfs_device *pdev, gfp_t gfp_mask);
177void nfs4_blkdev_put(struct block_device *bdev); 184void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
178struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, 185
179 struct pnfs_device *dev); 186/* extent_tree.c */
180int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, 187int ext_tree_insert(struct pnfs_block_layout *bl,
181 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 188 struct pnfs_block_extent *new);
182 189int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
183/* blocklayoutdm.c */ 190 sector_t end);
184void bl_free_block_dev(struct pnfs_block_dev *bdev); 191int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
185 192 sector_t len);
186/* extents.c */ 193bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
187struct pnfs_block_extent * 194 struct pnfs_block_extent *ret, bool rw);
188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 195int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
189 struct pnfs_block_extent **cow_read); 196void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
190int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 197
191 sector_t offset, sector_t length); 198/* rpc_pipefs.c */
192void bl_put_extent(struct pnfs_block_extent *be); 199dev_t bl_resolve_deviceid(struct nfs_server *server,
193struct pnfs_block_extent *bl_alloc_extent(void); 200 struct pnfs_block_volume *b, gfp_t gfp_mask);
194int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); 201int __init bl_init_pipefs(void);
195int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 202void __exit bl_cleanup_pipefs(void);
196 struct xdr_stream *xdr,
197 const struct nfs4_layoutcommit_args *arg);
198void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
199 const struct nfs4_layoutcommit_args *arg,
200 int status);
201int bl_add_merge_extent(struct pnfs_block_layout *bl,
202 struct pnfs_block_extent *new);
203int bl_mark_for_commit(struct pnfs_block_extent *be,
204 sector_t offset, sector_t length,
205 struct pnfs_block_short_extent *new);
206int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
207struct pnfs_block_short_extent *
208bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
209void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
210 203
211#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 204#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c9361..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
3 *
4 * Device operations for the pnfs nfs4 file layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#include <linux/module.h>
33#include <linux/buffer_head.h> /* __bread */
34
35#include <linux/genhd.h>
36#include <linux/blkdev.h>
37#include <linux/hash.h>
38
39#include "blocklayout.h"
40
41#define NFSDBG_FACILITY NFSDBG_PNFS_LD
42
43static int decode_sector_number(__be32 **rp, sector_t *sp)
44{
45 uint64_t s;
46
47 *rp = xdr_decode_hyper(*rp, &s);
48 if (s & 0x1ff) {
49 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
50 return -1;
51 }
52 *sp = s >> SECTOR_SHIFT;
53 return 0;
54}
55
56/*
57 * Release the block device
58 */
59void nfs4_blkdev_put(struct block_device *bdev)
60{
61 dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
62 MINOR(bdev->bd_dev));
63 blkdev_put(bdev, FMODE_READ);
64}
65
66ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
67 size_t mlen)
68{
69 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
70 nfs_net_id);
71
72 if (mlen != sizeof (struct bl_dev_msg))
73 return -EINVAL;
74
75 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
76 return -EFAULT;
77
78 wake_up(&nn->bl_wq);
79
80 return mlen;
81}
82
83void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
84{
85 struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
86
87 if (msg->errno >= 0)
88 return;
89 wake_up(bl_pipe_msg->bl_wq);
90}
91
92/*
93 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
94 */
95struct pnfs_block_dev *
96nfs4_blk_decode_device(struct nfs_server *server,
97 struct pnfs_device *dev)
98{
99 struct pnfs_block_dev *rv;
100 struct block_device *bd = NULL;
101 struct bl_pipe_msg bl_pipe_msg;
102 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
103 struct bl_msg_hdr bl_msg = {
104 .type = BL_DEVICE_MOUNT,
105 .totallen = dev->mincount,
106 };
107 uint8_t *dataptr;
108 DECLARE_WAITQUEUE(wq, current);
109 int offset, len, i, rc;
110 struct net *net = server->nfs_client->cl_net;
111 struct nfs_net *nn = net_generic(net, nfs_net_id);
112 struct bl_dev_msg *reply = &nn->bl_mount_reply;
113
114 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
115 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
116 dev->mincount);
117
118 bl_pipe_msg.bl_wq = &nn->bl_wq;
119 memset(msg, 0, sizeof(*msg));
120 msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
121 if (!msg->data) {
122 rv = ERR_PTR(-ENOMEM);
123 goto out;
124 }
125
126 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
127 dataptr = (uint8_t *) msg->data;
128 len = dev->mincount;
129 offset = sizeof(bl_msg);
130 for (i = 0; len > 0; i++) {
131 memcpy(&dataptr[offset], page_address(dev->pages[i]),
132 len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
133 len -= PAGE_CACHE_SIZE;
134 offset += PAGE_CACHE_SIZE;
135 }
136 msg->len = sizeof(bl_msg) + dev->mincount;
137
138 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
139 add_wait_queue(&nn->bl_wq, &wq);
140 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
141 if (rc < 0) {
142 remove_wait_queue(&nn->bl_wq, &wq);
143 rv = ERR_PTR(rc);
144 goto out;
145 }
146
147 set_current_state(TASK_UNINTERRUPTIBLE);
148 schedule();
149 __set_current_state(TASK_RUNNING);
150 remove_wait_queue(&nn->bl_wq, &wq);
151
152 if (reply->status != BL_DEVICE_REQUEST_PROC) {
153 dprintk("%s failed to open device: %d\n",
154 __func__, reply->status);
155 rv = ERR_PTR(-EINVAL);
156 goto out;
157 }
158
159 bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
160 FMODE_READ, NULL);
161 if (IS_ERR(bd)) {
162 dprintk("%s failed to open device : %ld\n", __func__,
163 PTR_ERR(bd));
164 rv = ERR_CAST(bd);
165 goto out;
166 }
167
168 rv = kzalloc(sizeof(*rv), GFP_NOFS);
169 if (!rv) {
170 rv = ERR_PTR(-ENOMEM);
171 goto out;
172 }
173
174 rv->bm_mdev = bd;
175 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
176 rv->net = net;
177 dprintk("%s Created device %s with bd_block_size %u\n",
178 __func__,
179 bd->bd_disk->disk_name,
180 bd->bd_block_size);
181
182out:
183 kfree(msg->data);
184 return rv;
185}
186
187/* Map deviceid returned by the server to constructed block_device */
188static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
189 struct nfs4_deviceid *id)
190{
191 struct block_device *rv = NULL;
192 struct block_mount_id *mid;
193 struct pnfs_block_dev *dev;
194
195 dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
196 mid = BLK_ID(lo);
197 spin_lock(&mid->bm_lock);
198 list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
199 if (memcmp(id->data, dev->bm_mdevid.data,
200 NFS4_DEVICEID4_SIZE) == 0) {
201 rv = dev->bm_mdev;
202 goto out;
203 }
204 }
205 out:
206 spin_unlock(&mid->bm_lock);
207 dprintk("%s returning %p\n", __func__, rv);
208 return rv;
209}
210
211/* Tracks info needed to ensure extents in layout obey constraints of spec */
212struct layout_verification {
213 u32 mode; /* R or RW */
214 u64 start; /* Expected start of next non-COW extent */
215 u64 inval; /* Start of INVAL coverage */
216 u64 cowread; /* End of COW read coverage */
217};
218
219/* Verify the extent meets the layout requirements of the pnfs-block draft,
220 * section 2.3.1.
221 */
222static int verify_extent(struct pnfs_block_extent *be,
223 struct layout_verification *lv)
224{
225 if (lv->mode == IOMODE_READ) {
226 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
227 be->be_state == PNFS_BLOCK_INVALID_DATA)
228 return -EIO;
229 if (be->be_f_offset != lv->start)
230 return -EIO;
231 lv->start += be->be_length;
232 return 0;
233 }
234 /* lv->mode == IOMODE_RW */
235 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
236 if (be->be_f_offset != lv->start)
237 return -EIO;
238 if (lv->cowread > lv->start)
239 return -EIO;
240 lv->start += be->be_length;
241 lv->inval = lv->start;
242 return 0;
243 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
244 if (be->be_f_offset != lv->start)
245 return -EIO;
246 lv->start += be->be_length;
247 return 0;
248 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
249 if (be->be_f_offset > lv->start)
250 return -EIO;
251 if (be->be_f_offset < lv->inval)
252 return -EIO;
253 if (be->be_f_offset < lv->cowread)
254 return -EIO;
255 /* It looks like you might want to min this with lv->start,
256 * but you really don't.
257 */
258 lv->inval = lv->inval + be->be_length;
259 lv->cowread = be->be_f_offset + be->be_length;
260 return 0;
261 } else
262 return -EIO;
263}
264
265/* XDR decode pnfs_block_layout4 structure */
266int
267nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
268 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
269{
270 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
271 int i, status = -EIO;
272 uint32_t count;
273 struct pnfs_block_extent *be = NULL, *save;
274 struct xdr_stream stream;
275 struct xdr_buf buf;
276 struct page *scratch;
277 __be32 *p;
278 struct layout_verification lv = {
279 .mode = lgr->range.iomode,
280 .start = lgr->range.offset >> SECTOR_SHIFT,
281 .inval = lgr->range.offset >> SECTOR_SHIFT,
282 .cowread = lgr->range.offset >> SECTOR_SHIFT,
283 };
284 LIST_HEAD(extents);
285
286 dprintk("---> %s\n", __func__);
287
288 scratch = alloc_page(gfp_flags);
289 if (!scratch)
290 return -ENOMEM;
291
292 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
293 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
294
295 p = xdr_inline_decode(&stream, 4);
296 if (unlikely(!p))
297 goto out_err;
298
299 count = be32_to_cpup(p++);
300
301 dprintk("%s enter, number of extents %i\n", __func__, count);
302 p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
303 if (unlikely(!p))
304 goto out_err;
305
306 /* Decode individual extents, putting them in temporary
307 * staging area until whole layout is decoded to make error
308 * recovery easier.
309 */
310 for (i = 0; i < count; i++) {
311 be = bl_alloc_extent();
312 if (!be) {
313 status = -ENOMEM;
314 goto out_err;
315 }
316 memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
317 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
318 be->be_mdev = translate_devid(lo, &be->be_devid);
319 if (!be->be_mdev)
320 goto out_err;
321
322 /* The next three values are read in as bytes,
323 * but stored as 512-byte sector lengths
324 */
325 if (decode_sector_number(&p, &be->be_f_offset) < 0)
326 goto out_err;
327 if (decode_sector_number(&p, &be->be_length) < 0)
328 goto out_err;
329 if (decode_sector_number(&p, &be->be_v_offset) < 0)
330 goto out_err;
331 be->be_state = be32_to_cpup(p++);
332 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
333 be->be_inval = &bl->bl_inval;
334 if (verify_extent(be, &lv)) {
335 dprintk("%s verify failed\n", __func__);
336 goto out_err;
337 }
338 list_add_tail(&be->be_node, &extents);
339 }
340 if (lgr->range.offset + lgr->range.length !=
341 lv.start << SECTOR_SHIFT) {
342 dprintk("%s Final length mismatch\n", __func__);
343 be = NULL;
344 goto out_err;
345 }
346 if (lv.start < lv.cowread) {
347 dprintk("%s Final uncovered COW extent\n", __func__);
348 be = NULL;
349 goto out_err;
350 }
351 /* Extents decoded properly, now try to merge them in to
352 * existing layout extents.
353 */
354 spin_lock(&bl->bl_ext_lock);
355 list_for_each_entry_safe(be, save, &extents, be_node) {
356 list_del(&be->be_node);
357 status = bl_add_merge_extent(bl, be);
358 if (status) {
359 spin_unlock(&bl->bl_ext_lock);
360 /* This is a fairly catastrophic error, as the
361 * entire layout extent lists are now corrupted.
362 * We should have some way to distinguish this.
363 */
364 be = NULL;
365 goto out_err;
366 }
367 }
368 spin_unlock(&bl->bl_ext_lock);
369 status = 0;
370 out:
371 __free_page(scratch);
372 dprintk("%s returns %i\n", __func__, status);
373 return status;
374
375 out_err:
376 bl_put_extent(be);
377 while (!list_empty(&extents)) {
378 be = list_first_entry(&extents, struct pnfs_block_extent,
379 be_node);
380 list_del(&be->be_node);
381 bl_put_extent(be);
382 }
383 goto out;
384}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd866..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdm.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2007 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Fred Isaman <iisaman@umich.edu>
10 * Andy Adamson <andros@citi.umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/genhd.h> /* gendisk - used in a dprintk*/
34#include <linux/sched.h>
35#include <linux/hash.h>
36
37#include "blocklayout.h"
38
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
40
41static void dev_remove(struct net *net, dev_t dev)
42{
43 struct bl_pipe_msg bl_pipe_msg;
44 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
45 struct bl_dev_msg bl_umount_request;
46 struct bl_msg_hdr bl_msg = {
47 .type = BL_DEVICE_UMOUNT,
48 .totallen = sizeof(bl_umount_request),
49 };
50 uint8_t *dataptr;
51 DECLARE_WAITQUEUE(wq, current);
52 struct nfs_net *nn = net_generic(net, nfs_net_id);
53
54 dprintk("Entering %s\n", __func__);
55
56 bl_pipe_msg.bl_wq = &nn->bl_wq;
57 memset(msg, 0, sizeof(*msg));
58 msg->len = sizeof(bl_msg) + bl_msg.totallen;
59 msg->data = kzalloc(msg->len, GFP_NOFS);
60 if (!msg->data)
61 goto out;
62
63 memset(&bl_umount_request, 0, sizeof(bl_umount_request));
64 bl_umount_request.major = MAJOR(dev);
65 bl_umount_request.minor = MINOR(dev);
66
67 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
68 dataptr = (uint8_t *) msg->data;
69 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
70
71 add_wait_queue(&nn->bl_wq, &wq);
72 if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
73 remove_wait_queue(&nn->bl_wq, &wq);
74 goto out;
75 }
76
77 set_current_state(TASK_UNINTERRUPTIBLE);
78 schedule();
79 __set_current_state(TASK_RUNNING);
80 remove_wait_queue(&nn->bl_wq, &wq);
81
82out:
83 kfree(msg->data);
84}
85
86/*
87 * Release meta device
88 */
89static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
90{
91 dprintk("%s Releasing\n", __func__);
92 nfs4_blkdev_put(bdev->bm_mdev);
93 dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
94}
95
96void bl_free_block_dev(struct pnfs_block_dev *bdev)
97{
98 if (bdev) {
99 if (bdev->bm_mdev) {
100 dprintk("%s Removing DM device: %d:%d\n",
101 __func__,
102 MAJOR(bdev->bm_mdev->bd_dev),
103 MINOR(bdev->bm_mdev->bd_dev));
104 nfs4_blk_metadev_release(bdev);
105 }
106 kfree(bdev);
107 }
108}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..5aed4f98df41
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/sunrpc/svc.h>
5#include <linux/blkdev.h>
6#include <linux/nfs4.h>
7#include <linux/nfs_fs.h>
8#include <linux/nfs_xdr.h>
9
10#include "blocklayout.h"
11
12#define NFSDBG_FACILITY NFSDBG_PNFS_LD
13
14static void
15bl_free_device(struct pnfs_block_dev *dev)
16{
17 if (dev->nr_children) {
18 int i;
19
20 for (i = 0; i < dev->nr_children; i++)
21 bl_free_device(&dev->children[i]);
22 kfree(dev->children);
23 } else {
24 if (dev->bdev)
25 blkdev_put(dev->bdev, FMODE_READ);
26 }
27}
28
29void
30bl_free_deviceid_node(struct nfs4_deviceid_node *d)
31{
32 struct pnfs_block_dev *dev =
33 container_of(d, struct pnfs_block_dev, node);
34
35 bl_free_device(dev);
36 kfree(dev);
37}
38
39static int
40nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
41{
42 __be32 *p;
43 int i;
44
45 p = xdr_inline_decode(xdr, 4);
46 if (!p)
47 return -EIO;
48 b->type = be32_to_cpup(p++);
49
50 switch (b->type) {
51 case PNFS_BLOCK_VOLUME_SIMPLE:
52 p = xdr_inline_decode(xdr, 4);
53 if (!p)
54 return -EIO;
55 b->simple.nr_sigs = be32_to_cpup(p++);
56 if (!b->simple.nr_sigs) {
57 dprintk("no signature\n");
58 return -EIO;
59 }
60
61 b->simple.len = 4 + 4;
62 for (i = 0; i < b->simple.nr_sigs; i++) {
63 p = xdr_inline_decode(xdr, 8 + 4);
64 if (!p)
65 return -EIO;
66 p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
67 b->simple.sigs[i].sig_len = be32_to_cpup(p++);
68
69 p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
70 if (!p)
71 return -EIO;
72 memcpy(&b->simple.sigs[i].sig, p,
73 b->simple.sigs[i].sig_len);
74
75 b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
76 }
77 break;
78 case PNFS_BLOCK_VOLUME_SLICE:
79 p = xdr_inline_decode(xdr, 8 + 8 + 4);
80 if (!p)
81 return -EIO;
82 p = xdr_decode_hyper(p, &b->slice.start);
83 p = xdr_decode_hyper(p, &b->slice.len);
84 b->slice.volume = be32_to_cpup(p++);
85 break;
86 case PNFS_BLOCK_VOLUME_CONCAT:
87 p = xdr_inline_decode(xdr, 4);
88 if (!p)
89 return -EIO;
90 b->concat.volumes_count = be32_to_cpup(p++);
91
92 p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
93 if (!p)
94 return -EIO;
95 for (i = 0; i < b->concat.volumes_count; i++)
96 b->concat.volumes[i] = be32_to_cpup(p++);
97 break;
98 case PNFS_BLOCK_VOLUME_STRIPE:
99 p = xdr_inline_decode(xdr, 8 + 4);
100 if (!p)
101 return -EIO;
102 p = xdr_decode_hyper(p, &b->stripe.chunk_size);
103 b->stripe.volumes_count = be32_to_cpup(p++);
104
105 p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
106 if (!p)
107 return -EIO;
108 for (i = 0; i < b->stripe.volumes_count; i++)
109 b->stripe.volumes[i] = be32_to_cpup(p++);
110 break;
111 default:
112 dprintk("unknown volume type!\n");
113 return -EIO;
114 }
115
116 return 0;
117}
118
119static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
120 struct pnfs_block_dev_map *map)
121{
122 map->start = dev->start;
123 map->len = dev->len;
124 map->disk_offset = dev->disk_offset;
125 map->bdev = dev->bdev;
126 return true;
127}
128
129static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
130 struct pnfs_block_dev_map *map)
131{
132 int i;
133
134 for (i = 0; i < dev->nr_children; i++) {
135 struct pnfs_block_dev *child = &dev->children[i];
136
137 if (child->start > offset ||
138 child->start + child->len <= offset)
139 continue;
140
141 child->map(child, offset - child->start, map);
142 return true;
143 }
144
145 dprintk("%s: ran off loop!\n", __func__);
146 return false;
147}
148
149static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
150 struct pnfs_block_dev_map *map)
151{
152 struct pnfs_block_dev *child;
153 u64 chunk;
154 u32 chunk_idx;
155 u64 disk_offset;
156
157 chunk = div_u64(offset, dev->chunk_size);
158 div_u64_rem(chunk, dev->nr_children, &chunk_idx);
159
160 if (chunk_idx > dev->nr_children) {
161 dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
162 __func__, chunk_idx, offset, dev->chunk_size);
163 /* error, should not happen */
164 return false;
165 }
166
167 /* truncate offset to the beginning of the stripe */
168 offset = chunk * dev->chunk_size;
169
170 /* disk offset of the stripe */
171 disk_offset = div_u64(offset, dev->nr_children);
172
173 child = &dev->children[chunk_idx];
174 child->map(child, disk_offset, map);
175
176 map->start += offset;
177 map->disk_offset += disk_offset;
178 map->len = dev->chunk_size;
179 return true;
180}
181
182static int
183bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
184 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
185
186
187static int
188bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
189 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
190{
191 struct pnfs_block_volume *v = &volumes[idx];
192 dev_t dev;
193
194 dev = bl_resolve_deviceid(server, v, gfp_mask);
195 if (!dev)
196 return -EIO;
197
198 d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
199 if (IS_ERR(d->bdev)) {
200 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
201 MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
202 return PTR_ERR(d->bdev);
203 }
204
205
206 d->len = i_size_read(d->bdev->bd_inode);
207 d->map = bl_map_simple;
208
209 printk(KERN_INFO "pNFS: using block device %s\n",
210 d->bdev->bd_disk->disk_name);
211 return 0;
212}
213
214static int
215bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
216 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
217{
218 struct pnfs_block_volume *v = &volumes[idx];
219 int ret;
220
221 ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
222 if (ret)
223 return ret;
224
225 d->disk_offset = v->slice.start;
226 d->len = v->slice.len;
227 return 0;
228}
229
230static int
231bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
232 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
233{
234 struct pnfs_block_volume *v = &volumes[idx];
235 u64 len = 0;
236 int ret, i;
237
238 d->children = kcalloc(v->concat.volumes_count,
239 sizeof(struct pnfs_block_dev), GFP_KERNEL);
240 if (!d->children)
241 return -ENOMEM;
242
243 for (i = 0; i < v->concat.volumes_count; i++) {
244 ret = bl_parse_deviceid(server, &d->children[i],
245 volumes, v->concat.volumes[i], gfp_mask);
246 if (ret)
247 return ret;
248
249 d->nr_children++;
250 d->children[i].start += len;
251 len += d->children[i].len;
252 }
253
254 d->len = len;
255 d->map = bl_map_concat;
256 return 0;
257}
258
259static int
260bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
261 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
262{
263 struct pnfs_block_volume *v = &volumes[idx];
264 u64 len = 0;
265 int ret, i;
266
267 d->children = kcalloc(v->stripe.volumes_count,
268 sizeof(struct pnfs_block_dev), GFP_KERNEL);
269 if (!d->children)
270 return -ENOMEM;
271
272 for (i = 0; i < v->stripe.volumes_count; i++) {
273 ret = bl_parse_deviceid(server, &d->children[i],
274 volumes, v->stripe.volumes[i], gfp_mask);
275 if (ret)
276 return ret;
277
278 d->nr_children++;
279 len += d->children[i].len;
280 }
281
282 d->len = len;
283 d->chunk_size = v->stripe.chunk_size;
284 d->map = bl_map_stripe;
285 return 0;
286}
287
288static int
289bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
290 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
291{
292 switch (volumes[idx].type) {
293 case PNFS_BLOCK_VOLUME_SIMPLE:
294 return bl_parse_simple(server, d, volumes, idx, gfp_mask);
295 case PNFS_BLOCK_VOLUME_SLICE:
296 return bl_parse_slice(server, d, volumes, idx, gfp_mask);
297 case PNFS_BLOCK_VOLUME_CONCAT:
298 return bl_parse_concat(server, d, volumes, idx, gfp_mask);
299 case PNFS_BLOCK_VOLUME_STRIPE:
300 return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
301 default:
302 dprintk("unsupported volume type: %d\n", volumes[idx].type);
303 return -EIO;
304 }
305}
306
307struct nfs4_deviceid_node *
308bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
309 gfp_t gfp_mask)
310{
311 struct nfs4_deviceid_node *node = NULL;
312 struct pnfs_block_volume *volumes;
313 struct pnfs_block_dev *top;
314 struct xdr_stream xdr;
315 struct xdr_buf buf;
316 struct page *scratch;
317 int nr_volumes, ret, i;
318 __be32 *p;
319
320 scratch = alloc_page(gfp_mask);
321 if (!scratch)
322 goto out;
323
324 xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
325 xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
326
327 p = xdr_inline_decode(&xdr, sizeof(__be32));
328 if (!p)
329 goto out_free_scratch;
330 nr_volumes = be32_to_cpup(p++);
331
332 volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
333 gfp_mask);
334 if (!volumes)
335 goto out_free_scratch;
336
337 for (i = 0; i < nr_volumes; i++) {
338 ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
339 if (ret < 0)
340 goto out_free_volumes;
341 }
342
343 top = kzalloc(sizeof(*top), gfp_mask);
344 if (!top)
345 goto out_free_volumes;
346
347 ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
348 if (ret) {
349 bl_free_device(top);
350 kfree(top);
351 goto out_free_volumes;
352 }
353
354 node = &top->node;
355 nfs4_init_deviceid_node(node, server, &pdev->dev_id);
356
357out_free_volumes:
358 kfree(volumes);
359out_free_scratch:
360 __free_page(scratch);
361out:
362 return node;
363}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..31d0b5e53dfd
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4
5#include <linux/vmalloc.h>
6
7#include "blocklayout.h"
8
9#define NFSDBG_FACILITY NFSDBG_PNFS_LD
10
11static inline struct pnfs_block_extent *
12ext_node(struct rb_node *node)
13{
14 return rb_entry(node, struct pnfs_block_extent, be_node);
15}
16
17static struct pnfs_block_extent *
18ext_tree_first(struct rb_root *root)
19{
20 struct rb_node *node = rb_first(root);
21 return node ? ext_node(node) : NULL;
22}
23
24static struct pnfs_block_extent *
25ext_tree_prev(struct pnfs_block_extent *be)
26{
27 struct rb_node *node = rb_prev(&be->be_node);
28 return node ? ext_node(node) : NULL;
29}
30
31static struct pnfs_block_extent *
32ext_tree_next(struct pnfs_block_extent *be)
33{
34 struct rb_node *node = rb_next(&be->be_node);
35 return node ? ext_node(node) : NULL;
36}
37
38static inline sector_t
39ext_f_end(struct pnfs_block_extent *be)
40{
41 return be->be_f_offset + be->be_length;
42}
43
44static struct pnfs_block_extent *
45__ext_tree_search(struct rb_root *root, sector_t start)
46{
47 struct rb_node *node = root->rb_node;
48 struct pnfs_block_extent *be = NULL;
49
50 while (node) {
51 be = ext_node(node);
52 if (start < be->be_f_offset)
53 node = node->rb_left;
54 else if (start >= ext_f_end(be))
55 node = node->rb_right;
56 else
57 return be;
58 }
59
60 if (be) {
61 if (start < be->be_f_offset)
62 return be;
63
64 if (start >= ext_f_end(be))
65 return ext_tree_next(be);
66 }
67
68 return NULL;
69}
70
71static bool
72ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
73{
74 if (be1->be_state != be2->be_state)
75 return false;
76 if (be1->be_device != be2->be_device)
77 return false;
78
79 if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
80 return false;
81
82 if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
83 (be1->be_v_offset + be1->be_length != be2->be_v_offset))
84 return false;
85
86 if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
87 be1->be_tag != be2->be_tag)
88 return false;
89
90 return true;
91}
92
93static struct pnfs_block_extent *
94ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
95{
96 struct pnfs_block_extent *left = ext_tree_prev(be);
97
98 if (left && ext_can_merge(left, be)) {
99 left->be_length += be->be_length;
100 rb_erase(&be->be_node, root);
101 nfs4_put_deviceid_node(be->be_device);
102 kfree(be);
103 return left;
104 }
105
106 return be;
107}
108
109static struct pnfs_block_extent *
110ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
111{
112 struct pnfs_block_extent *right = ext_tree_next(be);
113
114 if (right && ext_can_merge(be, right)) {
115 be->be_length += right->be_length;
116 rb_erase(&right->be_node, root);
117 nfs4_put_deviceid_node(right->be_device);
118 kfree(right);
119 }
120
121 return be;
122}
123
124static void
125__ext_tree_insert(struct rb_root *root,
126 struct pnfs_block_extent *new, bool merge_ok)
127{
128 struct rb_node **p = &root->rb_node, *parent = NULL;
129 struct pnfs_block_extent *be;
130
131 while (*p) {
132 parent = *p;
133 be = ext_node(parent);
134
135 if (new->be_f_offset < be->be_f_offset) {
136 if (merge_ok && ext_can_merge(new, be)) {
137 be->be_f_offset = new->be_f_offset;
138 if (be->be_state != PNFS_BLOCK_NONE_DATA)
139 be->be_v_offset = new->be_v_offset;
140 be->be_length += new->be_length;
141 be = ext_try_to_merge_left(root, be);
142 goto free_new;
143 }
144 p = &(*p)->rb_left;
145 } else if (new->be_f_offset >= ext_f_end(be)) {
146 if (merge_ok && ext_can_merge(be, new)) {
147 be->be_length += new->be_length;
148 be = ext_try_to_merge_right(root, be);
149 goto free_new;
150 }
151 p = &(*p)->rb_right;
152 } else {
153 BUG();
154 }
155 }
156
157 rb_link_node(&new->be_node, parent, p);
158 rb_insert_color(&new->be_node, root);
159 return;
160free_new:
161 nfs4_put_deviceid_node(new->be_device);
162 kfree(new);
163}
164
165static int
166__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
167{
168 struct pnfs_block_extent *be;
169 sector_t len1 = 0, len2 = 0;
170 sector_t orig_v_offset;
171 sector_t orig_len;
172
173 be = __ext_tree_search(root, start);
174 if (!be)
175 return 0;
176 if (be->be_f_offset >= end)
177 return 0;
178
179 orig_v_offset = be->be_v_offset;
180 orig_len = be->be_length;
181
182 if (start > be->be_f_offset)
183 len1 = start - be->be_f_offset;
184 if (ext_f_end(be) > end)
185 len2 = ext_f_end(be) - end;
186
187 if (len2 > 0) {
188 if (len1 > 0) {
189 struct pnfs_block_extent *new;
190
191 new = kzalloc(sizeof(*new), GFP_ATOMIC);
192 if (!new)
193 return -ENOMEM;
194
195 be->be_length = len1;
196
197 new->be_f_offset = end;
198 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
199 new->be_v_offset =
200 orig_v_offset + orig_len - len2;
201 }
202 new->be_length = len2;
203 new->be_state = be->be_state;
204 new->be_tag = be->be_tag;
205 new->be_device = nfs4_get_deviceid(be->be_device);
206
207 __ext_tree_insert(root, new, true);
208 } else {
209 be->be_f_offset = end;
210 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
211 be->be_v_offset =
212 orig_v_offset + orig_len - len2;
213 }
214 be->be_length = len2;
215 }
216 } else {
217 if (len1 > 0) {
218 be->be_length = len1;
219 be = ext_tree_next(be);
220 }
221
222 while (be && ext_f_end(be) <= end) {
223 struct pnfs_block_extent *next = ext_tree_next(be);
224
225 rb_erase(&be->be_node, root);
226 nfs4_put_deviceid_node(be->be_device);
227 kfree(be);
228 be = next;
229 }
230
231 if (be && be->be_f_offset < end) {
232 len1 = ext_f_end(be) - end;
233 be->be_f_offset = end;
234 if (be->be_state != PNFS_BLOCK_NONE_DATA)
235 be->be_v_offset += be->be_length - len1;
236 be->be_length = len1;
237 }
238 }
239
240 return 0;
241}
242
243int
244ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
245{
246 struct pnfs_block_extent *be;
247 struct rb_root *root;
248 int err = 0;
249
250 switch (new->be_state) {
251 case PNFS_BLOCK_READWRITE_DATA:
252 case PNFS_BLOCK_INVALID_DATA:
253 root = &bl->bl_ext_rw;
254 break;
255 case PNFS_BLOCK_READ_DATA:
256 case PNFS_BLOCK_NONE_DATA:
257 root = &bl->bl_ext_ro;
258 break;
259 default:
260 dprintk("invalid extent type\n");
261 return -EINVAL;
262 }
263
264 spin_lock(&bl->bl_ext_lock);
265retry:
266 be = __ext_tree_search(root, new->be_f_offset);
267 if (!be || be->be_f_offset >= ext_f_end(new)) {
268 __ext_tree_insert(root, new, true);
269 } else if (new->be_f_offset >= be->be_f_offset) {
270 if (ext_f_end(new) <= ext_f_end(be)) {
271 nfs4_put_deviceid_node(new->be_device);
272 kfree(new);
273 } else {
274 sector_t new_len = ext_f_end(new) - ext_f_end(be);
275 sector_t diff = new->be_length - new_len;
276
277 new->be_f_offset += diff;
278 new->be_v_offset += diff;
279 new->be_length = new_len;
280 goto retry;
281 }
282 } else if (ext_f_end(new) <= ext_f_end(be)) {
283 new->be_length = be->be_f_offset - new->be_f_offset;
284 __ext_tree_insert(root, new, true);
285 } else {
286 struct pnfs_block_extent *split;
287 sector_t new_len = ext_f_end(new) - ext_f_end(be);
288 sector_t diff = new->be_length - new_len;
289
290 split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
291 if (!split) {
292 err = -EINVAL;
293 goto out;
294 }
295
296 split->be_length = be->be_f_offset - split->be_f_offset;
297 split->be_device = nfs4_get_deviceid(new->be_device);
298 __ext_tree_insert(root, split, true);
299
300 new->be_f_offset += diff;
301 new->be_v_offset += diff;
302 new->be_length = new_len;
303 goto retry;
304 }
305out:
306 spin_unlock(&bl->bl_ext_lock);
307 return err;
308}
309
310static bool
311__ext_tree_lookup(struct rb_root *root, sector_t isect,
312 struct pnfs_block_extent *ret)
313{
314 struct rb_node *node;
315 struct pnfs_block_extent *be;
316
317 node = root->rb_node;
318 while (node) {
319 be = ext_node(node);
320 if (isect < be->be_f_offset)
321 node = node->rb_left;
322 else if (isect >= ext_f_end(be))
323 node = node->rb_right;
324 else {
325 *ret = *be;
326 return true;
327 }
328 }
329
330 return false;
331}
332
333bool
334ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
335 struct pnfs_block_extent *ret, bool rw)
336{
337 bool found = false;
338
339 spin_lock(&bl->bl_ext_lock);
340 if (!rw)
341 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
342 if (!found)
343 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
344 spin_unlock(&bl->bl_ext_lock);
345
346 return found;
347}
348
349int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
350 sector_t start, sector_t end)
351{
352 int err, err2;
353
354 spin_lock(&bl->bl_ext_lock);
355 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
356 if (rw) {
357 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
358 if (!err)
359 err = err2;
360 }
361 spin_unlock(&bl->bl_ext_lock);
362
363 return err;
364}
365
366static int
367ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
368 sector_t split)
369{
370 struct pnfs_block_extent *new;
371 sector_t orig_len = be->be_length;
372
373 new = kzalloc(sizeof(*new), GFP_ATOMIC);
374 if (!new)
375 return -ENOMEM;
376
377 be->be_length = split - be->be_f_offset;
378
379 new->be_f_offset = split;
380 if (be->be_state != PNFS_BLOCK_NONE_DATA)
381 new->be_v_offset = be->be_v_offset + be->be_length;
382 new->be_length = orig_len - be->be_length;
383 new->be_state = be->be_state;
384 new->be_tag = be->be_tag;
385 new->be_device = nfs4_get_deviceid(be->be_device);
386
387 __ext_tree_insert(root, new, false);
388 return 0;
389}
390
391int
392ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
393 sector_t len)
394{
395 struct rb_root *root = &bl->bl_ext_rw;
396 sector_t end = start + len;
397 struct pnfs_block_extent *be;
398 int err = 0;
399
400 spin_lock(&bl->bl_ext_lock);
401 /*
402 * First remove all COW extents or holes from written to range.
403 */
404 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
405 if (err)
406 goto out;
407
408 /*
409 * Then mark all invalid extents in the range as written to.
410 */
411 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
412 if (be->be_f_offset >= end)
413 break;
414
415 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
416 continue;
417
418 if (be->be_f_offset < start) {
419 struct pnfs_block_extent *left = ext_tree_prev(be);
420
421 if (left && ext_can_merge(left, be)) {
422 sector_t diff = start - be->be_f_offset;
423
424 left->be_length += diff;
425
426 be->be_f_offset += diff;
427 be->be_v_offset += diff;
428 be->be_length -= diff;
429 } else {
430 err = ext_tree_split(root, be, start);
431 if (err)
432 goto out;
433 }
434 }
435
436 if (ext_f_end(be) > end) {
437 struct pnfs_block_extent *right = ext_tree_next(be);
438
439 if (right && ext_can_merge(be, right)) {
440 sector_t diff = end - be->be_f_offset;
441
442 be->be_length -= diff;
443
444 right->be_f_offset -= diff;
445 right->be_v_offset -= diff;
446 right->be_length += diff;
447 } else {
448 err = ext_tree_split(root, be, end);
449 if (err)
450 goto out;
451 }
452 }
453
454 if (be->be_f_offset >= start && ext_f_end(be) <= end) {
455 be->be_tag = EXTENT_WRITTEN;
456 be = ext_try_to_merge_left(root, be);
457 be = ext_try_to_merge_right(root, be);
458 }
459 }
460out:
461 spin_unlock(&bl->bl_ext_lock);
462 return err;
463}
464
465static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
466 size_t buffer_size)
467{
468 if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
469 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
470
471 for (i = 0; i < nr_pages; i++)
472 put_page(arg->layoutupdate_pages[i]);
473 kfree(arg->layoutupdate_pages);
474 } else {
475 put_page(arg->layoutupdate_page);
476 }
477}
478
479static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
480 size_t buffer_size, size_t *count)
481{
482 struct pnfs_block_extent *be;
483 int ret = 0;
484
485 spin_lock(&bl->bl_ext_lock);
486 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
487 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
488 be->be_tag != EXTENT_WRITTEN)
489 continue;
490
491 (*count)++;
492 if (*count * BL_EXTENT_SIZE > buffer_size) {
493 /* keep counting.. */
494 ret = -ENOSPC;
495 continue;
496 }
497
498 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
499 NFS4_DEVICEID4_SIZE);
500 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
501 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
502 p = xdr_encode_hyper(p, 0LL);
503 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
504
505 be->be_tag = EXTENT_COMMITTING;
506 }
507 spin_unlock(&bl->bl_ext_lock);
508
509 return ret;
510}
511
512int
513ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
514{
515 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
516 size_t count = 0, buffer_size = PAGE_SIZE;
517 __be32 *start_p;
518 int ret;
519
520 dprintk("%s enter\n", __func__);
521
522 arg->layoutupdate_page = alloc_page(GFP_NOFS);
523 if (!arg->layoutupdate_page)
524 return -ENOMEM;
525 start_p = page_address(arg->layoutupdate_page);
526 arg->layoutupdate_pages = &arg->layoutupdate_page;
527
528retry:
529 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
530 if (unlikely(ret)) {
531 ext_tree_free_commitdata(arg, buffer_size);
532
533 buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
534 count = 0;
535
536 arg->layoutupdate_pages =
537 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
538 sizeof(struct page *), GFP_NOFS);
539 if (!arg->layoutupdate_pages)
540 return -ENOMEM;
541
542 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
543 if (!start_p) {
544 kfree(arg->layoutupdate_pages);
545 return -ENOMEM;
546 }
547
548 goto retry;
549 }
550
551 *start_p = cpu_to_be32(count);
552 arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
553
554 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
555 __be32 *p = start_p;
556 int i = 0;
557
558 for (p = start_p;
559 p < start_p + arg->layoutupdate_len;
560 p += PAGE_SIZE) {
561 arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
562 }
563 }
564
565 dprintk("%s found %zu ranges\n", __func__, count);
566 return 0;
567}
568
569void
570ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
571{
572 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
573 struct rb_root *root = &bl->bl_ext_rw;
574 struct pnfs_block_extent *be;
575
576 dprintk("%s status %d\n", __func__, status);
577
578 ext_tree_free_commitdata(arg, arg->layoutupdate_len);
579
580 spin_lock(&bl->bl_ext_lock);
581 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
582 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
583 be->be_tag != EXTENT_COMMITTING)
584 continue;
585
586 if (status) {
587 /*
588 * Mark as written and try again.
589 *
590 * XXX: some real error handling here wouldn't hurt..
591 */
592 be->be_tag = EXTENT_WRITTEN;
593 } else {
594 be->be_state = PNFS_BLOCK_READWRITE_DATA;
595 be->be_tag = 0;
596 }
597
598 be = ext_try_to_merge_left(root, be);
599 be = ext_try_to_merge_right(root, be);
600 }
601 spin_unlock(&bl->bl_ext_lock);
602}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include "blocklayout.h"
34#define NFSDBG_FACILITY NFSDBG_PNFS_LD
35
36/* Bit numbers */
37#define EXTENT_INITIALIZED 0
38#define EXTENT_WRITTEN 1
39#define EXTENT_IN_COMMIT 2
40#define INTERNAL_EXISTS MY_MAX_TAGS
41#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
42
43/* Returns largest t<=s s.t. t%base==0 */
44static inline sector_t normalize(sector_t s, int base)
45{
46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - sector_div(tmp, base);
48}
49
50static inline sector_t normalize_up(sector_t s, int base)
51{
52 return normalize(s + base - 1, base);
53}
54
55/* Complete stub using list while determine API wanted */
56
57/* Returns tags, or negative */
58static int32_t _find_entry(struct my_tree *tree, u64 s)
59{
60 struct pnfs_inval_tracking *pos;
61
62 dprintk("%s(%llu) enter\n", __func__, s);
63 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
64 if (pos->it_sector > s)
65 continue;
66 else if (pos->it_sector == s)
67 return pos->it_tags & INTERNAL_MASK;
68 else
69 break;
70 }
71 return -ENOENT;
72}
73
74static inline
75int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
76{
77 int32_t tags;
78
79 dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
80 s = normalize(s, tree->mtt_step_size);
81 tags = _find_entry(tree, s);
82 if ((tags < 0) || !(tags & (1 << tag)))
83 return 0;
84 else
85 return 1;
86}
87
88/* Creates entry with tag, or if entry already exists, unions tag to it.
89 * If storage is not NULL, newly created entry will use it.
90 * Returns number of entries added, or negative on error.
91 */
92static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
93 struct pnfs_inval_tracking *storage)
94{
95 int found = 0;
96 struct pnfs_inval_tracking *pos;
97
98 dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
99 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
100 if (pos->it_sector > s)
101 continue;
102 else if (pos->it_sector == s) {
103 found = 1;
104 break;
105 } else
106 break;
107 }
108 if (found) {
109 pos->it_tags |= (1 << tag);
110 return 0;
111 } else {
112 struct pnfs_inval_tracking *new;
113 new = storage;
114 new->it_sector = s;
115 new->it_tags = (1 << tag);
116 list_add(&new->it_link, &pos->it_link);
117 return 1;
118 }
119}
120
121/* XXXX Really want option to not create */
122/* Over range, unions tag with existing entries, else creates entry with tag */
123static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
124{
125 u64 i;
126
127 dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
128 for (i = normalize(s, tree->mtt_step_size); i < s + length;
129 i += tree->mtt_step_size)
130 if (_add_entry(tree, i, tag, NULL))
131 return -ENOMEM;
132 return 0;
133}
134
135/* Ensure that future operations on given range of tree will not malloc */
136static int _preload_range(struct pnfs_inval_markings *marks,
137 u64 offset, u64 length)
138{
139 u64 start, end, s;
140 int count, i, used = 0, status = -ENOMEM;
141 struct pnfs_inval_tracking **storage;
142 struct my_tree *tree = &marks->im_tree;
143
144 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
145 start = normalize(offset, tree->mtt_step_size);
146 end = normalize_up(offset + length, tree->mtt_step_size);
147 count = (int)(end - start) / (int)tree->mtt_step_size;
148
149 /* Pre-malloc what memory we might need */
150 storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
151 if (!storage)
152 return -ENOMEM;
153 for (i = 0; i < count; i++) {
154 storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
155 GFP_NOFS);
156 if (!storage[i])
157 goto out_cleanup;
158 }
159
160 spin_lock_bh(&marks->im_lock);
161 for (s = start; s < end; s += tree->mtt_step_size)
162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163 spin_unlock_bh(&marks->im_lock);
164
165 status = 0;
166
167 out_cleanup:
168 for (i = used; i < count; i++) {
169 if (!storage[i])
170 break;
171 kfree(storage[i]);
172 }
173 kfree(storage);
174 return status;
175}
176
177/* We are relying on page lock to serialize this */
178int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
179{
180 int rv;
181
182 spin_lock_bh(&marks->im_lock);
183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
184 spin_unlock_bh(&marks->im_lock);
185 return rv;
186}
187
188/* Assume start, end already sector aligned */
189static int
190_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
191{
192 struct pnfs_inval_tracking *pos;
193 u64 expect = 0;
194
195 dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
196 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
197 if (pos->it_sector >= end)
198 continue;
199 if (!expect) {
200 if ((pos->it_sector == end - tree->mtt_step_size) &&
201 (pos->it_tags & (1 << tag))) {
202 expect = pos->it_sector - tree->mtt_step_size;
203 if (pos->it_sector < tree->mtt_step_size || expect < start)
204 return 1;
205 continue;
206 } else {
207 return 0;
208 }
209 }
210 if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
211 return 0;
212 expect -= tree->mtt_step_size;
213 if (expect < start)
214 return 1;
215 }
216 return 0;
217}
218
219static int is_range_written(struct pnfs_inval_markings *marks,
220 sector_t start, sector_t end)
221{
222 int rv;
223
224 spin_lock_bh(&marks->im_lock);
225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
226 spin_unlock_bh(&marks->im_lock);
227 return rv;
228}
229
230/* Marks sectors in [offest, offset_length) as having been initialized.
231 * All lengths are step-aligned, where step is min(pagesize, blocksize).
232 * Currently assumes offset is page-aligned
233 */
234int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
235 sector_t offset, sector_t length)
236{
237 sector_t start, end;
238
239 dprintk("%s(offset=%llu,len=%llu) enter\n",
240 __func__, (u64)offset, (u64)length);
241
242 start = normalize(offset, marks->im_block_size);
243 end = normalize_up(offset + length, marks->im_block_size);
244 if (_preload_range(marks, start, end - start))
245 goto outerr;
246
247 spin_lock_bh(&marks->im_lock);
248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
249 goto out_unlock;
250 spin_unlock_bh(&marks->im_lock);
251
252 return 0;
253
254out_unlock:
255 spin_unlock_bh(&marks->im_lock);
256outerr:
257 return -ENOMEM;
258}
259
260/* Marks sectors in [offest, offset+length) as having been written to disk.
261 * All lengths should be block aligned.
262 */
263static int mark_written_sectors(struct pnfs_inval_markings *marks,
264 sector_t offset, sector_t length)
265{
266 int status;
267
268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
269 (u64)offset, (u64)length);
270 spin_lock_bh(&marks->im_lock);
271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
272 spin_unlock_bh(&marks->im_lock);
273 return status;
274}
275
276static void print_short_extent(struct pnfs_block_short_extent *be)
277{
278 dprintk("PRINT SHORT EXTENT extent %p\n", be);
279 if (be) {
280 dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
281 dprintk(" be_length %llu\n", (u64)be->bse_length);
282 }
283}
284
285static void print_clist(struct list_head *list, unsigned int count)
286{
287 struct pnfs_block_short_extent *be;
288 unsigned int i = 0;
289
290 ifdebug(FACILITY) {
291 printk(KERN_DEBUG "****************\n");
292 printk(KERN_DEBUG "Extent list looks like:\n");
293 list_for_each_entry(be, list, bse_node) {
294 i++;
295 print_short_extent(be);
296 }
297 if (i != count)
298 printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
299 printk(KERN_DEBUG "****************\n");
300 }
301}
302
303/* Note: In theory, we should do more checking that devid's match between
304 * old and new, but if they don't, the lists are too corrupt to salvage anyway.
305 */
306/* Note this is very similar to bl_add_merge_extent */
307static void add_to_commitlist(struct pnfs_block_layout *bl,
308 struct pnfs_block_short_extent *new)
309{
310 struct list_head *clist = &bl->bl_commit;
311 struct pnfs_block_short_extent *old, *save;
312 sector_t end = new->bse_f_offset + new->bse_length;
313
314 dprintk("%s enter\n", __func__);
315 print_short_extent(new);
316 print_clist(clist, bl->bl_count);
317 bl->bl_count++;
318 /* Scan for proper place to insert, extending new to the left
319 * as much as possible.
320 */
321 list_for_each_entry_safe(old, save, clist, bse_node) {
322 if (new->bse_f_offset < old->bse_f_offset)
323 break;
324 if (end <= old->bse_f_offset + old->bse_length) {
325 /* Range is already in list */
326 bl->bl_count--;
327 kfree(new);
328 return;
329 } else if (new->bse_f_offset <=
330 old->bse_f_offset + old->bse_length) {
331 /* new overlaps or abuts existing be */
332 if (new->bse_mdev == old->bse_mdev) {
333 /* extend new to fully replace old */
334 new->bse_length += new->bse_f_offset -
335 old->bse_f_offset;
336 new->bse_f_offset = old->bse_f_offset;
337 list_del(&old->bse_node);
338 bl->bl_count--;
339 kfree(old);
340 }
341 }
342 }
343 /* Note that if we never hit the above break, old will not point to a
344 * valid extent. However, in that case &old->bse_node==list.
345 */
346 list_add_tail(&new->bse_node, &old->bse_node);
347 /* Scan forward for overlaps. If we find any, extend new and
348 * remove the overlapped extent.
349 */
350 old = list_prepare_entry(new, clist, bse_node);
351 list_for_each_entry_safe_continue(old, save, clist, bse_node) {
352 if (end < old->bse_f_offset)
353 break;
354 /* new overlaps or abuts old */
355 if (new->bse_mdev == old->bse_mdev) {
356 if (end < old->bse_f_offset + old->bse_length) {
357 /* extend new to fully cover old */
358 end = old->bse_f_offset + old->bse_length;
359 new->bse_length = end - new->bse_f_offset;
360 }
361 list_del(&old->bse_node);
362 bl->bl_count--;
363 kfree(old);
364 }
365 }
366 dprintk("%s: after merging\n", __func__);
367 print_clist(clist, bl->bl_count);
368}
369
370/* Note the range described by offset, length is guaranteed to be contained
371 * within be.
372 * new will be freed, either by this function or add_to_commitlist if they
373 * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
374 */
375int bl_mark_for_commit(struct pnfs_block_extent *be,
376 sector_t offset, sector_t length,
377 struct pnfs_block_short_extent *new)
378{
379 sector_t new_end, end = offset + length;
380 struct pnfs_block_layout *bl = container_of(be->be_inval,
381 struct pnfs_block_layout,
382 bl_inval);
383
384 mark_written_sectors(be->be_inval, offset, length);
385 /* We want to add the range to commit list, but it must be
386 * block-normalized, and verified that the normalized range has
387 * been entirely written to disk.
388 */
389 new->bse_f_offset = offset;
390 offset = normalize(offset, bl->bl_blocksize);
391 if (offset < new->bse_f_offset) {
392 if (is_range_written(be->be_inval, offset, new->bse_f_offset))
393 new->bse_f_offset = offset;
394 else
395 new->bse_f_offset = offset + bl->bl_blocksize;
396 }
397 new_end = normalize_up(end, bl->bl_blocksize);
398 if (end < new_end) {
399 if (is_range_written(be->be_inval, end, new_end))
400 end = new_end;
401 else
402 end = new_end - bl->bl_blocksize;
403 }
404 if (end <= new->bse_f_offset) {
405 kfree(new);
406 return 0;
407 }
408 new->bse_length = end - new->bse_f_offset;
409 new->bse_devid = be->be_devid;
410 new->bse_mdev = be->be_mdev;
411
412 spin_lock(&bl->bl_ext_lock);
413 add_to_commitlist(bl, new);
414 spin_unlock(&bl->bl_ext_lock);
415 return 0;
416}
417
418static void print_bl_extent(struct pnfs_block_extent *be)
419{
420 dprintk("PRINT EXTENT extent %p\n", be);
421 if (be) {
422 dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
423 dprintk(" be_length %llu\n", (u64)be->be_length);
424 dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
425 dprintk(" be_state %d\n", be->be_state);
426 }
427}
428
429static void
430destroy_extent(struct kref *kref)
431{
432 struct pnfs_block_extent *be;
433
434 be = container_of(kref, struct pnfs_block_extent, be_refcnt);
435 dprintk("%s be=%p\n", __func__, be);
436 kfree(be);
437}
438
439void
440bl_put_extent(struct pnfs_block_extent *be)
441{
442 if (be) {
443 dprintk("%s enter %p (%i)\n", __func__, be,
444 atomic_read(&be->be_refcnt.refcount));
445 kref_put(&be->be_refcnt, destroy_extent);
446 }
447}
448
449struct pnfs_block_extent *bl_alloc_extent(void)
450{
451 struct pnfs_block_extent *be;
452
453 be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
454 if (!be)
455 return NULL;
456 INIT_LIST_HEAD(&be->be_node);
457 kref_init(&be->be_refcnt);
458 be->be_inval = NULL;
459 return be;
460}
461
462static void print_elist(struct list_head *list)
463{
464 struct pnfs_block_extent *be;
465 dprintk("****************\n");
466 dprintk("Extent list looks like:\n");
467 list_for_each_entry(be, list, be_node) {
468 print_bl_extent(be);
469 }
470 dprintk("****************\n");
471}
472
473static inline int
474extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
475{
476 /* Note this assumes new->be_f_offset >= old->be_f_offset */
477 return (new->be_state == old->be_state) &&
478 ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
479 ((new->be_v_offset - old->be_v_offset ==
480 new->be_f_offset - old->be_f_offset) &&
481 new->be_mdev == old->be_mdev));
482}
483
484/* Adds new to appropriate list in bl, modifying new and removing existing
485 * extents as appropriate to deal with overlaps.
486 *
487 * See bl_find_get_extent for list constraints.
488 *
489 * Refcount on new is already set. If end up not using it, or error out,
490 * need to put the reference.
491 *
492 * bl->bl_ext_lock is held by caller.
493 */
494int
495bl_add_merge_extent(struct pnfs_block_layout *bl,
496 struct pnfs_block_extent *new)
497{
498 struct pnfs_block_extent *be, *tmp;
499 sector_t end = new->be_f_offset + new->be_length;
500 struct list_head *list;
501
502 dprintk("%s enter with be=%p\n", __func__, new);
503 print_bl_extent(new);
504 list = &bl->bl_extents[bl_choose_list(new->be_state)];
505 print_elist(list);
506
507 /* Scan for proper place to insert, extending new to the left
508 * as much as possible.
509 */
510 list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
511 if (new->be_f_offset >= be->be_f_offset + be->be_length)
512 break;
513 if (new->be_f_offset >= be->be_f_offset) {
514 if (end <= be->be_f_offset + be->be_length) {
515 /* new is a subset of existing be*/
516 if (extents_consistent(be, new)) {
517 dprintk("%s: new is subset, ignoring\n",
518 __func__);
519 bl_put_extent(new);
520 return 0;
521 } else {
522 goto out_err;
523 }
524 } else {
525 /* |<-- be -->|
526 * |<-- new -->| */
527 if (extents_consistent(be, new)) {
528 /* extend new to fully replace be */
529 new->be_length += new->be_f_offset -
530 be->be_f_offset;
531 new->be_f_offset = be->be_f_offset;
532 new->be_v_offset = be->be_v_offset;
533 dprintk("%s: removing %p\n", __func__, be);
534 list_del(&be->be_node);
535 bl_put_extent(be);
536 } else {
537 goto out_err;
538 }
539 }
540 } else if (end >= be->be_f_offset + be->be_length) {
541 /* new extent overlap existing be */
542 if (extents_consistent(be, new)) {
543 /* extend new to fully replace be */
544 dprintk("%s: removing %p\n", __func__, be);
545 list_del(&be->be_node);
546 bl_put_extent(be);
547 } else {
548 goto out_err;
549 }
550 } else if (end > be->be_f_offset) {
551 /* |<-- be -->|
552 *|<-- new -->| */
553 if (extents_consistent(new, be)) {
554 /* extend new to fully replace be */
555 new->be_length += be->be_f_offset + be->be_length -
556 new->be_f_offset - new->be_length;
557 dprintk("%s: removing %p\n", __func__, be);
558 list_del(&be->be_node);
559 bl_put_extent(be);
560 } else {
561 goto out_err;
562 }
563 }
564 }
565 /* Note that if we never hit the above break, be will not point to a
566 * valid extent. However, in that case &be->be_node==list.
567 */
568 list_add(&new->be_node, &be->be_node);
569 dprintk("%s: inserting new\n", __func__);
570 print_elist(list);
571 /* FIXME - The per-list consistency checks have all been done,
572 * should now check cross-list consistency.
573 */
574 return 0;
575
576 out_err:
577 bl_put_extent(new);
578 return -EIO;
579}
580
581/* Returns extent, or NULL. If a second READ extent exists, it is returned
582 * in cow_read, if given.
583 *
584 * The extents are kept in two seperate ordered lists, one for READ and NONE,
585 * one for READWRITE and INVALID. Within each list, we assume:
586 * 1. Extents are ordered by file offset.
587 * 2. For any given isect, there is at most one extents that matches.
588 */
589struct pnfs_block_extent *
590bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
591 struct pnfs_block_extent **cow_read)
592{
593 struct pnfs_block_extent *be, *cow, *ret;
594 int i;
595
596 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
597 cow = ret = NULL;
598 spin_lock(&bl->bl_ext_lock);
599 for (i = 0; i < EXTENT_LISTS; i++) {
600 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
601 if (isect >= be->be_f_offset + be->be_length)
602 break;
603 if (isect >= be->be_f_offset) {
604 /* We have found an extent */
605 dprintk("%s Get %p (%i)\n", __func__, be,
606 atomic_read(&be->be_refcnt.refcount));
607 kref_get(&be->be_refcnt);
608 if (!ret)
609 ret = be;
610 else if (be->be_state != PNFS_BLOCK_READ_DATA)
611 bl_put_extent(be);
612 else
613 cow = be;
614 break;
615 }
616 }
617 if (ret &&
618 (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
619 break;
620 }
621 spin_unlock(&bl->bl_ext_lock);
622 if (cow_read)
623 *cow_read = cow;
624 print_bl_extent(ret);
625 return ret;
626}
627
628/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
629static struct pnfs_block_extent *
630bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
631{
632 struct pnfs_block_extent *be, *ret = NULL;
633 int i;
634
635 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
636 for (i = 0; i < EXTENT_LISTS; i++) {
637 if (ret)
638 break;
639 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
640 if (isect >= be->be_f_offset + be->be_length)
641 break;
642 if (isect >= be->be_f_offset) {
643 /* We have found an extent */
644 dprintk("%s Get %p (%i)\n", __func__, be,
645 atomic_read(&be->be_refcnt.refcount));
646 kref_get(&be->be_refcnt);
647 ret = be;
648 break;
649 }
650 }
651 }
652 print_bl_extent(ret);
653 return ret;
654}
655
656int
657encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
658 struct xdr_stream *xdr,
659 const struct nfs4_layoutcommit_args *arg)
660{
661 struct pnfs_block_short_extent *lce, *save;
662 unsigned int count = 0;
663 __be32 *p, *xdr_start;
664
665 dprintk("%s enter\n", __func__);
666 /* BUG - creation of bl_commit is buggy - need to wait for
667 * entire block to be marked WRITTEN before it can be added.
668 */
669 spin_lock(&bl->bl_ext_lock);
670 /* Want to adjust for possible truncate */
671 /* We now want to adjust argument range */
672
673 /* XDR encode the ranges found */
674 xdr_start = xdr_reserve_space(xdr, 8);
675 if (!xdr_start)
676 goto out;
677 list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
678 p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
679 if (!p)
680 break;
681 p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
682 p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
684 p = xdr_encode_hyper(p, 0LL);
685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
686 list_move_tail(&lce->bse_node, &bl->bl_committing);
687 bl->bl_count--;
688 count++;
689 }
690 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
691 xdr_start[1] = cpu_to_be32(count);
692out:
693 spin_unlock(&bl->bl_ext_lock);
694 dprintk("%s found %i ranges\n", __func__, count);
695 return 0;
696}
697
698/* Helper function to set_to_rw that initialize a new extent */
699static void
700_prep_new_extent(struct pnfs_block_extent *new,
701 struct pnfs_block_extent *orig,
702 sector_t offset, sector_t length, int state)
703{
704 kref_init(&new->be_refcnt);
705 /* don't need to INIT_LIST_HEAD(&new->be_node) */
706 memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
707 new->be_mdev = orig->be_mdev;
708 new->be_f_offset = offset;
709 new->be_length = length;
710 new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
711 new->be_state = state;
712 new->be_inval = orig->be_inval;
713}
714
715/* Tries to merge be with extent in front of it in list.
716 * Frees storage if not used.
717 */
718static struct pnfs_block_extent *
719_front_merge(struct pnfs_block_extent *be, struct list_head *head,
720 struct pnfs_block_extent *storage)
721{
722 struct pnfs_block_extent *prev;
723
724 if (!storage)
725 goto no_merge;
726 if (&be->be_node == head || be->be_node.prev == head)
727 goto no_merge;
728 prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
729 if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
730 !extents_consistent(prev, be))
731 goto no_merge;
732 _prep_new_extent(storage, prev, prev->be_f_offset,
733 prev->be_length + be->be_length, prev->be_state);
734 list_replace(&prev->be_node, &storage->be_node);
735 bl_put_extent(prev);
736 list_del(&be->be_node);
737 bl_put_extent(be);
738 return storage;
739
740 no_merge:
741 kfree(storage);
742 return be;
743}
744
745static u64
746set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
747{
748 u64 rv = offset + length;
749 struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
750 struct pnfs_block_extent *children[3];
751 struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
752 int i = 0, j;
753
754 dprintk("%s(%llu, %llu)\n", __func__, offset, length);
755 /* Create storage for up to three new extents e1, e2, e3 */
756 e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
757 e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
758 e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
759 /* BUG - we are ignoring any failure */
760 if (!e1 || !e2 || !e3)
761 goto out_nosplit;
762
763 spin_lock(&bl->bl_ext_lock);
764 be = bl_find_get_extent_locked(bl, offset);
765 rv = be->be_f_offset + be->be_length;
766 if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
767 spin_unlock(&bl->bl_ext_lock);
768 goto out_nosplit;
769 }
770 /* Add e* to children, bumping e*'s krefs */
771 if (be->be_f_offset != offset) {
772 _prep_new_extent(e1, be, be->be_f_offset,
773 offset - be->be_f_offset,
774 PNFS_BLOCK_INVALID_DATA);
775 children[i++] = e1;
776 print_bl_extent(e1);
777 } else
778 merge1 = e1;
779 _prep_new_extent(e2, be, offset,
780 min(length, be->be_f_offset + be->be_length - offset),
781 PNFS_BLOCK_READWRITE_DATA);
782 children[i++] = e2;
783 print_bl_extent(e2);
784 if (offset + length < be->be_f_offset + be->be_length) {
785 _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
786 be->be_f_offset + be->be_length -
787 offset - length,
788 PNFS_BLOCK_INVALID_DATA);
789 children[i++] = e3;
790 print_bl_extent(e3);
791 } else
792 merge2 = e3;
793
794 /* Remove be from list, and insert the e* */
795 /* We don't get refs on e*, since this list is the base reference
796 * set when init'ed.
797 */
798 if (i < 3)
799 children[i] = NULL;
800 new = children[0];
801 list_replace(&be->be_node, &new->be_node);
802 bl_put_extent(be);
803 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
804 for (j = 1; j < i; j++) {
805 old = new;
806 new = children[j];
807 list_add(&new->be_node, &old->be_node);
808 }
809 if (merge2) {
810 /* This is a HACK, should just create a _back_merge function */
811 new = list_entry(new->be_node.next,
812 struct pnfs_block_extent, be_node);
813 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
814 }
815 spin_unlock(&bl->bl_ext_lock);
816
817 /* Since we removed the base reference above, be is now scheduled for
818 * destruction.
819 */
820 bl_put_extent(be);
821 dprintk("%s returns %llu after split\n", __func__, rv);
822 return rv;
823
824 out_nosplit:
825 kfree(e1);
826 kfree(e2);
827 kfree(e3);
828 dprintk("%s returns %llu without splitting\n", __func__, rv);
829 return rv;
830}
831
832void
833clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
834 const struct nfs4_layoutcommit_args *arg,
835 int status)
836{
837 struct pnfs_block_short_extent *lce, *save;
838
839 dprintk("%s status %d\n", __func__, status);
840 list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
841 if (likely(!status)) {
842 u64 offset = lce->bse_f_offset;
843 u64 end = offset + lce->bse_length;
844
845 do {
846 offset = set_to_rw(bl, offset, end - offset);
847 } while (offset < end);
848 list_del(&lce->bse_node);
849
850 kfree(lce);
851 } else {
852 list_del(&lce->bse_node);
853 spin_lock(&bl->bl_ext_lock);
854 add_to_commitlist(bl, lce);
855 spin_unlock(&bl->bl_ext_lock);
856 }
857 }
858}
859
860int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
861{
862 struct pnfs_block_short_extent *new;
863
864 new = kmalloc(sizeof(*new), GFP_NOFS);
865 if (unlikely(!new))
866 return -ENOMEM;
867
868 spin_lock_bh(&marks->im_lock);
869 list_add(&new->bse_node, &marks->im_extents);
870 spin_unlock_bh(&marks->im_lock);
871
872 return 0;
873}
874
875struct pnfs_block_short_extent *
876bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
877{
878 struct pnfs_block_short_extent *rv = NULL;
879
880 spin_lock_bh(&marks->im_lock);
881 if (!list_empty(&marks->im_extents)) {
882 rv = list_entry((&marks->im_extents)->next,
883 struct pnfs_block_short_extent, bse_node);
884 list_del_init(&rv->bse_node);
885 }
886 spin_unlock_bh(&marks->im_lock);
887
888 return rv;
889}
890
891void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
892{
893 struct pnfs_block_short_extent *se = NULL, *tmp;
894
895 if (num_to_free <= 0)
896 return;
897
898 spin_lock(&marks->im_lock);
899 list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
900 list_del(&se->bse_node);
901 kfree(se);
902 if (--num_to_free == 0)
903 break;
904 }
905 spin_unlock(&marks->im_lock);
906
907 BUG_ON(num_to_free > 0);
908}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..8d04bda2bd2e
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,285 @@
1/*
2 * Copyright (c) 2006,2007 The Regents of the University of Michigan.
3 * All rights reserved.
4 *
5 * Andy Adamson <andros@citi.umich.edu>
6 * Fred Isaman <iisaman@umich.edu>
7 *
8 * permission is granted to use, copy, create derivative works and
9 * redistribute this software and such derivative works for any purpose,
10 * so long as the name of the university of michigan is not used in
11 * any advertising or publicity pertaining to the use or distribution
12 * of this software without specific, written prior authorization. if
13 * the above copyright notice or any other identification of the
14 * university of michigan is included in any copy of any portion of
15 * this software, then the disclaimer below must also be included.
16 *
17 * this software is provided as is, without representation from the
18 * university of michigan as to its fitness for any purpose, and without
19 * warranty by the university of michigan of any kind, either express
20 * or implied, including without limitation the implied warranties of
21 * merchantability and fitness for a particular purpose. the regents
22 * of the university of michigan shall not be liable for any damages,
23 * including special, indirect, incidental, or consequential damages,
24 * with respect to any claim arising out or in connection with the use
25 * of the software, even if it has been or is hereafter advised of the
26 * possibility of such damages.
27 */
28
29#include <linux/module.h>
30#include <linux/genhd.h>
31#include <linux/blkdev.h>
32
33#include "blocklayout.h"
34
35#define NFSDBG_FACILITY NFSDBG_PNFS_LD
36
37static void
38nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
39{
40 int i;
41
42 *p++ = cpu_to_be32(1);
43 *p++ = cpu_to_be32(b->type);
44 *p++ = cpu_to_be32(b->simple.nr_sigs);
45 for (i = 0; i < b->simple.nr_sigs; i++) {
46 p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
47 p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
48 b->simple.sigs[i].sig_len);
49 }
50}
51
52dev_t
53bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
54 gfp_t gfp_mask)
55{
56 struct net *net = server->nfs_client->cl_net;
57 struct nfs_net *nn = net_generic(net, nfs_net_id);
58 struct bl_dev_msg *reply = &nn->bl_mount_reply;
59 struct bl_pipe_msg bl_pipe_msg;
60 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
61 struct bl_msg_hdr *bl_msg;
62 DECLARE_WAITQUEUE(wq, current);
63 dev_t dev = 0;
64 int rc;
65
66 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
67
68 bl_pipe_msg.bl_wq = &nn->bl_wq;
69
70 b->simple.len += 4; /* single volume */
71 if (b->simple.len > PAGE_SIZE)
72 return -EIO;
73
74 memset(msg, 0, sizeof(*msg));
75 msg->len = sizeof(*bl_msg) + b->simple.len;
76 msg->data = kzalloc(msg->len, gfp_mask);
77 if (!msg->data)
78 goto out;
79
80 bl_msg = msg->data;
81 bl_msg->type = BL_DEVICE_MOUNT,
82 bl_msg->totallen = b->simple.len;
83 nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
84
85 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
86 add_wait_queue(&nn->bl_wq, &wq);
87 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
88 if (rc < 0) {
89 remove_wait_queue(&nn->bl_wq, &wq);
90 goto out;
91 }
92
93 set_current_state(TASK_UNINTERRUPTIBLE);
94 schedule();
95 __set_current_state(TASK_RUNNING);
96 remove_wait_queue(&nn->bl_wq, &wq);
97
98 if (reply->status != BL_DEVICE_REQUEST_PROC) {
99 printk(KERN_WARNING "%s failed to decode device: %d\n",
100 __func__, reply->status);
101 goto out;
102 }
103
104 dev = MKDEV(reply->major, reply->minor);
105out:
106 kfree(msg->data);
107 return dev;
108}
109
110static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
111 size_t mlen)
112{
113 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
114 nfs_net_id);
115
116 if (mlen != sizeof (struct bl_dev_msg))
117 return -EINVAL;
118
119 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
120 return -EFAULT;
121
122 wake_up(&nn->bl_wq);
123
124 return mlen;
125}
126
127static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
128{
129 struct bl_pipe_msg *bl_pipe_msg =
130 container_of(msg, struct bl_pipe_msg, msg);
131
132 if (msg->errno >= 0)
133 return;
134 wake_up(bl_pipe_msg->bl_wq);
135}
136
137static const struct rpc_pipe_ops bl_upcall_ops = {
138 .upcall = rpc_pipe_generic_upcall,
139 .downcall = bl_pipe_downcall,
140 .destroy_msg = bl_pipe_destroy_msg,
141};
142
143static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
144 struct rpc_pipe *pipe)
145{
146 struct dentry *dir, *dentry;
147
148 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
149 if (dir == NULL)
150 return ERR_PTR(-ENOENT);
151 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
152 dput(dir);
153 return dentry;
154}
155
156static void nfs4blocklayout_unregister_sb(struct super_block *sb,
157 struct rpc_pipe *pipe)
158{
159 if (pipe->dentry)
160 rpc_unlink(pipe->dentry);
161}
162
163static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
164 void *ptr)
165{
166 struct super_block *sb = ptr;
167 struct net *net = sb->s_fs_info;
168 struct nfs_net *nn = net_generic(net, nfs_net_id);
169 struct dentry *dentry;
170 int ret = 0;
171
172 if (!try_module_get(THIS_MODULE))
173 return 0;
174
175 if (nn->bl_device_pipe == NULL) {
176 module_put(THIS_MODULE);
177 return 0;
178 }
179
180 switch (event) {
181 case RPC_PIPEFS_MOUNT:
182 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
183 if (IS_ERR(dentry)) {
184 ret = PTR_ERR(dentry);
185 break;
186 }
187 nn->bl_device_pipe->dentry = dentry;
188 break;
189 case RPC_PIPEFS_UMOUNT:
190 if (nn->bl_device_pipe->dentry)
191 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
192 break;
193 default:
194 ret = -ENOTSUPP;
195 break;
196 }
197 module_put(THIS_MODULE);
198 return ret;
199}
200
201static struct notifier_block nfs4blocklayout_block = {
202 .notifier_call = rpc_pipefs_event,
203};
204
205static struct dentry *nfs4blocklayout_register_net(struct net *net,
206 struct rpc_pipe *pipe)
207{
208 struct super_block *pipefs_sb;
209 struct dentry *dentry;
210
211 pipefs_sb = rpc_get_sb_net(net);
212 if (!pipefs_sb)
213 return NULL;
214 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
215 rpc_put_sb_net(net);
216 return dentry;
217}
218
219static void nfs4blocklayout_unregister_net(struct net *net,
220 struct rpc_pipe *pipe)
221{
222 struct super_block *pipefs_sb;
223
224 pipefs_sb = rpc_get_sb_net(net);
225 if (pipefs_sb) {
226 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
227 rpc_put_sb_net(net);
228 }
229}
230
231static int nfs4blocklayout_net_init(struct net *net)
232{
233 struct nfs_net *nn = net_generic(net, nfs_net_id);
234 struct dentry *dentry;
235
236 init_waitqueue_head(&nn->bl_wq);
237 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
238 if (IS_ERR(nn->bl_device_pipe))
239 return PTR_ERR(nn->bl_device_pipe);
240 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
241 if (IS_ERR(dentry)) {
242 rpc_destroy_pipe_data(nn->bl_device_pipe);
243 return PTR_ERR(dentry);
244 }
245 nn->bl_device_pipe->dentry = dentry;
246 return 0;
247}
248
249static void nfs4blocklayout_net_exit(struct net *net)
250{
251 struct nfs_net *nn = net_generic(net, nfs_net_id);
252
253 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
254 rpc_destroy_pipe_data(nn->bl_device_pipe);
255 nn->bl_device_pipe = NULL;
256}
257
258static struct pernet_operations nfs4blocklayout_net_ops = {
259 .init = nfs4blocklayout_net_init,
260 .exit = nfs4blocklayout_net_exit,
261};
262
263int __init bl_init_pipefs(void)
264{
265 int ret;
266
267 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
268 if (ret)
269 goto out;
270 ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
271 if (ret)
272 goto out_unregister_notifier;
273 return 0;
274
275out_unregister_notifier:
276 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
277out:
278 return ret;
279}
280
281void __exit bl_cleanup_pipefs(void)
282{
283 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
284 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
285}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
171 goto out; 171 goto out;
172 172
173 ino = lo->plh_inode; 173 ino = lo->plh_inode;
174
175 spin_lock(&ino->i_lock);
176 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
177 spin_unlock(&ino->i_lock);
178
179 pnfs_layoutcommit_inode(ino, false);
180
174 spin_lock(&ino->i_lock); 181 spin_lock(&ino->i_lock);
175 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 182 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
176 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, 183 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
177 &args->cbl_range)) 184 &args->cbl_range)) {
178 rv = NFS4ERR_DELAY; 185 rv = NFS4ERR_DELAY;
179 else 186 goto unlock;
180 rv = NFS4ERR_NOMATCHING_LAYOUT; 187 }
181 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 188
189 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
190 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
191 &args->cbl_range);
192 }
193unlock:
182 spin_unlock(&ino->i_lock); 194 spin_unlock(&ino->i_lock);
183 pnfs_free_lseg_list(&free_me_list); 195 pnfs_free_lseg_list(&free_me_list);
184 pnfs_put_layout_hdr(lo); 196 pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
277 } 289 }
278 290
279 found: 291 found:
280 if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
281 dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
282 "deleting instead\n", __func__);
283 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); 292 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
284 } 293 }
285 294
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1c5ff6d58385..f9f4845db989 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file)
1252 * set up the iterator to start reading from the server list and return the first item 1252 * set up the iterator to start reading from the server list and return the first item
1253 */ 1253 */
1254static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) 1254static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1255 __acquires(&nn->nfs_client_lock)
1255{ 1256{
1256 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1257 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1257 1258
@@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1274 * clean up after reading from the transports list 1275 * clean up after reading from the transports list
1275 */ 1276 */
1276static void nfs_server_list_stop(struct seq_file *p, void *v) 1277static void nfs_server_list_stop(struct seq_file *p, void *v)
1278 __releases(&nn->nfs_client_lock)
1277{ 1279{
1278 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1280 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1279 1281
@@ -1318,7 +1320,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1318 */ 1320 */
1319static int nfs_volume_list_open(struct inode *inode, struct file *file) 1321static int nfs_volume_list_open(struct inode *inode, struct file *file)
1320{ 1322{
1321 return seq_open_net(inode, file, &nfs_server_list_ops, 1323 return seq_open_net(inode, file, &nfs_volume_list_ops,
1322 sizeof(struct seq_net_private)); 1324 sizeof(struct seq_net_private));
1323} 1325}
1324 1326
@@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file)
1326 * set up the iterator to start reading from the volume list and return the first item 1328 * set up the iterator to start reading from the volume list and return the first item
1327 */ 1329 */
1328static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) 1330static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1331 __acquires(&nn->nfs_client_lock)
1329{ 1332{
1330 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); 1333 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1331 1334
@@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1348 * clean up after reading from the transports list 1351 * clean up after reading from the transports list
1349 */ 1352 */
1350static void nfs_volume_list_stop(struct seq_file *p, void *v) 1353static void nfs_volume_list_stop(struct seq_file *p, void *v)
1354 __releases(&nn->nfs_client_lock)
1351{ 1355{
1352 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); 1356 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1353 1357
@@ -1412,24 +1416,18 @@ int nfs_fs_proc_net_init(struct net *net)
1412 p = proc_create("volumes", S_IFREG|S_IRUGO, 1416 p = proc_create("volumes", S_IFREG|S_IRUGO,
1413 nn->proc_nfsfs, &nfs_volume_list_fops); 1417 nn->proc_nfsfs, &nfs_volume_list_fops);
1414 if (!p) 1418 if (!p)
1415 goto error_2; 1419 goto error_1;
1416 return 0; 1420 return 0;
1417 1421
1418error_2:
1419 remove_proc_entry("servers", nn->proc_nfsfs);
1420error_1: 1422error_1:
1421 remove_proc_entry("fs/nfsfs", NULL); 1423 remove_proc_subtree("nfsfs", net->proc_net);
1422error_0: 1424error_0:
1423 return -ENOMEM; 1425 return -ENOMEM;
1424} 1426}
1425 1427
1426void nfs_fs_proc_net_exit(struct net *net) 1428void nfs_fs_proc_net_exit(struct net *net)
1427{ 1429{
1428 struct nfs_net *nn = net_generic(net, nfs_net_id); 1430 remove_proc_subtree("nfsfs", net->proc_net);
1429
1430 remove_proc_entry("volumes", nn->proc_nfsfs);
1431 remove_proc_entry("servers", nn->proc_nfsfs);
1432 remove_proc_entry("fs/nfsfs", NULL);
1433} 1431}
1434 1432
1435/* 1433/*
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 65ef6e00deee..dda4b8667c02 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
178 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 178 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
179} 179}
180 180
181#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
182/* 181/*
183 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data 182 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
184 * @dreq - direct request possibly spanning multiple servers 183 * @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
197 WARN_ON_ONCE(verfp->committed < 0); 196 WARN_ON_ONCE(verfp->committed < 0);
198 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 197 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
199} 198}
200#endif
201 199
202/** 200/**
203 * nfs_direct_IO - NFS address space operation for direct I/O 201 * nfs_direct_IO - NFS address space operation for direct I/O
@@ -576,7 +574,6 @@ out:
576 return result; 574 return result;
577} 575}
578 576
579#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
580static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 577static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
581{ 578{
582 struct nfs_pageio_descriptor desc; 579 struct nfs_pageio_descriptor desc;
@@ -700,17 +697,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
700 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ 697 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
701} 698}
702 699
703#else
704static void nfs_direct_write_schedule_work(struct work_struct *work)
705{
706}
707
708static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
709{
710 nfs_direct_complete(dreq, true);
711}
712#endif
713
714static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) 700static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
715{ 701{
716 struct nfs_direct_req *dreq = hdr->dreq; 702 struct nfs_direct_req *dreq = hdr->dreq;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8c4048ecdad1..4ea92ce0537f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39#include "pnfs.h"
39 40
40#include "nfstrace.h" 41#include "nfstrace.h"
41 42
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
327 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); 328 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
328 unsigned int end = offset + len; 329 unsigned int end = offset + len;
329 330
331 if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
332 if (!PageUptodate(page))
333 return 1;
334 return 0;
335 }
336
330 if ((file->f_mode & FMODE_READ) && /* open for read? */ 337 if ((file->f_mode & FMODE_READ) && /* open for read? */
331 !PageUptodate(page) && /* Uptodate? */ 338 !PageUptodate(page) && /* Uptodate? */
332 !PagePrivate(page) && /* i/o request already? */ 339 !PagePrivate(page) && /* i/o request already? */
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
468 475
469 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 476 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
470 477
471 /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not 478 /* Always try to initiate a 'commit' if relevant, but only
472 * doing this memory reclaim for a fs-related allocation. 479 * wait for it if __GFP_WAIT is set. Even then, only wait 1
480 * second and only if the 'bdi' is not congested.
481 * Waiting indefinitely can cause deadlocks when the NFS
482 * server is on this machine, when a new TCP connection is
483 * needed and in other rare cases. There is no particular
484 * need to wait extensively here. A short wait has the
485 * benefit that someone else can worry about the freezer.
473 */ 486 */
474 if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && 487 if (mapping) {
475 !(current->flags & PF_FSTRANS)) { 488 struct nfs_server *nfss = NFS_SERVER(mapping->host);
476 int how = FLUSH_SYNC; 489 nfs_commit_inode(mapping->host, 0);
477 490 if ((gfp & __GFP_WAIT) &&
478 /* Don't let kswapd deadlock waiting for OOM RPC calls */ 491 !bdi_write_congested(&nfss->backing_dev_info)) {
479 if (current_is_kswapd()) 492 wait_on_page_bit_killable_timeout(page, PG_private,
480 how = 0; 493 HZ);
481 nfs_commit_inode(mapping->host, how); 494 if (PagePrivate(page))
495 set_bdi_congested(&nfss->backing_dev_info,
496 BLK_RW_ASYNC);
497 }
482 } 498 }
483 /* If PagePrivate() is set, then the page is not freeable */ 499 /* If PagePrivate() is set, then the page is not freeable */
484 if (PagePrivate(page)) 500 if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
539static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, 555static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
540 sector_t *span) 556 sector_t *span)
541{ 557{
558 int ret;
559 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
560
542 *span = sis->pages; 561 *span = sis->pages;
543 return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); 562
563 rcu_read_lock();
564 ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
565 rcu_read_unlock();
566
567 return ret;
544} 568}
545 569
546static void nfs_swap_deactivate(struct file *file) 570static void nfs_swap_deactivate(struct file *file)
547{ 571{
548 xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); 572 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
573
574 rcu_read_lock();
575 xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
576 rcu_read_unlock();
549} 577}
550#endif 578#endif
551 579
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 1359c4a27393..abc5056999d6 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
265{ 265{
266 266
267 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || 267 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
268 hdr->res.verf->committed == NFS_FILE_SYNC) 268 hdr->res.verf->committed != NFS_DATA_SYNC)
269 return; 269 return;
270 270
271 pnfs_set_layoutcommit(hdr); 271 pnfs_set_layoutcommit(hdr);
@@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
403 return -EAGAIN; 403 return -EAGAIN;
404 } 404 }
405 405
406 if (data->verf.committed == NFS_UNSTABLE)
407 pnfs_commit_set_layoutcommit(data);
408
406 return 0; 409 return 0;
407} 410}
408 411
@@ -646,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
646 } 649 }
647 650
648 /* find and reference the deviceid */ 651 /* find and reference the deviceid */
649 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 652 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
650 NFS_SERVER(lo->plh_inode)->nfs_client, id); 653 lo->plh_lc_cred, gfp_flags);
651 if (d == NULL) { 654 if (d == NULL)
652 dsaddr = filelayout_get_device_info(lo->plh_inode, id, 655 goto out;
653 lo->plh_lc_cred, gfp_flags); 656
654 if (dsaddr == NULL) 657 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
655 goto out;
656 } else
657 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
658 /* Found deviceid is unavailable */ 658 /* Found deviceid is unavailable */
659 if (filelayout_test_devid_unavailable(&dsaddr->id_node)) 659 if (filelayout_test_devid_unavailable(&dsaddr->id_node))
660 goto out_put; 660 goto out_put;
661 661
662 fl->dsaddr = dsaddr; 662 fl->dsaddr = dsaddr;
663 663
@@ -1269,11 +1269,12 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
1269static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx) 1269static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
1270{ 1270{
1271 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; 1271 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
1272 struct pnfs_commit_bucket *bucket = fl_cinfo->buckets; 1272 struct pnfs_commit_bucket *bucket;
1273 struct pnfs_layout_segment *freeme; 1273 struct pnfs_layout_segment *freeme;
1274 int i; 1274 int i;
1275 1275
1276 for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) { 1276 for (i = idx; i < fl_cinfo->nbuckets; i++) {
1277 bucket = &fl_cinfo->buckets[i];
1277 if (list_empty(&bucket->committing)) 1278 if (list_empty(&bucket->committing))
1278 continue; 1279 continue;
1279 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); 1280 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
@@ -1367,6 +1368,17 @@ out:
1367 cinfo->ds->ncommitting = 0; 1368 cinfo->ds->ncommitting = 0;
1368 return PNFS_ATTEMPTED; 1369 return PNFS_ATTEMPTED;
1369} 1370}
1371static struct nfs4_deviceid_node *
1372filelayout_alloc_deviceid_node(struct nfs_server *server,
1373 struct pnfs_device *pdev, gfp_t gfp_flags)
1374{
1375 struct nfs4_file_layout_dsaddr *dsaddr;
1376
1377 dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
1378 if (!dsaddr)
1379 return NULL;
1380 return &dsaddr->id_node;
1381}
1370 1382
1371static void 1383static void
1372filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) 1384filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1419,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1419 .commit_pagelist = filelayout_commit_pagelist, 1431 .commit_pagelist = filelayout_commit_pagelist,
1420 .read_pagelist = filelayout_read_pagelist, 1432 .read_pagelist = filelayout_read_pagelist,
1421 .write_pagelist = filelayout_write_pagelist, 1433 .write_pagelist = filelayout_write_pagelist,
1434 .alloc_deviceid_node = filelayout_alloc_deviceid_node,
1422 .free_deviceid_node = filelayout_free_deveiceid_node, 1435 .free_deviceid_node = filelayout_free_deveiceid_node,
1423}; 1436};
1424 1437
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
149 u32 ds_idx); 149 u32 ds_idx);
150
151extern struct nfs4_file_layout_dsaddr *
152nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
153 struct pnfs_device *pdev, gfp_t gfp_flags);
150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 154extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 155extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
152struct nfs4_file_layout_dsaddr *
153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
154 struct rpc_cred *cred, gfp_t gfp_flags);
155 156
156#endif /* FS_NFS_NFS4FILELAYOUT_H */ 157#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 8540516f4d71..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
484} 484}
485 485
486/* Decode opaque device data and return the result */ 486/* Decode opaque device data and return the result */
487static struct nfs4_file_layout_dsaddr* 487struct nfs4_file_layout_dsaddr *
488decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) 488nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
489 gfp_t gfp_flags)
489{ 490{
490 int i; 491 int i;
491 u32 cnt, num; 492 u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
570 dsaddr->stripe_indices = stripe_indices; 571 dsaddr->stripe_indices = stripe_indices;
571 stripe_indices = NULL; 572 stripe_indices = NULL;
572 dsaddr->ds_num = num; 573 dsaddr->ds_num = num;
573 nfs4_init_deviceid_node(&dsaddr->id_node, 574 nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
574 NFS_SERVER(ino)->pnfs_curr_ld,
575 NFS_SERVER(ino)->nfs_client,
576 &pdev->dev_id);
577 575
578 INIT_LIST_HEAD(&dsaddrs); 576 INIT_LIST_HEAD(&dsaddrs);
579 577
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
587 585
588 mp_count = be32_to_cpup(p); /* multipath count */ 586 mp_count = be32_to_cpup(p); /* multipath count */
589 for (j = 0; j < mp_count; j++) { 587 for (j = 0; j < mp_count; j++) {
590 da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, 588 da = decode_ds_addr(server->nfs_client->cl_net,
591 &stream, gfp_flags); 589 &stream, gfp_flags);
592 if (da) 590 if (da)
593 list_add_tail(&da->da_node, &dsaddrs); 591 list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
637 return NULL; 635 return NULL;
638} 636}
639 637
640/*
641 * Decode the opaque device specified in 'dev' and add it to the cache of
642 * available devices.
643 */
644static struct nfs4_file_layout_dsaddr *
645decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
646{
647 struct nfs4_deviceid_node *d;
648 struct nfs4_file_layout_dsaddr *n, *new;
649
650 new = decode_device(inode, dev, gfp_flags);
651 if (!new) {
652 printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
653 __func__);
654 return NULL;
655 }
656
657 d = nfs4_insert_deviceid_node(&new->id_node);
658 n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
659 if (n != new) {
660 nfs4_fl_free_deviceid(new);
661 return n;
662 }
663
664 return new;
665}
666
667/*
668 * Retrieve the information for dev_id, add it to the list
669 * of available devices, and return it.
670 */
671struct nfs4_file_layout_dsaddr *
672filelayout_get_device_info(struct inode *inode,
673 struct nfs4_deviceid *dev_id,
674 struct rpc_cred *cred,
675 gfp_t gfp_flags)
676{
677 struct pnfs_device *pdev = NULL;
678 u32 max_resp_sz;
679 int max_pages;
680 struct page **pages = NULL;
681 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
682 int rc, i;
683 struct nfs_server *server = NFS_SERVER(inode);
684
685 /*
686 * Use the session max response size as the basis for setting
687 * GETDEVICEINFO's maxcount
688 */
689 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
690 max_pages = nfs_page_array_len(0, max_resp_sz);
691 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
692 __func__, inode, max_resp_sz, max_pages);
693
694 pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
695 if (pdev == NULL)
696 return NULL;
697
698 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
699 if (pages == NULL) {
700 kfree(pdev);
701 return NULL;
702 }
703 for (i = 0; i < max_pages; i++) {
704 pages[i] = alloc_page(gfp_flags);
705 if (!pages[i])
706 goto out_free;
707 }
708
709 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
710 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
711 pdev->pages = pages;
712 pdev->pgbase = 0;
713 pdev->pglen = max_resp_sz;
714 pdev->mincount = 0;
715 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
716
717 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
718 dprintk("%s getdevice info returns %d\n", __func__, rc);
719 if (rc)
720 goto out_free;
721
722 /*
723 * Found new device, need to decode it and then add it to the
724 * list of known devices for this mountpoint.
725 */
726 dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
727out_free:
728 for (i = 0; i < max_pages; i++)
729 __free_page(pages[i]);
730 kfree(pages);
731 kfree(pdev);
732 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
733 return dsaddr;
734}
735
736void 638void
737nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 639nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
738{ 640{
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b08..777b055063f6 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
74 struct nfs_server_key *key = buffer; 74 struct nfs_server_key *key = buffer;
75 uint16_t len = sizeof(struct nfs_server_key); 75 uint16_t len = sizeof(struct nfs_server_key);
76 76
77 memset(key, 0, len);
77 key->nfsversion = clp->rpc_ops->version; 78 key->nfsversion = clp->rpc_ops->version;
78 key->family = clp->cl_addr.ss_family; 79 key->family = clp->cl_addr.ss_family;
79 80
80 memset(key, 0, len);
81
82 switch (clp->cl_addr.ss_family) { 81 switch (clp->cl_addr.ss_family) {
83 case AF_INET: 82 case AF_INET:
84 key->port = sin->sin_port; 83 key->port = sin->sin_port;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 577a36f0a510..141c9f4a40de 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
505 attr->ia_valid &= ~ATTR_MODE; 505 attr->ia_valid &= ~ATTR_MODE;
506 506
507 if (attr->ia_valid & ATTR_SIZE) { 507 if (attr->ia_valid & ATTR_SIZE) {
508 if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) 508 BUG_ON(!S_ISREG(inode->i_mode));
509
510 if (attr->ia_size == i_size_read(inode))
509 attr->ia_valid &= ~ATTR_SIZE; 511 attr->ia_valid &= ~ATTR_SIZE;
510 } 512 }
511 513
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 94d922ebb5ac..efaa31c70fbe 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
218int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); 218int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
219#endif 219#endif
220 220
221/* nfs3client.c */
222#if IS_ENABLED(CONFIG_NFS_V3)
223struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
224struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
225 struct nfs_fattr *, rpc_authflavor_t);
226#endif
227
228/* callback_xdr.c */ 221/* callback_xdr.c */
229extern struct svc_version nfs4_callback_version1; 222extern struct svc_version nfs4_callback_version1;
230extern struct svc_version nfs4_callback_version4; 223extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..333ae4068506
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) 2014 Anna Schumaker.
3 *
4 * NFSv3-specific filesystem definitions and declarations
5 */
6#ifndef __LINUX_FS_NFS_NFS3_FS_H
7#define __LINUX_FS_NFS_NFS3_FS_H
8
9/*
10 * nfs3acl.c
11 */
12#ifdef CONFIG_NFS_V3_ACL
13extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
14extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
15extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
16 struct posix_acl *dfacl);
17extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
18extern const struct xattr_handler *nfs3_xattr_handlers[];
19#else
20static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
21 struct posix_acl *dfacl)
22{
23 return 0;
24}
25#define nfs3_listxattr NULL
26#endif /* CONFIG_NFS_V3_ACL */
27
28/* nfs3client.c */
29struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
31 struct nfs_fattr *, rpc_authflavor_t);
32
33
34#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index d0fec260132a..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
7#include <linux/nfsacl.h> 7#include <linux/nfsacl.h>
8 8
9#include "internal.h" 9#include "internal.h"
10#include "nfs3_fs.h"
10 11
11#define NFSDBG_FACILITY NFSDBG_PROC 12#define NFSDBG_FACILITY NFSDBG_PROC
12 13
@@ -129,7 +130,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
129 .rpc_argp = &args, 130 .rpc_argp = &args,
130 .rpc_resp = &fattr, 131 .rpc_resp = &fattr,
131 }; 132 };
132 int status; 133 int status = 0;
134
135 if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
136 goto out;
133 137
134 status = -EOPNOTSUPP; 138 status = -EOPNOTSUPP;
135 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 139 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
1#include <linux/nfs_fs.h> 1#include <linux/nfs_fs.h>
2#include <linux/nfs_mount.h> 2#include <linux/nfs_mount.h>
3#include "internal.h" 3#include "internal.h"
4#include "nfs3_fs.h"
4 5
5#ifdef CONFIG_NFS_V3_ACL 6#ifdef CONFIG_NFS_V3_ACL
6static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; 7static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 809670eba52a..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
22 22
23#include "iostat.h" 23#include "iostat.h"
24#include "internal.h" 24#include "internal.h"
25#include "nfs3_fs.h"
25 26
26#define NFSDBG_FACILITY NFSDBG_PROC 27#define NFSDBG_FACILITY NFSDBG_PROC
27 28
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
6#include "internal.h" 6#include "internal.h"
7#include "nfs3_fs.h"
7#include "nfs.h" 8#include "nfs.h"
8 9
9static struct nfs_subversion nfs_v3 = { 10static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 92193eddb41d..a8b855ab4e22 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -130,16 +130,15 @@ enum {
130 */ 130 */
131 131
132struct nfs4_lock_state { 132struct nfs4_lock_state {
133 struct list_head ls_locks; /* Other lock stateids */ 133 struct list_head ls_locks; /* Other lock stateids */
134 struct nfs4_state * ls_state; /* Pointer to open state */ 134 struct nfs4_state * ls_state; /* Pointer to open state */
135#define NFS_LOCK_INITIALIZED 0 135#define NFS_LOCK_INITIALIZED 0
136#define NFS_LOCK_LOST 1 136#define NFS_LOCK_LOST 1
137 unsigned long ls_flags; 137 unsigned long ls_flags;
138 struct nfs_seqid_counter ls_seqid; 138 struct nfs_seqid_counter ls_seqid;
139 nfs4_stateid ls_stateid; 139 nfs4_stateid ls_stateid;
140 atomic_t ls_count; 140 atomic_t ls_count;
141 fl_owner_t ls_owner; 141 fl_owner_t ls_owner;
142 struct work_struct ls_release;
143}; 142};
144 143
145/* bits for nfs4_state->flags */ 144/* bits for nfs4_state->flags */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 53e435a95260..ffdb28d86cf8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
482 482
483 spin_lock(&nn->nfs_client_lock); 483 spin_lock(&nn->nfs_client_lock);
484 list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { 484 list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
485
486 if (pos->rpc_ops != new->rpc_ops)
487 continue;
488
489 if (pos->cl_proto != new->cl_proto)
490 continue;
491
492 if (pos->cl_minorversion != new->cl_minorversion)
493 continue;
494
485 /* If "pos" isn't marked ready, we can't trust the 495 /* If "pos" isn't marked ready, we can't trust the
486 * remaining fields in "pos" */ 496 * remaining fields in "pos" */
487 if (pos->cl_cons_state > NFS_CS_READY) { 497 if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
501 if (pos->cl_cons_state != NFS_CS_READY) 511 if (pos->cl_cons_state != NFS_CS_READY)
502 continue; 512 continue;
503 513
504 if (pos->rpc_ops != new->rpc_ops)
505 continue;
506
507 if (pos->cl_proto != new->cl_proto)
508 continue;
509
510 if (pos->cl_minorversion != new->cl_minorversion)
511 continue;
512
513 if (pos->cl_clientid != new->cl_clientid) 514 if (pos->cl_clientid != new->cl_clientid)
514 continue; 515 continue;
515 516
@@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
622 623
623 spin_lock(&nn->nfs_client_lock); 624 spin_lock(&nn->nfs_client_lock);
624 list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { 625 list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
626
627 if (pos->rpc_ops != new->rpc_ops)
628 continue;
629
630 if (pos->cl_proto != new->cl_proto)
631 continue;
632
633 if (pos->cl_minorversion != new->cl_minorversion)
634 continue;
635
625 /* If "pos" isn't marked ready, we can't trust the 636 /* If "pos" isn't marked ready, we can't trust the
626 * remaining fields in "pos", especially the client 637 * remaining fields in "pos", especially the client
627 * ID and serverowner fields. Wait for CREATE_SESSION 638 * ID and serverowner fields. Wait for CREATE_SESSION
@@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
647 if (pos->cl_cons_state != NFS_CS_READY) 658 if (pos->cl_cons_state != NFS_CS_READY)
648 continue; 659 continue;
649 660
650 if (pos->rpc_ops != new->rpc_ops)
651 continue;
652
653 if (pos->cl_proto != new->cl_proto)
654 continue;
655
656 if (pos->cl_minorversion != new->cl_minorversion)
657 continue;
658
659 if (!nfs4_match_clientids(pos, new)) 661 if (!nfs4_match_clientids(pos, new))
660 continue; 662 continue;
661 663
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75ae8d22f067..5aa55c132aa2 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
77static int _nfs4_proc_open(struct nfs4_opendata *data); 77static int _nfs4_proc_open(struct nfs4_opendata *data);
78static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 78static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
79static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 79static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
80static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 80static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
81static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); 81static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
82static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); 82static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); 83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
314 kunmap_atomic(start); 314 kunmap_atomic(start);
315} 315}
316 316
317static long nfs4_update_delay(long *timeout)
318{
319 long ret;
320 if (!timeout)
321 return NFS4_POLL_RETRY_MAX;
322 if (*timeout <= 0)
323 *timeout = NFS4_POLL_RETRY_MIN;
324 if (*timeout > NFS4_POLL_RETRY_MAX)
325 *timeout = NFS4_POLL_RETRY_MAX;
326 ret = *timeout;
327 *timeout <<= 1;
328 return ret;
329}
330
317static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) 331static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
318{ 332{
319 int res = 0; 333 int res = 0;
320 334
321 might_sleep(); 335 might_sleep();
322 336
323 if (*timeout <= 0) 337 freezable_schedule_timeout_killable_unsafe(
324 *timeout = NFS4_POLL_RETRY_MIN; 338 nfs4_update_delay(timeout));
325 if (*timeout > NFS4_POLL_RETRY_MAX)
326 *timeout = NFS4_POLL_RETRY_MAX;
327 freezable_schedule_timeout_killable_unsafe(*timeout);
328 if (fatal_signal_pending(current)) 339 if (fatal_signal_pending(current))
329 res = -ERESTARTSYS; 340 res = -ERESTARTSYS;
330 *timeout <<= 1;
331 return res; 341 return res;
332} 342}
333 343
@@ -1307,15 +1317,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1307 int ret = -EAGAIN; 1317 int ret = -EAGAIN;
1308 1318
1309 for (;;) { 1319 for (;;) {
1320 spin_lock(&state->owner->so_lock);
1310 if (can_open_cached(state, fmode, open_mode)) { 1321 if (can_open_cached(state, fmode, open_mode)) {
1311 spin_lock(&state->owner->so_lock); 1322 update_open_stateflags(state, fmode);
1312 if (can_open_cached(state, fmode, open_mode)) {
1313 update_open_stateflags(state, fmode);
1314 spin_unlock(&state->owner->so_lock);
1315 goto out_return_state;
1316 }
1317 spin_unlock(&state->owner->so_lock); 1323 spin_unlock(&state->owner->so_lock);
1324 goto out_return_state;
1318 } 1325 }
1326 spin_unlock(&state->owner->so_lock);
1319 rcu_read_lock(); 1327 rcu_read_lock();
1320 delegation = rcu_dereference(nfsi->delegation); 1328 delegation = rcu_dereference(nfsi->delegation);
1321 if (!can_open_delegated(delegation, fmode)) { 1329 if (!can_open_delegated(delegation, fmode)) {
@@ -2226,9 +2234,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
2226 ret = _nfs4_proc_open(opendata); 2234 ret = _nfs4_proc_open(opendata);
2227 if (ret != 0) { 2235 if (ret != 0) {
2228 if (ret == -ENOENT) { 2236 if (ret == -ENOENT) {
2229 d_drop(opendata->dentry); 2237 dentry = opendata->dentry;
2230 d_add(opendata->dentry, NULL); 2238 if (dentry->d_inode)
2231 nfs_set_verifier(opendata->dentry, 2239 d_delete(dentry);
2240 else if (d_unhashed(dentry))
2241 d_add(dentry, NULL);
2242
2243 nfs_set_verifier(dentry,
2232 nfs_save_change_attribute(opendata->dir->d_inode)); 2244 nfs_save_change_attribute(opendata->dir->d_inode));
2233 } 2245 }
2234 goto out; 2246 goto out;
@@ -2560,6 +2572,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2560 struct nfs4_closedata *calldata = data; 2572 struct nfs4_closedata *calldata = data;
2561 struct nfs4_state *state = calldata->state; 2573 struct nfs4_state *state = calldata->state;
2562 struct nfs_server *server = NFS_SERVER(calldata->inode); 2574 struct nfs_server *server = NFS_SERVER(calldata->inode);
2575 nfs4_stateid *res_stateid = NULL;
2563 2576
2564 dprintk("%s: begin!\n", __func__); 2577 dprintk("%s: begin!\n", __func__);
2565 if (!nfs4_sequence_done(task, &calldata->res.seq_res)) 2578 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -2570,12 +2583,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2570 */ 2583 */
2571 switch (task->tk_status) { 2584 switch (task->tk_status) {
2572 case 0: 2585 case 0:
2573 if (calldata->roc) 2586 res_stateid = &calldata->res.stateid;
2587 if (calldata->arg.fmode == 0 && calldata->roc)
2574 pnfs_roc_set_barrier(state->inode, 2588 pnfs_roc_set_barrier(state->inode,
2575 calldata->roc_barrier); 2589 calldata->roc_barrier);
2576 nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
2577 renew_lease(server, calldata->timestamp); 2590 renew_lease(server, calldata->timestamp);
2578 goto out_release; 2591 break;
2579 case -NFS4ERR_ADMIN_REVOKED: 2592 case -NFS4ERR_ADMIN_REVOKED:
2580 case -NFS4ERR_STALE_STATEID: 2593 case -NFS4ERR_STALE_STATEID:
2581 case -NFS4ERR_OLD_STATEID: 2594 case -NFS4ERR_OLD_STATEID:
@@ -2584,12 +2597,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2584 if (calldata->arg.fmode == 0) 2597 if (calldata->arg.fmode == 0)
2585 break; 2598 break;
2586 default: 2599 default:
2587 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { 2600 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
2588 rpc_restart_call_prepare(task); 2601 rpc_restart_call_prepare(task);
2589 goto out_release; 2602 goto out_release;
2590 } 2603 }
2591 } 2604 }
2592 nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); 2605 nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
2593out_release: 2606out_release:
2594 nfs_release_seqid(calldata->arg.seqid); 2607 nfs_release_seqid(calldata->arg.seqid);
2595 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 2608 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2601,6 +2614,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2601 struct nfs4_closedata *calldata = data; 2614 struct nfs4_closedata *calldata = data;
2602 struct nfs4_state *state = calldata->state; 2615 struct nfs4_state *state = calldata->state;
2603 struct inode *inode = calldata->inode; 2616 struct inode *inode = calldata->inode;
2617 bool is_rdonly, is_wronly, is_rdwr;
2604 int call_close = 0; 2618 int call_close = 0;
2605 2619
2606 dprintk("%s: begin!\n", __func__); 2620 dprintk("%s: begin!\n", __func__);
@@ -2608,21 +2622,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2608 goto out_wait; 2622 goto out_wait;
2609 2623
2610 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; 2624 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
2611 calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
2612 spin_lock(&state->owner->so_lock); 2625 spin_lock(&state->owner->so_lock);
2626 is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
2627 is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
2628 is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
2613 /* Calculate the change in open mode */ 2629 /* Calculate the change in open mode */
2630 calldata->arg.fmode = 0;
2614 if (state->n_rdwr == 0) { 2631 if (state->n_rdwr == 0) {
2615 if (state->n_rdonly == 0) { 2632 if (state->n_rdonly == 0)
2616 call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); 2633 call_close |= is_rdonly;
2617 call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); 2634 else if (is_rdonly)
2618 calldata->arg.fmode &= ~FMODE_READ; 2635 calldata->arg.fmode |= FMODE_READ;
2619 } 2636 if (state->n_wronly == 0)
2620 if (state->n_wronly == 0) { 2637 call_close |= is_wronly;
2621 call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); 2638 else if (is_wronly)
2622 call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); 2639 calldata->arg.fmode |= FMODE_WRITE;
2623 calldata->arg.fmode &= ~FMODE_WRITE; 2640 } else if (is_rdwr)
2624 } 2641 calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
2625 } 2642
2643 if (calldata->arg.fmode == 0)
2644 call_close |= is_rdwr;
2645
2626 if (!nfs4_valid_open_stateid(state)) 2646 if (!nfs4_valid_open_stateid(state))
2627 call_close = 0; 2647 call_close = 0;
2628 spin_unlock(&state->owner->so_lock); 2648 spin_unlock(&state->owner->so_lock);
@@ -3205,7 +3225,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
3205 struct nfs4_label *label = NULL; 3225 struct nfs4_label *label = NULL;
3206 int status; 3226 int status;
3207 3227
3208 if (pnfs_ld_layoutret_on_setattr(inode)) 3228 if (pnfs_ld_layoutret_on_setattr(inode) &&
3229 sattr->ia_valid & ATTR_SIZE &&
3230 sattr->ia_size < i_size_read(inode))
3209 pnfs_commit_and_return_layout(inode); 3231 pnfs_commit_and_return_layout(inode);
3210 3232
3211 nfs_fattr_init(fattr); 3233 nfs_fattr_init(fattr);
@@ -3564,7 +3586,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
3564 3586
3565 if (!nfs4_sequence_done(task, &res->seq_res)) 3587 if (!nfs4_sequence_done(task, &res->seq_res))
3566 return 0; 3588 return 0;
3567 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 3589 if (nfs4_async_handle_error(task, res->server, NULL,
3590 &data->timeout) == -EAGAIN)
3568 return 0; 3591 return 0;
3569 update_changeattr(dir, &res->cinfo); 3592 update_changeattr(dir, &res->cinfo);
3570 return 1; 3593 return 1;
@@ -3597,7 +3620,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
3597 3620
3598 if (!nfs4_sequence_done(task, &res->seq_res)) 3621 if (!nfs4_sequence_done(task, &res->seq_res))
3599 return 0; 3622 return 0;
3600 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 3623 if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
3601 return 0; 3624 return 0;
3602 3625
3603 update_changeattr(old_dir, &res->old_cinfo); 3626 update_changeattr(old_dir, &res->old_cinfo);
@@ -4101,7 +4124,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
4101 4124
4102 trace_nfs4_read(hdr, task->tk_status); 4125 trace_nfs4_read(hdr, task->tk_status);
4103 if (nfs4_async_handle_error(task, server, 4126 if (nfs4_async_handle_error(task, server,
4104 hdr->args.context->state) == -EAGAIN) { 4127 hdr->args.context->state,
4128 NULL) == -EAGAIN) {
4105 rpc_restart_call_prepare(task); 4129 rpc_restart_call_prepare(task);
4106 return -EAGAIN; 4130 return -EAGAIN;
4107 } 4131 }
@@ -4169,10 +4193,11 @@ static int nfs4_write_done_cb(struct rpc_task *task,
4169 struct nfs_pgio_header *hdr) 4193 struct nfs_pgio_header *hdr)
4170{ 4194{
4171 struct inode *inode = hdr->inode; 4195 struct inode *inode = hdr->inode;
4172 4196
4173 trace_nfs4_write(hdr, task->tk_status); 4197 trace_nfs4_write(hdr, task->tk_status);
4174 if (nfs4_async_handle_error(task, NFS_SERVER(inode), 4198 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4175 hdr->args.context->state) == -EAGAIN) { 4199 hdr->args.context->state,
4200 NULL) == -EAGAIN) {
4176 rpc_restart_call_prepare(task); 4201 rpc_restart_call_prepare(task);
4177 return -EAGAIN; 4202 return -EAGAIN;
4178 } 4203 }
@@ -4252,7 +4277,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
4252 struct inode *inode = data->inode; 4277 struct inode *inode = data->inode;
4253 4278
4254 trace_nfs4_commit(data, task->tk_status); 4279 trace_nfs4_commit(data, task->tk_status);
4255 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 4280 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4281 NULL, NULL) == -EAGAIN) {
4256 rpc_restart_call_prepare(task); 4282 rpc_restart_call_prepare(task);
4257 return -EAGAIN; 4283 return -EAGAIN;
4258 } 4284 }
@@ -4805,7 +4831,8 @@ out:
4805 4831
4806 4832
4807static int 4833static int
4808nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 4834nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4835 struct nfs4_state *state, long *timeout)
4809{ 4836{
4810 struct nfs_client *clp = server->nfs_client; 4837 struct nfs_client *clp = server->nfs_client;
4811 4838
@@ -4855,6 +4882,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4855#endif /* CONFIG_NFS_V4_1 */ 4882#endif /* CONFIG_NFS_V4_1 */
4856 case -NFS4ERR_DELAY: 4883 case -NFS4ERR_DELAY:
4857 nfs_inc_server_stats(server, NFSIOS_DELAY); 4884 nfs_inc_server_stats(server, NFSIOS_DELAY);
4885 rpc_delay(task, nfs4_update_delay(timeout));
4886 goto restart_call;
4858 case -NFS4ERR_GRACE: 4887 case -NFS4ERR_GRACE:
4859 rpc_delay(task, NFS4_POLL_RETRY_MAX); 4888 rpc_delay(task, NFS4_POLL_RETRY_MAX);
4860 case -NFS4ERR_RETRY_UNCACHED_REP: 4889 case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -5095,8 +5124,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5095 pnfs_roc_set_barrier(data->inode, data->roc_barrier); 5124 pnfs_roc_set_barrier(data->inode, data->roc_barrier);
5096 break; 5125 break;
5097 default: 5126 default:
5098 if (nfs4_async_handle_error(task, data->res.server, NULL) == 5127 if (nfs4_async_handle_error(task, data->res.server,
5099 -EAGAIN) { 5128 NULL, NULL) == -EAGAIN) {
5100 rpc_restart_call_prepare(task); 5129 rpc_restart_call_prepare(task);
5101 return; 5130 return;
5102 } 5131 }
@@ -5360,7 +5389,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
5360 case -NFS4ERR_EXPIRED: 5389 case -NFS4ERR_EXPIRED:
5361 break; 5390 break;
5362 default: 5391 default:
5363 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 5392 if (nfs4_async_handle_error(task, calldata->server,
5393 NULL, NULL) == -EAGAIN)
5364 rpc_restart_call_prepare(task); 5394 rpc_restart_call_prepare(task);
5365 } 5395 }
5366 nfs_release_seqid(calldata->arg.seqid); 5396 nfs_release_seqid(calldata->arg.seqid);
@@ -5966,7 +5996,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
5966 break; 5996 break;
5967 case -NFS4ERR_LEASE_MOVED: 5997 case -NFS4ERR_LEASE_MOVED:
5968 case -NFS4ERR_DELAY: 5998 case -NFS4ERR_DELAY:
5969 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) 5999 if (nfs4_async_handle_error(task, server,
6000 NULL, NULL) == -EAGAIN)
5970 rpc_restart_call_prepare(task); 6001 rpc_restart_call_prepare(task);
5971 } 6002 }
5972} 6003}
@@ -7341,7 +7372,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
7341 int ret = 0; 7372 int ret = 0;
7342 7373
7343 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) 7374 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
7344 return 0; 7375 return -EAGAIN;
7345 task = _nfs41_proc_sequence(clp, cred, false); 7376 task = _nfs41_proc_sequence(clp, cred, false);
7346 if (IS_ERR(task)) 7377 if (IS_ERR(task))
7347 ret = PTR_ERR(task); 7378 ret = PTR_ERR(task);
@@ -7571,14 +7602,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7571 } else { 7602 } else {
7572 LIST_HEAD(head); 7603 LIST_HEAD(head);
7573 7604
7605 /*
7606 * Mark the bad layout state as invalid, then retry
7607 * with the current stateid.
7608 */
7574 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); 7609 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
7575 spin_unlock(&inode->i_lock); 7610 spin_unlock(&inode->i_lock);
7576 /* Mark the bad layout state as invalid, then
7577 * retry using the open stateid. */
7578 pnfs_free_lseg_list(&head); 7611 pnfs_free_lseg_list(&head);
7612
7613 task->tk_status = 0;
7614 rpc_restart_call_prepare(task);
7579 } 7615 }
7580 } 7616 }
7581 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) 7617 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
7582 rpc_restart_call_prepare(task); 7618 rpc_restart_call_prepare(task);
7583out: 7619out:
7584 dprintk("<-- %s\n", __func__); 7620 dprintk("<-- %s\n", __func__);
@@ -7738,7 +7774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
7738 case 0: 7774 case 0:
7739 break; 7775 break;
7740 case -NFS4ERR_DELAY: 7776 case -NFS4ERR_DELAY:
7741 if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) 7777 if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
7742 break; 7778 break;
7743 rpc_restart_call_prepare(task); 7779 rpc_restart_call_prepare(task);
7744 return; 7780 return;
@@ -7797,54 +7833,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
7797 return status; 7833 return status;
7798} 7834}
7799 7835
7800/*
7801 * Retrieve the list of Data Server devices from the MDS.
7802 */
7803static int _nfs4_getdevicelist(struct nfs_server *server,
7804 const struct nfs_fh *fh,
7805 struct pnfs_devicelist *devlist)
7806{
7807 struct nfs4_getdevicelist_args args = {
7808 .fh = fh,
7809 .layoutclass = server->pnfs_curr_ld->id,
7810 };
7811 struct nfs4_getdevicelist_res res = {
7812 .devlist = devlist,
7813 };
7814 struct rpc_message msg = {
7815 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
7816 .rpc_argp = &args,
7817 .rpc_resp = &res,
7818 };
7819 int status;
7820
7821 dprintk("--> %s\n", __func__);
7822 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
7823 &res.seq_res, 0);
7824 dprintk("<-- %s status=%d\n", __func__, status);
7825 return status;
7826}
7827
7828int nfs4_proc_getdevicelist(struct nfs_server *server,
7829 const struct nfs_fh *fh,
7830 struct pnfs_devicelist *devlist)
7831{
7832 struct nfs4_exception exception = { };
7833 int err;
7834
7835 do {
7836 err = nfs4_handle_exception(server,
7837 _nfs4_getdevicelist(server, fh, devlist),
7838 &exception);
7839 } while (exception.retry);
7840
7841 dprintk("%s: err=%d, num_devs=%u\n", __func__,
7842 err, devlist->num_devs);
7843
7844 return err;
7845}
7846EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
7847
7848static int 7836static int
7849_nfs4_proc_getdeviceinfo(struct nfs_server *server, 7837_nfs4_proc_getdeviceinfo(struct nfs_server *server,
7850 struct pnfs_device *pdev, 7838 struct pnfs_device *pdev,
@@ -7917,7 +7905,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
7917 case 0: 7905 case 0:
7918 break; 7906 break;
7919 default: 7907 default:
7920 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 7908 if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
7921 rpc_restart_call_prepare(task); 7909 rpc_restart_call_prepare(task);
7922 return; 7910 return;
7923 } 7911 }
@@ -8213,7 +8201,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
8213 8201
8214 switch (task->tk_status) { 8202 switch (task->tk_status) {
8215 case -NFS4ERR_DELAY: 8203 case -NFS4ERR_DELAY:
8216 if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) 8204 if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
8217 rpc_restart_call_prepare(task); 8205 rpc_restart_call_prepare(task);
8218 } 8206 }
8219} 8207}
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
88 } 88 }
89 nfs_expire_all_delegations(clp); 89 nfs_expire_all_delegations(clp);
90 } else { 90 } else {
91 int ret;
92
91 /* Queue an asynchronous RENEW. */ 93 /* Queue an asynchronous RENEW. */
92 ops->sched_state_renewal(clp, cred, renew_flags); 94 ret = ops->sched_state_renewal(clp, cred, renew_flags);
93 put_rpccred(cred); 95 put_rpccred(cred);
94 goto out_exp; 96 switch (ret) {
97 default:
98 goto out_exp;
99 case -EAGAIN:
100 case -ENOMEM:
101 break;
102 }
95 } 103 }
96 } else { 104 } else {
97 dprintk("%s: failed to call renewd. Reason: lease not expired \n", 105 dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a043f618cd5a..5194933ed419 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -799,18 +799,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
799 return NULL; 799 return NULL;
800} 800}
801 801
802static void
803free_lock_state_work(struct work_struct *work)
804{
805 struct nfs4_lock_state *lsp = container_of(work,
806 struct nfs4_lock_state, ls_release);
807 struct nfs4_state *state = lsp->ls_state;
808 struct nfs_server *server = state->owner->so_server;
809 struct nfs_client *clp = server->nfs_client;
810
811 clp->cl_mvops->free_lock_state(server, lsp);
812}
813
814/* 802/*
815 * Return a compatible lock_state. If no initialized lock_state structure 803 * Return a compatible lock_state. If no initialized lock_state structure
816 * exists, return an uninitialized one. 804 * exists, return an uninitialized one.
@@ -832,7 +820,6 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
832 if (lsp->ls_seqid.owner_id < 0) 820 if (lsp->ls_seqid.owner_id < 0)
833 goto out_free; 821 goto out_free;
834 INIT_LIST_HEAD(&lsp->ls_locks); 822 INIT_LIST_HEAD(&lsp->ls_locks);
835 INIT_WORK(&lsp->ls_release, free_lock_state_work);
836 return lsp; 823 return lsp;
837out_free: 824out_free:
838 kfree(lsp); 825 kfree(lsp);
@@ -896,12 +883,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
896 if (list_empty(&state->lock_states)) 883 if (list_empty(&state->lock_states))
897 clear_bit(LK_STATE_IN_USE, &state->flags); 884 clear_bit(LK_STATE_IN_USE, &state->flags);
898 spin_unlock(&state->state_lock); 885 spin_unlock(&state->state_lock);
899 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) 886 server = state->owner->so_server;
900 queue_work(nfsiod_workqueue, &lsp->ls_release); 887 if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
901 else { 888 struct nfs_client *clp = server->nfs_client;
902 server = state->owner->so_server; 889
890 clp->cl_mvops->free_lock_state(server, lsp);
891 } else
903 nfs4_free_lock_state(server, lsp); 892 nfs4_free_lock_state(server, lsp);
904 }
905} 893}
906 894
907static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 895static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
@@ -1717,7 +1705,8 @@ restart:
1717 if (status < 0) { 1705 if (status < 0) {
1718 set_bit(ops->owner_flag_bit, &sp->so_flags); 1706 set_bit(ops->owner_flag_bit, &sp->so_flags);
1719 nfs4_put_state_owner(sp); 1707 nfs4_put_state_owner(sp);
1720 return nfs4_recovery_handle_error(clp, status); 1708 status = nfs4_recovery_handle_error(clp, status);
1709 return (status != 0) ? status : -EAGAIN;
1721 } 1710 }
1722 1711
1723 nfs4_put_state_owner(sp); 1712 nfs4_put_state_owner(sp);
@@ -1726,7 +1715,7 @@ restart:
1726 spin_unlock(&clp->cl_lock); 1715 spin_unlock(&clp->cl_lock);
1727 } 1716 }
1728 rcu_read_unlock(); 1717 rcu_read_unlock();
1729 return status; 1718 return 0;
1730} 1719}
1731 1720
1732static int nfs4_check_lease(struct nfs_client *clp) 1721static int nfs4_check_lease(struct nfs_client *clp)
@@ -1773,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1773 break; 1762 break;
1774 case -NFS4ERR_STALE_CLIENTID: 1763 case -NFS4ERR_STALE_CLIENTID:
1775 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); 1764 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
1776 nfs4_state_clear_reclaim_reboot(clp);
1777 nfs4_state_start_reclaim_reboot(clp); 1765 nfs4_state_start_reclaim_reboot(clp);
1778 break; 1766 break;
1779 case -NFS4ERR_CLID_INUSE: 1767 case -NFS4ERR_CLID_INUSE:
@@ -2357,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
2357 status = nfs4_check_lease(clp); 2345 status = nfs4_check_lease(clp);
2358 if (status < 0) 2346 if (status < 0)
2359 goto out_error; 2347 goto out_error;
2348 continue;
2360 } 2349 }
2361 2350
2362 if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { 2351 if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2378,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
2378 section = "reclaim reboot"; 2367 section = "reclaim reboot";
2379 status = nfs4_do_reclaim(clp, 2368 status = nfs4_do_reclaim(clp,
2380 clp->cl_mvops->reboot_recovery_ops); 2369 clp->cl_mvops->reboot_recovery_ops);
2381 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 2370 if (status == -EAGAIN)
2382 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
2383 continue;
2384 nfs4_state_end_reclaim_reboot(clp);
2385 if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
2386 continue; 2371 continue;
2387 if (status < 0) 2372 if (status < 0)
2388 goto out_error; 2373 goto out_error;
2374 nfs4_state_end_reclaim_reboot(clp);
2389 } 2375 }
2390 2376
2391 /* Now recover expired state... */ 2377 /* Now recover expired state... */
@@ -2393,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
2393 section = "reclaim nograce"; 2379 section = "reclaim nograce";
2394 status = nfs4_do_reclaim(clp, 2380 status = nfs4_do_reclaim(clp,
2395 clp->cl_mvops->nograce_recovery_ops); 2381 clp->cl_mvops->nograce_recovery_ops);
2396 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 2382 if (status == -EAGAIN)
2397 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
2398 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
2399 continue; 2383 continue;
2400 if (status < 0) 2384 if (status < 0)
2401 goto out_error; 2385 goto out_error;
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e13b59d8d9aa..005d03c5d274 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
362 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 362 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
363#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 363#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
364#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 364#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
365#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ 365#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
366 encode_verifier_maxsz) 366 XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
367#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ 367 1 /* layout type */ + \
368 2 /* nfs_cookie4 gdlr_cookie */ + \ 368 1 /* maxcount */ + \
369 decode_verifier_maxsz \ 369 1 /* bitmap size */ + \
370 /* verifier4 gdlr_verifier */ + \ 370 1 /* notification bitmap length */ + \
371 1 /* gdlr_deviceid_list count */ + \ 371 1 /* notification bitmap, word 0 */)
372 XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
373 NFS4_DEVICEID4_SIZE) \
374 /* gdlr_deviceid_list */ + \
375 1 /* bool gdlr_eof */)
376#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
377 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
378#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 372#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
379 1 /* layout type */ + \ 373 1 /* layout type */ + \
380 1 /* opaque devaddr4 length */ + \ 374 1 /* opaque devaddr4 length */ + \
381 /* devaddr4 payload is read into page */ \ 375 /* devaddr4 payload is read into page */ \
382 1 /* notification bitmap length */ + \ 376 1 /* notification bitmap length */ + \
383 1 /* notification bitmap */) 377 1 /* notification bitmap, word 0 */)
384#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ 378#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
385 encode_stateid_maxsz) 379 encode_stateid_maxsz)
386#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ 380#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
395 2 /* last byte written */ + \ 389 2 /* last byte written */ + \
396 1 /* nt_timechanged (false) */ + \ 390 1 /* nt_timechanged (false) */ + \
397 1 /* layoutupdate4 layout type */ + \ 391 1 /* layoutupdate4 layout type */ + \
398 1 /* NULL filelayout layoutupdate4 payload */) 392 1 /* layoutupdate4 opaqueue len */)
393 /* the actual content of layoutupdate4 should
394 be allocated by drivers and spliced in
395 using xdr_write_pages */
399#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) 396#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
400#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ 397#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
401 encode_stateid_maxsz + \ 398 encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
809#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 806#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
810 decode_sequence_maxsz + \ 807 decode_sequence_maxsz + \
811 decode_reclaim_complete_maxsz) 808 decode_reclaim_complete_maxsz)
812#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
813 encode_sequence_maxsz + \
814 encode_putfh_maxsz + \
815 encode_getdevicelist_maxsz)
816#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
817 decode_sequence_maxsz + \
818 decode_putfh_maxsz + \
819 decode_getdevicelist_maxsz)
820#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ 809#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
821 encode_sequence_maxsz +\ 810 encode_sequence_maxsz +\
822 encode_getdeviceinfo_maxsz) 811 encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
1927 1916
1928#ifdef CONFIG_NFS_V4_1 1917#ifdef CONFIG_NFS_V4_1
1929static void 1918static void
1930encode_getdevicelist(struct xdr_stream *xdr,
1931 const struct nfs4_getdevicelist_args *args,
1932 struct compound_hdr *hdr)
1933{
1934 __be32 *p;
1935 nfs4_verifier dummy = {
1936 .data = "dummmmmy",
1937 };
1938
1939 encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
1940 p = reserve_space(xdr, 16);
1941 *p++ = cpu_to_be32(args->layoutclass);
1942 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
1943 xdr_encode_hyper(p, 0ULL); /* cookie */
1944 encode_nfs4_verifier(xdr, &dummy);
1945}
1946
1947static void
1948encode_getdeviceinfo(struct xdr_stream *xdr, 1919encode_getdeviceinfo(struct xdr_stream *xdr,
1949 const struct nfs4_getdeviceinfo_args *args, 1920 const struct nfs4_getdeviceinfo_args *args,
1950 struct compound_hdr *hdr) 1921 struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1952 __be32 *p; 1923 __be32 *p;
1953 1924
1954 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); 1925 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
1955 p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); 1926 p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
1956 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, 1927 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1957 NFS4_DEVICEID4_SIZE); 1928 NFS4_DEVICEID4_SIZE);
1958 *p++ = cpu_to_be32(args->pdev->layout_type); 1929 *p++ = cpu_to_be32(args->pdev->layout_type);
1959 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ 1930 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
1960 *p++ = cpu_to_be32(0); /* bitmap length 0 */ 1931
1932 p = reserve_space(xdr, 4 + 4);
1933 *p++ = cpu_to_be32(1); /* bitmap length */
1934 *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
1961} 1935}
1962 1936
1963static void 1937static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
1990static int 1964static int
1991encode_layoutcommit(struct xdr_stream *xdr, 1965encode_layoutcommit(struct xdr_stream *xdr,
1992 struct inode *inode, 1966 struct inode *inode,
1993 const struct nfs4_layoutcommit_args *args, 1967 struct nfs4_layoutcommit_args *args,
1994 struct compound_hdr *hdr) 1968 struct compound_hdr *hdr)
1995{ 1969{
1996 __be32 *p; 1970 __be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
2011 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1985 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
2012 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1986 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
2013 1987
2014 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) 1988 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
2015 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( 1989 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
2016 NFS_I(inode)->layout, xdr, args); 1990 NFS_I(inode)->layout, xdr, args);
2017 else 1991 } else {
2018 encode_uint32(xdr, 0); /* no layout-type payload */ 1992 encode_uint32(xdr, args->layoutupdate_len);
1993 if (args->layoutupdate_pages) {
1994 xdr_write_pages(xdr, args->layoutupdate_pages, 0,
1995 args->layoutupdate_len);
1996 }
1997 }
2019 1998
2020 return 0; 1999 return 0;
2021} 2000}
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2893} 2872}
2894 2873
2895/* 2874/*
2896 * Encode GETDEVICELIST request
2897 */
2898static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
2899 struct xdr_stream *xdr,
2900 struct nfs4_getdevicelist_args *args)
2901{
2902 struct compound_hdr hdr = {
2903 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2904 };
2905
2906 encode_compound_hdr(xdr, req, &hdr);
2907 encode_sequence(xdr, &args->seq_args, &hdr);
2908 encode_putfh(xdr, args->fh, &hdr);
2909 encode_getdevicelist(xdr, args, &hdr);
2910 encode_nops(&hdr);
2911}
2912
2913/*
2914 * Encode GETDEVICEINFO request 2875 * Encode GETDEVICEINFO request
2915 */ 2876 */
2916static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, 2877static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
5765} 5726}
5766 5727
5767#if defined(CONFIG_NFS_V4_1) 5728#if defined(CONFIG_NFS_V4_1)
5768/*
5769 * TODO: Need to handle case when EOF != true;
5770 */
5771static int decode_getdevicelist(struct xdr_stream *xdr,
5772 struct pnfs_devicelist *res)
5773{
5774 __be32 *p;
5775 int status, i;
5776 nfs4_verifier verftemp;
5777
5778 status = decode_op_hdr(xdr, OP_GETDEVICELIST);
5779 if (status)
5780 return status;
5781
5782 p = xdr_inline_decode(xdr, 8 + 8 + 4);
5783 if (unlikely(!p))
5784 goto out_overflow;
5785
5786 /* TODO: Skip cookie for now */
5787 p += 2;
5788
5789 /* Read verifier */
5790 p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
5791
5792 res->num_devs = be32_to_cpup(p);
5793
5794 dprintk("%s: num_dev %d\n", __func__, res->num_devs);
5795
5796 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
5797 printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
5798 __func__, res->num_devs);
5799 return -EIO;
5800 }
5801
5802 p = xdr_inline_decode(xdr,
5803 res->num_devs * NFS4_DEVICEID4_SIZE + 4);
5804 if (unlikely(!p))
5805 goto out_overflow;
5806 for (i = 0; i < res->num_devs; i++)
5807 p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
5808 NFS4_DEVICEID4_SIZE);
5809 res->eof = be32_to_cpup(p);
5810 return 0;
5811out_overflow:
5812 print_overflow_msg(__func__, xdr);
5813 return -EIO;
5814}
5815
5816static int decode_getdeviceinfo(struct xdr_stream *xdr, 5729static int decode_getdeviceinfo(struct xdr_stream *xdr,
5817 struct pnfs_device *pdev) 5730 struct pnfs_device *pdev)
5818{ 5731{
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5862 p = xdr_inline_decode(xdr, 4 * len); 5775 p = xdr_inline_decode(xdr, 4 * len);
5863 if (unlikely(!p)) 5776 if (unlikely(!p))
5864 goto out_overflow; 5777 goto out_overflow;
5865 for (i = 0; i < len; i++, p++) { 5778
5866 if (be32_to_cpup(p)) { 5779 if (be32_to_cpup(p++) &
5867 dprintk("%s: notifications not supported\n", 5780 ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
5781 dprintk("%s: unsupported notification\n",
5782 __func__);
5783 }
5784
5785 for (i = 1; i < len; i++) {
5786 if (be32_to_cpup(p++)) {
5787 dprintk("%s: unsupported notification\n",
5868 __func__); 5788 __func__);
5869 return -EIO; 5789 return -EIO;
5870 } 5790 }
@@ -7097,32 +7017,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
7097} 7017}
7098 7018
7099/* 7019/*
7100 * Decode GETDEVICELIST response
7101 */
7102static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
7103 struct xdr_stream *xdr,
7104 struct nfs4_getdevicelist_res *res)
7105{
7106 struct compound_hdr hdr;
7107 int status;
7108
7109 dprintk("encoding getdevicelist!\n");
7110
7111 status = decode_compound_hdr(xdr, &hdr);
7112 if (status != 0)
7113 goto out;
7114 status = decode_sequence(xdr, &res->seq_res, rqstp);
7115 if (status != 0)
7116 goto out;
7117 status = decode_putfh(xdr);
7118 if (status != 0)
7119 goto out;
7120 status = decode_getdevicelist(xdr, res->devlist);
7121out:
7122 return status;
7123}
7124
7125/*
7126 * Decode GETDEVINFO response 7020 * Decode GETDEVINFO response
7127 */ 7021 */
7128static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, 7022static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp,
@@ -7490,7 +7384,6 @@ struct rpc_procinfo nfs4_procedures[] = {
7490 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), 7384 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
7491 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), 7385 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
7492 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), 7386 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7493 PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
7494 PROC(BIND_CONN_TO_SESSION, 7387 PROC(BIND_CONN_TO_SESSION,
7495 enc_bind_conn_to_session, dec_bind_conn_to_session), 7388 enc_bind_conn_to_session, dec_bind_conn_to_session),
7496 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), 7389 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index ae05278b3761..c6e4bda63000 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60 kfree(de); 60 kfree(de);
61} 61}
62 62
63static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
64 const struct nfs4_deviceid *d_id)
65{
66 struct nfs4_deviceid_node *d;
67 struct objio_dev_ent *de;
68
69 d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
70 if (!d)
71 return NULL;
72
73 de = container_of(d, struct objio_dev_ent, id_node);
74 return de;
75}
76
77static struct objio_dev_ent *
78_dev_list_add(const struct nfs_server *nfss,
79 const struct nfs4_deviceid *d_id, struct osd_dev *od,
80 gfp_t gfp_flags)
81{
82 struct nfs4_deviceid_node *d;
83 struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
84 struct objio_dev_ent *n;
85
86 if (!de) {
87 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
88 return NULL;
89 }
90
91 dprintk("%s: Adding od=%p\n", __func__, od);
92 nfs4_init_deviceid_node(&de->id_node,
93 nfss->pnfs_curr_ld,
94 nfss->nfs_client,
95 d_id);
96 de->od.od = od;
97
98 d = nfs4_insert_deviceid_node(&de->id_node);
99 n = container_of(d, struct objio_dev_ent, id_node);
100 if (n != de) {
101 dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
102 objio_free_deviceid_node(&de->id_node);
103 de = n;
104 }
105
106 return de;
107}
108
109struct objio_segment { 63struct objio_segment {
110 struct pnfs_layout_segment lseg; 64 struct pnfs_layout_segment lseg;
111 65
@@ -130,29 +84,24 @@ struct objio_state {
130 84
131/* Send and wait for a get_device_info of devices in the layout, 85/* Send and wait for a get_device_info of devices in the layout,
132 then look them up with the osd_initiator library */ 86 then look them up with the osd_initiator library */
133static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, 87struct nfs4_deviceid_node *
134 struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, 88objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
135 gfp_t gfp_flags) 89 gfp_t gfp_flags)
136{ 90{
137 struct pnfs_osd_deviceaddr *deviceaddr; 91 struct pnfs_osd_deviceaddr *deviceaddr;
138 struct objio_dev_ent *ode; 92 struct objio_dev_ent *ode = NULL;
139 struct osd_dev *od; 93 struct osd_dev *od;
140 struct osd_dev_info odi; 94 struct osd_dev_info odi;
141 bool retry_flag = true; 95 bool retry_flag = true;
96 __be32 *p;
142 int err; 97 int err;
143 98
144 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 99 deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
145 if (ode) { 100 if (!deviceaddr)
146 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ 101 return NULL;
147 return 0;
148 }
149 102
150 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); 103 p = page_address(pdev->pages[0]);
151 if (unlikely(err)) { 104 pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
152 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
153 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
154 return err;
155 }
156 105
157 odi.systemid_len = deviceaddr->oda_systemid.len; 106 odi.systemid_len = deviceaddr->oda_systemid.len;
158 if (odi.systemid_len > sizeof(odi.systemid)) { 107 if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
188 goto out; 137 goto out;
189 } 138 }
190 139
191 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
192 gfp_flags);
193 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
194 dprintk("Adding new dev_id(%llx:%llx)\n", 140 dprintk("Adding new dev_id(%llx:%llx)\n",
195 _DEVID_LO(d_id), _DEVID_HI(d_id)); 141 _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
142
143 ode = kzalloc(sizeof(*ode), gfp_flags);
144 if (!ode) {
145 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
146 goto out;
147 }
148
149 nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
150 kfree(deviceaddr);
151
152 ode->od.od = od;
153 return &ode->id_node;
154
196out: 155out:
197 objlayout_put_deviceinfo(deviceaddr); 156 kfree(deviceaddr);
198 return err; 157 return NULL;
199} 158}
200 159
201static void copy_single_comp(struct ore_components *oc, unsigned c, 160static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
254 struct xdr_stream *xdr, 213 struct xdr_stream *xdr,
255 gfp_t gfp_flags) 214 gfp_t gfp_flags)
256{ 215{
216 struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
257 struct objio_segment *objio_seg; 217 struct objio_segment *objio_seg;
258 struct pnfs_osd_xdr_decode_layout_iter iter; 218 struct pnfs_osd_xdr_decode_layout_iter iter;
259 struct pnfs_osd_layout layout; 219 struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
283 objio_seg->oc.first_dev = layout.olo_comps_index; 243 objio_seg->oc.first_dev = layout.olo_comps_index;
284 cur_comp = 0; 244 cur_comp = 0;
285 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { 245 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
246 struct nfs4_deviceid_node *d;
247 struct objio_dev_ent *ode;
248
286 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); 249 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
287 err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, 250
288 &src_comp.oc_object_id.oid_device_id, 251 d = nfs4_find_get_deviceid(server,
289 gfp_flags); 252 &src_comp.oc_object_id.oid_device_id,
290 if (err) 253 pnfslay->plh_lc_cred, gfp_flags);
254 if (!d) {
255 err = -ENXIO;
291 goto err; 256 goto err;
292 ++cur_comp; 257 }
258
259 ode = container_of(d, struct objio_dev_ent, id_node);
260 objio_seg->oc.ods[cur_comp++] = &ode->od;
293 } 261 }
294 /* pnfs_osd_xdr_decode_layout_comp returns false on error */ 262 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
295 if (unlikely(err)) 263 if (unlikely(err))
@@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
653 .flags = PNFS_LAYOUTRET_ON_SETATTR | 621 .flags = PNFS_LAYOUTRET_ON_SETATTR |
654 PNFS_LAYOUTRET_ON_ERROR, 622 PNFS_LAYOUTRET_ON_ERROR,
655 623
624 .max_deviceinfo_size = PAGE_SIZE,
656 .owner = THIS_MODULE, 625 .owner = THIS_MODULE,
657 .alloc_layout_hdr = objlayout_alloc_layout_hdr, 626 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
658 .free_layout_hdr = objlayout_free_layout_hdr, 627 .free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 697a16d11fac..c89357c7a914 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -574,76 +574,6 @@ loop_done:
574 dprintk("%s: Return\n", __func__); 574 dprintk("%s: Return\n", __func__);
575} 575}
576 576
577
578/*
579 * Get Device Info API for io engines
580 */
581struct objlayout_deviceinfo {
582 struct page *page;
583 struct pnfs_osd_deviceaddr da; /* This must be last */
584};
585
586/* Initialize and call nfs_getdeviceinfo, then decode and return a
587 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
588 * should be called.
589 */
590int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
591 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
592 gfp_t gfp_flags)
593{
594 struct objlayout_deviceinfo *odi;
595 struct pnfs_device pd;
596 struct page *page, **pages;
597 u32 *p;
598 int err;
599
600 page = alloc_page(gfp_flags);
601 if (!page)
602 return -ENOMEM;
603
604 pages = &page;
605 pd.pages = pages;
606
607 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
608 pd.layout_type = LAYOUT_OSD2_OBJECTS;
609 pd.pages = &page;
610 pd.pgbase = 0;
611 pd.pglen = PAGE_SIZE;
612 pd.mincount = 0;
613 pd.maxcount = PAGE_SIZE;
614
615 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
616 pnfslay->plh_lc_cred);
617 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
618 if (err)
619 goto err_out;
620
621 p = page_address(page);
622 odi = kzalloc(sizeof(*odi), gfp_flags);
623 if (!odi) {
624 err = -ENOMEM;
625 goto err_out;
626 }
627 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
628 odi->page = page;
629 *deviceaddr = &odi->da;
630 return 0;
631
632err_out:
633 __free_page(page);
634 return err;
635}
636
637void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
638{
639 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
640 struct objlayout_deviceinfo,
641 da);
642
643 __free_page(odi->page);
644 kfree(odi);
645}
646
647enum { 577enum {
648 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, 578 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
649 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, 579 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index fd13f1d2f136..3a0828d57339 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
149extern void objlayout_write_done(struct objlayout_io_res *oir, 149extern void objlayout_write_done(struct objlayout_io_res *oir,
150 ssize_t status, bool sync); 150 ssize_t status, bool sync);
151 151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
153 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
154 gfp_t gfp_flags);
155extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
156
157/* 152/*
158 * exported generic objects function vectors 153 * exported generic objects function vectors
159 */ 154 */
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index ba491926df5f..94e16ec88312 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -116,7 +116,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c)
116 if (atomic_read(&c->io_count) == 0) 116 if (atomic_read(&c->io_count) == 0)
117 break; 117 break;
118 ret = nfs_wait_bit_killable(&q.key); 118 ret = nfs_wait_bit_killable(&q.key);
119 } while (atomic_read(&c->io_count) != 0); 119 } while (atomic_read(&c->io_count) != 0 && !ret);
120 finish_wait(wq, &q.wait); 120 finish_wait(wq, &q.wait);
121 return ret; 121 return ret;
122} 122}
@@ -139,26 +139,49 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
139/* 139/*
140 * nfs_page_group_lock - lock the head of the page group 140 * nfs_page_group_lock - lock the head of the page group
141 * @req - request in group that is to be locked 141 * @req - request in group that is to be locked
142 * @nonblock - if true don't block waiting for lock
142 * 143 *
143 * this lock must be held if modifying the page group list 144 * this lock must be held if modifying the page group list
144 * 145 *
145 * returns result from wait_on_bit_lock: 0 on success, < 0 on error 146 * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
147 * result from wait_on_bit_lock
148 *
149 * NOTE: calling with nonblock=false should always have set the
150 * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
151 * with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
146 */ 152 */
147int 153int
148nfs_page_group_lock(struct nfs_page *req, bool wait) 154nfs_page_group_lock(struct nfs_page *req, bool nonblock)
149{ 155{
150 struct nfs_page *head = req->wb_head; 156 struct nfs_page *head = req->wb_head;
151 int ret;
152 157
153 WARN_ON_ONCE(head != head->wb_head); 158 WARN_ON_ONCE(head != head->wb_head);
154 159
155 do { 160 if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
156 ret = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, 161 return 0;
157 TASK_UNINTERRUPTIBLE);
158 } while (wait && ret != 0);
159 162
160 WARN_ON_ONCE(ret > 0); 163 if (!nonblock)
161 return ret; 164 return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
165 TASK_UNINTERRUPTIBLE);
166
167 return -EAGAIN;
168}
169
170/*
171 * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
172 * @req - a request in the group
173 *
174 * This is a blocking call to wait for the group lock to be cleared.
175 */
176void
177nfs_page_group_lock_wait(struct nfs_page *req)
178{
179 struct nfs_page *head = req->wb_head;
180
181 WARN_ON_ONCE(head != head->wb_head);
182
183 wait_on_bit(&head->wb_flags, PG_HEADLOCK,
184 TASK_UNINTERRUPTIBLE);
162} 185}
163 186
164/* 187/*
@@ -219,7 +242,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
219{ 242{
220 bool ret; 243 bool ret;
221 244
222 nfs_page_group_lock(req, true); 245 nfs_page_group_lock(req, false);
223 ret = nfs_page_group_sync_on_bit_locked(req, bit); 246 ret = nfs_page_group_sync_on_bit_locked(req, bit);
224 nfs_page_group_unlock(req); 247 nfs_page_group_unlock(req);
225 248
@@ -458,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
458 return 0; 481 return 0;
459 } 482 }
460 483
484 /*
485 * Limit the request size so that we can still allocate a page array
486 * for it without upsetting the slab allocator.
487 */
488 if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
489 sizeof(struct page) > PAGE_SIZE)
490 return 0;
491
461 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); 492 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
462} 493}
463EXPORT_SYMBOL_GPL(nfs_generic_pg_test); 494EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
@@ -701,10 +732,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
701 struct nfs_pgio_header *hdr) 732 struct nfs_pgio_header *hdr)
702{ 733{
703 struct nfs_page *req; 734 struct nfs_page *req;
704 struct page **pages; 735 struct page **pages,
736 *last_page;
705 struct list_head *head = &desc->pg_list; 737 struct list_head *head = &desc->pg_list;
706 struct nfs_commit_info cinfo; 738 struct nfs_commit_info cinfo;
707 unsigned int pagecount; 739 unsigned int pagecount, pageused;
708 740
709 pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count); 741 pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
710 if (!nfs_pgarray_set(&hdr->page_array, pagecount)) 742 if (!nfs_pgarray_set(&hdr->page_array, pagecount))
@@ -712,12 +744,23 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
712 744
713 nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); 745 nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
714 pages = hdr->page_array.pagevec; 746 pages = hdr->page_array.pagevec;
747 last_page = NULL;
748 pageused = 0;
715 while (!list_empty(head)) { 749 while (!list_empty(head)) {
716 req = nfs_list_entry(head->next); 750 req = nfs_list_entry(head->next);
717 nfs_list_remove_request(req); 751 nfs_list_remove_request(req);
718 nfs_list_add_request(req, &hdr->pages); 752 nfs_list_add_request(req, &hdr->pages);
719 *pages++ = req->wb_page; 753
754 if (WARN_ON_ONCE(pageused >= pagecount))
755 return nfs_pgio_error(desc, hdr);
756
757 if (!last_page || last_page != req->wb_page) {
758 *pages++ = last_page = req->wb_page;
759 pageused++;
760 }
720 } 761 }
762 if (WARN_ON_ONCE(pageused != pagecount))
763 return nfs_pgio_error(desc, hdr);
721 764
722 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 765 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
723 (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) 766 (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -788,6 +831,14 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
788 return false; 831 return false;
789 if (req_offset(req) != req_offset(prev) + prev->wb_bytes) 832 if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
790 return false; 833 return false;
834 if (req->wb_page == prev->wb_page) {
835 if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
836 return false;
837 } else {
838 if (req->wb_pgbase != 0 ||
839 prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
840 return false;
841 }
791 } 842 }
792 size = pgio->pg_ops->pg_test(pgio, prev, req); 843 size = pgio->pg_ops->pg_test(pgio, prev, req);
793 WARN_ON_ONCE(size > req->wb_bytes); 844 WARN_ON_ONCE(size > req->wb_bytes);
@@ -858,13 +909,8 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
858 struct nfs_page *subreq; 909 struct nfs_page *subreq;
859 unsigned int bytes_left = 0; 910 unsigned int bytes_left = 0;
860 unsigned int offset, pgbase; 911 unsigned int offset, pgbase;
861 int ret;
862 912
863 ret = nfs_page_group_lock(req, false); 913 nfs_page_group_lock(req, false);
864 if (ret < 0) {
865 desc->pg_error = ret;
866 return 0;
867 }
868 914
869 subreq = req; 915 subreq = req;
870 bytes_left = subreq->wb_bytes; 916 bytes_left = subreq->wb_bytes;
@@ -886,11 +932,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
886 if (desc->pg_recoalesce) 932 if (desc->pg_recoalesce)
887 return 0; 933 return 0;
888 /* retry add_request for this subreq */ 934 /* retry add_request for this subreq */
889 ret = nfs_page_group_lock(req, false); 935 nfs_page_group_lock(req, false);
890 if (ret < 0) {
891 desc->pg_error = ret;
892 return 0;
893 }
894 continue; 936 continue;
895 } 937 }
896 938
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index a3851debf8a2..76de7f568119 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -594,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
594 dprintk("%s freeing layout for inode %lu\n", __func__, 594 dprintk("%s freeing layout for inode %lu\n", __func__,
595 lo->plh_inode->i_ino); 595 lo->plh_inode->i_ino);
596 inode = lo->plh_inode; 596 inode = lo->plh_inode;
597
598 pnfs_layoutcommit_inode(inode, false);
599
597 spin_lock(&inode->i_lock); 600 spin_lock(&inode->i_lock);
598 list_del_init(&lo->plh_bulk_destroy); 601 list_del_init(&lo->plh_bulk_destroy);
599 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 602 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -682,17 +685,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
682 return (s32)(s1 - s2) > 0; 685 return (s32)(s1 - s2) > 0;
683} 686}
684 687
685static void
686pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
687 const nfs4_stateid *new,
688 struct list_head *free_me_list)
689{
690 if (nfs4_stateid_match_other(&lo->plh_stateid, new))
691 return;
692 /* Layout is new! Kill existing layout segments */
693 pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
694}
695
696/* update lo->plh_stateid with new if is more recent */ 688/* update lo->plh_stateid with new if is more recent */
697void 689void
698pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 690pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -749,7 +741,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
749 status = -EAGAIN; 741 status = -EAGAIN;
750 } else if (!nfs4_valid_open_stateid(open_state)) { 742 } else if (!nfs4_valid_open_stateid(open_state)) {
751 status = -EBADF; 743 status = -EBADF;
752 } else if (list_empty(&lo->plh_segs)) { 744 } else if (list_empty(&lo->plh_segs) ||
745 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
753 int seq; 746 int seq;
754 747
755 do { 748 do {
@@ -864,6 +857,16 @@ _pnfs_return_layout(struct inode *ino)
864 empty = list_empty(&lo->plh_segs); 857 empty = list_empty(&lo->plh_segs);
865 pnfs_clear_layoutcommit(ino, &tmp_list); 858 pnfs_clear_layoutcommit(ino, &tmp_list);
866 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 859 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
860
861 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
862 struct pnfs_layout_range range = {
863 .iomode = IOMODE_ANY,
864 .offset = 0,
865 .length = NFS4_MAX_UINT64,
866 };
867 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
868 }
869
867 /* Don't send a LAYOUTRETURN if list was initially empty */ 870 /* Don't send a LAYOUTRETURN if list was initially empty */
868 if (empty) { 871 if (empty) {
869 spin_unlock(&ino->i_lock); 872 spin_unlock(&ino->i_lock);
@@ -871,6 +874,8 @@ _pnfs_return_layout(struct inode *ino)
871 dprintk("NFS: %s no layout segments to return\n", __func__); 874 dprintk("NFS: %s no layout segments to return\n", __func__);
872 goto out; 875 goto out;
873 } 876 }
877
878 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
874 lo->plh_block_lgets++; 879 lo->plh_block_lgets++;
875 spin_unlock(&ino->i_lock); 880 spin_unlock(&ino->i_lock);
876 pnfs_free_lseg_list(&tmp_list); 881 pnfs_free_lseg_list(&tmp_list);
@@ -1358,25 +1363,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1358 goto out; 1363 goto out;
1359 } 1364 }
1360 1365
1366 init_lseg(lo, lseg);
1367 lseg->pls_range = res->range;
1368
1361 spin_lock(&ino->i_lock); 1369 spin_lock(&ino->i_lock);
1362 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1370 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1363 dprintk("%s forget reply due to recall\n", __func__); 1371 dprintk("%s forget reply due to recall\n", __func__);
1364 goto out_forget_reply; 1372 goto out_forget_reply;
1365 } 1373 }
1366 1374
1367 if (pnfs_layoutgets_blocked(lo, 1) || 1375 if (pnfs_layoutgets_blocked(lo, 1)) {
1368 pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1369 dprintk("%s forget reply due to state\n", __func__); 1376 dprintk("%s forget reply due to state\n", __func__);
1370 goto out_forget_reply; 1377 goto out_forget_reply;
1371 } 1378 }
1372 1379
1373 /* Check that the new stateid matches the old stateid */ 1380 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1374 pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); 1381 /* existing state ID, make sure the sequence number matches. */
1375 /* Done processing layoutget. Set the layout stateid */ 1382 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1376 pnfs_set_layout_stateid(lo, &res->stateid, false); 1383 dprintk("%s forget reply due to sequence\n", __func__);
1384 goto out_forget_reply;
1385 }
1386 pnfs_set_layout_stateid(lo, &res->stateid, false);
1387 } else {
1388 /*
1389 * We got an entirely new state ID. Mark all segments for the
1390 * inode invalid, and don't bother validating the stateid
1391 * sequence number.
1392 */
1393 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
1394
1395 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1396 lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1397 }
1398
1399 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1377 1400
1378 init_lseg(lo, lseg);
1379 lseg->pls_range = res->range;
1380 pnfs_get_lseg(lseg); 1401 pnfs_get_lseg(lseg);
1381 pnfs_layout_insert_lseg(lo, lseg); 1402 pnfs_layout_insert_lseg(lo, lseg);
1382 1403
@@ -1797,6 +1818,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
1797} 1818}
1798EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1819EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1799 1820
1821void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
1822{
1823 struct inode *inode = data->inode;
1824 struct nfs_inode *nfsi = NFS_I(inode);
1825 bool mark_as_dirty = false;
1826
1827 spin_lock(&inode->i_lock);
1828 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1829 mark_as_dirty = true;
1830 dprintk("%s: Set layoutcommit for inode %lu ",
1831 __func__, inode->i_ino);
1832 }
1833 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
1834 /* references matched in nfs4_layoutcommit_release */
1835 pnfs_get_lseg(data->lseg);
1836 }
1837 if (data->lwb > nfsi->layout->plh_lwb)
1838 nfsi->layout->plh_lwb = data->lwb;
1839 spin_unlock(&inode->i_lock);
1840 dprintk("%s: lseg %p end_pos %llu\n",
1841 __func__, data->lseg, nfsi->layout->plh_lwb);
1842
1843 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1844 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
1845 if (mark_as_dirty)
1846 mark_inode_dirty_sync(inode);
1847}
1848EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
1849
1800void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 1850void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1801{ 1851{
1802 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 1852 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1817,6 +1867,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1817int 1867int
1818pnfs_layoutcommit_inode(struct inode *inode, bool sync) 1868pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1819{ 1869{
1870 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1820 struct nfs4_layoutcommit_data *data; 1871 struct nfs4_layoutcommit_data *data;
1821 struct nfs_inode *nfsi = NFS_I(inode); 1872 struct nfs_inode *nfsi = NFS_I(inode);
1822 loff_t end_pos; 1873 loff_t end_pos;
@@ -1867,6 +1918,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1867 data->args.lastbytewritten = end_pos - 1; 1918 data->args.lastbytewritten = end_pos - 1;
1868 data->res.server = NFS_SERVER(inode); 1919 data->res.server = NFS_SERVER(inode);
1869 1920
1921 if (ld->prepare_layoutcommit) {
1922 status = ld->prepare_layoutcommit(&data->args);
1923 if (status) {
1924 spin_lock(&inode->i_lock);
1925 if (end_pos < nfsi->layout->plh_lwb)
1926 nfsi->layout->plh_lwb = end_pos;
1927 spin_unlock(&inode->i_lock);
1928 put_rpccred(data->cred);
1929 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
1930 goto clear_layoutcommitting;
1931 }
1932 }
1933
1934
1870 status = nfs4_proc_layoutcommit(data, sync); 1935 status = nfs4_proc_layoutcommit(data, sync);
1871out: 1936out:
1872 if (status) 1937 if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index aca3dff5dae6..693ce42ec683 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -65,12 +65,15 @@ enum {
65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */ 67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */
68 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
68}; 69};
69 70
70enum layoutdriver_policy_flags { 71enum layoutdriver_policy_flags {
71 /* Should the pNFS client commit and return the layout upon a setattr */ 72 /* Should the pNFS client commit and return the layout upon truncate to
73 * a smaller size */
72 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, 74 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
73 PNFS_LAYOUTRET_ON_ERROR = 1 << 1, 75 PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
76 PNFS_READ_WHOLE_PAGE = 1 << 2,
74}; 77};
75 78
76struct nfs4_deviceid_node; 79struct nfs4_deviceid_node;
@@ -82,6 +85,7 @@ struct pnfs_layoutdriver_type {
82 const char *name; 85 const char *name;
83 struct module *owner; 86 struct module *owner;
84 unsigned flags; 87 unsigned flags;
88 unsigned max_deviceinfo_size;
85 89
86 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); 90 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
87 int (*clear_layoutdriver) (struct nfs_server *); 91 int (*clear_layoutdriver) (struct nfs_server *);
@@ -92,6 +96,9 @@ struct pnfs_layoutdriver_type {
92 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 96 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
93 void (*free_lseg) (struct pnfs_layout_segment *lseg); 97 void (*free_lseg) (struct pnfs_layout_segment *lseg);
94 98
99 void (*return_range) (struct pnfs_layout_hdr *lo,
100 struct pnfs_layout_range *range);
101
95 /* test for nfs page cache coalescing */ 102 /* test for nfs page cache coalescing */
96 const struct nfs_pageio_ops *pg_read_ops; 103 const struct nfs_pageio_ops *pg_read_ops;
97 const struct nfs_pageio_ops *pg_write_ops; 104 const struct nfs_pageio_ops *pg_write_ops;
@@ -121,14 +128,17 @@ struct pnfs_layoutdriver_type {
121 enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int); 128 enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
122 129
123 void (*free_deviceid_node) (struct nfs4_deviceid_node *); 130 void (*free_deviceid_node) (struct nfs4_deviceid_node *);
131 struct nfs4_deviceid_node * (*alloc_deviceid_node)
132 (struct nfs_server *server, struct pnfs_device *pdev,
133 gfp_t gfp_flags);
124 134
125 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, 135 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
126 struct xdr_stream *xdr, 136 struct xdr_stream *xdr,
127 const struct nfs4_layoutreturn_args *args); 137 const struct nfs4_layoutreturn_args *args);
128 138
129 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); 139 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
130 140 int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
131 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, 141 void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
132 struct xdr_stream *xdr, 142 struct xdr_stream *xdr,
133 const struct nfs4_layoutcommit_args *args); 143 const struct nfs4_layoutcommit_args *args);
134}; 144};
@@ -171,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
171extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 181extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
172 182
173/* nfs4proc.c */ 183/* nfs4proc.c */
174extern int nfs4_proc_getdevicelist(struct nfs_server *server,
175 const struct nfs_fh *fh,
176 struct pnfs_devicelist *devlist);
177extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 184extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
178 struct pnfs_device *dev, 185 struct pnfs_device *dev,
179 struct rpc_cred *cred); 186 struct rpc_cred *cred);
@@ -219,6 +226,7 @@ void pnfs_roc_release(struct inode *ino);
219void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 226void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
220bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); 227bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
221void pnfs_set_layoutcommit(struct nfs_pgio_header *); 228void pnfs_set_layoutcommit(struct nfs_pgio_header *);
229void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
222void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 230void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
223int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 231int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
224int _pnfs_return_layout(struct inode *); 232int _pnfs_return_layout(struct inode *);
@@ -255,11 +263,12 @@ struct nfs4_deviceid_node {
255 atomic_t ref; 263 atomic_t ref;
256}; 264};
257 265
258struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 266struct nfs4_deviceid_node *
267nfs4_find_get_deviceid(struct nfs_server *server,
268 const struct nfs4_deviceid *id, struct rpc_cred *cred,
269 gfp_t gfp_mask);
259void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 270void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
260void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, 271void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
261 const struct pnfs_layoutdriver_type *,
262 const struct nfs_client *,
263 const struct nfs4_deviceid *); 272 const struct nfs4_deviceid *);
264struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); 273struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
265bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); 274bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
@@ -267,6 +276,13 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
267bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); 276bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
268void nfs4_deviceid_purge_client(const struct nfs_client *); 277void nfs4_deviceid_purge_client(const struct nfs_client *);
269 278
279static inline struct nfs4_deviceid_node *
280nfs4_get_deviceid(struct nfs4_deviceid_node *d)
281{
282 atomic_inc(&d->ref);
283 return d;
284}
285
270static inline struct pnfs_layout_segment * 286static inline struct pnfs_layout_segment *
271pnfs_get_lseg(struct pnfs_layout_segment *lseg) 287pnfs_get_lseg(struct pnfs_layout_segment *lseg)
272{ 288{
@@ -368,6 +384,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
368} 384}
369 385
370static inline bool 386static inline bool
387pnfs_ld_read_whole_page(struct inode *inode)
388{
389 if (!pnfs_enabled_sb(NFS_SERVER(inode)))
390 return false;
391 return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
392}
393
394static inline bool
371pnfs_layoutcommit_outstanding(struct inode *inode) 395pnfs_layoutcommit_outstanding(struct inode *inode)
372{ 396{
373 struct nfs_inode *nfsi = NFS_I(inode); 397 struct nfs_inode *nfsi = NFS_I(inode);
@@ -443,6 +467,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
443} 467}
444 468
445static inline bool 469static inline bool
470pnfs_ld_read_whole_page(struct inode *inode)
471{
472 return false;
473}
474
475static inline bool
446pnfs_roc(struct inode *ino) 476pnfs_roc(struct inode *ino)
447{ 477{
448 return false; 478 return false;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
29 */ 29 */
30 30
31#include <linux/export.h> 31#include <linux/export.h>
32#include <linux/nfs_fs.h>
33#include "nfs4session.h"
34#include "internal.h"
32#include "pnfs.h" 35#include "pnfs.h"
33 36
34#define NFSDBG_FACILITY NFSDBG_PNFS 37#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
89 return NULL; 92 return NULL;
90} 93}
91 94
95static struct nfs4_deviceid_node *
96nfs4_get_device_info(struct nfs_server *server,
97 const struct nfs4_deviceid *dev_id,
98 struct rpc_cred *cred, gfp_t gfp_flags)
99{
100 struct nfs4_deviceid_node *d = NULL;
101 struct pnfs_device *pdev = NULL;
102 struct page **pages = NULL;
103 u32 max_resp_sz;
104 int max_pages;
105 int rc, i;
106
107 /*
108 * Use the session max response size as the basis for setting
109 * GETDEVICEINFO's maxcount
110 */
111 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
112 if (server->pnfs_curr_ld->max_deviceinfo_size &&
113 server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
114 max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
115 max_pages = nfs_page_array_len(0, max_resp_sz);
116 dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
117 __func__, server, max_resp_sz, max_pages);
118
119 pdev = kzalloc(sizeof(*pdev), gfp_flags);
120 if (!pdev)
121 return NULL;
122
123 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
124 if (!pages)
125 goto out_free_pdev;
126
127 for (i = 0; i < max_pages; i++) {
128 pages[i] = alloc_page(gfp_flags);
129 if (!pages[i])
130 goto out_free_pages;
131 }
132
133 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
134 pdev->layout_type = server->pnfs_curr_ld->id;
135 pdev->pages = pages;
136 pdev->pgbase = 0;
137 pdev->pglen = max_resp_sz;
138 pdev->mincount = 0;
139 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
140
141 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
142 dprintk("%s getdevice info returns %d\n", __func__, rc);
143 if (rc)
144 goto out_free_pages;
145
146 /*
147 * Found new device, need to decode it and then add it to the
148 * list of known devices for this mountpoint.
149 */
150 d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
151 gfp_flags);
152
153out_free_pages:
154 for (i = 0; i < max_pages; i++)
155 __free_page(pages[i]);
156 kfree(pages);
157out_free_pdev:
158 kfree(pdev);
159 dprintk("<-- %s d %p\n", __func__, d);
160 return d;
161}
162
92/* 163/*
93 * Lookup a deviceid in cache and get a reference count on it if found 164 * Lookup a deviceid in cache and get a reference count on it if found
94 * 165 *
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
96 * @id deviceid to look up 167 * @id deviceid to look up
97 */ 168 */
98static struct nfs4_deviceid_node * 169static struct nfs4_deviceid_node *
99_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 170__nfs4_find_get_deviceid(struct nfs_server *server,
100 const struct nfs_client *clp, const struct nfs4_deviceid *id, 171 const struct nfs4_deviceid *id, long hash)
101 long hash)
102{ 172{
103 struct nfs4_deviceid_node *d; 173 struct nfs4_deviceid_node *d;
104 174
105 rcu_read_lock(); 175 rcu_read_lock();
106 d = _lookup_deviceid(ld, clp, id, hash); 176 d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
177 hash);
107 if (d != NULL) 178 if (d != NULL)
108 atomic_inc(&d->ref); 179 atomic_inc(&d->ref);
109 rcu_read_unlock(); 180 rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
111} 182}
112 183
113struct nfs4_deviceid_node * 184struct nfs4_deviceid_node *
114nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 185nfs4_find_get_deviceid(struct nfs_server *server,
115 const struct nfs_client *clp, const struct nfs4_deviceid *id) 186 const struct nfs4_deviceid *id, struct rpc_cred *cred,
187 gfp_t gfp_mask)
116{ 188{
117 return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); 189 long hash = nfs4_deviceid_hash(id);
190 struct nfs4_deviceid_node *d, *new;
191
192 d = __nfs4_find_get_deviceid(server, id, hash);
193 if (d)
194 return d;
195
196 new = nfs4_get_device_info(server, id, cred, gfp_mask);
197 if (!new)
198 return new;
199
200 spin_lock(&nfs4_deviceid_lock);
201 d = __nfs4_find_get_deviceid(server, id, hash);
202 if (d) {
203 spin_unlock(&nfs4_deviceid_lock);
204 server->pnfs_curr_ld->free_deviceid_node(new);
205 return d;
206 }
207 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
208 atomic_inc(&new->ref);
209 spin_unlock(&nfs4_deviceid_lock);
210
211 return new;
118} 212}
119EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); 213EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
120 214
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
151EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); 245EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
152 246
153void 247void
154nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, 248nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
155 const struct pnfs_layoutdriver_type *ld,
156 const struct nfs_client *nfs_client,
157 const struct nfs4_deviceid *id) 249 const struct nfs4_deviceid *id)
158{ 250{
159 INIT_HLIST_NODE(&d->node); 251 INIT_HLIST_NODE(&d->node);
160 INIT_HLIST_NODE(&d->tmpnode); 252 INIT_HLIST_NODE(&d->tmpnode);
161 d->ld = ld; 253 d->ld = server->pnfs_curr_ld;
162 d->nfs_client = nfs_client; 254 d->nfs_client = server->nfs_client;
163 d->flags = 0; 255 d->flags = 0;
164 d->deviceid = *id; 256 d->deviceid = *id;
165 atomic_set(&d->ref, 1); 257 atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
167EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); 259EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
168 260
169/* 261/*
170 * Uniquely initialize and insert a deviceid node into cache
171 *
172 * @new new deviceid node
173 * Note that the caller must set up the following members:
174 * new->ld
175 * new->nfs_client
176 * new->deviceid
177 *
178 * @ret the inserted node, if none found, otherwise, the found entry.
179 */
180struct nfs4_deviceid_node *
181nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
182{
183 struct nfs4_deviceid_node *d;
184 long hash;
185
186 spin_lock(&nfs4_deviceid_lock);
187 hash = nfs4_deviceid_hash(&new->deviceid);
188 d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
189 if (d) {
190 spin_unlock(&nfs4_deviceid_lock);
191 return d;
192 }
193
194 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
195 spin_unlock(&nfs4_deviceid_lock);
196 atomic_inc(&new->ref);
197
198 return new;
199}
200EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
201
202/*
203 * Dereference a deviceid node and delete it when its reference count drops 262 * Dereference a deviceid node and delete it when its reference count drops
204 * to zero. 263 * to zero.
205 * 264 *
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
299 } 358 }
300 rcu_read_unlock(); 359 rcu_read_unlock();
301} 360}
302
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e4499d5b51e8..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
2065 return NFS_TEXT_DATA; 2065 return NFS_TEXT_DATA;
2066 } 2066 }
2067 2067
2068#if !IS_ENABLED(CONFIG_NFS_V3)
2069 if (args->version == 3)
2070 goto out_v3_not_compiled;
2071#endif /* !CONFIG_NFS_V3 */
2072
2073 return 0; 2068 return 0;
2074 2069
2075out_no_data: 2070out_no_data:
@@ -2085,12 +2080,6 @@ out_no_sec:
2085 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); 2080 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
2086 return -EINVAL; 2081 return -EINVAL;
2087 2082
2088#if !IS_ENABLED(CONFIG_NFS_V3)
2089out_v3_not_compiled:
2090 dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
2091 return -EPROTONOSUPPORT;
2092#endif /* !CONFIG_NFS_V3 */
2093
2094out_nomem: 2083out_nomem:
2095 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); 2084 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
2096 return -ENOMEM; 2085 return -ENOMEM;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e3b5cf28bdc5..12493846a2d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops;
49static void nfs_clear_request_commit(struct nfs_page *req); 49static void nfs_clear_request_commit(struct nfs_page *req);
50static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, 50static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
51 struct inode *inode); 51 struct inode *inode);
52static struct nfs_page *
53nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
54 struct page *page);
52 55
53static struct kmem_cache *nfs_wdata_cachep; 56static struct kmem_cache *nfs_wdata_cachep;
54static mempool_t *nfs_wdata_mempool; 57static mempool_t *nfs_wdata_mempool;
@@ -95,38 +98,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
95} 98}
96 99
97/* 100/*
98 * nfs_page_search_commits_for_head_request_locked
99 *
100 * Search through commit lists on @inode for the head request for @page.
101 * Must be called while holding the inode (which is cinfo) lock.
102 *
103 * Returns the head request if found, or NULL if not found.
104 */
105static struct nfs_page *
106nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
107 struct page *page)
108{
109 struct nfs_page *freq, *t;
110 struct nfs_commit_info cinfo;
111 struct inode *inode = &nfsi->vfs_inode;
112
113 nfs_init_cinfo_from_inode(&cinfo, inode);
114
115 /* search through pnfs commit lists */
116 freq = pnfs_search_commit_reqs(inode, &cinfo, page);
117 if (freq)
118 return freq->wb_head;
119
120 /* Linearly search the commit list for the correct request */
121 list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
122 if (freq->wb_page == page)
123 return freq->wb_head;
124 }
125
126 return NULL;
127}
128
129/*
130 * nfs_page_find_head_request_locked - find head request associated with @page 101 * nfs_page_find_head_request_locked - find head request associated with @page
131 * 102 *
132 * must be called while holding the inode lock. 103 * must be called while holding the inode lock.
@@ -241,7 +212,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
241 unsigned int pos = 0; 212 unsigned int pos = 0;
242 unsigned int len = nfs_page_length(req->wb_page); 213 unsigned int len = nfs_page_length(req->wb_page);
243 214
244 nfs_page_group_lock(req, true); 215 nfs_page_group_lock(req, false);
245 216
246 do { 217 do {
247 tmp = nfs_page_group_search_locked(req->wb_head, pos); 218 tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -271,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
271 242
272static int wb_priority(struct writeback_control *wbc) 243static int wb_priority(struct writeback_control *wbc)
273{ 244{
245 int ret = 0;
274 if (wbc->for_reclaim) 246 if (wbc->for_reclaim)
275 return FLUSH_HIGHPRI | FLUSH_STABLE; 247 return FLUSH_HIGHPRI | FLUSH_STABLE;
248 if (wbc->sync_mode == WB_SYNC_ALL)
249 ret = FLUSH_COND_STABLE;
276 if (wbc->for_kupdate || wbc->for_background) 250 if (wbc->for_kupdate || wbc->for_background)
277 return FLUSH_LOWPRI | FLUSH_COND_STABLE; 251 ret |= FLUSH_LOWPRI;
278 return FLUSH_COND_STABLE; 252 return ret;
279} 253}
280 254
281/* 255/*
@@ -478,10 +452,23 @@ try_again:
478 return NULL; 452 return NULL;
479 } 453 }
480 454
481 /* lock each request in the page group */ 455 /* holding inode lock, so always make a non-blocking call to try the
482 ret = nfs_page_group_lock(head, false); 456 * page group lock */
483 if (ret < 0) 457 ret = nfs_page_group_lock(head, true);
458 if (ret < 0) {
459 spin_unlock(&inode->i_lock);
460
461 if (!nonblock && ret == -EAGAIN) {
462 nfs_page_group_lock_wait(head);
463 nfs_release_request(head);
464 goto try_again;
465 }
466
467 nfs_release_request(head);
484 return ERR_PTR(ret); 468 return ERR_PTR(ret);
469 }
470
471 /* lock each request in the page group */
485 subreq = head; 472 subreq = head;
486 do { 473 do {
487 /* 474 /*
@@ -718,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
718 if (likely(!PageSwapCache(head->wb_page))) { 705 if (likely(!PageSwapCache(head->wb_page))) {
719 set_page_private(head->wb_page, 0); 706 set_page_private(head->wb_page, 0);
720 ClearPagePrivate(head->wb_page); 707 ClearPagePrivate(head->wb_page);
708 smp_mb__after_atomic();
709 wake_up_page(head->wb_page, PG_private);
721 clear_bit(PG_MAPPED, &head->wb_flags); 710 clear_bit(PG_MAPPED, &head->wb_flags);
722 } 711 }
723 nfsi->npages--; 712 nfsi->npages--;
@@ -736,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
736 __set_page_dirty_nobuffers(req->wb_page); 725 __set_page_dirty_nobuffers(req->wb_page);
737} 726}
738 727
739#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 728/*
729 * nfs_page_search_commits_for_head_request_locked
730 *
731 * Search through commit lists on @inode for the head request for @page.
732 * Must be called while holding the inode (which is cinfo) lock.
733 *
734 * Returns the head request if found, or NULL if not found.
735 */
736static struct nfs_page *
737nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
738 struct page *page)
739{
740 struct nfs_page *freq, *t;
741 struct nfs_commit_info cinfo;
742 struct inode *inode = &nfsi->vfs_inode;
743
744 nfs_init_cinfo_from_inode(&cinfo, inode);
745
746 /* search through pnfs commit lists */
747 freq = pnfs_search_commit_reqs(inode, &cinfo, page);
748 if (freq)
749 return freq->wb_head;
750
751 /* Linearly search the commit list for the correct request */
752 list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
753 if (freq->wb_page == page)
754 return freq->wb_head;
755 }
756
757 return NULL;
758}
759
740/** 760/**
741 * nfs_request_add_commit_list - add request to a commit list 761 * nfs_request_add_commit_list - add request to a commit list
742 * @req: pointer to a struct nfs_page 762 * @req: pointer to a struct nfs_page
@@ -854,36 +874,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr)
854 return hdr->verf.committed != NFS_FILE_SYNC; 874 return hdr->verf.committed != NFS_FILE_SYNC;
855} 875}
856 876
857#else
858static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
859 struct inode *inode)
860{
861}
862
863void nfs_init_cinfo(struct nfs_commit_info *cinfo,
864 struct inode *inode,
865 struct nfs_direct_req *dreq)
866{
867}
868
869void
870nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
871 struct nfs_commit_info *cinfo)
872{
873}
874
875static void
876nfs_clear_request_commit(struct nfs_page *req)
877{
878}
879
880int nfs_write_need_commit(struct nfs_pgio_header *hdr)
881{
882 return 0;
883}
884
885#endif
886
887static void nfs_write_completion(struct nfs_pgio_header *hdr) 877static void nfs_write_completion(struct nfs_pgio_header *hdr)
888{ 878{
889 struct nfs_commit_info cinfo; 879 struct nfs_commit_info cinfo;
@@ -919,7 +909,6 @@ out:
919 hdr->release(hdr); 909 hdr->release(hdr);
920} 910}
921 911
922#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
923unsigned long 912unsigned long
924nfs_reqs_to_commit(struct nfs_commit_info *cinfo) 913nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
925{ 914{
@@ -976,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
976 return ret; 965 return ret;
977} 966}
978 967
979#else
980unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
981{
982 return 0;
983}
984
985int nfs_scan_commit(struct inode *inode, struct list_head *dst,
986 struct nfs_commit_info *cinfo)
987{
988 return 0;
989}
990#endif
991
992/* 968/*
993 * Search for an existing write request, and attempt to update 969 * Search for an existing write request, and attempt to update
994 * it to reflect a new dirty region on a given page. 970 * it to reflect a new dirty region on a given page.
@@ -1381,7 +1357,6 @@ static int nfs_writeback_done(struct rpc_task *task,
1381 return status; 1357 return status;
1382 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); 1358 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
1383 1359
1384#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1385 if (hdr->res.verf->committed < hdr->args.stable && 1360 if (hdr->res.verf->committed < hdr->args.stable &&
1386 task->tk_status >= 0) { 1361 task->tk_status >= 0) {
1387 /* We tried a write call, but the server did not 1362 /* We tried a write call, but the server did not
@@ -1403,7 +1378,6 @@ static int nfs_writeback_done(struct rpc_task *task,
1403 complain = jiffies + 300 * HZ; 1378 complain = jiffies + 300 * HZ;
1404 } 1379 }
1405 } 1380 }
1406#endif
1407 1381
1408 /* Deal with the suid/sgid bit corner case */ 1382 /* Deal with the suid/sgid bit corner case */
1409 if (nfs_should_remove_suid(inode)) 1383 if (nfs_should_remove_suid(inode))
@@ -1456,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task,
1456} 1430}
1457 1431
1458 1432
1459#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1460static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) 1433static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1461{ 1434{
1462 int ret; 1435 int ret;
@@ -1525,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1525} 1498}
1526EXPORT_SYMBOL_GPL(nfs_initiate_commit); 1499EXPORT_SYMBOL_GPL(nfs_initiate_commit);
1527 1500
1501static loff_t nfs_get_lwb(struct list_head *head)
1502{
1503 loff_t lwb = 0;
1504 struct nfs_page *req;
1505
1506 list_for_each_entry(req, head, wb_list)
1507 if (lwb < (req_offset(req) + req->wb_bytes))
1508 lwb = req_offset(req) + req->wb_bytes;
1509
1510 return lwb;
1511}
1512
1528/* 1513/*
1529 * Set up the argument/result storage required for the RPC call. 1514 * Set up the argument/result storage required for the RPC call.
1530 */ 1515 */
@@ -1544,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
1544 data->inode = inode; 1529 data->inode = inode;
1545 data->cred = first->wb_context->cred; 1530 data->cred = first->wb_context->cred;
1546 data->lseg = lseg; /* reference transferred */ 1531 data->lseg = lseg; /* reference transferred */
1532 /* only set lwb for pnfs commit */
1533 if (lseg)
1534 data->lwb = nfs_get_lwb(&data->pages);
1547 data->mds_ops = &nfs_commit_ops; 1535 data->mds_ops = &nfs_commit_ops;
1548 data->completion_ops = cinfo->completion_ops; 1536 data->completion_ops = cinfo->completion_ops;
1549 data->dreq = cinfo->dreq; 1537 data->dreq = cinfo->dreq;
@@ -1623,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1623 struct nfs_page *req; 1611 struct nfs_page *req;
1624 int status = data->task.tk_status; 1612 int status = data->task.tk_status;
1625 struct nfs_commit_info cinfo; 1613 struct nfs_commit_info cinfo;
1614 struct nfs_server *nfss;
1626 1615
1627 while (!list_empty(&data->pages)) { 1616 while (!list_empty(&data->pages)) {
1628 req = nfs_list_entry(data->pages.next); 1617 req = nfs_list_entry(data->pages.next);
@@ -1656,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1656 next: 1645 next:
1657 nfs_unlock_and_release_request(req); 1646 nfs_unlock_and_release_request(req);
1658 } 1647 }
1648 nfss = NFS_SERVER(data->inode);
1649 if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
1650 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
1651
1659 nfs_init_cinfo(&cinfo, data->inode, data->dreq); 1652 nfs_init_cinfo(&cinfo, data->inode, data->dreq);
1660 if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) 1653 if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
1661 nfs_commit_clear_lock(NFS_I(data->inode)); 1654 nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1765,12 +1758,6 @@ out_mark_dirty:
1765 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 1758 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1766 return ret; 1759 return ret;
1767} 1760}
1768#else
1769static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1770{
1771 return 0;
1772}
1773#endif
1774 1761
1775int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) 1762int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1776{ 1763{
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index f689ed82af3a..d153ca3ea577 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -3,5 +3,6 @@
3# 3#
4 4
5obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o 5obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
6
7nfs_acl-objs := nfsacl.o 6nfs_acl-objs := nfsacl.o
7
8obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c
index 6d1ee7204c88..ae6e58ea4de5 100644
--- a/fs/lockd/grace.c
+++ b/fs/nfs_common/grace.c
@@ -1,17 +1,20 @@
1/* 1/*
2 * Common code for control of lockd and nfsv4 grace periods. 2 * Common code for control of lockd and nfsv4 grace periods.
3 *
4 * Transplanted from lockd code
3 */ 5 */
4 6
5#include <linux/module.h> 7#include <linux/module.h>
6#include <linux/lockd/bind.h>
7#include <net/net_namespace.h> 8#include <net/net_namespace.h>
9#include <net/netns/generic.h>
10#include <linux/fs.h>
8 11
9#include "netns.h" 12static int grace_net_id;
10
11static DEFINE_SPINLOCK(grace_lock); 13static DEFINE_SPINLOCK(grace_lock);
12 14
13/** 15/**
14 * locks_start_grace 16 * locks_start_grace
17 * @net: net namespace that this lock manager belongs to
15 * @lm: who this grace period is for 18 * @lm: who this grace period is for
16 * 19 *
17 * A grace period is a period during which locks should not be given 20 * A grace period is a period during which locks should not be given
@@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock);
21 * 24 *
22 * This function is called to start a grace period. 25 * This function is called to start a grace period.
23 */ 26 */
24void locks_start_grace(struct net *net, struct lock_manager *lm) 27void
28locks_start_grace(struct net *net, struct lock_manager *lm)
25{ 29{
26 struct lockd_net *ln = net_generic(net, lockd_net_id); 30 struct list_head *grace_list = net_generic(net, grace_net_id);
27 31
28 spin_lock(&grace_lock); 32 spin_lock(&grace_lock);
29 list_add(&lm->list, &ln->grace_list); 33 list_add(&lm->list, grace_list);
30 spin_unlock(&grace_lock); 34 spin_unlock(&grace_lock);
31} 35}
32EXPORT_SYMBOL_GPL(locks_start_grace); 36EXPORT_SYMBOL_GPL(locks_start_grace);
33 37
34/** 38/**
35 * locks_end_grace 39 * locks_end_grace
40 * @net: net namespace that this lock manager belongs to
36 * @lm: who this grace period is for 41 * @lm: who this grace period is for
37 * 42 *
38 * Call this function to state that the given lock manager is ready to 43 * Call this function to state that the given lock manager is ready to
@@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
41 * Note that callers count on it being safe to call this more than once, 46 * Note that callers count on it being safe to call this more than once,
42 * and the second call should be a no-op. 47 * and the second call should be a no-op.
43 */ 48 */
44void locks_end_grace(struct lock_manager *lm) 49void
50locks_end_grace(struct lock_manager *lm)
45{ 51{
46 spin_lock(&grace_lock); 52 spin_lock(&grace_lock);
47 list_del_init(&lm->list); 53 list_del_init(&lm->list);
@@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
56 * to answer ordinary lock requests, and when they should accept only 62 * to answer ordinary lock requests, and when they should accept only
57 * lock reclaims. 63 * lock reclaims.
58 */ 64 */
59int locks_in_grace(struct net *net) 65int
66locks_in_grace(struct net *net)
60{ 67{
61 struct lockd_net *ln = net_generic(net, lockd_net_id); 68 struct list_head *grace_list = net_generic(net, grace_net_id);
62 69
63 return !list_empty(&ln->grace_list); 70 return !list_empty(grace_list);
64} 71}
65EXPORT_SYMBOL_GPL(locks_in_grace); 72EXPORT_SYMBOL_GPL(locks_in_grace);
73
74static int __net_init
75grace_init_net(struct net *net)
76{
77 struct list_head *grace_list = net_generic(net, grace_net_id);
78
79 INIT_LIST_HEAD(grace_list);
80 return 0;
81}
82
83static void __net_exit
84grace_exit_net(struct net *net)
85{
86 struct list_head *grace_list = net_generic(net, grace_net_id);
87
88 BUG_ON(!list_empty(grace_list));
89}
90
91static struct pernet_operations grace_net_ops = {
92 .init = grace_init_net,
93 .exit = grace_exit_net,
94 .id = &grace_net_id,
95 .size = sizeof(struct list_head),
96};
97
98static int __init
99init_grace(void)
100{
101 return register_pernet_subsys(&grace_net_ops);
102}
103
104static void __exit
105exit_grace(void)
106{
107 unregister_pernet_subsys(&grace_net_ops);
108}
109
110MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
111MODULE_LICENSE("GPL");
112module_init(init_grace)
113module_exit(exit_grace)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f3586b645d7d..73395156bdb4 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -71,6 +71,7 @@ config NFSD_V4
71 select FS_POSIX_ACL 71 select FS_POSIX_ACL
72 select SUNRPC_GSS 72 select SUNRPC_GSS
73 select CRYPTO 73 select CRYPTO
74 select GRACE_PERIOD
74 help 75 help
75 This option enables support in your system's NFS server for 76 This option enables support in your system's NFS server for
76 version 4 of the NFS protocol (RFC 3530). 77 version 4 of the NFS protocol (RFC 3530).
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e0be57b0f79b..ed2b1151b171 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
49 49
50/* Index of predefined Linux callback client operations */ 50/* Index of predefined Linux callback client operations */
51 51
52enum {
53 NFSPROC4_CLNT_CB_NULL = 0,
54 NFSPROC4_CLNT_CB_RECALL,
55 NFSPROC4_CLNT_CB_SEQUENCE,
56};
57
58struct nfs4_cb_compound_hdr { 52struct nfs4_cb_compound_hdr {
59 /* args */ 53 /* args */
60 u32 ident; /* minorversion 0 only */ 54 u32 ident; /* minorversion 0 only */
@@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
494static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, 488static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
495 const struct nfsd4_callback *cb) 489 const struct nfsd4_callback *cb)
496{ 490{
497 const struct nfs4_delegation *args = cb->cb_op; 491 const struct nfs4_delegation *dp = cb_to_delegation(cb);
498 struct nfs4_cb_compound_hdr hdr = { 492 struct nfs4_cb_compound_hdr hdr = {
499 .ident = cb->cb_clp->cl_cb_ident, 493 .ident = cb->cb_clp->cl_cb_ident,
500 .minorversion = cb->cb_minorversion, 494 .minorversion = cb->cb_minorversion,
@@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
502 496
503 encode_cb_compound4args(xdr, &hdr); 497 encode_cb_compound4args(xdr, &hdr);
504 encode_cb_sequence4args(xdr, cb, &hdr); 498 encode_cb_sequence4args(xdr, cb, &hdr);
505 encode_cb_recall4args(xdr, args, &hdr); 499 encode_cb_recall4args(xdr, dp, &hdr);
506 encode_cb_nops(&hdr); 500 encode_cb_nops(&hdr);
507} 501}
508 502
@@ -746,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
746 740
747static struct workqueue_struct *callback_wq; 741static struct workqueue_struct *callback_wq;
748 742
749static void run_nfsd4_cb(struct nfsd4_callback *cb)
750{
751 queue_work(callback_wq, &cb->cb_work);
752}
753
754static void do_probe_callback(struct nfs4_client *clp)
755{
756 struct nfsd4_callback *cb = &clp->cl_cb_null;
757
758 cb->cb_op = NULL;
759 cb->cb_clp = clp;
760
761 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
762 cb->cb_msg.rpc_argp = NULL;
763 cb->cb_msg.rpc_resp = NULL;
764
765 cb->cb_ops = &nfsd4_cb_probe_ops;
766
767 run_nfsd4_cb(cb);
768}
769
770/* 743/*
771 * Poke the callback thread to process any updates to the callback 744 * Poke the callback thread to process any updates to the callback
772 * parameters, and send a null probe. 745 * parameters, and send a null probe.
@@ -775,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
775{ 748{
776 clp->cl_cb_state = NFSD4_CB_UNKNOWN; 749 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
777 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); 750 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
778 do_probe_callback(clp); 751 nfsd4_run_cb(&clp->cl_cb_null);
779} 752}
780 753
781void nfsd4_probe_callback_sync(struct nfs4_client *clp) 754void nfsd4_probe_callback_sync(struct nfs4_client *clp)
@@ -847,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
847 rpc_wake_up_next(&clp->cl_cb_waitq); 820 rpc_wake_up_next(&clp->cl_cb_waitq);
848 dprintk("%s: freed slot, new seqid=%d\n", __func__, 821 dprintk("%s: freed slot, new seqid=%d\n", __func__,
849 clp->cl_cb_session->se_cb_seq_nr); 822 clp->cl_cb_session->se_cb_seq_nr);
850
851 /* We're done looking into the sequence information */
852 task->tk_msg.rpc_resp = NULL;
853 } 823 }
854}
855
856
857static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
858{
859 struct nfsd4_callback *cb = calldata;
860 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
861 struct nfs4_client *clp = cb->cb_clp;
862 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
863
864 nfsd4_cb_done(task, calldata);
865 824
866 if (current_rpc_client != task->tk_client) { 825 if (clp->cl_cb_client != task->tk_client) {
867 /* We're shutting down or changing cl_cb_client; leave 826 /* We're shutting down or changing cl_cb_client; leave
868 * it to nfsd4_process_cb_update to restart the call if 827 * it to nfsd4_process_cb_update to restart the call if
869 * necessary. */ 828 * necessary. */
@@ -872,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
872 831
873 if (cb->cb_done) 832 if (cb->cb_done)
874 return; 833 return;
875 switch (task->tk_status) { 834
835 switch (cb->cb_ops->done(cb, task)) {
876 case 0: 836 case 0:
877 cb->cb_done = true; 837 task->tk_status = 0;
838 rpc_restart_call_prepare(task);
878 return; 839 return;
879 case -EBADHANDLE: 840 case 1:
880 case -NFS4ERR_BAD_STATEID:
881 /* Race: client probably got cb_recall
882 * before open reply granting delegation */
883 break; 841 break;
884 default: 842 case -1:
885 /* Network partition? */ 843 /* Network partition? */
886 nfsd4_mark_cb_down(clp, task->tk_status); 844 nfsd4_mark_cb_down(clp, task->tk_status);
845 break;
846 default:
847 BUG();
887 } 848 }
888 if (dp->dl_retries--) {
889 rpc_delay(task, 2*HZ);
890 task->tk_status = 0;
891 rpc_restart_call_prepare(task);
892 return;
893 }
894 nfsd4_mark_cb_down(clp, task->tk_status);
895 cb->cb_done = true; 849 cb->cb_done = true;
896} 850}
897 851
898static void nfsd4_cb_recall_release(void *calldata) 852static void nfsd4_cb_release(void *calldata)
899{ 853{
900 struct nfsd4_callback *cb = calldata; 854 struct nfsd4_callback *cb = calldata;
901 struct nfs4_client *clp = cb->cb_clp; 855 struct nfs4_client *clp = cb->cb_clp;
902 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
903 856
904 if (cb->cb_done) { 857 if (cb->cb_done) {
905 spin_lock(&clp->cl_lock); 858 spin_lock(&clp->cl_lock);
906 list_del(&cb->cb_per_client); 859 list_del(&cb->cb_per_client);
907 spin_unlock(&clp->cl_lock); 860 spin_unlock(&clp->cl_lock);
908 nfs4_put_stid(&dp->dl_stid); 861
862 cb->cb_ops->release(cb);
909 } 863 }
910} 864}
911 865
912static const struct rpc_call_ops nfsd4_cb_recall_ops = { 866static const struct rpc_call_ops nfsd4_cb_ops = {
913 .rpc_call_prepare = nfsd4_cb_prepare, 867 .rpc_call_prepare = nfsd4_cb_prepare,
914 .rpc_call_done = nfsd4_cb_recall_done, 868 .rpc_call_done = nfsd4_cb_done,
915 .rpc_release = nfsd4_cb_recall_release, 869 .rpc_release = nfsd4_cb_release,
916}; 870};
917 871
918int nfsd4_create_callback_queue(void) 872int nfsd4_create_callback_queue(void)
@@ -937,16 +891,10 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
937 * instead, nfsd4_run_cb_null() will detect the killed 891 * instead, nfsd4_run_cb_null() will detect the killed
938 * client, destroy the rpc client, and stop: 892 * client, destroy the rpc client, and stop:
939 */ 893 */
940 do_probe_callback(clp); 894 nfsd4_run_cb(&clp->cl_cb_null);
941 flush_workqueue(callback_wq); 895 flush_workqueue(callback_wq);
942} 896}
943 897
944static void nfsd4_release_cb(struct nfsd4_callback *cb)
945{
946 if (cb->cb_ops->rpc_release)
947 cb->cb_ops->rpc_release(cb);
948}
949
950/* requires cl_lock: */ 898/* requires cl_lock: */
951static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) 899static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
952{ 900{
@@ -1009,63 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
1009 } 957 }
1010 /* Yay, the callback channel's back! Restart any callbacks: */ 958 /* Yay, the callback channel's back! Restart any callbacks: */
1011 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) 959 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
1012 run_nfsd4_cb(cb); 960 queue_work(callback_wq, &cb->cb_work);
1013} 961}
1014 962
1015static void 963static void
1016nfsd4_run_callback_rpc(struct nfsd4_callback *cb) 964nfsd4_run_cb_work(struct work_struct *work)
1017{ 965{
966 struct nfsd4_callback *cb =
967 container_of(work, struct nfsd4_callback, cb_work);
1018 struct nfs4_client *clp = cb->cb_clp; 968 struct nfs4_client *clp = cb->cb_clp;
1019 struct rpc_clnt *clnt; 969 struct rpc_clnt *clnt;
1020 970
971 if (cb->cb_ops && cb->cb_ops->prepare)
972 cb->cb_ops->prepare(cb);
973
1021 if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) 974 if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
1022 nfsd4_process_cb_update(cb); 975 nfsd4_process_cb_update(cb);
1023 976
1024 clnt = clp->cl_cb_client; 977 clnt = clp->cl_cb_client;
1025 if (!clnt) { 978 if (!clnt) {
1026 /* Callback channel broken, or client killed; give up: */ 979 /* Callback channel broken, or client killed; give up: */
1027 nfsd4_release_cb(cb); 980 if (cb->cb_ops && cb->cb_ops->release)
981 cb->cb_ops->release(cb);
1028 return; 982 return;
1029 } 983 }
1030 cb->cb_msg.rpc_cred = clp->cl_cb_cred; 984 cb->cb_msg.rpc_cred = clp->cl_cb_cred;
1031 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 985 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
1032 cb->cb_ops, cb); 986 cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
1033} 987}
1034 988
1035void 989void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
1036nfsd4_run_cb_null(struct work_struct *w) 990 struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
1037{ 991{
1038 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
1039 cb_work);
1040 nfsd4_run_callback_rpc(cb);
1041}
1042
1043void
1044nfsd4_run_cb_recall(struct work_struct *w)
1045{
1046 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback,
1047 cb_work);
1048
1049 nfsd4_prepare_cb_recall(cb->cb_op);
1050 nfsd4_run_callback_rpc(cb);
1051}
1052
1053void nfsd4_cb_recall(struct nfs4_delegation *dp)
1054{
1055 struct nfsd4_callback *cb = &dp->dl_recall;
1056 struct nfs4_client *clp = dp->dl_stid.sc_client;
1057
1058 dp->dl_retries = 1;
1059 cb->cb_op = dp;
1060 cb->cb_clp = clp; 992 cb->cb_clp = clp;
1061 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; 993 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
1062 cb->cb_msg.rpc_argp = cb; 994 cb->cb_msg.rpc_argp = cb;
1063 cb->cb_msg.rpc_resp = cb; 995 cb->cb_msg.rpc_resp = cb;
1064 996 cb->cb_ops = ops;
1065 cb->cb_ops = &nfsd4_cb_recall_ops; 997 INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
1066
1067 INIT_LIST_HEAD(&cb->cb_per_client); 998 INIT_LIST_HEAD(&cb->cb_per_client);
1068 cb->cb_done = true; 999 cb->cb_done = true;
1000}
1069 1001
1070 run_nfsd4_cb(&dp->dl_recall); 1002void nfsd4_run_cb(struct nfsd4_callback *cb)
1003{
1004 queue_work(callback_wq, &cb->cb_work);
1071} 1005}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a0ab0a847d69..e1b3d3d472da 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
215 memset(&ent, 0, sizeof(ent)); 215 memset(&ent, 0, sizeof(ent));
216 216
217 /* Authentication name */ 217 /* Authentication name */
218 if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) 218 len = qword_get(&buf, buf1, PAGE_SIZE);
219 if (len <= 0 || len >= IDMAP_NAMESZ)
219 goto out; 220 goto out;
220 memcpy(ent.authname, buf1, sizeof(ent.authname)); 221 memcpy(ent.authname, buf1, sizeof(ent.authname));
221 222
@@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
245 /* Name */ 246 /* Name */
246 error = -EINVAL; 247 error = -EINVAL;
247 len = qword_get(&buf, buf1, PAGE_SIZE); 248 len = qword_get(&buf, buf1, PAGE_SIZE);
248 if (len < 0) 249 if (len < 0 || len >= IDMAP_NAMESZ)
249 goto out; 250 goto out;
250 if (len == 0) 251 if (len == 0)
251 set_bit(CACHE_NEGATIVE, &ent.h.flags); 252 set_bit(CACHE_NEGATIVE, &ent.h.flags);
252 else if (len >= IDMAP_NAMESZ)
253 goto out;
254 else 253 else
255 memcpy(ent.name, buf1, sizeof(ent.name)); 254 memcpy(ent.name, buf1, sizeof(ent.name));
256 error = -ENOMEM; 255 error = -ENOMEM;
@@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
259 goto out; 258 goto out;
260 259
261 cache_put(&res->h, cd); 260 cache_put(&res->h, cd);
262
263 error = 0; 261 error = 0;
264out: 262out:
265 kfree(buf1); 263 kfree(buf1);
266
267 return error; 264 return error;
268} 265}
269 266
270
271static struct ent * 267static struct ent *
272idtoname_lookup(struct cache_detail *cd, struct ent *item) 268idtoname_lookup(struct cache_detail *cd, struct ent *item)
273{ 269{
@@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
368{ 364{
369 struct ent ent, *res; 365 struct ent ent, *res;
370 char *buf1; 366 char *buf1;
371 int error = -EINVAL; 367 int len, error = -EINVAL;
372 368
373 if (buf[buflen - 1] != '\n') 369 if (buf[buflen - 1] != '\n')
374 return (-EINVAL); 370 return (-EINVAL);
@@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
381 memset(&ent, 0, sizeof(ent)); 377 memset(&ent, 0, sizeof(ent));
382 378
383 /* Authentication name */ 379 /* Authentication name */
384 if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) 380 len = qword_get(&buf, buf1, PAGE_SIZE);
381 if (len <= 0 || len >= IDMAP_NAMESZ)
385 goto out; 382 goto out;
386 memcpy(ent.authname, buf1, sizeof(ent.authname)); 383 memcpy(ent.authname, buf1, sizeof(ent.authname));
387 384
@@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
392 IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; 389 IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
393 390
394 /* Name */ 391 /* Name */
395 error = qword_get(&buf, buf1, PAGE_SIZE); 392 len = qword_get(&buf, buf1, PAGE_SIZE);
396 if (error <= 0 || error >= IDMAP_NAMESZ) 393 if (len <= 0 || len >= IDMAP_NAMESZ)
397 goto out; 394 goto out;
398 memcpy(ent.name, buf1, sizeof(ent.name)); 395 memcpy(ent.name, buf1, sizeof(ent.name));
399 396
@@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
421 error = 0; 418 error = 0;
422out: 419out:
423 kfree(buf1); 420 kfree(buf1);
424
425 return (error); 421 return (error);
426} 422}
427 423
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 5e0dc528a0e8..cdeb3cfd6f32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1013,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1013 return status; 1013 return status;
1014} 1014}
1015 1015
1016static __be32
1017nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1018 struct nfsd4_seek *seek)
1019{
1020 int whence;
1021 __be32 status;
1022 struct file *file;
1023
1024 status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
1025 &seek->seek_stateid,
1026 RD_STATE, &file);
1027 if (status) {
1028 dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
1029 return status;
1030 }
1031
1032 switch (seek->seek_whence) {
1033 case NFS4_CONTENT_DATA:
1034 whence = SEEK_DATA;
1035 break;
1036 case NFS4_CONTENT_HOLE:
1037 whence = SEEK_HOLE;
1038 break;
1039 default:
1040 status = nfserr_union_notsupp;
1041 goto out;
1042 }
1043
1044 /*
1045 * Note: This call does change file->f_pos, but nothing in NFSD
1046 * should ever file->f_pos.
1047 */
1048 seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
1049 if (seek->seek_pos < 0)
1050 status = nfserrno(seek->seek_pos);
1051 else if (seek->seek_pos >= i_size_read(file_inode(file)))
1052 seek->seek_eof = true;
1053
1054out:
1055 fput(file);
1056 return status;
1057}
1058
1016/* This routine never returns NFS_OK! If there are no other errors, it 1059/* This routine never returns NFS_OK! If there are no other errors, it
1017 * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the 1060 * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
1018 * attributes matched. VERIFY is implemented by mapping NFSERR_SAME 1061 * attributes matched. VERIFY is implemented by mapping NFSERR_SAME
@@ -1881,6 +1924,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
1881 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, 1924 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
1882 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 1925 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1883 }, 1926 },
1927
1928 /* NFSv4.2 operations */
1929 [OP_SEEK] = {
1930 .op_func = (nfsd4op_func)nfsd4_seek,
1931 .op_name = "OP_SEEK",
1932 },
1884}; 1933};
1885 1934
1886int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) 1935int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..ea95a2bc21b5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops {
58 void (*create)(struct nfs4_client *); 58 void (*create)(struct nfs4_client *);
59 void (*remove)(struct nfs4_client *); 59 void (*remove)(struct nfs4_client *);
60 int (*check)(struct nfs4_client *); 60 int (*check)(struct nfs4_client *);
61 void (*grace_done)(struct nfsd_net *, time_t); 61 void (*grace_done)(struct nfsd_net *);
62}; 62};
63 63
64/* Globals */ 64/* Globals */
@@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
188 188
189 status = mnt_want_write_file(nn->rec_file); 189 status = mnt_want_write_file(nn->rec_file);
190 if (status) 190 if (status)
191 return; 191 goto out_creds;
192 192
193 dir = nn->rec_file->f_path.dentry; 193 dir = nn->rec_file->f_path.dentry;
194 /* lock the parent */ 194 /* lock the parent */
@@ -228,6 +228,7 @@ out_unlock:
228 user_recovery_dirname); 228 user_recovery_dirname);
229 } 229 }
230 mnt_drop_write_file(nn->rec_file); 230 mnt_drop_write_file(nn->rec_file);
231out_creds:
231 nfs4_reset_creds(original_cred); 232 nfs4_reset_creds(original_cred);
232} 233}
233 234
@@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
392} 393}
393 394
394static void 395static void
395nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time) 396nfsd4_recdir_purge_old(struct nfsd_net *nn)
396{ 397{
397 int status; 398 int status;
398 399
@@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net)
479 return status; 480 return status;
480} 481}
481 482
483static void
484nfsd4_shutdown_recdir(struct net *net)
485{
486 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
487
488 if (!nn->rec_file)
489 return;
490 fput(nn->rec_file);
491 nn->rec_file = NULL;
492}
482 493
483static int 494static int
484nfs4_legacy_state_init(struct net *net) 495nfs4_legacy_state_init(struct net *net)
@@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net)
512 int status; 523 int status;
513 524
514 status = nfsd4_init_recdir(net); 525 status = nfsd4_init_recdir(net);
515 if (!status)
516 status = nfsd4_recdir_load(net);
517 if (status) 526 if (status)
518 printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); 527 return status;
528
529 status = nfsd4_recdir_load(net);
530 if (status)
531 nfsd4_shutdown_recdir(net);
532
519 return status; 533 return status;
520} 534}
521 535
@@ -546,21 +560,12 @@ err:
546} 560}
547 561
548static void 562static void
549nfsd4_shutdown_recdir(struct nfsd_net *nn)
550{
551 if (!nn->rec_file)
552 return;
553 fput(nn->rec_file);
554 nn->rec_file = NULL;
555}
556
557static void
558nfsd4_legacy_tracking_exit(struct net *net) 563nfsd4_legacy_tracking_exit(struct net *net)
559{ 564{
560 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 565 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
561 566
562 nfs4_release_reclaim(nn); 567 nfs4_release_reclaim(nn);
563 nfsd4_shutdown_recdir(nn); 568 nfsd4_shutdown_recdir(net);
564 nfs4_legacy_state_shutdown(net); 569 nfs4_legacy_state_shutdown(net);
565} 570}
566 571
@@ -1016,7 +1021,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
1016} 1021}
1017 1022
1018static void 1023static void
1019nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time) 1024nfsd4_cld_grace_done(struct nfsd_net *nn)
1020{ 1025{
1021 int ret; 1026 int ret;
1022 struct cld_upcall *cup; 1027 struct cld_upcall *cup;
@@ -1029,7 +1034,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
1029 } 1034 }
1030 1035
1031 cup->cu_msg.cm_cmd = Cld_GraceDone; 1036 cup->cu_msg.cm_cmd = Cld_GraceDone;
1032 cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time; 1037 cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
1033 ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); 1038 ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
1034 if (!ret) 1039 if (!ret)
1035 ret = cup->cu_msg.cm_status; 1040 ret = cup->cu_msg.cm_status;
@@ -1062,6 +1067,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable,
1062 1067
1063#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" 1068#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
1064#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" 1069#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
1070#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
1071#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
1065 1072
1066static char * 1073static char *
1067nfsd4_cltrack_legacy_topdir(void) 1074nfsd4_cltrack_legacy_topdir(void)
@@ -1126,10 +1133,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
1126 return result; 1133 return result;
1127} 1134}
1128 1135
1136static char *
1137nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
1138{
1139 int copied;
1140 size_t len;
1141 char *result;
1142
1143 /* prefix + Y/N character + terminating NULL */
1144 len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
1145
1146 result = kmalloc(len, GFP_KERNEL);
1147 if (!result)
1148 return result;
1149
1150 copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
1151 clp->cl_minorversion ? 'Y' : 'N');
1152 if (copied >= len) {
1153 /* just return nothing if output was truncated */
1154 kfree(result);
1155 return NULL;
1156 }
1157
1158 return result;
1159}
1160
1161static char *
1162nfsd4_cltrack_grace_start(time_t grace_start)
1163{
1164 int copied;
1165 size_t len;
1166 char *result;
1167
1168 /* prefix + max width of int64_t string + terminating NULL */
1169 len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
1170
1171 result = kmalloc(len, GFP_KERNEL);
1172 if (!result)
1173 return result;
1174
1175 copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
1176 grace_start);
1177 if (copied >= len) {
1178 /* just return nothing if output was truncated */
1179 kfree(result);
1180 return NULL;
1181 }
1182
1183 return result;
1184}
1185
1129static int 1186static int
1130nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) 1187nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
1131{ 1188{
1132 char *envp[2]; 1189 char *envp[3];
1133 char *argv[4]; 1190 char *argv[4];
1134 int ret; 1191 int ret;
1135 1192
@@ -1140,10 +1197,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
1140 1197
1141 dprintk("%s: cmd: %s\n", __func__, cmd); 1198 dprintk("%s: cmd: %s\n", __func__, cmd);
1142 dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); 1199 dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
1143 dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)"); 1200 dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
1201 dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
1144 1202
1145 envp[0] = legacy; 1203 envp[0] = env0;
1146 envp[1] = NULL; 1204 envp[1] = env1;
1205 envp[2] = NULL;
1147 1206
1148 argv[0] = (char *)cltrack_prog; 1207 argv[0] = (char *)cltrack_prog;
1149 argv[1] = cmd; 1208 argv[1] = cmd;
@@ -1187,28 +1246,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
1187} 1246}
1188 1247
1189static int 1248static int
1190nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) 1249nfsd4_umh_cltrack_init(struct net *net)
1191{ 1250{
1251 int ret;
1252 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1253 char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
1254
1192 /* XXX: The usermode helper s not working in container yet. */ 1255 /* XXX: The usermode helper s not working in container yet. */
1193 if (net != &init_net) { 1256 if (net != &init_net) {
1194 WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " 1257 WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
1195 "tracking in a container!\n"); 1258 "tracking in a container!\n");
1196 return -EINVAL; 1259 return -EINVAL;
1197 } 1260 }
1198 return nfsd4_umh_cltrack_upcall("init", NULL, NULL); 1261
1262 ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
1263 kfree(grace_start);
1264 return ret;
1265}
1266
1267static void
1268nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
1269{
1270 wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
1271 TASK_UNINTERRUPTIBLE);
1272}
1273
1274static void
1275nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
1276{
1277 smp_mb__before_atomic();
1278 clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
1279 smp_mb__after_atomic();
1280 wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
1199} 1281}
1200 1282
1201static void 1283static void
1202nfsd4_umh_cltrack_create(struct nfs4_client *clp) 1284nfsd4_umh_cltrack_create(struct nfs4_client *clp)
1203{ 1285{
1204 char *hexid; 1286 char *hexid, *has_session, *grace_start;
1287 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1288
1289 /*
1290 * With v4.0 clients, there's little difference in outcome between a
1291 * create and check operation, and we can end up calling into this
1292 * function multiple times per client (once for each openowner). So,
1293 * for v4.0 clients skip upcalling once the client has been recorded
1294 * on stable storage.
1295 *
1296 * For v4.1+ clients, the outcome of the two operations is different,
1297 * so we must ensure that we upcall for the create operation. v4.1+
1298 * clients call this on RECLAIM_COMPLETE though, so we should only end
1299 * up doing a single create upcall per client.
1300 */
1301 if (clp->cl_minorversion == 0 &&
1302 test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1303 return;
1205 1304
1206 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1305 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1207 if (!hexid) { 1306 if (!hexid) {
1208 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1307 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1209 return; 1308 return;
1210 } 1309 }
1211 nfsd4_umh_cltrack_upcall("create", hexid, NULL); 1310
1311 has_session = nfsd4_cltrack_client_has_session(clp);
1312 grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
1313
1314 nfsd4_cltrack_upcall_lock(clp);
1315 if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
1316 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1317 nfsd4_cltrack_upcall_unlock(clp);
1318
1319 kfree(has_session);
1320 kfree(grace_start);
1212 kfree(hexid); 1321 kfree(hexid);
1213} 1322}
1214 1323
@@ -1217,12 +1326,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
1217{ 1326{
1218 char *hexid; 1327 char *hexid;
1219 1328
1329 if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1330 return;
1331
1220 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1332 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1221 if (!hexid) { 1333 if (!hexid) {
1222 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1334 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1223 return; 1335 return;
1224 } 1336 }
1225 nfsd4_umh_cltrack_upcall("remove", hexid, NULL); 1337
1338 nfsd4_cltrack_upcall_lock(clp);
1339 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
1340 nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
1341 clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1342 nfsd4_cltrack_upcall_unlock(clp);
1343
1226 kfree(hexid); 1344 kfree(hexid);
1227} 1345}
1228 1346
@@ -1230,30 +1348,45 @@ static int
1230nfsd4_umh_cltrack_check(struct nfs4_client *clp) 1348nfsd4_umh_cltrack_check(struct nfs4_client *clp)
1231{ 1349{
1232 int ret; 1350 int ret;
1233 char *hexid, *legacy; 1351 char *hexid, *has_session, *legacy;
1352
1353 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1354 return 0;
1234 1355
1235 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1356 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1236 if (!hexid) { 1357 if (!hexid) {
1237 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1358 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1238 return -ENOMEM; 1359 return -ENOMEM;
1239 } 1360 }
1361
1362 has_session = nfsd4_cltrack_client_has_session(clp);
1240 legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); 1363 legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
1241 ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy); 1364
1365 nfsd4_cltrack_upcall_lock(clp);
1366 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
1367 ret = 0;
1368 } else {
1369 ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
1370 if (ret == 0)
1371 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1372 }
1373 nfsd4_cltrack_upcall_unlock(clp);
1374 kfree(has_session);
1242 kfree(legacy); 1375 kfree(legacy);
1243 kfree(hexid); 1376 kfree(hexid);
1377
1244 return ret; 1378 return ret;
1245} 1379}
1246 1380
1247static void 1381static void
1248nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn, 1382nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
1249 time_t boot_time)
1250{ 1383{
1251 char *legacy; 1384 char *legacy;
1252 char timestr[22]; /* FIXME: better way to determine max size? */ 1385 char timestr[22]; /* FIXME: better way to determine max size? */
1253 1386
1254 sprintf(timestr, "%ld", boot_time); 1387 sprintf(timestr, "%ld", nn->boot_time);
1255 legacy = nfsd4_cltrack_legacy_topdir(); 1388 legacy = nfsd4_cltrack_legacy_topdir();
1256 nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy); 1389 nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
1257 kfree(legacy); 1390 kfree(legacy);
1258} 1391}
1259 1392
@@ -1356,10 +1489,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
1356} 1489}
1357 1490
1358void 1491void
1359nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time) 1492nfsd4_record_grace_done(struct nfsd_net *nn)
1360{ 1493{
1361 if (nn->client_tracking_ops) 1494 if (nn->client_tracking_ops)
1362 nn->client_tracking_ops->grace_done(nn, boot_time); 1495 nn->client_tracking_ops->grace_done(nn);
1363} 1496}
1364 1497
1365static int 1498static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d1b851548b7a..e9c3afe4b5d3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -96,6 +96,8 @@ static struct kmem_cache *deleg_slab;
96 96
97static void free_session(struct nfsd4_session *); 97static void free_session(struct nfsd4_session *);
98 98
99static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
100
99static bool is_session_dead(struct nfsd4_session *ses) 101static bool is_session_dead(struct nfsd4_session *ses)
100{ 102{
101 return ses->se_flags & NFS4_SESSION_DEAD; 103 return ses->se_flags & NFS4_SESSION_DEAD;
@@ -650,7 +652,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
650 INIT_LIST_HEAD(&dp->dl_perclnt); 652 INIT_LIST_HEAD(&dp->dl_perclnt);
651 INIT_LIST_HEAD(&dp->dl_recall_lru); 653 INIT_LIST_HEAD(&dp->dl_recall_lru);
652 dp->dl_type = NFS4_OPEN_DELEGATE_READ; 654 dp->dl_type = NFS4_OPEN_DELEGATE_READ;
653 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_run_cb_recall); 655 dp->dl_retries = 1;
656 nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
657 &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
654 return dp; 658 return dp;
655out_dec: 659out_dec:
656 atomic_long_dec(&num_delegations); 660 atomic_long_dec(&num_delegations);
@@ -1870,7 +1874,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
1870 free_client(clp); 1874 free_client(clp);
1871 return NULL; 1875 return NULL;
1872 } 1876 }
1873 INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_run_cb_null); 1877 nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
1874 clp->cl_time = get_seconds(); 1878 clp->cl_time = get_seconds();
1875 clear_bit(0, &clp->cl_cb_slot_busy); 1879 clear_bit(0, &clp->cl_cb_slot_busy);
1876 copy_verf(clp, verf); 1880 copy_verf(clp, verf);
@@ -3355,8 +3359,9 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
3355 return ret; 3359 return ret;
3356} 3360}
3357 3361
3358void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp) 3362static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
3359{ 3363{
3364 struct nfs4_delegation *dp = cb_to_delegation(cb);
3360 struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, 3365 struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
3361 nfsd_net_id); 3366 nfsd_net_id);
3362 3367
@@ -3377,6 +3382,43 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp)
3377 spin_unlock(&state_lock); 3382 spin_unlock(&state_lock);
3378} 3383}
3379 3384
3385static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
3386 struct rpc_task *task)
3387{
3388 struct nfs4_delegation *dp = cb_to_delegation(cb);
3389
3390 switch (task->tk_status) {
3391 case 0:
3392 return 1;
3393 case -EBADHANDLE:
3394 case -NFS4ERR_BAD_STATEID:
3395 /*
3396 * Race: client probably got cb_recall before open reply
3397 * granting delegation.
3398 */
3399 if (dp->dl_retries--) {
3400 rpc_delay(task, 2 * HZ);
3401 return 0;
3402 }
3403 /*FALLTHRU*/
3404 default:
3405 return -1;
3406 }
3407}
3408
3409static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
3410{
3411 struct nfs4_delegation *dp = cb_to_delegation(cb);
3412
3413 nfs4_put_stid(&dp->dl_stid);
3414}
3415
3416static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
3417 .prepare = nfsd4_cb_recall_prepare,
3418 .done = nfsd4_cb_recall_done,
3419 .release = nfsd4_cb_recall_release,
3420};
3421
3380static void nfsd_break_one_deleg(struct nfs4_delegation *dp) 3422static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
3381{ 3423{
3382 /* 3424 /*
@@ -3387,7 +3429,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
3387 * it's safe to take a reference. 3429 * it's safe to take a reference.
3388 */ 3430 */
3389 atomic_inc(&dp->dl_stid.sc_count); 3431 atomic_inc(&dp->dl_stid.sc_count);
3390 nfsd4_cb_recall(dp); 3432 nfsd4_run_cb(&dp->dl_recall);
3391} 3433}
3392 3434
3393/* Called from break_lease() with i_lock held. */ 3435/* Called from break_lease() with i_lock held. */
@@ -4113,7 +4155,7 @@ out:
4113 return status; 4155 return status;
4114} 4156}
4115 4157
4116static void 4158void
4117nfsd4_end_grace(struct nfsd_net *nn) 4159nfsd4_end_grace(struct nfsd_net *nn)
4118{ 4160{
4119 /* do nothing if grace period already ended */ 4161 /* do nothing if grace period already ended */
@@ -4122,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn)
4122 4164
4123 dprintk("NFSD: end of grace period\n"); 4165 dprintk("NFSD: end of grace period\n");
4124 nn->grace_ended = true; 4166 nn->grace_ended = true;
4125 nfsd4_record_grace_done(nn, nn->boot_time); 4167 /*
4168 * If the server goes down again right now, an NFSv4
4169 * client will still be allowed to reclaim after it comes back up,
4170 * even if it hasn't yet had a chance to reclaim state this time.
4171 *
4172 */
4173 nfsd4_record_grace_done(nn);
4174 /*
4175 * At this point, NFSv4 clients can still reclaim. But if the
4176 * server crashes, any that have not yet reclaimed will be out
4177 * of luck on the next boot.
4178 *
4179 * (NFSv4.1+ clients are considered to have reclaimed once they
4180 * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to
4181 * have reclaimed after their first OPEN.)
4182 */
4126 locks_end_grace(&nn->nfsd4_manager); 4183 locks_end_grace(&nn->nfsd4_manager);
4127 /* 4184 /*
4128 * Now that every NFSv4 client has had the chance to recover and 4185 * At this point, and once lockd and/or any other containers
4129 * to see the (possibly new, possibly shorter) lease time, we 4186 * exit their grace period, further reclaims will fail and
4130 * can safely set the next grace time to the current lease time: 4187 * regular locking can resume.
4131 */ 4188 */
4132 nn->nfsd4_grace = nn->nfsd4_lease;
4133} 4189}
4134 4190
4135static time_t 4191static time_t
@@ -5664,6 +5720,9 @@ nfs4_check_open_reclaim(clientid_t *clid,
5664 if (status) 5720 if (status)
5665 return nfserr_reclaim_bad; 5721 return nfserr_reclaim_bad;
5666 5722
5723 if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
5724 return nfserr_no_grace;
5725
5667 if (nfsd4_client_record_check(cstate->clp)) 5726 if (nfsd4_client_record_check(cstate->clp))
5668 return nfserr_reclaim_bad; 5727 return nfserr_reclaim_bad;
5669 5728
@@ -6361,10 +6420,10 @@ nfs4_state_start_net(struct net *net)
6361 ret = nfs4_state_create_net(net); 6420 ret = nfs4_state_create_net(net);
6362 if (ret) 6421 if (ret)
6363 return ret; 6422 return ret;
6364 nfsd4_client_tracking_init(net);
6365 nn->boot_time = get_seconds(); 6423 nn->boot_time = get_seconds();
6366 locks_start_grace(net, &nn->nfsd4_manager);
6367 nn->grace_ended = false; 6424 nn->grace_ended = false;
6425 locks_start_grace(net, &nn->nfsd4_manager);
6426 nfsd4_client_tracking_init(net);
6368 printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", 6427 printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
6369 nn->nfsd4_grace, net); 6428 nn->nfsd4_grace, net);
6370 queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); 6429 queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e771a1a7c6f1..eeea7a90eb87 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1514,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
1514} 1514}
1515 1515
1516static __be32 1516static __be32
1517nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
1518{
1519 DECODE_HEAD;
1520
1521 status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
1522 if (status)
1523 return status;
1524
1525 READ_BUF(8 + 4);
1526 p = xdr_decode_hyper(p, &seek->seek_offset);
1527 seek->seek_whence = be32_to_cpup(p);
1528
1529 DECODE_TAIL;
1530}
1531
1532static __be32
1517nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) 1533nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1518{ 1534{
1519 return nfs_ok; 1535 return nfs_ok;
@@ -1586,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1586 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1602 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1587 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, 1603 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
1588 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, 1604 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
1605
1606 /* new operations for NFSv4.2 */
1607 [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
1608 [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp,
1609 [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp,
1610 [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
1611 [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
1612 [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
1613 [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
1614 [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
1615 [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
1616 [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
1617 [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
1618 [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
1589}; 1619};
1590 1620
1591static inline bool 1621static inline bool
@@ -2658,6 +2688,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2658 struct xdr_stream *xdr = cd->xdr; 2688 struct xdr_stream *xdr = cd->xdr;
2659 int start_offset = xdr->buf->len; 2689 int start_offset = xdr->buf->len;
2660 int cookie_offset; 2690 int cookie_offset;
2691 u32 name_and_cookie;
2661 int entry_bytes; 2692 int entry_bytes;
2662 __be32 nfserr = nfserr_toosmall; 2693 __be32 nfserr = nfserr_toosmall;
2663 __be64 wire_offset; 2694 __be64 wire_offset;
@@ -2719,7 +2750,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2719 cd->rd_maxcount -= entry_bytes; 2750 cd->rd_maxcount -= entry_bytes;
2720 if (!cd->rd_dircount) 2751 if (!cd->rd_dircount)
2721 goto fail; 2752 goto fail;
2722 cd->rd_dircount--; 2753 /*
2754 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
2755 * let's always let through the first entry, at least:
2756 */
2757 name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
2758 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
2759 goto fail;
2760 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
2723 cd->cookie_offset = cookie_offset; 2761 cd->cookie_offset = cookie_offset;
2724skip_entry: 2762skip_entry:
2725 cd->common.err = nfs_ok; 2763 cd->common.err = nfs_ok;
@@ -3097,7 +3135,8 @@ static __be32 nfsd4_encode_splice_read(
3097 3135
3098 buf->page_len = maxcount; 3136 buf->page_len = maxcount;
3099 buf->len += maxcount; 3137 buf->len += maxcount;
3100 xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE; 3138 xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
3139 / PAGE_SIZE;
3101 3140
3102 /* Use rest of head for padding and remaining ops: */ 3141 /* Use rest of head for padding and remaining ops: */
3103 buf->tail[0].iov_base = xdr->p; 3142 buf->tail[0].iov_base = xdr->p;
@@ -3322,6 +3361,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3322 } 3361 }
3323 maxcount = min_t(int, maxcount-16, bytes_left); 3362 maxcount = min_t(int, maxcount-16, bytes_left);
3324 3363
3364 /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
3365 if (!readdir->rd_dircount)
3366 readdir->rd_dircount = INT_MAX;
3367
3325 readdir->xdr = xdr; 3368 readdir->xdr = xdr;
3326 readdir->rd_maxcount = maxcount; 3369 readdir->rd_maxcount = maxcount;
3327 readdir->common.err = 0; 3370 readdir->common.err = 0;
@@ -3752,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3752} 3795}
3753 3796
3754static __be32 3797static __be32
3798nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
3799 struct nfsd4_seek *seek)
3800{
3801 __be32 *p;
3802
3803 if (nfserr)
3804 return nfserr;
3805
3806 p = xdr_reserve_space(&resp->xdr, 4 + 8);
3807 *p++ = cpu_to_be32(seek->seek_eof);
3808 p = xdr_encode_hyper(p, seek->seek_pos);
3809
3810 return nfserr;
3811}
3812
3813static __be32
3755nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) 3814nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
3756{ 3815{
3757 return nfserr; 3816 return nfserr;
@@ -3823,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3823 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, 3882 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3824 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, 3883 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
3825 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, 3884 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
3885
3886 /* NFSv4.2 operations */
3887 [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
3888 [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop,
3889 [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop,
3890 [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
3891 [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
3892 [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
3893 [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
3894 [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
3895 [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
3896 [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
3897 [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
3898 [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
3826}; 3899};
3827 3900
3828/* 3901/*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 4e042105fb6e..ca73ca79a0ee 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -49,6 +49,7 @@ enum {
49 NFSD_Leasetime, 49 NFSD_Leasetime,
50 NFSD_Gracetime, 50 NFSD_Gracetime,
51 NFSD_RecoveryDir, 51 NFSD_RecoveryDir,
52 NFSD_V4EndGrace,
52#endif 53#endif
53}; 54};
54 55
@@ -68,6 +69,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
68static ssize_t write_leasetime(struct file *file, char *buf, size_t size); 69static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
69static ssize_t write_gracetime(struct file *file, char *buf, size_t size); 70static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
70static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 71static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
72static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
71#endif 73#endif
72 74
73static ssize_t (*write_op[])(struct file *, char *, size_t) = { 75static ssize_t (*write_op[])(struct file *, char *, size_t) = {
@@ -84,6 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
84 [NFSD_Leasetime] = write_leasetime, 86 [NFSD_Leasetime] = write_leasetime,
85 [NFSD_Gracetime] = write_gracetime, 87 [NFSD_Gracetime] = write_gracetime,
86 [NFSD_RecoveryDir] = write_recoverydir, 88 [NFSD_RecoveryDir] = write_recoverydir,
89 [NFSD_V4EndGrace] = write_v4_end_grace,
87#endif 90#endif
88}; 91};
89 92
@@ -1077,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
1077 return rv; 1080 return rv;
1078} 1081}
1079 1082
1083/**
1084 * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
1085 *
1086 * Input:
1087 * buf: ignored
1088 * size: zero
1089 * OR
1090 *
1091 * Input:
1092 * buf: any value
1093 * size: non-zero length of C string in @buf
1094 * Output:
1095 * passed-in buffer filled with "Y" or "N" with a newline
1096 * and NULL-terminated C string. This indicates whether
1097 * the grace period has ended in the current net
1098 * namespace. Return code is the size in bytes of the
1099 * string. Writing a string that starts with 'Y', 'y', or
1100 * '1' to the file will end the grace period for nfsd's v4
1101 * lock manager.
1102 */
1103static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
1104{
1105 struct net *net = file->f_dentry->d_sb->s_fs_info;
1106 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1107
1108 if (size > 0) {
1109 switch(buf[0]) {
1110 case 'Y':
1111 case 'y':
1112 case '1':
1113 nfsd4_end_grace(nn);
1114 break;
1115 default:
1116 return -EINVAL;
1117 }
1118 }
1119
1120 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
1121 nn->grace_ended ? 'Y' : 'N');
1122}
1123
1080#endif 1124#endif
1081 1125
1082/*----------------------------------------------------------------------------*/ 1126/*----------------------------------------------------------------------------*/
@@ -1110,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1110 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1154 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
1111 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1155 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
1112 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, 1156 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
1157 [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
1113#endif 1158#endif
1114 /* last one */ {""} 1159 /* last one */ {""}
1115 }; 1160 };
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index e883a5868be6..88026fc6a981 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -209,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
209 * fix that case easily. 209 * fix that case easily.
210 */ 210 */
211 struct cred *new = prepare_creds(); 211 struct cred *new = prepare_creds();
212 if (!new) 212 if (!new) {
213 return nfserrno(-ENOMEM); 213 error = nfserrno(-ENOMEM);
214 goto out;
215 }
214 new->cap_effective = 216 new->cap_effective =
215 cap_raise_nfsd_set(new->cap_effective, 217 cap_raise_nfsd_set(new->cap_effective,
216 new->cap_permitted); 218 new->cap_permitted);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 64f291a25a8c..2712042a66b1 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -62,16 +62,21 @@ typedef struct {
62 (s)->si_generation 62 (s)->si_generation
63 63
64struct nfsd4_callback { 64struct nfsd4_callback {
65 void *cb_op;
66 struct nfs4_client *cb_clp; 65 struct nfs4_client *cb_clp;
67 struct list_head cb_per_client; 66 struct list_head cb_per_client;
68 u32 cb_minorversion; 67 u32 cb_minorversion;
69 struct rpc_message cb_msg; 68 struct rpc_message cb_msg;
70 const struct rpc_call_ops *cb_ops; 69 struct nfsd4_callback_ops *cb_ops;
71 struct work_struct cb_work; 70 struct work_struct cb_work;
72 bool cb_done; 71 bool cb_done;
73}; 72};
74 73
74struct nfsd4_callback_ops {
75 void (*prepare)(struct nfsd4_callback *);
76 int (*done)(struct nfsd4_callback *, struct rpc_task *);
77 void (*release)(struct nfsd4_callback *);
78};
79
75/* 80/*
76 * A core object that represents a "common" stateid. These are generally 81 * A core object that represents a "common" stateid. These are generally
77 * embedded within the different (more specific) stateid objects and contain 82 * embedded within the different (more specific) stateid objects and contain
@@ -127,6 +132,9 @@ struct nfs4_delegation {
127 struct nfsd4_callback dl_recall; 132 struct nfsd4_callback dl_recall;
128}; 133};
129 134
135#define cb_to_delegation(cb) \
136 container_of(cb, struct nfs4_delegation, dl_recall)
137
130/* client delegation callback info */ 138/* client delegation callback info */
131struct nfs4_cb_conn { 139struct nfs4_cb_conn {
132 /* SETCLIENTID info */ 140 /* SETCLIENTID info */
@@ -306,6 +314,7 @@ struct nfs4_client {
306#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ 314#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
307#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ 315#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
308#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */ 316#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
317#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
309#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 318#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
310 1 << NFSD4_CLIENT_CB_KILL) 319 1 << NFSD4_CLIENT_CB_KILL)
311 unsigned long cl_flags; 320 unsigned long cl_flags;
@@ -516,6 +525,13 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
516#define RD_STATE 0x00000010 525#define RD_STATE 0x00000010
517#define WR_STATE 0x00000020 526#define WR_STATE 0x00000020
518 527
528enum nfsd4_cb_op {
529 NFSPROC4_CLNT_CB_NULL = 0,
530 NFSPROC4_CLNT_CB_RECALL,
531 NFSPROC4_CLNT_CB_SEQUENCE,
532};
533
534
519struct nfsd4_compound_state; 535struct nfsd4_compound_state;
520struct nfsd_net; 536struct nfsd_net;
521 537
@@ -530,12 +546,12 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
530extern __be32 nfs4_check_open_reclaim(clientid_t *clid, 546extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
531 struct nfsd4_compound_state *cstate, struct nfsd_net *nn); 547 struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
532extern int set_callback_cred(void); 548extern int set_callback_cred(void);
533void nfsd4_run_cb_null(struct work_struct *w);
534void nfsd4_run_cb_recall(struct work_struct *w);
535extern void nfsd4_probe_callback(struct nfs4_client *clp); 549extern void nfsd4_probe_callback(struct nfs4_client *clp);
536extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); 550extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
537extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 551extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
538extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 552extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
553 struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
554extern void nfsd4_run_cb(struct nfsd4_callback *cb);
539extern int nfsd4_create_callback_queue(void); 555extern int nfsd4_create_callback_queue(void);
540extern void nfsd4_destroy_callback_queue(void); 556extern void nfsd4_destroy_callback_queue(void);
541extern void nfsd4_shutdown_callback(struct nfs4_client *); 557extern void nfsd4_shutdown_callback(struct nfs4_client *);
@@ -544,13 +560,16 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
544 struct nfsd_net *nn); 560 struct nfsd_net *nn);
545extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); 561extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
546 562
563/* grace period management */
564void nfsd4_end_grace(struct nfsd_net *nn);
565
547/* nfs4recover operations */ 566/* nfs4recover operations */
548extern int nfsd4_client_tracking_init(struct net *net); 567extern int nfsd4_client_tracking_init(struct net *net);
549extern void nfsd4_client_tracking_exit(struct net *net); 568extern void nfsd4_client_tracking_exit(struct net *net);
550extern void nfsd4_client_record_create(struct nfs4_client *clp); 569extern void nfsd4_client_record_create(struct nfs4_client *clp);
551extern void nfsd4_client_record_remove(struct nfs4_client *clp); 570extern void nfsd4_client_record_remove(struct nfs4_client *clp);
552extern int nfsd4_client_record_check(struct nfs4_client *clp); 571extern int nfsd4_client_record_check(struct nfs4_client *clp);
553extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); 572extern void nfsd4_record_grace_done(struct nfsd_net *nn);
554 573
555/* nfs fault injection functions */ 574/* nfs fault injection functions */
556#ifdef CONFIG_NFSD_FAULT_INJECTION 575#ifdef CONFIG_NFSD_FAULT_INJECTION
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index f501a9b5c9df..965cffd17a0c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -445,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
445 if (err) 445 if (err)
446 goto out; 446 goto out;
447 size_change = 1; 447 size_change = 1;
448
449 /*
450 * RFC5661, Section 18.30.4:
451 * Changing the size of a file with SETATTR indirectly
452 * changes the time_modify and change attributes.
453 *
454 * (and similar for the older RFCs)
455 */
456 if (iap->ia_size != i_size_read(inode))
457 iap->ia_valid |= ATTR_MTIME;
448 } 458 }
449 459
450 iap->ia_valid |= ATTR_CTIME; 460 iap->ia_valid |= ATTR_CTIME;
@@ -649,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
649{ 659{
650 struct path path; 660 struct path path;
651 struct inode *inode; 661 struct inode *inode;
662 struct file *file;
652 int flags = O_RDONLY|O_LARGEFILE; 663 int flags = O_RDONLY|O_LARGEFILE;
653 __be32 err; 664 __be32 err;
654 int host_err = 0; 665 int host_err = 0;
@@ -703,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
703 else 714 else
704 flags = O_WRONLY|O_LARGEFILE; 715 flags = O_WRONLY|O_LARGEFILE;
705 } 716 }
706 *filp = dentry_open(&path, flags, current_cred());
707 if (IS_ERR(*filp)) {
708 host_err = PTR_ERR(*filp);
709 *filp = NULL;
710 } else {
711 host_err = ima_file_check(*filp, may_flags);
712 717
713 if (may_flags & NFSD_MAY_64BIT_COOKIE) 718 file = dentry_open(&path, flags, current_cred());
714 (*filp)->f_mode |= FMODE_64BITHASH; 719 if (IS_ERR(file)) {
715 else 720 host_err = PTR_ERR(file);
716 (*filp)->f_mode |= FMODE_32BITHASH; 721 goto out_nfserr;
717 } 722 }
718 723
724 host_err = ima_file_check(file, may_flags);
725 if (host_err) {
726 nfsd_close(file);
727 goto out_nfserr;
728 }
729
730 if (may_flags & NFSD_MAY_64BIT_COOKIE)
731 file->f_mode |= FMODE_64BITHASH;
732 else
733 file->f_mode |= FMODE_32BITHASH;
734
735 *filp = file;
719out_nfserr: 736out_nfserr:
720 err = nfserrno(host_err); 737 err = nfserrno(host_err);
721out: 738out:
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 465e7799742a..5720e9457f33 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,17 @@ struct nfsd4_reclaim_complete {
428 u32 rca_one_fs; 428 u32 rca_one_fs;
429}; 429};
430 430
431struct nfsd4_seek {
432 /* request */
433 stateid_t seek_stateid;
434 loff_t seek_offset;
435 u32 seek_whence;
436
437 /* response */
438 u32 seek_eof;
439 loff_t seek_pos;
440};
441
431struct nfsd4_op { 442struct nfsd4_op {
432 int opnum; 443 int opnum;
433 __be32 status; 444 __be32 status;
@@ -473,6 +484,9 @@ struct nfsd4_op {
473 struct nfsd4_reclaim_complete reclaim_complete; 484 struct nfsd4_reclaim_complete reclaim_complete;
474 struct nfsd4_test_stateid test_stateid; 485 struct nfsd4_test_stateid test_stateid;
475 struct nfsd4_free_stateid free_stateid; 486 struct nfsd4_free_stateid free_stateid;
487
488 /* NFSv4.2 */
489 struct nfsd4_seek seek;
476 } u; 490 } u;
477 struct nfs4_replay * replay; 491 struct nfs4_replay * replay;
478}; 492};
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6252b173a465..d071e7f23de2 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -24,6 +24,7 @@
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/pagemap.h>
27#include <linux/writeback.h> 28#include <linux/writeback.h>
28#include <linux/aio.h> 29#include <linux/aio.h>
29#include "nilfs.h" 30#include "nilfs.h"
@@ -219,10 +220,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
219 220
220static int nilfs_set_page_dirty(struct page *page) 221static int nilfs_set_page_dirty(struct page *page)
221{ 222{
223 struct inode *inode = page->mapping->host;
222 int ret = __set_page_dirty_nobuffers(page); 224 int ret = __set_page_dirty_nobuffers(page);
223 225
224 if (page_has_buffers(page)) { 226 if (page_has_buffers(page)) {
225 struct inode *inode = page->mapping->host;
226 unsigned nr_dirty = 0; 227 unsigned nr_dirty = 0;
227 struct buffer_head *bh, *head; 228 struct buffer_head *bh, *head;
228 229
@@ -245,6 +246,10 @@ static int nilfs_set_page_dirty(struct page *page)
245 246
246 if (nr_dirty) 247 if (nr_dirty)
247 nilfs_set_file_dirty(inode, nr_dirty); 248 nilfs_set_file_dirty(inode, nr_dirty);
249 } else if (ret) {
250 unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
251
252 nilfs_set_file_dirty(inode, nr_dirty);
248 } 253 }
249 return ret; 254 return ret;
250} 255}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b13992a41bd9..c991616acca9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group,
78 78
79 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 79 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
80 80
81 client_fd = get_unused_fd(); 81 client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
82 if (client_fd < 0) 82 if (client_fd < 0)
83 return client_fd; 83 return client_fd;
84 84
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 238a5930cb3c..9d7e2b9659cb 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
42{ 42{
43 struct { 43 struct {
44 struct file_handle handle; 44 struct file_handle handle;
45 u8 pad[64]; 45 u8 pad[MAX_HANDLE_SZ];
46 } f; 46 } f;
47 int size, ret, i; 47 int size, ret, i;
48 48
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
50 size = f.handle.handle_bytes >> 2; 50 size = f.handle.handle_bytes >> 2;
51 51
52 ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); 52 ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
53 if ((ret == 255) || (ret == -ENOSPC)) { 53 if ((ret == FILEID_INVALID) || (ret < 0)) {
54 WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); 54 WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
55 return 0; 55 return 0;
56 } 56 }
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 85e7d2b431d9..9c0898c4cfe1 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -23,9 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
23 struct fsnotify_group *group, struct vfsmount *mnt, 23 struct fsnotify_group *group, struct vfsmount *mnt,
24 int allow_dups); 24 int allow_dups);
25 25
26/* final kfree of a group */
27extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
28
29/* vfsmount specific destruction of a mark */ 26/* vfsmount specific destruction of a mark */
30extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); 27extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
31/* inode specific destruction of a mark */ 28/* inode specific destruction of a mark */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ad1995980456..d16b62cb2854 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -31,7 +31,7 @@
31/* 31/*
32 * Final freeing of a group 32 * Final freeing of a group
33 */ 33 */
34void fsnotify_final_destroy_group(struct fsnotify_group *group) 34static void fsnotify_final_destroy_group(struct fsnotify_group *group)
35{ 35{
36 if (group->ops->free_group_priv) 36 if (group->ops->free_group_priv)
37 group->ops->free_group_priv(group); 37 group->ops->free_group_priv(group);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 0f88bc0b4e6c..7d888d77d59a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
165 /* ideally the idr is empty and we won't hit the BUG in the callback */ 165 /* ideally the idr is empty and we won't hit the BUG in the callback */
166 idr_for_each(&group->inotify_data.idr, idr_callback, group); 166 idr_for_each(&group->inotify_data.idr, idr_callback, group);
167 idr_destroy(&group->inotify_data.idr); 167 idr_destroy(&group->inotify_data.idr);
168 atomic_dec(&group->inotify_data.user->inotify_devs); 168 if (group->inotify_data.user) {
169 free_uid(group->inotify_data.user); 169 atomic_dec(&group->inotify_data.user->inotify_devs);
170 free_uid(group->inotify_data.user);
171 }
170} 172}
171 173
172static void inotify_free_event(struct fsnotify_event *fsn_event) 174static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index dd6103cc93c1..825a54e8f490 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb,
112/* If 1, output debug messages, and if 0, don't. */ 112/* If 1, output debug messages, and if 0, don't. */
113int debug_msgs = 0; 113int debug_msgs = 0;
114 114
115void __ntfs_debug (const char *file, int line, const char *function, 115void __ntfs_debug(const char *file, int line, const char *function,
116 const char *fmt, ...) 116 const char *fmt, ...)
117{ 117{
118 struct va_format vaf; 118 struct va_format vaf;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f5ec1ce7a532..643faa44f22b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. 4 * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -410,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
410 BUG_ON(!nr_pages); 410 BUG_ON(!nr_pages);
411 err = nr = 0; 411 err = nr = 0;
412 do { 412 do {
413 pages[nr] = find_lock_page(mapping, index); 413 pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
414 FGP_ACCESSED);
414 if (!pages[nr]) { 415 if (!pages[nr]) {
415 if (!*cached_page) { 416 if (!*cached_page) {
416 *cached_page = page_cache_alloc(mapping); 417 *cached_page = page_cache_alloc(mapping);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c3296e546c3..9e1e112074fb 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void)
3208} 3208}
3209 3209
3210MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); 3210MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
3211MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc."); 3211MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
3212MODULE_VERSION(NTFS_VERSION); 3212MODULE_VERSION(NTFS_VERSION);
3213MODULE_LICENSE("GPL"); 3213MODULE_LICENSE("GPL");
3214#ifdef DEBUG 3214#ifdef DEBUG
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4a231a166cf8..1ef547e49373 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1481 handle_t *handle; 1481 handle_t *handle;
1482 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1482 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1483 1483
1484 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1485 if (IS_ERR(handle)) {
1486 ret = PTR_ERR(handle);
1487 mlog_errno(ret);
1488 goto out;
1489 }
1490
1484 page = find_or_create_page(mapping, 0, GFP_NOFS); 1491 page = find_or_create_page(mapping, 0, GFP_NOFS);
1485 if (!page) { 1492 if (!page) {
1493 ocfs2_commit_trans(osb, handle);
1486 ret = -ENOMEM; 1494 ret = -ENOMEM;
1487 mlog_errno(ret); 1495 mlog_errno(ret);
1488 goto out; 1496 goto out;
@@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1494 wc->w_pages[0] = wc->w_target_page = page; 1502 wc->w_pages[0] = wc->w_target_page = page;
1495 wc->w_num_pages = 1; 1503 wc->w_num_pages = 1;
1496 1504
1497 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1498 if (IS_ERR(handle)) {
1499 ret = PTR_ERR(handle);
1500 mlog_errno(ret);
1501 goto out;
1502 }
1503
1504 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, 1505 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1505 OCFS2_JOURNAL_ACCESS_WRITE); 1506 OCFS2_JOURNAL_ACCESS_WRITE);
1506 if (ret) { 1507 if (ret) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73039295d0d1..d13385448168 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num)
2572} 2572}
2573EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); 2573EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
2574 2574
2575int o2hb_check_node_heartbeating_no_sem(u8 node_num)
2576{
2577 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2578 unsigned long flags;
2579
2580 spin_lock_irqsave(&o2hb_live_lock, flags);
2581 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
2582 spin_unlock_irqrestore(&o2hb_live_lock, flags);
2583 if (!test_bit(node_num, testing_map)) {
2584 mlog(ML_HEARTBEAT,
2585 "node (%u) does not have heartbeating enabled.\n",
2586 node_num);
2587 return 0;
2588 }
2589
2590 return 1;
2591}
2592EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
2593
2575int o2hb_check_node_heartbeating_from_callback(u8 node_num) 2594int o2hb_check_node_heartbeating_from_callback(u8 node_num)
2576{ 2595{
2577 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 2596 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 00ad8e8fea51..3ef5137dc362 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map,
80void o2hb_exit(void); 80void o2hb_exit(void);
81int o2hb_init(void); 81int o2hb_init(void);
82int o2hb_check_node_heartbeating(u8 node_num); 82int o2hb_check_node_heartbeating(u8 node_num);
83int o2hb_check_node_heartbeating_no_sem(u8 node_num);
83int o2hb_check_node_heartbeating_from_callback(u8 node_num); 84int o2hb_check_node_heartbeating_from_callback(u8 node_num);
84int o2hb_check_local_node_heartbeating(void); 85int o2hb_check_local_node_heartbeating(void);
85void o2hb_stop_all_regions(void); 86void o2hb_stop_all_regions(void);
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 73ba81928bce..27d1242c8383 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = {
185static int nst_fop_open(struct inode *inode, struct file *file) 185static int nst_fop_open(struct inode *inode, struct file *file)
186{ 186{
187 struct o2net_send_tracking *dummy_nst; 187 struct o2net_send_tracking *dummy_nst;
188 struct seq_file *seq;
189 int ret;
190 188
191 dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); 189 dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
192 if (dummy_nst == NULL) { 190 if (!dummy_nst)
193 ret = -ENOMEM; 191 return -ENOMEM;
194 goto out;
195 }
196 dummy_nst->st_task = NULL;
197
198 ret = seq_open(file, &nst_seq_ops);
199 if (ret)
200 goto out;
201
202 seq = file->private_data;
203 seq->private = dummy_nst;
204 o2net_debug_add_nst(dummy_nst); 192 o2net_debug_add_nst(dummy_nst);
205 193
206 dummy_nst = NULL; 194 return 0;
207
208out:
209 kfree(dummy_nst);
210 return ret;
211} 195}
212 196
213static int nst_fop_release(struct inode *inode, struct file *file) 197static int nst_fop_release(struct inode *inode, struct file *file)
@@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = {
412 .show = sc_seq_show, 396 .show = sc_seq_show,
413}; 397};
414 398
415static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) 399static int sc_common_open(struct file *file, int ctxt)
416{ 400{
401 struct o2net_sock_debug *sd;
417 struct o2net_sock_container *dummy_sc; 402 struct o2net_sock_container *dummy_sc;
418 struct seq_file *seq;
419 int ret;
420 403
421 dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); 404 dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
422 if (dummy_sc == NULL) { 405 if (!dummy_sc)
423 ret = -ENOMEM; 406 return -ENOMEM;
424 goto out;
425 }
426 dummy_sc->sc_page = NULL;
427 407
428 ret = seq_open(file, &sc_seq_ops); 408 sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
429 if (ret) 409 if (!sd) {
430 goto out; 410 kfree(dummy_sc);
411 return -ENOMEM;
412 }
431 413
432 seq = file->private_data; 414 sd->dbg_ctxt = ctxt;
433 seq->private = sd;
434 sd->dbg_sock = dummy_sc; 415 sd->dbg_sock = dummy_sc;
435 o2net_debug_add_sc(dummy_sc);
436 416
437 dummy_sc = NULL; 417 o2net_debug_add_sc(dummy_sc);
438 418
439out: 419 return 0;
440 kfree(dummy_sc);
441 return ret;
442} 420}
443 421
444static int sc_fop_release(struct inode *inode, struct file *file) 422static int sc_fop_release(struct inode *inode, struct file *file)
@@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
453 431
454static int stats_fop_open(struct inode *inode, struct file *file) 432static int stats_fop_open(struct inode *inode, struct file *file)
455{ 433{
456 struct o2net_sock_debug *sd; 434 return sc_common_open(file, SHOW_SOCK_STATS);
457
458 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
459 if (sd == NULL)
460 return -ENOMEM;
461
462 sd->dbg_ctxt = SHOW_SOCK_STATS;
463 sd->dbg_sock = NULL;
464
465 return sc_common_open(file, sd);
466} 435}
467 436
468static const struct file_operations stats_seq_fops = { 437static const struct file_operations stats_seq_fops = {
@@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = {
474 443
475static int sc_fop_open(struct inode *inode, struct file *file) 444static int sc_fop_open(struct inode *inode, struct file *file)
476{ 445{
477 struct o2net_sock_debug *sd; 446 return sc_common_open(file, SHOW_SOCK_CONTAINERS);
478
479 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
480 if (sd == NULL)
481 return -ENOMEM;
482
483 sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
484 sd->dbg_sock = NULL;
485
486 return sc_common_open(file, sd);
487} 447}
488 448
489static const struct file_operations sc_seq_fops = { 449static const struct file_operations sc_seq_fops = {
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ec141e758d7..62e8ec619b4c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work)
160 } 160 }
161 161
162out: 162out:
163 spin_unlock(&qs->qs_lock); 163 if (fence) {
164 if (fence) 164 spin_unlock(&qs->qs_lock);
165 o2quo_fence_self(); 165 o2quo_fence_self();
166 } else {
167 mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
168 "connected: %d, lowest: %d (%sreachable)\n",
169 qs->qs_heartbeating, qs->qs_connected, lowest_hb,
170 lowest_reachable ? "" : "un");
171 spin_unlock(&qs->qs_lock);
172
173 }
174
166} 175}
167 176
168static void o2quo_set_hold(struct o2quo_state *qs, u8 node) 177static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 681691bc233a..97de0fbd9f78 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
536 if (nn->nn_persistent_error || nn->nn_sc_valid) 536 if (nn->nn_persistent_error || nn->nn_sc_valid)
537 wake_up(&nn->nn_sc_wq); 537 wake_up(&nn->nn_sc_wq);
538 538
539 if (!was_err && nn->nn_persistent_error) { 539 if (was_valid && !was_err && nn->nn_persistent_error) {
540 o2quo_conn_err(o2net_num_from_nn(nn)); 540 o2quo_conn_err(o2net_num_from_nn(nn));
541 queue_delayed_work(o2net_wq, &nn->nn_still_up, 541 queue_delayed_work(o2net_wq, &nn->nn_still_up,
542 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); 542 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
@@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock)
1480 return ret; 1480 return ret;
1481} 1481}
1482 1482
1483static int o2net_set_usertimeout(struct socket *sock)
1484{
1485 int user_timeout = O2NET_TCP_USER_TIMEOUT;
1486
1487 return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
1488 (char *)&user_timeout, sizeof(user_timeout));
1489}
1490
1483static void o2net_initialize_handshake(void) 1491static void o2net_initialize_handshake(void)
1484{ 1492{
1485 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( 1493 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1536,16 +1544,20 @@ static void o2net_idle_timer(unsigned long data)
1536#endif 1544#endif
1537 1545
1538 printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " 1546 printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
1539 "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), 1547 "idle for %lu.%lu secs.\n",
1540 msecs / 1000, msecs % 1000); 1548 SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
1541 1549
1542 /* 1550 /* idle timerout happen, don't shutdown the connection, but
1543 * Initialize the nn_timeout so that the next connection attempt 1551 * make fence decision. Maybe the connection can recover before
1544 * will continue in o2net_start_connect. 1552 * the decision is made.
1545 */ 1553 */
1546 atomic_set(&nn->nn_timeout, 1); 1554 atomic_set(&nn->nn_timeout, 1);
1555 o2quo_conn_err(o2net_num_from_nn(nn));
1556 queue_delayed_work(o2net_wq, &nn->nn_still_up,
1557 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
1558
1559 o2net_sc_reset_idle_timer(sc);
1547 1560
1548 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1549} 1561}
1550 1562
1551static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) 1563static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1560,6 +1572,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
1560 1572
1561static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) 1573static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
1562{ 1574{
1575 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1576
1577 /* clear fence decision since the connection recover from timeout*/
1578 if (atomic_read(&nn->nn_timeout)) {
1579 o2quo_conn_up(o2net_num_from_nn(nn));
1580 cancel_delayed_work(&nn->nn_still_up);
1581 atomic_set(&nn->nn_timeout, 0);
1582 }
1583
1563 /* Only push out an existing timer */ 1584 /* Only push out an existing timer */
1564 if (timer_pending(&sc->sc_idle_timeout)) 1585 if (timer_pending(&sc->sc_idle_timeout))
1565 o2net_sc_reset_idle_timer(sc); 1586 o2net_sc_reset_idle_timer(sc);
@@ -1580,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work)
1580 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1601 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1581 int ret = 0, stop; 1602 int ret = 0, stop;
1582 unsigned int timeout; 1603 unsigned int timeout;
1604 unsigned int noio_flag;
1583 1605
1606 /*
1607 * sock_create allocates the sock with GFP_KERNEL. We must set
1608 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
1609 * by this process are done as if GFP_NOIO was specified. So we
1610 * are not reentering filesystem while doing memory reclaim.
1611 */
1612 noio_flag = memalloc_noio_save();
1584 /* if we're greater we initiate tx, otherwise we accept */ 1613 /* if we're greater we initiate tx, otherwise we accept */
1585 if (o2nm_this_node() <= o2net_num_from_nn(nn)) 1614 if (o2nm_this_node() <= o2net_num_from_nn(nn))
1586 goto out; 1615 goto out;
@@ -1650,6 +1679,12 @@ static void o2net_start_connect(struct work_struct *work)
1650 goto out; 1679 goto out;
1651 } 1680 }
1652 1681
1682 ret = o2net_set_usertimeout(sock);
1683 if (ret) {
1684 mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
1685 goto out;
1686 }
1687
1653 o2net_register_callbacks(sc->sc_sock->sk, sc); 1688 o2net_register_callbacks(sc->sc_sock->sk, sc);
1654 1689
1655 spin_lock(&nn->nn_lock); 1690 spin_lock(&nn->nn_lock);
@@ -1683,6 +1718,7 @@ out:
1683 if (mynode) 1718 if (mynode)
1684 o2nm_node_put(mynode); 1719 o2nm_node_put(mynode);
1685 1720
1721 memalloc_noio_restore(noio_flag);
1686 return; 1722 return;
1687} 1723}
1688 1724
@@ -1694,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work)
1694 spin_lock(&nn->nn_lock); 1730 spin_lock(&nn->nn_lock);
1695 if (!nn->nn_sc_valid) { 1731 if (!nn->nn_sc_valid) {
1696 printk(KERN_NOTICE "o2net: No connection established with " 1732 printk(KERN_NOTICE "o2net: No connection established with "
1697 "node %u after %u.%u seconds, giving up.\n", 1733 "node %u after %u.%u seconds, check network and"
1734 " cluster configuration.\n",
1698 o2net_num_from_nn(nn), 1735 o2net_num_from_nn(nn),
1699 o2net_idle_timeout() / 1000, 1736 o2net_idle_timeout() / 1000,
1700 o2net_idle_timeout() % 1000); 1737 o2net_idle_timeout() % 1000);
@@ -1808,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more)
1808 struct o2nm_node *local_node = NULL; 1845 struct o2nm_node *local_node = NULL;
1809 struct o2net_sock_container *sc = NULL; 1846 struct o2net_sock_container *sc = NULL;
1810 struct o2net_node *nn; 1847 struct o2net_node *nn;
1848 unsigned int noio_flag;
1849
1850 /*
1851 * sock_create_lite allocates the sock with GFP_KERNEL. We must set
1852 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
1853 * by this process are done as if GFP_NOIO was specified. So we
1854 * are not reentering filesystem while doing memory reclaim.
1855 */
1856 noio_flag = memalloc_noio_save();
1811 1857
1812 BUG_ON(sock == NULL); 1858 BUG_ON(sock == NULL);
1813 *more = 0; 1859 *more = 0;
@@ -1831,6 +1877,12 @@ static int o2net_accept_one(struct socket *sock, int *more)
1831 goto out; 1877 goto out;
1832 } 1878 }
1833 1879
1880 ret = o2net_set_usertimeout(new_sock);
1881 if (ret) {
1882 mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
1883 goto out;
1884 }
1885
1834 slen = sizeof(sin); 1886 slen = sizeof(sin);
1835 ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1887 ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
1836 &slen, 1); 1888 &slen, 1);
@@ -1918,6 +1970,8 @@ out:
1918 o2nm_node_put(local_node); 1970 o2nm_node_put(local_node);
1919 if (sc) 1971 if (sc)
1920 sc_put(sc); 1972 sc_put(sc);
1973
1974 memalloc_noio_restore(noio_flag);
1921 return ret; 1975 return ret;
1922} 1976}
1923 1977
@@ -2113,17 +2167,13 @@ int o2net_init(void)
2113 o2quo_init(); 2167 o2quo_init();
2114 2168
2115 if (o2net_debugfs_init()) 2169 if (o2net_debugfs_init())
2116 return -ENOMEM; 2170 goto out;
2117 2171
2118 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); 2172 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
2119 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2173 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
2120 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2174 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
2121 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { 2175 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
2122 kfree(o2net_hand); 2176 goto out;
2123 kfree(o2net_keep_req);
2124 kfree(o2net_keep_resp);
2125 return -ENOMEM;
2126 }
2127 2177
2128 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); 2178 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
2129 o2net_hand->connector_id = cpu_to_be64(1); 2179 o2net_hand->connector_id = cpu_to_be64(1);
@@ -2148,6 +2198,14 @@ int o2net_init(void)
2148 } 2198 }
2149 2199
2150 return 0; 2200 return 0;
2201
2202out:
2203 kfree(o2net_hand);
2204 kfree(o2net_keep_req);
2205 kfree(o2net_keep_resp);
2206
2207 o2quo_exit();
2208 return -ENOMEM;
2151} 2209}
2152 2210
2153void o2net_exit(void) 2211void o2net_exit(void)
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 5bada2a69b50..c571e849fda4 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
65 65
66#define O2NET_TCP_USER_TIMEOUT 0x7fffffff
66 67
67/* TODO: figure this out.... */ 68/* TODO: figure this out.... */
68static inline int o2net_link_down(int err, struct socket *sock) 69static inline int o2net_link_down(int err, struct socket *sock)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 18f13c2e4a10..149eb556b8c6 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = {
647static int debug_lockres_open(struct inode *inode, struct file *file) 647static int debug_lockres_open(struct inode *inode, struct file *file)
648{ 648{
649 struct dlm_ctxt *dlm = inode->i_private; 649 struct dlm_ctxt *dlm = inode->i_private;
650 int ret = -ENOMEM; 650 struct debug_lockres *dl;
651 struct seq_file *seq; 651 void *buf;
652 struct debug_lockres *dl = NULL;
653 652
654 dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); 653 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
655 if (!dl) { 654 if (!buf)
656 mlog_errno(ret);
657 goto bail; 655 goto bail;
658 }
659 656
660 dl->dl_len = PAGE_SIZE; 657 dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl));
661 dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); 658 if (!dl)
662 if (!dl->dl_buf) { 659 goto bailfree;
663 mlog_errno(ret);
664 goto bail;
665 }
666 660
667 ret = seq_open(file, &debug_lockres_ops); 661 dl->dl_len = PAGE_SIZE;
668 if (ret) { 662 dl->dl_buf = buf;
669 mlog_errno(ret);
670 goto bail;
671 }
672
673 seq = file->private_data;
674 seq->private = dl;
675 663
676 dlm_grab(dlm); 664 dlm_grab(dlm);
677 dl->dl_ctxt = dlm; 665 dl->dl_ctxt = dlm;
678 666
679 return 0; 667 return 0;
668
669bailfree:
670 kfree(buf);
680bail: 671bail:
681 if (dl) 672 mlog_errno(-ENOMEM);
682 kfree(dl->dl_buf); 673 return -ENOMEM;
683 kfree(dl);
684 return ret;
685} 674}
686 675
687static int debug_lockres_release(struct inode *inode, struct file *file) 676static int debug_lockres_release(struct inode *inode, struct file *file)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 3fcf205ee900..02d315fef432 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
839 * to back off and try again. This gives heartbeat a chance 839 * to back off and try again. This gives heartbeat a chance
840 * to catch up. 840 * to catch up.
841 */ 841 */
842 if (!o2hb_check_node_heartbeating(query->node_idx)) { 842 if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
843 mlog(0, "node %u is not in our live map yet\n", 843 mlog(0, "node %u is not in our live map yet\n",
844 query->node_idx); 844 query->node_idx);
845 845
@@ -1975,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1975 1975
1976 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); 1976 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
1977 if (!dlm) { 1977 if (!dlm) {
1978 mlog_errno(-ENOMEM); 1978 ret = -ENOMEM;
1979 mlog_errno(ret);
1979 goto leave; 1980 goto leave;
1980 } 1981 }
1981 1982
1982 dlm->name = kstrdup(domain, GFP_KERNEL); 1983 dlm->name = kstrdup(domain, GFP_KERNEL);
1983 if (dlm->name == NULL) { 1984 if (dlm->name == NULL) {
1984 mlog_errno(-ENOMEM); 1985 ret = -ENOMEM;
1985 kfree(dlm); 1986 mlog_errno(ret);
1986 dlm = NULL;
1987 goto leave; 1987 goto leave;
1988 } 1988 }
1989 1989
1990 dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); 1990 dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
1991 if (!dlm->lockres_hash) { 1991 if (!dlm->lockres_hash) {
1992 mlog_errno(-ENOMEM); 1992 ret = -ENOMEM;
1993 kfree(dlm->name); 1993 mlog_errno(ret);
1994 kfree(dlm);
1995 dlm = NULL;
1996 goto leave; 1994 goto leave;
1997 } 1995 }
1998 1996
@@ -2002,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2002 dlm->master_hash = (struct hlist_head **) 2000 dlm->master_hash = (struct hlist_head **)
2003 dlm_alloc_pagevec(DLM_HASH_PAGES); 2001 dlm_alloc_pagevec(DLM_HASH_PAGES);
2004 if (!dlm->master_hash) { 2002 if (!dlm->master_hash) {
2005 mlog_errno(-ENOMEM); 2003 ret = -ENOMEM;
2006 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 2004 mlog_errno(ret);
2007 kfree(dlm->name);
2008 kfree(dlm);
2009 dlm = NULL;
2010 goto leave; 2005 goto leave;
2011 } 2006 }
2012 2007
@@ -2017,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2017 dlm->node_num = o2nm_this_node(); 2012 dlm->node_num = o2nm_this_node();
2018 2013
2019 ret = dlm_create_debugfs_subroot(dlm); 2014 ret = dlm_create_debugfs_subroot(dlm);
2020 if (ret < 0) { 2015 if (ret < 0)
2021 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
2022 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
2023 kfree(dlm->name);
2024 kfree(dlm);
2025 dlm = NULL;
2026 goto leave; 2016 goto leave;
2027 }
2028 2017
2029 spin_lock_init(&dlm->spinlock); 2018 spin_lock_init(&dlm->spinlock);
2030 spin_lock_init(&dlm->master_lock); 2019 spin_lock_init(&dlm->master_lock);
@@ -2085,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2085 atomic_read(&dlm->dlm_refs.refcount)); 2074 atomic_read(&dlm->dlm_refs.refcount));
2086 2075
2087leave: 2076leave:
2077 if (ret < 0 && dlm) {
2078 if (dlm->master_hash)
2079 dlm_free_pagevec((void **)dlm->master_hash,
2080 DLM_HASH_PAGES);
2081
2082 if (dlm->lockres_hash)
2083 dlm_free_pagevec((void **)dlm->lockres_hash,
2084 DLM_HASH_PAGES);
2085
2086 kfree(dlm->name);
2087 kfree(dlm);
2088 dlm = NULL;
2089 }
2088 return dlm; 2090 return dlm;
2089} 2091}
2090 2092
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3ec906ef5d9a..215e41abf101 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
625 return res; 625 return res;
626 626
627error: 627error:
628 if (res && res->lockname.name)
629 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
630
631 if (res) 628 if (res)
632 kmem_cache_free(dlm_lockres_cache, res); 629 kmem_cache_free(dlm_lockres_cache, res);
633 return NULL; 630 return NULL;
@@ -655,12 +652,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
655 clear_bit(bit, res->refmap); 652 clear_bit(bit, res->refmap);
656} 653}
657 654
658 655static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
659void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
660 struct dlm_lock_resource *res) 656 struct dlm_lock_resource *res)
661{ 657{
662 assert_spin_locked(&res->spinlock);
663
664 res->inflight_locks++; 658 res->inflight_locks++;
665 659
666 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, 660 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
@@ -668,6 +662,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
668 __builtin_return_address(0)); 662 __builtin_return_address(0));
669} 663}
670 664
665void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
666 struct dlm_lock_resource *res)
667{
668 assert_spin_locked(&res->spinlock);
669 __dlm_lockres_grab_inflight_ref(dlm, res);
670}
671
671void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 672void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
672 struct dlm_lock_resource *res) 673 struct dlm_lock_resource *res)
673{ 674{
@@ -894,10 +895,8 @@ lookup:
894 /* finally add the lockres to its hash bucket */ 895 /* finally add the lockres to its hash bucket */
895 __dlm_insert_lockres(dlm, res); 896 __dlm_insert_lockres(dlm, res);
896 897
897 /* Grab inflight ref to pin the resource */ 898 /* since this lockres is new it doesn't not require the spinlock */
898 spin_lock(&res->spinlock); 899 __dlm_lockres_grab_inflight_ref(dlm, res);
899 dlm_lockres_grab_inflight_ref(dlm, res);
900 spin_unlock(&res->spinlock);
901 900
902 /* get an extra ref on the mle in case this is a BLOCK 901 /* get an extra ref on the mle in case this is a BLOCK
903 * if so, the creator of the BLOCK may try to put the last 902 * if so, the creator of the BLOCK may try to put the last
@@ -2037,6 +2036,10 @@ kill:
2037 "and killing the other node now! This node is OK and can continue.\n"); 2036 "and killing the other node now! This node is OK and can continue.\n");
2038 __dlm_print_one_lock_resource(res); 2037 __dlm_print_one_lock_resource(res);
2039 spin_unlock(&res->spinlock); 2038 spin_unlock(&res->spinlock);
2039 spin_lock(&dlm->master_lock);
2040 if (mle)
2041 __dlm_put_mle(mle);
2042 spin_unlock(&dlm->master_lock);
2040 spin_unlock(&dlm->spinlock); 2043 spin_unlock(&dlm->spinlock);
2041 *ret_data = (void *)res; 2044 *ret_data = (void *)res;
2042 dlm_put(dlm); 2045 dlm_put(dlm);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 45067faf5695..3365839d2971 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
1710 BUG(); 1710 BUG();
1711 } else 1711 } else
1712 __dlm_lockres_grab_inflight_worker(dlm, res); 1712 __dlm_lockres_grab_inflight_worker(dlm, res);
1713 } else /* put.. incase we are not the master */ 1713 spin_unlock(&res->spinlock);
1714 } else {
1715 /* put.. incase we are not the master */
1716 spin_unlock(&res->spinlock);
1714 dlm_lockres_put(res); 1717 dlm_lockres_put(res);
1715 spin_unlock(&res->spinlock); 1718 }
1716 } 1719 }
1717 spin_unlock(&dlm->spinlock); 1720 spin_unlock(&dlm->spinlock);
1718 1721
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 52cfe99ae056..21262f2b1654 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
2892 2892
2893static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) 2893static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
2894{ 2894{
2895 int ret;
2896 struct ocfs2_dlm_seq_priv *priv; 2895 struct ocfs2_dlm_seq_priv *priv;
2897 struct seq_file *seq;
2898 struct ocfs2_super *osb; 2896 struct ocfs2_super *osb;
2899 2897
2900 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); 2898 priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
2901 if (!priv) { 2899 if (!priv) {
2902 ret = -ENOMEM; 2900 mlog_errno(-ENOMEM);
2903 mlog_errno(ret); 2901 return -ENOMEM;
2904 goto out;
2905 } 2902 }
2903
2906 osb = inode->i_private; 2904 osb = inode->i_private;
2907 ocfs2_get_dlm_debug(osb->osb_dlm_debug); 2905 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2908 priv->p_dlm_debug = osb->osb_dlm_debug; 2906 priv->p_dlm_debug = osb->osb_dlm_debug;
2909 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); 2907 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2910 2908
2911 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2912 if (ret) {
2913 kfree(priv);
2914 mlog_errno(ret);
2915 goto out;
2916 }
2917
2918 seq = file->private_data;
2919 seq->private = priv;
2920
2921 ocfs2_add_lockres_tracking(&priv->p_iter_res, 2909 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2922 priv->p_dlm_debug); 2910 priv->p_dlm_debug);
2923 2911
2924out: 2912 return 0;
2925 return ret;
2926} 2913}
2927 2914
2928static const struct file_operations ocfs2_dlm_debug_fops = { 2915static const struct file_operations ocfs2_dlm_debug_fops = {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2930e231f3f9..324dc93ac896 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
760 struct address_space *mapping = inode->i_mapping; 760 struct address_space *mapping = inode->i_mapping;
761 struct page *page; 761 struct page *page;
762 unsigned long index = abs_from >> PAGE_CACHE_SHIFT; 762 unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
763 handle_t *handle = NULL; 763 handle_t *handle;
764 int ret = 0; 764 int ret = 0;
765 unsigned zero_from, zero_to, block_start, block_end; 765 unsigned zero_from, zero_to, block_start, block_end;
766 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 766 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
769 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 769 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
770 BUG_ON(abs_from & (inode->i_blkbits - 1)); 770 BUG_ON(abs_from & (inode->i_blkbits - 1));
771 771
772 handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
773 if (IS_ERR(handle)) {
774 ret = PTR_ERR(handle);
775 goto out;
776 }
777
772 page = find_or_create_page(mapping, index, GFP_NOFS); 778 page = find_or_create_page(mapping, index, GFP_NOFS);
773 if (!page) { 779 if (!page) {
774 ret = -ENOMEM; 780 ret = -ENOMEM;
775 mlog_errno(ret); 781 mlog_errno(ret);
776 goto out; 782 goto out_commit_trans;
777 } 783 }
778 784
779 /* Get the offsets within the page that we want to zero */ 785 /* Get the offsets within the page that we want to zero */
@@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
805 goto out_unlock; 811 goto out_unlock;
806 } 812 }
807 813
808 if (!handle) {
809 handle = ocfs2_zero_start_ordered_transaction(inode,
810 di_bh);
811 if (IS_ERR(handle)) {
812 ret = PTR_ERR(handle);
813 handle = NULL;
814 break;
815 }
816 }
817 814
818 /* must not update i_size! */ 815 /* must not update i_size! */
819 ret = block_commit_write(page, block_start + 1, 816 ret = block_commit_write(page, block_start + 1,
@@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
824 ret = 0; 821 ret = 0;
825 } 822 }
826 823
824 /*
825 * fs-writeback will release the dirty pages without page lock
826 * whose offset are over inode size, the release happens at
827 * block_write_full_page().
828 */
829 i_size_write(inode, abs_to);
830 inode->i_blocks = ocfs2_inode_sector_count(inode);
831 di->i_size = cpu_to_le64((u64)i_size_read(inode));
832 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
833 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
834 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
835 di->i_mtime_nsec = di->i_ctime_nsec;
827 if (handle) { 836 if (handle) {
828 /*
829 * fs-writeback will release the dirty pages without page lock
830 * whose offset are over inode size, the release happens at
831 * block_write_full_page().
832 */
833 i_size_write(inode, abs_to);
834 inode->i_blocks = ocfs2_inode_sector_count(inode);
835 di->i_size = cpu_to_le64((u64)i_size_read(inode));
836 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
837 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
838 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
839 di->i_mtime_nsec = di->i_ctime_nsec;
840 ocfs2_journal_dirty(handle, di_bh); 837 ocfs2_journal_dirty(handle, di_bh);
841 ocfs2_update_inode_fsync_trans(handle, inode, 1); 838 ocfs2_update_inode_fsync_trans(handle, inode, 1);
842 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
843 } 839 }
844 840
845out_unlock: 841out_unlock:
846 unlock_page(page); 842 unlock_page(page);
847 page_cache_release(page); 843 page_cache_release(page);
844out_commit_trans:
845 if (handle)
846 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
848out: 847out:
849 return ret; 848 return ret;
850} 849}
@@ -1253,7 +1252,7 @@ bail:
1253 brelse(bh); 1252 brelse(bh);
1254 1253
1255 /* Release quota pointers in case we acquired them */ 1254 /* Release quota pointers in case we acquired them */
1256 for (qtype = 0; qtype < MAXQUOTAS; qtype++) 1255 for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
1257 dqput(transfer_to[qtype]); 1256 dqput(transfer_to[qtype]);
1258 1257
1259 if (!status && attr->ia_valid & ATTR_MODE) { 1258 if (!status && attr->ia_valid & ATTR_MODE) {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a6c991c0fc98..a9b76de46047 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
162{ 162{
163 int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; 163 int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
164 164
165 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); 165 return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
166} 166}
167 167
168/* Validate that a bh contains a valid inode */ 168/* Validate that a bh contains a valid inode */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 6f66b3751ace..53e6c40ed4c6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -35,9 +35,8 @@
35 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) 35 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
36 36
37/* 37/*
38 * This call is void because we are already reporting an error that may 38 * This is just a best-effort to tell userspace that this request
39 * be -EFAULT. The error will be returned from the ioctl(2) call. It's 39 * caused the error.
40 * just a best-effort to tell userspace that this request caused the error.
41 */ 40 */
42static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, 41static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
43 struct ocfs2_info_request __user *req) 42 struct ocfs2_info_request __user *req)
@@ -146,136 +145,105 @@ bail:
146static int ocfs2_info_handle_blocksize(struct inode *inode, 145static int ocfs2_info_handle_blocksize(struct inode *inode,
147 struct ocfs2_info_request __user *req) 146 struct ocfs2_info_request __user *req)
148{ 147{
149 int status = -EFAULT;
150 struct ocfs2_info_blocksize oib; 148 struct ocfs2_info_blocksize oib;
151 149
152 if (o2info_from_user(oib, req)) 150 if (o2info_from_user(oib, req))
153 goto bail; 151 return -EFAULT;
154 152
155 oib.ib_blocksize = inode->i_sb->s_blocksize; 153 oib.ib_blocksize = inode->i_sb->s_blocksize;
156 154
157 o2info_set_request_filled(&oib.ib_req); 155 o2info_set_request_filled(&oib.ib_req);
158 156
159 if (o2info_to_user(oib, req)) 157 if (o2info_to_user(oib, req))
160 goto bail; 158 return -EFAULT;
161
162 status = 0;
163bail:
164 if (status)
165 o2info_set_request_error(&oib.ib_req, req);
166 159
167 return status; 160 return 0;
168} 161}
169 162
170static int ocfs2_info_handle_clustersize(struct inode *inode, 163static int ocfs2_info_handle_clustersize(struct inode *inode,
171 struct ocfs2_info_request __user *req) 164 struct ocfs2_info_request __user *req)
172{ 165{
173 int status = -EFAULT;
174 struct ocfs2_info_clustersize oic; 166 struct ocfs2_info_clustersize oic;
175 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 167 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
176 168
177 if (o2info_from_user(oic, req)) 169 if (o2info_from_user(oic, req))
178 goto bail; 170 return -EFAULT;
179 171
180 oic.ic_clustersize = osb->s_clustersize; 172 oic.ic_clustersize = osb->s_clustersize;
181 173
182 o2info_set_request_filled(&oic.ic_req); 174 o2info_set_request_filled(&oic.ic_req);
183 175
184 if (o2info_to_user(oic, req)) 176 if (o2info_to_user(oic, req))
185 goto bail; 177 return -EFAULT;
186
187 status = 0;
188bail:
189 if (status)
190 o2info_set_request_error(&oic.ic_req, req);
191 178
192 return status; 179 return 0;
193} 180}
194 181
195static int ocfs2_info_handle_maxslots(struct inode *inode, 182static int ocfs2_info_handle_maxslots(struct inode *inode,
196 struct ocfs2_info_request __user *req) 183 struct ocfs2_info_request __user *req)
197{ 184{
198 int status = -EFAULT;
199 struct ocfs2_info_maxslots oim; 185 struct ocfs2_info_maxslots oim;
200 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 186 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
201 187
202 if (o2info_from_user(oim, req)) 188 if (o2info_from_user(oim, req))
203 goto bail; 189 return -EFAULT;
204 190
205 oim.im_max_slots = osb->max_slots; 191 oim.im_max_slots = osb->max_slots;
206 192
207 o2info_set_request_filled(&oim.im_req); 193 o2info_set_request_filled(&oim.im_req);
208 194
209 if (o2info_to_user(oim, req)) 195 if (o2info_to_user(oim, req))
210 goto bail; 196 return -EFAULT;
211 197
212 status = 0; 198 return 0;
213bail:
214 if (status)
215 o2info_set_request_error(&oim.im_req, req);
216
217 return status;
218} 199}
219 200
220static int ocfs2_info_handle_label(struct inode *inode, 201static int ocfs2_info_handle_label(struct inode *inode,
221 struct ocfs2_info_request __user *req) 202 struct ocfs2_info_request __user *req)
222{ 203{
223 int status = -EFAULT;
224 struct ocfs2_info_label oil; 204 struct ocfs2_info_label oil;
225 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
226 206
227 if (o2info_from_user(oil, req)) 207 if (o2info_from_user(oil, req))
228 goto bail; 208 return -EFAULT;
229 209
230 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); 210 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
231 211
232 o2info_set_request_filled(&oil.il_req); 212 o2info_set_request_filled(&oil.il_req);
233 213
234 if (o2info_to_user(oil, req)) 214 if (o2info_to_user(oil, req))
235 goto bail; 215 return -EFAULT;
236 216
237 status = 0; 217 return 0;
238bail:
239 if (status)
240 o2info_set_request_error(&oil.il_req, req);
241
242 return status;
243} 218}
244 219
245static int ocfs2_info_handle_uuid(struct inode *inode, 220static int ocfs2_info_handle_uuid(struct inode *inode,
246 struct ocfs2_info_request __user *req) 221 struct ocfs2_info_request __user *req)
247{ 222{
248 int status = -EFAULT;
249 struct ocfs2_info_uuid oiu; 223 struct ocfs2_info_uuid oiu;
250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 224 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
251 225
252 if (o2info_from_user(oiu, req)) 226 if (o2info_from_user(oiu, req))
253 goto bail; 227 return -EFAULT;
254 228
255 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); 229 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
256 230
257 o2info_set_request_filled(&oiu.iu_req); 231 o2info_set_request_filled(&oiu.iu_req);
258 232
259 if (o2info_to_user(oiu, req)) 233 if (o2info_to_user(oiu, req))
260 goto bail; 234 return -EFAULT;
261
262 status = 0;
263bail:
264 if (status)
265 o2info_set_request_error(&oiu.iu_req, req);
266 235
267 return status; 236 return 0;
268} 237}
269 238
270static int ocfs2_info_handle_fs_features(struct inode *inode, 239static int ocfs2_info_handle_fs_features(struct inode *inode,
271 struct ocfs2_info_request __user *req) 240 struct ocfs2_info_request __user *req)
272{ 241{
273 int status = -EFAULT;
274 struct ocfs2_info_fs_features oif; 242 struct ocfs2_info_fs_features oif;
275 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 243 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
276 244
277 if (o2info_from_user(oif, req)) 245 if (o2info_from_user(oif, req))
278 goto bail; 246 return -EFAULT;
279 247
280 oif.if_compat_features = osb->s_feature_compat; 248 oif.if_compat_features = osb->s_feature_compat;
281 oif.if_incompat_features = osb->s_feature_incompat; 249 oif.if_incompat_features = osb->s_feature_incompat;
@@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode,
284 o2info_set_request_filled(&oif.if_req); 252 o2info_set_request_filled(&oif.if_req);
285 253
286 if (o2info_to_user(oif, req)) 254 if (o2info_to_user(oif, req))
287 goto bail; 255 return -EFAULT;
288 256
289 status = 0; 257 return 0;
290bail:
291 if (status)
292 o2info_set_request_error(&oif.if_req, req);
293
294 return status;
295} 258}
296 259
297static int ocfs2_info_handle_journal_size(struct inode *inode, 260static int ocfs2_info_handle_journal_size(struct inode *inode,
298 struct ocfs2_info_request __user *req) 261 struct ocfs2_info_request __user *req)
299{ 262{
300 int status = -EFAULT;
301 struct ocfs2_info_journal_size oij; 263 struct ocfs2_info_journal_size oij;
302 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 264 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
303 265
304 if (o2info_from_user(oij, req)) 266 if (o2info_from_user(oij, req))
305 goto bail; 267 return -EFAULT;
306 268
307 oij.ij_journal_size = i_size_read(osb->journal->j_inode); 269 oij.ij_journal_size = i_size_read(osb->journal->j_inode);
308 270
309 o2info_set_request_filled(&oij.ij_req); 271 o2info_set_request_filled(&oij.ij_req);
310 272
311 if (o2info_to_user(oij, req)) 273 if (o2info_to_user(oij, req))
312 goto bail; 274 return -EFAULT;
313 275
314 status = 0; 276 return 0;
315bail:
316 if (status)
317 o2info_set_request_error(&oij.ij_req, req);
318
319 return status;
320} 277}
321 278
322static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, 279static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
@@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
373 u32 i; 330 u32 i;
374 u64 blkno = -1; 331 u64 blkno = -1;
375 char namebuf[40]; 332 char namebuf[40];
376 int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; 333 int status, type = INODE_ALLOC_SYSTEM_INODE;
377 struct ocfs2_info_freeinode *oifi = NULL; 334 struct ocfs2_info_freeinode *oifi = NULL;
378 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 335 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
379 struct inode *inode_alloc = NULL; 336 struct inode *inode_alloc = NULL;
@@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
385 goto out_err; 342 goto out_err;
386 } 343 }
387 344
388 if (o2info_from_user(*oifi, req)) 345 if (o2info_from_user(*oifi, req)) {
389 goto bail; 346 status = -EFAULT;
347 goto out_free;
348 }
390 349
391 oifi->ifi_slotnum = osb->max_slots; 350 oifi->ifi_slotnum = osb->max_slots;
392 351
@@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
424 383
425 o2info_set_request_filled(&oifi->ifi_req); 384 o2info_set_request_filled(&oifi->ifi_req);
426 385
427 if (o2info_to_user(*oifi, req)) 386 if (o2info_to_user(*oifi, req)) {
428 goto bail; 387 status = -EFAULT;
388 goto out_free;
389 }
429 390
430 status = 0; 391 status = 0;
431bail: 392bail:
432 if (status) 393 if (status)
433 o2info_set_request_error(&oifi->ifi_req, req); 394 o2info_set_request_error(&oifi->ifi_req, req);
434 395out_free:
435 kfree(oifi); 396 kfree(oifi);
436out_err: 397out_err:
437 return status; 398 return status;
@@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
658{ 619{
659 u64 blkno = -1; 620 u64 blkno = -1;
660 char namebuf[40]; 621 char namebuf[40];
661 int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; 622 int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
662 623
663 struct ocfs2_info_freefrag *oiff; 624 struct ocfs2_info_freefrag *oiff;
664 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 625 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
671 goto out_err; 632 goto out_err;
672 } 633 }
673 634
674 if (o2info_from_user(*oiff, req)) 635 if (o2info_from_user(*oiff, req)) {
675 goto bail; 636 status = -EFAULT;
637 goto out_free;
638 }
676 /* 639 /*
677 * chunksize from userspace should be power of 2. 640 * chunksize from userspace should be power of 2.
678 */ 641 */
@@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
711 674
712 if (o2info_to_user(*oiff, req)) { 675 if (o2info_to_user(*oiff, req)) {
713 status = -EFAULT; 676 status = -EFAULT;
714 goto bail; 677 goto out_free;
715 } 678 }
716 679
717 status = 0; 680 status = 0;
718bail: 681bail:
719 if (status) 682 if (status)
720 o2info_set_request_error(&oiff->iff_req, req); 683 o2info_set_request_error(&oiff->iff_req, req);
721 684out_free:
722 kfree(oiff); 685 kfree(oiff);
723out_err: 686out_err:
724 return status; 687 return status;
@@ -727,23 +690,17 @@ out_err:
727static int ocfs2_info_handle_unknown(struct inode *inode, 690static int ocfs2_info_handle_unknown(struct inode *inode,
728 struct ocfs2_info_request __user *req) 691 struct ocfs2_info_request __user *req)
729{ 692{
730 int status = -EFAULT;
731 struct ocfs2_info_request oir; 693 struct ocfs2_info_request oir;
732 694
733 if (o2info_from_user(oir, req)) 695 if (o2info_from_user(oir, req))
734 goto bail; 696 return -EFAULT;
735 697
736 o2info_clear_request_filled(&oir); 698 o2info_clear_request_filled(&oir);
737 699
738 if (o2info_to_user(oir, req)) 700 if (o2info_to_user(oir, req))
739 goto bail; 701 return -EFAULT;
740 702
741 status = 0; 703 return 0;
742bail:
743 if (status)
744 o2info_set_request_error(&oir, req);
745
746 return status;
747} 704}
748 705
749/* 706/*
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 6219aaadeb08..74caffeeee1d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
404 * 'vict_blkno' was out of the valid range. 404 * 'vict_blkno' was out of the valid range.
405 */ 405 */
406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
407 (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 407 (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
408 bits_per_unit))) { 408 bits_per_unit))) {
409 ret = -EINVAL; 409 ret = -EINVAL;
410 goto out; 410 goto out;
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index f266d67df3c6..1eae330193a6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,6 +17,9 @@
17 17
18#include "ocfs2.h" 18#include "ocfs2.h"
19 19
20/* Number of quota types we support */
21#define OCFS2_MAXQUOTAS 2
22
20/* 23/*
21 * In-memory structures 24 * In-memory structures
22 */ 25 */
@@ -39,7 +42,7 @@ struct ocfs2_recovery_chunk {
39}; 42};
40 43
41struct ocfs2_quota_recovery { 44struct ocfs2_quota_recovery {
42 struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */ 45 struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */
43}; 46};
44 47
45/* In-memory structure with quota header information */ 48/* In-memory structure with quota header information */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b990a62cff50..c93d67220887 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -336,8 +336,8 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
336int ocfs2_global_read_info(struct super_block *sb, int type) 336int ocfs2_global_read_info(struct super_block *sb, int type)
337{ 337{
338 struct inode *gqinode = NULL; 338 struct inode *gqinode = NULL;
339 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, 339 unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
340 GROUP_QUOTA_SYSTEM_INODE }; 340 GROUP_QUOTA_SYSTEM_INODE };
341 struct ocfs2_global_disk_dqinfo dinfo; 341 struct ocfs2_global_disk_dqinfo dinfo;
342 struct mem_dqinfo *info = sb_dqinfo(sb, type); 342 struct mem_dqinfo *info = sb_dqinfo(sb, type);
343 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; 343 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2001862bf2b1..10b653930ee2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -166,12 +166,12 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
166/* Check whether we understand format of quota files */ 166/* Check whether we understand format of quota files */
167static int ocfs2_local_check_quota_file(struct super_block *sb, int type) 167static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
168{ 168{
169 unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; 169 unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
170 unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; 170 unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
171 unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; 171 unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
172 unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; 172 unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
173 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, 173 unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
174 GROUP_QUOTA_SYSTEM_INODE }; 174 GROUP_QUOTA_SYSTEM_INODE };
175 struct buffer_head *bh = NULL; 175 struct buffer_head *bh = NULL;
176 struct inode *linode = sb_dqopt(sb)->files[type]; 176 struct inode *linode = sb_dqopt(sb)->files[type];
177 struct inode *ginode = NULL; 177 struct inode *ginode = NULL;
@@ -336,7 +336,7 @@ void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
336{ 336{
337 int type; 337 int type;
338 338
339 for (type = 0; type < MAXQUOTAS; type++) 339 for (type = 0; type < OCFS2_MAXQUOTAS; type++)
340 free_recovery_list(&(rec->r_list[type])); 340 free_recovery_list(&(rec->r_list[type]));
341 kfree(rec); 341 kfree(rec);
342} 342}
@@ -382,7 +382,7 @@ static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
382 rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); 382 rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
383 if (!rec) 383 if (!rec)
384 return NULL; 384 return NULL;
385 for (type = 0; type < MAXQUOTAS; type++) 385 for (type = 0; type < OCFS2_MAXQUOTAS; type++)
386 INIT_LIST_HEAD(&(rec->r_list[type])); 386 INIT_LIST_HEAD(&(rec->r_list[type]));
387 return rec; 387 return rec;
388} 388}
@@ -392,10 +392,11 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
392 struct ocfs2_super *osb, 392 struct ocfs2_super *osb,
393 int slot_num) 393 int slot_num)
394{ 394{
395 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 395 unsigned int feature[OCFS2_MAXQUOTAS] = {
396 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 396 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
397 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, 397 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
398 LOCAL_GROUP_QUOTA_SYSTEM_INODE }; 398 unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
399 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
399 struct super_block *sb = osb->sb; 400 struct super_block *sb = osb->sb;
400 struct ocfs2_local_disk_dqinfo *ldinfo; 401 struct ocfs2_local_disk_dqinfo *ldinfo;
401 struct inode *lqinode; 402 struct inode *lqinode;
@@ -412,7 +413,7 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
412 return ERR_PTR(-ENOMEM); 413 return ERR_PTR(-ENOMEM);
413 /* First init... */ 414 /* First init... */
414 415
415 for (type = 0; type < MAXQUOTAS; type++) { 416 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
416 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 417 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
417 continue; 418 continue;
418 /* At this point, journal of the slot is already replayed so 419 /* At this point, journal of the slot is already replayed so
@@ -589,8 +590,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
589 struct ocfs2_quota_recovery *rec, 590 struct ocfs2_quota_recovery *rec,
590 int slot_num) 591 int slot_num)
591{ 592{
592 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, 593 unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
593 LOCAL_GROUP_QUOTA_SYSTEM_INODE }; 594 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
594 struct super_block *sb = osb->sb; 595 struct super_block *sb = osb->sb;
595 struct ocfs2_local_disk_dqinfo *ldinfo; 596 struct ocfs2_local_disk_dqinfo *ldinfo;
596 struct buffer_head *bh; 597 struct buffer_head *bh;
@@ -604,7 +605,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
604 "slot %u\n", osb->dev_str, slot_num); 605 "slot %u\n", osb->dev_str, slot_num);
605 606
606 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 607 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
607 for (type = 0; type < MAXQUOTAS; type++) { 608 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
608 if (list_empty(&(rec->r_list[type]))) 609 if (list_empty(&(rec->r_list[type])))
609 continue; 610 continue;
610 trace_ocfs2_finish_quota_recovery(slot_num); 611 trace_ocfs2_finish_quota_recovery(slot_num);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 13a8537d8e8b..720aa389e0ea 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
591 */ 591 */
592 ocfs2_control_this_node = -1; 592 ocfs2_control_this_node = -1;
593 running_proto.pv_major = 0; 593 running_proto.pv_major = 0;
594 running_proto.pv_major = 0; 594 running_proto.pv_minor = 0;
595 } 595 }
596 596
597out: 597out:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ddb662b32447..93c85bc745e1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -899,11 +899,12 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
899{ 899{
900 int type; 900 int type;
901 struct super_block *sb = osb->sb; 901 struct super_block *sb = osb->sb;
902 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 902 unsigned int feature[OCFS2_MAXQUOTAS] = {
903 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 903 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
904 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
904 int status = 0; 905 int status = 0;
905 906
906 for (type = 0; type < MAXQUOTAS; type++) { 907 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
907 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 908 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
908 continue; 909 continue;
909 if (unsuspend) 910 if (unsuspend)
@@ -927,17 +928,19 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
927 928
928static int ocfs2_enable_quotas(struct ocfs2_super *osb) 929static int ocfs2_enable_quotas(struct ocfs2_super *osb)
929{ 930{
930 struct inode *inode[MAXQUOTAS] = { NULL, NULL }; 931 struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL };
931 struct super_block *sb = osb->sb; 932 struct super_block *sb = osb->sb;
932 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 933 unsigned int feature[OCFS2_MAXQUOTAS] = {
933 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 934 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
934 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, 935 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
936 unsigned int ino[OCFS2_MAXQUOTAS] = {
937 LOCAL_USER_QUOTA_SYSTEM_INODE,
935 LOCAL_GROUP_QUOTA_SYSTEM_INODE }; 938 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
936 int status; 939 int status;
937 int type; 940 int type;
938 941
939 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; 942 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
940 for (type = 0; type < MAXQUOTAS; type++) { 943 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
941 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 944 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
942 continue; 945 continue;
943 inode[type] = ocfs2_get_system_file_inode(osb, ino[type], 946 inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
@@ -952,12 +955,12 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
952 goto out_quota_off; 955 goto out_quota_off;
953 } 956 }
954 957
955 for (type = 0; type < MAXQUOTAS; type++) 958 for (type = 0; type < OCFS2_MAXQUOTAS; type++)
956 iput(inode[type]); 959 iput(inode[type]);
957 return 0; 960 return 0;
958out_quota_off: 961out_quota_off:
959 ocfs2_disable_quotas(osb); 962 ocfs2_disable_quotas(osb);
960 for (type = 0; type < MAXQUOTAS; type++) 963 for (type = 0; type < OCFS2_MAXQUOTAS; type++)
961 iput(inode[type]); 964 iput(inode[type]);
962 mlog_errno(status); 965 mlog_errno(status);
963 return status; 966 return status;
@@ -972,7 +975,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
972 975
973 /* We mostly ignore errors in this function because there's not much 976 /* We mostly ignore errors in this function because there's not much
974 * we can do when we see them */ 977 * we can do when we see them */
975 for (type = 0; type < MAXQUOTAS; type++) { 978 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
976 if (!sb_has_quota_loaded(sb, type)) 979 if (!sb_has_quota_loaded(sb, type))
977 continue; 980 continue;
978 /* Cancel periodic syncing before we grab dqonoff_mutex */ 981 /* Cancel periodic syncing before we grab dqonoff_mutex */
@@ -993,8 +996,9 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
993/* Handle quota on quotactl */ 996/* Handle quota on quotactl */
994static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) 997static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
995{ 998{
996 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 999 unsigned int feature[OCFS2_MAXQUOTAS] = {
997 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 1000 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
1001 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
998 1002
999 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 1003 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
1000 return -EINVAL; 1004 return -EINVAL;
@@ -2532,6 +2536,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2532 kfree(osb->journal); 2536 kfree(osb->journal);
2533 kfree(osb->local_alloc_copy); 2537 kfree(osb->local_alloc_copy);
2534 kfree(osb->uuid_str); 2538 kfree(osb->uuid_str);
2539 kfree(osb->vol_label);
2535 ocfs2_put_dlm_debug(osb->osb_dlm_debug); 2540 ocfs2_put_dlm_debug(osb->osb_dlm_debug);
2536 memset(osb, 0, sizeof(struct ocfs2_super)); 2541 memset(osb, 0, sizeof(struct ocfs2_super));
2537} 2542}
diff --git a/fs/pnode.c b/fs/pnode.c
index 302bf22c4a30..aae331a5d03b 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt)
381 * other children 381 * other children
382 */ 382 */
383 if (child && list_empty(&child->mnt_mounts)) { 383 if (child && list_empty(&child->mnt_mounts)) {
384 list_del_init(&child->mnt_child);
384 hlist_del_init_rcu(&child->mnt_hash); 385 hlist_del_init_rcu(&child->mnt_hash);
385 hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); 386 hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
386 } 387 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index baf852b648ad..950100e326a1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -376,37 +376,6 @@ static const struct file_operations proc_lstats_operations = {
376 376
377#endif 377#endif
378 378
379#ifdef CONFIG_CGROUPS
380static int cgroup_open(struct inode *inode, struct file *file)
381{
382 struct pid *pid = PROC_I(inode)->pid;
383 return single_open(file, proc_cgroup_show, pid);
384}
385
386static const struct file_operations proc_cgroup_operations = {
387 .open = cgroup_open,
388 .read = seq_read,
389 .llseek = seq_lseek,
390 .release = single_release,
391};
392#endif
393
394#ifdef CONFIG_PROC_PID_CPUSET
395
396static int cpuset_open(struct inode *inode, struct file *file)
397{
398 struct pid *pid = PROC_I(inode)->pid;
399 return single_open(file, proc_cpuset_show, pid);
400}
401
402static const struct file_operations proc_cpuset_operations = {
403 .open = cpuset_open,
404 .read = seq_read,
405 .llseek = seq_lseek,
406 .release = single_release,
407};
408#endif
409
410static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, 379static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
411 struct pid *pid, struct task_struct *task) 380 struct pid *pid, struct task_struct *task)
412{ 381{
@@ -632,29 +601,35 @@ static const struct file_operations proc_single_file_operations = {
632 .release = single_release, 601 .release = single_release,
633}; 602};
634 603
635static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) 604
605struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
636{ 606{
637 struct task_struct *task = get_proc_task(file_inode(file)); 607 struct task_struct *task = get_proc_task(inode);
638 struct mm_struct *mm; 608 struct mm_struct *mm = ERR_PTR(-ESRCH);
639 609
640 if (!task) 610 if (task) {
641 return -ESRCH; 611 mm = mm_access(task, mode);
612 put_task_struct(task);
642 613
643 mm = mm_access(task, mode); 614 if (!IS_ERR_OR_NULL(mm)) {
644 put_task_struct(task); 615 /* ensure this mm_struct can't be freed */
616 atomic_inc(&mm->mm_count);
617 /* but do not pin its memory */
618 mmput(mm);
619 }
620 }
621
622 return mm;
623}
624
625static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
626{
627 struct mm_struct *mm = proc_mem_open(inode, mode);
645 628
646 if (IS_ERR(mm)) 629 if (IS_ERR(mm))
647 return PTR_ERR(mm); 630 return PTR_ERR(mm);
648 631
649 if (mm) {
650 /* ensure this mm_struct can't be freed */
651 atomic_inc(&mm->mm_count);
652 /* but do not pin its memory */
653 mmput(mm);
654 }
655
656 file->private_data = mm; 632 file->private_data = mm;
657
658 return 0; 633 return 0;
659} 634}
660 635
@@ -2573,10 +2548,10 @@ static const struct pid_entry tgid_base_stuff[] = {
2573 REG("latency", S_IRUGO, proc_lstats_operations), 2548 REG("latency", S_IRUGO, proc_lstats_operations),
2574#endif 2549#endif
2575#ifdef CONFIG_PROC_PID_CPUSET 2550#ifdef CONFIG_PROC_PID_CPUSET
2576 REG("cpuset", S_IRUGO, proc_cpuset_operations), 2551 ONE("cpuset", S_IRUGO, proc_cpuset_show),
2577#endif 2552#endif
2578#ifdef CONFIG_CGROUPS 2553#ifdef CONFIG_CGROUPS
2579 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2554 ONE("cgroup", S_IRUGO, proc_cgroup_show),
2580#endif 2555#endif
2581 ONE("oom_score", S_IRUGO, proc_oom_score), 2556 ONE("oom_score", S_IRUGO, proc_oom_score),
2582 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 2557 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
@@ -2919,10 +2894,10 @@ static const struct pid_entry tid_base_stuff[] = {
2919 REG("latency", S_IRUGO, proc_lstats_operations), 2894 REG("latency", S_IRUGO, proc_lstats_operations),
2920#endif 2895#endif
2921#ifdef CONFIG_PROC_PID_CPUSET 2896#ifdef CONFIG_PROC_PID_CPUSET
2922 REG("cpuset", S_IRUGO, proc_cpuset_operations), 2897 ONE("cpuset", S_IRUGO, proc_cpuset_show),
2923#endif 2898#endif
2924#ifdef CONFIG_CGROUPS 2899#ifdef CONFIG_CGROUPS
2925 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2900 ONE("cgroup", S_IRUGO, proc_cgroup_show),
2926#endif 2901#endif
2927 ONE("oom_score", S_IRUGO, proc_oom_score), 2902 ONE("oom_score", S_IRUGO, proc_oom_score),
2928 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 2903 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7da13e49128a..aa7a0ee182e1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -268,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *);
268 * task_[no]mmu.c 268 * task_[no]mmu.c
269 */ 269 */
270struct proc_maps_private { 270struct proc_maps_private {
271 struct pid *pid; 271 struct inode *inode;
272 struct task_struct *task; 272 struct task_struct *task;
273 struct mm_struct *mm;
273#ifdef CONFIG_MMU 274#ifdef CONFIG_MMU
274 struct vm_area_struct *tail_vma; 275 struct vm_area_struct *tail_vma;
275#endif 276#endif
@@ -278,6 +279,8 @@ struct proc_maps_private {
278#endif 279#endif
279}; 280};
280 281
282struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
283
281extern const struct file_operations proc_pid_maps_operations; 284extern const struct file_operations proc_pid_maps_operations;
282extern const struct file_operations proc_tid_maps_operations; 285extern const struct file_operations proc_tid_maps_operations;
283extern const struct file_operations proc_pid_numa_maps_operations; 286extern const struct file_operations proc_pid_numa_maps_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6df8d0722c97..91a4e6426321 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void)
610struct kcore_list kcore_modules; 610struct kcore_list kcore_modules;
611static void __init add_modules_range(void) 611static void __init add_modules_range(void)
612{ 612{
613 kclist_add(&kcore_modules, (void *)MODULES_VADDR, 613 if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
614 kclist_add(&kcore_modules, (void *)MODULES_VADDR,
614 MODULES_END - MODULES_VADDR, KCORE_VMALLOC); 615 MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
616 }
615} 617}
616#else 618#else
617static void __init add_modules_range(void) 619static void __init add_modules_range(void)
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55275d9..1e3187da1fed 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page)
133 if (PageBuddy(page)) 133 if (PageBuddy(page))
134 u |= 1 << KPF_BUDDY; 134 u |= 1 << KPF_BUDDY;
135 135
136 if (PageBalloon(page))
137 u |= 1 << KPF_BALLOON;
138
136 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); 139 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
137 140
138 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 141 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dfc791c42d64..b7a7dc963a35 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm,
87 87
88#ifdef CONFIG_NUMA 88#ifdef CONFIG_NUMA
89/* 89/*
90 * These functions are for numa_maps but called in generic **maps seq_file 90 * Save get_task_policy() for show_numa_map().
91 * ->start(), ->stop() ops.
92 *
93 * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
94 * Each mempolicy object is controlled by reference counting. The problem here
95 * is how to avoid accessing dead mempolicy object.
96 *
97 * Because we're holding mmap_sem while reading seq_file, it's safe to access
98 * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
99 *
100 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
101 * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
102 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
103 * gurantee the task never exits under us. But taking task_lock() around
104 * get_vma_plicy() causes lock order problem.
105 *
106 * To access task->mempolicy without lock, we hold a reference count of an
107 * object pointed by task->mempolicy and remember it. This will guarantee
108 * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
109 */ 91 */
110static void hold_task_mempolicy(struct proc_maps_private *priv) 92static void hold_task_mempolicy(struct proc_maps_private *priv)
111{ 93{
112 struct task_struct *task = priv->task; 94 struct task_struct *task = priv->task;
113 95
114 task_lock(task); 96 task_lock(task);
115 priv->task_mempolicy = task->mempolicy; 97 priv->task_mempolicy = get_task_policy(task);
116 mpol_get(priv->task_mempolicy); 98 mpol_get(priv->task_mempolicy);
117 task_unlock(task); 99 task_unlock(task);
118} 100}
@@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
129} 111}
130#endif 112#endif
131 113
132static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) 114static void vma_stop(struct proc_maps_private *priv)
133{ 115{
134 if (vma && vma != priv->tail_vma) { 116 struct mm_struct *mm = priv->mm;
135 struct mm_struct *mm = vma->vm_mm; 117
136 release_task_mempolicy(priv); 118 release_task_mempolicy(priv);
137 up_read(&mm->mmap_sem); 119 up_read(&mm->mmap_sem);
138 mmput(mm); 120 mmput(mm);
139 } 121}
122
123static struct vm_area_struct *
124m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
125{
126 if (vma == priv->tail_vma)
127 return NULL;
128 return vma->vm_next ?: priv->tail_vma;
129}
130
131static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
132{
133 if (m->count < m->size) /* vma is copied successfully */
134 m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
140} 135}
141 136
142static void *m_start(struct seq_file *m, loff_t *pos) 137static void *m_start(struct seq_file *m, loff_t *ppos)
143{ 138{
144 struct proc_maps_private *priv = m->private; 139 struct proc_maps_private *priv = m->private;
145 unsigned long last_addr = m->version; 140 unsigned long last_addr = m->version;
146 struct mm_struct *mm; 141 struct mm_struct *mm;
147 struct vm_area_struct *vma, *tail_vma = NULL; 142 struct vm_area_struct *vma;
148 loff_t l = *pos; 143 unsigned int pos = *ppos;
149
150 /* Clear the per syscall fields in priv */
151 priv->task = NULL;
152 priv->tail_vma = NULL;
153
154 /*
155 * We remember last_addr rather than next_addr to hit with
156 * vmacache most of the time. We have zero last_addr at
157 * the beginning and also after lseek. We will have -1 last_addr
158 * after the end of the vmas.
159 */
160 144
145 /* See m_cache_vma(). Zero at the start or after lseek. */
161 if (last_addr == -1UL) 146 if (last_addr == -1UL)
162 return NULL; 147 return NULL;
163 148
164 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 149 priv->task = get_proc_task(priv->inode);
165 if (!priv->task) 150 if (!priv->task)
166 return ERR_PTR(-ESRCH); 151 return ERR_PTR(-ESRCH);
167 152
168 mm = mm_access(priv->task, PTRACE_MODE_READ); 153 mm = priv->mm;
169 if (!mm || IS_ERR(mm)) 154 if (!mm || !atomic_inc_not_zero(&mm->mm_users))
170 return mm; 155 return NULL;
171 down_read(&mm->mmap_sem);
172 156
173 tail_vma = get_gate_vma(priv->task->mm); 157 down_read(&mm->mmap_sem);
174 priv->tail_vma = tail_vma;
175 hold_task_mempolicy(priv); 158 hold_task_mempolicy(priv);
176 /* Start with last addr hint */ 159 priv->tail_vma = get_gate_vma(mm);
177 vma = find_vma(mm, last_addr); 160
178 if (last_addr && vma) { 161 if (last_addr) {
179 vma = vma->vm_next; 162 vma = find_vma(mm, last_addr);
180 goto out; 163 if (vma && (vma = m_next_vma(priv, vma)))
164 return vma;
181 } 165 }
182 166
183 /* 167 m->version = 0;
184 * Check the vma index is within the range and do 168 if (pos < mm->map_count) {
185 * sequential scan until m_index. 169 for (vma = mm->mmap; pos; pos--) {
186 */ 170 m->version = vma->vm_start;
187 vma = NULL;
188 if ((unsigned long)l < mm->map_count) {
189 vma = mm->mmap;
190 while (l-- && vma)
191 vma = vma->vm_next; 171 vma = vma->vm_next;
192 goto out; 172 }
173 return vma;
193 } 174 }
194 175
195 if (l != mm->map_count) 176 /* we do not bother to update m->version in this case */
196 tail_vma = NULL; /* After gate vma */ 177 if (pos == mm->map_count && priv->tail_vma)
197 178 return priv->tail_vma;
198out:
199 if (vma)
200 return vma;
201 179
202 release_task_mempolicy(priv); 180 vma_stop(priv);
203 /* End of vmas has been reached */ 181 return NULL;
204 m->version = (tail_vma != NULL)? 0: -1UL;
205 up_read(&mm->mmap_sem);
206 mmput(mm);
207 return tail_vma;
208} 182}
209 183
210static void *m_next(struct seq_file *m, void *v, loff_t *pos) 184static void *m_next(struct seq_file *m, void *v, loff_t *pos)
211{ 185{
212 struct proc_maps_private *priv = m->private; 186 struct proc_maps_private *priv = m->private;
213 struct vm_area_struct *vma = v; 187 struct vm_area_struct *next;
214 struct vm_area_struct *tail_vma = priv->tail_vma;
215 188
216 (*pos)++; 189 (*pos)++;
217 if (vma && (vma != tail_vma) && vma->vm_next) 190 next = m_next_vma(priv, v);
218 return vma->vm_next; 191 if (!next)
219 vma_stop(priv, vma); 192 vma_stop(priv);
220 return (vma != tail_vma)? tail_vma: NULL; 193 return next;
221} 194}
222 195
223static void m_stop(struct seq_file *m, void *v) 196static void m_stop(struct seq_file *m, void *v)
224{ 197{
225 struct proc_maps_private *priv = m->private; 198 struct proc_maps_private *priv = m->private;
226 struct vm_area_struct *vma = v;
227 199
228 if (!IS_ERR(vma)) 200 if (!IS_ERR_OR_NULL(v))
229 vma_stop(priv, vma); 201 vma_stop(priv);
230 if (priv->task) 202 if (priv->task) {
231 put_task_struct(priv->task); 203 put_task_struct(priv->task);
204 priv->task = NULL;
205 }
206}
207
208static int proc_maps_open(struct inode *inode, struct file *file,
209 const struct seq_operations *ops, int psize)
210{
211 struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
212
213 if (!priv)
214 return -ENOMEM;
215
216 priv->inode = inode;
217 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
218 if (IS_ERR(priv->mm)) {
219 int err = PTR_ERR(priv->mm);
220
221 seq_release_private(inode, file);
222 return err;
223 }
224
225 return 0;
226}
227
228static int proc_map_release(struct inode *inode, struct file *file)
229{
230 struct seq_file *seq = file->private_data;
231 struct proc_maps_private *priv = seq->private;
232
233 if (priv->mm)
234 mmdrop(priv->mm);
235
236 return seq_release_private(inode, file);
232} 237}
233 238
234static int do_maps_open(struct inode *inode, struct file *file, 239static int do_maps_open(struct inode *inode, struct file *file,
235 const struct seq_operations *ops) 240 const struct seq_operations *ops)
236{ 241{
237 struct proc_maps_private *priv; 242 return proc_maps_open(inode, file, ops,
238 int ret = -ENOMEM; 243 sizeof(struct proc_maps_private));
239 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 244}
240 if (priv) { 245
241 priv->pid = proc_pid(inode); 246static pid_t pid_of_stack(struct proc_maps_private *priv,
242 ret = seq_open(file, ops); 247 struct vm_area_struct *vma, bool is_pid)
243 if (!ret) { 248{
244 struct seq_file *m = file->private_data; 249 struct inode *inode = priv->inode;
245 m->private = priv; 250 struct task_struct *task;
246 } else { 251 pid_t ret = 0;
247 kfree(priv); 252
248 } 253 rcu_read_lock();
254 task = pid_task(proc_pid(inode), PIDTYPE_PID);
255 if (task) {
256 task = task_of_stack(task, vma, is_pid);
257 if (task)
258 ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
249 } 259 }
260 rcu_read_unlock();
261
250 return ret; 262 return ret;
251} 263}
252 264
@@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
256 struct mm_struct *mm = vma->vm_mm; 268 struct mm_struct *mm = vma->vm_mm;
257 struct file *file = vma->vm_file; 269 struct file *file = vma->vm_file;
258 struct proc_maps_private *priv = m->private; 270 struct proc_maps_private *priv = m->private;
259 struct task_struct *task = priv->task;
260 vm_flags_t flags = vma->vm_flags; 271 vm_flags_t flags = vma->vm_flags;
261 unsigned long ino = 0; 272 unsigned long ino = 0;
262 unsigned long long pgoff = 0; 273 unsigned long long pgoff = 0;
@@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
321 goto done; 332 goto done;
322 } 333 }
323 334
324 tid = vm_is_stack(task, vma, is_pid); 335 tid = pid_of_stack(priv, vma, is_pid);
325
326 if (tid != 0) { 336 if (tid != 0) {
327 /* 337 /*
328 * Thread stack in /proc/PID/task/TID/maps or 338 * Thread stack in /proc/PID/task/TID/maps or
@@ -349,15 +359,8 @@ done:
349 359
350static int show_map(struct seq_file *m, void *v, int is_pid) 360static int show_map(struct seq_file *m, void *v, int is_pid)
351{ 361{
352 struct vm_area_struct *vma = v; 362 show_map_vma(m, v, is_pid);
353 struct proc_maps_private *priv = m->private; 363 m_cache_vma(m, v);
354 struct task_struct *task = priv->task;
355
356 show_map_vma(m, vma, is_pid);
357
358 if (m->count < m->size) /* vma is copied successfully */
359 m->version = (vma != get_gate_vma(task->mm))
360 ? vma->vm_start : 0;
361 return 0; 364 return 0;
362} 365}
363 366
@@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = {
399 .open = pid_maps_open, 402 .open = pid_maps_open,
400 .read = seq_read, 403 .read = seq_read,
401 .llseek = seq_lseek, 404 .llseek = seq_lseek,
402 .release = seq_release_private, 405 .release = proc_map_release,
403}; 406};
404 407
405const struct file_operations proc_tid_maps_operations = { 408const struct file_operations proc_tid_maps_operations = {
406 .open = tid_maps_open, 409 .open = tid_maps_open,
407 .read = seq_read, 410 .read = seq_read,
408 .llseek = seq_lseek, 411 .llseek = seq_lseek,
409 .release = seq_release_private, 412 .release = proc_map_release,
410}; 413};
411 414
412/* 415/*
@@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
583 586
584static int show_smap(struct seq_file *m, void *v, int is_pid) 587static int show_smap(struct seq_file *m, void *v, int is_pid)
585{ 588{
586 struct proc_maps_private *priv = m->private;
587 struct task_struct *task = priv->task;
588 struct vm_area_struct *vma = v; 589 struct vm_area_struct *vma = v;
589 struct mem_size_stats mss; 590 struct mem_size_stats mss;
590 struct mm_walk smaps_walk = { 591 struct mm_walk smaps_walk = {
@@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
637 mss.nonlinear >> 10); 638 mss.nonlinear >> 10);
638 639
639 show_smap_vma_flags(m, vma); 640 show_smap_vma_flags(m, vma);
640 641 m_cache_vma(m, vma);
641 if (m->count < m->size) /* vma is copied successfully */
642 m->version = (vma != get_gate_vma(task->mm))
643 ? vma->vm_start : 0;
644 return 0; 642 return 0;
645} 643}
646 644
@@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = {
682 .open = pid_smaps_open, 680 .open = pid_smaps_open,
683 .read = seq_read, 681 .read = seq_read,
684 .llseek = seq_lseek, 682 .llseek = seq_lseek,
685 .release = seq_release_private, 683 .release = proc_map_release,
686}; 684};
687 685
688const struct file_operations proc_tid_smaps_operations = { 686const struct file_operations proc_tid_smaps_operations = {
689 .open = tid_smaps_open, 687 .open = tid_smaps_open,
690 .read = seq_read, 688 .read = seq_read,
691 .llseek = seq_lseek, 689 .llseek = seq_lseek,
692 .release = seq_release_private, 690 .release = proc_map_release,
693}; 691};
694 692
695/* 693/*
@@ -931,23 +929,32 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
931 while (addr < end) { 929 while (addr < end) {
932 struct vm_area_struct *vma = find_vma(walk->mm, addr); 930 struct vm_area_struct *vma = find_vma(walk->mm, addr);
933 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); 931 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
934 unsigned long vm_end; 932 /* End of address space hole, which we mark as non-present. */
935 933 unsigned long hole_end;
936 if (!vma) { 934
937 vm_end = end; 935 if (vma)
938 } else { 936 hole_end = min(end, vma->vm_start);
939 vm_end = min(end, vma->vm_end); 937 else
940 if (vma->vm_flags & VM_SOFTDIRTY) 938 hole_end = end;
941 pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); 939
940 for (; addr < hole_end; addr += PAGE_SIZE) {
941 err = add_to_pagemap(addr, &pme, pm);
942 if (err)
943 goto out;
942 } 944 }
943 945
944 for (; addr < vm_end; addr += PAGE_SIZE) { 946 if (!vma)
947 break;
948
949 /* Addresses in the VMA. */
950 if (vma->vm_flags & VM_SOFTDIRTY)
951 pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
952 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
945 err = add_to_pagemap(addr, &pme, pm); 953 err = add_to_pagemap(addr, &pme, pm);
946 if (err) 954 if (err)
947 goto out; 955 goto out;
948 } 956 }
949 } 957 }
950
951out: 958out:
952 return err; 959 return err;
953} 960}
@@ -1020,7 +1027,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1020 spinlock_t *ptl; 1027 spinlock_t *ptl;
1021 pte_t *pte; 1028 pte_t *pte;
1022 int err = 0; 1029 int err = 0;
1023 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1024 1030
1025 /* find the first VMA at or above 'addr' */ 1031 /* find the first VMA at or above 'addr' */
1026 vma = find_vma(walk->mm, addr); 1032 vma = find_vma(walk->mm, addr);
@@ -1034,6 +1040,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1034 1040
1035 for (; addr != end; addr += PAGE_SIZE) { 1041 for (; addr != end; addr += PAGE_SIZE) {
1036 unsigned long offset; 1042 unsigned long offset;
1043 pagemap_entry_t pme;
1037 1044
1038 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1045 offset = (addr & ~PAGEMAP_WALK_MASK) >>
1039 PAGE_SHIFT; 1046 PAGE_SHIFT;
@@ -1048,32 +1055,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1048 1055
1049 if (pmd_trans_unstable(pmd)) 1056 if (pmd_trans_unstable(pmd))
1050 return 0; 1057 return 0;
1051 for (; addr != end; addr += PAGE_SIZE) { 1058
1052 int flags2; 1059 while (1) {
1053 1060 /* End of address space hole, which we mark as non-present. */
1054 /* check to see if we've left 'vma' behind 1061 unsigned long hole_end;
1055 * and need a new, higher one */ 1062
1056 if (vma && (addr >= vma->vm_end)) { 1063 if (vma)
1057 vma = find_vma(walk->mm, addr); 1064 hole_end = min(end, vma->vm_start);
1058 if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1065 else
1059 flags2 = __PM_SOFT_DIRTY; 1066 hole_end = end;
1060 else 1067
1061 flags2 = 0; 1068 for (; addr < hole_end; addr += PAGE_SIZE) {
1062 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 1069 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1070
1071 err = add_to_pagemap(addr, &pme, pm);
1072 if (err)
1073 return err;
1063 } 1074 }
1064 1075
1065 /* check that 'vma' actually covers this address, 1076 if (!vma || vma->vm_start >= end)
1066 * and that it isn't a huge page vma */ 1077 break;
1067 if (vma && (vma->vm_start <= addr) && 1078 /*
1068 !is_vm_hugetlb_page(vma)) { 1079 * We can't possibly be in a hugetlb VMA. In general,
1080 * for a mm_walk with a pmd_entry and a hugetlb_entry,
1081 * the pmd_entry can only be called on addresses in a
1082 * hugetlb if the walk starts in a non-hugetlb VMA and
1083 * spans a hugepage VMA. Since pagemap_read walks are
1084 * PMD-sized and PMD-aligned, this will never be true.
1085 */
1086 BUG_ON(is_vm_hugetlb_page(vma));
1087
1088 /* Addresses in the VMA. */
1089 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1090 pagemap_entry_t pme;
1069 pte = pte_offset_map(pmd, addr); 1091 pte = pte_offset_map(pmd, addr);
1070 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); 1092 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1071 /* unmap before userspace copy */
1072 pte_unmap(pte); 1093 pte_unmap(pte);
1094 err = add_to_pagemap(addr, &pme, pm);
1095 if (err)
1096 return err;
1073 } 1097 }
1074 err = add_to_pagemap(addr, &pme, pm); 1098
1075 if (err) 1099 if (addr == end)
1076 return err; 1100 break;
1101
1102 vma = find_vma(walk->mm, addr);
1077 } 1103 }
1078 1104
1079 cond_resched(); 1105 cond_resched();
@@ -1406,7 +1432,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1406 struct vm_area_struct *vma = v; 1432 struct vm_area_struct *vma = v;
1407 struct numa_maps *md = &numa_priv->md; 1433 struct numa_maps *md = &numa_priv->md;
1408 struct file *file = vma->vm_file; 1434 struct file *file = vma->vm_file;
1409 struct task_struct *task = proc_priv->task;
1410 struct mm_struct *mm = vma->vm_mm; 1435 struct mm_struct *mm = vma->vm_mm;
1411 struct mm_walk walk = {}; 1436 struct mm_walk walk = {};
1412 struct mempolicy *pol; 1437 struct mempolicy *pol;
@@ -1426,9 +1451,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1426 walk.private = md; 1451 walk.private = md;
1427 walk.mm = mm; 1452 walk.mm = mm;
1428 1453
1429 pol = get_vma_policy(task, vma, vma->vm_start); 1454 pol = __get_vma_policy(vma, vma->vm_start);
1430 mpol_to_str(buffer, sizeof(buffer), pol); 1455 if (pol) {
1431 mpol_cond_put(pol); 1456 mpol_to_str(buffer, sizeof(buffer), pol);
1457 mpol_cond_put(pol);
1458 } else {
1459 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
1460 }
1432 1461
1433 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1462 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1434 1463
@@ -1438,7 +1467,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1438 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1467 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1439 seq_puts(m, " heap"); 1468 seq_puts(m, " heap");
1440 } else { 1469 } else {
1441 pid_t tid = vm_is_stack(task, vma, is_pid); 1470 pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
1442 if (tid != 0) { 1471 if (tid != 0) {
1443 /* 1472 /*
1444 * Thread stack in /proc/PID/task/TID/maps or 1473 * Thread stack in /proc/PID/task/TID/maps or
@@ -1486,9 +1515,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1486 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 1515 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1487out: 1516out:
1488 seq_putc(m, '\n'); 1517 seq_putc(m, '\n');
1489 1518 m_cache_vma(m, vma);
1490 if (m->count < m->size)
1491 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
1492 return 0; 1519 return 0;
1493} 1520}
1494 1521
@@ -1519,20 +1546,8 @@ static const struct seq_operations proc_tid_numa_maps_op = {
1519static int numa_maps_open(struct inode *inode, struct file *file, 1546static int numa_maps_open(struct inode *inode, struct file *file,
1520 const struct seq_operations *ops) 1547 const struct seq_operations *ops)
1521{ 1548{
1522 struct numa_maps_private *priv; 1549 return proc_maps_open(inode, file, ops,
1523 int ret = -ENOMEM; 1550 sizeof(struct numa_maps_private));
1524 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1525 if (priv) {
1526 priv->proc_maps.pid = proc_pid(inode);
1527 ret = seq_open(file, ops);
1528 if (!ret) {
1529 struct seq_file *m = file->private_data;
1530 m->private = priv;
1531 } else {
1532 kfree(priv);
1533 }
1534 }
1535 return ret;
1536} 1551}
1537 1552
1538static int pid_numa_maps_open(struct inode *inode, struct file *file) 1553static int pid_numa_maps_open(struct inode *inode, struct file *file)
@@ -1549,13 +1564,13 @@ const struct file_operations proc_pid_numa_maps_operations = {
1549 .open = pid_numa_maps_open, 1564 .open = pid_numa_maps_open,
1550 .read = seq_read, 1565 .read = seq_read,
1551 .llseek = seq_lseek, 1566 .llseek = seq_lseek,
1552 .release = seq_release_private, 1567 .release = proc_map_release,
1553}; 1568};
1554 1569
1555const struct file_operations proc_tid_numa_maps_operations = { 1570const struct file_operations proc_tid_numa_maps_operations = {
1556 .open = tid_numa_maps_open, 1571 .open = tid_numa_maps_open,
1557 .read = seq_read, 1572 .read = seq_read,
1558 .llseek = seq_lseek, 1573 .llseek = seq_lseek,
1559 .release = seq_release_private, 1574 .release = proc_map_release,
1560}; 1575};
1561#endif /* CONFIG_NUMA */ 1576#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 678455d2d683..599ec2e20104 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm,
123 return size; 123 return size;
124} 124}
125 125
126static pid_t pid_of_stack(struct proc_maps_private *priv,
127 struct vm_area_struct *vma, bool is_pid)
128{
129 struct inode *inode = priv->inode;
130 struct task_struct *task;
131 pid_t ret = 0;
132
133 rcu_read_lock();
134 task = pid_task(proc_pid(inode), PIDTYPE_PID);
135 if (task) {
136 task = task_of_stack(task, vma, is_pid);
137 if (task)
138 ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
139 }
140 rcu_read_unlock();
141
142 return ret;
143}
144
126/* 145/*
127 * display a single VMA to a sequenced file 146 * display a single VMA to a sequenced file
128 */ 147 */
@@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
163 seq_pad(m, ' '); 182 seq_pad(m, ' ');
164 seq_path(m, &file->f_path, ""); 183 seq_path(m, &file->f_path, "");
165 } else if (mm) { 184 } else if (mm) {
166 pid_t tid = vm_is_stack(priv->task, vma, is_pid); 185 pid_t tid = pid_of_stack(priv, vma, is_pid);
167 186
168 if (tid != 0) { 187 if (tid != 0) {
169 seq_pad(m, ' '); 188 seq_pad(m, ' ');
@@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos)
212 loff_t n = *pos; 231 loff_t n = *pos;
213 232
214 /* pin the task and mm whilst we play with them */ 233 /* pin the task and mm whilst we play with them */
215 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 234 priv->task = get_proc_task(priv->inode);
216 if (!priv->task) 235 if (!priv->task)
217 return ERR_PTR(-ESRCH); 236 return ERR_PTR(-ESRCH);
218 237
219 mm = mm_access(priv->task, PTRACE_MODE_READ); 238 mm = priv->mm;
220 if (!mm || IS_ERR(mm)) { 239 if (!mm || !atomic_inc_not_zero(&mm->mm_users))
221 put_task_struct(priv->task); 240 return NULL;
222 priv->task = NULL;
223 return mm;
224 }
225 down_read(&mm->mmap_sem);
226 241
242 down_read(&mm->mmap_sem);
227 /* start from the Nth VMA */ 243 /* start from the Nth VMA */
228 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) 244 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
229 if (n-- == 0) 245 if (n-- == 0)
230 return p; 246 return p;
247
248 up_read(&mm->mmap_sem);
249 mmput(mm);
231 return NULL; 250 return NULL;
232} 251}
233 252
@@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml)
235{ 254{
236 struct proc_maps_private *priv = m->private; 255 struct proc_maps_private *priv = m->private;
237 256
257 if (!IS_ERR_OR_NULL(_vml)) {
258 up_read(&priv->mm->mmap_sem);
259 mmput(priv->mm);
260 }
238 if (priv->task) { 261 if (priv->task) {
239 struct mm_struct *mm = priv->task->mm;
240 up_read(&mm->mmap_sem);
241 mmput(mm);
242 put_task_struct(priv->task); 262 put_task_struct(priv->task);
263 priv->task = NULL;
243 } 264 }
244} 265}
245 266
@@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file,
269 const struct seq_operations *ops) 290 const struct seq_operations *ops)
270{ 291{
271 struct proc_maps_private *priv; 292 struct proc_maps_private *priv;
272 int ret = -ENOMEM; 293
273 294 priv = __seq_open_private(file, ops, sizeof(*priv));
274 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 295 if (!priv)
275 if (priv) { 296 return -ENOMEM;
276 priv->pid = proc_pid(inode); 297
277 ret = seq_open(file, ops); 298 priv->inode = inode;
278 if (!ret) { 299 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
279 struct seq_file *m = file->private_data; 300 if (IS_ERR(priv->mm)) {
280 m->private = priv; 301 int err = PTR_ERR(priv->mm);
281 } else { 302
282 kfree(priv); 303 seq_release_private(inode, file);
283 } 304 return err;
284 } 305 }
285 return ret; 306
307 return 0;
308}
309
310
311static int map_release(struct inode *inode, struct file *file)
312{
313 struct seq_file *seq = file->private_data;
314 struct proc_maps_private *priv = seq->private;
315
316 if (priv->mm)
317 mmdrop(priv->mm);
318
319 return seq_release_private(inode, file);
286} 320}
287 321
288static int pid_maps_open(struct inode *inode, struct file *file) 322static int pid_maps_open(struct inode *inode, struct file *file)
@@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = {
299 .open = pid_maps_open, 333 .open = pid_maps_open,
300 .read = seq_read, 334 .read = seq_read,
301 .llseek = seq_lseek, 335 .llseek = seq_lseek,
302 .release = seq_release_private, 336 .release = map_release,
303}; 337};
304 338
305const struct file_operations proc_tid_maps_operations = { 339const struct file_operations proc_tid_maps_operations = {
306 .open = tid_maps_open, 340 .open = tid_maps_open,
307 .read = seq_read, 341 .read = seq_read,
308 .llseek = seq_lseek, 342 .llseek = seq_lseek,
309 .release = seq_release_private, 343 .release = map_release,
310}; 344};
311 345
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index f2d0eee9d1f1..8b663b2d9562 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2725,7 +2725,7 @@ static int __init dquot_init(void)
2725 panic("Cannot create dquot hash table"); 2725 panic("Cannot create dquot hash table");
2726 2726
2727 for (i = 0; i < _DQST_DQSTAT_LAST; i++) { 2727 for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
2728 ret = percpu_counter_init(&dqstats.counter[i], 0); 2728 ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
2729 if (ret) 2729 if (ret)
2730 panic("Cannot create dquot stat counters"); 2730 panic("Cannot create dquot stat counters");
2731 } 2731 }
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 735c2c2b4536..1894d96ccb7c 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -506,6 +506,9 @@ typedef struct reiserfs_proc_info_data {
506} reiserfs_proc_info_data_t; 506} reiserfs_proc_info_data_t;
507#endif 507#endif
508 508
509/* Number of quota types we support */
510#define REISERFS_MAXQUOTAS 2
511
509/* reiserfs union of in-core super block data */ 512/* reiserfs union of in-core super block data */
510struct reiserfs_sb_info { 513struct reiserfs_sb_info {
511 /* Buffer containing the super block */ 514 /* Buffer containing the super block */
@@ -615,7 +618,7 @@ struct reiserfs_sb_info {
615 spinlock_t old_work_lock; /* protects old_work and work_queued */ 618 spinlock_t old_work_lock; /* protects old_work and work_queued */
616 619
617#ifdef CONFIG_QUOTA 620#ifdef CONFIG_QUOTA
618 char *s_qf_names[MAXQUOTAS]; 621 char *s_qf_names[REISERFS_MAXQUOTAS];
619 int s_jquota_fmt; 622 int s_jquota_fmt;
620#endif 623#endif
621 char *s_jdev; /* Stored jdev for mount option showing */ 624 char *s_jdev; /* Stored jdev for mount option showing */
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index d46e88a33b02..f1376c92cf74 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -206,7 +206,7 @@ static int finish_unfinished(struct super_block *s)
206#ifdef CONFIG_QUOTA 206#ifdef CONFIG_QUOTA
207 int i; 207 int i;
208 int ms_active_set; 208 int ms_active_set;
209 int quota_enabled[MAXQUOTAS]; 209 int quota_enabled[REISERFS_MAXQUOTAS];
210#endif 210#endif
211 211
212 /* compose key to look for "save" links */ 212 /* compose key to look for "save" links */
@@ -227,7 +227,7 @@ static int finish_unfinished(struct super_block *s)
227 s->s_flags |= MS_ACTIVE; 227 s->s_flags |= MS_ACTIVE;
228 } 228 }
229 /* Turn on quotas so that they are updated correctly */ 229 /* Turn on quotas so that they are updated correctly */
230 for (i = 0; i < MAXQUOTAS; i++) { 230 for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
231 quota_enabled[i] = 1; 231 quota_enabled[i] = 1;
232 if (REISERFS_SB(s)->s_qf_names[i]) { 232 if (REISERFS_SB(s)->s_qf_names[i]) {
233 int ret; 233 int ret;
@@ -370,7 +370,7 @@ static int finish_unfinished(struct super_block *s)
370#ifdef CONFIG_QUOTA 370#ifdef CONFIG_QUOTA
371 /* Turn quotas off */ 371 /* Turn quotas off */
372 reiserfs_write_unlock(s); 372 reiserfs_write_unlock(s);
373 for (i = 0; i < MAXQUOTAS; i++) { 373 for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
374 if (sb_dqopt(s)->files[i] && quota_enabled[i]) 374 if (sb_dqopt(s)->files[i] && quota_enabled[i])
375 dquot_quota_off(s, i); 375 dquot_quota_off(s, i);
376 } 376 }
@@ -1360,7 +1360,7 @@ static void handle_quota_files(struct super_block *s, char **qf_names,
1360{ 1360{
1361 int i; 1361 int i;
1362 1362
1363 for (i = 0; i < MAXQUOTAS; i++) { 1363 for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
1364 if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) 1364 if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
1365 kfree(REISERFS_SB(s)->s_qf_names[i]); 1365 kfree(REISERFS_SB(s)->s_qf_names[i]);
1366 REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; 1366 REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
@@ -1381,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1381 struct reiserfs_journal *journal = SB_JOURNAL(s); 1381 struct reiserfs_journal *journal = SB_JOURNAL(s);
1382 char *new_opts = kstrdup(arg, GFP_KERNEL); 1382 char *new_opts = kstrdup(arg, GFP_KERNEL);
1383 int err; 1383 int err;
1384 char *qf_names[MAXQUOTAS]; 1384 char *qf_names[REISERFS_MAXQUOTAS];
1385 unsigned int qfmt = 0; 1385 unsigned int qfmt = 0;
1386#ifdef CONFIG_QUOTA 1386#ifdef CONFIG_QUOTA
1387 int i; 1387 int i;
@@ -1400,7 +1400,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1400 (s, arg, &mount_options, &blocks, NULL, &commit_max_age, 1400 (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
1401 qf_names, &qfmt)) { 1401 qf_names, &qfmt)) {
1402#ifdef CONFIG_QUOTA 1402#ifdef CONFIG_QUOTA
1403 for (i = 0; i < MAXQUOTAS; i++) 1403 for (i = 0; i < REISERFS_MAXQUOTAS; i++)
1404 if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) 1404 if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
1405 kfree(qf_names[i]); 1405 kfree(qf_names[i]);
1406#endif 1406#endif
@@ -1844,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1844 char *jdev_name; 1844 char *jdev_name;
1845 struct reiserfs_sb_info *sbi; 1845 struct reiserfs_sb_info *sbi;
1846 int errval = -EINVAL; 1846 int errval = -EINVAL;
1847 char *qf_names[MAXQUOTAS] = {}; 1847 char *qf_names[REISERFS_MAXQUOTAS] = {};
1848 unsigned int qfmt = 0; 1848 unsigned int qfmt = 0;
1849 1849
1850 save_mount_options(s, data); 1850 save_mount_options(s, data);
@@ -2169,7 +2169,7 @@ error_unlocked:
2169#ifdef CONFIG_QUOTA 2169#ifdef CONFIG_QUOTA
2170 { 2170 {
2171 int j; 2171 int j;
2172 for (j = 0; j < MAXQUOTAS; j++) 2172 for (j = 0; j < REISERFS_MAXQUOTAS; j++)
2173 kfree(qf_names[j]); 2173 kfree(qf_names[j]);
2174 } 2174 }
2175#endif 2175#endif
diff --git a/fs/stack.c b/fs/stack.c
index 5b5388250e29..a54e33ed10f1 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -44,7 +44,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
44 * include/linux/fs.h). We don't necessarily hold i_mutex when this 44 * include/linux/fs.h). We don't necessarily hold i_mutex when this
45 * is called, so take i_lock for that case. 45 * is called, so take i_lock for that case.
46 * 46 *
47 * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the 47 * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
48 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock 48 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
49 * for that case too, and do both at once by combining the tests. 49 * for that case too, and do both at once by combining the tests.
50 * 50 *
diff --git a/fs/super.c b/fs/super.c
index b9a214d2fe98..1b836107acee 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
175 goto fail; 175 goto fail;
176 176
177 for (i = 0; i < SB_FREEZE_LEVELS; i++) { 177 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
178 if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) 178 if (percpu_counter_init(&s->s_writers.counter[i], 0,
179 GFP_KERNEL) < 0)
179 goto fail; 180 goto fail;
180 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], 181 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
181 &type->s_writers_key[i], 0); 182 &type->s_writers_key[i], 0);
diff --git a/fs/sync.c b/fs/sync.c
index b28d1dd10e8b..bdc729d80e5e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -65,7 +65,7 @@ int sync_filesystem(struct super_block *sb)
65 return ret; 65 return ret;
66 return __sync_filesystem(sb, 1); 66 return __sync_filesystem(sb, 1);
67} 67}
68EXPORT_SYMBOL_GPL(sync_filesystem); 68EXPORT_SYMBOL(sync_filesystem);
69 69
70static void sync_inodes_one_sb(struct super_block *sb, void *arg) 70static void sync_inodes_one_sb(struct super_block *sb, void *arg)
71{ 71{
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 80c350216ea8..b46ffa94372a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -333,8 +333,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg
333 spin_lock_irq(&ctx->wqh.lock); 333 spin_lock_irq(&ctx->wqh.lock);
334 if (!timerfd_canceled(ctx)) { 334 if (!timerfd_canceled(ctx)) {
335 ctx->ticks = ticks; 335 ctx->ticks = ticks;
336 if (ticks) 336 wake_up_locked(&ctx->wqh);
337 wake_up_locked(&ctx->wqh);
338 } else 337 } else
339 ret = -ECANCELED; 338 ret = -ECANCELED;
340 spin_unlock_irq(&ctx->wqh.lock); 339 spin_unlock_irq(&ctx->wqh.lock);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 86c6743ec1fe..bb15771b92ae 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -223,11 +223,18 @@ out:
223 223
224static int udf_release_file(struct inode *inode, struct file *filp) 224static int udf_release_file(struct inode *inode, struct file *filp)
225{ 225{
226 if (filp->f_mode & FMODE_WRITE) { 226 if (filp->f_mode & FMODE_WRITE &&
227 atomic_read(&inode->i_writecount) > 1) {
228 /*
229 * Grab i_mutex to avoid races with writes changing i_size
230 * while we are running.
231 */
232 mutex_lock(&inode->i_mutex);
227 down_write(&UDF_I(inode)->i_data_sem); 233 down_write(&UDF_I(inode)->i_data_sem);
228 udf_discard_prealloc(inode); 234 udf_discard_prealloc(inode);
229 udf_truncate_tail_extent(inode); 235 udf_truncate_tail_extent(inode);
230 up_write(&UDF_I(inode)->i_data_sem); 236 up_write(&UDF_I(inode)->i_data_sem);
237 mutex_unlock(&inode->i_mutex);
231 } 238 }
232 return 0; 239 return 0;
233} 240}
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6eaf5edf1ea1..e77db621ec89 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode)
45 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); 45 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
46} 46}
47 47
48struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) 48struct inode *udf_new_inode(struct inode *dir, umode_t mode)
49{ 49{
50 struct super_block *sb = dir->i_sb; 50 struct super_block *sb = dir->i_sb;
51 struct udf_sb_info *sbi = UDF_SB(sb); 51 struct udf_sb_info *sbi = UDF_SB(sb);
@@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
55 struct udf_inode_info *iinfo; 55 struct udf_inode_info *iinfo;
56 struct udf_inode_info *dinfo = UDF_I(dir); 56 struct udf_inode_info *dinfo = UDF_I(dir);
57 struct logicalVolIntegrityDescImpUse *lvidiu; 57 struct logicalVolIntegrityDescImpUse *lvidiu;
58 int err;
58 59
59 inode = new_inode(sb); 60 inode = new_inode(sb);
60 61
61 if (!inode) { 62 if (!inode)
62 *err = -ENOMEM; 63 return ERR_PTR(-ENOMEM);
63 return NULL;
64 }
65 *err = -ENOSPC;
66 64
67 iinfo = UDF_I(inode); 65 iinfo = UDF_I(inode);
68 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { 66 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
@@ -80,21 +78,22 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
80 } 78 }
81 if (!iinfo->i_ext.i_data) { 79 if (!iinfo->i_ext.i_data) {
82 iput(inode); 80 iput(inode);
83 *err = -ENOMEM; 81 return ERR_PTR(-ENOMEM);
84 return NULL;
85 } 82 }
86 83
84 err = -ENOSPC;
87 block = udf_new_block(dir->i_sb, NULL, 85 block = udf_new_block(dir->i_sb, NULL,
88 dinfo->i_location.partitionReferenceNum, 86 dinfo->i_location.partitionReferenceNum,
89 start, err); 87 start, &err);
90 if (*err) { 88 if (err) {
91 iput(inode); 89 iput(inode);
92 return NULL; 90 return ERR_PTR(err);
93 } 91 }
94 92
95 lvidiu = udf_sb_lvidiu(sb); 93 lvidiu = udf_sb_lvidiu(sb);
96 if (lvidiu) { 94 if (lvidiu) {
97 iinfo->i_unique = lvid_get_unique_id(sb); 95 iinfo->i_unique = lvid_get_unique_id(sb);
96 inode->i_generation = iinfo->i_unique;
98 mutex_lock(&sbi->s_alloc_mutex); 97 mutex_lock(&sbi->s_alloc_mutex);
99 if (S_ISDIR(mode)) 98 if (S_ISDIR(mode))
100 le32_add_cpu(&lvidiu->numDirs, 1); 99 le32_add_cpu(&lvidiu->numDirs, 1);
@@ -123,9 +122,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
123 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; 122 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
124 inode->i_mtime = inode->i_atime = inode->i_ctime = 123 inode->i_mtime = inode->i_atime = inode->i_ctime =
125 iinfo->i_crtime = current_fs_time(inode->i_sb); 124 iinfo->i_crtime = current_fs_time(inode->i_sb);
126 insert_inode_hash(inode); 125 if (unlikely(insert_inode_locked(inode) < 0)) {
126 make_bad_inode(inode);
127 iput(inode);
128 return ERR_PTR(-EIO);
129 }
127 mark_inode_dirty(inode); 130 mark_inode_dirty(inode);
128 131
129 *err = 0;
130 return inode; 132 return inode;
131} 133}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 236cd48184c2..c9b4df5810d5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -51,7 +51,6 @@ MODULE_LICENSE("GPL");
51 51
52static umode_t udf_convert_permissions(struct fileEntry *); 52static umode_t udf_convert_permissions(struct fileEntry *);
53static int udf_update_inode(struct inode *, int); 53static int udf_update_inode(struct inode *, int);
54static void udf_fill_inode(struct inode *, struct buffer_head *);
55static int udf_sync_inode(struct inode *inode); 54static int udf_sync_inode(struct inode *inode);
56static int udf_alloc_i_data(struct inode *inode, size_t size); 55static int udf_alloc_i_data(struct inode *inode, size_t size);
57static sector_t inode_getblk(struct inode *, sector_t, int *, int *); 56static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
@@ -1271,12 +1270,33 @@ update_time:
1271 return 0; 1270 return 0;
1272} 1271}
1273 1272
1274static void __udf_read_inode(struct inode *inode) 1273/*
1274 * Maximum length of linked list formed by ICB hierarchy. The chosen number is
1275 * arbitrary - just that we hopefully don't limit any real use of rewritten
1276 * inode on write-once media but avoid looping for too long on corrupted media.
1277 */
1278#define UDF_MAX_ICB_NESTING 1024
1279
1280static int udf_read_inode(struct inode *inode, bool hidden_inode)
1275{ 1281{
1276 struct buffer_head *bh = NULL; 1282 struct buffer_head *bh = NULL;
1277 struct fileEntry *fe; 1283 struct fileEntry *fe;
1284 struct extendedFileEntry *efe;
1278 uint16_t ident; 1285 uint16_t ident;
1279 struct udf_inode_info *iinfo = UDF_I(inode); 1286 struct udf_inode_info *iinfo = UDF_I(inode);
1287 struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
1288 struct kernel_lb_addr *iloc = &iinfo->i_location;
1289 unsigned int link_count;
1290 unsigned int indirections = 0;
1291 int ret = -EIO;
1292
1293reread:
1294 if (iloc->logicalBlockNum >=
1295 sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
1296 udf_debug("block=%d, partition=%d out of range\n",
1297 iloc->logicalBlockNum, iloc->partitionReferenceNum);
1298 return -EIO;
1299 }
1280 1300
1281 /* 1301 /*
1282 * Set defaults, but the inode is still incomplete! 1302 * Set defaults, but the inode is still incomplete!
@@ -1290,78 +1310,54 @@ static void __udf_read_inode(struct inode *inode)
1290 * i_nlink = 1 1310 * i_nlink = 1
1291 * i_op = NULL; 1311 * i_op = NULL;
1292 */ 1312 */
1293 bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); 1313 bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
1294 if (!bh) { 1314 if (!bh) {
1295 udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); 1315 udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
1296 make_bad_inode(inode); 1316 return -EIO;
1297 return;
1298 } 1317 }
1299 1318
1300 if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && 1319 if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
1301 ident != TAG_IDENT_USE) { 1320 ident != TAG_IDENT_USE) {
1302 udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", 1321 udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
1303 inode->i_ino, ident); 1322 inode->i_ino, ident);
1304 brelse(bh); 1323 goto out;
1305 make_bad_inode(inode);
1306 return;
1307 } 1324 }
1308 1325
1309 fe = (struct fileEntry *)bh->b_data; 1326 fe = (struct fileEntry *)bh->b_data;
1327 efe = (struct extendedFileEntry *)bh->b_data;
1310 1328
1311 if (fe->icbTag.strategyType == cpu_to_le16(4096)) { 1329 if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
1312 struct buffer_head *ibh; 1330 struct buffer_head *ibh;
1313 1331
1314 ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, 1332 ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
1315 &ident);
1316 if (ident == TAG_IDENT_IE && ibh) { 1333 if (ident == TAG_IDENT_IE && ibh) {
1317 struct buffer_head *nbh = NULL;
1318 struct kernel_lb_addr loc; 1334 struct kernel_lb_addr loc;
1319 struct indirectEntry *ie; 1335 struct indirectEntry *ie;
1320 1336
1321 ie = (struct indirectEntry *)ibh->b_data; 1337 ie = (struct indirectEntry *)ibh->b_data;
1322 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1338 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1323 1339
1324 if (ie->indirectICB.extLength && 1340 if (ie->indirectICB.extLength) {
1325 (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, 1341 brelse(ibh);
1326 &ident))) { 1342 memcpy(&iinfo->i_location, &loc,
1327 if (ident == TAG_IDENT_FE || 1343 sizeof(struct kernel_lb_addr));
1328 ident == TAG_IDENT_EFE) { 1344 if (++indirections > UDF_MAX_ICB_NESTING) {
1329 memcpy(&iinfo->i_location, 1345 udf_err(inode->i_sb,
1330 &loc, 1346 "too many ICBs in ICB hierarchy"
1331 sizeof(struct kernel_lb_addr)); 1347 " (max %d supported)\n",
1332 brelse(bh); 1348 UDF_MAX_ICB_NESTING);
1333 brelse(ibh); 1349 goto out;
1334 brelse(nbh);
1335 __udf_read_inode(inode);
1336 return;
1337 } 1350 }
1338 brelse(nbh); 1351 brelse(bh);
1352 goto reread;
1339 } 1353 }
1340 } 1354 }
1341 brelse(ibh); 1355 brelse(ibh);
1342 } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { 1356 } else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
1343 udf_err(inode->i_sb, "unsupported strategy type: %d\n", 1357 udf_err(inode->i_sb, "unsupported strategy type: %d\n",
1344 le16_to_cpu(fe->icbTag.strategyType)); 1358 le16_to_cpu(fe->icbTag.strategyType));
1345 brelse(bh); 1359 goto out;
1346 make_bad_inode(inode);
1347 return;
1348 } 1360 }
1349 udf_fill_inode(inode, bh);
1350
1351 brelse(bh);
1352}
1353
1354static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1355{
1356 struct fileEntry *fe;
1357 struct extendedFileEntry *efe;
1358 struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
1359 struct udf_inode_info *iinfo = UDF_I(inode);
1360 unsigned int link_count;
1361
1362 fe = (struct fileEntry *)bh->b_data;
1363 efe = (struct extendedFileEntry *)bh->b_data;
1364
1365 if (fe->icbTag.strategyType == cpu_to_le16(4)) 1361 if (fe->icbTag.strategyType == cpu_to_le16(4))
1366 iinfo->i_strat4096 = 0; 1362 iinfo->i_strat4096 = 0;
1367 else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ 1363 else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
@@ -1378,11 +1374,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1378 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { 1374 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
1379 iinfo->i_efe = 1; 1375 iinfo->i_efe = 1;
1380 iinfo->i_use = 0; 1376 iinfo->i_use = 0;
1381 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1377 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
1382 sizeof(struct extendedFileEntry))) { 1378 sizeof(struct extendedFileEntry));
1383 make_bad_inode(inode); 1379 if (ret)
1384 return; 1380 goto out;
1385 }
1386 memcpy(iinfo->i_ext.i_data, 1381 memcpy(iinfo->i_ext.i_data,
1387 bh->b_data + sizeof(struct extendedFileEntry), 1382 bh->b_data + sizeof(struct extendedFileEntry),
1388 inode->i_sb->s_blocksize - 1383 inode->i_sb->s_blocksize -
@@ -1390,11 +1385,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1390 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { 1385 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
1391 iinfo->i_efe = 0; 1386 iinfo->i_efe = 0;
1392 iinfo->i_use = 0; 1387 iinfo->i_use = 0;
1393 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1388 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
1394 sizeof(struct fileEntry))) { 1389 sizeof(struct fileEntry));
1395 make_bad_inode(inode); 1390 if (ret)
1396 return; 1391 goto out;
1397 }
1398 memcpy(iinfo->i_ext.i_data, 1392 memcpy(iinfo->i_ext.i_data,
1399 bh->b_data + sizeof(struct fileEntry), 1393 bh->b_data + sizeof(struct fileEntry),
1400 inode->i_sb->s_blocksize - sizeof(struct fileEntry)); 1394 inode->i_sb->s_blocksize - sizeof(struct fileEntry));
@@ -1404,18 +1398,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1404 iinfo->i_lenAlloc = le32_to_cpu( 1398 iinfo->i_lenAlloc = le32_to_cpu(
1405 ((struct unallocSpaceEntry *)bh->b_data)-> 1399 ((struct unallocSpaceEntry *)bh->b_data)->
1406 lengthAllocDescs); 1400 lengthAllocDescs);
1407 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1401 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
1408 sizeof(struct unallocSpaceEntry))) { 1402 sizeof(struct unallocSpaceEntry));
1409 make_bad_inode(inode); 1403 if (ret)
1410 return; 1404 goto out;
1411 }
1412 memcpy(iinfo->i_ext.i_data, 1405 memcpy(iinfo->i_ext.i_data,
1413 bh->b_data + sizeof(struct unallocSpaceEntry), 1406 bh->b_data + sizeof(struct unallocSpaceEntry),
1414 inode->i_sb->s_blocksize - 1407 inode->i_sb->s_blocksize -
1415 sizeof(struct unallocSpaceEntry)); 1408 sizeof(struct unallocSpaceEntry));
1416 return; 1409 return 0;
1417 } 1410 }
1418 1411
1412 ret = -EIO;
1419 read_lock(&sbi->s_cred_lock); 1413 read_lock(&sbi->s_cred_lock);
1420 i_uid_write(inode, le32_to_cpu(fe->uid)); 1414 i_uid_write(inode, le32_to_cpu(fe->uid));
1421 if (!uid_valid(inode->i_uid) || 1415 if (!uid_valid(inode->i_uid) ||
@@ -1441,8 +1435,13 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1441 read_unlock(&sbi->s_cred_lock); 1435 read_unlock(&sbi->s_cred_lock);
1442 1436
1443 link_count = le16_to_cpu(fe->fileLinkCount); 1437 link_count = le16_to_cpu(fe->fileLinkCount);
1444 if (!link_count) 1438 if (!link_count) {
1439 if (!hidden_inode) {
1440 ret = -ESTALE;
1441 goto out;
1442 }
1445 link_count = 1; 1443 link_count = 1;
1444 }
1446 set_nlink(inode, link_count); 1445 set_nlink(inode, link_count);
1447 1446
1448 inode->i_size = le64_to_cpu(fe->informationLength); 1447 inode->i_size = le64_to_cpu(fe->informationLength);
@@ -1488,6 +1487,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1488 iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); 1487 iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
1489 iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); 1488 iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
1490 } 1489 }
1490 inode->i_generation = iinfo->i_unique;
1491 1491
1492 switch (fe->icbTag.fileType) { 1492 switch (fe->icbTag.fileType) {
1493 case ICBTAG_FILE_TYPE_DIRECTORY: 1493 case ICBTAG_FILE_TYPE_DIRECTORY:
@@ -1537,8 +1537,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1537 default: 1537 default:
1538 udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", 1538 udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
1539 inode->i_ino, fe->icbTag.fileType); 1539 inode->i_ino, fe->icbTag.fileType);
1540 make_bad_inode(inode); 1540 goto out;
1541 return;
1542 } 1541 }
1543 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1542 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1544 struct deviceSpec *dsea = 1543 struct deviceSpec *dsea =
@@ -1549,8 +1548,12 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1549 le32_to_cpu(dsea->minorDeviceIdent))); 1548 le32_to_cpu(dsea->minorDeviceIdent)));
1550 /* Developer ID ??? */ 1549 /* Developer ID ??? */
1551 } else 1550 } else
1552 make_bad_inode(inode); 1551 goto out;
1553 } 1552 }
1553 ret = 0;
1554out:
1555 brelse(bh);
1556 return ret;
1554} 1557}
1555 1558
1556static int udf_alloc_i_data(struct inode *inode, size_t size) 1559static int udf_alloc_i_data(struct inode *inode, size_t size)
@@ -1664,7 +1667,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1664 FE_PERM_U_DELETE | FE_PERM_U_CHATTR)); 1667 FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
1665 fe->permissions = cpu_to_le32(udfperms); 1668 fe->permissions = cpu_to_le32(udfperms);
1666 1669
1667 if (S_ISDIR(inode->i_mode)) 1670 if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
1668 fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); 1671 fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
1669 else 1672 else
1670 fe->fileLinkCount = cpu_to_le16(inode->i_nlink); 1673 fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
@@ -1826,36 +1829,28 @@ out:
1826 return err; 1829 return err;
1827} 1830}
1828 1831
1829struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) 1832struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
1833 bool hidden_inode)
1830{ 1834{
1831 unsigned long block = udf_get_lb_pblock(sb, ino, 0); 1835 unsigned long block = udf_get_lb_pblock(sb, ino, 0);
1832 struct inode *inode = iget_locked(sb, block); 1836 struct inode *inode = iget_locked(sb, block);
1837 int err;
1833 1838
1834 if (!inode) 1839 if (!inode)
1835 return NULL; 1840 return ERR_PTR(-ENOMEM);
1836
1837 if (inode->i_state & I_NEW) {
1838 memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
1839 __udf_read_inode(inode);
1840 unlock_new_inode(inode);
1841 }
1842 1841
1843 if (is_bad_inode(inode)) 1842 if (!(inode->i_state & I_NEW))
1844 goto out_iput; 1843 return inode;
1845 1844
1846 if (ino->logicalBlockNum >= UDF_SB(sb)-> 1845 memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
1847 s_partmaps[ino->partitionReferenceNum].s_partition_len) { 1846 err = udf_read_inode(inode, hidden_inode);
1848 udf_debug("block=%d, partition=%d out of range\n", 1847 if (err < 0) {
1849 ino->logicalBlockNum, ino->partitionReferenceNum); 1848 iget_failed(inode);
1850 make_bad_inode(inode); 1849 return ERR_PTR(err);
1851 goto out_iput;
1852 } 1850 }
1851 unlock_new_inode(inode);
1853 1852
1854 return inode; 1853 return inode;
1855
1856 out_iput:
1857 iput(inode);
1858 return NULL;
1859} 1854}
1860 1855
1861int udf_add_aext(struct inode *inode, struct extent_position *epos, 1856int udf_add_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 9737cba1357d..c12e260fd6c4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
270 NULL, 0), 270 NULL, 0),
271 }; 271 };
272 inode = udf_iget(dir->i_sb, lb); 272 inode = udf_iget(dir->i_sb, lb);
273 if (!inode) { 273 if (IS_ERR(inode))
274 return ERR_PTR(-EACCES); 274 return inode;
275 }
276 } else 275 } else
277#endif /* UDF_RECOVERY */ 276#endif /* UDF_RECOVERY */
278 277
@@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
285 284
286 loc = lelb_to_cpu(cfi.icb.extLocation); 285 loc = lelb_to_cpu(cfi.icb.extLocation);
287 inode = udf_iget(dir->i_sb, &loc); 286 inode = udf_iget(dir->i_sb, &loc);
288 if (!inode) { 287 if (IS_ERR(inode))
289 return ERR_PTR(-EACCES); 288 return ERR_CAST(inode);
290 }
291 } 289 }
292 290
293 return d_splice_alias(inode, dentry); 291 return d_splice_alias(inode, dentry);
@@ -550,32 +548,18 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
550 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); 548 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
551} 549}
552 550
553static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, 551static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
554 bool excl)
555{ 552{
553 struct udf_inode_info *iinfo = UDF_I(inode);
554 struct inode *dir = dentry->d_parent->d_inode;
556 struct udf_fileident_bh fibh; 555 struct udf_fileident_bh fibh;
557 struct inode *inode;
558 struct fileIdentDesc cfi, *fi; 556 struct fileIdentDesc cfi, *fi;
559 int err; 557 int err;
560 struct udf_inode_info *iinfo;
561
562 inode = udf_new_inode(dir, mode, &err);
563 if (!inode) {
564 return err;
565 }
566
567 iinfo = UDF_I(inode);
568 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
569 inode->i_data.a_ops = &udf_adinicb_aops;
570 else
571 inode->i_data.a_ops = &udf_aops;
572 inode->i_op = &udf_file_inode_operations;
573 inode->i_fop = &udf_file_operations;
574 mark_inode_dirty(inode);
575 558
576 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 559 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
577 if (!fi) { 560 if (unlikely(!fi)) {
578 inode_dec_link_count(inode); 561 inode_dec_link_count(inode);
562 unlock_new_inode(inode);
579 iput(inode); 563 iput(inode);
580 return err; 564 return err;
581 } 565 }
@@ -589,23 +573,21 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
589 if (fibh.sbh != fibh.ebh) 573 if (fibh.sbh != fibh.ebh)
590 brelse(fibh.ebh); 574 brelse(fibh.ebh);
591 brelse(fibh.sbh); 575 brelse(fibh.sbh);
576 unlock_new_inode(inode);
592 d_instantiate(dentry, inode); 577 d_instantiate(dentry, inode);
593 578
594 return 0; 579 return 0;
595} 580}
596 581
597static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 582static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
583 bool excl)
598{ 584{
599 struct inode *inode; 585 struct inode *inode = udf_new_inode(dir, mode);
600 struct udf_inode_info *iinfo;
601 int err;
602 586
603 inode = udf_new_inode(dir, mode, &err); 587 if (IS_ERR(inode))
604 if (!inode) 588 return PTR_ERR(inode);
605 return err;
606 589
607 iinfo = UDF_I(inode); 590 if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
608 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
609 inode->i_data.a_ops = &udf_adinicb_aops; 591 inode->i_data.a_ops = &udf_adinicb_aops;
610 else 592 else
611 inode->i_data.a_ops = &udf_aops; 593 inode->i_data.a_ops = &udf_aops;
@@ -613,7 +595,25 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
613 inode->i_fop = &udf_file_operations; 595 inode->i_fop = &udf_file_operations;
614 mark_inode_dirty(inode); 596 mark_inode_dirty(inode);
615 597
598 return udf_add_nondir(dentry, inode);
599}
600
601static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
602{
603 struct inode *inode = udf_new_inode(dir, mode);
604
605 if (IS_ERR(inode))
606 return PTR_ERR(inode);
607
608 if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
609 inode->i_data.a_ops = &udf_adinicb_aops;
610 else
611 inode->i_data.a_ops = &udf_aops;
612 inode->i_op = &udf_file_inode_operations;
613 inode->i_fop = &udf_file_operations;
614 mark_inode_dirty(inode);
616 d_tmpfile(dentry, inode); 615 d_tmpfile(dentry, inode);
616 unlock_new_inode(inode);
617 return 0; 617 return 0;
618} 618}
619 619
@@ -621,44 +621,16 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
621 dev_t rdev) 621 dev_t rdev)
622{ 622{
623 struct inode *inode; 623 struct inode *inode;
624 struct udf_fileident_bh fibh;
625 struct fileIdentDesc cfi, *fi;
626 int err;
627 struct udf_inode_info *iinfo;
628 624
629 if (!old_valid_dev(rdev)) 625 if (!old_valid_dev(rdev))
630 return -EINVAL; 626 return -EINVAL;
631 627
632 err = -EIO; 628 inode = udf_new_inode(dir, mode);
633 inode = udf_new_inode(dir, mode, &err); 629 if (IS_ERR(inode))
634 if (!inode) 630 return PTR_ERR(inode);
635 goto out;
636 631
637 iinfo = UDF_I(inode);
638 init_special_inode(inode, mode, rdev); 632 init_special_inode(inode, mode, rdev);
639 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 633 return udf_add_nondir(dentry, inode);
640 if (!fi) {
641 inode_dec_link_count(inode);
642 iput(inode);
643 return err;
644 }
645 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
646 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
647 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
648 cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
649 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
650 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
651 mark_inode_dirty(dir);
652 mark_inode_dirty(inode);
653
654 if (fibh.sbh != fibh.ebh)
655 brelse(fibh.ebh);
656 brelse(fibh.sbh);
657 d_instantiate(dentry, inode);
658 err = 0;
659
660out:
661 return err;
662} 634}
663 635
664static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 636static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -670,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
670 struct udf_inode_info *dinfo = UDF_I(dir); 642 struct udf_inode_info *dinfo = UDF_I(dir);
671 struct udf_inode_info *iinfo; 643 struct udf_inode_info *iinfo;
672 644
673 err = -EIO; 645 inode = udf_new_inode(dir, S_IFDIR | mode);
674 inode = udf_new_inode(dir, S_IFDIR | mode, &err); 646 if (IS_ERR(inode))
675 if (!inode) 647 return PTR_ERR(inode);
676 goto out;
677 648
678 iinfo = UDF_I(inode); 649 iinfo = UDF_I(inode);
679 inode->i_op = &udf_dir_inode_operations; 650 inode->i_op = &udf_dir_inode_operations;
@@ -681,6 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
681 fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); 652 fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
682 if (!fi) { 653 if (!fi) {
683 inode_dec_link_count(inode); 654 inode_dec_link_count(inode);
655 unlock_new_inode(inode);
684 iput(inode); 656 iput(inode);
685 goto out; 657 goto out;
686 } 658 }
@@ -699,6 +671,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
699 if (!fi) { 671 if (!fi) {
700 clear_nlink(inode); 672 clear_nlink(inode);
701 mark_inode_dirty(inode); 673 mark_inode_dirty(inode);
674 unlock_new_inode(inode);
702 iput(inode); 675 iput(inode);
703 goto out; 676 goto out;
704 } 677 }
@@ -710,6 +683,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
710 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 683 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
711 inc_nlink(dir); 684 inc_nlink(dir);
712 mark_inode_dirty(dir); 685 mark_inode_dirty(dir);
686 unlock_new_inode(inode);
713 d_instantiate(dentry, inode); 687 d_instantiate(dentry, inode);
714 if (fibh.sbh != fibh.ebh) 688 if (fibh.sbh != fibh.ebh)
715 brelse(fibh.ebh); 689 brelse(fibh.ebh);
@@ -876,14 +850,11 @@ out:
876static int udf_symlink(struct inode *dir, struct dentry *dentry, 850static int udf_symlink(struct inode *dir, struct dentry *dentry,
877 const char *symname) 851 const char *symname)
878{ 852{
879 struct inode *inode; 853 struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
880 struct pathComponent *pc; 854 struct pathComponent *pc;
881 const char *compstart; 855 const char *compstart;
882 struct udf_fileident_bh fibh;
883 struct extent_position epos = {}; 856 struct extent_position epos = {};
884 int eoffset, elen = 0; 857 int eoffset, elen = 0;
885 struct fileIdentDesc *fi;
886 struct fileIdentDesc cfi;
887 uint8_t *ea; 858 uint8_t *ea;
888 int err; 859 int err;
889 int block; 860 int block;
@@ -892,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
892 struct udf_inode_info *iinfo; 863 struct udf_inode_info *iinfo;
893 struct super_block *sb = dir->i_sb; 864 struct super_block *sb = dir->i_sb;
894 865
895 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); 866 if (IS_ERR(inode))
896 if (!inode) 867 return PTR_ERR(inode);
897 goto out;
898 868
899 iinfo = UDF_I(inode); 869 iinfo = UDF_I(inode);
900 down_write(&iinfo->i_data_sem); 870 down_write(&iinfo->i_data_sem);
@@ -1012,24 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1012 mark_inode_dirty(inode); 982 mark_inode_dirty(inode);
1013 up_write(&iinfo->i_data_sem); 983 up_write(&iinfo->i_data_sem);
1014 984
1015 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 985 err = udf_add_nondir(dentry, inode);
1016 if (!fi)
1017 goto out_no_entry;
1018 cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
1019 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
1020 if (UDF_SB(inode->i_sb)->s_lvid_bh) {
1021 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
1022 cpu_to_le32(lvid_get_unique_id(sb));
1023 }
1024 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1025 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1026 mark_inode_dirty(dir);
1027 if (fibh.sbh != fibh.ebh)
1028 brelse(fibh.ebh);
1029 brelse(fibh.sbh);
1030 d_instantiate(dentry, inode);
1031 err = 0;
1032
1033out: 986out:
1034 kfree(name); 987 kfree(name);
1035 return err; 988 return err;
@@ -1037,6 +990,7 @@ out:
1037out_no_entry: 990out_no_entry:
1038 up_write(&iinfo->i_data_sem); 991 up_write(&iinfo->i_data_sem);
1039 inode_dec_link_count(inode); 992 inode_dec_link_count(inode);
993 unlock_new_inode(inode);
1040 iput(inode); 994 iput(inode);
1041 goto out; 995 goto out;
1042} 996}
@@ -1221,7 +1175,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
1221 struct udf_fileident_bh fibh; 1175 struct udf_fileident_bh fibh;
1222 1176
1223 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) 1177 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
1224 goto out_unlock; 1178 return ERR_PTR(-EACCES);
1225 1179
1226 if (fibh.sbh != fibh.ebh) 1180 if (fibh.sbh != fibh.ebh)
1227 brelse(fibh.ebh); 1181 brelse(fibh.ebh);
@@ -1229,12 +1183,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
1229 1183
1230 tloc = lelb_to_cpu(cfi.icb.extLocation); 1184 tloc = lelb_to_cpu(cfi.icb.extLocation);
1231 inode = udf_iget(child->d_inode->i_sb, &tloc); 1185 inode = udf_iget(child->d_inode->i_sb, &tloc);
1232 if (!inode) 1186 if (IS_ERR(inode))
1233 goto out_unlock; 1187 return ERR_CAST(inode);
1234 1188
1235 return d_obtain_alias(inode); 1189 return d_obtain_alias(inode);
1236out_unlock:
1237 return ERR_PTR(-EACCES);
1238} 1190}
1239 1191
1240 1192
@@ -1251,8 +1203,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
1251 loc.partitionReferenceNum = partref; 1203 loc.partitionReferenceNum = partref;
1252 inode = udf_iget(sb, &loc); 1204 inode = udf_iget(sb, &loc);
1253 1205
1254 if (inode == NULL) 1206 if (IS_ERR(inode))
1255 return ERR_PTR(-ENOMEM); 1207 return ERR_CAST(inode);
1256 1208
1257 if (generation && inode->i_generation != generation) { 1209 if (generation && inode->i_generation != generation) {
1258 iput(inode); 1210 iput(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 813da94d447b..e229315bbf7a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -959,14 +959,16 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
959 addr.logicalBlockNum = meta_file_loc; 959 addr.logicalBlockNum = meta_file_loc;
960 addr.partitionReferenceNum = partition_num; 960 addr.partitionReferenceNum = partition_num;
961 961
962 metadata_fe = udf_iget(sb, &addr); 962 metadata_fe = udf_iget_special(sb, &addr);
963 963
964 if (metadata_fe == NULL) 964 if (IS_ERR(metadata_fe)) {
965 udf_warn(sb, "metadata inode efe not found\n"); 965 udf_warn(sb, "metadata inode efe not found\n");
966 else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { 966 return metadata_fe;
967 }
968 if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
967 udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); 969 udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
968 iput(metadata_fe); 970 iput(metadata_fe);
969 metadata_fe = NULL; 971 return ERR_PTR(-EIO);
970 } 972 }
971 973
972 return metadata_fe; 974 return metadata_fe;
@@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
978 struct udf_part_map *map; 980 struct udf_part_map *map;
979 struct udf_meta_data *mdata; 981 struct udf_meta_data *mdata;
980 struct kernel_lb_addr addr; 982 struct kernel_lb_addr addr;
983 struct inode *fe;
981 984
982 map = &sbi->s_partmaps[partition]; 985 map = &sbi->s_partmaps[partition];
983 mdata = &map->s_type_specific.s_metadata; 986 mdata = &map->s_type_specific.s_metadata;
@@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
986 udf_debug("Metadata file location: block = %d part = %d\n", 989 udf_debug("Metadata file location: block = %d part = %d\n",
987 mdata->s_meta_file_loc, map->s_partition_num); 990 mdata->s_meta_file_loc, map->s_partition_num);
988 991
989 mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb, 992 fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
990 mdata->s_meta_file_loc, map->s_partition_num); 993 map->s_partition_num);
991 994 if (IS_ERR(fe)) {
992 if (mdata->s_metadata_fe == NULL) {
993 /* mirror file entry */ 995 /* mirror file entry */
994 udf_debug("Mirror metadata file location: block = %d part = %d\n", 996 udf_debug("Mirror metadata file location: block = %d part = %d\n",
995 mdata->s_mirror_file_loc, map->s_partition_num); 997 mdata->s_mirror_file_loc, map->s_partition_num);
996 998
997 mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, 999 fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
998 mdata->s_mirror_file_loc, map->s_partition_num); 1000 map->s_partition_num);
999 1001
1000 if (mdata->s_mirror_fe == NULL) { 1002 if (IS_ERR(fe)) {
1001 udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); 1003 udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
1002 return -EIO; 1004 return PTR_ERR(fe);
1003 } 1005 }
1004 } 1006 mdata->s_mirror_fe = fe;
1007 } else
1008 mdata->s_metadata_fe = fe;
1009
1005 1010
1006 /* 1011 /*
1007 * bitmap file entry 1012 * bitmap file entry
@@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
1015 udf_debug("Bitmap file location: block = %d part = %d\n", 1020 udf_debug("Bitmap file location: block = %d part = %d\n",
1016 addr.logicalBlockNum, addr.partitionReferenceNum); 1021 addr.logicalBlockNum, addr.partitionReferenceNum);
1017 1022
1018 mdata->s_bitmap_fe = udf_iget(sb, &addr); 1023 fe = udf_iget_special(sb, &addr);
1019 if (mdata->s_bitmap_fe == NULL) { 1024 if (IS_ERR(fe)) {
1020 if (sb->s_flags & MS_RDONLY) 1025 if (sb->s_flags & MS_RDONLY)
1021 udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); 1026 udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
1022 else { 1027 else {
1023 udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); 1028 udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
1024 return -EIO; 1029 return PTR_ERR(fe);
1025 } 1030 }
1026 } 1031 } else
1032 mdata->s_bitmap_fe = fe;
1027 } 1033 }
1028 1034
1029 udf_debug("udf_load_metadata_files Ok\n"); 1035 udf_debug("udf_load_metadata_files Ok\n");
@@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1111 phd->unallocSpaceTable.extPosition), 1117 phd->unallocSpaceTable.extPosition),
1112 .partitionReferenceNum = p_index, 1118 .partitionReferenceNum = p_index,
1113 }; 1119 };
1120 struct inode *inode;
1114 1121
1115 map->s_uspace.s_table = udf_iget(sb, &loc); 1122 inode = udf_iget_special(sb, &loc);
1116 if (!map->s_uspace.s_table) { 1123 if (IS_ERR(inode)) {
1117 udf_debug("cannot load unallocSpaceTable (part %d)\n", 1124 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1118 p_index); 1125 p_index);
1119 return -EIO; 1126 return PTR_ERR(inode);
1120 } 1127 }
1128 map->s_uspace.s_table = inode;
1121 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; 1129 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
1122 udf_debug("unallocSpaceTable (part %d) @ %ld\n", 1130 udf_debug("unallocSpaceTable (part %d) @ %ld\n",
1123 p_index, map->s_uspace.s_table->i_ino); 1131 p_index, map->s_uspace.s_table->i_ino);
@@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1144 phd->freedSpaceTable.extPosition), 1152 phd->freedSpaceTable.extPosition),
1145 .partitionReferenceNum = p_index, 1153 .partitionReferenceNum = p_index,
1146 }; 1154 };
1155 struct inode *inode;
1147 1156
1148 map->s_fspace.s_table = udf_iget(sb, &loc); 1157 inode = udf_iget_special(sb, &loc);
1149 if (!map->s_fspace.s_table) { 1158 if (IS_ERR(inode)) {
1150 udf_debug("cannot load freedSpaceTable (part %d)\n", 1159 udf_debug("cannot load freedSpaceTable (part %d)\n",
1151 p_index); 1160 p_index);
1152 return -EIO; 1161 return PTR_ERR(inode);
1153 } 1162 }
1154 1163 map->s_fspace.s_table = inode;
1155 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; 1164 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
1156 udf_debug("freedSpaceTable (part %d) @ %ld\n", 1165 udf_debug("freedSpaceTable (part %d) @ %ld\n",
1157 p_index, map->s_fspace.s_table->i_ino); 1166 p_index, map->s_fspace.s_table->i_ino);
@@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
1178 struct udf_part_map *map = &sbi->s_partmaps[p_index]; 1187 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1179 sector_t vat_block; 1188 sector_t vat_block;
1180 struct kernel_lb_addr ino; 1189 struct kernel_lb_addr ino;
1190 struct inode *inode;
1181 1191
1182 /* 1192 /*
1183 * VAT file entry is in the last recorded block. Some broken disks have 1193 * VAT file entry is in the last recorded block. Some broken disks have
@@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
1186 ino.partitionReferenceNum = type1_index; 1196 ino.partitionReferenceNum = type1_index;
1187 for (vat_block = start_block; 1197 for (vat_block = start_block;
1188 vat_block >= map->s_partition_root && 1198 vat_block >= map->s_partition_root &&
1189 vat_block >= start_block - 3 && 1199 vat_block >= start_block - 3; vat_block--) {
1190 !sbi->s_vat_inode; vat_block--) {
1191 ino.logicalBlockNum = vat_block - map->s_partition_root; 1200 ino.logicalBlockNum = vat_block - map->s_partition_root;
1192 sbi->s_vat_inode = udf_iget(sb, &ino); 1201 inode = udf_iget_special(sb, &ino);
1202 if (!IS_ERR(inode)) {
1203 sbi->s_vat_inode = inode;
1204 break;
1205 }
1193 } 1206 }
1194} 1207}
1195 1208
@@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2205 /* assign inodes by physical block number */ 2218 /* assign inodes by physical block number */
2206 /* perhaps it's not extensible enough, but for now ... */ 2219 /* perhaps it's not extensible enough, but for now ... */
2207 inode = udf_iget(sb, &rootdir); 2220 inode = udf_iget(sb, &rootdir);
2208 if (!inode) { 2221 if (IS_ERR(inode)) {
2209 udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", 2222 udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
2210 rootdir.logicalBlockNum, rootdir.partitionReferenceNum); 2223 rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
2211 ret = -EIO; 2224 ret = PTR_ERR(inode);
2212 goto error_out; 2225 goto error_out;
2213 } 2226 }
2214 2227
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index be7dabbbcb49..1cc3c993ebd0 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -138,12 +138,22 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
138/* file.c */ 138/* file.c */
139extern long udf_ioctl(struct file *, unsigned int, unsigned long); 139extern long udf_ioctl(struct file *, unsigned int, unsigned long);
140/* inode.c */ 140/* inode.c */
141extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 141extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *,
142 bool hidden_inode);
143static inline struct inode *udf_iget_special(struct super_block *sb,
144 struct kernel_lb_addr *ino)
145{
146 return __udf_iget(sb, ino, true);
147}
148static inline struct inode *udf_iget(struct super_block *sb,
149 struct kernel_lb_addr *ino)
150{
151 return __udf_iget(sb, ino, false);
152}
142extern int udf_expand_file_adinicb(struct inode *); 153extern int udf_expand_file_adinicb(struct inode *);
143extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 154extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
144extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 155extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
145extern int udf_setsize(struct inode *, loff_t); 156extern int udf_setsize(struct inode *, loff_t);
146extern void udf_read_inode(struct inode *);
147extern void udf_evict_inode(struct inode *); 157extern void udf_evict_inode(struct inode *);
148extern int udf_write_inode(struct inode *, struct writeback_control *wbc); 158extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
149extern long udf_block_map(struct inode *, sector_t); 159extern long udf_block_map(struct inode *, sector_t);
@@ -209,7 +219,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
209 219
210/* ialloc.c */ 220/* ialloc.c */
211extern void udf_free_inode(struct inode *); 221extern void udf_free_inode(struct inode *);
212extern struct inode *udf_new_inode(struct inode *, umode_t, int *); 222extern struct inode *udf_new_inode(struct inode *, umode_t);
213 223
214/* truncate.c */ 224/* truncate.c */
215extern void udf_truncate_tail_extent(struct inode *); 225extern void udf_truncate_tail_extent(struct inode *);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 1f11483eba6a..77c331f1a770 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -81,8 +81,6 @@ static time_t year_seconds[MAX_YEAR_SECONDS] = {
81/*2038*/ SPY(68, 17, 0) 81/*2038*/ SPY(68, 17, 0)
82}; 82};
83 83
84extern struct timezone sys_tz;
85
86#define SECS_PER_HOUR (60 * 60) 84#define SECS_PER_HOUR (60 * 60)
87#define SECS_PER_DAY (SECS_PER_HOUR * 24) 85#define SECS_PER_DAY (SECS_PER_HOUR * 24)
88 86
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a9cc75ffa925..7caa01652888 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -298,7 +298,10 @@ cg_found:
298 ufsi->i_oeftflag = 0; 298 ufsi->i_oeftflag = 0;
299 ufsi->i_dir_start_lookup = 0; 299 ufsi->i_dir_start_lookup = 0;
300 memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1)); 300 memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
301 insert_inode_hash(inode); 301 if (insert_inode_locked(inode) < 0) {
302 err = -EIO;
303 goto failed;
304 }
302 mark_inode_dirty(inode); 305 mark_inode_dirty(inode);
303 306
304 if (uspi->fs_magic == UFS2_MAGIC) { 307 if (uspi->fs_magic == UFS2_MAGIC) {
@@ -337,6 +340,7 @@ cg_found:
337fail_remove_inode: 340fail_remove_inode:
338 unlock_ufs(sb); 341 unlock_ufs(sb);
339 clear_nlink(inode); 342 clear_nlink(inode);
343 unlock_new_inode(inode);
340 iput(inode); 344 iput(inode);
341 UFSD("EXIT (FAILED): err %d\n", err); 345 UFSD("EXIT (FAILED): err %d\n", err);
342 return ERR_PTR(err); 346 return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 7c580c97990e..be7d42c7d938 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -902,9 +902,6 @@ void ufs_evict_inode(struct inode * inode)
902 invalidate_inode_buffers(inode); 902 invalidate_inode_buffers(inode);
903 clear_inode(inode); 903 clear_inode(inode);
904 904
905 if (want_delete) { 905 if (want_delete)
906 lock_ufs(inode->i_sb); 906 ufs_free_inode(inode);
907 ufs_free_inode (inode);
908 unlock_ufs(inode->i_sb);
909 }
910} 907}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 90d74b8f8eba..fd65deb4b5f0 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -38,10 +38,12 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
38{ 38{
39 int err = ufs_add_link(dentry, inode); 39 int err = ufs_add_link(dentry, inode);
40 if (!err) { 40 if (!err) {
41 unlock_new_inode(inode);
41 d_instantiate(dentry, inode); 42 d_instantiate(dentry, inode);
42 return 0; 43 return 0;
43 } 44 }
44 inode_dec_link_count(inode); 45 inode_dec_link_count(inode);
46 unlock_new_inode(inode);
45 iput(inode); 47 iput(inode);
46 return err; 48 return err;
47} 49}
@@ -126,12 +128,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
126 if (l > sb->s_blocksize) 128 if (l > sb->s_blocksize)
127 goto out_notlocked; 129 goto out_notlocked;
128 130
129 lock_ufs(dir->i_sb);
130 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 131 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
131 err = PTR_ERR(inode); 132 err = PTR_ERR(inode);
132 if (IS_ERR(inode)) 133 if (IS_ERR(inode))
133 goto out; 134 goto out_notlocked;
134 135
136 lock_ufs(dir->i_sb);
135 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { 137 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
136 /* slow symlink */ 138 /* slow symlink */
137 inode->i_op = &ufs_symlink_inode_operations; 139 inode->i_op = &ufs_symlink_inode_operations;
@@ -155,6 +157,7 @@ out_notlocked:
155 157
156out_fail: 158out_fail:
157 inode_dec_link_count(inode); 159 inode_dec_link_count(inode);
160 unlock_new_inode(inode);
158 iput(inode); 161 iput(inode);
159 goto out; 162 goto out;
160} 163}
@@ -181,13 +184,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
181 struct inode * inode; 184 struct inode * inode;
182 int err; 185 int err;
183 186
184 lock_ufs(dir->i_sb);
185 inode_inc_link_count(dir);
186
187 inode = ufs_new_inode(dir, S_IFDIR|mode); 187 inode = ufs_new_inode(dir, S_IFDIR|mode);
188 err = PTR_ERR(inode);
189 if (IS_ERR(inode)) 188 if (IS_ERR(inode))
190 goto out_dir; 189 return PTR_ERR(inode);
191 190
192 inode->i_op = &ufs_dir_inode_operations; 191 inode->i_op = &ufs_dir_inode_operations;
193 inode->i_fop = &ufs_dir_operations; 192 inode->i_fop = &ufs_dir_operations;
@@ -195,6 +194,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
195 194
196 inode_inc_link_count(inode); 195 inode_inc_link_count(inode);
197 196
197 lock_ufs(dir->i_sb);
198 inode_inc_link_count(dir);
199
198 err = ufs_make_empty(inode, dir); 200 err = ufs_make_empty(inode, dir);
199 if (err) 201 if (err)
200 goto out_fail; 202 goto out_fail;
@@ -211,8 +213,8 @@ out:
211out_fail: 213out_fail:
212 inode_dec_link_count(inode); 214 inode_dec_link_count(inode);
213 inode_dec_link_count(inode); 215 inode_dec_link_count(inode);
216 unlock_new_inode(inode);
214 iput (inode); 217 iput (inode);
215out_dir:
216 inode_dec_link_count(dir); 218 inode_dec_link_count(dir);
217 unlock_ufs(dir->i_sb); 219 unlock_ufs(dir->i_sb);
218 goto out; 220 goto out;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index de2d26d32844..86df952d3e24 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5424,7 +5424,7 @@ xfs_bmap_shift_extents(
5424 struct xfs_bmap_free *flist, 5424 struct xfs_bmap_free *flist,
5425 int num_exts) 5425 int num_exts)
5426{ 5426{
5427 struct xfs_btree_cur *cur; 5427 struct xfs_btree_cur *cur = NULL;
5428 struct xfs_bmbt_rec_host *gotp; 5428 struct xfs_bmbt_rec_host *gotp;
5429 struct xfs_bmbt_irec got; 5429 struct xfs_bmbt_irec got;
5430 struct xfs_bmbt_irec left; 5430 struct xfs_bmbt_irec left;
@@ -5435,7 +5435,7 @@ xfs_bmap_shift_extents(
5435 int error = 0; 5435 int error = 0;
5436 int i; 5436 int i;
5437 int whichfork = XFS_DATA_FORK; 5437 int whichfork = XFS_DATA_FORK;
5438 int logflags; 5438 int logflags = 0;
5439 xfs_filblks_t blockcount = 0; 5439 xfs_filblks_t blockcount = 0;
5440 int total_extents; 5440 int total_extents;
5441 5441
@@ -5478,16 +5478,11 @@ xfs_bmap_shift_extents(
5478 } 5478 }
5479 } 5479 }
5480 5480
5481 /* We are going to change core inode */
5482 logflags = XFS_ILOG_CORE;
5483 if (ifp->if_flags & XFS_IFBROOT) { 5481 if (ifp->if_flags & XFS_IFBROOT) {
5484 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); 5482 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5485 cur->bc_private.b.firstblock = *firstblock; 5483 cur->bc_private.b.firstblock = *firstblock;
5486 cur->bc_private.b.flist = flist; 5484 cur->bc_private.b.flist = flist;
5487 cur->bc_private.b.flags = 0; 5485 cur->bc_private.b.flags = 0;
5488 } else {
5489 cur = NULL;
5490 logflags |= XFS_ILOG_DEXT;
5491 } 5486 }
5492 5487
5493 /* 5488 /*
@@ -5545,11 +5540,14 @@ xfs_bmap_shift_extents(
5545 blockcount = left.br_blockcount + 5540 blockcount = left.br_blockcount +
5546 got.br_blockcount; 5541 got.br_blockcount;
5547 xfs_iext_remove(ip, *current_ext, 1, 0); 5542 xfs_iext_remove(ip, *current_ext, 1, 0);
5543 logflags |= XFS_ILOG_CORE;
5548 if (cur) { 5544 if (cur) {
5549 error = xfs_btree_delete(cur, &i); 5545 error = xfs_btree_delete(cur, &i);
5550 if (error) 5546 if (error)
5551 goto del_cursor; 5547 goto del_cursor;
5552 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); 5548 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5549 } else {
5550 logflags |= XFS_ILOG_DEXT;
5553 } 5551 }
5554 XFS_IFORK_NEXT_SET(ip, whichfork, 5552 XFS_IFORK_NEXT_SET(ip, whichfork,
5555 XFS_IFORK_NEXTENTS(ip, whichfork) - 1); 5553 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
@@ -5575,6 +5573,7 @@ xfs_bmap_shift_extents(
5575 got.br_startoff = startoff; 5573 got.br_startoff = startoff;
5576 } 5574 }
5577 5575
5576 logflags |= XFS_ILOG_CORE;
5578 if (cur) { 5577 if (cur) {
5579 error = xfs_bmbt_update(cur, got.br_startoff, 5578 error = xfs_bmbt_update(cur, got.br_startoff,
5580 got.br_startblock, 5579 got.br_startblock,
@@ -5582,6 +5581,8 @@ xfs_bmap_shift_extents(
5582 got.br_state); 5581 got.br_state);
5583 if (error) 5582 if (error)
5584 goto del_cursor; 5583 goto del_cursor;
5584 } else {
5585 logflags |= XFS_ILOG_DEXT;
5585 } 5586 }
5586 5587
5587 (*current_ext)++; 5588 (*current_ext)++;
@@ -5597,6 +5598,7 @@ del_cursor:
5597 xfs_btree_del_cursor(cur, 5598 xfs_btree_del_cursor(cur,
5598 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 5599 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5599 5600
5600 xfs_trans_log_inode(tp, ip, logflags); 5601 if (logflags)
5602 xfs_trans_log_inode(tp, ip, logflags);
5601 return error; 5603 return error;
5602} 5604}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 11e9b4caa54f..b984647c24db 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1753,11 +1753,72 @@ xfs_vm_readpages(
1753 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1753 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1754} 1754}
1755 1755
1756/*
1757 * This is basically a copy of __set_page_dirty_buffers() with one
1758 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1759 * dirty, we'll never be able to clean them because we don't write buffers
1760 * beyond EOF, and that means we can't invalidate pages that span EOF
1761 * that have been marked dirty. Further, the dirty state can leak into
1762 * the file interior if the file is extended, resulting in all sorts of
1763 * bad things happening as the state does not match the underlying data.
1764 *
1765 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1766 * this only exist because of bufferheads and how the generic code manages them.
1767 */
1768STATIC int
1769xfs_vm_set_page_dirty(
1770 struct page *page)
1771{
1772 struct address_space *mapping = page->mapping;
1773 struct inode *inode = mapping->host;
1774 loff_t end_offset;
1775 loff_t offset;
1776 int newly_dirty;
1777
1778 if (unlikely(!mapping))
1779 return !TestSetPageDirty(page);
1780
1781 end_offset = i_size_read(inode);
1782 offset = page_offset(page);
1783
1784 spin_lock(&mapping->private_lock);
1785 if (page_has_buffers(page)) {
1786 struct buffer_head *head = page_buffers(page);
1787 struct buffer_head *bh = head;
1788
1789 do {
1790 if (offset < end_offset)
1791 set_buffer_dirty(bh);
1792 bh = bh->b_this_page;
1793 offset += 1 << inode->i_blkbits;
1794 } while (bh != head);
1795 }
1796 newly_dirty = !TestSetPageDirty(page);
1797 spin_unlock(&mapping->private_lock);
1798
1799 if (newly_dirty) {
1800 /* sigh - __set_page_dirty() is static, so copy it here, too */
1801 unsigned long flags;
1802
1803 spin_lock_irqsave(&mapping->tree_lock, flags);
1804 if (page->mapping) { /* Race with truncate? */
1805 WARN_ON_ONCE(!PageUptodate(page));
1806 account_page_dirtied(page, mapping);
1807 radix_tree_tag_set(&mapping->page_tree,
1808 page_index(page), PAGECACHE_TAG_DIRTY);
1809 }
1810 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1811 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1812 }
1813 return newly_dirty;
1814}
1815
1756const struct address_space_operations xfs_address_space_operations = { 1816const struct address_space_operations xfs_address_space_operations = {
1757 .readpage = xfs_vm_readpage, 1817 .readpage = xfs_vm_readpage,
1758 .readpages = xfs_vm_readpages, 1818 .readpages = xfs_vm_readpages,
1759 .writepage = xfs_vm_writepage, 1819 .writepage = xfs_vm_writepage,
1760 .writepages = xfs_vm_writepages, 1820 .writepages = xfs_vm_writepages,
1821 .set_page_dirty = xfs_vm_set_page_dirty,
1761 .releasepage = xfs_vm_releasepage, 1822 .releasepage = xfs_vm_releasepage,
1762 .invalidatepage = xfs_vm_invalidatepage, 1823 .invalidatepage = xfs_vm_invalidatepage,
1763 .write_begin = xfs_vm_write_begin, 1824 .write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 2f1e30d39a35..1707980f9a4b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1470,6 +1470,26 @@ xfs_collapse_file_space(
1470 start_fsb = XFS_B_TO_FSB(mp, offset + len); 1470 start_fsb = XFS_B_TO_FSB(mp, offset + len);
1471 shift_fsb = XFS_B_TO_FSB(mp, len); 1471 shift_fsb = XFS_B_TO_FSB(mp, len);
1472 1472
1473 /*
1474 * Writeback the entire file and force remove any post-eof blocks. The
1475 * writeback prevents changes to the extent list via concurrent
1476 * writeback and the eofblocks trim prevents the extent shift algorithm
1477 * from running into a post-eof delalloc extent.
1478 *
1479 * XXX: This is a temporary fix until the extent shift loop below is
1480 * converted to use offsets and lookups within the ILOCK rather than
1481 * carrying around the index into the extent list for the next
1482 * iteration.
1483 */
1484 error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1485 if (error)
1486 return error;
1487 if (xfs_can_free_eofblocks(ip, true)) {
1488 error = xfs_free_eofblocks(mp, ip, false);
1489 if (error)
1490 return error;
1491 }
1492
1473 error = xfs_free_file_space(ip, offset, len); 1493 error = xfs_free_file_space(ip, offset, len);
1474 if (error) 1494 if (error)
1475 return error; 1495 return error;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 076b1708d134..de5368c803f9 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -291,12 +291,22 @@ xfs_file_read_iter(
291 if (inode->i_mapping->nrpages) { 291 if (inode->i_mapping->nrpages) {
292 ret = filemap_write_and_wait_range( 292 ret = filemap_write_and_wait_range(
293 VFS_I(ip)->i_mapping, 293 VFS_I(ip)->i_mapping,
294 pos, -1); 294 pos, pos + size - 1);
295 if (ret) { 295 if (ret) {
296 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 296 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
297 return ret; 297 return ret;
298 } 298 }
299 truncate_pagecache_range(VFS_I(ip), pos, -1); 299
300 /*
301 * Invalidate whole pages. This can return an error if
302 * we fail to invalidate a page, but this should never
303 * happen on XFS. Warn if it does fail.
304 */
305 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
306 pos >> PAGE_CACHE_SHIFT,
307 (pos + size - 1) >> PAGE_CACHE_SHIFT);
308 WARN_ON_ONCE(ret);
309 ret = 0;
300 } 310 }
301 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 311 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
302 } 312 }
@@ -632,10 +642,19 @@ xfs_file_dio_aio_write(
632 642
633 if (mapping->nrpages) { 643 if (mapping->nrpages) {
634 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 644 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
635 pos, -1); 645 pos, pos + count - 1);
636 if (ret) 646 if (ret)
637 goto out; 647 goto out;
638 truncate_pagecache_range(VFS_I(ip), pos, -1); 648 /*
649 * Invalidate whole pages. This can return an error if
650 * we fail to invalidate a page, but this should never
651 * happen on XFS. Warn if it does fail.
652 */
653 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
654 pos >> PAGE_CACHE_SHIFT,
655 (pos + count - 1) >> PAGE_CACHE_SHIFT);
656 WARN_ON_ONCE(ret);
657 ret = 0;
639 } 658 }
640 659
641 /* 660 /*