aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2016-11-29 22:39:29 -0500
committerDave Chinner <david@fromorbit.com>2016-11-29 22:39:29 -0500
commit5f1c6d28cfcd11c9df67dad45992fd523727fe1e (patch)
tree6d1e2ec57356653992937381cd3b5753b22f2ca4
parentb7b26110edf88bad41b87e96a9f0148bed5e2ff8 (diff)
parentacdda3aae146d9b69d30e9d8a32a8d8937055523 (diff)
Merge branch 'iomap-4.10-directio' into for-next
-rw-r--r--block/bio.c49
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/internal.h3
-rw-r--r--fs/iomap.c373
-rw-r--r--fs/xfs/xfs_aops.c298
-rw-r--r--fs/xfs/xfs_aops.h6
-rw-r--r--fs/xfs/xfs_bmap_util.c12
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_file.c226
-rw-r--r--fs/xfs/xfs_icache.c6
-rw-r--r--fs/xfs/xfs_inode.c82
-rw-r--r--fs/xfs/xfs_inode.h7
-rw-r--r--fs/xfs/xfs_ioctl.c2
-rw-r--r--fs/xfs/xfs_iomap.c50
-rw-r--r--fs/xfs/xfs_iops.c14
-rw-r--r--fs/xfs/xfs_pnfs.c7
-rw-r--r--fs/xfs/xfs_pnfs.h4
-rw-r--r--fs/xfs/xfs_reflink.c14
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/xfs/xfs_symlink.c7
-rw-r--r--include/linux/bio.h1
-rw-r--r--include/linux/iomap.h11
-rw-r--r--include/linux/lockdep.h25
-rw-r--r--kernel/locking/lockdep.c20
24 files changed, 668 insertions, 555 deletions
diff --git a/block/bio.c b/block/bio.c
index db85c5753a76..2cf6ebabc68c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -847,6 +847,55 @@ done:
847} 847}
848EXPORT_SYMBOL(bio_add_page); 848EXPORT_SYMBOL(bio_add_page);
849 849
850/**
851 * bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio
852 * @bio: bio to add pages to
853 * @iter: iov iterator describing the region to be mapped
854 *
855 * Pins as many pages from *iter and appends them to @bio's bvec array. The
856 * pages will have to be released using put_page() when done.
857 */
858int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
859{
860 unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
861 struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
862 struct page **pages = (struct page **)bv;
863 size_t offset, diff;
864 ssize_t size;
865
866 size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
867 if (unlikely(size <= 0))
868 return size ? size : -EFAULT;
869 nr_pages = (size + offset + PAGE_SIZE - 1) / PAGE_SIZE;
870
871 /*
872 * Deep magic below: We need to walk the pinned pages backwards
873 * because we are abusing the space allocated for the bio_vecs
874 * for the page array. Because the bio_vecs are larger than the
875 * page pointers by definition this will always work. But it also
876 * means we can't use bio_add_page, so any changes to it's semantics
877 * need to be reflected here as well.
878 */
879 bio->bi_iter.bi_size += size;
880 bio->bi_vcnt += nr_pages;
881
882 diff = (nr_pages * PAGE_SIZE - offset) - size;
883 while (nr_pages--) {
884 bv[nr_pages].bv_page = pages[nr_pages];
885 bv[nr_pages].bv_len = PAGE_SIZE;
886 bv[nr_pages].bv_offset = 0;
887 }
888
889 bv[0].bv_offset += offset;
890 bv[0].bv_len -= offset;
891 if (diff)
892 bv[bio->bi_vcnt - 1].bv_len -= diff;
893
894 iov_iter_advance(iter, size);
895 return 0;
896}
897EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
898
850struct submit_bio_ret { 899struct submit_bio_ret {
851 struct completion event; 900 struct completion event;
852 int error; 901 int error;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index fb9aa16a7727..19aa448fde6a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
554 * filesystems that don't need it and also allows us to create the workqueue 554 * filesystems that don't need it and also allows us to create the workqueue
555 * late enough so the we can include s_id in the name of the workqueue. 555 * late enough so the we can include s_id in the name of the workqueue.
556 */ 556 */
557static int sb_init_dio_done_wq(struct super_block *sb) 557int sb_init_dio_done_wq(struct super_block *sb)
558{ 558{
559 struct workqueue_struct *old; 559 struct workqueue_struct *old;
560 struct workqueue_struct *wq = alloc_workqueue("dio/%s", 560 struct workqueue_struct *wq = alloc_workqueue("dio/%s",
diff --git a/fs/internal.h b/fs/internal.h
index f4da3341b4a3..4fcf51766d4a 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
184loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, 184loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
185 unsigned flags, struct iomap_ops *ops, void *data, 185 unsigned flags, struct iomap_ops *ops, void *data,
186 iomap_actor_t actor); 186 iomap_actor_t actor);
187
188/* direct-io.c: */
189int sb_init_dio_done_wq(struct super_block *sb);
diff --git a/fs/iomap.c b/fs/iomap.c
index 13dd413b2b9c..fc2446242935 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -24,6 +24,7 @@
24#include <linux/uio.h> 24#include <linux/uio.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/task_io_accounting_ops.h>
27#include <linux/dax.h> 28#include <linux/dax.h>
28#include "internal.h" 29#include "internal.h"
29 30
@@ -584,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
584 return 0; 585 return 0;
585} 586}
586EXPORT_SYMBOL_GPL(iomap_fiemap); 587EXPORT_SYMBOL_GPL(iomap_fiemap);
588
589/*
590 * Private flags for iomap_dio, must not overlap with the public ones in
591 * iomap.h:
592 */
593#define IOMAP_DIO_WRITE (1 << 30)
594#define IOMAP_DIO_DIRTY (1 << 31)
595
596struct iomap_dio {
597 struct kiocb *iocb;
598 iomap_dio_end_io_t *end_io;
599 loff_t i_size;
600 loff_t size;
601 atomic_t ref;
602 unsigned flags;
603 int error;
604
605 union {
606 /* used during submission and for synchronous completion: */
607 struct {
608 struct iov_iter *iter;
609 struct task_struct *waiter;
610 struct request_queue *last_queue;
611 blk_qc_t cookie;
612 } submit;
613
614 /* used for aio completion: */
615 struct {
616 struct work_struct work;
617 } aio;
618 };
619};
620
621static ssize_t iomap_dio_complete(struct iomap_dio *dio)
622{
623 struct kiocb *iocb = dio->iocb;
624 ssize_t ret;
625
626 if (dio->end_io) {
627 ret = dio->end_io(iocb,
628 dio->error ? dio->error : dio->size,
629 dio->flags);
630 } else {
631 ret = dio->error;
632 }
633
634 if (likely(!ret)) {
635 ret = dio->size;
636 /* check for short read */
637 if (iocb->ki_pos + ret > dio->i_size &&
638 !(dio->flags & IOMAP_DIO_WRITE))
639 ret = dio->i_size - iocb->ki_pos;
640 iocb->ki_pos += ret;
641 }
642
643 inode_dio_end(file_inode(iocb->ki_filp));
644 kfree(dio);
645
646 return ret;
647}
648
649static void iomap_dio_complete_work(struct work_struct *work)
650{
651 struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
652 struct kiocb *iocb = dio->iocb;
653 bool is_write = (dio->flags & IOMAP_DIO_WRITE);
654 ssize_t ret;
655
656 ret = iomap_dio_complete(dio);
657 if (is_write && ret > 0)
658 ret = generic_write_sync(iocb, ret);
659 iocb->ki_complete(iocb, ret, 0);
660}
661
662/*
663 * Set an error in the dio if none is set yet. We have to use cmpxchg
664 * as the submission context and the completion context(s) can race to
665 * update the error.
666 */
667static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
668{
669 cmpxchg(&dio->error, 0, ret);
670}
671
672static void iomap_dio_bio_end_io(struct bio *bio)
673{
674 struct iomap_dio *dio = bio->bi_private;
675 bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
676
677 if (bio->bi_error)
678 iomap_dio_set_error(dio, bio->bi_error);
679
680 if (atomic_dec_and_test(&dio->ref)) {
681 if (is_sync_kiocb(dio->iocb)) {
682 struct task_struct *waiter = dio->submit.waiter;
683
684 WRITE_ONCE(dio->submit.waiter, NULL);
685 wake_up_process(waiter);
686 } else if (dio->flags & IOMAP_DIO_WRITE) {
687 struct inode *inode = file_inode(dio->iocb->ki_filp);
688
689 INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
690 queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
691 } else {
692 iomap_dio_complete_work(&dio->aio.work);
693 }
694 }
695
696 if (should_dirty) {
697 bio_check_pages_dirty(bio);
698 } else {
699 struct bio_vec *bvec;
700 int i;
701
702 bio_for_each_segment_all(bvec, bio, i)
703 put_page(bvec->bv_page);
704 bio_put(bio);
705 }
706}
707
708static blk_qc_t
709iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
710 unsigned len)
711{
712 struct page *page = ZERO_PAGE(0);
713 struct bio *bio;
714
715 bio = bio_alloc(GFP_KERNEL, 1);
716 bio->bi_bdev = iomap->bdev;
717 bio->bi_iter.bi_sector =
718 iomap->blkno + ((pos - iomap->offset) >> 9);
719 bio->bi_private = dio;
720 bio->bi_end_io = iomap_dio_bio_end_io;
721
722 get_page(page);
723 if (bio_add_page(bio, page, len, 0) != len)
724 BUG();
725 bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);
726
727 atomic_inc(&dio->ref);
728 return submit_bio(bio);
729}
730
731static loff_t
732iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
733 void *data, struct iomap *iomap)
734{
735 struct iomap_dio *dio = data;
736 unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
737 unsigned fs_block_size = (1 << inode->i_blkbits), pad;
738 unsigned align = iov_iter_alignment(dio->submit.iter);
739 struct iov_iter iter;
740 struct bio *bio;
741 bool need_zeroout = false;
742 int nr_pages, ret;
743
744 if ((pos | length | align) & ((1 << blkbits) - 1))
745 return -EINVAL;
746
747 switch (iomap->type) {
748 case IOMAP_HOLE:
749 if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
750 return -EIO;
751 /*FALLTHRU*/
752 case IOMAP_UNWRITTEN:
753 if (!(dio->flags & IOMAP_DIO_WRITE)) {
754 iov_iter_zero(length, dio->submit.iter);
755 dio->size += length;
756 return length;
757 }
758 dio->flags |= IOMAP_DIO_UNWRITTEN;
759 need_zeroout = true;
760 break;
761 case IOMAP_MAPPED:
762 if (iomap->flags & IOMAP_F_SHARED)
763 dio->flags |= IOMAP_DIO_COW;
764 if (iomap->flags & IOMAP_F_NEW)
765 need_zeroout = true;
766 break;
767 default:
768 WARN_ON_ONCE(1);
769 return -EIO;
770 }
771
772 /*
773 * Operate on a partial iter trimmed to the extent we were called for.
774 * We'll update the iter in the dio once we're done with this extent.
775 */
776 iter = *dio->submit.iter;
777 iov_iter_truncate(&iter, length);
778
779 nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
780 if (nr_pages <= 0)
781 return nr_pages;
782
783 if (need_zeroout) {
784 /* zero out from the start of the block to the write offset */
785 pad = pos & (fs_block_size - 1);
786 if (pad)
787 iomap_dio_zero(dio, iomap, pos - pad, pad);
788 }
789
790 do {
791 if (dio->error)
792 return 0;
793
794 bio = bio_alloc(GFP_KERNEL, nr_pages);
795 bio->bi_bdev = iomap->bdev;
796 bio->bi_iter.bi_sector =
797 iomap->blkno + ((pos - iomap->offset) >> 9);
798 bio->bi_private = dio;
799 bio->bi_end_io = iomap_dio_bio_end_io;
800
801 ret = bio_iov_iter_get_pages(bio, &iter);
802 if (unlikely(ret)) {
803 bio_put(bio);
804 return ret;
805 }
806
807 if (dio->flags & IOMAP_DIO_WRITE) {
808 bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_ODIRECT);
809 task_io_account_write(bio->bi_iter.bi_size);
810 } else {
811 bio_set_op_attrs(bio, REQ_OP_READ, 0);
812 if (dio->flags & IOMAP_DIO_DIRTY)
813 bio_set_pages_dirty(bio);
814 }
815
816 dio->size += bio->bi_iter.bi_size;
817 pos += bio->bi_iter.bi_size;
818
819 nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
820
821 atomic_inc(&dio->ref);
822
823 dio->submit.last_queue = bdev_get_queue(iomap->bdev);
824 dio->submit.cookie = submit_bio(bio);
825 } while (nr_pages);
826
827 if (need_zeroout) {
828 /* zero out from the end of the write to the end of the block */
829 pad = pos & (fs_block_size - 1);
830 if (pad)
831 iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
832 }
833
834 iov_iter_advance(dio->submit.iter, length);
835 return length;
836}
837
838ssize_t
839iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
840 iomap_dio_end_io_t end_io)
841{
842 struct address_space *mapping = iocb->ki_filp->f_mapping;
843 struct inode *inode = file_inode(iocb->ki_filp);
844 size_t count = iov_iter_count(iter);
845 loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
846 unsigned int flags = IOMAP_DIRECT;
847 struct blk_plug plug;
848 struct iomap_dio *dio;
849
850 lockdep_assert_held(&inode->i_rwsem);
851
852 if (!count)
853 return 0;
854
855 dio = kmalloc(sizeof(*dio), GFP_KERNEL);
856 if (!dio)
857 return -ENOMEM;
858
859 dio->iocb = iocb;
860 atomic_set(&dio->ref, 1);
861 dio->size = 0;
862 dio->i_size = i_size_read(inode);
863 dio->end_io = end_io;
864 dio->error = 0;
865 dio->flags = 0;
866
867 dio->submit.iter = iter;
868 if (is_sync_kiocb(iocb)) {
869 dio->submit.waiter = current;
870 dio->submit.cookie = BLK_QC_T_NONE;
871 dio->submit.last_queue = NULL;
872 }
873
874 if (iov_iter_rw(iter) == READ) {
875 if (pos >= dio->i_size)
876 goto out_free_dio;
877
878 if (iter->type == ITER_IOVEC)
879 dio->flags |= IOMAP_DIO_DIRTY;
880 } else {
881 dio->flags |= IOMAP_DIO_WRITE;
882 flags |= IOMAP_WRITE;
883 }
884
885 if (mapping->nrpages) {
886 ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
887 if (ret)
888 goto out_free_dio;
889
890 ret = invalidate_inode_pages2_range(mapping,
891 iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
892 WARN_ON_ONCE(ret);
893 ret = 0;
894 }
895
896 inode_dio_begin(inode);
897
898 blk_start_plug(&plug);
899 do {
900 ret = iomap_apply(inode, pos, count, flags, ops, dio,
901 iomap_dio_actor);
902 if (ret <= 0) {
903 /* magic error code to fall back to buffered I/O */
904 if (ret == -ENOTBLK)
905 ret = 0;
906 break;
907 }
908 pos += ret;
909 } while ((count = iov_iter_count(iter)) > 0);
910 blk_finish_plug(&plug);
911
912 if (ret < 0)
913 iomap_dio_set_error(dio, ret);
914
915 if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
916 !inode->i_sb->s_dio_done_wq) {
917 ret = sb_init_dio_done_wq(inode->i_sb);
918 if (ret < 0)
919 iomap_dio_set_error(dio, ret);
920 }
921
922 if (!atomic_dec_and_test(&dio->ref)) {
923 if (!is_sync_kiocb(iocb))
924 return -EIOCBQUEUED;
925
926 for (;;) {
927 set_current_state(TASK_UNINTERRUPTIBLE);
928 if (!READ_ONCE(dio->submit.waiter))
929 break;
930
931 if (!(iocb->ki_flags & IOCB_HIPRI) ||
932 !dio->submit.last_queue ||
933 !blk_poll(dio->submit.last_queue,
934 dio->submit.cookie))
935 io_schedule();
936 }
937 __set_current_state(TASK_RUNNING);
938 }
939
940 /*
941 * Try again to invalidate clean pages which might have been cached by
942 * non-direct readahead, or faulted in by get_user_pages() if the source
943 * of the write was an mmap'ed region of the file we're writing. Either
944 * one is a pretty crazy thing to do, so we don't support it 100%. If
945 * this invalidation fails, tough, the write still worked...
946 */
947 if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
948 ret = invalidate_inode_pages2_range(mapping,
949 iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
950 WARN_ON_ONCE(ret);
951 }
952
953 return iomap_dio_complete(dio);
954
955out_free_dio:
956 kfree(dio);
957 return ret;
958}
959EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index ab266d66124d..265000a09327 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -37,11 +37,6 @@
37#include <linux/pagevec.h> 37#include <linux/pagevec.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39 39
40/* flags for direct write completions */
41#define XFS_DIO_FLAG_UNWRITTEN (1 << 0)
42#define XFS_DIO_FLAG_APPEND (1 << 1)
43#define XFS_DIO_FLAG_COW (1 << 2)
44
45/* 40/*
46 * structure owned by writepages passed to individual writepage calls 41 * structure owned by writepages passed to individual writepage calls
47 */ 42 */
@@ -1176,45 +1171,6 @@ xfs_vm_releasepage(
1176} 1171}
1177 1172
1178/* 1173/*
1179 * When we map a DIO buffer, we may need to pass flags to
1180 * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
1181 *
1182 * Note that for DIO, an IO to the highest supported file block offset (i.e.
1183 * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
1184 * bit variable. Hence if we see this overflow, we have to assume that the IO is
1185 * extending the file size. We won't know for sure until IO completion is run
1186 * and the actual max write offset is communicated to the IO completion
1187 * routine.
1188 */
1189static void
1190xfs_map_direct(
1191 struct inode *inode,
1192 struct buffer_head *bh_result,
1193 struct xfs_bmbt_irec *imap,
1194 xfs_off_t offset,
1195 bool is_cow)
1196{
1197 uintptr_t *flags = (uintptr_t *)&bh_result->b_private;
1198 xfs_off_t size = bh_result->b_size;
1199
1200 trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
1201 ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
1202 XFS_IO_OVERWRITE, imap);
1203
1204 if (ISUNWRITTEN(imap)) {
1205 *flags |= XFS_DIO_FLAG_UNWRITTEN;
1206 set_buffer_defer_completion(bh_result);
1207 } else if (is_cow) {
1208 *flags |= XFS_DIO_FLAG_COW;
1209 set_buffer_defer_completion(bh_result);
1210 }
1211 if (offset + size > i_size_read(inode) || offset + size < 0) {
1212 *flags |= XFS_DIO_FLAG_APPEND;
1213 set_buffer_defer_completion(bh_result);
1214 }
1215}
1216
1217/*
1218 * If this is O_DIRECT or the mpage code calling tell them how large the mapping 1174 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1219 * is, so that we can avoid repeated get_blocks calls. 1175 * is, so that we can avoid repeated get_blocks calls.
1220 * 1176 *
@@ -1254,51 +1210,12 @@ xfs_map_trim_size(
1254 bh_result->b_size = mapping_size; 1210 bh_result->b_size = mapping_size;
1255} 1211}
1256 1212
1257/* Bounce unaligned directio writes to the page cache. */
1258static int 1213static int
1259xfs_bounce_unaligned_dio_write( 1214xfs_get_blocks(
1260 struct xfs_inode *ip,
1261 xfs_fileoff_t offset_fsb,
1262 struct xfs_bmbt_irec *imap)
1263{
1264 struct xfs_bmbt_irec irec;
1265 xfs_fileoff_t delta;
1266 bool shared;
1267 bool x;
1268 int error;
1269
1270 irec = *imap;
1271 if (offset_fsb > irec.br_startoff) {
1272 delta = offset_fsb - irec.br_startoff;
1273 irec.br_blockcount -= delta;
1274 irec.br_startblock += delta;
1275 irec.br_startoff = offset_fsb;
1276 }
1277 error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
1278 if (error)
1279 return error;
1280
1281 /*
1282 * We're here because we're trying to do a directio write to a
1283 * region that isn't aligned to a filesystem block. If any part
1284 * of the extent is shared, fall back to buffered mode to handle
1285 * the RMW. This is done by returning -EREMCHG ("remote addr
1286 * changed"), which is caught further up the call stack.
1287 */
1288 if (shared) {
1289 trace_xfs_reflink_bounce_dio_write(ip, imap);
1290 return -EREMCHG;
1291 }
1292 return 0;
1293}
1294
1295STATIC int
1296__xfs_get_blocks(
1297 struct inode *inode, 1215 struct inode *inode,
1298 sector_t iblock, 1216 sector_t iblock,
1299 struct buffer_head *bh_result, 1217 struct buffer_head *bh_result,
1300 int create, 1218 int create)
1301 bool direct)
1302{ 1219{
1303 struct xfs_inode *ip = XFS_I(inode); 1220 struct xfs_inode *ip = XFS_I(inode);
1304 struct xfs_mount *mp = ip->i_mount; 1221 struct xfs_mount *mp = ip->i_mount;
@@ -1309,10 +1226,8 @@ __xfs_get_blocks(
1309 int nimaps = 1; 1226 int nimaps = 1;
1310 xfs_off_t offset; 1227 xfs_off_t offset;
1311 ssize_t size; 1228 ssize_t size;
1312 int new = 0;
1313 bool is_cow = false;
1314 1229
1315 BUG_ON(create && !direct); 1230 BUG_ON(create);
1316 1231
1317 if (XFS_FORCED_SHUTDOWN(mp)) 1232 if (XFS_FORCED_SHUTDOWN(mp))
1318 return -EIO; 1233 return -EIO;
@@ -1321,7 +1236,7 @@ __xfs_get_blocks(
1321 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1236 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1322 size = bh_result->b_size; 1237 size = bh_result->b_size;
1323 1238
1324 if (!create && offset >= i_size_read(inode)) 1239 if (offset >= i_size_read(inode))
1325 return 0; 1240 return 0;
1326 1241
1327 /* 1242 /*
@@ -1336,73 +1251,12 @@ __xfs_get_blocks(
1336 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); 1251 end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1337 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1252 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1338 1253
1339 if (create && direct && xfs_is_reflink_inode(ip)) { 1254 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1340 is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); 1255 &imap, &nimaps, XFS_BMAPI_ENTIRE);
1341 ASSERT(!is_cow || !isnullstartblock(imap.br_startblock));
1342 }
1343
1344 if (!is_cow) {
1345 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1346 &imap, &nimaps, XFS_BMAPI_ENTIRE);
1347 /*
1348 * Truncate an overwrite extent if there's a pending CoW
1349 * reservation before the end of this extent. This
1350 * forces us to come back to get_blocks to take care of
1351 * the CoW.
1352 */
1353 if (create && direct && nimaps &&
1354 imap.br_startblock != HOLESTARTBLOCK &&
1355 imap.br_startblock != DELAYSTARTBLOCK &&
1356 !ISUNWRITTEN(&imap))
1357 xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
1358 &imap);
1359 }
1360 if (error) 1256 if (error)
1361 goto out_unlock; 1257 goto out_unlock;
1362 1258
1363 /* 1259 if (nimaps) {
1364 * The only time we can ever safely find delalloc blocks on direct I/O
1365 * is a dio write to post-eof speculative preallocation. All other
1366 * scenarios are indicative of a problem or misuse (such as mixing
1367 * direct and mapped I/O).
1368 *
1369 * The file may be unmapped by the time we get here so we cannot
1370 * reliably fail the I/O based on mapping. Instead, fail the I/O if this
1371 * is a read or a write within eof. Otherwise, carry on but warn as a
1372 * precuation if the file happens to be mapped.
1373 */
1374 if (direct && imap.br_startblock == DELAYSTARTBLOCK) {
1375 if (!create || offset < i_size_read(VFS_I(ip))) {
1376 WARN_ON_ONCE(1);
1377 error = -EIO;
1378 goto out_unlock;
1379 }
1380 WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping));
1381 }
1382
1383 /* for DAX, we convert unwritten extents directly */
1384 if (create &&
1385 (!nimaps ||
1386 (imap.br_startblock == HOLESTARTBLOCK ||
1387 imap.br_startblock == DELAYSTARTBLOCK) ||
1388 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1389 /*
1390 * xfs_iomap_write_direct() expects the shared lock. It
1391 * is unlocked on return.
1392 */
1393 if (lockmode == XFS_ILOCK_EXCL)
1394 xfs_ilock_demote(ip, lockmode);
1395
1396 error = xfs_iomap_write_direct(ip, offset, size,
1397 &imap, nimaps);
1398 if (error)
1399 return error;
1400 new = 1;
1401
1402 trace_xfs_get_blocks_alloc(ip, offset, size,
1403 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1404 : XFS_IO_DELALLOC, &imap);
1405 } else if (nimaps) {
1406 trace_xfs_get_blocks_found(ip, offset, size, 1260 trace_xfs_get_blocks_found(ip, offset, size,
1407 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1261 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1408 : XFS_IO_OVERWRITE, &imap); 1262 : XFS_IO_OVERWRITE, &imap);
@@ -1412,12 +1266,6 @@ __xfs_get_blocks(
1412 goto out_unlock; 1266 goto out_unlock;
1413 } 1267 }
1414 1268
1415 if (IS_DAX(inode) && create) {
1416 ASSERT(!ISUNWRITTEN(&imap));
1417 /* zeroing is not needed at a higher layer */
1418 new = 0;
1419 }
1420
1421 /* trim mapping down to size requested */ 1269 /* trim mapping down to size requested */
1422 xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); 1270 xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1423 1271
@@ -1427,43 +1275,14 @@ __xfs_get_blocks(
1427 */ 1275 */
1428 if (imap.br_startblock != HOLESTARTBLOCK && 1276 if (imap.br_startblock != HOLESTARTBLOCK &&
1429 imap.br_startblock != DELAYSTARTBLOCK && 1277 imap.br_startblock != DELAYSTARTBLOCK &&
1430 (create || !ISUNWRITTEN(&imap))) { 1278 !ISUNWRITTEN(&imap))
1431 if (create && direct && !is_cow) {
1432 error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
1433 &imap);
1434 if (error)
1435 return error;
1436 }
1437
1438 xfs_map_buffer(inode, bh_result, &imap, offset); 1279 xfs_map_buffer(inode, bh_result, &imap, offset);
1439 if (ISUNWRITTEN(&imap))
1440 set_buffer_unwritten(bh_result);
1441 /* direct IO needs special help */
1442 if (create)
1443 xfs_map_direct(inode, bh_result, &imap, offset, is_cow);
1444 }
1445 1280
1446 /* 1281 /*
1447 * If this is a realtime file, data may be on a different device. 1282 * If this is a realtime file, data may be on a different device.
1448 * to that pointed to from the buffer_head b_bdev currently. 1283 * to that pointed to from the buffer_head b_bdev currently.
1449 */ 1284 */
1450 bh_result->b_bdev = xfs_find_bdev_for_inode(inode); 1285 bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1451
1452 /*
1453 * If we previously allocated a block out beyond eof and we are now
1454 * coming back to use it then we will need to flag it as new even if it
1455 * has a disk address.
1456 *
1457 * With sub-block writes into unwritten extents we also need to mark
1458 * the buffer as new so that the unwritten parts of the buffer gets
1459 * correctly zeroed.
1460 */
1461 if (create &&
1462 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1463 (offset >= i_size_read(inode)) ||
1464 (new || ISUNWRITTEN(&imap))))
1465 set_buffer_new(bh_result);
1466
1467 return 0; 1286 return 0;
1468 1287
1469out_unlock: 1288out_unlock:
@@ -1471,100 +1290,6 @@ out_unlock:
1471 return error; 1290 return error;
1472} 1291}
1473 1292
1474int
1475xfs_get_blocks(
1476 struct inode *inode,
1477 sector_t iblock,
1478 struct buffer_head *bh_result,
1479 int create)
1480{
1481 return __xfs_get_blocks(inode, iblock, bh_result, create, false);
1482}
1483
1484int
1485xfs_get_blocks_direct(
1486 struct inode *inode,
1487 sector_t iblock,
1488 struct buffer_head *bh_result,
1489 int create)
1490{
1491 return __xfs_get_blocks(inode, iblock, bh_result, create, true);
1492}
1493
1494/*
1495 * Complete a direct I/O write request.
1496 *
1497 * xfs_map_direct passes us some flags in the private data to tell us what to
1498 * do. If no flags are set, then the write IO is an overwrite wholly within
1499 * the existing allocated file size and so there is nothing for us to do.
1500 *
1501 * Note that in this case the completion can be called in interrupt context,
1502 * whereas if we have flags set we will always be called in task context
1503 * (i.e. from a workqueue).
1504 */
1505int
1506xfs_end_io_direct_write(
1507 struct kiocb *iocb,
1508 loff_t offset,
1509 ssize_t size,
1510 void *private)
1511{
1512 struct inode *inode = file_inode(iocb->ki_filp);
1513 struct xfs_inode *ip = XFS_I(inode);
1514 uintptr_t flags = (uintptr_t)private;
1515 int error = 0;
1516
1517 trace_xfs_end_io_direct_write(ip, offset, size);
1518
1519 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1520 return -EIO;
1521
1522 if (size <= 0)
1523 return size;
1524
1525 /*
1526 * The flags tell us whether we are doing unwritten extent conversions
1527 * or an append transaction that updates the on-disk file size. These
1528 * cases are the only cases where we should *potentially* be needing
1529 * to update the VFS inode size.
1530 */
1531 if (flags == 0) {
1532 ASSERT(offset + size <= i_size_read(inode));
1533 return 0;
1534 }
1535
1536 /*
1537 * We need to update the in-core inode size here so that we don't end up
1538 * with the on-disk inode size being outside the in-core inode size. We
1539 * have no other method of updating EOF for AIO, so always do it here
1540 * if necessary.
1541 *
1542 * We need to lock the test/set EOF update as we can be racing with
1543 * other IO completions here to update the EOF. Failing to serialise
1544 * here can result in EOF moving backwards and Bad Things Happen when
1545 * that occurs.
1546 */
1547 spin_lock(&ip->i_flags_lock);
1548 if (offset + size > i_size_read(inode))
1549 i_size_write(inode, offset + size);
1550 spin_unlock(&ip->i_flags_lock);
1551
1552 if (flags & XFS_DIO_FLAG_COW)
1553 error = xfs_reflink_end_cow(ip, offset, size);
1554 if (flags & XFS_DIO_FLAG_UNWRITTEN) {
1555 trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
1556
1557 error = xfs_iomap_write_unwritten(ip, offset, size);
1558 }
1559 if (flags & XFS_DIO_FLAG_APPEND) {
1560 trace_xfs_end_io_direct_write_append(ip, offset, size);
1561
1562 error = xfs_setfilesize(ip, offset, size);
1563 }
1564
1565 return error;
1566}
1567
1568STATIC ssize_t 1293STATIC ssize_t
1569xfs_vm_direct_IO( 1294xfs_vm_direct_IO(
1570 struct kiocb *iocb, 1295 struct kiocb *iocb,
@@ -1585,7 +1310,6 @@ xfs_vm_bmap(
1585 struct xfs_inode *ip = XFS_I(inode); 1310 struct xfs_inode *ip = XFS_I(inode);
1586 1311
1587 trace_xfs_vm_bmap(XFS_I(inode)); 1312 trace_xfs_vm_bmap(XFS_I(inode));
1588 xfs_ilock(ip, XFS_IOLOCK_SHARED);
1589 1313
1590 /* 1314 /*
1591 * The swap code (ab-)uses ->bmap to get a block mapping and then 1315 * The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1593,12 +1317,10 @@ xfs_vm_bmap(
1593 * that on reflinks inodes, so we have to skip out here. And yes, 1317 * that on reflinks inodes, so we have to skip out here. And yes,
1594 * 0 is the magic code for a bmap error.. 1318 * 0 is the magic code for a bmap error..
1595 */ 1319 */
1596 if (xfs_is_reflink_inode(ip)) { 1320 if (xfs_is_reflink_inode(ip))
1597 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1598 return 0; 1321 return 0;
1599 } 1322
1600 filemap_write_and_wait(mapping); 1323 filemap_write_and_wait(mapping);
1601 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1602 return generic_block_bmap(mapping, block, xfs_get_blocks); 1324 return generic_block_bmap(mapping, block, xfs_get_blocks);
1603} 1325}
1604 1326
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 34dc00dfb91d..cc174ec6c2fd 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -55,12 +55,6 @@ struct xfs_ioend {
55 55
56extern const struct address_space_operations xfs_address_space_operations; 56extern const struct address_space_operations xfs_address_space_operations;
57 57
58int xfs_get_blocks(struct inode *inode, sector_t offset,
59 struct buffer_head *map_bh, int create);
60int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
61 struct buffer_head *map_bh, int create);
62int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
63 ssize_t size, void *private);
64int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); 58int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
65 59
66extern void xfs_count_page_state(struct page *, int *, int *); 60extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0670a8bd5818..b9abce524c33 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1935,8 +1935,8 @@ xfs_swap_extents(
1935 * page cache safely. Once we have done this we can take the ilocks and 1935 * page cache safely. Once we have done this we can take the ilocks and
1936 * do the rest of the checks. 1936 * do the rest of the checks.
1937 */ 1937 */
1938 lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 1938 lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
1939 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); 1939 lock_flags = XFS_MMAPLOCK_EXCL;
1940 xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); 1940 xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
1941 1941
1942 /* Verify that both files have the same format */ 1942 /* Verify that both files have the same format */
@@ -2076,15 +2076,13 @@ xfs_swap_extents(
2076 trace_xfs_swap_extent_after(ip, 0); 2076 trace_xfs_swap_extent_after(ip, 0);
2077 trace_xfs_swap_extent_after(tip, 1); 2077 trace_xfs_swap_extent_after(tip, 1);
2078 2078
2079out_unlock:
2079 xfs_iunlock(ip, lock_flags); 2080 xfs_iunlock(ip, lock_flags);
2080 xfs_iunlock(tip, lock_flags); 2081 xfs_iunlock(tip, lock_flags);
2082 unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
2081 return error; 2083 return error;
2082 2084
2083out_trans_cancel: 2085out_trans_cancel:
2084 xfs_trans_cancel(tp); 2086 xfs_trans_cancel(tp);
2085 2087 goto out_unlock;
2086out_unlock:
2087 xfs_iunlock(ip, lock_flags);
2088 xfs_iunlock(tip, lock_flags);
2089 return error;
2090} 2088}
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 29816981b50a..003a99b83bd8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -677,7 +677,6 @@ xfs_readdir(
677 args.dp = dp; 677 args.dp = dp;
678 args.geo = dp->i_mount->m_dir_geo; 678 args.geo = dp->i_mount->m_dir_geo;
679 679
680 xfs_ilock(dp, XFS_IOLOCK_SHARED);
681 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 680 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
682 rval = xfs_dir2_sf_getdents(&args, ctx); 681 rval = xfs_dir2_sf_getdents(&args, ctx);
683 else if ((rval = xfs_dir2_isblock(&args, &v))) 682 else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -686,7 +685,6 @@ xfs_readdir(
686 rval = xfs_dir2_block_getdents(&args, ctx); 685 rval = xfs_dir2_block_getdents(&args, ctx);
687 else 686 else
688 rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); 687 rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
689 xfs_iunlock(dp, XFS_IOLOCK_SHARED);
690 688
691 return rval; 689 return rval;
692} 690}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d818c160451f..f5effa68e037 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -48,40 +48,6 @@
48static const struct vm_operations_struct xfs_file_vm_ops; 48static const struct vm_operations_struct xfs_file_vm_ops;
49 49
50/* 50/*
51 * Locking primitives for read and write IO paths to ensure we consistently use
52 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
53 */
54static inline void
55xfs_rw_ilock(
56 struct xfs_inode *ip,
57 int type)
58{
59 if (type & XFS_IOLOCK_EXCL)
60 inode_lock(VFS_I(ip));
61 xfs_ilock(ip, type);
62}
63
64static inline void
65xfs_rw_iunlock(
66 struct xfs_inode *ip,
67 int type)
68{
69 xfs_iunlock(ip, type);
70 if (type & XFS_IOLOCK_EXCL)
71 inode_unlock(VFS_I(ip));
72}
73
74static inline void
75xfs_rw_ilock_demote(
76 struct xfs_inode *ip,
77 int type)
78{
79 xfs_ilock_demote(ip, type);
80 if (type & XFS_IOLOCK_EXCL)
81 inode_unlock(VFS_I(ip));
82}
83
84/*
85 * Clear the specified ranges to zero through either the pagecache or DAX. 51 * Clear the specified ranges to zero through either the pagecache or DAX.
86 * Holes and unwritten extents will be left as-is as they already are zeroed. 52 * Holes and unwritten extents will be left as-is as they already are zeroed.
87 */ 53 */
@@ -244,62 +210,21 @@ xfs_file_dio_aio_read(
244 struct kiocb *iocb, 210 struct kiocb *iocb,
245 struct iov_iter *to) 211 struct iov_iter *to)
246{ 212{
247 struct address_space *mapping = iocb->ki_filp->f_mapping; 213 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
248 struct inode *inode = mapping->host;
249 struct xfs_inode *ip = XFS_I(inode);
250 loff_t isize = i_size_read(inode);
251 size_t count = iov_iter_count(to); 214 size_t count = iov_iter_count(to);
252 loff_t end = iocb->ki_pos + count - 1; 215 ssize_t ret;
253 struct iov_iter data;
254 struct xfs_buftarg *target;
255 ssize_t ret = 0;
256 216
257 trace_xfs_file_direct_read(ip, count, iocb->ki_pos); 217 trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
258 218
259 if (!count) 219 if (!count)
260 return 0; /* skip atime */ 220 return 0; /* skip atime */
261 221
262 if (XFS_IS_REALTIME_INODE(ip))
263 target = ip->i_mount->m_rtdev_targp;
264 else
265 target = ip->i_mount->m_ddev_targp;
266
267 /* DIO must be aligned to device logical sector size */
268 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
269 if (iocb->ki_pos == isize)
270 return 0;
271 return -EINVAL;
272 }
273
274 file_accessed(iocb->ki_filp); 222 file_accessed(iocb->ki_filp);
275 223
276 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 224 xfs_ilock(ip, XFS_IOLOCK_SHARED);
277 if (mapping->nrpages) { 225 ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
278 ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); 226 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
279 if (ret)
280 goto out_unlock;
281
282 /*
283 * Invalidate whole pages. This can return an error if we fail
284 * to invalidate a page, but this should never happen on XFS.
285 * Warn if it does fail.
286 */
287 ret = invalidate_inode_pages2_range(mapping,
288 iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
289 WARN_ON_ONCE(ret);
290 ret = 0;
291 }
292
293 data = *to;
294 ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
295 xfs_get_blocks_direct, NULL, NULL, 0);
296 if (ret >= 0) {
297 iocb->ki_pos += ret;
298 iov_iter_advance(to, ret);
299 }
300 227
301out_unlock:
302 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
303 return ret; 228 return ret;
304} 229}
305 230
@@ -317,9 +242,9 @@ xfs_file_dax_read(
317 if (!count) 242 if (!count)
318 return 0; /* skip atime */ 243 return 0; /* skip atime */
319 244
320 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 245 xfs_ilock(ip, XFS_IOLOCK_SHARED);
321 ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); 246 ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
322 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 247 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
323 248
324 file_accessed(iocb->ki_filp); 249 file_accessed(iocb->ki_filp);
325 return ret; 250 return ret;
@@ -335,9 +260,9 @@ xfs_file_buffered_aio_read(
335 260
336 trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); 261 trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
337 262
338 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 263 xfs_ilock(ip, XFS_IOLOCK_SHARED);
339 ret = generic_file_read_iter(iocb, to); 264 ret = generic_file_read_iter(iocb, to);
340 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 265 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
341 266
342 return ret; 267 return ret;
343} 268}
@@ -418,15 +343,18 @@ restart:
418 if (error <= 0) 343 if (error <= 0)
419 return error; 344 return error;
420 345
421 error = xfs_break_layouts(inode, iolock, true); 346 error = xfs_break_layouts(inode, iolock);
422 if (error) 347 if (error)
423 return error; 348 return error;
424 349
425 /* For changing security info in file_remove_privs() we need i_mutex */ 350 /*
351 * For changing security info in file_remove_privs() we need i_rwsem
352 * exclusively.
353 */
426 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { 354 if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
427 xfs_rw_iunlock(ip, *iolock); 355 xfs_iunlock(ip, *iolock);
428 *iolock = XFS_IOLOCK_EXCL; 356 *iolock = XFS_IOLOCK_EXCL;
429 xfs_rw_ilock(ip, *iolock); 357 xfs_ilock(ip, *iolock);
430 goto restart; 358 goto restart;
431 } 359 }
432 /* 360 /*
@@ -451,9 +379,9 @@ restart:
451 spin_unlock(&ip->i_flags_lock); 379 spin_unlock(&ip->i_flags_lock);
452 if (!drained_dio) { 380 if (!drained_dio) {
453 if (*iolock == XFS_IOLOCK_SHARED) { 381 if (*iolock == XFS_IOLOCK_SHARED) {
454 xfs_rw_iunlock(ip, *iolock); 382 xfs_iunlock(ip, *iolock);
455 *iolock = XFS_IOLOCK_EXCL; 383 *iolock = XFS_IOLOCK_EXCL;
456 xfs_rw_ilock(ip, *iolock); 384 xfs_ilock(ip, *iolock);
457 iov_iter_reexpand(from, count); 385 iov_iter_reexpand(from, count);
458 } 386 }
459 /* 387 /*
@@ -496,6 +424,58 @@ restart:
496 return 0; 424 return 0;
497} 425}
498 426
427static int
428xfs_dio_write_end_io(
429 struct kiocb *iocb,
430 ssize_t size,
431 unsigned flags)
432{
433 struct inode *inode = file_inode(iocb->ki_filp);
434 struct xfs_inode *ip = XFS_I(inode);
435 loff_t offset = iocb->ki_pos;
436 bool update_size = false;
437 int error = 0;
438
439 trace_xfs_end_io_direct_write(ip, offset, size);
440
441 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
442 return -EIO;
443
444 if (size <= 0)
445 return size;
446
447 /*
448 * We need to update the in-core inode size here so that we don't end up
449 * with the on-disk inode size being outside the in-core inode size. We
450 * have no other method of updating EOF for AIO, so always do it here
451 * if necessary.
452 *
453 * We need to lock the test/set EOF update as we can be racing with
454 * other IO completions here to update the EOF. Failing to serialise
455 * here can result in EOF moving backwards and Bad Things Happen when
456 * that occurs.
457 */
458 spin_lock(&ip->i_flags_lock);
459 if (offset + size > i_size_read(inode)) {
460 i_size_write(inode, offset + size);
461 update_size = true;
462 }
463 spin_unlock(&ip->i_flags_lock);
464
465 if (flags & IOMAP_DIO_COW) {
466 error = xfs_reflink_end_cow(ip, offset, size);
467 if (error)
468 return error;
469 }
470
471 if (flags & IOMAP_DIO_UNWRITTEN)
472 error = xfs_iomap_write_unwritten(ip, offset, size);
473 else if (update_size)
474 error = xfs_setfilesize(ip, offset, size);
475
476 return error;
477}
478
499/* 479/*
500 * xfs_file_dio_aio_write - handle direct IO writes 480 * xfs_file_dio_aio_write - handle direct IO writes
501 * 481 *
@@ -535,9 +515,7 @@ xfs_file_dio_aio_write(
535 int unaligned_io = 0; 515 int unaligned_io = 0;
536 int iolock; 516 int iolock;
537 size_t count = iov_iter_count(from); 517 size_t count = iov_iter_count(from);
538 loff_t end; 518 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
539 struct iov_iter data;
540 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
541 mp->m_rtdev_targp : mp->m_ddev_targp; 519 mp->m_rtdev_targp : mp->m_ddev_targp;
542 520
543 /* DIO must be aligned to device logical sector size */ 521 /* DIO must be aligned to device logical sector size */
@@ -559,29 +537,12 @@ xfs_file_dio_aio_write(
559 iolock = XFS_IOLOCK_SHARED; 537 iolock = XFS_IOLOCK_SHARED;
560 } 538 }
561 539
562 xfs_rw_ilock(ip, iolock); 540 xfs_ilock(ip, iolock);
563 541
564 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 542 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
565 if (ret) 543 if (ret)
566 goto out; 544 goto out;
567 count = iov_iter_count(from); 545 count = iov_iter_count(from);
568 end = iocb->ki_pos + count - 1;
569
570 if (mapping->nrpages) {
571 ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
572 if (ret)
573 goto out;
574
575 /*
576 * Invalidate whole pages. This can return an error if we fail
577 * to invalidate a page, but this should never happen on XFS.
578 * Warn if it does fail.
579 */
580 ret = invalidate_inode_pages2_range(mapping,
581 iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
582 WARN_ON_ONCE(ret);
583 ret = 0;
584 }
585 546
586 /* 547 /*
587 * If we are doing unaligned IO, wait for all other IO to drain, 548 * If we are doing unaligned IO, wait for all other IO to drain,
@@ -591,7 +552,7 @@ xfs_file_dio_aio_write(
591 if (unaligned_io) 552 if (unaligned_io)
592 inode_dio_wait(inode); 553 inode_dio_wait(inode);
593 else if (iolock == XFS_IOLOCK_EXCL) { 554 else if (iolock == XFS_IOLOCK_EXCL) {
594 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 555 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
595 iolock = XFS_IOLOCK_SHARED; 556 iolock = XFS_IOLOCK_SHARED;
596 } 557 }
597 558
@@ -604,24 +565,9 @@ xfs_file_dio_aio_write(
604 goto out; 565 goto out;
605 } 566 }
606 567
607 data = *from; 568 ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
608 ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
609 xfs_get_blocks_direct, xfs_end_io_direct_write,
610 NULL, DIO_ASYNC_EXTEND);
611
612 /* see generic_file_direct_write() for why this is necessary */
613 if (mapping->nrpages) {
614 invalidate_inode_pages2_range(mapping,
615 iocb->ki_pos >> PAGE_SHIFT,
616 end >> PAGE_SHIFT);
617 }
618
619 if (ret > 0) {
620 iocb->ki_pos += ret;
621 iov_iter_advance(from, ret);
622 }
623out: 569out:
624 xfs_rw_iunlock(ip, iolock); 570 xfs_iunlock(ip, iolock);
625 571
626 /* 572 /*
627 * No fallback to buffered IO on errors for XFS, direct IO will either 573 * No fallback to buffered IO on errors for XFS, direct IO will either
@@ -643,7 +589,7 @@ xfs_file_dax_write(
643 size_t count; 589 size_t count;
644 loff_t pos; 590 loff_t pos;
645 591
646 xfs_rw_ilock(ip, iolock); 592 xfs_ilock(ip, iolock);
647 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 593 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
648 if (ret) 594 if (ret)
649 goto out; 595 goto out;
@@ -652,15 +598,13 @@ xfs_file_dax_write(
652 count = iov_iter_count(from); 598 count = iov_iter_count(from);
653 599
654 trace_xfs_file_dax_write(ip, count, pos); 600 trace_xfs_file_dax_write(ip, count, pos);
655
656 ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); 601 ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
657 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { 602 if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
658 i_size_write(inode, iocb->ki_pos); 603 i_size_write(inode, iocb->ki_pos);
659 error = xfs_setfilesize(ip, pos, ret); 604 error = xfs_setfilesize(ip, pos, ret);
660 } 605 }
661
662out: 606out:
663 xfs_rw_iunlock(ip, iolock); 607 xfs_iunlock(ip, iolock);
664 return error ? error : ret; 608 return error ? error : ret;
665} 609}
666 610
@@ -677,7 +621,7 @@ xfs_file_buffered_aio_write(
677 int enospc = 0; 621 int enospc = 0;
678 int iolock = XFS_IOLOCK_EXCL; 622 int iolock = XFS_IOLOCK_EXCL;
679 623
680 xfs_rw_ilock(ip, iolock); 624 xfs_ilock(ip, iolock);
681 625
682 ret = xfs_file_aio_write_checks(iocb, from, &iolock); 626 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
683 if (ret) 627 if (ret)
@@ -721,7 +665,7 @@ write_retry:
721 665
722 current->backing_dev_info = NULL; 666 current->backing_dev_info = NULL;
723out: 667out:
724 xfs_rw_iunlock(ip, iolock); 668 xfs_iunlock(ip, iolock);
725 return ret; 669 return ret;
726} 670}
727 671
@@ -797,7 +741,7 @@ xfs_file_fallocate(
797 return -EOPNOTSUPP; 741 return -EOPNOTSUPP;
798 742
799 xfs_ilock(ip, iolock); 743 xfs_ilock(ip, iolock);
800 error = xfs_break_layouts(inode, &iolock, false); 744 error = xfs_break_layouts(inode, &iolock);
801 if (error) 745 if (error)
802 goto out_unlock; 746 goto out_unlock;
803 747
@@ -1501,15 +1445,9 @@ xfs_filemap_fault(
1501 return xfs_filemap_page_mkwrite(vma, vmf); 1445 return xfs_filemap_page_mkwrite(vma, vmf);
1502 1446
1503 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1447 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1504 if (IS_DAX(inode)) { 1448 if (IS_DAX(inode))
1505 /*
1506 * we do not want to trigger unwritten extent conversion on read
1507 * faults - that is unnecessary overhead and would also require
1508 * changes to xfs_get_blocks_direct() to map unwritten extent
1509 * ioend for conversion on read-only mappings.
1510 */
1511 ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); 1449 ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
1512 } else 1450 else
1513 ret = filemap_fault(vma, vmf); 1451 ret = filemap_fault(vma, vmf);
1514 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1452 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1515 1453
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9c3e5c6ddf20..ff4d6311c7f4 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -70,8 +70,6 @@ xfs_inode_alloc(
70 ASSERT(!xfs_isiflocked(ip)); 70 ASSERT(!xfs_isiflocked(ip));
71 ASSERT(ip->i_ino == 0); 71 ASSERT(ip->i_ino == 0);
72 72
73 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
74
75 /* initialise the xfs inode */ 73 /* initialise the xfs inode */
76 ip->i_ino = ino; 74 ip->i_ino = ino;
77 ip->i_mount = mp; 75 ip->i_mount = mp;
@@ -394,8 +392,8 @@ xfs_iget_cache_hit(
394 xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 392 xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
395 inode->i_state = I_NEW; 393 inode->i_state = I_NEW;
396 394
397 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 395 ASSERT(!rwsem_is_locked(&inode->i_rwsem));
398 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 396 init_rwsem(&inode->i_rwsem);
399 397
400 spin_unlock(&ip->i_flags_lock); 398 spin_unlock(&ip->i_flags_lock);
401 spin_unlock(&pag->pag_ici_lock); 399 spin_unlock(&pag->pag_ici_lock);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4e560e6a12c1..e9ab42d8965b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared(
142} 142}
143 143
144/* 144/*
145 * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and 145 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
146 * the i_lock. This routine allows various combinations of the locks to be 146 * multi-reader locks: i_mmap_lock and the i_lock. This routine allows
147 * obtained. 147 * various combinations of the locks to be obtained.
148 * 148 *
149 * The 3 locks should always be ordered so that the IO lock is obtained first, 149 * The 3 locks should always be ordered so that the IO lock is obtained first,
150 * the mmap lock second and the ilock last in order to prevent deadlock. 150 * the mmap lock second and the ilock last in order to prevent deadlock.
151 * 151 *
152 * Basic locking order: 152 * Basic locking order:
153 * 153 *
154 * i_iolock -> i_mmap_lock -> page_lock -> i_ilock 154 * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
155 * 155 *
156 * mmap_sem locking order: 156 * mmap_sem locking order:
157 * 157 *
158 * i_iolock -> page lock -> mmap_sem 158 * i_rwsem -> page lock -> mmap_sem
159 * mmap_sem -> i_mmap_lock -> page_lock 159 * mmap_sem -> i_mmap_lock -> page_lock
160 * 160 *
161 * The difference in mmap_sem locking order mean that we cannot hold the 161 * The difference in mmap_sem locking order mean that we cannot hold the
162 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can 162 * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
163 * fault in pages during copy in/out (for buffered IO) or require the mmap_sem 163 * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
164 * in get_user_pages() to map the user pages into the kernel address space for 164 * in get_user_pages() to map the user pages into the kernel address space for
165 * direct IO. Similarly the i_iolock cannot be taken inside a page fault because 165 * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
166 * page faults already hold the mmap_sem. 166 * page faults already hold the mmap_sem.
167 * 167 *
168 * Hence to serialise fully against both syscall and mmap based IO, we need to 168 * Hence to serialise fully against both syscall and mmap based IO, we need to
169 * take both the i_iolock and the i_mmap_lock. These locks should *only* be both 169 * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
170 * taken in places where we need to invalidate the page cache in a race 170 * taken in places where we need to invalidate the page cache in a race
171 * free manner (e.g. truncate, hole punch and other extent manipulation 171 * free manner (e.g. truncate, hole punch and other extent manipulation
172 * functions). 172 * functions).
@@ -191,10 +191,13 @@ xfs_ilock(
191 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 191 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
192 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 192 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
193 193
194 if (lock_flags & XFS_IOLOCK_EXCL) 194 if (lock_flags & XFS_IOLOCK_EXCL) {
195 mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); 195 down_write_nested(&VFS_I(ip)->i_rwsem,
196 else if (lock_flags & XFS_IOLOCK_SHARED) 196 XFS_IOLOCK_DEP(lock_flags));
197 mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); 197 } else if (lock_flags & XFS_IOLOCK_SHARED) {
198 down_read_nested(&VFS_I(ip)->i_rwsem,
199 XFS_IOLOCK_DEP(lock_flags));
200 }
198 201
199 if (lock_flags & XFS_MMAPLOCK_EXCL) 202 if (lock_flags & XFS_MMAPLOCK_EXCL)
200 mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); 203 mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
240 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 243 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
241 244
242 if (lock_flags & XFS_IOLOCK_EXCL) { 245 if (lock_flags & XFS_IOLOCK_EXCL) {
243 if (!mrtryupdate(&ip->i_iolock)) 246 if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
244 goto out; 247 goto out;
245 } else if (lock_flags & XFS_IOLOCK_SHARED) { 248 } else if (lock_flags & XFS_IOLOCK_SHARED) {
246 if (!mrtryaccess(&ip->i_iolock)) 249 if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
247 goto out; 250 goto out;
248 } 251 }
249 252
@@ -271,9 +274,9 @@ out_undo_mmaplock:
271 mrunlock_shared(&ip->i_mmaplock); 274 mrunlock_shared(&ip->i_mmaplock);
272out_undo_iolock: 275out_undo_iolock:
273 if (lock_flags & XFS_IOLOCK_EXCL) 276 if (lock_flags & XFS_IOLOCK_EXCL)
274 mrunlock_excl(&ip->i_iolock); 277 up_write(&VFS_I(ip)->i_rwsem);
275 else if (lock_flags & XFS_IOLOCK_SHARED) 278 else if (lock_flags & XFS_IOLOCK_SHARED)
276 mrunlock_shared(&ip->i_iolock); 279 up_read(&VFS_I(ip)->i_rwsem);
277out: 280out:
278 return 0; 281 return 0;
279} 282}
@@ -310,9 +313,9 @@ xfs_iunlock(
310 ASSERT(lock_flags != 0); 313 ASSERT(lock_flags != 0);
311 314
312 if (lock_flags & XFS_IOLOCK_EXCL) 315 if (lock_flags & XFS_IOLOCK_EXCL)
313 mrunlock_excl(&ip->i_iolock); 316 up_write(&VFS_I(ip)->i_rwsem);
314 else if (lock_flags & XFS_IOLOCK_SHARED) 317 else if (lock_flags & XFS_IOLOCK_SHARED)
315 mrunlock_shared(&ip->i_iolock); 318 up_read(&VFS_I(ip)->i_rwsem);
316 319
317 if (lock_flags & XFS_MMAPLOCK_EXCL) 320 if (lock_flags & XFS_MMAPLOCK_EXCL)
318 mrunlock_excl(&ip->i_mmaplock); 321 mrunlock_excl(&ip->i_mmaplock);
@@ -345,7 +348,7 @@ xfs_ilock_demote(
345 if (lock_flags & XFS_MMAPLOCK_EXCL) 348 if (lock_flags & XFS_MMAPLOCK_EXCL)
346 mrdemote(&ip->i_mmaplock); 349 mrdemote(&ip->i_mmaplock);
347 if (lock_flags & XFS_IOLOCK_EXCL) 350 if (lock_flags & XFS_IOLOCK_EXCL)
348 mrdemote(&ip->i_iolock); 351 downgrade_write(&VFS_I(ip)->i_rwsem);
349 352
350 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 353 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
351} 354}
@@ -370,8 +373,9 @@ xfs_isilocked(
370 373
371 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { 374 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
372 if (!(lock_flags & XFS_IOLOCK_SHARED)) 375 if (!(lock_flags & XFS_IOLOCK_SHARED))
373 return !!ip->i_iolock.mr_writer; 376 return !debug_locks ||
374 return rwsem_is_locked(&ip->i_iolock.mr_lock); 377 lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
378 return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
375 } 379 }
376 380
377 ASSERT(0); 381 ASSERT(0);
@@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
421 425
422 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { 426 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
423 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); 427 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
424 ASSERT(xfs_lockdep_subclass_ok(subclass +
425 XFS_IOLOCK_PARENT_VAL));
426 class += subclass << XFS_IOLOCK_SHIFT; 428 class += subclass << XFS_IOLOCK_SHIFT;
427 if (lock_mode & XFS_IOLOCK_PARENT)
428 class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
429 } 429 }
430 430
431 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { 431 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
@@ -477,8 +477,6 @@ xfs_lock_inodes(
477 XFS_ILOCK_EXCL)); 477 XFS_ILOCK_EXCL));
478 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | 478 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
479 XFS_ILOCK_SHARED))); 479 XFS_ILOCK_SHARED)));
480 ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
481 inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
482 ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || 480 ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
483 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); 481 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
484 ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || 482 ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
@@ -581,10 +579,8 @@ xfs_lock_two_inodes(
581 int attempts = 0; 579 int attempts = 0;
582 xfs_log_item_t *lp; 580 xfs_log_item_t *lp;
583 581
584 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { 582 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
585 ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 583 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
586 ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
587 } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
588 ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); 584 ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
589 585
590 ASSERT(ip0->i_ino != ip1->i_ino); 586 ASSERT(ip0->i_ino != ip1->i_ino);
@@ -715,7 +711,6 @@ xfs_lookup(
715 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 711 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
716 return -EIO; 712 return -EIO;
717 713
718 xfs_ilock(dp, XFS_IOLOCK_SHARED);
719 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 714 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
720 if (error) 715 if (error)
721 goto out_unlock; 716 goto out_unlock;
@@ -724,14 +719,12 @@ xfs_lookup(
724 if (error) 719 if (error)
725 goto out_free_name; 720 goto out_free_name;
726 721
727 xfs_iunlock(dp, XFS_IOLOCK_SHARED);
728 return 0; 722 return 0;
729 723
730out_free_name: 724out_free_name:
731 if (ci_name) 725 if (ci_name)
732 kmem_free(ci_name->name); 726 kmem_free(ci_name->name);
733out_unlock: 727out_unlock:
734 xfs_iunlock(dp, XFS_IOLOCK_SHARED);
735 *ipp = NULL; 728 *ipp = NULL;
736 return error; 729 return error;
737} 730}
@@ -1215,8 +1208,7 @@ xfs_create(
1215 if (error) 1208 if (error)
1216 goto out_release_inode; 1209 goto out_release_inode;
1217 1210
1218 xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | 1211 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1219 XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
1220 unlock_dp_on_error = true; 1212 unlock_dp_on_error = true;
1221 1213
1222 xfs_defer_init(&dfops, &first_block); 1214 xfs_defer_init(&dfops, &first_block);
@@ -1252,7 +1244,7 @@ xfs_create(
1252 * the transaction cancel unlocking dp so don't do it explicitly in the 1244 * the transaction cancel unlocking dp so don't do it explicitly in the
1253 * error path. 1245 * error path.
1254 */ 1246 */
1255 xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1247 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1256 unlock_dp_on_error = false; 1248 unlock_dp_on_error = false;
1257 1249
1258 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1250 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1325,7 +1317,7 @@ xfs_create(
1325 xfs_qm_dqrele(pdqp); 1317 xfs_qm_dqrele(pdqp);
1326 1318
1327 if (unlock_dp_on_error) 1319 if (unlock_dp_on_error)
1328 xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1320 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1329 return error; 1321 return error;
1330} 1322}
1331 1323
@@ -1466,11 +1458,10 @@ xfs_link(
1466 if (error) 1458 if (error)
1467 goto std_return; 1459 goto std_return;
1468 1460
1469 xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
1470 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); 1461 xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1471 1462
1472 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); 1463 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1473 xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1464 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1474 1465
1475 /* 1466 /*
1476 * If we are using project inheritance, we only allow hard link 1467 * If we are using project inheritance, we only allow hard link
@@ -2579,10 +2570,9 @@ xfs_remove(
2579 goto std_return; 2570 goto std_return;
2580 } 2571 }
2581 2572
2582 xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2583 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); 2573 xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
2584 2574
2585 xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 2575 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2586 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 2576 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2587 2577
2588 /* 2578 /*
@@ -2963,12 +2953,6 @@ xfs_rename(
2963 * whether the target directory is the same as the source 2953 * whether the target directory is the same as the source
2964 * directory, we can lock from 2 to 4 inodes. 2954 * directory, we can lock from 2 to 4 inodes.
2965 */ 2955 */
2966 if (!new_parent)
2967 xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2968 else
2969 xfs_lock_two_inodes(src_dp, target_dp,
2970 XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
2971
2972 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 2956 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2973 2957
2974 /* 2958 /*
@@ -2976,9 +2960,9 @@ xfs_rename(
2976 * we can rely on either trans_commit or trans_cancel to unlock 2960 * we can rely on either trans_commit or trans_cancel to unlock
2977 * them. 2961 * them.
2978 */ 2962 */
2979 xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 2963 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2980 if (new_parent) 2964 if (new_parent)
2981 xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 2965 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2982 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 2966 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2983 if (target_ip) 2967 if (target_ip)
2984 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 2968 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 71e8a81c91a3..10dcf27b4c85 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,7 +56,6 @@ typedef struct xfs_inode {
56 /* Transaction and locking information. */ 56 /* Transaction and locking information. */
57 struct xfs_inode_log_item *i_itemp; /* logging information */ 57 struct xfs_inode_log_item *i_itemp; /* logging information */
58 mrlock_t i_lock; /* inode lock */ 58 mrlock_t i_lock; /* inode lock */
59 mrlock_t i_iolock; /* inode IO lock */
60 mrlock_t i_mmaplock; /* inode mmap IO lock */ 59 mrlock_t i_mmaplock; /* inode mmap IO lock */
61 atomic_t i_pincount; /* inode pin count */ 60 atomic_t i_pincount; /* inode pin count */
62 spinlock_t i_flags_lock; /* inode i_flags lock */ 61 spinlock_t i_flags_lock; /* inode i_flags lock */
@@ -333,7 +332,7 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
333 * IOLOCK values 332 * IOLOCK values
334 * 333 *
335 * 0-3 subclass value 334 * 0-3 subclass value
336 * 4-7 PARENT subclass values 335 * 4-7 unused
337 * 336 *
338 * MMAPLOCK values 337 * MMAPLOCK values
339 * 338 *
@@ -348,10 +347,8 @@ static inline void xfs_ifunlock(struct xfs_inode *ip)
348 * 347 *
349 */ 348 */
350#define XFS_IOLOCK_SHIFT 16 349#define XFS_IOLOCK_SHIFT 16
351#define XFS_IOLOCK_PARENT_VAL 4 350#define XFS_IOLOCK_MAX_SUBCLASS 3
352#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1)
353#define XFS_IOLOCK_DEP_MASK 0x000f0000 351#define XFS_IOLOCK_DEP_MASK 0x000f0000
354#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
355 352
356#define XFS_MMAPLOCK_SHIFT 20 353#define XFS_MMAPLOCK_SHIFT 20
357#define XFS_MMAPLOCK_NUMORDER 0 354#define XFS_MMAPLOCK_NUMORDER 0
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a39197501a7c..fc563b82aea6 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -639,7 +639,7 @@ xfs_ioc_space(
639 return error; 639 return error;
640 640
641 xfs_ilock(ip, iolock); 641 xfs_ilock(ip, iolock);
642 error = xfs_break_layouts(inode, &iolock, false); 642 error = xfs_break_layouts(inode, &iolock);
643 if (error) 643 if (error)
644 goto out_unlock; 644 goto out_unlock;
645 645
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 15a83813b708..0d147428971e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -950,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode,
950 (IS_DAX(inode) && ISUNWRITTEN(imap)); 950 (IS_DAX(inode) && ISUNWRITTEN(imap));
951} 951}
952 952
953static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
954{
955 /*
956 * COW writes will allocate delalloc space, so we need to make sure
957 * to take the lock exclusively here.
958 */
959 if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
960 return true;
961 if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE))
962 return true;
963 return false;
964}
965
953static int 966static int
954xfs_file_iomap_begin( 967xfs_file_iomap_begin(
955 struct inode *inode, 968 struct inode *inode,
@@ -969,18 +982,14 @@ xfs_file_iomap_begin(
969 if (XFS_FORCED_SHUTDOWN(mp)) 982 if (XFS_FORCED_SHUTDOWN(mp))
970 return -EIO; 983 return -EIO;
971 984
972 if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && 985 if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
973 !xfs_get_extsz_hint(ip)) { 986 !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
974 /* Reserve delalloc blocks for regular writeback. */ 987 /* Reserve delalloc blocks for regular writeback. */
975 return xfs_file_iomap_begin_delay(inode, offset, length, flags, 988 return xfs_file_iomap_begin_delay(inode, offset, length, flags,
976 iomap); 989 iomap);
977 } 990 }
978 991
979 /* 992 if (need_excl_ilock(ip, flags)) {
980 * COW writes will allocate delalloc space, so we need to make sure
981 * to take the lock exclusively here.
982 */
983 if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
984 lockmode = XFS_ILOCK_EXCL; 993 lockmode = XFS_ILOCK_EXCL;
985 xfs_ilock(ip, XFS_ILOCK_EXCL); 994 xfs_ilock(ip, XFS_ILOCK_EXCL);
986 } else { 995 } else {
@@ -993,17 +1002,41 @@ xfs_file_iomap_begin(
993 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1002 offset_fsb = XFS_B_TO_FSBT(mp, offset);
994 end_fsb = XFS_B_TO_FSB(mp, offset + length); 1003 end_fsb = XFS_B_TO_FSB(mp, offset + length);
995 1004
1005 if (xfs_is_reflink_inode(ip) &&
1006 (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) {
1007 shared = xfs_reflink_find_cow_mapping(ip, offset, &imap);
1008 if (shared) {
1009 xfs_iunlock(ip, lockmode);
1010 goto alloc_done;
1011 }
1012 ASSERT(!isnullstartblock(imap.br_startblock));
1013 }
1014
996 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, 1015 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
997 &nimaps, 0); 1016 &nimaps, 0);
998 if (error) 1017 if (error)
999 goto out_unlock; 1018 goto out_unlock;
1000 1019
1001 if (flags & IOMAP_REPORT) { 1020 if ((flags & IOMAP_REPORT) ||
1021 (xfs_is_reflink_inode(ip) &&
1022 (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) {
1002 /* Trim the mapping to the nearest shared extent boundary. */ 1023 /* Trim the mapping to the nearest shared extent boundary. */
1003 error = xfs_reflink_trim_around_shared(ip, &imap, &shared, 1024 error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
1004 &trimmed); 1025 &trimmed);
1005 if (error) 1026 if (error)
1006 goto out_unlock; 1027 goto out_unlock;
1028
1029 /*
1030 * We're here because we're trying to do a directio write to a
1031 * region that isn't aligned to a filesystem block. If the
1032 * extent is shared, fall back to buffered mode to handle the
1033 * RMW.
1034 */
1035 if (!(flags & IOMAP_REPORT) && shared) {
1036 trace_xfs_reflink_bounce_dio_write(ip, &imap);
1037 error = -EREMCHG;
1038 goto out_unlock;
1039 }
1007 } 1040 }
1008 1041
1009 if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { 1042 if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
@@ -1038,6 +1071,7 @@ xfs_file_iomap_begin(
1038 if (error) 1071 if (error)
1039 return error; 1072 return error;
1040 1073
1074alloc_done:
1041 iomap->flags = IOMAP_F_NEW; 1075 iomap->flags = IOMAP_F_NEW;
1042 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); 1076 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
1043 } else { 1077 } else {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 405a65cd9d6b..c962999a87ab 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -983,15 +983,13 @@ xfs_vn_setattr(
983 struct xfs_inode *ip = XFS_I(d_inode(dentry)); 983 struct xfs_inode *ip = XFS_I(d_inode(dentry));
984 uint iolock = XFS_IOLOCK_EXCL; 984 uint iolock = XFS_IOLOCK_EXCL;
985 985
986 xfs_ilock(ip, iolock); 986 error = xfs_break_layouts(d_inode(dentry), &iolock);
987 error = xfs_break_layouts(d_inode(dentry), &iolock, true); 987 if (error)
988 if (!error) { 988 return error;
989 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
990 iolock |= XFS_MMAPLOCK_EXCL;
991 989
992 error = xfs_vn_setattr_size(dentry, iattr); 990 xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
993 } 991 error = xfs_setattr_size(ip, iattr);
994 xfs_iunlock(ip, iolock); 992 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
995 } else { 993 } else {
996 error = xfs_vn_setattr_nonsize(dentry, iattr); 994 error = xfs_vn_setattr_nonsize(dentry, iattr);
997 } 995 }
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 93a7aafa56d6..2f2dc3c09ad0 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -32,8 +32,7 @@
32int 32int
33xfs_break_layouts( 33xfs_break_layouts(
34 struct inode *inode, 34 struct inode *inode,
35 uint *iolock, 35 uint *iolock)
36 bool with_imutex)
37{ 36{
38 struct xfs_inode *ip = XFS_I(inode); 37 struct xfs_inode *ip = XFS_I(inode);
39 int error; 38 int error;
@@ -42,12 +41,8 @@ xfs_break_layouts(
42 41
43 while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { 42 while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
44 xfs_iunlock(ip, *iolock); 43 xfs_iunlock(ip, *iolock);
45 if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
46 inode_unlock(inode);
47 error = break_layout(inode, true); 44 error = break_layout(inode, true);
48 *iolock = XFS_IOLOCK_EXCL; 45 *iolock = XFS_IOLOCK_EXCL;
49 if (with_imutex)
50 inode_lock(inode);
51 xfs_ilock(ip, *iolock); 46 xfs_ilock(ip, *iolock);
52 } 47 }
53 48
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index e8339f74966b..b587cb99b2b7 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
8int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, 8int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
9 struct iattr *iattr); 9 struct iattr *iattr);
10 10
11int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); 11int xfs_break_layouts(struct inode *inode, uint *iolock);
12#else 12#else
13static inline int 13static inline int
14xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) 14xfs_break_layouts(struct inode *inode, uint *iolock)
15{ 15{
16 return 0; 16 return 0;
17} 17}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index becf2465dd23..88fd03c66e99 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1251,13 +1251,11 @@ xfs_reflink_remap_range(
1251 return -EIO; 1251 return -EIO;
1252 1252
1253 /* Lock both files against IO */ 1253 /* Lock both files against IO */
1254 if (same_inode) { 1254 lock_two_nondirectories(inode_in, inode_out);
1255 xfs_ilock(src, XFS_IOLOCK_EXCL); 1255 if (same_inode)
1256 xfs_ilock(src, XFS_MMAPLOCK_EXCL); 1256 xfs_ilock(src, XFS_MMAPLOCK_EXCL);
1257 } else { 1257 else
1258 xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
1259 xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); 1258 xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
1260 }
1261 1259
1262 /* Don't touch certain kinds of inodes */ 1260 /* Don't touch certain kinds of inodes */
1263 ret = -EPERM; 1261 ret = -EPERM;
@@ -1402,11 +1400,9 @@ xfs_reflink_remap_range(
1402 1400
1403out_unlock: 1401out_unlock:
1404 xfs_iunlock(src, XFS_MMAPLOCK_EXCL); 1402 xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
1405 xfs_iunlock(src, XFS_IOLOCK_EXCL); 1403 if (!same_inode)
1406 if (src->i_ino != dest->i_ino) {
1407 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); 1404 xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
1408 xfs_iunlock(dest, XFS_IOLOCK_EXCL); 1405 unlock_two_nondirectories(inode_in, inode_out);
1409 }
1410 if (ret) 1406 if (ret)
1411 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); 1407 trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1412 return ret; 1408 return ret;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ade4691e3f74..563d1d146b8c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -943,7 +943,7 @@ xfs_fs_destroy_inode(
943 943
944 trace_xfs_destroy_inode(ip); 944 trace_xfs_destroy_inode(ip);
945 945
946 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 946 ASSERT(!rwsem_is_locked(&inode->i_rwsem));
947 XFS_STATS_INC(ip->i_mount, vn_rele); 947 XFS_STATS_INC(ip->i_mount, vn_rele);
948 XFS_STATS_INC(ip->i_mount, vn_remove); 948 XFS_STATS_INC(ip->i_mount, vn_remove);
949 949
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 58142aeeeea6..f2cb45ed1d54 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -238,8 +238,7 @@ xfs_symlink(
238 if (error) 238 if (error)
239 goto out_release_inode; 239 goto out_release_inode;
240 240
241 xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | 241 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
242 XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
243 unlock_dp_on_error = true; 242 unlock_dp_on_error = true;
244 243
245 /* 244 /*
@@ -287,7 +286,7 @@ xfs_symlink(
287 * the transaction cancel unlocking dp so don't do it explicitly in the 286 * the transaction cancel unlocking dp so don't do it explicitly in the
288 * error path. 287 * error path.
289 */ 288 */
290 xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 289 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
291 unlock_dp_on_error = false; 290 unlock_dp_on_error = false;
292 291
293 /* 292 /*
@@ -412,7 +411,7 @@ out_release_inode:
412 xfs_qm_dqrele(pdqp); 411 xfs_qm_dqrele(pdqp);
413 412
414 if (unlock_dp_on_error) 413 if (unlock_dp_on_error)
415 xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 414 xfs_iunlock(dp, XFS_ILOCK_EXCL);
416 return error; 415 return error;
417} 416}
418 417
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 97cb48f03dc7..66228c28c621 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -430,6 +430,7 @@ void bio_chain(struct bio *, struct bio *);
430extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); 430extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
431extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, 431extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
432 unsigned int, unsigned int); 432 unsigned int, unsigned int);
433int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
433struct rq_map_data; 434struct rq_map_data;
434extern struct bio *bio_map_user_iov(struct request_queue *, 435extern struct bio *bio_map_user_iov(struct request_queue *,
435 const struct iov_iter *, gfp_t); 436 const struct iov_iter *, gfp_t);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f185156de74d..a4c94b86401e 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -50,6 +50,7 @@ struct iomap {
50#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */ 50#define IOMAP_ZERO (1 << 1) /* zeroing operation, may skip holes */
51#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ 51#define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */
52#define IOMAP_FAULT (1 << 3) /* mapping for page fault */ 52#define IOMAP_FAULT (1 << 3) /* mapping for page fault */
53#define IOMAP_DIRECT (1 << 4) /* direct I/O */
53 54
54struct iomap_ops { 55struct iomap_ops {
55 /* 56 /*
@@ -83,4 +84,14 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
83int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 84int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
84 loff_t start, loff_t len, struct iomap_ops *ops); 85 loff_t start, loff_t len, struct iomap_ops *ops);
85 86
87/*
88 * Flags for direct I/O ->end_io:
89 */
90#define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */
91#define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */
92typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret,
93 unsigned flags);
94ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
95 struct iomap_ops *ops, iomap_dio_end_io_t end_io);
96
86#endif /* LINUX_IOMAP_H */ 97#endif /* LINUX_IOMAP_H */
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index c1458fede1f9..1e327bb80838 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -338,9 +338,18 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
338extern void lock_release(struct lockdep_map *lock, int nested, 338extern void lock_release(struct lockdep_map *lock, int nested,
339 unsigned long ip); 339 unsigned long ip);
340 340
341#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) 341/*
342 * Same "read" as for lock_acquire(), except -1 means any.
343 */
344extern int lock_is_held_type(struct lockdep_map *lock, int read);
345
346static inline int lock_is_held(struct lockdep_map *lock)
347{
348 return lock_is_held_type(lock, -1);
349}
342 350
343extern int lock_is_held(struct lockdep_map *lock); 351#define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map)
352#define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r))
344 353
345extern void lock_set_class(struct lockdep_map *lock, const char *name, 354extern void lock_set_class(struct lockdep_map *lock, const char *name,
346 struct lock_class_key *key, unsigned int subclass, 355 struct lock_class_key *key, unsigned int subclass,
@@ -372,6 +381,14 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
372 WARN_ON(debug_locks && !lockdep_is_held(l)); \ 381 WARN_ON(debug_locks && !lockdep_is_held(l)); \
373 } while (0) 382 } while (0)
374 383
384#define lockdep_assert_held_exclusive(l) do { \
385 WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \
386 } while (0)
387
388#define lockdep_assert_held_read(l) do { \
389 WARN_ON(debug_locks && !lockdep_is_held_type(l, 1)); \
390 } while (0)
391
375#define lockdep_assert_held_once(l) do { \ 392#define lockdep_assert_held_once(l) do { \
376 WARN_ON_ONCE(debug_locks && !lockdep_is_held(l)); \ 393 WARN_ON_ONCE(debug_locks && !lockdep_is_held(l)); \
377 } while (0) 394 } while (0)
@@ -428,7 +445,11 @@ struct lock_class_key { };
428 445
429#define lockdep_depth(tsk) (0) 446#define lockdep_depth(tsk) (0)
430 447
448#define lockdep_is_held_type(l, r) (1)
449
431#define lockdep_assert_held(l) do { (void)(l); } while (0) 450#define lockdep_assert_held(l) do { (void)(l); } while (0)
451#define lockdep_assert_held_exclusive(l) do { (void)(l); } while (0)
452#define lockdep_assert_held_read(l) do { (void)(l); } while (0)
432#define lockdep_assert_held_once(l) do { (void)(l); } while (0) 453#define lockdep_assert_held_once(l) do { (void)(l); } while (0)
433 454
434#define lockdep_recursing(tsk) (0) 455#define lockdep_recursing(tsk) (0)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 589d763a49b3..cff580a6edf9 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3188,7 +3188,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
3188 return 0; 3188 return 0;
3189} 3189}
3190 3190
3191static int __lock_is_held(struct lockdep_map *lock); 3191static int __lock_is_held(struct lockdep_map *lock, int read);
3192 3192
3193/* 3193/*
3194 * This gets called for every mutex_lock*()/spin_lock*() operation. 3194 * This gets called for every mutex_lock*()/spin_lock*() operation.
@@ -3329,7 +3329,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
3329 } 3329 }
3330 chain_key = iterate_chain_key(chain_key, class_idx); 3330 chain_key = iterate_chain_key(chain_key, class_idx);
3331 3331
3332 if (nest_lock && !__lock_is_held(nest_lock)) 3332 if (nest_lock && !__lock_is_held(nest_lock, -1))
3333 return print_lock_nested_lock_not_held(curr, hlock, ip); 3333 return print_lock_nested_lock_not_held(curr, hlock, ip);
3334 3334
3335 if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) 3335 if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
@@ -3576,7 +3576,7 @@ found_it:
3576 return 1; 3576 return 1;
3577} 3577}
3578 3578
3579static int __lock_is_held(struct lockdep_map *lock) 3579static int __lock_is_held(struct lockdep_map *lock, int read)
3580{ 3580{
3581 struct task_struct *curr = current; 3581 struct task_struct *curr = current;
3582 int i; 3582 int i;
@@ -3584,8 +3584,12 @@ static int __lock_is_held(struct lockdep_map *lock)
3584 for (i = 0; i < curr->lockdep_depth; i++) { 3584 for (i = 0; i < curr->lockdep_depth; i++) {
3585 struct held_lock *hlock = curr->held_locks + i; 3585 struct held_lock *hlock = curr->held_locks + i;
3586 3586
3587 if (match_held_lock(hlock, lock)) 3587 if (match_held_lock(hlock, lock)) {
3588 return 1; 3588 if (read == -1 || hlock->read == read)
3589 return 1;
3590
3591 return 0;
3592 }
3589 } 3593 }
3590 3594
3591 return 0; 3595 return 0;
@@ -3769,7 +3773,7 @@ void lock_release(struct lockdep_map *lock, int nested,
3769} 3773}
3770EXPORT_SYMBOL_GPL(lock_release); 3774EXPORT_SYMBOL_GPL(lock_release);
3771 3775
3772int lock_is_held(struct lockdep_map *lock) 3776int lock_is_held_type(struct lockdep_map *lock, int read)
3773{ 3777{
3774 unsigned long flags; 3778 unsigned long flags;
3775 int ret = 0; 3779 int ret = 0;
@@ -3781,13 +3785,13 @@ int lock_is_held(struct lockdep_map *lock)
3781 check_flags(flags); 3785 check_flags(flags);
3782 3786
3783 current->lockdep_recursion = 1; 3787 current->lockdep_recursion = 1;
3784 ret = __lock_is_held(lock); 3788 ret = __lock_is_held(lock, read);
3785 current->lockdep_recursion = 0; 3789 current->lockdep_recursion = 0;
3786 raw_local_irq_restore(flags); 3790 raw_local_irq_restore(flags);
3787 3791
3788 return ret; 3792 return ret;
3789} 3793}
3790EXPORT_SYMBOL_GPL(lock_is_held); 3794EXPORT_SYMBOL_GPL(lock_is_held_type);
3791 3795
3792struct pin_cookie lock_pin_lock(struct lockdep_map *lock) 3796struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
3793{ 3797{