summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTrond Myklebust <trond.myklebust@primarydata.com>2016-06-03 17:07:19 -0400
committerTrond Myklebust <trond.myklebust@primarydata.com>2016-07-05 19:11:04 -0400
commita5864c999de6703f7ce908f72337568520c6cad3 (patch)
treec1136f64cb88078a28bfb177d9f0f1e1745716d2
parent18290650b1c8655cfe6e0d63dd34942a037a130b (diff)
NFS: Do not serialise O_DIRECT reads and writes
Allow dio requests to be scheduled in parallel, but ensuring that they do not conflict with buffered I/O. Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/direct.c41
-rw-r--r--fs/nfs/file.c12
-rw-r--r--fs/nfs/internal.h8
-rw-r--r--fs/nfs/io.c147
-rw-r--r--include/linux/nfs_fs.h1
6 files changed, 174 insertions, 37 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8664417955a2..6abdda209642 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
6 6
7CFLAGS_nfstrace.o += -I$(src) 7CFLAGS_nfstrace.o += -I$(src)
8nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ 8nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
9 direct.o pagelist.o read.o symlink.o unlink.o \ 9 io.o direct.o pagelist.o read.o symlink.o unlink.o \
10 write.o namespace.o mount_clnt.o nfstrace.o 10 write.o namespace.o mount_clnt.o nfstrace.o
11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o 11nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
12nfs-$(CONFIG_SYSCTL) += sysctl.o 12nfs-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0169eca8eb42..6d0e88096440 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -578,17 +578,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
578 if (!count) 578 if (!count)
579 goto out; 579 goto out;
580 580
581 inode_lock(inode);
582 result = nfs_sync_mapping(mapping);
583 if (result)
584 goto out_unlock;
585
586 task_io_account_read(count); 581 task_io_account_read(count);
587 582
588 result = -ENOMEM; 583 result = -ENOMEM;
589 dreq = nfs_direct_req_alloc(); 584 dreq = nfs_direct_req_alloc();
590 if (dreq == NULL) 585 if (dreq == NULL)
591 goto out_unlock; 586 goto out;
592 587
593 dreq->inode = inode; 588 dreq->inode = inode;
594 dreq->bytes_left = dreq->max_count = count; 589 dreq->bytes_left = dreq->max_count = count;
@@ -603,10 +598,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
603 if (!is_sync_kiocb(iocb)) 598 if (!is_sync_kiocb(iocb))
604 dreq->iocb = iocb; 599 dreq->iocb = iocb;
605 600
601 nfs_start_io_direct(inode);
602
606 NFS_I(inode)->read_io += count; 603 NFS_I(inode)->read_io += count;
607 result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); 604 result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
608 605
609 inode_unlock(inode); 606 nfs_end_io_direct(inode);
610 607
611 if (!result) { 608 if (!result) {
612 result = nfs_direct_wait(dreq); 609 result = nfs_direct_wait(dreq);
@@ -614,13 +611,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
614 iocb->ki_pos += result; 611 iocb->ki_pos += result;
615 } 612 }
616 613
617 nfs_direct_req_release(dreq);
618 return result;
619
620out_release: 614out_release:
621 nfs_direct_req_release(dreq); 615 nfs_direct_req_release(dreq);
622out_unlock:
623 inode_unlock(inode);
624out: 616out:
625 return result; 617 return result;
626} 618}
@@ -1008,25 +1000,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1008 pos = iocb->ki_pos; 1000 pos = iocb->ki_pos;
1009 end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; 1001 end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
1010 1002
1011 inode_lock(inode);
1012
1013 result = nfs_sync_mapping(mapping);
1014 if (result)
1015 goto out_unlock;
1016
1017 if (mapping->nrpages) {
1018 result = invalidate_inode_pages2_range(mapping,
1019 pos >> PAGE_SHIFT, end);
1020 if (result)
1021 goto out_unlock;
1022 }
1023
1024 task_io_account_write(count); 1003 task_io_account_write(count);
1025 1004
1026 result = -ENOMEM; 1005 result = -ENOMEM;
1027 dreq = nfs_direct_req_alloc(); 1006 dreq = nfs_direct_req_alloc();
1028 if (!dreq) 1007 if (!dreq)
1029 goto out_unlock; 1008 goto out;
1030 1009
1031 dreq->inode = inode; 1010 dreq->inode = inode;
1032 dreq->bytes_left = dreq->max_count = count; 1011 dreq->bytes_left = dreq->max_count = count;
@@ -1041,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1041 if (!is_sync_kiocb(iocb)) 1020 if (!is_sync_kiocb(iocb))
1042 dreq->iocb = iocb; 1021 dreq->iocb = iocb;
1043 1022
1023 nfs_start_io_direct(inode);
1024
1044 result = nfs_direct_write_schedule_iovec(dreq, iter, pos); 1025 result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
1045 1026
1046 if (mapping->nrpages) { 1027 if (mapping->nrpages) {
@@ -1048,7 +1029,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1048 pos >> PAGE_SHIFT, end); 1029 pos >> PAGE_SHIFT, end);
1049 } 1030 }
1050 1031
1051 inode_unlock(inode); 1032 nfs_end_io_direct(inode);
1052 1033
1053 if (!result) { 1034 if (!result) {
1054 result = nfs_direct_wait(dreq); 1035 result = nfs_direct_wait(dreq);
@@ -1058,13 +1039,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
1058 generic_write_sync(iocb, result); 1039 generic_write_sync(iocb, result);
1059 } 1040 }
1060 } 1041 }
1061 nfs_direct_req_release(dreq);
1062 return result;
1063
1064out_release: 1042out_release:
1065 nfs_direct_req_release(dreq); 1043 nfs_direct_req_release(dreq);
1066out_unlock: 1044out:
1067 inode_unlock(inode);
1068 return result; 1045 return result;
1069} 1046}
1070 1047
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 46cf0afe3c0f..9f8da9e1b23f 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
170 iocb->ki_filp, 170 iocb->ki_filp,
171 iov_iter_count(to), (unsigned long) iocb->ki_pos); 171 iov_iter_count(to), (unsigned long) iocb->ki_pos);
172 172
173 result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); 173 nfs_start_io_read(inode);
174 result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
174 if (!result) { 175 if (!result) {
175 result = generic_file_read_iter(iocb, to); 176 result = generic_file_read_iter(iocb, to);
176 if (result > 0) 177 if (result > 0)
177 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); 178 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
178 } 179 }
180 nfs_end_io_read(inode);
179 return result; 181 return result;
180} 182}
181EXPORT_SYMBOL_GPL(nfs_file_read); 183EXPORT_SYMBOL_GPL(nfs_file_read);
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
191 dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", 193 dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
192 filp, (unsigned long) count, (unsigned long long) *ppos); 194 filp, (unsigned long) count, (unsigned long long) *ppos);
193 195
194 res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); 196 nfs_start_io_read(inode);
197 res = nfs_revalidate_mapping(inode, filp->f_mapping);
195 if (!res) { 198 if (!res) {
196 res = generic_file_splice_read(filp, ppos, pipe, count, flags); 199 res = generic_file_splice_read(filp, ppos, pipe, count, flags);
197 if (res > 0) 200 if (res > 0)
198 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); 201 nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
199 } 202 }
203 nfs_end_io_read(inode);
200 return res; 204 return res;
201} 205}
202EXPORT_SYMBOL_GPL(nfs_file_splice_read); 206EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@@ -645,14 +649,14 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
645 goto out; 649 goto out;
646 } 650 }
647 651
648 inode_lock(inode); 652 nfs_start_io_write(inode);
649 result = generic_write_checks(iocb, from); 653 result = generic_write_checks(iocb, from);
650 if (result > 0) { 654 if (result > 0) {
651 current->backing_dev_info = inode_to_bdi(inode); 655 current->backing_dev_info = inode_to_bdi(inode);
652 result = generic_perform_write(file, from, iocb->ki_pos); 656 result = generic_perform_write(file, from, iocb->ki_pos);
653 current->backing_dev_info = NULL; 657 current->backing_dev_info = NULL;
654 } 658 }
655 inode_unlock(inode); 659 nfs_end_io_write(inode);
656 if (result <= 0) 660 if (result <= 0)
657 goto out; 661 goto out;
658 662
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 0eb5c924886d..159b64ede82a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -411,6 +411,14 @@ extern void __exit unregister_nfs_fs(void);
411extern bool nfs_sb_active(struct super_block *sb); 411extern bool nfs_sb_active(struct super_block *sb);
412extern void nfs_sb_deactive(struct super_block *sb); 412extern void nfs_sb_deactive(struct super_block *sb);
413 413
414/* io.c */
415extern void nfs_start_io_read(struct inode *inode);
416extern void nfs_end_io_read(struct inode *inode);
417extern void nfs_start_io_write(struct inode *inode);
418extern void nfs_end_io_write(struct inode *inode);
419extern void nfs_start_io_direct(struct inode *inode);
420extern void nfs_end_io_direct(struct inode *inode);
421
414/* namespace.c */ 422/* namespace.c */
415#define NFS_PATH_CANONICAL 1 423#define NFS_PATH_CANONICAL 1
416extern char *nfs_path(char **p, struct dentry *dentry, 424extern char *nfs_path(char **p, struct dentry *dentry,
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644
index 000000000000..1fc5d1ce327e
--- /dev/null
+++ b/fs/nfs/io.c
@@ -0,0 +1,147 @@
1/*
2 * Copyright (c) 2016 Trond Myklebust
3 *
4 * I/O and data path helper functionality.
5 */
6
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/bitops.h>
10#include <linux/rwsem.h>
11#include <linux/fs.h>
12#include <linux/nfs_fs.h>
13
14#include "internal.h"
15
16/* Call with exclusively locked inode->i_rwsem */
17static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
18{
19 if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
20 clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
21 inode_dio_wait(inode);
22 }
23}
24
25/**
26 * nfs_start_io_read - declare the file is being used for buffered reads
27 * @inode - file inode
28 *
29 * Declare that a buffered read operation is about to start, and ensure
30 * that we block all direct I/O.
31 * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
32 * and holds a shared lock on inode->i_rwsem to ensure that the flag
33 * cannot be changed.
34 * In practice, this means that buffered read operations are allowed to
35 * execute in parallel, thanks to the shared lock, whereas direct I/O
36 * operations need to wait to grab an exclusive lock in order to set
37 * NFS_INO_ODIRECT.
38 * Note that buffered writes and truncates both take a write lock on
39 * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
40 */
41void
42nfs_start_io_read(struct inode *inode)
43{
44 struct nfs_inode *nfsi = NFS_I(inode);
45 /* Be an optimist! */
46 down_read(&inode->i_rwsem);
47 if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
48 return;
49 up_read(&inode->i_rwsem);
50 /* Slow path.... */
51 down_write(&inode->i_rwsem);
52 nfs_block_o_direct(nfsi, inode);
53 downgrade_write(&inode->i_rwsem);
54}
55
56/**
57 * nfs_end_io_read - declare that the buffered read operation is done
58 * @inode - file inode
59 *
60 * Declare that a buffered read operation is done, and release the shared
61 * lock on inode->i_rwsem.
62 */
63void
64nfs_end_io_read(struct inode *inode)
65{
66 up_read(&inode->i_rwsem);
67}
68
69/**
70 * nfs_start_io_write - declare the file is being used for buffered writes
71 * @inode - file inode
72 *
73 * Declare that a buffered read operation is about to start, and ensure
74 * that we block all direct I/O.
75 */
76void
77nfs_start_io_write(struct inode *inode)
78{
79 down_write(&inode->i_rwsem);
80 nfs_block_o_direct(NFS_I(inode), inode);
81}
82
83/**
84 * nfs_end_io_write - declare that the buffered write operation is done
85 * @inode - file inode
86 *
87 * Declare that a buffered write operation is done, and release the
88 * lock on inode->i_rwsem.
89 */
90void
91nfs_end_io_write(struct inode *inode)
92{
93 up_write(&inode->i_rwsem);
94}
95
96/* Call with exclusively locked inode->i_rwsem */
97static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
98{
99 if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
100 set_bit(NFS_INO_ODIRECT, &nfsi->flags);
101 nfs_wb_all(inode);
102 }
103}
104
105/**
106 * nfs_end_io_direct - declare the file is being used for direct i/o
107 * @inode - file inode
108 *
109 * Declare that a direct I/O operation is about to start, and ensure
110 * that we block all buffered I/O.
111 * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
112 * and holds a shared lock on inode->i_rwsem to ensure that the flag
113 * cannot be changed.
114 * In practice, this means that direct I/O operations are allowed to
115 * execute in parallel, thanks to the shared lock, whereas buffered I/O
116 * operations need to wait to grab an exclusive lock in order to clear
117 * NFS_INO_ODIRECT.
118 * Note that buffered writes and truncates both take a write lock on
119 * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
120 */
121void
122nfs_start_io_direct(struct inode *inode)
123{
124 struct nfs_inode *nfsi = NFS_I(inode);
125 /* Be an optimist! */
126 down_read(&inode->i_rwsem);
127 if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
128 return;
129 up_read(&inode->i_rwsem);
130 /* Slow path.... */
131 down_write(&inode->i_rwsem);
132 nfs_block_buffered(nfsi, inode);
133 downgrade_write(&inode->i_rwsem);
134}
135
136/**
137 * nfs_end_io_direct - declare that the direct i/o operation is done
138 * @inode - file inode
139 *
140 * Declare that a direct I/O operation is done, and release the shared
141 * lock on inode->i_rwsem.
142 */
143void
144nfs_end_io_direct(struct inode *inode)
145{
146 up_read(&inode->i_rwsem);
147}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 120dd04b553c..225d17d35277 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -210,6 +210,7 @@ struct nfs_inode {
210#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ 210#define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */
211#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ 211#define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */
212#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ 212#define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */
213#define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */
213 214
214static inline struct nfs_inode *NFS_I(const struct inode *inode) 215static inline struct nfs_inode *NFS_I(const struct inode *inode)
215{ 216{