diff options
author | Trond Myklebust <trond.myklebust@primarydata.com> | 2016-06-03 17:07:19 -0400 |
---|---|---|
committer | Trond Myklebust <trond.myklebust@primarydata.com> | 2016-07-05 19:11:04 -0400 |
commit | a5864c999de6703f7ce908f72337568520c6cad3 (patch) | |
tree | c1136f64cb88078a28bfb177d9f0f1e1745716d2 /fs | |
parent | 18290650b1c8655cfe6e0d63dd34942a037a130b (diff) |
NFS: Do not serialise O_DIRECT reads and writes
Allow dio requests to be scheduled in parallel, but ensuring that they
do not conflict with buffered I/O.
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/nfs/Makefile | 2 | ||||
-rw-r--r-- | fs/nfs/direct.c | 41 | ||||
-rw-r--r-- | fs/nfs/file.c | 12 | ||||
-rw-r--r-- | fs/nfs/internal.h | 8 | ||||
-rw-r--r-- | fs/nfs/io.c | 147 |
5 files changed, 173 insertions, 37 deletions
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 8664417955a2..6abdda209642 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile | |||
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o | |||
6 | 6 | ||
7 | CFLAGS_nfstrace.o += -I$(src) | 7 | CFLAGS_nfstrace.o += -I$(src) |
8 | nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ | 8 | nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ |
9 | direct.o pagelist.o read.o symlink.o unlink.o \ | 9 | io.o direct.o pagelist.o read.o symlink.o unlink.o \ |
10 | write.o namespace.o mount_clnt.o nfstrace.o | 10 | write.o namespace.o mount_clnt.o nfstrace.o |
11 | nfs-$(CONFIG_ROOT_NFS) += nfsroot.o | 11 | nfs-$(CONFIG_ROOT_NFS) += nfsroot.o |
12 | nfs-$(CONFIG_SYSCTL) += sysctl.o | 12 | nfs-$(CONFIG_SYSCTL) += sysctl.o |
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0169eca8eb42..6d0e88096440 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c | |||
@@ -578,17 +578,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) | |||
578 | if (!count) | 578 | if (!count) |
579 | goto out; | 579 | goto out; |
580 | 580 | ||
581 | inode_lock(inode); | ||
582 | result = nfs_sync_mapping(mapping); | ||
583 | if (result) | ||
584 | goto out_unlock; | ||
585 | |||
586 | task_io_account_read(count); | 581 | task_io_account_read(count); |
587 | 582 | ||
588 | result = -ENOMEM; | 583 | result = -ENOMEM; |
589 | dreq = nfs_direct_req_alloc(); | 584 | dreq = nfs_direct_req_alloc(); |
590 | if (dreq == NULL) | 585 | if (dreq == NULL) |
591 | goto out_unlock; | 586 | goto out; |
592 | 587 | ||
593 | dreq->inode = inode; | 588 | dreq->inode = inode; |
594 | dreq->bytes_left = dreq->max_count = count; | 589 | dreq->bytes_left = dreq->max_count = count; |
@@ -603,10 +598,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) | |||
603 | if (!is_sync_kiocb(iocb)) | 598 | if (!is_sync_kiocb(iocb)) |
604 | dreq->iocb = iocb; | 599 | dreq->iocb = iocb; |
605 | 600 | ||
601 | nfs_start_io_direct(inode); | ||
602 | |||
606 | NFS_I(inode)->read_io += count; | 603 | NFS_I(inode)->read_io += count; |
607 | result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); | 604 | result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); |
608 | 605 | ||
609 | inode_unlock(inode); | 606 | nfs_end_io_direct(inode); |
610 | 607 | ||
611 | if (!result) { | 608 | if (!result) { |
612 | result = nfs_direct_wait(dreq); | 609 | result = nfs_direct_wait(dreq); |
@@ -614,13 +611,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) | |||
614 | iocb->ki_pos += result; | 611 | iocb->ki_pos += result; |
615 | } | 612 | } |
616 | 613 | ||
617 | nfs_direct_req_release(dreq); | ||
618 | return result; | ||
619 | |||
620 | out_release: | 614 | out_release: |
621 | nfs_direct_req_release(dreq); | 615 | nfs_direct_req_release(dreq); |
622 | out_unlock: | ||
623 | inode_unlock(inode); | ||
624 | out: | 616 | out: |
625 | return result; | 617 | return result; |
626 | } | 618 | } |
@@ -1008,25 +1000,12 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) | |||
1008 | pos = iocb->ki_pos; | 1000 | pos = iocb->ki_pos; |
1009 | end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; | 1001 | end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT; |
1010 | 1002 | ||
1011 | inode_lock(inode); | ||
1012 | |||
1013 | result = nfs_sync_mapping(mapping); | ||
1014 | if (result) | ||
1015 | goto out_unlock; | ||
1016 | |||
1017 | if (mapping->nrpages) { | ||
1018 | result = invalidate_inode_pages2_range(mapping, | ||
1019 | pos >> PAGE_SHIFT, end); | ||
1020 | if (result) | ||
1021 | goto out_unlock; | ||
1022 | } | ||
1023 | |||
1024 | task_io_account_write(count); | 1003 | task_io_account_write(count); |
1025 | 1004 | ||
1026 | result = -ENOMEM; | 1005 | result = -ENOMEM; |
1027 | dreq = nfs_direct_req_alloc(); | 1006 | dreq = nfs_direct_req_alloc(); |
1028 | if (!dreq) | 1007 | if (!dreq) |
1029 | goto out_unlock; | 1008 | goto out; |
1030 | 1009 | ||
1031 | dreq->inode = inode; | 1010 | dreq->inode = inode; |
1032 | dreq->bytes_left = dreq->max_count = count; | 1011 | dreq->bytes_left = dreq->max_count = count; |
@@ -1041,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) | |||
1041 | if (!is_sync_kiocb(iocb)) | 1020 | if (!is_sync_kiocb(iocb)) |
1042 | dreq->iocb = iocb; | 1021 | dreq->iocb = iocb; |
1043 | 1022 | ||
1023 | nfs_start_io_direct(inode); | ||
1024 | |||
1044 | result = nfs_direct_write_schedule_iovec(dreq, iter, pos); | 1025 | result = nfs_direct_write_schedule_iovec(dreq, iter, pos); |
1045 | 1026 | ||
1046 | if (mapping->nrpages) { | 1027 | if (mapping->nrpages) { |
@@ -1048,7 +1029,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) | |||
1048 | pos >> PAGE_SHIFT, end); | 1029 | pos >> PAGE_SHIFT, end); |
1049 | } | 1030 | } |
1050 | 1031 | ||
1051 | inode_unlock(inode); | 1032 | nfs_end_io_direct(inode); |
1052 | 1033 | ||
1053 | if (!result) { | 1034 | if (!result) { |
1054 | result = nfs_direct_wait(dreq); | 1035 | result = nfs_direct_wait(dreq); |
@@ -1058,13 +1039,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) | |||
1058 | generic_write_sync(iocb, result); | 1039 | generic_write_sync(iocb, result); |
1059 | } | 1040 | } |
1060 | } | 1041 | } |
1061 | nfs_direct_req_release(dreq); | ||
1062 | return result; | ||
1063 | |||
1064 | out_release: | 1042 | out_release: |
1065 | nfs_direct_req_release(dreq); | 1043 | nfs_direct_req_release(dreq); |
1066 | out_unlock: | 1044 | out: |
1067 | inode_unlock(inode); | ||
1068 | return result; | 1045 | return result; |
1069 | } | 1046 | } |
1070 | 1047 | ||
diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 46cf0afe3c0f..9f8da9e1b23f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c | |||
@@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) | |||
170 | iocb->ki_filp, | 170 | iocb->ki_filp, |
171 | iov_iter_count(to), (unsigned long) iocb->ki_pos); | 171 | iov_iter_count(to), (unsigned long) iocb->ki_pos); |
172 | 172 | ||
173 | result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping); | 173 | nfs_start_io_read(inode); |
174 | result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); | ||
174 | if (!result) { | 175 | if (!result) { |
175 | result = generic_file_read_iter(iocb, to); | 176 | result = generic_file_read_iter(iocb, to); |
176 | if (result > 0) | 177 | if (result > 0) |
177 | nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); | 178 | nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result); |
178 | } | 179 | } |
180 | nfs_end_io_read(inode); | ||
179 | return result; | 181 | return result; |
180 | } | 182 | } |
181 | EXPORT_SYMBOL_GPL(nfs_file_read); | 183 | EXPORT_SYMBOL_GPL(nfs_file_read); |
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos, | |||
191 | dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", | 193 | dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n", |
192 | filp, (unsigned long) count, (unsigned long long) *ppos); | 194 | filp, (unsigned long) count, (unsigned long long) *ppos); |
193 | 195 | ||
194 | res = nfs_revalidate_mapping_protected(inode, filp->f_mapping); | 196 | nfs_start_io_read(inode); |
197 | res = nfs_revalidate_mapping(inode, filp->f_mapping); | ||
195 | if (!res) { | 198 | if (!res) { |
196 | res = generic_file_splice_read(filp, ppos, pipe, count, flags); | 199 | res = generic_file_splice_read(filp, ppos, pipe, count, flags); |
197 | if (res > 0) | 200 | if (res > 0) |
198 | nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); | 201 | nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res); |
199 | } | 202 | } |
203 | nfs_end_io_read(inode); | ||
200 | return res; | 204 | return res; |
201 | } | 205 | } |
202 | EXPORT_SYMBOL_GPL(nfs_file_splice_read); | 206 | EXPORT_SYMBOL_GPL(nfs_file_splice_read); |
@@ -645,14 +649,14 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) | |||
645 | goto out; | 649 | goto out; |
646 | } | 650 | } |
647 | 651 | ||
648 | inode_lock(inode); | 652 | nfs_start_io_write(inode); |
649 | result = generic_write_checks(iocb, from); | 653 | result = generic_write_checks(iocb, from); |
650 | if (result > 0) { | 654 | if (result > 0) { |
651 | current->backing_dev_info = inode_to_bdi(inode); | 655 | current->backing_dev_info = inode_to_bdi(inode); |
652 | result = generic_perform_write(file, from, iocb->ki_pos); | 656 | result = generic_perform_write(file, from, iocb->ki_pos); |
653 | current->backing_dev_info = NULL; | 657 | current->backing_dev_info = NULL; |
654 | } | 658 | } |
655 | inode_unlock(inode); | 659 | nfs_end_io_write(inode); |
656 | if (result <= 0) | 660 | if (result <= 0) |
657 | goto out; | 661 | goto out; |
658 | 662 | ||
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 0eb5c924886d..159b64ede82a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h | |||
@@ -411,6 +411,14 @@ extern void __exit unregister_nfs_fs(void); | |||
411 | extern bool nfs_sb_active(struct super_block *sb); | 411 | extern bool nfs_sb_active(struct super_block *sb); |
412 | extern void nfs_sb_deactive(struct super_block *sb); | 412 | extern void nfs_sb_deactive(struct super_block *sb); |
413 | 413 | ||
414 | /* io.c */ | ||
415 | extern void nfs_start_io_read(struct inode *inode); | ||
416 | extern void nfs_end_io_read(struct inode *inode); | ||
417 | extern void nfs_start_io_write(struct inode *inode); | ||
418 | extern void nfs_end_io_write(struct inode *inode); | ||
419 | extern void nfs_start_io_direct(struct inode *inode); | ||
420 | extern void nfs_end_io_direct(struct inode *inode); | ||
421 | |||
414 | /* namespace.c */ | 422 | /* namespace.c */ |
415 | #define NFS_PATH_CANONICAL 1 | 423 | #define NFS_PATH_CANONICAL 1 |
416 | extern char *nfs_path(char **p, struct dentry *dentry, | 424 | extern char *nfs_path(char **p, struct dentry *dentry, |
diff --git a/fs/nfs/io.c b/fs/nfs/io.c new file mode 100644 index 000000000000..1fc5d1ce327e --- /dev/null +++ b/fs/nfs/io.c | |||
@@ -0,0 +1,147 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2016 Trond Myklebust | ||
3 | * | ||
4 | * I/O and data path helper functionality. | ||
5 | */ | ||
6 | |||
7 | #include <linux/types.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/bitops.h> | ||
10 | #include <linux/rwsem.h> | ||
11 | #include <linux/fs.h> | ||
12 | #include <linux/nfs_fs.h> | ||
13 | |||
14 | #include "internal.h" | ||
15 | |||
16 | /* Call with exclusively locked inode->i_rwsem */ | ||
17 | static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) | ||
18 | { | ||
19 | if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { | ||
20 | clear_bit(NFS_INO_ODIRECT, &nfsi->flags); | ||
21 | inode_dio_wait(inode); | ||
22 | } | ||
23 | } | ||
24 | |||
25 | /** | ||
26 | * nfs_start_io_read - declare the file is being used for buffered reads | ||
27 | * @inode - file inode | ||
28 | * | ||
29 | * Declare that a buffered read operation is about to start, and ensure | ||
30 | * that we block all direct I/O. | ||
31 | * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset, | ||
32 | * and holds a shared lock on inode->i_rwsem to ensure that the flag | ||
33 | * cannot be changed. | ||
34 | * In practice, this means that buffered read operations are allowed to | ||
35 | * execute in parallel, thanks to the shared lock, whereas direct I/O | ||
36 | * operations need to wait to grab an exclusive lock in order to set | ||
37 | * NFS_INO_ODIRECT. | ||
38 | * Note that buffered writes and truncates both take a write lock on | ||
39 | * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. | ||
40 | */ | ||
41 | void | ||
42 | nfs_start_io_read(struct inode *inode) | ||
43 | { | ||
44 | struct nfs_inode *nfsi = NFS_I(inode); | ||
45 | /* Be an optimist! */ | ||
46 | down_read(&inode->i_rwsem); | ||
47 | if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) | ||
48 | return; | ||
49 | up_read(&inode->i_rwsem); | ||
50 | /* Slow path.... */ | ||
51 | down_write(&inode->i_rwsem); | ||
52 | nfs_block_o_direct(nfsi, inode); | ||
53 | downgrade_write(&inode->i_rwsem); | ||
54 | } | ||
55 | |||
56 | /** | ||
57 | * nfs_end_io_read - declare that the buffered read operation is done | ||
58 | * @inode - file inode | ||
59 | * | ||
60 | * Declare that a buffered read operation is done, and release the shared | ||
61 | * lock on inode->i_rwsem. | ||
62 | */ | ||
63 | void | ||
64 | nfs_end_io_read(struct inode *inode) | ||
65 | { | ||
66 | up_read(&inode->i_rwsem); | ||
67 | } | ||
68 | |||
69 | /** | ||
70 | * nfs_start_io_write - declare the file is being used for buffered writes | ||
71 | * @inode - file inode | ||
72 | * | ||
73 | * Declare that a buffered read operation is about to start, and ensure | ||
74 | * that we block all direct I/O. | ||
75 | */ | ||
76 | void | ||
77 | nfs_start_io_write(struct inode *inode) | ||
78 | { | ||
79 | down_write(&inode->i_rwsem); | ||
80 | nfs_block_o_direct(NFS_I(inode), inode); | ||
81 | } | ||
82 | |||
83 | /** | ||
84 | * nfs_end_io_write - declare that the buffered write operation is done | ||
85 | * @inode - file inode | ||
86 | * | ||
87 | * Declare that a buffered write operation is done, and release the | ||
88 | * lock on inode->i_rwsem. | ||
89 | */ | ||
90 | void | ||
91 | nfs_end_io_write(struct inode *inode) | ||
92 | { | ||
93 | up_write(&inode->i_rwsem); | ||
94 | } | ||
95 | |||
96 | /* Call with exclusively locked inode->i_rwsem */ | ||
97 | static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) | ||
98 | { | ||
99 | if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) { | ||
100 | set_bit(NFS_INO_ODIRECT, &nfsi->flags); | ||
101 | nfs_wb_all(inode); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | /** | ||
106 | * nfs_end_io_direct - declare the file is being used for direct i/o | ||
107 | * @inode - file inode | ||
108 | * | ||
109 | * Declare that a direct I/O operation is about to start, and ensure | ||
110 | * that we block all buffered I/O. | ||
111 | * On exit, the function ensures that the NFS_INO_ODIRECT flag is set, | ||
112 | * and holds a shared lock on inode->i_rwsem to ensure that the flag | ||
113 | * cannot be changed. | ||
114 | * In practice, this means that direct I/O operations are allowed to | ||
115 | * execute in parallel, thanks to the shared lock, whereas buffered I/O | ||
116 | * operations need to wait to grab an exclusive lock in order to clear | ||
117 | * NFS_INO_ODIRECT. | ||
118 | * Note that buffered writes and truncates both take a write lock on | ||
119 | * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. | ||
120 | */ | ||
121 | void | ||
122 | nfs_start_io_direct(struct inode *inode) | ||
123 | { | ||
124 | struct nfs_inode *nfsi = NFS_I(inode); | ||
125 | /* Be an optimist! */ | ||
126 | down_read(&inode->i_rwsem); | ||
127 | if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) | ||
128 | return; | ||
129 | up_read(&inode->i_rwsem); | ||
130 | /* Slow path.... */ | ||
131 | down_write(&inode->i_rwsem); | ||
132 | nfs_block_buffered(nfsi, inode); | ||
133 | downgrade_write(&inode->i_rwsem); | ||
134 | } | ||
135 | |||
136 | /** | ||
137 | * nfs_end_io_direct - declare that the direct i/o operation is done | ||
138 | * @inode - file inode | ||
139 | * | ||
140 | * Declare that a direct I/O operation is done, and release the shared | ||
141 | * lock on inode->i_rwsem. | ||
142 | */ | ||
143 | void | ||
144 | nfs_end_io_direct(struct inode *inode) | ||
145 | { | ||
146 | up_read(&inode->i_rwsem); | ||
147 | } | ||