summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Layton <jlayton@kernel.org>2019-08-02 13:15:39 -0400
committerIlya Dryomov <idryomov@gmail.com>2019-09-16 06:06:25 -0400
commit321fe13c939876b55fbb7780a243c86e577a2151 (patch)
tree9963961399b7648c1207012d31b1dc51f50a389c
parent4766815b1179dbe4fe263a5f95795710e29276e2 (diff)
ceph: add buffered/direct exclusionary locking for reads and writes
xfstest generic/451 intermittently fails. The test does O_DIRECT writes to a file, and then reads back the result using buffered I/O, while running a separate set of tasks that are also doing buffered reads. The client will invalidate the cache prior to a direct write, but it's easy for one of the other readers' replies to race in and reinstantiate the invalidated range with stale data. To fix this, we must to serialize direct I/O writes and buffered reads. We could just sprinkle in some shared locks on the i_rwsem for reads, and increase the exclusive footprint on the write side, but that would cause O_DIRECT writes to end up serialized vs. other direct requests. Instead, borrow the scheme used by nfs.ko. Buffered writes take the i_rwsem exclusively, but buffered reads take a shared lock, allowing them to run in parallel. O_DIRECT requests also take a shared lock, but we need for them to not run in parallel with buffered reads. A flag on the ceph_inode_info is used to indicate whether it's in direct or buffered I/O mode. When a conflicting request is submitted, it will block until the inode can be flipped to the necessary mode. Link: https://tracker.ceph.com/issues/40985 Signed-off-by: Jeff Layton <jlayton@kernel.org> Reviewed-by: "Yan, Zheng" <zyan@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/file.c35
-rw-r--r--fs/ceph/io.c163
-rw-r--r--fs/ceph/io.h12
-rw-r--r--fs/ceph/super.h4
5 files changed, 200 insertions, 16 deletions
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index a699e320393f..c1da294418d1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@
6obj-$(CONFIG_CEPH_FS) += ceph.o 6obj-$(CONFIG_CEPH_FS) += ceph.o
7 7
8ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 8ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
9 export.o caps.o snap.o xattr.o quota.o \ 9 export.o caps.o snap.o xattr.o quota.o io.o \
10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 10 mds_client.o mdsmap.o strings.o ceph_frag.o \
11 debugfs.o 11 debugfs.o
12 12
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 5182e1a49d6f..ff17a81bf2e2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -15,6 +15,7 @@
15#include "super.h" 15#include "super.h"
16#include "mds_client.h" 16#include "mds_client.h"
17#include "cache.h" 17#include "cache.h"
18#include "io.h"
18 19
19static __le32 ceph_flags_sys2wire(u32 flags) 20static __le32 ceph_flags_sys2wire(u32 flags)
20{ 21{
@@ -930,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
930 struct ceph_aio_request *aio_req = NULL; 931 struct ceph_aio_request *aio_req = NULL;
931 int num_pages = 0; 932 int num_pages = 0;
932 int flags; 933 int flags;
933 int ret; 934 int ret = 0;
934 struct timespec64 mtime = current_time(inode); 935 struct timespec64 mtime = current_time(inode);
935 size_t count = iov_iter_count(iter); 936 size_t count = iov_iter_count(iter);
936 loff_t pos = iocb->ki_pos; 937 loff_t pos = iocb->ki_pos;
@@ -944,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
944 (write ? "write" : "read"), file, pos, (unsigned)count, 945 (write ? "write" : "read"), file, pos, (unsigned)count,
945 snapc, snapc ? snapc->seq : 0); 946 snapc, snapc ? snapc->seq : 0);
946 947
947 ret = filemap_write_and_wait_range(inode->i_mapping,
948 pos, pos + count - 1);
949 if (ret < 0)
950 return ret;
951
952 if (write) { 948 if (write) {
953 int ret2 = invalidate_inode_pages2_range(inode->i_mapping, 949 int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
954 pos >> PAGE_SHIFT, 950 pos >> PAGE_SHIFT,
@@ -1284,12 +1280,16 @@ again:
1284 1280
1285 if (ci->i_inline_version == CEPH_INLINE_NONE) { 1281 if (ci->i_inline_version == CEPH_INLINE_NONE) {
1286 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 1282 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
1283 ceph_start_io_direct(inode);
1287 ret = ceph_direct_read_write(iocb, to, 1284 ret = ceph_direct_read_write(iocb, to,
1288 NULL, NULL); 1285 NULL, NULL);
1286 ceph_end_io_direct(inode);
1289 if (ret >= 0 && ret < len) 1287 if (ret >= 0 && ret < len)
1290 retry_op = CHECK_EOF; 1288 retry_op = CHECK_EOF;
1291 } else { 1289 } else {
1290 ceph_start_io_read(inode);
1292 ret = ceph_sync_read(iocb, to, &retry_op); 1291 ret = ceph_sync_read(iocb, to, &retry_op);
1292 ceph_end_io_read(inode);
1293 } 1293 }
1294 } else { 1294 } else {
1295 retry_op = READ_INLINE; 1295 retry_op = READ_INLINE;
@@ -1300,7 +1300,9 @@ again:
1300 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1300 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
1301 ceph_cap_string(got)); 1301 ceph_cap_string(got));
1302 ceph_add_rw_context(fi, &rw_ctx); 1302 ceph_add_rw_context(fi, &rw_ctx);
1303 ceph_start_io_read(inode);
1303 ret = generic_file_read_iter(iocb, to); 1304 ret = generic_file_read_iter(iocb, to);
1305 ceph_end_io_read(inode);
1304 ceph_del_rw_context(fi, &rw_ctx); 1306 ceph_del_rw_context(fi, &rw_ctx);
1305 } 1307 }
1306 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 1308 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
@@ -1409,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
1409 return -ENOMEM; 1411 return -ENOMEM;
1410 1412
1411retry_snap: 1413retry_snap:
1412 inode_lock(inode); 1414 if (iocb->ki_flags & IOCB_DIRECT)
1415 ceph_start_io_direct(inode);
1416 else
1417 ceph_start_io_write(inode);
1413 1418
1414 /* We can write back this queue in page reclaim */ 1419 /* We can write back this queue in page reclaim */
1415 current->backing_dev_info = inode_to_bdi(inode); 1420 current->backing_dev_info = inode_to_bdi(inode);
@@ -1480,7 +1485,6 @@ retry_snap:
1480 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { 1485 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
1481 struct ceph_snap_context *snapc; 1486 struct ceph_snap_context *snapc;
1482 struct iov_iter data; 1487 struct iov_iter data;
1483 inode_unlock(inode);
1484 1488
1485 spin_lock(&ci->i_ceph_lock); 1489 spin_lock(&ci->i_ceph_lock);
1486 if (__ceph_have_pending_cap_snap(ci)) { 1490 if (__ceph_have_pending_cap_snap(ci)) {
@@ -1497,11 +1501,14 @@ retry_snap:
1497 1501
1498 /* we might need to revert back to that point */ 1502 /* we might need to revert back to that point */
1499 data = *from; 1503 data = *from;
1500 if (iocb->ki_flags & IOCB_DIRECT) 1504 if (iocb->ki_flags & IOCB_DIRECT) {
1501 written = ceph_direct_read_write(iocb, &data, snapc, 1505 written = ceph_direct_read_write(iocb, &data, snapc,
1502 &prealloc_cf); 1506 &prealloc_cf);
1503 else 1507 ceph_end_io_direct(inode);
1508 } else {
1504 written = ceph_sync_write(iocb, &data, pos, snapc); 1509 written = ceph_sync_write(iocb, &data, pos, snapc);
1510 ceph_end_io_write(inode);
1511 }
1505 if (written > 0) 1512 if (written > 0)
1506 iov_iter_advance(from, written); 1513 iov_iter_advance(from, written);
1507 ceph_put_snap_context(snapc); 1514 ceph_put_snap_context(snapc);
@@ -1516,7 +1523,7 @@ retry_snap:
1516 written = generic_perform_write(file, from, pos); 1523 written = generic_perform_write(file, from, pos);
1517 if (likely(written >= 0)) 1524 if (likely(written >= 0))
1518 iocb->ki_pos = pos + written; 1525 iocb->ki_pos = pos + written;
1519 inode_unlock(inode); 1526 ceph_end_io_write(inode);
1520 } 1527 }
1521 1528
1522 if (written >= 0) { 1529 if (written >= 0) {
@@ -1551,9 +1558,11 @@ retry_snap:
1551 } 1558 }
1552 1559
1553 goto out_unlocked; 1560 goto out_unlocked;
1554
1555out: 1561out:
1556 inode_unlock(inode); 1562 if (iocb->ki_flags & IOCB_DIRECT)
1563 ceph_end_io_direct(inode);
1564 else
1565 ceph_end_io_write(inode);
1557out_unlocked: 1566out_unlocked:
1558 ceph_free_cap_flush(prealloc_cf); 1567 ceph_free_cap_flush(prealloc_cf);
1559 current->backing_dev_info = NULL; 1568 current->backing_dev_info = NULL;
diff --git a/fs/ceph/io.c b/fs/ceph/io.c
new file mode 100644
index 000000000000..97602ea92ff4
--- /dev/null
+++ b/fs/ceph/io.c
@@ -0,0 +1,163 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2016 Trond Myklebust
4 * Copyright (c) 2019 Jeff Layton
5 *
6 * I/O and data path helper functionality.
7 *
8 * Heavily borrowed from equivalent code in fs/nfs/io.c
9 */
10
11#include <linux/ceph/ceph_debug.h>
12
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/rwsem.h>
16#include <linux/fs.h>
17
18#include "super.h"
19#include "io.h"
20
21/* Call with exclusively locked inode->i_rwsem */
22static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
23{
24 lockdep_assert_held_write(&inode->i_rwsem);
25
26 if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
27 spin_lock(&ci->i_ceph_lock);
28 ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
29 spin_unlock(&ci->i_ceph_lock);
30 inode_dio_wait(inode);
31 }
32}
33
34/**
35 * ceph_start_io_read - declare the file is being used for buffered reads
36 * @inode: file inode
37 *
38 * Declare that a buffered read operation is about to start, and ensure
39 * that we block all direct I/O.
40 * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
41 * and holds a shared lock on inode->i_rwsem to ensure that the flag
42 * cannot be changed.
43 * In practice, this means that buffered read operations are allowed to
44 * execute in parallel, thanks to the shared lock, whereas direct I/O
45 * operations need to wait to grab an exclusive lock in order to set
46 * CEPH_I_ODIRECT.
47 * Note that buffered writes and truncates both take a write lock on
48 * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
49 */
50void
51ceph_start_io_read(struct inode *inode)
52{
53 struct ceph_inode_info *ci = ceph_inode(inode);
54
55 /* Be an optimist! */
56 down_read(&inode->i_rwsem);
57 if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
58 return;
59 up_read(&inode->i_rwsem);
60 /* Slow path.... */
61 down_write(&inode->i_rwsem);
62 ceph_block_o_direct(ci, inode);
63 downgrade_write(&inode->i_rwsem);
64}
65
66/**
67 * ceph_end_io_read - declare that the buffered read operation is done
68 * @inode: file inode
69 *
70 * Declare that a buffered read operation is done, and release the shared
71 * lock on inode->i_rwsem.
72 */
73void
74ceph_end_io_read(struct inode *inode)
75{
76 up_read(&inode->i_rwsem);
77}
78
79/**
80 * ceph_start_io_write - declare the file is being used for buffered writes
81 * @inode: file inode
82 *
83 * Declare that a buffered write operation is about to start, and ensure
84 * that we block all direct I/O.
85 */
86void
87ceph_start_io_write(struct inode *inode)
88{
89 down_write(&inode->i_rwsem);
90 ceph_block_o_direct(ceph_inode(inode), inode);
91}
92
93/**
94 * ceph_end_io_write - declare that the buffered write operation is done
95 * @inode: file inode
96 *
97 * Declare that a buffered write operation is done, and release the
98 * lock on inode->i_rwsem.
99 */
100void
101ceph_end_io_write(struct inode *inode)
102{
103 up_write(&inode->i_rwsem);
104}
105
106/* Call with exclusively locked inode->i_rwsem */
107static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
108{
109 lockdep_assert_held_write(&inode->i_rwsem);
110
111 if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
112 spin_lock(&ci->i_ceph_lock);
113 ci->i_ceph_flags |= CEPH_I_ODIRECT;
114 spin_unlock(&ci->i_ceph_lock);
115 /* FIXME: unmap_mapping_range? */
116 filemap_write_and_wait(inode->i_mapping);
117 }
118}
119
120/**
121 * ceph_end_io_direct - declare the file is being used for direct i/o
122 * @inode: file inode
123 *
124 * Declare that a direct I/O operation is about to start, and ensure
125 * that we block all buffered I/O.
126 * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
127 * and holds a shared lock on inode->i_rwsem to ensure that the flag
128 * cannot be changed.
129 * In practice, this means that direct I/O operations are allowed to
130 * execute in parallel, thanks to the shared lock, whereas buffered I/O
131 * operations need to wait to grab an exclusive lock in order to clear
132 * CEPH_I_ODIRECT.
133 * Note that buffered writes and truncates both take a write lock on
134 * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
135 */
136void
137ceph_start_io_direct(struct inode *inode)
138{
139 struct ceph_inode_info *ci = ceph_inode(inode);
140
141 /* Be an optimist! */
142 down_read(&inode->i_rwsem);
143 if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
144 return;
145 up_read(&inode->i_rwsem);
146 /* Slow path.... */
147 down_write(&inode->i_rwsem);
148 ceph_block_buffered(ci, inode);
149 downgrade_write(&inode->i_rwsem);
150}
151
152/**
153 * ceph_end_io_direct - declare that the direct i/o operation is done
154 * @inode: file inode
155 *
156 * Declare that a direct I/O operation is done, and release the shared
157 * lock on inode->i_rwsem.
158 */
159void
160ceph_end_io_direct(struct inode *inode)
161{
162 up_read(&inode->i_rwsem);
163}
diff --git a/fs/ceph/io.h b/fs/ceph/io.h
new file mode 100644
index 000000000000..fa594cd77348
--- /dev/null
+++ b/fs/ceph/io.h
@@ -0,0 +1,12 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _FS_CEPH_IO_H
3#define _FS_CEPH_IO_H
4
5void ceph_start_io_read(struct inode *inode);
6void ceph_end_io_read(struct inode *inode);
7void ceph_start_io_write(struct inode *inode);
8void ceph_end_io_write(struct inode *inode);
9void ceph_start_io_direct(struct inode *inode);
10void ceph_end_io_direct(struct inode *inode);
11
12#endif /* FS_CEPH_IO_H */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 03e4828c7635..98d7190289c8 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -513,10 +513,10 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
513#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ 513#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
514#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ 514#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
515#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ 515#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
516#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ 516#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
517#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ 517#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
518#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ 518#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
519 519#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
520 520
521/* 521/*
522 * Masks of ceph inode work. 522 * Masks of ceph inode work.