diff options
author | Matthew Wilcox <matthew.r.wilcox@intel.com> | 2015-02-16 18:58:56 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-16 20:56:03 -0500 |
commit | d475c6346a38aef3058eba96867bfa726a3cc940 (patch) | |
tree | 4d69d0f50a4a8e649a751dca8f710485848c0249 /fs/dax.c | |
parent | fbbbad4bc2101e452b24e6e65d3d5e11314a0b5f (diff) |
dax,ext2: replace XIP read and write with DAX I/O
Use the generic AIO infrastructure instead of custom read and write
methods. In addition to giving us support for AIO, this adds the missing
locking between read() and truncate().
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andreas Dilger <andreas.dilger@intel.com>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 186 |
1 files changed, 186 insertions, 0 deletions
diff --git a/fs/dax.c b/fs/dax.c new file mode 100644 index 000000000000..1a2bdbfa3ea9 --- /dev/null +++ b/fs/dax.c | |||
@@ -0,0 +1,186 @@ | |||
1 | /* | ||
2 | * fs/dax.c - Direct Access filesystem code | ||
3 | * Copyright (c) 2013-2014 Intel Corporation | ||
4 | * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> | ||
5 | * Author: Ross Zwisler <ross.zwisler@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify it | ||
8 | * under the terms and conditions of the GNU General Public License, | ||
9 | * version 2, as published by the Free Software Foundation. | ||
10 | * | ||
11 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
14 | * more details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/atomic.h> | ||
18 | #include <linux/blkdev.h> | ||
19 | #include <linux/buffer_head.h> | ||
20 | #include <linux/fs.h> | ||
21 | #include <linux/genhd.h> | ||
22 | #include <linux/mutex.h> | ||
23 | #include <linux/uio.h> | ||
24 | |||
25 | static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) | ||
26 | { | ||
27 | unsigned long pfn; | ||
28 | sector_t sector = bh->b_blocknr << (blkbits - 9); | ||
29 | return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); | ||
30 | } | ||
31 | |||
32 | static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, | ||
33 | loff_t end) | ||
34 | { | ||
35 | loff_t final = end - pos + first; /* The final byte of the buffer */ | ||
36 | |||
37 | if (first > 0) | ||
38 | memset(addr, 0, first); | ||
39 | if (final < size) | ||
40 | memset(addr + final, 0, size - final); | ||
41 | } | ||
42 | |||
43 | static bool buffer_written(struct buffer_head *bh) | ||
44 | { | ||
45 | return buffer_mapped(bh) && !buffer_unwritten(bh); | ||
46 | } | ||
47 | |||
48 | /* | ||
49 | * When ext4 encounters a hole, it returns without modifying the buffer_head | ||
50 | * which means that we can't trust b_size. To cope with this, we set b_state | ||
51 | * to 0 before calling get_block and, if any bit is set, we know we can trust | ||
52 | * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is | ||
53 | * and would save us time calling get_block repeatedly. | ||
54 | */ | ||
55 | static bool buffer_size_valid(struct buffer_head *bh) | ||
56 | { | ||
57 | return bh->b_state != 0; | ||
58 | } | ||
59 | |||
60 | static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, | ||
61 | loff_t start, loff_t end, get_block_t get_block, | ||
62 | struct buffer_head *bh) | ||
63 | { | ||
64 | ssize_t retval = 0; | ||
65 | loff_t pos = start; | ||
66 | loff_t max = start; | ||
67 | loff_t bh_max = start; | ||
68 | void *addr; | ||
69 | bool hole = false; | ||
70 | |||
71 | if (rw != WRITE) | ||
72 | end = min(end, i_size_read(inode)); | ||
73 | |||
74 | while (pos < end) { | ||
75 | unsigned len; | ||
76 | if (pos == max) { | ||
77 | unsigned blkbits = inode->i_blkbits; | ||
78 | sector_t block = pos >> blkbits; | ||
79 | unsigned first = pos - (block << blkbits); | ||
80 | long size; | ||
81 | |||
82 | if (pos == bh_max) { | ||
83 | bh->b_size = PAGE_ALIGN(end - pos); | ||
84 | bh->b_state = 0; | ||
85 | retval = get_block(inode, block, bh, | ||
86 | rw == WRITE); | ||
87 | if (retval) | ||
88 | break; | ||
89 | if (!buffer_size_valid(bh)) | ||
90 | bh->b_size = 1 << blkbits; | ||
91 | bh_max = pos - first + bh->b_size; | ||
92 | } else { | ||
93 | unsigned done = bh->b_size - | ||
94 | (bh_max - (pos - first)); | ||
95 | bh->b_blocknr += done >> blkbits; | ||
96 | bh->b_size -= done; | ||
97 | } | ||
98 | |||
99 | hole = (rw != WRITE) && !buffer_written(bh); | ||
100 | if (hole) { | ||
101 | addr = NULL; | ||
102 | size = bh->b_size - first; | ||
103 | } else { | ||
104 | retval = dax_get_addr(bh, &addr, blkbits); | ||
105 | if (retval < 0) | ||
106 | break; | ||
107 | if (buffer_unwritten(bh) || buffer_new(bh)) | ||
108 | dax_new_buf(addr, retval, first, pos, | ||
109 | end); | ||
110 | addr += first; | ||
111 | size = retval - first; | ||
112 | } | ||
113 | max = min(pos + size, end); | ||
114 | } | ||
115 | |||
116 | if (rw == WRITE) | ||
117 | len = copy_from_iter(addr, max - pos, iter); | ||
118 | else if (!hole) | ||
119 | len = copy_to_iter(addr, max - pos, iter); | ||
120 | else | ||
121 | len = iov_iter_zero(max - pos, iter); | ||
122 | |||
123 | if (!len) | ||
124 | break; | ||
125 | |||
126 | pos += len; | ||
127 | addr += len; | ||
128 | } | ||
129 | |||
130 | return (pos == start) ? retval : pos - start; | ||
131 | } | ||
132 | |||
133 | /** | ||
134 | * dax_do_io - Perform I/O to a DAX file | ||
135 | * @rw: READ to read or WRITE to write | ||
136 | * @iocb: The control block for this I/O | ||
137 | * @inode: The file which the I/O is directed at | ||
138 | * @iter: The addresses to do I/O from or to | ||
139 | * @pos: The file offset where the I/O starts | ||
140 | * @get_block: The filesystem method used to translate file offsets to blocks | ||
141 | * @end_io: A filesystem callback for I/O completion | ||
142 | * @flags: See below | ||
143 | * | ||
144 | * This function uses the same locking scheme as do_blockdev_direct_IO: | ||
145 | * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the | ||
146 | * caller for writes. For reads, we take and release the i_mutex ourselves. | ||
147 | * If DIO_LOCKING is not set, the filesystem takes care of its own locking. | ||
148 | * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O | ||
149 | * is in progress. | ||
150 | */ | ||
151 | ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, | ||
152 | struct iov_iter *iter, loff_t pos, | ||
153 | get_block_t get_block, dio_iodone_t end_io, int flags) | ||
154 | { | ||
155 | struct buffer_head bh; | ||
156 | ssize_t retval = -EINVAL; | ||
157 | loff_t end = pos + iov_iter_count(iter); | ||
158 | |||
159 | memset(&bh, 0, sizeof(bh)); | ||
160 | |||
161 | if ((flags & DIO_LOCKING) && (rw == READ)) { | ||
162 | struct address_space *mapping = inode->i_mapping; | ||
163 | mutex_lock(&inode->i_mutex); | ||
164 | retval = filemap_write_and_wait_range(mapping, pos, end - 1); | ||
165 | if (retval) { | ||
166 | mutex_unlock(&inode->i_mutex); | ||
167 | goto out; | ||
168 | } | ||
169 | } | ||
170 | |||
171 | /* Protects against truncate */ | ||
172 | atomic_inc(&inode->i_dio_count); | ||
173 | |||
174 | retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); | ||
175 | |||
176 | if ((flags & DIO_LOCKING) && (rw == READ)) | ||
177 | mutex_unlock(&inode->i_mutex); | ||
178 | |||
179 | if ((retval > 0) && end_io) | ||
180 | end_io(iocb, pos, retval, bh.b_private); | ||
181 | |||
182 | inode_dio_done(inode); | ||
183 | out: | ||
184 | return retval; | ||
185 | } | ||
186 | EXPORT_SYMBOL_GPL(dax_do_io); | ||