diff options
author | Jan Kara <jack@suse.cz> | 2016-11-20 20:48:36 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2016-11-20 20:48:36 -0500 |
commit | dd936e4313fa3f60abd6e67abb3cb66fc9a018d1 (patch) | |
tree | fd9b89b7922a9c28bdb614ae2d49853d94772093 | |
parent | 00697eed386d57c5267aad5b8343f027b10da0c0 (diff) |
dax: rip out get_block based IO support
No one uses functions using the get_block callback anymore. Rip them
out and update documentation.
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r-- | Documentation/filesystems/dax.txt | 22 | ||||
-rw-r--r-- | fs/dax.c | 315 | ||||
-rw-r--r-- | include/linux/dax.h | 12 |
3 files changed, 11 insertions, 338 deletions
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 23d18b8a49d5..a7e6e14aeb08 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt | |||
@@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers | |||
58 | Filesystem support consists of | 58 | Filesystem support consists of |
59 | - adding support to mark inodes as being DAX by setting the S_DAX flag in | 59 | - adding support to mark inodes as being DAX by setting the S_DAX flag in |
60 | i_flags | 60 | i_flags |
61 | - implementing the direct_IO address space operation, and calling | 61 | - implementing ->read_iter and ->write_iter operations which use dax_iomap_rw() |
62 | dax_do_io() instead of blockdev_direct_IO() if S_DAX is set | 62 | when inode has S_DAX flag set |
63 | - implementing an mmap file operation for DAX files which sets the | 63 | - implementing an mmap file operation for DAX files which sets the |
64 | VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to | 64 | VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to |
65 | include handlers for fault, pmd_fault and page_mkwrite (which should | 65 | include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These |
66 | probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the | 66 | handlers should probably call dax_iomap_fault() (for fault and page_mkwrite |
67 | appropriate get_block() callback) | 67 | handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate |
68 | - calling dax_truncate_page() instead of block_truncate_page() for DAX files | 68 | iomap operations. |
69 | - calling dax_zero_page_range() instead of zero_user() for DAX files | 69 | - calling iomap_zero_range() passing appropriate iomap operations instead of |
70 | block_truncate_page() for DAX files | ||
70 | - ensuring that there is sufficient locking between reads, writes, | 71 | - ensuring that there is sufficient locking between reads, writes, |
71 | truncates and page faults | 72 | truncates and page faults |
72 | 73 | ||
73 | The get_block() callback passed to the DAX functions may return | 74 | The iomap handlers for allocating blocks must make sure that allocated blocks |
74 | uninitialised extents. If it does, it must ensure that simultaneous | 75 | are zeroed out and converted to written extents before being returned to avoid |
75 | calls to get_block() (for example by a page-fault racing with a read() | 76 | exposure of uninitialized data through mmap. |
76 | or a write()) work correctly. | ||
77 | 77 | ||
78 | These filesystems may be used for inspiration: | 78 | These filesystems may be used for inspiration: |
79 | - ext2: see Documentation/filesystems/ext2.txt | 79 | - ext2: see Documentation/filesystems/ext2.txt |
@@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n) | |||
116 | return page; | 116 | return page; |
117 | } | 117 | } |
118 | 118 | ||
119 | static bool buffer_written(struct buffer_head *bh) | ||
120 | { | ||
121 | return buffer_mapped(bh) && !buffer_unwritten(bh); | ||
122 | } | ||
123 | |||
124 | static sector_t to_sector(const struct buffer_head *bh, | ||
125 | const struct inode *inode) | ||
126 | { | ||
127 | sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); | ||
128 | |||
129 | return sector; | ||
130 | } | ||
131 | |||
132 | static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, | ||
133 | loff_t start, loff_t end, get_block_t get_block, | ||
134 | struct buffer_head *bh) | ||
135 | { | ||
136 | loff_t pos = start, max = start, bh_max = start; | ||
137 | bool hole = false; | ||
138 | struct block_device *bdev = NULL; | ||
139 | int rw = iov_iter_rw(iter), rc; | ||
140 | long map_len = 0; | ||
141 | struct blk_dax_ctl dax = { | ||
142 | .addr = ERR_PTR(-EIO), | ||
143 | }; | ||
144 | unsigned blkbits = inode->i_blkbits; | ||
145 | sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) | ||
146 | >> blkbits; | ||
147 | |||
148 | if (rw == READ) | ||
149 | end = min(end, i_size_read(inode)); | ||
150 | |||
151 | while (pos < end) { | ||
152 | size_t len; | ||
153 | if (pos == max) { | ||
154 | long page = pos >> PAGE_SHIFT; | ||
155 | sector_t block = page << (PAGE_SHIFT - blkbits); | ||
156 | unsigned first = pos - (block << blkbits); | ||
157 | long size; | ||
158 | |||
159 | if (pos == bh_max) { | ||
160 | bh->b_size = PAGE_ALIGN(end - pos); | ||
161 | bh->b_state = 0; | ||
162 | rc = get_block(inode, block, bh, rw == WRITE); | ||
163 | if (rc) | ||
164 | break; | ||
165 | bh_max = pos - first + bh->b_size; | ||
166 | bdev = bh->b_bdev; | ||
167 | /* | ||
168 | * We allow uninitialized buffers for writes | ||
169 | * beyond EOF as those cannot race with faults | ||
170 | */ | ||
171 | WARN_ON_ONCE( | ||
172 | (buffer_new(bh) && block < file_blks) || | ||
173 | (rw == WRITE && buffer_unwritten(bh))); | ||
174 | } else { | ||
175 | unsigned done = bh->b_size - | ||
176 | (bh_max - (pos - first)); | ||
177 | bh->b_blocknr += done >> blkbits; | ||
178 | bh->b_size -= done; | ||
179 | } | ||
180 | |||
181 | hole = rw == READ && !buffer_written(bh); | ||
182 | if (hole) { | ||
183 | size = bh->b_size - first; | ||
184 | } else { | ||
185 | dax_unmap_atomic(bdev, &dax); | ||
186 | dax.sector = to_sector(bh, inode); | ||
187 | dax.size = bh->b_size; | ||
188 | map_len = dax_map_atomic(bdev, &dax); | ||
189 | if (map_len < 0) { | ||
190 | rc = map_len; | ||
191 | break; | ||
192 | } | ||
193 | dax.addr += first; | ||
194 | size = map_len - first; | ||
195 | } | ||
196 | /* | ||
197 | * pos + size is one past the last offset for IO, | ||
198 | * so pos + size can overflow loff_t at extreme offsets. | ||
199 | * Cast to u64 to catch this and get the true minimum. | ||
200 | */ | ||
201 | max = min_t(u64, pos + size, end); | ||
202 | } | ||
203 | |||
204 | if (iov_iter_rw(iter) == WRITE) { | ||
205 | len = copy_from_iter_pmem(dax.addr, max - pos, iter); | ||
206 | } else if (!hole) | ||
207 | len = copy_to_iter((void __force *) dax.addr, max - pos, | ||
208 | iter); | ||
209 | else | ||
210 | len = iov_iter_zero(max - pos, iter); | ||
211 | |||
212 | if (!len) { | ||
213 | rc = -EFAULT; | ||
214 | break; | ||
215 | } | ||
216 | |||
217 | pos += len; | ||
218 | if (!IS_ERR(dax.addr)) | ||
219 | dax.addr += len; | ||
220 | } | ||
221 | |||
222 | dax_unmap_atomic(bdev, &dax); | ||
223 | |||
224 | return (pos == start) ? rc : pos - start; | ||
225 | } | ||
226 | |||
227 | /** | ||
228 | * dax_do_io - Perform I/O to a DAX file | ||
229 | * @iocb: The control block for this I/O | ||
230 | * @inode: The file which the I/O is directed at | ||
231 | * @iter: The addresses to do I/O from or to | ||
232 | * @get_block: The filesystem method used to translate file offsets to blocks | ||
233 | * @end_io: A filesystem callback for I/O completion | ||
234 | * @flags: See below | ||
235 | * | ||
236 | * This function uses the same locking scheme as do_blockdev_direct_IO: | ||
237 | * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the | ||
238 | * caller for writes. For reads, we take and release the i_mutex ourselves. | ||
239 | * If DIO_LOCKING is not set, the filesystem takes care of its own locking. | ||
240 | * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O | ||
241 | * is in progress. | ||
242 | */ | ||
243 | ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, | ||
244 | struct iov_iter *iter, get_block_t get_block, | ||
245 | dio_iodone_t end_io, int flags) | ||
246 | { | ||
247 | struct buffer_head bh; | ||
248 | ssize_t retval = -EINVAL; | ||
249 | loff_t pos = iocb->ki_pos; | ||
250 | loff_t end = pos + iov_iter_count(iter); | ||
251 | |||
252 | memset(&bh, 0, sizeof(bh)); | ||
253 | bh.b_bdev = inode->i_sb->s_bdev; | ||
254 | |||
255 | if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) | ||
256 | inode_lock(inode); | ||
257 | |||
258 | /* Protects against truncate */ | ||
259 | if (!(flags & DIO_SKIP_DIO_COUNT)) | ||
260 | inode_dio_begin(inode); | ||
261 | |||
262 | retval = dax_io(inode, iter, pos, end, get_block, &bh); | ||
263 | |||
264 | if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) | ||
265 | inode_unlock(inode); | ||
266 | |||
267 | if (end_io) { | ||
268 | int err; | ||
269 | |||
270 | err = end_io(iocb, pos, retval, bh.b_private); | ||
271 | if (err) | ||
272 | retval = err; | ||
273 | } | ||
274 | |||
275 | if (!(flags & DIO_SKIP_DIO_COUNT)) | ||
276 | inode_dio_end(inode); | ||
277 | return retval; | ||
278 | } | ||
279 | EXPORT_SYMBOL_GPL(dax_do_io); | ||
280 | |||
281 | /* | 119 | /* |
282 | * DAX radix tree locking | 120 | * DAX radix tree locking |
283 | */ | 121 | */ |
@@ -920,105 +758,6 @@ static int dax_insert_mapping(struct address_space *mapping, | |||
920 | } | 758 | } |
921 | 759 | ||
922 | /** | 760 | /** |
923 | * dax_fault - handle a page fault on a DAX file | ||
924 | * @vma: The virtual memory area where the fault occurred | ||
925 | * @vmf: The description of the fault | ||
926 | * @get_block: The filesystem method used to translate file offsets to blocks | ||
927 | * | ||
928 | * When a page fault occurs, filesystems may call this helper in their | ||
929 | * fault handler for DAX files. dax_fault() assumes the caller has done all | ||
930 | * the necessary locking for the page fault to proceed successfully. | ||
931 | */ | ||
932 | int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | ||
933 | get_block_t get_block) | ||
934 | { | ||
935 | struct file *file = vma->vm_file; | ||
936 | struct address_space *mapping = file->f_mapping; | ||
937 | struct inode *inode = mapping->host; | ||
938 | void *entry; | ||
939 | struct buffer_head bh; | ||
940 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | ||
941 | unsigned blkbits = inode->i_blkbits; | ||
942 | sector_t block; | ||
943 | pgoff_t size; | ||
944 | int error; | ||
945 | int major = 0; | ||
946 | |||
947 | /* | ||
948 | * Check whether offset isn't beyond end of file now. Caller is supposed | ||
949 | * to hold locks serializing us with truncate / punch hole so this is | ||
950 | * a reliable test. | ||
951 | */ | ||
952 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
953 | if (vmf->pgoff >= size) | ||
954 | return VM_FAULT_SIGBUS; | ||
955 | |||
956 | memset(&bh, 0, sizeof(bh)); | ||
957 | block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); | ||
958 | bh.b_bdev = inode->i_sb->s_bdev; | ||
959 | bh.b_size = PAGE_SIZE; | ||
960 | |||
961 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||
962 | if (IS_ERR(entry)) { | ||
963 | error = PTR_ERR(entry); | ||
964 | goto out; | ||
965 | } | ||
966 | |||
967 | error = get_block(inode, block, &bh, 0); | ||
968 | if (!error && (bh.b_size < PAGE_SIZE)) | ||
969 | error = -EIO; /* fs corruption? */ | ||
970 | if (error) | ||
971 | goto unlock_entry; | ||
972 | |||
973 | if (vmf->cow_page) { | ||
974 | struct page *new_page = vmf->cow_page; | ||
975 | if (buffer_written(&bh)) | ||
976 | error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), | ||
977 | bh.b_size, new_page, vaddr); | ||
978 | else | ||
979 | clear_user_highpage(new_page, vaddr); | ||
980 | if (error) | ||
981 | goto unlock_entry; | ||
982 | if (!radix_tree_exceptional_entry(entry)) { | ||
983 | vmf->page = entry; | ||
984 | return VM_FAULT_LOCKED; | ||
985 | } | ||
986 | vmf->entry = entry; | ||
987 | return VM_FAULT_DAX_LOCKED; | ||
988 | } | ||
989 | |||
990 | if (!buffer_mapped(&bh)) { | ||
991 | if (vmf->flags & FAULT_FLAG_WRITE) { | ||
992 | error = get_block(inode, block, &bh, 1); | ||
993 | count_vm_event(PGMAJFAULT); | ||
994 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | ||
995 | major = VM_FAULT_MAJOR; | ||
996 | if (!error && (bh.b_size < PAGE_SIZE)) | ||
997 | error = -EIO; | ||
998 | if (error) | ||
999 | goto unlock_entry; | ||
1000 | } else { | ||
1001 | return dax_load_hole(mapping, entry, vmf); | ||
1002 | } | ||
1003 | } | ||
1004 | |||
1005 | /* Filesystem should not return unwritten buffers to us! */ | ||
1006 | WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); | ||
1007 | error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), | ||
1008 | bh.b_size, &entry, vma, vmf); | ||
1009 | unlock_entry: | ||
1010 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | ||
1011 | out: | ||
1012 | if (error == -ENOMEM) | ||
1013 | return VM_FAULT_OOM | major; | ||
1014 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | ||
1015 | if ((error < 0) && (error != -EBUSY)) | ||
1016 | return VM_FAULT_SIGBUS | major; | ||
1017 | return VM_FAULT_NOPAGE | major; | ||
1018 | } | ||
1019 | EXPORT_SYMBOL_GPL(dax_fault); | ||
1020 | |||
1021 | /** | ||
1022 | * dax_pfn_mkwrite - handle first write to DAX page | 761 | * dax_pfn_mkwrite - handle first write to DAX page |
1023 | * @vma: The virtual memory area where the fault occurred | 762 | * @vma: The virtual memory area where the fault occurred |
1024 | * @vmf: The description of the fault | 763 | * @vmf: The description of the fault |
@@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, | |||
1078 | } | 817 | } |
1079 | EXPORT_SYMBOL_GPL(__dax_zero_page_range); | 818 | EXPORT_SYMBOL_GPL(__dax_zero_page_range); |
1080 | 819 | ||
1081 | /** | ||
1082 | * dax_zero_page_range - zero a range within a page of a DAX file | ||
1083 | * @inode: The file being truncated | ||
1084 | * @from: The file offset that is being truncated to | ||
1085 | * @length: The number of bytes to zero | ||
1086 | * @get_block: The filesystem method used to translate file offsets to blocks | ||
1087 | * | ||
1088 | * This function can be called by a filesystem when it is zeroing part of a | ||
1089 | * page in a DAX file. This is intended for hole-punch operations. If | ||
1090 | * you are truncating a file, the helper function dax_truncate_page() may be | ||
1091 | * more convenient. | ||
1092 | */ | ||
1093 | int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, | ||
1094 | get_block_t get_block) | ||
1095 | { | ||
1096 | struct buffer_head bh; | ||
1097 | pgoff_t index = from >> PAGE_SHIFT; | ||
1098 | unsigned offset = from & (PAGE_SIZE-1); | ||
1099 | int err; | ||
1100 | |||
1101 | /* Block boundary? Nothing to do */ | ||
1102 | if (!length) | ||
1103 | return 0; | ||
1104 | if (WARN_ON_ONCE((offset + length) > PAGE_SIZE)) | ||
1105 | return -EINVAL; | ||
1106 | |||
1107 | memset(&bh, 0, sizeof(bh)); | ||
1108 | bh.b_bdev = inode->i_sb->s_bdev; | ||
1109 | bh.b_size = PAGE_SIZE; | ||
1110 | err = get_block(inode, index, &bh, 0); | ||
1111 | if (err < 0 || !buffer_written(&bh)) | ||
1112 | return err; | ||
1113 | |||
1114 | return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), | ||
1115 | offset, length); | ||
1116 | } | ||
1117 | EXPORT_SYMBOL_GPL(dax_zero_page_range); | ||
1118 | |||
1119 | /** | ||
1120 | * dax_truncate_page - handle a partial page being truncated in a DAX file | ||
1121 | * @inode: The file being truncated | ||
1122 | * @from: The file offset that is being truncated to | ||
1123 | * @get_block: The filesystem method used to translate file offsets to blocks | ||
1124 | * | ||
1125 | * Similar to block_truncate_page(), this function can be called by a | ||
1126 | * filesystem when it is truncating a DAX file to handle the partial page. | ||
1127 | */ | ||
1128 | int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) | ||
1129 | { | ||
1130 | unsigned length = PAGE_ALIGN(from) - from; | ||
1131 | return dax_zero_page_range(inode, from, length, get_block); | ||
1132 | } | ||
1133 | EXPORT_SYMBOL_GPL(dax_truncate_page); | ||
1134 | |||
1135 | #ifdef CONFIG_FS_IOMAP | 820 | #ifdef CONFIG_FS_IOMAP |
1136 | static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) | 821 | static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) |
1137 | { | 822 | { |
diff --git a/include/linux/dax.h b/include/linux/dax.h index 8d1a5c47945f..0afade8bd3d7 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h | |||
@@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags) | |||
38 | 38 | ||
39 | ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | 39 | ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, |
40 | struct iomap_ops *ops); | 40 | struct iomap_ops *ops); |
41 | ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, | ||
42 | get_block_t, dio_iodone_t, int flags); | ||
43 | int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); | ||
44 | int dax_truncate_page(struct inode *, loff_t from, get_block_t); | ||
45 | int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | 41 | int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, |
46 | struct iomap_ops *ops); | 42 | struct iomap_ops *ops); |
47 | int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); | ||
48 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); | 43 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); |
49 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, | 44 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, |
50 | pgoff_t index, void *entry, bool wake_all); | 45 | pgoff_t index, void *entry, bool wake_all); |
@@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev, | |||
73 | } | 68 | } |
74 | #endif | 69 | #endif |
75 | 70 | ||
76 | static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | ||
77 | pmd_t *pmd, unsigned int flags, get_block_t gb) | ||
78 | { | ||
79 | return VM_FAULT_FALLBACK; | ||
80 | } | ||
81 | |||
82 | #ifdef CONFIG_FS_DAX_PMD | 71 | #ifdef CONFIG_FS_DAX_PMD |
83 | static inline unsigned int dax_radix_order(void *entry) | 72 | static inline unsigned int dax_radix_order(void *entry) |
84 | { | 73 | { |
@@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma, | |||
101 | } | 90 | } |
102 | #endif | 91 | #endif |
103 | int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); | 92 | int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); |
104 | #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) | ||
105 | 93 | ||
106 | static inline bool vma_is_dax(struct vm_area_struct *vma) | 94 | static inline bool vma_is_dax(struct vm_area_struct *vma) |
107 | { | 95 | { |