summaryrefslogtreecommitdiffstats
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c315
1 files changed, 0 insertions, 315 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 28af41b9da3a..ad131cd2605d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
116 return page; 116 return page;
117} 117}
118 118
119static bool buffer_written(struct buffer_head *bh)
120{
121 return buffer_mapped(bh) && !buffer_unwritten(bh);
122}
123
124static sector_t to_sector(const struct buffer_head *bh,
125 const struct inode *inode)
126{
127 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
128
129 return sector;
130}
131
132static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
133 loff_t start, loff_t end, get_block_t get_block,
134 struct buffer_head *bh)
135{
136 loff_t pos = start, max = start, bh_max = start;
137 bool hole = false;
138 struct block_device *bdev = NULL;
139 int rw = iov_iter_rw(iter), rc;
140 long map_len = 0;
141 struct blk_dax_ctl dax = {
142 .addr = ERR_PTR(-EIO),
143 };
144 unsigned blkbits = inode->i_blkbits;
145 sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
146 >> blkbits;
147
148 if (rw == READ)
149 end = min(end, i_size_read(inode));
150
151 while (pos < end) {
152 size_t len;
153 if (pos == max) {
154 long page = pos >> PAGE_SHIFT;
155 sector_t block = page << (PAGE_SHIFT - blkbits);
156 unsigned first = pos - (block << blkbits);
157 long size;
158
159 if (pos == bh_max) {
160 bh->b_size = PAGE_ALIGN(end - pos);
161 bh->b_state = 0;
162 rc = get_block(inode, block, bh, rw == WRITE);
163 if (rc)
164 break;
165 bh_max = pos - first + bh->b_size;
166 bdev = bh->b_bdev;
167 /*
168 * We allow uninitialized buffers for writes
169 * beyond EOF as those cannot race with faults
170 */
171 WARN_ON_ONCE(
172 (buffer_new(bh) && block < file_blks) ||
173 (rw == WRITE && buffer_unwritten(bh)));
174 } else {
175 unsigned done = bh->b_size -
176 (bh_max - (pos - first));
177 bh->b_blocknr += done >> blkbits;
178 bh->b_size -= done;
179 }
180
181 hole = rw == READ && !buffer_written(bh);
182 if (hole) {
183 size = bh->b_size - first;
184 } else {
185 dax_unmap_atomic(bdev, &dax);
186 dax.sector = to_sector(bh, inode);
187 dax.size = bh->b_size;
188 map_len = dax_map_atomic(bdev, &dax);
189 if (map_len < 0) {
190 rc = map_len;
191 break;
192 }
193 dax.addr += first;
194 size = map_len - first;
195 }
196 /*
197 * pos + size is one past the last offset for IO,
198 * so pos + size can overflow loff_t at extreme offsets.
199 * Cast to u64 to catch this and get the true minimum.
200 */
201 max = min_t(u64, pos + size, end);
202 }
203
204 if (iov_iter_rw(iter) == WRITE) {
205 len = copy_from_iter_pmem(dax.addr, max - pos, iter);
206 } else if (!hole)
207 len = copy_to_iter((void __force *) dax.addr, max - pos,
208 iter);
209 else
210 len = iov_iter_zero(max - pos, iter);
211
212 if (!len) {
213 rc = -EFAULT;
214 break;
215 }
216
217 pos += len;
218 if (!IS_ERR(dax.addr))
219 dax.addr += len;
220 }
221
222 dax_unmap_atomic(bdev, &dax);
223
224 return (pos == start) ? rc : pos - start;
225}
226
227/**
228 * dax_do_io - Perform I/O to a DAX file
229 * @iocb: The control block for this I/O
230 * @inode: The file which the I/O is directed at
231 * @iter: The addresses to do I/O from or to
232 * @get_block: The filesystem method used to translate file offsets to blocks
233 * @end_io: A filesystem callback for I/O completion
234 * @flags: See below
235 *
236 * This function uses the same locking scheme as do_blockdev_direct_IO:
237 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
238 * caller for writes. For reads, we take and release the i_mutex ourselves.
239 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
240 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
241 * is in progress.
242 */
243ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
244 struct iov_iter *iter, get_block_t get_block,
245 dio_iodone_t end_io, int flags)
246{
247 struct buffer_head bh;
248 ssize_t retval = -EINVAL;
249 loff_t pos = iocb->ki_pos;
250 loff_t end = pos + iov_iter_count(iter);
251
252 memset(&bh, 0, sizeof(bh));
253 bh.b_bdev = inode->i_sb->s_bdev;
254
255 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
256 inode_lock(inode);
257
258 /* Protects against truncate */
259 if (!(flags & DIO_SKIP_DIO_COUNT))
260 inode_dio_begin(inode);
261
262 retval = dax_io(inode, iter, pos, end, get_block, &bh);
263
264 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
265 inode_unlock(inode);
266
267 if (end_io) {
268 int err;
269
270 err = end_io(iocb, pos, retval, bh.b_private);
271 if (err)
272 retval = err;
273 }
274
275 if (!(flags & DIO_SKIP_DIO_COUNT))
276 inode_dio_end(inode);
277 return retval;
278}
279EXPORT_SYMBOL_GPL(dax_do_io);
280
281/* 119/*
282 * DAX radix tree locking 120 * DAX radix tree locking
283 */ 121 */
@@ -920,105 +758,6 @@ static int dax_insert_mapping(struct address_space *mapping,
920} 758}
921 759
922/** 760/**
923 * dax_fault - handle a page fault on a DAX file
924 * @vma: The virtual memory area where the fault occurred
925 * @vmf: The description of the fault
926 * @get_block: The filesystem method used to translate file offsets to blocks
927 *
928 * When a page fault occurs, filesystems may call this helper in their
929 * fault handler for DAX files. dax_fault() assumes the caller has done all
930 * the necessary locking for the page fault to proceed successfully.
931 */
932int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
933 get_block_t get_block)
934{
935 struct file *file = vma->vm_file;
936 struct address_space *mapping = file->f_mapping;
937 struct inode *inode = mapping->host;
938 void *entry;
939 struct buffer_head bh;
940 unsigned long vaddr = (unsigned long)vmf->virtual_address;
941 unsigned blkbits = inode->i_blkbits;
942 sector_t block;
943 pgoff_t size;
944 int error;
945 int major = 0;
946
947 /*
948 * Check whether offset isn't beyond end of file now. Caller is supposed
949 * to hold locks serializing us with truncate / punch hole so this is
950 * a reliable test.
951 */
952 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
953 if (vmf->pgoff >= size)
954 return VM_FAULT_SIGBUS;
955
956 memset(&bh, 0, sizeof(bh));
957 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
958 bh.b_bdev = inode->i_sb->s_bdev;
959 bh.b_size = PAGE_SIZE;
960
961 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
962 if (IS_ERR(entry)) {
963 error = PTR_ERR(entry);
964 goto out;
965 }
966
967 error = get_block(inode, block, &bh, 0);
968 if (!error && (bh.b_size < PAGE_SIZE))
969 error = -EIO; /* fs corruption? */
970 if (error)
971 goto unlock_entry;
972
973 if (vmf->cow_page) {
974 struct page *new_page = vmf->cow_page;
975 if (buffer_written(&bh))
976 error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
977 bh.b_size, new_page, vaddr);
978 else
979 clear_user_highpage(new_page, vaddr);
980 if (error)
981 goto unlock_entry;
982 if (!radix_tree_exceptional_entry(entry)) {
983 vmf->page = entry;
984 return VM_FAULT_LOCKED;
985 }
986 vmf->entry = entry;
987 return VM_FAULT_DAX_LOCKED;
988 }
989
990 if (!buffer_mapped(&bh)) {
991 if (vmf->flags & FAULT_FLAG_WRITE) {
992 error = get_block(inode, block, &bh, 1);
993 count_vm_event(PGMAJFAULT);
994 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
995 major = VM_FAULT_MAJOR;
996 if (!error && (bh.b_size < PAGE_SIZE))
997 error = -EIO;
998 if (error)
999 goto unlock_entry;
1000 } else {
1001 return dax_load_hole(mapping, entry, vmf);
1002 }
1003 }
1004
1005 /* Filesystem should not return unwritten buffers to us! */
1006 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
1007 error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
1008 bh.b_size, &entry, vma, vmf);
1009 unlock_entry:
1010 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1011 out:
1012 if (error == -ENOMEM)
1013 return VM_FAULT_OOM | major;
1014 /* -EBUSY is fine, somebody else faulted on the same PTE */
1015 if ((error < 0) && (error != -EBUSY))
1016 return VM_FAULT_SIGBUS | major;
1017 return VM_FAULT_NOPAGE | major;
1018}
1019EXPORT_SYMBOL_GPL(dax_fault);
1020
1021/**
1022 * dax_pfn_mkwrite - handle first write to DAX page 761 * dax_pfn_mkwrite - handle first write to DAX page
1023 * @vma: The virtual memory area where the fault occurred 762 * @vma: The virtual memory area where the fault occurred
1024 * @vmf: The description of the fault 763 * @vmf: The description of the fault
@@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
1078} 817}
1079EXPORT_SYMBOL_GPL(__dax_zero_page_range); 818EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1080 819
1081/**
1082 * dax_zero_page_range - zero a range within a page of a DAX file
1083 * @inode: The file being truncated
1084 * @from: The file offset that is being truncated to
1085 * @length: The number of bytes to zero
1086 * @get_block: The filesystem method used to translate file offsets to blocks
1087 *
1088 * This function can be called by a filesystem when it is zeroing part of a
1089 * page in a DAX file. This is intended for hole-punch operations. If
1090 * you are truncating a file, the helper function dax_truncate_page() may be
1091 * more convenient.
1092 */
1093int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1094 get_block_t get_block)
1095{
1096 struct buffer_head bh;
1097 pgoff_t index = from >> PAGE_SHIFT;
1098 unsigned offset = from & (PAGE_SIZE-1);
1099 int err;
1100
1101 /* Block boundary? Nothing to do */
1102 if (!length)
1103 return 0;
1104 if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
1105 return -EINVAL;
1106
1107 memset(&bh, 0, sizeof(bh));
1108 bh.b_bdev = inode->i_sb->s_bdev;
1109 bh.b_size = PAGE_SIZE;
1110 err = get_block(inode, index, &bh, 0);
1111 if (err < 0 || !buffer_written(&bh))
1112 return err;
1113
1114 return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1115 offset, length);
1116}
1117EXPORT_SYMBOL_GPL(dax_zero_page_range);
1118
1119/**
1120 * dax_truncate_page - handle a partial page being truncated in a DAX file
1121 * @inode: The file being truncated
1122 * @from: The file offset that is being truncated to
1123 * @get_block: The filesystem method used to translate file offsets to blocks
1124 *
1125 * Similar to block_truncate_page(), this function can be called by a
1126 * filesystem when it is truncating a DAX file to handle the partial page.
1127 */
1128int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1129{
1130 unsigned length = PAGE_ALIGN(from) - from;
1131 return dax_zero_page_range(inode, from, length, get_block);
1132}
1133EXPORT_SYMBOL_GPL(dax_truncate_page);
1134
1135#ifdef CONFIG_FS_IOMAP 820#ifdef CONFIG_FS_IOMAP
1136static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 821static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
1137{ 822{