aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dax.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dax.c')
-rw-r--r--fs/dax.c252
1 files changed, 240 insertions, 12 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 993dc6fe0416..cc025f82ef07 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,8 @@
31#include <linux/vmstat.h> 31#include <linux/vmstat.h>
32#include <linux/pfn_t.h> 32#include <linux/pfn_t.h>
33#include <linux/sizes.h> 33#include <linux/sizes.h>
34#include <linux/iomap.h>
35#include "internal.h"
34 36
35/* 37/*
36 * We use lowest available bit in exceptional entry for locking, other two 38 * We use lowest available bit in exceptional entry for locking, other two
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
580 return VM_FAULT_LOCKED; 582 return VM_FAULT_LOCKED;
581} 583}
582 584
583static int copy_user_bh(struct page *to, struct inode *inode, 585static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
584 struct buffer_head *bh, unsigned long vaddr) 586 struct page *to, unsigned long vaddr)
585{ 587{
586 struct blk_dax_ctl dax = { 588 struct blk_dax_ctl dax = {
587 .sector = to_sector(bh, inode), 589 .sector = sector,
588 .size = bh->b_size, 590 .size = size,
589 }; 591 };
590 struct block_device *bdev = bh->b_bdev;
591 void *vto; 592 void *vto;
592 593
593 if (dax_map_atomic(bdev, &dax) < 0) 594 if (dax_map_atomic(bdev, &dax) < 0)
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping,
790EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 791EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
791 792
792static int dax_insert_mapping(struct address_space *mapping, 793static int dax_insert_mapping(struct address_space *mapping,
793 struct buffer_head *bh, void **entryp, 794 struct block_device *bdev, sector_t sector, size_t size,
794 struct vm_area_struct *vma, struct vm_fault *vmf) 795 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
795{ 796{
796 unsigned long vaddr = (unsigned long)vmf->virtual_address; 797 unsigned long vaddr = (unsigned long)vmf->virtual_address;
797 struct block_device *bdev = bh->b_bdev;
798 struct blk_dax_ctl dax = { 798 struct blk_dax_ctl dax = {
799 .sector = to_sector(bh, mapping->host), 799 .sector = sector,
800 .size = bh->b_size, 800 .size = size,
801 }; 801 };
802 void *ret; 802 void *ret;
803 void *entry = *entryp; 803 void *entry = *entryp;
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
868 if (vmf->cow_page) { 868 if (vmf->cow_page) {
869 struct page *new_page = vmf->cow_page; 869 struct page *new_page = vmf->cow_page;
870 if (buffer_written(&bh)) 870 if (buffer_written(&bh))
871 error = copy_user_bh(new_page, inode, &bh, vaddr); 871 error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
872 bh.b_size, new_page, vaddr);
872 else 873 else
873 clear_user_highpage(new_page, vaddr); 874 clear_user_highpage(new_page, vaddr);
874 if (error) 875 if (error)
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
898 899
899 /* Filesystem should not return unwritten buffers to us! */ 900 /* Filesystem should not return unwritten buffers to us! */
900 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); 901 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
901 error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); 902 error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
903 bh.b_size, &entry, vma, vmf);
902 unlock_entry: 904 unlock_entry:
903 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 905 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
904 out: 906 out:
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1241 return dax_zero_page_range(inode, from, length, get_block); 1243 return dax_zero_page_range(inode, from, length, get_block);
1242} 1244}
1243EXPORT_SYMBOL_GPL(dax_truncate_page); 1245EXPORT_SYMBOL_GPL(dax_truncate_page);
1246
1247#ifdef CONFIG_FS_IOMAP
1248static loff_t
1249iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
1250 struct iomap *iomap)
1251{
1252 struct iov_iter *iter = data;
1253 loff_t end = pos + length, done = 0;
1254 ssize_t ret = 0;
1255
1256 if (iov_iter_rw(iter) == READ) {
1257 end = min(end, i_size_read(inode));
1258 if (pos >= end)
1259 return 0;
1260
1261 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
1262 return iov_iter_zero(min(length, end - pos), iter);
1263 }
1264
1265 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
1266 return -EIO;
1267
1268 while (pos < end) {
1269 unsigned offset = pos & (PAGE_SIZE - 1);
1270 struct blk_dax_ctl dax = { 0 };
1271 ssize_t map_len;
1272
1273 dax.sector = iomap->blkno +
1274 (((pos & PAGE_MASK) - iomap->offset) >> 9);
1275 dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
1276 map_len = dax_map_atomic(iomap->bdev, &dax);
1277 if (map_len < 0) {
1278 ret = map_len;
1279 break;
1280 }
1281
1282 dax.addr += offset;
1283 map_len -= offset;
1284 if (map_len > end - pos)
1285 map_len = end - pos;
1286
1287 if (iov_iter_rw(iter) == WRITE)
1288 map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
1289 else
1290 map_len = copy_to_iter(dax.addr, map_len, iter);
1291 dax_unmap_atomic(iomap->bdev, &dax);
1292 if (map_len <= 0) {
1293 ret = map_len ? map_len : -EFAULT;
1294 break;
1295 }
1296
1297 pos += map_len;
1298 length -= map_len;
1299 done += map_len;
1300 }
1301
1302 return done ? done : ret;
1303}
1304
1305/**
1306 * iomap_dax_rw - Perform I/O to a DAX file
1307 * @iocb: The control block for this I/O
1308 * @iter: The addresses to do I/O from or to
1309 * @ops: iomap ops passed from the file system
1310 *
1311 * This function performs read and write operations to directly mapped
1312 * persistent memory. The callers needs to take care of read/write exclusion
1313 * and evicting any page cache pages in the region under I/O.
1314 */
1315ssize_t
1316iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
1317 struct iomap_ops *ops)
1318{
1319 struct address_space *mapping = iocb->ki_filp->f_mapping;
1320 struct inode *inode = mapping->host;
1321 loff_t pos = iocb->ki_pos, ret = 0, done = 0;
1322 unsigned flags = 0;
1323
1324 if (iov_iter_rw(iter) == WRITE)
1325 flags |= IOMAP_WRITE;
1326
1327 /*
1328 * Yes, even DAX files can have page cache attached to them: A zeroed
1329 * page is inserted into the pagecache when we have to serve a write
1330 * fault on a hole. It should never be dirtied and can simply be
1331 * dropped from the pagecache once we get real data for the page.
1332 *
1333 * XXX: This is racy against mmap, and there's nothing we can do about
1334 * it. We'll eventually need to shift this down even further so that
1335 * we can check if we allocated blocks over a hole first.
1336 */
1337 if (mapping->nrpages) {
1338 ret = invalidate_inode_pages2_range(mapping,
1339 pos >> PAGE_SHIFT,
1340 (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
1341 WARN_ON_ONCE(ret);
1342 }
1343
1344 while (iov_iter_count(iter)) {
1345 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
1346 iter, iomap_dax_actor);
1347 if (ret <= 0)
1348 break;
1349 pos += ret;
1350 done += ret;
1351 }
1352
1353 iocb->ki_pos += done;
1354 return done ? done : ret;
1355}
1356EXPORT_SYMBOL_GPL(iomap_dax_rw);
1357
1358/**
1359 * iomap_dax_fault - handle a page fault on a DAX file
1360 * @vma: The virtual memory area where the fault occurred
1361 * @vmf: The description of the fault
1362 * @ops: iomap ops passed from the file system
1363 *
1364 * When a page fault occurs, filesystems may call this helper in their fault
1365 * or mkwrite handler for DAX files. Assumes the caller has done all the
1366 * necessary locking for the page fault to proceed successfully.
1367 */
1368int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1369 struct iomap_ops *ops)
1370{
1371 struct address_space *mapping = vma->vm_file->f_mapping;
1372 struct inode *inode = mapping->host;
1373 unsigned long vaddr = (unsigned long)vmf->virtual_address;
1374 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
1375 sector_t sector;
1376 struct iomap iomap = { 0 };
1377 unsigned flags = 0;
1378 int error, major = 0;
1379 void *entry;
1380
1381 /*
1382 * Check whether offset isn't beyond end of file now. Caller is supposed
1383 * to hold locks serializing us with truncate / punch hole so this is
1384 * a reliable test.
1385 */
1386 if (pos >= i_size_read(inode))
1387 return VM_FAULT_SIGBUS;
1388
1389 entry = grab_mapping_entry(mapping, vmf->pgoff);
1390 if (IS_ERR(entry)) {
1391 error = PTR_ERR(entry);
1392 goto out;
1393 }
1394
1395 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1396 flags |= IOMAP_WRITE;
1397
1398 /*
1399 * Note that we don't bother to use iomap_apply here: DAX required
1400 * the file system block size to be equal the page size, which means
1401 * that we never have to deal with more than a single extent here.
1402 */
1403 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1404 if (error)
1405 goto unlock_entry;
1406 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1407 error = -EIO; /* fs corruption? */
1408 goto unlock_entry;
1409 }
1410
1411 sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
1412
1413 if (vmf->cow_page) {
1414 switch (iomap.type) {
1415 case IOMAP_HOLE:
1416 case IOMAP_UNWRITTEN:
1417 clear_user_highpage(vmf->cow_page, vaddr);
1418 break;
1419 case IOMAP_MAPPED:
1420 error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
1421 vmf->cow_page, vaddr);
1422 break;
1423 default:
1424 WARN_ON_ONCE(1);
1425 error = -EIO;
1426 break;
1427 }
1428
1429 if (error)
1430 goto unlock_entry;
1431 if (!radix_tree_exceptional_entry(entry)) {
1432 vmf->page = entry;
1433 return VM_FAULT_LOCKED;
1434 }
1435 vmf->entry = entry;
1436 return VM_FAULT_DAX_LOCKED;
1437 }
1438
1439 switch (iomap.type) {
1440 case IOMAP_MAPPED:
1441 if (iomap.flags & IOMAP_F_NEW) {
1442 count_vm_event(PGMAJFAULT);
1443 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1444 major = VM_FAULT_MAJOR;
1445 }
1446 error = dax_insert_mapping(mapping, iomap.bdev, sector,
1447 PAGE_SIZE, &entry, vma, vmf);
1448 break;
1449 case IOMAP_UNWRITTEN:
1450 case IOMAP_HOLE:
1451 if (!(vmf->flags & FAULT_FLAG_WRITE))
1452 return dax_load_hole(mapping, entry, vmf);
1453 /*FALLTHRU*/
1454 default:
1455 WARN_ON_ONCE(1);
1456 error = -EIO;
1457 break;
1458 }
1459
1460 unlock_entry:
1461 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1462 out:
1463 if (error == -ENOMEM)
1464 return VM_FAULT_OOM | major;
1465 /* -EBUSY is fine, somebody else faulted on the same PTE */
1466 if (error < 0 && error != -EBUSY)
1467 return VM_FAULT_SIGBUS | major;
1468 return VM_FAULT_NOPAGE | major;
1469}
1470EXPORT_SYMBOL_GPL(iomap_dax_fault);
1471#endif /* CONFIG_FS_IOMAP */