diff options
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 252 |
1 files changed, 240 insertions, 12 deletions
@@ -31,6 +31,8 @@ | |||
31 | #include <linux/vmstat.h> | 31 | #include <linux/vmstat.h> |
32 | #include <linux/pfn_t.h> | 32 | #include <linux/pfn_t.h> |
33 | #include <linux/sizes.h> | 33 | #include <linux/sizes.h> |
34 | #include <linux/iomap.h> | ||
35 | #include "internal.h" | ||
34 | 36 | ||
35 | /* | 37 | /* |
36 | * We use lowest available bit in exceptional entry for locking, other two | 38 | * We use lowest available bit in exceptional entry for locking, other two |
@@ -580,14 +582,13 @@ static int dax_load_hole(struct address_space *mapping, void *entry, | |||
580 | return VM_FAULT_LOCKED; | 582 | return VM_FAULT_LOCKED; |
581 | } | 583 | } |
582 | 584 | ||
583 | static int copy_user_bh(struct page *to, struct inode *inode, | 585 | static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, |
584 | struct buffer_head *bh, unsigned long vaddr) | 586 | struct page *to, unsigned long vaddr) |
585 | { | 587 | { |
586 | struct blk_dax_ctl dax = { | 588 | struct blk_dax_ctl dax = { |
587 | .sector = to_sector(bh, inode), | 589 | .sector = sector, |
588 | .size = bh->b_size, | 590 | .size = size, |
589 | }; | 591 | }; |
590 | struct block_device *bdev = bh->b_bdev; | ||
591 | void *vto; | 592 | void *vto; |
592 | 593 | ||
593 | if (dax_map_atomic(bdev, &dax) < 0) | 594 | if (dax_map_atomic(bdev, &dax) < 0) |
@@ -790,14 +791,13 @@ int dax_writeback_mapping_range(struct address_space *mapping, | |||
790 | EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); | 791 | EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); |
791 | 792 | ||
792 | static int dax_insert_mapping(struct address_space *mapping, | 793 | static int dax_insert_mapping(struct address_space *mapping, |
793 | struct buffer_head *bh, void **entryp, | 794 | struct block_device *bdev, sector_t sector, size_t size, |
794 | struct vm_area_struct *vma, struct vm_fault *vmf) | 795 | void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) |
795 | { | 796 | { |
796 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 797 | unsigned long vaddr = (unsigned long)vmf->virtual_address; |
797 | struct block_device *bdev = bh->b_bdev; | ||
798 | struct blk_dax_ctl dax = { | 798 | struct blk_dax_ctl dax = { |
799 | .sector = to_sector(bh, mapping->host), | 799 | .sector = sector, |
800 | .size = bh->b_size, | 800 | .size = size, |
801 | }; | 801 | }; |
802 | void *ret; | 802 | void *ret; |
803 | void *entry = *entryp; | 803 | void *entry = *entryp; |
@@ -868,7 +868,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
868 | if (vmf->cow_page) { | 868 | if (vmf->cow_page) { |
869 | struct page *new_page = vmf->cow_page; | 869 | struct page *new_page = vmf->cow_page; |
870 | if (buffer_written(&bh)) | 870 | if (buffer_written(&bh)) |
871 | error = copy_user_bh(new_page, inode, &bh, vaddr); | 871 | error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), |
872 | bh.b_size, new_page, vaddr); | ||
872 | else | 873 | else |
873 | clear_user_highpage(new_page, vaddr); | 874 | clear_user_highpage(new_page, vaddr); |
874 | if (error) | 875 | if (error) |
@@ -898,7 +899,8 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
898 | 899 | ||
899 | /* Filesystem should not return unwritten buffers to us! */ | 900 | /* Filesystem should not return unwritten buffers to us! */ |
900 | WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); | 901 | WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); |
901 | error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); | 902 | error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), |
903 | bh.b_size, &entry, vma, vmf); | ||
902 | unlock_entry: | 904 | unlock_entry: |
903 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | 905 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); |
904 | out: | 906 | out: |
@@ -1241,3 +1243,229 @@ int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) | |||
1241 | return dax_zero_page_range(inode, from, length, get_block); | 1243 | return dax_zero_page_range(inode, from, length, get_block); |
1242 | } | 1244 | } |
1243 | EXPORT_SYMBOL_GPL(dax_truncate_page); | 1245 | EXPORT_SYMBOL_GPL(dax_truncate_page); |
1246 | |||
1247 | #ifdef CONFIG_FS_IOMAP | ||
1248 | static loff_t | ||
1249 | iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, | ||
1250 | struct iomap *iomap) | ||
1251 | { | ||
1252 | struct iov_iter *iter = data; | ||
1253 | loff_t end = pos + length, done = 0; | ||
1254 | ssize_t ret = 0; | ||
1255 | |||
1256 | if (iov_iter_rw(iter) == READ) { | ||
1257 | end = min(end, i_size_read(inode)); | ||
1258 | if (pos >= end) | ||
1259 | return 0; | ||
1260 | |||
1261 | if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) | ||
1262 | return iov_iter_zero(min(length, end - pos), iter); | ||
1263 | } | ||
1264 | |||
1265 | if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) | ||
1266 | return -EIO; | ||
1267 | |||
1268 | while (pos < end) { | ||
1269 | unsigned offset = pos & (PAGE_SIZE - 1); | ||
1270 | struct blk_dax_ctl dax = { 0 }; | ||
1271 | ssize_t map_len; | ||
1272 | |||
1273 | dax.sector = iomap->blkno + | ||
1274 | (((pos & PAGE_MASK) - iomap->offset) >> 9); | ||
1275 | dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; | ||
1276 | map_len = dax_map_atomic(iomap->bdev, &dax); | ||
1277 | if (map_len < 0) { | ||
1278 | ret = map_len; | ||
1279 | break; | ||
1280 | } | ||
1281 | |||
1282 | dax.addr += offset; | ||
1283 | map_len -= offset; | ||
1284 | if (map_len > end - pos) | ||
1285 | map_len = end - pos; | ||
1286 | |||
1287 | if (iov_iter_rw(iter) == WRITE) | ||
1288 | map_len = copy_from_iter_pmem(dax.addr, map_len, iter); | ||
1289 | else | ||
1290 | map_len = copy_to_iter(dax.addr, map_len, iter); | ||
1291 | dax_unmap_atomic(iomap->bdev, &dax); | ||
1292 | if (map_len <= 0) { | ||
1293 | ret = map_len ? map_len : -EFAULT; | ||
1294 | break; | ||
1295 | } | ||
1296 | |||
1297 | pos += map_len; | ||
1298 | length -= map_len; | ||
1299 | done += map_len; | ||
1300 | } | ||
1301 | |||
1302 | return done ? done : ret; | ||
1303 | } | ||
1304 | |||
1305 | /** | ||
1306 | * iomap_dax_rw - Perform I/O to a DAX file | ||
1307 | * @iocb: The control block for this I/O | ||
1308 | * @iter: The addresses to do I/O from or to | ||
1309 | * @ops: iomap ops passed from the file system | ||
1310 | * | ||
1311 | * This function performs read and write operations to directly mapped | ||
1312 | * persistent memory. The callers needs to take care of read/write exclusion | ||
1313 | * and evicting any page cache pages in the region under I/O. | ||
1314 | */ | ||
1315 | ssize_t | ||
1316 | iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, | ||
1317 | struct iomap_ops *ops) | ||
1318 | { | ||
1319 | struct address_space *mapping = iocb->ki_filp->f_mapping; | ||
1320 | struct inode *inode = mapping->host; | ||
1321 | loff_t pos = iocb->ki_pos, ret = 0, done = 0; | ||
1322 | unsigned flags = 0; | ||
1323 | |||
1324 | if (iov_iter_rw(iter) == WRITE) | ||
1325 | flags |= IOMAP_WRITE; | ||
1326 | |||
1327 | /* | ||
1328 | * Yes, even DAX files can have page cache attached to them: A zeroed | ||
1329 | * page is inserted into the pagecache when we have to serve a write | ||
1330 | * fault on a hole. It should never be dirtied and can simply be | ||
1331 | * dropped from the pagecache once we get real data for the page. | ||
1332 | * | ||
1333 | * XXX: This is racy against mmap, and there's nothing we can do about | ||
1334 | * it. We'll eventually need to shift this down even further so that | ||
1335 | * we can check if we allocated blocks over a hole first. | ||
1336 | */ | ||
1337 | if (mapping->nrpages) { | ||
1338 | ret = invalidate_inode_pages2_range(mapping, | ||
1339 | pos >> PAGE_SHIFT, | ||
1340 | (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); | ||
1341 | WARN_ON_ONCE(ret); | ||
1342 | } | ||
1343 | |||
1344 | while (iov_iter_count(iter)) { | ||
1345 | ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, | ||
1346 | iter, iomap_dax_actor); | ||
1347 | if (ret <= 0) | ||
1348 | break; | ||
1349 | pos += ret; | ||
1350 | done += ret; | ||
1351 | } | ||
1352 | |||
1353 | iocb->ki_pos += done; | ||
1354 | return done ? done : ret; | ||
1355 | } | ||
1356 | EXPORT_SYMBOL_GPL(iomap_dax_rw); | ||
1357 | |||
1358 | /** | ||
1359 | * iomap_dax_fault - handle a page fault on a DAX file | ||
1360 | * @vma: The virtual memory area where the fault occurred | ||
1361 | * @vmf: The description of the fault | ||
1362 | * @ops: iomap ops passed from the file system | ||
1363 | * | ||
1364 | * When a page fault occurs, filesystems may call this helper in their fault | ||
1365 | * or mkwrite handler for DAX files. Assumes the caller has done all the | ||
1366 | * necessary locking for the page fault to proceed successfully. | ||
1367 | */ | ||
1368 | int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | ||
1369 | struct iomap_ops *ops) | ||
1370 | { | ||
1371 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
1372 | struct inode *inode = mapping->host; | ||
1373 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | ||
1374 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; | ||
1375 | sector_t sector; | ||
1376 | struct iomap iomap = { 0 }; | ||
1377 | unsigned flags = 0; | ||
1378 | int error, major = 0; | ||
1379 | void *entry; | ||
1380 | |||
1381 | /* | ||
1382 | * Check whether offset isn't beyond end of file now. Caller is supposed | ||
1383 | * to hold locks serializing us with truncate / punch hole so this is | ||
1384 | * a reliable test. | ||
1385 | */ | ||
1386 | if (pos >= i_size_read(inode)) | ||
1387 | return VM_FAULT_SIGBUS; | ||
1388 | |||
1389 | entry = grab_mapping_entry(mapping, vmf->pgoff); | ||
1390 | if (IS_ERR(entry)) { | ||
1391 | error = PTR_ERR(entry); | ||
1392 | goto out; | ||
1393 | } | ||
1394 | |||
1395 | if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) | ||
1396 | flags |= IOMAP_WRITE; | ||
1397 | |||
1398 | /* | ||
1399 | * Note that we don't bother to use iomap_apply here: DAX required | ||
1400 | * the file system block size to be equal the page size, which means | ||
1401 | * that we never have to deal with more than a single extent here. | ||
1402 | */ | ||
1403 | error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); | ||
1404 | if (error) | ||
1405 | goto unlock_entry; | ||
1406 | if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { | ||
1407 | error = -EIO; /* fs corruption? */ | ||
1408 | goto unlock_entry; | ||
1409 | } | ||
1410 | |||
1411 | sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); | ||
1412 | |||
1413 | if (vmf->cow_page) { | ||
1414 | switch (iomap.type) { | ||
1415 | case IOMAP_HOLE: | ||
1416 | case IOMAP_UNWRITTEN: | ||
1417 | clear_user_highpage(vmf->cow_page, vaddr); | ||
1418 | break; | ||
1419 | case IOMAP_MAPPED: | ||
1420 | error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, | ||
1421 | vmf->cow_page, vaddr); | ||
1422 | break; | ||
1423 | default: | ||
1424 | WARN_ON_ONCE(1); | ||
1425 | error = -EIO; | ||
1426 | break; | ||
1427 | } | ||
1428 | |||
1429 | if (error) | ||
1430 | goto unlock_entry; | ||
1431 | if (!radix_tree_exceptional_entry(entry)) { | ||
1432 | vmf->page = entry; | ||
1433 | return VM_FAULT_LOCKED; | ||
1434 | } | ||
1435 | vmf->entry = entry; | ||
1436 | return VM_FAULT_DAX_LOCKED; | ||
1437 | } | ||
1438 | |||
1439 | switch (iomap.type) { | ||
1440 | case IOMAP_MAPPED: | ||
1441 | if (iomap.flags & IOMAP_F_NEW) { | ||
1442 | count_vm_event(PGMAJFAULT); | ||
1443 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | ||
1444 | major = VM_FAULT_MAJOR; | ||
1445 | } | ||
1446 | error = dax_insert_mapping(mapping, iomap.bdev, sector, | ||
1447 | PAGE_SIZE, &entry, vma, vmf); | ||
1448 | break; | ||
1449 | case IOMAP_UNWRITTEN: | ||
1450 | case IOMAP_HOLE: | ||
1451 | if (!(vmf->flags & FAULT_FLAG_WRITE)) | ||
1452 | return dax_load_hole(mapping, entry, vmf); | ||
1453 | /*FALLTHRU*/ | ||
1454 | default: | ||
1455 | WARN_ON_ONCE(1); | ||
1456 | error = -EIO; | ||
1457 | break; | ||
1458 | } | ||
1459 | |||
1460 | unlock_entry: | ||
1461 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | ||
1462 | out: | ||
1463 | if (error == -ENOMEM) | ||
1464 | return VM_FAULT_OOM | major; | ||
1465 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | ||
1466 | if (error < 0 && error != -EBUSY) | ||
1467 | return VM_FAULT_SIGBUS | major; | ||
1468 | return VM_FAULT_NOPAGE | major; | ||
1469 | } | ||
1470 | EXPORT_SYMBOL_GPL(iomap_dax_fault); | ||
1471 | #endif /* CONFIG_FS_IOMAP */ | ||