diff options
author | Dan Williams <dan.j.williams@intel.com> | 2015-11-30 13:20:29 -0500 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2016-01-09 09:30:49 -0500 |
commit | 5a023cdba50c5f5f2bc351783b3131699deb3937 (patch) | |
tree | bae047e7bf8fb28e48563b33afc2881c226f4e31 | |
parent | 4ebb16ca9a06a54cdb2e7f2ce1e506fa4d432445 (diff) |
block: enable dax for raw block devices
If an application wants exclusive access to all of the persistent memory
provided by an NVDIMM namespace it can use this raw-block-dax facility
to forgo establishing a filesystem. This capability is targeted
primarily to hypervisors wanting to provision persistent memory for
guests. It can be disabled / enabled dynamically via the new BLKDAXSET
ioctl.
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reviewed-by: Jan Kara <jack@suse.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r-- | block/ioctl.c | 61 | ||||
-rw-r--r-- | fs/block_dev.c | 103 | ||||
-rw-r--r-- | include/linux/fs.h | 11 | ||||
-rw-r--r-- | include/uapi/linux/fs.h | 2 |
4 files changed, 169 insertions, 8 deletions
diff --git a/block/ioctl.c b/block/ioctl.c index 0918aed2d847..7a964d842913 100644 --- a/block/ioctl.c +++ b/block/ioctl.c | |||
@@ -406,6 +406,62 @@ static inline int is_unrecognized_ioctl(int ret) | |||
406 | ret == -ENOIOCTLCMD; | 406 | ret == -ENOIOCTLCMD; |
407 | } | 407 | } |
408 | 408 | ||
409 | #ifdef CONFIG_FS_DAX | ||
410 | bool blkdev_dax_capable(struct block_device *bdev) | ||
411 | { | ||
412 | struct gendisk *disk = bdev->bd_disk; | ||
413 | |||
414 | if (!disk->fops->direct_access) | ||
415 | return false; | ||
416 | |||
417 | /* | ||
418 | * If the partition is not aligned on a page boundary, we can't | ||
419 | * do dax I/O to it. | ||
420 | */ | ||
421 | if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) | ||
422 | || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) | ||
423 | return false; | ||
424 | |||
425 | return true; | ||
426 | } | ||
427 | |||
428 | static int blkdev_daxset(struct block_device *bdev, unsigned long argp) | ||
429 | { | ||
430 | unsigned long arg; | ||
431 | int rc = 0; | ||
432 | |||
433 | if (!capable(CAP_SYS_ADMIN)) | ||
434 | return -EACCES; | ||
435 | |||
436 | if (get_user(arg, (int __user *)(argp))) | ||
437 | return -EFAULT; | ||
438 | arg = !!arg; | ||
439 | if (arg == !!(bdev->bd_inode->i_flags & S_DAX)) | ||
440 | return 0; | ||
441 | |||
442 | if (arg) | ||
443 | arg = S_DAX; | ||
444 | |||
445 | if (arg && !blkdev_dax_capable(bdev)) | ||
446 | return -ENOTTY; | ||
447 | |||
448 | mutex_lock(&bdev->bd_inode->i_mutex); | ||
449 | if (bdev->bd_map_count == 0) | ||
450 | inode_set_flags(bdev->bd_inode, arg, S_DAX); | ||
451 | else | ||
452 | rc = -EBUSY; | ||
453 | mutex_unlock(&bdev->bd_inode->i_mutex); | ||
454 | return rc; | ||
455 | } | ||
456 | #else | ||
457 | static int blkdev_daxset(struct block_device *bdev, int arg) | ||
458 | { | ||
459 | if (arg) | ||
460 | return -ENOTTY; | ||
461 | return 0; | ||
462 | } | ||
463 | #endif | ||
464 | |||
409 | static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, | 465 | static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, |
410 | unsigned cmd, unsigned long arg) | 466 | unsigned cmd, unsigned long arg) |
411 | { | 467 | { |
@@ -568,6 +624,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, | |||
568 | case BLKTRACESETUP: | 624 | case BLKTRACESETUP: |
569 | case BLKTRACETEARDOWN: | 625 | case BLKTRACETEARDOWN: |
570 | return blk_trace_ioctl(bdev, cmd, argp); | 626 | return blk_trace_ioctl(bdev, cmd, argp); |
627 | case BLKDAXSET: | ||
628 | return blkdev_daxset(bdev, arg); | ||
629 | case BLKDAXGET: | ||
630 | return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX)); | ||
631 | break; | ||
571 | case IOC_PR_REGISTER: | 632 | case IOC_PR_REGISTER: |
572 | return blkdev_pr_register(bdev, argp); | 633 | return blkdev_pr_register(bdev, argp); |
573 | case IOC_PR_RESERVE: | 634 | case IOC_PR_RESERVE: |
diff --git a/fs/block_dev.c b/fs/block_dev.c index 52248bce42d2..5c0b2cba870e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -1235,8 +1235,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1235 | } | 1235 | } |
1236 | } | 1236 | } |
1237 | 1237 | ||
1238 | if (!ret) | 1238 | if (!ret) { |
1239 | bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); | 1239 | bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); |
1240 | if (!blkdev_dax_capable(bdev)) | ||
1241 | bdev->bd_inode->i_flags &= ~S_DAX; | ||
1242 | } | ||
1240 | 1243 | ||
1241 | /* | 1244 | /* |
1242 | * If the device is invalidated, rescan partition | 1245 | * If the device is invalidated, rescan partition |
@@ -1250,6 +1253,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1250 | else if (ret == -ENOMEDIUM) | 1253 | else if (ret == -ENOMEDIUM) |
1251 | invalidate_partitions(disk, bdev); | 1254 | invalidate_partitions(disk, bdev); |
1252 | } | 1255 | } |
1256 | |||
1253 | if (ret) | 1257 | if (ret) |
1254 | goto out_clear; | 1258 | goto out_clear; |
1255 | } else { | 1259 | } else { |
@@ -1270,12 +1274,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) | |||
1270 | goto out_clear; | 1274 | goto out_clear; |
1271 | } | 1275 | } |
1272 | bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); | 1276 | bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); |
1273 | /* | 1277 | if (!blkdev_dax_capable(bdev)) |
1274 | * If the partition is not aligned on a page | ||
1275 | * boundary, we can't do dax I/O to it. | ||
1276 | */ | ||
1277 | if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) || | ||
1278 | (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) | ||
1279 | bdev->bd_inode->i_flags &= ~S_DAX; | 1278 | bdev->bd_inode->i_flags &= ~S_DAX; |
1280 | } | 1279 | } |
1281 | } else { | 1280 | } else { |
@@ -1713,13 +1712,101 @@ static const struct address_space_operations def_blk_aops = { | |||
1713 | .is_dirty_writeback = buffer_check_dirty_writeback, | 1712 | .is_dirty_writeback = buffer_check_dirty_writeback, |
1714 | }; | 1713 | }; |
1715 | 1714 | ||
1715 | #ifdef CONFIG_FS_DAX | ||
1716 | /* | ||
1717 | * In the raw block case we do not need to contend with truncation nor | ||
1718 | * unwritten file extents. Without those concerns there is no need for | ||
1719 | * additional locking beyond the mmap_sem context that these routines | ||
1720 | * are already executing under. | ||
1721 | * | ||
1722 | * Note, there is no protection if the block device is dynamically | ||
1723 | * resized (partition grow/shrink) during a fault. A stable block device | ||
1724 | * size is already not enforced in the blkdev_direct_IO path. | ||
1725 | * | ||
1726 | * For DAX, it is the responsibility of the block device driver to | ||
1727 | * ensure the whole-disk device size is stable while requests are in | ||
1728 | * flight. | ||
1729 | * | ||
1730 | * Finally, unlike the filemap_page_mkwrite() case there is no | ||
1731 | * filesystem superblock to sync against freezing. We still include a | ||
1732 | * pfn_mkwrite callback for dax drivers to receive write fault | ||
1733 | * notifications. | ||
1734 | */ | ||
1735 | static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | ||
1736 | { | ||
1737 | return __dax_fault(vma, vmf, blkdev_get_block, NULL); | ||
1738 | } | ||
1739 | |||
1740 | static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | ||
1741 | pmd_t *pmd, unsigned int flags) | ||
1742 | { | ||
1743 | return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); | ||
1744 | } | ||
1745 | |||
1746 | static void blkdev_vm_open(struct vm_area_struct *vma) | ||
1747 | { | ||
1748 | struct inode *bd_inode = bdev_file_inode(vma->vm_file); | ||
1749 | struct block_device *bdev = I_BDEV(bd_inode); | ||
1750 | |||
1751 | mutex_lock(&bd_inode->i_mutex); | ||
1752 | bdev->bd_map_count++; | ||
1753 | mutex_unlock(&bd_inode->i_mutex); | ||
1754 | } | ||
1755 | |||
1756 | static void blkdev_vm_close(struct vm_area_struct *vma) | ||
1757 | { | ||
1758 | struct inode *bd_inode = bdev_file_inode(vma->vm_file); | ||
1759 | struct block_device *bdev = I_BDEV(bd_inode); | ||
1760 | |||
1761 | mutex_lock(&bd_inode->i_mutex); | ||
1762 | bdev->bd_map_count--; | ||
1763 | mutex_unlock(&bd_inode->i_mutex); | ||
1764 | } | ||
1765 | |||
1766 | static const struct vm_operations_struct blkdev_dax_vm_ops = { | ||
1767 | .open = blkdev_vm_open, | ||
1768 | .close = blkdev_vm_close, | ||
1769 | .fault = blkdev_dax_fault, | ||
1770 | .pmd_fault = blkdev_dax_pmd_fault, | ||
1771 | .pfn_mkwrite = blkdev_dax_fault, | ||
1772 | }; | ||
1773 | |||
1774 | static const struct vm_operations_struct blkdev_default_vm_ops = { | ||
1775 | .open = blkdev_vm_open, | ||
1776 | .close = blkdev_vm_close, | ||
1777 | .fault = filemap_fault, | ||
1778 | .map_pages = filemap_map_pages, | ||
1779 | }; | ||
1780 | |||
1781 | static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) | ||
1782 | { | ||
1783 | struct inode *bd_inode = bdev_file_inode(file); | ||
1784 | struct block_device *bdev = I_BDEV(bd_inode); | ||
1785 | |||
1786 | file_accessed(file); | ||
1787 | mutex_lock(&bd_inode->i_mutex); | ||
1788 | bdev->bd_map_count++; | ||
1789 | if (IS_DAX(bd_inode)) { | ||
1790 | vma->vm_ops = &blkdev_dax_vm_ops; | ||
1791 | vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; | ||
1792 | } else { | ||
1793 | vma->vm_ops = &blkdev_default_vm_ops; | ||
1794 | } | ||
1795 | mutex_unlock(&bd_inode->i_mutex); | ||
1796 | |||
1797 | return 0; | ||
1798 | } | ||
1799 | #else | ||
1800 | #define blkdev_mmap generic_file_mmap | ||
1801 | #endif | ||
1802 | |||
1716 | const struct file_operations def_blk_fops = { | 1803 | const struct file_operations def_blk_fops = { |
1717 | .open = blkdev_open, | 1804 | .open = blkdev_open, |
1718 | .release = blkdev_close, | 1805 | .release = blkdev_close, |
1719 | .llseek = block_llseek, | 1806 | .llseek = block_llseek, |
1720 | .read_iter = blkdev_read_iter, | 1807 | .read_iter = blkdev_read_iter, |
1721 | .write_iter = blkdev_write_iter, | 1808 | .write_iter = blkdev_write_iter, |
1722 | .mmap = generic_file_mmap, | 1809 | .mmap = blkdev_mmap, |
1723 | .fsync = blkdev_fsync, | 1810 | .fsync = blkdev_fsync, |
1724 | .unlocked_ioctl = block_ioctl, | 1811 | .unlocked_ioctl = block_ioctl, |
1725 | #ifdef CONFIG_COMPAT | 1812 | #ifdef CONFIG_COMPAT |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 3aa514254161..96fabc93b583 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -482,6 +482,9 @@ struct block_device { | |||
482 | int bd_fsfreeze_count; | 482 | int bd_fsfreeze_count; |
483 | /* Mutex for freeze */ | 483 | /* Mutex for freeze */ |
484 | struct mutex bd_fsfreeze_mutex; | 484 | struct mutex bd_fsfreeze_mutex; |
485 | #ifdef CONFIG_FS_DAX | ||
486 | int bd_map_count; | ||
487 | #endif | ||
485 | }; | 488 | }; |
486 | 489 | ||
487 | /* | 490 | /* |
@@ -2264,6 +2267,14 @@ extern struct super_block *freeze_bdev(struct block_device *); | |||
2264 | extern void emergency_thaw_all(void); | 2267 | extern void emergency_thaw_all(void); |
2265 | extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); | 2268 | extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); |
2266 | extern int fsync_bdev(struct block_device *); | 2269 | extern int fsync_bdev(struct block_device *); |
2270 | #ifdef CONFIG_FS_DAX | ||
2271 | extern bool blkdev_dax_capable(struct block_device *bdev); | ||
2272 | #else | ||
2273 | static inline bool blkdev_dax_capable(struct block_device *bdev) | ||
2274 | { | ||
2275 | return false; | ||
2276 | } | ||
2277 | #endif | ||
2267 | 2278 | ||
2268 | extern struct super_block *blockdev_superblock; | 2279 | extern struct super_block *blockdev_superblock; |
2269 | 2280 | ||
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index f15d980249b5..401c409e9239 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h | |||
@@ -152,6 +152,8 @@ struct inodes_stat_t { | |||
152 | #define BLKSECDISCARD _IO(0x12,125) | 152 | #define BLKSECDISCARD _IO(0x12,125) |
153 | #define BLKROTATIONAL _IO(0x12,126) | 153 | #define BLKROTATIONAL _IO(0x12,126) |
154 | #define BLKZEROOUT _IO(0x12,127) | 154 | #define BLKZEROOUT _IO(0x12,127) |
155 | #define BLKDAXSET _IO(0x12,128) | ||
156 | #define BLKDAXGET _IO(0x12,129) | ||
155 | 157 | ||
156 | #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ | 158 | #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ |
157 | #define FIBMAP _IO(0x00,1) /* bmap access */ | 159 | #define FIBMAP _IO(0x00,1) /* bmap access */ |