aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2015-11-30 13:20:29 -0500
committerDan Williams <dan.j.williams@intel.com>2016-01-09 09:30:49 -0500
commit5a023cdba50c5f5f2bc351783b3131699deb3937 (patch)
treebae047e7bf8fb28e48563b33afc2881c226f4e31
parent4ebb16ca9a06a54cdb2e7f2ce1e506fa4d432445 (diff)
block: enable dax for raw block devices
If an application wants exclusive access to all of the persistent memory provided by an NVDIMM namespace it can use this raw-block-dax facility to forgo establishing a filesystem. This capability is targeted primarily to hypervisors wanting to provision persistent memory for guests. It can be disabled / enabled dynamically via the new BLKDAXSET ioctl. Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Reported-by: kbuild test robot <fengguang.wu@intel.com> Reviewed-by: Jan Kara <jack@suse.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r--block/ioctl.c61
-rw-r--r--fs/block_dev.c103
-rw-r--r--include/linux/fs.h11
-rw-r--r--include/uapi/linux/fs.h2
4 files changed, 169 insertions, 8 deletions
diff --git a/block/ioctl.c b/block/ioctl.c
index 0918aed2d847..7a964d842913 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -406,6 +406,62 @@ static inline int is_unrecognized_ioctl(int ret)
406 ret == -ENOIOCTLCMD; 406 ret == -ENOIOCTLCMD;
407} 407}
408 408
409#ifdef CONFIG_FS_DAX
410bool blkdev_dax_capable(struct block_device *bdev)
411{
412 struct gendisk *disk = bdev->bd_disk;
413
414 if (!disk->fops->direct_access)
415 return false;
416
417 /*
418 * If the partition is not aligned on a page boundary, we can't
419 * do dax I/O to it.
420 */
421 if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512))
422 || (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
423 return false;
424
425 return true;
426}
427
428static int blkdev_daxset(struct block_device *bdev, unsigned long argp)
429{
430 unsigned long arg;
431 int rc = 0;
432
433 if (!capable(CAP_SYS_ADMIN))
434 return -EACCES;
435
436 if (get_user(arg, (int __user *)(argp)))
437 return -EFAULT;
438 arg = !!arg;
439 if (arg == !!(bdev->bd_inode->i_flags & S_DAX))
440 return 0;
441
442 if (arg)
443 arg = S_DAX;
444
445 if (arg && !blkdev_dax_capable(bdev))
446 return -ENOTTY;
447
448 mutex_lock(&bdev->bd_inode->i_mutex);
449 if (bdev->bd_map_count == 0)
450 inode_set_flags(bdev->bd_inode, arg, S_DAX);
451 else
452 rc = -EBUSY;
453 mutex_unlock(&bdev->bd_inode->i_mutex);
454 return rc;
455}
456#else
457static int blkdev_daxset(struct block_device *bdev, int arg)
458{
459 if (arg)
460 return -ENOTTY;
461 return 0;
462}
463#endif
464
409static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, 465static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
410 unsigned cmd, unsigned long arg) 466 unsigned cmd, unsigned long arg)
411{ 467{
@@ -568,6 +624,11 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
568 case BLKTRACESETUP: 624 case BLKTRACESETUP:
569 case BLKTRACETEARDOWN: 625 case BLKTRACETEARDOWN:
570 return blk_trace_ioctl(bdev, cmd, argp); 626 return blk_trace_ioctl(bdev, cmd, argp);
627 case BLKDAXSET:
628 return blkdev_daxset(bdev, arg);
629 case BLKDAXGET:
630 return put_int(arg, !!(bdev->bd_inode->i_flags & S_DAX));
631 break;
571 case IOC_PR_REGISTER: 632 case IOC_PR_REGISTER:
572 return blkdev_pr_register(bdev, argp); 633 return blkdev_pr_register(bdev, argp);
573 case IOC_PR_RESERVE: 634 case IOC_PR_RESERVE:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 52248bce42d2..5c0b2cba870e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1235,8 +1235,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1235 } 1235 }
1236 } 1236 }
1237 1237
1238 if (!ret) 1238 if (!ret) {
1239 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1239 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1240 if (!blkdev_dax_capable(bdev))
1241 bdev->bd_inode->i_flags &= ~S_DAX;
1242 }
1240 1243
1241 /* 1244 /*
1242 * If the device is invalidated, rescan partition 1245 * If the device is invalidated, rescan partition
@@ -1250,6 +1253,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1250 else if (ret == -ENOMEDIUM) 1253 else if (ret == -ENOMEDIUM)
1251 invalidate_partitions(disk, bdev); 1254 invalidate_partitions(disk, bdev);
1252 } 1255 }
1256
1253 if (ret) 1257 if (ret)
1254 goto out_clear; 1258 goto out_clear;
1255 } else { 1259 } else {
@@ -1270,12 +1274,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1270 goto out_clear; 1274 goto out_clear;
1271 } 1275 }
1272 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1276 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1273 /* 1277 if (!blkdev_dax_capable(bdev))
1274 * If the partition is not aligned on a page
1275 * boundary, we can't do dax I/O to it.
1276 */
1277 if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
1278 (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
1279 bdev->bd_inode->i_flags &= ~S_DAX; 1278 bdev->bd_inode->i_flags &= ~S_DAX;
1280 } 1279 }
1281 } else { 1280 } else {
@@ -1713,13 +1712,101 @@ static const struct address_space_operations def_blk_aops = {
1713 .is_dirty_writeback = buffer_check_dirty_writeback, 1712 .is_dirty_writeback = buffer_check_dirty_writeback,
1714}; 1713};
1715 1714
1715#ifdef CONFIG_FS_DAX
1716/*
1717 * In the raw block case we do not need to contend with truncation nor
1718 * unwritten file extents. Without those concerns there is no need for
1719 * additional locking beyond the mmap_sem context that these routines
1720 * are already executing under.
1721 *
1722 * Note, there is no protection if the block device is dynamically
1723 * resized (partition grow/shrink) during a fault. A stable block device
1724 * size is already not enforced in the blkdev_direct_IO path.
1725 *
1726 * For DAX, it is the responsibility of the block device driver to
1727 * ensure the whole-disk device size is stable while requests are in
1728 * flight.
1729 *
1730 * Finally, unlike the filemap_page_mkwrite() case there is no
1731 * filesystem superblock to sync against freezing. We still include a
1732 * pfn_mkwrite callback for dax drivers to receive write fault
1733 * notifications.
1734 */
1735static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1736{
1737 return __dax_fault(vma, vmf, blkdev_get_block, NULL);
1738}
1739
1740static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
1741 pmd_t *pmd, unsigned int flags)
1742{
1743 return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
1744}
1745
1746static void blkdev_vm_open(struct vm_area_struct *vma)
1747{
1748 struct inode *bd_inode = bdev_file_inode(vma->vm_file);
1749 struct block_device *bdev = I_BDEV(bd_inode);
1750
1751 mutex_lock(&bd_inode->i_mutex);
1752 bdev->bd_map_count++;
1753 mutex_unlock(&bd_inode->i_mutex);
1754}
1755
1756static void blkdev_vm_close(struct vm_area_struct *vma)
1757{
1758 struct inode *bd_inode = bdev_file_inode(vma->vm_file);
1759 struct block_device *bdev = I_BDEV(bd_inode);
1760
1761 mutex_lock(&bd_inode->i_mutex);
1762 bdev->bd_map_count--;
1763 mutex_unlock(&bd_inode->i_mutex);
1764}
1765
1766static const struct vm_operations_struct blkdev_dax_vm_ops = {
1767 .open = blkdev_vm_open,
1768 .close = blkdev_vm_close,
1769 .fault = blkdev_dax_fault,
1770 .pmd_fault = blkdev_dax_pmd_fault,
1771 .pfn_mkwrite = blkdev_dax_fault,
1772};
1773
1774static const struct vm_operations_struct blkdev_default_vm_ops = {
1775 .open = blkdev_vm_open,
1776 .close = blkdev_vm_close,
1777 .fault = filemap_fault,
1778 .map_pages = filemap_map_pages,
1779};
1780
1781static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
1782{
1783 struct inode *bd_inode = bdev_file_inode(file);
1784 struct block_device *bdev = I_BDEV(bd_inode);
1785
1786 file_accessed(file);
1787 mutex_lock(&bd_inode->i_mutex);
1788 bdev->bd_map_count++;
1789 if (IS_DAX(bd_inode)) {
1790 vma->vm_ops = &blkdev_dax_vm_ops;
1791 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
1792 } else {
1793 vma->vm_ops = &blkdev_default_vm_ops;
1794 }
1795 mutex_unlock(&bd_inode->i_mutex);
1796
1797 return 0;
1798}
1799#else
1800#define blkdev_mmap generic_file_mmap
1801#endif
1802
1716const struct file_operations def_blk_fops = { 1803const struct file_operations def_blk_fops = {
1717 .open = blkdev_open, 1804 .open = blkdev_open,
1718 .release = blkdev_close, 1805 .release = blkdev_close,
1719 .llseek = block_llseek, 1806 .llseek = block_llseek,
1720 .read_iter = blkdev_read_iter, 1807 .read_iter = blkdev_read_iter,
1721 .write_iter = blkdev_write_iter, 1808 .write_iter = blkdev_write_iter,
1722 .mmap = generic_file_mmap, 1809 .mmap = blkdev_mmap,
1723 .fsync = blkdev_fsync, 1810 .fsync = blkdev_fsync,
1724 .unlocked_ioctl = block_ioctl, 1811 .unlocked_ioctl = block_ioctl,
1725#ifdef CONFIG_COMPAT 1812#ifdef CONFIG_COMPAT
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3aa514254161..96fabc93b583 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -482,6 +482,9 @@ struct block_device {
482 int bd_fsfreeze_count; 482 int bd_fsfreeze_count;
483 /* Mutex for freeze */ 483 /* Mutex for freeze */
484 struct mutex bd_fsfreeze_mutex; 484 struct mutex bd_fsfreeze_mutex;
485#ifdef CONFIG_FS_DAX
486 int bd_map_count;
487#endif
485}; 488};
486 489
487/* 490/*
@@ -2264,6 +2267,14 @@ extern struct super_block *freeze_bdev(struct block_device *);
2264extern void emergency_thaw_all(void); 2267extern void emergency_thaw_all(void);
2265extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); 2268extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
2266extern int fsync_bdev(struct block_device *); 2269extern int fsync_bdev(struct block_device *);
2270#ifdef CONFIG_FS_DAX
2271extern bool blkdev_dax_capable(struct block_device *bdev);
2272#else
2273static inline bool blkdev_dax_capable(struct block_device *bdev)
2274{
2275 return false;
2276}
2277#endif
2267 2278
2268extern struct super_block *blockdev_superblock; 2279extern struct super_block *blockdev_superblock;
2269 2280
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index f15d980249b5..401c409e9239 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -152,6 +152,8 @@ struct inodes_stat_t {
152#define BLKSECDISCARD _IO(0x12,125) 152#define BLKSECDISCARD _IO(0x12,125)
153#define BLKROTATIONAL _IO(0x12,126) 153#define BLKROTATIONAL _IO(0x12,126)
154#define BLKZEROOUT _IO(0x12,127) 154#define BLKZEROOUT _IO(0x12,127)
155#define BLKDAXSET _IO(0x12,128)
156#define BLKDAXGET _IO(0x12,129)
155 157
156#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ 158#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
157#define FIBMAP _IO(0x00,1) /* bmap access */ 159#define FIBMAP _IO(0x00,1) /* bmap access */