diff options
author | Dan Williams <dan.j.williams@intel.com> | 2017-04-07 19:42:08 -0400 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2017-04-12 16:45:18 -0400 |
commit | 956a4cd2c957acf638ff29951aabaa9d8e92bbc2 (patch) | |
tree | d4e8def7593d04b58bad676f1329cdd8446bd00f | |
parent | 4aa5615e080a9855e607accc75b07ab79b252dde (diff) |
device-dax: switch to srcu, fix rcu_read_lock() vs pte allocation
The following warning triggers with a new unit test that stresses the
device-dax interface.
===============================
[ ERR: suspicious RCU usage. ]
4.11.0-rc4+ #1049 Tainted: G O
-------------------------------
./include/linux/rcupdate.h:521 Illegal context switch in RCU read-side critical section!
other info that might help us debug this:
rcu_scheduler_active = 2, debug_locks = 0
2 locks held by fio/9070:
#0: (&mm->mmap_sem){++++++}, at: [<ffffffff8d0739d7>] __do_page_fault+0x167/0x4f0
#1: (rcu_read_lock){......}, at: [<ffffffffc03fbd02>] dax_dev_huge_fault+0x32/0x620 [dax]
Call Trace:
dump_stack+0x86/0xc3
lockdep_rcu_suspicious+0xd7/0x110
___might_sleep+0xac/0x250
__might_sleep+0x4a/0x80
__alloc_pages_nodemask+0x23a/0x360
alloc_pages_current+0xa1/0x1f0
pte_alloc_one+0x17/0x80
__pte_alloc+0x1e/0x120
__get_locked_pte+0x1bf/0x1d0
insert_pfn.isra.70+0x3a/0x100
? lookup_memtype+0xa6/0xd0
vm_insert_mixed+0x64/0x90
dax_dev_huge_fault+0x520/0x620 [dax]
? dax_dev_huge_fault+0x32/0x620 [dax]
dax_dev_fault+0x10/0x20 [dax]
__do_fault+0x1e/0x140
__handle_mm_fault+0x9af/0x10d0
handle_mm_fault+0x16d/0x370
? handle_mm_fault+0x47/0x370
__do_page_fault+0x28c/0x4f0
trace_do_page_fault+0x58/0x2a0
do_async_page_fault+0x1a/0xa0
async_page_fault+0x28/0x30
Inserting a page table entry may trigger an allocation while we are
holding a read lock to keep the device instance alive for the duration
of the fault. Use srcu for this keep-alive protection.
Fixes: dee410792419 ("/dev/dax, core: file operations and dax-mmap")
Cc: <stable@vger.kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r-- | drivers/dax/Kconfig | 1 | ||||
-rw-r--r-- | drivers/dax/dax.c | 13 |
2 files changed, 8 insertions, 6 deletions
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 3e2ab3b14eea..9e95bf94eb13 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig | |||
@@ -2,6 +2,7 @@ menuconfig DEV_DAX | |||
2 | tristate "DAX: direct access to differentiated memory" | 2 | tristate "DAX: direct access to differentiated memory" |
3 | default m if NVDIMM_DAX | 3 | default m if NVDIMM_DAX |
4 | depends on TRANSPARENT_HUGEPAGE | 4 | depends on TRANSPARENT_HUGEPAGE |
5 | select SRCU | ||
5 | help | 6 | help |
6 | Support raw access to differentiated (persistence, bandwidth, | 7 | Support raw access to differentiated (persistence, bandwidth, |
7 | latency...) memory via an mmap(2) capable character | 8 | latency...) memory via an mmap(2) capable character |
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 80c6db279ae1..806f180c80d8 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include "dax.h" | 25 | #include "dax.h" |
26 | 26 | ||
27 | static dev_t dax_devt; | 27 | static dev_t dax_devt; |
28 | DEFINE_STATIC_SRCU(dax_srcu); | ||
28 | static struct class *dax_class; | 29 | static struct class *dax_class; |
29 | static DEFINE_IDA(dax_minor_ida); | 30 | static DEFINE_IDA(dax_minor_ida); |
30 | static int nr_dax = CONFIG_NR_DEV_DAX; | 31 | static int nr_dax = CONFIG_NR_DEV_DAX; |
@@ -60,7 +61,7 @@ struct dax_region { | |||
60 | * @region - parent region | 61 | * @region - parent region |
61 | * @dev - device backing the character device | 62 | * @dev - device backing the character device |
62 | * @cdev - core chardev data | 63 | * @cdev - core chardev data |
63 | * @alive - !alive + rcu grace period == no new mappings can be established | 64 | * @alive - !alive + srcu grace period == no new mappings can be established |
64 | * @id - child id in the region | 65 | * @id - child id in the region |
65 | * @num_resources - number of physical address extents in this device | 66 | * @num_resources - number of physical address extents in this device |
66 | * @res - array of physical address ranges | 67 | * @res - array of physical address ranges |
@@ -569,7 +570,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) | |||
569 | static int dax_dev_huge_fault(struct vm_fault *vmf, | 570 | static int dax_dev_huge_fault(struct vm_fault *vmf, |
570 | enum page_entry_size pe_size) | 571 | enum page_entry_size pe_size) |
571 | { | 572 | { |
572 | int rc; | 573 | int rc, id; |
573 | struct file *filp = vmf->vma->vm_file; | 574 | struct file *filp = vmf->vma->vm_file; |
574 | struct dax_dev *dax_dev = filp->private_data; | 575 | struct dax_dev *dax_dev = filp->private_data; |
575 | 576 | ||
@@ -578,7 +579,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, | |||
578 | ? "write" : "read", | 579 | ? "write" : "read", |
579 | vmf->vma->vm_start, vmf->vma->vm_end); | 580 | vmf->vma->vm_start, vmf->vma->vm_end); |
580 | 581 | ||
581 | rcu_read_lock(); | 582 | id = srcu_read_lock(&dax_srcu); |
582 | switch (pe_size) { | 583 | switch (pe_size) { |
583 | case PE_SIZE_PTE: | 584 | case PE_SIZE_PTE: |
584 | rc = __dax_dev_pte_fault(dax_dev, vmf); | 585 | rc = __dax_dev_pte_fault(dax_dev, vmf); |
@@ -592,7 +593,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, | |||
592 | default: | 593 | default: |
593 | return VM_FAULT_FALLBACK; | 594 | return VM_FAULT_FALLBACK; |
594 | } | 595 | } |
595 | rcu_read_unlock(); | 596 | srcu_read_unlock(&dax_srcu, id); |
596 | 597 | ||
597 | return rc; | 598 | return rc; |
598 | } | 599 | } |
@@ -713,11 +714,11 @@ static void unregister_dax_dev(void *dev) | |||
713 | * Note, rcu is not protecting the liveness of dax_dev, rcu is | 714 | * Note, rcu is not protecting the liveness of dax_dev, rcu is |
714 | * ensuring that any fault handlers that might have seen | 715 | * ensuring that any fault handlers that might have seen |
715 | * dax_dev->alive == true, have completed. Any fault handlers | 716 | * dax_dev->alive == true, have completed. Any fault handlers |
716 | * that start after synchronize_rcu() has started will abort | 717 | * that start after synchronize_srcu() has started will abort |
717 | * upon seeing dax_dev->alive == false. | 718 | * upon seeing dax_dev->alive == false. |
718 | */ | 719 | */ |
719 | dax_dev->alive = false; | 720 | dax_dev->alive = false; |
720 | synchronize_rcu(); | 721 | synchronize_srcu(&dax_srcu); |
721 | unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); | 722 | unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); |
722 | cdev_del(cdev); | 723 | cdev_del(cdev); |
723 | device_unregister(dev); | 724 | device_unregister(dev); |