summaryrefslogtreecommitdiffstats
path: root/drivers/dax
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2016-05-14 15:20:44 -0400
committerDan Williams <dan.j.williams@intel.com>2016-05-21 01:02:55 -0400
commitdee410792419aaa8bc3e3b35d2ccb6515835916d (patch)
treeb63a073ff7e6bad601ac7c45d3b0a156d906b052 /drivers/dax
parentab68f26221366f92611650e8470e6a926801c7d4 (diff)
/dev/dax, core: file operations and dax-mmap
The "Device DAX" core enables dax mappings of performance / feature differentiated memory. An open mapping or file handle keeps the backing struct device live, but new mappings are only possible while the device is enabled. Faults are handled under rcu_read_lock to synchronize with the enabled state of the device. Similar to the filesystem-dax case the backing memory may optionally have struct page entries. However, unlike fs-dax there is no support for private mappings, or mappings that are not backed by media (see use of zero-page in fs-dax). Mappings are always guaranteed to match the alignment of the dax_region. If the dax_region is configured to have a 2MB alignment, all mappings are guaranteed to be backed by a pmd entry. Contrast this determinism with the fs-dax case where pmd mappings are opportunistic. If userspace attempts to force a misaligned mapping, the driver will fail the mmap attempt. See dax_dev_check_vma() for other scenarios that are rejected, like MAP_PRIVATE mappings. Cc: Hannes Reinecke <hare@suse.de> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Diffstat (limited to 'drivers/dax')
-rw-r--r--drivers/dax/Kconfig1
-rw-r--r--drivers/dax/dax.c322
2 files changed, 323 insertions, 0 deletions
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 86ffbaa891ad..cedab7572de3 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,6 +1,7 @@
1menuconfig DEV_DAX 1menuconfig DEV_DAX
2 tristate "DAX: direct access to differentiated memory" 2 tristate "DAX: direct access to differentiated memory"
3 default m if NVDIMM_DAX 3 default m if NVDIMM_DAX
4 depends on TRANSPARENT_HUGEPAGE
4 help 5 help
5 Support raw access to differentiated (persistence, bandwidth, 6 Support raw access to differentiated (persistence, bandwidth,
6 latency...) memory via an mmap(2) capable character 7 latency...) memory via an mmap(2) capable character
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 4c22a40f2335..b891a129b275 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -49,6 +49,7 @@ struct dax_region {
49 * @region - parent region 49 * @region - parent region
50 * @dev - device backing the character device 50 * @dev - device backing the character device
51 * @kref - enable this data to be tracked in filp->private_data 51 * @kref - enable this data to be tracked in filp->private_data
52 * @alive - !alive + rcu grace period == no new mappings can be established
52 * @id - child id in the region 53 * @id - child id in the region
53 * @num_resources - number of physical address extents in this device 54 * @num_resources - number of physical address extents in this device
54 * @res - array of physical address ranges 55 * @res - array of physical address ranges
@@ -57,6 +58,7 @@ struct dax_dev {
57 struct dax_region *region; 58 struct dax_region *region;
58 struct device *dev; 59 struct device *dev;
59 struct kref kref; 60 struct kref kref;
61 bool alive;
60 int id; 62 int id;
61 int num_resources; 63 int num_resources;
62 struct resource res[0]; 64 struct resource res[0];
@@ -150,6 +152,16 @@ static void unregister_dax_dev(void *_dev)
150 152
151 dev_dbg(dev, "%s\n", __func__); 153 dev_dbg(dev, "%s\n", __func__);
152 154
155 /*
156 * Note, rcu is not protecting the liveness of dax_dev, rcu is
157 * ensuring that any fault handlers that might have seen
158 * dax_dev->alive == true, have completed. Any fault handlers
159 * that start after synchronize_rcu() has started will abort
160 * upon seeing dax_dev->alive == false.
161 */
162 dax_dev->alive = false;
163 synchronize_rcu();
164
153 get_device(dev); 165 get_device(dev);
154 device_unregister(dev); 166 device_unregister(dev);
155 ida_simple_remove(&dax_region->ida, dax_dev->id); 167 ida_simple_remove(&dax_region->ida, dax_dev->id);
@@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
173 memcpy(dax_dev->res, res, sizeof(*res) * count); 185 memcpy(dax_dev->res, res, sizeof(*res) * count);
174 dax_dev->num_resources = count; 186 dax_dev->num_resources = count;
175 kref_init(&dax_dev->kref); 187 kref_init(&dax_dev->kref);
188 dax_dev->alive = true;
176 dax_dev->region = dax_region; 189 dax_dev->region = dax_region;
177 kref_get(&dax_region->kref); 190 kref_get(&dax_region->kref);
178 191
@@ -217,9 +230,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
217} 230}
218EXPORT_SYMBOL_GPL(devm_create_dax_dev); 231EXPORT_SYMBOL_GPL(devm_create_dax_dev);
219 232
233/* return an unmapped area aligned to the dax region specified alignment */
234static unsigned long dax_dev_get_unmapped_area(struct file *filp,
235 unsigned long addr, unsigned long len, unsigned long pgoff,
236 unsigned long flags)
237{
238 unsigned long off, off_end, off_align, len_align, addr_align, align;
239 struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
240 struct dax_region *dax_region;
241
242 if (!dax_dev || addr)
243 goto out;
244
245 dax_region = dax_dev->region;
246 align = dax_region->align;
247 off = pgoff << PAGE_SHIFT;
248 off_end = off + len;
249 off_align = round_up(off, align);
250
251 if ((off_end <= off_align) || ((off_end - off_align) < align))
252 goto out;
253
254 len_align = len + align;
255 if ((off + len_align) < off)
256 goto out;
257
258 addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
259 pgoff, flags);
260 if (!IS_ERR_VALUE(addr_align)) {
261 addr_align += (off - addr_align) & (align - 1);
262 return addr_align;
263 }
264 out:
265 return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
266}
267
268static int __match_devt(struct device *dev, const void *data)
269{
270 const dev_t *devt = data;
271
272 return dev->devt == *devt;
273}
274
275static struct device *dax_dev_find(dev_t dev_t)
276{
277 return class_find_device(dax_class, NULL, &dev_t, __match_devt);
278}
279
280static int dax_dev_open(struct inode *inode, struct file *filp)
281{
282 struct dax_dev *dax_dev = NULL;
283 struct device *dev;
284
285 dev = dax_dev_find(inode->i_rdev);
286 if (!dev)
287 return -ENXIO;
288
289 device_lock(dev);
290 dax_dev = dev_get_drvdata(dev);
291 if (dax_dev) {
292 dev_dbg(dev, "%s\n", __func__);
293 filp->private_data = dax_dev;
294 kref_get(&dax_dev->kref);
295 inode->i_flags = S_DAX;
296 }
297 device_unlock(dev);
298
299 if (!dax_dev) {
300 put_device(dev);
301 return -ENXIO;
302 }
303 return 0;
304}
305
306static int dax_dev_release(struct inode *inode, struct file *filp)
307{
308 struct dax_dev *dax_dev = filp->private_data;
309 struct device *dev = dax_dev->dev;
310
311 dev_dbg(dax_dev->dev, "%s\n", __func__);
312 dax_dev_put(dax_dev);
313 put_device(dev);
314
315 return 0;
316}
317
318static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
319 const char *func)
320{
321 struct dax_region *dax_region = dax_dev->region;
322 struct device *dev = dax_dev->dev;
323 unsigned long mask;
324
325 if (!dax_dev->alive)
326 return -ENXIO;
327
328 /* prevent private / writable mappings from being established */
329 if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
330 dev_info(dev, "%s: %s: fail, attempted private mapping\n",
331 current->comm, func);
332 return -EINVAL;
333 }
334
335 mask = dax_region->align - 1;
336 if (vma->vm_start & mask || vma->vm_end & mask) {
337 dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
338 current->comm, func, vma->vm_start, vma->vm_end,
339 mask);
340 return -EINVAL;
341 }
342
343 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
344 && (vma->vm_flags & VM_DONTCOPY) == 0) {
345 dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
346 current->comm, func);
347 return -EINVAL;
348 }
349
350 if (!vma_is_dax(vma)) {
351 dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
352 current->comm, func);
353 return -EINVAL;
354 }
355
356 return 0;
357}
358
359static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
360 unsigned long size)
361{
362 struct resource *res;
363 phys_addr_t phys;
364 int i;
365
366 for (i = 0; i < dax_dev->num_resources; i++) {
367 res = &dax_dev->res[i];
368 phys = pgoff * PAGE_SIZE + res->start;
369 if (phys >= res->start && phys <= res->end)
370 break;
371 pgoff -= PHYS_PFN(resource_size(res));
372 }
373
374 if (i < dax_dev->num_resources) {
375 res = &dax_dev->res[i];
376 if (phys + size - 1 <= res->end)
377 return phys;
378 }
379
380 return -1;
381}
382
383static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
384 struct vm_fault *vmf)
385{
386 unsigned long vaddr = (unsigned long) vmf->virtual_address;
387 struct device *dev = dax_dev->dev;
388 struct dax_region *dax_region;
389 int rc = VM_FAULT_SIGBUS;
390 phys_addr_t phys;
391 pfn_t pfn;
392
393 if (check_vma(dax_dev, vma, __func__))
394 return VM_FAULT_SIGBUS;
395
396 dax_region = dax_dev->region;
397 if (dax_region->align > PAGE_SIZE) {
398 dev_dbg(dev, "%s: alignment > fault size\n", __func__);
399 return VM_FAULT_SIGBUS;
400 }
401
402 phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
403 if (phys == -1) {
404 dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
405 vmf->pgoff);
406 return VM_FAULT_SIGBUS;
407 }
408
409 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
410
411 rc = vm_insert_mixed(vma, vaddr, pfn);
412
413 if (rc == -ENOMEM)
414 return VM_FAULT_OOM;
415 if (rc < 0 && rc != -EBUSY)
416 return VM_FAULT_SIGBUS;
417
418 return VM_FAULT_NOPAGE;
419}
420
421static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
422{
423 int rc;
424 struct file *filp = vma->vm_file;
425 struct dax_dev *dax_dev = filp->private_data;
426
427 dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
428 current->comm, (vmf->flags & FAULT_FLAG_WRITE)
429 ? "write" : "read", vma->vm_start, vma->vm_end);
430 rcu_read_lock();
431 rc = __dax_dev_fault(dax_dev, vma, vmf);
432 rcu_read_unlock();
433
434 return rc;
435}
436
437static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
438 struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
439 unsigned int flags)
440{
441 unsigned long pmd_addr = addr & PMD_MASK;
442 struct device *dev = dax_dev->dev;
443 struct dax_region *dax_region;
444 phys_addr_t phys;
445 pgoff_t pgoff;
446 pfn_t pfn;
447
448 if (check_vma(dax_dev, vma, __func__))
449 return VM_FAULT_SIGBUS;
450
451 dax_region = dax_dev->region;
452 if (dax_region->align > PMD_SIZE) {
453 dev_dbg(dev, "%s: alignment > fault size\n", __func__);
454 return VM_FAULT_SIGBUS;
455 }
456
457 /* dax pmd mappings require pfn_t_devmap() */
458 if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
459 dev_dbg(dev, "%s: alignment > fault size\n", __func__);
460 return VM_FAULT_SIGBUS;
461 }
462
463 pgoff = linear_page_index(vma, pmd_addr);
464 phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
465 if (phys == -1) {
466 dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
467 pgoff);
468 return VM_FAULT_SIGBUS;
469 }
470
471 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
472
473 return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
474 flags & FAULT_FLAG_WRITE);
475}
476
477static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
478 pmd_t *pmd, unsigned int flags)
479{
480 int rc;
481 struct file *filp = vma->vm_file;
482 struct dax_dev *dax_dev = filp->private_data;
483
484 dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
485 current->comm, (flags & FAULT_FLAG_WRITE)
486 ? "write" : "read", vma->vm_start, vma->vm_end);
487
488 rcu_read_lock();
489 rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
490 rcu_read_unlock();
491
492 return rc;
493}
494
495static void dax_dev_vm_open(struct vm_area_struct *vma)
496{
497 struct file *filp = vma->vm_file;
498 struct dax_dev *dax_dev = filp->private_data;
499
500 dev_dbg(dax_dev->dev, "%s\n", __func__);
501 kref_get(&dax_dev->kref);
502}
503
504static void dax_dev_vm_close(struct vm_area_struct *vma)
505{
506 struct file *filp = vma->vm_file;
507 struct dax_dev *dax_dev = filp->private_data;
508
509 dev_dbg(dax_dev->dev, "%s\n", __func__);
510 dax_dev_put(dax_dev);
511}
512
513static const struct vm_operations_struct dax_dev_vm_ops = {
514 .fault = dax_dev_fault,
515 .pmd_fault = dax_dev_pmd_fault,
516 .open = dax_dev_vm_open,
517 .close = dax_dev_vm_close,
518};
519
520static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)
521{
522 struct dax_dev *dax_dev = filp->private_data;
523 int rc;
524
525 dev_dbg(dax_dev->dev, "%s\n", __func__);
526
527 rc = check_vma(dax_dev, vma, __func__);
528 if (rc)
529 return rc;
530
531 kref_get(&dax_dev->kref);
532 vma->vm_ops = &dax_dev_vm_ops;
533 vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
534 return 0;
535
536}
537
220static const struct file_operations dax_fops = { 538static const struct file_operations dax_fops = {
221 .llseek = noop_llseek, 539 .llseek = noop_llseek,
222 .owner = THIS_MODULE, 540 .owner = THIS_MODULE,
541 .open = dax_dev_open,
542 .release = dax_dev_release,
543 .get_unmapped_area = dax_dev_get_unmapped_area,
544 .mmap = dax_dev_mmap,
223}; 545};
224 546
225static int __init dax_init(void) 547static int __init dax_init(void)