summaryrefslogtreecommitdiffstats
path: root/mm/hmm.c
diff options
context:
space:
mode:
authorJérôme Glisse <jglisse@redhat.com>2018-04-10 19:29:02 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-11 13:28:31 -0400
commit2aee09d8c1164219971c7b396f2235bd5334018c (patch)
tree121ed9085293eeb478d1a8465514ab96ed5eb2a8 /mm/hmm.c
parent53f5c3f489ecddc7570a2e2422a6fc5b25007b9d (diff)
mm/hmm: change hmm_vma_fault() to allow write fault on page basis
This changes hmm_vma_fault() to not take a global write fault flag for a range but instead rely on caller to populate HMM pfns array with proper fault flag ie HMM_PFN_VALID if driver want read fault for that address or HMM_PFN_VALID and HMM_PFN_WRITE for write. Moreover by setting HMM_PFN_DEVICE_PRIVATE the device driver can ask for device private memory to be migrated back to system memory through page fault. This is more flexible API and it better reflects how device handles and reports fault. Link: http://lkml.kernel.org/r/20180323005527.758-15-jglisse@redhat.com Signed-off-by: Jérôme Glisse <jglisse@redhat.com> Cc: Evgeny Baskakov <ebaskakov@nvidia.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Mark Hairgrove <mhairgrove@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hmm.c')
-rw-r--r--mm/hmm.c151
1 files changed, 118 insertions, 33 deletions
diff --git a/mm/hmm.c b/mm/hmm.c
index 2cc4dda1fd2e..290c872062a1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -299,12 +299,10 @@ struct hmm_vma_walk {
299 unsigned long last; 299 unsigned long last;
300 bool fault; 300 bool fault;
301 bool block; 301 bool block;
302 bool write;
303}; 302};
304 303
305static int hmm_vma_do_fault(struct mm_walk *walk, 304static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
306 unsigned long addr, 305 bool write_fault, uint64_t *pfn)
307 uint64_t *pfn)
308{ 306{
309 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 307 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
310 struct hmm_vma_walk *hmm_vma_walk = walk->private; 308 struct hmm_vma_walk *hmm_vma_walk = walk->private;
@@ -312,7 +310,7 @@ static int hmm_vma_do_fault(struct mm_walk *walk,
312 int r; 310 int r;
313 311
314 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; 312 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
315 flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; 313 flags |= write_fault ? FAULT_FLAG_WRITE : 0;
316 r = handle_mm_fault(vma, addr, flags); 314 r = handle_mm_fault(vma, addr, flags);
317 if (r & VM_FAULT_RETRY) 315 if (r & VM_FAULT_RETRY)
318 return -EBUSY; 316 return -EBUSY;
@@ -344,15 +342,17 @@ static int hmm_pfns_bad(unsigned long addr,
344 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 342 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
345 * @start: range virtual start address (inclusive) 343 * @start: range virtual start address (inclusive)
346 * @end: range virtual end address (exclusive) 344 * @end: range virtual end address (exclusive)
345 * @fault: should we fault or not ?
346 * @write_fault: write fault ?
347 * @walk: mm_walk structure 347 * @walk: mm_walk structure
348 * Returns: 0 on success, -EAGAIN after page fault, or page fault error 348 * Returns: 0 on success, -EAGAIN after page fault, or page fault error
349 * 349 *
350 * This function will be called whenever pmd_none() or pte_none() returns true, 350 * This function will be called whenever pmd_none() or pte_none() returns true,
351 * or whenever there is no page directory covering the virtual address range. 351 * or whenever there is no page directory covering the virtual address range.
352 */ 352 */
353static int hmm_vma_walk_hole(unsigned long addr, 353static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
354 unsigned long end, 354 bool fault, bool write_fault,
355 struct mm_walk *walk) 355 struct mm_walk *walk)
356{ 356{
357 struct hmm_vma_walk *hmm_vma_walk = walk->private; 357 struct hmm_vma_walk *hmm_vma_walk = walk->private;
358 struct hmm_range *range = hmm_vma_walk->range; 358 struct hmm_range *range = hmm_vma_walk->range;
@@ -363,16 +363,89 @@ static int hmm_vma_walk_hole(unsigned long addr,
363 i = (addr - range->start) >> PAGE_SHIFT; 363 i = (addr - range->start) >> PAGE_SHIFT;
364 for (; addr < end; addr += PAGE_SIZE, i++) { 364 for (; addr < end; addr += PAGE_SIZE, i++) {
365 pfns[i] = 0; 365 pfns[i] = 0;
366 if (hmm_vma_walk->fault) { 366 if (fault || write_fault) {
367 int ret; 367 int ret;
368 368
369 ret = hmm_vma_do_fault(walk, addr, &pfns[i]); 369 ret = hmm_vma_do_fault(walk, addr, write_fault,
370 &pfns[i]);
370 if (ret != -EAGAIN) 371 if (ret != -EAGAIN)
371 return ret; 372 return ret;
372 } 373 }
373 } 374 }
374 375
375 return hmm_vma_walk->fault ? -EAGAIN : 0; 376 return (fault || write_fault) ? -EAGAIN : 0;
377}
378
379static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
380 uint64_t pfns, uint64_t cpu_flags,
381 bool *fault, bool *write_fault)
382{
383 *fault = *write_fault = false;
384 if (!hmm_vma_walk->fault)
385 return;
386
387 /* We aren't ask to do anything ... */
388 if (!(pfns & HMM_PFN_VALID))
389 return;
390 /* If CPU page table is not valid then we need to fault */
391 *fault = cpu_flags & HMM_PFN_VALID;
392 /* Need to write fault ? */
393 if ((pfns & HMM_PFN_WRITE) && !(cpu_flags & HMM_PFN_WRITE)) {
394 *fault = *write_fault = false;
395 return;
396 }
397 /* Do we fault on device memory ? */
398 if ((pfns & HMM_PFN_DEVICE_PRIVATE) &&
399 (cpu_flags & HMM_PFN_DEVICE_PRIVATE)) {
400 *write_fault = pfns & HMM_PFN_WRITE;
401 *fault = true;
402 }
403}
404
405static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
406 const uint64_t *pfns, unsigned long npages,
407 uint64_t cpu_flags, bool *fault,
408 bool *write_fault)
409{
410 unsigned long i;
411
412 if (!hmm_vma_walk->fault) {
413 *fault = *write_fault = false;
414 return;
415 }
416
417 for (i = 0; i < npages; ++i) {
418 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
419 fault, write_fault);
420 if ((*fault) || (*write_fault))
421 return;
422 }
423}
424
425static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
426 struct mm_walk *walk)
427{
428 struct hmm_vma_walk *hmm_vma_walk = walk->private;
429 struct hmm_range *range = hmm_vma_walk->range;
430 bool fault, write_fault;
431 unsigned long i, npages;
432 uint64_t *pfns;
433
434 i = (addr - range->start) >> PAGE_SHIFT;
435 npages = (end - addr) >> PAGE_SHIFT;
436 pfns = &range->pfns[i];
437 hmm_range_need_fault(hmm_vma_walk, pfns, npages,
438 0, &fault, &write_fault);
439 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
440}
441
442static inline uint64_t pmd_to_hmm_pfn_flags(pmd_t pmd)
443{
444 if (pmd_protnone(pmd))
445 return 0;
446 return pmd_write(pmd) ? HMM_PFN_VALID |
447 HMM_PFN_WRITE :
448 HMM_PFN_VALID;
376} 449}
377 450
378static int hmm_vma_handle_pmd(struct mm_walk *walk, 451static int hmm_vma_handle_pmd(struct mm_walk *walk,
@@ -382,14 +455,17 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
382 pmd_t pmd) 455 pmd_t pmd)
383{ 456{
384 struct hmm_vma_walk *hmm_vma_walk = walk->private; 457 struct hmm_vma_walk *hmm_vma_walk = walk->private;
385 unsigned long pfn, i; 458 unsigned long pfn, npages, i;
386 uint64_t flag = 0; 459 uint64_t flag = 0, cpu_flags;
460 bool fault, write_fault;
387 461
388 if (pmd_protnone(pmd)) 462 npages = (end - addr) >> PAGE_SHIFT;
389 return hmm_vma_walk_hole(addr, end, walk); 463 cpu_flags = pmd_to_hmm_pfn_flags(pmd);
464 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
465 &fault, &write_fault);
390 466
391 if ((hmm_vma_walk->fault & hmm_vma_walk->write) && !pmd_write(pmd)) 467 if (pmd_protnone(pmd) || fault || write_fault)
392 return hmm_vma_walk_hole(addr, end, walk); 468 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
393 469
394 pfn = pmd_pfn(pmd) + pte_index(addr); 470 pfn = pmd_pfn(pmd) + pte_index(addr);
395 flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; 471 flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0;
@@ -399,19 +475,32 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
399 return 0; 475 return 0;
400} 476}
401 477
478static inline uint64_t pte_to_hmm_pfn_flags(pte_t pte)
479{
480 if (pte_none(pte) || !pte_present(pte))
481 return 0;
482 return pte_write(pte) ? HMM_PFN_VALID |
483 HMM_PFN_WRITE :
484 HMM_PFN_VALID;
485}
486
402static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 487static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
403 unsigned long end, pmd_t *pmdp, pte_t *ptep, 488 unsigned long end, pmd_t *pmdp, pte_t *ptep,
404 uint64_t *pfn) 489 uint64_t *pfn)
405{ 490{
406 struct hmm_vma_walk *hmm_vma_walk = walk->private; 491 struct hmm_vma_walk *hmm_vma_walk = walk->private;
407 struct vm_area_struct *vma = walk->vma; 492 struct vm_area_struct *vma = walk->vma;
493 bool fault, write_fault;
494 uint64_t cpu_flags;
408 pte_t pte = *ptep; 495 pte_t pte = *ptep;
409 496
410 *pfn = 0; 497 *pfn = 0;
498 cpu_flags = pte_to_hmm_pfn_flags(pte);
499 hmm_pte_need_fault(hmm_vma_walk, *pfn, cpu_flags,
500 &fault, &write_fault);
411 501
412 if (pte_none(pte)) { 502 if (pte_none(pte)) {
413 *pfn = 0; 503 if (fault || write_fault)
414 if (hmm_vma_walk->fault)
415 goto fault; 504 goto fault;
416 return 0; 505 return 0;
417 } 506 }
@@ -420,7 +509,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
420 swp_entry_t entry = pte_to_swp_entry(pte); 509 swp_entry_t entry = pte_to_swp_entry(pte);
421 510
422 if (!non_swap_entry(entry)) { 511 if (!non_swap_entry(entry)) {
423 if (hmm_vma_walk->fault) 512 if (fault || write_fault)
424 goto fault; 513 goto fault;
425 return 0; 514 return 0;
426 } 515 }
@@ -430,21 +519,20 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
430 * device and report anything else as error. 519 * device and report anything else as error.
431 */ 520 */
432 if (is_device_private_entry(entry)) { 521 if (is_device_private_entry(entry)) {
522 cpu_flags = HMM_PFN_VALID | HMM_PFN_DEVICE_PRIVATE;
523 cpu_flags |= is_write_device_private_entry(entry) ?
524 HMM_PFN_WRITE : 0;
433 *pfn = hmm_pfn_from_pfn(swp_offset(entry)); 525 *pfn = hmm_pfn_from_pfn(swp_offset(entry));
434 if (is_write_device_private_entry(entry)) {
435 *pfn |= HMM_PFN_WRITE;
436 } else if ((hmm_vma_walk->fault & hmm_vma_walk->write))
437 goto fault;
438 *pfn |= HMM_PFN_DEVICE_PRIVATE; 526 *pfn |= HMM_PFN_DEVICE_PRIVATE;
439 return 0; 527 return 0;
440 } 528 }
441 529
442 if (is_migration_entry(entry)) { 530 if (is_migration_entry(entry)) {
443 if (hmm_vma_walk->fault) { 531 if (fault || write_fault) {
444 pte_unmap(ptep); 532 pte_unmap(ptep);
445 hmm_vma_walk->last = addr; 533 hmm_vma_walk->last = addr;
446 migration_entry_wait(vma->vm_mm, 534 migration_entry_wait(vma->vm_mm,
447 pmdp, addr); 535 pmdp, addr);
448 return -EAGAIN; 536 return -EAGAIN;
449 } 537 }
450 return 0; 538 return 0;
@@ -455,17 +543,16 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
455 return -EFAULT; 543 return -EFAULT;
456 } 544 }
457 545
458 if ((hmm_vma_walk->fault & hmm_vma_walk->write) && !pte_write(pte)) 546 if (fault || write_fault)
459 goto fault; 547 goto fault;
460 548
461 *pfn = hmm_pfn_from_pfn(pte_pfn(pte)); 549 *pfn = hmm_pfn_from_pfn(pte_pfn(pte)) | cpu_flags;
462 *pfn |= pte_write(pte) ? HMM_PFN_WRITE : 0;
463 return 0; 550 return 0;
464 551
465fault: 552fault:
466 pte_unmap(ptep); 553 pte_unmap(ptep);
467 /* Fault any virtual address we were asked to fault */ 554 /* Fault any virtual address we were asked to fault */
468 return hmm_vma_walk_hole(addr, end, walk); 555 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
469} 556}
470 557
471static int hmm_vma_walk_pmd(pmd_t *pmdp, 558static int hmm_vma_walk_pmd(pmd_t *pmdp,
@@ -686,7 +773,6 @@ EXPORT_SYMBOL(hmm_vma_range_done);
686/* 773/*
687 * hmm_vma_fault() - try to fault some address in a virtual address range 774 * hmm_vma_fault() - try to fault some address in a virtual address range
688 * @range: range being faulted 775 * @range: range being faulted
689 * @write: is it a write fault
690 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) 776 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
691 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) 777 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
692 * 778 *
@@ -731,7 +817,7 @@ EXPORT_SYMBOL(hmm_vma_range_done);
731 * 817 *
732 * YOU HAVE BEEN WARNED ! 818 * YOU HAVE BEEN WARNED !
733 */ 819 */
734int hmm_vma_fault(struct hmm_range *range, bool write, bool block) 820int hmm_vma_fault(struct hmm_range *range, bool block)
735{ 821{
736 struct vm_area_struct *vma = range->vma; 822 struct vm_area_struct *vma = range->vma;
737 unsigned long start = range->start; 823 unsigned long start = range->start;
@@ -779,7 +865,6 @@ int hmm_vma_fault(struct hmm_range *range, bool write, bool block)
779 spin_unlock(&hmm->lock); 865 spin_unlock(&hmm->lock);
780 866
781 hmm_vma_walk.fault = true; 867 hmm_vma_walk.fault = true;
782 hmm_vma_walk.write = write;
783 hmm_vma_walk.block = block; 868 hmm_vma_walk.block = block;
784 hmm_vma_walk.range = range; 869 hmm_vma_walk.range = range;
785 mm_walk.private = &hmm_vma_walk; 870 mm_walk.private = &hmm_vma_walk;