aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-08-29 12:11:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-08-29 12:11:06 -0400
commit785373b4c38719f4af6775845df6be1dfaea120f (patch)
tree36ddccb8d7d97def30c1830ab575e7c856ca8c40
parent9c3a815f471a84811cf8021cf64aae3b8081dfde (diff)
Revert "rmap: do not call mmu_notifier_invalidate_page() under ptl"
This reverts commit aac2fea94f7a3df8ad1eeb477eb2643f81fd5393. It turns out that that patch was complete and utter garbage, and broke KVM, resulting in odd oopses. Quoting Andrea Arcangeli: "The aforementioned commit has 3 bugs. 1) mmu_notifier_invalidate_range cannot be used in replacement of mmu_notifier_invalidate_range_start/end. For KVM mmu_notifier_invalidate_range is a noop and rightfully so. A MMU notifier implementation has to implement either ->invalidate_range method or the invalidate_range_start/end methods, not both. And if you implement invalidate_range_start/end like KVM is forced to do, calling mmu_notifier_invalidate_range in common code is a noop for KVM. For those MMU notifiers that can get away only implementing ->invalidate_range, the ->invalidate_range is implicitly called by mmu_notifier_invalidate_range_end(). And only those secondary MMUs that share the same pagetable with the primary MMU (like AMD iommuv2) can get away only implementing ->invalidate_range. So all cases (THP on/off) are broken right now. To fix this is enough to replace mmu_notifier_invalidate_range with mmu_notifier_invalidate_range_start;mmu_notifier_invalidate_range_end. Either that or call multiple mmu_notifier_invalidate_page like before. 2) address + (1UL << compound_order(page) is buggy, it should be PAGE_SIZE << compound_order(page), it's bytes not pages, 2M not 512. 3) The whole invalidate_range thing was an attempt to call a single invalidate while walking multiple 4k ptes that maps the same THP (after a pmd virtual split without physical compound page THP split). It's unclear if the rmap_walk will always provide an address that is 2M aligned as parameter to try_to_unmap_one, in presence of THP. I think it needs also an address &= (PAGE_SIZE << compound_order(page)) - 1 to be safe" In general, we should stop making excuses for horrible MMU notifier users. It's much more important that the core VM is sane and safe, than letting MMU notifiers sleep. So if some MMU notifier is sleeping under a spinlock, we need to fix the notifier, not try to make excuses for that garbage in the core VM. Reported-and-tested-by: Bernhard Held <berny156@gmx.de> Reported-and-tested-by: Adam Borowski <kilobyte@angband.pl> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Radim Krčmář <rkrcmar@redhat.com> Cc: Wanpeng Li <kernellwp@gmail.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Takashi Iwai <tiwai@suse.de> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: axie <axie@amd.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/rmap.c52
1 files changed, 22 insertions, 30 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index c1286d47aa1f..c8993c63eb25 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -888,10 +888,10 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
888 .flags = PVMW_SYNC, 888 .flags = PVMW_SYNC,
889 }; 889 };
890 int *cleaned = arg; 890 int *cleaned = arg;
891 bool invalidation_needed = false;
892 891
893 while (page_vma_mapped_walk(&pvmw)) { 892 while (page_vma_mapped_walk(&pvmw)) {
894 int ret = 0; 893 int ret = 0;
894 address = pvmw.address;
895 if (pvmw.pte) { 895 if (pvmw.pte) {
896 pte_t entry; 896 pte_t entry;
897 pte_t *pte = pvmw.pte; 897 pte_t *pte = pvmw.pte;
@@ -899,11 +899,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
899 if (!pte_dirty(*pte) && !pte_write(*pte)) 899 if (!pte_dirty(*pte) && !pte_write(*pte))
900 continue; 900 continue;
901 901
902 flush_cache_page(vma, pvmw.address, pte_pfn(*pte)); 902 flush_cache_page(vma, address, pte_pfn(*pte));
903 entry = ptep_clear_flush(vma, pvmw.address, pte); 903 entry = ptep_clear_flush(vma, address, pte);
904 entry = pte_wrprotect(entry); 904 entry = pte_wrprotect(entry);
905 entry = pte_mkclean(entry); 905 entry = pte_mkclean(entry);
906 set_pte_at(vma->vm_mm, pvmw.address, pte, entry); 906 set_pte_at(vma->vm_mm, address, pte, entry);
907 ret = 1; 907 ret = 1;
908 } else { 908 } else {
909#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE 909#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -913,11 +913,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
913 if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) 913 if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
914 continue; 914 continue;
915 915
916 flush_cache_page(vma, pvmw.address, page_to_pfn(page)); 916 flush_cache_page(vma, address, page_to_pfn(page));
917 entry = pmdp_huge_clear_flush(vma, pvmw.address, pmd); 917 entry = pmdp_huge_clear_flush(vma, address, pmd);
918 entry = pmd_wrprotect(entry); 918 entry = pmd_wrprotect(entry);
919 entry = pmd_mkclean(entry); 919 entry = pmd_mkclean(entry);
920 set_pmd_at(vma->vm_mm, pvmw.address, pmd, entry); 920 set_pmd_at(vma->vm_mm, address, pmd, entry);
921 ret = 1; 921 ret = 1;
922#else 922#else
923 /* unexpected pmd-mapped page? */ 923 /* unexpected pmd-mapped page? */
@@ -926,16 +926,11 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
926 } 926 }
927 927
928 if (ret) { 928 if (ret) {
929 mmu_notifier_invalidate_page(vma->vm_mm, address);
929 (*cleaned)++; 930 (*cleaned)++;
930 invalidation_needed = true;
931 } 931 }
932 } 932 }
933 933
934 if (invalidation_needed) {
935 mmu_notifier_invalidate_range(vma->vm_mm, address,
936 address + (1UL << compound_order(page)));
937 }
938
939 return true; 934 return true;
940} 935}
941 936
@@ -1328,7 +1323,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1328 }; 1323 };
1329 pte_t pteval; 1324 pte_t pteval;
1330 struct page *subpage; 1325 struct page *subpage;
1331 bool ret = true, invalidation_needed = false; 1326 bool ret = true;
1332 enum ttu_flags flags = (enum ttu_flags)arg; 1327 enum ttu_flags flags = (enum ttu_flags)arg;
1333 1328
1334 /* munlock has nothing to gain from examining un-locked vmas */ 1329 /* munlock has nothing to gain from examining un-locked vmas */
@@ -1368,9 +1363,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1368 VM_BUG_ON_PAGE(!pvmw.pte, page); 1363 VM_BUG_ON_PAGE(!pvmw.pte, page);
1369 1364
1370 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte); 1365 subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
1366 address = pvmw.address;
1367
1371 1368
1372 if (!(flags & TTU_IGNORE_ACCESS)) { 1369 if (!(flags & TTU_IGNORE_ACCESS)) {
1373 if (ptep_clear_flush_young_notify(vma, pvmw.address, 1370 if (ptep_clear_flush_young_notify(vma, address,
1374 pvmw.pte)) { 1371 pvmw.pte)) {
1375 ret = false; 1372 ret = false;
1376 page_vma_mapped_walk_done(&pvmw); 1373 page_vma_mapped_walk_done(&pvmw);
@@ -1379,7 +1376,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1379 } 1376 }
1380 1377
1381 /* Nuke the page table entry. */ 1378 /* Nuke the page table entry. */
1382 flush_cache_page(vma, pvmw.address, pte_pfn(*pvmw.pte)); 1379 flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
1383 if (should_defer_flush(mm, flags)) { 1380 if (should_defer_flush(mm, flags)) {
1384 /* 1381 /*
1385 * We clear the PTE but do not flush so potentially 1382 * We clear the PTE but do not flush so potentially
@@ -1389,12 +1386,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1389 * transition on a cached TLB entry is written through 1386 * transition on a cached TLB entry is written through
1390 * and traps if the PTE is unmapped. 1387 * and traps if the PTE is unmapped.
1391 */ 1388 */
1392 pteval = ptep_get_and_clear(mm, pvmw.address, 1389 pteval = ptep_get_and_clear(mm, address, pvmw.pte);
1393 pvmw.pte);
1394 1390
1395 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval)); 1391 set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
1396 } else { 1392 } else {
1397 pteval = ptep_clear_flush(vma, pvmw.address, pvmw.pte); 1393 pteval = ptep_clear_flush(vma, address, pvmw.pte);
1398 } 1394 }
1399 1395
1400 /* Move the dirty bit to the page. Now the pte is gone. */ 1396 /* Move the dirty bit to the page. Now the pte is gone. */
@@ -1409,12 +1405,12 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1409 if (PageHuge(page)) { 1405 if (PageHuge(page)) {
1410 int nr = 1 << compound_order(page); 1406 int nr = 1 << compound_order(page);
1411 hugetlb_count_sub(nr, mm); 1407 hugetlb_count_sub(nr, mm);
1412 set_huge_swap_pte_at(mm, pvmw.address, 1408 set_huge_swap_pte_at(mm, address,
1413 pvmw.pte, pteval, 1409 pvmw.pte, pteval,
1414 vma_mmu_pagesize(vma)); 1410 vma_mmu_pagesize(vma));
1415 } else { 1411 } else {
1416 dec_mm_counter(mm, mm_counter(page)); 1412 dec_mm_counter(mm, mm_counter(page));
1417 set_pte_at(mm, pvmw.address, pvmw.pte, pteval); 1413 set_pte_at(mm, address, pvmw.pte, pteval);
1418 } 1414 }
1419 1415
1420 } else if (pte_unused(pteval)) { 1416 } else if (pte_unused(pteval)) {
@@ -1438,7 +1434,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1438 swp_pte = swp_entry_to_pte(entry); 1434 swp_pte = swp_entry_to_pte(entry);
1439 if (pte_soft_dirty(pteval)) 1435 if (pte_soft_dirty(pteval))
1440 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1436 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1441 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1437 set_pte_at(mm, address, pvmw.pte, swp_pte);
1442 } else if (PageAnon(page)) { 1438 } else if (PageAnon(page)) {
1443 swp_entry_t entry = { .val = page_private(subpage) }; 1439 swp_entry_t entry = { .val = page_private(subpage) };
1444 pte_t swp_pte; 1440 pte_t swp_pte;
@@ -1464,7 +1460,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1464 * If the page was redirtied, it cannot be 1460 * If the page was redirtied, it cannot be
1465 * discarded. Remap the page to page table. 1461 * discarded. Remap the page to page table.
1466 */ 1462 */
1467 set_pte_at(mm, pvmw.address, pvmw.pte, pteval); 1463 set_pte_at(mm, address, pvmw.pte, pteval);
1468 SetPageSwapBacked(page); 1464 SetPageSwapBacked(page);
1469 ret = false; 1465 ret = false;
1470 page_vma_mapped_walk_done(&pvmw); 1466 page_vma_mapped_walk_done(&pvmw);
@@ -1472,7 +1468,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1472 } 1468 }
1473 1469
1474 if (swap_duplicate(entry) < 0) { 1470 if (swap_duplicate(entry) < 0) {
1475 set_pte_at(mm, pvmw.address, pvmw.pte, pteval); 1471 set_pte_at(mm, address, pvmw.pte, pteval);
1476 ret = false; 1472 ret = false;
1477 page_vma_mapped_walk_done(&pvmw); 1473 page_vma_mapped_walk_done(&pvmw);
1478 break; 1474 break;
@@ -1488,18 +1484,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1488 swp_pte = swp_entry_to_pte(entry); 1484 swp_pte = swp_entry_to_pte(entry);
1489 if (pte_soft_dirty(pteval)) 1485 if (pte_soft_dirty(pteval))
1490 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1486 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1491 set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); 1487 set_pte_at(mm, address, pvmw.pte, swp_pte);
1492 } else 1488 } else
1493 dec_mm_counter(mm, mm_counter_file(page)); 1489 dec_mm_counter(mm, mm_counter_file(page));
1494discard: 1490discard:
1495 page_remove_rmap(subpage, PageHuge(page)); 1491 page_remove_rmap(subpage, PageHuge(page));
1496 put_page(page); 1492 put_page(page);
1497 invalidation_needed = true; 1493 mmu_notifier_invalidate_page(mm, address);
1498 } 1494 }
1499
1500 if (invalidation_needed)
1501 mmu_notifier_invalidate_range(mm, address,
1502 address + (1UL << compound_order(page)));
1503 return ret; 1495 return ret;
1504} 1496}
1505 1497