aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2015-03-25 18:55:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-03-25 19:20:31 -0400
commitb191f9b106ea1a24a711dbebb2925d3313da5852 (patch)
treed47cd29412ed7c10fbd5415e2bf2d8ebcc8366d0 /mm
parentbea66fbd11af1ca98ae26855eea41eda8582923e (diff)
mm: numa: preserve PTE write permissions across a NUMA hinting fault
Protecting a PTE to trap a NUMA hinting fault clears the writable bit and further faults are needed after trapping a NUMA hinting fault to set the writable bit again. This patch preserves the writable bit when trapping NUMA hinting faults. The impact is obvious from the number of minor faults trapped during the basis balancing benchmark and the system CPU usage; autonumabench 4.0.0-rc4 4.0.0-rc4 baseline preserve Time System-NUMA01 107.13 ( 0.00%) 103.13 ( 3.73%) Time System-NUMA01_THEADLOCAL 131.87 ( 0.00%) 83.30 ( 36.83%) Time System-NUMA02 8.95 ( 0.00%) 10.72 (-19.78%) Time System-NUMA02_SMT 4.57 ( 0.00%) 3.99 ( 12.69%) Time Elapsed-NUMA01 515.78 ( 0.00%) 517.26 ( -0.29%) Time Elapsed-NUMA01_THEADLOCAL 384.10 ( 0.00%) 384.31 ( -0.05%) Time Elapsed-NUMA02 48.86 ( 0.00%) 48.78 ( 0.16%) Time Elapsed-NUMA02_SMT 47.98 ( 0.00%) 48.12 ( -0.29%) 4.0.0-rc4 4.0.0-rc4 baseline preserve User 44383.95 43971.89 System 252.61 201.24 Elapsed 998.68 1000.94 Minor Faults 2597249 1981230 Major Faults 365 364 There is a similar drop in system CPU usage using Dave Chinner's xfsrepair workload 4.0.0-rc4 4.0.0-rc4 baseline preserve Amean real-xfsrepair 454.14 ( 0.00%) 442.36 ( 2.60%) Amean syst-xfsrepair 277.20 ( 0.00%) 204.68 ( 26.16%) The patch looks hacky but the alternatives looked worse. The tidest was to rewalk the page tables after a hinting fault but it was more complex than this approach and the performance was worse. It's not generally safe to just mark the page writable during the fault if it's a write fault as it may have been read-only for COW so that approach was discarded. Signed-off-by: Mel Gorman <mgorman@suse.de> Reported-by: Dave Chinner <david@fromorbit.com> Tested-by: Dave Chinner <david@fromorbit.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c9
-rw-r--r--mm/memory.c8
-rw-r--r--mm/mprotect.c3
3 files changed, 14 insertions, 6 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2f12e9fcf1a2..0a42d1521aa4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1260,6 +1260,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1260 int target_nid, last_cpupid = -1; 1260 int target_nid, last_cpupid = -1;
1261 bool page_locked; 1261 bool page_locked;
1262 bool migrated = false; 1262 bool migrated = false;
1263 bool was_writable;
1263 int flags = 0; 1264 int flags = 0;
1264 1265
1265 /* A PROT_NONE fault should not end up here */ 1266 /* A PROT_NONE fault should not end up here */
@@ -1354,7 +1355,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1354 goto out; 1355 goto out;
1355clear_pmdnuma: 1356clear_pmdnuma:
1356 BUG_ON(!PageLocked(page)); 1357 BUG_ON(!PageLocked(page));
1358 was_writable = pmd_write(pmd);
1357 pmd = pmd_modify(pmd, vma->vm_page_prot); 1359 pmd = pmd_modify(pmd, vma->vm_page_prot);
1360 if (was_writable)
1361 pmd = pmd_mkwrite(pmd);
1358 set_pmd_at(mm, haddr, pmdp, pmd); 1362 set_pmd_at(mm, haddr, pmdp, pmd);
1359 update_mmu_cache_pmd(vma, addr, pmdp); 1363 update_mmu_cache_pmd(vma, addr, pmdp);
1360 unlock_page(page); 1364 unlock_page(page);
@@ -1478,6 +1482,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1478 1482
1479 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1483 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1480 pmd_t entry; 1484 pmd_t entry;
1485 bool preserve_write = prot_numa && pmd_write(*pmd);
1481 ret = 1; 1486 ret = 1;
1482 1487
1483 /* 1488 /*
@@ -1493,9 +1498,11 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1493 if (!prot_numa || !pmd_protnone(*pmd)) { 1498 if (!prot_numa || !pmd_protnone(*pmd)) {
1494 entry = pmdp_get_and_clear_notify(mm, addr, pmd); 1499 entry = pmdp_get_and_clear_notify(mm, addr, pmd);
1495 entry = pmd_modify(entry, newprot); 1500 entry = pmd_modify(entry, newprot);
1501 if (preserve_write)
1502 entry = pmd_mkwrite(entry);
1496 ret = HPAGE_PMD_NR; 1503 ret = HPAGE_PMD_NR;
1497 set_pmd_at(mm, addr, pmd, entry); 1504 set_pmd_at(mm, addr, pmd, entry);
1498 BUG_ON(pmd_write(entry)); 1505 BUG_ON(!preserve_write && pmd_write(entry));
1499 } 1506 }
1500 spin_unlock(ptl); 1507 spin_unlock(ptl);
1501 } 1508 }
diff --git a/mm/memory.c b/mm/memory.c
index 20beb6647dba..d20e12da3a3c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3035,6 +3035,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3035 int last_cpupid; 3035 int last_cpupid;
3036 int target_nid; 3036 int target_nid;
3037 bool migrated = false; 3037 bool migrated = false;
3038 bool was_writable = pte_write(pte);
3038 int flags = 0; 3039 int flags = 0;
3039 3040
3040 /* A PROT_NONE fault should not end up here */ 3041 /* A PROT_NONE fault should not end up here */
@@ -3059,6 +3060,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3059 /* Make it present again */ 3060 /* Make it present again */
3060 pte = pte_modify(pte, vma->vm_page_prot); 3061 pte = pte_modify(pte, vma->vm_page_prot);
3061 pte = pte_mkyoung(pte); 3062 pte = pte_mkyoung(pte);
3063 if (was_writable)
3064 pte = pte_mkwrite(pte);
3062 set_pte_at(mm, addr, ptep, pte); 3065 set_pte_at(mm, addr, ptep, pte);
3063 update_mmu_cache(vma, addr, ptep); 3066 update_mmu_cache(vma, addr, ptep);
3064 3067
@@ -3075,11 +3078,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3075 * to it but pte_write gets cleared during protection updates and 3078 * to it but pte_write gets cleared during protection updates and
3076 * pte_dirty has unpredictable behaviour between PTE scan updates, 3079 * pte_dirty has unpredictable behaviour between PTE scan updates,
3077 * background writeback, dirty balancing and application behaviour. 3080 * background writeback, dirty balancing and application behaviour.
3078 *
3079 * TODO: Note that the ideal here would be to avoid a situation where a
3080 * NUMA fault is taken immediately followed by a write fault in
3081 * some cases which would have lower overhead overall but would be
3082 * invasive as the fault paths would need to be unified.
3083 */ 3081 */
3084 if (!(vma->vm_flags & VM_WRITE)) 3082 if (!(vma->vm_flags & VM_WRITE))
3085 flags |= TNF_NO_GROUP; 3083 flags |= TNF_NO_GROUP;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 44727811bf4c..88584838e704 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
75 oldpte = *pte; 75 oldpte = *pte;
76 if (pte_present(oldpte)) { 76 if (pte_present(oldpte)) {
77 pte_t ptent; 77 pte_t ptent;
78 bool preserve_write = prot_numa && pte_write(oldpte);
78 79
79 /* 80 /*
80 * Avoid trapping faults against the zero or KSM 81 * Avoid trapping faults against the zero or KSM
@@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
94 95
95 ptent = ptep_modify_prot_start(mm, addr, pte); 96 ptent = ptep_modify_prot_start(mm, addr, pte);
96 ptent = pte_modify(ptent, newprot); 97 ptent = pte_modify(ptent, newprot);
98 if (preserve_write)
99 ptent = pte_mkwrite(ptent);
97 100
98 /* Avoid taking write faults for known dirty pages */ 101 /* Avoid taking write faults for known dirty pages */
99 if (dirty_accountable && pte_dirty(ptent) && 102 if (dirty_accountable && pte_dirty(ptent) &&