aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 17:33:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 18:18:08 -0500
commit3d59eebc5e137bd89c6351e4c70e90ba1d0dc234 (patch)
treeb4ddfd0b057454a7437a3b4e3074a3b8b4b03817 /mm/huge_memory.c
parent11520e5e7c1855fc3bf202bb3be35a39d9efa034 (diff)
parent4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 (diff)
Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma
Pull Automatic NUMA Balancing bare-bones from Mel Gorman: "There are three implementations for NUMA balancing, this tree (balancenuma), numacore which has been developed in tip/master and autonuma which is in aa.git. In almost all respects balancenuma is the dumbest of the three because its main impact is on the VM side with no attempt to be smart about scheduling. In the interest of getting the ball rolling, it would be desirable to see this much merged for 3.8 with the view to building scheduler smarts on top and adapting the VM where required for 3.9. The most recent set of comparisons available from different people are mel: https://lkml.org/lkml/2012/12/9/108 mingo: https://lkml.org/lkml/2012/12/7/331 tglx: https://lkml.org/lkml/2012/12/10/437 srikar: https://lkml.org/lkml/2012/12/10/397 The results are a mixed bag. In my own tests, balancenuma does reasonably well. It's dumb as rocks and does not regress against mainline. On the other hand, Ingo's tests shows that balancenuma is incapable of converging for this workloads driven by perf which is bad but is potentially explained by the lack of scheduler smarts. Thomas' results show balancenuma improves on mainline but falls far short of numacore or autonuma. Srikar's results indicate we all suffer on a large machine with imbalanced node sizes. My own testing showed that recent numacore results have improved dramatically, particularly in the last week but not universally. We've butted heads heavily on system CPU usage and high levels of migration even when it shows that overall performance is better. There are also cases where it regresses. Of interest is that for specjbb in some configurations it will regress for lower numbers of warehouses and show gains for higher numbers which is not reported by the tool by default and sometimes missed in treports. Recently I reported for numacore that the JVM was crashing with NullPointerExceptions but currently it's unclear what the source of this problem is. Initially I thought it was in how numacore batch handles PTEs but I'm no longer think this is the case. It's possible numacore is just able to trigger it due to higher rates of migration. These reports were quite late in the cycle so I/we would like to start with this tree as it contains much of the code we can agree on and has not changed significantly over the last 2-3 weeks." * tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits) mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable mm/rmap: Convert the struct anon_vma::mutex to an rwsem mm: migrate: Account a transhuge page properly when rate limiting mm: numa: Account for failed allocations and isolations as migration failures mm: numa: Add THP migration for the NUMA working set scanning fault case build fix mm: numa: Add THP migration for the NUMA working set scanning fault case. mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG mm: sched: numa: Control enabling and disabling of NUMA balancing mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships mm: numa: migrate: Set last_nid on newly allocated page mm: numa: split_huge_page: Transfer last_nid on tail page mm: numa: Introduce last_nid to the page frame sched: numa: Slowly increase the scanning period as NUMA faults are handled mm: numa: Rate limit setting of pte_numa if node is saturated mm: numa: Rate limit the amount of memory that is migrated between nodes mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting mm: numa: Migrate pages handled during a pmd_numa hinting fault mm: numa: Migrate on reference policy ...
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c108
1 files changed, 99 insertions, 9 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 827d9c813051..d7ee1691fd21 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -19,6 +19,7 @@
19#include <linux/freezer.h> 19#include <linux/freezer.h>
20#include <linux/mman.h> 20#include <linux/mman.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/migrate.h>
22 23
23#include <asm/tlb.h> 24#include <asm/tlb.h>
24#include <asm/pgalloc.h> 25#include <asm/pgalloc.h>
@@ -690,7 +691,7 @@ out:
690} 691}
691__setup("transparent_hugepage=", setup_transparent_hugepage); 692__setup("transparent_hugepage=", setup_transparent_hugepage);
692 693
693static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 694pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
694{ 695{
695 if (likely(vma->vm_flags & VM_WRITE)) 696 if (likely(vma->vm_flags & VM_WRITE))
696 pmd = pmd_mkwrite(pmd); 697 pmd = pmd_mkwrite(pmd);
@@ -848,7 +849,8 @@ out:
848 * run pte_offset_map on the pmd, if an huge pmd could 849 * run pte_offset_map on the pmd, if an huge pmd could
849 * materialize from under us from a different thread. 850 * materialize from under us from a different thread.
850 */ 851 */
851 if (unlikely(__pte_alloc(mm, vma, pmd, address))) 852 if (unlikely(pmd_none(*pmd)) &&
853 unlikely(__pte_alloc(mm, vma, pmd, address)))
852 return VM_FAULT_OOM; 854 return VM_FAULT_OOM;
853 /* if an huge pmd materialized from under us just retry later */ 855 /* if an huge pmd materialized from under us just retry later */
854 if (unlikely(pmd_trans_huge(*pmd))) 856 if (unlikely(pmd_trans_huge(*pmd)))
@@ -1287,6 +1289,81 @@ out:
1287 return page; 1289 return page;
1288} 1290}
1289 1291
1292/* NUMA hinting page fault entry point for trans huge pmds */
1293int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1294 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1295{
1296 struct page *page;
1297 unsigned long haddr = addr & HPAGE_PMD_MASK;
1298 int target_nid;
1299 int current_nid = -1;
1300 bool migrated;
1301 bool page_locked = false;
1302
1303 spin_lock(&mm->page_table_lock);
1304 if (unlikely(!pmd_same(pmd, *pmdp)))
1305 goto out_unlock;
1306
1307 page = pmd_page(pmd);
1308 get_page(page);
1309 current_nid = page_to_nid(page);
1310 count_vm_numa_event(NUMA_HINT_FAULTS);
1311 if (current_nid == numa_node_id())
1312 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1313
1314 target_nid = mpol_misplaced(page, vma, haddr);
1315 if (target_nid == -1) {
1316 put_page(page);
1317 goto clear_pmdnuma;
1318 }
1319
1320 /* Acquire the page lock to serialise THP migrations */
1321 spin_unlock(&mm->page_table_lock);
1322 lock_page(page);
1323 page_locked = true;
1324
1325 /* Confirm the PTE did not while locked */
1326 spin_lock(&mm->page_table_lock);
1327 if (unlikely(!pmd_same(pmd, *pmdp))) {
1328 unlock_page(page);
1329 put_page(page);
1330 goto out_unlock;
1331 }
1332 spin_unlock(&mm->page_table_lock);
1333
1334 /* Migrate the THP to the requested node */
1335 migrated = migrate_misplaced_transhuge_page(mm, vma,
1336 pmdp, pmd, addr,
1337 page, target_nid);
1338 if (migrated)
1339 current_nid = target_nid;
1340 else {
1341 spin_lock(&mm->page_table_lock);
1342 if (unlikely(!pmd_same(pmd, *pmdp))) {
1343 unlock_page(page);
1344 goto out_unlock;
1345 }
1346 goto clear_pmdnuma;
1347 }
1348
1349 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1350 return 0;
1351
1352clear_pmdnuma:
1353 pmd = pmd_mknonnuma(pmd);
1354 set_pmd_at(mm, haddr, pmdp, pmd);
1355 VM_BUG_ON(pmd_numa(*pmdp));
1356 update_mmu_cache_pmd(vma, addr, pmdp);
1357 if (page_locked)
1358 unlock_page(page);
1359
1360out_unlock:
1361 spin_unlock(&mm->page_table_lock);
1362 if (current_nid != -1)
1363 task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
1364 return 0;
1365}
1366
1290int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1367int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1291 pmd_t *pmd, unsigned long addr) 1368 pmd_t *pmd, unsigned long addr)
1292{ 1369{
@@ -1375,7 +1452,7 @@ out:
1375} 1452}
1376 1453
1377int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 1454int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1378 unsigned long addr, pgprot_t newprot) 1455 unsigned long addr, pgprot_t newprot, int prot_numa)
1379{ 1456{
1380 struct mm_struct *mm = vma->vm_mm; 1457 struct mm_struct *mm = vma->vm_mm;
1381 int ret = 0; 1458 int ret = 0;
@@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1383 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1460 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1384 pmd_t entry; 1461 pmd_t entry;
1385 entry = pmdp_get_and_clear(mm, addr, pmd); 1462 entry = pmdp_get_and_clear(mm, addr, pmd);
1386 entry = pmd_modify(entry, newprot); 1463 if (!prot_numa)
1464 entry = pmd_modify(entry, newprot);
1465 else {
1466 struct page *page = pmd_page(*pmd);
1467
1468 /* only check non-shared pages */
1469 if (page_mapcount(page) == 1 &&
1470 !pmd_numa(*pmd)) {
1471 entry = pmd_mknuma(entry);
1472 }
1473 }
1387 BUG_ON(pmd_write(entry)); 1474 BUG_ON(pmd_write(entry));
1388 set_pmd_at(mm, addr, pmd, entry); 1475 set_pmd_at(mm, addr, pmd, entry);
1389 spin_unlock(&vma->vm_mm->page_table_lock); 1476 spin_unlock(&vma->vm_mm->page_table_lock);
@@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
1474 * We can't temporarily set the pmd to null in order 1561 * We can't temporarily set the pmd to null in order
1475 * to split it, the pmd must remain marked huge at all 1562 * to split it, the pmd must remain marked huge at all
1476 * times or the VM won't take the pmd_trans_huge paths 1563 * times or the VM won't take the pmd_trans_huge paths
1477 * and it won't wait on the anon_vma->root->mutex to 1564 * and it won't wait on the anon_vma->root->rwsem to
1478 * serialize against split_huge_page*. 1565 * serialize against split_huge_page*.
1479 */ 1566 */
1480 pmdp_splitting_flush(vma, address, pmd); 1567 pmdp_splitting_flush(vma, address, pmd);
@@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
1565 page_tail->mapping = page->mapping; 1652 page_tail->mapping = page->mapping;
1566 1653
1567 page_tail->index = page->index + i; 1654 page_tail->index = page->index + i;
1655 page_xchg_last_nid(page_tail, page_last_nid(page));
1568 1656
1569 BUG_ON(!PageAnon(page_tail)); 1657 BUG_ON(!PageAnon(page_tail));
1570 BUG_ON(!PageUptodate(page_tail)); 1658 BUG_ON(!PageUptodate(page_tail));
@@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
1632 BUG_ON(page_mapcount(page) != 1); 1720 BUG_ON(page_mapcount(page) != 1);
1633 if (!pmd_young(*pmd)) 1721 if (!pmd_young(*pmd))
1634 entry = pte_mkold(entry); 1722 entry = pte_mkold(entry);
1723 if (pmd_numa(*pmd))
1724 entry = pte_mknuma(entry);
1635 pte = pte_offset_map(&_pmd, haddr); 1725 pte = pte_offset_map(&_pmd, haddr);
1636 BUG_ON(!pte_none(*pte)); 1726 BUG_ON(!pte_none(*pte));
1637 set_pte_at(mm, haddr, pte, entry); 1727 set_pte_at(mm, haddr, pte, entry);
@@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
1674 return ret; 1764 return ret;
1675} 1765}
1676 1766
1677/* must be called with anon_vma->root->mutex hold */ 1767/* must be called with anon_vma->root->rwsem held */
1678static void __split_huge_page(struct page *page, 1768static void __split_huge_page(struct page *page,
1679 struct anon_vma *anon_vma) 1769 struct anon_vma *anon_vma)
1680{ 1770{
@@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page)
1729 1819
1730 BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); 1820 BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1731 BUG_ON(!PageAnon(page)); 1821 BUG_ON(!PageAnon(page));
1732 anon_vma = page_lock_anon_vma(page); 1822 anon_vma = page_lock_anon_vma_read(page);
1733 if (!anon_vma) 1823 if (!anon_vma)
1734 goto out; 1824 goto out;
1735 ret = 0; 1825 ret = 0;
@@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page)
1742 1832
1743 BUG_ON(PageCompound(page)); 1833 BUG_ON(PageCompound(page));
1744out_unlock: 1834out_unlock:
1745 page_unlock_anon_vma(anon_vma); 1835 page_unlock_anon_vma_read(anon_vma);
1746out: 1836out:
1747 return ret; 1837 return ret;
1748} 1838}
@@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2234 if (pmd_trans_huge(*pmd)) 2324 if (pmd_trans_huge(*pmd))
2235 goto out; 2325 goto out;
2236 2326
2237 anon_vma_lock(vma->anon_vma); 2327 anon_vma_lock_write(vma->anon_vma);
2238 2328
2239 pte = pte_offset_map(pmd, address); 2329 pte = pte_offset_map(pmd, address);
2240 ptl = pte_lockptr(mm, pmd); 2330 ptl = pte_lockptr(mm, pmd);