aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorBob Liu <lliubbo@gmail.com>2013-11-12 18:07:37 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-12 22:09:06 -0500
commit9f1b868a13ac36bd207a571f5ea1193d823ab18d (patch)
tree56b43b02ba6f859424bfb43983740b648de33e66 /mm/huge_memory.c
parent10dc4155c7714f508fe2e4667164925ea971fb25 (diff)
mm: thp: khugepaged: add policy for finding target node
Khugepaged will scan/free HPAGE_PMD_NR normal pages and replace with a hugepage which is allocated from the node of the first scanned normal page, but this policy is too rough and may end with unexpected result to upper users. The problem is the original page-balancing among all nodes will be broken after hugepaged started. Thinking about the case if the first scanned normal page is allocated from node A, most of other scanned normal pages are allocated from node B or C.. But hugepaged will always allocate hugepage from node A which will cause extra memory pressure on node A which is not the situation before khugepaged started. This patch try to fix this problem by making khugepaged allocate hugepage from the node which have max record of scaned normal pages hit, so that the effect to original page-balancing can be minimized. The other problem is if normal scanned pages are equally allocated from Node A,B and C, after khugepaged started Node A will still suffer extra memory pressure. Andrew Davidoff reported a related issue several days ago. He wanted his application interleaving among all nodes and "numactl --interleave=all ./test" was used to run the testcase, but the result wasn't not as expected. cat /proc/2814/numa_maps: 7f50bd440000 interleave:0-3 anon=51403 dirty=51403 N0=435 N1=435 N2=435 N3=50098 The end result showed that most pages are from Node3 instead of interleave among node0-3 which was unreasonable. This patch also fix this issue by allocating hugepage round robin from all nodes have the same record, after this patch the result was as expected: 7f78399c0000 interleave:0-3 anon=51403 dirty=51403 N0=12723 N1=12723 N2=13235 N3=12722 The simple testcase is like this: int main() { char *p; int i; int j; for (i=0; i < 200; i++) { p = (char *)malloc(1048576); printf("malloc done\n"); if (p == 0) { printf("Out of memory\n"); return 1; } for (j=0; j < 1048576; j++) { p[j] = 'A'; } printf("touched memory\n"); sleep(1); } printf("enter sleep\n"); while(1) { sleep(100); } } [akpm@linux-foundation.org: make last_khugepaged_target_node local to khugepaged_find_target_node()] Reported-by: Andrew Davidoff <davidoff@qedmf.net> Tested-by: Andrew Davidoff <davidoff@qedmf.net> Signed-off-by: Bob Liu <bob.liu@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c53
1 files changed, 44 insertions, 9 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 411c4f2c0492..0556c6a44959 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2191,7 +2191,34 @@ static void khugepaged_alloc_sleep(void)
2191 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 2191 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2192} 2192}
2193 2193
2194static int khugepaged_node_load[MAX_NUMNODES];
2195
2194#ifdef CONFIG_NUMA 2196#ifdef CONFIG_NUMA
2197static int khugepaged_find_target_node(void)
2198{
2199 static int last_khugepaged_target_node = NUMA_NO_NODE;
2200 int nid, target_node = 0, max_value = 0;
2201
2202 /* find first node with max normal pages hit */
2203 for (nid = 0; nid < MAX_NUMNODES; nid++)
2204 if (khugepaged_node_load[nid] > max_value) {
2205 max_value = khugepaged_node_load[nid];
2206 target_node = nid;
2207 }
2208
2209 /* do some balance if several nodes have the same hit record */
2210 if (target_node <= last_khugepaged_target_node)
2211 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
2212 nid++)
2213 if (max_value == khugepaged_node_load[nid]) {
2214 target_node = nid;
2215 break;
2216 }
2217
2218 last_khugepaged_target_node = target_node;
2219 return target_node;
2220}
2221
2195static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) 2222static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2196{ 2223{
2197 if (IS_ERR(*hpage)) { 2224 if (IS_ERR(*hpage)) {
@@ -2225,9 +2252,8 @@ static struct page
2225 * mmap_sem in read mode is good idea also to allow greater 2252 * mmap_sem in read mode is good idea also to allow greater
2226 * scalability. 2253 * scalability.
2227 */ 2254 */
2228 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 2255 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
2229 node, __GFP_OTHER_NODE); 2256 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2230
2231 /* 2257 /*
2232 * After allocating the hugepage, release the mmap_sem read lock in 2258 * After allocating the hugepage, release the mmap_sem read lock in
2233 * preparation for taking it in write mode. 2259 * preparation for taking it in write mode.
@@ -2243,6 +2269,11 @@ static struct page
2243 return *hpage; 2269 return *hpage;
2244} 2270}
2245#else 2271#else
2272static int khugepaged_find_target_node(void)
2273{
2274 return 0;
2275}
2276
2246static inline struct page *alloc_hugepage(int defrag) 2277static inline struct page *alloc_hugepage(int defrag)
2247{ 2278{
2248 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), 2279 return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
@@ -2455,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2455 if (pmd_trans_huge(*pmd)) 2486 if (pmd_trans_huge(*pmd))
2456 goto out; 2487 goto out;
2457 2488
2489 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2458 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2490 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2459 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2491 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2460 _pte++, _address += PAGE_SIZE) { 2492 _pte++, _address += PAGE_SIZE) {
@@ -2471,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2471 if (unlikely(!page)) 2503 if (unlikely(!page))
2472 goto out_unmap; 2504 goto out_unmap;
2473 /* 2505 /*
2474 * Chose the node of the first page. This could 2506 * Record which node the original page is from and save this
2475 * be more sophisticated and look at more pages, 2507 * information to khugepaged_node_load[].
2476 * but isn't for now. 2508 * Khupaged will allocate hugepage from the node has the max
2509 * hit record.
2477 */ 2510 */
2478 if (node == NUMA_NO_NODE) 2511 node = page_to_nid(page);
2479 node = page_to_nid(page); 2512 khugepaged_node_load[node]++;
2480 VM_BUG_ON(PageCompound(page)); 2513 VM_BUG_ON(PageCompound(page));
2481 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2514 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2482 goto out_unmap; 2515 goto out_unmap;
@@ -2491,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2491 ret = 1; 2524 ret = 1;
2492out_unmap: 2525out_unmap:
2493 pte_unmap_unlock(pte, ptl); 2526 pte_unmap_unlock(pte, ptl);
2494 if (ret) 2527 if (ret) {
2528 node = khugepaged_find_target_node();
2495 /* collapse_huge_page will return with the mmap_sem released */ 2529 /* collapse_huge_page will return with the mmap_sem released */
2496 collapse_huge_page(mm, address, hpage, vma, node); 2530 collapse_huge_page(mm, address, hpage, vma, node);
2531 }
2497out: 2532out:
2498 return ret; 2533 return ret;
2499} 2534}