aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>2013-09-11 17:22:09 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-11 18:57:48 -0400
commitc8721bbbdd36382de51cd6b7a56322e0acca2414 (patch)
tree8fb7b55974defcde9a4b07f571f0dd2dd1ad591f /mm
parent71ea2efb1e936a127690a0a540b3a6162f95e48a (diff)
mm: memory-hotplug: enable memory hotplug to handle hugepage
Until now we can't offline memory blocks which contain hugepages because a hugepage is considered as an unmovable page. But now with this patch series, a hugepage has become movable, so by using hugepage migration we can offline such memory blocks. What's different from other users of hugepage migration is that we need to decompose all the hugepages inside the target memory block into free buddy pages after hugepage migration, because otherwise free hugepages remaining in the memory block intervene the memory offlining. For this reason we introduce new functions dissolve_free_huge_page() and dissolve_free_huge_pages(). Other than that, what this patch does is straightforwardly to add hugepage migration code, that is, adding hugepage code to the functions which scan over pfn and collect hugepages to be migrated, and adding a hugepage allocation function to alloc_migrate_target(). As for larger hugepages (1GB for x86_64), it's not easy to do hotremove over them because it's larger than memory block. So we now simply leave it to fail as it is. [yongjun_wei@trendmicro.com.cn: remove duplicated include] Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Acked-by: Andi Kleen <ak@linux.intel.com> Cc: Hillf Danton <dhillf@gmail.com> Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Hugh Dickins <hughd@google.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com> Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c71
-rw-r--r--mm/memory_hotplug.c42
-rw-r--r--mm/page_alloc.c11
-rw-r--r--mm/page_isolation.c14
4 files changed, 129 insertions, 9 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d37b3b95c439..fb4293b93fd0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -21,6 +21,7 @@
21#include <linux/rmap.h> 21#include <linux/rmap.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/page-isolation.h>
24 25
25#include <asm/page.h> 26#include <asm/page.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -522,9 +523,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
522{ 523{
523 struct page *page; 524 struct page *page;
524 525
525 if (list_empty(&h->hugepage_freelists[nid])) 526 list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
527 if (!is_migrate_isolate_page(page))
528 break;
529 /*
530 * if 'non-isolated free hugepage' not found on the list,
531 * the allocation fails.
532 */
533 if (&h->hugepage_freelists[nid] == &page->lru)
526 return NULL; 534 return NULL;
527 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
528 list_move(&page->lru, &h->hugepage_activelist); 535 list_move(&page->lru, &h->hugepage_activelist);
529 set_page_refcounted(page); 536 set_page_refcounted(page);
530 h->free_huge_pages--; 537 h->free_huge_pages--;
@@ -878,6 +885,44 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
878 return ret; 885 return ret;
879} 886}
880 887
888/*
889 * Dissolve a given free hugepage into free buddy pages. This function does
890 * nothing for in-use (including surplus) hugepages.
891 */
892static void dissolve_free_huge_page(struct page *page)
893{
894 spin_lock(&hugetlb_lock);
895 if (PageHuge(page) && !page_count(page)) {
896 struct hstate *h = page_hstate(page);
897 int nid = page_to_nid(page);
898 list_del(&page->lru);
899 h->free_huge_pages--;
900 h->free_huge_pages_node[nid]--;
901 update_and_free_page(h, page);
902 }
903 spin_unlock(&hugetlb_lock);
904}
905
906/*
907 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
908 * make specified memory blocks removable from the system.
909 * Note that start_pfn should aligned with (minimum) hugepage size.
910 */
911void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
912{
913 unsigned int order = 8 * sizeof(void *);
914 unsigned long pfn;
915 struct hstate *h;
916
917 /* Set scan step to minimum hugepage size */
918 for_each_hstate(h)
919 if (order > huge_page_order(h))
920 order = huge_page_order(h);
921 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
922 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
923 dissolve_free_huge_page(pfn_to_page(pfn));
924}
925
881static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 926static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
882{ 927{
883 struct page *page; 928 struct page *page;
@@ -3457,3 +3502,25 @@ void putback_active_hugepage(struct page *page)
3457 spin_unlock(&hugetlb_lock); 3502 spin_unlock(&hugetlb_lock);
3458 put_page(page); 3503 put_page(page);
3459} 3504}
3505
3506bool is_hugepage_active(struct page *page)
3507{
3508 VM_BUG_ON(!PageHuge(page));
3509 /*
3510 * This function can be called for a tail page because the caller,
3511 * scan_movable_pages, scans through a given pfn-range which typically
3512 * covers one memory block. In systems using gigantic hugepage (1GB
3513 * for x86_64,) a hugepage is larger than a memory block, and we don't
3514 * support migrating such large hugepages for now, so return false
3515 * when called for tail pages.
3516 */
3517 if (PageTail(page))
3518 return false;
3519 /*
3520 * Refcount of a hwpoisoned hugepages is 1, but they are not active,
3521 * so we should return false for them.
3522 */
3523 if (unlikely(PageHWPoison(page)))
3524 return false;
3525 return page_count(page) > 0;
3526}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d595606728f9..0eb1a1df649d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -30,6 +30,7 @@
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33#include <linux/hugetlb.h>
33 34
34#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
35 36
@@ -1230,10 +1231,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
1230} 1231}
1231 1232
1232/* 1233/*
1233 * Scanning pfn is much easier than scanning lru list. 1234 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
1234 * Scan pfn from start to end and Find LRU page. 1235 * and hugepages). We scan pfn because it's much easier than scanning over
1236 * linked list. This function returns the pfn of the first found movable
1237 * page if it's found, otherwise 0.
1235 */ 1238 */
1236static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 1239static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1237{ 1240{
1238 unsigned long pfn; 1241 unsigned long pfn;
1239 struct page *page; 1242 struct page *page;
@@ -1242,6 +1245,13 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
1242 page = pfn_to_page(pfn); 1245 page = pfn_to_page(pfn);
1243 if (PageLRU(page)) 1246 if (PageLRU(page))
1244 return pfn; 1247 return pfn;
1248 if (PageHuge(page)) {
1249 if (is_hugepage_active(page))
1250 return pfn;
1251 else
1252 pfn = round_up(pfn + 1,
1253 1 << compound_order(page)) - 1;
1254 }
1245 } 1255 }
1246 } 1256 }
1247 return 0; 1257 return 0;
@@ -1262,6 +1272,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1262 if (!pfn_valid(pfn)) 1272 if (!pfn_valid(pfn))
1263 continue; 1273 continue;
1264 page = pfn_to_page(pfn); 1274 page = pfn_to_page(pfn);
1275
1276 if (PageHuge(page)) {
1277 struct page *head = compound_head(page);
1278 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1279 if (compound_order(head) > PFN_SECTION_SHIFT) {
1280 ret = -EBUSY;
1281 break;
1282 }
1283 if (isolate_huge_page(page, &source))
1284 move_pages -= 1 << compound_order(head);
1285 continue;
1286 }
1287
1265 if (!get_page_unless_zero(page)) 1288 if (!get_page_unless_zero(page))
1266 continue; 1289 continue;
1267 /* 1290 /*
@@ -1294,7 +1317,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1294 } 1317 }
1295 if (!list_empty(&source)) { 1318 if (!list_empty(&source)) {
1296 if (not_managed) { 1319 if (not_managed) {
1297 putback_lru_pages(&source); 1320 putback_movable_pages(&source);
1298 goto out; 1321 goto out;
1299 } 1322 }
1300 1323
@@ -1305,7 +1328,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1305 ret = migrate_pages(&source, alloc_migrate_target, 0, 1328 ret = migrate_pages(&source, alloc_migrate_target, 0,
1306 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1329 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1307 if (ret) 1330 if (ret)
1308 putback_lru_pages(&source); 1331 putback_movable_pages(&source);
1309 } 1332 }
1310out: 1333out:
1311 return ret; 1334 return ret;
@@ -1548,8 +1571,8 @@ repeat:
1548 drain_all_pages(); 1571 drain_all_pages();
1549 } 1572 }
1550 1573
1551 pfn = scan_lru_pages(start_pfn, end_pfn); 1574 pfn = scan_movable_pages(start_pfn, end_pfn);
1552 if (pfn) { /* We have page on LRU */ 1575 if (pfn) { /* We have movable pages */
1553 ret = do_migrate_range(pfn, end_pfn); 1576 ret = do_migrate_range(pfn, end_pfn);
1554 if (!ret) { 1577 if (!ret) {
1555 drain = 1; 1578 drain = 1;
@@ -1568,6 +1591,11 @@ repeat:
1568 yield(); 1591 yield();
1569 /* drain pcp pages, this is synchronous. */ 1592 /* drain pcp pages, this is synchronous. */
1570 drain_all_pages(); 1593 drain_all_pages();
1594 /*
1595 * dissolve free hugepages in the memory block before doing offlining
1596 * actually in order to make hugetlbfs's object counting consistent.
1597 */
1598 dissolve_free_huge_pages(start_pfn, end_pfn);
1571 /* check again */ 1599 /* check again */
1572 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1600 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1573 if (offlined_pages < 0) { 1601 if (offlined_pages < 0) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7c3f8d7e2d8e..f7cc08dad26a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6008,6 +6008,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
6008 continue; 6008 continue;
6009 6009
6010 page = pfn_to_page(check); 6010 page = pfn_to_page(check);
6011
6012 /*
6013 * Hugepages are not in LRU lists, but they're movable.
6014 * We need not scan over tail pages bacause we don't
6015 * handle each tail page individually in migration.
6016 */
6017 if (PageHuge(page)) {
6018 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6019 continue;
6020 }
6021
6011 /* 6022 /*
6012 * We can't use page_count without pin a page 6023 * We can't use page_count without pin a page
6013 * because another CPU can free compound page. 6024 * because another CPU can free compound page.
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 0cee10ffb98d..d1473b2e9481 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -6,6 +6,7 @@
6#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/hugetlb.h>
9#include "internal.h" 10#include "internal.h"
10 11
11int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) 12int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
@@ -252,6 +253,19 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
252{ 253{
253 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 254 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
254 255
256 /*
257 * TODO: allocate a destination hugepage from a nearest neighbor node,
258 * accordance with memory policy of the user process if possible. For
259 * now as a simple work-around, we use the next node for destination.
260 */
261 if (PageHuge(page)) {
262 nodemask_t src = nodemask_of_node(page_to_nid(page));
263 nodemask_t dst;
264 nodes_complement(dst, src);
265 return alloc_huge_page_node(page_hstate(compound_head(page)),
266 next_node(page_to_nid(page), dst));
267 }
268
255 if (PageHighMem(page)) 269 if (PageHighMem(page))
256 gfp_mask |= __GFP_HIGHMEM; 270 gfp_mask |= __GFP_HIGHMEM;
257 271