aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>2010-05-27 20:29:16 -0400
committerAndi Kleen <ak@linux.intel.com>2010-08-11 03:21:15 -0400
commit0fe6e20b9c4c53b3e97096ee73a0857f60aad43f (patch)
tree3014636f2ed66fdebecb6f6bab338b39c3543a07
parent8edf344c66a3f214d709dad1421c29d678915b3f (diff)
hugetlb, rmap: add reverse mapping for hugepage
This patch adds reverse mapping feature for hugepage by introducing mapcount for shared/private-mapped hugepage and anon_vma for private-mapped hugepage. While hugepage is not currently swappable, reverse mapping can be useful for memory error handler. Without this patch, memory error handler cannot identify processes using the bad hugepage nor unmap it from them. That is: - for shared hugepage: we can collect processes using a hugepage through pagecache, but can not unmap the hugepage because of the lack of mapcount. - for privately mapped hugepage: we can neither collect processes nor unmap the hugepage. This patch solves these problems. This patch include the bug fix given by commit 23be7468e8, so reverts it. Dependency: "hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h" ChangeLog since May 24. - create hugetlb_inline.h and move is_vm_hugetlb_index() in it. - move functions setting up anon_vma for hugepage into mm/rmap.c. ChangeLog since May 13. - rebased to 2.6.34 - fix logic error (in case that private mapping and shared mapping coexist) - move is_vm_hugetlb_page() into include/linux/mm.h to use this function from linear_page_index() - define and use linear_hugepage_index() instead of compound_order() - use page_move_anon_rmap() in hugetlb_cow() - copy exclusive switch of __set_page_anon_rmap() into hugepage counterpart. - revert commit 24be7468 completely Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Acked-by: Fengguang Wu <fengguang.wu@intel.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andi Kleen <ak@linux.intel.com>
-rw-r--r--include/linux/hugetlb.h1
-rw-r--r--include/linux/pagemap.h8
-rw-r--r--include/linux/poison.h9
-rw-r--r--include/linux/rmap.h5
-rw-r--r--mm/hugetlb.c44
-rw-r--r--mm/rmap.c59
6 files changed, 114 insertions, 12 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d47a7c41745d..e688fd89354d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -99,6 +99,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
99#define is_hugepage_only_range(mm, addr, len) 0 99#define is_hugepage_only_range(mm, addr, len) 0
100#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) 100#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
101#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) 101#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
102#define huge_pte_offset(mm, address) 0
102 103
103#define hugetlb_change_protection(vma, address, end, newprot) 104#define hugetlb_change_protection(vma, address, end, newprot)
104 105
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index b2bd2bae9775..a547d9689170 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -282,10 +282,16 @@ static inline loff_t page_offset(struct page *page)
282 return ((loff_t)page->index) << PAGE_CACHE_SHIFT; 282 return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
283} 283}
284 284
285extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
286 unsigned long address);
287
285static inline pgoff_t linear_page_index(struct vm_area_struct *vma, 288static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
286 unsigned long address) 289 unsigned long address)
287{ 290{
288 pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT; 291 pgoff_t pgoff;
292 if (unlikely(is_vm_hugetlb_page(vma)))
293 return linear_hugepage_index(vma, address);
294 pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
289 pgoff += vma->vm_pgoff; 295 pgoff += vma->vm_pgoff;
290 return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT); 296 return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
291} 297}
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 34066ffd893d..2110a81c5e2a 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -48,15 +48,6 @@
48#define POISON_FREE 0x6b /* for use-after-free poisoning */ 48#define POISON_FREE 0x6b /* for use-after-free poisoning */
49#define POISON_END 0xa5 /* end-byte of poisoning */ 49#define POISON_END 0xa5 /* end-byte of poisoning */
50 50
51/********** mm/hugetlb.c **********/
52/*
53 * Private mappings of hugetlb pages use this poisoned value for
54 * page->mapping. The core VM should not be doing anything with this mapping
55 * but futex requires the existence of some page->mapping value even though it
56 * is unused if PAGE_MAPPING_ANON is set.
57 */
58#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON))
59
60/********** arch/$ARCH/mm/init.c **********/ 51/********** arch/$ARCH/mm/init.c **********/
61#define POISON_FREE_INITMEM 0xcc 52#define POISON_FREE_INITMEM 0xcc
62 53
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 77216742c178..9d50e7ef5f5a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -140,6 +140,11 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon
140void page_add_file_rmap(struct page *); 140void page_add_file_rmap(struct page *);
141void page_remove_rmap(struct page *); 141void page_remove_rmap(struct page *);
142 142
143void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
144 unsigned long);
145void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
146 unsigned long);
147
143static inline void page_dup_rmap(struct page *page) 148static inline void page_dup_rmap(struct page *page)
144{ 149{
145 atomic_inc(&page->_mapcount); 150 atomic_inc(&page->_mapcount);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009dbe..aa3c51739378 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rmap.h>
21 22
22#include <asm/page.h> 23#include <asm/page.h>
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
@@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
220 (vma->vm_pgoff >> huge_page_order(h)); 221 (vma->vm_pgoff >> huge_page_order(h));
221} 222}
222 223
224pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
225 unsigned long address)
226{
227 return vma_hugecache_offset(hstate_vma(vma), vma, address);
228}
229
223/* 230/*
224 * Return the size of the pages allocated when backing a VMA. In the majority 231 * Return the size of the pages allocated when backing a VMA. In the majority
225 * cases this will be same size as used by the page table entries. 232 * cases this will be same size as used by the page table entries.
@@ -552,6 +559,7 @@ static void free_huge_page(struct page *page)
552 set_page_private(page, 0); 559 set_page_private(page, 0);
553 page->mapping = NULL; 560 page->mapping = NULL;
554 BUG_ON(page_count(page)); 561 BUG_ON(page_count(page));
562 BUG_ON(page_mapcount(page));
555 INIT_LIST_HEAD(&page->lru); 563 INIT_LIST_HEAD(&page->lru);
556 564
557 spin_lock(&hugetlb_lock); 565 spin_lock(&hugetlb_lock);
@@ -2129,6 +2137,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2129 entry = huge_ptep_get(src_pte); 2137 entry = huge_ptep_get(src_pte);
2130 ptepage = pte_page(entry); 2138 ptepage = pte_page(entry);
2131 get_page(ptepage); 2139 get_page(ptepage);
2140 page_dup_rmap(ptepage);
2132 set_huge_pte_at(dst, addr, dst_pte, entry); 2141 set_huge_pte_at(dst, addr, dst_pte, entry);
2133 } 2142 }
2134 spin_unlock(&src->page_table_lock); 2143 spin_unlock(&src->page_table_lock);
@@ -2207,6 +2216,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2207 flush_tlb_range(vma, start, end); 2216 flush_tlb_range(vma, start, end);
2208 mmu_notifier_invalidate_range_end(mm, start, end); 2217 mmu_notifier_invalidate_range_end(mm, start, end);
2209 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2218 list_for_each_entry_safe(page, tmp, &page_list, lru) {
2219 page_remove_rmap(page);
2210 list_del(&page->lru); 2220 list_del(&page->lru);
2211 put_page(page); 2221 put_page(page);
2212 } 2222 }
@@ -2272,6 +2282,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2272 return 1; 2282 return 1;
2273} 2283}
2274 2284
2285/*
2286 * Hugetlb_cow() should be called with page lock of the original hugepage held.
2287 */
2275static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2288static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2276 unsigned long address, pte_t *ptep, pte_t pte, 2289 unsigned long address, pte_t *ptep, pte_t pte,
2277 struct page *pagecache_page) 2290 struct page *pagecache_page)
@@ -2286,8 +2299,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2286retry_avoidcopy: 2299retry_avoidcopy:
2287 /* If no-one else is actually using this page, avoid the copy 2300 /* If no-one else is actually using this page, avoid the copy
2288 * and just make the page writable */ 2301 * and just make the page writable */
2289 avoidcopy = (page_count(old_page) == 1); 2302 avoidcopy = (page_mapcount(old_page) == 1);
2290 if (avoidcopy) { 2303 if (avoidcopy) {
2304 if (!trylock_page(old_page))
2305 if (PageAnon(old_page))
2306 page_move_anon_rmap(old_page, vma, address);
2291 set_huge_ptep_writable(vma, address, ptep); 2307 set_huge_ptep_writable(vma, address, ptep);
2292 return 0; 2308 return 0;
2293 } 2309 }
@@ -2338,6 +2354,13 @@ retry_avoidcopy:
2338 return -PTR_ERR(new_page); 2354 return -PTR_ERR(new_page);
2339 } 2355 }
2340 2356
2357 /*
2358 * When the original hugepage is shared one, it does not have
2359 * anon_vma prepared.
2360 */
2361 if (unlikely(anon_vma_prepare(vma)))
2362 return VM_FAULT_OOM;
2363
2341 copy_huge_page(new_page, old_page, address, vma); 2364 copy_huge_page(new_page, old_page, address, vma);
2342 __SetPageUptodate(new_page); 2365 __SetPageUptodate(new_page);
2343 2366
@@ -2352,6 +2375,8 @@ retry_avoidcopy:
2352 huge_ptep_clear_flush(vma, address, ptep); 2375 huge_ptep_clear_flush(vma, address, ptep);
2353 set_huge_pte_at(mm, address, ptep, 2376 set_huge_pte_at(mm, address, ptep,
2354 make_huge_pte(vma, new_page, 1)); 2377 make_huge_pte(vma, new_page, 1));
2378 page_remove_rmap(old_page);
2379 hugepage_add_anon_rmap(new_page, vma, address);
2355 /* Make the old page be freed below */ 2380 /* Make the old page be freed below */
2356 new_page = old_page; 2381 new_page = old_page;
2357 } 2382 }
@@ -2452,10 +2477,17 @@ retry:
2452 spin_lock(&inode->i_lock); 2477 spin_lock(&inode->i_lock);
2453 inode->i_blocks += blocks_per_huge_page(h); 2478 inode->i_blocks += blocks_per_huge_page(h);
2454 spin_unlock(&inode->i_lock); 2479 spin_unlock(&inode->i_lock);
2480 page_dup_rmap(page);
2455 } else { 2481 } else {
2456 lock_page(page); 2482 lock_page(page);
2457 page->mapping = HUGETLB_POISON; 2483 if (unlikely(anon_vma_prepare(vma))) {
2484 ret = VM_FAULT_OOM;
2485 goto backout_unlocked;
2486 }
2487 hugepage_add_new_anon_rmap(page, vma, address);
2458 } 2488 }
2489 } else {
2490 page_dup_rmap(page);
2459 } 2491 }
2460 2492
2461 /* 2493 /*
@@ -2507,6 +2539,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2507 pte_t *ptep; 2539 pte_t *ptep;
2508 pte_t entry; 2540 pte_t entry;
2509 int ret; 2541 int ret;
2542 struct page *page = NULL;
2510 struct page *pagecache_page = NULL; 2543 struct page *pagecache_page = NULL;
2511 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 2544 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2512 struct hstate *h = hstate_vma(vma); 2545 struct hstate *h = hstate_vma(vma);
@@ -2548,6 +2581,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2548 vma, address); 2581 vma, address);
2549 } 2582 }
2550 2583
2584 if (!pagecache_page) {
2585 page = pte_page(entry);
2586 lock_page(page);
2587 }
2588
2551 spin_lock(&mm->page_table_lock); 2589 spin_lock(&mm->page_table_lock);
2552 /* Check for a racing update before calling hugetlb_cow */ 2590 /* Check for a racing update before calling hugetlb_cow */
2553 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2591 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2573,6 +2611,8 @@ out_page_table_lock:
2573 if (pagecache_page) { 2611 if (pagecache_page) {
2574 unlock_page(pagecache_page); 2612 unlock_page(pagecache_page);
2575 put_page(pagecache_page); 2613 put_page(pagecache_page);
2614 } else {
2615 unlock_page(page);
2576 } 2616 }
2577 2617
2578out_mutex: 2618out_mutex:
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..0ad53572eaf2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
56#include <linux/memcontrol.h> 56#include <linux/memcontrol.h>
57#include <linux/mmu_notifier.h> 57#include <linux/mmu_notifier.h>
58#include <linux/migrate.h> 58#include <linux/migrate.h>
59#include <linux/hugetlb.h>
59 60
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61 62
@@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
326 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 327 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
327 unsigned long address; 328 unsigned long address;
328 329
330 if (unlikely(is_vm_hugetlb_page(vma)))
331 pgoff = page->index << huge_page_order(page_hstate(page));
329 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 332 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
330 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 333 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
331 /* page should be within @vma mapping range */ 334 /* page should be within @vma mapping range */
@@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
369 pte_t *pte; 372 pte_t *pte;
370 spinlock_t *ptl; 373 spinlock_t *ptl;
371 374
375 if (unlikely(PageHuge(page))) {
376 pte = huge_pte_offset(mm, address);
377 ptl = &mm->page_table_lock;
378 goto check;
379 }
380
372 pgd = pgd_offset(mm, address); 381 pgd = pgd_offset(mm, address);
373 if (!pgd_present(*pgd)) 382 if (!pgd_present(*pgd))
374 return NULL; 383 return NULL;
@@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
389 } 398 }
390 399
391 ptl = pte_lockptr(mm, pmd); 400 ptl = pte_lockptr(mm, pmd);
401check:
392 spin_lock(ptl); 402 spin_lock(ptl);
393 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 403 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
394 *ptlp = ptl; 404 *ptlp = ptl;
@@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page)
873 page_clear_dirty(page); 883 page_clear_dirty(page);
874 set_page_dirty(page); 884 set_page_dirty(page);
875 } 885 }
886 /*
887 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
888 * and not charged by memcg for now.
889 */
890 if (unlikely(PageHuge(page)))
891 return;
876 if (PageAnon(page)) { 892 if (PageAnon(page)) {
877 mem_cgroup_uncharge_page(page); 893 mem_cgroup_uncharge_page(page);
878 __dec_zone_page_state(page, NR_ANON_PAGES); 894 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1445,3 +1461,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1445 return rmap_walk_file(page, rmap_one, arg); 1461 return rmap_walk_file(page, rmap_one, arg);
1446} 1462}
1447#endif /* CONFIG_MIGRATION */ 1463#endif /* CONFIG_MIGRATION */
1464
1465#ifdef CONFIG_HUGETLBFS
1466/*
1467 * The following three functions are for anonymous (private mapped) hugepages.
1468 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1469 * and no lru code, because we handle hugepages differently from common pages.
1470 */
1471static void __hugepage_set_anon_rmap(struct page *page,
1472 struct vm_area_struct *vma, unsigned long address, int exclusive)
1473{
1474 struct anon_vma *anon_vma = vma->anon_vma;
1475 BUG_ON(!anon_vma);
1476 if (!exclusive) {
1477 struct anon_vma_chain *avc;
1478 avc = list_entry(vma->anon_vma_chain.prev,
1479 struct anon_vma_chain, same_vma);
1480 anon_vma = avc->anon_vma;
1481 }
1482 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1483 page->mapping = (struct address_space *) anon_vma;
1484 page->index = linear_page_index(vma, address);
1485}
1486
1487void hugepage_add_anon_rmap(struct page *page,
1488 struct vm_area_struct *vma, unsigned long address)
1489{
1490 struct anon_vma *anon_vma = vma->anon_vma;
1491 int first;
1492 BUG_ON(!anon_vma);
1493 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1494 first = atomic_inc_and_test(&page->_mapcount);
1495 if (first)
1496 __hugepage_set_anon_rmap(page, vma, address, 0);
1497}
1498
1499void hugepage_add_new_anon_rmap(struct page *page,
1500 struct vm_area_struct *vma, unsigned long address)
1501{
1502 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1503 atomic_set(&page->_mapcount, 0);
1504 __hugepage_set_anon_rmap(page, vma, address, 1);
1505}
1506#endif /* CONFIG_HUGETLBFS */