From 0fe6e20b9c4c53b3e97096ee73a0857f60aad43f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 28 May 2010 09:29:16 +0900 Subject: hugetlb, rmap: add reverse mapping for hugepage This patch adds reverse mapping feature for hugepage by introducing mapcount for shared/private-mapped hugepage and anon_vma for private-mapped hugepage. While hugepage is not currently swappable, reverse mapping can be useful for memory error handler. Without this patch, memory error handler cannot identify processes using the bad hugepage nor unmap it from them. That is: - for shared hugepage: we can collect processes using a hugepage through pagecache, but can not unmap the hugepage because of the lack of mapcount. - for privately mapped hugepage: we can neither collect processes nor unmap the hugepage. This patch solves these problems. This patch include the bug fix given by commit 23be7468e8, so reverts it. Dependency: "hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h" ChangeLog since May 24. - create hugetlb_inline.h and move is_vm_hugetlb_index() in it. - move functions setting up anon_vma for hugepage into mm/rmap.c. ChangeLog since May 13. - rebased to 2.6.34 - fix logic error (in case that private mapping and shared mapping coexist) - move is_vm_hugetlb_page() into include/linux/mm.h to use this function from linear_page_index() - define and use linear_hugepage_index() instead of compound_order() - use page_move_anon_rmap() in hugetlb_cow() - copy exclusive switch of __set_page_anon_rmap() into hugepage counterpart. - revert commit 24be7468 completely Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Andrew Morton Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Larry Woodman Cc: Lee Schermerhorn Acked-by: Fengguang Wu Acked-by: Mel Gorman Signed-off-by: Andi Kleen --- mm/hugetlb.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 54d42b009dbe..aa3c51739378 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -552,6 +559,7 @@ static void free_huge_page(struct page *page) set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); @@ -2129,6 +2137,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); + page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -2207,6 +2216,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { + page_remove_rmap(page); list_del(&page->lru); put_page(page); } @@ -2272,6 +2282,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, return 1; } +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) @@ -2286,8 +2299,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_count(old_page) == 1); + avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { + if (!trylock_page(old_page)) + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2338,6 +2354,13 @@ retry_avoidcopy: return -PTR_ERR(new_page); } + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -2352,6 +2375,8 @@ retry_avoidcopy: huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; } @@ -2452,10 +2477,17 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); + page_dup_rmap(page); } else { lock_page(page); - page->mapping = HUGETLB_POISON; + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + hugepage_add_new_anon_rmap(page, vma, address); } + } else { + page_dup_rmap(page); } /* @@ -2507,6 +2539,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *page = NULL; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -2548,6 +2581,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + if (!pagecache_page) { + page = pte_page(entry); + lock_page(page); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) @@ -2573,6 +2611,8 @@ out_page_table_lock: if (pagecache_page) { unlock_page(pagecache_page); put_page(pagecache_page); + } else { + unlock_page(page); } out_mutex: -- cgit v1.2.2 From 93f70f900da36fbc19c13c2aa04b2e468c8d00fb Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 28 May 2010 09:29:20 +0900 Subject: HWPOISON, hugetlb: isolate corrupted hugepage If error hugepage is not in-use, we can fully recovery from error by dequeuing it from freelist, so return RECOVERY. Otherwise whether or not we can recovery depends on user processes, so return DELAYED. Dependency: "HWPOISON, hugetlb: enable error handling path for hugepage" Signed-off-by: Naoya Horiguchi Cc: Andrew Morton Acked-by: Fengguang Wu Signed-off-by: Andi Kleen --- mm/hugetlb.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aa3c51739378..8c163f64cf10 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2825,3 +2825,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_put_quota(inode->i_mapping, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); } + +/* + * This function is called from memory failure code. + * Assume the caller holds page lock of the head page. + */ +void __isolate_hwpoisoned_huge_page(struct page *hpage) +{ + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + spin_lock(&hugetlb_lock); + list_del(&hpage->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + spin_unlock(&hugetlb_lock); +} -- cgit v1.2.2 From fd6a03edd271cf2d69a61aa8df98dd05fa6b9afd Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 28 May 2010 09:29:21 +0900 Subject: HWPOISON, hugetlb: detect hwpoison in hugetlb code This patch enables to block access to hwpoisoned hugepage and also enables to block unmapping for it. Dependency: "HWPOISON, hugetlb: enable error handling path for hugepage" Signed-off-by: Naoya Horiguchi Cc: Andrew Morton Acked-by: Fengguang Wu Acked-by: Mel Gorman Signed-off-by: Andi Kleen --- mm/hugetlb.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8c163f64cf10..4c2efc0f3919 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include @@ -2149,6 +2151,19 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + return 1; + } else + return 0; +} + void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -2207,6 +2222,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pte_none(pte)) continue; + /* + * HWPoisoned hugepage is already unmapped and dropped reference + */ + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + page = pte_page(pte); if (pte_dirty(pte)) set_page_dirty(page); @@ -2490,6 +2511,18 @@ retry: page_dup_rmap(page); } + /* + * Since memory error handler replaces pte into hwpoison swap entry + * at the time of error handling, a process which reserved but not have + * the mapping to the error hugepage does not have hwpoison swap entry. + * So we need to block accesses from such a process by checking + * PG_hwpoison bit here. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON; + goto backout_unlocked; + } + /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that @@ -2544,6 +2577,13 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + ptep = huge_pte_offset(mm, address); + if (ptep) { + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON; + } + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; -- cgit v1.2.2 From 43131e141abdb44c487cf79af3ef1fe5164dcef9 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 28 May 2010 09:29:22 +0900 Subject: HWPOISON, hugetlb: support hwpoison injection for hugepage This patch enables hwpoison injection through debug/hwpoison interfaces, with which we can test memory error handling for free or reserved hugepages (which cannot be tested by madvise() injector). [AK: Export PageHuge too for the injection module] Signed-off-by: Naoya Horiguchi Cc: Andrew Morton Acked-by: Fengguang Wu Signed-off-by: Andi Kleen --- mm/hugetlb.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c2efc0f3919..3c275ffd32a7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -615,6 +615,8 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } +EXPORT_SYMBOL_GPL(PageHuge); + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; -- cgit v1.2.2 From 28957a5467bab9ed51a237d21e31055fad987887 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 2 Jul 2010 14:47:20 +0900 Subject: hugetlb: add missing unlock in avoidcopy path in hugetlb_cow() This patch fixes possible deadlock in hugepage lock_page() by adding missing unlock_page(). libhugetlbfs test will hit this bug when the next patch in this patchset ("hugetlb, HWPOISON: move PG_HWPoison bit check") is applied. Signed-off-by: Naoya Horiguchi Signed-off-by: Jun'ichi Nomura Acked-by: Fengguang Wu Signed-off-by: Andi Kleen --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c275ffd32a7..303fb0c02364 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2324,9 +2324,11 @@ retry_avoidcopy: * and just make the page writable */ avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { - if (!trylock_page(old_page)) + if (!trylock_page(old_page)) { if (PageAnon(old_page)) page_move_anon_rmap(old_page, vma, address); + } else + unlock_page(old_page); set_huge_ptep_writable(vma, address, ptep); return 0; } -- cgit v1.2.2