aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-08-12 13:15:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-12 13:15:10 -0400
commit1021a645344d4a77333e19e60d37b9343be0d7b7 (patch)
tree7a78ab55f27f97209ed1b85ccfd88c6d5b8416d3
parent7367f5b013fee33f7d40a5a10a39d5134f529ec8 (diff)
parent28957a5467bab9ed51a237d21e31055fad987887 (diff)
Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6
* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: hugetlb: add missing unlock in avoidcopy path in hugetlb_cow() hwpoison: rename CONFIG HWPOISON, hugetlb: support hwpoison injection for hugepage HWPOISON, hugetlb: detect hwpoison in hugetlb code HWPOISON, hugetlb: isolate corrupted hugepage HWPOISON, hugetlb: maintain mce_bad_pages in handling hugepage error HWPOISON, hugetlb: set/clear PG_hwpoison bits on hugepage HWPOISON, hugetlb: enable error handling path for hugepage hugetlb, rmap: add reverse mapping for hugepage hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h Fix up trivial conflicts in mm/memory-failure.c
-rw-r--r--include/linux/hugetlb.h14
-rw-r--r--include/linux/hugetlb_inline.h22
-rw-r--r--include/linux/pagemap.h9
-rw-r--r--include/linux/poison.h9
-rw-r--r--include/linux/rmap.h5
-rw-r--r--mm/hugetlb.c104
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/memory-failure.c120
-rw-r--r--mm/rmap.c59
9 files changed, 299 insertions, 58 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 78b4bc64c006..f479700df61b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -2,6 +2,7 @@
2#define _LINUX_HUGETLB_H 2#define _LINUX_HUGETLB_H
3 3
4#include <linux/fs.h> 4#include <linux/fs.h>
5#include <linux/hugetlb_inline.h>
5 6
6struct ctl_table; 7struct ctl_table;
7struct user_struct; 8struct user_struct;
@@ -14,11 +15,6 @@ struct user_struct;
14 15
15int PageHuge(struct page *page); 16int PageHuge(struct page *page);
16 17
17static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
18{
19 return vma->vm_flags & VM_HUGETLB;
20}
21
22void reset_vma_resv_huge_pages(struct vm_area_struct *vma); 18void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
23int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 19int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
24int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); 20int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -47,6 +43,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
47 struct vm_area_struct *vma, 43 struct vm_area_struct *vma,
48 int acctflags); 44 int acctflags);
49void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); 45void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
46void __isolate_hwpoisoned_huge_page(struct page *page);
50 47
51extern unsigned long hugepages_treat_as_movable; 48extern unsigned long hugepages_treat_as_movable;
52extern const unsigned long hugetlb_zero, hugetlb_infinity; 49extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -77,11 +74,6 @@ static inline int PageHuge(struct page *page)
77 return 0; 74 return 0;
78} 75}
79 76
80static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
81{
82 return 0;
83}
84
85static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 77static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
86{ 78{
87} 79}
@@ -108,6 +100,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
108#define is_hugepage_only_range(mm, addr, len) 0 100#define is_hugepage_only_range(mm, addr, len) 0
109#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) 101#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
110#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) 102#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
103#define huge_pte_offset(mm, address) 0
104#define __isolate_hwpoisoned_huge_page(page) 0
111 105
112#define hugetlb_change_protection(vma, address, end, newprot) 106#define hugetlb_change_protection(vma, address, end, newprot)
113 107
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
new file mode 100644
index 000000000000..6931489a5c14
--- /dev/null
+++ b/include/linux/hugetlb_inline.h
@@ -0,0 +1,22 @@
1#ifndef _LINUX_HUGETLB_INLINE_H
2#define _LINUX_HUGETLB_INLINE_H
3
4#ifdef CONFIG_HUGETLB_PAGE
5
6#include <linux/mm.h>
7
8static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
9{
10 return vma->vm_flags & VM_HUGETLB;
11}
12
13#else
14
15static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
16{
17 return 0;
18}
19
20#endif
21
22#endif
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 78a702ce4fcb..e12cdc6d79ee 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -13,6 +13,7 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include <linux/bitops.h> 14#include <linux/bitops.h>
15#include <linux/hardirq.h> /* for in_interrupt() */ 15#include <linux/hardirq.h> /* for in_interrupt() */
16#include <linux/hugetlb_inline.h>
16 17
17/* 18/*
18 * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page 19 * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page
@@ -281,10 +282,16 @@ static inline loff_t page_offset(struct page *page)
281 return ((loff_t)page->index) << PAGE_CACHE_SHIFT; 282 return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
282} 283}
283 284
285extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
286 unsigned long address);
287
284static inline pgoff_t linear_page_index(struct vm_area_struct *vma, 288static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
285 unsigned long address) 289 unsigned long address)
286{ 290{
287 pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT; 291 pgoff_t pgoff;
292 if (unlikely(is_vm_hugetlb_page(vma)))
293 return linear_hugepage_index(vma, address);
294 pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
288 pgoff += vma->vm_pgoff; 295 pgoff += vma->vm_pgoff;
289 return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT); 296 return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
290} 297}
diff --git a/include/linux/poison.h b/include/linux/poison.h
index 34066ffd893d..2110a81c5e2a 100644
--- a/include/linux/poison.h
+++ b/include/linux/poison.h
@@ -48,15 +48,6 @@
48#define POISON_FREE 0x6b /* for use-after-free poisoning */ 48#define POISON_FREE 0x6b /* for use-after-free poisoning */
49#define POISON_END 0xa5 /* end-byte of poisoning */ 49#define POISON_END 0xa5 /* end-byte of poisoning */
50 50
51/********** mm/hugetlb.c **********/
52/*
53 * Private mappings of hugetlb pages use this poisoned value for
54 * page->mapping. The core VM should not be doing anything with this mapping
55 * but futex requires the existence of some page->mapping value even though it
56 * is unused if PAGE_MAPPING_ANON is set.
57 */
58#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON))
59
60/********** arch/$ARCH/mm/init.c **********/ 51/********** arch/$ARCH/mm/init.c **********/
61#define POISON_FREE_INITMEM 0xcc 52#define POISON_FREE_INITMEM 0xcc
62 53
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d6661de56f30..31b2fd75dcba 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -168,6 +168,11 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon
168void page_add_file_rmap(struct page *); 168void page_add_file_rmap(struct page *);
169void page_remove_rmap(struct page *); 169void page_remove_rmap(struct page *);
170 170
171void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
172 unsigned long);
173void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
174 unsigned long);
175
171static inline void page_dup_rmap(struct page *page) 176static inline void page_dup_rmap(struct page *page)
172{ 177{
173 atomic_inc(&page->_mapcount); 178 atomic_inc(&page->_mapcount);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b61d2db9f34e..cc5be788a39f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
18#include <linux/bootmem.h> 18#include <linux/bootmem.h>
19#include <linux/sysfs.h> 19#include <linux/sysfs.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/rmap.h>
22#include <linux/swap.h>
23#include <linux/swapops.h>
21 24
22#include <asm/page.h> 25#include <asm/page.h>
23#include <asm/pgtable.h> 26#include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
220 (vma->vm_pgoff >> huge_page_order(h)); 223 (vma->vm_pgoff >> huge_page_order(h));
221} 224}
222 225
226pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
227 unsigned long address)
228{
229 return vma_hugecache_offset(hstate_vma(vma), vma, address);
230}
231
223/* 232/*
224 * Return the size of the pages allocated when backing a VMA. In the majority 233 * Return the size of the pages allocated when backing a VMA. In the majority
225 * cases this will be same size as used by the page table entries. 234 * cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
552 set_page_private(page, 0); 561 set_page_private(page, 0);
553 page->mapping = NULL; 562 page->mapping = NULL;
554 BUG_ON(page_count(page)); 563 BUG_ON(page_count(page));
564 BUG_ON(page_mapcount(page));
555 INIT_LIST_HEAD(&page->lru); 565 INIT_LIST_HEAD(&page->lru);
556 566
557 spin_lock(&hugetlb_lock); 567 spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
605 return dtor == free_huge_page; 615 return dtor == free_huge_page;
606} 616}
607 617
618EXPORT_SYMBOL_GPL(PageHuge);
619
608static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 620static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
609{ 621{
610 struct page *page; 622 struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
2129 entry = huge_ptep_get(src_pte); 2141 entry = huge_ptep_get(src_pte);
2130 ptepage = pte_page(entry); 2142 ptepage = pte_page(entry);
2131 get_page(ptepage); 2143 get_page(ptepage);
2144 page_dup_rmap(ptepage);
2132 set_huge_pte_at(dst, addr, dst_pte, entry); 2145 set_huge_pte_at(dst, addr, dst_pte, entry);
2133 } 2146 }
2134 spin_unlock(&src->page_table_lock); 2147 spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
2140 return -ENOMEM; 2153 return -ENOMEM;
2141} 2154}
2142 2155
2156static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2157{
2158 swp_entry_t swp;
2159
2160 if (huge_pte_none(pte) || pte_present(pte))
2161 return 0;
2162 swp = pte_to_swp_entry(pte);
2163 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
2164 return 1;
2165 } else
2166 return 0;
2167}
2168
2143void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, 2169void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2144 unsigned long end, struct page *ref_page) 2170 unsigned long end, struct page *ref_page)
2145{ 2171{
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2198 if (huge_pte_none(pte)) 2224 if (huge_pte_none(pte))
2199 continue; 2225 continue;
2200 2226
2227 /*
2228 * HWPoisoned hugepage is already unmapped and dropped reference
2229 */
2230 if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
2231 continue;
2232
2201 page = pte_page(pte); 2233 page = pte_page(pte);
2202 if (pte_dirty(pte)) 2234 if (pte_dirty(pte))
2203 set_page_dirty(page); 2235 set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2207 flush_tlb_range(vma, start, end); 2239 flush_tlb_range(vma, start, end);
2208 mmu_notifier_invalidate_range_end(mm, start, end); 2240 mmu_notifier_invalidate_range_end(mm, start, end);
2209 list_for_each_entry_safe(page, tmp, &page_list, lru) { 2241 list_for_each_entry_safe(page, tmp, &page_list, lru) {
2242 page_remove_rmap(page);
2210 list_del(&page->lru); 2243 list_del(&page->lru);
2211 put_page(page); 2244 put_page(page);
2212 } 2245 }
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2272 return 1; 2305 return 1;
2273} 2306}
2274 2307
2308/*
2309 * Hugetlb_cow() should be called with page lock of the original hugepage held.
2310 */
2275static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2311static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2276 unsigned long address, pte_t *ptep, pte_t pte, 2312 unsigned long address, pte_t *ptep, pte_t pte,
2277 struct page *pagecache_page) 2313 struct page *pagecache_page)
@@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2286retry_avoidcopy: 2322retry_avoidcopy:
2287 /* If no-one else is actually using this page, avoid the copy 2323 /* If no-one else is actually using this page, avoid the copy
2288 * and just make the page writable */ 2324 * and just make the page writable */
2289 avoidcopy = (page_count(old_page) == 1); 2325 avoidcopy = (page_mapcount(old_page) == 1);
2290 if (avoidcopy) { 2326 if (avoidcopy) {
2327 if (!trylock_page(old_page)) {
2328 if (PageAnon(old_page))
2329 page_move_anon_rmap(old_page, vma, address);
2330 } else
2331 unlock_page(old_page);
2291 set_huge_ptep_writable(vma, address, ptep); 2332 set_huge_ptep_writable(vma, address, ptep);
2292 return 0; 2333 return 0;
2293 } 2334 }
@@ -2338,6 +2379,13 @@ retry_avoidcopy:
2338 return -PTR_ERR(new_page); 2379 return -PTR_ERR(new_page);
2339 } 2380 }
2340 2381
2382 /*
2383 * When the original hugepage is shared one, it does not have
2384 * anon_vma prepared.
2385 */
2386 if (unlikely(anon_vma_prepare(vma)))
2387 return VM_FAULT_OOM;
2388
2341 copy_huge_page(new_page, old_page, address, vma); 2389 copy_huge_page(new_page, old_page, address, vma);
2342 __SetPageUptodate(new_page); 2390 __SetPageUptodate(new_page);
2343 2391
@@ -2355,6 +2403,8 @@ retry_avoidcopy:
2355 huge_ptep_clear_flush(vma, address, ptep); 2403 huge_ptep_clear_flush(vma, address, ptep);
2356 set_huge_pte_at(mm, address, ptep, 2404 set_huge_pte_at(mm, address, ptep,
2357 make_huge_pte(vma, new_page, 1)); 2405 make_huge_pte(vma, new_page, 1));
2406 page_remove_rmap(old_page);
2407 hugepage_add_anon_rmap(new_page, vma, address);
2358 /* Make the old page be freed below */ 2408 /* Make the old page be freed below */
2359 new_page = old_page; 2409 new_page = old_page;
2360 mmu_notifier_invalidate_range_end(mm, 2410 mmu_notifier_invalidate_range_end(mm,
@@ -2458,10 +2508,29 @@ retry:
2458 spin_lock(&inode->i_lock); 2508 spin_lock(&inode->i_lock);
2459 inode->i_blocks += blocks_per_huge_page(h); 2509 inode->i_blocks += blocks_per_huge_page(h);
2460 spin_unlock(&inode->i_lock); 2510 spin_unlock(&inode->i_lock);
2511 page_dup_rmap(page);
2461 } else { 2512 } else {
2462 lock_page(page); 2513 lock_page(page);
2463 page->mapping = HUGETLB_POISON; 2514 if (unlikely(anon_vma_prepare(vma))) {
2515 ret = VM_FAULT_OOM;
2516 goto backout_unlocked;
2517 }
2518 hugepage_add_new_anon_rmap(page, vma, address);
2464 } 2519 }
2520 } else {
2521 page_dup_rmap(page);
2522 }
2523
2524 /*
2525 * Since memory error handler replaces pte into hwpoison swap entry
2526 * at the time of error handling, a process which reserved but not have
2527 * the mapping to the error hugepage does not have hwpoison swap entry.
2528 * So we need to block accesses from such a process by checking
2529 * PG_hwpoison bit here.
2530 */
2531 if (unlikely(PageHWPoison(page))) {
2532 ret = VM_FAULT_HWPOISON;
2533 goto backout_unlocked;
2465 } 2534 }
2466 2535
2467 /* 2536 /*
@@ -2513,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2513 pte_t *ptep; 2582 pte_t *ptep;
2514 pte_t entry; 2583 pte_t entry;
2515 int ret; 2584 int ret;
2585 struct page *page = NULL;
2516 struct page *pagecache_page = NULL; 2586 struct page *pagecache_page = NULL;
2517 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 2587 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2518 struct hstate *h = hstate_vma(vma); 2588 struct hstate *h = hstate_vma(vma);
2519 2589
2590 ptep = huge_pte_offset(mm, address);
2591 if (ptep) {
2592 entry = huge_ptep_get(ptep);
2593 if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2594 return VM_FAULT_HWPOISON;
2595 }
2596
2520 ptep = huge_pte_alloc(mm, address, huge_page_size(h)); 2597 ptep = huge_pte_alloc(mm, address, huge_page_size(h));
2521 if (!ptep) 2598 if (!ptep)
2522 return VM_FAULT_OOM; 2599 return VM_FAULT_OOM;
@@ -2554,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2554 vma, address); 2631 vma, address);
2555 } 2632 }
2556 2633
2634 if (!pagecache_page) {
2635 page = pte_page(entry);
2636 lock_page(page);
2637 }
2638
2557 spin_lock(&mm->page_table_lock); 2639 spin_lock(&mm->page_table_lock);
2558 /* Check for a racing update before calling hugetlb_cow */ 2640 /* Check for a racing update before calling hugetlb_cow */
2559 if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) 2641 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2579,6 +2661,8 @@ out_page_table_lock:
2579 if (pagecache_page) { 2661 if (pagecache_page) {
2580 unlock_page(pagecache_page); 2662 unlock_page(pagecache_page);
2581 put_page(pagecache_page); 2663 put_page(pagecache_page);
2664 } else {
2665 unlock_page(page);
2582 } 2666 }
2583 2667
2584out_mutex: 2668out_mutex:
@@ -2791,3 +2875,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2791 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 2875 hugetlb_put_quota(inode->i_mapping, (chg - freed));
2792 hugetlb_acct_memory(h, -(chg - freed)); 2876 hugetlb_acct_memory(h, -(chg - freed));
2793} 2877}
2878
2879/*
2880 * This function is called from memory failure code.
2881 * Assume the caller holds page lock of the head page.
2882 */
2883void __isolate_hwpoisoned_huge_page(struct page *hpage)
2884{
2885 struct hstate *h = page_hstate(hpage);
2886 int nid = page_to_nid(hpage);
2887
2888 spin_lock(&hugetlb_lock);
2889 list_del(&hpage->lru);
2890 h->free_huge_pages--;
2891 h->free_huge_pages_node[nid]--;
2892 spin_unlock(&hugetlb_lock);
2893}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea71905c1f..0948f1072d6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/swap.h> 6#include <linux/swap.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/hugetlb.h>
8#include "internal.h" 9#include "internal.h"
9 10
10static struct dentry *hwpoison_dir; 11static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
13{ 14{
14 unsigned long pfn = val; 15 unsigned long pfn = val;
15 struct page *p; 16 struct page *p;
17 struct page *hpage;
16 int err; 18 int err;
17 19
18 if (!capable(CAP_SYS_ADMIN)) 20 if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
24 return -ENXIO; 26 return -ENXIO;
25 27
26 p = pfn_to_page(pfn); 28 p = pfn_to_page(pfn);
29 hpage = compound_head(p);
27 /* 30 /*
28 * This implies unable to support free buddy pages. 31 * This implies unable to support free buddy pages.
29 */ 32 */
30 if (!get_page_unless_zero(p)) 33 if (!get_page_unless_zero(hpage))
31 return 0; 34 return 0;
32 35
33 if (!PageLRU(p)) 36 if (!PageLRU(p) && !PageHuge(p))
34 shake_page(p, 0); 37 shake_page(p, 0);
35 /* 38 /*
36 * This implies unable to support non-LRU pages. 39 * This implies unable to support non-LRU pages.
37 */ 40 */
38 if (!PageLRU(p)) 41 if (!PageLRU(p) && !PageHuge(p))
39 return 0; 42 return 0;
40 43
41 /* 44 /*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
44 * We temporarily take page lock for try_get_mem_cgroup_from_page(). 47 * We temporarily take page lock for try_get_mem_cgroup_from_page().
45 * __memory_failure() will redo the check reliably inside page lock. 48 * __memory_failure() will redo the check reliably inside page lock.
46 */ 49 */
47 lock_page(p); 50 lock_page(hpage);
48 err = hwpoison_filter(p); 51 err = hwpoison_filter(hpage);
49 unlock_page(p); 52 unlock_page(hpage);
50 if (err) 53 if (err)
51 return 0; 54 return 0;
52 55
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6b44e52cacaa..9c26eeca1342 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
46#include <linux/suspend.h> 46#include <linux/suspend.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/swapops.h> 48#include <linux/swapops.h>
49#include <linux/hugetlb.h>
49#include "internal.h" 50#include "internal.h"
50 51
51int sysctl_memory_failure_early_kill __read_mostly = 0; 52int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
690/* 691/*
691 * Huge pages. Needs work. 692 * Huge pages. Needs work.
692 * Issues: 693 * Issues:
693 * No rmap support so we cannot find the original mapper. In theory could walk 694 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
694 * all MMs and look for the mappings, but that would be non atomic and racy. 695 * To narrow down kill region to one page, we need to break up pmd.
695 * Need rmap for hugepages for this. Alternatively we could employ a heuristic, 696 * - To support soft-offlining for hugepage, we need to support hugepage
696 * like just walking the current process and hoping it has it mapped (that 697 * migration.
697 * should be usually true for the common "shared database cache" case)
698 * Should handle free huge pages and dequeue them too, but this needs to
699 * handle huge page accounting correctly.
700 */ 698 */
701static int me_huge_page(struct page *p, unsigned long pfn) 699static int me_huge_page(struct page *p, unsigned long pfn)
702{ 700{
703 return FAILED; 701 struct page *hpage = compound_head(p);
702 /*
703 * We can safely recover from error on free or reserved (i.e.
704 * not in-use) hugepage by dequeuing it from freelist.
705 * To check whether a hugepage is in-use or not, we can't use
706 * page->lru because it can be used in other hugepage operations,
707 * such as __unmap_hugepage_range() and gather_surplus_pages().
708 * So instead we use page_mapping() and PageAnon().
709 * We assume that this function is called with page lock held,
710 * so there is no race between isolation and mapping/unmapping.
711 */
712 if (!(page_mapping(hpage) || PageAnon(hpage))) {
713 __isolate_hwpoisoned_huge_page(hpage);
714 return RECOVERED;
715 }
716 return DELAYED;
704} 717}
705 718
706/* 719/*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
838 int ret; 851 int ret;
839 int i; 852 int i;
840 int kill = 1; 853 int kill = 1;
854 struct page *hpage = compound_head(p);
841 855
842 if (PageReserved(p) || PageSlab(p)) 856 if (PageReserved(p) || PageSlab(p))
843 return SWAP_SUCCESS; 857 return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
846 * This check implies we don't kill processes if their pages 860 * This check implies we don't kill processes if their pages
847 * are in the swap cache early. Those are always late kills. 861 * are in the swap cache early. Those are always late kills.
848 */ 862 */
849 if (!page_mapped(p)) 863 if (!page_mapped(hpage))
850 return SWAP_SUCCESS; 864 return SWAP_SUCCESS;
851 865
852 if (PageCompound(p) || PageKsm(p)) 866 if (PageKsm(p))
853 return SWAP_FAIL; 867 return SWAP_FAIL;
854 868
855 if (PageSwapCache(p)) { 869 if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
864 * XXX: the dirty test could be racy: set_page_dirty() may not always 878 * XXX: the dirty test could be racy: set_page_dirty() may not always
865 * be called inside page lock (it's recommended but not enforced). 879 * be called inside page lock (it's recommended but not enforced).
866 */ 880 */
867 mapping = page_mapping(p); 881 mapping = page_mapping(hpage);
868 if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { 882 if (!PageDirty(hpage) && mapping &&
869 if (page_mkclean(p)) { 883 mapping_cap_writeback_dirty(mapping)) {
870 SetPageDirty(p); 884 if (page_mkclean(hpage)) {
885 SetPageDirty(hpage);
871 } else { 886 } else {
872 kill = 0; 887 kill = 0;
873 ttu |= TTU_IGNORE_HWPOISON; 888 ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
886 * there's nothing that can be done. 901 * there's nothing that can be done.
887 */ 902 */
888 if (kill) 903 if (kill)
889 collect_procs(p, &tokill); 904 collect_procs(hpage, &tokill);
890 905
891 /* 906 /*
892 * try_to_unmap can fail temporarily due to races. 907 * try_to_unmap can fail temporarily due to races.
893 * Try a few times (RED-PEN better strategy?) 908 * Try a few times (RED-PEN better strategy?)
894 */ 909 */
895 for (i = 0; i < N_UNMAP_TRIES; i++) { 910 for (i = 0; i < N_UNMAP_TRIES; i++) {
896 ret = try_to_unmap(p, ttu); 911 ret = try_to_unmap(hpage, ttu);
897 if (ret == SWAP_SUCCESS) 912 if (ret == SWAP_SUCCESS)
898 break; 913 break;
899 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); 914 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
901 916
902 if (ret != SWAP_SUCCESS) 917 if (ret != SWAP_SUCCESS)
903 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 918 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
904 pfn, page_mapcount(p)); 919 pfn, page_mapcount(hpage));
905 920
906 /* 921 /*
907 * Now that the dirty bit has been propagated to the 922 * Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
912 * use a more force-full uncatchable kill to prevent 927 * use a more force-full uncatchable kill to prevent
913 * any accesses to the poisoned memory. 928 * any accesses to the poisoned memory.
914 */ 929 */
915 kill_procs_ao(&tokill, !!PageDirty(p), trapno, 930 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
916 ret != SWAP_SUCCESS, pfn); 931 ret != SWAP_SUCCESS, pfn);
917 932
918 return ret; 933 return ret;
919} 934}
920 935
936static void set_page_hwpoison_huge_page(struct page *hpage)
937{
938 int i;
939 int nr_pages = 1 << compound_order(hpage);
940 for (i = 0; i < nr_pages; i++)
941 SetPageHWPoison(hpage + i);
942}
943
944static void clear_page_hwpoison_huge_page(struct page *hpage)
945{
946 int i;
947 int nr_pages = 1 << compound_order(hpage);
948 for (i = 0; i < nr_pages; i++)
949 ClearPageHWPoison(hpage + i);
950}
951
921int __memory_failure(unsigned long pfn, int trapno, int flags) 952int __memory_failure(unsigned long pfn, int trapno, int flags)
922{ 953{
923 struct page_state *ps; 954 struct page_state *ps;
924 struct page *p; 955 struct page *p;
956 struct page *hpage;
925 int res; 957 int res;
958 unsigned int nr_pages;
926 959
927 if (!sysctl_memory_failure_recovery) 960 if (!sysctl_memory_failure_recovery)
928 panic("Memory failure from trap %d on page %lx", trapno, pfn); 961 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
935 } 968 }
936 969
937 p = pfn_to_page(pfn); 970 p = pfn_to_page(pfn);
971 hpage = compound_head(p);
938 if (TestSetPageHWPoison(p)) { 972 if (TestSetPageHWPoison(p)) {
939 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); 973 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
940 return 0; 974 return 0;
941 } 975 }
942 976
943 atomic_long_add(1, &mce_bad_pages); 977 nr_pages = 1 << compound_order(hpage);
978 atomic_long_add(nr_pages, &mce_bad_pages);
944 979
945 /* 980 /*
946 * We need/can do nothing about count=0 pages. 981 * We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
954 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. 989 * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
955 */ 990 */
956 if (!(flags & MF_COUNT_INCREASED) && 991 if (!(flags & MF_COUNT_INCREASED) &&
957 !get_page_unless_zero(compound_head(p))) { 992 !get_page_unless_zero(hpage)) {
958 if (is_free_buddy_page(p)) { 993 if (is_free_buddy_page(p)) {
959 action_result(pfn, "free buddy", DELAYED); 994 action_result(pfn, "free buddy", DELAYED);
960 return 0; 995 return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
972 * The check (unnecessarily) ignores LRU pages being isolated and 1007 * The check (unnecessarily) ignores LRU pages being isolated and
973 * walked by the page reclaim code, however that's not a big loss. 1008 * walked by the page reclaim code, however that's not a big loss.
974 */ 1009 */
975 if (!PageLRU(p)) 1010 if (!PageLRU(p) && !PageHuge(p))
976 shake_page(p, 0); 1011 shake_page(p, 0);
977 if (!PageLRU(p)) { 1012 if (!PageLRU(p) && !PageHuge(p)) {
978 /* 1013 /*
979 * shake_page could have turned it free. 1014 * shake_page could have turned it free.
980 */ 1015 */
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
992 * It's very difficult to mess with pages currently under IO 1027 * It's very difficult to mess with pages currently under IO
993 * and in many cases impossible, so we just avoid it here. 1028 * and in many cases impossible, so we just avoid it here.
994 */ 1029 */
995 lock_page_nosync(p); 1030 lock_page_nosync(hpage);
996 1031
997 /* 1032 /*
998 * unpoison always clear PG_hwpoison inside page lock 1033 * unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1004 } 1039 }
1005 if (hwpoison_filter(p)) { 1040 if (hwpoison_filter(p)) {
1006 if (TestClearPageHWPoison(p)) 1041 if (TestClearPageHWPoison(p))
1007 atomic_long_dec(&mce_bad_pages); 1042 atomic_long_sub(nr_pages, &mce_bad_pages);
1008 unlock_page(p); 1043 unlock_page(hpage);
1009 put_page(p); 1044 put_page(hpage);
1045 return 0;
1046 }
1047
1048 /*
1049 * For error on the tail page, we should set PG_hwpoison
1050 * on the head page to show that the hugepage is hwpoisoned
1051 */
1052 if (PageTail(p) && TestSetPageHWPoison(hpage)) {
1053 action_result(pfn, "hugepage already hardware poisoned",
1054 IGNORED);
1055 unlock_page(hpage);
1056 put_page(hpage);
1010 return 0; 1057 return 0;
1011 } 1058 }
1059 /*
1060 * Set PG_hwpoison on all pages in an error hugepage,
1061 * because containment is done in hugepage unit for now.
1062 * Since we have done TestSetPageHWPoison() for the head page with
1063 * page lock held, we can safely set PG_hwpoison bits on tail pages.
1064 */
1065 if (PageHuge(p))
1066 set_page_hwpoison_huge_page(hpage);
1012 1067
1013 wait_on_page_writeback(p); 1068 wait_on_page_writeback(p);
1014 1069
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1039 } 1094 }
1040 } 1095 }
1041out: 1096out:
1042 unlock_page(p); 1097 unlock_page(hpage);
1043 return res; 1098 return res;
1044} 1099}
1045EXPORT_SYMBOL_GPL(__memory_failure); 1100EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
1083 struct page *page; 1138 struct page *page;
1084 struct page *p; 1139 struct page *p;
1085 int freeit = 0; 1140 int freeit = 0;
1141 unsigned int nr_pages;
1086 1142
1087 if (!pfn_valid(pfn)) 1143 if (!pfn_valid(pfn))
1088 return -ENXIO; 1144 return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
1095 return 0; 1151 return 0;
1096 } 1152 }
1097 1153
1154 nr_pages = 1 << compound_order(page);
1155
1098 if (!get_page_unless_zero(page)) { 1156 if (!get_page_unless_zero(page)) {
1099 if (TestClearPageHWPoison(p)) 1157 if (TestClearPageHWPoison(p))
1100 atomic_long_dec(&mce_bad_pages); 1158 atomic_long_sub(nr_pages, &mce_bad_pages);
1101 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); 1159 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
1102 return 0; 1160 return 0;
1103 } 1161 }
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
1109 * the PG_hwpoison page will be caught and isolated on the entrance to 1167 * the PG_hwpoison page will be caught and isolated on the entrance to
1110 * the free buddy page pool. 1168 * the free buddy page pool.
1111 */ 1169 */
1112 if (TestClearPageHWPoison(p)) { 1170 if (TestClearPageHWPoison(page)) {
1113 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); 1171 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
1114 atomic_long_dec(&mce_bad_pages); 1172 atomic_long_sub(nr_pages, &mce_bad_pages);
1115 freeit = 1; 1173 freeit = 1;
1116 } 1174 }
1175 if (PageHuge(p))
1176 clear_page_hwpoison_huge_page(page);
1117 unlock_page(page); 1177 unlock_page(page);
1118 1178
1119 put_page(page); 1179 put_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index a7d0f5482634..87b9e8ad4509 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
56#include <linux/memcontrol.h> 56#include <linux/memcontrol.h>
57#include <linux/mmu_notifier.h> 57#include <linux/mmu_notifier.h>
58#include <linux/migrate.h> 58#include <linux/migrate.h>
59#include <linux/hugetlb.h>
59 60
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61 62
@@ -350,6 +351,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
350 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 351 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
351 unsigned long address; 352 unsigned long address;
352 353
354 if (unlikely(is_vm_hugetlb_page(vma)))
355 pgoff = page->index << huge_page_order(page_hstate(page));
353 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 356 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
354 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 357 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
355 /* page should be within @vma mapping range */ 358 /* page should be within @vma mapping range */
@@ -394,6 +397,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
394 pte_t *pte; 397 pte_t *pte;
395 spinlock_t *ptl; 398 spinlock_t *ptl;
396 399
400 if (unlikely(PageHuge(page))) {
401 pte = huge_pte_offset(mm, address);
402 ptl = &mm->page_table_lock;
403 goto check;
404 }
405
397 pgd = pgd_offset(mm, address); 406 pgd = pgd_offset(mm, address);
398 if (!pgd_present(*pgd)) 407 if (!pgd_present(*pgd))
399 return NULL; 408 return NULL;
@@ -414,6 +423,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
414 } 423 }
415 424
416 ptl = pte_lockptr(mm, pmd); 425 ptl = pte_lockptr(mm, pmd);
426check:
417 spin_lock(ptl); 427 spin_lock(ptl);
418 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { 428 if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
419 *ptlp = ptl; 429 *ptlp = ptl;
@@ -916,6 +926,12 @@ void page_remove_rmap(struct page *page)
916 page_clear_dirty(page); 926 page_clear_dirty(page);
917 set_page_dirty(page); 927 set_page_dirty(page);
918 } 928 }
929 /*
930 * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
931 * and not charged by memcg for now.
932 */
933 if (unlikely(PageHuge(page)))
934 return;
919 if (PageAnon(page)) { 935 if (PageAnon(page)) {
920 mem_cgroup_uncharge_page(page); 936 mem_cgroup_uncharge_page(page);
921 __dec_zone_page_state(page, NR_ANON_PAGES); 937 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1524,3 +1540,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
1524 return rmap_walk_file(page, rmap_one, arg); 1540 return rmap_walk_file(page, rmap_one, arg);
1525} 1541}
1526#endif /* CONFIG_MIGRATION */ 1542#endif /* CONFIG_MIGRATION */
1543
1544#ifdef CONFIG_HUGETLB_PAGE
1545/*
1546 * The following three functions are for anonymous (private mapped) hugepages.
1547 * Unlike common anonymous pages, anonymous hugepages have no accounting code
1548 * and no lru code, because we handle hugepages differently from common pages.
1549 */
1550static void __hugepage_set_anon_rmap(struct page *page,
1551 struct vm_area_struct *vma, unsigned long address, int exclusive)
1552{
1553 struct anon_vma *anon_vma = vma->anon_vma;
1554 BUG_ON(!anon_vma);
1555 if (!exclusive) {
1556 struct anon_vma_chain *avc;
1557 avc = list_entry(vma->anon_vma_chain.prev,
1558 struct anon_vma_chain, same_vma);
1559 anon_vma = avc->anon_vma;
1560 }
1561 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1562 page->mapping = (struct address_space *) anon_vma;
1563 page->index = linear_page_index(vma, address);
1564}
1565
1566void hugepage_add_anon_rmap(struct page *page,
1567 struct vm_area_struct *vma, unsigned long address)
1568{
1569 struct anon_vma *anon_vma = vma->anon_vma;
1570 int first;
1571 BUG_ON(!anon_vma);
1572 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1573 first = atomic_inc_and_test(&page->_mapcount);
1574 if (first)
1575 __hugepage_set_anon_rmap(page, vma, address, 0);
1576}
1577
1578void hugepage_add_new_anon_rmap(struct page *page,
1579 struct vm_area_struct *vma, unsigned long address)
1580{
1581 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
1582 atomic_set(&page->_mapcount, 0);
1583 __hugepage_set_anon_rmap(page, vma, address, 1);
1584}
1585#endif /* CONFIG_HUGETLB_PAGE */