Merge branch 'hwpoison-hugepages' into hwpoison

Conflicts: mm/memory-failure.c
author: Andi Kleen <ak@linux.intel.com> 2010-10-22 11:40:48 -0400
committer: Andi Kleen <ak@linux.intel.com> 2010-10-22 11:40:48 -0400
commit: 46e387bbd82d438b9131e237e6e2cb55a825da49 (patch)
tree: 414948afd6b4d63c6ea8cc79ce022128bc1bf2eb
parent: e9d08567ef72a2d0fb9b14dded386352d3136442 (diff)
parent: 3ef8fd7f720fc4f462fcdcae2fcde6f1c0536bfe (diff)
10 files changed, 551 insertions, 125 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 79b0b372d2d0..852b319edbdc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,7 @@
 #include <linux/kprobes.h>              /* __kprobes, ...               */
 #include <linux/mmiotrace.h>            /* kmmio_handler, ...           */
 #include <linux/perf_event.h>           /* perf_sw_event                */
+#include <linux/hugetlb.h>              /* hstate_index_to_shift        */
 #include <asm/traps.h>                  /* dotraplinkage, ...           */
 #include <asm/pgalloc.h>                /* pgd_*(), ...                 */
@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 static void
 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
-                     struct task_struct *tsk)
+                     struct task_struct *tsk, int fault)
 {
+        unsigned lsb = 0;
        siginfo_t info;
        info.si_signo   = si_signo;
        info.si_errno   = 0;
        info.si_code    = si_code;
        info.si_addr    = (void __user *)address;
-        info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
+        if (fault & VM_FAULT_HWPOISON_LARGE)
+                lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); 
+        if (fault & VM_FAULT_HWPOISON)
+                lsb = PAGE_SHIFT;
+        info.si_addr_lsb = lsb;
        force_sig_info(si_signo, &info, tsk);
 }
@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                tsk->thread.error_code  = error_code | (address >= TASK_SIZE);
                tsk->thread.trap_no     = 14;
-                force_sig_info_fault(SIGSEGV, si_code, address, tsk);
+                force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
                return;
        }
@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
        tsk->thread.trap_no     = 14;
 #ifdef CONFIG_MEMORY_FAILURE
-        if (fault & VM_FAULT_HWPOISON) {
+        if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
                printk(KERN_ERR
        "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
                        tsk->comm, tsk->pid, address);
                code = BUS_MCEERR_AR;
        }
 #endif
-        force_sig_info_fault(SIGBUS, code, address, tsk);
+        force_sig_info_fault(SIGBUS, code, address, tsk, fault);
 }
 static noinline void
@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
        if (fault & VM_FAULT_OOM) {
                out_of_memory(regs, error_code, address);
        } else {
-                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
+                if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
+                             VM_FAULT_HWPOISON_LARGE))
                        do_sigbus(regs, error_code, address, fault);
                else
                        BUG();
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6e5bd42f3860..1f7ca505d48e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -31,6 +31,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/magic.h>
+#include <linux/migrate.h>
 #include <asm/uaccess.h>
@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
        return 0;
 }
+static int hugetlbfs_migrate_page(struct address_space *mapping,
+                                struct page *newpage, struct page *page)
+{
+        int rc;
+        rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+        if (rc)
+                return rc;
+        migrate_page_copy(newpage, page);
+        return 0;
+}
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
        .write_begin    = hugetlbfs_write_begin,
        .write_end      = hugetlbfs_write_end,
        .set_page_dirty = hugetlbfs_set_page_dirty,
+        .migratepage    = hugetlbfs_migrate_page,
 };
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f479700df61b..943c76b3d4bb 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
-void __isolate_hwpoisoned_huge_page(struct page *page);
+int dequeue_hwpoisoned_huge_page(struct page *page);
+void copy_huge_page(struct page *dst, struct page *src);
 extern unsigned long hugepages_treat_as_movable;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, flags)     ({ BUG(); 0; })
 #define huge_pte_offset(mm, address)    0
-#define __isolate_hwpoisoned_huge_page(page)    0
+#define dequeue_hwpoisoned_huge_page(page)      0
+static inline void copy_huge_page(struct page *dst, struct page *src)
+{
+}
 #define hugetlb_change_protection(vma, address, end, newprot)
@@ -228,6 +232,8 @@ struct huge_bootmem_page {
        struct hstate *hstate;
 };
+struct page *alloc_huge_page_node(struct hstate *h, int nid);
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page)
        return size_to_hstate(PAGE_SIZE << compound_order(page));
 }
+static inline unsigned hstate_index_to_shift(unsigned index)
+{
+        return hstates[index].order + PAGE_SHIFT;
+}
 #else
 struct hstate {};
+#define alloc_huge_page_node(h, nid) NULL
 #define alloc_bootmem_huge_page(h) NULL
 #define hstate_file(f) NULL
 #define hstate_vma(v) NULL
@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
 {
        return 1;
 }
+#define hstate_index_to_shift(index) 0
 #endif
 #endif /* _LINUX_HUGETLB_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7238231b8dd4..085527fb8261 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
                        struct page *, struct page *);
 extern int migrate_pages(struct list_head *l, new_page_t x,
                        unsigned long private, int offlining);
+extern int migrate_huge_pages(struct list_head *l, new_page_t x,
+                        unsigned long private, int offlining);
 extern int fail_migrate_page(struct address_space *,
                        struct page *, struct page *);
@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
 extern int migrate_vmas(struct mm_struct *mm,
                const nodemask_t *from, const nodemask_t *to,
                unsigned long flags);
+extern void migrate_page_copy(struct page *newpage, struct page *page);
+extern int migrate_huge_page_move_mapping(struct address_space *mapping,
+                                  struct page *newpage, struct page *page);
 #else
 #define PAGE_MIGRATION 0
 static inline void putback_lru_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
                unsigned long private, int offlining) { return -ENOSYS; }
+static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
+                unsigned long private, int offlining) { return -ENOSYS; }
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
        return -ENOSYS;
 }
+static inline void migrate_page_copy(struct page *newpage,
+                                     struct page *page) {}
+static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
+                                  struct page *newpage, struct page *page)
+{
+        return -ENOSYS;
+}
 /* Possible settings for the migrate_page() method in address_operations */
 #define migrate_page NULL
 #define fail_migrate_page NULL
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7687228dd3b7..a4c66846fb8f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_SIGBUS 0x0002
 #define VM_FAULT_MAJOR  0x0004
 #define VM_FAULT_WRITE  0x0008  /* Special case for get_user_pages */
-#define VM_FAULT_HWPOISON 0x0010        /* Hit poisoned page */
+#define VM_FAULT_HWPOISON 0x0010        /* Hit poisoned small page */
+#define VM_FAULT_HWPOISON_LARGE 0x0020  /* Hit poisoned large page. Index encoded in upper bits */
 #define VM_FAULT_NOPAGE 0x0100  /* ->fault installed the pte, not return page */
 #define VM_FAULT_LOCKED 0x0200  /* ->fault locked the returned page */
-#define VM_FAULT_ERROR  (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
+#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
+#define VM_FAULT_ERROR  (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
+                         VM_FAULT_HWPOISON_LARGE)
+/* Encode hstate index for a hwpoisoned large page */
+#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
+#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
 /*
 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..96991ded82fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
        }
 }
-static void copy_gigantic_page(struct page *dst, struct page *src,
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        struct hstate *h = hstate_vma(vma);
        struct page *dst_base = dst;
        struct page *src_base = src;
-        might_sleep();
        for (i = 0; i < pages_per_huge_page(h); ) {
                cond_resched();
                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
                src = mem_map_next(src, src_base, i);
        }
 }
-static void copy_huge_page(struct page *dst, struct page *src,
+static void copy_user_huge_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        struct hstate *h = hstate_vma(vma);
        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-                copy_gigantic_page(dst, src, addr, vma);
+                copy_user_gigantic_page(dst, src, addr, vma);
                return;
        }
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
        }
 }
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+        int i;
+        struct hstate *h = page_hstate(src);
+        struct page *dst_base = dst;
+        struct page *src_base = src;
+        for (i = 0; i < pages_per_huge_page(h); ) {
+                cond_resched();
+                copy_highpage(dst, src);
+                i++;
+                dst = mem_map_next(dst, dst_base, i);
+                src = mem_map_next(src, src_base, i);
+        }
+}
+void copy_huge_page(struct page *dst, struct page *src)
+{
+        int i;
+        struct hstate *h = page_hstate(src);
+        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+                copy_gigantic_page(dst, src);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page(h); i++) {
+                cond_resched();
+                copy_highpage(dst + i, src + i);
+        }
+}
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
        h->free_huge_pages_node[nid]++;
 }
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+        struct page *page;
+        if (list_empty(&h->hugepage_freelists[nid]))
+                return NULL;
+        page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+        list_del(&page->lru);
+        set_page_refcounted(page);
+        h->free_huge_pages--;
+        h->free_huge_pages_node[nid]--;
+        return page;
+}
 static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
 {
-        int nid;
        struct page *page = NULL;
        struct mempolicy *mpol;
        nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
-                nid = zone_to_nid(zone);
+                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
-                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
+                        page = dequeue_huge_page_node(h, zone_to_nid(zone));
-                    !list_empty(&h->hugepage_freelists[nid])) {
+                        if (page) {
-                        page = list_entry(h->hugepage_freelists[nid].next,
+                                if (!avoid_reserve)
-                                          struct page, lru);
+                                        decrement_hugepage_resv_vma(h, vma);
-                        list_del(&page->lru);
+                                break;
-                        h->free_huge_pages--;
+                        }
-                        h->free_huge_pages_node[nid]--;
-                        if (!avoid_reserve)
-                                decrement_hugepage_resv_vma(h, vma);
-                        break;
                }
        }
 err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
        return ret;
 }
-static struct page *alloc_buddy_huge_page(struct hstate *h,
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
-                        struct vm_area_struct *vma, unsigned long address)
 {
        struct page *page;
-        unsigned int nid;
+        unsigned int r_nid;
        if (h->order >= MAX_ORDER)
                return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        }
        spin_unlock(&hugetlb_lock);
-        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+        if (nid == NUMA_NO_NODE)
-                                        __GFP_REPEAT|__GFP_NOWARN,
+                page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
-                                        huge_page_order(h));
+                                   __GFP_REPEAT|__GFP_NOWARN,
+                                   huge_page_order(h));
+        else
+                page = alloc_pages_exact_node(nid,
+                        htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+                        __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
        if (page && arch_prepare_hugepage(page)) {
                __free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        spin_lock(&hugetlb_lock);
        if (page) {
-                /*
+                r_nid = page_to_nid(page);
-                 * This page is now managed by the hugetlb allocator and has
-                 * no users -- drop the buddy allocator's reference.
-                 */
-                put_page_testzero(page);
-                VM_BUG_ON(page_count(page));
-                nid = page_to_nid(page);
                set_compound_page_dtor(page, free_huge_page);
                /*
                 * We incremented the global counters already
                 */
-                h->nr_huge_pages_node[nid]++;
+                h->nr_huge_pages_node[r_nid]++;
-                h->surplus_huge_pages_node[nid]++;
+                h->surplus_huge_pages_node[r_nid]++;
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        } else {
                h->nr_huge_pages--;
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 }
 /*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+        struct page *page;
+        spin_lock(&hugetlb_lock);
+        page = dequeue_huge_page_node(h, nid);
+        spin_unlock(&hugetlb_lock);
+        if (!page)
+                page = alloc_buddy_huge_page(h, nid);
+        return page;
+}
+/*
 * Increase the hugetlb pool such that it can accomodate a reservation
 * of size 'delta'.
 */
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-                page = alloc_buddy_huge_page(h, NULL, 0);
+                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
-                if (!page) {
+                if (!page)
                        /*
                         * We were not able to allocate enough pages to
                         * satisfy the entire reservation so we free what
                         * we've allocated so far.
                         */
-                        spin_lock(&hugetlb_lock);
-                        needed = 0;
                        goto free;
-                }
                list_add(&page->lru, &surplus_list);
        }
@@ -908,31 +964,31 @@ retry:
        needed += allocated;
        h->resv_huge_pages += delta;
        ret = 0;
-free:
+        spin_unlock(&hugetlb_lock);
        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
                list_del(&page->lru);
+                /*
+                 * This page is now managed by the hugetlb allocator and has
+                 * no users -- drop the buddy allocator's reference.
+                 */
+                put_page_testzero(page);
+                VM_BUG_ON(page_count(page));
                enqueue_huge_page(h, page);
        }
        /* Free unnecessary surplus pages to the buddy allocator */
+free:
        if (!list_empty(&surplus_list)) {
-                spin_unlock(&hugetlb_lock);
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                        list_del(&page->lru);
-                        /*
+                        put_page(page);
-                         * The page has a reference count of zero already, so
-                         * call free_huge_page directly instead of using
-                         * put_page.  This must be done with hugetlb_lock
-                         * unlocked which is safe because free_huge_page takes
-                         * hugetlb_lock before deciding how to free the page.
-                         */
-                        free_huge_page(page);
                }
-                spin_lock(&hugetlb_lock);
        }
+        spin_lock(&hugetlb_lock);
        return ret;
 }
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        spin_unlock(&hugetlb_lock);
        if (!page) {
-                page = alloc_buddy_huge_page(h, vma, addr);
+                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
                        hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
-        set_page_refcounted(page);
        set_page_private(page, (unsigned long) mapping);
        vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
        return -ENOMEM;
 }
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_migration_entry(swp)) {
+                return 1;
+        } else
+                return 0;
+}
 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
 {
        swp_entry_t swp;
@@ -2383,7 +2451,7 @@ retry_avoidcopy:
        if (unlikely(anon_vma_prepare(vma)))
                return VM_FAULT_OOM;
-        copy_huge_page(new_page, old_page, address, vma);
+        copy_user_huge_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
        /*
@@ -2515,22 +2583,20 @@ retry:
                        hugepage_add_new_anon_rmap(page, vma, address);
                }
        } else {
+                /*
+                 * If memory error occurs between mmap() and fault, some process
+                 * don't have hwpoisoned swap entry for errored virtual address.
+                 * So we need to block hugepage fault by PG_hwpoison bit check.
+                 */
+                if (unlikely(PageHWPoison(page))) {
+                        ret = VM_FAULT_HWPOISON | 
+                              VM_FAULT_SET_HINDEX(h - hstates);
+                        goto backout_unlocked;
+                }
                page_dup_rmap(page);
        }
        /*
-         * Since memory error handler replaces pte into hwpoison swap entry
-         * at the time of error handling, a process which reserved but not have
-         * the mapping to the error hugepage does not have hwpoison swap entry.
-         * So we need to block accesses from such a process by checking
-         * PG_hwpoison bit here.
-         */
-        if (unlikely(PageHWPoison(page))) {
-                ret = VM_FAULT_HWPOISON;
-                goto backout_unlocked;
-        }
-        /*
         * If we are going to COW a private mapping later, we examine the
         * pending reservations for this page now. This will ensure that
         * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        ptep = huge_pte_offset(mm, address);
        if (ptep) {
                entry = huge_ptep_get(ptep);
-                if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                if (unlikely(is_hugetlb_entry_migration(entry))) {
-                        return VM_FAULT_HWPOISON;
+                        migration_entry_wait(mm, (pmd_t *)ptep, address);
+                        return 0;
+                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                        return VM_FAULT_HWPOISON_LARGE | 
+                               VM_FAULT_SET_HINDEX(h - hstates);
        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        hugetlb_acct_memory(h, -(chg - freed));
 }
+#ifdef CONFIG_MEMORY_FAILURE
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+        struct page *page;
+        struct page *tmp;
+        struct hstate *h = page_hstate(hpage);
+        int nid = page_to_nid(hpage);
+        list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+                if (page == hpage)
+                        return 1;
+        return 0;
+}
 /*
 * This function is called from memory failure code.
 * Assume the caller holds page lock of the head page.
 */
-void __isolate_hwpoisoned_huge_page(struct page *hpage)
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
 {
        struct hstate *h = page_hstate(hpage);
        int nid = page_to_nid(hpage);
+        int ret = -EBUSY;
        spin_lock(&hugetlb_lock);
-        list_del(&hpage->lru);
+        if (is_hugepage_on_freelist(hpage)) {
-        h->free_huge_pages--;
+                list_del(&hpage->lru);
-        h->free_huge_pages_node[nid]--;
+                set_page_refcounted(hpage);
+                h->free_huge_pages--;
+                h->free_huge_pages_node[nid]--;
+                ret = 0;
+        }
        spin_unlock(&hugetlb_lock);
+        return ret;
 }
+#endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2044fe8920c2..44a8cefeae6e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -697,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 * Issues:
 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
 *   To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- *   migration.
 */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
+        int res = 0;
        struct page *hpage = compound_head(p);
        /*
         * We can safely recover from error on free or reserved (i.e.
@@ -714,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
         * so there is no race between isolation and mapping/unmapping.
         */
        if (!(page_mapping(hpage) || PageAnon(hpage))) {
-                __isolate_hwpoisoned_huge_page(hpage);
+                res = dequeue_hwpoisoned_huge_page(hpage);
-                return RECOVERED;
+                if (!res)
+                        return RECOVERED;
        }
        return DELAYED;
 }
@@ -972,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * We need/can do nothing about count=0 pages.
         * 1) it's a free page, and therefore in safe hand:
         *    prep_new_page() will be the gate keeper.
-         * 2) it's part of a non-compound high order page.
+         * 2) it's a free hugepage, which is also safe:
+         *    an affected hugepage will be dequeued from hugepage freelist,
+         *    so there's no concern about reusing it ever after.
+         * 3) it's part of a non-compound high order page.
         *    Implies some kernel user: cannot stop them from
         *    R/W the page; let's pray that the page has been
         *    used and will be freed some time later.
@@ -984,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                if (is_free_buddy_page(p)) {
                        action_result(pfn, "free buddy", DELAYED);
                        return 0;
+                } else if (PageHuge(hpage)) {
+                        /*
+                         * Check "just unpoisoned", "filter hit", and
+                         * "race with other subpage."
+                         */
+                        lock_page_nosync(hpage);
+                        if (!PageHWPoison(hpage)
+                            || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+                            || (p != hpage && TestSetPageHWPoison(hpage))) {
+                                atomic_long_sub(nr_pages, &mce_bad_pages);
+                                return 0;
+                        }
+                        set_page_hwpoison_huge_page(hpage);
+                        res = dequeue_hwpoisoned_huge_page(hpage);
+                        action_result(pfn, "free huge",
+                                      res ? IGNORED : DELAYED);
+                        unlock_page(hpage);
+                        return res;
                } else {
                        action_result(pfn, "high order kernel", IGNORED);
                        return -EBUSY;
@@ -1145,6 +1166,16 @@ int unpoison_memory(unsigned long pfn)
        nr_pages = 1 << compound_order(page);
        if (!get_page_unless_zero(page)) {
+                /*
+                 * Since HWPoisoned hugepage should have non-zero refcount,
+                 * race between memory failure and unpoison seems to happen.
+                 * In such case unpoison fails and memory failure runs
+                 * to the end.
+                 */
+                if (PageHuge(page)) {
+                        pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+                        return 0;
+                }
                if (TestClearPageHWPoison(p))
                        atomic_long_sub(nr_pages, &mce_bad_pages);
                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
@@ -1162,9 +1193,9 @@ int unpoison_memory(unsigned long pfn)
                pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
                atomic_long_sub(nr_pages, &mce_bad_pages);
                freeit = 1;
+                if (PageHuge(page))
+                        clear_page_hwpoison_huge_page(page);
        }
-        if (PageHuge(p))
-                clear_page_hwpoison_huge_page(page);
        unlock_page(page);
        put_page(page);
@@ -1178,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
 static struct page *new_page(struct page *p, unsigned long private, int **x)
 {
        int nid = page_to_nid(p);
-        return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+        if (PageHuge(p))
+                return alloc_huge_page_node(page_hstate(compound_head(p)),
+                                                   nid);
+        else
+                return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 /*
@@ -1206,8 +1241,15 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
         * was free.
         */
        set_migratetype_isolate(p);
+        /*
+         * When the target page is a free hugepage, just remove it
+         * from free hugepage list.
+         */
        if (!get_page_unless_zero(compound_head(p))) {
-                if (is_free_buddy_page(p)) {
+                if (PageHuge(p)) {
+                        pr_info("get_any_page: %#lx free huge page\n", pfn);
+                        ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+                } else if (is_free_buddy_page(p)) {
                        pr_info("get_any_page: %#lx free buddy page\n", pfn);
                        /* Set hwpoison bit while page is still isolated */
                        SetPageHWPoison(p);
@@ -1226,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
        return ret;
 }
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+        int ret;
+        unsigned long pfn = page_to_pfn(page);
+        struct page *hpage = compound_head(page);
+        LIST_HEAD(pagelist);
+        ret = get_any_page(page, pfn, flags);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                goto done;
+        if (PageHWPoison(hpage)) {
+                put_page(hpage);
+                pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+                return -EBUSY;
+        }
+        /* Keep page count to indicate a given hugepage is isolated. */
+        list_add(&hpage->lru, &pagelist);
+        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+        if (ret) {
+                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                         pfn, ret, page->flags);
+                if (ret > 0)
+                        ret = -EIO;
+                return ret;
+        }
+done:
+        if (!PageHWPoison(hpage))
+                atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+        set_page_hwpoison_huge_page(hpage);
+        dequeue_hwpoisoned_huge_page(hpage);
+        /* keep elevated page count for bad page */
+        return ret;
+}
 /**
 * soft_offline_page - Soft offline a page.
 * @page: page to offline
@@ -1253,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
+        if (PageHuge(page))
+                return soft_offline_huge_page(page, flags);
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
                return ret;
diff --git a/mm/memory.c b/mm/memory.c
index 98b58fecedef..af82741caaa4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                        if (ret & VM_FAULT_OOM)
                                                return i ? i : -ENOMEM;
                                        if (ret &
-                                            (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
+                                            (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
+                                             VM_FAULT_SIGBUS))
                                                return i ? i : -EFAULT;
                                        BUG();
                                }
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..f8c9bccf2520 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
+#include <linux/hugetlb.h>
 #include <linux/gfp.h>
 #include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte_t *ptep, pte;
        spinlock_t *ptl;
-        pgd = pgd_offset(mm, addr);
+        if (unlikely(PageHuge(new))) {
-        if (!pgd_present(*pgd))
+                ptep = huge_pte_offset(mm, addr);
-                goto out;
+                if (!ptep)
+                        goto out;
+                ptl = &mm->page_table_lock;
+        } else {
+                pgd = pgd_offset(mm, addr);
+                if (!pgd_present(*pgd))
+                        goto out;
-        pud = pud_offset(pgd, addr);
+                pud = pud_offset(pgd, addr);
-        if (!pud_present(*pud))
+                if (!pud_present(*pud))
-                goto out;
+                        goto out;
-        pmd = pmd_offset(pud, addr);
+                pmd = pmd_offset(pud, addr);
-        if (!pmd_present(*pmd))
+                if (!pmd_present(*pmd))
-                goto out;
+                        goto out;
-        ptep = pte_offset_map(pmd, addr);
+                ptep = pte_offset_map(pmd, addr);
-        if (!is_swap_pte(*ptep)) {
+                if (!is_swap_pte(*ptep)) {
-                pte_unmap(ptep);
+                        pte_unmap(ptep);
-                goto out;
+                        goto out;
-        }
+                }
+                ptl = pte_lockptr(mm, pmd);
+        }
-        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        pte = *ptep;
        if (!is_swap_pte(pte))
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+        if (PageHuge(new))
+                pte = pte_mkhuge(pte);
+#endif
        flush_cache_page(vma, addr, pte_pfn(pte));
        set_pte_at(mm, addr, ptep, pte);
-        if (PageAnon(new))
+        if (PageHuge(new)) {
+                if (PageAnon(new))
+                        hugepage_add_anon_rmap(new, vma, addr);
+                else
+                        page_dup_rmap(new);
+        } else if (PageAnon(new))
                page_add_anon_rmap(new, vma, addr);
        else
                page_add_file_rmap(new);
@@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 }
 /*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+                                   struct page *newpage, struct page *page)
+{
+        int expected_count;
+        void **pslot;
+        if (!mapping) {
+                if (page_count(page) != 1)
+                        return -EAGAIN;
+                return 0;
+        }
+        spin_lock_irq(&mapping->tree_lock);
+        pslot = radix_tree_lookup_slot(&mapping->page_tree,
+                                        page_index(page));
+        expected_count = 2 + page_has_private(page);
+        if (page_count(page) != expected_count ||
+            (struct page *)radix_tree_deref_slot(pslot) != page) {
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        if (!page_freeze_refs(page, expected_count)) {
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        get_page(newpage);
+        radix_tree_replace_slot(pslot, newpage);
+        page_unfreeze_refs(page, expected_count);
+        __put_page(page);
+        spin_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+/*
 * Copy the page to its new location
 */
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
 {
-        copy_highpage(newpage, page);
+        if (PageHuge(page))
+                copy_huge_page(newpage, page);
+        else
+                copy_highpage(newpage, page);
        if (PageError(page))
                SetPageError(newpage);
@@ -724,6 +790,92 @@ move_newpage:
 }
 /*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+                                unsigned long private, struct page *hpage,
+                                int force, int offlining)
+{
+        int rc = 0;
+        int *result = NULL;
+        struct page *new_hpage = get_new_page(hpage, private, &result);
+        int rcu_locked = 0;
+        struct anon_vma *anon_vma = NULL;
+        if (!new_hpage)
+                return -ENOMEM;
+        rc = -EAGAIN;
+        if (!trylock_page(hpage)) {
+                if (!force)
+                        goto out;
+                lock_page(hpage);
+        }
+        if (PageAnon(hpage)) {
+                rcu_read_lock();
+                rcu_locked = 1;
+                if (page_mapped(hpage)) {
+                        anon_vma = page_anon_vma(hpage);
+                        atomic_inc(&anon_vma->external_refcount);
+                }
+        }
+        try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+        if (!page_mapped(hpage))
+                rc = move_to_new_page(new_hpage, hpage, 1);
+        if (rc)
+                remove_migration_ptes(hpage, hpage);
+        if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+                                            &anon_vma->lock)) {
+                int empty = list_empty(&anon_vma->head);
+                spin_unlock(&anon_vma->lock);
+                if (empty)
+                        anon_vma_free(anon_vma);
+        }
+        if (rcu_locked)
+                rcu_read_unlock();
+out:
+        unlock_page(hpage);
+        if (rc != -EAGAIN) {
+                list_del(&hpage->lru);
+                put_page(hpage);
+        }
+        put_page(new_hpage);
+        if (result) {
+                if (rc)
+                        *result = rc;
+                else
+                        *result = page_to_nid(new_hpage);
+        }
+        return rc;
+}
+/*
 * migrate_pages
 *
 * The function takes one list of pages to migrate and a function
@@ -788,6 +940,52 @@ out:
        return nr_failed + retry;
 }
+int migrate_huge_pages(struct list_head *from,
+                new_page_t get_new_page, unsigned long private, int offlining)
+{
+        int retry = 1;
+        int nr_failed = 0;
+        int pass = 0;
+        struct page *page;
+        struct page *page2;
+        int rc;
+        for (pass = 0; pass < 10 && retry; pass++) {
+                retry = 0;
+                list_for_each_entry_safe(page, page2, from, lru) {
+                        cond_resched();
+                        rc = unmap_and_move_huge_page(get_new_page,
+                                        private, page, pass > 2, offlining);
+                        switch(rc) {
+                        case -ENOMEM:
+                                goto out;
+                        case -EAGAIN:
+                                retry++;
+                                break;
+                        case 0:
+                                break;
+                        default:
+                                /* Permanent failure */
+                                nr_failed++;
+                                break;
+                        }
+                }
+        }
+        rc = 0;
+out:
+        list_for_each_entry_safe(page, page2, from, lru)
+                put_page(page);
+        if (rc)
+                return rc;
+        return nr_failed + retry;
+}
 #ifdef CONFIG_NUMA
 /*
 * Move a list of individual pages
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e6757f196e..8adc6e3b09b3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page,
 }
 /**
- * __page_set_anon_rmap - setup new anonymous rmap
+ * __page_set_anon_rmap - set up new anonymous rmap
- * @page:       the page to add the mapping to
+ * @page:       Page to add to rmap     
- * @vma:        the vm area in which the mapping is added
+ * @vma:        VM area to add page to.
- * @address:    the user virtual address mapped
+ * @address:    User virtual address of the mapping     
 * @exclusive:  the page is exclusively owned by the current process
 */
 static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page,
        BUG_ON(!anon_vma);
+        if (PageAnon(page))
+                return;
        /*
         * If the page isn't exclusively mapped into this vma,
         * we must use the _oldest_ possible anon_vma for the
         * page mapping!
         */
-        if (!exclusive) {
+        if (!exclusive)
-                if (PageAnon(page))
-                        return;
                anon_vma = anon_vma->root;
-        } else {
-                /*
-                 * In this case, swapped-out-but-not-discarded swap-cache
-                 * is remapped. So, no need to update page->mapping here.
-                 * We convice anon_vma poitned by page->mapping is not obsolete
-                 * because vma->anon_vma is necessary to be a family of it.
-                 */
-                if (PageAnon(page))
-                        return;
-        }
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
author	Andi Kleen <ak@linux.intel.com>	2010-10-22 11:40:48 -0400
committer	Andi Kleen <ak@linux.intel.com>	2010-10-22 11:40:48 -0400
commit	46e387bbd82d438b9131e237e6e2cb55a825da49 (patch)
tree	414948afd6b4d63c6ea8cc79ce022128bc1bf2eb
parent	e9d08567ef72a2d0fb9b14dded386352d3136442 (diff)
parent	3ef8fd7f720fc4f462fcdcae2fcde6f1c0536bfe (diff)