24 files changed, 358 insertions, 417 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f2e574dbc300..801c08b046e6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -176,6 +176,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
        int ret = 0;
        struct device *dev;
+        if (bdi->dev)   /* The driver needs to use separate queues per device */
+                goto exit;
        va_start(args, fmt);
        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
        va_end(args);
diff --git a/mm/filemap.c b/mm/filemap.c
index ab8553658af3..f3e5f8944d17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2029,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping,
 {
        const struct address_space_operations *aops = mapping->a_ops;
-        if (aops->write_begin) {
+        return aops->write_begin(file, mapping, pos, len, flags,
-                return aops->write_begin(file, mapping, pos, len, flags,
                                                        pagep, fsdata);
-        } else {
-                int ret;
-                pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-                struct inode *inode = mapping->host;
-                struct page *page;
-again:
-                page = __grab_cache_page(mapping, index);
-                *pagep = page;
-                if (!page)
-                        return -ENOMEM;
-                if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
-                        /*
-                         * There is no way to resolve a short write situation
-                         * for a !Uptodate page (except by double copying in
-                         * the caller done by generic_perform_write_2copy).
-                         *
-                         * Instead, we have to bring it uptodate here.
-                         */
-                        ret = aops->readpage(file, page);
-                        page_cache_release(page);
-                        if (ret) {
-                                if (ret == AOP_TRUNCATED_PAGE)
-                                        goto again;
-                                return ret;
-                        }
-                        goto again;
-                }
-                ret = aops->prepare_write(file, page, offset, offset+len);
-                if (ret) {
-                        unlock_page(page);
-                        page_cache_release(page);
-                        if (pos + len > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
-                }
-                return ret;
-        }
 }
 EXPORT_SYMBOL(pagecache_write_begin);
@@ -2079,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
                                struct page *page, void *fsdata)
 {
        const struct address_space_operations *aops = mapping->a_ops;
-        int ret;
-        if (aops->write_end) {
-                mark_page_accessed(page);
-                ret = aops->write_end(file, mapping, pos, len, copied,
-                                                        page, fsdata);
-        } else {
-                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
-                struct inode *inode = mapping->host;
-                flush_dcache_page(page);
-                ret = aops->commit_write(file, page, offset, offset+len);
-                unlock_page(page);
-                mark_page_accessed(page);
-                page_cache_release(page);
-                if (ret < 0) {
-                        if (pos + len > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
-                } else if (ret > 0)
-                        ret = min_t(size_t, copied, ret);
-                else
-                        ret = copied;
-        }
-        return ret;
+        mark_page_accessed(page);
+        return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
 }
 EXPORT_SYMBOL(pagecache_write_end);
@@ -2226,174 +2163,6 @@ repeat:
 }
 EXPORT_SYMBOL(__grab_cache_page);
-static ssize_t generic_perform_write_2copy(struct file *file,
-                                struct iov_iter *i, loff_t pos)
-{
-        struct address_space *mapping = file->f_mapping;
-        const struct address_space_operations *a_ops = mapping->a_ops;
-        struct inode *inode = mapping->host;
-        long status = 0;
-        ssize_t written = 0;
-        do {
-                struct page *src_page;
-                struct page *page;
-                pgoff_t index;          /* Pagecache index for current page */
-                unsigned long offset;   /* Offset into pagecache page */
-                unsigned long bytes;    /* Bytes to write to page */
-                size_t copied;          /* Bytes copied from user */
-                offset = (pos & (PAGE_CACHE_SIZE - 1));
-                index = pos >> PAGE_CACHE_SHIFT;
-                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-                                                iov_iter_count(i));
-                /*
-                 * a non-NULL src_page indicates that we're doing the
-                 * copy via get_user_pages and kmap.
-                 */
-                src_page = NULL;
-                /*
-                 * Bring in the user page that we will copy from _first_.
-                 * Otherwise there's a nasty deadlock on copying from the
-                 * same page as we're writing to, without it being marked
-                 * up-to-date.
-                 *
-                 * Not only is this an optimisation, but it is also required
-                 * to check that the address is actually valid, when atomic
-                 * usercopies are used, below.
-                 */
-                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
-                        status = -EFAULT;
-                        break;
-                }
-                page = __grab_cache_page(mapping, index);
-                if (!page) {
-                        status = -ENOMEM;
-                        break;
-                }
-                /*
-                 * non-uptodate pages cannot cope with short copies, and we
-                 * cannot take a pagefault with the destination page locked.
-                 * So pin the source page to copy it.
-                 */
-                if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
-                        unlock_page(page);
-                        src_page = alloc_page(GFP_KERNEL);
-                        if (!src_page) {
-                                page_cache_release(page);
-                                status = -ENOMEM;
-                                break;
-                        }
-                        /*
-                         * Cannot get_user_pages with a page locked for the
-                         * same reason as we can't take a page fault with a
-                         * page locked (as explained below).
-                         */
-                        copied = iov_iter_copy_from_user(src_page, i,
-                                                                offset, bytes);
-                        if (unlikely(copied == 0)) {
-                                status = -EFAULT;
-                                page_cache_release(page);
-                                page_cache_release(src_page);
-                                break;
-                        }
-                        bytes = copied;
-                        lock_page(page);
-                        /*
-                         * Can't handle the page going uptodate here, because
-                         * that means we would use non-atomic usercopies, which
-                         * zero out the tail of the page, which can cause
-                         * zeroes to become transiently visible. We could just
-                         * use a non-zeroing copy, but the APIs aren't too
-                         * consistent.
-                         */
-                        if (unlikely(!page->mapping || PageUptodate(page))) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                page_cache_release(src_page);
-                                continue;
-                        }
-                }
-                status = a_ops->prepare_write(file, page, offset, offset+bytes);
-                if (unlikely(status))
-                        goto fs_write_aop_error;
-                if (!src_page) {
-                        /*
-                         * Must not enter the pagefault handler here, because
-                         * we hold the page lock, so we might recursively
-                         * deadlock on the same lock, or get an ABBA deadlock
-                         * against a different lock, or against the mmap_sem
-                         * (which nests outside the page lock).  So increment
-                         * preempt count, and use _atomic usercopies.
-                         *
-                         * The page is uptodate so we are OK to encounter a
-                         * short copy: if unmodified parts of the page are
-                         * marked dirty and written out to disk, it doesn't
-                         * really matter.
-                         */
-                        pagefault_disable();
-                        copied = iov_iter_copy_from_user_atomic(page, i,
-                                                                offset, bytes);
-                        pagefault_enable();
-                } else {
-                        void *src, *dst;
-                        src = kmap_atomic(src_page, KM_USER0);
-                        dst = kmap_atomic(page, KM_USER1);
-                        memcpy(dst + offset, src + offset, bytes);
-                        kunmap_atomic(dst, KM_USER1);
-                        kunmap_atomic(src, KM_USER0);
-                        copied = bytes;
-                }
-                flush_dcache_page(page);
-                status = a_ops->commit_write(file, page, offset, offset+bytes);
-                if (unlikely(status < 0))
-                        goto fs_write_aop_error;
-                if (unlikely(status > 0)) /* filesystem did partial write */
-                        copied = min_t(size_t, copied, status);
-                unlock_page(page);
-                mark_page_accessed(page);
-                page_cache_release(page);
-                if (src_page)
-                        page_cache_release(src_page);
-                iov_iter_advance(i, copied);
-                pos += copied;
-                written += copied;
-                balance_dirty_pages_ratelimited(mapping);
-                cond_resched();
-                continue;
-fs_write_aop_error:
-                unlock_page(page);
-                page_cache_release(page);
-                if (src_page)
-                        page_cache_release(src_page);
-                /*
-                 * prepare_write() may have instantiated a few blocks
-                 * outside i_size.  Trim these off again. Don't need
-                 * i_size_read because we hold i_mutex.
-                 */
-                if (pos + bytes > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
-                break;
-        } while (iov_iter_count(i));
-        return written ? written : status;
-}
 static ssize_t generic_perform_write(struct file *file,
                                struct iov_iter *i, loff_t pos)
 {
@@ -2494,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
        struct iov_iter i;
        iov_iter_init(&i, iov, nr_segs, count, written);
-        if (a_ops->write_begin)
+        status = generic_perform_write(file, &i, pos);
-                status = generic_perform_write(file, &i, pos);
-        else
-                status = generic_perform_write_2copy(file, &i, pos);
        if (likely(status >= 0)) {
                written += status;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 421aee99b84a..6058b53dcb89 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -354,11 +354,26 @@ static int vma_has_reserves(struct vm_area_struct *vma)
        return 0;
 }
+static void clear_gigantic_page(struct page *page,
+                        unsigned long addr, unsigned long sz)
+{
+        int i;
+        struct page *p = page;
+        might_sleep();
+        for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
+                cond_resched();
+                clear_user_highpage(p, addr + i * PAGE_SIZE);
+        }
+}
 static void clear_huge_page(struct page *page,
                        unsigned long addr, unsigned long sz)
 {
        int i;
+        if (unlikely(sz > MAX_ORDER_NR_PAGES))
+                return clear_gigantic_page(page, addr, sz);
        might_sleep();
        for (i = 0; i < sz/PAGE_SIZE; i++) {
                cond_resched();
@@ -366,12 +381,32 @@ static void clear_huge_page(struct page *page,
        }
 }
+static void copy_gigantic_page(struct page *dst, struct page *src,
+                           unsigned long addr, struct vm_area_struct *vma)
+{
+        int i;
+        struct hstate *h = hstate_vma(vma);
+        struct page *dst_base = dst;
+        struct page *src_base = src;
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page(h); ) {
+                cond_resched();
+                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+                i++;
+                dst = mem_map_next(dst, dst_base, i);
+                src = mem_map_next(src, src_base, i);
+        }
+}
 static void copy_huge_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        struct hstate *h = hstate_vma(vma);
+        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
+                return copy_gigantic_page(dst, src, addr, vma);
        might_sleep();
        for (i = 0; i < pages_per_huge_page(h); i++) {
                cond_resched();
@@ -456,6 +491,8 @@ static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
+        VM_BUG_ON(h->order >= MAX_ORDER);
        h->nr_huge_pages--;
        h->nr_huge_pages_node[page_to_nid(page)]--;
        for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -970,6 +1007,14 @@ found:
        return 1;
 }
+static void prep_compound_huge_page(struct page *page, int order)
+{
+        if (unlikely(order > (MAX_ORDER - 1)))
+                prep_compound_gigantic_page(page, order);
+        else
+                prep_compound_page(page, order);
+}
 /* Put bootmem huge pages into the standard lists after mem_map is up */
 static void __init gather_bootmem_prealloc(void)
 {
@@ -980,7 +1025,7 @@ static void __init gather_bootmem_prealloc(void)
                struct hstate *h = m->hstate;
                __ClearPageReserved(page);
                WARN_ON(page_count(page) != 1);
-                prep_compound_page(page, h->order);
+                prep_compound_huge_page(page, h->order);
                prep_new_huge_page(h, page, page_to_nid(page));
        }
 }
@@ -1751,6 +1796,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                                struct page *page, unsigned long address)
 {
+        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
        struct prio_tree_iter iter;
@@ -1760,7 +1806,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
         * from page cache lookup which is in HPAGE_SIZE units.
         */
-        address = address & huge_page_mask(hstate_vma(vma));
+        address = address & huge_page_mask(h);
        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
                + (vma->vm_pgoff >> PAGE_SHIFT);
        mapping = (struct address_space *)page_private(page);
@@ -1779,7 +1825,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                        unmap_hugepage_range(iter_vma,
-                                address, address + HPAGE_SIZE,
+                                address, address + huge_page_size(h),
                                page);
        }
@@ -2130,7 +2176,7 @@ same_page:
                        if (zeropage_ok)
                                pages[i] = ZERO_PAGE(0);
                        else
-                                pages[i] = page + pfn_offset;
+                                pages[i] = mem_map_offset(page, pfn_offset);
                        get_page(pages[i]);
                }
diff --git a/mm/internal.h b/mm/internal.h
index e4e728bdf324..13333bc2eb68 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -17,6 +17,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
                unsigned long floor, unsigned long ceiling);
 extern void prep_compound_page(struct page *page, unsigned long order);
+extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
 static inline void set_page_count(struct page *page, int v)
 {
@@ -176,6 +177,34 @@ static inline void free_page_mlock(struct page *page) { }
 #endif /* CONFIG_UNEVICTABLE_LRU */
 /*
+ * Return the mem_map entry representing the 'offset' subpage within
+ * the maximally aligned gigantic page 'base'.  Handle any discontiguity
+ * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
+ */
+static inline struct page *mem_map_offset(struct page *base, int offset)
+{
+        if (unlikely(offset >= MAX_ORDER_NR_PAGES))
+                return pfn_to_page(page_to_pfn(base) + offset);
+        return base + offset;
+}
+/*
+ * Iterator over all subpages withing the maximally aligned gigantic
+ * page 'base'.  Handle any discontiguity in the mem_map.
+ */
+static inline struct page *mem_map_next(struct page *iter,
+                                                struct page *base, int offset)
+{
+        if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
+                unsigned long pfn = page_to_pfn(base) + offset;
+                if (!pfn_valid(pfn))
+                        return NULL;
+                return pfn_to_page(pfn);
+        }
+        return iter + 1;
+}
+/*
 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
 * so all functions starting at paging_init should be marked __init
 * in those cases. SPARSEMEM, however, allows for memory hotplug,
diff --git a/mm/memory.c b/mm/memory.c
index 164951c47305..f01b7eed6e16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -669,6 +669,16 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+        if (unlikely(is_pfn_mapping(vma))) {
+                /*
+                 * We do not free on error cases below as remove_vma
+                 * gets called on error from higher level routine
+                 */
+                ret = track_pfn_vma_copy(vma);
+                if (ret)
+                        return ret;
+        }
        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
@@ -915,6 +925,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                if (vma->vm_flags & VM_ACCOUNT)
                        *nr_accounted += (end - start) >> PAGE_SHIFT;
+                if (unlikely(is_pfn_mapping(vma)))
+                        untrack_pfn_vma(vma, 0, 0);
                while (start != end) {
                        if (!tlb_start_valid) {
                                tlb_start = start;
@@ -1430,6 +1443,7 @@ out:
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
 {
+        int ret;
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
@@ -1444,7 +1458,15 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-        return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+        if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE))
+                return -EINVAL;
+        ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+        if (ret)
+                untrack_pfn_vma(vma, pfn, PAGE_SIZE);
+        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
@@ -1575,14 +1597,17 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         */
-        if (is_cow_mapping(vma->vm_flags)) {
+        if (addr == vma->vm_start && end == vma->vm_end)
-                if (addr != vma->vm_start || end != vma->vm_end)
-                        return -EINVAL;
                vma->vm_pgoff = pfn;
-        }
+        else if (is_cow_mapping(vma->vm_flags))
+                return -EINVAL;
        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+        err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size));
+        if (err)
+                return -EINVAL;
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
@@ -1594,6 +1619,10 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
+        if (err)
+                untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size));
        return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -2865,9 +2894,9 @@ int in_gate_area_no_task(unsigned long addr)
 #endif  /* __HAVE_ARCH_GATE_AREA */
 #ifdef CONFIG_HAVE_IOREMAP_PROT
-static resource_size_t follow_phys(struct vm_area_struct *vma,
+int follow_phys(struct vm_area_struct *vma,
-                        unsigned long address, unsigned int flags,
+                unsigned long address, unsigned int flags,
-                        unsigned long *prot)
+                unsigned long *prot, resource_size_t *phys)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -2876,24 +2905,26 @@ static resource_size_t follow_phys(struct vm_area_struct *vma,
        spinlock_t *ptl;
        resource_size_t phys_addr = 0;
        struct mm_struct *mm = vma->vm_mm;
+        int ret = -EINVAL;
-        VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+                goto out;
        pgd = pgd_offset(mm, address);
        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-                goto no_page_table;
+                goto out;
        pud = pud_offset(pgd, address);
        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-                goto no_page_table;
+                goto out;
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-                goto no_page_table;
+                goto out;
        /* We cannot handle huge page PFN maps. Luckily they don't exist. */
        if (pmd_huge(*pmd))
-                goto no_page_table;
+                goto out;
        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!ptep)
@@ -2908,13 +2939,13 @@ static resource_size_t follow_phys(struct vm_area_struct *vma,
        phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
        *prot = pgprot_val(pte_pgprot(pte));
+        *phys = phys_addr;
+        ret = 0;
 unlock:
        pte_unmap_unlock(ptep, ptl);
 out:
-        return phys_addr;
+        return ret;
-no_page_table:
-        return 0;
 }
 int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
@@ -2925,12 +2956,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
        void *maddr;
        int offset = addr & (PAGE_SIZE-1);
-        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+        if (follow_phys(vma, addr, write, &prot, &phys_addr))
-                return -EINVAL;
-        phys_addr = follow_phys(vma, addr, write, &prot);
-        if (!phys_addr)
                return -EINVAL;
        maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6837a1014372..b17371185468 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -22,7 +22,6 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
-#include <linux/cpuset.h>
 #include <linux/delay.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
@@ -190,7 +189,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
                                        pgdat->node_start_pfn;
 }
-static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int nr_pages = PAGES_PER_SECTION;
@@ -217,7 +216,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        return 0;
 }
-static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
+static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
        int nr_pages = PAGES_PER_SECTION;
        int ret;
@@ -274,7 +273,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 * call this function after deciding the zone to which to
 * add the new pages.
 */
-int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
                 unsigned long nr_pages)
 {
        unsigned long i;
@@ -471,7 +470,8 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 }
-int add_memory(int nid, u64 start, u64 size)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+int __ref add_memory(int nid, u64 start, u64 size)
 {
        pg_data_t *pgdat = NULL;
        int new_pgdat = 0;
@@ -498,8 +498,6 @@ int add_memory(int nid, u64 start, u64 size)
        /* we online node here. we can't roll back from here. */
        node_set_online(nid);
-        cpuset_track_online_nodes();
        if (new_pgdat) {
                ret = register_one_node(nid);
                /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36f42573a335..e9493b1c1117 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -489,12 +489,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        int err;
        struct vm_area_struct *first, *vma, *prev;
-        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-                err = migrate_prep();
-                if (err)
-                        return ERR_PTR(err);
-        }
        first = find_vma(mm, start);
        if (!first)
@@ -809,9 +803,13 @@ int do_migrate_pages(struct mm_struct *mm,
        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
        int busy = 0;
-        int err = 0;
+        int err;
        nodemask_t tmp;
+        err = migrate_prep();
+        if (err)
+                return err;
        down_read(&mm->mmap_sem);
        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
@@ -974,6 +972,12 @@ static long do_mbind(unsigned long start, unsigned long len,
                 start, start + len, mode, mode_flags,
                 nmask ? nodes_addr(*nmask)[0] : -1);
+        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+                err = migrate_prep();
+                if (err)
+                        return err;
+        }
        down_write(&mm->mmap_sem);
        vma = check_range(mm, start, end, nmask,
                          flags | MPOL_MF_INVERT, &pagelist);
diff --git a/mm/migrate.c b/mm/migrate.c
index 6602941bfab0..037b0967c1e3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -522,15 +522,12 @@ static int writeout(struct address_space *mapping, struct page *page)
        remove_migration_ptes(page, page);
        rc = mapping->a_ops->writepage(page, &wbc);
-        if (rc < 0)
-                /* I/O Error writing */
-                return -EIO;
        if (rc != AOP_WRITEPAGE_ACTIVATE)
                /* unlocked. Relock */
                lock_page(page);
-        return -EAGAIN;
+        return (rc < 0) ? -EIO : -EAGAIN;
 }
 /*
@@ -841,12 +838,12 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
        struct page_to_node *pp;
        LIST_HEAD(pagelist);
+        migrate_prep();
        down_read(&mm->mmap_sem);
        /*
         * Build a list of pages to migrate
         */
-        migrate_prep();
        for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
                struct vm_area_struct *vma;
                struct page *page;
@@ -990,25 +987,18 @@ out:
 /*
 * Determine the nodes of an array of pages and store it in an array of status.
 */
-static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
+static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
-                         const void __user * __user *pages,
+                                const void __user **pages, int *status)
-                         int __user *status)
 {
        unsigned long i;
-        int err;
        down_read(&mm->mmap_sem);
        for (i = 0; i < nr_pages; i++) {
-                const void __user *p;
+                unsigned long addr = (unsigned long)(*pages);
-                unsigned long addr;
                struct vm_area_struct *vma;
                struct page *page;
+                int err = -EFAULT;
-                err = -EFAULT;
-                if (get_user(p, pages+i))
-                        goto out;
-                addr = (unsigned long) p;
                vma = find_vma(mm, addr);
                if (!vma)
@@ -1027,12 +1017,52 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
                err = page_to_nid(page);
 set_status:
-                put_user(err, status+i);
+                *status = err;
+                pages++;
+                status++;
+        }
+        up_read(&mm->mmap_sem);
+}
+/*
+ * Determine the nodes of a user array of pages and store it in
+ * a user array of status.
+ */
+static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
+                         const void __user * __user *pages,
+                         int __user *status)
+{
+#define DO_PAGES_STAT_CHUNK_NR 16
+        const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR];
+        int chunk_status[DO_PAGES_STAT_CHUNK_NR];
+        unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR;
+        int err;
+        for (i = 0; i < nr_pages; i += chunk_nr) {
+                if (chunk_nr + i > nr_pages)
+                        chunk_nr = nr_pages - i;
+                err = copy_from_user(chunk_pages, &pages[i],
+                                     chunk_nr * sizeof(*chunk_pages));
+                if (err) {
+                        err = -EFAULT;
+                        goto out;
+                }
+                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
+                err = copy_to_user(&status[i], chunk_status,
+                                   chunk_nr * sizeof(*chunk_status));
+                if (err) {
+                        err = -EFAULT;
+                        goto out;
+                }
        }
        err = 0;
 out:
-        up_read(&mm->mmap_sem);
        return err;
 }
diff --git a/mm/mlock.c b/mm/mlock.c
index 008ea70b7afa..1ada366570cb 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -66,14 +66,10 @@ void __clear_page_mlock(struct page *page)
                putback_lru_page(page);
        } else {
                /*
-                 * Page not on the LRU yet.  Flush all pagevecs and retry.
+                 * We lost the race. the page already moved to evictable list.
                 */
-                lru_add_drain_all();
+                if (PageUnevictable(page))
-                if (!isolate_lru_page(page))
-                        putback_lru_page(page);
-                else if (PageUnevictable(page))
                        count_vm_event(UNEVICTABLE_PGSTRANDED);
        }
 }
@@ -166,7 +162,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        unsigned long addr = start;
        struct page *pages[16]; /* 16 gives a reasonable batch */
        int nr_pages = (end - start) / PAGE_SIZE;
-        int ret;
+        int ret = 0;
        int gup_flags = 0;
        VM_BUG_ON(start & ~PAGE_MASK);
@@ -187,8 +183,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & VM_WRITE)
                gup_flags |= GUP_FLAGS_WRITE;
-        lru_add_drain_all();    /* push cached pages to LRU */
        while (nr_pages > 0) {
                int i;
@@ -251,8 +245,6 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
                ret = 0;
        }
-        lru_add_drain_all();    /* to update stats */
        return ret;     /* count entire vma as locked_vm */
 }
@@ -546,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
        if (!can_do_mlock())
                return -EPERM;
+        lru_add_drain_all();    /* flush pagevec */
        down_write(&current->mm->mmap_sem);
        len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
        start &= PAGE_MASK;
@@ -612,6 +606,8 @@ asmlinkage long sys_mlockall(int flags)
        if (!can_do_mlock())
                goto out;
+        lru_add_drain_all();    /* flush pagevec */
        down_write(&current->mm->mmap_sem);
        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
diff --git a/mm/mmap.c b/mm/mmap.c
index 74f4d158022e..d4855a682ab6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -175,7 +175,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        /* Don't let a single process grow too big:
           leave 3% of the size of this process for other processes */
-        allowed -= mm->total_vm / 32;
+        if (mm)
+                allowed -= mm->total_vm / 32;
        /*
         * cast `allowed' as a signed long because vm_committed_space
@@ -1703,7 +1704,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
-        if (expand_stack(prev, addr))
+        if (!prev || expand_stack(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED) {
                if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
diff --git a/mm/nommu.c b/mm/nommu.c
index 2696b24f2bb3..7695dc850785 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1454,7 +1454,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
        /* Don't let a single process grow too big:
           leave 3% of the size of this process for other processes */
-        allowed -= current->mm->total_vm / 32;
+        if (mm)
+                allowed -= mm->total_vm / 32;
        /*
         * cast `allowed' as a signed long because vm_committed_space
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 64e5b4bcd964..a0a01902f551 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -38,7 +38,6 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
 * badness - calculate a numeric value for how bad this task has been
 * @p: task struct of which task we should calculate
 * @uptime: current uptime in seconds
- * @mem: target memory controller
 *
 * The formula used is relatively simple and documented inline in the
 * function. The main rationale is that we want to select a good task
@@ -295,6 +294,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
                        continue;
                if (mem && !task_in_mem_cgroup(p, mem))
                        continue;
+                if (!thread_group_leader(p))
+                        continue;
                task_lock(p);
                printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d0a240fbb8bf..d8ac01474563 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -263,24 +263,39 @@ void prep_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
+        set_compound_page_dtor(page, free_compound_page);
+        set_compound_order(page, order);
+        __SetPageHead(page);
+        for (i = 1; i < nr_pages; i++) {
+                struct page *p = page + i;
+                __SetPageTail(p);
+                p->first_page = page;
+        }
+}
+#ifdef CONFIG_HUGETLBFS
+void prep_compound_gigantic_page(struct page *page, unsigned long order)
+{
+        int i;
+        int nr_pages = 1 << order;
        struct page *p = page + 1;
        set_compound_page_dtor(page, free_compound_page);
        set_compound_order(page, order);
        __SetPageHead(page);
-        for (i = 1; i < nr_pages; i++, p++) {
+        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-                if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
-                        p = pfn_to_page(page_to_pfn(page) + i);
                __SetPageTail(p);
                p->first_page = page;
        }
 }
+#endif
 static void destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
-        struct page *p = page + 1;
        if (unlikely(compound_order(page) != order))
                bad_page(page);
@@ -288,9 +303,8 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        if (unlikely(!PageHead(page)))
                        bad_page(page);
        __ClearPageHead(page);
-        for (i = 1; i < nr_pages; i++, p++) {
+        for (i = 1; i < nr_pages; i++) {
-                if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+                struct page *p = page + i;
-                        p = pfn_to_page(page_to_pfn(page) + i);
                if (unlikely(!PageTail(p) |
                                (p->first_page != page)))
@@ -1547,6 +1561,10 @@ nofail_alloc:
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
+        /*
+         * The task's cpuset might have expanded its set of allowable nodes
+         */
+        cpuset_update_task_memory_state();
        p->flags |= PF_MEMALLOC;
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index f59d797dc5a9..ab27ff750519 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -21,7 +21,7 @@ static unsigned long total_usage;
 #if !defined(CONFIG_SPARSEMEM)
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
        pgdat->node_page_cgroup = NULL;
 }
@@ -49,6 +49,9 @@ static int __init alloc_node_page_cgroup(int nid)
        start_pfn = NODE_DATA(nid)->node_start_pfn;
        nr_pages = NODE_DATA(nid)->node_spanned_pages;
+        if (!nr_pages)
+                return 0;
        table_size = sizeof(struct page_cgroup) * nr_pages;
        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
@@ -97,7 +100,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
        return section->page_cgroup + pfn;
 }
-int __meminit init_section_page_cgroup(unsigned long pfn)
+/* __alloc_bootmem...() is protected by !slab_available() */
+int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
        struct mem_section *section;
        struct page_cgroup *base, *pc;
@@ -106,19 +110,29 @@ int __meminit init_section_page_cgroup(unsigned long pfn)
        section = __pfn_to_section(pfn);
-        if (section->page_cgroup)
+        if (!section->page_cgroup) {
-                return 0;
+                nid = page_to_nid(pfn_to_page(pfn));
+                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-        nid = page_to_nid(pfn_to_page(pfn));
+                if (slab_is_available()) {
+                        base = kmalloc_node(table_size, GFP_KERNEL, nid);
-        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+                        if (!base)
-        if (slab_is_available()) {
+                                base = vmalloc_node(table_size, nid);
-                base = kmalloc_node(table_size, GFP_KERNEL, nid);
+                } else {
-                if (!base)
+                        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-                        base = vmalloc_node(table_size, nid);
+                                table_size,
-        } else {
-                base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
                                PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+                }
+        } else {
+                /*
+                 * We don't have to allocate page_cgroup again, but
+                 * address of memmap may be changed. So, we have to initialize
+                 * again.
+                 */
+                base = section->page_cgroup + pfn;
+                table_size = 0;
+                /* check address of memmap is changed or not. */
+                if (base->page == pfn_to_page(pfn))
+                        return 0;
        }
        if (!base) {
@@ -158,14 +172,14 @@ void __free_page_cgroup(unsigned long pfn)
        }
 }
-int online_page_cgroup(unsigned long start_pfn,
+int __meminit online_page_cgroup(unsigned long start_pfn,
                        unsigned long nr_pages,
                        int nid)
 {
        unsigned long start, end, pfn;
        int fail = 0;
-        start = start_pfn & (PAGES_PER_SECTION - 1);
+        start = start_pfn & ~(PAGES_PER_SECTION - 1);
        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
@@ -183,12 +197,12 @@ int online_page_cgroup(unsigned long start_pfn,
        return -ENOMEM;
 }
-int offline_page_cgroup(unsigned long start_pfn,
+int __meminit offline_page_cgroup(unsigned long start_pfn,
                unsigned long nr_pages, int nid)
 {
        unsigned long start, end, pfn;
-        start = start_pfn & (PAGES_PER_SECTION - 1);
+        start = start_pfn & ~(PAGES_PER_SECTION - 1);
        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
@@ -197,7 +211,7 @@ int offline_page_cgroup(unsigned long start_pfn,
 }
-static int page_cgroup_callback(struct notifier_block *self,
+static int __meminit page_cgroup_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
 {
        struct memory_notify *mn = arg;
@@ -207,18 +221,23 @@ static int page_cgroup_callback(struct notifier_block *self,
                ret = online_page_cgroup(mn->start_pfn,
                                   mn->nr_pages, mn->status_change_nid);
                break;
-        case MEM_CANCEL_ONLINE:
        case MEM_OFFLINE:
                offline_page_cgroup(mn->start_pfn,
                                mn->nr_pages, mn->status_change_nid);
                break;
+        case MEM_CANCEL_ONLINE:
        case MEM_GOING_OFFLINE:
                break;
        case MEM_ONLINE:
        case MEM_CANCEL_OFFLINE:
                break;
        }
-        ret = notifier_from_errno(ret);
+        if (ret)
+                ret = notifier_from_errno(ret);
+        else
+                ret = NOTIFY_OK;
        return ret;
 }
@@ -248,7 +267,7 @@ void __init page_cgroup_init(void)
        " want\n");
 }
-void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 {
        return;
 }
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index b70a7fec1ff6..5e0ffd967452 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -130,10 +130,11 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
                if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
                        break;
        }
-        if (pfn < end_pfn)
+        page = __first_valid_page(start_pfn, end_pfn - start_pfn);
+        if ((pfn < end_pfn) || !page)
                return -EBUSY;
        /* Check all pages are free or Marked as ISOLATED */
-        zone = page_zone(pfn_to_page(pfn));
+        zone = page_zone(page);
        spin_lock_irqsave(&zone->lock, flags);
        ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
        spin_unlock_irqrestore(&zone->lock, flags);
diff --git a/mm/shmem.c b/mm/shmem.c
index d38d7e61fcd0..0ed075215e5f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -161,8 +161,8 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 */
 static inline int shmem_acct_size(unsigned long flags, loff_t size)
 {
-        return (flags & VM_ACCOUNT)?
+        return (flags & VM_ACCOUNT) ?
-                security_vm_enough_memory(VM_ACCT(size)): 0;
+                security_vm_enough_memory_kern(VM_ACCT(size)) : 0;
 }
 static inline void shmem_unacct_size(unsigned long flags, loff_t size)
@@ -179,8 +179,8 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
 */
 static inline int shmem_acct_block(unsigned long flags)
 {
-        return (flags & VM_ACCOUNT)?
+        return (flags & VM_ACCOUNT) ?
-                0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE));
+                0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE));
 }
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
diff --git a/mm/slob.c b/mm/slob.c
index cb675d126791..bf7e8fc3aed8 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -535,7 +535,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        struct kmem_cache *c;
        c = slob_alloc(sizeof(struct kmem_cache),
-                flags, ARCH_KMALLOC_MINALIGN, -1);
+                GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1);
        if (c) {
                c->name = name;
diff --git a/mm/slub.c b/mm/slub.c
index 7ad489af9561..a2cd47d89e0a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2931,8 +2931,10 @@ static int slab_memory_callback(struct notifier_block *self,
        case MEM_CANCEL_OFFLINE:
                break;
        }
+        if (ret)
-        ret = notifier_from_errno(ret);
+                ret = notifier_from_errno(ret);
+        else
+                ret = NOTIFY_OK;
        return ret;
 }
@@ -3595,7 +3597,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
        for (i = 0; i < t.count; i++) {
                struct location *l = &t.loc[i];
-                if (len > PAGE_SIZE - 100)
+                if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
                        break;
                len += sprintf(buf + len, "%7ld ", l->count);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a91b5f8fcaf6..a13ea6401ae7 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -64,7 +64,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
        unsigned long pfn = pte_pfn(*pte);
        int actual_node = early_pfn_to_nid(pfn);
-        if (actual_node != node)
+        if (node_distance(actual_node, node) > LOCAL_DISTANCE)
                printk(KERN_WARNING "[%lx-%lx] potential offnode "
                        "page_structs\n", start, end - 1);
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 39db301b920d..083f5b63e7a8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -570,7 +570,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
 * set.  If this is <=0, then that means that the passed-in
 * map was not consumed and must be freed.
 */
-int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                           int nr_pages)
 {
        unsigned long section_nr = pfn_to_section_nr(start_pfn);
diff --git a/mm/swap.c b/mm/swap.c
index 2152e48a7b8f..b135ec90cdeb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -299,7 +299,6 @@ void lru_add_drain(void)
        put_cpu();
 }
-#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
        lru_add_drain();
@@ -313,18 +312,6 @@ int lru_add_drain_all(void)
        return schedule_on_each_cpu(lru_add_drain_per_cpu);
 }
-#else
-/*
- * Returns 0 for success
- */
-int lru_add_drain_all(void)
-{
-        lru_add_drain();
-        return 0;
-}
-#endif
 /*
 * Batched page_cache_release().  Decrement the reference count on all the
 * passed pages.  If it fell to zero then remove the page from the LRU and
@@ -445,6 +432,7 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
                struct zone *pagezone = page_zone(page);
+                int file;
                if (pagezone != zone) {
                        if (zone)
@@ -456,8 +444,12 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
                VM_BUG_ON(PageUnevictable(page));
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
-                if (is_active_lru(lru))
+                file = is_file_lru(lru);
+                zone->recent_scanned[file]++;
+                if (is_active_lru(lru)) {
                        SetPageActive(page);
+                        zone->recent_rotated[file]++;
+                }
                add_page_to_lru_list(zone, page, lru);
        }
        if (zone)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 90cb67a5417c..54a9f87e5162 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1462,6 +1462,15 @@ static int __init procswaps_init(void)
 __initcall(procswaps_init);
 #endif /* CONFIG_PROC_FS */
+#ifdef MAX_SWAPFILES_CHECK
+static int __init max_swapfiles_check(void)
+{
+        MAX_SWAPFILES_CHECK();
+        return 0;
+}
+late_initcall(max_swapfiles_check);
+#endif
 /*
 * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
 *
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 036536945dd9..1ddb77ba3995 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -77,7 +77,6 @@ static void vunmap_page_range(unsigned long addr, unsigned long end)
        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
-        flush_cache_vunmap(addr, end);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
@@ -178,7 +177,7 @@ static int vmap_page_range(unsigned long addr, unsigned long end,
 static inline int is_vmalloc_or_module_addr(const void *x)
 {
        /*
-         * x86-64 and sparc64 put modules in a special place,
+         * ARM, x86-64 and sparc64 put modules in a special place,
         * and fall back on vmalloc() if that fails. Others
         * just put it in the vmalloc space.
         */
@@ -324,14 +323,14 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
        BUG_ON(size & ~PAGE_MASK);
-        addr = ALIGN(vstart, align);
        va = kmalloc_node(sizeof(struct vmap_area),
                        gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!va))
                return ERR_PTR(-ENOMEM);
 retry:
+        addr = ALIGN(vstart, align);
        spin_lock(&vmap_area_lock);
        /* XXX: could have a last_hole cache */
        n = vmap_area_root.rb_node;
@@ -362,7 +361,7 @@ retry:
                                goto found;
                }
-                while (addr + size >= first->va_start && addr + size <= vend) {
+                while (addr + size > first->va_start && addr + size <= vend) {
                        addr = ALIGN(first->va_end + PAGE_SIZE, align);
                        n = rb_next(&first->rb_node);
@@ -522,24 +521,45 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 }
 /*
+ * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
+ * is already purging.
+ */
+static void try_purge_vmap_area_lazy(void)
+{
+        unsigned long start = ULONG_MAX, end = 0;
+        __purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+/*
 * Kick off a purge of the outstanding lazy areas.
 */
 static void purge_vmap_area_lazy(void)
 {
        unsigned long start = ULONG_MAX, end = 0;
-        __purge_vmap_area_lazy(&start, &end, 0, 0);
+        __purge_vmap_area_lazy(&start, &end, 1, 0);
 }
 /*
- * Free and unmap a vmap area
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
 */
-static void free_unmap_vmap_area(struct vmap_area *va)
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
 {
        va->flags |= VM_LAZY_FREE;
        atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
        if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
-                purge_vmap_area_lazy();
+                try_purge_vmap_area_lazy();
+}
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+        flush_cache_vunmap(va->va_start, va->va_end);
+        free_unmap_vmap_area_noflush(va);
 }
 static struct vmap_area *find_vmap_area(unsigned long addr)
@@ -592,6 +612,8 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
 #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
+static bool vmap_initialized __read_mostly = false;
 struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;
@@ -721,7 +743,7 @@ static void free_vmap_block(struct vmap_block *vb)
        spin_unlock(&vmap_block_tree_lock);
        BUG_ON(tmp != vb);
-        free_unmap_vmap_area(vb->va);
+        free_unmap_vmap_area_noflush(vb->va);
        call_rcu(&vb->rcu_head, rcu_free_vb);
 }
@@ -783,6 +805,9 @@ static void vb_free(const void *addr, unsigned long size)
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+        flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
        order = get_order(size);
        offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
@@ -828,6 +853,9 @@ void vm_unmap_aliases(void)
        int cpu;
        int flush = 0;
+        if (unlikely(!vmap_initialized))
+                return;
        for_each_possible_cpu(cpu) {
                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
                struct vmap_block *vb;
@@ -897,7 +925,8 @@ EXPORT_SYMBOL(vm_unmap_ram);
 * @count: number of pages
 * @node: prefer to allocate data structures on this node
 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
- * @returns: a pointer to the address that has been mapped, or NULL on failure
+ *
+ * Returns: a pointer to the address that has been mapped, or %NULL on failure
 */
 void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
 {
@@ -941,6 +970,8 @@ void __init vmalloc_init(void)
                INIT_LIST_HEAD(&vbq->dirty);
                vbq->nr_dirty = 0;
        }
+        vmap_initialized = true;
 }
 void unmap_kernel_range(unsigned long addr, unsigned long size)
@@ -1686,7 +1717,7 @@ static int s_show(struct seq_file *m, void *p)
                v->addr, v->addr + v->size, v->size);
        if (v->caller) {
-                char buff[2 * KSYM_NAME_LEN];
+                char buff[KSYM_SYMBOL_LEN];
                seq_putc(m, ' ');
                sprint_symbol(buff, (unsigned long)v->caller);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3b5860294bb6..62e7f62fb559 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -623,6 +623,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * Try to allocate it some swap space here.
                 */
                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!(sc->gfp_mask & __GFP_IO))
+                                goto keep_locked;
                        switch (try_to_munlock(page)) {
                        case SWAP_FAIL:         /* shouldn't happen */
                        case SWAP_AGAIN:
@@ -634,6 +636,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        }
                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
+                        may_enter_fs = 1;
                }
 #endif /* CONFIG_SWAP */
@@ -1245,6 +1248,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                list_add(&page->lru, &l_inactive);
        }
+        spin_lock_irq(&zone->lru_lock);
        /*
         * Count referenced pages from currently used mappings as
         * rotated, even though they are moved to the inactive list.
@@ -1260,7 +1264,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        pgmoved = 0;
        lru = LRU_BASE + file * LRU_FILE;
-        spin_lock_irq(&zone->lru_lock);
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
                prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1386,9 +1389,9 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
        file_prio = 200 - sc->swappiness;
        /*
-         *                  anon       recent_rotated[0]
+         * The amount of pressure on anon vs file pages is inversely
-         * %anon = 100 * ----------- / ----------------- * IO cost
+         * proportional to the fraction of recently scanned pages on
-         *               anon + file      rotate_sum
+         * each list that were recently referenced and in active use.
         */
        ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
        ap /= zone->recent_rotated[0] + 1;
@@ -2368,39 +2371,6 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
        return 1;
 }
-static void show_page_path(struct page *page)
-{
-        char buf[256];
-        if (page_is_file_cache(page)) {
-                struct address_space *mapping = page->mapping;
-                struct dentry *dentry;
-                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-                spin_lock(&mapping->i_mmap_lock);
-                dentry = d_find_alias(mapping->host);
-                printk(KERN_INFO "rescued: %s %lu\n",
-                       dentry_path(dentry, buf, 256), pgoff);
-                spin_unlock(&mapping->i_mmap_lock);
-        } else {
-#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
-                struct anon_vma *anon_vma;
-                struct vm_area_struct *vma;
-                anon_vma = page_lock_anon_vma(page);
-                if (!anon_vma)
-                        return;
-                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-                        printk(KERN_INFO "rescued: anon %s\n",
-                               vma->vm_mm->owner->comm);
-                        break;
-                }
-                page_unlock_anon_vma(anon_vma);
-#endif
-        }
-}
 /**
 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
 * @page: page to check evictability and move to appropriate lru list
@@ -2421,8 +2391,6 @@ retry:
        if (page_evictable(page, NULL)) {
                enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
-                show_page_path(page);
                __dec_zone_state(zone, NR_UNEVICTABLE);
                list_move(&page->lru, &zone->lru[l].list);
                __inc_zone_state(zone, NR_INACTIVE_ANON + l);