9 files changed, 113 insertions, 24 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d7ca59d66c59..de4cf458d6e1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -643,7 +643,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        spin_unlock(&mm->page_table_lock);
                        ret = hugetlb_fault(mm, vma, vaddr, 0);
                        spin_lock(&mm->page_table_lock);
-                        if (!(ret & VM_FAULT_MAJOR))
+                        if (!(ret & VM_FAULT_ERROR))
                                continue;
                        remainder = 0;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 71b84b45154a..172abffeb2e3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -149,7 +149,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
           lower zones etc. Avoid empty zones because the memory allocator
           doesn't like them. If you implement node hot removal you
           have to fix that. */
-        k = policy_zone;
+        k = MAX_NR_ZONES - 1;
        while (1) {
                for_each_node_mask(nd, *nodes) { 
                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
diff --git a/mm/mmap.c b/mm/mmap.c
index b6537211b9cc..0d40e66c841b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -93,7 +93,7 @@ atomic_t vm_committed_space = ATOMIC_INIT(0);
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
-int __vm_enough_memory(long pages, int cap_sys_admin)
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
        unsigned long free, allowed;
@@ -166,7 +166,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
        /* Don't let a single process grow too big:
           leave 3% of the size of this process for other processes */
-        allowed -= current->mm->total_vm / 32;
+        allowed -= mm->total_vm / 32;
        /*
         * cast `allowed' as a signed long because vm_committed_space
@@ -2077,7 +2077,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
        if (__vma && __vma->vm_start < vma->vm_end)
                return -ENOMEM;
        if ((vma->vm_flags & VM_ACCOUNT) &&
-             security_vm_enough_memory(vma_pages(vma)))
+             security_vm_enough_memory_mm(mm, vma_pages(vma)))
                return -ENOMEM;
        vma_link(mm, vma, prev, rb_link, rb_parent);
        return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 9eef6a398555..8ed0cb43118a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1270,7 +1270,7 @@ EXPORT_SYMBOL(get_unmapped_area);
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
-int __vm_enough_memory(long pages, int cap_sys_admin)
+int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
        unsigned long free, allowed;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3da85b81dabb..6427653023aa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1157,6 +1157,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
        nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
        int zlc_active = 0;             /* set if using zonelist_cache */
        int did_zlc_setup = 0;          /* just call zlc_setup() one time */
+        enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
 zonelist_scan:
        /*
@@ -1166,6 +1167,18 @@ zonelist_scan:
        z = zonelist->zones;
        do {
+                /*
+                 * In NUMA, this could be a policy zonelist which contains
+                 * zones that may not be allowed by the current gfp_mask.
+                 * Check the zone is allowed by the current flags
+                 */
+                if (unlikely(alloc_should_filter_zonelist(zonelist))) {
+                        if (highest_zoneidx == -1)
+                                highest_zoneidx = gfp_zone(gfp_mask);
+                        if (zone_idx(*z) > highest_zoneidx)
+                                continue;
+                }
                if (NUMA_BUILD && zlc_active &&
                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
diff --git a/mm/slab.c b/mm/slab.c
index a684778b2b41..6f6abef83a1a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -883,6 +883,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
  */
 static int use_alien_caches __read_mostly = 1;
+static int numa_platform __read_mostly = 1;
 static int __init noaliencache_setup(char *s)
 {
        use_alien_caches = 0;
@@ -1399,8 +1400,10 @@ void __init kmem_cache_init(void)
        int order;
        int node;
-        if (num_possible_nodes() == 1)
+        if (num_possible_nodes() == 1) {
                use_alien_caches = 0;
+                numa_platform = 0;
+        }
        for (i = 0; i < NUM_INIT_LISTS; i++) {
                kmem_list3_init(&initkmem_list3[i]);
@@ -3558,7 +3561,14 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
        check_irq_off();
        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
-        if (cache_free_alien(cachep, objp))
+        /*
+         * Skip calling cache_free_alien() when the platform is not numa.
+         * This will avoid cache misses that happen while accessing slabp (which
+         * is per page memory  reference) to get nodeid. Instead use a global
+         * variable to skip the call, which is mostly likely to be present in
+         * the cache.
+         */
+        if (numa_platform && cache_free_alien(cachep, objp))
                return;
        if (likely(ac->avail < ac->limit)) {
diff --git a/mm/slub.c b/mm/slub.c
index 69d02e3e439e..04151da399c6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1877,9 +1877,16 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
        BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
-        page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
+        page = new_slab(kmalloc_caches, gfpflags, node);
        BUG_ON(!page);
+        if (page_to_nid(page) != node) {
+                printk(KERN_ERR "SLUB: Unable to allocate memory from "
+                                "node %d\n", node);
+                printk(KERN_ERR "SLUB: Allocating a useless per node structure "
+                                "in order to be able to continue\n");
+        }
        n = page->freelist;
        BUG_ON(!n);
        page->freelist = get_freepointer(kmalloc_caches, n);
@@ -3112,7 +3119,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
                unsigned long flags;
                struct page *page;
-                if (!atomic_read(&n->nr_slabs))
+                if (!atomic_long_read(&n->nr_slabs))
                        continue;
                spin_lock_irqsave(&n->list_lock, flags);
@@ -3247,7 +3254,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
                }
                if (flags & SO_FULL) {
-                        int full_slabs = atomic_read(&n->nr_slabs)
+                        int full_slabs = atomic_long_read(&n->nr_slabs)
                                        - per_cpu[node]
                                        - n->nr_partial;
@@ -3283,7 +3290,7 @@ static int any_slab_objects(struct kmem_cache *s)
        for_each_node(node) {
                struct kmem_cache_node *n = get_node(s, node);
-                if (n->nr_partial || atomic_read(&n->nr_slabs))
+                if (n->nr_partial || atomic_long_read(&n->nr_slabs))
                        return 1;
        }
        return 0;
diff --git a/mm/sparse.c b/mm/sparse.c
index 3047bf06c1f3..239f5a720d38 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -41,6 +41,15 @@ int page_to_nid(struct page *page)
        return section_to_node_table[page_to_section(page)];
 }
 EXPORT_SYMBOL(page_to_nid);
+static void set_section_nid(unsigned long section_nr, int nid)
+{
+        section_to_node_table[section_nr] = nid;
+}
+#else /* !NODE_NOT_IN_PAGE_FLAGS */
+static inline void set_section_nid(unsigned long section_nr, int nid)
+{
+}
 #endif
 #ifdef CONFIG_SPARSEMEM_EXTREME
@@ -68,10 +77,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
        struct mem_section *section;
        int ret = 0;
-#ifdef NODE_NOT_IN_PAGE_FLAGS
-        section_to_node_table[section_nr] = nid;
-#endif
        if (mem_section[root])
                return -EEXIST;
@@ -148,6 +153,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
                struct mem_section *ms;
                sparse_index_init(section, nid);
+                set_section_nid(section, nid);
                ms = __nr_to_section(section);
                if (!ms->section_mem_map)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d419e10e3daa..a6e65d024995 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -271,6 +271,12 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
+/* Request for sync pageout. */
+enum pageout_io {
+        PAGEOUT_IO_ASYNC,
+        PAGEOUT_IO_SYNC,
+};
 /* possible outcome of pageout() */
 typedef enum {
        /* failed to write page out, page is locked */
@@ -287,7 +293,8 @@ typedef enum {
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
 */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+                                                enum pageout_io sync_writeback)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -346,6 +353,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                        ClearPageReclaim(page);
                        return PAGE_ACTIVATE;
                }
+                /*
+                 * Wait on writeback if requested to. This happens when
+                 * direct reclaiming a large contiguous area and the
+                 * first attempt to free a range of pages fails.
+                 */
+                if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
@@ -423,7 +439,8 @@ cannot_free:
 * shrink_page_list() returns the number of reclaimed pages
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                        struct scan_control *sc)
+                                        struct scan_control *sc,
+                                        enum pageout_io sync_writeback)
 {
        LIST_HEAD(ret_pages);
        struct pagevec freed_pvec;
@@ -458,8 +475,23 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (page_mapped(page) || PageSwapCache(page))
                        sc->nr_scanned++;
-                if (PageWriteback(page))
+                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
-                        goto keep_locked;
+                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+                if (PageWriteback(page)) {
+                        /*
+                         * Synchronous reclaim is performed in two passes,
+                         * first an asynchronous pass over the list to
+                         * start parallel writeback, and a second synchronous
+                         * pass to wait for the IO to complete.  Wait here
+                         * for any page for which writeback has already
+                         * started.
+                         */
+                        if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+                                wait_on_page_writeback(page);
+                        else
+                                goto keep_locked;
+                }
                referenced = page_referenced(page, 1);
                /* In active use or really unfreeable?  Activate it. */
@@ -478,8 +510,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 #endif /* CONFIG_SWAP */
                mapping = page_mapping(page);
-                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
-                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                /*
                 * The page is mapped into the page tables of one or more
@@ -505,7 +535,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                        /* Page is dirty, try to write it out here */
-                        switch(pageout(page, mapping)) {
+                        switch (pageout(page, mapping, sync_writeback)) {
                        case PAGE_KEEP:
                                goto keep_locked;
                        case PAGE_ACTIVATE:
@@ -777,6 +807,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                             (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
                                             ISOLATE_BOTH : ISOLATE_INACTIVE);
                nr_active = clear_active_flags(&page_list);
+                __count_vm_events(PGDEACTIVATE, nr_active);
                __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
                __mod_zone_page_state(zone, NR_INACTIVE,
@@ -785,7 +816,29 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                spin_unlock_irq(&zone->lru_lock);
                nr_scanned += nr_scan;
-                nr_freed = shrink_page_list(&page_list, sc);
+                nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+                /*
+                 * If we are direct reclaiming for contiguous pages and we do
+                 * not reclaim everything in the list, try again and wait
+                 * for IO to complete. This will stall high-order allocations
+                 * but that should be acceptable to the caller
+                 */
+                if (nr_freed < nr_taken && !current_is_kswapd() &&
+                                        sc->order > PAGE_ALLOC_COSTLY_ORDER) {
+                        congestion_wait(WRITE, HZ/10);
+                        /*
+                         * The attempt at page out may have made some
+                         * of the pages active, mark them inactive again.
+                         */
+                        nr_active = clear_active_flags(&page_list);
+                        count_vm_events(PGDEACTIVATE, nr_active);
+                        nr_freed += shrink_page_list(&page_list, sc,
+                                                        PAGEOUT_IO_SYNC);
+                }
                nr_reclaimed += nr_freed;
                local_irq_disable();
                if (current_is_kswapd()) {