Update from upstream with manual merge of Yasunori Goto's

changes to swiotlb.c made in commit 281dd25cdc0d6903929b79183816d151ea626341 since this file has been moved from arch/ia64/lib/swiotlb.c to lib/swiotlb.c Signed-off-by: Tony Luck <tony.luck@intel.com>
author: Tony Luck <tony.luck@intel.com> 2005-10-20 13:41:44 -0400
committer: Tony Luck <tony.luck@intel.com> 2005-10-20 13:41:44 -0400
commit: 9cec58dc138d6fcad9f447a19c8ff69f6540e667 (patch)
tree: 4fe1cca94fdba8b705c87615bee06d3346f687ce /mm
parent: 17e5ad6c0ce5a970e2830d0de8bdd60a2f077d38 (diff)
parent: ac9b9c667c2e1194e22ebe0a441ae1c37aaa9b90 (diff)
22 files changed, 170 insertions, 123 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4e9937ac3529..391ffc54d136 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -29,7 +29,7 @@ config FLATMEM_MANUAL
          If unsure, choose this option (Flat Memory) over any other.
 config DISCONTIGMEM_MANUAL
-        bool "Discontigious Memory"
+        bool "Discontiguous Memory"
        depends on ARCH_DISCONTIGMEM_ENABLE
        help
          This option provides enhanced support for discontiguous
@@ -52,7 +52,7 @@ config SPARSEMEM_MANUAL
          memory hotplug systems.  This is normal.
          For many other systems, this will be an alternative to
-          "Discontigious Memory".  This option provides some potential
+          "Discontiguous Memory".  This option provides some potential
          performance benefits, along with decreased code complexity,
          but it is newer, and more experimental.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8ec4e4c2a179..a58699b6579e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -61,17 +61,9 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 {
        bootmem_data_t *bdata = pgdat->bdata;
        unsigned long mapsize = ((end - start)+7)/8;
-        static struct pglist_data *pgdat_last;
+        pgdat->pgdat_next = pgdat_list;
-        pgdat->pgdat_next = NULL;
+        pgdat_list = pgdat;
-        /* Add new nodes last so that bootmem always starts
-           searching in the first nodes, not the last ones */
-        if (pgdat_last)
-                pgdat_last->pgdat_next = pgdat;
-        else {
-                pgdat_list = pgdat;     
-                pgdat_last = pgdat;
-        }
        mapsize = ALIGN(mapsize, sizeof(long));
        bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
@@ -162,10 +154,10 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
 */
 static void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
-                unsigned long align, unsigned long goal)
+              unsigned long align, unsigned long goal, unsigned long limit)
 {
        unsigned long offset, remaining_size, areasize, preferred;
-        unsigned long i, start = 0, incr, eidx;
+        unsigned long i, start = 0, incr, eidx, end_pfn = bdata->node_low_pfn;
        void *ret;
        if(!size) {
@@ -174,7 +166,14 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
        }
        BUG_ON(align & (align-1));
-        eidx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
+        if (limit && bdata->node_boot_start >= limit)
+                return NULL;
+        limit >>=PAGE_SHIFT;
+        if (limit && end_pfn > limit)
+                end_pfn = limit;
+        eidx = end_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
        offset = 0;
        if (align &&
            (bdata->node_boot_start & (align - 1UL)) != 0)
@@ -186,11 +185,12 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
         * first, then we try to allocate lower pages.
         */
        if (goal && (goal >= bdata->node_boot_start) && 
-            ((goal >> PAGE_SHIFT) < bdata->node_low_pfn)) {
+            ((goal >> PAGE_SHIFT) < end_pfn)) {
                preferred = goal - bdata->node_boot_start;
                if (bdata->last_success >= preferred)
-                        preferred = bdata->last_success;
+                        if (!limit || (limit && limit > bdata->last_success))
+                                preferred = bdata->last_success;
        } else
                preferred = 0;
@@ -390,14 +390,15 @@ unsigned long __init free_all_bootmem (void)
        return(free_all_bootmem_core(NODE_DATA(0)));
 }
-void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
+                                unsigned long limit)
 {
        pg_data_t *pgdat = pgdat_list;
        void *ptr;
        for_each_pgdat(pgdat)
                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-                                                align, goal)))
+                                                 align, goal, limit)))
                        return(ptr);
        /*
@@ -408,14 +409,16 @@ void * __init __alloc_bootmem (unsigned long size, unsigned long align, unsigned
        return NULL;
 }
-void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal)
+void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
+                                     unsigned long goal, unsigned long limit)
 {
        void *ptr;
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal);
+        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
        if (ptr)
                return (ptr);
-        return __alloc_bootmem(size, align, goal);
+        return __alloc_bootmem_limit(size, align, goal, limit);
 }
diff --git a/mm/fremap.c b/mm/fremap.c
index 3235fb77c133..ab23a0673c35 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -89,6 +89,9 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (!page->mapping || page->index >= size)
                goto err_unlock;
+        err = -ENOMEM;
+        if (page_mapcount(page) > INT_MAX/2)
+                goto err_unlock;
        zap_pte(mm, vma, addr, pte);
diff --git a/mm/highmem.c b/mm/highmem.c
index 400911599468..90e1861e2da0 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,7 +30,7 @@
 static mempool_t *page_pool, *isa_page_pool;
-static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data)
+static void *page_pool_alloc(gfp_t gfp_mask, void *data)
 {
        unsigned int gfp = gfp_mask | (unsigned int) (long) data;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 901ac523a1c3..61d380678030 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -274,21 +274,22 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 {
        pte_t *src_pte, *dst_pte, entry;
        struct page *ptepage;
-        unsigned long addr = vma->vm_start;
+        unsigned long addr;
-        unsigned long end = vma->vm_end;
-        while (addr < end) {
+        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
                dst_pte = huge_pte_alloc(dst, addr);
                if (!dst_pte)
                        goto nomem;
+                spin_lock(&src->page_table_lock);
                src_pte = huge_pte_offset(src, addr);
-                BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+                if (src_pte && !pte_none(*src_pte)) {
-                entry = *src_pte;
+                        entry = *src_pte;
-                ptepage = pte_page(entry);
+                        ptepage = pte_page(entry);
-                get_page(ptepage);
+                        get_page(ptepage);
-                add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+                        add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
-                set_huge_pte_at(dst, addr, dst_pte, entry);
+                        set_huge_pte_at(dst, addr, dst_pte, entry);
-                addr += HPAGE_SIZE;
+                }
+                spin_unlock(&src->page_table_lock);
        }
        return 0;
@@ -323,8 +324,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                page = pte_page(pte);
                put_page(page);
+                add_mm_counter(mm, rss,  - (HPAGE_SIZE / PAGE_SIZE));
        }
-        add_mm_counter(mm, rss,  -((end - start) >> PAGE_SHIFT));
        flush_tlb_range(vma, start, end);
 }
@@ -393,6 +394,28 @@ out:
        return ret;
 }
+/*
+ * On ia64 at least, it is possible to receive a hugetlb fault from a
+ * stale zero entry left in the TLB from earlier hardware prefetching.
+ * Low-level arch code should already have flushed the stale entry as
+ * part of its fault handling, but we do need to accept this minor fault
+ * and return successfully.  Whereas the "normal" case is that this is
+ * an access to a hugetlb page which has been truncated off since mmap.
+ */
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, int write_access)
+{
+        int ret = VM_FAULT_SIGBUS;
+        pte_t *pte;
+        spin_lock(&mm->page_table_lock);
+        pte = huge_pte_offset(mm, address);
+        if (pte && !pte_none(*pte))
+                ret = VM_FAULT_MINOR;
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i)
@@ -403,6 +426,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        BUG_ON(!is_vm_hugetlb_page(vma));
        vpfn = vaddr/PAGE_SIZE;
+        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
                if (pages) {
@@ -415,8 +439,13 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         * indexing below to work. */
                        pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-                        /* hugetlb should be locked, and hence, prefaulted */
+                        /* the hugetlb file might have been truncated */
-                        WARN_ON(!pte || pte_none(*pte));
+                        if (!pte || pte_none(*pte)) {
+                                remainder = 0;
+                                if (!i)
+                                        i = -EFAULT;
+                                break;
+                        }
                        page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
@@ -434,7 +463,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                --remainder;
                ++i;
        }
+        spin_unlock(&mm->page_table_lock);
        *length = remainder;
        *position = vaddr;
diff --git a/mm/madvise.c b/mm/madvise.c
index 4454936f87d1..20e075d1c64c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -83,6 +83,9 @@ static long madvise_willneed(struct vm_area_struct * vma,
 {
        struct file *file = vma->vm_file;
+        if (!file)
+                return -EBADF;
        if (file->f_mapping->a_ops->get_xip_page) {
                /* no bad return value, but ignore advice */
                return 0;
@@ -141,11 +144,7 @@ static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                unsigned long start, unsigned long end, int behavior)
 {
-        struct file *filp = vma->vm_file;
+        long error;
-        long error = -EBADF;
-        if (!filp)
-                goto  out;
        switch (behavior) {
        case MADV_NORMAL:
@@ -166,8 +165,6 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                error = -EINVAL;
                break;
        }
-                
-out:
        return error;
 }
diff --git a/mm/memory.c b/mm/memory.c
index ae8161f1f459..1db40e935e55 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2045,8 +2045,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
        inc_page_state(pgfault);
-        if (is_vm_hugetlb_page(vma))
+        if (unlikely(is_vm_hugetlb_page(vma)))
-                return VM_FAULT_SIGBUS; /* mapping truncation does this. */
+                return hugetlb_fault(mm, vma, address, write_access);
        /*
         * We need the page table lock to synchronize with kswapd
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9033f0859aa8..37af443eb094 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -687,7 +687,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
 }
 /* Return a zonelist representing a mempolicy */
-static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
+static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 {
        int nd;
@@ -751,7 +751,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
 {
        struct zonelist *zl;
        struct page *page;
@@ -789,7 +789,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
 *      Should be called with the mm_sem of the vma hold.
 */
 struct page *
-alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
@@ -832,7 +832,7 @@ alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned l
 *      1) it's ok to take cpuset_sem (can WAIT), and
 *      2) allocating for current task (not interrupt).
 */
-struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
+struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
        struct mempolicy *pol = current->mempolicy;
diff --git a/mm/mempool.c b/mm/mempool.c
index 65f2957b8d51..9e377ea700b2 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,7 +112,7 @@ EXPORT_SYMBOL(mempool_create_node);
 * while this function is running. mempool_alloc() & mempool_free()
 * might be called (eg. from IRQ contexts) while this function executes.
 */
-int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
 {
        void *element;
        void **new_elements;
@@ -200,7 +200,7 @@ EXPORT_SYMBOL(mempool_destroy);
 * *never* fails when called from process contexts. (it might
 * fail if called from an IRQ context.)
 */
-void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
+void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 {
        void *element;
        unsigned long flags;
@@ -276,7 +276,7 @@ EXPORT_SYMBOL(mempool_free);
 /*
 * A commonly used alloc and free fn.
 */
-void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data)
+void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
        kmem_cache_t *mem = (kmem_cache_t *) pool_data;
        return kmem_cache_alloc(mem, gfp_mask);
diff --git a/mm/mmap.c b/mm/mmap.c
index 8b8e05f07cdb..fa11d91242e8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1640,7 +1640,7 @@ static void unmap_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 /*
 * Get rid of page table information in the indicated region.
 *
- * Called with the page table lock held.
+ * Called with the mm semaphore held.
 */
 static void unmap_region(struct mm_struct *mm,
                struct vm_area_struct *vma, struct vm_area_struct *prev,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e9fbd013ad9a..57577f63b305 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -248,7 +248,8 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
                newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
-                if ((newflags & ~(newflags >> 4)) & 0xf) {
+                /* newflags >> 4 shift VM_MAY% in place of VM_% */
+                if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
                        error = -EACCES;
                        goto out;
                }
diff --git a/mm/mremap.c b/mm/mremap.c
index a32fed454bd7..f343fc73a8bd 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -141,10 +141,10 @@ move_one_page(struct vm_area_struct *vma, unsigned long old_addr,
                        if (dst) {
                                pte_t pte;
                                pte = ptep_clear_flush(vma, old_addr, src);
                                /* ZERO_PAGE can be dependant on virtual addr */
-                                if (pfn_valid(pte_pfn(pte)) &&
+                                pte = move_pte(pte, new_vma->vm_page_prot,
-                                        pte_page(pte) == ZERO_PAGE(old_addr))
+                                                        old_addr, new_addr);
-                                        pte = pte_wrprotect(mk_pte(ZERO_PAGE(new_addr), new_vma->vm_page_prot));
                                set_pte_at(mm, new_addr, dst, pte);
                        } else
                                error = -ENOMEM;
diff --git a/mm/nommu.c b/mm/nommu.c
index 064d70442895..0ef241ae3763 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -157,8 +157,7 @@ void vfree(void *addr)
        kfree(addr);
 }
-void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask,
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
-                        pgprot_t prot)
 {
        /*
         * kmalloc doesn't like __GFP_HIGHMEM for some reason
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ac3bf33e5370..d348b9035955 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -263,7 +263,7 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-void out_of_memory(unsigned int __nocast gfp_mask, int order)
+void out_of_memory(gfp_t gfp_mask, int order)
 {
        struct mm_struct *mm = NULL;
        task_t * p;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ae2903339e71..cc1fe2672a31 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -671,7 +671,7 @@ void fastcall free_cold_page(struct page *page)
        free_hot_cold_page(page, 1);
 }
-static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags)
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
        int i;
@@ -686,7 +686,7 @@ static inline void prep_zero_page(struct page *page, int order, unsigned int __n
 * or two.
 */
 static struct page *
-buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
 {
        unsigned long flags;
        struct page *page = NULL;
@@ -761,7 +761,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 }
 static inline int
-should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
 {
        if (!z->reclaim_pages)
                return 0;
@@ -774,7 +774,7 @@ should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
 * This is the 'heart' of the zoned buddy allocator.
 */
 struct page * fastcall
-__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
                struct zonelist *zonelist)
 {
        const int wait = gfp_mask & __GFP_WAIT;
@@ -977,7 +977,7 @@ EXPORT_SYMBOL(__alloc_pages);
 /*
 * Common helper functions.
 */
-fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)
+fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
        struct page * page;
        page = alloc_pages(gfp_mask, order);
@@ -988,7 +988,7 @@ fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned
 EXPORT_SYMBOL(__get_free_pages);
-fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask)
+fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
        struct page * page;
diff --git a/mm/page_io.c b/mm/page_io.c
index 2e605a19ce57..330e00d6db00 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,7 +19,7 @@
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
-static struct bio *get_swap_bio(unsigned int __nocast gfp_flags, pgoff_t index,
+static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
                                struct page *page, bio_end_io_t end_io)
 {
        struct bio *bio;
diff --git a/mm/shmem.c b/mm/shmem.c
index 1f7aeb210c7b..ea064d89cda9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -921,8 +921,7 @@ shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
 }
 static inline struct page *
-shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info,
+shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
-                                 unsigned long idx)
 {
        return alloc_page(gfp | __GFP_ZERO);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 437d3388054b..d05c678bceb3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -308,12 +308,12 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define SIZE_L3 (1 + MAX_NUMNODES)
 /*
- * This function may be completely optimized away if
+ * This function must be completely optimized away if
 * a constant is passed to it. Mostly the same as
 * what is in linux/slab.h except it returns an
 * index.
 */
-static inline int index_of(const size_t size)
+static __always_inline int index_of(const size_t size)
 {
        if (__builtin_constant_p(size)) {
                int i = 0;
@@ -329,7 +329,8 @@ static inline int index_of(const size_t size)
                        extern void __bad_size(void);
                        __bad_size();
                }
-        }
+        } else
+                BUG();
        return 0;
 }
@@ -639,7 +640,7 @@ static enum {
 static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(kmem_cache_t* cachep, void** objpp, int len);
+static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
 static void enable_cpucache (kmem_cache_t *cachep);
 static void cache_reap (void *unused);
 static int __node_shrink(kmem_cache_t *cachep, int node);
@@ -649,8 +650,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
        return cachep->array[smp_processor_id()];
 }
-static inline kmem_cache_t *__find_general_cachep(size_t size,
+static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
-                                                unsigned int __nocast gfpflags)
 {
        struct cache_sizes *csizep = malloc_sizes;
@@ -674,8 +674,7 @@ static inline kmem_cache_t *__find_general_cachep(size_t size,
        return csizep->cs_cachep;
 }
-kmem_cache_t *kmem_find_general_cachep(size_t size,
+kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
-                unsigned int __nocast gfpflags)
 {
        return __find_general_cachep(size, gfpflags);
 }
@@ -804,7 +803,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
        if (ac->avail) {
                spin_lock(&rl3->list_lock);
-                free_block(cachep, ac->entry, ac->avail);
+                free_block(cachep, ac->entry, ac->avail, node);
                ac->avail = 0;
                spin_unlock(&rl3->list_lock);
        }
@@ -925,7 +924,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        /* Free limit for this kmem_list3 */
                        l3->free_limit -= cachep->batchcount;
                        if (nc)
-                                free_block(cachep, nc->entry, nc->avail);
+                                free_block(cachep, nc->entry, nc->avail, node);
                        if (!cpus_empty(mask)) {
                                spin_unlock(&l3->list_lock);
@@ -934,7 +933,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        if (l3->shared) {
                                free_block(cachep, l3->shared->entry,
-                                                l3->shared->avail);
+                                                l3->shared->avail, node);
                                kfree(l3->shared);
                                l3->shared = NULL;
                        }
@@ -1184,7 +1183,7 @@ __initcall(cpucache_init);
 * did not request dmaable memory, we might get it, but that
 * would be relatively rare and ignorable.
 */
-static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
        struct page *page;
        void *addr;
@@ -1882,12 +1881,13 @@ static void do_drain(void *arg)
 {
        kmem_cache_t *cachep = (kmem_cache_t*)arg;
        struct array_cache *ac;
+        int node = numa_node_id();
        check_irq_off();
        ac = ac_data(cachep);
-        spin_lock(&cachep->nodelists[numa_node_id()]->list_lock);
+        spin_lock(&cachep->nodelists[node]->list_lock);
-        free_block(cachep, ac->entry, ac->avail);
+        free_block(cachep, ac->entry, ac->avail, node);
-        spin_unlock(&cachep->nodelists[numa_node_id()]->list_lock);
+        spin_unlock(&cachep->nodelists[node]->list_lock);
        ac->avail = 0;
 }
@@ -2046,7 +2046,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 /* Get the memory for a slab management obj. */
 static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-                        int colour_off, unsigned int __nocast local_flags)
+                        int colour_off, gfp_t local_flags)
 {
        struct slab *slabp;
        
@@ -2147,7 +2147,7 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
 * Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
-static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
        struct slab     *slabp;
        void            *objp;
@@ -2354,7 +2354,7 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
-static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags)
+static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 {
        int batchcount;
        struct kmem_list3 *l3;
@@ -2454,7 +2454,7 @@ alloc_done:
 }
 static inline void
-cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
+cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 {
        might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2465,7 +2465,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
 #if DEBUG
 static void *
 cache_alloc_debugcheck_after(kmem_cache_t *cachep,
-                        unsigned int __nocast flags, void *objp, void *caller)
+                        gfp_t flags, void *objp, void *caller)
 {
        if (!objp)      
                return objp;
@@ -2508,16 +2508,12 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
+static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
-static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
 {
-        unsigned long save_flags;
        void* objp;
        struct array_cache *ac;
-        cache_alloc_debugcheck_before(cachep, flags);
+        check_irq_off();
-        local_irq_save(save_flags);
        ac = ac_data(cachep);
        if (likely(ac->avail)) {
                STATS_INC_ALLOCHIT(cachep);
@@ -2527,6 +2523,18 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
                STATS_INC_ALLOCMISS(cachep);
                objp = cache_alloc_refill(cachep, flags);
        }
+        return objp;
+}
+static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+{
+        unsigned long save_flags;
+        void* objp;
+        cache_alloc_debugcheck_before(cachep, flags);
+        local_irq_save(save_flags);
+        objp = ____cache_alloc(cachep, flags);
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp,
                                        __builtin_return_address(0));
@@ -2608,7 +2616,7 @@ done:
 /*
 * Caller needs to acquire correct kmem_list's list_lock
 */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
 {
        int i;
        struct kmem_list3 *l3;
@@ -2617,14 +2625,12 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
                void *objp = objpp[i];
                struct slab *slabp;
                unsigned int objnr;
-                int nodeid = 0;
                slabp = GET_PAGE_SLAB(virt_to_page(objp));
-                nodeid = slabp->nodeid;
+                l3 = cachep->nodelists[node];
-                l3 = cachep->nodelists[nodeid];
                list_del(&slabp->list);
                objnr = (objp - slabp->s_mem) / cachep->objsize;
-                check_spinlock_acquired_node(cachep, nodeid);
+                check_spinlock_acquired_node(cachep, node);
                check_slabp(cachep, slabp);
@@ -2664,13 +2670,14 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
 {
        int batchcount;
        struct kmem_list3 *l3;
+        int node = numa_node_id();
        batchcount = ac->batchcount;
 #if DEBUG
        BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
        check_irq_off();
-        l3 = cachep->nodelists[numa_node_id()];
+        l3 = cachep->nodelists[node];
        spin_lock(&l3->list_lock);
        if (l3->shared) {
                struct array_cache *shared_array = l3->shared;
@@ -2686,7 +2693,7 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
                }
        }
-        free_block(cachep, ac->entry, batchcount);
+        free_block(cachep, ac->entry, batchcount, node);
 free_done:
 #if STATS
        {
@@ -2751,7 +2758,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
                        } else {
                                spin_lock(&(cachep->nodelists[nodeid])->
                                                list_lock);
-                                free_block(cachep, &objp, 1);
+                                free_block(cachep, &objp, 1, nodeid);
                                spin_unlock(&(cachep->nodelists[nodeid])->
                                                list_lock);
                        }
@@ -2778,7 +2785,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 * Allocate an object from this cache.  The flags are only relevant
 * if the cache has no available objects.
 */
-void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
+void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
        return __cache_alloc(cachep, flags);
 }
@@ -2839,12 +2846,12 @@ out:
 * New and improved: it will now make sure that the object gets
 * put on the correct node list so that there is no false sharing.
 */
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
        unsigned long save_flags;
        void *ptr;
-        if (nodeid == numa_node_id() || nodeid == -1)
+        if (nodeid == -1)
                return __cache_alloc(cachep, flags);
        if (unlikely(!cachep->nodelists[nodeid])) {
@@ -2855,7 +2862,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, i
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
-        ptr = __cache_alloc_node(cachep, flags, nodeid);
+        if (nodeid == numa_node_id())
+                ptr = ____cache_alloc(cachep, flags);
+        else
+                ptr = __cache_alloc_node(cachep, flags, nodeid);
        local_irq_restore(save_flags);
        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
@@ -2863,7 +2873,7 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, i
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
-void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
+void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
        kmem_cache_t *cachep;
@@ -2896,7 +2906,7 @@ EXPORT_SYMBOL(kmalloc_node);
 * platforms.  For example, on i386, it means that the memory must come
 * from the first 16MB.
 */
-void *__kmalloc(size_t size, unsigned int __nocast flags)
+void *__kmalloc(size_t size, gfp_t flags)
 {
        kmem_cache_t *cachep;
@@ -2985,7 +2995,7 @@ EXPORT_SYMBOL(kmem_cache_free);
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 */
-void *kzalloc(size_t size, unsigned int __nocast flags)
+void *kzalloc(size_t size, gfp_t flags)
 {
        void *ret = kmalloc(size, flags);
        if (ret)
@@ -3079,7 +3089,7 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
                        if ((nc = cachep->nodelists[node]->shared))
                                free_block(cachep, nc->entry,
-                                                        nc->avail);
+                                                        nc->avail, node);
                        l3->shared = new;
                        if (!cachep->nodelists[node]->alien) {
@@ -3160,7 +3170,7 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
                if (!ccold)
                        continue;
                spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
-                free_block(cachep, ccold->entry, ccold->avail);
+                free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
                spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
                kfree(ccold);
        }
@@ -3240,7 +3250,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
                if (tofree > ac->avail) {
                        tofree = (ac->avail+1)/2;
                }
-                free_block(cachep, ac->entry, tofree);
+                free_block(cachep, ac->entry, tofree, node);
                ac->avail -= tofree;
                memmove(ac->entry, &(ac->entry[tofree]),
                                        sizeof(void*)*ac->avail);
@@ -3591,7 +3601,7 @@ unsigned int ksize(const void *objp)
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 */
-char *kstrdup(const char *s, unsigned int __nocast gfp)
+char *kstrdup(const char *s, gfp_t gfp)
 {
        size_t len;
        char *buf;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index adbc2b426c2f..132164f7d0a7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -68,7 +68,7 @@ void show_swap_cache_info(void)
 * but sets SwapCache flag and private instead of mapping and index.
 */
 static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
-                               unsigned int __nocast gfp_mask)
+                               gfp_t gfp_mask)
 {
        int error;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0184f510aace..1dcaeda039f4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1381,6 +1381,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = bd_claim(bdev, sys_swapon);
                if (error < 0) {
                        bdev = NULL;
+                        error = -EINVAL;
                        goto bad_swap;
                }
                p->old_block_size = block_size(bdev);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13c3d82968ae..1150229b6366 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -395,7 +395,7 @@ void *vmap(struct page **pages, unsigned int count,
 EXPORT_SYMBOL(vmap);
-void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot)
+void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 {
        struct page **pages;
        unsigned int nr_pages, array_size, i;
@@ -446,7 +446,7 @@ fail:
 *      allocator with @gfp_mask flags.  Map them into contiguous
 *      kernel virtual space, using a pagetable protection of @prot.
 */
-void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
        struct vm_struct *area;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0ea71e887bb6..64f9570cff56 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -511,10 +511,11 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * PageDirty _after_ making sure that the page is freeable and
                 * not in use by anybody.       (pagecache + us == 2)
                 */
-                if (page_count(page) != 2 || PageDirty(page)) {
+                if (unlikely(page_count(page) != 2))
-                        write_unlock_irq(&mapping->tree_lock);
+                        goto cannot_free;
-                        goto keep_locked;
+                smp_rmb();
-                }
+                if (unlikely(PageDirty(page)))
+                        goto cannot_free;
 #ifdef CONFIG_SWAP
                if (PageSwapCache(page)) {
@@ -538,6 +539,10 @@ free_it:
                        __pagevec_release_nonlru(&freed_pvec);
                continue;
+cannot_free:
+                write_unlock_irq(&mapping->tree_lock);
+                goto keep_locked;
 activate_locked:
                SetPageActive(page);
                pgactivate++;
author	Tony Luck <tony.luck@intel.com>	2005-10-20 13:41:44 -0400
committer	Tony Luck <tony.luck@intel.com>	2005-10-20 13:41:44 -0400
commit	9cec58dc138d6fcad9f447a19c8ff69f6540e667 (patch)
tree	4fe1cca94fdba8b705c87615bee06d3346f687ce /mm
parent	17e5ad6c0ce5a970e2830d0de8bdd60a2f077d38 (diff)
parent	ac9b9c667c2e1194e22ebe0a441ae1c37aaa9b90 (diff)