Merge branch 'master' of /home/trondmy/kernel/linux-2.6/

author: Trond Myklebust <Trond.Myklebust@netapp.com> 2006-07-05 13:13:03 -0400
committer: Trond Myklebust <Trond.Myklebust@netapp.com> 2006-07-05 13:13:03 -0400
commit: 5e66dd6d66ffe758b39b6dcadf2330753ee1159b (patch)
tree: a72cdcff4448e4af9425cc213ddf56ab23e697fe /mm
parent: 026477c1141b67e98e3bd8bdedb7d4b88a3ecd09 (diff)
parent: ca78f6baca863afe2e6a244a0fe94b3a70211d46 (diff)
8 files changed, 93 insertions, 31 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 7e2a4b1580e3..c1e14c9e67e4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -503,7 +503,7 @@ again:
                return -ENOMEM;
        src_pte = pte_offset_map_nested(src_pmd, addr);
        src_ptl = pte_lockptr(src_mm, src_pmd);
-        spin_lock(src_ptl);
+        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        do {
                /*
diff --git a/mm/mremap.c b/mm/mremap.c
index 1903bdf65e42..7c15cf3373ad 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -97,7 +97,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        new_pte = pte_offset_map_nested(new_pmd, new_addr);
        new_ptl = pte_lockptr(mm, new_pmd);
        if (new_ptl != old_ptl)
-                spin_lock(new_ptl);
+                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
        for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
                                   new_pte++, new_addr += PAGE_SIZE) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d46ed0f1dc06..b9af136e5cfa 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -225,7 +225,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
 * we select a process with CAP_SYS_RAW_IO set).
 */
-static void __oom_kill_task(task_t *p, const char *message)
+static void __oom_kill_task(struct task_struct *p, const char *message)
 {
        if (p->pid == 1) {
                WARN_ON(1);
@@ -255,10 +255,10 @@ static void __oom_kill_task(task_t *p, const char *message)
        force_sig(SIGKILL, p);
 }
-static int oom_kill_task(task_t *p, const char *message)
+static int oom_kill_task(struct task_struct *p, const char *message)
 {
        struct mm_struct *mm;
-        task_t * g, * q;
+        struct task_struct *g, *q;
        mm = p->mm;
@@ -316,7 +316,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
 {
-        task_t *p;
+        struct task_struct *p;
        unsigned long points = 0;
        if (printk_ratelimit()) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3e792a583f3b..54a4f5375bba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2005,6 +2005,10 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
+#ifdef CONFIG_NUMA
+                zone->min_unmapped_ratio = (realsize*sysctl_min_unmapped_ratio)
+                                                / 100;
+#endif
                zone->name = zone_names[j];
                spin_lock_init(&zone->lock);
                spin_lock_init(&zone->lru_lock);
@@ -2298,6 +2302,24 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
        return 0;
 }
+#ifdef CONFIG_NUMA
+int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
+        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+        struct zone *zone;
+        int rc;
+        rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+        if (rc)
+                return rc;
+        for_each_zone(zone)
+                zone->min_unmapped_ratio = (zone->present_pages *
+                                sysctl_min_unmapped_ratio) / 100;
+        return 0;
+}
+#endif
 /*
 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
 *      proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
diff --git a/mm/slab.c b/mm/slab.c
index 3936af344542..85c2e03098a7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1021,7 +1021,8 @@ static void drain_alien_cache(struct kmem_cache *cachep,
        }
 }
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp,
+                                   int nesting)
 {
        struct slab *slabp = virt_to_slab(objp);
        int nodeid = slabp->nodeid;
@@ -1039,7 +1040,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
        STATS_INC_NODEFREES(cachep);
        if (l3->alien && l3->alien[nodeid]) {
                alien = l3->alien[nodeid];
-                spin_lock(&alien->lock);
+                spin_lock_nested(&alien->lock, nesting);
                if (unlikely(alien->avail == alien->limit)) {
                        STATS_INC_ACOVERFLOW(cachep);
                        __drain_alien_cache(cachep, alien, nodeid);
@@ -1068,7 +1069,8 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp,
+                                   int nesting)
 {
        return 0;
 }
@@ -1272,6 +1274,11 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
        local_irq_disable();
        memcpy(ptr, list, sizeof(struct kmem_list3));
+        /*
+         * Do not assume that spinlocks can be initialized via memcpy:
+         */
+        spin_lock_init(&ptr->list_lock);
        MAKE_ALL_LISTS(cachep, ptr, nodeid);
        cachep->nodelists[nodeid] = ptr;
        local_irq_enable();
@@ -1398,7 +1405,7 @@ void __init kmem_cache_init(void)
        }
        /* 4) Replace the bootstrap head arrays */
        {
-                void *ptr;
+                struct array_cache *ptr;
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
@@ -1406,6 +1413,11 @@ void __init kmem_cache_init(void)
                BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
                memcpy(ptr, cpu_cache_get(&cache_cache),
                       sizeof(struct arraycache_init));
+                /*
+                 * Do not assume that spinlocks can be initialized via memcpy:
+                 */
+                spin_lock_init(&ptr->lock);
                cache_cache.array[smp_processor_id()] = ptr;
                local_irq_enable();
@@ -1416,6 +1428,11 @@ void __init kmem_cache_init(void)
                       != &initarray_generic.cache);
                memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
                       sizeof(struct arraycache_init));
+                /*
+                 * Do not assume that spinlocks can be initialized via memcpy:
+                 */
+                spin_lock_init(&ptr->lock);
                malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
                    ptr;
                local_irq_enable();
@@ -1743,6 +1760,8 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 }
 #endif
+static void __cache_free(struct kmem_cache *cachep, void *objp, int nesting);
 /**
 * slab_destroy - destroy and release all objects in a slab
 * @cachep: cache pointer being destroyed
@@ -1766,8 +1785,17 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
                call_rcu(&slab_rcu->head, kmem_rcu_free);
        } else {
                kmem_freepages(cachep, addr);
-                if (OFF_SLAB(cachep))
+                if (OFF_SLAB(cachep)) {
-                        kmem_cache_free(cachep->slabp_cache, slabp);
+                        unsigned long flags;
+                        /*
+                         * lockdep: we may nest inside an already held
+                         * ac->lock, so pass in a nesting flag:
+                         */
+                        local_irq_save(flags);
+                        __cache_free(cachep->slabp_cache, slabp, 1);
+                        local_irq_restore(flags);
+                }
        }
 }
@@ -3072,7 +3100,16 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
                if (slabp->inuse == 0) {
                        if (l3->free_objects > l3->free_limit) {
                                l3->free_objects -= cachep->num;
+                                /*
+                                 * It is safe to drop the lock. The slab is
+                                 * no longer linked to the cache. cachep
+                                 * cannot disappear - we are using it and
+                                 * all destruction of caches must be
+                                 * serialized properly by the user.
+                                 */
+                                spin_unlock(&l3->list_lock);
                                slab_destroy(cachep, slabp);
+                                spin_lock(&l3->list_lock);
                        } else {
                                list_add(&slabp->list, &l3->slabs_free);
                        }
@@ -3098,7 +3135,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 #endif
        check_irq_off();
        l3 = cachep->nodelists[node];
-        spin_lock(&l3->list_lock);
+        spin_lock_nested(&l3->list_lock, SINGLE_DEPTH_NESTING);
        if (l3->shared) {
                struct array_cache *shared_array = l3->shared;
                int max = shared_array->limit - shared_array->avail;
@@ -3141,14 +3178,14 @@ free_done:
 * Release an obj back to its cache. If the obj has a constructed state, it must
 * be in this state _before_ it is released.  Called with disabled ints.
 */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static void __cache_free(struct kmem_cache *cachep, void *objp, int nesting)
 {
        struct array_cache *ac = cpu_cache_get(cachep);
        check_irq_off();
        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
-        if (cache_free_alien(cachep, objp))
+        if (cache_free_alien(cachep, objp, nesting))
                return;
        if (likely(ac->avail < ac->limit)) {
@@ -3387,7 +3424,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        BUG_ON(virt_to_cache(objp) != cachep);
        local_irq_save(flags);
-        __cache_free(cachep, objp);
+        __cache_free(cachep, objp, 0);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kmem_cache_free);
@@ -3412,7 +3449,7 @@ void kfree(const void *objp)
        kfree_debugcheck(objp);
        c = virt_to_cache(objp);
        debug_check_no_locks_freed(objp, obj_size(c));
-        __cache_free(c, (void *)objp);
+        __cache_free(c, (void *)objp, 0);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fccbd9bba77b..5f7cf2a4cb55 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -38,7 +38,7 @@ static struct backing_dev_info swap_backing_dev_info = {
 struct address_space swapper_space = {
        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-        .tree_lock      = RW_LOCK_UNLOCKED,
+        .tree_lock      = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
        .a_ops          = &swap_aops,
        .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
        .backing_dev_info = &swap_backing_dev_info,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35f8553f893a..7b450798b458 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -330,6 +330,8 @@ void __vunmap(void *addr, int deallocate_pages)
                return;
        }
+        debug_check_no_locks_freed(addr, area->size);
        if (deallocate_pages) {
                int i;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ff2ebe9458a3..5d4c4d02254d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1503,10 +1503,6 @@ module_init(kswapd_init)
 *
 * If non-zero call zone_reclaim when the number of free pages falls below
 * the watermarks.
- *
- * In the future we may add flags to the mode. However, the page allocator
- * should only have to check that zone_reclaim_mode != 0 before calling
- * zone_reclaim().
 */
 int zone_reclaim_mode __read_mostly;
@@ -1524,6 +1520,12 @@ int zone_reclaim_mode __read_mostly;
 #define ZONE_RECLAIM_PRIORITY 4
 /*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+/*
 * Try to free up some pages from this zone through reclaim.
 */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -1590,18 +1592,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        int node_id;
        /*
-         * Do not reclaim if there are not enough reclaimable pages in this
+         * Zone reclaim reclaims unmapped file backed pages.
-         * zone that would satify this allocations.
         *
-         * All unmapped pagecache pages are reclaimable.
+         * A small portion of unmapped file backed pages is needed for
-         *
+         * file I/O otherwise pages read by file I/O will be immediately
-         * Both counters may be temporarily off a bit so we use
+         * thrown out if the zone is overallocated. So we do not reclaim
-         * SWAP_CLUSTER_MAX as the boundary. It may also be good to
+         * if less than a specified percentage of the zone is used by
-         * leave a few frequently used unmapped pagecache pages around.
+         * unmapped file backed pages.
         */
        if (zone_page_state(zone, NR_FILE_PAGES) -
-                zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX)
+            zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_ratio)
-                        return 0;
+                return 0;
        /*
         * Avoid concurrent zone reclaims, do not reclaim in a zone that does
author	Trond Myklebust <Trond.Myklebust@netapp.com>	2006-07-05 13:13:03 -0400
committer	Trond Myklebust <Trond.Myklebust@netapp.com>	2006-07-05 13:13:03 -0400
commit	5e66dd6d66ffe758b39b6dcadf2330753ee1159b (patch)
tree	a72cdcff4448e4af9425cc213ddf56ab23e697fe /mm
parent	026477c1141b67e98e3bd8bdedb7d4b88a3ecd09 (diff)
parent	ca78f6baca863afe2e6a244a0fe94b3a70211d46 (diff)