aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/mempolicy.c99
-rw-r--r--mm/page-writeback.c7
-rw-r--r--mm/page_alloc.c17
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/slab.c58
-rw-r--r--mm/swap.c26
-rw-r--r--mm/swapfile.c17
-rw-r--r--mm/vmscan.c146
9 files changed, 246 insertions, 127 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a965b6b35f26..44da3d476994 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
94 * ->private_lock (try_to_unmap_one) 94 * ->private_lock (try_to_unmap_one)
95 * ->tree_lock (try_to_unmap_one) 95 * ->tree_lock (try_to_unmap_one)
96 * ->zone.lru_lock (follow_page->mark_page_accessed) 96 * ->zone.lru_lock (follow_page->mark_page_accessed)
97 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
97 * ->private_lock (page_remove_rmap->set_page_dirty) 98 * ->private_lock (page_remove_rmap->set_page_dirty)
98 * ->tree_lock (page_remove_rmap->set_page_dirty) 99 * ->tree_lock (page_remove_rmap->set_page_dirty)
99 * ->inode_lock (page_remove_rmap->set_page_dirty) 100 * ->inode_lock (page_remove_rmap->set_page_dirty)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3171f884d245..73790188b0eb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -185,8 +185,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
185} 185}
186 186
187static void gather_stats(struct page *, void *); 187static void gather_stats(struct page *, void *);
188static void migrate_page_add(struct vm_area_struct *vma, 188static void migrate_page_add(struct page *page, struct list_head *pagelist,
189 struct page *page, struct list_head *pagelist, unsigned long flags); 189 unsigned long flags);
190 190
191/* Scan through pages checking if pages follow certain conditions. */ 191/* Scan through pages checking if pages follow certain conditions. */
192static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 192static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -208,6 +208,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
208 page = vm_normal_page(vma, addr, *pte); 208 page = vm_normal_page(vma, addr, *pte);
209 if (!page) 209 if (!page)
210 continue; 210 continue;
211 /*
212 * The check for PageReserved here is important to avoid
213 * handling zero pages and other pages that may have been
214 * marked special by the system.
215 *
216 * If the PageReserved would not be checked here then f.e.
217 * the location of the zero page could have an influence
218 * on MPOL_MF_STRICT, zero pages would be counted for
219 * the per node stats, and there would be useless attempts
220 * to put zero pages on the migration list.
221 */
211 if (PageReserved(page)) 222 if (PageReserved(page))
212 continue; 223 continue;
213 nid = page_to_nid(page); 224 nid = page_to_nid(page);
@@ -216,11 +227,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
216 227
217 if (flags & MPOL_MF_STATS) 228 if (flags & MPOL_MF_STATS)
218 gather_stats(page, private); 229 gather_stats(page, private);
219 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 230 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
220 spin_unlock(ptl); 231 migrate_page_add(page, private, flags);
221 migrate_page_add(vma, page, private, flags);
222 spin_lock(ptl);
223 }
224 else 232 else
225 break; 233 break;
226 } while (pte++, addr += PAGE_SIZE, addr != end); 234 } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -309,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
309 int err; 317 int err;
310 struct vm_area_struct *first, *vma, *prev; 318 struct vm_area_struct *first, *vma, *prev;
311 319
320 /* Clear the LRU lists so pages can be isolated */
321 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
322 lru_add_drain_all();
323
312 first = find_vma(mm, start); 324 first = find_vma(mm, start);
313 if (!first) 325 if (!first)
314 return ERR_PTR(-EFAULT); 326 return ERR_PTR(-EFAULT);
@@ -519,51 +531,15 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
519 * page migration 531 * page migration
520 */ 532 */
521 533
522/* Check if we are the only process mapping the page in question */ 534static void migrate_page_add(struct page *page, struct list_head *pagelist,
523static inline int single_mm_mapping(struct mm_struct *mm, 535 unsigned long flags)
524 struct address_space *mapping)
525{
526 struct vm_area_struct *vma;
527 struct prio_tree_iter iter;
528 int rc = 1;
529
530 spin_lock(&mapping->i_mmap_lock);
531 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
532 if (mm != vma->vm_mm) {
533 rc = 0;
534 goto out;
535 }
536 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
537 if (mm != vma->vm_mm) {
538 rc = 0;
539 goto out;
540 }
541out:
542 spin_unlock(&mapping->i_mmap_lock);
543 return rc;
544}
545
546/*
547 * Add a page to be migrated to the pagelist
548 */
549static void migrate_page_add(struct vm_area_struct *vma,
550 struct page *page, struct list_head *pagelist, unsigned long flags)
551{ 536{
552 /* 537 /*
553 * Avoid migrating a page that is shared by others and not writable. 538 * Avoid migrating a page that is shared with others.
554 */ 539 */
555 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || 540 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
556 mapping_writably_mapped(page->mapping) || 541 if (isolate_lru_page(page))
557 single_mm_mapping(vma->vm_mm, page->mapping)) {
558 int rc = isolate_lru_page(page);
559
560 if (rc == 1)
561 list_add(&page->lru, pagelist); 542 list_add(&page->lru, pagelist);
562 /*
563 * If the isolate attempt was not successful then we just
564 * encountered an unswappable page. Something must be wrong.
565 */
566 WARN_ON(rc == 0);
567 } 543 }
568} 544}
569 545
@@ -1000,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1000 return nid; 976 return nid;
1001} 977}
1002 978
979/*
980 * Depending on the memory policy provide a node from which to allocate the
981 * next slab entry.
982 */
983unsigned slab_node(struct mempolicy *policy)
984{
985 switch (policy->policy) {
986 case MPOL_INTERLEAVE:
987 return interleave_nodes(policy);
988
989 case MPOL_BIND:
990 /*
991 * Follow bind policy behavior and start allocation at the
992 * first node.
993 */
994 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
995
996 case MPOL_PREFERRED:
997 if (policy->v.preferred_node >= 0)
998 return policy->v.preferred_node;
999 /* Fall through */
1000
1001 default:
1002 return numa_node_id();
1003 }
1004}
1005
1003/* Do static interleaving for a VMA with known offset. */ 1006/* Do static interleaving for a VMA with known offset. */
1004static unsigned offset_il_node(struct mempolicy *pol, 1007static unsigned offset_il_node(struct mempolicy *pol,
1005 struct vm_area_struct *vma, unsigned long off) 1008 struct vm_area_struct *vma, unsigned long off)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5240e426c1f7..945559fb63d2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,7 @@
46static long ratelimit_pages = 32; 46static long ratelimit_pages = 32;
47 47
48static long total_pages; /* The total number of pages in the machine. */ 48static long total_pages; /* The total number of pages in the machine. */
49static int dirty_exceeded; /* Dirty mem may be over limit */ 49static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
50 50
51/* 51/*
52 * When balance_dirty_pages decides that the caller needs to perform some 52 * When balance_dirty_pages decides that the caller needs to perform some
@@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping)
212 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) 212 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
213 break; 213 break;
214 214
215 dirty_exceeded = 1; 215 if (!dirty_exceeded)
216 dirty_exceeded = 1;
216 217
217 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 218 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
218 * Unstable writes are a feature of certain networked 219 * Unstable writes are a feature of certain networked
@@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping)
234 blk_congestion_wait(WRITE, HZ/10); 235 blk_congestion_wait(WRITE, HZ/10);
235 } 236 }
236 237
237 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) 238 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
238 dirty_exceeded = 0; 239 dirty_exceeded = 0;
239 240
240 if (writeback_in_progress(bdi)) 241 if (writeback_in_progress(bdi))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2e29743a8d1..df54e2fc8ee0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
878 mark = (*z)->pages_high; 878 mark = (*z)->pages_high;
879 if (!zone_watermark_ok(*z, order, mark, 879 if (!zone_watermark_ok(*z, order, mark,
880 classzone_idx, alloc_flags)) 880 classzone_idx, alloc_flags))
881 continue; 881 if (!zone_reclaim_mode ||
882 !zone_reclaim(*z, gfp_mask, order))
883 continue;
882 } 884 }
883 885
884 page = buffered_rmqueue(zonelist, *z, order, gfp_mask); 886 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
1595 prev_node = local_node; 1597 prev_node = local_node;
1596 nodes_clear(used_mask); 1598 nodes_clear(used_mask);
1597 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1599 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1600 int distance = node_distance(local_node, node);
1601
1602 /*
1603 * If another node is sufficiently far away then it is better
1604 * to reclaim pages in a zone before going off node.
1605 */
1606 if (distance > RECLAIM_DISTANCE)
1607 zone_reclaim_mode = 1;
1608
1598 /* 1609 /*
1599 * We don't want to pressure a particular node. 1610 * We don't want to pressure a particular node.
1600 * So adding penalty to the first node in same 1611 * So adding penalty to the first node in same
1601 * distance group to make it round-robin. 1612 * distance group to make it round-robin.
1602 */ 1613 */
1603 if (node_distance(local_node, node) != 1614
1604 node_distance(local_node, prev_node)) 1615 if (distance != node_distance(local_node, prev_node))
1605 node_load[node] += load; 1616 node_load[node] += load;
1606 prev_node = node; 1617 prev_node = node;
1607 load--; 1618 load--;
diff --git a/mm/rmap.c b/mm/rmap.c
index dfbb89f99a15..d85a99d28c03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -33,7 +33,7 @@
33 * mapping->i_mmap_lock 33 * mapping->i_mmap_lock
34 * anon_vma->lock 34 * anon_vma->lock
35 * mm->page_table_lock or pte_lock 35 * mm->page_table_lock or pte_lock
36 * zone->lru_lock (in mark_page_accessed) 36 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
37 * swap_lock (in swap_duplicate, swap_info_get) 37 * swap_lock (in swap_duplicate, swap_info_get)
38 * mmlist_lock (in mmput, drain_mmlist and others) 38 * mmlist_lock (in mmput, drain_mmlist and others)
39 * mapping->private_lock (in __set_page_dirty_buffers) 39 * mapping->private_lock (in __set_page_dirty_buffers)
diff --git a/mm/slab.c b/mm/slab.c
index 9374293a3012..6f8495e2185b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
68 * Further notes from the original documentation: 68 * Further notes from the original documentation:
69 * 69 *
70 * 11 April '97. Started multi-threading - markhe 70 * 11 April '97. Started multi-threading - markhe
71 * The global cache-chain is protected by the semaphore 'cache_chain_sem'. 71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 * The sem is only needed when accessing/extending the cache-chain, which 72 * The sem is only needed when accessing/extending the cache-chain, which
73 * can never happen inside an interrupt (kmem_cache_create(), 73 * can never happen inside an interrupt (kmem_cache_create(),
74 * kmem_cache_shrink() and kmem_cache_reap()). 74 * kmem_cache_shrink() and kmem_cache_reap()).
@@ -103,6 +103,8 @@
103#include <linux/rcupdate.h> 103#include <linux/rcupdate.h>
104#include <linux/string.h> 104#include <linux/string.h>
105#include <linux/nodemask.h> 105#include <linux/nodemask.h>
106#include <linux/mempolicy.h>
107#include <linux/mutex.h>
106 108
107#include <asm/uaccess.h> 109#include <asm/uaccess.h>
108#include <asm/cacheflush.h> 110#include <asm/cacheflush.h>
@@ -631,7 +633,7 @@ static kmem_cache_t cache_cache = {
631}; 633};
632 634
633/* Guard access to the cache-chain. */ 635/* Guard access to the cache-chain. */
634static struct semaphore cache_chain_sem; 636static DEFINE_MUTEX(cache_chain_mutex);
635static struct list_head cache_chain; 637static struct list_head cache_chain;
636 638
637/* 639/*
@@ -772,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries,
772} 774}
773 775
774#ifdef CONFIG_NUMA 776#ifdef CONFIG_NUMA
777static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
778
775static inline struct array_cache **alloc_alien_cache(int node, int limit) 779static inline struct array_cache **alloc_alien_cache(int node, int limit)
776{ 780{
777 struct array_cache **ac_ptr; 781 struct array_cache **ac_ptr;
@@ -857,7 +861,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
857 861
858 switch (action) { 862 switch (action) {
859 case CPU_UP_PREPARE: 863 case CPU_UP_PREPARE:
860 down(&cache_chain_sem); 864 mutex_lock(&cache_chain_mutex);
861 /* we need to do this right in the beginning since 865 /* we need to do this right in the beginning since
862 * alloc_arraycache's are going to use this list. 866 * alloc_arraycache's are going to use this list.
863 * kmalloc_node allows us to add the slab to the right 867 * kmalloc_node allows us to add the slab to the right
@@ -912,7 +916,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
912 l3->shared = nc; 916 l3->shared = nc;
913 } 917 }
914 } 918 }
915 up(&cache_chain_sem); 919 mutex_unlock(&cache_chain_mutex);
916 break; 920 break;
917 case CPU_ONLINE: 921 case CPU_ONLINE:
918 start_cpu_timer(cpu); 922 start_cpu_timer(cpu);
@@ -921,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
921 case CPU_DEAD: 925 case CPU_DEAD:
922 /* fall thru */ 926 /* fall thru */
923 case CPU_UP_CANCELED: 927 case CPU_UP_CANCELED:
924 down(&cache_chain_sem); 928 mutex_lock(&cache_chain_mutex);
925 929
926 list_for_each_entry(cachep, &cache_chain, next) { 930 list_for_each_entry(cachep, &cache_chain, next) {
927 struct array_cache *nc; 931 struct array_cache *nc;
@@ -973,13 +977,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
973 spin_unlock_irq(&cachep->spinlock); 977 spin_unlock_irq(&cachep->spinlock);
974 kfree(nc); 978 kfree(nc);
975 } 979 }
976 up(&cache_chain_sem); 980 mutex_unlock(&cache_chain_mutex);
977 break; 981 break;
978#endif 982#endif
979 } 983 }
980 return NOTIFY_OK; 984 return NOTIFY_OK;
981 bad: 985 bad:
982 up(&cache_chain_sem); 986 mutex_unlock(&cache_chain_mutex);
983 return NOTIFY_BAD; 987 return NOTIFY_BAD;
984} 988}
985 989
@@ -1047,7 +1051,6 @@ void __init kmem_cache_init(void)
1047 */ 1051 */
1048 1052
1049 /* 1) create the cache_cache */ 1053 /* 1) create the cache_cache */
1050 init_MUTEX(&cache_chain_sem);
1051 INIT_LIST_HEAD(&cache_chain); 1054 INIT_LIST_HEAD(&cache_chain);
1052 list_add(&cache_cache.next, &cache_chain); 1055 list_add(&cache_cache.next, &cache_chain);
1053 cache_cache.colour_off = cache_line_size(); 1056 cache_cache.colour_off = cache_line_size();
@@ -1168,10 +1171,10 @@ void __init kmem_cache_init(void)
1168 /* 6) resize the head arrays to their final sizes */ 1171 /* 6) resize the head arrays to their final sizes */
1169 { 1172 {
1170 kmem_cache_t *cachep; 1173 kmem_cache_t *cachep;
1171 down(&cache_chain_sem); 1174 mutex_lock(&cache_chain_mutex);
1172 list_for_each_entry(cachep, &cache_chain, next) 1175 list_for_each_entry(cachep, &cache_chain, next)
1173 enable_cpucache(cachep); 1176 enable_cpucache(cachep);
1174 up(&cache_chain_sem); 1177 mutex_unlock(&cache_chain_mutex);
1175 } 1178 }
1176 1179
1177 /* Done! */ 1180 /* Done! */
@@ -1590,7 +1593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1590 BUG(); 1593 BUG();
1591 } 1594 }
1592 1595
1593 down(&cache_chain_sem); 1596 mutex_lock(&cache_chain_mutex);
1594 1597
1595 list_for_each(p, &cache_chain) { 1598 list_for_each(p, &cache_chain) {
1596 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1599 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
@@ -1856,7 +1859,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1856 if (!cachep && (flags & SLAB_PANIC)) 1859 if (!cachep && (flags & SLAB_PANIC))
1857 panic("kmem_cache_create(): failed to create slab `%s'\n", 1860 panic("kmem_cache_create(): failed to create slab `%s'\n",
1858 name); 1861 name);
1859 up(&cache_chain_sem); 1862 mutex_unlock(&cache_chain_mutex);
1860 return cachep; 1863 return cachep;
1861} 1864}
1862EXPORT_SYMBOL(kmem_cache_create); 1865EXPORT_SYMBOL(kmem_cache_create);
@@ -2044,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t *cachep)
2044 lock_cpu_hotplug(); 2047 lock_cpu_hotplug();
2045 2048
2046 /* Find the cache in the chain of caches. */ 2049 /* Find the cache in the chain of caches. */
2047 down(&cache_chain_sem); 2050 mutex_lock(&cache_chain_mutex);
2048 /* 2051 /*
2049 * the chain is never empty, cache_cache is never destroyed 2052 * the chain is never empty, cache_cache is never destroyed
2050 */ 2053 */
2051 list_del(&cachep->next); 2054 list_del(&cachep->next);
2052 up(&cache_chain_sem); 2055 mutex_unlock(&cache_chain_mutex);
2053 2056
2054 if (__cache_shrink(cachep)) { 2057 if (__cache_shrink(cachep)) {
2055 slab_error(cachep, "Can't free all objects"); 2058 slab_error(cachep, "Can't free all objects");
2056 down(&cache_chain_sem); 2059 mutex_lock(&cache_chain_mutex);
2057 list_add(&cachep->next, &cache_chain); 2060 list_add(&cachep->next, &cache_chain);
2058 up(&cache_chain_sem); 2061 mutex_unlock(&cache_chain_mutex);
2059 unlock_cpu_hotplug(); 2062 unlock_cpu_hotplug();
2060 return 1; 2063 return 1;
2061 } 2064 }
@@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2570 void *objp; 2573 void *objp;
2571 struct array_cache *ac; 2574 struct array_cache *ac;
2572 2575
2576#ifdef CONFIG_NUMA
2577 if (unlikely(current->mempolicy && !in_interrupt())) {
2578 int nid = slab_node(current->mempolicy);
2579
2580 if (nid != numa_node_id())
2581 return __cache_alloc_node(cachep, flags, nid);
2582 }
2583#endif
2584
2573 check_irq_off(); 2585 check_irq_off();
2574 ac = ac_data(cachep); 2586 ac = ac_data(cachep);
2575 if (likely(ac->avail)) { 2587 if (likely(ac->avail)) {
@@ -3314,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
3314 * - clear the per-cpu caches for this CPU. 3326 * - clear the per-cpu caches for this CPU.
3315 * - return freeable pages to the main free memory pool. 3327 * - return freeable pages to the main free memory pool.
3316 * 3328 *
3317 * If we cannot acquire the cache chain semaphore then just give up - we'll 3329 * If we cannot acquire the cache chain mutex then just give up - we'll
3318 * try again on the next iteration. 3330 * try again on the next iteration.
3319 */ 3331 */
3320static void cache_reap(void *unused) 3332static void cache_reap(void *unused)
@@ -3322,7 +3334,7 @@ static void cache_reap(void *unused)
3322 struct list_head *walk; 3334 struct list_head *walk;
3323 struct kmem_list3 *l3; 3335 struct kmem_list3 *l3;
3324 3336
3325 if (down_trylock(&cache_chain_sem)) { 3337 if (!mutex_trylock(&cache_chain_mutex)) {
3326 /* Give up. Setup the next iteration. */ 3338 /* Give up. Setup the next iteration. */
3327 schedule_delayed_work(&__get_cpu_var(reap_work), 3339 schedule_delayed_work(&__get_cpu_var(reap_work),
3328 REAPTIMEOUT_CPUC); 3340 REAPTIMEOUT_CPUC);
@@ -3393,7 +3405,7 @@ static void cache_reap(void *unused)
3393 cond_resched(); 3405 cond_resched();
3394 } 3406 }
3395 check_irq_on(); 3407 check_irq_on();
3396 up(&cache_chain_sem); 3408 mutex_unlock(&cache_chain_mutex);
3397 drain_remote_pages(); 3409 drain_remote_pages();
3398 /* Setup the next iteration */ 3410 /* Setup the next iteration */
3399 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3411 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
@@ -3429,7 +3441,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
3429 loff_t n = *pos; 3441 loff_t n = *pos;
3430 struct list_head *p; 3442 struct list_head *p;
3431 3443
3432 down(&cache_chain_sem); 3444 mutex_lock(&cache_chain_mutex);
3433 if (!n) 3445 if (!n)
3434 print_slabinfo_header(m); 3446 print_slabinfo_header(m);
3435 p = cache_chain.next; 3447 p = cache_chain.next;
@@ -3451,7 +3463,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3451 3463
3452static void s_stop(struct seq_file *m, void *p) 3464static void s_stop(struct seq_file *m, void *p)
3453{ 3465{
3454 up(&cache_chain_sem); 3466 mutex_unlock(&cache_chain_mutex);
3455} 3467}
3456 3468
3457static int s_show(struct seq_file *m, void *p) 3469static int s_show(struct seq_file *m, void *p)
@@ -3603,7 +3615,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3603 return -EINVAL; 3615 return -EINVAL;
3604 3616
3605 /* Find the cache in the chain of caches. */ 3617 /* Find the cache in the chain of caches. */
3606 down(&cache_chain_sem); 3618 mutex_lock(&cache_chain_mutex);
3607 res = -EINVAL; 3619 res = -EINVAL;
3608 list_for_each(p, &cache_chain) { 3620 list_for_each(p, &cache_chain) {
3609 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3621 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
@@ -3620,7 +3632,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3620 break; 3632 break;
3621 } 3633 }
3622 } 3634 }
3623 up(&cache_chain_sem); 3635 mutex_unlock(&cache_chain_mutex);
3624 if (res >= 0) 3636 if (res >= 0)
3625 res = count; 3637 res = count;
3626 return res; 3638 return res;
diff --git a/mm/swap.c b/mm/swap.c
index cbb48e721ab9..bc2442a7b0ee 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -174,6 +174,32 @@ void lru_add_drain(void)
174 put_cpu(); 174 put_cpu();
175} 175}
176 176
177#ifdef CONFIG_NUMA
178static void lru_add_drain_per_cpu(void *dummy)
179{
180 lru_add_drain();
181}
182
183/*
184 * Returns 0 for success
185 */
186int lru_add_drain_all(void)
187{
188 return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
189}
190
191#else
192
193/*
194 * Returns 0 for success
195 */
196int lru_add_drain_all(void)
197{
198 lru_add_drain();
199 return 0;
200}
201#endif
202
177/* 203/*
178 * This path almost never happens for VM activity - pages are normally 204 * This path almost never happens for VM activity - pages are normally
179 * freed via pagevecs. But it gets used by networking. 205 * freed via pagevecs. But it gets used by networking.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 957fef43fa60..f1e69c30d203 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,7 @@
25#include <linux/rmap.h> 25#include <linux/rmap.h>
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
28#include <linux/capability.h> 29#include <linux/capability.h>
29#include <linux/syscalls.h> 30#include <linux/syscalls.h>
30 31
@@ -46,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1};
46 47
47struct swap_info_struct swap_info[MAX_SWAPFILES]; 48struct swap_info_struct swap_info[MAX_SWAPFILES];
48 49
49static DECLARE_MUTEX(swapon_sem); 50static DEFINE_MUTEX(swapon_mutex);
50 51
51/* 52/*
52 * We need this because the bdev->unplug_fn can sleep and we cannot 53 * We need this because the bdev->unplug_fn can sleep and we cannot
53 * hold swap_lock while calling the unplug_fn. And swap_lock 54 * hold swap_lock while calling the unplug_fn. And swap_lock
54 * cannot be turned into a semaphore. 55 * cannot be turned into a mutex.
55 */ 56 */
56static DECLARE_RWSEM(swap_unplug_sem); 57static DECLARE_RWSEM(swap_unplug_sem);
57 58
@@ -1161,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1161 up_write(&swap_unplug_sem); 1162 up_write(&swap_unplug_sem);
1162 1163
1163 destroy_swap_extents(p); 1164 destroy_swap_extents(p);
1164 down(&swapon_sem); 1165 mutex_lock(&swapon_mutex);
1165 spin_lock(&swap_lock); 1166 spin_lock(&swap_lock);
1166 drain_mmlist(); 1167 drain_mmlist();
1167 1168
@@ -1180,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1180 p->swap_map = NULL; 1181 p->swap_map = NULL;
1181 p->flags = 0; 1182 p->flags = 0;
1182 spin_unlock(&swap_lock); 1183 spin_unlock(&swap_lock);
1183 up(&swapon_sem); 1184 mutex_unlock(&swapon_mutex);
1184 vfree(swap_map); 1185 vfree(swap_map);
1185 inode = mapping->host; 1186 inode = mapping->host;
1186 if (S_ISBLK(inode->i_mode)) { 1187 if (S_ISBLK(inode->i_mode)) {
@@ -1209,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1209 int i; 1210 int i;
1210 loff_t l = *pos; 1211 loff_t l = *pos;
1211 1212
1212 down(&swapon_sem); 1213 mutex_lock(&swapon_mutex);
1213 1214
1214 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1215 for (i = 0; i < nr_swapfiles; i++, ptr++) {
1215 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1216 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
@@ -1238,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1238 1239
1239static void swap_stop(struct seq_file *swap, void *v) 1240static void swap_stop(struct seq_file *swap, void *v)
1240{ 1241{
1241 up(&swapon_sem); 1242 mutex_unlock(&swapon_mutex);
1242} 1243}
1243 1244
1244static int swap_show(struct seq_file *swap, void *v) 1245static int swap_show(struct seq_file *swap, void *v)
@@ -1540,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1540 goto bad_swap; 1541 goto bad_swap;
1541 } 1542 }
1542 1543
1543 down(&swapon_sem); 1544 mutex_lock(&swapon_mutex);
1544 spin_lock(&swap_lock); 1545 spin_lock(&swap_lock);
1545 p->flags = SWP_ACTIVE; 1546 p->flags = SWP_ACTIVE;
1546 nr_swap_pages += nr_good_pages; 1547 nr_swap_pages += nr_good_pages;
@@ -1566,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1566 swap_info[prev].next = p - swap_info; 1567 swap_info[prev].next = p - swap_info;
1567 } 1568 }
1568 spin_unlock(&swap_lock); 1569 spin_unlock(&swap_lock);
1569 up(&swapon_sem); 1570 mutex_unlock(&swapon_mutex);
1570 error = 0; 1571 error = 0;
1571 goto out; 1572 goto out;
1572bad_swap: 1573bad_swap:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf903b2d198f..2e34b61a70c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -71,6 +71,9 @@ struct scan_control {
71 71
72 int may_writepage; 72 int may_writepage;
73 73
74 /* Can pages be swapped as part of reclaim? */
75 int may_swap;
76
74 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 77 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
75 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 78 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
76 * In this context, it doesn't matter that we scan the 79 * In this context, it doesn't matter that we scan the
@@ -458,6 +461,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
458 * Try to allocate it some swap space here. 461 * Try to allocate it some swap space here.
459 */ 462 */
460 if (PageAnon(page) && !PageSwapCache(page)) { 463 if (PageAnon(page) && !PageSwapCache(page)) {
464 if (!sc->may_swap)
465 goto keep_locked;
461 if (!add_to_swap(page, GFP_ATOMIC)) 466 if (!add_to_swap(page, GFP_ATOMIC))
462 goto activate_locked; 467 goto activate_locked;
463 } 468 }
@@ -586,7 +591,7 @@ static inline void move_to_lru(struct page *page)
586} 591}
587 592
588/* 593/*
589 * Add isolated pages on the list back to the LRU 594 * Add isolated pages on the list back to the LRU.
590 * 595 *
591 * returns the number of pages put back. 596 * returns the number of pages put back.
592 */ 597 */
@@ -760,46 +765,33 @@ next:
760 return nr_failed + retry; 765 return nr_failed + retry;
761} 766}
762 767
763static void lru_add_drain_per_cpu(void *dummy)
764{
765 lru_add_drain();
766}
767
768/* 768/*
769 * Isolate one page from the LRU lists and put it on the 769 * Isolate one page from the LRU lists and put it on the
770 * indicated list. Do necessary cache draining if the 770 * indicated list with elevated refcount.
771 * page is not on the LRU lists yet.
772 * 771 *
773 * Result: 772 * Result:
774 * 0 = page not on LRU list 773 * 0 = page not on LRU list
775 * 1 = page removed from LRU list and added to the specified list. 774 * 1 = page removed from LRU list and added to the specified list.
776 * -ENOENT = page is being freed elsewhere.
777 */ 775 */
778int isolate_lru_page(struct page *page) 776int isolate_lru_page(struct page *page)
779{ 777{
780 int rc = 0; 778 int ret = 0;
781 struct zone *zone = page_zone(page);
782 779
783redo: 780 if (PageLRU(page)) {
784 spin_lock_irq(&zone->lru_lock); 781 struct zone *zone = page_zone(page);
785 rc = __isolate_lru_page(page); 782 spin_lock_irq(&zone->lru_lock);
786 if (rc == 1) { 783 if (TestClearPageLRU(page)) {
787 if (PageActive(page)) 784 ret = 1;
788 del_page_from_active_list(zone, page); 785 get_page(page);
789 else 786 if (PageActive(page))
790 del_page_from_inactive_list(zone, page); 787 del_page_from_active_list(zone, page);
791 } 788 else
792 spin_unlock_irq(&zone->lru_lock); 789 del_page_from_inactive_list(zone, page);
793 if (rc == 0) { 790 }
794 /* 791 spin_unlock_irq(&zone->lru_lock);
795 * Maybe this page is still waiting for a cpu to drain it
796 * from one of the lru lists?
797 */
798 rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
799 if (rc == 0 && PageLRU(page))
800 goto redo;
801 } 792 }
802 return rc; 793
794 return ret;
803} 795}
804#endif 796#endif
805 797
@@ -831,18 +823,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
831 page = lru_to_page(src); 823 page = lru_to_page(src);
832 prefetchw_prev_lru_page(page, src, flags); 824 prefetchw_prev_lru_page(page, src, flags);
833 825
834 switch (__isolate_lru_page(page)) { 826 if (!TestClearPageLRU(page))
835 case 1:
836 /* Succeeded to isolate page */
837 list_move(&page->lru, dst);
838 nr_taken++;
839 break;
840 case -ENOENT:
841 /* Not possible to isolate */
842 list_move(&page->lru, src);
843 break;
844 default:
845 BUG(); 827 BUG();
828 list_del(&page->lru);
829 if (get_page_testone(page)) {
830 /*
831 * It is being freed elsewhere
832 */
833 __put_page(page);
834 SetPageLRU(page);
835 list_add(&page->lru, src);
836 continue;
837 } else {
838 list_add(&page->lru, dst);
839 nr_taken++;
846 } 840 }
847 } 841 }
848 842
@@ -1177,6 +1171,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1177 1171
1178 sc.gfp_mask = gfp_mask; 1172 sc.gfp_mask = gfp_mask;
1179 sc.may_writepage = 0; 1173 sc.may_writepage = 0;
1174 sc.may_swap = 1;
1180 1175
1181 inc_page_state(allocstall); 1176 inc_page_state(allocstall);
1182 1177
@@ -1279,6 +1274,7 @@ loop_again:
1279 total_reclaimed = 0; 1274 total_reclaimed = 0;
1280 sc.gfp_mask = GFP_KERNEL; 1275 sc.gfp_mask = GFP_KERNEL;
1281 sc.may_writepage = 0; 1276 sc.may_writepage = 0;
1277 sc.may_swap = 1;
1282 sc.nr_mapped = read_page_state(nr_mapped); 1278 sc.nr_mapped = read_page_state(nr_mapped);
1283 1279
1284 inc_page_state(pageoutrun); 1280 inc_page_state(pageoutrun);
@@ -1576,3 +1572,71 @@ static int __init kswapd_init(void)
1576} 1572}
1577 1573
1578module_init(kswapd_init) 1574module_init(kswapd_init)
1575
1576#ifdef CONFIG_NUMA
1577/*
1578 * Zone reclaim mode
1579 *
1580 * If non-zero call zone_reclaim when the number of free pages falls below
1581 * the watermarks.
1582 *
1583 * In the future we may add flags to the mode. However, the page allocator
1584 * should only have to check that zone_reclaim_mode != 0 before calling
1585 * zone_reclaim().
1586 */
1587int zone_reclaim_mode __read_mostly;
1588
1589/*
1590 * Mininum time between zone reclaim scans
1591 */
1592#define ZONE_RECLAIM_INTERVAL HZ/2
1593/*
1594 * Try to free up some pages from this zone through reclaim.
1595 */
1596int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1597{
1598 int nr_pages = 1 << order;
1599 struct task_struct *p = current;
1600 struct reclaim_state reclaim_state;
1601 struct scan_control sc = {
1602 .gfp_mask = gfp_mask,
1603 .may_writepage = 0,
1604 .may_swap = 0,
1605 .nr_mapped = read_page_state(nr_mapped),
1606 .nr_scanned = 0,
1607 .nr_reclaimed = 0,
1608 .priority = 0
1609 };
1610
1611 if (!(gfp_mask & __GFP_WAIT) ||
1612 zone->zone_pgdat->node_id != numa_node_id() ||
1613 zone->all_unreclaimable ||
1614 atomic_read(&zone->reclaim_in_progress) > 0)
1615 return 0;
1616
1617 if (time_before(jiffies,
1618 zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
1619 return 0;
1620
1621 disable_swap_token();
1622
1623 if (nr_pages > SWAP_CLUSTER_MAX)
1624 sc.swap_cluster_max = nr_pages;
1625 else
1626 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1627
1628 cond_resched();
1629 p->flags |= PF_MEMALLOC;
1630 reclaim_state.reclaimed_slab = 0;
1631 p->reclaim_state = &reclaim_state;
1632 shrink_zone(zone, &sc);
1633 p->reclaim_state = NULL;
1634 current->flags &= ~PF_MEMALLOC;
1635
1636 if (sc.nr_reclaimed == 0)
1637 zone->last_unsuccessful_zone_reclaim = jiffies;
1638
1639 return sc.nr_reclaimed > nr_pages;
1640}
1641#endif
1642