diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 99 | ||||
-rw-r--r-- | mm/page-writeback.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 17 | ||||
-rw-r--r-- | mm/rmap.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 58 | ||||
-rw-r--r-- | mm/swap.c | 26 | ||||
-rw-r--r-- | mm/swapfile.c | 17 | ||||
-rw-r--r-- | mm/vmscan.c | 146 |
9 files changed, 246 insertions, 127 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index a965b6b35f26..44da3d476994 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -94,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
94 | * ->private_lock (try_to_unmap_one) | 94 | * ->private_lock (try_to_unmap_one) |
95 | * ->tree_lock (try_to_unmap_one) | 95 | * ->tree_lock (try_to_unmap_one) |
96 | * ->zone.lru_lock (follow_page->mark_page_accessed) | 96 | * ->zone.lru_lock (follow_page->mark_page_accessed) |
97 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | ||
97 | * ->private_lock (page_remove_rmap->set_page_dirty) | 98 | * ->private_lock (page_remove_rmap->set_page_dirty) |
98 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 99 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
99 | * ->inode_lock (page_remove_rmap->set_page_dirty) | 100 | * ->inode_lock (page_remove_rmap->set_page_dirty) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3171f884d245..73790188b0eb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -185,8 +185,8 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
185 | } | 185 | } |
186 | 186 | ||
187 | static void gather_stats(struct page *, void *); | 187 | static void gather_stats(struct page *, void *); |
188 | static void migrate_page_add(struct vm_area_struct *vma, | 188 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
189 | struct page *page, struct list_head *pagelist, unsigned long flags); | 189 | unsigned long flags); |
190 | 190 | ||
191 | /* Scan through pages checking if pages follow certain conditions. */ | 191 | /* Scan through pages checking if pages follow certain conditions. */ |
192 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 192 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
@@ -208,6 +208,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
208 | page = vm_normal_page(vma, addr, *pte); | 208 | page = vm_normal_page(vma, addr, *pte); |
209 | if (!page) | 209 | if (!page) |
210 | continue; | 210 | continue; |
211 | /* | ||
212 | * The check for PageReserved here is important to avoid | ||
213 | * handling zero pages and other pages that may have been | ||
214 | * marked special by the system. | ||
215 | * | ||
216 | * If the PageReserved would not be checked here then f.e. | ||
217 | * the location of the zero page could have an influence | ||
218 | * on MPOL_MF_STRICT, zero pages would be counted for | ||
219 | * the per node stats, and there would be useless attempts | ||
220 | * to put zero pages on the migration list. | ||
221 | */ | ||
211 | if (PageReserved(page)) | 222 | if (PageReserved(page)) |
212 | continue; | 223 | continue; |
213 | nid = page_to_nid(page); | 224 | nid = page_to_nid(page); |
@@ -216,11 +227,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
216 | 227 | ||
217 | if (flags & MPOL_MF_STATS) | 228 | if (flags & MPOL_MF_STATS) |
218 | gather_stats(page, private); | 229 | gather_stats(page, private); |
219 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 230 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
220 | spin_unlock(ptl); | 231 | migrate_page_add(page, private, flags); |
221 | migrate_page_add(vma, page, private, flags); | ||
222 | spin_lock(ptl); | ||
223 | } | ||
224 | else | 232 | else |
225 | break; | 233 | break; |
226 | } while (pte++, addr += PAGE_SIZE, addr != end); | 234 | } while (pte++, addr += PAGE_SIZE, addr != end); |
@@ -309,6 +317,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
309 | int err; | 317 | int err; |
310 | struct vm_area_struct *first, *vma, *prev; | 318 | struct vm_area_struct *first, *vma, *prev; |
311 | 319 | ||
320 | /* Clear the LRU lists so pages can be isolated */ | ||
321 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
322 | lru_add_drain_all(); | ||
323 | |||
312 | first = find_vma(mm, start); | 324 | first = find_vma(mm, start); |
313 | if (!first) | 325 | if (!first) |
314 | return ERR_PTR(-EFAULT); | 326 | return ERR_PTR(-EFAULT); |
@@ -519,51 +531,15 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
519 | * page migration | 531 | * page migration |
520 | */ | 532 | */ |
521 | 533 | ||
522 | /* Check if we are the only process mapping the page in question */ | 534 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
523 | static inline int single_mm_mapping(struct mm_struct *mm, | 535 | unsigned long flags) |
524 | struct address_space *mapping) | ||
525 | { | ||
526 | struct vm_area_struct *vma; | ||
527 | struct prio_tree_iter iter; | ||
528 | int rc = 1; | ||
529 | |||
530 | spin_lock(&mapping->i_mmap_lock); | ||
531 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
532 | if (mm != vma->vm_mm) { | ||
533 | rc = 0; | ||
534 | goto out; | ||
535 | } | ||
536 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
537 | if (mm != vma->vm_mm) { | ||
538 | rc = 0; | ||
539 | goto out; | ||
540 | } | ||
541 | out: | ||
542 | spin_unlock(&mapping->i_mmap_lock); | ||
543 | return rc; | ||
544 | } | ||
545 | |||
546 | /* | ||
547 | * Add a page to be migrated to the pagelist | ||
548 | */ | ||
549 | static void migrate_page_add(struct vm_area_struct *vma, | ||
550 | struct page *page, struct list_head *pagelist, unsigned long flags) | ||
551 | { | 536 | { |
552 | /* | 537 | /* |
553 | * Avoid migrating a page that is shared by others and not writable. | 538 | * Avoid migrating a page that is shared with others. |
554 | */ | 539 | */ |
555 | if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || | 540 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
556 | mapping_writably_mapped(page->mapping) || | 541 | if (isolate_lru_page(page)) |
557 | single_mm_mapping(vma->vm_mm, page->mapping)) { | ||
558 | int rc = isolate_lru_page(page); | ||
559 | |||
560 | if (rc == 1) | ||
561 | list_add(&page->lru, pagelist); | 542 | list_add(&page->lru, pagelist); |
562 | /* | ||
563 | * If the isolate attempt was not successful then we just | ||
564 | * encountered an unswappable page. Something must be wrong. | ||
565 | */ | ||
566 | WARN_ON(rc == 0); | ||
567 | } | 543 | } |
568 | } | 544 | } |
569 | 545 | ||
@@ -1000,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1000 | return nid; | 976 | return nid; |
1001 | } | 977 | } |
1002 | 978 | ||
979 | /* | ||
980 | * Depending on the memory policy provide a node from which to allocate the | ||
981 | * next slab entry. | ||
982 | */ | ||
983 | unsigned slab_node(struct mempolicy *policy) | ||
984 | { | ||
985 | switch (policy->policy) { | ||
986 | case MPOL_INTERLEAVE: | ||
987 | return interleave_nodes(policy); | ||
988 | |||
989 | case MPOL_BIND: | ||
990 | /* | ||
991 | * Follow bind policy behavior and start allocation at the | ||
992 | * first node. | ||
993 | */ | ||
994 | return policy->v.zonelist->zones[0]->zone_pgdat->node_id; | ||
995 | |||
996 | case MPOL_PREFERRED: | ||
997 | if (policy->v.preferred_node >= 0) | ||
998 | return policy->v.preferred_node; | ||
999 | /* Fall through */ | ||
1000 | |||
1001 | default: | ||
1002 | return numa_node_id(); | ||
1003 | } | ||
1004 | } | ||
1005 | |||
1003 | /* Do static interleaving for a VMA with known offset. */ | 1006 | /* Do static interleaving for a VMA with known offset. */ |
1004 | static unsigned offset_il_node(struct mempolicy *pol, | 1007 | static unsigned offset_il_node(struct mempolicy *pol, |
1005 | struct vm_area_struct *vma, unsigned long off) | 1008 | struct vm_area_struct *vma, unsigned long off) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5240e426c1f7..945559fb63d2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -46,7 +46,7 @@ | |||
46 | static long ratelimit_pages = 32; | 46 | static long ratelimit_pages = 32; |
47 | 47 | ||
48 | static long total_pages; /* The total number of pages in the machine. */ | 48 | static long total_pages; /* The total number of pages in the machine. */ |
49 | static int dirty_exceeded; /* Dirty mem may be over limit */ | 49 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * When balance_dirty_pages decides that the caller needs to perform some | 52 | * When balance_dirty_pages decides that the caller needs to perform some |
@@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
212 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | 212 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) |
213 | break; | 213 | break; |
214 | 214 | ||
215 | dirty_exceeded = 1; | 215 | if (!dirty_exceeded) |
216 | dirty_exceeded = 1; | ||
216 | 217 | ||
217 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 218 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
218 | * Unstable writes are a feature of certain networked | 219 | * Unstable writes are a feature of certain networked |
@@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
234 | blk_congestion_wait(WRITE, HZ/10); | 235 | blk_congestion_wait(WRITE, HZ/10); |
235 | } | 236 | } |
236 | 237 | ||
237 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) | 238 | if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded) |
238 | dirty_exceeded = 0; | 239 | dirty_exceeded = 0; |
239 | 240 | ||
240 | if (writeback_in_progress(bdi)) | 241 | if (writeback_in_progress(bdi)) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2e29743a8d1..df54e2fc8ee0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
878 | mark = (*z)->pages_high; | 878 | mark = (*z)->pages_high; |
879 | if (!zone_watermark_ok(*z, order, mark, | 879 | if (!zone_watermark_ok(*z, order, mark, |
880 | classzone_idx, alloc_flags)) | 880 | classzone_idx, alloc_flags)) |
881 | continue; | 881 | if (!zone_reclaim_mode || |
882 | !zone_reclaim(*z, gfp_mask, order)) | ||
883 | continue; | ||
882 | } | 884 | } |
883 | 885 | ||
884 | page = buffered_rmqueue(zonelist, *z, order, gfp_mask); | 886 | page = buffered_rmqueue(zonelist, *z, order, gfp_mask); |
@@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat) | |||
1595 | prev_node = local_node; | 1597 | prev_node = local_node; |
1596 | nodes_clear(used_mask); | 1598 | nodes_clear(used_mask); |
1597 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 1599 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
1600 | int distance = node_distance(local_node, node); | ||
1601 | |||
1602 | /* | ||
1603 | * If another node is sufficiently far away then it is better | ||
1604 | * to reclaim pages in a zone before going off node. | ||
1605 | */ | ||
1606 | if (distance > RECLAIM_DISTANCE) | ||
1607 | zone_reclaim_mode = 1; | ||
1608 | |||
1598 | /* | 1609 | /* |
1599 | * We don't want to pressure a particular node. | 1610 | * We don't want to pressure a particular node. |
1600 | * So adding penalty to the first node in same | 1611 | * So adding penalty to the first node in same |
1601 | * distance group to make it round-robin. | 1612 | * distance group to make it round-robin. |
1602 | */ | 1613 | */ |
1603 | if (node_distance(local_node, node) != | 1614 | |
1604 | node_distance(local_node, prev_node)) | 1615 | if (distance != node_distance(local_node, prev_node)) |
1605 | node_load[node] += load; | 1616 | node_load[node] += load; |
1606 | prev_node = node; | 1617 | prev_node = node; |
1607 | load--; | 1618 | load--; |
@@ -33,7 +33,7 @@ | |||
33 | * mapping->i_mmap_lock | 33 | * mapping->i_mmap_lock |
34 | * anon_vma->lock | 34 | * anon_vma->lock |
35 | * mm->page_table_lock or pte_lock | 35 | * mm->page_table_lock or pte_lock |
36 | * zone->lru_lock (in mark_page_accessed) | 36 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
37 | * swap_lock (in swap_duplicate, swap_info_get) | 37 | * swap_lock (in swap_duplicate, swap_info_get) |
38 | * mmlist_lock (in mmput, drain_mmlist and others) | 38 | * mmlist_lock (in mmput, drain_mmlist and others) |
39 | * mapping->private_lock (in __set_page_dirty_buffers) | 39 | * mapping->private_lock (in __set_page_dirty_buffers) |
@@ -68,7 +68,7 @@ | |||
68 | * Further notes from the original documentation: | 68 | * Further notes from the original documentation: |
69 | * | 69 | * |
70 | * 11 April '97. Started multi-threading - markhe | 70 | * 11 April '97. Started multi-threading - markhe |
71 | * The global cache-chain is protected by the semaphore 'cache_chain_sem'. | 71 | * The global cache-chain is protected by the mutex 'cache_chain_mutex'. |
72 | * The sem is only needed when accessing/extending the cache-chain, which | 72 | * The sem is only needed when accessing/extending the cache-chain, which |
73 | * can never happen inside an interrupt (kmem_cache_create(), | 73 | * can never happen inside an interrupt (kmem_cache_create(), |
74 | * kmem_cache_shrink() and kmem_cache_reap()). | 74 | * kmem_cache_shrink() and kmem_cache_reap()). |
@@ -103,6 +103,8 @@ | |||
103 | #include <linux/rcupdate.h> | 103 | #include <linux/rcupdate.h> |
104 | #include <linux/string.h> | 104 | #include <linux/string.h> |
105 | #include <linux/nodemask.h> | 105 | #include <linux/nodemask.h> |
106 | #include <linux/mempolicy.h> | ||
107 | #include <linux/mutex.h> | ||
106 | 108 | ||
107 | #include <asm/uaccess.h> | 109 | #include <asm/uaccess.h> |
108 | #include <asm/cacheflush.h> | 110 | #include <asm/cacheflush.h> |
@@ -631,7 +633,7 @@ static kmem_cache_t cache_cache = { | |||
631 | }; | 633 | }; |
632 | 634 | ||
633 | /* Guard access to the cache-chain. */ | 635 | /* Guard access to the cache-chain. */ |
634 | static struct semaphore cache_chain_sem; | 636 | static DEFINE_MUTEX(cache_chain_mutex); |
635 | static struct list_head cache_chain; | 637 | static struct list_head cache_chain; |
636 | 638 | ||
637 | /* | 639 | /* |
@@ -772,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
772 | } | 774 | } |
773 | 775 | ||
774 | #ifdef CONFIG_NUMA | 776 | #ifdef CONFIG_NUMA |
777 | static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int); | ||
778 | |||
775 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | 779 | static inline struct array_cache **alloc_alien_cache(int node, int limit) |
776 | { | 780 | { |
777 | struct array_cache **ac_ptr; | 781 | struct array_cache **ac_ptr; |
@@ -857,7 +861,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
857 | 861 | ||
858 | switch (action) { | 862 | switch (action) { |
859 | case CPU_UP_PREPARE: | 863 | case CPU_UP_PREPARE: |
860 | down(&cache_chain_sem); | 864 | mutex_lock(&cache_chain_mutex); |
861 | /* we need to do this right in the beginning since | 865 | /* we need to do this right in the beginning since |
862 | * alloc_arraycache's are going to use this list. | 866 | * alloc_arraycache's are going to use this list. |
863 | * kmalloc_node allows us to add the slab to the right | 867 | * kmalloc_node allows us to add the slab to the right |
@@ -912,7 +916,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
912 | l3->shared = nc; | 916 | l3->shared = nc; |
913 | } | 917 | } |
914 | } | 918 | } |
915 | up(&cache_chain_sem); | 919 | mutex_unlock(&cache_chain_mutex); |
916 | break; | 920 | break; |
917 | case CPU_ONLINE: | 921 | case CPU_ONLINE: |
918 | start_cpu_timer(cpu); | 922 | start_cpu_timer(cpu); |
@@ -921,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
921 | case CPU_DEAD: | 925 | case CPU_DEAD: |
922 | /* fall thru */ | 926 | /* fall thru */ |
923 | case CPU_UP_CANCELED: | 927 | case CPU_UP_CANCELED: |
924 | down(&cache_chain_sem); | 928 | mutex_lock(&cache_chain_mutex); |
925 | 929 | ||
926 | list_for_each_entry(cachep, &cache_chain, next) { | 930 | list_for_each_entry(cachep, &cache_chain, next) { |
927 | struct array_cache *nc; | 931 | struct array_cache *nc; |
@@ -973,13 +977,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
973 | spin_unlock_irq(&cachep->spinlock); | 977 | spin_unlock_irq(&cachep->spinlock); |
974 | kfree(nc); | 978 | kfree(nc); |
975 | } | 979 | } |
976 | up(&cache_chain_sem); | 980 | mutex_unlock(&cache_chain_mutex); |
977 | break; | 981 | break; |
978 | #endif | 982 | #endif |
979 | } | 983 | } |
980 | return NOTIFY_OK; | 984 | return NOTIFY_OK; |
981 | bad: | 985 | bad: |
982 | up(&cache_chain_sem); | 986 | mutex_unlock(&cache_chain_mutex); |
983 | return NOTIFY_BAD; | 987 | return NOTIFY_BAD; |
984 | } | 988 | } |
985 | 989 | ||
@@ -1047,7 +1051,6 @@ void __init kmem_cache_init(void) | |||
1047 | */ | 1051 | */ |
1048 | 1052 | ||
1049 | /* 1) create the cache_cache */ | 1053 | /* 1) create the cache_cache */ |
1050 | init_MUTEX(&cache_chain_sem); | ||
1051 | INIT_LIST_HEAD(&cache_chain); | 1054 | INIT_LIST_HEAD(&cache_chain); |
1052 | list_add(&cache_cache.next, &cache_chain); | 1055 | list_add(&cache_cache.next, &cache_chain); |
1053 | cache_cache.colour_off = cache_line_size(); | 1056 | cache_cache.colour_off = cache_line_size(); |
@@ -1168,10 +1171,10 @@ void __init kmem_cache_init(void) | |||
1168 | /* 6) resize the head arrays to their final sizes */ | 1171 | /* 6) resize the head arrays to their final sizes */ |
1169 | { | 1172 | { |
1170 | kmem_cache_t *cachep; | 1173 | kmem_cache_t *cachep; |
1171 | down(&cache_chain_sem); | 1174 | mutex_lock(&cache_chain_mutex); |
1172 | list_for_each_entry(cachep, &cache_chain, next) | 1175 | list_for_each_entry(cachep, &cache_chain, next) |
1173 | enable_cpucache(cachep); | 1176 | enable_cpucache(cachep); |
1174 | up(&cache_chain_sem); | 1177 | mutex_unlock(&cache_chain_mutex); |
1175 | } | 1178 | } |
1176 | 1179 | ||
1177 | /* Done! */ | 1180 | /* Done! */ |
@@ -1590,7 +1593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1590 | BUG(); | 1593 | BUG(); |
1591 | } | 1594 | } |
1592 | 1595 | ||
1593 | down(&cache_chain_sem); | 1596 | mutex_lock(&cache_chain_mutex); |
1594 | 1597 | ||
1595 | list_for_each(p, &cache_chain) { | 1598 | list_for_each(p, &cache_chain) { |
1596 | kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); | 1599 | kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); |
@@ -1856,7 +1859,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1856 | if (!cachep && (flags & SLAB_PANIC)) | 1859 | if (!cachep && (flags & SLAB_PANIC)) |
1857 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 1860 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
1858 | name); | 1861 | name); |
1859 | up(&cache_chain_sem); | 1862 | mutex_unlock(&cache_chain_mutex); |
1860 | return cachep; | 1863 | return cachep; |
1861 | } | 1864 | } |
1862 | EXPORT_SYMBOL(kmem_cache_create); | 1865 | EXPORT_SYMBOL(kmem_cache_create); |
@@ -2044,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t *cachep) | |||
2044 | lock_cpu_hotplug(); | 2047 | lock_cpu_hotplug(); |
2045 | 2048 | ||
2046 | /* Find the cache in the chain of caches. */ | 2049 | /* Find the cache in the chain of caches. */ |
2047 | down(&cache_chain_sem); | 2050 | mutex_lock(&cache_chain_mutex); |
2048 | /* | 2051 | /* |
2049 | * the chain is never empty, cache_cache is never destroyed | 2052 | * the chain is never empty, cache_cache is never destroyed |
2050 | */ | 2053 | */ |
2051 | list_del(&cachep->next); | 2054 | list_del(&cachep->next); |
2052 | up(&cache_chain_sem); | 2055 | mutex_unlock(&cache_chain_mutex); |
2053 | 2056 | ||
2054 | if (__cache_shrink(cachep)) { | 2057 | if (__cache_shrink(cachep)) { |
2055 | slab_error(cachep, "Can't free all objects"); | 2058 | slab_error(cachep, "Can't free all objects"); |
2056 | down(&cache_chain_sem); | 2059 | mutex_lock(&cache_chain_mutex); |
2057 | list_add(&cachep->next, &cache_chain); | 2060 | list_add(&cachep->next, &cache_chain); |
2058 | up(&cache_chain_sem); | 2061 | mutex_unlock(&cache_chain_mutex); |
2059 | unlock_cpu_hotplug(); | 2062 | unlock_cpu_hotplug(); |
2060 | return 1; | 2063 | return 1; |
2061 | } | 2064 | } |
@@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) | |||
2570 | void *objp; | 2573 | void *objp; |
2571 | struct array_cache *ac; | 2574 | struct array_cache *ac; |
2572 | 2575 | ||
2576 | #ifdef CONFIG_NUMA | ||
2577 | if (unlikely(current->mempolicy && !in_interrupt())) { | ||
2578 | int nid = slab_node(current->mempolicy); | ||
2579 | |||
2580 | if (nid != numa_node_id()) | ||
2581 | return __cache_alloc_node(cachep, flags, nid); | ||
2582 | } | ||
2583 | #endif | ||
2584 | |||
2573 | check_irq_off(); | 2585 | check_irq_off(); |
2574 | ac = ac_data(cachep); | 2586 | ac = ac_data(cachep); |
2575 | if (likely(ac->avail)) { | 2587 | if (likely(ac->avail)) { |
@@ -3314,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, | |||
3314 | * - clear the per-cpu caches for this CPU. | 3326 | * - clear the per-cpu caches for this CPU. |
3315 | * - return freeable pages to the main free memory pool. | 3327 | * - return freeable pages to the main free memory pool. |
3316 | * | 3328 | * |
3317 | * If we cannot acquire the cache chain semaphore then just give up - we'll | 3329 | * If we cannot acquire the cache chain mutex then just give up - we'll |
3318 | * try again on the next iteration. | 3330 | * try again on the next iteration. |
3319 | */ | 3331 | */ |
3320 | static void cache_reap(void *unused) | 3332 | static void cache_reap(void *unused) |
@@ -3322,7 +3334,7 @@ static void cache_reap(void *unused) | |||
3322 | struct list_head *walk; | 3334 | struct list_head *walk; |
3323 | struct kmem_list3 *l3; | 3335 | struct kmem_list3 *l3; |
3324 | 3336 | ||
3325 | if (down_trylock(&cache_chain_sem)) { | 3337 | if (!mutex_trylock(&cache_chain_mutex)) { |
3326 | /* Give up. Setup the next iteration. */ | 3338 | /* Give up. Setup the next iteration. */ |
3327 | schedule_delayed_work(&__get_cpu_var(reap_work), | 3339 | schedule_delayed_work(&__get_cpu_var(reap_work), |
3328 | REAPTIMEOUT_CPUC); | 3340 | REAPTIMEOUT_CPUC); |
@@ -3393,7 +3405,7 @@ static void cache_reap(void *unused) | |||
3393 | cond_resched(); | 3405 | cond_resched(); |
3394 | } | 3406 | } |
3395 | check_irq_on(); | 3407 | check_irq_on(); |
3396 | up(&cache_chain_sem); | 3408 | mutex_unlock(&cache_chain_mutex); |
3397 | drain_remote_pages(); | 3409 | drain_remote_pages(); |
3398 | /* Setup the next iteration */ | 3410 | /* Setup the next iteration */ |
3399 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 3411 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); |
@@ -3429,7 +3441,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
3429 | loff_t n = *pos; | 3441 | loff_t n = *pos; |
3430 | struct list_head *p; | 3442 | struct list_head *p; |
3431 | 3443 | ||
3432 | down(&cache_chain_sem); | 3444 | mutex_lock(&cache_chain_mutex); |
3433 | if (!n) | 3445 | if (!n) |
3434 | print_slabinfo_header(m); | 3446 | print_slabinfo_header(m); |
3435 | p = cache_chain.next; | 3447 | p = cache_chain.next; |
@@ -3451,7 +3463,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) | |||
3451 | 3463 | ||
3452 | static void s_stop(struct seq_file *m, void *p) | 3464 | static void s_stop(struct seq_file *m, void *p) |
3453 | { | 3465 | { |
3454 | up(&cache_chain_sem); | 3466 | mutex_unlock(&cache_chain_mutex); |
3455 | } | 3467 | } |
3456 | 3468 | ||
3457 | static int s_show(struct seq_file *m, void *p) | 3469 | static int s_show(struct seq_file *m, void *p) |
@@ -3603,7 +3615,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3603 | return -EINVAL; | 3615 | return -EINVAL; |
3604 | 3616 | ||
3605 | /* Find the cache in the chain of caches. */ | 3617 | /* Find the cache in the chain of caches. */ |
3606 | down(&cache_chain_sem); | 3618 | mutex_lock(&cache_chain_mutex); |
3607 | res = -EINVAL; | 3619 | res = -EINVAL; |
3608 | list_for_each(p, &cache_chain) { | 3620 | list_for_each(p, &cache_chain) { |
3609 | kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); | 3621 | kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); |
@@ -3620,7 +3632,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3620 | break; | 3632 | break; |
3621 | } | 3633 | } |
3622 | } | 3634 | } |
3623 | up(&cache_chain_sem); | 3635 | mutex_unlock(&cache_chain_mutex); |
3624 | if (res >= 0) | 3636 | if (res >= 0) |
3625 | res = count; | 3637 | res = count; |
3626 | return res; | 3638 | return res; |
@@ -174,6 +174,32 @@ void lru_add_drain(void) | |||
174 | put_cpu(); | 174 | put_cpu(); |
175 | } | 175 | } |
176 | 176 | ||
177 | #ifdef CONFIG_NUMA | ||
178 | static void lru_add_drain_per_cpu(void *dummy) | ||
179 | { | ||
180 | lru_add_drain(); | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * Returns 0 for success | ||
185 | */ | ||
186 | int lru_add_drain_all(void) | ||
187 | { | ||
188 | return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); | ||
189 | } | ||
190 | |||
191 | #else | ||
192 | |||
193 | /* | ||
194 | * Returns 0 for success | ||
195 | */ | ||
196 | int lru_add_drain_all(void) | ||
197 | { | ||
198 | lru_add_drain(); | ||
199 | return 0; | ||
200 | } | ||
201 | #endif | ||
202 | |||
177 | /* | 203 | /* |
178 | * This path almost never happens for VM activity - pages are normally | 204 | * This path almost never happens for VM activity - pages are normally |
179 | * freed via pagevecs. But it gets used by networking. | 205 | * freed via pagevecs. But it gets used by networking. |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 957fef43fa60..f1e69c30d203 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
26 | #include <linux/security.h> | 26 | #include <linux/security.h> |
27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
28 | #include <linux/mutex.h> | ||
28 | #include <linux/capability.h> | 29 | #include <linux/capability.h> |
29 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
30 | 31 | ||
@@ -46,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1}; | |||
46 | 47 | ||
47 | struct swap_info_struct swap_info[MAX_SWAPFILES]; | 48 | struct swap_info_struct swap_info[MAX_SWAPFILES]; |
48 | 49 | ||
49 | static DECLARE_MUTEX(swapon_sem); | 50 | static DEFINE_MUTEX(swapon_mutex); |
50 | 51 | ||
51 | /* | 52 | /* |
52 | * We need this because the bdev->unplug_fn can sleep and we cannot | 53 | * We need this because the bdev->unplug_fn can sleep and we cannot |
53 | * hold swap_lock while calling the unplug_fn. And swap_lock | 54 | * hold swap_lock while calling the unplug_fn. And swap_lock |
54 | * cannot be turned into a semaphore. | 55 | * cannot be turned into a mutex. |
55 | */ | 56 | */ |
56 | static DECLARE_RWSEM(swap_unplug_sem); | 57 | static DECLARE_RWSEM(swap_unplug_sem); |
57 | 58 | ||
@@ -1161,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1161 | up_write(&swap_unplug_sem); | 1162 | up_write(&swap_unplug_sem); |
1162 | 1163 | ||
1163 | destroy_swap_extents(p); | 1164 | destroy_swap_extents(p); |
1164 | down(&swapon_sem); | 1165 | mutex_lock(&swapon_mutex); |
1165 | spin_lock(&swap_lock); | 1166 | spin_lock(&swap_lock); |
1166 | drain_mmlist(); | 1167 | drain_mmlist(); |
1167 | 1168 | ||
@@ -1180,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
1180 | p->swap_map = NULL; | 1181 | p->swap_map = NULL; |
1181 | p->flags = 0; | 1182 | p->flags = 0; |
1182 | spin_unlock(&swap_lock); | 1183 | spin_unlock(&swap_lock); |
1183 | up(&swapon_sem); | 1184 | mutex_unlock(&swapon_mutex); |
1184 | vfree(swap_map); | 1185 | vfree(swap_map); |
1185 | inode = mapping->host; | 1186 | inode = mapping->host; |
1186 | if (S_ISBLK(inode->i_mode)) { | 1187 | if (S_ISBLK(inode->i_mode)) { |
@@ -1209,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
1209 | int i; | 1210 | int i; |
1210 | loff_t l = *pos; | 1211 | loff_t l = *pos; |
1211 | 1212 | ||
1212 | down(&swapon_sem); | 1213 | mutex_lock(&swapon_mutex); |
1213 | 1214 | ||
1214 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1215 | for (i = 0; i < nr_swapfiles; i++, ptr++) { |
1215 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1216 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) |
@@ -1238,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | |||
1238 | 1239 | ||
1239 | static void swap_stop(struct seq_file *swap, void *v) | 1240 | static void swap_stop(struct seq_file *swap, void *v) |
1240 | { | 1241 | { |
1241 | up(&swapon_sem); | 1242 | mutex_unlock(&swapon_mutex); |
1242 | } | 1243 | } |
1243 | 1244 | ||
1244 | static int swap_show(struct seq_file *swap, void *v) | 1245 | static int swap_show(struct seq_file *swap, void *v) |
@@ -1540,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1540 | goto bad_swap; | 1541 | goto bad_swap; |
1541 | } | 1542 | } |
1542 | 1543 | ||
1543 | down(&swapon_sem); | 1544 | mutex_lock(&swapon_mutex); |
1544 | spin_lock(&swap_lock); | 1545 | spin_lock(&swap_lock); |
1545 | p->flags = SWP_ACTIVE; | 1546 | p->flags = SWP_ACTIVE; |
1546 | nr_swap_pages += nr_good_pages; | 1547 | nr_swap_pages += nr_good_pages; |
@@ -1566,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
1566 | swap_info[prev].next = p - swap_info; | 1567 | swap_info[prev].next = p - swap_info; |
1567 | } | 1568 | } |
1568 | spin_unlock(&swap_lock); | 1569 | spin_unlock(&swap_lock); |
1569 | up(&swapon_sem); | 1570 | mutex_unlock(&swapon_mutex); |
1570 | error = 0; | 1571 | error = 0; |
1571 | goto out; | 1572 | goto out; |
1572 | bad_swap: | 1573 | bad_swap: |
diff --git a/mm/vmscan.c b/mm/vmscan.c index bf903b2d198f..2e34b61a70c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -71,6 +71,9 @@ struct scan_control { | |||
71 | 71 | ||
72 | int may_writepage; | 72 | int may_writepage; |
73 | 73 | ||
74 | /* Can pages be swapped as part of reclaim? */ | ||
75 | int may_swap; | ||
76 | |||
74 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | 77 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for |
75 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | 78 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. |
76 | * In this context, it doesn't matter that we scan the | 79 | * In this context, it doesn't matter that we scan the |
@@ -458,6 +461,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
458 | * Try to allocate it some swap space here. | 461 | * Try to allocate it some swap space here. |
459 | */ | 462 | */ |
460 | if (PageAnon(page) && !PageSwapCache(page)) { | 463 | if (PageAnon(page) && !PageSwapCache(page)) { |
464 | if (!sc->may_swap) | ||
465 | goto keep_locked; | ||
461 | if (!add_to_swap(page, GFP_ATOMIC)) | 466 | if (!add_to_swap(page, GFP_ATOMIC)) |
462 | goto activate_locked; | 467 | goto activate_locked; |
463 | } | 468 | } |
@@ -586,7 +591,7 @@ static inline void move_to_lru(struct page *page) | |||
586 | } | 591 | } |
587 | 592 | ||
588 | /* | 593 | /* |
589 | * Add isolated pages on the list back to the LRU | 594 | * Add isolated pages on the list back to the LRU. |
590 | * | 595 | * |
591 | * returns the number of pages put back. | 596 | * returns the number of pages put back. |
592 | */ | 597 | */ |
@@ -760,46 +765,33 @@ next: | |||
760 | return nr_failed + retry; | 765 | return nr_failed + retry; |
761 | } | 766 | } |
762 | 767 | ||
763 | static void lru_add_drain_per_cpu(void *dummy) | ||
764 | { | ||
765 | lru_add_drain(); | ||
766 | } | ||
767 | |||
768 | /* | 768 | /* |
769 | * Isolate one page from the LRU lists and put it on the | 769 | * Isolate one page from the LRU lists and put it on the |
770 | * indicated list. Do necessary cache draining if the | 770 | * indicated list with elevated refcount. |
771 | * page is not on the LRU lists yet. | ||
772 | * | 771 | * |
773 | * Result: | 772 | * Result: |
774 | * 0 = page not on LRU list | 773 | * 0 = page not on LRU list |
775 | * 1 = page removed from LRU list and added to the specified list. | 774 | * 1 = page removed from LRU list and added to the specified list. |
776 | * -ENOENT = page is being freed elsewhere. | ||
777 | */ | 775 | */ |
778 | int isolate_lru_page(struct page *page) | 776 | int isolate_lru_page(struct page *page) |
779 | { | 777 | { |
780 | int rc = 0; | 778 | int ret = 0; |
781 | struct zone *zone = page_zone(page); | ||
782 | 779 | ||
783 | redo: | 780 | if (PageLRU(page)) { |
784 | spin_lock_irq(&zone->lru_lock); | 781 | struct zone *zone = page_zone(page); |
785 | rc = __isolate_lru_page(page); | 782 | spin_lock_irq(&zone->lru_lock); |
786 | if (rc == 1) { | 783 | if (TestClearPageLRU(page)) { |
787 | if (PageActive(page)) | 784 | ret = 1; |
788 | del_page_from_active_list(zone, page); | 785 | get_page(page); |
789 | else | 786 | if (PageActive(page)) |
790 | del_page_from_inactive_list(zone, page); | 787 | del_page_from_active_list(zone, page); |
791 | } | 788 | else |
792 | spin_unlock_irq(&zone->lru_lock); | 789 | del_page_from_inactive_list(zone, page); |
793 | if (rc == 0) { | 790 | } |
794 | /* | 791 | spin_unlock_irq(&zone->lru_lock); |
795 | * Maybe this page is still waiting for a cpu to drain it | ||
796 | * from one of the lru lists? | ||
797 | */ | ||
798 | rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); | ||
799 | if (rc == 0 && PageLRU(page)) | ||
800 | goto redo; | ||
801 | } | 792 | } |
802 | return rc; | 793 | |
794 | return ret; | ||
803 | } | 795 | } |
804 | #endif | 796 | #endif |
805 | 797 | ||
@@ -831,18 +823,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | |||
831 | page = lru_to_page(src); | 823 | page = lru_to_page(src); |
832 | prefetchw_prev_lru_page(page, src, flags); | 824 | prefetchw_prev_lru_page(page, src, flags); |
833 | 825 | ||
834 | switch (__isolate_lru_page(page)) { | 826 | if (!TestClearPageLRU(page)) |
835 | case 1: | ||
836 | /* Succeeded to isolate page */ | ||
837 | list_move(&page->lru, dst); | ||
838 | nr_taken++; | ||
839 | break; | ||
840 | case -ENOENT: | ||
841 | /* Not possible to isolate */ | ||
842 | list_move(&page->lru, src); | ||
843 | break; | ||
844 | default: | ||
845 | BUG(); | 827 | BUG(); |
828 | list_del(&page->lru); | ||
829 | if (get_page_testone(page)) { | ||
830 | /* | ||
831 | * It is being freed elsewhere | ||
832 | */ | ||
833 | __put_page(page); | ||
834 | SetPageLRU(page); | ||
835 | list_add(&page->lru, src); | ||
836 | continue; | ||
837 | } else { | ||
838 | list_add(&page->lru, dst); | ||
839 | nr_taken++; | ||
846 | } | 840 | } |
847 | } | 841 | } |
848 | 842 | ||
@@ -1177,6 +1171,7 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1177 | 1171 | ||
1178 | sc.gfp_mask = gfp_mask; | 1172 | sc.gfp_mask = gfp_mask; |
1179 | sc.may_writepage = 0; | 1173 | sc.may_writepage = 0; |
1174 | sc.may_swap = 1; | ||
1180 | 1175 | ||
1181 | inc_page_state(allocstall); | 1176 | inc_page_state(allocstall); |
1182 | 1177 | ||
@@ -1279,6 +1274,7 @@ loop_again: | |||
1279 | total_reclaimed = 0; | 1274 | total_reclaimed = 0; |
1280 | sc.gfp_mask = GFP_KERNEL; | 1275 | sc.gfp_mask = GFP_KERNEL; |
1281 | sc.may_writepage = 0; | 1276 | sc.may_writepage = 0; |
1277 | sc.may_swap = 1; | ||
1282 | sc.nr_mapped = read_page_state(nr_mapped); | 1278 | sc.nr_mapped = read_page_state(nr_mapped); |
1283 | 1279 | ||
1284 | inc_page_state(pageoutrun); | 1280 | inc_page_state(pageoutrun); |
@@ -1576,3 +1572,71 @@ static int __init kswapd_init(void) | |||
1576 | } | 1572 | } |
1577 | 1573 | ||
1578 | module_init(kswapd_init) | 1574 | module_init(kswapd_init) |
1575 | |||
1576 | #ifdef CONFIG_NUMA | ||
1577 | /* | ||
1578 | * Zone reclaim mode | ||
1579 | * | ||
1580 | * If non-zero call zone_reclaim when the number of free pages falls below | ||
1581 | * the watermarks. | ||
1582 | * | ||
1583 | * In the future we may add flags to the mode. However, the page allocator | ||
1584 | * should only have to check that zone_reclaim_mode != 0 before calling | ||
1585 | * zone_reclaim(). | ||
1586 | */ | ||
1587 | int zone_reclaim_mode __read_mostly; | ||
1588 | |||
1589 | /* | ||
1590 | * Mininum time between zone reclaim scans | ||
1591 | */ | ||
1592 | #define ZONE_RECLAIM_INTERVAL HZ/2 | ||
1593 | /* | ||
1594 | * Try to free up some pages from this zone through reclaim. | ||
1595 | */ | ||
1596 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | ||
1597 | { | ||
1598 | int nr_pages = 1 << order; | ||
1599 | struct task_struct *p = current; | ||
1600 | struct reclaim_state reclaim_state; | ||
1601 | struct scan_control sc = { | ||
1602 | .gfp_mask = gfp_mask, | ||
1603 | .may_writepage = 0, | ||
1604 | .may_swap = 0, | ||
1605 | .nr_mapped = read_page_state(nr_mapped), | ||
1606 | .nr_scanned = 0, | ||
1607 | .nr_reclaimed = 0, | ||
1608 | .priority = 0 | ||
1609 | }; | ||
1610 | |||
1611 | if (!(gfp_mask & __GFP_WAIT) || | ||
1612 | zone->zone_pgdat->node_id != numa_node_id() || | ||
1613 | zone->all_unreclaimable || | ||
1614 | atomic_read(&zone->reclaim_in_progress) > 0) | ||
1615 | return 0; | ||
1616 | |||
1617 | if (time_before(jiffies, | ||
1618 | zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) | ||
1619 | return 0; | ||
1620 | |||
1621 | disable_swap_token(); | ||
1622 | |||
1623 | if (nr_pages > SWAP_CLUSTER_MAX) | ||
1624 | sc.swap_cluster_max = nr_pages; | ||
1625 | else | ||
1626 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
1627 | |||
1628 | cond_resched(); | ||
1629 | p->flags |= PF_MEMALLOC; | ||
1630 | reclaim_state.reclaimed_slab = 0; | ||
1631 | p->reclaim_state = &reclaim_state; | ||
1632 | shrink_zone(zone, &sc); | ||
1633 | p->reclaim_state = NULL; | ||
1634 | current->flags &= ~PF_MEMALLOC; | ||
1635 | |||
1636 | if (sc.nr_reclaimed == 0) | ||
1637 | zone->last_unsuccessful_zone_reclaim = jiffies; | ||
1638 | |||
1639 | return sc.nr_reclaimed > nr_pages; | ||
1640 | } | ||
1641 | #endif | ||
1642 | |||