aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig11
-rw-r--r--mm/Makefile3
-rw-r--r--mm/filemap.c37
-rw-r--r--mm/fremap.c27
-rw-r--r--mm/hugetlb.c44
-rw-r--r--mm/internal.h131
-rw-r--r--mm/memcontrol.c466
-rw-r--r--mm/memory.c125
-rw-r--r--mm/memory_hotplug.c19
-rw-r--r--mm/mempolicy.c11
-rw-r--r--mm/migrate.c274
-rw-r--r--mm/mlock.c443
-rw-r--r--mm/mmap.c81
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/nommu.c44
-rw-r--r--mm/page-writeback.c8
-rw-r--r--mm/page_alloc.c121
-rw-r--r--mm/page_cgroup.c237
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c319
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/swap.c172
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c27
-rw-r--r--mm/tiny-shmem.c1
-rw-r--r--mm/truncate.c4
-rw-r--r--mm/vmalloc.c994
-rw-r--r--mm/vmscan.c1026
-rw-r--r--mm/vmstat.c33
29 files changed, 3596 insertions, 1090 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1a501a4de95c..5b5790f8a816 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -209,5 +209,16 @@ config VIRT_TO_BUS
209 def_bool y 209 def_bool y
210 depends on !ARCH_NO_VIRT_TO_BUS 210 depends on !ARCH_NO_VIRT_TO_BUS
211 211
212config UNEVICTABLE_LRU
213 bool "Add LRU list to track non-evictable pages"
214 default y
215 depends on MMU
216 help
217 Keeps unevictable pages off of the active and inactive pageout
218 lists, so kswapd will not waste CPU time or have its balancing
219 algorithms thrown off by scanning these pages. Selecting this
220 will use one page flag and increase the code size a little,
221 say Y unless you know what you are doing.
222
212config MMU_NOTIFIER 223config MMU_NOTIFIER
213 bool 224 bool
diff --git a/mm/Makefile b/mm/Makefile
index da4ccf015aea..c06b45a1ff5f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
33obj-$(CONFIG_MIGRATION) += migrate.o 33obj-$(CONFIG_MIGRATION) += migrate.o
34obj-$(CONFIG_SMP) += allocpercpu.o 34obj-$(CONFIG_SMP) += allocpercpu.o
35obj-$(CONFIG_QUICKLIST) += quicklist.o 35obj-$(CONFIG_QUICKLIST) += quicklist.o
36obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o 36obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
37
diff --git a/mm/filemap.c b/mm/filemap.c
index 903bf316912a..ab8553658af3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
36#include "internal.h" 37#include "internal.h"
37 38
38/* 39/*
@@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page)
115{ 116{
116 struct address_space *mapping = page->mapping; 117 struct address_space *mapping = page->mapping;
117 118
118 mem_cgroup_uncharge_cache_page(page);
119 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
120 page->mapping = NULL; 120 page->mapping = NULL;
121 mapping->nrpages--; 121 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
123 BUG_ON(page_mapped(page)); 123 BUG_ON(page_mapped(page));
124 mem_cgroup_uncharge_cache_page(page);
124 125
125 /* 126 /*
126 * Some filesystems seem to re-dirty the page even after 127 * Some filesystems seem to re-dirty the page even after
@@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked);
492int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 493int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
493 pgoff_t offset, gfp_t gfp_mask) 494 pgoff_t offset, gfp_t gfp_mask)
494{ 495{
495 int ret = add_to_page_cache(page, mapping, offset, gfp_mask); 496 int ret;
496 if (ret == 0) 497
497 lru_cache_add(page); 498 /*
499 * Splice_read and readahead add shmem/tmpfs pages into the page cache
500 * before shmem_readpage has a chance to mark them as SwapBacked: they
501 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
502 * (called in add_to_page_cache) needs to know where they're going too.
503 */
504 if (mapping_cap_swap_backed(mapping))
505 SetPageSwapBacked(page);
506
507 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
508 if (ret == 0) {
509 if (page_is_file_cache(page))
510 lru_cache_add_file(page);
511 else
512 lru_cache_add_active_anon(page);
513 }
498 return ret; 514 return ret;
499} 515}
500 516
@@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
557 * mechananism between PageLocked pages and PageWriteback pages is shared. 573 * mechananism between PageLocked pages and PageWriteback pages is shared.
558 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 574 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
559 * 575 *
560 * The first mb is necessary to safely close the critical section opened by the 576 * The mb is necessary to enforce ordering between the clear_bit and the read
561 * test_and_set_bit() to lock the page; the second mb is necessary to enforce 577 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
562 * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
563 * races with a parallel wait_on_page_locked()).
564 */ 578 */
565void unlock_page(struct page *page) 579void unlock_page(struct page *page)
566{ 580{
567 smp_mb__before_clear_bit(); 581 VM_BUG_ON(!PageLocked(page));
568 if (!test_and_clear_bit(PG_locked, &page->flags)) 582 clear_bit_unlock(PG_locked, &page->flags);
569 BUG(); 583 smp_mb__after_clear_bit();
570 smp_mb__after_clear_bit();
571 wake_up_page(page, PG_locked); 584 wake_up_page(page, PG_locked);
572} 585}
573EXPORT_SYMBOL(unlock_page); 586EXPORT_SYMBOL(unlock_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7881638e4a12..7d12ca70ef7b 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -21,6 +21,8 @@
21#include <asm/cacheflush.h> 21#include <asm/cacheflush.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23 23
24#include "internal.h"
25
24static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, 26static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
25 unsigned long addr, pte_t *ptep) 27 unsigned long addr, pte_t *ptep)
26{ 28{
@@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
215 spin_unlock(&mapping->i_mmap_lock); 217 spin_unlock(&mapping->i_mmap_lock);
216 } 218 }
217 219
220 if (vma->vm_flags & VM_LOCKED) {
221 /*
222 * drop PG_Mlocked flag for over-mapped range
223 */
224 unsigned int saved_flags = vma->vm_flags;
225 munlock_vma_pages_range(vma, start, start + size);
226 vma->vm_flags = saved_flags;
227 }
228
218 mmu_notifier_invalidate_range_start(mm, start, start + size); 229 mmu_notifier_invalidate_range_start(mm, start, start + size);
219 err = populate_range(mm, vma, start, size, pgoff); 230 err = populate_range(mm, vma, start, size, pgoff);
220 mmu_notifier_invalidate_range_end(mm, start, start + size); 231 mmu_notifier_invalidate_range_end(mm, start, start + size);
221 if (!err && !(flags & MAP_NONBLOCK)) { 232 if (!err && !(flags & MAP_NONBLOCK)) {
222 if (unlikely(has_write_lock)) { 233 if (vma->vm_flags & VM_LOCKED) {
223 downgrade_write(&mm->mmap_sem); 234 /*
224 has_write_lock = 0; 235 * might be mapping previously unmapped range of file
236 */
237 mlock_vma_pages_range(vma, start, start + size);
238 } else {
239 if (unlikely(has_write_lock)) {
240 downgrade_write(&mm->mmap_sem);
241 has_write_lock = 0;
242 }
243 make_pages_present(start, start+size);
225 } 244 }
226 make_pages_present(start, start+size);
227 } 245 }
228 246
229 /* 247 /*
@@ -240,4 +258,3 @@ out:
240 258
241 return err; 259 return err;
242} 260}
243
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 38633864a93e..ce8cbb29860b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -262,7 +262,7 @@ struct resv_map {
262 struct list_head regions; 262 struct list_head regions;
263}; 263};
264 264
265struct resv_map *resv_map_alloc(void) 265static struct resv_map *resv_map_alloc(void)
266{ 266{
267 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 267 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
268 if (!resv_map) 268 if (!resv_map)
@@ -274,7 +274,7 @@ struct resv_map *resv_map_alloc(void)
274 return resv_map; 274 return resv_map;
275} 275}
276 276
277void resv_map_release(struct kref *ref) 277static void resv_map_release(struct kref *ref)
278{ 278{
279 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 279 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
280 280
@@ -289,7 +289,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
289 if (!(vma->vm_flags & VM_SHARED)) 289 if (!(vma->vm_flags & VM_SHARED))
290 return (struct resv_map *)(get_vma_private_data(vma) & 290 return (struct resv_map *)(get_vma_private_data(vma) &
291 ~HPAGE_RESV_MASK); 291 ~HPAGE_RESV_MASK);
292 return 0; 292 return NULL;
293} 293}
294 294
295static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 295static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
@@ -1459,11 +1459,11 @@ int hugetlb_report_meminfo(char *buf)
1459{ 1459{
1460 struct hstate *h = &default_hstate; 1460 struct hstate *h = &default_hstate;
1461 return sprintf(buf, 1461 return sprintf(buf,
1462 "HugePages_Total: %5lu\n" 1462 "HugePages_Total: %5lu\n"
1463 "HugePages_Free: %5lu\n" 1463 "HugePages_Free: %5lu\n"
1464 "HugePages_Rsvd: %5lu\n" 1464 "HugePages_Rsvd: %5lu\n"
1465 "HugePages_Surp: %5lu\n" 1465 "HugePages_Surp: %5lu\n"
1466 "Hugepagesize: %5lu kB\n", 1466 "Hugepagesize: %8lu kB\n",
1467 h->nr_huge_pages, 1467 h->nr_huge_pages,
1468 h->free_huge_pages, 1468 h->free_huge_pages,
1469 h->resv_huge_pages, 1469 h->resv_huge_pages,
@@ -1747,10 +1747,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1747 * from other VMAs and let the children be SIGKILLed if they are faulting the 1747 * from other VMAs and let the children be SIGKILLed if they are faulting the
1748 * same region. 1748 * same region.
1749 */ 1749 */
1750int unmap_ref_private(struct mm_struct *mm, 1750static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1751 struct vm_area_struct *vma, 1751 struct page *page, unsigned long address)
1752 struct page *page,
1753 unsigned long address)
1754{ 1752{
1755 struct vm_area_struct *iter_vma; 1753 struct vm_area_struct *iter_vma;
1756 struct address_space *mapping; 1754 struct address_space *mapping;
@@ -2073,6 +2071,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2073 return NULL; 2071 return NULL;
2074} 2072}
2075 2073
2074static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2075{
2076 if (!ptep || write || shared)
2077 return 0;
2078 else
2079 return huge_pte_none(huge_ptep_get(ptep));
2080}
2081
2076int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2082int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2077 struct page **pages, struct vm_area_struct **vmas, 2083 struct page **pages, struct vm_area_struct **vmas,
2078 unsigned long *position, int *length, int i, 2084 unsigned long *position, int *length, int i,
@@ -2082,6 +2088,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2082 unsigned long vaddr = *position; 2088 unsigned long vaddr = *position;
2083 int remainder = *length; 2089 int remainder = *length;
2084 struct hstate *h = hstate_vma(vma); 2090 struct hstate *h = hstate_vma(vma);
2091 int zeropage_ok = 0;
2092 int shared = vma->vm_flags & VM_SHARED;
2085 2093
2086 spin_lock(&mm->page_table_lock); 2094 spin_lock(&mm->page_table_lock);
2087 while (vaddr < vma->vm_end && remainder) { 2095 while (vaddr < vma->vm_end && remainder) {
@@ -2094,8 +2102,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2094 * first, for the page indexing below to work. 2102 * first, for the page indexing below to work.
2095 */ 2103 */
2096 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2104 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2105 if (huge_zeropage_ok(pte, write, shared))
2106 zeropage_ok = 1;
2097 2107
2098 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 2108 if (!pte ||
2109 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
2099 (write && !pte_write(huge_ptep_get(pte)))) { 2110 (write && !pte_write(huge_ptep_get(pte)))) {
2100 int ret; 2111 int ret;
2101 2112
@@ -2115,8 +2126,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2115 page = pte_page(huge_ptep_get(pte)); 2126 page = pte_page(huge_ptep_get(pte));
2116same_page: 2127same_page:
2117 if (pages) { 2128 if (pages) {
2118 get_page(page); 2129 if (zeropage_ok)
2119 pages[i] = page + pfn_offset; 2130 pages[i] = ZERO_PAGE(0);
2131 else
2132 pages[i] = page + pfn_offset;
2133 get_page(pages[i]);
2120 } 2134 }
2121 2135
2122 if (vmas) 2136 if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 1f43f7416972..e4e728bdf324 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
39 atomic_dec(&page->_count); 39 atomic_dec(&page->_count);
40} 40}
41 41
42/*
43 * in mm/vmscan.c:
44 */
45extern int isolate_lru_page(struct page *page);
46extern void putback_lru_page(struct page *page);
47
48/*
49 * in mm/page_alloc.c
50 */
42extern void __free_pages_bootmem(struct page *page, unsigned int order); 51extern void __free_pages_bootmem(struct page *page, unsigned int order);
43 52
44/* 53/*
@@ -52,6 +61,120 @@ static inline unsigned long page_order(struct page *page)
52 return page_private(page); 61 return page_private(page);
53} 62}
54 63
64extern long mlock_vma_pages_range(struct vm_area_struct *vma,
65 unsigned long start, unsigned long end);
66extern void munlock_vma_pages_range(struct vm_area_struct *vma,
67 unsigned long start, unsigned long end);
68static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
69{
70 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
71}
72
73#ifdef CONFIG_UNEVICTABLE_LRU
74/*
75 * unevictable_migrate_page() called only from migrate_page_copy() to
76 * migrate unevictable flag to new page.
77 * Note that the old page has been isolated from the LRU lists at this
78 * point so we don't need to worry about LRU statistics.
79 */
80static inline void unevictable_migrate_page(struct page *new, struct page *old)
81{
82 if (TestClearPageUnevictable(old))
83 SetPageUnevictable(new);
84}
85#else
86static inline void unevictable_migrate_page(struct page *new, struct page *old)
87{
88}
89#endif
90
91#ifdef CONFIG_UNEVICTABLE_LRU
92/*
93 * Called only in fault path via page_evictable() for a new page
94 * to determine if it's being mapped into a LOCKED vma.
95 * If so, mark page as mlocked.
96 */
97static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
98{
99 VM_BUG_ON(PageLRU(page));
100
101 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
102 return 0;
103
104 if (!TestSetPageMlocked(page)) {
105 inc_zone_page_state(page, NR_MLOCK);
106 count_vm_event(UNEVICTABLE_PGMLOCKED);
107 }
108 return 1;
109}
110
111/*
112 * must be called with vma's mmap_sem held for read, and page locked.
113 */
114extern void mlock_vma_page(struct page *page);
115
116/*
117 * Clear the page's PageMlocked(). This can be useful in a situation where
118 * we want to unconditionally remove a page from the pagecache -- e.g.,
119 * on truncation or freeing.
120 *
121 * It is legal to call this function for any page, mlocked or not.
122 * If called for a page that is still mapped by mlocked vmas, all we do
123 * is revert to lazy LRU behaviour -- semantics are not broken.
124 */
125extern void __clear_page_mlock(struct page *page);
126static inline void clear_page_mlock(struct page *page)
127{
128 if (unlikely(TestClearPageMlocked(page)))
129 __clear_page_mlock(page);
130}
131
132/*
133 * mlock_migrate_page - called only from migrate_page_copy() to
134 * migrate the Mlocked page flag; update statistics.
135 */
136static inline void mlock_migrate_page(struct page *newpage, struct page *page)
137{
138 if (TestClearPageMlocked(page)) {
139 unsigned long flags;
140
141 local_irq_save(flags);
142 __dec_zone_page_state(page, NR_MLOCK);
143 SetPageMlocked(newpage);
144 __inc_zone_page_state(newpage, NR_MLOCK);
145 local_irq_restore(flags);
146 }
147}
148
149/*
150 * free_page_mlock() -- clean up attempts to free and mlocked() page.
151 * Page should not be on lru, so no need to fix that up.
152 * free_pages_check() will verify...
153 */
154static inline void free_page_mlock(struct page *page)
155{
156 if (unlikely(TestClearPageMlocked(page))) {
157 unsigned long flags;
158
159 local_irq_save(flags);
160 __dec_zone_page_state(page, NR_MLOCK);
161 __count_vm_event(UNEVICTABLE_MLOCKFREED);
162 local_irq_restore(flags);
163 }
164}
165
166#else /* CONFIG_UNEVICTABLE_LRU */
167static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
168{
169 return 0;
170}
171static inline void clear_page_mlock(struct page *page) { }
172static inline void mlock_vma_page(struct page *page) { }
173static inline void mlock_migrate_page(struct page *new, struct page *old) { }
174static inline void free_page_mlock(struct page *page) { }
175
176#endif /* CONFIG_UNEVICTABLE_LRU */
177
55/* 178/*
56 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, 179 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
57 * so all functions starting at paging_init should be marked __init 180 * so all functions starting at paging_init should be marked __init
@@ -120,4 +243,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
120} 243}
121#endif /* CONFIG_SPARSEMEM */ 244#endif /* CONFIG_SPARSEMEM */
122 245
246#define GUP_FLAGS_WRITE 0x1
247#define GUP_FLAGS_FORCE 0x2
248#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
249
250int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
251 unsigned long start, int len, int flags,
252 struct page **pages, struct vm_area_struct **vmas);
253
123#endif 254#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 36896f3eb7f5..d4a92b63e98e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -32,11 +32,12 @@
32#include <linux/fs.h> 32#include <linux/fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/vmalloc.h> 34#include <linux/vmalloc.h>
35#include <linux/mm_inline.h>
36#include <linux/page_cgroup.h>
35 37
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37 39
38struct cgroup_subsys mem_cgroup_subsys __read_mostly; 40struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static struct kmem_cache *page_cgroup_cache __read_mostly;
40#define MEM_CGROUP_RECLAIM_RETRIES 5 41#define MEM_CGROUP_RECLAIM_RETRIES 5
41 42
42/* 43/*
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
65/* 66/*
66 * For accounting under irq disable, no need for increment preempt count. 67 * For accounting under irq disable, no need for increment preempt count.
67 */ 68 */
68static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 69static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
69 enum mem_cgroup_stat_index idx, int val) 70 enum mem_cgroup_stat_index idx, int val)
70{ 71{
71 int cpu = smp_processor_id(); 72 stat->count[idx] += val;
72 stat->cpustat[cpu].count[idx] += val;
73} 73}
74 74
75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
85/* 85/*
86 * per-zone information in memory controller. 86 * per-zone information in memory controller.
87 */ 87 */
88
89enum mem_cgroup_zstat_index {
90 MEM_CGROUP_ZSTAT_ACTIVE,
91 MEM_CGROUP_ZSTAT_INACTIVE,
92
93 NR_MEM_CGROUP_ZSTAT,
94};
95
96struct mem_cgroup_per_zone { 88struct mem_cgroup_per_zone {
97 /* 89 /*
98 * spin_lock to protect the per cgroup LRU 90 * spin_lock to protect the per cgroup LRU
99 */ 91 */
100 spinlock_t lru_lock; 92 spinlock_t lru_lock;
101 struct list_head active_list; 93 struct list_head lists[NR_LRU_LISTS];
102 struct list_head inactive_list; 94 unsigned long count[NR_LRU_LISTS];
103 unsigned long count[NR_MEM_CGROUP_ZSTAT];
104}; 95};
105/* Macro for accessing counter */ 96/* Macro for accessing counter */
106#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 97#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -144,69 +135,52 @@ struct mem_cgroup {
144}; 135};
145static struct mem_cgroup init_mem_cgroup; 136static struct mem_cgroup init_mem_cgroup;
146 137
147/*
148 * We use the lower bit of the page->page_cgroup pointer as a bit spin
149 * lock. We need to ensure that page->page_cgroup is at least two
150 * byte aligned (based on comments from Nick Piggin). But since
151 * bit_spin_lock doesn't actually set that lock bit in a non-debug
152 * uniprocessor kernel, we should avoid setting it here too.
153 */
154#define PAGE_CGROUP_LOCK_BIT 0x0
155#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
156#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
157#else
158#define PAGE_CGROUP_LOCK 0x0
159#endif
160
161/*
162 * A page_cgroup page is associated with every page descriptor. The
163 * page_cgroup helps us identify information about the cgroup
164 */
165struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page;
168 struct mem_cgroup *mem_cgroup;
169 int flags;
170};
171#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
172#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
173
174static int page_cgroup_nid(struct page_cgroup *pc)
175{
176 return page_to_nid(pc->page);
177}
178
179static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
180{
181 return page_zonenum(pc->page);
182}
183
184enum charge_type { 138enum charge_type {
185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 139 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
186 MEM_CGROUP_CHARGE_TYPE_MAPPED, 140 MEM_CGROUP_CHARGE_TYPE_MAPPED,
141 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 142 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
143 NR_CHARGE_TYPE,
144};
145
146/* only for here (for easy reading.) */
147#define PCGF_CACHE (1UL << PCG_CACHE)
148#define PCGF_USED (1UL << PCG_USED)
149#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
150#define PCGF_LOCK (1UL << PCG_LOCK)
151#define PCGF_FILE (1UL << PCG_FILE)
152static const unsigned long
153pcg_default_flags[NR_CHARGE_TYPE] = {
154 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
155 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
156 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
157 0, /* FORCE */
188}; 158};
189 159
190/* 160/*
191 * Always modified under lru lock. Then, not necessary to preempt_disable() 161 * Always modified under lru lock. Then, not necessary to preempt_disable()
192 */ 162 */
193static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 163static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
194 bool charge) 164 struct page_cgroup *pc,
165 bool charge)
195{ 166{
196 int val = (charge)? 1 : -1; 167 int val = (charge)? 1 : -1;
197 struct mem_cgroup_stat *stat = &mem->stat; 168 struct mem_cgroup_stat *stat = &mem->stat;
169 struct mem_cgroup_stat_cpu *cpustat;
198 170
199 VM_BUG_ON(!irqs_disabled()); 171 VM_BUG_ON(!irqs_disabled());
200 if (flags & PAGE_CGROUP_FLAG_CACHE) 172
201 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 173 cpustat = &stat->cpustat[smp_processor_id()];
174 if (PageCgroupCache(pc))
175 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
202 else 176 else
203 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 177 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
204 178
205 if (charge) 179 if (charge)
206 __mem_cgroup_stat_add_safe(stat, 180 __mem_cgroup_stat_add_safe(cpustat,
207 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 181 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
208 else 182 else
209 __mem_cgroup_stat_add_safe(stat, 183 __mem_cgroup_stat_add_safe(cpustat,
210 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 184 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
211} 185}
212 186
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
227} 201}
228 202
229static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 203static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
230 enum mem_cgroup_zstat_index idx) 204 enum lru_list idx)
231{ 205{
232 int nid, zid; 206 int nid, zid;
233 struct mem_cgroup_per_zone *mz; 207 struct mem_cgroup_per_zone *mz;
@@ -262,85 +236,77 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
262 struct mem_cgroup, css); 236 struct mem_cgroup, css);
263} 237}
264 238
265static inline int page_cgroup_locked(struct page *page)
266{
267 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
268}
269
270static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
271{
272 VM_BUG_ON(!page_cgroup_locked(page));
273 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
274}
275
276struct page_cgroup *page_get_page_cgroup(struct page *page)
277{
278 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
279}
280
281static void lock_page_cgroup(struct page *page)
282{
283 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
284}
285
286static int try_lock_page_cgroup(struct page *page)
287{
288 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
289}
290
291static void unlock_page_cgroup(struct page *page)
292{
293 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
294}
295
296static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 239static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
297 struct page_cgroup *pc) 240 struct page_cgroup *pc)
298{ 241{
299 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 242 int lru = LRU_BASE;
243
244 if (PageCgroupUnevictable(pc))
245 lru = LRU_UNEVICTABLE;
246 else {
247 if (PageCgroupActive(pc))
248 lru += LRU_ACTIVE;
249 if (PageCgroupFile(pc))
250 lru += LRU_FILE;
251 }
300 252
301 if (from) 253 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
302 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
303 else
304 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
305 254
306 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 255 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
307 list_del(&pc->lru); 256 list_del(&pc->lru);
308} 257}
309 258
310static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
311 struct page_cgroup *pc) 260 struct page_cgroup *pc)
312{ 261{
313 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 262 int lru = LRU_BASE;
314 263
315 if (!to) { 264 if (PageCgroupUnevictable(pc))
316 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 265 lru = LRU_UNEVICTABLE;
317 list_add(&pc->lru, &mz->inactive_list); 266 else {
318 } else { 267 if (PageCgroupActive(pc))
319 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 268 lru += LRU_ACTIVE;
320 list_add(&pc->lru, &mz->active_list); 269 if (PageCgroupFile(pc))
270 lru += LRU_FILE;
321 } 271 }
322 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 272
273 MEM_CGROUP_ZSTAT(mz, lru) += 1;
274 list_add(&pc->lru, &mz->lists[lru]);
275
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
323} 277}
324 278
325static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 279static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
326{ 280{
327 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
328 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 281 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
282 int active = PageCgroupActive(pc);
283 int file = PageCgroupFile(pc);
284 int unevictable = PageCgroupUnevictable(pc);
285 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
286 (LRU_FILE * !!file + !!active);
329 287
330 if (from) 288 if (lru == from)
331 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 289 return;
332 else
333 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
334 290
335 if (active) { 291 MEM_CGROUP_ZSTAT(mz, from) -= 1;
336 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 292 /*
337 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 293 * However this is done under mz->lru_lock, another flags, which
338 list_move(&pc->lru, &mz->active_list); 294 * are not related to LRU, will be modified from out-of-lock.
295 * We have to use atomic set/clear flags.
296 */
297 if (is_unevictable_lru(lru)) {
298 ClearPageCgroupActive(pc);
299 SetPageCgroupUnevictable(pc);
339 } else { 300 } else {
340 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 301 if (is_active_lru(lru))
341 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 302 SetPageCgroupActive(pc);
342 list_move(&pc->lru, &mz->inactive_list); 303 else
304 ClearPageCgroupActive(pc);
305 ClearPageCgroupUnevictable(pc);
343 } 306 }
307
308 MEM_CGROUP_ZSTAT(mz, lru) += 1;
309 list_move(&pc->lru, &mz->lists[lru]);
344} 310}
345 311
346int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 312int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -356,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
356/* 322/*
357 * This routine assumes that the appropriate zone's lru lock is already held 323 * This routine assumes that the appropriate zone's lru lock is already held
358 */ 324 */
359void mem_cgroup_move_lists(struct page *page, bool active) 325void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
360{ 326{
361 struct page_cgroup *pc; 327 struct page_cgroup *pc;
362 struct mem_cgroup_per_zone *mz; 328 struct mem_cgroup_per_zone *mz;
@@ -372,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
372 * safely get to page_cgroup without it, so just try_lock it: 338 * safely get to page_cgroup without it, so just try_lock it:
373 * mem_cgroup_isolate_pages allows for page left on wrong list. 339 * mem_cgroup_isolate_pages allows for page left on wrong list.
374 */ 340 */
375 if (!try_lock_page_cgroup(page)) 341 pc = lookup_page_cgroup(page);
342 if (!trylock_page_cgroup(pc))
376 return; 343 return;
377 344 if (pc && PageCgroupUsed(pc)) {
378 pc = page_get_page_cgroup(page);
379 if (pc) {
380 mz = page_cgroup_zoneinfo(pc); 345 mz = page_cgroup_zoneinfo(pc);
381 spin_lock_irqsave(&mz->lru_lock, flags); 346 spin_lock_irqsave(&mz->lru_lock, flags);
382 __mem_cgroup_move_lists(pc, active); 347 __mem_cgroup_move_lists(pc, lru);
383 spin_unlock_irqrestore(&mz->lru_lock, flags); 348 spin_unlock_irqrestore(&mz->lru_lock, flags);
384 } 349 }
385 unlock_page_cgroup(page); 350 unlock_page_cgroup(pc);
386} 351}
387 352
388/* 353/*
@@ -403,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
403} 368}
404 369
405/* 370/*
406 * This function is called from vmscan.c. In page reclaiming loop. balance
407 * between active and inactive list is calculated. For memory controller
408 * page reclaiming, we should use using mem_cgroup's imbalance rather than
409 * zone's global lru imbalance.
410 */
411long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
412{
413 unsigned long active, inactive;
414 /* active and inactive are the number of pages. 'long' is ok.*/
415 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
416 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
417 return (long) (active / (inactive + 1));
418}
419
420/*
421 * prev_priority control...this will be used in memory reclaim path. 371 * prev_priority control...this will be used in memory reclaim path.
422 */ 372 */
423int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 373int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -444,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
444 * (see include/linux/mmzone.h) 394 * (see include/linux/mmzone.h)
445 */ 395 */
446 396
447long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 397long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
448 struct zone *zone, int priority) 398 int priority, enum lru_list lru)
449{ 399{
450 long nr_active; 400 long nr_pages;
451 int nid = zone->zone_pgdat->node_id; 401 int nid = zone->zone_pgdat->node_id;
452 int zid = zone_idx(zone); 402 int zid = zone_idx(zone);
453 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 403 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
454 404
455 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 405 nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
456 return (nr_active >> priority);
457}
458 406
459long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, 407 return (nr_pages >> priority);
460 struct zone *zone, int priority)
461{
462 long nr_inactive;
463 int nid = zone->zone_pgdat->node_id;
464 int zid = zone_idx(zone);
465 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
466
467 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
468 return (nr_inactive >> priority);
469} 408}
470 409
471unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 410unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -473,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
473 unsigned long *scanned, int order, 412 unsigned long *scanned, int order,
474 int mode, struct zone *z, 413 int mode, struct zone *z,
475 struct mem_cgroup *mem_cont, 414 struct mem_cgroup *mem_cont,
476 int active) 415 int active, int file)
477{ 416{
478 unsigned long nr_taken = 0; 417 unsigned long nr_taken = 0;
479 struct page *page; 418 struct page *page;
@@ -484,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
484 int nid = z->zone_pgdat->node_id; 423 int nid = z->zone_pgdat->node_id;
485 int zid = zone_idx(z); 424 int zid = zone_idx(z);
486 struct mem_cgroup_per_zone *mz; 425 struct mem_cgroup_per_zone *mz;
426 int lru = LRU_FILE * !!file + !!active;
487 427
488 BUG_ON(!mem_cont); 428 BUG_ON(!mem_cont);
489 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 429 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
490 if (active) 430 src = &mz->lists[lru];
491 src = &mz->active_list;
492 else
493 src = &mz->inactive_list;
494
495 431
496 spin_lock(&mz->lru_lock); 432 spin_lock(&mz->lru_lock);
497 scan = 0; 433 scan = 0;
498 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 434 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
499 if (scan >= nr_to_scan) 435 if (scan >= nr_to_scan)
500 break; 436 break;
437 if (unlikely(!PageCgroupUsed(pc)))
438 continue;
501 page = pc->page; 439 page = pc->page;
502 440
503 if (unlikely(!PageLRU(page))) 441 if (unlikely(!PageLRU(page)))
504 continue; 442 continue;
505 443
506 if (PageActive(page) && !active) { 444 /*
507 __mem_cgroup_move_lists(pc, true); 445 * TODO: play better with lumpy reclaim, grabbing anything.
508 continue; 446 */
509 } 447 if (PageUnevictable(page) ||
510 if (!PageActive(page) && active) { 448 (PageActive(page) && !active) ||
511 __mem_cgroup_move_lists(pc, false); 449 (!PageActive(page) && active)) {
450 __mem_cgroup_move_lists(pc, page_lru(page));
512 continue; 451 continue;
513 } 452 }
514 453
515 scan++; 454 scan++;
516 list_move(&pc->lru, &pc_list); 455 list_move(&pc->lru, &pc_list);
517 456
518 if (__isolate_lru_page(page, mode) == 0) { 457 if (__isolate_lru_page(page, mode, file) == 0) {
519 list_move(&page->lru, dst); 458 list_move(&page->lru, dst);
520 nr_taken++; 459 nr_taken++;
521 } 460 }
@@ -540,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
540{ 479{
541 struct mem_cgroup *mem; 480 struct mem_cgroup *mem;
542 struct page_cgroup *pc; 481 struct page_cgroup *pc;
543 unsigned long flags;
544 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
545 struct mem_cgroup_per_zone *mz; 483 struct mem_cgroup_per_zone *mz;
484 unsigned long flags;
546 485
547 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); 486 pc = lookup_page_cgroup(page);
548 if (unlikely(pc == NULL)) 487 /* can happen at boot */
549 goto err; 488 if (unlikely(!pc))
550 489 return 0;
490 prefetchw(pc);
551 /* 491 /*
552 * We always charge the cgroup the mm_struct belongs to. 492 * We always charge the cgroup the mm_struct belongs to.
553 * The mm_struct's mem_cgroup changes on task migration if the 493 * The mm_struct's mem_cgroup changes on task migration if the
554 * thread group leader migrates. It's possible that mm is not 494 * thread group leader migrates. It's possible that mm is not
555 * set, if so charge the init_mm (happens for pagecache usage). 495 * set, if so charge the init_mm (happens for pagecache usage).
556 */ 496 */
497
557 if (likely(!memcg)) { 498 if (likely(!memcg)) {
558 rcu_read_lock(); 499 rcu_read_lock();
559 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
560 if (unlikely(!mem)) { 501 if (unlikely(!mem)) {
561 rcu_read_unlock(); 502 rcu_read_unlock();
562 kmem_cache_free(page_cgroup_cache, pc);
563 return 0; 503 return 0;
564 } 504 }
565 /* 505 /*
@@ -572,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
572 css_get(&memcg->css); 512 css_get(&memcg->css);
573 } 513 }
574 514
575 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
576 if (!(gfp_mask & __GFP_WAIT)) 516 if (!(gfp_mask & __GFP_WAIT))
577 goto out; 517 goto out;
578 518
@@ -595,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
595 } 535 }
596 } 536 }
597 537
598 pc->mem_cgroup = mem;
599 pc->page = page;
600 /*
601 * If a page is accounted as a page cache, insert to inactive list.
602 * If anon, insert to active list.
603 */
604 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
605 pc->flags = PAGE_CGROUP_FLAG_CACHE;
606 else
607 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
608 538
609 lock_page_cgroup(page); 539 lock_page_cgroup(pc);
610 if (unlikely(page_get_page_cgroup(page))) { 540 if (unlikely(PageCgroupUsed(pc))) {
611 unlock_page_cgroup(page); 541 unlock_page_cgroup(pc);
612 res_counter_uncharge(&mem->res, PAGE_SIZE); 542 res_counter_uncharge(&mem->res, PAGE_SIZE);
613 css_put(&mem->css); 543 css_put(&mem->css);
614 kmem_cache_free(page_cgroup_cache, pc); 544
615 goto done; 545 goto done;
616 } 546 }
617 page_assign_page_cgroup(page, pc); 547 pc->mem_cgroup = mem;
548 /*
549 * If a page is accounted as a page cache, insert to inactive list.
550 * If anon, insert to active list.
551 */
552 pc->flags = pcg_default_flags[ctype];
618 553
619 mz = page_cgroup_zoneinfo(pc); 554 mz = page_cgroup_zoneinfo(pc);
555
620 spin_lock_irqsave(&mz->lru_lock, flags); 556 spin_lock_irqsave(&mz->lru_lock, flags);
621 __mem_cgroup_add_list(mz, pc); 557 __mem_cgroup_add_list(mz, pc);
622 spin_unlock_irqrestore(&mz->lru_lock, flags); 558 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 unlock_page_cgroup(pc);
623 560
624 unlock_page_cgroup(page);
625done: 561done:
626 return 0; 562 return 0;
627out: 563out:
628 css_put(&mem->css); 564 css_put(&mem->css);
629 kmem_cache_free(page_cgroup_cache, pc);
630err:
631 return -ENOMEM; 565 return -ENOMEM;
632} 566}
633 567
@@ -635,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
635{ 569{
636 if (mem_cgroup_subsys.disabled) 570 if (mem_cgroup_subsys.disabled)
637 return 0; 571 return 0;
638 572 if (PageCompound(page))
573 return 0;
639 /* 574 /*
640 * If already mapped, we don't have to account. 575 * If already mapped, we don't have to account.
641 * If page cache, page->mapping has address_space. 576 * If page cache, page->mapping has address_space.
@@ -656,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
656{ 591{
657 if (mem_cgroup_subsys.disabled) 592 if (mem_cgroup_subsys.disabled)
658 return 0; 593 return 0;
659 594 if (PageCompound(page))
595 return 0;
660 /* 596 /*
661 * Corner case handling. This is called from add_to_page_cache() 597 * Corner case handling. This is called from add_to_page_cache()
662 * in usual. But some FS (shmem) precharges this page before calling it 598 * in usual. But some FS (shmem) precharges this page before calling it
@@ -669,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
669 if (!(gfp_mask & __GFP_WAIT)) { 605 if (!(gfp_mask & __GFP_WAIT)) {
670 struct page_cgroup *pc; 606 struct page_cgroup *pc;
671 607
672 lock_page_cgroup(page); 608
673 pc = page_get_page_cgroup(page); 609 pc = lookup_page_cgroup(page);
674 if (pc) { 610 if (!pc)
675 VM_BUG_ON(pc->page != page); 611 return 0;
676 VM_BUG_ON(!pc->mem_cgroup); 612 lock_page_cgroup(pc);
677 unlock_page_cgroup(page); 613 if (PageCgroupUsed(pc)) {
614 unlock_page_cgroup(pc);
678 return 0; 615 return 0;
679 } 616 }
680 unlock_page_cgroup(page); 617 unlock_page_cgroup(pc);
681 } 618 }
682 619
683 if (unlikely(!mm)) 620 if (unlikely(!mm))
684 mm = &init_mm; 621 mm = &init_mm;
685 622
686 return mem_cgroup_charge_common(page, mm, gfp_mask, 623 if (page_is_file_cache(page))
624 return mem_cgroup_charge_common(page, mm, gfp_mask,
687 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 625 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
626 else
627 return mem_cgroup_charge_common(page, mm, gfp_mask,
628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
688} 629}
689 630
690/* 631/*
@@ -704,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
704 /* 645 /*
705 * Check if our page_cgroup is valid 646 * Check if our page_cgroup is valid
706 */ 647 */
707 lock_page_cgroup(page); 648 pc = lookup_page_cgroup(page);
708 pc = page_get_page_cgroup(page); 649 if (unlikely(!pc || !PageCgroupUsed(pc)))
709 if (unlikely(!pc)) 650 return;
710 goto unlock;
711
712 VM_BUG_ON(pc->page != page);
713 651
714 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 652 lock_page_cgroup(pc);
715 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) 653 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
716 || page_mapped(page))) 654 || !PageCgroupUsed(pc)) {
717 goto unlock; 655 /* This happens at race in zap_pte_range() and do_swap_page()*/
656 unlock_page_cgroup(pc);
657 return;
658 }
659 ClearPageCgroupUsed(pc);
660 mem = pc->mem_cgroup;
718 661
719 mz = page_cgroup_zoneinfo(pc); 662 mz = page_cgroup_zoneinfo(pc);
720 spin_lock_irqsave(&mz->lru_lock, flags); 663 spin_lock_irqsave(&mz->lru_lock, flags);
721 __mem_cgroup_remove_list(mz, pc); 664 __mem_cgroup_remove_list(mz, pc);
722 spin_unlock_irqrestore(&mz->lru_lock, flags); 665 spin_unlock_irqrestore(&mz->lru_lock, flags);
666 unlock_page_cgroup(pc);
723 667
724 page_assign_page_cgroup(page, NULL);
725 unlock_page_cgroup(page);
726
727 mem = pc->mem_cgroup;
728 res_counter_uncharge(&mem->res, PAGE_SIZE); 668 res_counter_uncharge(&mem->res, PAGE_SIZE);
729 css_put(&mem->css); 669 css_put(&mem->css);
730 670
731 kmem_cache_free(page_cgroup_cache, pc);
732 return; 671 return;
733unlock:
734 unlock_page_cgroup(page);
735} 672}
736 673
737void mem_cgroup_uncharge_page(struct page *page) 674void mem_cgroup_uncharge_page(struct page *page)
738{ 675{
676 /* early check. */
677 if (page_mapped(page))
678 return;
679 if (page->mapping && !PageAnon(page))
680 return;
739 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 681 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
740} 682}
741 683
742void mem_cgroup_uncharge_cache_page(struct page *page) 684void mem_cgroup_uncharge_cache_page(struct page *page)
743{ 685{
744 VM_BUG_ON(page_mapped(page)); 686 VM_BUG_ON(page_mapped(page));
687 VM_BUG_ON(page->mapping);
745 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 688 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
746} 689}
747 690
@@ -758,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
758 if (mem_cgroup_subsys.disabled) 701 if (mem_cgroup_subsys.disabled)
759 return 0; 702 return 0;
760 703
761 lock_page_cgroup(page); 704 pc = lookup_page_cgroup(page);
762 pc = page_get_page_cgroup(page); 705 lock_page_cgroup(pc);
763 if (pc) { 706 if (PageCgroupUsed(pc)) {
764 mem = pc->mem_cgroup; 707 mem = pc->mem_cgroup;
765 css_get(&mem->css); 708 css_get(&mem->css);
766 if (pc->flags & PAGE_CGROUP_FLAG_CACHE) 709 if (PageCgroupCache(pc)) {
767 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 710 if (page_is_file_cache(page))
711 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
712 else
713 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
714 }
768 } 715 }
769 unlock_page_cgroup(page); 716 unlock_page_cgroup(pc);
770 if (mem) { 717 if (mem) {
771 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 718 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
772 ctype, mem); 719 ctype, mem);
@@ -791,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
791 */ 738 */
792 if (!newpage->mapping) 739 if (!newpage->mapping)
793 __mem_cgroup_uncharge_common(newpage, 740 __mem_cgroup_uncharge_common(newpage,
794 MEM_CGROUP_CHARGE_TYPE_FORCE); 741 MEM_CGROUP_CHARGE_TYPE_FORCE);
795 else if (PageAnon(newpage)) 742 else if (PageAnon(newpage))
796 mem_cgroup_uncharge_page(newpage); 743 mem_cgroup_uncharge_page(newpage);
797} 744}
@@ -863,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
863#define FORCE_UNCHARGE_BATCH (128) 810#define FORCE_UNCHARGE_BATCH (128)
864static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 811static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
865 struct mem_cgroup_per_zone *mz, 812 struct mem_cgroup_per_zone *mz,
866 int active) 813 enum lru_list lru)
867{ 814{
868 struct page_cgroup *pc; 815 struct page_cgroup *pc;
869 struct page *page; 816 struct page *page;
@@ -871,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
871 unsigned long flags; 818 unsigned long flags;
872 struct list_head *list; 819 struct list_head *list;
873 820
874 if (active) 821 list = &mz->lists[lru];
875 list = &mz->active_list;
876 else
877 list = &mz->inactive_list;
878 822
879 spin_lock_irqsave(&mz->lru_lock, flags); 823 spin_lock_irqsave(&mz->lru_lock, flags);
880 while (!list_empty(list)) { 824 while (!list_empty(list)) {
881 pc = list_entry(list->prev, struct page_cgroup, lru); 825 pc = list_entry(list->prev, struct page_cgroup, lru);
882 page = pc->page; 826 page = pc->page;
827 if (!PageCgroupUsed(pc))
828 break;
883 get_page(page); 829 get_page(page);
884 spin_unlock_irqrestore(&mz->lru_lock, flags); 830 spin_unlock_irqrestore(&mz->lru_lock, flags);
885 /* 831 /*
@@ -894,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
894 count = FORCE_UNCHARGE_BATCH; 840 count = FORCE_UNCHARGE_BATCH;
895 cond_resched(); 841 cond_resched();
896 } 842 }
897 } else 843 } else {
898 cond_resched(); 844 spin_lock_irqsave(&mz->lru_lock, flags);
845 break;
846 }
899 spin_lock_irqsave(&mz->lru_lock, flags); 847 spin_lock_irqsave(&mz->lru_lock, flags);
900 } 848 }
901 spin_unlock_irqrestore(&mz->lru_lock, flags); 849 spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -919,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
919 while (mem->res.usage > 0) { 867 while (mem->res.usage > 0) {
920 if (atomic_read(&mem->css.cgroup->count) > 0) 868 if (atomic_read(&mem->css.cgroup->count) > 0)
921 goto out; 869 goto out;
870 /* This is for making all *used* pages to be on LRU. */
871 lru_add_drain_all();
922 for_each_node_state(node, N_POSSIBLE) 872 for_each_node_state(node, N_POSSIBLE)
923 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 873 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
924 struct mem_cgroup_per_zone *mz; 874 struct mem_cgroup_per_zone *mz;
875 enum lru_list l;
925 mz = mem_cgroup_zoneinfo(mem, node, zid); 876 mz = mem_cgroup_zoneinfo(mem, node, zid);
926 /* drop all page_cgroup in active_list */ 877 for_each_lru(l)
927 mem_cgroup_force_empty_list(mem, mz, 1); 878 mem_cgroup_force_empty_list(mem, mz, l);
928 /* drop all page_cgroup in inactive_list */
929 mem_cgroup_force_empty_list(mem, mz, 0);
930 } 879 }
880 cond_resched();
931 } 881 }
932 ret = 0; 882 ret = 0;
933out: 883out:
@@ -1012,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1012 } 962 }
1013 /* showing # of active pages */ 963 /* showing # of active pages */
1014 { 964 {
1015 unsigned long active, inactive; 965 unsigned long active_anon, inactive_anon;
1016 966 unsigned long active_file, inactive_file;
1017 inactive = mem_cgroup_get_all_zonestat(mem_cont, 967 unsigned long unevictable;
1018 MEM_CGROUP_ZSTAT_INACTIVE); 968
1019 active = mem_cgroup_get_all_zonestat(mem_cont, 969 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1020 MEM_CGROUP_ZSTAT_ACTIVE); 970 LRU_INACTIVE_ANON);
1021 cb->fill(cb, "active", (active) * PAGE_SIZE); 971 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1022 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 972 LRU_ACTIVE_ANON);
973 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
974 LRU_INACTIVE_FILE);
975 active_file = mem_cgroup_get_all_zonestat(mem_cont,
976 LRU_ACTIVE_FILE);
977 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
978 LRU_UNEVICTABLE);
979
980 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
981 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
982 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
983 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
984 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
985
1023 } 986 }
1024 return 0; 987 return 0;
1025} 988}
@@ -1062,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1062{ 1025{
1063 struct mem_cgroup_per_node *pn; 1026 struct mem_cgroup_per_node *pn;
1064 struct mem_cgroup_per_zone *mz; 1027 struct mem_cgroup_per_zone *mz;
1028 enum lru_list l;
1065 int zone, tmp = node; 1029 int zone, tmp = node;
1066 /* 1030 /*
1067 * This routine is called against possible nodes. 1031 * This routine is called against possible nodes.
@@ -1082,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1082 1046
1083 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1047 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1084 mz = &pn->zoneinfo[zone]; 1048 mz = &pn->zoneinfo[zone];
1085 INIT_LIST_HEAD(&mz->active_list);
1086 INIT_LIST_HEAD(&mz->inactive_list);
1087 spin_lock_init(&mz->lru_lock); 1049 spin_lock_init(&mz->lru_lock);
1050 for_each_lru(l)
1051 INIT_LIST_HEAD(&mz->lists[l]);
1088 } 1052 }
1089 return 0; 1053 return 0;
1090} 1054}
@@ -1124,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1124 int node; 1088 int node;
1125 1089
1126 if (unlikely((cont->parent) == NULL)) { 1090 if (unlikely((cont->parent) == NULL)) {
1091 page_cgroup_init();
1127 mem = &init_mem_cgroup; 1092 mem = &init_mem_cgroup;
1128 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
1129 } else { 1093 } else {
1130 mem = mem_cgroup_alloc(); 1094 mem = mem_cgroup_alloc();
1131 if (!mem) 1095 if (!mem)
diff --git a/mm/memory.c b/mm/memory.c
index 1002f473f497..164951c47305 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1129,12 +1129,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1129 return !vma->vm_ops || !vma->vm_ops->fault; 1129 return !vma->vm_ops || !vma->vm_ops->fault;
1130} 1130}
1131 1131
1132int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1132
1133 unsigned long start, int len, int write, int force, 1133
1134int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1135 unsigned long start, int len, int flags,
1134 struct page **pages, struct vm_area_struct **vmas) 1136 struct page **pages, struct vm_area_struct **vmas)
1135{ 1137{
1136 int i; 1138 int i;
1137 unsigned int vm_flags; 1139 unsigned int vm_flags = 0;
1140 int write = !!(flags & GUP_FLAGS_WRITE);
1141 int force = !!(flags & GUP_FLAGS_FORCE);
1142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1138 1143
1139 if (len <= 0) 1144 if (len <= 0)
1140 return 0; 1145 return 0;
@@ -1158,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1158 pud_t *pud; 1163 pud_t *pud;
1159 pmd_t *pmd; 1164 pmd_t *pmd;
1160 pte_t *pte; 1165 pte_t *pte;
1161 if (write) /* user gate pages are read-only */ 1166
1167 /* user gate pages are read-only */
1168 if (!ignore && write)
1162 return i ? : -EFAULT; 1169 return i ? : -EFAULT;
1163 if (pg > TASK_SIZE) 1170 if (pg > TASK_SIZE)
1164 pgd = pgd_offset_k(pg); 1171 pgd = pgd_offset_k(pg);
@@ -1190,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1190 continue; 1197 continue;
1191 } 1198 }
1192 1199
1193 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1200 if (!vma ||
1194 || !(vm_flags & vma->vm_flags)) 1201 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1202 (!ignore && !(vm_flags & vma->vm_flags)))
1195 return i ? : -EFAULT; 1203 return i ? : -EFAULT;
1196 1204
1197 if (is_vm_hugetlb_page(vma)) { 1205 if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1266 } while (len); 1274 } while (len);
1267 return i; 1275 return i;
1268} 1276}
1277
1278int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1279 unsigned long start, int len, int write, int force,
1280 struct page **pages, struct vm_area_struct **vmas)
1281{
1282 int flags = 0;
1283
1284 if (write)
1285 flags |= GUP_FLAGS_WRITE;
1286 if (force)
1287 flags |= GUP_FLAGS_FORCE;
1288
1289 return __get_user_pages(tsk, mm,
1290 start, len, flags,
1291 pages, vmas);
1292}
1293
1269EXPORT_SYMBOL(get_user_pages); 1294EXPORT_SYMBOL(get_user_pages);
1270 1295
1271pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1296pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1296,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1296 pte_t *pte; 1321 pte_t *pte;
1297 spinlock_t *ptl; 1322 spinlock_t *ptl;
1298 1323
1299 retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
1300 if (retval)
1301 goto out;
1302
1303 retval = -EINVAL; 1324 retval = -EINVAL;
1304 if (PageAnon(page)) 1325 if (PageAnon(page))
1305 goto out_uncharge; 1326 goto out;
1306 retval = -ENOMEM; 1327 retval = -ENOMEM;
1307 flush_dcache_page(page); 1328 flush_dcache_page(page);
1308 pte = get_locked_pte(mm, addr, &ptl); 1329 pte = get_locked_pte(mm, addr, &ptl);
1309 if (!pte) 1330 if (!pte)
1310 goto out_uncharge; 1331 goto out;
1311 retval = -EBUSY; 1332 retval = -EBUSY;
1312 if (!pte_none(*pte)) 1333 if (!pte_none(*pte))
1313 goto out_unlock; 1334 goto out_unlock;
@@ -1323,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1323 return retval; 1344 return retval;
1324out_unlock: 1345out_unlock:
1325 pte_unmap_unlock(pte, ptl); 1346 pte_unmap_unlock(pte, ptl);
1326out_uncharge:
1327 mem_cgroup_uncharge_page(page);
1328out: 1347out:
1329 return retval; 1348 return retval;
1330} 1349}
@@ -1858,6 +1877,15 @@ gotten:
1858 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1877 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1859 if (!new_page) 1878 if (!new_page)
1860 goto oom; 1879 goto oom;
1880 /*
1881 * Don't let another task, with possibly unlocked vma,
1882 * keep the mlocked page.
1883 */
1884 if (vma->vm_flags & VM_LOCKED) {
1885 lock_page(old_page); /* for LRU manipulation */
1886 clear_page_mlock(old_page);
1887 unlock_page(old_page);
1888 }
1861 cow_user_page(new_page, old_page, address, vma); 1889 cow_user_page(new_page, old_page, address, vma);
1862 __SetPageUptodate(new_page); 1890 __SetPageUptodate(new_page);
1863 1891
@@ -1886,11 +1914,13 @@ gotten:
1886 * thread doing COW. 1914 * thread doing COW.
1887 */ 1915 */
1888 ptep_clear_flush_notify(vma, address, page_table); 1916 ptep_clear_flush_notify(vma, address, page_table);
1889 set_pte_at(mm, address, page_table, entry); 1917 SetPageSwapBacked(new_page);
1890 update_mmu_cache(vma, address, entry); 1918 lru_cache_add_active_or_unevictable(new_page, vma);
1891 lru_cache_add_active(new_page);
1892 page_add_new_anon_rmap(new_page, vma, address); 1919 page_add_new_anon_rmap(new_page, vma, address);
1893 1920
1921//TODO: is this safe? do_anonymous_page() does it this way.
1922 set_pte_at(mm, address, page_table, entry);
1923 update_mmu_cache(vma, address, entry);
1894 if (old_page) { 1924 if (old_page) {
1895 /* 1925 /*
1896 * Only after switching the pte to the new page may 1926 * Only after switching the pte to the new page may
@@ -2288,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2288 count_vm_event(PGMAJFAULT); 2318 count_vm_event(PGMAJFAULT);
2289 } 2319 }
2290 2320
2321 mark_page_accessed(page);
2322
2323 lock_page(page);
2324 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2325
2291 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2326 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2292 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2293 ret = VM_FAULT_OOM; 2327 ret = VM_FAULT_OOM;
2328 unlock_page(page);
2294 goto out; 2329 goto out;
2295 } 2330 }
2296 2331
2297 mark_page_accessed(page);
2298 lock_page(page);
2299 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2300
2301 /* 2332 /*
2302 * Back out if somebody else already faulted in this pte. 2333 * Back out if somebody else already faulted in this pte.
2303 */ 2334 */
@@ -2324,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2324 page_add_anon_rmap(page, vma, address); 2355 page_add_anon_rmap(page, vma, address);
2325 2356
2326 swap_free(entry); 2357 swap_free(entry);
2327 if (vm_swap_full()) 2358 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2328 remove_exclusive_swap_page(page); 2359 remove_exclusive_swap_page(page);
2329 unlock_page(page); 2360 unlock_page(page);
2330 2361
@@ -2382,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2382 if (!pte_none(*page_table)) 2413 if (!pte_none(*page_table))
2383 goto release; 2414 goto release;
2384 inc_mm_counter(mm, anon_rss); 2415 inc_mm_counter(mm, anon_rss);
2385 lru_cache_add_active(page); 2416 SetPageSwapBacked(page);
2417 lru_cache_add_active_or_unevictable(page, vma);
2386 page_add_new_anon_rmap(page, vma, address); 2418 page_add_new_anon_rmap(page, vma, address);
2387 set_pte_at(mm, address, page_table, entry); 2419 set_pte_at(mm, address, page_table, entry);
2388 2420
@@ -2423,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2423 struct page *page; 2455 struct page *page;
2424 pte_t entry; 2456 pte_t entry;
2425 int anon = 0; 2457 int anon = 0;
2458 int charged = 0;
2426 struct page *dirty_page = NULL; 2459 struct page *dirty_page = NULL;
2427 struct vm_fault vmf; 2460 struct vm_fault vmf;
2428 int ret; 2461 int ret;
@@ -2463,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2463 ret = VM_FAULT_OOM; 2496 ret = VM_FAULT_OOM;
2464 goto out; 2497 goto out;
2465 } 2498 }
2499 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2500 ret = VM_FAULT_OOM;
2501 page_cache_release(page);
2502 goto out;
2503 }
2504 charged = 1;
2505 /*
2506 * Don't let another task, with possibly unlocked vma,
2507 * keep the mlocked page.
2508 */
2509 if (vma->vm_flags & VM_LOCKED)
2510 clear_page_mlock(vmf.page);
2466 copy_user_highpage(page, vmf.page, address, vma); 2511 copy_user_highpage(page, vmf.page, address, vma);
2467 __SetPageUptodate(page); 2512 __SetPageUptodate(page);
2468 } else { 2513 } else {
@@ -2497,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2497 2542
2498 } 2543 }
2499 2544
2500 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2501 ret = VM_FAULT_OOM;
2502 goto out;
2503 }
2504
2505 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2545 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2506 2546
2507 /* 2547 /*
@@ -2520,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2520 entry = mk_pte(page, vma->vm_page_prot); 2560 entry = mk_pte(page, vma->vm_page_prot);
2521 if (flags & FAULT_FLAG_WRITE) 2561 if (flags & FAULT_FLAG_WRITE)
2522 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2562 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2523 set_pte_at(mm, address, page_table, entry);
2524 if (anon) { 2563 if (anon) {
2525 inc_mm_counter(mm, anon_rss); 2564 inc_mm_counter(mm, anon_rss);
2526 lru_cache_add_active(page); 2565 SetPageSwapBacked(page);
2527 page_add_new_anon_rmap(page, vma, address); 2566 lru_cache_add_active_or_unevictable(page, vma);
2567 page_add_new_anon_rmap(page, vma, address);
2528 } else { 2568 } else {
2529 inc_mm_counter(mm, file_rss); 2569 inc_mm_counter(mm, file_rss);
2530 page_add_file_rmap(page); 2570 page_add_file_rmap(page);
@@ -2533,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2533 get_page(dirty_page); 2573 get_page(dirty_page);
2534 } 2574 }
2535 } 2575 }
2576//TODO: is this safe? do_anonymous_page() does it this way.
2577 set_pte_at(mm, address, page_table, entry);
2536 2578
2537 /* no need to invalidate: a not-present page won't be cached */ 2579 /* no need to invalidate: a not-present page won't be cached */
2538 update_mmu_cache(vma, address, entry); 2580 update_mmu_cache(vma, address, entry);
2539 } else { 2581 } else {
2540 mem_cgroup_uncharge_page(page); 2582 if (charged)
2583 mem_cgroup_uncharge_page(page);
2541 if (anon) 2584 if (anon)
2542 page_cache_release(page); 2585 page_cache_release(page);
2543 else 2586 else
@@ -2772,19 +2815,9 @@ int make_pages_present(unsigned long addr, unsigned long end)
2772 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 2815 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2773 ret = get_user_pages(current, current->mm, addr, 2816 ret = get_user_pages(current, current->mm, addr,
2774 len, write, 0, NULL, NULL); 2817 len, write, 0, NULL, NULL);
2775 if (ret < 0) { 2818 if (ret < 0)
2776 /*
2777 SUS require strange return value to mlock
2778 - invalid addr generate to ENOMEM.
2779 - out of memory should generate EAGAIN.
2780 */
2781 if (ret == -EFAULT)
2782 ret = -ENOMEM;
2783 else if (ret == -ENOMEM)
2784 ret = -EAGAIN;
2785 return ret; 2819 return ret;
2786 } 2820 return ret == len ? 0 : -EFAULT;
2787 return ret == len ? 0 : -ENOMEM;
2788} 2821}
2789 2822
2790#if !defined(__HAVE_ARCH_GATE_AREA) 2823#if !defined(__HAVE_ARCH_GATE_AREA)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 89fee2dcb039..6837a1014372 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
26#include <linux/delay.h> 26#include <linux/delay.h>
27#include <linux/migrate.h> 27#include <linux/migrate.h>
28#include <linux/page-isolation.h> 28#include <linux/page-isolation.h>
29#include <linux/pfn.h>
29 30
30#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
31 32
@@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
323 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 324 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
324 BUG_ON(nr_pages % PAGES_PER_SECTION); 325 BUG_ON(nr_pages % PAGES_PER_SECTION);
325 326
326 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
327
328 sections_to_remove = nr_pages / PAGES_PER_SECTION; 327 sections_to_remove = nr_pages / PAGES_PER_SECTION;
329 for (i = 0; i < sections_to_remove; i++) { 328 for (i = 0; i < sections_to_remove; i++) {
330 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 329 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
330 release_mem_region(pfn << PAGE_SHIFT,
331 PAGES_PER_SECTION << PAGE_SHIFT);
331 ret = __remove_section(zone, __pfn_to_section(pfn)); 332 ret = __remove_section(zone, __pfn_to_section(pfn));
332 if (ret) 333 if (ret)
333 break; 334 break;
@@ -657,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
657 * We can skip free pages. And we can only deal with pages on 658 * We can skip free pages. And we can only deal with pages on
658 * LRU. 659 * LRU.
659 */ 660 */
660 ret = isolate_lru_page(page, &source); 661 ret = isolate_lru_page(page);
661 if (!ret) { /* Success */ 662 if (!ret) { /* Success */
663 list_add_tail(&page->lru, &source);
662 move_pages--; 664 move_pages--;
663 } else { 665 } else {
664 /* Becasue we don't have big zone->lock. we should 666 /* Becasue we don't have big zone->lock. we should
@@ -849,10 +851,19 @@ failed_removal:
849 851
850 return ret; 852 return ret;
851} 853}
854
855int remove_memory(u64 start, u64 size)
856{
857 unsigned long start_pfn, end_pfn;
858
859 start_pfn = PFN_DOWN(start);
860 end_pfn = start_pfn + PFN_DOWN(size);
861 return offline_pages(start_pfn, end_pfn, 120 * HZ);
862}
852#else 863#else
853int remove_memory(u64 start, u64 size) 864int remove_memory(u64 start, u64 size)
854{ 865{
855 return -EINVAL; 866 return -EINVAL;
856} 867}
857EXPORT_SYMBOL_GPL(remove_memory);
858#endif /* CONFIG_MEMORY_HOTREMOVE */ 868#endif /* CONFIG_MEMORY_HOTREMOVE */
869EXPORT_SYMBOL_GPL(remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 83369058ec13..36f42573a335 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,8 @@
93#include <asm/tlbflush.h> 93#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 94#include <asm/uaccess.h>
95 95
96#include "internal.h"
97
96/* Internal flags */ 98/* Internal flags */
97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 99#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 100#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
@@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
762 /* 764 /*
763 * Avoid migrating a page that is shared with others. 765 * Avoid migrating a page that is shared with others.
764 */ 766 */
765 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) 767 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
766 isolate_lru_page(page, pagelist); 768 if (!isolate_lru_page(page)) {
769 list_add_tail(&page->lru, pagelist);
770 }
771 }
767} 772}
768 773
769static struct page *new_node_page(struct page *page, unsigned long node, int **x) 774static struct page *new_node_page(struct page *page, unsigned long node, int **x)
@@ -2197,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
2197 if (PageSwapCache(page)) 2202 if (PageSwapCache(page))
2198 md->swapcache++; 2203 md->swapcache++;
2199 2204
2200 if (PageActive(page)) 2205 if (PageActive(page) || PageUnevictable(page))
2201 md->active++; 2206 md->active++;
2202 2207
2203 if (PageWriteback(page)) 2208 if (PageWriteback(page))
diff --git a/mm/migrate.c b/mm/migrate.c
index 2a80136b23bb..6602941bfab0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,36 +37,6 @@
37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
38 38
39/* 39/*
40 * Isolate one page from the LRU lists. If successful put it onto
41 * the indicated list with elevated page count.
42 *
43 * Result:
44 * -EBUSY: page not on LRU list
45 * 0: page removed from LRU list and added to the specified list.
46 */
47int isolate_lru_page(struct page *page, struct list_head *pagelist)
48{
49 int ret = -EBUSY;
50
51 if (PageLRU(page)) {
52 struct zone *zone = page_zone(page);
53
54 spin_lock_irq(&zone->lru_lock);
55 if (PageLRU(page) && get_page_unless_zero(page)) {
56 ret = 0;
57 ClearPageLRU(page);
58 if (PageActive(page))
59 del_page_from_active_list(zone, page);
60 else
61 del_page_from_inactive_list(zone, page);
62 list_add_tail(&page->lru, pagelist);
63 }
64 spin_unlock_irq(&zone->lru_lock);
65 }
66 return ret;
67}
68
69/*
70 * migrate_prep() needs to be called before we start compiling a list of pages 40 * migrate_prep() needs to be called before we start compiling a list of pages
71 * to be migrated using isolate_lru_page(). 41 * to be migrated using isolate_lru_page().
72 */ 42 */
@@ -83,23 +53,9 @@ int migrate_prep(void)
83 return 0; 53 return 0;
84} 54}
85 55
86static inline void move_to_lru(struct page *page)
87{
88 if (PageActive(page)) {
89 /*
90 * lru_cache_add_active checks that
91 * the PG_active bit is off.
92 */
93 ClearPageActive(page);
94 lru_cache_add_active(page);
95 } else {
96 lru_cache_add(page);
97 }
98 put_page(page);
99}
100
101/* 56/*
102 * Add isolated pages on the list back to the LRU. 57 * Add isolated pages on the list back to the LRU under page lock
58 * to avoid leaking evictable pages back onto unevictable list.
103 * 59 *
104 * returns the number of pages put back. 60 * returns the number of pages put back.
105 */ 61 */
@@ -111,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
111 67
112 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
113 list_del(&page->lru); 69 list_del(&page->lru);
114 move_to_lru(page); 70 putback_lru_page(page);
115 count++; 71 count++;
116 } 72 }
117 return count; 73 return count;
@@ -374,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
374 __inc_zone_page_state(newpage, NR_FILE_PAGES); 330 __inc_zone_page_state(newpage, NR_FILE_PAGES);
375 331
376 spin_unlock_irq(&mapping->tree_lock); 332 spin_unlock_irq(&mapping->tree_lock);
377 if (!PageSwapCache(newpage))
378 mem_cgroup_uncharge_cache_page(page);
379 333
380 return 0; 334 return 0;
381} 335}
@@ -385,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
385 */ 339 */
386static void migrate_page_copy(struct page *newpage, struct page *page) 340static void migrate_page_copy(struct page *newpage, struct page *page)
387{ 341{
342 int anon;
343
388 copy_highpage(newpage, page); 344 copy_highpage(newpage, page);
389 345
390 if (PageError(page)) 346 if (PageError(page))
@@ -393,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
393 SetPageReferenced(newpage); 349 SetPageReferenced(newpage);
394 if (PageUptodate(page)) 350 if (PageUptodate(page))
395 SetPageUptodate(newpage); 351 SetPageUptodate(newpage);
396 if (PageActive(page)) 352 if (TestClearPageActive(page)) {
353 VM_BUG_ON(PageUnevictable(page));
397 SetPageActive(newpage); 354 SetPageActive(newpage);
355 } else
356 unevictable_migrate_page(newpage, page);
398 if (PageChecked(page)) 357 if (PageChecked(page))
399 SetPageChecked(newpage); 358 SetPageChecked(newpage);
400 if (PageMappedToDisk(page)) 359 if (PageMappedToDisk(page))
@@ -412,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
412 __set_page_dirty_nobuffers(newpage); 371 __set_page_dirty_nobuffers(newpage);
413 } 372 }
414 373
374 mlock_migrate_page(newpage, page);
375
415#ifdef CONFIG_SWAP 376#ifdef CONFIG_SWAP
416 ClearPageSwapCache(page); 377 ClearPageSwapCache(page);
417#endif 378#endif
418 ClearPageActive(page);
419 ClearPagePrivate(page); 379 ClearPagePrivate(page);
420 set_page_private(page, 0); 380 set_page_private(page, 0);
381 /* page->mapping contains a flag for PageAnon() */
382 anon = PageAnon(page);
421 page->mapping = NULL; 383 page->mapping = NULL;
422 384
385 if (!anon) /* This page was removed from radix-tree. */
386 mem_cgroup_uncharge_cache_page(page);
387
423 /* 388 /*
424 * If any waiters have accumulated on the new page then 389 * If any waiters have accumulated on the new page then
425 * wake them up. 390 * wake them up.
@@ -594,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping,
594 * 559 *
595 * The new page will have replaced the old page if this function 560 * The new page will have replaced the old page if this function
596 * is successful. 561 * is successful.
562 *
563 * Return value:
564 * < 0 - error code
565 * == 0 - success
597 */ 566 */
598static int move_to_new_page(struct page *newpage, struct page *page) 567static int move_to_new_page(struct page *newpage, struct page *page)
599{ 568{
@@ -611,6 +580,8 @@ static int move_to_new_page(struct page *newpage, struct page *page)
611 /* Prepare mapping for the new page.*/ 580 /* Prepare mapping for the new page.*/
612 newpage->index = page->index; 581 newpage->index = page->index;
613 newpage->mapping = page->mapping; 582 newpage->mapping = page->mapping;
583 if (PageSwapBacked(page))
584 SetPageSwapBacked(newpage);
614 585
615 mapping = page_mapping(page); 586 mapping = page_mapping(page);
616 if (!mapping) 587 if (!mapping)
@@ -654,9 +625,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
654 if (!newpage) 625 if (!newpage)
655 return -ENOMEM; 626 return -ENOMEM;
656 627
657 if (page_count(page) == 1) 628 if (page_count(page) == 1) {
658 /* page was freed from under us. So we are done. */ 629 /* page was freed from under us. So we are done. */
659 goto move_newpage; 630 goto move_newpage;
631 }
660 632
661 charge = mem_cgroup_prepare_migration(page, newpage); 633 charge = mem_cgroup_prepare_migration(page, newpage);
662 if (charge == -ENOMEM) { 634 if (charge == -ENOMEM) {
@@ -730,7 +702,6 @@ rcu_unlock:
730 rcu_read_unlock(); 702 rcu_read_unlock();
731 703
732unlock: 704unlock:
733
734 unlock_page(page); 705 unlock_page(page);
735 706
736 if (rc != -EAGAIN) { 707 if (rc != -EAGAIN) {
@@ -741,17 +712,19 @@ unlock:
741 * restored. 712 * restored.
742 */ 713 */
743 list_del(&page->lru); 714 list_del(&page->lru);
744 move_to_lru(page); 715 putback_lru_page(page);
745 } 716 }
746 717
747move_newpage: 718move_newpage:
748 if (!charge) 719 if (!charge)
749 mem_cgroup_end_migration(newpage); 720 mem_cgroup_end_migration(newpage);
721
750 /* 722 /*
751 * Move the new page to the LRU. If migration was not successful 723 * Move the new page to the LRU. If migration was not successful
752 * then this will free the page. 724 * then this will free the page.
753 */ 725 */
754 move_to_lru(newpage); 726 putback_lru_page(newpage);
727
755 if (result) { 728 if (result) {
756 if (rc) 729 if (rc)
757 *result = rc; 730 *result = rc;
@@ -858,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
858 * Move a set of pages as indicated in the pm array. The addr 831 * Move a set of pages as indicated in the pm array. The addr
859 * field must be set to the virtual address of the page to be moved 832 * field must be set to the virtual address of the page to be moved
860 * and the node number must contain a valid target node. 833 * and the node number must contain a valid target node.
834 * The pm array ends with node = MAX_NUMNODES.
861 */ 835 */
862static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, 836static int do_move_page_to_node_array(struct mm_struct *mm,
863 int migrate_all) 837 struct page_to_node *pm,
838 int migrate_all)
864{ 839{
865 int err; 840 int err;
866 struct page_to_node *pp; 841 struct page_to_node *pp;
@@ -914,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
914 !migrate_all) 889 !migrate_all)
915 goto put_and_set; 890 goto put_and_set;
916 891
917 err = isolate_lru_page(page, &pagelist); 892 err = isolate_lru_page(page);
893 if (!err)
894 list_add_tail(&page->lru, &pagelist);
918put_and_set: 895put_and_set:
919 /* 896 /*
920 * Either remove the duplicate refcount from 897 * Either remove the duplicate refcount from
@@ -926,36 +903,118 @@ set_status:
926 pp->status = err; 903 pp->status = err;
927 } 904 }
928 905
906 err = 0;
929 if (!list_empty(&pagelist)) 907 if (!list_empty(&pagelist))
930 err = migrate_pages(&pagelist, new_page_node, 908 err = migrate_pages(&pagelist, new_page_node,
931 (unsigned long)pm); 909 (unsigned long)pm);
932 else
933 err = -ENOENT;
934 910
935 up_read(&mm->mmap_sem); 911 up_read(&mm->mmap_sem);
936 return err; 912 return err;
937} 913}
938 914
939/* 915/*
940 * Determine the nodes of a list of pages. The addr in the pm array 916 * Migrate an array of page address onto an array of nodes and fill
941 * must have been set to the virtual address of which we want to determine 917 * the corresponding array of status.
942 * the node number.
943 */ 918 */
944static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) 919static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
920 unsigned long nr_pages,
921 const void __user * __user *pages,
922 const int __user *nodes,
923 int __user *status, int flags)
945{ 924{
925 struct page_to_node *pm = NULL;
926 nodemask_t task_nodes;
927 int err = 0;
928 int i;
929
930 task_nodes = cpuset_mems_allowed(task);
931
932 /* Limit nr_pages so that the multiplication may not overflow */
933 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
934 err = -E2BIG;
935 goto out;
936 }
937
938 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
939 if (!pm) {
940 err = -ENOMEM;
941 goto out;
942 }
943
944 /*
945 * Get parameters from user space and initialize the pm
946 * array. Return various errors if the user did something wrong.
947 */
948 for (i = 0; i < nr_pages; i++) {
949 const void __user *p;
950
951 err = -EFAULT;
952 if (get_user(p, pages + i))
953 goto out_pm;
954
955 pm[i].addr = (unsigned long)p;
956 if (nodes) {
957 int node;
958
959 if (get_user(node, nodes + i))
960 goto out_pm;
961
962 err = -ENODEV;
963 if (!node_state(node, N_HIGH_MEMORY))
964 goto out_pm;
965
966 err = -EACCES;
967 if (!node_isset(node, task_nodes))
968 goto out_pm;
969
970 pm[i].node = node;
971 } else
972 pm[i].node = 0; /* anything to not match MAX_NUMNODES */
973 }
974 /* End marker */
975 pm[nr_pages].node = MAX_NUMNODES;
976
977 err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
978 if (err >= 0)
979 /* Return status information */
980 for (i = 0; i < nr_pages; i++)
981 if (put_user(pm[i].status, status + i))
982 err = -EFAULT;
983
984out_pm:
985 vfree(pm);
986out:
987 return err;
988}
989
990/*
991 * Determine the nodes of an array of pages and store it in an array of status.
992 */
993static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
994 const void __user * __user *pages,
995 int __user *status)
996{
997 unsigned long i;
998 int err;
999
946 down_read(&mm->mmap_sem); 1000 down_read(&mm->mmap_sem);
947 1001
948 for ( ; pm->node != MAX_NUMNODES; pm++) { 1002 for (i = 0; i < nr_pages; i++) {
1003 const void __user *p;
1004 unsigned long addr;
949 struct vm_area_struct *vma; 1005 struct vm_area_struct *vma;
950 struct page *page; 1006 struct page *page;
951 int err;
952 1007
953 err = -EFAULT; 1008 err = -EFAULT;
954 vma = find_vma(mm, pm->addr); 1009 if (get_user(p, pages+i))
1010 goto out;
1011 addr = (unsigned long) p;
1012
1013 vma = find_vma(mm, addr);
955 if (!vma) 1014 if (!vma)
956 goto set_status; 1015 goto set_status;
957 1016
958 page = follow_page(vma, pm->addr, 0); 1017 page = follow_page(vma, addr, 0);
959 1018
960 err = PTR_ERR(page); 1019 err = PTR_ERR(page);
961 if (IS_ERR(page)) 1020 if (IS_ERR(page))
@@ -968,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
968 1027
969 err = page_to_nid(page); 1028 err = page_to_nid(page);
970set_status: 1029set_status:
971 pm->status = err; 1030 put_user(err, status+i);
972 } 1031 }
1032 err = 0;
973 1033
1034out:
974 up_read(&mm->mmap_sem); 1035 up_read(&mm->mmap_sem);
975 return 0; 1036 return err;
976} 1037}
977 1038
978/* 1039/*
@@ -984,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
984 const int __user *nodes, 1045 const int __user *nodes,
985 int __user *status, int flags) 1046 int __user *status, int flags)
986{ 1047{
987 int err = 0;
988 int i;
989 struct task_struct *task; 1048 struct task_struct *task;
990 nodemask_t task_nodes;
991 struct mm_struct *mm; 1049 struct mm_struct *mm;
992 struct page_to_node *pm = NULL; 1050 int err;
993 1051
994 /* Check flags */ 1052 /* Check flags */
995 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1053 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1021,75 +1079,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
1021 (current->uid != task->suid) && (current->uid != task->uid) && 1079 (current->uid != task->suid) && (current->uid != task->uid) &&
1022 !capable(CAP_SYS_NICE)) { 1080 !capable(CAP_SYS_NICE)) {
1023 err = -EPERM; 1081 err = -EPERM;
1024 goto out2; 1082 goto out;
1025 } 1083 }
1026 1084
1027 err = security_task_movememory(task); 1085 err = security_task_movememory(task);
1028 if (err) 1086 if (err)
1029 goto out2; 1087 goto out;
1030
1031
1032 task_nodes = cpuset_mems_allowed(task);
1033
1034 /* Limit nr_pages so that the multiplication may not overflow */
1035 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
1036 err = -E2BIG;
1037 goto out2;
1038 }
1039
1040 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
1041 if (!pm) {
1042 err = -ENOMEM;
1043 goto out2;
1044 }
1045
1046 /*
1047 * Get parameters from user space and initialize the pm
1048 * array. Return various errors if the user did something wrong.
1049 */
1050 for (i = 0; i < nr_pages; i++) {
1051 const void __user *p;
1052
1053 err = -EFAULT;
1054 if (get_user(p, pages + i))
1055 goto out;
1056
1057 pm[i].addr = (unsigned long)p;
1058 if (nodes) {
1059 int node;
1060
1061 if (get_user(node, nodes + i))
1062 goto out;
1063
1064 err = -ENODEV;
1065 if (!node_state(node, N_HIGH_MEMORY))
1066 goto out;
1067
1068 err = -EACCES;
1069 if (!node_isset(node, task_nodes))
1070 goto out;
1071 1088
1072 pm[i].node = node; 1089 if (nodes) {
1073 } else 1090 err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1074 pm[i].node = 0; /* anything to not match MAX_NUMNODES */ 1091 flags);
1092 } else {
1093 err = do_pages_stat(mm, nr_pages, pages, status);
1075 } 1094 }
1076 /* End marker */
1077 pm[nr_pages].node = MAX_NUMNODES;
1078
1079 if (nodes)
1080 err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
1081 else
1082 err = do_pages_stat(mm, pm);
1083
1084 if (err >= 0)
1085 /* Return status information */
1086 for (i = 0; i < nr_pages; i++)
1087 if (put_user(pm[i].status, status + i))
1088 err = -EFAULT;
1089 1095
1090out: 1096out:
1091 vfree(pm);
1092out2:
1093 mmput(mm); 1097 mmput(mm);
1094 return err; 1098 return err;
1095} 1099}
diff --git a/mm/mlock.c b/mm/mlock.c
index 01fbe93eff5c..008ea70b7afa 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
8#include <linux/capability.h> 8#include <linux/capability.h>
9#include <linux/mman.h> 9#include <linux/mman.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/swapops.h>
13#include <linux/pagemap.h>
11#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
12#include <linux/syscalls.h> 15#include <linux/syscalls.h>
13#include <linux/sched.h> 16#include <linux/sched.h>
14#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rmap.h>
19#include <linux/mmzone.h>
20#include <linux/hugetlb.h>
21
22#include "internal.h"
15 23
16int can_do_mlock(void) 24int can_do_mlock(void)
17{ 25{
@@ -23,17 +31,381 @@ int can_do_mlock(void)
23} 31}
24EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
25 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate
38 * statistics.
39 *
40 * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
41 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
42 * The unevictable list is an LRU sibling list to the [in]active lists.
43 * PageUnevictable is set to indicate the unevictable state.
44 *
45 * When lazy mlocking via vmscan, it is important to ensure that the
46 * vma's VM_LOCKED status is not concurrently being modified, otherwise we
47 * may have mlocked a page that is being munlocked. So lazy mlock must take
48 * the mmap_sem for read, and verify that the vma really is locked
49 * (see mm/rmap.c).
50 */
51
52/*
53 * LRU accounting for clear_page_mlock()
54 */
55void __clear_page_mlock(struct page *page)
56{
57 VM_BUG_ON(!PageLocked(page));
58
59 if (!page->mapping) { /* truncated ? */
60 return;
61 }
62
63 dec_zone_page_state(page, NR_MLOCK);
64 count_vm_event(UNEVICTABLE_PGCLEARED);
65 if (!isolate_lru_page(page)) {
66 putback_lru_page(page);
67 } else {
68 /*
69 * Page not on the LRU yet. Flush all pagevecs and retry.
70 */
71 lru_add_drain_all();
72 if (!isolate_lru_page(page))
73 putback_lru_page(page);
74 else if (PageUnevictable(page))
75 count_vm_event(UNEVICTABLE_PGSTRANDED);
76
77 }
78}
79
80/*
81 * Mark page as mlocked if not already.
82 * If page on LRU, isolate and putback to move to unevictable list.
83 */
84void mlock_vma_page(struct page *page)
85{
86 BUG_ON(!PageLocked(page));
87
88 if (!TestSetPageMlocked(page)) {
89 inc_zone_page_state(page, NR_MLOCK);
90 count_vm_event(UNEVICTABLE_PGMLOCKED);
91 if (!isolate_lru_page(page))
92 putback_lru_page(page);
93 }
94}
95
96/*
97 * called from munlock()/munmap() path with page supposedly on the LRU.
98 *
99 * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
100 * [in try_to_munlock()] and then attempt to isolate the page. We must
101 * isolate the page to keep others from messing with its unevictable
102 * and mlocked state while trying to munlock. However, we pre-clear the
103 * mlocked state anyway as we might lose the isolation race and we might
104 * not get another chance to clear PageMlocked. If we successfully
105 * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
106 * mapping the page, it will restore the PageMlocked state, unless the page
107 * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
108 * perhaps redundantly.
109 * If we lose the isolation race, and the page is mapped by other VM_LOCKED
110 * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
111 * either of which will restore the PageMlocked state by calling
112 * mlock_vma_page() above, if it can grab the vma's mmap sem.
113 */
114static void munlock_vma_page(struct page *page)
115{
116 BUG_ON(!PageLocked(page));
117
118 if (TestClearPageMlocked(page)) {
119 dec_zone_page_state(page, NR_MLOCK);
120 if (!isolate_lru_page(page)) {
121 int ret = try_to_munlock(page);
122 /*
123 * did try_to_unlock() succeed or punt?
124 */
125 if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
126 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
127
128 putback_lru_page(page);
129 } else {
130 /*
131 * We lost the race. let try_to_unmap() deal
132 * with it. At least we get the page state and
133 * mlock stats right. However, page is still on
134 * the noreclaim list. We'll fix that up when
135 * the page is eventually freed or we scan the
136 * noreclaim list.
137 */
138 if (PageUnevictable(page))
139 count_vm_event(UNEVICTABLE_PGSTRANDED);
140 else
141 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
142 }
143 }
144}
145
146/**
147 * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
148 * @vma: target vma
149 * @start: start address
150 * @end: end address
151 * @mlock: 0 indicate munlock, otherwise mlock.
152 *
153 * If @mlock == 0, unlock an mlocked range;
154 * else mlock the range of pages. This takes care of making the pages present ,
155 * too.
156 *
157 * return 0 on success, negative error code on error.
158 *
159 * vma->vm_mm->mmap_sem must be held for at least read.
160 */
161static long __mlock_vma_pages_range(struct vm_area_struct *vma,
162 unsigned long start, unsigned long end,
163 int mlock)
164{
165 struct mm_struct *mm = vma->vm_mm;
166 unsigned long addr = start;
167 struct page *pages[16]; /* 16 gives a reasonable batch */
168 int nr_pages = (end - start) / PAGE_SIZE;
169 int ret;
170 int gup_flags = 0;
171
172 VM_BUG_ON(start & ~PAGE_MASK);
173 VM_BUG_ON(end & ~PAGE_MASK);
174 VM_BUG_ON(start < vma->vm_start);
175 VM_BUG_ON(end > vma->vm_end);
176 VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
177 (atomic_read(&mm->mm_users) != 0));
178
179 /*
180 * mlock: don't page populate if page has PROT_NONE permission.
181 * munlock: the pages always do munlock althrough
182 * its has PROT_NONE permission.
183 */
184 if (!mlock)
185 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
186
187 if (vma->vm_flags & VM_WRITE)
188 gup_flags |= GUP_FLAGS_WRITE;
189
190 lru_add_drain_all(); /* push cached pages to LRU */
191
192 while (nr_pages > 0) {
193 int i;
194
195 cond_resched();
196
197 /*
198 * get_user_pages makes pages present if we are
199 * setting mlock. and this extra reference count will
200 * disable migration of this page. However, page may
201 * still be truncated out from under us.
202 */
203 ret = __get_user_pages(current, mm, addr,
204 min_t(int, nr_pages, ARRAY_SIZE(pages)),
205 gup_flags, pages, NULL);
206 /*
207 * This can happen for, e.g., VM_NONLINEAR regions before
208 * a page has been allocated and mapped at a given offset,
209 * or for addresses that map beyond end of a file.
210 * We'll mlock the the pages if/when they get faulted in.
211 */
212 if (ret < 0)
213 break;
214 if (ret == 0) {
215 /*
216 * We know the vma is there, so the only time
217 * we cannot get a single page should be an
218 * error (ret < 0) case.
219 */
220 WARN_ON(1);
221 break;
222 }
223
224 lru_add_drain(); /* push cached pages to LRU */
225
226 for (i = 0; i < ret; i++) {
227 struct page *page = pages[i];
228
229 lock_page(page);
230 /*
231 * Because we lock page here and migration is blocked
232 * by the elevated reference, we need only check for
233 * page truncation (file-cache only).
234 */
235 if (page->mapping) {
236 if (mlock)
237 mlock_vma_page(page);
238 else
239 munlock_vma_page(page);
240 }
241 unlock_page(page);
242 put_page(page); /* ref from get_user_pages() */
243
244 /*
245 * here we assume that get_user_pages() has given us
246 * a list of virtually contiguous pages.
247 */
248 addr += PAGE_SIZE; /* for next get_user_pages() */
249 nr_pages--;
250 }
251 ret = 0;
252 }
253
254 lru_add_drain_all(); /* to update stats */
255
256 return ret; /* count entire vma as locked_vm */
257}
258
259/*
260 * convert get_user_pages() return value to posix mlock() error
261 */
262static int __mlock_posix_error_return(long retval)
263{
264 if (retval == -EFAULT)
265 retval = -ENOMEM;
266 else if (retval == -ENOMEM)
267 retval = -EAGAIN;
268 return retval;
269}
270
271#else /* CONFIG_UNEVICTABLE_LRU */
272
273/*
274 * Just make pages present if VM_LOCKED. No-op if unlocking.
275 */
276static long __mlock_vma_pages_range(struct vm_area_struct *vma,
277 unsigned long start, unsigned long end,
278 int mlock)
279{
280 if (mlock && (vma->vm_flags & VM_LOCKED))
281 return make_pages_present(start, end);
282 return 0;
283}
284
285static inline int __mlock_posix_error_return(long retval)
286{
287 return 0;
288}
289
290#endif /* CONFIG_UNEVICTABLE_LRU */
291
292/**
293 * mlock_vma_pages_range() - mlock pages in specified vma range.
294 * @vma - the vma containing the specfied address range
295 * @start - starting address in @vma to mlock
296 * @end - end address [+1] in @vma to mlock
297 *
298 * For mmap()/mremap()/expansion of mlocked vma.
299 *
300 * return 0 on success for "normal" vmas.
301 *
302 * return number of pages [> 0] to be removed from locked_vm on success
303 * of "special" vmas.
304 *
305 * return negative error if vma spanning @start-@range disappears while
306 * mmap semaphore is dropped. Unlikely?
307 */
308long mlock_vma_pages_range(struct vm_area_struct *vma,
309 unsigned long start, unsigned long end)
310{
311 struct mm_struct *mm = vma->vm_mm;
312 int nr_pages = (end - start) / PAGE_SIZE;
313 BUG_ON(!(vma->vm_flags & VM_LOCKED));
314
315 /*
316 * filter unlockable vmas
317 */
318 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
319 goto no_mlock;
320
321 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
322 is_vm_hugetlb_page(vma) ||
323 vma == get_gate_vma(current))) {
324 long error;
325 downgrade_write(&mm->mmap_sem);
326
327 error = __mlock_vma_pages_range(vma, start, end, 1);
328
329 up_read(&mm->mmap_sem);
330 /* vma can change or disappear */
331 down_write(&mm->mmap_sem);
332 vma = find_vma(mm, start);
333 /* non-NULL vma must contain @start, but need to check @end */
334 if (!vma || end > vma->vm_end)
335 return -ENOMEM;
336
337 return 0; /* hide other errors from mmap(), et al */
338 }
339
340 /*
341 * User mapped kernel pages or huge pages:
342 * make these pages present to populate the ptes, but
343 * fall thru' to reset VM_LOCKED--no need to unlock, and
344 * return nr_pages so these don't get counted against task's
345 * locked limit. huge pages are already counted against
346 * locked vm limit.
347 */
348 make_pages_present(start, end);
349
350no_mlock:
351 vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
352 return nr_pages; /* error or pages NOT mlocked */
353}
354
355
356/*
357 * munlock_vma_pages_range() - munlock all pages in the vma range.'
358 * @vma - vma containing range to be munlock()ed.
359 * @start - start address in @vma of the range
360 * @end - end of range in @vma.
361 *
362 * For mremap(), munmap() and exit().
363 *
364 * Called with @vma VM_LOCKED.
365 *
366 * Returns with VM_LOCKED cleared. Callers must be prepared to
367 * deal with this.
368 *
369 * We don't save and restore VM_LOCKED here because pages are
370 * still on lru. In unmap path, pages might be scanned by reclaim
371 * and re-mlocked by try_to_{munlock|unmap} before we unmap and
372 * free them. This will result in freeing mlocked pages.
373 */
374void munlock_vma_pages_range(struct vm_area_struct *vma,
375 unsigned long start, unsigned long end)
376{
377 vma->vm_flags &= ~VM_LOCKED;
378 __mlock_vma_pages_range(vma, start, end, 0);
379}
380
381/*
382 * mlock_fixup - handle mlock[all]/munlock[all] requests.
383 *
384 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
385 * munlock is a no-op. However, for some special vmas, we go ahead and
386 * populate the ptes via make_pages_present().
387 *
388 * For vmas that pass the filters, merge/split as appropriate.
389 */
26static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 390static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
27 unsigned long start, unsigned long end, unsigned int newflags) 391 unsigned long start, unsigned long end, unsigned int newflags)
28{ 392{
29 struct mm_struct * mm = vma->vm_mm; 393 struct mm_struct *mm = vma->vm_mm;
30 pgoff_t pgoff; 394 pgoff_t pgoff;
31 int pages; 395 int nr_pages;
32 int ret = 0; 396 int ret = 0;
33 397 int lock = newflags & VM_LOCKED;
34 if (newflags == vma->vm_flags) { 398
35 *prev = vma; 399 if (newflags == vma->vm_flags ||
36 goto out; 400 (vma->vm_flags & (VM_IO | VM_PFNMAP)))
401 goto out; /* don't set VM_LOCKED, don't count */
402
403 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
404 is_vm_hugetlb_page(vma) ||
405 vma == get_gate_vma(current)) {
406 if (lock)
407 make_pages_present(start, end);
408 goto out; /* don't set VM_LOCKED, don't count */
37 } 409 }
38 410
39 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 411 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
44 goto success; 416 goto success;
45 } 417 }
46 418
47 *prev = vma;
48
49 if (start != vma->vm_start) { 419 if (start != vma->vm_start) {
50 ret = split_vma(mm, vma, start, 1); 420 ret = split_vma(mm, vma, start, 1);
51 if (ret) 421 if (ret)
@@ -60,24 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
60 430
61success: 431success:
62 /* 432 /*
433 * Keep track of amount of locked VM.
434 */
435 nr_pages = (end - start) >> PAGE_SHIFT;
436 if (!lock)
437 nr_pages = -nr_pages;
438 mm->locked_vm += nr_pages;
439
440 /*
63 * vm_flags is protected by the mmap_sem held in write mode. 441 * vm_flags is protected by the mmap_sem held in write mode.
64 * It's okay if try_to_unmap_one unmaps a page just after we 442 * It's okay if try_to_unmap_one unmaps a page just after we
65 * set VM_LOCKED, make_pages_present below will bring it back. 443 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
66 */ 444 */
67 vma->vm_flags = newflags; 445 vma->vm_flags = newflags;
68 446
69 /* 447 if (lock) {
70 * Keep track of amount of locked VM. 448 /*
71 */ 449 * mmap_sem is currently held for write. Downgrade the write
72 pages = (end - start) >> PAGE_SHIFT; 450 * lock to a read lock so that other faults, mmap scans, ...
73 if (newflags & VM_LOCKED) { 451 * while we fault in all pages.
74 pages = -pages; 452 */
75 if (!(newflags & VM_IO)) 453 downgrade_write(&mm->mmap_sem);
76 ret = make_pages_present(start, end); 454
455 ret = __mlock_vma_pages_range(vma, start, end, 1);
456
457 /*
458 * Need to reacquire mmap sem in write mode, as our callers
459 * expect this. We have no support for atomically upgrading
460 * a sem to write, so we need to check for ranges while sem
461 * is unlocked.
462 */
463 up_read(&mm->mmap_sem);
464 /* vma can change or disappear */
465 down_write(&mm->mmap_sem);
466 *prev = find_vma(mm, start);
467 /* non-NULL *prev must contain @start, but need to check @end */
468 if (!(*prev) || end > (*prev)->vm_end)
469 ret = -ENOMEM;
470 else if (ret > 0) {
471 mm->locked_vm -= ret;
472 ret = 0;
473 } else
474 ret = __mlock_posix_error_return(ret); /* translate if needed */
475 } else {
476 /*
477 * TODO: for unlocking, pages will already be resident, so
478 * we don't need to wait for allocations/reclaim/pagein, ...
479 * However, unlocking a very large region can still take a
480 * while. Should we downgrade the semaphore for both lock
481 * AND unlock ?
482 */
483 __mlock_vma_pages_range(vma, start, end, 0);
77 } 484 }
78 485
79 mm->locked_vm -= pages;
80out: 486out:
487 *prev = vma;
81 return ret; 488 return ret;
82} 489}
83 490
diff --git a/mm/mmap.c b/mm/mmap.c
index e7a5a68a9c2e..74f4d158022e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -410,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
410 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 410 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
411} 411}
412 412
413static inline void __vma_link_file(struct vm_area_struct *vma) 413static void __vma_link_file(struct vm_area_struct *vma)
414{ 414{
415 struct file * file; 415 struct file * file;
416 416
@@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end);
662 * If the vma has a ->close operation then the driver probably needs to release 662 * If the vma has a ->close operation then the driver probably needs to release
663 * per-vma resources, so we don't attempt to merge those. 663 * per-vma resources, so we don't attempt to merge those.
664 */ 664 */
665#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
666
667static inline int is_mergeable_vma(struct vm_area_struct *vma, 665static inline int is_mergeable_vma(struct vm_area_struct *vma,
668 struct file *file, unsigned long vm_flags) 666 struct file *file, unsigned long vm_flags)
669{ 667{
@@ -972,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
972 return -EPERM; 970 return -EPERM;
973 vm_flags |= VM_LOCKED; 971 vm_flags |= VM_LOCKED;
974 } 972 }
973
975 /* mlock MCL_FUTURE? */ 974 /* mlock MCL_FUTURE? */
976 if (vm_flags & VM_LOCKED) { 975 if (vm_flags & VM_LOCKED) {
977 unsigned long locked, lock_limit; 976 unsigned long locked, lock_limit;
@@ -1139,10 +1138,12 @@ munmap_back:
1139 * The VM_SHARED test is necessary because shmem_zero_setup 1138 * The VM_SHARED test is necessary because shmem_zero_setup
1140 * will create the file object for a shared anonymous map below. 1139 * will create the file object for a shared anonymous map below.
1141 */ 1140 */
1142 if (!file && !(vm_flags & VM_SHARED) && 1141 if (!file && !(vm_flags & VM_SHARED)) {
1143 vma_merge(mm, prev, addr, addr + len, vm_flags, 1142 vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
1144 NULL, NULL, pgoff, NULL)) 1143 NULL, NULL, pgoff, NULL);
1145 goto out; 1144 if (vma)
1145 goto out;
1146 }
1146 1147
1147 /* 1148 /*
1148 * Determine the object being mapped and call the appropriate 1149 * Determine the object being mapped and call the appropriate
@@ -1224,10 +1225,14 @@ out:
1224 mm->total_vm += len >> PAGE_SHIFT; 1225 mm->total_vm += len >> PAGE_SHIFT;
1225 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1226 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1226 if (vm_flags & VM_LOCKED) { 1227 if (vm_flags & VM_LOCKED) {
1227 mm->locked_vm += len >> PAGE_SHIFT; 1228 /*
1228 make_pages_present(addr, addr + len); 1229 * makes pages present; downgrades, drops, reacquires mmap_sem
1229 } 1230 */
1230 if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1231 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1232 if (nr_pages < 0)
1233 return nr_pages; /* vma gone! */
1234 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1235 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1231 make_pages_present(addr, addr + len); 1236 make_pages_present(addr, addr + len);
1232 return addr; 1237 return addr;
1233 1238
@@ -1586,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1586 * vma is the last one with address > vma->vm_end. Have to extend vma. 1591 * vma is the last one with address > vma->vm_end. Have to extend vma.
1587 */ 1592 */
1588#ifndef CONFIG_IA64 1593#ifndef CONFIG_IA64
1589static inline 1594static
1590#endif 1595#endif
1591int expand_upwards(struct vm_area_struct *vma, unsigned long address) 1596int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1592{ 1597{
@@ -1636,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1636/* 1641/*
1637 * vma is the first one with address < vma->vm_start. Have to extend vma. 1642 * vma is the first one with address < vma->vm_start. Have to extend vma.
1638 */ 1643 */
1639static inline int expand_downwards(struct vm_area_struct *vma, 1644static int expand_downwards(struct vm_area_struct *vma,
1640 unsigned long address) 1645 unsigned long address)
1641{ 1646{
1642 int error; 1647 int error;
@@ -1698,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
1698 vma = find_vma_prev(mm, addr, &prev); 1703 vma = find_vma_prev(mm, addr, &prev);
1699 if (vma && (vma->vm_start <= addr)) 1704 if (vma && (vma->vm_start <= addr))
1700 return vma; 1705 return vma;
1701 if (!prev || expand_stack(prev, addr)) 1706 if (expand_stack(prev, addr))
1702 return NULL; 1707 return NULL;
1703 if (prev->vm_flags & VM_LOCKED) 1708 if (prev->vm_flags & VM_LOCKED) {
1704 make_pages_present(addr, prev->vm_end); 1709 if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
1710 return NULL; /* vma gone! */
1711 }
1705 return prev; 1712 return prev;
1706} 1713}
1707#else 1714#else
@@ -1727,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1727 start = vma->vm_start; 1734 start = vma->vm_start;
1728 if (expand_stack(vma, addr)) 1735 if (expand_stack(vma, addr))
1729 return NULL; 1736 return NULL;
1730 if (vma->vm_flags & VM_LOCKED) 1737 if (vma->vm_flags & VM_LOCKED) {
1731 make_pages_present(addr, start); 1738 if (mlock_vma_pages_range(vma, addr, start) < 0)
1739 return NULL; /* vma gone! */
1740 }
1732 return vma; 1741 return vma;
1733} 1742}
1734#endif 1743#endif
@@ -1747,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1747 long nrpages = vma_pages(vma); 1756 long nrpages = vma_pages(vma);
1748 1757
1749 mm->total_vm -= nrpages; 1758 mm->total_vm -= nrpages;
1750 if (vma->vm_flags & VM_LOCKED)
1751 mm->locked_vm -= nrpages;
1752 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1759 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1753 vma = remove_vma(vma); 1760 vma = remove_vma(vma);
1754 } while (vma); 1761 } while (vma);
@@ -1914,6 +1921,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1914 vma = prev? prev->vm_next: mm->mmap; 1921 vma = prev? prev->vm_next: mm->mmap;
1915 1922
1916 /* 1923 /*
1924 * unlock any mlock()ed ranges before detaching vmas
1925 */
1926 if (mm->locked_vm) {
1927 struct vm_area_struct *tmp = vma;
1928 while (tmp && tmp->vm_start < end) {
1929 if (tmp->vm_flags & VM_LOCKED) {
1930 mm->locked_vm -= vma_pages(tmp);
1931 munlock_vma_pages_all(tmp);
1932 }
1933 tmp = tmp->vm_next;
1934 }
1935 }
1936
1937 /*
1917 * Remove the vma's, and unmap the actual pages 1938 * Remove the vma's, and unmap the actual pages
1918 */ 1939 */
1919 detach_vmas_to_be_unmapped(mm, vma, prev, end); 1940 detach_vmas_to_be_unmapped(mm, vma, prev, end);
@@ -2025,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2025 return -ENOMEM; 2046 return -ENOMEM;
2026 2047
2027 /* Can we just expand an old private anonymous mapping? */ 2048 /* Can we just expand an old private anonymous mapping? */
2028 if (vma_merge(mm, prev, addr, addr + len, flags, 2049 vma = vma_merge(mm, prev, addr, addr + len, flags,
2029 NULL, NULL, pgoff, NULL)) 2050 NULL, NULL, pgoff, NULL);
2051 if (vma)
2030 goto out; 2052 goto out;
2031 2053
2032 /* 2054 /*
@@ -2048,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2048out: 2070out:
2049 mm->total_vm += len >> PAGE_SHIFT; 2071 mm->total_vm += len >> PAGE_SHIFT;
2050 if (flags & VM_LOCKED) { 2072 if (flags & VM_LOCKED) {
2051 mm->locked_vm += len >> PAGE_SHIFT; 2073 if (!mlock_vma_pages_range(vma, addr, addr + len))
2052 make_pages_present(addr, addr + len); 2074 mm->locked_vm += (len >> PAGE_SHIFT);
2053 } 2075 }
2054 return addr; 2076 return addr;
2055} 2077}
@@ -2060,7 +2082,7 @@ EXPORT_SYMBOL(do_brk);
2060void exit_mmap(struct mm_struct *mm) 2082void exit_mmap(struct mm_struct *mm)
2061{ 2083{
2062 struct mmu_gather *tlb; 2084 struct mmu_gather *tlb;
2063 struct vm_area_struct *vma = mm->mmap; 2085 struct vm_area_struct *vma;
2064 unsigned long nr_accounted = 0; 2086 unsigned long nr_accounted = 0;
2065 unsigned long end; 2087 unsigned long end;
2066 2088
@@ -2068,6 +2090,15 @@ void exit_mmap(struct mm_struct *mm)
2068 arch_exit_mmap(mm); 2090 arch_exit_mmap(mm);
2069 mmu_notifier_release(mm); 2091 mmu_notifier_release(mm);
2070 2092
2093 if (mm->locked_vm) {
2094 vma = mm->mmap;
2095 while (vma) {
2096 if (vma->vm_flags & VM_LOCKED)
2097 munlock_vma_pages_all(vma);
2098 vma = vma->vm_next;
2099 }
2100 }
2101 vma = mm->mmap;
2071 lru_add_drain(); 2102 lru_add_drain();
2072 flush_cache_mm(mm); 2103 flush_cache_mm(mm);
2073 tlb = tlb_gather_mmu(mm, 1); 2104 tlb = tlb_gather_mmu(mm, 1);
diff --git a/mm/mremap.c b/mm/mremap.c
index 1a7743923c8c..58a2908f42f5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -24,6 +24,8 @@
24#include <asm/cacheflush.h> 24#include <asm/cacheflush.h>
25#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
26 26
27#include "internal.h"
28
27static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) 29static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
28{ 30{
29 pgd_t *pgd; 31 pgd_t *pgd;
@@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
238 if (vm_flags & VM_LOCKED) { 240 if (vm_flags & VM_LOCKED) {
239 mm->locked_vm += new_len >> PAGE_SHIFT; 241 mm->locked_vm += new_len >> PAGE_SHIFT;
240 if (new_len > old_len) 242 if (new_len > old_len)
241 make_pages_present(new_addr + old_len, 243 mlock_vma_pages_range(new_vma, new_addr + old_len,
242 new_addr + new_len); 244 new_addr + new_len);
243 } 245 }
244 246
245 return new_addr; 247 return new_addr;
@@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long addr,
379 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 381 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
380 if (vma->vm_flags & VM_LOCKED) { 382 if (vma->vm_flags & VM_LOCKED) {
381 mm->locked_vm += pages; 383 mm->locked_vm += pages;
382 make_pages_present(addr + old_len, 384 mlock_vma_pages_range(vma, addr + old_len,
383 addr + new_len); 385 addr + new_len);
384 } 386 }
385 ret = addr; 387 ret = addr;
diff --git a/mm/nommu.c b/mm/nommu.c
index ed75bc962fbe..2696b24f2bb3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -34,6 +34,8 @@
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36 36
37#include "internal.h"
38
37void *high_memory; 39void *high_memory;
38struct page *mem_map; 40struct page *mem_map;
39unsigned long max_mapnr; 41unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
128 return PAGE_SIZE << compound_order(page); 130 return PAGE_SIZE << compound_order(page);
129} 131}
130 132
131/* 133int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
132 * get a list of pages in an address range belonging to the specified process 134 unsigned long start, int len, int flags,
133 * and indicate the VMA that covers each page 135 struct page **pages, struct vm_area_struct **vmas)
134 * - this is potentially dodgy as we may end incrementing the page count of a
135 * slab page or a secondary page from a compound page
136 * - don't permit access to VMAs that don't support it, such as I/O mappings
137 */
138int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
139 unsigned long start, int len, int write, int force,
140 struct page **pages, struct vm_area_struct **vmas)
141{ 136{
142 struct vm_area_struct *vma; 137 struct vm_area_struct *vma;
143 unsigned long vm_flags; 138 unsigned long vm_flags;
144 int i; 139 int i;
140 int write = !!(flags & GUP_FLAGS_WRITE);
141 int force = !!(flags & GUP_FLAGS_FORCE);
142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
145 143
146 /* calculate required read or write permissions. 144 /* calculate required read or write permissions.
147 * - if 'force' is set, we only require the "MAY" flags. 145 * - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
156 154
157 /* protect what we can, including chardevs */ 155 /* protect what we can, including chardevs */
158 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 156 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
159 !(vm_flags & vma->vm_flags)) 157 (!ignore && !(vm_flags & vma->vm_flags)))
160 goto finish_or_fault; 158 goto finish_or_fault;
161 159
162 if (pages) { 160 if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
174finish_or_fault: 172finish_or_fault:
175 return i ? : -EFAULT; 173 return i ? : -EFAULT;
176} 174}
175
176
177/*
178 * get a list of pages in an address range belonging to the specified process
179 * and indicate the VMA that covers each page
180 * - this is potentially dodgy as we may end incrementing the page count of a
181 * slab page or a secondary page from a compound page
182 * - don't permit access to VMAs that don't support it, such as I/O mappings
183 */
184int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
185 unsigned long start, int len, int write, int force,
186 struct page **pages, struct vm_area_struct **vmas)
187{
188 int flags = 0;
189
190 if (write)
191 flags |= GUP_FLAGS_WRITE;
192 if (force)
193 flags |= GUP_FLAGS_FORCE;
194
195 return __get_user_pages(tsk, mm,
196 start, len, flags,
197 pages, vmas);
198}
177EXPORT_SYMBOL(get_user_pages); 199EXPORT_SYMBOL(get_user_pages);
178 200
179DEFINE_RWLOCK(vmlist_lock); 201DEFINE_RWLOCK(vmlist_lock);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b40f6d5f8fe9..2970e35fd03f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
329 struct zone *z = 329 struct zone *z =
330 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 330 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
331 331
332 x += zone_page_state(z, NR_FREE_PAGES) 332 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
333 + zone_page_state(z, NR_INACTIVE)
334 + zone_page_state(z, NR_ACTIVE);
335 } 333 }
336 /* 334 /*
337 * Make sure that the number of highmem pages is never larger 335 * Make sure that the number of highmem pages is never larger
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
355{ 353{
356 unsigned long x; 354 unsigned long x;
357 355
358 x = global_page_state(NR_FREE_PAGES) 356 x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
359 + global_page_state(NR_INACTIVE)
360 + global_page_state(NR_ACTIVE);
361 357
362 if (!vm_highmem_is_dirtyable) 358 if (!vm_highmem_is_dirtyable)
363 x -= highmem_dirtyable_memory(x); 359 x -= highmem_dirtyable_memory(x);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9eb9eb928285..d0a240fbb8bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -44,7 +44,7 @@
44#include <linux/backing-dev.h> 44#include <linux/backing-dev.h>
45#include <linux/fault-inject.h> 45#include <linux/fault-inject.h>
46#include <linux/page-isolation.h> 46#include <linux/page-isolation.h>
47#include <linux/memcontrol.h> 47#include <linux/page_cgroup.h>
48#include <linux/debugobjects.h> 48#include <linux/debugobjects.h>
49 49
50#include <asm/tlbflush.h> 50#include <asm/tlbflush.h>
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page)
223 223
224static void bad_page(struct page *page) 224static void bad_page(struct page *page)
225{ 225{
226 void *pc = page_get_page_cgroup(page);
227
228 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG 226 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
229 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 227 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
230 current->comm, page, (int)(2*sizeof(unsigned long)), 228 current->comm, page, (int)(2*sizeof(unsigned long)),
231 (unsigned long)page->flags, page->mapping, 229 (unsigned long)page->flags, page->mapping,
232 page_mapcount(page), page_count(page)); 230 page_mapcount(page), page_count(page));
233 if (pc) { 231
234 printk(KERN_EMERG "cgroup:%p\n", pc);
235 page_reset_bad_cgroup(page);
236 }
237 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 232 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
238 KERN_EMERG "Backtrace:\n"); 233 KERN_EMERG "Backtrace:\n");
239 dump_stack(); 234 dump_stack();
@@ -454,14 +449,16 @@ static inline void __free_one_page(struct page *page,
454 449
455static inline int free_pages_check(struct page *page) 450static inline int free_pages_check(struct page *page)
456{ 451{
452 free_page_mlock(page);
457 if (unlikely(page_mapcount(page) | 453 if (unlikely(page_mapcount(page) |
458 (page->mapping != NULL) | 454 (page->mapping != NULL) |
459 (page_get_page_cgroup(page) != NULL) |
460 (page_count(page) != 0) | 455 (page_count(page) != 0) |
461 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) 456 (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
462 bad_page(page); 457 bad_page(page);
463 if (PageDirty(page)) 458 if (PageDirty(page))
464 __ClearPageDirty(page); 459 __ClearPageDirty(page);
460 if (PageSwapBacked(page))
461 __ClearPageSwapBacked(page);
465 /* 462 /*
466 * For now, we report if PG_reserved was found set, but do not 463 * For now, we report if PG_reserved was found set, but do not
467 * clear it, and do not free the page. But we shall soon need 464 * clear it, and do not free the page. But we shall soon need
@@ -600,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
600{ 597{
601 if (unlikely(page_mapcount(page) | 598 if (unlikely(page_mapcount(page) |
602 (page->mapping != NULL) | 599 (page->mapping != NULL) |
603 (page_get_page_cgroup(page) != NULL) |
604 (page_count(page) != 0) | 600 (page_count(page) != 0) |
605 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) 601 (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
606 bad_page(page); 602 bad_page(page);
@@ -614,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
614 610
615 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 611 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
616 1 << PG_referenced | 1 << PG_arch_1 | 612 1 << PG_referenced | 1 << PG_arch_1 |
617 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 613 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
614#ifdef CONFIG_UNEVICTABLE_LRU
615 | 1 << PG_mlocked
616#endif
617 );
618 set_page_private(page, 0); 618 set_page_private(page, 0);
619 set_page_refcounted(page); 619 set_page_refcounted(page);
620 620
@@ -1862,10 +1862,21 @@ void show_free_areas(void)
1862 } 1862 }
1863 } 1863 }
1864 1864
1865 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1865 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1866 " inactive_file:%lu"
1867//TODO: check/adjust line lengths
1868#ifdef CONFIG_UNEVICTABLE_LRU
1869 " unevictable:%lu"
1870#endif
1871 " dirty:%lu writeback:%lu unstable:%lu\n"
1866 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1872 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1867 global_page_state(NR_ACTIVE), 1873 global_page_state(NR_ACTIVE_ANON),
1868 global_page_state(NR_INACTIVE), 1874 global_page_state(NR_ACTIVE_FILE),
1875 global_page_state(NR_INACTIVE_ANON),
1876 global_page_state(NR_INACTIVE_FILE),
1877#ifdef CONFIG_UNEVICTABLE_LRU
1878 global_page_state(NR_UNEVICTABLE),
1879#endif
1869 global_page_state(NR_FILE_DIRTY), 1880 global_page_state(NR_FILE_DIRTY),
1870 global_page_state(NR_WRITEBACK), 1881 global_page_state(NR_WRITEBACK),
1871 global_page_state(NR_UNSTABLE_NFS), 1882 global_page_state(NR_UNSTABLE_NFS),
@@ -1888,8 +1899,13 @@ void show_free_areas(void)
1888 " min:%lukB" 1899 " min:%lukB"
1889 " low:%lukB" 1900 " low:%lukB"
1890 " high:%lukB" 1901 " high:%lukB"
1891 " active:%lukB" 1902 " active_anon:%lukB"
1892 " inactive:%lukB" 1903 " inactive_anon:%lukB"
1904 " active_file:%lukB"
1905 " inactive_file:%lukB"
1906#ifdef CONFIG_UNEVICTABLE_LRU
1907 " unevictable:%lukB"
1908#endif
1893 " present:%lukB" 1909 " present:%lukB"
1894 " pages_scanned:%lu" 1910 " pages_scanned:%lu"
1895 " all_unreclaimable? %s" 1911 " all_unreclaimable? %s"
@@ -1899,8 +1915,13 @@ void show_free_areas(void)
1899 K(zone->pages_min), 1915 K(zone->pages_min),
1900 K(zone->pages_low), 1916 K(zone->pages_low),
1901 K(zone->pages_high), 1917 K(zone->pages_high),
1902 K(zone_page_state(zone, NR_ACTIVE)), 1918 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1903 K(zone_page_state(zone, NR_INACTIVE)), 1919 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1920 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1921 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1922#ifdef CONFIG_UNEVICTABLE_LRU
1923 K(zone_page_state(zone, NR_UNEVICTABLE)),
1924#endif
1904 K(zone->present_pages), 1925 K(zone->present_pages),
1905 zone->pages_scanned, 1926 zone->pages_scanned,
1906 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 1927 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -3410,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3410 pgdat->nr_zones = 0; 3431 pgdat->nr_zones = 0;
3411 init_waitqueue_head(&pgdat->kswapd_wait); 3432 init_waitqueue_head(&pgdat->kswapd_wait);
3412 pgdat->kswapd_max_order = 0; 3433 pgdat->kswapd_max_order = 0;
3434 pgdat_page_cgroup_init(pgdat);
3413 3435
3414 for (j = 0; j < MAX_NR_ZONES; j++) { 3436 for (j = 0; j < MAX_NR_ZONES; j++) {
3415 struct zone *zone = pgdat->node_zones + j; 3437 struct zone *zone = pgdat->node_zones + j;
3416 unsigned long size, realsize, memmap_pages; 3438 unsigned long size, realsize, memmap_pages;
3439 enum lru_list l;
3417 3440
3418 size = zone_spanned_pages_in_node(nid, j, zones_size); 3441 size = zone_spanned_pages_in_node(nid, j, zones_size);
3419 realsize = size - zone_absent_pages_in_node(nid, j, 3442 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -3428,8 +3451,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3428 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3451 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3429 if (realsize >= memmap_pages) { 3452 if (realsize >= memmap_pages) {
3430 realsize -= memmap_pages; 3453 realsize -= memmap_pages;
3431 mminit_dprintk(MMINIT_TRACE, "memmap_init", 3454 printk(KERN_DEBUG
3432 "%s zone: %lu pages used for memmap\n", 3455 " %s zone: %lu pages used for memmap\n",
3433 zone_names[j], memmap_pages); 3456 zone_names[j], memmap_pages);
3434 } else 3457 } else
3435 printk(KERN_WARNING 3458 printk(KERN_WARNING
@@ -3439,8 +3462,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3439 /* Account for reserved pages */ 3462 /* Account for reserved pages */
3440 if (j == 0 && realsize > dma_reserve) { 3463 if (j == 0 && realsize > dma_reserve) {
3441 realsize -= dma_reserve; 3464 realsize -= dma_reserve;
3442 mminit_dprintk(MMINIT_TRACE, "memmap_init", 3465 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
3443 "%s zone: %lu pages reserved\n",
3444 zone_names[0], dma_reserve); 3466 zone_names[0], dma_reserve);
3445 } 3467 }
3446 3468
@@ -3465,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3465 zone->prev_priority = DEF_PRIORITY; 3487 zone->prev_priority = DEF_PRIORITY;
3466 3488
3467 zone_pcp_init(zone); 3489 zone_pcp_init(zone);
3468 INIT_LIST_HEAD(&zone->active_list); 3490 for_each_lru(l) {
3469 INIT_LIST_HEAD(&zone->inactive_list); 3491 INIT_LIST_HEAD(&zone->lru[l].list);
3470 zone->nr_scan_active = 0; 3492 zone->lru[l].nr_scan = 0;
3471 zone->nr_scan_inactive = 0; 3493 }
3494 zone->recent_rotated[0] = 0;
3495 zone->recent_rotated[1] = 0;
3496 zone->recent_scanned[0] = 0;
3497 zone->recent_scanned[1] = 0;
3472 zap_zone_vm_stats(zone); 3498 zap_zone_vm_stats(zone);
3473 zone->flags = 0; 3499 zone->flags = 0;
3474 if (!size) 3500 if (!size)
@@ -4210,7 +4236,7 @@ void setup_per_zone_pages_min(void)
4210 for_each_zone(zone) { 4236 for_each_zone(zone) {
4211 u64 tmp; 4237 u64 tmp;
4212 4238
4213 spin_lock_irqsave(&zone->lru_lock, flags); 4239 spin_lock_irqsave(&zone->lock, flags);
4214 tmp = (u64)pages_min * zone->present_pages; 4240 tmp = (u64)pages_min * zone->present_pages;
4215 do_div(tmp, lowmem_pages); 4241 do_div(tmp, lowmem_pages);
4216 if (is_highmem(zone)) { 4242 if (is_highmem(zone)) {
@@ -4242,13 +4268,53 @@ void setup_per_zone_pages_min(void)
4242 zone->pages_low = zone->pages_min + (tmp >> 2); 4268 zone->pages_low = zone->pages_min + (tmp >> 2);
4243 zone->pages_high = zone->pages_min + (tmp >> 1); 4269 zone->pages_high = zone->pages_min + (tmp >> 1);
4244 setup_zone_migrate_reserve(zone); 4270 setup_zone_migrate_reserve(zone);
4245 spin_unlock_irqrestore(&zone->lru_lock, flags); 4271 spin_unlock_irqrestore(&zone->lock, flags);
4246 } 4272 }
4247 4273
4248 /* update totalreserve_pages */ 4274 /* update totalreserve_pages */
4249 calculate_totalreserve_pages(); 4275 calculate_totalreserve_pages();
4250} 4276}
4251 4277
4278/**
4279 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4280 *
4281 * The inactive anon list should be small enough that the VM never has to
4282 * do too much work, but large enough that each inactive page has a chance
4283 * to be referenced again before it is swapped out.
4284 *
4285 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
4286 * INACTIVE_ANON pages on this zone's LRU, maintained by the
4287 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
4288 * the anonymous pages are kept on the inactive list.
4289 *
4290 * total target max
4291 * memory ratio inactive anon
4292 * -------------------------------------
4293 * 10MB 1 5MB
4294 * 100MB 1 50MB
4295 * 1GB 3 250MB
4296 * 10GB 10 0.9GB
4297 * 100GB 31 3GB
4298 * 1TB 101 10GB
4299 * 10TB 320 32GB
4300 */
4301void setup_per_zone_inactive_ratio(void)
4302{
4303 struct zone *zone;
4304
4305 for_each_zone(zone) {
4306 unsigned int gb, ratio;
4307
4308 /* Zone size in gigabytes */
4309 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4310 ratio = int_sqrt(10 * gb);
4311 if (!ratio)
4312 ratio = 1;
4313
4314 zone->inactive_ratio = ratio;
4315 }
4316}
4317
4252/* 4318/*
4253 * Initialise min_free_kbytes. 4319 * Initialise min_free_kbytes.
4254 * 4320 *
@@ -4286,6 +4352,7 @@ static int __init init_per_zone_pages_min(void)
4286 min_free_kbytes = 65536; 4352 min_free_kbytes = 65536;
4287 setup_per_zone_pages_min(); 4353 setup_per_zone_pages_min();
4288 setup_per_zone_lowmem_reserve(); 4354 setup_per_zone_lowmem_reserve();
4355 setup_per_zone_inactive_ratio();
4289 return 0; 4356 return 0;
4290} 4357}
4291module_init(init_per_zone_pages_min) 4358module_init(init_per_zone_pages_min)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
new file mode 100644
index 000000000000..5d86550701f2
--- /dev/null
+++ b/mm/page_cgroup.c
@@ -0,0 +1,237 @@
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/memory.h>
8
9static void __meminit
10__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
11{
12 pc->flags = 0;
13 pc->mem_cgroup = NULL;
14 pc->page = pfn_to_page(pfn);
15}
16static unsigned long total_usage;
17
18#if !defined(CONFIG_SPARSEMEM)
19
20
21void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
22{
23 pgdat->node_page_cgroup = NULL;
24}
25
26struct page_cgroup *lookup_page_cgroup(struct page *page)
27{
28 unsigned long pfn = page_to_pfn(page);
29 unsigned long offset;
30 struct page_cgroup *base;
31
32 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
33 if (unlikely(!base))
34 return NULL;
35
36 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
37 return base + offset;
38}
39
40static int __init alloc_node_page_cgroup(int nid)
41{
42 struct page_cgroup *base, *pc;
43 unsigned long table_size;
44 unsigned long start_pfn, nr_pages, index;
45
46 start_pfn = NODE_DATA(nid)->node_start_pfn;
47 nr_pages = NODE_DATA(nid)->node_spanned_pages;
48
49 table_size = sizeof(struct page_cgroup) * nr_pages;
50
51 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
52 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
53 if (!base)
54 return -ENOMEM;
55 for (index = 0; index < nr_pages; index++) {
56 pc = base + index;
57 __init_page_cgroup(pc, start_pfn + index);
58 }
59 NODE_DATA(nid)->node_page_cgroup = base;
60 total_usage += table_size;
61 return 0;
62}
63
64void __init page_cgroup_init(void)
65{
66
67 int nid, fail;
68
69 for_each_online_node(nid) {
70 fail = alloc_node_page_cgroup(nid);
71 if (fail)
72 goto fail;
73 }
74 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
75 printk(KERN_INFO "please try cgroup_disable=memory option if you"
76 " don't want\n");
77 return;
78fail:
79 printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
80 printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
81 panic("Out of memory");
82}
83
84#else /* CONFIG_FLAT_NODE_MEM_MAP */
85
86struct page_cgroup *lookup_page_cgroup(struct page *page)
87{
88 unsigned long pfn = page_to_pfn(page);
89 struct mem_section *section = __pfn_to_section(pfn);
90
91 return section->page_cgroup + pfn;
92}
93
94int __meminit init_section_page_cgroup(unsigned long pfn)
95{
96 struct mem_section *section;
97 struct page_cgroup *base, *pc;
98 unsigned long table_size;
99 int nid, index;
100
101 section = __pfn_to_section(pfn);
102
103 if (section->page_cgroup)
104 return 0;
105
106 nid = page_to_nid(pfn_to_page(pfn));
107
108 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
109 base = kmalloc_node(table_size, GFP_KERNEL, nid);
110 if (!base)
111 base = vmalloc_node(table_size, nid);
112
113 if (!base) {
114 printk(KERN_ERR "page cgroup allocation failure\n");
115 return -ENOMEM;
116 }
117
118 for (index = 0; index < PAGES_PER_SECTION; index++) {
119 pc = base + index;
120 __init_page_cgroup(pc, pfn + index);
121 }
122
123 section = __pfn_to_section(pfn);
124 section->page_cgroup = base - pfn;
125 total_usage += table_size;
126 return 0;
127}
128#ifdef CONFIG_MEMORY_HOTPLUG
129void __free_page_cgroup(unsigned long pfn)
130{
131 struct mem_section *ms;
132 struct page_cgroup *base;
133
134 ms = __pfn_to_section(pfn);
135 if (!ms || !ms->page_cgroup)
136 return;
137 base = ms->page_cgroup + pfn;
138 ms->page_cgroup = NULL;
139 if (is_vmalloc_addr(base))
140 vfree(base);
141 else
142 kfree(base);
143}
144
145int online_page_cgroup(unsigned long start_pfn,
146 unsigned long nr_pages,
147 int nid)
148{
149 unsigned long start, end, pfn;
150 int fail = 0;
151
152 start = start_pfn & (PAGES_PER_SECTION - 1);
153 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
154
155 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
156 if (!pfn_present(pfn))
157 continue;
158 fail = init_section_page_cgroup(pfn);
159 }
160 if (!fail)
161 return 0;
162
163 /* rollback */
164 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
165 __free_page_cgroup(pfn);
166
167 return -ENOMEM;
168}
169
170int offline_page_cgroup(unsigned long start_pfn,
171 unsigned long nr_pages, int nid)
172{
173 unsigned long start, end, pfn;
174
175 start = start_pfn & (PAGES_PER_SECTION - 1);
176 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
177
178 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
179 __free_page_cgroup(pfn);
180 return 0;
181
182}
183
184static int page_cgroup_callback(struct notifier_block *self,
185 unsigned long action, void *arg)
186{
187 struct memory_notify *mn = arg;
188 int ret = 0;
189 switch (action) {
190 case MEM_GOING_ONLINE:
191 ret = online_page_cgroup(mn->start_pfn,
192 mn->nr_pages, mn->status_change_nid);
193 break;
194 case MEM_CANCEL_ONLINE:
195 case MEM_OFFLINE:
196 offline_page_cgroup(mn->start_pfn,
197 mn->nr_pages, mn->status_change_nid);
198 break;
199 case MEM_GOING_OFFLINE:
200 break;
201 case MEM_ONLINE:
202 case MEM_CANCEL_OFFLINE:
203 break;
204 }
205 ret = notifier_from_errno(ret);
206 return ret;
207}
208
209#endif
210
211void __init page_cgroup_init(void)
212{
213 unsigned long pfn;
214 int fail = 0;
215
216 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
217 if (!pfn_present(pfn))
218 continue;
219 fail = init_section_page_cgroup(pfn);
220 }
221 if (fail) {
222 printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
223 panic("Out of memory");
224 } else {
225 hotplug_memory_notifier(page_cgroup_callback, 0);
226 }
227 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
228 printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
229 " want\n");
230}
231
232void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
233{
234 return;
235}
236
237#endif
diff --git a/mm/readahead.c b/mm/readahead.c
index 6cbd9a72fde2..bec83c15a78f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
229 */ 229 */
230unsigned long max_sane_readahead(unsigned long nr) 230unsigned long max_sane_readahead(unsigned long nr)
231{ 231{
232 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) 232 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
234} 234}
235 235
diff --git a/mm/rmap.c b/mm/rmap.c
index 0383acfcb068..10993942d6c9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,9 +53,47 @@
53 53
54#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
55 55
56struct kmem_cache *anon_vma_cachep; 56#include "internal.h"
57 57
58/* This must be called under the mmap_sem. */ 58static struct kmem_cache *anon_vma_cachep;
59
60static inline struct anon_vma *anon_vma_alloc(void)
61{
62 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
63}
64
65static inline void anon_vma_free(struct anon_vma *anon_vma)
66{
67 kmem_cache_free(anon_vma_cachep, anon_vma);
68}
69
70/**
71 * anon_vma_prepare - attach an anon_vma to a memory region
72 * @vma: the memory region in question
73 *
74 * This makes sure the memory mapping described by 'vma' has
75 * an 'anon_vma' attached to it, so that we can associate the
76 * anonymous pages mapped into it with that anon_vma.
77 *
78 * The common case will be that we already have one, but if
79 * if not we either need to find an adjacent mapping that we
80 * can re-use the anon_vma from (very common when the only
81 * reason for splitting a vma has been mprotect()), or we
82 * allocate a new one.
83 *
84 * Anon-vma allocations are very subtle, because we may have
85 * optimistically looked up an anon_vma in page_lock_anon_vma()
86 * and that may actually touch the spinlock even in the newly
87 * allocated vma (it depends on RCU to make sure that the
88 * anon_vma isn't actually destroyed).
89 *
90 * As a result, we need to do proper anon_vma locking even
91 * for the new allocation. At the same time, we do not want
92 * to do any locking for the common case of already having
93 * an anon_vma.
94 *
95 * This must be called with the mmap_sem held for reading.
96 */
59int anon_vma_prepare(struct vm_area_struct *vma) 97int anon_vma_prepare(struct vm_area_struct *vma)
60{ 98{
61 struct anon_vma *anon_vma = vma->anon_vma; 99 struct anon_vma *anon_vma = vma->anon_vma;
@@ -63,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma)
63 might_sleep(); 101 might_sleep();
64 if (unlikely(!anon_vma)) { 102 if (unlikely(!anon_vma)) {
65 struct mm_struct *mm = vma->vm_mm; 103 struct mm_struct *mm = vma->vm_mm;
66 struct anon_vma *allocated, *locked; 104 struct anon_vma *allocated;
67 105
68 anon_vma = find_mergeable_anon_vma(vma); 106 anon_vma = find_mergeable_anon_vma(vma);
69 if (anon_vma) { 107 allocated = NULL;
70 allocated = NULL; 108 if (!anon_vma) {
71 locked = anon_vma;
72 spin_lock(&locked->lock);
73 } else {
74 anon_vma = anon_vma_alloc(); 109 anon_vma = anon_vma_alloc();
75 if (unlikely(!anon_vma)) 110 if (unlikely(!anon_vma))
76 return -ENOMEM; 111 return -ENOMEM;
77 allocated = anon_vma; 112 allocated = anon_vma;
78 locked = NULL;
79 } 113 }
114 spin_lock(&anon_vma->lock);
80 115
81 /* page_table_lock to protect against threads */ 116 /* page_table_lock to protect against threads */
82 spin_lock(&mm->page_table_lock); 117 spin_lock(&mm->page_table_lock);
@@ -87,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
87 } 122 }
88 spin_unlock(&mm->page_table_lock); 123 spin_unlock(&mm->page_table_lock);
89 124
90 if (locked) 125 spin_unlock(&anon_vma->lock);
91 spin_unlock(&locked->lock);
92 if (unlikely(allocated)) 126 if (unlikely(allocated))
93 anon_vma_free(allocated); 127 anon_vma_free(allocated);
94 } 128 }
@@ -157,7 +191,7 @@ void __init anon_vma_init(void)
157 * Getting a lock on a stable anon_vma from a page off the LRU is 191 * Getting a lock on a stable anon_vma from a page off the LRU is
158 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 192 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
159 */ 193 */
160static struct anon_vma *page_lock_anon_vma(struct page *page) 194struct anon_vma *page_lock_anon_vma(struct page *page)
161{ 195{
162 struct anon_vma *anon_vma; 196 struct anon_vma *anon_vma;
163 unsigned long anon_mapping; 197 unsigned long anon_mapping;
@@ -177,7 +211,7 @@ out:
177 return NULL; 211 return NULL;
178} 212}
179 213
180static void page_unlock_anon_vma(struct anon_vma *anon_vma) 214void page_unlock_anon_vma(struct anon_vma *anon_vma)
181{ 215{
182 spin_unlock(&anon_vma->lock); 216 spin_unlock(&anon_vma->lock);
183 rcu_read_unlock(); 217 rcu_read_unlock();
@@ -268,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
268 return NULL; 302 return NULL;
269} 303}
270 304
305/**
306 * page_mapped_in_vma - check whether a page is really mapped in a VMA
307 * @page: the page to test
308 * @vma: the VMA to test
309 *
310 * Returns 1 if the page is mapped into the page tables of the VMA, 0
311 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs.
313 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{
316 unsigned long address;
317 pte_t *pte;
318 spinlock_t *ptl;
319
320 address = vma_address(page, vma);
321 if (address == -EFAULT) /* out of vma range */
322 return 0;
323 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
324 if (!pte) /* the page is not in this mm */
325 return 0;
326 pte_unmap_unlock(pte, ptl);
327
328 return 1;
329}
330
271/* 331/*
272 * Subfunctions of page_referenced: page_referenced_one called 332 * Subfunctions of page_referenced: page_referenced_one called
273 * repeatedly from either page_referenced_anon or page_referenced_file. 333 * repeatedly from either page_referenced_anon or page_referenced_file.
@@ -289,10 +349,17 @@ static int page_referenced_one(struct page *page,
289 if (!pte) 349 if (!pte)
290 goto out; 350 goto out;
291 351
352 /*
353 * Don't want to elevate referenced for mlocked page that gets this far,
354 * in order that it progresses to try_to_unmap and is moved to the
355 * unevictable list.
356 */
292 if (vma->vm_flags & VM_LOCKED) { 357 if (vma->vm_flags & VM_LOCKED) {
293 referenced++;
294 *mapcount = 1; /* break early from loop */ 358 *mapcount = 1; /* break early from loop */
295 } else if (ptep_clear_flush_young_notify(vma, address, pte)) 359 goto out_unmap;
360 }
361
362 if (ptep_clear_flush_young_notify(vma, address, pte))
296 referenced++; 363 referenced++;
297 364
298 /* Pretend the page is referenced if the task has the 365 /* Pretend the page is referenced if the task has the
@@ -301,6 +368,7 @@ static int page_referenced_one(struct page *page,
301 rwsem_is_locked(&mm->mmap_sem)) 368 rwsem_is_locked(&mm->mmap_sem))
302 referenced++; 369 referenced++;
303 370
371out_unmap:
304 (*mapcount)--; 372 (*mapcount)--;
305 pte_unmap_unlock(pte, ptl); 373 pte_unmap_unlock(pte, ptl);
306out: 374out:
@@ -390,11 +458,6 @@ static int page_referenced_file(struct page *page,
390 */ 458 */
391 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 459 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
392 continue; 460 continue;
393 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
394 == (VM_LOCKED|VM_MAYSHARE)) {
395 referenced++;
396 break;
397 }
398 referenced += page_referenced_one(page, vma, &mapcount); 461 referenced += page_referenced_one(page, vma, &mapcount);
399 if (!mapcount) 462 if (!mapcount)
400 break; 463 break;
@@ -674,8 +737,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
674 page_clear_dirty(page); 737 page_clear_dirty(page);
675 set_page_dirty(page); 738 set_page_dirty(page);
676 } 739 }
677 740 if (PageAnon(page))
678 mem_cgroup_uncharge_page(page); 741 mem_cgroup_uncharge_page(page);
679 __dec_zone_page_state(page, 742 __dec_zone_page_state(page,
680 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 743 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
681 /* 744 /*
@@ -717,11 +780,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
717 * If it's recently referenced (perhaps page_referenced 780 * If it's recently referenced (perhaps page_referenced
718 * skipped over this mm) then we should reactivate it. 781 * skipped over this mm) then we should reactivate it.
719 */ 782 */
720 if (!migration && ((vma->vm_flags & VM_LOCKED) || 783 if (!migration) {
721 (ptep_clear_flush_young_notify(vma, address, pte)))) { 784 if (vma->vm_flags & VM_LOCKED) {
722 ret = SWAP_FAIL; 785 ret = SWAP_MLOCK;
723 goto out_unmap; 786 goto out_unmap;
724 } 787 }
788 if (ptep_clear_flush_young_notify(vma, address, pte)) {
789 ret = SWAP_FAIL;
790 goto out_unmap;
791 }
792 }
725 793
726 /* Nuke the page table entry. */ 794 /* Nuke the page table entry. */
727 flush_cache_page(vma, address, page_to_pfn(page)); 795 flush_cache_page(vma, address, page_to_pfn(page));
@@ -802,12 +870,17 @@ out:
802 * For very sparsely populated VMAs this is a little inefficient - chances are 870 * For very sparsely populated VMAs this is a little inefficient - chances are
803 * there there won't be many ptes located within the scan cluster. In this case 871 * there there won't be many ptes located within the scan cluster. In this case
804 * maybe we could scan further - to the end of the pte page, perhaps. 872 * maybe we could scan further - to the end of the pte page, perhaps.
873 *
874 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
875 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
876 * rather than unmapping them. If we encounter the "check_page" that vmscan is
877 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
805 */ 878 */
806#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 879#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
807#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 880#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
808 881
809static void try_to_unmap_cluster(unsigned long cursor, 882static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
810 unsigned int *mapcount, struct vm_area_struct *vma) 883 struct vm_area_struct *vma, struct page *check_page)
811{ 884{
812 struct mm_struct *mm = vma->vm_mm; 885 struct mm_struct *mm = vma->vm_mm;
813 pgd_t *pgd; 886 pgd_t *pgd;
@@ -819,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
819 struct page *page; 892 struct page *page;
820 unsigned long address; 893 unsigned long address;
821 unsigned long end; 894 unsigned long end;
895 int ret = SWAP_AGAIN;
896 int locked_vma = 0;
822 897
823 address = (vma->vm_start + cursor) & CLUSTER_MASK; 898 address = (vma->vm_start + cursor) & CLUSTER_MASK;
824 end = address + CLUSTER_SIZE; 899 end = address + CLUSTER_SIZE;
@@ -829,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
829 904
830 pgd = pgd_offset(mm, address); 905 pgd = pgd_offset(mm, address);
831 if (!pgd_present(*pgd)) 906 if (!pgd_present(*pgd))
832 return; 907 return ret;
833 908
834 pud = pud_offset(pgd, address); 909 pud = pud_offset(pgd, address);
835 if (!pud_present(*pud)) 910 if (!pud_present(*pud))
836 return; 911 return ret;
837 912
838 pmd = pmd_offset(pud, address); 913 pmd = pmd_offset(pud, address);
839 if (!pmd_present(*pmd)) 914 if (!pmd_present(*pmd))
840 return; 915 return ret;
916
917 /*
918 * MLOCK_PAGES => feature is configured.
919 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
920 * keep the sem while scanning the cluster for mlocking pages.
921 */
922 if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
923 locked_vma = (vma->vm_flags & VM_LOCKED);
924 if (!locked_vma)
925 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
926 }
841 927
842 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 928 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
843 929
@@ -850,6 +936,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
850 page = vm_normal_page(vma, address, *pte); 936 page = vm_normal_page(vma, address, *pte);
851 BUG_ON(!page || PageAnon(page)); 937 BUG_ON(!page || PageAnon(page));
852 938
939 if (locked_vma) {
940 mlock_vma_page(page); /* no-op if already mlocked */
941 if (page == check_page)
942 ret = SWAP_MLOCK;
943 continue; /* don't unmap */
944 }
945
853 if (ptep_clear_flush_young_notify(vma, address, pte)) 946 if (ptep_clear_flush_young_notify(vma, address, pte))
854 continue; 947 continue;
855 948
@@ -871,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
871 (*mapcount)--; 964 (*mapcount)--;
872 } 965 }
873 pte_unmap_unlock(pte - 1, ptl); 966 pte_unmap_unlock(pte - 1, ptl);
967 if (locked_vma)
968 up_read(&vma->vm_mm->mmap_sem);
969 return ret;
874} 970}
875 971
876static int try_to_unmap_anon(struct page *page, int migration) 972/*
973 * common handling for pages mapped in VM_LOCKED vmas
974 */
975static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
976{
977 int mlocked = 0;
978
979 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
980 if (vma->vm_flags & VM_LOCKED) {
981 mlock_vma_page(page);
982 mlocked++; /* really mlocked the page */
983 }
984 up_read(&vma->vm_mm->mmap_sem);
985 }
986 return mlocked;
987}
988
989/**
990 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
991 * rmap method
992 * @page: the page to unmap/unlock
993 * @unlock: request for unlock rather than unmap [unlikely]
994 * @migration: unmapping for migration - ignored if @unlock
995 *
996 * Find all the mappings of a page using the mapping pointer and the vma chains
997 * contained in the anon_vma struct it points to.
998 *
999 * This function is only called from try_to_unmap/try_to_munlock for
1000 * anonymous pages.
1001 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1002 * where the page was found will be held for write. So, we won't recheck
1003 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1004 * 'LOCKED.
1005 */
1006static int try_to_unmap_anon(struct page *page, int unlock, int migration)
877{ 1007{
878 struct anon_vma *anon_vma; 1008 struct anon_vma *anon_vma;
879 struct vm_area_struct *vma; 1009 struct vm_area_struct *vma;
1010 unsigned int mlocked = 0;
880 int ret = SWAP_AGAIN; 1011 int ret = SWAP_AGAIN;
881 1012
1013 if (MLOCK_PAGES && unlikely(unlock))
1014 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1015
882 anon_vma = page_lock_anon_vma(page); 1016 anon_vma = page_lock_anon_vma(page);
883 if (!anon_vma) 1017 if (!anon_vma)
884 return ret; 1018 return ret;
885 1019
886 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1020 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
887 ret = try_to_unmap_one(page, vma, migration); 1021 if (MLOCK_PAGES && unlikely(unlock)) {
888 if (ret == SWAP_FAIL || !page_mapped(page)) 1022 if (!((vma->vm_flags & VM_LOCKED) &&
889 break; 1023 page_mapped_in_vma(page, vma)))
1024 continue; /* must visit all unlocked vmas */
1025 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1026 } else {
1027 ret = try_to_unmap_one(page, vma, migration);
1028 if (ret == SWAP_FAIL || !page_mapped(page))
1029 break;
1030 }
1031 if (ret == SWAP_MLOCK) {
1032 mlocked = try_to_mlock_page(page, vma);
1033 if (mlocked)
1034 break; /* stop if actually mlocked page */
1035 }
890 } 1036 }
891 1037
892 page_unlock_anon_vma(anon_vma); 1038 page_unlock_anon_vma(anon_vma);
1039
1040 if (mlocked)
1041 ret = SWAP_MLOCK; /* actually mlocked the page */
1042 else if (ret == SWAP_MLOCK)
1043 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1044
893 return ret; 1045 return ret;
894} 1046}
895 1047
896/** 1048/**
897 * try_to_unmap_file - unmap file page using the object-based rmap method 1049 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
898 * @page: the page to unmap 1050 * @page: the page to unmap/unlock
899 * @migration: migration flag 1051 * @unlock: request for unlock rather than unmap [unlikely]
1052 * @migration: unmapping for migration - ignored if @unlock
900 * 1053 *
901 * Find all the mappings of a page using the mapping pointer and the vma chains 1054 * Find all the mappings of a page using the mapping pointer and the vma chains
902 * contained in the address_space struct it points to. 1055 * contained in the address_space struct it points to.
903 * 1056 *
904 * This function is only called from try_to_unmap for object-based pages. 1057 * This function is only called from try_to_unmap/try_to_munlock for
1058 * object-based pages.
1059 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1060 * where the page was found will be held for write. So, we won't recheck
1061 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1062 * 'LOCKED.
905 */ 1063 */
906static int try_to_unmap_file(struct page *page, int migration) 1064static int try_to_unmap_file(struct page *page, int unlock, int migration)
907{ 1065{
908 struct address_space *mapping = page->mapping; 1066 struct address_space *mapping = page->mapping;
909 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1067 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -914,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration)
914 unsigned long max_nl_cursor = 0; 1072 unsigned long max_nl_cursor = 0;
915 unsigned long max_nl_size = 0; 1073 unsigned long max_nl_size = 0;
916 unsigned int mapcount; 1074 unsigned int mapcount;
1075 unsigned int mlocked = 0;
1076
1077 if (MLOCK_PAGES && unlikely(unlock))
1078 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
917 1079
918 spin_lock(&mapping->i_mmap_lock); 1080 spin_lock(&mapping->i_mmap_lock);
919 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1081 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
920 ret = try_to_unmap_one(page, vma, migration); 1082 if (MLOCK_PAGES && unlikely(unlock)) {
921 if (ret == SWAP_FAIL || !page_mapped(page)) 1083 if (!(vma->vm_flags & VM_LOCKED))
922 goto out; 1084 continue; /* must visit all vmas */
1085 ret = SWAP_MLOCK;
1086 } else {
1087 ret = try_to_unmap_one(page, vma, migration);
1088 if (ret == SWAP_FAIL || !page_mapped(page))
1089 goto out;
1090 }
1091 if (ret == SWAP_MLOCK) {
1092 mlocked = try_to_mlock_page(page, vma);
1093 if (mlocked)
1094 break; /* stop if actually mlocked page */
1095 }
923 } 1096 }
924 1097
1098 if (mlocked)
1099 goto out;
1100
925 if (list_empty(&mapping->i_mmap_nonlinear)) 1101 if (list_empty(&mapping->i_mmap_nonlinear))
926 goto out; 1102 goto out;
927 1103
928 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1104 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
929 shared.vm_set.list) { 1105 shared.vm_set.list) {
930 if ((vma->vm_flags & VM_LOCKED) && !migration) 1106 if (MLOCK_PAGES && unlikely(unlock)) {
1107 if (!(vma->vm_flags & VM_LOCKED))
1108 continue; /* must visit all vmas */
1109 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1110 goto out; /* no need to look further */
1111 }
1112 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
931 continue; 1113 continue;
932 cursor = (unsigned long) vma->vm_private_data; 1114 cursor = (unsigned long) vma->vm_private_data;
933 if (cursor > max_nl_cursor) 1115 if (cursor > max_nl_cursor)
@@ -937,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration)
937 max_nl_size = cursor; 1119 max_nl_size = cursor;
938 } 1120 }
939 1121
940 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 1122 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
941 ret = SWAP_FAIL; 1123 ret = SWAP_FAIL;
942 goto out; 1124 goto out;
943 } 1125 }
@@ -961,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration)
961 do { 1143 do {
962 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1144 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
963 shared.vm_set.list) { 1145 shared.vm_set.list) {
964 if ((vma->vm_flags & VM_LOCKED) && !migration) 1146 if (!MLOCK_PAGES && !migration &&
1147 (vma->vm_flags & VM_LOCKED))
965 continue; 1148 continue;
966 cursor = (unsigned long) vma->vm_private_data; 1149 cursor = (unsigned long) vma->vm_private_data;
967 while ( cursor < max_nl_cursor && 1150 while ( cursor < max_nl_cursor &&
968 cursor < vma->vm_end - vma->vm_start) { 1151 cursor < vma->vm_end - vma->vm_start) {
969 try_to_unmap_cluster(cursor, &mapcount, vma); 1152 ret = try_to_unmap_cluster(cursor, &mapcount,
1153 vma, page);
1154 if (ret == SWAP_MLOCK)
1155 mlocked = 2; /* to return below */
970 cursor += CLUSTER_SIZE; 1156 cursor += CLUSTER_SIZE;
971 vma->vm_private_data = (void *) cursor; 1157 vma->vm_private_data = (void *) cursor;
972 if ((int)mapcount <= 0) 1158 if ((int)mapcount <= 0)
@@ -987,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration)
987 vma->vm_private_data = NULL; 1173 vma->vm_private_data = NULL;
988out: 1174out:
989 spin_unlock(&mapping->i_mmap_lock); 1175 spin_unlock(&mapping->i_mmap_lock);
1176 if (mlocked)
1177 ret = SWAP_MLOCK; /* actually mlocked the page */
1178 else if (ret == SWAP_MLOCK)
1179 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
990 return ret; 1180 return ret;
991} 1181}
992 1182
@@ -1002,6 +1192,7 @@ out:
1002 * SWAP_SUCCESS - we succeeded in removing all mappings 1192 * SWAP_SUCCESS - we succeeded in removing all mappings
1003 * SWAP_AGAIN - we missed a mapping, try again later 1193 * SWAP_AGAIN - we missed a mapping, try again later
1004 * SWAP_FAIL - the page is unswappable 1194 * SWAP_FAIL - the page is unswappable
1195 * SWAP_MLOCK - page is mlocked.
1005 */ 1196 */
1006int try_to_unmap(struct page *page, int migration) 1197int try_to_unmap(struct page *page, int migration)
1007{ 1198{
@@ -1010,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration)
1010 BUG_ON(!PageLocked(page)); 1201 BUG_ON(!PageLocked(page));
1011 1202
1012 if (PageAnon(page)) 1203 if (PageAnon(page))
1013 ret = try_to_unmap_anon(page, migration); 1204 ret = try_to_unmap_anon(page, 0, migration);
1014 else 1205 else
1015 ret = try_to_unmap_file(page, migration); 1206 ret = try_to_unmap_file(page, 0, migration);
1016 1207 if (ret != SWAP_MLOCK && !page_mapped(page))
1017 if (!page_mapped(page))
1018 ret = SWAP_SUCCESS; 1208 ret = SWAP_SUCCESS;
1019 return ret; 1209 return ret;
1020} 1210}
1021 1211
1212#ifdef CONFIG_UNEVICTABLE_LRU
1213/**
1214 * try_to_munlock - try to munlock a page
1215 * @page: the page to be munlocked
1216 *
1217 * Called from munlock code. Checks all of the VMAs mapping the page
1218 * to make sure nobody else has this page mlocked. The page will be
1219 * returned with PG_mlocked cleared if no other vmas have it mlocked.
1220 *
1221 * Return values are:
1222 *
1223 * SWAP_SUCCESS - no vma's holding page mlocked.
1224 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1225 * SWAP_MLOCK - page is now mlocked.
1226 */
1227int try_to_munlock(struct page *page)
1228{
1229 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1230
1231 if (PageAnon(page))
1232 return try_to_unmap_anon(page, 1, 0);
1233 else
1234 return try_to_unmap_file(page, 1, 0);
1235}
1236#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index d87958a5f03e..d38d7e61fcd0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -199,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
199 199
200static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 200static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
201 .ra_pages = 0, /* No readahead */ 201 .ra_pages = 0, /* No readahead */
202 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 202 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
203 .unplug_io_fn = default_unplug_io_fn, 203 .unplug_io_fn = default_unplug_io_fn,
204}; 204};
205 205
@@ -1367,6 +1367,7 @@ repeat:
1367 error = -ENOMEM; 1367 error = -ENOMEM;
1368 goto failed; 1368 goto failed;
1369 } 1369 }
1370 SetPageSwapBacked(filepage);
1370 1371
1371 /* Precharge page while we can wait, compensate after */ 1372 /* Precharge page while we can wait, compensate after */
1372 error = mem_cgroup_cache_charge(filepage, current->mm, 1373 error = mem_cgroup_cache_charge(filepage, current->mm,
@@ -1476,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
1476 if (!user_shm_lock(inode->i_size, user)) 1477 if (!user_shm_lock(inode->i_size, user))
1477 goto out_nomem; 1478 goto out_nomem;
1478 info->flags |= VM_LOCKED; 1479 info->flags |= VM_LOCKED;
1480 mapping_set_unevictable(file->f_mapping);
1479 } 1481 }
1480 if (!lock && (info->flags & VM_LOCKED) && user) { 1482 if (!lock && (info->flags & VM_LOCKED) && user) {
1481 user_shm_unlock(inode->i_size, user); 1483 user_shm_unlock(inode->i_size, user);
1482 info->flags &= ~VM_LOCKED; 1484 info->flags &= ~VM_LOCKED;
1485 mapping_clear_unevictable(file->f_mapping);
1486 scan_mapping_unevictable_pages(file->f_mapping);
1483 } 1487 }
1484 retval = 0; 1488 retval = 0;
1489
1485out_nomem: 1490out_nomem:
1486 spin_unlock(&info->lock); 1491 spin_unlock(&info->lock);
1487 return retval; 1492 return retval;
diff --git a/mm/swap.c b/mm/swap.c
index 9e0cb3118079..2152e48a7b8f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,11 +31,12 @@
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33 33
34#include "internal.h"
35
34/* How many pages do we try to swap or page in/out together? */ 36/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 37int page_cluster;
36 38
37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs); 39static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 40static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
40 41
41/* 42/*
@@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec)
116 zone = pagezone; 117 zone = pagezone;
117 spin_lock(&zone->lru_lock); 118 spin_lock(&zone->lru_lock);
118 } 119 }
119 if (PageLRU(page) && !PageActive(page)) { 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
120 list_move_tail(&page->lru, &zone->inactive_list); 121 int lru = page_is_file_cache(page);
122 list_move_tail(&page->lru, &zone->lru[lru].list);
121 pgmoved++; 123 pgmoved++;
122 } 124 }
123 } 125 }
@@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
136void rotate_reclaimable_page(struct page *page) 138void rotate_reclaimable_page(struct page *page)
137{ 139{
138 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 140 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
139 PageLRU(page)) { 141 !PageUnevictable(page) && PageLRU(page)) {
140 struct pagevec *pvec; 142 struct pagevec *pvec;
141 unsigned long flags; 143 unsigned long flags;
142 144
@@ -157,12 +159,19 @@ void activate_page(struct page *page)
157 struct zone *zone = page_zone(page); 159 struct zone *zone = page_zone(page);
158 160
159 spin_lock_irq(&zone->lru_lock); 161 spin_lock_irq(&zone->lru_lock);
160 if (PageLRU(page) && !PageActive(page)) { 162 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
161 del_page_from_inactive_list(zone, page); 163 int file = page_is_file_cache(page);
164 int lru = LRU_BASE + file;
165 del_page_from_lru_list(zone, page, lru);
166
162 SetPageActive(page); 167 SetPageActive(page);
163 add_page_to_active_list(zone, page); 168 lru += LRU_ACTIVE;
169 add_page_to_lru_list(zone, page, lru);
164 __count_vm_event(PGACTIVATE); 170 __count_vm_event(PGACTIVATE);
165 mem_cgroup_move_lists(page, true); 171 mem_cgroup_move_lists(page, lru);
172
173 zone->recent_rotated[!!file]++;
174 zone->recent_scanned[!!file]++;
166 } 175 }
167 spin_unlock_irq(&zone->lru_lock); 176 spin_unlock_irq(&zone->lru_lock);
168} 177}
@@ -176,7 +185,8 @@ void activate_page(struct page *page)
176 */ 185 */
177void mark_page_accessed(struct page *page) 186void mark_page_accessed(struct page *page)
178{ 187{
179 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 188 if (!PageActive(page) && !PageUnevictable(page) &&
189 PageReferenced(page) && PageLRU(page)) {
180 activate_page(page); 190 activate_page(page);
181 ClearPageReferenced(page); 191 ClearPageReferenced(page);
182 } else if (!PageReferenced(page)) { 192 } else if (!PageReferenced(page)) {
@@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page)
186 196
187EXPORT_SYMBOL(mark_page_accessed); 197EXPORT_SYMBOL(mark_page_accessed);
188 198
189/** 199void __lru_cache_add(struct page *page, enum lru_list lru)
190 * lru_cache_add: add a page to the page lists
191 * @page: the page to add
192 */
193void lru_cache_add(struct page *page)
194{ 200{
195 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 201 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
196 202
197 page_cache_get(page); 203 page_cache_get(page);
198 if (!pagevec_add(pvec, page)) 204 if (!pagevec_add(pvec, page))
199 __pagevec_lru_add(pvec); 205 ____pagevec_lru_add(pvec, lru);
200 put_cpu_var(lru_add_pvecs); 206 put_cpu_var(lru_add_pvecs);
201} 207}
202 208
203void lru_cache_add_active(struct page *page) 209/**
210 * lru_cache_add_lru - add a page to a page list
211 * @page: the page to be added to the LRU.
212 * @lru: the LRU list to which the page is added.
213 */
214void lru_cache_add_lru(struct page *page, enum lru_list lru)
204{ 215{
205 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 216 if (PageActive(page)) {
217 VM_BUG_ON(PageUnevictable(page));
218 ClearPageActive(page);
219 } else if (PageUnevictable(page)) {
220 VM_BUG_ON(PageActive(page));
221 ClearPageUnevictable(page);
222 }
206 223
207 page_cache_get(page); 224 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
208 if (!pagevec_add(pvec, page)) 225 __lru_cache_add(page, lru);
209 __pagevec_lru_add_active(pvec); 226}
210 put_cpu_var(lru_add_active_pvecs); 227
228/**
229 * add_page_to_unevictable_list - add a page to the unevictable list
230 * @page: the page to be added to the unevictable list
231 *
232 * Add page directly to its zone's unevictable list. To avoid races with
233 * tasks that might be making the page evictable, through eg. munlock,
234 * munmap or exit, while it's not on the lru, we want to add the page
235 * while it's locked or otherwise "invisible" to other tasks. This is
236 * difficult to do when using the pagevec cache, so bypass that.
237 */
238void add_page_to_unevictable_list(struct page *page)
239{
240 struct zone *zone = page_zone(page);
241
242 spin_lock_irq(&zone->lru_lock);
243 SetPageUnevictable(page);
244 SetPageLRU(page);
245 add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
246 spin_unlock_irq(&zone->lru_lock);
247}
248
249/**
250 * lru_cache_add_active_or_unevictable
251 * @page: the page to be added to LRU
252 * @vma: vma in which page is mapped for determining reclaimability
253 *
254 * place @page on active or unevictable LRU list, depending on
255 * page_evictable(). Note that if the page is not evictable,
256 * it goes directly back onto it's zone's unevictable list. It does
257 * NOT use a per cpu pagevec.
258 */
259void lru_cache_add_active_or_unevictable(struct page *page,
260 struct vm_area_struct *vma)
261{
262 if (page_evictable(page, vma))
263 lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
264 else
265 add_page_to_unevictable_list(page);
211} 266}
212 267
213/* 268/*
@@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page)
217 */ 272 */
218static void drain_cpu_pagevecs(int cpu) 273static void drain_cpu_pagevecs(int cpu)
219{ 274{
275 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
220 struct pagevec *pvec; 276 struct pagevec *pvec;
277 int lru;
221 278
222 pvec = &per_cpu(lru_add_pvecs, cpu); 279 for_each_lru(lru) {
223 if (pagevec_count(pvec)) 280 pvec = &pvecs[lru - LRU_BASE];
224 __pagevec_lru_add(pvec); 281 if (pagevec_count(pvec))
225 282 ____pagevec_lru_add(pvec, lru);
226 pvec = &per_cpu(lru_add_active_pvecs, cpu); 283 }
227 if (pagevec_count(pvec))
228 __pagevec_lru_add_active(pvec);
229 284
230 pvec = &per_cpu(lru_rotate_pvecs, cpu); 285 pvec = &per_cpu(lru_rotate_pvecs, cpu);
231 if (pagevec_count(pvec)) { 286 if (pagevec_count(pvec)) {
@@ -244,7 +299,7 @@ void lru_add_drain(void)
244 put_cpu(); 299 put_cpu();
245} 300}
246 301
247#ifdef CONFIG_NUMA 302#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
248static void lru_add_drain_per_cpu(struct work_struct *dummy) 303static void lru_add_drain_per_cpu(struct work_struct *dummy)
249{ 304{
250 lru_add_drain(); 305 lru_add_drain();
@@ -308,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold)
308 363
309 if (PageLRU(page)) { 364 if (PageLRU(page)) {
310 struct zone *pagezone = page_zone(page); 365 struct zone *pagezone = page_zone(page);
366
311 if (pagezone != zone) { 367 if (pagezone != zone) {
312 if (zone) 368 if (zone)
313 spin_unlock_irqrestore(&zone->lru_lock, 369 spin_unlock_irqrestore(&zone->lru_lock,
@@ -380,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
380 * Add the passed pages to the LRU, then drop the caller's refcount 436 * Add the passed pages to the LRU, then drop the caller's refcount
381 * on them. Reinitialises the caller's pagevec. 437 * on them. Reinitialises the caller's pagevec.
382 */ 438 */
383void __pagevec_lru_add(struct pagevec *pvec) 439void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
384{ 440{
385 int i; 441 int i;
386 struct zone *zone = NULL; 442 struct zone *zone = NULL;
443 VM_BUG_ON(is_unevictable_lru(lru));
387 444
388 for (i = 0; i < pagevec_count(pvec); i++) { 445 for (i = 0; i < pagevec_count(pvec); i++) {
389 struct page *page = pvec->pages[i]; 446 struct page *page = pvec->pages[i];
@@ -395,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec)
395 zone = pagezone; 452 zone = pagezone;
396 spin_lock_irq(&zone->lru_lock); 453 spin_lock_irq(&zone->lru_lock);
397 } 454 }
455 VM_BUG_ON(PageActive(page));
456 VM_BUG_ON(PageUnevictable(page));
398 VM_BUG_ON(PageLRU(page)); 457 VM_BUG_ON(PageLRU(page));
399 SetPageLRU(page); 458 SetPageLRU(page);
400 add_page_to_inactive_list(zone, page); 459 if (is_active_lru(lru))
460 SetPageActive(page);
461 add_page_to_lru_list(zone, page, lru);
401 } 462 }
402 if (zone) 463 if (zone)
403 spin_unlock_irq(&zone->lru_lock); 464 spin_unlock_irq(&zone->lru_lock);
@@ -405,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec)
405 pagevec_reinit(pvec); 466 pagevec_reinit(pvec);
406} 467}
407 468
408EXPORT_SYMBOL(__pagevec_lru_add); 469EXPORT_SYMBOL(____pagevec_lru_add);
409 470
410void __pagevec_lru_add_active(struct pagevec *pvec) 471/*
472 * Try to drop buffers from the pages in a pagevec
473 */
474void pagevec_strip(struct pagevec *pvec)
411{ 475{
412 int i; 476 int i;
413 struct zone *zone = NULL;
414 477
415 for (i = 0; i < pagevec_count(pvec); i++) { 478 for (i = 0; i < pagevec_count(pvec); i++) {
416 struct page *page = pvec->pages[i]; 479 struct page *page = pvec->pages[i];
417 struct zone *pagezone = page_zone(page);
418 480
419 if (pagezone != zone) { 481 if (PagePrivate(page) && trylock_page(page)) {
420 if (zone) 482 if (PagePrivate(page))
421 spin_unlock_irq(&zone->lru_lock); 483 try_to_release_page(page, 0);
422 zone = pagezone; 484 unlock_page(page);
423 spin_lock_irq(&zone->lru_lock);
424 } 485 }
425 VM_BUG_ON(PageLRU(page));
426 SetPageLRU(page);
427 VM_BUG_ON(PageActive(page));
428 SetPageActive(page);
429 add_page_to_active_list(zone, page);
430 } 486 }
431 if (zone)
432 spin_unlock_irq(&zone->lru_lock);
433 release_pages(pvec->pages, pvec->nr, pvec->cold);
434 pagevec_reinit(pvec);
435} 487}
436 488
437/* 489/**
438 * Try to drop buffers from the pages in a pagevec 490 * pagevec_swap_free - try to free swap space from the pages in a pagevec
491 * @pvec: pagevec with swapcache pages to free the swap space of
492 *
493 * The caller needs to hold an extra reference to each page and
494 * not hold the page lock on the pages. This function uses a
495 * trylock on the page lock so it may not always free the swap
496 * space associated with a page.
439 */ 497 */
440void pagevec_strip(struct pagevec *pvec) 498void pagevec_swap_free(struct pagevec *pvec)
441{ 499{
442 int i; 500 int i;
443 501
444 for (i = 0; i < pagevec_count(pvec); i++) { 502 for (i = 0; i < pagevec_count(pvec); i++) {
445 struct page *page = pvec->pages[i]; 503 struct page *page = pvec->pages[i];
446 504
447 if (PagePrivate(page) && trylock_page(page)) { 505 if (PageSwapCache(page) && trylock_page(page)) {
448 if (PagePrivate(page)) 506 if (PageSwapCache(page))
449 try_to_release_page(page, 0); 507 remove_exclusive_swap_page_ref(page);
450 unlock_page(page); 508 unlock_page(page);
451 } 509 }
452 } 510 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 797c3831cbec..3353c9029cef 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
33}; 33};
34 34
35static struct backing_dev_info swap_backing_dev_info = { 35static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37 .unplug_io_fn = swap_unplug_io_fn, 37 .unplug_io_fn = swap_unplug_io_fn,
38}; 38};
39 39
@@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
75 BUG_ON(!PageLocked(page)); 75 BUG_ON(!PageLocked(page));
76 BUG_ON(PageSwapCache(page)); 76 BUG_ON(PageSwapCache(page));
77 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
78 BUG_ON(!PageSwapBacked(page));
78 error = radix_tree_preload(gfp_mask); 79 error = radix_tree_preload(gfp_mask);
79 if (!error) { 80 if (!error) {
80 page_cache_get(page); 81 page_cache_get(page);
@@ -302,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
302 * re-using the just freed swap entry for an existing page. 303 * re-using the just freed swap entry for an existing page.
303 * May fail (-ENOMEM) if radix-tree node allocation failed. 304 * May fail (-ENOMEM) if radix-tree node allocation failed.
304 */ 305 */
305 set_page_locked(new_page); 306 __set_page_locked(new_page);
307 SetPageSwapBacked(new_page);
306 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 308 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
307 if (likely(!err)) { 309 if (likely(!err)) {
308 /* 310 /*
309 * Initiate read into locked page and return. 311 * Initiate read into locked page and return.
310 */ 312 */
311 lru_cache_add_active(new_page); 313 lru_cache_add_anon(new_page);
312 swap_readpage(NULL, new_page); 314 swap_readpage(NULL, new_page);
313 return new_page; 315 return new_page;
314 } 316 }
315 clear_page_locked(new_page); 317 ClearPageSwapBacked(new_page);
318 __clear_page_locked(new_page);
316 swap_free(entry); 319 swap_free(entry);
317 } while (err != -ENOMEM); 320 } while (err != -ENOMEM);
318 321
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1e330f2998fa..90cb67a5417c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page)
344 * Work out if there are any other processes sharing this 344 * Work out if there are any other processes sharing this
345 * swap cache page. Free it if you can. Return success. 345 * swap cache page. Free it if you can. Return success.
346 */ 346 */
347int remove_exclusive_swap_page(struct page *page) 347static int remove_exclusive_swap_page_count(struct page *page, int count)
348{ 348{
349 int retval; 349 int retval;
350 struct swap_info_struct * p; 350 struct swap_info_struct * p;
@@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page)
357 return 0; 357 return 0;
358 if (PageWriteback(page)) 358 if (PageWriteback(page))
359 return 0; 359 return 0;
360 if (page_count(page) != 2) /* 2: us + cache */ 360 if (page_count(page) != count) /* us + cache + ptes */
361 return 0; 361 return 0;
362 362
363 entry.val = page_private(page); 363 entry.val = page_private(page);
@@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page)
370 if (p->swap_map[swp_offset(entry)] == 1) { 370 if (p->swap_map[swp_offset(entry)] == 1) {
371 /* Recheck the page count with the swapcache lock held.. */ 371 /* Recheck the page count with the swapcache lock held.. */
372 spin_lock_irq(&swapper_space.tree_lock); 372 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == 2) && !PageWriteback(page)) { 373 if ((page_count(page) == count) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page); 374 __delete_from_swap_cache(page);
375 SetPageDirty(page); 375 SetPageDirty(page);
376 retval = 1; 376 retval = 1;
@@ -388,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page)
388} 388}
389 389
390/* 390/*
391 * Most of the time the page should have two references: one for the
392 * process and one for the swap cache.
393 */
394int remove_exclusive_swap_page(struct page *page)
395{
396 return remove_exclusive_swap_page_count(page, 2);
397}
398
399/*
400 * The pageout code holds an extra reference to the page. That raises
401 * the reference count to test for to 2 for a page that is only in the
402 * swap cache plus 1 for each process that maps the page.
403 */
404int remove_exclusive_swap_page_ref(struct page *page)
405{
406 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
407}
408
409/*
391 * Free the swap entry like above, but also try to 410 * Free the swap entry like above, but also try to
392 * free the page cache entry if it is the last user. 411 * free the page cache entry if it is the last user.
393 */ 412 */
@@ -403,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry)
403 if (p) { 422 if (p) {
404 if (swap_entry_free(p, swp_offset(entry)) == 1) { 423 if (swap_entry_free(p, swp_offset(entry)) == 1) {
405 page = find_get_page(&swapper_space, entry.val); 424 page = find_get_page(&swapper_space, entry.val);
406 if (page && unlikely(!trylock_page(page))) { 425 if (page && !trylock_page(page)) {
407 page_cache_release(page); 426 page_cache_release(page);
408 page = NULL; 427 page = NULL;
409 } 428 }
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 8d7a27a6335c..3e67d575ee6e 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -95,6 +95,7 @@ put_dentry:
95put_memory: 95put_memory:
96 return ERR_PTR(error); 96 return ERR_PTR(error);
97} 97}
98EXPORT_SYMBOL_GPL(shmem_file_setup);
98 99
99/** 100/**
100 * shmem_zero_setup - setup a shared anonymous mapping 101 * shmem_zero_setup - setup a shared anonymous mapping
diff --git a/mm/truncate.c b/mm/truncate.c
index e83e4b114ef1..1229211104f8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -18,6 +18,7 @@
18#include <linux/task_io_accounting_ops.h> 18#include <linux/task_io_accounting_ops.h>
19#include <linux/buffer_head.h> /* grr. try_to_release_page, 19#include <linux/buffer_head.h> /* grr. try_to_release_page,
20 do_invalidatepage */ 20 do_invalidatepage */
21#include "internal.h"
21 22
22 23
23/** 24/**
@@ -103,6 +104,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
103 104
104 cancel_dirty_page(page, PAGE_CACHE_SIZE); 105 cancel_dirty_page(page, PAGE_CACHE_SIZE);
105 106
107 clear_page_mlock(page);
106 remove_from_page_cache(page); 108 remove_from_page_cache(page);
107 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
108 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
@@ -127,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
127 if (PagePrivate(page) && !try_to_release_page(page, 0)) 129 if (PagePrivate(page) && !try_to_release_page(page, 0))
128 return 0; 130 return 0;
129 131
132 clear_page_mlock(page);
130 ret = remove_mapping(mapping, page); 133 ret = remove_mapping(mapping, page);
131 134
132 return ret; 135 return ret;
@@ -352,6 +355,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
352 if (PageDirty(page)) 355 if (PageDirty(page))
353 goto failed; 356 goto failed;
354 357
358 clear_page_mlock(page);
355 BUG_ON(PagePrivate(page)); 359 BUG_ON(PagePrivate(page));
356 __remove_from_page_cache(page); 360 __remove_from_page_cache(page);
357 spin_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bba06c41fc59..65ae576030da 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -8,6 +8,7 @@
8 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Numa awareness, Christoph Lameter, SGI, June 2005
9 */ 9 */
10 10
11#include <linux/vmalloc.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/highmem.h> 14#include <linux/highmem.h>
@@ -16,18 +17,18 @@
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
17#include <linux/seq_file.h> 18#include <linux/seq_file.h>
18#include <linux/debugobjects.h> 19#include <linux/debugobjects.h>
19#include <linux/vmalloc.h>
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21#include <linux/list.h>
22#include <linux/rbtree.h>
23#include <linux/radix-tree.h>
24#include <linux/rcupdate.h>
21 25
26#include <asm/atomic.h>
22#include <asm/uaccess.h> 27#include <asm/uaccess.h>
23#include <asm/tlbflush.h> 28#include <asm/tlbflush.h>
24 29
25 30
26DEFINE_RWLOCK(vmlist_lock); 31/*** Page table manipulation functions ***/
27struct vm_struct *vmlist;
28
29static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
30 int node, void *caller);
31 32
32static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 33static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
33{ 34{
@@ -40,8 +41,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
40 } while (pte++, addr += PAGE_SIZE, addr != end); 41 } while (pte++, addr += PAGE_SIZE, addr != end);
41} 42}
42 43
43static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, 44static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
44 unsigned long end)
45{ 45{
46 pmd_t *pmd; 46 pmd_t *pmd;
47 unsigned long next; 47 unsigned long next;
@@ -55,8 +55,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
55 } while (pmd++, addr = next, addr != end); 55 } while (pmd++, addr = next, addr != end);
56} 56}
57 57
58static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, 58static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
59 unsigned long end)
60{ 59{
61 pud_t *pud; 60 pud_t *pud;
62 unsigned long next; 61 unsigned long next;
@@ -70,12 +69,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
70 } while (pud++, addr = next, addr != end); 69 } while (pud++, addr = next, addr != end);
71} 70}
72 71
73void unmap_kernel_range(unsigned long addr, unsigned long size) 72static void vunmap_page_range(unsigned long addr, unsigned long end)
74{ 73{
75 pgd_t *pgd; 74 pgd_t *pgd;
76 unsigned long next; 75 unsigned long next;
77 unsigned long start = addr;
78 unsigned long end = addr + size;
79 76
80 BUG_ON(addr >= end); 77 BUG_ON(addr >= end);
81 pgd = pgd_offset_k(addr); 78 pgd = pgd_offset_k(addr);
@@ -86,35 +83,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
86 continue; 83 continue;
87 vunmap_pud_range(pgd, addr, next); 84 vunmap_pud_range(pgd, addr, next);
88 } while (pgd++, addr = next, addr != end); 85 } while (pgd++, addr = next, addr != end);
89 flush_tlb_kernel_range(start, end);
90}
91
92static void unmap_vm_area(struct vm_struct *area)
93{
94 unmap_kernel_range((unsigned long)area->addr, area->size);
95} 86}
96 87
97static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 88static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
98 unsigned long end, pgprot_t prot, struct page ***pages) 89 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
99{ 90{
100 pte_t *pte; 91 pte_t *pte;
101 92
93 /*
94 * nr is a running index into the array which helps higher level
95 * callers keep track of where we're up to.
96 */
97
102 pte = pte_alloc_kernel(pmd, addr); 98 pte = pte_alloc_kernel(pmd, addr);
103 if (!pte) 99 if (!pte)
104 return -ENOMEM; 100 return -ENOMEM;
105 do { 101 do {
106 struct page *page = **pages; 102 struct page *page = pages[*nr];
107 WARN_ON(!pte_none(*pte)); 103
108 if (!page) 104 if (WARN_ON(!pte_none(*pte)))
105 return -EBUSY;
106 if (WARN_ON(!page))
109 return -ENOMEM; 107 return -ENOMEM;
110 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 108 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
111 (*pages)++; 109 (*nr)++;
112 } while (pte++, addr += PAGE_SIZE, addr != end); 110 } while (pte++, addr += PAGE_SIZE, addr != end);
113 return 0; 111 return 0;
114} 112}
115 113
116static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, 114static int vmap_pmd_range(pud_t *pud, unsigned long addr,
117 unsigned long end, pgprot_t prot, struct page ***pages) 115 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
118{ 116{
119 pmd_t *pmd; 117 pmd_t *pmd;
120 unsigned long next; 118 unsigned long next;
@@ -124,14 +122,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
124 return -ENOMEM; 122 return -ENOMEM;
125 do { 123 do {
126 next = pmd_addr_end(addr, end); 124 next = pmd_addr_end(addr, end);
127 if (vmap_pte_range(pmd, addr, next, prot, pages)) 125 if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
128 return -ENOMEM; 126 return -ENOMEM;
129 } while (pmd++, addr = next, addr != end); 127 } while (pmd++, addr = next, addr != end);
130 return 0; 128 return 0;
131} 129}
132 130
133static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, 131static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
134 unsigned long end, pgprot_t prot, struct page ***pages) 132 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
135{ 133{
136 pud_t *pud; 134 pud_t *pud;
137 unsigned long next; 135 unsigned long next;
@@ -141,57 +139,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
141 return -ENOMEM; 139 return -ENOMEM;
142 do { 140 do {
143 next = pud_addr_end(addr, end); 141 next = pud_addr_end(addr, end);
144 if (vmap_pmd_range(pud, addr, next, prot, pages)) 142 if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
145 return -ENOMEM; 143 return -ENOMEM;
146 } while (pud++, addr = next, addr != end); 144 } while (pud++, addr = next, addr != end);
147 return 0; 145 return 0;
148} 146}
149 147
150int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 148/*
149 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
150 * will have pfns corresponding to the "pages" array.
151 *
152 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
153 */
154static int vmap_page_range(unsigned long addr, unsigned long end,
155 pgprot_t prot, struct page **pages)
151{ 156{
152 pgd_t *pgd; 157 pgd_t *pgd;
153 unsigned long next; 158 unsigned long next;
154 unsigned long addr = (unsigned long) area->addr; 159 int err = 0;
155 unsigned long end = addr + area->size - PAGE_SIZE; 160 int nr = 0;
156 int err;
157 161
158 BUG_ON(addr >= end); 162 BUG_ON(addr >= end);
159 pgd = pgd_offset_k(addr); 163 pgd = pgd_offset_k(addr);
160 do { 164 do {
161 next = pgd_addr_end(addr, end); 165 next = pgd_addr_end(addr, end);
162 err = vmap_pud_range(pgd, addr, next, prot, pages); 166 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
163 if (err) 167 if (err)
164 break; 168 break;
165 } while (pgd++, addr = next, addr != end); 169 } while (pgd++, addr = next, addr != end);
166 flush_cache_vmap((unsigned long) area->addr, end); 170 flush_cache_vmap(addr, end);
167 return err; 171
172 if (unlikely(err))
173 return err;
174 return nr;
175}
176
177static inline int is_vmalloc_or_module_addr(const void *x)
178{
179 /*
180 * x86-64 and sparc64 put modules in a special place,
181 * and fall back on vmalloc() if that fails. Others
182 * just put it in the vmalloc space.
183 */
184#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
185 unsigned long addr = (unsigned long)x;
186 if (addr >= MODULES_VADDR && addr < MODULES_END)
187 return 1;
188#endif
189 return is_vmalloc_addr(x);
168} 190}
169EXPORT_SYMBOL_GPL(map_vm_area);
170 191
171/* 192/*
172 * Map a vmalloc()-space virtual address to the physical page. 193 * Walk a vmap address to the struct page it maps.
173 */ 194 */
174struct page *vmalloc_to_page(const void *vmalloc_addr) 195struct page *vmalloc_to_page(const void *vmalloc_addr)
175{ 196{
176 unsigned long addr = (unsigned long) vmalloc_addr; 197 unsigned long addr = (unsigned long) vmalloc_addr;
177 struct page *page = NULL; 198 struct page *page = NULL;
178 pgd_t *pgd = pgd_offset_k(addr); 199 pgd_t *pgd = pgd_offset_k(addr);
179 pud_t *pud;
180 pmd_t *pmd;
181 pte_t *ptep, pte;
182 200
183 /* 201 /*
184 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 202 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
185 * architectures that do not vmalloc module space 203 * architectures that do not vmalloc module space
186 */ 204 */
187 VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && 205 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
188 !is_module_address(addr));
189 206
190 if (!pgd_none(*pgd)) { 207 if (!pgd_none(*pgd)) {
191 pud = pud_offset(pgd, addr); 208 pud_t *pud = pud_offset(pgd, addr);
192 if (!pud_none(*pud)) { 209 if (!pud_none(*pud)) {
193 pmd = pmd_offset(pud, addr); 210 pmd_t *pmd = pmd_offset(pud, addr);
194 if (!pmd_none(*pmd)) { 211 if (!pmd_none(*pmd)) {
212 pte_t *ptep, pte;
213
195 ptep = pte_offset_map(pmd, addr); 214 ptep = pte_offset_map(pmd, addr);
196 pte = *ptep; 215 pte = *ptep;
197 if (pte_present(pte)) 216 if (pte_present(pte))
@@ -213,13 +232,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
213} 232}
214EXPORT_SYMBOL(vmalloc_to_pfn); 233EXPORT_SYMBOL(vmalloc_to_pfn);
215 234
216static struct vm_struct * 235
217__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, 236/*** Global kva allocator ***/
218 unsigned long end, int node, gfp_t gfp_mask, void *caller) 237
238#define VM_LAZY_FREE 0x01
239#define VM_LAZY_FREEING 0x02
240#define VM_VM_AREA 0x04
241
242struct vmap_area {
243 unsigned long va_start;
244 unsigned long va_end;
245 unsigned long flags;
246 struct rb_node rb_node; /* address sorted rbtree */
247 struct list_head list; /* address sorted list */
248 struct list_head purge_list; /* "lazy purge" list */
249 void *private;
250 struct rcu_head rcu_head;
251};
252
253static DEFINE_SPINLOCK(vmap_area_lock);
254static struct rb_root vmap_area_root = RB_ROOT;
255static LIST_HEAD(vmap_area_list);
256
257static struct vmap_area *__find_vmap_area(unsigned long addr)
219{ 258{
220 struct vm_struct **p, *tmp, *area; 259 struct rb_node *n = vmap_area_root.rb_node;
221 unsigned long align = 1; 260
261 while (n) {
262 struct vmap_area *va;
263
264 va = rb_entry(n, struct vmap_area, rb_node);
265 if (addr < va->va_start)
266 n = n->rb_left;
267 else if (addr > va->va_start)
268 n = n->rb_right;
269 else
270 return va;
271 }
272
273 return NULL;
274}
275
276static void __insert_vmap_area(struct vmap_area *va)
277{
278 struct rb_node **p = &vmap_area_root.rb_node;
279 struct rb_node *parent = NULL;
280 struct rb_node *tmp;
281
282 while (*p) {
283 struct vmap_area *tmp;
284
285 parent = *p;
286 tmp = rb_entry(parent, struct vmap_area, rb_node);
287 if (va->va_start < tmp->va_end)
288 p = &(*p)->rb_left;
289 else if (va->va_end > tmp->va_start)
290 p = &(*p)->rb_right;
291 else
292 BUG();
293 }
294
295 rb_link_node(&va->rb_node, parent, p);
296 rb_insert_color(&va->rb_node, &vmap_area_root);
297
298 /* address-sort this list so it is usable like the vmlist */
299 tmp = rb_prev(&va->rb_node);
300 if (tmp) {
301 struct vmap_area *prev;
302 prev = rb_entry(tmp, struct vmap_area, rb_node);
303 list_add_rcu(&va->list, &prev->list);
304 } else
305 list_add_rcu(&va->list, &vmap_area_list);
306}
307
308static void purge_vmap_area_lazy(void);
309
310/*
311 * Allocate a region of KVA of the specified size and alignment, within the
312 * vstart and vend.
313 */
314static struct vmap_area *alloc_vmap_area(unsigned long size,
315 unsigned long align,
316 unsigned long vstart, unsigned long vend,
317 int node, gfp_t gfp_mask)
318{
319 struct vmap_area *va;
320 struct rb_node *n;
222 unsigned long addr; 321 unsigned long addr;
322 int purged = 0;
323
324 BUG_ON(size & ~PAGE_MASK);
325
326 addr = ALIGN(vstart, align);
327
328 va = kmalloc_node(sizeof(struct vmap_area),
329 gfp_mask & GFP_RECLAIM_MASK, node);
330 if (unlikely(!va))
331 return ERR_PTR(-ENOMEM);
332
333retry:
334 spin_lock(&vmap_area_lock);
335 /* XXX: could have a last_hole cache */
336 n = vmap_area_root.rb_node;
337 if (n) {
338 struct vmap_area *first = NULL;
339
340 do {
341 struct vmap_area *tmp;
342 tmp = rb_entry(n, struct vmap_area, rb_node);
343 if (tmp->va_end >= addr) {
344 if (!first && tmp->va_start < addr + size)
345 first = tmp;
346 n = n->rb_left;
347 } else {
348 first = tmp;
349 n = n->rb_right;
350 }
351 } while (n);
352
353 if (!first)
354 goto found;
355
356 if (first->va_end < addr) {
357 n = rb_next(&first->rb_node);
358 if (n)
359 first = rb_entry(n, struct vmap_area, rb_node);
360 else
361 goto found;
362 }
363
364 while (addr + size >= first->va_start && addr + size <= vend) {
365 addr = ALIGN(first->va_end + PAGE_SIZE, align);
366
367 n = rb_next(&first->rb_node);
368 if (n)
369 first = rb_entry(n, struct vmap_area, rb_node);
370 else
371 goto found;
372 }
373 }
374found:
375 if (addr + size > vend) {
376 spin_unlock(&vmap_area_lock);
377 if (!purged) {
378 purge_vmap_area_lazy();
379 purged = 1;
380 goto retry;
381 }
382 if (printk_ratelimit())
383 printk(KERN_WARNING "vmap allocation failed: "
384 "use vmalloc=<size> to increase size.\n");
385 return ERR_PTR(-EBUSY);
386 }
387
388 BUG_ON(addr & (align-1));
389
390 va->va_start = addr;
391 va->va_end = addr + size;
392 va->flags = 0;
393 __insert_vmap_area(va);
394 spin_unlock(&vmap_area_lock);
395
396 return va;
397}
398
399static void rcu_free_va(struct rcu_head *head)
400{
401 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
402
403 kfree(va);
404}
405
406static void __free_vmap_area(struct vmap_area *va)
407{
408 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
409 rb_erase(&va->rb_node, &vmap_area_root);
410 RB_CLEAR_NODE(&va->rb_node);
411 list_del_rcu(&va->list);
412
413 call_rcu(&va->rcu_head, rcu_free_va);
414}
415
416/*
417 * Free a region of KVA allocated by alloc_vmap_area
418 */
419static void free_vmap_area(struct vmap_area *va)
420{
421 spin_lock(&vmap_area_lock);
422 __free_vmap_area(va);
423 spin_unlock(&vmap_area_lock);
424}
425
426/*
427 * Clear the pagetable entries of a given vmap_area
428 */
429static void unmap_vmap_area(struct vmap_area *va)
430{
431 vunmap_page_range(va->va_start, va->va_end);
432}
433
434/*
435 * lazy_max_pages is the maximum amount of virtual address space we gather up
436 * before attempting to purge with a TLB flush.
437 *
438 * There is a tradeoff here: a larger number will cover more kernel page tables
439 * and take slightly longer to purge, but it will linearly reduce the number of
440 * global TLB flushes that must be performed. It would seem natural to scale
441 * this number up linearly with the number of CPUs (because vmapping activity
442 * could also scale linearly with the number of CPUs), however it is likely
443 * that in practice, workloads might be constrained in other ways that mean
444 * vmap activity will not scale linearly with CPUs. Also, I want to be
445 * conservative and not introduce a big latency on huge systems, so go with
446 * a less aggressive log scale. It will still be an improvement over the old
447 * code, and it will be simple to change the scale factor if we find that it
448 * becomes a problem on bigger systems.
449 */
450static unsigned long lazy_max_pages(void)
451{
452 unsigned int log;
453
454 log = fls(num_online_cpus());
455
456 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
457}
458
459static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
460
461/*
462 * Purges all lazily-freed vmap areas.
463 *
464 * If sync is 0 then don't purge if there is already a purge in progress.
465 * If force_flush is 1, then flush kernel TLBs between *start and *end even
466 * if we found no lazy vmap areas to unmap (callers can use this to optimise
467 * their own TLB flushing).
468 * Returns with *start = min(*start, lowest purged address)
469 * *end = max(*end, highest purged address)
470 */
471static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
472 int sync, int force_flush)
473{
474 static DEFINE_SPINLOCK(purge_lock);
475 LIST_HEAD(valist);
476 struct vmap_area *va;
477 int nr = 0;
478
479 /*
480 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
481 * should not expect such behaviour. This just simplifies locking for
482 * the case that isn't actually used at the moment anyway.
483 */
484 if (!sync && !force_flush) {
485 if (!spin_trylock(&purge_lock))
486 return;
487 } else
488 spin_lock(&purge_lock);
489
490 rcu_read_lock();
491 list_for_each_entry_rcu(va, &vmap_area_list, list) {
492 if (va->flags & VM_LAZY_FREE) {
493 if (va->va_start < *start)
494 *start = va->va_start;
495 if (va->va_end > *end)
496 *end = va->va_end;
497 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
498 unmap_vmap_area(va);
499 list_add_tail(&va->purge_list, &valist);
500 va->flags |= VM_LAZY_FREEING;
501 va->flags &= ~VM_LAZY_FREE;
502 }
503 }
504 rcu_read_unlock();
505
506 if (nr) {
507 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
508 atomic_sub(nr, &vmap_lazy_nr);
509 }
510
511 if (nr || force_flush)
512 flush_tlb_kernel_range(*start, *end);
513
514 if (nr) {
515 spin_lock(&vmap_area_lock);
516 list_for_each_entry(va, &valist, purge_list)
517 __free_vmap_area(va);
518 spin_unlock(&vmap_area_lock);
519 }
520 spin_unlock(&purge_lock);
521}
522
523/*
524 * Kick off a purge of the outstanding lazy areas.
525 */
526static void purge_vmap_area_lazy(void)
527{
528 unsigned long start = ULONG_MAX, end = 0;
529
530 __purge_vmap_area_lazy(&start, &end, 0, 0);
531}
532
533/*
534 * Free and unmap a vmap area
535 */
536static void free_unmap_vmap_area(struct vmap_area *va)
537{
538 va->flags |= VM_LAZY_FREE;
539 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
540 if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
541 purge_vmap_area_lazy();
542}
543
544static struct vmap_area *find_vmap_area(unsigned long addr)
545{
546 struct vmap_area *va;
547
548 spin_lock(&vmap_area_lock);
549 va = __find_vmap_area(addr);
550 spin_unlock(&vmap_area_lock);
551
552 return va;
553}
554
555static void free_unmap_vmap_area_addr(unsigned long addr)
556{
557 struct vmap_area *va;
558
559 va = find_vmap_area(addr);
560 BUG_ON(!va);
561 free_unmap_vmap_area(va);
562}
563
564
565/*** Per cpu kva allocator ***/
566
567/*
568 * vmap space is limited especially on 32 bit architectures. Ensure there is
569 * room for at least 16 percpu vmap blocks per CPU.
570 */
571/*
572 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
573 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
574 * instead (we just need a rough idea)
575 */
576#if BITS_PER_LONG == 32
577#define VMALLOC_SPACE (128UL*1024*1024)
578#else
579#define VMALLOC_SPACE (128UL*1024*1024*1024)
580#endif
581
582#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
583#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
584#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
585#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
586#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
587#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
588#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
589 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
590 VMALLOC_PAGES / NR_CPUS / 16))
591
592#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
593
594struct vmap_block_queue {
595 spinlock_t lock;
596 struct list_head free;
597 struct list_head dirty;
598 unsigned int nr_dirty;
599};
600
601struct vmap_block {
602 spinlock_t lock;
603 struct vmap_area *va;
604 struct vmap_block_queue *vbq;
605 unsigned long free, dirty;
606 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
607 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
608 union {
609 struct {
610 struct list_head free_list;
611 struct list_head dirty_list;
612 };
613 struct rcu_head rcu_head;
614 };
615};
616
617/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
618static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
619
620/*
621 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
622 * in the free path. Could get rid of this if we change the API to return a
623 * "cookie" from alloc, to be passed to free. But no big deal yet.
624 */
625static DEFINE_SPINLOCK(vmap_block_tree_lock);
626static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
627
628/*
629 * We should probably have a fallback mechanism to allocate virtual memory
630 * out of partially filled vmap blocks. However vmap block sizing should be
631 * fairly reasonable according to the vmalloc size, so it shouldn't be a
632 * big problem.
633 */
634
635static unsigned long addr_to_vb_idx(unsigned long addr)
636{
637 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
638 addr /= VMAP_BLOCK_SIZE;
639 return addr;
640}
641
642static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
643{
644 struct vmap_block_queue *vbq;
645 struct vmap_block *vb;
646 struct vmap_area *va;
647 unsigned long vb_idx;
648 int node, err;
649
650 node = numa_node_id();
651
652 vb = kmalloc_node(sizeof(struct vmap_block),
653 gfp_mask & GFP_RECLAIM_MASK, node);
654 if (unlikely(!vb))
655 return ERR_PTR(-ENOMEM);
656
657 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
658 VMALLOC_START, VMALLOC_END,
659 node, gfp_mask);
660 if (unlikely(IS_ERR(va))) {
661 kfree(vb);
662 return ERR_PTR(PTR_ERR(va));
663 }
664
665 err = radix_tree_preload(gfp_mask);
666 if (unlikely(err)) {
667 kfree(vb);
668 free_vmap_area(va);
669 return ERR_PTR(err);
670 }
671
672 spin_lock_init(&vb->lock);
673 vb->va = va;
674 vb->free = VMAP_BBMAP_BITS;
675 vb->dirty = 0;
676 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
677 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
678 INIT_LIST_HEAD(&vb->free_list);
679 INIT_LIST_HEAD(&vb->dirty_list);
680
681 vb_idx = addr_to_vb_idx(va->va_start);
682 spin_lock(&vmap_block_tree_lock);
683 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
684 spin_unlock(&vmap_block_tree_lock);
685 BUG_ON(err);
686 radix_tree_preload_end();
687
688 vbq = &get_cpu_var(vmap_block_queue);
689 vb->vbq = vbq;
690 spin_lock(&vbq->lock);
691 list_add(&vb->free_list, &vbq->free);
692 spin_unlock(&vbq->lock);
693 put_cpu_var(vmap_cpu_blocks);
694
695 return vb;
696}
697
698static void rcu_free_vb(struct rcu_head *head)
699{
700 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
701
702 kfree(vb);
703}
704
705static void free_vmap_block(struct vmap_block *vb)
706{
707 struct vmap_block *tmp;
708 unsigned long vb_idx;
709
710 spin_lock(&vb->vbq->lock);
711 if (!list_empty(&vb->free_list))
712 list_del(&vb->free_list);
713 if (!list_empty(&vb->dirty_list))
714 list_del(&vb->dirty_list);
715 spin_unlock(&vb->vbq->lock);
716
717 vb_idx = addr_to_vb_idx(vb->va->va_start);
718 spin_lock(&vmap_block_tree_lock);
719 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
720 spin_unlock(&vmap_block_tree_lock);
721 BUG_ON(tmp != vb);
722
723 free_unmap_vmap_area(vb->va);
724 call_rcu(&vb->rcu_head, rcu_free_vb);
725}
726
727static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
728{
729 struct vmap_block_queue *vbq;
730 struct vmap_block *vb;
731 unsigned long addr = 0;
732 unsigned int order;
733
734 BUG_ON(size & ~PAGE_MASK);
735 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
736 order = get_order(size);
737
738again:
739 rcu_read_lock();
740 vbq = &get_cpu_var(vmap_block_queue);
741 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
742 int i;
743
744 spin_lock(&vb->lock);
745 i = bitmap_find_free_region(vb->alloc_map,
746 VMAP_BBMAP_BITS, order);
747
748 if (i >= 0) {
749 addr = vb->va->va_start + (i << PAGE_SHIFT);
750 BUG_ON(addr_to_vb_idx(addr) !=
751 addr_to_vb_idx(vb->va->va_start));
752 vb->free -= 1UL << order;
753 if (vb->free == 0) {
754 spin_lock(&vbq->lock);
755 list_del_init(&vb->free_list);
756 spin_unlock(&vbq->lock);
757 }
758 spin_unlock(&vb->lock);
759 break;
760 }
761 spin_unlock(&vb->lock);
762 }
763 put_cpu_var(vmap_cpu_blocks);
764 rcu_read_unlock();
765
766 if (!addr) {
767 vb = new_vmap_block(gfp_mask);
768 if (IS_ERR(vb))
769 return vb;
770 goto again;
771 }
772
773 return (void *)addr;
774}
775
776static void vb_free(const void *addr, unsigned long size)
777{
778 unsigned long offset;
779 unsigned long vb_idx;
780 unsigned int order;
781 struct vmap_block *vb;
782
783 BUG_ON(size & ~PAGE_MASK);
784 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
785 order = get_order(size);
786
787 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
788
789 vb_idx = addr_to_vb_idx((unsigned long)addr);
790 rcu_read_lock();
791 vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
792 rcu_read_unlock();
793 BUG_ON(!vb);
794
795 spin_lock(&vb->lock);
796 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
797 if (!vb->dirty) {
798 spin_lock(&vb->vbq->lock);
799 list_add(&vb->dirty_list, &vb->vbq->dirty);
800 spin_unlock(&vb->vbq->lock);
801 }
802 vb->dirty += 1UL << order;
803 if (vb->dirty == VMAP_BBMAP_BITS) {
804 BUG_ON(vb->free || !list_empty(&vb->free_list));
805 spin_unlock(&vb->lock);
806 free_vmap_block(vb);
807 } else
808 spin_unlock(&vb->lock);
809}
810
811/**
812 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
813 *
814 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
815 * to amortize TLB flushing overheads. What this means is that any page you
816 * have now, may, in a former life, have been mapped into kernel virtual
817 * address by the vmap layer and so there might be some CPUs with TLB entries
818 * still referencing that page (additional to the regular 1:1 kernel mapping).
819 *
820 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
821 * be sure that none of the pages we have control over will have any aliases
822 * from the vmap layer.
823 */
824void vm_unmap_aliases(void)
825{
826 unsigned long start = ULONG_MAX, end = 0;
827 int cpu;
828 int flush = 0;
829
830 for_each_possible_cpu(cpu) {
831 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
832 struct vmap_block *vb;
833
834 rcu_read_lock();
835 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
836 int i;
837
838 spin_lock(&vb->lock);
839 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
840 while (i < VMAP_BBMAP_BITS) {
841 unsigned long s, e;
842 int j;
843 j = find_next_zero_bit(vb->dirty_map,
844 VMAP_BBMAP_BITS, i);
845
846 s = vb->va->va_start + (i << PAGE_SHIFT);
847 e = vb->va->va_start + (j << PAGE_SHIFT);
848 vunmap_page_range(s, e);
849 flush = 1;
850
851 if (s < start)
852 start = s;
853 if (e > end)
854 end = e;
855
856 i = j;
857 i = find_next_bit(vb->dirty_map,
858 VMAP_BBMAP_BITS, i);
859 }
860 spin_unlock(&vb->lock);
861 }
862 rcu_read_unlock();
863 }
864
865 __purge_vmap_area_lazy(&start, &end, 1, flush);
866}
867EXPORT_SYMBOL_GPL(vm_unmap_aliases);
868
869/**
870 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
871 * @mem: the pointer returned by vm_map_ram
872 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
873 */
874void vm_unmap_ram(const void *mem, unsigned int count)
875{
876 unsigned long size = count << PAGE_SHIFT;
877 unsigned long addr = (unsigned long)mem;
878
879 BUG_ON(!addr);
880 BUG_ON(addr < VMALLOC_START);
881 BUG_ON(addr > VMALLOC_END);
882 BUG_ON(addr & (PAGE_SIZE-1));
883
884 debug_check_no_locks_freed(mem, size);
885
886 if (likely(count <= VMAP_MAX_ALLOC))
887 vb_free(mem, size);
888 else
889 free_unmap_vmap_area_addr(addr);
890}
891EXPORT_SYMBOL(vm_unmap_ram);
892
893/**
894 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
895 * @pages: an array of pointers to the pages to be mapped
896 * @count: number of pages
897 * @node: prefer to allocate data structures on this node
898 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
899 * @returns: a pointer to the address that has been mapped, or NULL on failure
900 */
901void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
902{
903 unsigned long size = count << PAGE_SHIFT;
904 unsigned long addr;
905 void *mem;
906
907 if (likely(count <= VMAP_MAX_ALLOC)) {
908 mem = vb_alloc(size, GFP_KERNEL);
909 if (IS_ERR(mem))
910 return NULL;
911 addr = (unsigned long)mem;
912 } else {
913 struct vmap_area *va;
914 va = alloc_vmap_area(size, PAGE_SIZE,
915 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
916 if (IS_ERR(va))
917 return NULL;
918
919 addr = va->va_start;
920 mem = (void *)addr;
921 }
922 if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
923 vm_unmap_ram(mem, count);
924 return NULL;
925 }
926 return mem;
927}
928EXPORT_SYMBOL(vm_map_ram);
929
930void __init vmalloc_init(void)
931{
932 int i;
933
934 for_each_possible_cpu(i) {
935 struct vmap_block_queue *vbq;
936
937 vbq = &per_cpu(vmap_block_queue, i);
938 spin_lock_init(&vbq->lock);
939 INIT_LIST_HEAD(&vbq->free);
940 INIT_LIST_HEAD(&vbq->dirty);
941 vbq->nr_dirty = 0;
942 }
943}
944
945void unmap_kernel_range(unsigned long addr, unsigned long size)
946{
947 unsigned long end = addr + size;
948 vunmap_page_range(addr, end);
949 flush_tlb_kernel_range(addr, end);
950}
951
952int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
953{
954 unsigned long addr = (unsigned long)area->addr;
955 unsigned long end = addr + area->size - PAGE_SIZE;
956 int err;
957
958 err = vmap_page_range(addr, end, prot, *pages);
959 if (err > 0) {
960 *pages += err;
961 err = 0;
962 }
963
964 return err;
965}
966EXPORT_SYMBOL_GPL(map_vm_area);
967
968/*** Old vmalloc interfaces ***/
969DEFINE_RWLOCK(vmlist_lock);
970struct vm_struct *vmlist;
971
972static struct vm_struct *__get_vm_area_node(unsigned long size,
973 unsigned long flags, unsigned long start, unsigned long end,
974 int node, gfp_t gfp_mask, void *caller)
975{
976 static struct vmap_area *va;
977 struct vm_struct *area;
978 struct vm_struct *tmp, **p;
979 unsigned long align = 1;
223 980
224 BUG_ON(in_interrupt()); 981 BUG_ON(in_interrupt());
225 if (flags & VM_IOREMAP) { 982 if (flags & VM_IOREMAP) {
@@ -232,13 +989,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
232 989
233 align = 1ul << bit; 990 align = 1ul << bit;
234 } 991 }
235 addr = ALIGN(start, align); 992
236 size = PAGE_ALIGN(size); 993 size = PAGE_ALIGN(size);
237 if (unlikely(!size)) 994 if (unlikely(!size))
238 return NULL; 995 return NULL;
239 996
240 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 997 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
241
242 if (unlikely(!area)) 998 if (unlikely(!area))
243 return NULL; 999 return NULL;
244 1000
@@ -247,48 +1003,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
247 */ 1003 */
248 size += PAGE_SIZE; 1004 size += PAGE_SIZE;
249 1005
250 write_lock(&vmlist_lock); 1006 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
251 for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { 1007 if (IS_ERR(va)) {
252 if ((unsigned long)tmp->addr < addr) { 1008 kfree(area);
253 if((unsigned long)tmp->addr + tmp->size >= addr) 1009 return NULL;
254 addr = ALIGN(tmp->size +
255 (unsigned long)tmp->addr, align);
256 continue;
257 }
258 if ((size + addr) < addr)
259 goto out;
260 if (size + addr <= (unsigned long)tmp->addr)
261 goto found;
262 addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
263 if (addr > end - size)
264 goto out;
265 } 1010 }
266 if ((size + addr) < addr)
267 goto out;
268 if (addr > end - size)
269 goto out;
270
271found:
272 area->next = *p;
273 *p = area;
274 1011
275 area->flags = flags; 1012 area->flags = flags;
276 area->addr = (void *)addr; 1013 area->addr = (void *)va->va_start;
277 area->size = size; 1014 area->size = size;
278 area->pages = NULL; 1015 area->pages = NULL;
279 area->nr_pages = 0; 1016 area->nr_pages = 0;
280 area->phys_addr = 0; 1017 area->phys_addr = 0;
281 area->caller = caller; 1018 area->caller = caller;
1019 va->private = area;
1020 va->flags |= VM_VM_AREA;
1021
1022 write_lock(&vmlist_lock);
1023 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1024 if (tmp->addr >= area->addr)
1025 break;
1026 }
1027 area->next = *p;
1028 *p = area;
282 write_unlock(&vmlist_lock); 1029 write_unlock(&vmlist_lock);
283 1030
284 return area; 1031 return area;
285
286out:
287 write_unlock(&vmlist_lock);
288 kfree(area);
289 if (printk_ratelimit())
290 printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
291 return NULL;
292} 1032}
293 1033
294struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1034struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
@@ -328,39 +1068,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
328 gfp_mask, __builtin_return_address(0)); 1068 gfp_mask, __builtin_return_address(0));
329} 1069}
330 1070
331/* Caller must hold vmlist_lock */ 1071static struct vm_struct *find_vm_area(const void *addr)
332static struct vm_struct *__find_vm_area(const void *addr)
333{ 1072{
334 struct vm_struct *tmp; 1073 struct vmap_area *va;
335
336 for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
337 if (tmp->addr == addr)
338 break;
339 }
340
341 return tmp;
342}
343 1074
344/* Caller must hold vmlist_lock */ 1075 va = find_vmap_area((unsigned long)addr);
345static struct vm_struct *__remove_vm_area(const void *addr) 1076 if (va && va->flags & VM_VM_AREA)
346{ 1077 return va->private;
347 struct vm_struct **p, *tmp;
348 1078
349 for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
350 if (tmp->addr == addr)
351 goto found;
352 }
353 return NULL; 1079 return NULL;
354
355found:
356 unmap_vm_area(tmp);
357 *p = tmp->next;
358
359 /*
360 * Remove the guard page.
361 */
362 tmp->size -= PAGE_SIZE;
363 return tmp;
364} 1080}
365 1081
366/** 1082/**
@@ -373,11 +1089,24 @@ found:
373 */ 1089 */
374struct vm_struct *remove_vm_area(const void *addr) 1090struct vm_struct *remove_vm_area(const void *addr)
375{ 1091{
376 struct vm_struct *v; 1092 struct vmap_area *va;
377 write_lock(&vmlist_lock); 1093
378 v = __remove_vm_area(addr); 1094 va = find_vmap_area((unsigned long)addr);
379 write_unlock(&vmlist_lock); 1095 if (va && va->flags & VM_VM_AREA) {
380 return v; 1096 struct vm_struct *vm = va->private;
1097 struct vm_struct *tmp, **p;
1098 free_unmap_vmap_area(va);
1099 vm->size -= PAGE_SIZE;
1100
1101 write_lock(&vmlist_lock);
1102 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1103 ;
1104 *p = tmp->next;
1105 write_unlock(&vmlist_lock);
1106
1107 return vm;
1108 }
1109 return NULL;
381} 1110}
382 1111
383static void __vunmap(const void *addr, int deallocate_pages) 1112static void __vunmap(const void *addr, int deallocate_pages)
@@ -487,6 +1216,8 @@ void *vmap(struct page **pages, unsigned int count,
487} 1216}
488EXPORT_SYMBOL(vmap); 1217EXPORT_SYMBOL(vmap);
489 1218
1219static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1220 int node, void *caller);
490static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1221static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
491 pgprot_t prot, int node, void *caller) 1222 pgprot_t prot, int node, void *caller)
492{ 1223{
@@ -613,10 +1344,8 @@ void *vmalloc_user(unsigned long size)
613 1344
614 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 1345 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
615 if (ret) { 1346 if (ret) {
616 write_lock(&vmlist_lock); 1347 area = find_vm_area(ret);
617 area = __find_vm_area(ret);
618 area->flags |= VM_USERMAP; 1348 area->flags |= VM_USERMAP;
619 write_unlock(&vmlist_lock);
620 } 1349 }
621 return ret; 1350 return ret;
622} 1351}
@@ -696,10 +1425,8 @@ void *vmalloc_32_user(unsigned long size)
696 1425
697 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); 1426 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
698 if (ret) { 1427 if (ret) {
699 write_lock(&vmlist_lock); 1428 area = find_vm_area(ret);
700 area = __find_vm_area(ret);
701 area->flags |= VM_USERMAP; 1429 area->flags |= VM_USERMAP;
702 write_unlock(&vmlist_lock);
703 } 1430 }
704 return ret; 1431 return ret;
705} 1432}
@@ -800,26 +1527,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
800 struct vm_struct *area; 1527 struct vm_struct *area;
801 unsigned long uaddr = vma->vm_start; 1528 unsigned long uaddr = vma->vm_start;
802 unsigned long usize = vma->vm_end - vma->vm_start; 1529 unsigned long usize = vma->vm_end - vma->vm_start;
803 int ret;
804 1530
805 if ((PAGE_SIZE-1) & (unsigned long)addr) 1531 if ((PAGE_SIZE-1) & (unsigned long)addr)
806 return -EINVAL; 1532 return -EINVAL;
807 1533
808 read_lock(&vmlist_lock); 1534 area = find_vm_area(addr);
809 area = __find_vm_area(addr);
810 if (!area) 1535 if (!area)
811 goto out_einval_locked; 1536 return -EINVAL;
812 1537
813 if (!(area->flags & VM_USERMAP)) 1538 if (!(area->flags & VM_USERMAP))
814 goto out_einval_locked; 1539 return -EINVAL;
815 1540
816 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 1541 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
817 goto out_einval_locked; 1542 return -EINVAL;
818 read_unlock(&vmlist_lock);
819 1543
820 addr += pgoff << PAGE_SHIFT; 1544 addr += pgoff << PAGE_SHIFT;
821 do { 1545 do {
822 struct page *page = vmalloc_to_page(addr); 1546 struct page *page = vmalloc_to_page(addr);
1547 int ret;
1548
823 ret = vm_insert_page(vma, uaddr, page); 1549 ret = vm_insert_page(vma, uaddr, page);
824 if (ret) 1550 if (ret)
825 return ret; 1551 return ret;
@@ -832,11 +1558,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
832 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 1558 /* Prevent "things" like memory migration? VM_flags need a cleanup... */
833 vma->vm_flags |= VM_RESERVED; 1559 vma->vm_flags |= VM_RESERVED;
834 1560
835 return ret; 1561 return 0;
836
837out_einval_locked:
838 read_unlock(&vmlist_lock);
839 return -EINVAL;
840} 1562}
841EXPORT_SYMBOL(remap_vmalloc_range); 1563EXPORT_SYMBOL(remap_vmalloc_range);
842 1564
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ff1a58e7c10..3b5860294bb6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,7 @@
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/delayacct.h> 41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
42 43
43#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
44#include <asm/div64.h> 45#include <asm/div64.h>
@@ -78,7 +79,7 @@ struct scan_control {
78 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 79 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
79 unsigned long *scanned, int order, int mode, 80 unsigned long *scanned, int order, int mode,
80 struct zone *z, struct mem_cgroup *mem_cont, 81 struct zone *z, struct mem_cgroup *mem_cont,
81 int active); 82 int active, int file);
82}; 83};
83 84
84#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 85#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -470,6 +471,85 @@ int remove_mapping(struct address_space *mapping, struct page *page)
470 return 0; 471 return 0;
471} 472}
472 473
474/**
475 * putback_lru_page - put previously isolated page onto appropriate LRU list
476 * @page: page to be put back to appropriate lru list
477 *
478 * Add previously isolated @page to appropriate LRU list.
479 * Page may still be unevictable for other reasons.
480 *
481 * lru_lock must not be held, interrupts must be enabled.
482 */
483#ifdef CONFIG_UNEVICTABLE_LRU
484void putback_lru_page(struct page *page)
485{
486 int lru;
487 int active = !!TestClearPageActive(page);
488 int was_unevictable = PageUnevictable(page);
489
490 VM_BUG_ON(PageLRU(page));
491
492redo:
493 ClearPageUnevictable(page);
494
495 if (page_evictable(page, NULL)) {
496 /*
497 * For evictable pages, we can use the cache.
498 * In event of a race, worst case is we end up with an
499 * unevictable page on [in]active list.
500 * We know how to handle that.
501 */
502 lru = active + page_is_file_cache(page);
503 lru_cache_add_lru(page, lru);
504 } else {
505 /*
506 * Put unevictable pages directly on zone's unevictable
507 * list.
508 */
509 lru = LRU_UNEVICTABLE;
510 add_page_to_unevictable_list(page);
511 }
512 mem_cgroup_move_lists(page, lru);
513
514 /*
515 * page's status can change while we move it among lru. If an evictable
516 * page is on unevictable list, it never be freed. To avoid that,
517 * check after we added it to the list, again.
518 */
519 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
520 if (!isolate_lru_page(page)) {
521 put_page(page);
522 goto redo;
523 }
524 /* This means someone else dropped this page from LRU
525 * So, it will be freed or putback to LRU again. There is
526 * nothing to do here.
527 */
528 }
529
530 if (was_unevictable && lru != LRU_UNEVICTABLE)
531 count_vm_event(UNEVICTABLE_PGRESCUED);
532 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
533 count_vm_event(UNEVICTABLE_PGCULLED);
534
535 put_page(page); /* drop ref from isolate */
536}
537
538#else /* CONFIG_UNEVICTABLE_LRU */
539
540void putback_lru_page(struct page *page)
541{
542 int lru;
543 VM_BUG_ON(PageLRU(page));
544
545 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
546 lru_cache_add_lru(page, lru);
547 mem_cgroup_move_lists(page, lru);
548 put_page(page);
549}
550#endif /* CONFIG_UNEVICTABLE_LRU */
551
552
473/* 553/*
474 * shrink_page_list() returns the number of reclaimed pages 554 * shrink_page_list() returns the number of reclaimed pages
475 */ 555 */
@@ -503,6 +583,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
503 583
504 sc->nr_scanned++; 584 sc->nr_scanned++;
505 585
586 if (unlikely(!page_evictable(page, NULL)))
587 goto cull_mlocked;
588
506 if (!sc->may_swap && page_mapped(page)) 589 if (!sc->may_swap && page_mapped(page))
507 goto keep_locked; 590 goto keep_locked;
508 591
@@ -539,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
539 * Anonymous process memory has backing store? 622 * Anonymous process memory has backing store?
540 * Try to allocate it some swap space here. 623 * Try to allocate it some swap space here.
541 */ 624 */
542 if (PageAnon(page) && !PageSwapCache(page)) 625 if (PageAnon(page) && !PageSwapCache(page)) {
626 switch (try_to_munlock(page)) {
627 case SWAP_FAIL: /* shouldn't happen */
628 case SWAP_AGAIN:
629 goto keep_locked;
630 case SWAP_MLOCK:
631 goto cull_mlocked;
632 case SWAP_SUCCESS:
633 ; /* fall thru'; add to swap cache */
634 }
543 if (!add_to_swap(page, GFP_ATOMIC)) 635 if (!add_to_swap(page, GFP_ATOMIC))
544 goto activate_locked; 636 goto activate_locked;
637 }
545#endif /* CONFIG_SWAP */ 638#endif /* CONFIG_SWAP */
546 639
547 mapping = page_mapping(page); 640 mapping = page_mapping(page);
@@ -556,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
556 goto activate_locked; 649 goto activate_locked;
557 case SWAP_AGAIN: 650 case SWAP_AGAIN:
558 goto keep_locked; 651 goto keep_locked;
652 case SWAP_MLOCK:
653 goto cull_mlocked;
559 case SWAP_SUCCESS: 654 case SWAP_SUCCESS:
560 ; /* try to free the page below */ 655 ; /* try to free the page below */
561 } 656 }
@@ -602,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
602 * possible for a page to have PageDirty set, but it is actually 697 * possible for a page to have PageDirty set, but it is actually
603 * clean (all its buffers are clean). This happens if the 698 * clean (all its buffers are clean). This happens if the
604 * buffers were written out directly, with submit_bh(). ext3 699 * buffers were written out directly, with submit_bh(). ext3
605 * will do this, as well as the blockdev mapping. 700 * will do this, as well as the blockdev mapping.
606 * try_to_release_page() will discover that cleanness and will 701 * try_to_release_page() will discover that cleanness and will
607 * drop the buffers and mark the page clean - it can be freed. 702 * drop the buffers and mark the page clean - it can be freed.
608 * 703 *
@@ -637,7 +732,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
637 if (!mapping || !__remove_mapping(mapping, page)) 732 if (!mapping || !__remove_mapping(mapping, page))
638 goto keep_locked; 733 goto keep_locked;
639 734
640 unlock_page(page); 735 /*
736 * At this point, we have no other references and there is
737 * no way to pick any more up (removed from LRU, removed
738 * from pagecache). Can use non-atomic bitops now (and
739 * we obviously don't have to worry about waking up a process
740 * waiting on the page lock, because there are no references.
741 */
742 __clear_page_locked(page);
641free_it: 743free_it:
642 nr_reclaimed++; 744 nr_reclaimed++;
643 if (!pagevec_add(&freed_pvec, page)) { 745 if (!pagevec_add(&freed_pvec, page)) {
@@ -646,14 +748,23 @@ free_it:
646 } 748 }
647 continue; 749 continue;
648 750
751cull_mlocked:
752 unlock_page(page);
753 putback_lru_page(page);
754 continue;
755
649activate_locked: 756activate_locked:
757 /* Not a candidate for swapping, so reclaim swap space. */
758 if (PageSwapCache(page) && vm_swap_full())
759 remove_exclusive_swap_page_ref(page);
760 VM_BUG_ON(PageActive(page));
650 SetPageActive(page); 761 SetPageActive(page);
651 pgactivate++; 762 pgactivate++;
652keep_locked: 763keep_locked:
653 unlock_page(page); 764 unlock_page(page);
654keep: 765keep:
655 list_add(&page->lru, &ret_pages); 766 list_add(&page->lru, &ret_pages);
656 VM_BUG_ON(PageLRU(page)); 767 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
657 } 768 }
658 list_splice(&ret_pages, page_list); 769 list_splice(&ret_pages, page_list);
659 if (pagevec_count(&freed_pvec)) 770 if (pagevec_count(&freed_pvec))
@@ -677,7 +788,7 @@ keep:
677 * 788 *
678 * returns 0 on success, -ve errno on failure. 789 * returns 0 on success, -ve errno on failure.
679 */ 790 */
680int __isolate_lru_page(struct page *page, int mode) 791int __isolate_lru_page(struct page *page, int mode, int file)
681{ 792{
682 int ret = -EINVAL; 793 int ret = -EINVAL;
683 794
@@ -693,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode)
693 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 804 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
694 return ret; 805 return ret;
695 806
807 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
808 return ret;
809
810 /*
811 * When this function is being called for lumpy reclaim, we
812 * initially look into all LRU pages, active, inactive and
813 * unevictable; only give shrink_page_list evictable pages.
814 */
815 if (PageUnevictable(page))
816 return ret;
817
696 ret = -EBUSY; 818 ret = -EBUSY;
697 if (likely(get_page_unless_zero(page))) { 819 if (likely(get_page_unless_zero(page))) {
698 /* 820 /*
@@ -723,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode)
723 * @scanned: The number of pages that were scanned. 845 * @scanned: The number of pages that were scanned.
724 * @order: The caller's attempted allocation order 846 * @order: The caller's attempted allocation order
725 * @mode: One of the LRU isolation modes 847 * @mode: One of the LRU isolation modes
848 * @file: True [1] if isolating file [!anon] pages
726 * 849 *
727 * returns how many pages were moved onto *@dst. 850 * returns how many pages were moved onto *@dst.
728 */ 851 */
729static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 852static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
730 struct list_head *src, struct list_head *dst, 853 struct list_head *src, struct list_head *dst,
731 unsigned long *scanned, int order, int mode) 854 unsigned long *scanned, int order, int mode, int file)
732{ 855{
733 unsigned long nr_taken = 0; 856 unsigned long nr_taken = 0;
734 unsigned long scan; 857 unsigned long scan;
@@ -745,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
745 868
746 VM_BUG_ON(!PageLRU(page)); 869 VM_BUG_ON(!PageLRU(page));
747 870
748 switch (__isolate_lru_page(page, mode)) { 871 switch (__isolate_lru_page(page, mode, file)) {
749 case 0: 872 case 0:
750 list_move(&page->lru, dst); 873 list_move(&page->lru, dst);
751 nr_taken++; 874 nr_taken++;
@@ -788,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
788 break; 911 break;
789 912
790 cursor_page = pfn_to_page(pfn); 913 cursor_page = pfn_to_page(pfn);
914
791 /* Check that we have not crossed a zone boundary. */ 915 /* Check that we have not crossed a zone boundary. */
792 if (unlikely(page_zone_id(cursor_page) != zone_id)) 916 if (unlikely(page_zone_id(cursor_page) != zone_id))
793 continue; 917 continue;
794 switch (__isolate_lru_page(cursor_page, mode)) { 918 switch (__isolate_lru_page(cursor_page, mode, file)) {
795 case 0: 919 case 0:
796 list_move(&cursor_page->lru, dst); 920 list_move(&cursor_page->lru, dst);
797 nr_taken++; 921 nr_taken++;
@@ -802,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
802 /* else it is being freed elsewhere */ 926 /* else it is being freed elsewhere */
803 list_move(&cursor_page->lru, src); 927 list_move(&cursor_page->lru, src);
804 default: 928 default:
805 break; 929 break; /* ! on LRU or wrong list */
806 } 930 }
807 } 931 }
808 } 932 }
@@ -816,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr,
816 unsigned long *scanned, int order, 940 unsigned long *scanned, int order,
817 int mode, struct zone *z, 941 int mode, struct zone *z,
818 struct mem_cgroup *mem_cont, 942 struct mem_cgroup *mem_cont,
819 int active) 943 int active, int file)
820{ 944{
945 int lru = LRU_BASE;
821 if (active) 946 if (active)
822 return isolate_lru_pages(nr, &z->active_list, dst, 947 lru += LRU_ACTIVE;
823 scanned, order, mode); 948 if (file)
824 else 949 lru += LRU_FILE;
825 return isolate_lru_pages(nr, &z->inactive_list, dst, 950 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
826 scanned, order, mode); 951 mode, !!file);
827} 952}
828 953
829/* 954/*
830 * clear_active_flags() is a helper for shrink_active_list(), clearing 955 * clear_active_flags() is a helper for shrink_active_list(), clearing
831 * any active bits from the pages in the list. 956 * any active bits from the pages in the list.
832 */ 957 */
833static unsigned long clear_active_flags(struct list_head *page_list) 958static unsigned long clear_active_flags(struct list_head *page_list,
959 unsigned int *count)
834{ 960{
835 int nr_active = 0; 961 int nr_active = 0;
962 int lru;
836 struct page *page; 963 struct page *page;
837 964
838 list_for_each_entry(page, page_list, lru) 965 list_for_each_entry(page, page_list, lru) {
966 lru = page_is_file_cache(page);
839 if (PageActive(page)) { 967 if (PageActive(page)) {
968 lru += LRU_ACTIVE;
840 ClearPageActive(page); 969 ClearPageActive(page);
841 nr_active++; 970 nr_active++;
842 } 971 }
972 count[lru]++;
973 }
843 974
844 return nr_active; 975 return nr_active;
845} 976}
846 977
978/**
979 * isolate_lru_page - tries to isolate a page from its LRU list
980 * @page: page to isolate from its LRU list
981 *
982 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
983 * vmstat statistic corresponding to whatever LRU list the page was on.
984 *
985 * Returns 0 if the page was removed from an LRU list.
986 * Returns -EBUSY if the page was not on an LRU list.
987 *
988 * The returned page will have PageLRU() cleared. If it was found on
989 * the active list, it will have PageActive set. If it was found on
990 * the unevictable list, it will have the PageUnevictable bit set. That flag
991 * may need to be cleared by the caller before letting the page go.
992 *
993 * The vmstat statistic corresponding to the list on which the page was
994 * found will be decremented.
995 *
996 * Restrictions:
997 * (1) Must be called with an elevated refcount on the page. This is a
998 * fundamentnal difference from isolate_lru_pages (which is called
999 * without a stable reference).
1000 * (2) the lru_lock must not be held.
1001 * (3) interrupts must be enabled.
1002 */
1003int isolate_lru_page(struct page *page)
1004{
1005 int ret = -EBUSY;
1006
1007 if (PageLRU(page)) {
1008 struct zone *zone = page_zone(page);
1009
1010 spin_lock_irq(&zone->lru_lock);
1011 if (PageLRU(page) && get_page_unless_zero(page)) {
1012 int lru = page_lru(page);
1013 ret = 0;
1014 ClearPageLRU(page);
1015
1016 del_page_from_lru_list(zone, page, lru);
1017 }
1018 spin_unlock_irq(&zone->lru_lock);
1019 }
1020 return ret;
1021}
1022
847/* 1023/*
848 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1024 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
849 * of reclaimed pages 1025 * of reclaimed pages
850 */ 1026 */
851static unsigned long shrink_inactive_list(unsigned long max_scan, 1027static unsigned long shrink_inactive_list(unsigned long max_scan,
852 struct zone *zone, struct scan_control *sc) 1028 struct zone *zone, struct scan_control *sc,
1029 int priority, int file)
853{ 1030{
854 LIST_HEAD(page_list); 1031 LIST_HEAD(page_list);
855 struct pagevec pvec; 1032 struct pagevec pvec;
@@ -866,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
866 unsigned long nr_scan; 1043 unsigned long nr_scan;
867 unsigned long nr_freed; 1044 unsigned long nr_freed;
868 unsigned long nr_active; 1045 unsigned long nr_active;
1046 unsigned int count[NR_LRU_LISTS] = { 0, };
1047 int mode = ISOLATE_INACTIVE;
1048
1049 /*
1050 * If we need a large contiguous chunk of memory, or have
1051 * trouble getting a small set of contiguous pages, we
1052 * will reclaim both active and inactive pages.
1053 *
1054 * We use the same threshold as pageout congestion_wait below.
1055 */
1056 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1057 mode = ISOLATE_BOTH;
1058 else if (sc->order && priority < DEF_PRIORITY - 2)
1059 mode = ISOLATE_BOTH;
869 1060
870 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1061 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
871 &page_list, &nr_scan, sc->order, 1062 &page_list, &nr_scan, sc->order, mode,
872 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 1063 zone, sc->mem_cgroup, 0, file);
873 ISOLATE_BOTH : ISOLATE_INACTIVE, 1064 nr_active = clear_active_flags(&page_list, count);
874 zone, sc->mem_cgroup, 0);
875 nr_active = clear_active_flags(&page_list);
876 __count_vm_events(PGDEACTIVATE, nr_active); 1065 __count_vm_events(PGDEACTIVATE, nr_active);
877 1066
878 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); 1067 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
879 __mod_zone_page_state(zone, NR_INACTIVE, 1068 -count[LRU_ACTIVE_FILE]);
880 -(nr_taken - nr_active)); 1069 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
881 if (scan_global_lru(sc)) 1070 -count[LRU_INACTIVE_FILE]);
1071 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1072 -count[LRU_ACTIVE_ANON]);
1073 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1074 -count[LRU_INACTIVE_ANON]);
1075
1076 if (scan_global_lru(sc)) {
882 zone->pages_scanned += nr_scan; 1077 zone->pages_scanned += nr_scan;
1078 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1079 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1080 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1081 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1082 }
883 spin_unlock_irq(&zone->lru_lock); 1083 spin_unlock_irq(&zone->lru_lock);
884 1084
885 nr_scanned += nr_scan; 1085 nr_scanned += nr_scan;
@@ -899,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
899 * The attempt at page out may have made some 1099 * The attempt at page out may have made some
900 * of the pages active, mark them inactive again. 1100 * of the pages active, mark them inactive again.
901 */ 1101 */
902 nr_active = clear_active_flags(&page_list); 1102 nr_active = clear_active_flags(&page_list, count);
903 count_vm_events(PGDEACTIVATE, nr_active); 1103 count_vm_events(PGDEACTIVATE, nr_active);
904 1104
905 nr_freed += shrink_page_list(&page_list, sc, 1105 nr_freed += shrink_page_list(&page_list, sc,
@@ -924,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
924 * Put back any unfreeable pages. 1124 * Put back any unfreeable pages.
925 */ 1125 */
926 while (!list_empty(&page_list)) { 1126 while (!list_empty(&page_list)) {
1127 int lru;
927 page = lru_to_page(&page_list); 1128 page = lru_to_page(&page_list);
928 VM_BUG_ON(PageLRU(page)); 1129 VM_BUG_ON(PageLRU(page));
929 SetPageLRU(page);
930 list_del(&page->lru); 1130 list_del(&page->lru);
931 if (PageActive(page)) 1131 if (unlikely(!page_evictable(page, NULL))) {
932 add_page_to_active_list(zone, page); 1132 spin_unlock_irq(&zone->lru_lock);
933 else 1133 putback_lru_page(page);
934 add_page_to_inactive_list(zone, page); 1134 spin_lock_irq(&zone->lru_lock);
1135 continue;
1136 }
1137 SetPageLRU(page);
1138 lru = page_lru(page);
1139 add_page_to_lru_list(zone, page, lru);
1140 mem_cgroup_move_lists(page, lru);
1141 if (PageActive(page) && scan_global_lru(sc)) {
1142 int file = !!page_is_file_cache(page);
1143 zone->recent_rotated[file]++;
1144 }
935 if (!pagevec_add(&pvec, page)) { 1145 if (!pagevec_add(&pvec, page)) {
936 spin_unlock_irq(&zone->lru_lock); 1146 spin_unlock_irq(&zone->lru_lock);
937 __pagevec_release(&pvec); 1147 __pagevec_release(&pvec);
@@ -962,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
962 1172
963static inline int zone_is_near_oom(struct zone *zone) 1173static inline int zone_is_near_oom(struct zone *zone)
964{ 1174{
965 return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) 1175 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
966 + zone_page_state(zone, NR_INACTIVE))*3;
967}
968
969/*
970 * Determine we should try to reclaim mapped pages.
971 * This is called only when sc->mem_cgroup is NULL.
972 */
973static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
974 int priority)
975{
976 long mapped_ratio;
977 long distress;
978 long swap_tendency;
979 long imbalance;
980 int reclaim_mapped = 0;
981 int prev_priority;
982
983 if (scan_global_lru(sc) && zone_is_near_oom(zone))
984 return 1;
985 /*
986 * `distress' is a measure of how much trouble we're having
987 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
988 */
989 if (scan_global_lru(sc))
990 prev_priority = zone->prev_priority;
991 else
992 prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
993
994 distress = 100 >> min(prev_priority, priority);
995
996 /*
997 * The point of this algorithm is to decide when to start
998 * reclaiming mapped memory instead of just pagecache. Work out
999 * how much memory
1000 * is mapped.
1001 */
1002 if (scan_global_lru(sc))
1003 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
1004 global_page_state(NR_ANON_PAGES)) * 100) /
1005 vm_total_pages;
1006 else
1007 mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
1008
1009 /*
1010 * Now decide how much we really want to unmap some pages. The
1011 * mapped ratio is downgraded - just because there's a lot of
1012 * mapped memory doesn't necessarily mean that page reclaim
1013 * isn't succeeding.
1014 *
1015 * The distress ratio is important - we don't want to start
1016 * going oom.
1017 *
1018 * A 100% value of vm_swappiness overrides this algorithm
1019 * altogether.
1020 */
1021 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1022
1023 /*
1024 * If there's huge imbalance between active and inactive
1025 * (think active 100 times larger than inactive) we should
1026 * become more permissive, or the system will take too much
1027 * cpu before it start swapping during memory pressure.
1028 * Distress is about avoiding early-oom, this is about
1029 * making swappiness graceful despite setting it to low
1030 * values.
1031 *
1032 * Avoid div by zero with nr_inactive+1, and max resulting
1033 * value is vm_total_pages.
1034 */
1035 if (scan_global_lru(sc)) {
1036 imbalance = zone_page_state(zone, NR_ACTIVE);
1037 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1038 } else
1039 imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
1040
1041 /*
1042 * Reduce the effect of imbalance if swappiness is low,
1043 * this means for a swappiness very low, the imbalance
1044 * must be much higher than 100 for this logic to make
1045 * the difference.
1046 *
1047 * Max temporary value is vm_total_pages*100.
1048 */
1049 imbalance *= (vm_swappiness + 1);
1050 imbalance /= 100;
1051
1052 /*
1053 * If not much of the ram is mapped, makes the imbalance
1054 * less relevant, it's high priority we refill the inactive
1055 * list with mapped pages only in presence of high ratio of
1056 * mapped pages.
1057 *
1058 * Max temporary value is vm_total_pages*100.
1059 */
1060 imbalance *= mapped_ratio;
1061 imbalance /= 100;
1062
1063 /* apply imbalance feedback to swap_tendency */
1064 swap_tendency += imbalance;
1065
1066 /*
1067 * Now use this metric to decide whether to start moving mapped
1068 * memory onto the inactive list.
1069 */
1070 if (swap_tendency >= 100)
1071 reclaim_mapped = 1;
1072
1073 return reclaim_mapped;
1074} 1176}
1075 1177
1076/* 1178/*
@@ -1093,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
1093 1195
1094 1196
1095static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1197static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1096 struct scan_control *sc, int priority) 1198 struct scan_control *sc, int priority, int file)
1097{ 1199{
1098 unsigned long pgmoved; 1200 unsigned long pgmoved;
1099 int pgdeactivate = 0; 1201 int pgdeactivate = 0;
1100 unsigned long pgscanned; 1202 unsigned long pgscanned;
1101 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1203 LIST_HEAD(l_hold); /* The pages which were snipped off */
1102 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 1204 LIST_HEAD(l_inactive);
1103 LIST_HEAD(l_active); /* Pages to go onto the active_list */
1104 struct page *page; 1205 struct page *page;
1105 struct pagevec pvec; 1206 struct pagevec pvec;
1106 int reclaim_mapped = 0; 1207 enum lru_list lru;
1107
1108 if (sc->may_swap)
1109 reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
1110 1208
1111 lru_add_drain(); 1209 lru_add_drain();
1112 spin_lock_irq(&zone->lru_lock); 1210 spin_lock_irq(&zone->lru_lock);
1113 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1211 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1114 ISOLATE_ACTIVE, zone, 1212 ISOLATE_ACTIVE, zone,
1115 sc->mem_cgroup, 1); 1213 sc->mem_cgroup, 1, file);
1116 /* 1214 /*
1117 * zone->pages_scanned is used for detect zone's oom 1215 * zone->pages_scanned is used for detect zone's oom
1118 * mem_cgroup remembers nr_scan by itself. 1216 * mem_cgroup remembers nr_scan by itself.
1119 */ 1217 */
1120 if (scan_global_lru(sc)) 1218 if (scan_global_lru(sc)) {
1121 zone->pages_scanned += pgscanned; 1219 zone->pages_scanned += pgscanned;
1220 zone->recent_scanned[!!file] += pgmoved;
1221 }
1122 1222
1123 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1223 if (file)
1224 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1225 else
1226 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1124 spin_unlock_irq(&zone->lru_lock); 1227 spin_unlock_irq(&zone->lru_lock);
1125 1228
1229 pgmoved = 0;
1126 while (!list_empty(&l_hold)) { 1230 while (!list_empty(&l_hold)) {
1127 cond_resched(); 1231 cond_resched();
1128 page = lru_to_page(&l_hold); 1232 page = lru_to_page(&l_hold);
1129 list_del(&page->lru); 1233 list_del(&page->lru);
1130 if (page_mapped(page)) { 1234
1131 if (!reclaim_mapped || 1235 if (unlikely(!page_evictable(page, NULL))) {
1132 (total_swap_pages == 0 && PageAnon(page)) || 1236 putback_lru_page(page);
1133 page_referenced(page, 0, sc->mem_cgroup)) { 1237 continue;
1134 list_add(&page->lru, &l_active);
1135 continue;
1136 }
1137 } 1238 }
1239
1240 /* page_referenced clears PageReferenced */
1241 if (page_mapping_inuse(page) &&
1242 page_referenced(page, 0, sc->mem_cgroup))
1243 pgmoved++;
1244
1138 list_add(&page->lru, &l_inactive); 1245 list_add(&page->lru, &l_inactive);
1139 } 1246 }
1140 1247
1248 /*
1249 * Count referenced pages from currently used mappings as
1250 * rotated, even though they are moved to the inactive list.
1251 * This helps balance scan pressure between file and anonymous
1252 * pages in get_scan_ratio.
1253 */
1254 zone->recent_rotated[!!file] += pgmoved;
1255
1256 /*
1257 * Move the pages to the [file or anon] inactive list.
1258 */
1141 pagevec_init(&pvec, 1); 1259 pagevec_init(&pvec, 1);
1260
1142 pgmoved = 0; 1261 pgmoved = 0;
1262 lru = LRU_BASE + file * LRU_FILE;
1143 spin_lock_irq(&zone->lru_lock); 1263 spin_lock_irq(&zone->lru_lock);
1144 while (!list_empty(&l_inactive)) { 1264 while (!list_empty(&l_inactive)) {
1145 page = lru_to_page(&l_inactive); 1265 page = lru_to_page(&l_inactive);
@@ -1149,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1149 VM_BUG_ON(!PageActive(page)); 1269 VM_BUG_ON(!PageActive(page));
1150 ClearPageActive(page); 1270 ClearPageActive(page);
1151 1271
1152 list_move(&page->lru, &zone->inactive_list); 1272 list_move(&page->lru, &zone->lru[lru].list);
1153 mem_cgroup_move_lists(page, false); 1273 mem_cgroup_move_lists(page, lru);
1154 pgmoved++; 1274 pgmoved++;
1155 if (!pagevec_add(&pvec, page)) { 1275 if (!pagevec_add(&pvec, page)) {
1156 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1276 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1157 spin_unlock_irq(&zone->lru_lock); 1277 spin_unlock_irq(&zone->lru_lock);
1158 pgdeactivate += pgmoved; 1278 pgdeactivate += pgmoved;
1159 pgmoved = 0; 1279 pgmoved = 0;
@@ -1163,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1163 spin_lock_irq(&zone->lru_lock); 1283 spin_lock_irq(&zone->lru_lock);
1164 } 1284 }
1165 } 1285 }
1166 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1286 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1167 pgdeactivate += pgmoved; 1287 pgdeactivate += pgmoved;
1168 if (buffer_heads_over_limit) { 1288 if (buffer_heads_over_limit) {
1169 spin_unlock_irq(&zone->lru_lock); 1289 spin_unlock_irq(&zone->lru_lock);
1170 pagevec_strip(&pvec); 1290 pagevec_strip(&pvec);
1171 spin_lock_irq(&zone->lru_lock); 1291 spin_lock_irq(&zone->lru_lock);
1172 } 1292 }
1173
1174 pgmoved = 0;
1175 while (!list_empty(&l_active)) {
1176 page = lru_to_page(&l_active);
1177 prefetchw_prev_lru_page(page, &l_active, flags);
1178 VM_BUG_ON(PageLRU(page));
1179 SetPageLRU(page);
1180 VM_BUG_ON(!PageActive(page));
1181
1182 list_move(&page->lru, &zone->active_list);
1183 mem_cgroup_move_lists(page, true);
1184 pgmoved++;
1185 if (!pagevec_add(&pvec, page)) {
1186 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
1187 pgmoved = 0;
1188 spin_unlock_irq(&zone->lru_lock);
1189 __pagevec_release(&pvec);
1190 spin_lock_irq(&zone->lru_lock);
1191 }
1192 }
1193 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
1194
1195 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1293 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1196 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1294 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1197 spin_unlock_irq(&zone->lru_lock); 1295 spin_unlock_irq(&zone->lru_lock);
1296 if (vm_swap_full())
1297 pagevec_swap_free(&pvec);
1198 1298
1199 pagevec_release(&pvec); 1299 pagevec_release(&pvec);
1200} 1300}
1201 1301
1302static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1303 struct zone *zone, struct scan_control *sc, int priority)
1304{
1305 int file = is_file_lru(lru);
1306
1307 if (lru == LRU_ACTIVE_FILE) {
1308 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1309 return 0;
1310 }
1311
1312 if (lru == LRU_ACTIVE_ANON &&
1313 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1314 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1315 return 0;
1316 }
1317 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1318}
1319
1320/*
1321 * Determine how aggressively the anon and file LRU lists should be
1322 * scanned. The relative value of each set of LRU lists is determined
1323 * by looking at the fraction of the pages scanned we did rotate back
1324 * onto the active list instead of evict.
1325 *
1326 * percent[0] specifies how much pressure to put on ram/swap backed
1327 * memory, while percent[1] determines pressure on the file LRUs.
1328 */
1329static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1330 unsigned long *percent)
1331{
1332 unsigned long anon, file, free;
1333 unsigned long anon_prio, file_prio;
1334 unsigned long ap, fp;
1335
1336 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1337 zone_page_state(zone, NR_INACTIVE_ANON);
1338 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1339 zone_page_state(zone, NR_INACTIVE_FILE);
1340 free = zone_page_state(zone, NR_FREE_PAGES);
1341
1342 /* If we have no swap space, do not bother scanning anon pages. */
1343 if (nr_swap_pages <= 0) {
1344 percent[0] = 0;
1345 percent[1] = 100;
1346 return;
1347 }
1348
1349 /* If we have very few page cache pages, force-scan anon pages. */
1350 if (unlikely(file + free <= zone->pages_high)) {
1351 percent[0] = 100;
1352 percent[1] = 0;
1353 return;
1354 }
1355
1356 /*
1357 * OK, so we have swap space and a fair amount of page cache
1358 * pages. We use the recently rotated / recently scanned
1359 * ratios to determine how valuable each cache is.
1360 *
1361 * Because workloads change over time (and to avoid overflow)
1362 * we keep these statistics as a floating average, which ends
1363 * up weighing recent references more than old ones.
1364 *
1365 * anon in [0], file in [1]
1366 */
1367 if (unlikely(zone->recent_scanned[0] > anon / 4)) {
1368 spin_lock_irq(&zone->lru_lock);
1369 zone->recent_scanned[0] /= 2;
1370 zone->recent_rotated[0] /= 2;
1371 spin_unlock_irq(&zone->lru_lock);
1372 }
1373
1374 if (unlikely(zone->recent_scanned[1] > file / 4)) {
1375 spin_lock_irq(&zone->lru_lock);
1376 zone->recent_scanned[1] /= 2;
1377 zone->recent_rotated[1] /= 2;
1378 spin_unlock_irq(&zone->lru_lock);
1379 }
1380
1381 /*
1382 * With swappiness at 100, anonymous and file have the same priority.
1383 * This scanning priority is essentially the inverse of IO cost.
1384 */
1385 anon_prio = sc->swappiness;
1386 file_prio = 200 - sc->swappiness;
1387
1388 /*
1389 * anon recent_rotated[0]
1390 * %anon = 100 * ----------- / ----------------- * IO cost
1391 * anon + file rotate_sum
1392 */
1393 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
1394 ap /= zone->recent_rotated[0] + 1;
1395
1396 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
1397 fp /= zone->recent_rotated[1] + 1;
1398
1399 /* Normalize to percentages */
1400 percent[0] = 100 * ap / (ap + fp + 1);
1401 percent[1] = 100 - percent[0];
1402}
1403
1404
1202/* 1405/*
1203 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1406 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1204 */ 1407 */
1205static unsigned long shrink_zone(int priority, struct zone *zone, 1408static unsigned long shrink_zone(int priority, struct zone *zone,
1206 struct scan_control *sc) 1409 struct scan_control *sc)
1207{ 1410{
1208 unsigned long nr_active; 1411 unsigned long nr[NR_LRU_LISTS];
1209 unsigned long nr_inactive;
1210 unsigned long nr_to_scan; 1412 unsigned long nr_to_scan;
1211 unsigned long nr_reclaimed = 0; 1413 unsigned long nr_reclaimed = 0;
1414 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1415 enum lru_list l;
1212 1416
1213 if (scan_global_lru(sc)) { 1417 get_scan_ratio(zone, sc, percent);
1214 /*
1215 * Add one to nr_to_scan just to make sure that the kernel
1216 * will slowly sift through the active list.
1217 */
1218 zone->nr_scan_active +=
1219 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
1220 nr_active = zone->nr_scan_active;
1221 zone->nr_scan_inactive +=
1222 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1223 nr_inactive = zone->nr_scan_inactive;
1224 if (nr_inactive >= sc->swap_cluster_max)
1225 zone->nr_scan_inactive = 0;
1226 else
1227 nr_inactive = 0;
1228
1229 if (nr_active >= sc->swap_cluster_max)
1230 zone->nr_scan_active = 0;
1231 else
1232 nr_active = 0;
1233 } else {
1234 /*
1235 * This reclaim occurs not because zone memory shortage but
1236 * because memory controller hits its limit.
1237 * Then, don't modify zone reclaim related data.
1238 */
1239 nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
1240 zone, priority);
1241
1242 nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
1243 zone, priority);
1244 }
1245 1418
1419 for_each_evictable_lru(l) {
1420 if (scan_global_lru(sc)) {
1421 int file = is_file_lru(l);
1422 int scan;
1246 1423
1247 while (nr_active || nr_inactive) { 1424 scan = zone_page_state(zone, NR_LRU_BASE + l);
1248 if (nr_active) { 1425 if (priority) {
1249 nr_to_scan = min(nr_active, 1426 scan >>= priority;
1250 (unsigned long)sc->swap_cluster_max); 1427 scan = (scan * percent[file]) / 100;
1251 nr_active -= nr_to_scan; 1428 }
1252 shrink_active_list(nr_to_scan, zone, sc, priority); 1429 zone->lru[l].nr_scan += scan;
1430 nr[l] = zone->lru[l].nr_scan;
1431 if (nr[l] >= sc->swap_cluster_max)
1432 zone->lru[l].nr_scan = 0;
1433 else
1434 nr[l] = 0;
1435 } else {
1436 /*
1437 * This reclaim occurs not because zone memory shortage
1438 * but because memory controller hits its limit.
1439 * Don't modify zone reclaim related data.
1440 */
1441 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1442 priority, l);
1253 } 1443 }
1444 }
1254 1445
1255 if (nr_inactive) { 1446 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1256 nr_to_scan = min(nr_inactive, 1447 nr[LRU_INACTIVE_FILE]) {
1448 for_each_evictable_lru(l) {
1449 if (nr[l]) {
1450 nr_to_scan = min(nr[l],
1257 (unsigned long)sc->swap_cluster_max); 1451 (unsigned long)sc->swap_cluster_max);
1258 nr_inactive -= nr_to_scan; 1452 nr[l] -= nr_to_scan;
1259 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 1453
1260 sc); 1454 nr_reclaimed += shrink_list(l, nr_to_scan,
1455 zone, sc, priority);
1456 }
1261 } 1457 }
1262 } 1458 }
1263 1459
1460 /*
1461 * Even if we did not try to evict anon pages at all, we want to
1462 * rebalance the anon lru active/inactive ratio.
1463 */
1464 if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
1465 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1466 else if (!scan_global_lru(sc))
1467 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1468
1264 throttle_vm_writeout(sc->gfp_mask); 1469 throttle_vm_writeout(sc->gfp_mask);
1265 return nr_reclaimed; 1470 return nr_reclaimed;
1266} 1471}
@@ -1321,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1321 1526
1322 return nr_reclaimed; 1527 return nr_reclaimed;
1323} 1528}
1324 1529
1325/* 1530/*
1326 * This is the main entry point to direct page reclaim. 1531 * This is the main entry point to direct page reclaim.
1327 * 1532 *
@@ -1364,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1364 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1569 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1365 continue; 1570 continue;
1366 1571
1367 lru_pages += zone_page_state(zone, NR_ACTIVE) 1572 lru_pages += zone_lru_pages(zone);
1368 + zone_page_state(zone, NR_INACTIVE);
1369 } 1573 }
1370 } 1574 }
1371 1575
@@ -1555,6 +1759,14 @@ loop_again:
1555 priority != DEF_PRIORITY) 1759 priority != DEF_PRIORITY)
1556 continue; 1760 continue;
1557 1761
1762 /*
1763 * Do some background aging of the anon list, to give
1764 * pages a chance to be referenced before reclaiming.
1765 */
1766 if (inactive_anon_is_low(zone))
1767 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1768 &sc, priority, 0);
1769
1558 if (!zone_watermark_ok(zone, order, zone->pages_high, 1770 if (!zone_watermark_ok(zone, order, zone->pages_high,
1559 0, 0)) { 1771 0, 0)) {
1560 end_zone = i; 1772 end_zone = i;
@@ -1567,8 +1779,7 @@ loop_again:
1567 for (i = 0; i <= end_zone; i++) { 1779 for (i = 0; i <= end_zone; i++) {
1568 struct zone *zone = pgdat->node_zones + i; 1780 struct zone *zone = pgdat->node_zones + i;
1569 1781
1570 lru_pages += zone_page_state(zone, NR_ACTIVE) 1782 lru_pages += zone_lru_pages(zone);
1571 + zone_page_state(zone, NR_INACTIVE);
1572 } 1783 }
1573 1784
1574 /* 1785 /*
@@ -1612,8 +1823,7 @@ loop_again:
1612 if (zone_is_all_unreclaimable(zone)) 1823 if (zone_is_all_unreclaimable(zone))
1613 continue; 1824 continue;
1614 if (nr_slab == 0 && zone->pages_scanned >= 1825 if (nr_slab == 0 && zone->pages_scanned >=
1615 (zone_page_state(zone, NR_ACTIVE) 1826 (zone_lru_pages(zone) * 6))
1616 + zone_page_state(zone, NR_INACTIVE)) * 6)
1617 zone_set_flag(zone, 1827 zone_set_flag(zone,
1618 ZONE_ALL_UNRECLAIMABLE); 1828 ZONE_ALL_UNRECLAIMABLE);
1619 /* 1829 /*
@@ -1667,7 +1877,7 @@ out:
1667 1877
1668/* 1878/*
1669 * The background pageout daemon, started as a kernel thread 1879 * The background pageout daemon, started as a kernel thread
1670 * from the init process. 1880 * from the init process.
1671 * 1881 *
1672 * This basically trickles out pages so that we have _some_ 1882 * This basically trickles out pages so that we have _some_
1673 * free memory available even if there is no other activity 1883 * free memory available even if there is no other activity
@@ -1761,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order)
1761 wake_up_interruptible(&pgdat->kswapd_wait); 1971 wake_up_interruptible(&pgdat->kswapd_wait);
1762} 1972}
1763 1973
1974unsigned long global_lru_pages(void)
1975{
1976 return global_page_state(NR_ACTIVE_ANON)
1977 + global_page_state(NR_ACTIVE_FILE)
1978 + global_page_state(NR_INACTIVE_ANON)
1979 + global_page_state(NR_INACTIVE_FILE);
1980}
1981
1764#ifdef CONFIG_PM 1982#ifdef CONFIG_PM
1765/* 1983/*
1766 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1984 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
@@ -1774,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1774{ 1992{
1775 struct zone *zone; 1993 struct zone *zone;
1776 unsigned long nr_to_scan, ret = 0; 1994 unsigned long nr_to_scan, ret = 0;
1995 enum lru_list l;
1777 1996
1778 for_each_zone(zone) { 1997 for_each_zone(zone) {
1779 1998
@@ -1783,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1783 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 2002 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
1784 continue; 2003 continue;
1785 2004
1786 /* For pass = 0 we don't shrink the active list */ 2005 for_each_evictable_lru(l) {
1787 if (pass > 0) { 2006 /* For pass = 0, we don't shrink the active list */
1788 zone->nr_scan_active += 2007 if (pass == 0 &&
1789 (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; 2008 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
1790 if (zone->nr_scan_active >= nr_pages || pass > 3) { 2009 continue;
1791 zone->nr_scan_active = 0; 2010
2011 zone->lru[l].nr_scan +=
2012 (zone_page_state(zone, NR_LRU_BASE + l)
2013 >> prio) + 1;
2014 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
2015 zone->lru[l].nr_scan = 0;
1792 nr_to_scan = min(nr_pages, 2016 nr_to_scan = min(nr_pages,
1793 zone_page_state(zone, NR_ACTIVE)); 2017 zone_page_state(zone,
1794 shrink_active_list(nr_to_scan, zone, sc, prio); 2018 NR_LRU_BASE + l));
2019 ret += shrink_list(l, nr_to_scan, zone,
2020 sc, prio);
2021 if (ret >= nr_pages)
2022 return ret;
1795 } 2023 }
1796 } 2024 }
1797
1798 zone->nr_scan_inactive +=
1799 (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
1800 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
1801 zone->nr_scan_inactive = 0;
1802 nr_to_scan = min(nr_pages,
1803 zone_page_state(zone, NR_INACTIVE));
1804 ret += shrink_inactive_list(nr_to_scan, zone, sc);
1805 if (ret >= nr_pages)
1806 return ret;
1807 }
1808 } 2025 }
1809 2026
1810 return ret; 2027 return ret;
1811} 2028}
1812 2029
1813static unsigned long count_lru_pages(void)
1814{
1815 return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
1816}
1817
1818/* 2030/*
1819 * Try to free `nr_pages' of memory, system-wide, and return the number of 2031 * Try to free `nr_pages' of memory, system-wide, and return the number of
1820 * freed pages. 2032 * freed pages.
@@ -1840,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1840 2052
1841 current->reclaim_state = &reclaim_state; 2053 current->reclaim_state = &reclaim_state;
1842 2054
1843 lru_pages = count_lru_pages(); 2055 lru_pages = global_lru_pages();
1844 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2056 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1845 /* If slab caches are huge, it's better to hit them first */ 2057 /* If slab caches are huge, it's better to hit them first */
1846 while (nr_slab >= lru_pages) { 2058 while (nr_slab >= lru_pages) {
@@ -1883,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1883 2095
1884 reclaim_state.reclaimed_slab = 0; 2096 reclaim_state.reclaimed_slab = 0;
1885 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2097 shrink_slab(sc.nr_scanned, sc.gfp_mask,
1886 count_lru_pages()); 2098 global_lru_pages());
1887 ret += reclaim_state.reclaimed_slab; 2099 ret += reclaim_state.reclaimed_slab;
1888 if (ret >= nr_pages) 2100 if (ret >= nr_pages)
1889 goto out; 2101 goto out;
@@ -1900,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1900 if (!ret) { 2112 if (!ret) {
1901 do { 2113 do {
1902 reclaim_state.reclaimed_slab = 0; 2114 reclaim_state.reclaimed_slab = 0;
1903 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); 2115 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
1904 ret += reclaim_state.reclaimed_slab; 2116 ret += reclaim_state.reclaimed_slab;
1905 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 2117 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1906 } 2118 }
@@ -2128,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2128 return ret; 2340 return ret;
2129} 2341}
2130#endif 2342#endif
2343
2344#ifdef CONFIG_UNEVICTABLE_LRU
2345/*
2346 * page_evictable - test whether a page is evictable
2347 * @page: the page to test
2348 * @vma: the VMA in which the page is or will be mapped, may be NULL
2349 *
2350 * Test whether page is evictable--i.e., should be placed on active/inactive
2351 * lists vs unevictable list. The vma argument is !NULL when called from the
2352 * fault path to determine how to instantate a new page.
2353 *
2354 * Reasons page might not be evictable:
2355 * (1) page's mapping marked unevictable
2356 * (2) page is part of an mlocked VMA
2357 *
2358 */
2359int page_evictable(struct page *page, struct vm_area_struct *vma)
2360{
2361
2362 if (mapping_unevictable(page_mapping(page)))
2363 return 0;
2364
2365 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2366 return 0;
2367
2368 return 1;
2369}
2370
2371static void show_page_path(struct page *page)
2372{
2373 char buf[256];
2374 if (page_is_file_cache(page)) {
2375 struct address_space *mapping = page->mapping;
2376 struct dentry *dentry;
2377 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
2378
2379 spin_lock(&mapping->i_mmap_lock);
2380 dentry = d_find_alias(mapping->host);
2381 printk(KERN_INFO "rescued: %s %lu\n",
2382 dentry_path(dentry, buf, 256), pgoff);
2383 spin_unlock(&mapping->i_mmap_lock);
2384 } else {
2385#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
2386 struct anon_vma *anon_vma;
2387 struct vm_area_struct *vma;
2388
2389 anon_vma = page_lock_anon_vma(page);
2390 if (!anon_vma)
2391 return;
2392
2393 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
2394 printk(KERN_INFO "rescued: anon %s\n",
2395 vma->vm_mm->owner->comm);
2396 break;
2397 }
2398 page_unlock_anon_vma(anon_vma);
2399#endif
2400 }
2401}
2402
2403
2404/**
2405 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
2406 * @page: page to check evictability and move to appropriate lru list
2407 * @zone: zone page is in
2408 *
2409 * Checks a page for evictability and moves the page to the appropriate
2410 * zone lru list.
2411 *
2412 * Restrictions: zone->lru_lock must be held, page must be on LRU and must
2413 * have PageUnevictable set.
2414 */
2415static void check_move_unevictable_page(struct page *page, struct zone *zone)
2416{
2417 VM_BUG_ON(PageActive(page));
2418
2419retry:
2420 ClearPageUnevictable(page);
2421 if (page_evictable(page, NULL)) {
2422 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
2423
2424 show_page_path(page);
2425
2426 __dec_zone_state(zone, NR_UNEVICTABLE);
2427 list_move(&page->lru, &zone->lru[l].list);
2428 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2429 __count_vm_event(UNEVICTABLE_PGRESCUED);
2430 } else {
2431 /*
2432 * rotate unevictable list
2433 */
2434 SetPageUnevictable(page);
2435 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2436 if (page_evictable(page, NULL))
2437 goto retry;
2438 }
2439}
2440
2441/**
2442 * scan_mapping_unevictable_pages - scan an address space for evictable pages
2443 * @mapping: struct address_space to scan for evictable pages
2444 *
2445 * Scan all pages in mapping. Check unevictable pages for
2446 * evictability and move them to the appropriate zone lru list.
2447 */
2448void scan_mapping_unevictable_pages(struct address_space *mapping)
2449{
2450 pgoff_t next = 0;
2451 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
2452 PAGE_CACHE_SHIFT;
2453 struct zone *zone;
2454 struct pagevec pvec;
2455
2456 if (mapping->nrpages == 0)
2457 return;
2458
2459 pagevec_init(&pvec, 0);
2460 while (next < end &&
2461 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
2462 int i;
2463 int pg_scanned = 0;
2464
2465 zone = NULL;
2466
2467 for (i = 0; i < pagevec_count(&pvec); i++) {
2468 struct page *page = pvec.pages[i];
2469 pgoff_t page_index = page->index;
2470 struct zone *pagezone = page_zone(page);
2471
2472 pg_scanned++;
2473 if (page_index > next)
2474 next = page_index;
2475 next++;
2476
2477 if (pagezone != zone) {
2478 if (zone)
2479 spin_unlock_irq(&zone->lru_lock);
2480 zone = pagezone;
2481 spin_lock_irq(&zone->lru_lock);
2482 }
2483
2484 if (PageLRU(page) && PageUnevictable(page))
2485 check_move_unevictable_page(page, zone);
2486 }
2487 if (zone)
2488 spin_unlock_irq(&zone->lru_lock);
2489 pagevec_release(&pvec);
2490
2491 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
2492 }
2493
2494}
2495
2496/**
2497 * scan_zone_unevictable_pages - check unevictable list for evictable pages
2498 * @zone - zone of which to scan the unevictable list
2499 *
2500 * Scan @zone's unevictable LRU lists to check for pages that have become
2501 * evictable. Move those that have to @zone's inactive list where they
2502 * become candidates for reclaim, unless shrink_inactive_zone() decides
2503 * to reactivate them. Pages that are still unevictable are rotated
2504 * back onto @zone's unevictable list.
2505 */
2506#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
2507void scan_zone_unevictable_pages(struct zone *zone)
2508{
2509 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2510 unsigned long scan;
2511 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
2512
2513 while (nr_to_scan > 0) {
2514 unsigned long batch_size = min(nr_to_scan,
2515 SCAN_UNEVICTABLE_BATCH_SIZE);
2516
2517 spin_lock_irq(&zone->lru_lock);
2518 for (scan = 0; scan < batch_size; scan++) {
2519 struct page *page = lru_to_page(l_unevictable);
2520
2521 if (!trylock_page(page))
2522 continue;
2523
2524 prefetchw_prev_lru_page(page, l_unevictable, flags);
2525
2526 if (likely(PageLRU(page) && PageUnevictable(page)))
2527 check_move_unevictable_page(page, zone);
2528
2529 unlock_page(page);
2530 }
2531 spin_unlock_irq(&zone->lru_lock);
2532
2533 nr_to_scan -= batch_size;
2534 }
2535}
2536
2537
2538/**
2539 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
2540 *
2541 * A really big hammer: scan all zones' unevictable LRU lists to check for
2542 * pages that have become evictable. Move those back to the zones'
2543 * inactive list where they become candidates for reclaim.
2544 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
2545 * and we add swap to the system. As such, it runs in the context of a task
2546 * that has possibly/probably made some previously unevictable pages
2547 * evictable.
2548 */
2549void scan_all_zones_unevictable_pages(void)
2550{
2551 struct zone *zone;
2552
2553 for_each_zone(zone) {
2554 scan_zone_unevictable_pages(zone);
2555 }
2556}
2557
2558/*
2559 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
2560 * all nodes' unevictable lists for evictable pages
2561 */
2562unsigned long scan_unevictable_pages;
2563
2564int scan_unevictable_handler(struct ctl_table *table, int write,
2565 struct file *file, void __user *buffer,
2566 size_t *length, loff_t *ppos)
2567{
2568 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2569
2570 if (write && *(unsigned long *)table->data)
2571 scan_all_zones_unevictable_pages();
2572
2573 scan_unevictable_pages = 0;
2574 return 0;
2575}
2576
2577/*
2578 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2579 * a specified node's per zone unevictable lists for evictable pages.
2580 */
2581
2582static ssize_t read_scan_unevictable_node(struct sys_device *dev,
2583 struct sysdev_attribute *attr,
2584 char *buf)
2585{
2586 return sprintf(buf, "0\n"); /* always zero; should fit... */
2587}
2588
2589static ssize_t write_scan_unevictable_node(struct sys_device *dev,
2590 struct sysdev_attribute *attr,
2591 const char *buf, size_t count)
2592{
2593 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
2594 struct zone *zone;
2595 unsigned long res;
2596 unsigned long req = strict_strtoul(buf, 10, &res);
2597
2598 if (!req)
2599 return 1; /* zero is no-op */
2600
2601 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2602 if (!populated_zone(zone))
2603 continue;
2604 scan_zone_unevictable_pages(zone);
2605 }
2606 return 1;
2607}
2608
2609
2610static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
2611 read_scan_unevictable_node,
2612 write_scan_unevictable_node);
2613
2614int scan_unevictable_register_node(struct node *node)
2615{
2616 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
2617}
2618
2619void scan_unevictable_unregister_node(struct node *node)
2620{
2621 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2622}
2623
2624#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d7826af2fb07..9343227c5c60 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -619,8 +619,14 @@ const struct seq_operations pagetypeinfo_op = {
619static const char * const vmstat_text[] = { 619static const char * const vmstat_text[] = {
620 /* Zoned VM counters */ 620 /* Zoned VM counters */
621 "nr_free_pages", 621 "nr_free_pages",
622 "nr_inactive", 622 "nr_inactive_anon",
623 "nr_active", 623 "nr_active_anon",
624 "nr_inactive_file",
625 "nr_active_file",
626#ifdef CONFIG_UNEVICTABLE_LRU
627 "nr_unevictable",
628 "nr_mlock",
629#endif
624 "nr_anon_pages", 630 "nr_anon_pages",
625 "nr_mapped", 631 "nr_mapped",
626 "nr_file_pages", 632 "nr_file_pages",
@@ -675,6 +681,16 @@ static const char * const vmstat_text[] = {
675 "htlb_buddy_alloc_success", 681 "htlb_buddy_alloc_success",
676 "htlb_buddy_alloc_fail", 682 "htlb_buddy_alloc_fail",
677#endif 683#endif
684#ifdef CONFIG_UNEVICTABLE_LRU
685 "unevictable_pgs_culled",
686 "unevictable_pgs_scanned",
687 "unevictable_pgs_rescued",
688 "unevictable_pgs_mlocked",
689 "unevictable_pgs_munlocked",
690 "unevictable_pgs_cleared",
691 "unevictable_pgs_stranded",
692 "unevictable_pgs_mlockfreed",
693#endif
678#endif 694#endif
679}; 695};
680 696
@@ -688,7 +704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
688 "\n min %lu" 704 "\n min %lu"
689 "\n low %lu" 705 "\n low %lu"
690 "\n high %lu" 706 "\n high %lu"
691 "\n scanned %lu (a: %lu i: %lu)" 707 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)"
692 "\n spanned %lu" 708 "\n spanned %lu"
693 "\n present %lu", 709 "\n present %lu",
694 zone_page_state(zone, NR_FREE_PAGES), 710 zone_page_state(zone, NR_FREE_PAGES),
@@ -696,7 +712,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
696 zone->pages_low, 712 zone->pages_low,
697 zone->pages_high, 713 zone->pages_high,
698 zone->pages_scanned, 714 zone->pages_scanned,
699 zone->nr_scan_active, zone->nr_scan_inactive, 715 zone->lru[LRU_ACTIVE_ANON].nr_scan,
716 zone->lru[LRU_INACTIVE_ANON].nr_scan,
717 zone->lru[LRU_ACTIVE_FILE].nr_scan,
718 zone->lru[LRU_INACTIVE_FILE].nr_scan,
700 zone->spanned_pages, 719 zone->spanned_pages,
701 zone->present_pages); 720 zone->present_pages);
702 721
@@ -733,10 +752,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
733 seq_printf(m, 752 seq_printf(m,
734 "\n all_unreclaimable: %u" 753 "\n all_unreclaimable: %u"
735 "\n prev_priority: %i" 754 "\n prev_priority: %i"
736 "\n start_pfn: %lu", 755 "\n start_pfn: %lu"
756 "\n inactive_ratio: %u",
737 zone_is_all_unreclaimable(zone), 757 zone_is_all_unreclaimable(zone),
738 zone->prev_priority, 758 zone->prev_priority,
739 zone->zone_start_pfn); 759 zone->zone_start_pfn,
760 zone->inactive_ratio);
740 seq_putc(m, '\n'); 761 seq_putc(m, '\n');
741} 762}
742 763