aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c177
-rw-r--r--mm/madvise.c103
-rw-r--r--mm/memory.c57
-rw-r--r--mm/mempolicy.c110
-rw-r--r--mm/mmap.c57
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c7
-rw-r--r--mm/page_alloc.c423
-rw-r--r--mm/rmap.c21
-rw-r--r--mm/shmem.c143
-rw-r--r--mm/slab.c1
-rw-r--r--mm/swapfile.c55
-rw-r--r--mm/vmscan.c103
14 files changed, 918 insertions, 343 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4eb5ae3fbe10..fbd1111ea119 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,10 +7,14 @@
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/hugetlb.h>
11#include <linux/sysctl.h> 10#include <linux/sysctl.h>
12#include <linux/highmem.h> 11#include <linux/highmem.h>
13#include <linux/nodemask.h> 12#include <linux/nodemask.h>
13#include <linux/pagemap.h>
14#include <asm/page.h>
15#include <asm/pgtable.h>
16
17#include <linux/hugetlb.h>
14 18
15const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 19const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
16static unsigned long nr_huge_pages, free_huge_pages; 20static unsigned long nr_huge_pages, free_huge_pages;
@@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = {
249 .nopage = hugetlb_nopage, 253 .nopage = hugetlb_nopage,
250}; 254};
251 255
256static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
257{
258 pte_t entry;
259
260 if (vma->vm_flags & VM_WRITE) {
261 entry =
262 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
263 } else {
264 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
265 }
266 entry = pte_mkyoung(entry);
267 entry = pte_mkhuge(entry);
268
269 return entry;
270}
271
272int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
273 struct vm_area_struct *vma)
274{
275 pte_t *src_pte, *dst_pte, entry;
276 struct page *ptepage;
277 unsigned long addr = vma->vm_start;
278 unsigned long end = vma->vm_end;
279
280 while (addr < end) {
281 dst_pte = huge_pte_alloc(dst, addr);
282 if (!dst_pte)
283 goto nomem;
284 src_pte = huge_pte_offset(src, addr);
285 BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
286 entry = *src_pte;
287 ptepage = pte_page(entry);
288 get_page(ptepage);
289 add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
290 set_huge_pte_at(dst, addr, dst_pte, entry);
291 addr += HPAGE_SIZE;
292 }
293 return 0;
294
295nomem:
296 return -ENOMEM;
297}
298
299void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
300 unsigned long end)
301{
302 struct mm_struct *mm = vma->vm_mm;
303 unsigned long address;
304 pte_t pte;
305 struct page *page;
306
307 WARN_ON(!is_vm_hugetlb_page(vma));
308 BUG_ON(start & ~HPAGE_MASK);
309 BUG_ON(end & ~HPAGE_MASK);
310
311 for (address = start; address < end; address += HPAGE_SIZE) {
312 pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address));
313 if (pte_none(pte))
314 continue;
315 page = pte_page(pte);
316 put_page(page);
317 }
318 add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT));
319 flush_tlb_range(vma, start, end);
320}
321
252void zap_hugepage_range(struct vm_area_struct *vma, 322void zap_hugepage_range(struct vm_area_struct *vma,
253 unsigned long start, unsigned long length) 323 unsigned long start, unsigned long length)
254{ 324{
@@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma,
258 unmap_hugepage_range(vma, start, start + length); 328 unmap_hugepage_range(vma, start, start + length);
259 spin_unlock(&mm->page_table_lock); 329 spin_unlock(&mm->page_table_lock);
260} 330}
331
332int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
333{
334 struct mm_struct *mm = current->mm;
335 unsigned long addr;
336 int ret = 0;
337
338 WARN_ON(!is_vm_hugetlb_page(vma));
339 BUG_ON(vma->vm_start & ~HPAGE_MASK);
340 BUG_ON(vma->vm_end & ~HPAGE_MASK);
341
342 hugetlb_prefault_arch_hook(mm);
343
344 spin_lock(&mm->page_table_lock);
345 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
346 unsigned long idx;
347 pte_t *pte = huge_pte_alloc(mm, addr);
348 struct page *page;
349
350 if (!pte) {
351 ret = -ENOMEM;
352 goto out;
353 }
354 if (! pte_none(*pte))
355 hugetlb_clean_stale_pgtable(pte);
356
357 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
358 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
359 page = find_get_page(mapping, idx);
360 if (!page) {
361 /* charge the fs quota first */
362 if (hugetlb_get_quota(mapping)) {
363 ret = -ENOMEM;
364 goto out;
365 }
366 page = alloc_huge_page();
367 if (!page) {
368 hugetlb_put_quota(mapping);
369 ret = -ENOMEM;
370 goto out;
371 }
372 ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
373 if (! ret) {
374 unlock_page(page);
375 } else {
376 hugetlb_put_quota(mapping);
377 free_huge_page(page);
378 goto out;
379 }
380 }
381 add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
382 set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
383 }
384out:
385 spin_unlock(&mm->page_table_lock);
386 return ret;
387}
388
389int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
390 struct page **pages, struct vm_area_struct **vmas,
391 unsigned long *position, int *length, int i)
392{
393 unsigned long vpfn, vaddr = *position;
394 int remainder = *length;
395
396 BUG_ON(!is_vm_hugetlb_page(vma));
397
398 vpfn = vaddr/PAGE_SIZE;
399 while (vaddr < vma->vm_end && remainder) {
400
401 if (pages) {
402 pte_t *pte;
403 struct page *page;
404
405 /* Some archs (sparc64, sh*) have multiple
406 * pte_ts to each hugepage. We have to make
407 * sure we get the first, for the page
408 * indexing below to work. */
409 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
410
411 /* hugetlb should be locked, and hence, prefaulted */
412 WARN_ON(!pte || pte_none(*pte));
413
414 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
415
416 WARN_ON(!PageCompound(page));
417
418 get_page(page);
419 pages[i] = page;
420 }
421
422 if (vmas)
423 vmas[i] = vma;
424
425 vaddr += PAGE_SIZE;
426 ++vpfn;
427 --remainder;
428 ++i;
429 }
430
431 *length = remainder;
432 *position = vaddr;
433
434 return i;
435}
diff --git a/mm/madvise.c b/mm/madvise.c
index 944b5e52d812..e3108054733c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -8,17 +8,47 @@
8#include <linux/mman.h> 8#include <linux/mman.h>
9#include <linux/pagemap.h> 9#include <linux/pagemap.h>
10#include <linux/syscalls.h> 10#include <linux/syscalls.h>
11#include <linux/mempolicy.h>
11#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
12 13
13/* 14/*
14 * We can potentially split a vm area into separate 15 * We can potentially split a vm area into separate
15 * areas, each area with its own behavior. 16 * areas, each area with its own behavior.
16 */ 17 */
17static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, 18static long madvise_behavior(struct vm_area_struct * vma,
18 unsigned long end, int behavior) 19 struct vm_area_struct **prev,
20 unsigned long start, unsigned long end, int behavior)
19{ 21{
20 struct mm_struct * mm = vma->vm_mm; 22 struct mm_struct * mm = vma->vm_mm;
21 int error = 0; 23 int error = 0;
24 pgoff_t pgoff;
25 int new_flags = vma->vm_flags & ~VM_READHINTMASK;
26
27 switch (behavior) {
28 case MADV_SEQUENTIAL:
29 new_flags |= VM_SEQ_READ;
30 break;
31 case MADV_RANDOM:
32 new_flags |= VM_RAND_READ;
33 break;
34 default:
35 break;
36 }
37
38 if (new_flags == vma->vm_flags) {
39 *prev = vma;
40 goto success;
41 }
42
43 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
44 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
45 vma->vm_file, pgoff, vma_policy(vma));
46 if (*prev) {
47 vma = *prev;
48 goto success;
49 }
50
51 *prev = vma;
22 52
23 if (start != vma->vm_start) { 53 if (start != vma->vm_start) {
24 error = split_vma(mm, vma, start, 1); 54 error = split_vma(mm, vma, start, 1);
@@ -36,21 +66,12 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start,
36 * vm_flags is protected by the mmap_sem held in write mode. 66 * vm_flags is protected by the mmap_sem held in write mode.
37 */ 67 */
38 VM_ClearReadHint(vma); 68 VM_ClearReadHint(vma);
39 69 vma->vm_flags = new_flags;
40 switch (behavior) {
41 case MADV_SEQUENTIAL:
42 vma->vm_flags |= VM_SEQ_READ;
43 break;
44 case MADV_RANDOM:
45 vma->vm_flags |= VM_RAND_READ;
46 break;
47 default:
48 break;
49 }
50 70
51out: 71out:
52 if (error == -ENOMEM) 72 if (error == -ENOMEM)
53 error = -EAGAIN; 73 error = -EAGAIN;
74success:
54 return error; 75 return error;
55} 76}
56 77
@@ -58,6 +79,7 @@ out:
58 * Schedule all required I/O operations. Do not wait for completion. 79 * Schedule all required I/O operations. Do not wait for completion.
59 */ 80 */
60static long madvise_willneed(struct vm_area_struct * vma, 81static long madvise_willneed(struct vm_area_struct * vma,
82 struct vm_area_struct ** prev,
61 unsigned long start, unsigned long end) 83 unsigned long start, unsigned long end)
62{ 84{
63 struct file *file = vma->vm_file; 85 struct file *file = vma->vm_file;
@@ -65,6 +87,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
65 if (!file) 87 if (!file)
66 return -EBADF; 88 return -EBADF;
67 89
90 *prev = vma;
68 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 91 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
69 if (end > vma->vm_end) 92 if (end > vma->vm_end)
70 end = vma->vm_end; 93 end = vma->vm_end;
@@ -95,8 +118,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
95 * dirty pages is already available as msync(MS_INVALIDATE). 118 * dirty pages is already available as msync(MS_INVALIDATE).
96 */ 119 */
97static long madvise_dontneed(struct vm_area_struct * vma, 120static long madvise_dontneed(struct vm_area_struct * vma,
121 struct vm_area_struct ** prev,
98 unsigned long start, unsigned long end) 122 unsigned long start, unsigned long end)
99{ 123{
124 *prev = vma;
100 if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) 125 if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
101 return -EINVAL; 126 return -EINVAL;
102 127
@@ -111,8 +136,8 @@ static long madvise_dontneed(struct vm_area_struct * vma,
111 return 0; 136 return 0;
112} 137}
113 138
114static long madvise_vma(struct vm_area_struct * vma, unsigned long start, 139static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
115 unsigned long end, int behavior) 140 unsigned long start, unsigned long end, int behavior)
116{ 141{
117 long error = -EBADF; 142 long error = -EBADF;
118 143
@@ -120,15 +145,15 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
120 case MADV_NORMAL: 145 case MADV_NORMAL:
121 case MADV_SEQUENTIAL: 146 case MADV_SEQUENTIAL:
122 case MADV_RANDOM: 147 case MADV_RANDOM:
123 error = madvise_behavior(vma, start, end, behavior); 148 error = madvise_behavior(vma, prev, start, end, behavior);
124 break; 149 break;
125 150
126 case MADV_WILLNEED: 151 case MADV_WILLNEED:
127 error = madvise_willneed(vma, start, end); 152 error = madvise_willneed(vma, prev, start, end);
128 break; 153 break;
129 154
130 case MADV_DONTNEED: 155 case MADV_DONTNEED:
131 error = madvise_dontneed(vma, start, end); 156 error = madvise_dontneed(vma, prev, start, end);
132 break; 157 break;
133 158
134 default: 159 default:
@@ -175,8 +200,8 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
175 */ 200 */
176asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) 201asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
177{ 202{
178 unsigned long end; 203 unsigned long end, tmp;
179 struct vm_area_struct * vma; 204 struct vm_area_struct * vma, *prev;
180 int unmapped_error = 0; 205 int unmapped_error = 0;
181 int error = -EINVAL; 206 int error = -EINVAL;
182 size_t len; 207 size_t len;
@@ -202,40 +227,42 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
202 /* 227 /*
203 * If the interval [start,end) covers some unmapped address 228 * If the interval [start,end) covers some unmapped address
204 * ranges, just ignore them, but return -ENOMEM at the end. 229 * ranges, just ignore them, but return -ENOMEM at the end.
230 * - different from the way of handling in mlock etc.
205 */ 231 */
206 vma = find_vma(current->mm, start); 232 vma = find_vma_prev(current->mm, start, &prev);
233 if (!vma && prev)
234 vma = prev->vm_next;
207 for (;;) { 235 for (;;) {
208 /* Still start < end. */ 236 /* Still start < end. */
209 error = -ENOMEM; 237 error = -ENOMEM;
210 if (!vma) 238 if (!vma)
211 goto out; 239 goto out;
212 240
213 /* Here start < vma->vm_end. */ 241 /* Here start < (end|vma->vm_end). */
214 if (start < vma->vm_start) { 242 if (start < vma->vm_start) {
215 unmapped_error = -ENOMEM; 243 unmapped_error = -ENOMEM;
216 start = vma->vm_start; 244 start = vma->vm_start;
245 if (start >= end)
246 goto out;
217 } 247 }
218 248
219 /* Here vma->vm_start <= start < vma->vm_end. */ 249 /* Here vma->vm_start <= start < (end|vma->vm_end) */
220 if (end <= vma->vm_end) { 250 tmp = vma->vm_end;
221 if (start < end) { 251 if (end < tmp)
222 error = madvise_vma(vma, start, end, 252 tmp = end;
223 behavior);
224 if (error)
225 goto out;
226 }
227 error = unmapped_error;
228 goto out;
229 }
230 253
231 /* Here vma->vm_start <= start < vma->vm_end < end. */ 254 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
232 error = madvise_vma(vma, start, vma->vm_end, behavior); 255 error = madvise_vma(vma, &prev, start, tmp, behavior);
233 if (error) 256 if (error)
234 goto out; 257 goto out;
235 start = vma->vm_end; 258 start = tmp;
236 vma = vma->vm_next; 259 if (start < prev->vm_end)
260 start = prev->vm_end;
261 error = unmapped_error;
262 if (start >= end)
263 goto out;
264 vma = prev->vm_next;
237 } 265 }
238
239out: 266out:
240 up_write(&current->mm->mmap_sem); 267 up_write(&current->mm->mmap_sem);
241 return error; 268 return error;
diff --git a/mm/memory.c b/mm/memory.c
index d209f745db7f..da91b7bf9986 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -840,23 +840,8 @@ check_user_page_readable(struct mm_struct *mm, unsigned long address)
840{ 840{
841 return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; 841 return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL;
842} 842}
843
844EXPORT_SYMBOL(check_user_page_readable); 843EXPORT_SYMBOL(check_user_page_readable);
845 844
846/*
847 * Given a physical address, is there a useful struct page pointing to
848 * it? This may become more complex in the future if we start dealing
849 * with IO-aperture pages for direct-IO.
850 */
851
852static inline struct page *get_page_map(struct page *page)
853{
854 if (!pfn_valid(page_to_pfn(page)))
855 return NULL;
856 return page;
857}
858
859
860static inline int 845static inline int
861untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, 846untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
862 unsigned long address) 847 unsigned long address)
@@ -887,7 +872,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
887 return 0; 872 return 0;
888} 873}
889 874
890
891int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 875int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
892 unsigned long start, int len, int write, int force, 876 unsigned long start, int len, int write, int force,
893 struct page **pages, struct vm_area_struct **vmas) 877 struct page **pages, struct vm_area_struct **vmas)
@@ -951,21 +935,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
951 } 935 }
952 spin_lock(&mm->page_table_lock); 936 spin_lock(&mm->page_table_lock);
953 do { 937 do {
954 struct page *map; 938 struct page *page;
955 int lookup_write = write; 939 int lookup_write = write;
956 940
957 cond_resched_lock(&mm->page_table_lock); 941 cond_resched_lock(&mm->page_table_lock);
958 while (!(map = follow_page(mm, start, lookup_write))) { 942 while (!(page = follow_page(mm, start, lookup_write))) {
959 /* 943 /*
960 * Shortcut for anonymous pages. We don't want 944 * Shortcut for anonymous pages. We don't want
961 * to force the creation of pages tables for 945 * to force the creation of pages tables for
962 * insanly big anonymously mapped areas that 946 * insanely big anonymously mapped areas that
963 * nobody touched so far. This is important 947 * nobody touched so far. This is important
964 * for doing a core dump for these mappings. 948 * for doing a core dump for these mappings.
965 */ 949 */
966 if (!lookup_write && 950 if (!lookup_write &&
967 untouched_anonymous_page(mm,vma,start)) { 951 untouched_anonymous_page(mm,vma,start)) {
968 map = ZERO_PAGE(start); 952 page = ZERO_PAGE(start);
969 break; 953 break;
970 } 954 }
971 spin_unlock(&mm->page_table_lock); 955 spin_unlock(&mm->page_table_lock);
@@ -994,30 +978,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
994 spin_lock(&mm->page_table_lock); 978 spin_lock(&mm->page_table_lock);
995 } 979 }
996 if (pages) { 980 if (pages) {
997 pages[i] = get_page_map(map); 981 pages[i] = page;
998 if (!pages[i]) { 982 flush_dcache_page(page);
999 spin_unlock(&mm->page_table_lock); 983 if (!PageReserved(page))
1000 while (i--) 984 page_cache_get(page);
1001 page_cache_release(pages[i]);
1002 i = -EFAULT;
1003 goto out;
1004 }
1005 flush_dcache_page(pages[i]);
1006 if (!PageReserved(pages[i]))
1007 page_cache_get(pages[i]);
1008 } 985 }
1009 if (vmas) 986 if (vmas)
1010 vmas[i] = vma; 987 vmas[i] = vma;
1011 i++; 988 i++;
1012 start += PAGE_SIZE; 989 start += PAGE_SIZE;
1013 len--; 990 len--;
1014 } while(len && start < vma->vm_end); 991 } while (len && start < vma->vm_end);
1015 spin_unlock(&mm->page_table_lock); 992 spin_unlock(&mm->page_table_lock);
1016 } while(len); 993 } while (len);
1017out:
1018 return i; 994 return i;
1019} 995}
1020
1021EXPORT_SYMBOL(get_user_pages); 996EXPORT_SYMBOL(get_user_pages);
1022 997
1023static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, 998static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
@@ -1264,7 +1239,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1264 } 1239 }
1265 old_page = pfn_to_page(pfn); 1240 old_page = pfn_to_page(pfn);
1266 1241
1267 if (!TestSetPageLocked(old_page)) { 1242 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1268 int reuse = can_share_swap_page(old_page); 1243 int reuse = can_share_swap_page(old_page);
1269 unlock_page(old_page); 1244 unlock_page(old_page);
1270 if (reuse) { 1245 if (reuse) {
@@ -1711,10 +1686,6 @@ static int do_swap_page(struct mm_struct * mm,
1711 } 1686 }
1712 1687
1713 /* The page isn't present yet, go ahead with the fault. */ 1688 /* The page isn't present yet, go ahead with the fault. */
1714
1715 swap_free(entry);
1716 if (vm_swap_full())
1717 remove_exclusive_swap_page(page);
1718 1689
1719 inc_mm_counter(mm, rss); 1690 inc_mm_counter(mm, rss);
1720 pte = mk_pte(page, vma->vm_page_prot); 1691 pte = mk_pte(page, vma->vm_page_prot);
@@ -1722,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm,
1722 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 1693 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
1723 write_access = 0; 1694 write_access = 0;
1724 } 1695 }
1725 unlock_page(page);
1726 1696
1727 flush_icache_page(vma, page); 1697 flush_icache_page(vma, page);
1728 set_pte_at(mm, address, page_table, pte); 1698 set_pte_at(mm, address, page_table, pte);
1729 page_add_anon_rmap(page, vma, address); 1699 page_add_anon_rmap(page, vma, address);
1730 1700
1701 swap_free(entry);
1702 if (vm_swap_full())
1703 remove_exclusive_swap_page(page);
1704 unlock_page(page);
1705
1731 if (write_access) { 1706 if (write_access) {
1732 if (do_wp_page(mm, vma, address, 1707 if (do_wp_page(mm, vma, address,
1733 page_table, pmd, pte) == VM_FAULT_OOM) 1708 page_table, pmd, pte) == VM_FAULT_OOM)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 08c41da429cf..cb41c31e7c87 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -238,46 +238,80 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
238} 238}
239 239
240/* Ensure all existing pages follow the policy. */ 240/* Ensure all existing pages follow the policy. */
241static int 241static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
242verify_pages(struct mm_struct *mm, 242 unsigned long addr, unsigned long end, unsigned long *nodes)
243 unsigned long addr, unsigned long end, unsigned long *nodes)
244{ 243{
245 while (addr < end) { 244 pte_t *orig_pte;
246 struct page *p; 245 pte_t *pte;
247 pte_t *pte; 246
248 pmd_t *pmd; 247 spin_lock(&mm->page_table_lock);
249 pud_t *pud; 248 orig_pte = pte = pte_offset_map(pmd, addr);
250 pgd_t *pgd; 249 do {
251 pgd = pgd_offset(mm, addr); 250 unsigned long pfn;
252 if (pgd_none(*pgd)) { 251 unsigned int nid;
253 unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK; 252
254 if (next > addr) 253 if (!pte_present(*pte))
255 break;
256 addr = next;
257 continue; 254 continue;
258 } 255 pfn = pte_pfn(*pte);
259 pud = pud_offset(pgd, addr); 256 if (!pfn_valid(pfn))
260 if (pud_none(*pud)) {
261 addr = (addr + PUD_SIZE) & PUD_MASK;
262 continue; 257 continue;
263 } 258 nid = pfn_to_nid(pfn);
264 pmd = pmd_offset(pud, addr); 259 if (!test_bit(nid, nodes))
265 if (pmd_none(*pmd)) { 260 break;
266 addr = (addr + PMD_SIZE) & PMD_MASK; 261 } while (pte++, addr += PAGE_SIZE, addr != end);
262 pte_unmap(orig_pte);
263 spin_unlock(&mm->page_table_lock);
264 return addr != end;
265}
266
267static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
268 unsigned long addr, unsigned long end, unsigned long *nodes)
269{
270 pmd_t *pmd;
271 unsigned long next;
272
273 pmd = pmd_offset(pud, addr);
274 do {
275 next = pmd_addr_end(addr, end);
276 if (pmd_none_or_clear_bad(pmd))
267 continue; 277 continue;
268 } 278 if (check_pte_range(mm, pmd, addr, next, nodes))
269 p = NULL; 279 return -EIO;
270 pte = pte_offset_map(pmd, addr); 280 } while (pmd++, addr = next, addr != end);
271 if (pte_present(*pte)) 281 return 0;
272 p = pte_page(*pte); 282}
273 pte_unmap(pte); 283
274 if (p) { 284static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
275 unsigned nid = page_to_nid(p); 285 unsigned long addr, unsigned long end, unsigned long *nodes)
276 if (!test_bit(nid, nodes)) 286{
277 return -EIO; 287 pud_t *pud;
278 } 288 unsigned long next;
279 addr += PAGE_SIZE; 289
280 } 290 pud = pud_offset(pgd, addr);
291 do {
292 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud))
294 continue;
295 if (check_pmd_range(mm, pud, addr, next, nodes))
296 return -EIO;
297 } while (pud++, addr = next, addr != end);
298 return 0;
299}
300
301static inline int check_pgd_range(struct mm_struct *mm,
302 unsigned long addr, unsigned long end, unsigned long *nodes)
303{
304 pgd_t *pgd;
305 unsigned long next;
306
307 pgd = pgd_offset(mm, addr);
308 do {
309 next = pgd_addr_end(addr, end);
310 if (pgd_none_or_clear_bad(pgd))
311 continue;
312 if (check_pud_range(mm, pgd, addr, next, nodes))
313 return -EIO;
314 } while (pgd++, addr = next, addr != end);
281 return 0; 315 return 0;
282} 316}
283 317
@@ -299,7 +333,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
299 if (prev && prev->vm_end < vma->vm_start) 333 if (prev && prev->vm_end < vma->vm_start)
300 return ERR_PTR(-EFAULT); 334 return ERR_PTR(-EFAULT);
301 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 335 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
302 err = verify_pages(vma->vm_mm, 336 err = check_pgd_range(vma->vm_mm,
303 vma->vm_start, vma->vm_end, nodes); 337 vma->vm_start, vma->vm_end, nodes);
304 if (err) { 338 if (err) {
305 first = ERR_PTR(err); 339 first = ERR_PTR(err);
@@ -721,7 +755,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
721 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); 755 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
722 page = __alloc_pages(gfp, order, zl); 756 page = __alloc_pages(gfp, order, zl);
723 if (page && page_zone(page) == zl->zones[0]) { 757 if (page && page_zone(page) == zl->zones[0]) {
724 zl->zones[0]->pageset[get_cpu()].interleave_hit++; 758 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
725 put_cpu(); 759 put_cpu();
726 } 760 }
727 return page; 761 return page;
diff --git a/mm/mmap.c b/mm/mmap.c
index de54acd9942f..da3fa90a0aae 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1175 (!vma || addr + len <= vma->vm_start)) 1175 (!vma || addr + len <= vma->vm_start))
1176 return addr; 1176 return addr;
1177 } 1177 }
1178 start_addr = addr = mm->free_area_cache; 1178 if (len > mm->cached_hole_size) {
1179 start_addr = addr = mm->free_area_cache;
1180 } else {
1181 start_addr = addr = TASK_UNMAPPED_BASE;
1182 mm->cached_hole_size = 0;
1183 }
1179 1184
1180full_search: 1185full_search:
1181 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 1186 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
@@ -1186,7 +1191,9 @@ full_search:
1186 * some holes. 1191 * some holes.
1187 */ 1192 */
1188 if (start_addr != TASK_UNMAPPED_BASE) { 1193 if (start_addr != TASK_UNMAPPED_BASE) {
1189 start_addr = addr = TASK_UNMAPPED_BASE; 1194 addr = TASK_UNMAPPED_BASE;
1195 start_addr = addr;
1196 mm->cached_hole_size = 0;
1190 goto full_search; 1197 goto full_search;
1191 } 1198 }
1192 return -ENOMEM; 1199 return -ENOMEM;
@@ -1198,19 +1205,22 @@ full_search:
1198 mm->free_area_cache = addr + len; 1205 mm->free_area_cache = addr + len;
1199 return addr; 1206 return addr;
1200 } 1207 }
1208 if (addr + mm->cached_hole_size < vma->vm_start)
1209 mm->cached_hole_size = vma->vm_start - addr;
1201 addr = vma->vm_end; 1210 addr = vma->vm_end;
1202 } 1211 }
1203} 1212}
1204#endif 1213#endif
1205 1214
1206void arch_unmap_area(struct vm_area_struct *area) 1215void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1207{ 1216{
1208 /* 1217 /*
1209 * Is this a new hole at the lowest possible address? 1218 * Is this a new hole at the lowest possible address?
1210 */ 1219 */
1211 if (area->vm_start >= TASK_UNMAPPED_BASE && 1220 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) {
1212 area->vm_start < area->vm_mm->free_area_cache) 1221 mm->free_area_cache = addr;
1213 area->vm_mm->free_area_cache = area->vm_start; 1222 mm->cached_hole_size = ~0UL;
1223 }
1214} 1224}
1215 1225
1216/* 1226/*
@@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1240 return addr; 1250 return addr;
1241 } 1251 }
1242 1252
1253 /* check if free_area_cache is useful for us */
1254 if (len <= mm->cached_hole_size) {
1255 mm->cached_hole_size = 0;
1256 mm->free_area_cache = mm->mmap_base;
1257 }
1258
1243 /* either no address requested or can't fit in requested address hole */ 1259 /* either no address requested or can't fit in requested address hole */
1244 addr = mm->free_area_cache; 1260 addr = mm->free_area_cache;
1245 1261
@@ -1251,6 +1267,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1251 return (mm->free_area_cache = addr-len); 1267 return (mm->free_area_cache = addr-len);
1252 } 1268 }
1253 1269
1270 if (mm->mmap_base < len)
1271 goto bottomup;
1272
1254 addr = mm->mmap_base-len; 1273 addr = mm->mmap_base-len;
1255 1274
1256 do { 1275 do {
@@ -1264,38 +1283,45 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1264 /* remember the address as a hint for next time */ 1283 /* remember the address as a hint for next time */
1265 return (mm->free_area_cache = addr); 1284 return (mm->free_area_cache = addr);
1266 1285
1286 /* remember the largest hole we saw so far */
1287 if (addr + mm->cached_hole_size < vma->vm_start)
1288 mm->cached_hole_size = vma->vm_start - addr;
1289
1267 /* try just below the current vma->vm_start */ 1290 /* try just below the current vma->vm_start */
1268 addr = vma->vm_start-len; 1291 addr = vma->vm_start-len;
1269 } while (len < vma->vm_start); 1292 } while (len < vma->vm_start);
1270 1293
1294bottomup:
1271 /* 1295 /*
1272 * A failed mmap() very likely causes application failure, 1296 * A failed mmap() very likely causes application failure,
1273 * so fall back to the bottom-up function here. This scenario 1297 * so fall back to the bottom-up function here. This scenario
1274 * can happen with large stack limits and large mmap() 1298 * can happen with large stack limits and large mmap()
1275 * allocations. 1299 * allocations.
1276 */ 1300 */
1277 mm->free_area_cache = TASK_UNMAPPED_BASE; 1301 mm->cached_hole_size = ~0UL;
1302 mm->free_area_cache = TASK_UNMAPPED_BASE;
1278 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 1303 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
1279 /* 1304 /*
1280 * Restore the topdown base: 1305 * Restore the topdown base:
1281 */ 1306 */
1282 mm->free_area_cache = mm->mmap_base; 1307 mm->free_area_cache = mm->mmap_base;
1308 mm->cached_hole_size = ~0UL;
1283 1309
1284 return addr; 1310 return addr;
1285} 1311}
1286#endif 1312#endif
1287 1313
1288void arch_unmap_area_topdown(struct vm_area_struct *area) 1314void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
1289{ 1315{
1290 /* 1316 /*
1291 * Is this a new hole at the highest possible address? 1317 * Is this a new hole at the highest possible address?
1292 */ 1318 */
1293 if (area->vm_end > area->vm_mm->free_area_cache) 1319 if (addr > mm->free_area_cache)
1294 area->vm_mm->free_area_cache = area->vm_end; 1320 mm->free_area_cache = addr;
1295 1321
1296 /* dont allow allocations above current base */ 1322 /* dont allow allocations above current base */
1297 if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base) 1323 if (mm->free_area_cache > mm->mmap_base)
1298 area->vm_mm->free_area_cache = area->vm_mm->mmap_base; 1324 mm->free_area_cache = mm->mmap_base;
1299} 1325}
1300 1326
1301unsigned long 1327unsigned long
@@ -1595,7 +1621,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
1595 if (area->vm_flags & VM_LOCKED) 1621 if (area->vm_flags & VM_LOCKED)
1596 area->vm_mm->locked_vm -= len >> PAGE_SHIFT; 1622 area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
1597 vm_stat_unaccount(area); 1623 vm_stat_unaccount(area);
1598 area->vm_mm->unmap_area(area);
1599 remove_vm_struct(area); 1624 remove_vm_struct(area);
1600} 1625}
1601 1626
@@ -1649,6 +1674,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1649{ 1674{
1650 struct vm_area_struct **insertion_point; 1675 struct vm_area_struct **insertion_point;
1651 struct vm_area_struct *tail_vma = NULL; 1676 struct vm_area_struct *tail_vma = NULL;
1677 unsigned long addr;
1652 1678
1653 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 1679 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1654 do { 1680 do {
@@ -1659,6 +1685,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1659 } while (vma && vma->vm_start < end); 1685 } while (vma && vma->vm_start < end);
1660 *insertion_point = vma; 1686 *insertion_point = vma;
1661 tail_vma->vm_next = NULL; 1687 tail_vma->vm_next = NULL;
1688 if (mm->unmap_area == arch_unmap_area)
1689 addr = prev ? prev->vm_end : mm->mmap_base;
1690 else
1691 addr = vma ? vma->vm_start : mm->mmap_base;
1692 mm->unmap_area(mm, addr);
1662 mm->mmap_cache = NULL; /* Kill the cache. */ 1693 mm->mmap_cache = NULL; /* Kill the cache. */
1663} 1694}
1664 1695
diff --git a/mm/msync.c b/mm/msync.c
index 090f426bca7d..d0f5a1bce7cb 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -34,6 +34,8 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
34 34
35 if (!pte_present(*pte)) 35 if (!pte_present(*pte))
36 continue; 36 continue;
37 if (!pte_maybe_dirty(*pte))
38 continue;
37 pfn = pte_pfn(*pte); 39 pfn = pte_pfn(*pte);
38 if (!pfn_valid(pfn)) 40 if (!pfn_valid(pfn))
39 continue; 41 continue;
diff --git a/mm/nommu.c b/mm/nommu.c
index c53e9c8f6b4a..ce74452c02d9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1067 return -ENOMEM; 1067 return -ENOMEM;
1068} 1068}
1069 1069
1070void arch_unmap_area(struct vm_area_struct *area) 1070void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1071{ 1071{
1072} 1072}
1073 1073
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4bbb1cb10495..59666d905f19 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -258,6 +258,10 @@ void out_of_memory(unsigned int __nocast gfp_mask)
258 struct mm_struct *mm = NULL; 258 struct mm_struct *mm = NULL;
259 task_t * p; 259 task_t * p;
260 260
261 printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
262 /* print memory stats */
263 show_mem();
264
261 read_lock(&tasklist_lock); 265 read_lock(&tasklist_lock);
262retry: 266retry:
263 p = select_bad_process(); 267 p = select_bad_process();
@@ -268,12 +272,9 @@ retry:
268 /* Found nothing?!?! Either we hang forever, or we panic. */ 272 /* Found nothing?!?! Either we hang forever, or we panic. */
269 if (!p) { 273 if (!p) {
270 read_unlock(&tasklist_lock); 274 read_unlock(&tasklist_lock);
271 show_free_areas();
272 panic("Out of memory and no killable processes...\n"); 275 panic("Out of memory and no killable processes...\n");
273 } 276 }
274 277
275 printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
276 show_free_areas();
277 mm = oom_kill_process(p); 278 mm = oom_kill_process(p);
278 if (!mm) 279 if (!mm)
279 goto retry; 280 goto retry;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b1061b1962f8..206920796f5f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page)
105 printk(KERN_EMERG "Backtrace:\n"); 105 printk(KERN_EMERG "Backtrace:\n");
106 dump_stack(); 106 dump_stack();
107 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); 107 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
108 page->flags &= ~(1 << PG_private | 108 page->flags &= ~(1 << PG_lru |
109 1 << PG_private |
109 1 << PG_locked | 110 1 << PG_locked |
110 1 << PG_lru |
111 1 << PG_active | 111 1 << PG_active |
112 1 << PG_dirty | 112 1 << PG_dirty |
113 1 << PG_reclaim |
114 1 << PG_slab |
113 1 << PG_swapcache | 115 1 << PG_swapcache |
114 1 << PG_writeback); 116 1 << PG_writeback);
115 set_page_count(page, 0); 117 set_page_count(page, 0);
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order)
440 */ 442 */
441static void prep_new_page(struct page *page, int order) 443static void prep_new_page(struct page *page, int order)
442{ 444{
443 if (page->mapping || page_mapcount(page) || 445 if ( page_mapcount(page) ||
444 (page->flags & ( 446 page->mapping != NULL ||
447 page_count(page) != 0 ||
448 (page->flags & (
449 1 << PG_lru |
445 1 << PG_private | 450 1 << PG_private |
446 1 << PG_locked | 451 1 << PG_locked |
447 1 << PG_lru |
448 1 << PG_active | 452 1 << PG_active |
449 1 << PG_dirty | 453 1 << PG_dirty |
450 1 << PG_reclaim | 454 1 << PG_reclaim |
455 1 << PG_slab |
451 1 << PG_swapcache | 456 1 << PG_swapcache |
452 1 << PG_writeback ))) 457 1 << PG_writeback )))
453 bad_page(__FUNCTION__, page); 458 bad_page(__FUNCTION__, page);
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
511 return allocated; 516 return allocated;
512} 517}
513 518
519#ifdef CONFIG_NUMA
520/* Called from the slab reaper to drain remote pagesets */
521void drain_remote_pages(void)
522{
523 struct zone *zone;
524 int i;
525 unsigned long flags;
526
527 local_irq_save(flags);
528 for_each_zone(zone) {
529 struct per_cpu_pageset *pset;
530
531 /* Do not drain local pagesets */
532 if (zone->zone_pgdat->node_id == numa_node_id())
533 continue;
534
535 pset = zone->pageset[smp_processor_id()];
536 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
537 struct per_cpu_pages *pcp;
538
539 pcp = &pset->pcp[i];
540 if (pcp->count)
541 pcp->count -= free_pages_bulk(zone, pcp->count,
542 &pcp->list, 0);
543 }
544 }
545 local_irq_restore(flags);
546}
547#endif
548
514#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 549#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
515static void __drain_pages(unsigned int cpu) 550static void __drain_pages(unsigned int cpu)
516{ 551{
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu)
520 for_each_zone(zone) { 555 for_each_zone(zone) {
521 struct per_cpu_pageset *pset; 556 struct per_cpu_pageset *pset;
522 557
523 pset = &zone->pageset[cpu]; 558 pset = zone_pcp(zone, cpu);
524 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 559 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
525 struct per_cpu_pages *pcp; 560 struct per_cpu_pages *pcp;
526 561
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
583 618
584 local_irq_save(flags); 619 local_irq_save(flags);
585 cpu = smp_processor_id(); 620 cpu = smp_processor_id();
586 p = &z->pageset[cpu]; 621 p = zone_pcp(z,cpu);
587 if (pg == orig) { 622 if (pg == orig) {
588 z->pageset[cpu].numa_hit++; 623 p->numa_hit++;
589 } else { 624 } else {
590 p->numa_miss++; 625 p->numa_miss++;
591 zonelist->zones[0]->pageset[cpu].numa_foreign++; 626 zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
592 } 627 }
593 if (pg == NODE_DATA(numa_node_id())) 628 if (pg == NODE_DATA(numa_node_id()))
594 p->local_node++; 629 p->local_node++;
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
615 if (PageAnon(page)) 650 if (PageAnon(page))
616 page->mapping = NULL; 651 page->mapping = NULL;
617 free_pages_check(__FUNCTION__, page); 652 free_pages_check(__FUNCTION__, page);
618 pcp = &zone->pageset[get_cpu()].pcp[cold]; 653 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
619 local_irq_save(flags); 654 local_irq_save(flags);
620 if (pcp->count >= pcp->high)
621 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
622 list_add(&page->lru, &pcp->list); 655 list_add(&page->lru, &pcp->list);
623 pcp->count++; 656 pcp->count++;
657 if (pcp->count >= pcp->high)
658 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
624 local_irq_restore(flags); 659 local_irq_restore(flags);
625 put_cpu(); 660 put_cpu();
626} 661}
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
659 if (order == 0) { 694 if (order == 0) {
660 struct per_cpu_pages *pcp; 695 struct per_cpu_pages *pcp;
661 696
662 pcp = &zone->pageset[get_cpu()].pcp[cold]; 697 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
663 local_irq_save(flags); 698 local_irq_save(flags);
664 if (pcp->count <= pcp->low) 699 if (pcp->count <= pcp->low)
665 pcp->count += rmqueue_bulk(zone, 0, 700 pcp->count += rmqueue_bulk(zone, 0,
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
724 return 1; 759 return 1;
725} 760}
726 761
762static inline int
763should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
764{
765 if (!z->reclaim_pages)
766 return 0;
767 if (gfp_mask & __GFP_NORECLAIM)
768 return 0;
769 return 1;
770}
771
727/* 772/*
728 * This is the 'heart' of the zoned buddy allocator. 773 * This is the 'heart' of the zoned buddy allocator.
729 */ 774 */
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
760 805
761 classzone_idx = zone_idx(zones[0]); 806 classzone_idx = zone_idx(zones[0]);
762 807
763 restart: 808restart:
764 /* Go through the zonelist once, looking for a zone with enough free */ 809 /* Go through the zonelist once, looking for a zone with enough free */
765 for (i = 0; (z = zones[i]) != NULL; i++) { 810 for (i = 0; (z = zones[i]) != NULL; i++) {
766 811 int do_reclaim = should_reclaim_zone(z, gfp_mask);
767 if (!zone_watermark_ok(z, order, z->pages_low,
768 classzone_idx, 0, 0))
769 continue;
770 812
771 if (!cpuset_zone_allowed(z)) 813 if (!cpuset_zone_allowed(z))
772 continue; 814 continue;
773 815
816 /*
817 * If the zone is to attempt early page reclaim then this loop
818 * will try to reclaim pages and check the watermark a second
819 * time before giving up and falling back to the next zone.
820 */
821zone_reclaim_retry:
822 if (!zone_watermark_ok(z, order, z->pages_low,
823 classzone_idx, 0, 0)) {
824 if (!do_reclaim)
825 continue;
826 else {
827 zone_reclaim(z, gfp_mask, order);
828 /* Only try reclaim once */
829 do_reclaim = 0;
830 goto zone_reclaim_retry;
831 }
832 }
833
774 page = buffered_rmqueue(z, order, gfp_mask); 834 page = buffered_rmqueue(z, order, gfp_mask);
775 if (page) 835 if (page)
776 goto got_pg; 836 goto got_pg;
@@ -829,7 +889,7 @@ rebalance:
829 reclaim_state.reclaimed_slab = 0; 889 reclaim_state.reclaimed_slab = 0;
830 p->reclaim_state = &reclaim_state; 890 p->reclaim_state = &reclaim_state;
831 891
832 did_some_progress = try_to_free_pages(zones, gfp_mask, order); 892 did_some_progress = try_to_free_pages(zones, gfp_mask);
833 893
834 p->reclaim_state = NULL; 894 p->reclaim_state = NULL;
835 p->flags &= ~PF_MEMALLOC; 895 p->flags &= ~PF_MEMALLOC;
@@ -905,6 +965,7 @@ nopage:
905 " order:%d, mode:0x%x\n", 965 " order:%d, mode:0x%x\n",
906 p->comm, order, gfp_mask); 966 p->comm, order, gfp_mask);
907 dump_stack(); 967 dump_stack();
968 show_mem();
908 } 969 }
909 return NULL; 970 return NULL;
910got_pg: 971got_pg:
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret)
1114 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); 1175 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
1115} 1176}
1116 1177
1117unsigned long __read_page_state(unsigned offset) 1178unsigned long __read_page_state(unsigned long offset)
1118{ 1179{
1119 unsigned long ret = 0; 1180 unsigned long ret = 0;
1120 int cpu; 1181 int cpu;
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset)
1128 return ret; 1189 return ret;
1129} 1190}
1130 1191
1131void __mod_page_state(unsigned offset, unsigned long delta) 1192void __mod_page_state(unsigned long offset, unsigned long delta)
1132{ 1193{
1133 unsigned long flags; 1194 unsigned long flags;
1134 void* ptr; 1195 void* ptr;
@@ -1237,22 +1298,23 @@ void show_free_areas(void)
1237 if (!cpu_possible(cpu)) 1298 if (!cpu_possible(cpu))
1238 continue; 1299 continue;
1239 1300
1240 pageset = zone->pageset + cpu; 1301 pageset = zone_pcp(zone, cpu);
1241 1302
1242 for (temperature = 0; temperature < 2; temperature++) 1303 for (temperature = 0; temperature < 2; temperature++)
1243 printk("cpu %d %s: low %d, high %d, batch %d\n", 1304 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
1244 cpu, 1305 cpu,
1245 temperature ? "cold" : "hot", 1306 temperature ? "cold" : "hot",
1246 pageset->pcp[temperature].low, 1307 pageset->pcp[temperature].low,
1247 pageset->pcp[temperature].high, 1308 pageset->pcp[temperature].high,
1248 pageset->pcp[temperature].batch); 1309 pageset->pcp[temperature].batch,
1310 pageset->pcp[temperature].count);
1249 } 1311 }
1250 } 1312 }
1251 1313
1252 get_page_state(&ps); 1314 get_page_state(&ps);
1253 get_zone_counts(&active, &inactive, &free); 1315 get_zone_counts(&active, &inactive, &free);
1254 1316
1255 printk("\nFree pages: %11ukB (%ukB HighMem)\n", 1317 printk("Free pages: %11ukB (%ukB HighMem)\n",
1256 K(nr_free_pages()), 1318 K(nr_free_pages()),
1257 K(nr_free_highpages())); 1319 K(nr_free_highpages()));
1258 1320
@@ -1620,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1620 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1682 memmap_init_zone((size), (nid), (zone), (start_pfn))
1621#endif 1683#endif
1622 1684
1685static int __devinit zone_batchsize(struct zone *zone)
1686{
1687 int batch;
1688
1689 /*
1690 * The per-cpu-pages pools are set to around 1000th of the
1691 * size of the zone. But no more than 1/4 of a meg - there's
1692 * no point in going beyond the size of L2 cache.
1693 *
1694 * OK, so we don't know how big the cache is. So guess.
1695 */
1696 batch = zone->present_pages / 1024;
1697 if (batch * PAGE_SIZE > 256 * 1024)
1698 batch = (256 * 1024) / PAGE_SIZE;
1699 batch /= 4; /* We effectively *= 4 below */
1700 if (batch < 1)
1701 batch = 1;
1702
1703 /*
1704 * Clamp the batch to a 2^n - 1 value. Having a power
1705 * of 2 value was found to be more likely to have
1706 * suboptimal cache aliasing properties in some cases.
1707 *
1708 * For example if 2 tasks are alternately allocating
1709 * batches of pages, one task can end up with a lot
1710 * of pages of one half of the possible page colors
1711 * and the other with pages of the other colors.
1712 */
1713 batch = (1 << fls(batch + batch/2)) - 1;
1714 return batch;
1715}
1716
1717inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1718{
1719 struct per_cpu_pages *pcp;
1720
1721 pcp = &p->pcp[0]; /* hot */
1722 pcp->count = 0;
1723 pcp->low = 2 * batch;
1724 pcp->high = 6 * batch;
1725 pcp->batch = max(1UL, 1 * batch);
1726 INIT_LIST_HEAD(&pcp->list);
1727
1728 pcp = &p->pcp[1]; /* cold*/
1729 pcp->count = 0;
1730 pcp->low = 0;
1731 pcp->high = 2 * batch;
1732 pcp->batch = max(1UL, 1 * batch);
1733 INIT_LIST_HEAD(&pcp->list);
1734}
1735
1736#ifdef CONFIG_NUMA
1737/*
1738 * Boot pageset table. One per cpu which is going to be used for all
1739 * zones and all nodes. The parameters will be set in such a way
1740 * that an item put on a list will immediately be handed over to
1741 * the buddy list. This is safe since pageset manipulation is done
1742 * with interrupts disabled.
1743 *
1744 * Some NUMA counter updates may also be caught by the boot pagesets.
1745 * These will be discarded when bootup is complete.
1746 */
1747static struct per_cpu_pageset
1748 boot_pageset[NR_CPUS] __initdata;
1749
1750/*
1751 * Dynamically allocate memory for the
1752 * per cpu pageset array in struct zone.
1753 */
1754static int __devinit process_zones(int cpu)
1755{
1756 struct zone *zone, *dzone;
1757
1758 for_each_zone(zone) {
1759
1760 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
1761 GFP_KERNEL, cpu_to_node(cpu));
1762 if (!zone->pageset[cpu])
1763 goto bad;
1764
1765 setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
1766 }
1767
1768 return 0;
1769bad:
1770 for_each_zone(dzone) {
1771 if (dzone == zone)
1772 break;
1773 kfree(dzone->pageset[cpu]);
1774 dzone->pageset[cpu] = NULL;
1775 }
1776 return -ENOMEM;
1777}
1778
1779static inline void free_zone_pagesets(int cpu)
1780{
1781#ifdef CONFIG_NUMA
1782 struct zone *zone;
1783
1784 for_each_zone(zone) {
1785 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
1786
1787 zone_pcp(zone, cpu) = NULL;
1788 kfree(pset);
1789 }
1790#endif
1791}
1792
1793static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1794 unsigned long action,
1795 void *hcpu)
1796{
1797 int cpu = (long)hcpu;
1798 int ret = NOTIFY_OK;
1799
1800 switch (action) {
1801 case CPU_UP_PREPARE:
1802 if (process_zones(cpu))
1803 ret = NOTIFY_BAD;
1804 break;
1805#ifdef CONFIG_HOTPLUG_CPU
1806 case CPU_DEAD:
1807 free_zone_pagesets(cpu);
1808 break;
1809#endif
1810 default:
1811 break;
1812 }
1813 return ret;
1814}
1815
1816static struct notifier_block pageset_notifier =
1817 { &pageset_cpuup_callback, NULL, 0 };
1818
1819void __init setup_per_cpu_pageset()
1820{
1821 int err;
1822
1823 /* Initialize per_cpu_pageset for cpu 0.
1824 * A cpuup callback will do this for every cpu
1825 * as it comes online
1826 */
1827 err = process_zones(smp_processor_id());
1828 BUG_ON(err);
1829 register_cpu_notifier(&pageset_notifier);
1830}
1831
1832#endif
1833
1623/* 1834/*
1624 * Set up the zone data structures: 1835 * Set up the zone data structures:
1625 * - mark all pages reserved 1836 * - mark all pages reserved
@@ -1662,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1662 1873
1663 zone->temp_priority = zone->prev_priority = DEF_PRIORITY; 1874 zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
1664 1875
1665 /* 1876 batch = zone_batchsize(zone);
1666 * The per-cpu-pages pools are set to around 1000th of the
1667 * size of the zone. But no more than 1/4 of a meg - there's
1668 * no point in going beyond the size of L2 cache.
1669 *
1670 * OK, so we don't know how big the cache is. So guess.
1671 */
1672 batch = zone->present_pages / 1024;
1673 if (batch * PAGE_SIZE > 256 * 1024)
1674 batch = (256 * 1024) / PAGE_SIZE;
1675 batch /= 4; /* We effectively *= 4 below */
1676 if (batch < 1)
1677 batch = 1;
1678
1679 /*
1680 * Clamp the batch to a 2^n - 1 value. Having a power
1681 * of 2 value was found to be more likely to have
1682 * suboptimal cache aliasing properties in some cases.
1683 *
1684 * For example if 2 tasks are alternately allocating
1685 * batches of pages, one task can end up with a lot
1686 * of pages of one half of the possible page colors
1687 * and the other with pages of the other colors.
1688 */
1689 batch = (1 << fls(batch + batch/2)) - 1;
1690 1877
1691 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1878 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1692 struct per_cpu_pages *pcp; 1879#ifdef CONFIG_NUMA
1693 1880 /* Early boot. Slab allocator not functional yet */
1694 pcp = &zone->pageset[cpu].pcp[0]; /* hot */ 1881 zone->pageset[cpu] = &boot_pageset[cpu];
1695 pcp->count = 0; 1882 setup_pageset(&boot_pageset[cpu],0);
1696 pcp->low = 2 * batch; 1883#else
1697 pcp->high = 6 * batch; 1884 setup_pageset(zone_pcp(zone,cpu), batch);
1698 pcp->batch = 1 * batch; 1885#endif
1699 INIT_LIST_HEAD(&pcp->list);
1700
1701 pcp = &zone->pageset[cpu].pcp[1]; /* cold */
1702 pcp->count = 0;
1703 pcp->low = 0;
1704 pcp->high = 2 * batch;
1705 pcp->batch = 1 * batch;
1706 INIT_LIST_HEAD(&pcp->list);
1707 } 1886 }
1708 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 1887 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
1709 zone_names[j], realsize, batch); 1888 zone_names[j], realsize, batch);
@@ -1713,6 +1892,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1713 zone->nr_scan_inactive = 0; 1892 zone->nr_scan_inactive = 0;
1714 zone->nr_active = 0; 1893 zone->nr_active = 0;
1715 zone->nr_inactive = 0; 1894 zone->nr_inactive = 0;
1895 atomic_set(&zone->reclaim_in_progress, -1);
1716 if (!size) 1896 if (!size)
1717 continue; 1897 continue;
1718 1898
@@ -1853,6 +2033,115 @@ struct seq_operations fragmentation_op = {
1853 .show = frag_show, 2033 .show = frag_show,
1854}; 2034};
1855 2035
2036/*
2037 * Output information about zones in @pgdat.
2038 */
2039static int zoneinfo_show(struct seq_file *m, void *arg)
2040{
2041 pg_data_t *pgdat = arg;
2042 struct zone *zone;
2043 struct zone *node_zones = pgdat->node_zones;
2044 unsigned long flags;
2045
2046 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2047 int i;
2048
2049 if (!zone->present_pages)
2050 continue;
2051
2052 spin_lock_irqsave(&zone->lock, flags);
2053 seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
2054 seq_printf(m,
2055 "\n pages free %lu"
2056 "\n min %lu"
2057 "\n low %lu"
2058 "\n high %lu"
2059 "\n active %lu"
2060 "\n inactive %lu"
2061 "\n scanned %lu (a: %lu i: %lu)"
2062 "\n spanned %lu"
2063 "\n present %lu",
2064 zone->free_pages,
2065 zone->pages_min,
2066 zone->pages_low,
2067 zone->pages_high,
2068 zone->nr_active,
2069 zone->nr_inactive,
2070 zone->pages_scanned,
2071 zone->nr_scan_active, zone->nr_scan_inactive,
2072 zone->spanned_pages,
2073 zone->present_pages);
2074 seq_printf(m,
2075 "\n protection: (%lu",
2076 zone->lowmem_reserve[0]);
2077 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
2078 seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
2079 seq_printf(m,
2080 ")"
2081 "\n pagesets");
2082 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
2083 struct per_cpu_pageset *pageset;
2084 int j;
2085
2086 pageset = zone_pcp(zone, i);
2087 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2088 if (pageset->pcp[j].count)
2089 break;
2090 }
2091 if (j == ARRAY_SIZE(pageset->pcp))
2092 continue;
2093 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
2094 seq_printf(m,
2095 "\n cpu: %i pcp: %i"
2096 "\n count: %i"
2097 "\n low: %i"
2098 "\n high: %i"
2099 "\n batch: %i",
2100 i, j,
2101 pageset->pcp[j].count,
2102 pageset->pcp[j].low,
2103 pageset->pcp[j].high,
2104 pageset->pcp[j].batch);
2105 }
2106#ifdef CONFIG_NUMA
2107 seq_printf(m,
2108 "\n numa_hit: %lu"
2109 "\n numa_miss: %lu"
2110 "\n numa_foreign: %lu"
2111 "\n interleave_hit: %lu"
2112 "\n local_node: %lu"
2113 "\n other_node: %lu",
2114 pageset->numa_hit,
2115 pageset->numa_miss,
2116 pageset->numa_foreign,
2117 pageset->interleave_hit,
2118 pageset->local_node,
2119 pageset->other_node);
2120#endif
2121 }
2122 seq_printf(m,
2123 "\n all_unreclaimable: %u"
2124 "\n prev_priority: %i"
2125 "\n temp_priority: %i"
2126 "\n start_pfn: %lu",
2127 zone->all_unreclaimable,
2128 zone->prev_priority,
2129 zone->temp_priority,
2130 zone->zone_start_pfn);
2131 spin_unlock_irqrestore(&zone->lock, flags);
2132 seq_putc(m, '\n');
2133 }
2134 return 0;
2135}
2136
2137struct seq_operations zoneinfo_op = {
2138 .start = frag_start, /* iterate over all zones. The same as in
2139 * fragmentation. */
2140 .next = frag_next,
2141 .stop = frag_stop,
2142 .show = zoneinfo_show,
2143};
2144
1856static char *vmstat_text[] = { 2145static char *vmstat_text[] = {
1857 "nr_dirty", 2146 "nr_dirty",
1858 "nr_writeback", 2147 "nr_writeback",
@@ -2058,10 +2347,10 @@ static void setup_per_zone_pages_min(void)
2058 min_pages = 128; 2347 min_pages = 128;
2059 zone->pages_min = min_pages; 2348 zone->pages_min = min_pages;
2060 } else { 2349 } else {
2061 /* if it's a lowmem zone, reserve a number of pages 2350 /* if it's a lowmem zone, reserve a number of pages
2062 * proportionate to the zone's size. 2351 * proportionate to the zone's size.
2063 */ 2352 */
2064 zone->pages_min = (pages_min * zone->present_pages) / 2353 zone->pages_min = (pages_min * zone->present_pages) /
2065 lowmem_pages; 2354 lowmem_pages;
2066 } 2355 }
2067 2356
diff --git a/mm/rmap.c b/mm/rmap.c
index 9827409eb7c7..89770bd25f31 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
539 goto out_unmap; 539 goto out_unmap;
540 } 540 }
541 541
542 /*
543 * Don't pull an anonymous page out from under get_user_pages.
544 * GUP carefully breaks COW and raises page count (while holding
545 * page_table_lock, as we have here) to make sure that the page
546 * cannot be freed. If we unmap that page here, a user write
547 * access to the virtual address will bring back the page, but
548 * its raised count will (ironically) be taken to mean it's not
549 * an exclusive swap page, do_wp_page will replace it by a copy
550 * page, and the user never get to see the data GUP was holding
551 * the original page for.
552 *
553 * This test is also useful for when swapoff (unuse_process) has
554 * to drop page lock: its reference to the page stops existing
555 * ptes from being unmapped, so swapoff can make progress.
556 */
557 if (PageSwapCache(page) &&
558 page_count(page) != page_mapcount(page) + 2) {
559 ret = SWAP_FAIL;
560 goto out_unmap;
561 }
562
563 /* Nuke the page table entry. */ 542 /* Nuke the page table entry. */
564 flush_cache_page(vma, address, page_to_pfn(page)); 543 flush_cache_page(vma, address, page_to_pfn(page));
565 pteval = ptep_clear_flush(vma, address, pte); 544 pteval = ptep_clear_flush(vma, address, pte);
diff --git a/mm/shmem.c b/mm/shmem.c
index 61574b81d979..e64fa726a790 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,8 +6,8 @@
6 * 2000-2001 Christoph Rohland 6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG 7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc. 8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2004 Hugh Dickins. 9 * Copyright (C) 2002-2005 Hugh Dickins.
10 * Copyright (C) 2002-2004 VERITAS Software Corporation. 10 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs 11 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 * 12 *
13 * Extended attribute support for tmpfs: 13 * Extended attribute support for tmpfs:
@@ -194,7 +194,7 @@ static DEFINE_SPINLOCK(shmem_swaplist_lock);
194static void shmem_free_blocks(struct inode *inode, long pages) 194static void shmem_free_blocks(struct inode *inode, long pages)
195{ 195{
196 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 196 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
197 if (sbinfo) { 197 if (sbinfo->max_blocks) {
198 spin_lock(&sbinfo->stat_lock); 198 spin_lock(&sbinfo->stat_lock);
199 sbinfo->free_blocks += pages; 199 sbinfo->free_blocks += pages;
200 inode->i_blocks -= pages*BLOCKS_PER_PAGE; 200 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
@@ -357,7 +357,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
357 * page (and perhaps indirect index pages) yet to allocate: 357 * page (and perhaps indirect index pages) yet to allocate:
358 * a waste to allocate index if we cannot allocate data. 358 * a waste to allocate index if we cannot allocate data.
359 */ 359 */
360 if (sbinfo) { 360 if (sbinfo->max_blocks) {
361 spin_lock(&sbinfo->stat_lock); 361 spin_lock(&sbinfo->stat_lock);
362 if (sbinfo->free_blocks <= 1) { 362 if (sbinfo->free_blocks <= 1) {
363 spin_unlock(&sbinfo->stat_lock); 363 spin_unlock(&sbinfo->stat_lock);
@@ -677,8 +677,8 @@ static void shmem_delete_inode(struct inode *inode)
677 spin_unlock(&shmem_swaplist_lock); 677 spin_unlock(&shmem_swaplist_lock);
678 } 678 }
679 } 679 }
680 if (sbinfo) { 680 BUG_ON(inode->i_blocks);
681 BUG_ON(inode->i_blocks); 681 if (sbinfo->max_inodes) {
682 spin_lock(&sbinfo->stat_lock); 682 spin_lock(&sbinfo->stat_lock);
683 sbinfo->free_inodes++; 683 sbinfo->free_inodes++;
684 spin_unlock(&sbinfo->stat_lock); 684 spin_unlock(&sbinfo->stat_lock);
@@ -1080,7 +1080,7 @@ repeat:
1080 } else { 1080 } else {
1081 shmem_swp_unmap(entry); 1081 shmem_swp_unmap(entry);
1082 sbinfo = SHMEM_SB(inode->i_sb); 1082 sbinfo = SHMEM_SB(inode->i_sb);
1083 if (sbinfo) { 1083 if (sbinfo->max_blocks) {
1084 spin_lock(&sbinfo->stat_lock); 1084 spin_lock(&sbinfo->stat_lock);
1085 if (sbinfo->free_blocks == 0 || 1085 if (sbinfo->free_blocks == 0 ||
1086 shmem_acct_block(info->flags)) { 1086 shmem_acct_block(info->flags)) {
@@ -1269,7 +1269,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1269 struct shmem_inode_info *info; 1269 struct shmem_inode_info *info;
1270 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1270 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1271 1271
1272 if (sbinfo) { 1272 if (sbinfo->max_inodes) {
1273 spin_lock(&sbinfo->stat_lock); 1273 spin_lock(&sbinfo->stat_lock);
1274 if (!sbinfo->free_inodes) { 1274 if (!sbinfo->free_inodes) {
1275 spin_unlock(&sbinfo->stat_lock); 1275 spin_unlock(&sbinfo->stat_lock);
@@ -1319,7 +1319,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1319 mpol_shared_policy_init(&info->policy); 1319 mpol_shared_policy_init(&info->policy);
1320 break; 1320 break;
1321 } 1321 }
1322 } else if (sbinfo) { 1322 } else if (sbinfo->max_inodes) {
1323 spin_lock(&sbinfo->stat_lock); 1323 spin_lock(&sbinfo->stat_lock);
1324 sbinfo->free_inodes++; 1324 sbinfo->free_inodes++;
1325 spin_unlock(&sbinfo->stat_lock); 1325 spin_unlock(&sbinfo->stat_lock);
@@ -1328,31 +1328,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1328} 1328}
1329 1329
1330#ifdef CONFIG_TMPFS 1330#ifdef CONFIG_TMPFS
1331
1332static int shmem_set_size(struct shmem_sb_info *sbinfo,
1333 unsigned long max_blocks, unsigned long max_inodes)
1334{
1335 int error;
1336 unsigned long blocks, inodes;
1337
1338 spin_lock(&sbinfo->stat_lock);
1339 blocks = sbinfo->max_blocks - sbinfo->free_blocks;
1340 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
1341 error = -EINVAL;
1342 if (max_blocks < blocks)
1343 goto out;
1344 if (max_inodes < inodes)
1345 goto out;
1346 error = 0;
1347 sbinfo->max_blocks = max_blocks;
1348 sbinfo->free_blocks = max_blocks - blocks;
1349 sbinfo->max_inodes = max_inodes;
1350 sbinfo->free_inodes = max_inodes - inodes;
1351out:
1352 spin_unlock(&sbinfo->stat_lock);
1353 return error;
1354}
1355
1356static struct inode_operations shmem_symlink_inode_operations; 1331static struct inode_operations shmem_symlink_inode_operations;
1357static struct inode_operations shmem_symlink_inline_operations; 1332static struct inode_operations shmem_symlink_inline_operations;
1358 1333
@@ -1607,15 +1582,17 @@ static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
1607 buf->f_type = TMPFS_MAGIC; 1582 buf->f_type = TMPFS_MAGIC;
1608 buf->f_bsize = PAGE_CACHE_SIZE; 1583 buf->f_bsize = PAGE_CACHE_SIZE;
1609 buf->f_namelen = NAME_MAX; 1584 buf->f_namelen = NAME_MAX;
1610 if (sbinfo) { 1585 spin_lock(&sbinfo->stat_lock);
1611 spin_lock(&sbinfo->stat_lock); 1586 if (sbinfo->max_blocks) {
1612 buf->f_blocks = sbinfo->max_blocks; 1587 buf->f_blocks = sbinfo->max_blocks;
1613 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; 1588 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1589 }
1590 if (sbinfo->max_inodes) {
1614 buf->f_files = sbinfo->max_inodes; 1591 buf->f_files = sbinfo->max_inodes;
1615 buf->f_ffree = sbinfo->free_inodes; 1592 buf->f_ffree = sbinfo->free_inodes;
1616 spin_unlock(&sbinfo->stat_lock);
1617 } 1593 }
1618 /* else leave those fields 0 like simple_statfs */ 1594 /* else leave those fields 0 like simple_statfs */
1595 spin_unlock(&sbinfo->stat_lock);
1619 return 0; 1596 return 0;
1620} 1597}
1621 1598
@@ -1672,7 +1649,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1672 * but each new link needs a new dentry, pinning lowmem, and 1649 * but each new link needs a new dentry, pinning lowmem, and
1673 * tmpfs dentries cannot be pruned until they are unlinked. 1650 * tmpfs dentries cannot be pruned until they are unlinked.
1674 */ 1651 */
1675 if (sbinfo) { 1652 if (sbinfo->max_inodes) {
1676 spin_lock(&sbinfo->stat_lock); 1653 spin_lock(&sbinfo->stat_lock);
1677 if (!sbinfo->free_inodes) { 1654 if (!sbinfo->free_inodes) {
1678 spin_unlock(&sbinfo->stat_lock); 1655 spin_unlock(&sbinfo->stat_lock);
@@ -1697,7 +1674,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1697 1674
1698 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { 1675 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) {
1699 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1676 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1700 if (sbinfo) { 1677 if (sbinfo->max_inodes) {
1701 spin_lock(&sbinfo->stat_lock); 1678 spin_lock(&sbinfo->stat_lock);
1702 sbinfo->free_inodes++; 1679 sbinfo->free_inodes++;
1703 spin_unlock(&sbinfo->stat_lock); 1680 spin_unlock(&sbinfo->stat_lock);
@@ -1921,22 +1898,42 @@ bad_val:
1921static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) 1898static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1922{ 1899{
1923 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1900 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1924 unsigned long max_blocks = 0; 1901 unsigned long max_blocks = sbinfo->max_blocks;
1925 unsigned long max_inodes = 0; 1902 unsigned long max_inodes = sbinfo->max_inodes;
1903 unsigned long blocks;
1904 unsigned long inodes;
1905 int error = -EINVAL;
1906
1907 if (shmem_parse_options(data, NULL, NULL, NULL,
1908 &max_blocks, &max_inodes))
1909 return error;
1926 1910
1927 if (sbinfo) { 1911 spin_lock(&sbinfo->stat_lock);
1928 max_blocks = sbinfo->max_blocks; 1912 blocks = sbinfo->max_blocks - sbinfo->free_blocks;
1929 max_inodes = sbinfo->max_inodes; 1913 inodes = sbinfo->max_inodes - sbinfo->free_inodes;
1930 } 1914 if (max_blocks < blocks)
1931 if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes)) 1915 goto out;
1932 return -EINVAL; 1916 if (max_inodes < inodes)
1933 /* Keep it simple: disallow limited <-> unlimited remount */ 1917 goto out;
1934 if ((max_blocks || max_inodes) == !sbinfo) 1918 /*
1935 return -EINVAL; 1919 * Those tests also disallow limited->unlimited while any are in
1936 /* But allow the pointless unlimited -> unlimited remount */ 1920 * use, so i_blocks will always be zero when max_blocks is zero;
1937 if (!sbinfo) 1921 * but we must separately disallow unlimited->limited, because
1938 return 0; 1922 * in that case we have no record of how much is already in use.
1939 return shmem_set_size(sbinfo, max_blocks, max_inodes); 1923 */
1924 if (max_blocks && !sbinfo->max_blocks)
1925 goto out;
1926 if (max_inodes && !sbinfo->max_inodes)
1927 goto out;
1928
1929 error = 0;
1930 sbinfo->max_blocks = max_blocks;
1931 sbinfo->free_blocks = max_blocks - blocks;
1932 sbinfo->max_inodes = max_inodes;
1933 sbinfo->free_inodes = max_inodes - inodes;
1934out:
1935 spin_unlock(&sbinfo->stat_lock);
1936 return error;
1940} 1937}
1941#endif 1938#endif
1942 1939
@@ -1961,11 +1958,11 @@ static int shmem_fill_super(struct super_block *sb,
1961 uid_t uid = current->fsuid; 1958 uid_t uid = current->fsuid;
1962 gid_t gid = current->fsgid; 1959 gid_t gid = current->fsgid;
1963 int err = -ENOMEM; 1960 int err = -ENOMEM;
1964 1961 struct shmem_sb_info *sbinfo;
1965#ifdef CONFIG_TMPFS
1966 unsigned long blocks = 0; 1962 unsigned long blocks = 0;
1967 unsigned long inodes = 0; 1963 unsigned long inodes = 0;
1968 1964
1965#ifdef CONFIG_TMPFS
1969 /* 1966 /*
1970 * Per default we only allow half of the physical ram per 1967 * Per default we only allow half of the physical ram per
1971 * tmpfs instance, limiting inodes to one per page of lowmem; 1968 * tmpfs instance, limiting inodes to one per page of lowmem;
@@ -1976,34 +1973,34 @@ static int shmem_fill_super(struct super_block *sb,
1976 inodes = totalram_pages - totalhigh_pages; 1973 inodes = totalram_pages - totalhigh_pages;
1977 if (inodes > blocks) 1974 if (inodes > blocks)
1978 inodes = blocks; 1975 inodes = blocks;
1979 1976 if (shmem_parse_options(data, &mode, &uid, &gid,
1980 if (shmem_parse_options(data, &mode, 1977 &blocks, &inodes))
1981 &uid, &gid, &blocks, &inodes))
1982 return -EINVAL; 1978 return -EINVAL;
1983 } 1979 }
1984
1985 if (blocks || inodes) {
1986 struct shmem_sb_info *sbinfo;
1987 sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL);
1988 if (!sbinfo)
1989 return -ENOMEM;
1990 sb->s_fs_info = sbinfo;
1991 spin_lock_init(&sbinfo->stat_lock);
1992 sbinfo->max_blocks = blocks;
1993 sbinfo->free_blocks = blocks;
1994 sbinfo->max_inodes = inodes;
1995 sbinfo->free_inodes = inodes;
1996 }
1997 sb->s_xattr = shmem_xattr_handlers;
1998#else 1980#else
1999 sb->s_flags |= MS_NOUSER; 1981 sb->s_flags |= MS_NOUSER;
2000#endif 1982#endif
2001 1983
1984 /* Round up to L1_CACHE_BYTES to resist false sharing */
1985 sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
1986 L1_CACHE_BYTES), GFP_KERNEL);
1987 if (!sbinfo)
1988 return -ENOMEM;
1989
1990 spin_lock_init(&sbinfo->stat_lock);
1991 sbinfo->max_blocks = blocks;
1992 sbinfo->free_blocks = blocks;
1993 sbinfo->max_inodes = inodes;
1994 sbinfo->free_inodes = inodes;
1995
1996 sb->s_fs_info = sbinfo;
2002 sb->s_maxbytes = SHMEM_MAX_BYTES; 1997 sb->s_maxbytes = SHMEM_MAX_BYTES;
2003 sb->s_blocksize = PAGE_CACHE_SIZE; 1998 sb->s_blocksize = PAGE_CACHE_SIZE;
2004 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1999 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2005 sb->s_magic = TMPFS_MAGIC; 2000 sb->s_magic = TMPFS_MAGIC;
2006 sb->s_op = &shmem_ops; 2001 sb->s_op = &shmem_ops;
2002 sb->s_xattr = shmem_xattr_handlers;
2003
2007 inode = shmem_get_inode(sb, S_IFDIR | mode, 0); 2004 inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
2008 if (!inode) 2005 if (!inode)
2009 goto failed; 2006 goto failed;
diff --git a/mm/slab.c b/mm/slab.c
index c78d343b3c5f..93cbbbb39f42 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2851,6 +2851,7 @@ next:
2851 } 2851 }
2852 check_irq_on(); 2852 check_irq_on();
2853 up(&cache_chain_sem); 2853 up(&cache_chain_sem);
2854 drain_remote_pages();
2854 /* Setup the next iteration */ 2855 /* Setup the next iteration */
2855 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); 2856 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
2856} 2857}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index da48405cd9a3..60cd24a55204 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry)
276} 276}
277 277
278/* 278/*
279 * Check if we're the only user of a swap page, 279 * How many references to page are currently swapped out?
280 * when the page is locked.
281 */ 280 */
282static int exclusive_swap_page(struct page *page) 281static inline int page_swapcount(struct page *page)
283{ 282{
284 int retval = 0; 283 int count = 0;
285 struct swap_info_struct * p; 284 struct swap_info_struct *p;
286 swp_entry_t entry; 285 swp_entry_t entry;
287 286
288 entry.val = page->private; 287 entry.val = page->private;
289 p = swap_info_get(entry); 288 p = swap_info_get(entry);
290 if (p) { 289 if (p) {
291 /* Is the only swap cache user the cache itself? */ 290 /* Subtract the 1 for the swap cache itself */
292 if (p->swap_map[swp_offset(entry)] == 1) { 291 count = p->swap_map[swp_offset(entry)] - 1;
293 /* Recheck the page count with the swapcache lock held.. */
294 write_lock_irq(&swapper_space.tree_lock);
295 if (page_count(page) == 2)
296 retval = 1;
297 write_unlock_irq(&swapper_space.tree_lock);
298 }
299 swap_info_put(p); 292 swap_info_put(p);
300 } 293 }
301 return retval; 294 return count;
302} 295}
303 296
304/* 297/*
305 * We can use this swap cache entry directly 298 * We can use this swap cache entry directly
306 * if there are no other references to it. 299 * if there are no other references to it.
307 *
308 * Here "exclusive_swap_page()" does the real
309 * work, but we opportunistically check whether
310 * we need to get all the locks first..
311 */ 300 */
312int can_share_swap_page(struct page *page) 301int can_share_swap_page(struct page *page)
313{ 302{
314 int retval = 0; 303 int count;
315 304
316 if (!PageLocked(page)) 305 BUG_ON(!PageLocked(page));
317 BUG(); 306 count = page_mapcount(page);
318 switch (page_count(page)) { 307 if (count <= 1 && PageSwapCache(page))
319 case 3: 308 count += page_swapcount(page);
320 if (!PagePrivate(page)) 309 return count == 1;
321 break;
322 /* Fallthrough */
323 case 2:
324 if (!PageSwapCache(page))
325 break;
326 retval = exclusive_swap_page(page);
327 break;
328 case 1:
329 if (PageReserved(page))
330 break;
331 retval = 1;
332 }
333 return retval;
334} 310}
335 311
336/* 312/*
@@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm,
529 505
530 if (!down_read_trylock(&mm->mmap_sem)) { 506 if (!down_read_trylock(&mm->mmap_sem)) {
531 /* 507 /*
532 * Our reference to the page stops try_to_unmap_one from 508 * Activate page so shrink_cache is unlikely to unmap its
533 * unmapping its ptes, so swapoff can make progress. 509 * ptes while lock is dropped, so swapoff can make progress.
534 */ 510 */
511 activate_page(page);
535 unlock_page(page); 512 unlock_page(page);
536 down_read(&mm->mmap_sem); 513 down_read(&mm->mmap_sem);
537 lock_page(page); 514 lock_page(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 269eded9b459..4b8e62a19370 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -74,6 +74,9 @@ struct scan_control {
74 74
75 int may_writepage; 75 int may_writepage;
76 76
77 /* Can pages be swapped as part of reclaim? */
78 int may_swap;
79
77 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 80 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
78 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 81 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
79 * In this context, it doesn't matter that we scan the 82 * In this context, it doesn't matter that we scan the
@@ -180,17 +183,20 @@ EXPORT_SYMBOL(remove_shrinker);
180 * `lru_pages' represents the number of on-LRU pages in all the zones which 183 * `lru_pages' represents the number of on-LRU pages in all the zones which
181 * are eligible for the caller's allocation attempt. It is used for balancing 184 * are eligible for the caller's allocation attempt. It is used for balancing
182 * slab reclaim versus page reclaim. 185 * slab reclaim versus page reclaim.
186 *
187 * Returns the number of slab objects which we shrunk.
183 */ 188 */
184static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, 189static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
185 unsigned long lru_pages) 190 unsigned long lru_pages)
186{ 191{
187 struct shrinker *shrinker; 192 struct shrinker *shrinker;
193 int ret = 0;
188 194
189 if (scanned == 0) 195 if (scanned == 0)
190 scanned = SWAP_CLUSTER_MAX; 196 scanned = SWAP_CLUSTER_MAX;
191 197
192 if (!down_read_trylock(&shrinker_rwsem)) 198 if (!down_read_trylock(&shrinker_rwsem))
193 return 0; 199 return 1; /* Assume we'll be able to shrink next time */
194 200
195 list_for_each_entry(shrinker, &shrinker_list, list) { 201 list_for_each_entry(shrinker, &shrinker_list, list) {
196 unsigned long long delta; 202 unsigned long long delta;
@@ -209,10 +215,14 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
209 while (total_scan >= SHRINK_BATCH) { 215 while (total_scan >= SHRINK_BATCH) {
210 long this_scan = SHRINK_BATCH; 216 long this_scan = SHRINK_BATCH;
211 int shrink_ret; 217 int shrink_ret;
218 int nr_before;
212 219
220 nr_before = (*shrinker->shrinker)(0, gfp_mask);
213 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); 221 shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
214 if (shrink_ret == -1) 222 if (shrink_ret == -1)
215 break; 223 break;
224 if (shrink_ret < nr_before)
225 ret += nr_before - shrink_ret;
216 mod_page_state(slabs_scanned, this_scan); 226 mod_page_state(slabs_scanned, this_scan);
217 total_scan -= this_scan; 227 total_scan -= this_scan;
218 228
@@ -222,7 +232,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
222 shrinker->nr += total_scan; 232 shrinker->nr += total_scan;
223 } 233 }
224 up_read(&shrinker_rwsem); 234 up_read(&shrinker_rwsem);
225 return 0; 235 return ret;
226} 236}
227 237
228/* Called without lock on whether page is mapped, so answer is unstable */ 238/* Called without lock on whether page is mapped, so answer is unstable */
@@ -407,7 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
407 * Anonymous process memory has backing store? 417 * Anonymous process memory has backing store?
408 * Try to allocate it some swap space here. 418 * Try to allocate it some swap space here.
409 */ 419 */
410 if (PageAnon(page) && !PageSwapCache(page)) { 420 if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
411 if (!add_to_swap(page)) 421 if (!add_to_swap(page))
412 goto activate_locked; 422 goto activate_locked;
413 } 423 }
@@ -890,7 +900,9 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
890 if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) 900 if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
891 continue; /* Let kswapd poll it */ 901 continue; /* Let kswapd poll it */
892 902
903 atomic_inc(&zone->reclaim_in_progress);
893 shrink_zone(zone, sc); 904 shrink_zone(zone, sc);
905 atomic_dec(&zone->reclaim_in_progress);
894 } 906 }
895} 907}
896 908
@@ -907,8 +919,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
907 * holds filesystem locks which prevent writeout this might not work, and the 919 * holds filesystem locks which prevent writeout this might not work, and the
908 * allocation attempt will fail. 920 * allocation attempt will fail.
909 */ 921 */
910int try_to_free_pages(struct zone **zones, 922int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
911 unsigned int gfp_mask, unsigned int order)
912{ 923{
913 int priority; 924 int priority;
914 int ret = 0; 925 int ret = 0;
@@ -920,6 +931,7 @@ int try_to_free_pages(struct zone **zones,
920 931
921 sc.gfp_mask = gfp_mask; 932 sc.gfp_mask = gfp_mask;
922 sc.may_writepage = 0; 933 sc.may_writepage = 0;
934 sc.may_swap = 1;
923 935
924 inc_page_state(allocstall); 936 inc_page_state(allocstall);
925 937
@@ -1020,6 +1032,7 @@ loop_again:
1020 total_reclaimed = 0; 1032 total_reclaimed = 0;
1021 sc.gfp_mask = GFP_KERNEL; 1033 sc.gfp_mask = GFP_KERNEL;
1022 sc.may_writepage = 0; 1034 sc.may_writepage = 0;
1035 sc.may_swap = 1;
1023 sc.nr_mapped = read_page_state(nr_mapped); 1036 sc.nr_mapped = read_page_state(nr_mapped);
1024 1037
1025 inc_page_state(pageoutrun); 1038 inc_page_state(pageoutrun);
@@ -1079,6 +1092,7 @@ scan:
1079 */ 1092 */
1080 for (i = 0; i <= end_zone; i++) { 1093 for (i = 0; i <= end_zone; i++) {
1081 struct zone *zone = pgdat->node_zones + i; 1094 struct zone *zone = pgdat->node_zones + i;
1095 int nr_slab;
1082 1096
1083 if (zone->present_pages == 0) 1097 if (zone->present_pages == 0)
1084 continue; 1098 continue;
@@ -1098,16 +1112,19 @@ scan:
1098 sc.nr_reclaimed = 0; 1112 sc.nr_reclaimed = 0;
1099 sc.priority = priority; 1113 sc.priority = priority;
1100 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; 1114 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1115 atomic_inc(&zone->reclaim_in_progress);
1101 shrink_zone(zone, &sc); 1116 shrink_zone(zone, &sc);
1117 atomic_dec(&zone->reclaim_in_progress);
1102 reclaim_state->reclaimed_slab = 0; 1118 reclaim_state->reclaimed_slab = 0;
1103 shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); 1119 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1120 lru_pages);
1104 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 1121 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1105 total_reclaimed += sc.nr_reclaimed; 1122 total_reclaimed += sc.nr_reclaimed;
1106 total_scanned += sc.nr_scanned; 1123 total_scanned += sc.nr_scanned;
1107 if (zone->all_unreclaimable) 1124 if (zone->all_unreclaimable)
1108 continue; 1125 continue;
1109 if (zone->pages_scanned >= (zone->nr_active + 1126 if (nr_slab == 0 && zone->pages_scanned >=
1110 zone->nr_inactive) * 4) 1127 (zone->nr_active + zone->nr_inactive) * 4)
1111 zone->all_unreclaimable = 1; 1128 zone->all_unreclaimable = 1;
1112 /* 1129 /*
1113 * If we've done a decent amount of scanning and 1130 * If we've done a decent amount of scanning and
@@ -1309,3 +1326,73 @@ static int __init kswapd_init(void)
1309} 1326}
1310 1327
1311module_init(kswapd_init) 1328module_init(kswapd_init)
1329
1330
1331/*
1332 * Try to free up some pages from this zone through reclaim.
1333 */
1334int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
1335{
1336 struct scan_control sc;
1337 int nr_pages = 1 << order;
1338 int total_reclaimed = 0;
1339
1340 /* The reclaim may sleep, so don't do it if sleep isn't allowed */
1341 if (!(gfp_mask & __GFP_WAIT))
1342 return 0;
1343 if (zone->all_unreclaimable)
1344 return 0;
1345
1346 sc.gfp_mask = gfp_mask;
1347 sc.may_writepage = 0;
1348 sc.may_swap = 0;
1349 sc.nr_mapped = read_page_state(nr_mapped);
1350 sc.nr_scanned = 0;
1351 sc.nr_reclaimed = 0;
1352 /* scan at the highest priority */
1353 sc.priority = 0;
1354
1355 if (nr_pages > SWAP_CLUSTER_MAX)
1356 sc.swap_cluster_max = nr_pages;
1357 else
1358 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1359
1360 /* Don't reclaim the zone if there are other reclaimers active */
1361 if (!atomic_inc_and_test(&zone->reclaim_in_progress))
1362 goto out;
1363
1364 shrink_zone(zone, &sc);
1365 total_reclaimed = sc.nr_reclaimed;
1366
1367 out:
1368 atomic_dec(&zone->reclaim_in_progress);
1369 return total_reclaimed;
1370}
1371
1372asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
1373 unsigned int state)
1374{
1375 struct zone *z;
1376 int i;
1377
1378 if (node >= MAX_NUMNODES || !node_online(node))
1379 return -EINVAL;
1380
1381 /* This will break if we ever add more zones */
1382 if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
1383 return -EINVAL;
1384
1385 for (i = 0; i < MAX_NR_ZONES; i++) {
1386 if (!(zone & 1<<i))
1387 continue;
1388
1389 z = &NODE_DATA(node)->node_zones[i];
1390
1391 if (state)
1392 z->reclaim_pages = 1;
1393 else
1394 z->reclaim_pages = 0;
1395 }
1396
1397 return 0;
1398}