diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 177 | ||||
-rw-r--r-- | mm/madvise.c | 103 | ||||
-rw-r--r-- | mm/memory.c | 57 | ||||
-rw-r--r-- | mm/mempolicy.c | 110 | ||||
-rw-r--r-- | mm/mmap.c | 57 | ||||
-rw-r--r-- | mm/msync.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 423 | ||||
-rw-r--r-- | mm/rmap.c | 21 | ||||
-rw-r--r-- | mm/shmem.c | 143 | ||||
-rw-r--r-- | mm/slab.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 55 | ||||
-rw-r--r-- | mm/vmscan.c | 103 |
14 files changed, 918 insertions, 343 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4eb5ae3fbe10..fbd1111ea119 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -7,10 +7,14 @@ | |||
7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/hugetlb.h> | ||
11 | #include <linux/sysctl.h> | 10 | #include <linux/sysctl.h> |
12 | #include <linux/highmem.h> | 11 | #include <linux/highmem.h> |
13 | #include <linux/nodemask.h> | 12 | #include <linux/nodemask.h> |
13 | #include <linux/pagemap.h> | ||
14 | #include <asm/page.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | |||
17 | #include <linux/hugetlb.h> | ||
14 | 18 | ||
15 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 19 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
16 | static unsigned long nr_huge_pages, free_huge_pages; | 20 | static unsigned long nr_huge_pages, free_huge_pages; |
@@ -249,6 +253,72 @@ struct vm_operations_struct hugetlb_vm_ops = { | |||
249 | .nopage = hugetlb_nopage, | 253 | .nopage = hugetlb_nopage, |
250 | }; | 254 | }; |
251 | 255 | ||
256 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | ||
257 | { | ||
258 | pte_t entry; | ||
259 | |||
260 | if (vma->vm_flags & VM_WRITE) { | ||
261 | entry = | ||
262 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | ||
263 | } else { | ||
264 | entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); | ||
265 | } | ||
266 | entry = pte_mkyoung(entry); | ||
267 | entry = pte_mkhuge(entry); | ||
268 | |||
269 | return entry; | ||
270 | } | ||
271 | |||
272 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | ||
273 | struct vm_area_struct *vma) | ||
274 | { | ||
275 | pte_t *src_pte, *dst_pte, entry; | ||
276 | struct page *ptepage; | ||
277 | unsigned long addr = vma->vm_start; | ||
278 | unsigned long end = vma->vm_end; | ||
279 | |||
280 | while (addr < end) { | ||
281 | dst_pte = huge_pte_alloc(dst, addr); | ||
282 | if (!dst_pte) | ||
283 | goto nomem; | ||
284 | src_pte = huge_pte_offset(src, addr); | ||
285 | BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */ | ||
286 | entry = *src_pte; | ||
287 | ptepage = pte_page(entry); | ||
288 | get_page(ptepage); | ||
289 | add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE); | ||
290 | set_huge_pte_at(dst, addr, dst_pte, entry); | ||
291 | addr += HPAGE_SIZE; | ||
292 | } | ||
293 | return 0; | ||
294 | |||
295 | nomem: | ||
296 | return -ENOMEM; | ||
297 | } | ||
298 | |||
299 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | ||
300 | unsigned long end) | ||
301 | { | ||
302 | struct mm_struct *mm = vma->vm_mm; | ||
303 | unsigned long address; | ||
304 | pte_t pte; | ||
305 | struct page *page; | ||
306 | |||
307 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
308 | BUG_ON(start & ~HPAGE_MASK); | ||
309 | BUG_ON(end & ~HPAGE_MASK); | ||
310 | |||
311 | for (address = start; address < end; address += HPAGE_SIZE) { | ||
312 | pte = huge_ptep_get_and_clear(mm, address, huge_pte_offset(mm, address)); | ||
313 | if (pte_none(pte)) | ||
314 | continue; | ||
315 | page = pte_page(pte); | ||
316 | put_page(page); | ||
317 | } | ||
318 | add_mm_counter(mm, rss, -((end - start) >> PAGE_SHIFT)); | ||
319 | flush_tlb_range(vma, start, end); | ||
320 | } | ||
321 | |||
252 | void zap_hugepage_range(struct vm_area_struct *vma, | 322 | void zap_hugepage_range(struct vm_area_struct *vma, |
253 | unsigned long start, unsigned long length) | 323 | unsigned long start, unsigned long length) |
254 | { | 324 | { |
@@ -258,3 +328,108 @@ void zap_hugepage_range(struct vm_area_struct *vma, | |||
258 | unmap_hugepage_range(vma, start, start + length); | 328 | unmap_hugepage_range(vma, start, start + length); |
259 | spin_unlock(&mm->page_table_lock); | 329 | spin_unlock(&mm->page_table_lock); |
260 | } | 330 | } |
331 | |||
332 | int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma) | ||
333 | { | ||
334 | struct mm_struct *mm = current->mm; | ||
335 | unsigned long addr; | ||
336 | int ret = 0; | ||
337 | |||
338 | WARN_ON(!is_vm_hugetlb_page(vma)); | ||
339 | BUG_ON(vma->vm_start & ~HPAGE_MASK); | ||
340 | BUG_ON(vma->vm_end & ~HPAGE_MASK); | ||
341 | |||
342 | hugetlb_prefault_arch_hook(mm); | ||
343 | |||
344 | spin_lock(&mm->page_table_lock); | ||
345 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | ||
346 | unsigned long idx; | ||
347 | pte_t *pte = huge_pte_alloc(mm, addr); | ||
348 | struct page *page; | ||
349 | |||
350 | if (!pte) { | ||
351 | ret = -ENOMEM; | ||
352 | goto out; | ||
353 | } | ||
354 | if (! pte_none(*pte)) | ||
355 | hugetlb_clean_stale_pgtable(pte); | ||
356 | |||
357 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
358 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
359 | page = find_get_page(mapping, idx); | ||
360 | if (!page) { | ||
361 | /* charge the fs quota first */ | ||
362 | if (hugetlb_get_quota(mapping)) { | ||
363 | ret = -ENOMEM; | ||
364 | goto out; | ||
365 | } | ||
366 | page = alloc_huge_page(); | ||
367 | if (!page) { | ||
368 | hugetlb_put_quota(mapping); | ||
369 | ret = -ENOMEM; | ||
370 | goto out; | ||
371 | } | ||
372 | ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); | ||
373 | if (! ret) { | ||
374 | unlock_page(page); | ||
375 | } else { | ||
376 | hugetlb_put_quota(mapping); | ||
377 | free_huge_page(page); | ||
378 | goto out; | ||
379 | } | ||
380 | } | ||
381 | add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE); | ||
382 | set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); | ||
383 | } | ||
384 | out: | ||
385 | spin_unlock(&mm->page_table_lock); | ||
386 | return ret; | ||
387 | } | ||
388 | |||
389 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
390 | struct page **pages, struct vm_area_struct **vmas, | ||
391 | unsigned long *position, int *length, int i) | ||
392 | { | ||
393 | unsigned long vpfn, vaddr = *position; | ||
394 | int remainder = *length; | ||
395 | |||
396 | BUG_ON(!is_vm_hugetlb_page(vma)); | ||
397 | |||
398 | vpfn = vaddr/PAGE_SIZE; | ||
399 | while (vaddr < vma->vm_end && remainder) { | ||
400 | |||
401 | if (pages) { | ||
402 | pte_t *pte; | ||
403 | struct page *page; | ||
404 | |||
405 | /* Some archs (sparc64, sh*) have multiple | ||
406 | * pte_ts to each hugepage. We have to make | ||
407 | * sure we get the first, for the page | ||
408 | * indexing below to work. */ | ||
409 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | ||
410 | |||
411 | /* hugetlb should be locked, and hence, prefaulted */ | ||
412 | WARN_ON(!pte || pte_none(*pte)); | ||
413 | |||
414 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | ||
415 | |||
416 | WARN_ON(!PageCompound(page)); | ||
417 | |||
418 | get_page(page); | ||
419 | pages[i] = page; | ||
420 | } | ||
421 | |||
422 | if (vmas) | ||
423 | vmas[i] = vma; | ||
424 | |||
425 | vaddr += PAGE_SIZE; | ||
426 | ++vpfn; | ||
427 | --remainder; | ||
428 | ++i; | ||
429 | } | ||
430 | |||
431 | *length = remainder; | ||
432 | *position = vaddr; | ||
433 | |||
434 | return i; | ||
435 | } | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 944b5e52d812..e3108054733c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -8,17 +8,47 @@ | |||
8 | #include <linux/mman.h> | 8 | #include <linux/mman.h> |
9 | #include <linux/pagemap.h> | 9 | #include <linux/pagemap.h> |
10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
11 | #include <linux/mempolicy.h> | ||
11 | #include <linux/hugetlb.h> | 12 | #include <linux/hugetlb.h> |
12 | 13 | ||
13 | /* | 14 | /* |
14 | * We can potentially split a vm area into separate | 15 | * We can potentially split a vm area into separate |
15 | * areas, each area with its own behavior. | 16 | * areas, each area with its own behavior. |
16 | */ | 17 | */ |
17 | static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, | 18 | static long madvise_behavior(struct vm_area_struct * vma, |
18 | unsigned long end, int behavior) | 19 | struct vm_area_struct **prev, |
20 | unsigned long start, unsigned long end, int behavior) | ||
19 | { | 21 | { |
20 | struct mm_struct * mm = vma->vm_mm; | 22 | struct mm_struct * mm = vma->vm_mm; |
21 | int error = 0; | 23 | int error = 0; |
24 | pgoff_t pgoff; | ||
25 | int new_flags = vma->vm_flags & ~VM_READHINTMASK; | ||
26 | |||
27 | switch (behavior) { | ||
28 | case MADV_SEQUENTIAL: | ||
29 | new_flags |= VM_SEQ_READ; | ||
30 | break; | ||
31 | case MADV_RANDOM: | ||
32 | new_flags |= VM_RAND_READ; | ||
33 | break; | ||
34 | default: | ||
35 | break; | ||
36 | } | ||
37 | |||
38 | if (new_flags == vma->vm_flags) { | ||
39 | *prev = vma; | ||
40 | goto success; | ||
41 | } | ||
42 | |||
43 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | ||
44 | *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, | ||
45 | vma->vm_file, pgoff, vma_policy(vma)); | ||
46 | if (*prev) { | ||
47 | vma = *prev; | ||
48 | goto success; | ||
49 | } | ||
50 | |||
51 | *prev = vma; | ||
22 | 52 | ||
23 | if (start != vma->vm_start) { | 53 | if (start != vma->vm_start) { |
24 | error = split_vma(mm, vma, start, 1); | 54 | error = split_vma(mm, vma, start, 1); |
@@ -36,21 +66,12 @@ static long madvise_behavior(struct vm_area_struct * vma, unsigned long start, | |||
36 | * vm_flags is protected by the mmap_sem held in write mode. | 66 | * vm_flags is protected by the mmap_sem held in write mode. |
37 | */ | 67 | */ |
38 | VM_ClearReadHint(vma); | 68 | VM_ClearReadHint(vma); |
39 | 69 | vma->vm_flags = new_flags; | |
40 | switch (behavior) { | ||
41 | case MADV_SEQUENTIAL: | ||
42 | vma->vm_flags |= VM_SEQ_READ; | ||
43 | break; | ||
44 | case MADV_RANDOM: | ||
45 | vma->vm_flags |= VM_RAND_READ; | ||
46 | break; | ||
47 | default: | ||
48 | break; | ||
49 | } | ||
50 | 70 | ||
51 | out: | 71 | out: |
52 | if (error == -ENOMEM) | 72 | if (error == -ENOMEM) |
53 | error = -EAGAIN; | 73 | error = -EAGAIN; |
74 | success: | ||
54 | return error; | 75 | return error; |
55 | } | 76 | } |
56 | 77 | ||
@@ -58,6 +79,7 @@ out: | |||
58 | * Schedule all required I/O operations. Do not wait for completion. | 79 | * Schedule all required I/O operations. Do not wait for completion. |
59 | */ | 80 | */ |
60 | static long madvise_willneed(struct vm_area_struct * vma, | 81 | static long madvise_willneed(struct vm_area_struct * vma, |
82 | struct vm_area_struct ** prev, | ||
61 | unsigned long start, unsigned long end) | 83 | unsigned long start, unsigned long end) |
62 | { | 84 | { |
63 | struct file *file = vma->vm_file; | 85 | struct file *file = vma->vm_file; |
@@ -65,6 +87,7 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
65 | if (!file) | 87 | if (!file) |
66 | return -EBADF; | 88 | return -EBADF; |
67 | 89 | ||
90 | *prev = vma; | ||
68 | start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 91 | start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
69 | if (end > vma->vm_end) | 92 | if (end > vma->vm_end) |
70 | end = vma->vm_end; | 93 | end = vma->vm_end; |
@@ -95,8 +118,10 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
95 | * dirty pages is already available as msync(MS_INVALIDATE). | 118 | * dirty pages is already available as msync(MS_INVALIDATE). |
96 | */ | 119 | */ |
97 | static long madvise_dontneed(struct vm_area_struct * vma, | 120 | static long madvise_dontneed(struct vm_area_struct * vma, |
121 | struct vm_area_struct ** prev, | ||
98 | unsigned long start, unsigned long end) | 122 | unsigned long start, unsigned long end) |
99 | { | 123 | { |
124 | *prev = vma; | ||
100 | if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) | 125 | if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) |
101 | return -EINVAL; | 126 | return -EINVAL; |
102 | 127 | ||
@@ -111,8 +136,8 @@ static long madvise_dontneed(struct vm_area_struct * vma, | |||
111 | return 0; | 136 | return 0; |
112 | } | 137 | } |
113 | 138 | ||
114 | static long madvise_vma(struct vm_area_struct * vma, unsigned long start, | 139 | static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, |
115 | unsigned long end, int behavior) | 140 | unsigned long start, unsigned long end, int behavior) |
116 | { | 141 | { |
117 | long error = -EBADF; | 142 | long error = -EBADF; |
118 | 143 | ||
@@ -120,15 +145,15 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start, | |||
120 | case MADV_NORMAL: | 145 | case MADV_NORMAL: |
121 | case MADV_SEQUENTIAL: | 146 | case MADV_SEQUENTIAL: |
122 | case MADV_RANDOM: | 147 | case MADV_RANDOM: |
123 | error = madvise_behavior(vma, start, end, behavior); | 148 | error = madvise_behavior(vma, prev, start, end, behavior); |
124 | break; | 149 | break; |
125 | 150 | ||
126 | case MADV_WILLNEED: | 151 | case MADV_WILLNEED: |
127 | error = madvise_willneed(vma, start, end); | 152 | error = madvise_willneed(vma, prev, start, end); |
128 | break; | 153 | break; |
129 | 154 | ||
130 | case MADV_DONTNEED: | 155 | case MADV_DONTNEED: |
131 | error = madvise_dontneed(vma, start, end); | 156 | error = madvise_dontneed(vma, prev, start, end); |
132 | break; | 157 | break; |
133 | 158 | ||
134 | default: | 159 | default: |
@@ -175,8 +200,8 @@ static long madvise_vma(struct vm_area_struct * vma, unsigned long start, | |||
175 | */ | 200 | */ |
176 | asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | 201 | asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) |
177 | { | 202 | { |
178 | unsigned long end; | 203 | unsigned long end, tmp; |
179 | struct vm_area_struct * vma; | 204 | struct vm_area_struct * vma, *prev; |
180 | int unmapped_error = 0; | 205 | int unmapped_error = 0; |
181 | int error = -EINVAL; | 206 | int error = -EINVAL; |
182 | size_t len; | 207 | size_t len; |
@@ -202,40 +227,42 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | |||
202 | /* | 227 | /* |
203 | * If the interval [start,end) covers some unmapped address | 228 | * If the interval [start,end) covers some unmapped address |
204 | * ranges, just ignore them, but return -ENOMEM at the end. | 229 | * ranges, just ignore them, but return -ENOMEM at the end. |
230 | * - different from the way of handling in mlock etc. | ||
205 | */ | 231 | */ |
206 | vma = find_vma(current->mm, start); | 232 | vma = find_vma_prev(current->mm, start, &prev); |
233 | if (!vma && prev) | ||
234 | vma = prev->vm_next; | ||
207 | for (;;) { | 235 | for (;;) { |
208 | /* Still start < end. */ | 236 | /* Still start < end. */ |
209 | error = -ENOMEM; | 237 | error = -ENOMEM; |
210 | if (!vma) | 238 | if (!vma) |
211 | goto out; | 239 | goto out; |
212 | 240 | ||
213 | /* Here start < vma->vm_end. */ | 241 | /* Here start < (end|vma->vm_end). */ |
214 | if (start < vma->vm_start) { | 242 | if (start < vma->vm_start) { |
215 | unmapped_error = -ENOMEM; | 243 | unmapped_error = -ENOMEM; |
216 | start = vma->vm_start; | 244 | start = vma->vm_start; |
245 | if (start >= end) | ||
246 | goto out; | ||
217 | } | 247 | } |
218 | 248 | ||
219 | /* Here vma->vm_start <= start < vma->vm_end. */ | 249 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ |
220 | if (end <= vma->vm_end) { | 250 | tmp = vma->vm_end; |
221 | if (start < end) { | 251 | if (end < tmp) |
222 | error = madvise_vma(vma, start, end, | 252 | tmp = end; |
223 | behavior); | ||
224 | if (error) | ||
225 | goto out; | ||
226 | } | ||
227 | error = unmapped_error; | ||
228 | goto out; | ||
229 | } | ||
230 | 253 | ||
231 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | 254 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ |
232 | error = madvise_vma(vma, start, vma->vm_end, behavior); | 255 | error = madvise_vma(vma, &prev, start, tmp, behavior); |
233 | if (error) | 256 | if (error) |
234 | goto out; | 257 | goto out; |
235 | start = vma->vm_end; | 258 | start = tmp; |
236 | vma = vma->vm_next; | 259 | if (start < prev->vm_end) |
260 | start = prev->vm_end; | ||
261 | error = unmapped_error; | ||
262 | if (start >= end) | ||
263 | goto out; | ||
264 | vma = prev->vm_next; | ||
237 | } | 265 | } |
238 | |||
239 | out: | 266 | out: |
240 | up_write(¤t->mm->mmap_sem); | 267 | up_write(¤t->mm->mmap_sem); |
241 | return error; | 268 | return error; |
diff --git a/mm/memory.c b/mm/memory.c index d209f745db7f..da91b7bf9986 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -840,23 +840,8 @@ check_user_page_readable(struct mm_struct *mm, unsigned long address) | |||
840 | { | 840 | { |
841 | return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; | 841 | return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; |
842 | } | 842 | } |
843 | |||
844 | EXPORT_SYMBOL(check_user_page_readable); | 843 | EXPORT_SYMBOL(check_user_page_readable); |
845 | 844 | ||
846 | /* | ||
847 | * Given a physical address, is there a useful struct page pointing to | ||
848 | * it? This may become more complex in the future if we start dealing | ||
849 | * with IO-aperture pages for direct-IO. | ||
850 | */ | ||
851 | |||
852 | static inline struct page *get_page_map(struct page *page) | ||
853 | { | ||
854 | if (!pfn_valid(page_to_pfn(page))) | ||
855 | return NULL; | ||
856 | return page; | ||
857 | } | ||
858 | |||
859 | |||
860 | static inline int | 845 | static inline int |
861 | untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, | 846 | untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, |
862 | unsigned long address) | 847 | unsigned long address) |
@@ -887,7 +872,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, | |||
887 | return 0; | 872 | return 0; |
888 | } | 873 | } |
889 | 874 | ||
890 | |||
891 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 875 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
892 | unsigned long start, int len, int write, int force, | 876 | unsigned long start, int len, int write, int force, |
893 | struct page **pages, struct vm_area_struct **vmas) | 877 | struct page **pages, struct vm_area_struct **vmas) |
@@ -951,21 +935,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
951 | } | 935 | } |
952 | spin_lock(&mm->page_table_lock); | 936 | spin_lock(&mm->page_table_lock); |
953 | do { | 937 | do { |
954 | struct page *map; | 938 | struct page *page; |
955 | int lookup_write = write; | 939 | int lookup_write = write; |
956 | 940 | ||
957 | cond_resched_lock(&mm->page_table_lock); | 941 | cond_resched_lock(&mm->page_table_lock); |
958 | while (!(map = follow_page(mm, start, lookup_write))) { | 942 | while (!(page = follow_page(mm, start, lookup_write))) { |
959 | /* | 943 | /* |
960 | * Shortcut for anonymous pages. We don't want | 944 | * Shortcut for anonymous pages. We don't want |
961 | * to force the creation of pages tables for | 945 | * to force the creation of pages tables for |
962 | * insanly big anonymously mapped areas that | 946 | * insanely big anonymously mapped areas that |
963 | * nobody touched so far. This is important | 947 | * nobody touched so far. This is important |
964 | * for doing a core dump for these mappings. | 948 | * for doing a core dump for these mappings. |
965 | */ | 949 | */ |
966 | if (!lookup_write && | 950 | if (!lookup_write && |
967 | untouched_anonymous_page(mm,vma,start)) { | 951 | untouched_anonymous_page(mm,vma,start)) { |
968 | map = ZERO_PAGE(start); | 952 | page = ZERO_PAGE(start); |
969 | break; | 953 | break; |
970 | } | 954 | } |
971 | spin_unlock(&mm->page_table_lock); | 955 | spin_unlock(&mm->page_table_lock); |
@@ -994,30 +978,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
994 | spin_lock(&mm->page_table_lock); | 978 | spin_lock(&mm->page_table_lock); |
995 | } | 979 | } |
996 | if (pages) { | 980 | if (pages) { |
997 | pages[i] = get_page_map(map); | 981 | pages[i] = page; |
998 | if (!pages[i]) { | 982 | flush_dcache_page(page); |
999 | spin_unlock(&mm->page_table_lock); | 983 | if (!PageReserved(page)) |
1000 | while (i--) | 984 | page_cache_get(page); |
1001 | page_cache_release(pages[i]); | ||
1002 | i = -EFAULT; | ||
1003 | goto out; | ||
1004 | } | ||
1005 | flush_dcache_page(pages[i]); | ||
1006 | if (!PageReserved(pages[i])) | ||
1007 | page_cache_get(pages[i]); | ||
1008 | } | 985 | } |
1009 | if (vmas) | 986 | if (vmas) |
1010 | vmas[i] = vma; | 987 | vmas[i] = vma; |
1011 | i++; | 988 | i++; |
1012 | start += PAGE_SIZE; | 989 | start += PAGE_SIZE; |
1013 | len--; | 990 | len--; |
1014 | } while(len && start < vma->vm_end); | 991 | } while (len && start < vma->vm_end); |
1015 | spin_unlock(&mm->page_table_lock); | 992 | spin_unlock(&mm->page_table_lock); |
1016 | } while(len); | 993 | } while (len); |
1017 | out: | ||
1018 | return i; | 994 | return i; |
1019 | } | 995 | } |
1020 | |||
1021 | EXPORT_SYMBOL(get_user_pages); | 996 | EXPORT_SYMBOL(get_user_pages); |
1022 | 997 | ||
1023 | static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, | 998 | static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, |
@@ -1264,7 +1239,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, | |||
1264 | } | 1239 | } |
1265 | old_page = pfn_to_page(pfn); | 1240 | old_page = pfn_to_page(pfn); |
1266 | 1241 | ||
1267 | if (!TestSetPageLocked(old_page)) { | 1242 | if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { |
1268 | int reuse = can_share_swap_page(old_page); | 1243 | int reuse = can_share_swap_page(old_page); |
1269 | unlock_page(old_page); | 1244 | unlock_page(old_page); |
1270 | if (reuse) { | 1245 | if (reuse) { |
@@ -1711,10 +1686,6 @@ static int do_swap_page(struct mm_struct * mm, | |||
1711 | } | 1686 | } |
1712 | 1687 | ||
1713 | /* The page isn't present yet, go ahead with the fault. */ | 1688 | /* The page isn't present yet, go ahead with the fault. */ |
1714 | |||
1715 | swap_free(entry); | ||
1716 | if (vm_swap_full()) | ||
1717 | remove_exclusive_swap_page(page); | ||
1718 | 1689 | ||
1719 | inc_mm_counter(mm, rss); | 1690 | inc_mm_counter(mm, rss); |
1720 | pte = mk_pte(page, vma->vm_page_prot); | 1691 | pte = mk_pte(page, vma->vm_page_prot); |
@@ -1722,12 +1693,16 @@ static int do_swap_page(struct mm_struct * mm, | |||
1722 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 1693 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
1723 | write_access = 0; | 1694 | write_access = 0; |
1724 | } | 1695 | } |
1725 | unlock_page(page); | ||
1726 | 1696 | ||
1727 | flush_icache_page(vma, page); | 1697 | flush_icache_page(vma, page); |
1728 | set_pte_at(mm, address, page_table, pte); | 1698 | set_pte_at(mm, address, page_table, pte); |
1729 | page_add_anon_rmap(page, vma, address); | 1699 | page_add_anon_rmap(page, vma, address); |
1730 | 1700 | ||
1701 | swap_free(entry); | ||
1702 | if (vm_swap_full()) | ||
1703 | remove_exclusive_swap_page(page); | ||
1704 | unlock_page(page); | ||
1705 | |||
1731 | if (write_access) { | 1706 | if (write_access) { |
1732 | if (do_wp_page(mm, vma, address, | 1707 | if (do_wp_page(mm, vma, address, |
1733 | page_table, pmd, pte) == VM_FAULT_OOM) | 1708 | page_table, pmd, pte) == VM_FAULT_OOM) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 08c41da429cf..cb41c31e7c87 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -238,46 +238,80 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes) | |||
238 | } | 238 | } |
239 | 239 | ||
240 | /* Ensure all existing pages follow the policy. */ | 240 | /* Ensure all existing pages follow the policy. */ |
241 | static int | 241 | static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, |
242 | verify_pages(struct mm_struct *mm, | 242 | unsigned long addr, unsigned long end, unsigned long *nodes) |
243 | unsigned long addr, unsigned long end, unsigned long *nodes) | ||
244 | { | 243 | { |
245 | while (addr < end) { | 244 | pte_t *orig_pte; |
246 | struct page *p; | 245 | pte_t *pte; |
247 | pte_t *pte; | 246 | |
248 | pmd_t *pmd; | 247 | spin_lock(&mm->page_table_lock); |
249 | pud_t *pud; | 248 | orig_pte = pte = pte_offset_map(pmd, addr); |
250 | pgd_t *pgd; | 249 | do { |
251 | pgd = pgd_offset(mm, addr); | 250 | unsigned long pfn; |
252 | if (pgd_none(*pgd)) { | 251 | unsigned int nid; |
253 | unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK; | 252 | |
254 | if (next > addr) | 253 | if (!pte_present(*pte)) |
255 | break; | ||
256 | addr = next; | ||
257 | continue; | 254 | continue; |
258 | } | 255 | pfn = pte_pfn(*pte); |
259 | pud = pud_offset(pgd, addr); | 256 | if (!pfn_valid(pfn)) |
260 | if (pud_none(*pud)) { | ||
261 | addr = (addr + PUD_SIZE) & PUD_MASK; | ||
262 | continue; | 257 | continue; |
263 | } | 258 | nid = pfn_to_nid(pfn); |
264 | pmd = pmd_offset(pud, addr); | 259 | if (!test_bit(nid, nodes)) |
265 | if (pmd_none(*pmd)) { | 260 | break; |
266 | addr = (addr + PMD_SIZE) & PMD_MASK; | 261 | } while (pte++, addr += PAGE_SIZE, addr != end); |
262 | pte_unmap(orig_pte); | ||
263 | spin_unlock(&mm->page_table_lock); | ||
264 | return addr != end; | ||
265 | } | ||
266 | |||
267 | static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, | ||
268 | unsigned long addr, unsigned long end, unsigned long *nodes) | ||
269 | { | ||
270 | pmd_t *pmd; | ||
271 | unsigned long next; | ||
272 | |||
273 | pmd = pmd_offset(pud, addr); | ||
274 | do { | ||
275 | next = pmd_addr_end(addr, end); | ||
276 | if (pmd_none_or_clear_bad(pmd)) | ||
267 | continue; | 277 | continue; |
268 | } | 278 | if (check_pte_range(mm, pmd, addr, next, nodes)) |
269 | p = NULL; | 279 | return -EIO; |
270 | pte = pte_offset_map(pmd, addr); | 280 | } while (pmd++, addr = next, addr != end); |
271 | if (pte_present(*pte)) | 281 | return 0; |
272 | p = pte_page(*pte); | 282 | } |
273 | pte_unmap(pte); | 283 | |
274 | if (p) { | 284 | static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, |
275 | unsigned nid = page_to_nid(p); | 285 | unsigned long addr, unsigned long end, unsigned long *nodes) |
276 | if (!test_bit(nid, nodes)) | 286 | { |
277 | return -EIO; | 287 | pud_t *pud; |
278 | } | 288 | unsigned long next; |
279 | addr += PAGE_SIZE; | 289 | |
280 | } | 290 | pud = pud_offset(pgd, addr); |
291 | do { | ||
292 | next = pud_addr_end(addr, end); | ||
293 | if (pud_none_or_clear_bad(pud)) | ||
294 | continue; | ||
295 | if (check_pmd_range(mm, pud, addr, next, nodes)) | ||
296 | return -EIO; | ||
297 | } while (pud++, addr = next, addr != end); | ||
298 | return 0; | ||
299 | } | ||
300 | |||
301 | static inline int check_pgd_range(struct mm_struct *mm, | ||
302 | unsigned long addr, unsigned long end, unsigned long *nodes) | ||
303 | { | ||
304 | pgd_t *pgd; | ||
305 | unsigned long next; | ||
306 | |||
307 | pgd = pgd_offset(mm, addr); | ||
308 | do { | ||
309 | next = pgd_addr_end(addr, end); | ||
310 | if (pgd_none_or_clear_bad(pgd)) | ||
311 | continue; | ||
312 | if (check_pud_range(mm, pgd, addr, next, nodes)) | ||
313 | return -EIO; | ||
314 | } while (pgd++, addr = next, addr != end); | ||
281 | return 0; | 315 | return 0; |
282 | } | 316 | } |
283 | 317 | ||
@@ -299,7 +333,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
299 | if (prev && prev->vm_end < vma->vm_start) | 333 | if (prev && prev->vm_end < vma->vm_start) |
300 | return ERR_PTR(-EFAULT); | 334 | return ERR_PTR(-EFAULT); |
301 | if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { | 335 | if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { |
302 | err = verify_pages(vma->vm_mm, | 336 | err = check_pgd_range(vma->vm_mm, |
303 | vma->vm_start, vma->vm_end, nodes); | 337 | vma->vm_start, vma->vm_end, nodes); |
304 | if (err) { | 338 | if (err) { |
305 | first = ERR_PTR(err); | 339 | first = ERR_PTR(err); |
@@ -721,7 +755,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or | |||
721 | zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); | 755 | zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK); |
722 | page = __alloc_pages(gfp, order, zl); | 756 | page = __alloc_pages(gfp, order, zl); |
723 | if (page && page_zone(page) == zl->zones[0]) { | 757 | if (page && page_zone(page) == zl->zones[0]) { |
724 | zl->zones[0]->pageset[get_cpu()].interleave_hit++; | 758 | zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; |
725 | put_cpu(); | 759 | put_cpu(); |
726 | } | 760 | } |
727 | return page; | 761 | return page; |
@@ -1175,7 +1175,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1175 | (!vma || addr + len <= vma->vm_start)) | 1175 | (!vma || addr + len <= vma->vm_start)) |
1176 | return addr; | 1176 | return addr; |
1177 | } | 1177 | } |
1178 | start_addr = addr = mm->free_area_cache; | 1178 | if (len > mm->cached_hole_size) { |
1179 | start_addr = addr = mm->free_area_cache; | ||
1180 | } else { | ||
1181 | start_addr = addr = TASK_UNMAPPED_BASE; | ||
1182 | mm->cached_hole_size = 0; | ||
1183 | } | ||
1179 | 1184 | ||
1180 | full_search: | 1185 | full_search: |
1181 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { | 1186 | for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { |
@@ -1186,7 +1191,9 @@ full_search: | |||
1186 | * some holes. | 1191 | * some holes. |
1187 | */ | 1192 | */ |
1188 | if (start_addr != TASK_UNMAPPED_BASE) { | 1193 | if (start_addr != TASK_UNMAPPED_BASE) { |
1189 | start_addr = addr = TASK_UNMAPPED_BASE; | 1194 | addr = TASK_UNMAPPED_BASE; |
1195 | start_addr = addr; | ||
1196 | mm->cached_hole_size = 0; | ||
1190 | goto full_search; | 1197 | goto full_search; |
1191 | } | 1198 | } |
1192 | return -ENOMEM; | 1199 | return -ENOMEM; |
@@ -1198,19 +1205,22 @@ full_search: | |||
1198 | mm->free_area_cache = addr + len; | 1205 | mm->free_area_cache = addr + len; |
1199 | return addr; | 1206 | return addr; |
1200 | } | 1207 | } |
1208 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
1209 | mm->cached_hole_size = vma->vm_start - addr; | ||
1201 | addr = vma->vm_end; | 1210 | addr = vma->vm_end; |
1202 | } | 1211 | } |
1203 | } | 1212 | } |
1204 | #endif | 1213 | #endif |
1205 | 1214 | ||
1206 | void arch_unmap_area(struct vm_area_struct *area) | 1215 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) |
1207 | { | 1216 | { |
1208 | /* | 1217 | /* |
1209 | * Is this a new hole at the lowest possible address? | 1218 | * Is this a new hole at the lowest possible address? |
1210 | */ | 1219 | */ |
1211 | if (area->vm_start >= TASK_UNMAPPED_BASE && | 1220 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { |
1212 | area->vm_start < area->vm_mm->free_area_cache) | 1221 | mm->free_area_cache = addr; |
1213 | area->vm_mm->free_area_cache = area->vm_start; | 1222 | mm->cached_hole_size = ~0UL; |
1223 | } | ||
1214 | } | 1224 | } |
1215 | 1225 | ||
1216 | /* | 1226 | /* |
@@ -1240,6 +1250,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1240 | return addr; | 1250 | return addr; |
1241 | } | 1251 | } |
1242 | 1252 | ||
1253 | /* check if free_area_cache is useful for us */ | ||
1254 | if (len <= mm->cached_hole_size) { | ||
1255 | mm->cached_hole_size = 0; | ||
1256 | mm->free_area_cache = mm->mmap_base; | ||
1257 | } | ||
1258 | |||
1243 | /* either no address requested or can't fit in requested address hole */ | 1259 | /* either no address requested or can't fit in requested address hole */ |
1244 | addr = mm->free_area_cache; | 1260 | addr = mm->free_area_cache; |
1245 | 1261 | ||
@@ -1251,6 +1267,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1251 | return (mm->free_area_cache = addr-len); | 1267 | return (mm->free_area_cache = addr-len); |
1252 | } | 1268 | } |
1253 | 1269 | ||
1270 | if (mm->mmap_base < len) | ||
1271 | goto bottomup; | ||
1272 | |||
1254 | addr = mm->mmap_base-len; | 1273 | addr = mm->mmap_base-len; |
1255 | 1274 | ||
1256 | do { | 1275 | do { |
@@ -1264,38 +1283,45 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1264 | /* remember the address as a hint for next time */ | 1283 | /* remember the address as a hint for next time */ |
1265 | return (mm->free_area_cache = addr); | 1284 | return (mm->free_area_cache = addr); |
1266 | 1285 | ||
1286 | /* remember the largest hole we saw so far */ | ||
1287 | if (addr + mm->cached_hole_size < vma->vm_start) | ||
1288 | mm->cached_hole_size = vma->vm_start - addr; | ||
1289 | |||
1267 | /* try just below the current vma->vm_start */ | 1290 | /* try just below the current vma->vm_start */ |
1268 | addr = vma->vm_start-len; | 1291 | addr = vma->vm_start-len; |
1269 | } while (len < vma->vm_start); | 1292 | } while (len < vma->vm_start); |
1270 | 1293 | ||
1294 | bottomup: | ||
1271 | /* | 1295 | /* |
1272 | * A failed mmap() very likely causes application failure, | 1296 | * A failed mmap() very likely causes application failure, |
1273 | * so fall back to the bottom-up function here. This scenario | 1297 | * so fall back to the bottom-up function here. This scenario |
1274 | * can happen with large stack limits and large mmap() | 1298 | * can happen with large stack limits and large mmap() |
1275 | * allocations. | 1299 | * allocations. |
1276 | */ | 1300 | */ |
1277 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 1301 | mm->cached_hole_size = ~0UL; |
1302 | mm->free_area_cache = TASK_UNMAPPED_BASE; | ||
1278 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); | 1303 | addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); |
1279 | /* | 1304 | /* |
1280 | * Restore the topdown base: | 1305 | * Restore the topdown base: |
1281 | */ | 1306 | */ |
1282 | mm->free_area_cache = mm->mmap_base; | 1307 | mm->free_area_cache = mm->mmap_base; |
1308 | mm->cached_hole_size = ~0UL; | ||
1283 | 1309 | ||
1284 | return addr; | 1310 | return addr; |
1285 | } | 1311 | } |
1286 | #endif | 1312 | #endif |
1287 | 1313 | ||
1288 | void arch_unmap_area_topdown(struct vm_area_struct *area) | 1314 | void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) |
1289 | { | 1315 | { |
1290 | /* | 1316 | /* |
1291 | * Is this a new hole at the highest possible address? | 1317 | * Is this a new hole at the highest possible address? |
1292 | */ | 1318 | */ |
1293 | if (area->vm_end > area->vm_mm->free_area_cache) | 1319 | if (addr > mm->free_area_cache) |
1294 | area->vm_mm->free_area_cache = area->vm_end; | 1320 | mm->free_area_cache = addr; |
1295 | 1321 | ||
1296 | /* dont allow allocations above current base */ | 1322 | /* dont allow allocations above current base */ |
1297 | if (area->vm_mm->free_area_cache > area->vm_mm->mmap_base) | 1323 | if (mm->free_area_cache > mm->mmap_base) |
1298 | area->vm_mm->free_area_cache = area->vm_mm->mmap_base; | 1324 | mm->free_area_cache = mm->mmap_base; |
1299 | } | 1325 | } |
1300 | 1326 | ||
1301 | unsigned long | 1327 | unsigned long |
@@ -1595,7 +1621,6 @@ static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area) | |||
1595 | if (area->vm_flags & VM_LOCKED) | 1621 | if (area->vm_flags & VM_LOCKED) |
1596 | area->vm_mm->locked_vm -= len >> PAGE_SHIFT; | 1622 | area->vm_mm->locked_vm -= len >> PAGE_SHIFT; |
1597 | vm_stat_unaccount(area); | 1623 | vm_stat_unaccount(area); |
1598 | area->vm_mm->unmap_area(area); | ||
1599 | remove_vm_struct(area); | 1624 | remove_vm_struct(area); |
1600 | } | 1625 | } |
1601 | 1626 | ||
@@ -1649,6 +1674,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1649 | { | 1674 | { |
1650 | struct vm_area_struct **insertion_point; | 1675 | struct vm_area_struct **insertion_point; |
1651 | struct vm_area_struct *tail_vma = NULL; | 1676 | struct vm_area_struct *tail_vma = NULL; |
1677 | unsigned long addr; | ||
1652 | 1678 | ||
1653 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | 1679 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
1654 | do { | 1680 | do { |
@@ -1659,6 +1685,11 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1659 | } while (vma && vma->vm_start < end); | 1685 | } while (vma && vma->vm_start < end); |
1660 | *insertion_point = vma; | 1686 | *insertion_point = vma; |
1661 | tail_vma->vm_next = NULL; | 1687 | tail_vma->vm_next = NULL; |
1688 | if (mm->unmap_area == arch_unmap_area) | ||
1689 | addr = prev ? prev->vm_end : mm->mmap_base; | ||
1690 | else | ||
1691 | addr = vma ? vma->vm_start : mm->mmap_base; | ||
1692 | mm->unmap_area(mm, addr); | ||
1662 | mm->mmap_cache = NULL; /* Kill the cache. */ | 1693 | mm->mmap_cache = NULL; /* Kill the cache. */ |
1663 | } | 1694 | } |
1664 | 1695 | ||
diff --git a/mm/msync.c b/mm/msync.c index 090f426bca7d..d0f5a1bce7cb 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -34,6 +34,8 @@ static void sync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
34 | 34 | ||
35 | if (!pte_present(*pte)) | 35 | if (!pte_present(*pte)) |
36 | continue; | 36 | continue; |
37 | if (!pte_maybe_dirty(*pte)) | ||
38 | continue; | ||
37 | pfn = pte_pfn(*pte); | 39 | pfn = pte_pfn(*pte); |
38 | if (!pfn_valid(pfn)) | 40 | if (!pfn_valid(pfn)) |
39 | continue; | 41 | continue; |
diff --git a/mm/nommu.c b/mm/nommu.c index c53e9c8f6b4a..ce74452c02d9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1067,7 +1067,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, | |||
1067 | return -ENOMEM; | 1067 | return -ENOMEM; |
1068 | } | 1068 | } |
1069 | 1069 | ||
1070 | void arch_unmap_area(struct vm_area_struct *area) | 1070 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) |
1071 | { | 1071 | { |
1072 | } | 1072 | } |
1073 | 1073 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4bbb1cb10495..59666d905f19 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -258,6 +258,10 @@ void out_of_memory(unsigned int __nocast gfp_mask) | |||
258 | struct mm_struct *mm = NULL; | 258 | struct mm_struct *mm = NULL; |
259 | task_t * p; | 259 | task_t * p; |
260 | 260 | ||
261 | printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); | ||
262 | /* print memory stats */ | ||
263 | show_mem(); | ||
264 | |||
261 | read_lock(&tasklist_lock); | 265 | read_lock(&tasklist_lock); |
262 | retry: | 266 | retry: |
263 | p = select_bad_process(); | 267 | p = select_bad_process(); |
@@ -268,12 +272,9 @@ retry: | |||
268 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 272 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
269 | if (!p) { | 273 | if (!p) { |
270 | read_unlock(&tasklist_lock); | 274 | read_unlock(&tasklist_lock); |
271 | show_free_areas(); | ||
272 | panic("Out of memory and no killable processes...\n"); | 275 | panic("Out of memory and no killable processes...\n"); |
273 | } | 276 | } |
274 | 277 | ||
275 | printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); | ||
276 | show_free_areas(); | ||
277 | mm = oom_kill_process(p); | 278 | mm = oom_kill_process(p); |
278 | if (!mm) | 279 | if (!mm) |
279 | goto retry; | 280 | goto retry; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b1061b1962f8..206920796f5f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page) | |||
105 | printk(KERN_EMERG "Backtrace:\n"); | 105 | printk(KERN_EMERG "Backtrace:\n"); |
106 | dump_stack(); | 106 | dump_stack(); |
107 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | 107 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); |
108 | page->flags &= ~(1 << PG_private | | 108 | page->flags &= ~(1 << PG_lru | |
109 | 1 << PG_private | | ||
109 | 1 << PG_locked | | 110 | 1 << PG_locked | |
110 | 1 << PG_lru | | ||
111 | 1 << PG_active | | 111 | 1 << PG_active | |
112 | 1 << PG_dirty | | 112 | 1 << PG_dirty | |
113 | 1 << PG_reclaim | | ||
114 | 1 << PG_slab | | ||
113 | 1 << PG_swapcache | | 115 | 1 << PG_swapcache | |
114 | 1 << PG_writeback); | 116 | 1 << PG_writeback); |
115 | set_page_count(page, 0); | 117 | set_page_count(page, 0); |
@@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order) | |||
440 | */ | 442 | */ |
441 | static void prep_new_page(struct page *page, int order) | 443 | static void prep_new_page(struct page *page, int order) |
442 | { | 444 | { |
443 | if (page->mapping || page_mapcount(page) || | 445 | if ( page_mapcount(page) || |
444 | (page->flags & ( | 446 | page->mapping != NULL || |
447 | page_count(page) != 0 || | ||
448 | (page->flags & ( | ||
449 | 1 << PG_lru | | ||
445 | 1 << PG_private | | 450 | 1 << PG_private | |
446 | 1 << PG_locked | | 451 | 1 << PG_locked | |
447 | 1 << PG_lru | | ||
448 | 1 << PG_active | | 452 | 1 << PG_active | |
449 | 1 << PG_dirty | | 453 | 1 << PG_dirty | |
450 | 1 << PG_reclaim | | 454 | 1 << PG_reclaim | |
455 | 1 << PG_slab | | ||
451 | 1 << PG_swapcache | | 456 | 1 << PG_swapcache | |
452 | 1 << PG_writeback ))) | 457 | 1 << PG_writeback ))) |
453 | bad_page(__FUNCTION__, page); | 458 | bad_page(__FUNCTION__, page); |
@@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
511 | return allocated; | 516 | return allocated; |
512 | } | 517 | } |
513 | 518 | ||
519 | #ifdef CONFIG_NUMA | ||
520 | /* Called from the slab reaper to drain remote pagesets */ | ||
521 | void drain_remote_pages(void) | ||
522 | { | ||
523 | struct zone *zone; | ||
524 | int i; | ||
525 | unsigned long flags; | ||
526 | |||
527 | local_irq_save(flags); | ||
528 | for_each_zone(zone) { | ||
529 | struct per_cpu_pageset *pset; | ||
530 | |||
531 | /* Do not drain local pagesets */ | ||
532 | if (zone->zone_pgdat->node_id == numa_node_id()) | ||
533 | continue; | ||
534 | |||
535 | pset = zone->pageset[smp_processor_id()]; | ||
536 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | ||
537 | struct per_cpu_pages *pcp; | ||
538 | |||
539 | pcp = &pset->pcp[i]; | ||
540 | if (pcp->count) | ||
541 | pcp->count -= free_pages_bulk(zone, pcp->count, | ||
542 | &pcp->list, 0); | ||
543 | } | ||
544 | } | ||
545 | local_irq_restore(flags); | ||
546 | } | ||
547 | #endif | ||
548 | |||
514 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 549 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) |
515 | static void __drain_pages(unsigned int cpu) | 550 | static void __drain_pages(unsigned int cpu) |
516 | { | 551 | { |
@@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu) | |||
520 | for_each_zone(zone) { | 555 | for_each_zone(zone) { |
521 | struct per_cpu_pageset *pset; | 556 | struct per_cpu_pageset *pset; |
522 | 557 | ||
523 | pset = &zone->pageset[cpu]; | 558 | pset = zone_pcp(zone, cpu); |
524 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 559 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
525 | struct per_cpu_pages *pcp; | 560 | struct per_cpu_pages *pcp; |
526 | 561 | ||
@@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
583 | 618 | ||
584 | local_irq_save(flags); | 619 | local_irq_save(flags); |
585 | cpu = smp_processor_id(); | 620 | cpu = smp_processor_id(); |
586 | p = &z->pageset[cpu]; | 621 | p = zone_pcp(z,cpu); |
587 | if (pg == orig) { | 622 | if (pg == orig) { |
588 | z->pageset[cpu].numa_hit++; | 623 | p->numa_hit++; |
589 | } else { | 624 | } else { |
590 | p->numa_miss++; | 625 | p->numa_miss++; |
591 | zonelist->zones[0]->pageset[cpu].numa_foreign++; | 626 | zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; |
592 | } | 627 | } |
593 | if (pg == NODE_DATA(numa_node_id())) | 628 | if (pg == NODE_DATA(numa_node_id())) |
594 | p->local_node++; | 629 | p->local_node++; |
@@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
615 | if (PageAnon(page)) | 650 | if (PageAnon(page)) |
616 | page->mapping = NULL; | 651 | page->mapping = NULL; |
617 | free_pages_check(__FUNCTION__, page); | 652 | free_pages_check(__FUNCTION__, page); |
618 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | 653 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
619 | local_irq_save(flags); | 654 | local_irq_save(flags); |
620 | if (pcp->count >= pcp->high) | ||
621 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | ||
622 | list_add(&page->lru, &pcp->list); | 655 | list_add(&page->lru, &pcp->list); |
623 | pcp->count++; | 656 | pcp->count++; |
657 | if (pcp->count >= pcp->high) | ||
658 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | ||
624 | local_irq_restore(flags); | 659 | local_irq_restore(flags); |
625 | put_cpu(); | 660 | put_cpu(); |
626 | } | 661 | } |
@@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) | |||
659 | if (order == 0) { | 694 | if (order == 0) { |
660 | struct per_cpu_pages *pcp; | 695 | struct per_cpu_pages *pcp; |
661 | 696 | ||
662 | pcp = &zone->pageset[get_cpu()].pcp[cold]; | 697 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
663 | local_irq_save(flags); | 698 | local_irq_save(flags); |
664 | if (pcp->count <= pcp->low) | 699 | if (pcp->count <= pcp->low) |
665 | pcp->count += rmqueue_bulk(zone, 0, | 700 | pcp->count += rmqueue_bulk(zone, 0, |
@@ -724,6 +759,16 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
724 | return 1; | 759 | return 1; |
725 | } | 760 | } |
726 | 761 | ||
762 | static inline int | ||
763 | should_reclaim_zone(struct zone *z, unsigned int gfp_mask) | ||
764 | { | ||
765 | if (!z->reclaim_pages) | ||
766 | return 0; | ||
767 | if (gfp_mask & __GFP_NORECLAIM) | ||
768 | return 0; | ||
769 | return 1; | ||
770 | } | ||
771 | |||
727 | /* | 772 | /* |
728 | * This is the 'heart' of the zoned buddy allocator. | 773 | * This is the 'heart' of the zoned buddy allocator. |
729 | */ | 774 | */ |
@@ -760,17 +805,32 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, | |||
760 | 805 | ||
761 | classzone_idx = zone_idx(zones[0]); | 806 | classzone_idx = zone_idx(zones[0]); |
762 | 807 | ||
763 | restart: | 808 | restart: |
764 | /* Go through the zonelist once, looking for a zone with enough free */ | 809 | /* Go through the zonelist once, looking for a zone with enough free */ |
765 | for (i = 0; (z = zones[i]) != NULL; i++) { | 810 | for (i = 0; (z = zones[i]) != NULL; i++) { |
766 | 811 | int do_reclaim = should_reclaim_zone(z, gfp_mask); | |
767 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
768 | classzone_idx, 0, 0)) | ||
769 | continue; | ||
770 | 812 | ||
771 | if (!cpuset_zone_allowed(z)) | 813 | if (!cpuset_zone_allowed(z)) |
772 | continue; | 814 | continue; |
773 | 815 | ||
816 | /* | ||
817 | * If the zone is to attempt early page reclaim then this loop | ||
818 | * will try to reclaim pages and check the watermark a second | ||
819 | * time before giving up and falling back to the next zone. | ||
820 | */ | ||
821 | zone_reclaim_retry: | ||
822 | if (!zone_watermark_ok(z, order, z->pages_low, | ||
823 | classzone_idx, 0, 0)) { | ||
824 | if (!do_reclaim) | ||
825 | continue; | ||
826 | else { | ||
827 | zone_reclaim(z, gfp_mask, order); | ||
828 | /* Only try reclaim once */ | ||
829 | do_reclaim = 0; | ||
830 | goto zone_reclaim_retry; | ||
831 | } | ||
832 | } | ||
833 | |||
774 | page = buffered_rmqueue(z, order, gfp_mask); | 834 | page = buffered_rmqueue(z, order, gfp_mask); |
775 | if (page) | 835 | if (page) |
776 | goto got_pg; | 836 | goto got_pg; |
@@ -829,7 +889,7 @@ rebalance: | |||
829 | reclaim_state.reclaimed_slab = 0; | 889 | reclaim_state.reclaimed_slab = 0; |
830 | p->reclaim_state = &reclaim_state; | 890 | p->reclaim_state = &reclaim_state; |
831 | 891 | ||
832 | did_some_progress = try_to_free_pages(zones, gfp_mask, order); | 892 | did_some_progress = try_to_free_pages(zones, gfp_mask); |
833 | 893 | ||
834 | p->reclaim_state = NULL; | 894 | p->reclaim_state = NULL; |
835 | p->flags &= ~PF_MEMALLOC; | 895 | p->flags &= ~PF_MEMALLOC; |
@@ -905,6 +965,7 @@ nopage: | |||
905 | " order:%d, mode:0x%x\n", | 965 | " order:%d, mode:0x%x\n", |
906 | p->comm, order, gfp_mask); | 966 | p->comm, order, gfp_mask); |
907 | dump_stack(); | 967 | dump_stack(); |
968 | show_mem(); | ||
908 | } | 969 | } |
909 | return NULL; | 970 | return NULL; |
910 | got_pg: | 971 | got_pg: |
@@ -1114,7 +1175,7 @@ void get_full_page_state(struct page_state *ret) | |||
1114 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); | 1175 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); |
1115 | } | 1176 | } |
1116 | 1177 | ||
1117 | unsigned long __read_page_state(unsigned offset) | 1178 | unsigned long __read_page_state(unsigned long offset) |
1118 | { | 1179 | { |
1119 | unsigned long ret = 0; | 1180 | unsigned long ret = 0; |
1120 | int cpu; | 1181 | int cpu; |
@@ -1128,7 +1189,7 @@ unsigned long __read_page_state(unsigned offset) | |||
1128 | return ret; | 1189 | return ret; |
1129 | } | 1190 | } |
1130 | 1191 | ||
1131 | void __mod_page_state(unsigned offset, unsigned long delta) | 1192 | void __mod_page_state(unsigned long offset, unsigned long delta) |
1132 | { | 1193 | { |
1133 | unsigned long flags; | 1194 | unsigned long flags; |
1134 | void* ptr; | 1195 | void* ptr; |
@@ -1237,22 +1298,23 @@ void show_free_areas(void) | |||
1237 | if (!cpu_possible(cpu)) | 1298 | if (!cpu_possible(cpu)) |
1238 | continue; | 1299 | continue; |
1239 | 1300 | ||
1240 | pageset = zone->pageset + cpu; | 1301 | pageset = zone_pcp(zone, cpu); |
1241 | 1302 | ||
1242 | for (temperature = 0; temperature < 2; temperature++) | 1303 | for (temperature = 0; temperature < 2; temperature++) |
1243 | printk("cpu %d %s: low %d, high %d, batch %d\n", | 1304 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", |
1244 | cpu, | 1305 | cpu, |
1245 | temperature ? "cold" : "hot", | 1306 | temperature ? "cold" : "hot", |
1246 | pageset->pcp[temperature].low, | 1307 | pageset->pcp[temperature].low, |
1247 | pageset->pcp[temperature].high, | 1308 | pageset->pcp[temperature].high, |
1248 | pageset->pcp[temperature].batch); | 1309 | pageset->pcp[temperature].batch, |
1310 | pageset->pcp[temperature].count); | ||
1249 | } | 1311 | } |
1250 | } | 1312 | } |
1251 | 1313 | ||
1252 | get_page_state(&ps); | 1314 | get_page_state(&ps); |
1253 | get_zone_counts(&active, &inactive, &free); | 1315 | get_zone_counts(&active, &inactive, &free); |
1254 | 1316 | ||
1255 | printk("\nFree pages: %11ukB (%ukB HighMem)\n", | 1317 | printk("Free pages: %11ukB (%ukB HighMem)\n", |
1256 | K(nr_free_pages()), | 1318 | K(nr_free_pages()), |
1257 | K(nr_free_highpages())); | 1319 | K(nr_free_highpages())); |
1258 | 1320 | ||
@@ -1620,6 +1682,155 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1620 | memmap_init_zone((size), (nid), (zone), (start_pfn)) | 1682 | memmap_init_zone((size), (nid), (zone), (start_pfn)) |
1621 | #endif | 1683 | #endif |
1622 | 1684 | ||
1685 | static int __devinit zone_batchsize(struct zone *zone) | ||
1686 | { | ||
1687 | int batch; | ||
1688 | |||
1689 | /* | ||
1690 | * The per-cpu-pages pools are set to around 1000th of the | ||
1691 | * size of the zone. But no more than 1/4 of a meg - there's | ||
1692 | * no point in going beyond the size of L2 cache. | ||
1693 | * | ||
1694 | * OK, so we don't know how big the cache is. So guess. | ||
1695 | */ | ||
1696 | batch = zone->present_pages / 1024; | ||
1697 | if (batch * PAGE_SIZE > 256 * 1024) | ||
1698 | batch = (256 * 1024) / PAGE_SIZE; | ||
1699 | batch /= 4; /* We effectively *= 4 below */ | ||
1700 | if (batch < 1) | ||
1701 | batch = 1; | ||
1702 | |||
1703 | /* | ||
1704 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
1705 | * of 2 value was found to be more likely to have | ||
1706 | * suboptimal cache aliasing properties in some cases. | ||
1707 | * | ||
1708 | * For example if 2 tasks are alternately allocating | ||
1709 | * batches of pages, one task can end up with a lot | ||
1710 | * of pages of one half of the possible page colors | ||
1711 | * and the other with pages of the other colors. | ||
1712 | */ | ||
1713 | batch = (1 << fls(batch + batch/2)) - 1; | ||
1714 | return batch; | ||
1715 | } | ||
1716 | |||
1717 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | ||
1718 | { | ||
1719 | struct per_cpu_pages *pcp; | ||
1720 | |||
1721 | pcp = &p->pcp[0]; /* hot */ | ||
1722 | pcp->count = 0; | ||
1723 | pcp->low = 2 * batch; | ||
1724 | pcp->high = 6 * batch; | ||
1725 | pcp->batch = max(1UL, 1 * batch); | ||
1726 | INIT_LIST_HEAD(&pcp->list); | ||
1727 | |||
1728 | pcp = &p->pcp[1]; /* cold*/ | ||
1729 | pcp->count = 0; | ||
1730 | pcp->low = 0; | ||
1731 | pcp->high = 2 * batch; | ||
1732 | pcp->batch = max(1UL, 1 * batch); | ||
1733 | INIT_LIST_HEAD(&pcp->list); | ||
1734 | } | ||
1735 | |||
1736 | #ifdef CONFIG_NUMA | ||
1737 | /* | ||
1738 | * Boot pageset table. One per cpu which is going to be used for all | ||
1739 | * zones and all nodes. The parameters will be set in such a way | ||
1740 | * that an item put on a list will immediately be handed over to | ||
1741 | * the buddy list. This is safe since pageset manipulation is done | ||
1742 | * with interrupts disabled. | ||
1743 | * | ||
1744 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
1745 | * These will be discarded when bootup is complete. | ||
1746 | */ | ||
1747 | static struct per_cpu_pageset | ||
1748 | boot_pageset[NR_CPUS] __initdata; | ||
1749 | |||
1750 | /* | ||
1751 | * Dynamically allocate memory for the | ||
1752 | * per cpu pageset array in struct zone. | ||
1753 | */ | ||
1754 | static int __devinit process_zones(int cpu) | ||
1755 | { | ||
1756 | struct zone *zone, *dzone; | ||
1757 | |||
1758 | for_each_zone(zone) { | ||
1759 | |||
1760 | zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), | ||
1761 | GFP_KERNEL, cpu_to_node(cpu)); | ||
1762 | if (!zone->pageset[cpu]) | ||
1763 | goto bad; | ||
1764 | |||
1765 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); | ||
1766 | } | ||
1767 | |||
1768 | return 0; | ||
1769 | bad: | ||
1770 | for_each_zone(dzone) { | ||
1771 | if (dzone == zone) | ||
1772 | break; | ||
1773 | kfree(dzone->pageset[cpu]); | ||
1774 | dzone->pageset[cpu] = NULL; | ||
1775 | } | ||
1776 | return -ENOMEM; | ||
1777 | } | ||
1778 | |||
1779 | static inline void free_zone_pagesets(int cpu) | ||
1780 | { | ||
1781 | #ifdef CONFIG_NUMA | ||
1782 | struct zone *zone; | ||
1783 | |||
1784 | for_each_zone(zone) { | ||
1785 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
1786 | |||
1787 | zone_pcp(zone, cpu) = NULL; | ||
1788 | kfree(pset); | ||
1789 | } | ||
1790 | #endif | ||
1791 | } | ||
1792 | |||
1793 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
1794 | unsigned long action, | ||
1795 | void *hcpu) | ||
1796 | { | ||
1797 | int cpu = (long)hcpu; | ||
1798 | int ret = NOTIFY_OK; | ||
1799 | |||
1800 | switch (action) { | ||
1801 | case CPU_UP_PREPARE: | ||
1802 | if (process_zones(cpu)) | ||
1803 | ret = NOTIFY_BAD; | ||
1804 | break; | ||
1805 | #ifdef CONFIG_HOTPLUG_CPU | ||
1806 | case CPU_DEAD: | ||
1807 | free_zone_pagesets(cpu); | ||
1808 | break; | ||
1809 | #endif | ||
1810 | default: | ||
1811 | break; | ||
1812 | } | ||
1813 | return ret; | ||
1814 | } | ||
1815 | |||
1816 | static struct notifier_block pageset_notifier = | ||
1817 | { &pageset_cpuup_callback, NULL, 0 }; | ||
1818 | |||
1819 | void __init setup_per_cpu_pageset() | ||
1820 | { | ||
1821 | int err; | ||
1822 | |||
1823 | /* Initialize per_cpu_pageset for cpu 0. | ||
1824 | * A cpuup callback will do this for every cpu | ||
1825 | * as it comes online | ||
1826 | */ | ||
1827 | err = process_zones(smp_processor_id()); | ||
1828 | BUG_ON(err); | ||
1829 | register_cpu_notifier(&pageset_notifier); | ||
1830 | } | ||
1831 | |||
1832 | #endif | ||
1833 | |||
1623 | /* | 1834 | /* |
1624 | * Set up the zone data structures: | 1835 | * Set up the zone data structures: |
1625 | * - mark all pages reserved | 1836 | * - mark all pages reserved |
@@ -1662,48 +1873,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1662 | 1873 | ||
1663 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; | 1874 | zone->temp_priority = zone->prev_priority = DEF_PRIORITY; |
1664 | 1875 | ||
1665 | /* | 1876 | batch = zone_batchsize(zone); |
1666 | * The per-cpu-pages pools are set to around 1000th of the | ||
1667 | * size of the zone. But no more than 1/4 of a meg - there's | ||
1668 | * no point in going beyond the size of L2 cache. | ||
1669 | * | ||
1670 | * OK, so we don't know how big the cache is. So guess. | ||
1671 | */ | ||
1672 | batch = zone->present_pages / 1024; | ||
1673 | if (batch * PAGE_SIZE > 256 * 1024) | ||
1674 | batch = (256 * 1024) / PAGE_SIZE; | ||
1675 | batch /= 4; /* We effectively *= 4 below */ | ||
1676 | if (batch < 1) | ||
1677 | batch = 1; | ||
1678 | |||
1679 | /* | ||
1680 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
1681 | * of 2 value was found to be more likely to have | ||
1682 | * suboptimal cache aliasing properties in some cases. | ||
1683 | * | ||
1684 | * For example if 2 tasks are alternately allocating | ||
1685 | * batches of pages, one task can end up with a lot | ||
1686 | * of pages of one half of the possible page colors | ||
1687 | * and the other with pages of the other colors. | ||
1688 | */ | ||
1689 | batch = (1 << fls(batch + batch/2)) - 1; | ||
1690 | 1877 | ||
1691 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 1878 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
1692 | struct per_cpu_pages *pcp; | 1879 | #ifdef CONFIG_NUMA |
1693 | 1880 | /* Early boot. Slab allocator not functional yet */ | |
1694 | pcp = &zone->pageset[cpu].pcp[0]; /* hot */ | 1881 | zone->pageset[cpu] = &boot_pageset[cpu]; |
1695 | pcp->count = 0; | 1882 | setup_pageset(&boot_pageset[cpu],0); |
1696 | pcp->low = 2 * batch; | 1883 | #else |
1697 | pcp->high = 6 * batch; | 1884 | setup_pageset(zone_pcp(zone,cpu), batch); |
1698 | pcp->batch = 1 * batch; | 1885 | #endif |
1699 | INIT_LIST_HEAD(&pcp->list); | ||
1700 | |||
1701 | pcp = &zone->pageset[cpu].pcp[1]; /* cold */ | ||
1702 | pcp->count = 0; | ||
1703 | pcp->low = 0; | ||
1704 | pcp->high = 2 * batch; | ||
1705 | pcp->batch = 1 * batch; | ||
1706 | INIT_LIST_HEAD(&pcp->list); | ||
1707 | } | 1886 | } |
1708 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 1887 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", |
1709 | zone_names[j], realsize, batch); | 1888 | zone_names[j], realsize, batch); |
@@ -1713,6 +1892,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1713 | zone->nr_scan_inactive = 0; | 1892 | zone->nr_scan_inactive = 0; |
1714 | zone->nr_active = 0; | 1893 | zone->nr_active = 0; |
1715 | zone->nr_inactive = 0; | 1894 | zone->nr_inactive = 0; |
1895 | atomic_set(&zone->reclaim_in_progress, -1); | ||
1716 | if (!size) | 1896 | if (!size) |
1717 | continue; | 1897 | continue; |
1718 | 1898 | ||
@@ -1853,6 +2033,115 @@ struct seq_operations fragmentation_op = { | |||
1853 | .show = frag_show, | 2033 | .show = frag_show, |
1854 | }; | 2034 | }; |
1855 | 2035 | ||
2036 | /* | ||
2037 | * Output information about zones in @pgdat. | ||
2038 | */ | ||
2039 | static int zoneinfo_show(struct seq_file *m, void *arg) | ||
2040 | { | ||
2041 | pg_data_t *pgdat = arg; | ||
2042 | struct zone *zone; | ||
2043 | struct zone *node_zones = pgdat->node_zones; | ||
2044 | unsigned long flags; | ||
2045 | |||
2046 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | ||
2047 | int i; | ||
2048 | |||
2049 | if (!zone->present_pages) | ||
2050 | continue; | ||
2051 | |||
2052 | spin_lock_irqsave(&zone->lock, flags); | ||
2053 | seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); | ||
2054 | seq_printf(m, | ||
2055 | "\n pages free %lu" | ||
2056 | "\n min %lu" | ||
2057 | "\n low %lu" | ||
2058 | "\n high %lu" | ||
2059 | "\n active %lu" | ||
2060 | "\n inactive %lu" | ||
2061 | "\n scanned %lu (a: %lu i: %lu)" | ||
2062 | "\n spanned %lu" | ||
2063 | "\n present %lu", | ||
2064 | zone->free_pages, | ||
2065 | zone->pages_min, | ||
2066 | zone->pages_low, | ||
2067 | zone->pages_high, | ||
2068 | zone->nr_active, | ||
2069 | zone->nr_inactive, | ||
2070 | zone->pages_scanned, | ||
2071 | zone->nr_scan_active, zone->nr_scan_inactive, | ||
2072 | zone->spanned_pages, | ||
2073 | zone->present_pages); | ||
2074 | seq_printf(m, | ||
2075 | "\n protection: (%lu", | ||
2076 | zone->lowmem_reserve[0]); | ||
2077 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | ||
2078 | seq_printf(m, ", %lu", zone->lowmem_reserve[i]); | ||
2079 | seq_printf(m, | ||
2080 | ")" | ||
2081 | "\n pagesets"); | ||
2082 | for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { | ||
2083 | struct per_cpu_pageset *pageset; | ||
2084 | int j; | ||
2085 | |||
2086 | pageset = zone_pcp(zone, i); | ||
2087 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
2088 | if (pageset->pcp[j].count) | ||
2089 | break; | ||
2090 | } | ||
2091 | if (j == ARRAY_SIZE(pageset->pcp)) | ||
2092 | continue; | ||
2093 | for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { | ||
2094 | seq_printf(m, | ||
2095 | "\n cpu: %i pcp: %i" | ||
2096 | "\n count: %i" | ||
2097 | "\n low: %i" | ||
2098 | "\n high: %i" | ||
2099 | "\n batch: %i", | ||
2100 | i, j, | ||
2101 | pageset->pcp[j].count, | ||
2102 | pageset->pcp[j].low, | ||
2103 | pageset->pcp[j].high, | ||
2104 | pageset->pcp[j].batch); | ||
2105 | } | ||
2106 | #ifdef CONFIG_NUMA | ||
2107 | seq_printf(m, | ||
2108 | "\n numa_hit: %lu" | ||
2109 | "\n numa_miss: %lu" | ||
2110 | "\n numa_foreign: %lu" | ||
2111 | "\n interleave_hit: %lu" | ||
2112 | "\n local_node: %lu" | ||
2113 | "\n other_node: %lu", | ||
2114 | pageset->numa_hit, | ||
2115 | pageset->numa_miss, | ||
2116 | pageset->numa_foreign, | ||
2117 | pageset->interleave_hit, | ||
2118 | pageset->local_node, | ||
2119 | pageset->other_node); | ||
2120 | #endif | ||
2121 | } | ||
2122 | seq_printf(m, | ||
2123 | "\n all_unreclaimable: %u" | ||
2124 | "\n prev_priority: %i" | ||
2125 | "\n temp_priority: %i" | ||
2126 | "\n start_pfn: %lu", | ||
2127 | zone->all_unreclaimable, | ||
2128 | zone->prev_priority, | ||
2129 | zone->temp_priority, | ||
2130 | zone->zone_start_pfn); | ||
2131 | spin_unlock_irqrestore(&zone->lock, flags); | ||
2132 | seq_putc(m, '\n'); | ||
2133 | } | ||
2134 | return 0; | ||
2135 | } | ||
2136 | |||
2137 | struct seq_operations zoneinfo_op = { | ||
2138 | .start = frag_start, /* iterate over all zones. The same as in | ||
2139 | * fragmentation. */ | ||
2140 | .next = frag_next, | ||
2141 | .stop = frag_stop, | ||
2142 | .show = zoneinfo_show, | ||
2143 | }; | ||
2144 | |||
1856 | static char *vmstat_text[] = { | 2145 | static char *vmstat_text[] = { |
1857 | "nr_dirty", | 2146 | "nr_dirty", |
1858 | "nr_writeback", | 2147 | "nr_writeback", |
@@ -2058,10 +2347,10 @@ static void setup_per_zone_pages_min(void) | |||
2058 | min_pages = 128; | 2347 | min_pages = 128; |
2059 | zone->pages_min = min_pages; | 2348 | zone->pages_min = min_pages; |
2060 | } else { | 2349 | } else { |
2061 | /* if it's a lowmem zone, reserve a number of pages | 2350 | /* if it's a lowmem zone, reserve a number of pages |
2062 | * proportionate to the zone's size. | 2351 | * proportionate to the zone's size. |
2063 | */ | 2352 | */ |
2064 | zone->pages_min = (pages_min * zone->present_pages) / | 2353 | zone->pages_min = (pages_min * zone->present_pages) / |
2065 | lowmem_pages; | 2354 | lowmem_pages; |
2066 | } | 2355 | } |
2067 | 2356 | ||
@@ -539,27 +539,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
539 | goto out_unmap; | 539 | goto out_unmap; |
540 | } | 540 | } |
541 | 541 | ||
542 | /* | ||
543 | * Don't pull an anonymous page out from under get_user_pages. | ||
544 | * GUP carefully breaks COW and raises page count (while holding | ||
545 | * page_table_lock, as we have here) to make sure that the page | ||
546 | * cannot be freed. If we unmap that page here, a user write | ||
547 | * access to the virtual address will bring back the page, but | ||
548 | * its raised count will (ironically) be taken to mean it's not | ||
549 | * an exclusive swap page, do_wp_page will replace it by a copy | ||
550 | * page, and the user never get to see the data GUP was holding | ||
551 | * the original page for. | ||
552 | * | ||
553 | * This test is also useful for when swapoff (unuse_process) has | ||
554 | * to drop page lock: its reference to the page stops existing | ||
555 | * ptes from being unmapped, so swapoff can make progress. | ||
556 | */ | ||
557 | if (PageSwapCache(page) && | ||
558 | page_count(page) != page_mapcount(page) + 2) { | ||
559 | ret = SWAP_FAIL; | ||
560 | goto out_unmap; | ||
561 | } | ||
562 | |||
563 | /* Nuke the page table entry. */ | 542 | /* Nuke the page table entry. */ |
564 | flush_cache_page(vma, address, page_to_pfn(page)); | 543 | flush_cache_page(vma, address, page_to_pfn(page)); |
565 | pteval = ptep_clear_flush(vma, address, pte); | 544 | pteval = ptep_clear_flush(vma, address, pte); |
diff --git a/mm/shmem.c b/mm/shmem.c index 61574b81d979..e64fa726a790 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -6,8 +6,8 @@ | |||
6 | * 2000-2001 Christoph Rohland | 6 | * 2000-2001 Christoph Rohland |
7 | * 2000-2001 SAP AG | 7 | * 2000-2001 SAP AG |
8 | * 2002 Red Hat Inc. | 8 | * 2002 Red Hat Inc. |
9 | * Copyright (C) 2002-2004 Hugh Dickins. | 9 | * Copyright (C) 2002-2005 Hugh Dickins. |
10 | * Copyright (C) 2002-2004 VERITAS Software Corporation. | 10 | * Copyright (C) 2002-2005 VERITAS Software Corporation. |
11 | * Copyright (C) 2004 Andi Kleen, SuSE Labs | 11 | * Copyright (C) 2004 Andi Kleen, SuSE Labs |
12 | * | 12 | * |
13 | * Extended attribute support for tmpfs: | 13 | * Extended attribute support for tmpfs: |
@@ -194,7 +194,7 @@ static DEFINE_SPINLOCK(shmem_swaplist_lock); | |||
194 | static void shmem_free_blocks(struct inode *inode, long pages) | 194 | static void shmem_free_blocks(struct inode *inode, long pages) |
195 | { | 195 | { |
196 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 196 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); |
197 | if (sbinfo) { | 197 | if (sbinfo->max_blocks) { |
198 | spin_lock(&sbinfo->stat_lock); | 198 | spin_lock(&sbinfo->stat_lock); |
199 | sbinfo->free_blocks += pages; | 199 | sbinfo->free_blocks += pages; |
200 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; | 200 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; |
@@ -357,7 +357,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long | |||
357 | * page (and perhaps indirect index pages) yet to allocate: | 357 | * page (and perhaps indirect index pages) yet to allocate: |
358 | * a waste to allocate index if we cannot allocate data. | 358 | * a waste to allocate index if we cannot allocate data. |
359 | */ | 359 | */ |
360 | if (sbinfo) { | 360 | if (sbinfo->max_blocks) { |
361 | spin_lock(&sbinfo->stat_lock); | 361 | spin_lock(&sbinfo->stat_lock); |
362 | if (sbinfo->free_blocks <= 1) { | 362 | if (sbinfo->free_blocks <= 1) { |
363 | spin_unlock(&sbinfo->stat_lock); | 363 | spin_unlock(&sbinfo->stat_lock); |
@@ -677,8 +677,8 @@ static void shmem_delete_inode(struct inode *inode) | |||
677 | spin_unlock(&shmem_swaplist_lock); | 677 | spin_unlock(&shmem_swaplist_lock); |
678 | } | 678 | } |
679 | } | 679 | } |
680 | if (sbinfo) { | 680 | BUG_ON(inode->i_blocks); |
681 | BUG_ON(inode->i_blocks); | 681 | if (sbinfo->max_inodes) { |
682 | spin_lock(&sbinfo->stat_lock); | 682 | spin_lock(&sbinfo->stat_lock); |
683 | sbinfo->free_inodes++; | 683 | sbinfo->free_inodes++; |
684 | spin_unlock(&sbinfo->stat_lock); | 684 | spin_unlock(&sbinfo->stat_lock); |
@@ -1080,7 +1080,7 @@ repeat: | |||
1080 | } else { | 1080 | } else { |
1081 | shmem_swp_unmap(entry); | 1081 | shmem_swp_unmap(entry); |
1082 | sbinfo = SHMEM_SB(inode->i_sb); | 1082 | sbinfo = SHMEM_SB(inode->i_sb); |
1083 | if (sbinfo) { | 1083 | if (sbinfo->max_blocks) { |
1084 | spin_lock(&sbinfo->stat_lock); | 1084 | spin_lock(&sbinfo->stat_lock); |
1085 | if (sbinfo->free_blocks == 0 || | 1085 | if (sbinfo->free_blocks == 0 || |
1086 | shmem_acct_block(info->flags)) { | 1086 | shmem_acct_block(info->flags)) { |
@@ -1269,7 +1269,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1269 | struct shmem_inode_info *info; | 1269 | struct shmem_inode_info *info; |
1270 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 1270 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
1271 | 1271 | ||
1272 | if (sbinfo) { | 1272 | if (sbinfo->max_inodes) { |
1273 | spin_lock(&sbinfo->stat_lock); | 1273 | spin_lock(&sbinfo->stat_lock); |
1274 | if (!sbinfo->free_inodes) { | 1274 | if (!sbinfo->free_inodes) { |
1275 | spin_unlock(&sbinfo->stat_lock); | 1275 | spin_unlock(&sbinfo->stat_lock); |
@@ -1319,7 +1319,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1319 | mpol_shared_policy_init(&info->policy); | 1319 | mpol_shared_policy_init(&info->policy); |
1320 | break; | 1320 | break; |
1321 | } | 1321 | } |
1322 | } else if (sbinfo) { | 1322 | } else if (sbinfo->max_inodes) { |
1323 | spin_lock(&sbinfo->stat_lock); | 1323 | spin_lock(&sbinfo->stat_lock); |
1324 | sbinfo->free_inodes++; | 1324 | sbinfo->free_inodes++; |
1325 | spin_unlock(&sbinfo->stat_lock); | 1325 | spin_unlock(&sbinfo->stat_lock); |
@@ -1328,31 +1328,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
1328 | } | 1328 | } |
1329 | 1329 | ||
1330 | #ifdef CONFIG_TMPFS | 1330 | #ifdef CONFIG_TMPFS |
1331 | |||
1332 | static int shmem_set_size(struct shmem_sb_info *sbinfo, | ||
1333 | unsigned long max_blocks, unsigned long max_inodes) | ||
1334 | { | ||
1335 | int error; | ||
1336 | unsigned long blocks, inodes; | ||
1337 | |||
1338 | spin_lock(&sbinfo->stat_lock); | ||
1339 | blocks = sbinfo->max_blocks - sbinfo->free_blocks; | ||
1340 | inodes = sbinfo->max_inodes - sbinfo->free_inodes; | ||
1341 | error = -EINVAL; | ||
1342 | if (max_blocks < blocks) | ||
1343 | goto out; | ||
1344 | if (max_inodes < inodes) | ||
1345 | goto out; | ||
1346 | error = 0; | ||
1347 | sbinfo->max_blocks = max_blocks; | ||
1348 | sbinfo->free_blocks = max_blocks - blocks; | ||
1349 | sbinfo->max_inodes = max_inodes; | ||
1350 | sbinfo->free_inodes = max_inodes - inodes; | ||
1351 | out: | ||
1352 | spin_unlock(&sbinfo->stat_lock); | ||
1353 | return error; | ||
1354 | } | ||
1355 | |||
1356 | static struct inode_operations shmem_symlink_inode_operations; | 1331 | static struct inode_operations shmem_symlink_inode_operations; |
1357 | static struct inode_operations shmem_symlink_inline_operations; | 1332 | static struct inode_operations shmem_symlink_inline_operations; |
1358 | 1333 | ||
@@ -1607,15 +1582,17 @@ static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) | |||
1607 | buf->f_type = TMPFS_MAGIC; | 1582 | buf->f_type = TMPFS_MAGIC; |
1608 | buf->f_bsize = PAGE_CACHE_SIZE; | 1583 | buf->f_bsize = PAGE_CACHE_SIZE; |
1609 | buf->f_namelen = NAME_MAX; | 1584 | buf->f_namelen = NAME_MAX; |
1610 | if (sbinfo) { | 1585 | spin_lock(&sbinfo->stat_lock); |
1611 | spin_lock(&sbinfo->stat_lock); | 1586 | if (sbinfo->max_blocks) { |
1612 | buf->f_blocks = sbinfo->max_blocks; | 1587 | buf->f_blocks = sbinfo->max_blocks; |
1613 | buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; | 1588 | buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; |
1589 | } | ||
1590 | if (sbinfo->max_inodes) { | ||
1614 | buf->f_files = sbinfo->max_inodes; | 1591 | buf->f_files = sbinfo->max_inodes; |
1615 | buf->f_ffree = sbinfo->free_inodes; | 1592 | buf->f_ffree = sbinfo->free_inodes; |
1616 | spin_unlock(&sbinfo->stat_lock); | ||
1617 | } | 1593 | } |
1618 | /* else leave those fields 0 like simple_statfs */ | 1594 | /* else leave those fields 0 like simple_statfs */ |
1595 | spin_unlock(&sbinfo->stat_lock); | ||
1619 | return 0; | 1596 | return 0; |
1620 | } | 1597 | } |
1621 | 1598 | ||
@@ -1672,7 +1649,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr | |||
1672 | * but each new link needs a new dentry, pinning lowmem, and | 1649 | * but each new link needs a new dentry, pinning lowmem, and |
1673 | * tmpfs dentries cannot be pruned until they are unlinked. | 1650 | * tmpfs dentries cannot be pruned until they are unlinked. |
1674 | */ | 1651 | */ |
1675 | if (sbinfo) { | 1652 | if (sbinfo->max_inodes) { |
1676 | spin_lock(&sbinfo->stat_lock); | 1653 | spin_lock(&sbinfo->stat_lock); |
1677 | if (!sbinfo->free_inodes) { | 1654 | if (!sbinfo->free_inodes) { |
1678 | spin_unlock(&sbinfo->stat_lock); | 1655 | spin_unlock(&sbinfo->stat_lock); |
@@ -1697,7 +1674,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) | |||
1697 | 1674 | ||
1698 | if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { | 1675 | if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { |
1699 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 1676 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); |
1700 | if (sbinfo) { | 1677 | if (sbinfo->max_inodes) { |
1701 | spin_lock(&sbinfo->stat_lock); | 1678 | spin_lock(&sbinfo->stat_lock); |
1702 | sbinfo->free_inodes++; | 1679 | sbinfo->free_inodes++; |
1703 | spin_unlock(&sbinfo->stat_lock); | 1680 | spin_unlock(&sbinfo->stat_lock); |
@@ -1921,22 +1898,42 @@ bad_val: | |||
1921 | static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | 1898 | static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) |
1922 | { | 1899 | { |
1923 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 1900 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
1924 | unsigned long max_blocks = 0; | 1901 | unsigned long max_blocks = sbinfo->max_blocks; |
1925 | unsigned long max_inodes = 0; | 1902 | unsigned long max_inodes = sbinfo->max_inodes; |
1903 | unsigned long blocks; | ||
1904 | unsigned long inodes; | ||
1905 | int error = -EINVAL; | ||
1906 | |||
1907 | if (shmem_parse_options(data, NULL, NULL, NULL, | ||
1908 | &max_blocks, &max_inodes)) | ||
1909 | return error; | ||
1926 | 1910 | ||
1927 | if (sbinfo) { | 1911 | spin_lock(&sbinfo->stat_lock); |
1928 | max_blocks = sbinfo->max_blocks; | 1912 | blocks = sbinfo->max_blocks - sbinfo->free_blocks; |
1929 | max_inodes = sbinfo->max_inodes; | 1913 | inodes = sbinfo->max_inodes - sbinfo->free_inodes; |
1930 | } | 1914 | if (max_blocks < blocks) |
1931 | if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes)) | 1915 | goto out; |
1932 | return -EINVAL; | 1916 | if (max_inodes < inodes) |
1933 | /* Keep it simple: disallow limited <-> unlimited remount */ | 1917 | goto out; |
1934 | if ((max_blocks || max_inodes) == !sbinfo) | 1918 | /* |
1935 | return -EINVAL; | 1919 | * Those tests also disallow limited->unlimited while any are in |
1936 | /* But allow the pointless unlimited -> unlimited remount */ | 1920 | * use, so i_blocks will always be zero when max_blocks is zero; |
1937 | if (!sbinfo) | 1921 | * but we must separately disallow unlimited->limited, because |
1938 | return 0; | 1922 | * in that case we have no record of how much is already in use. |
1939 | return shmem_set_size(sbinfo, max_blocks, max_inodes); | 1923 | */ |
1924 | if (max_blocks && !sbinfo->max_blocks) | ||
1925 | goto out; | ||
1926 | if (max_inodes && !sbinfo->max_inodes) | ||
1927 | goto out; | ||
1928 | |||
1929 | error = 0; | ||
1930 | sbinfo->max_blocks = max_blocks; | ||
1931 | sbinfo->free_blocks = max_blocks - blocks; | ||
1932 | sbinfo->max_inodes = max_inodes; | ||
1933 | sbinfo->free_inodes = max_inodes - inodes; | ||
1934 | out: | ||
1935 | spin_unlock(&sbinfo->stat_lock); | ||
1936 | return error; | ||
1940 | } | 1937 | } |
1941 | #endif | 1938 | #endif |
1942 | 1939 | ||
@@ -1961,11 +1958,11 @@ static int shmem_fill_super(struct super_block *sb, | |||
1961 | uid_t uid = current->fsuid; | 1958 | uid_t uid = current->fsuid; |
1962 | gid_t gid = current->fsgid; | 1959 | gid_t gid = current->fsgid; |
1963 | int err = -ENOMEM; | 1960 | int err = -ENOMEM; |
1964 | 1961 | struct shmem_sb_info *sbinfo; | |
1965 | #ifdef CONFIG_TMPFS | ||
1966 | unsigned long blocks = 0; | 1962 | unsigned long blocks = 0; |
1967 | unsigned long inodes = 0; | 1963 | unsigned long inodes = 0; |
1968 | 1964 | ||
1965 | #ifdef CONFIG_TMPFS | ||
1969 | /* | 1966 | /* |
1970 | * Per default we only allow half of the physical ram per | 1967 | * Per default we only allow half of the physical ram per |
1971 | * tmpfs instance, limiting inodes to one per page of lowmem; | 1968 | * tmpfs instance, limiting inodes to one per page of lowmem; |
@@ -1976,34 +1973,34 @@ static int shmem_fill_super(struct super_block *sb, | |||
1976 | inodes = totalram_pages - totalhigh_pages; | 1973 | inodes = totalram_pages - totalhigh_pages; |
1977 | if (inodes > blocks) | 1974 | if (inodes > blocks) |
1978 | inodes = blocks; | 1975 | inodes = blocks; |
1979 | 1976 | if (shmem_parse_options(data, &mode, &uid, &gid, | |
1980 | if (shmem_parse_options(data, &mode, | 1977 | &blocks, &inodes)) |
1981 | &uid, &gid, &blocks, &inodes)) | ||
1982 | return -EINVAL; | 1978 | return -EINVAL; |
1983 | } | 1979 | } |
1984 | |||
1985 | if (blocks || inodes) { | ||
1986 | struct shmem_sb_info *sbinfo; | ||
1987 | sbinfo = kmalloc(sizeof(struct shmem_sb_info), GFP_KERNEL); | ||
1988 | if (!sbinfo) | ||
1989 | return -ENOMEM; | ||
1990 | sb->s_fs_info = sbinfo; | ||
1991 | spin_lock_init(&sbinfo->stat_lock); | ||
1992 | sbinfo->max_blocks = blocks; | ||
1993 | sbinfo->free_blocks = blocks; | ||
1994 | sbinfo->max_inodes = inodes; | ||
1995 | sbinfo->free_inodes = inodes; | ||
1996 | } | ||
1997 | sb->s_xattr = shmem_xattr_handlers; | ||
1998 | #else | 1980 | #else |
1999 | sb->s_flags |= MS_NOUSER; | 1981 | sb->s_flags |= MS_NOUSER; |
2000 | #endif | 1982 | #endif |
2001 | 1983 | ||
1984 | /* Round up to L1_CACHE_BYTES to resist false sharing */ | ||
1985 | sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), | ||
1986 | L1_CACHE_BYTES), GFP_KERNEL); | ||
1987 | if (!sbinfo) | ||
1988 | return -ENOMEM; | ||
1989 | |||
1990 | spin_lock_init(&sbinfo->stat_lock); | ||
1991 | sbinfo->max_blocks = blocks; | ||
1992 | sbinfo->free_blocks = blocks; | ||
1993 | sbinfo->max_inodes = inodes; | ||
1994 | sbinfo->free_inodes = inodes; | ||
1995 | |||
1996 | sb->s_fs_info = sbinfo; | ||
2002 | sb->s_maxbytes = SHMEM_MAX_BYTES; | 1997 | sb->s_maxbytes = SHMEM_MAX_BYTES; |
2003 | sb->s_blocksize = PAGE_CACHE_SIZE; | 1998 | sb->s_blocksize = PAGE_CACHE_SIZE; |
2004 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | 1999 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; |
2005 | sb->s_magic = TMPFS_MAGIC; | 2000 | sb->s_magic = TMPFS_MAGIC; |
2006 | sb->s_op = &shmem_ops; | 2001 | sb->s_op = &shmem_ops; |
2002 | sb->s_xattr = shmem_xattr_handlers; | ||
2003 | |||
2007 | inode = shmem_get_inode(sb, S_IFDIR | mode, 0); | 2004 | inode = shmem_get_inode(sb, S_IFDIR | mode, 0); |
2008 | if (!inode) | 2005 | if (!inode) |
2009 | goto failed; | 2006 | goto failed; |
@@ -2851,6 +2851,7 @@ next: | |||
2851 | } | 2851 | } |
2852 | check_irq_on(); | 2852 | check_irq_on(); |
2853 | up(&cache_chain_sem); | 2853 | up(&cache_chain_sem); |
2854 | drain_remote_pages(); | ||
2854 | /* Setup the next iteration */ | 2855 | /* Setup the next iteration */ |
2855 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); | 2856 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id()); |
2856 | } | 2857 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index da48405cd9a3..60cd24a55204 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -276,61 +276,37 @@ void swap_free(swp_entry_t entry) | |||
276 | } | 276 | } |
277 | 277 | ||
278 | /* | 278 | /* |
279 | * Check if we're the only user of a swap page, | 279 | * How many references to page are currently swapped out? |
280 | * when the page is locked. | ||
281 | */ | 280 | */ |
282 | static int exclusive_swap_page(struct page *page) | 281 | static inline int page_swapcount(struct page *page) |
283 | { | 282 | { |
284 | int retval = 0; | 283 | int count = 0; |
285 | struct swap_info_struct * p; | 284 | struct swap_info_struct *p; |
286 | swp_entry_t entry; | 285 | swp_entry_t entry; |
287 | 286 | ||
288 | entry.val = page->private; | 287 | entry.val = page->private; |
289 | p = swap_info_get(entry); | 288 | p = swap_info_get(entry); |
290 | if (p) { | 289 | if (p) { |
291 | /* Is the only swap cache user the cache itself? */ | 290 | /* Subtract the 1 for the swap cache itself */ |
292 | if (p->swap_map[swp_offset(entry)] == 1) { | 291 | count = p->swap_map[swp_offset(entry)] - 1; |
293 | /* Recheck the page count with the swapcache lock held.. */ | ||
294 | write_lock_irq(&swapper_space.tree_lock); | ||
295 | if (page_count(page) == 2) | ||
296 | retval = 1; | ||
297 | write_unlock_irq(&swapper_space.tree_lock); | ||
298 | } | ||
299 | swap_info_put(p); | 292 | swap_info_put(p); |
300 | } | 293 | } |
301 | return retval; | 294 | return count; |
302 | } | 295 | } |
303 | 296 | ||
304 | /* | 297 | /* |
305 | * We can use this swap cache entry directly | 298 | * We can use this swap cache entry directly |
306 | * if there are no other references to it. | 299 | * if there are no other references to it. |
307 | * | ||
308 | * Here "exclusive_swap_page()" does the real | ||
309 | * work, but we opportunistically check whether | ||
310 | * we need to get all the locks first.. | ||
311 | */ | 300 | */ |
312 | int can_share_swap_page(struct page *page) | 301 | int can_share_swap_page(struct page *page) |
313 | { | 302 | { |
314 | int retval = 0; | 303 | int count; |
315 | 304 | ||
316 | if (!PageLocked(page)) | 305 | BUG_ON(!PageLocked(page)); |
317 | BUG(); | 306 | count = page_mapcount(page); |
318 | switch (page_count(page)) { | 307 | if (count <= 1 && PageSwapCache(page)) |
319 | case 3: | 308 | count += page_swapcount(page); |
320 | if (!PagePrivate(page)) | 309 | return count == 1; |
321 | break; | ||
322 | /* Fallthrough */ | ||
323 | case 2: | ||
324 | if (!PageSwapCache(page)) | ||
325 | break; | ||
326 | retval = exclusive_swap_page(page); | ||
327 | break; | ||
328 | case 1: | ||
329 | if (PageReserved(page)) | ||
330 | break; | ||
331 | retval = 1; | ||
332 | } | ||
333 | return retval; | ||
334 | } | 310 | } |
335 | 311 | ||
336 | /* | 312 | /* |
@@ -529,9 +505,10 @@ static int unuse_mm(struct mm_struct *mm, | |||
529 | 505 | ||
530 | if (!down_read_trylock(&mm->mmap_sem)) { | 506 | if (!down_read_trylock(&mm->mmap_sem)) { |
531 | /* | 507 | /* |
532 | * Our reference to the page stops try_to_unmap_one from | 508 | * Activate page so shrink_cache is unlikely to unmap its |
533 | * unmapping its ptes, so swapoff can make progress. | 509 | * ptes while lock is dropped, so swapoff can make progress. |
534 | */ | 510 | */ |
511 | activate_page(page); | ||
535 | unlock_page(page); | 512 | unlock_page(page); |
536 | down_read(&mm->mmap_sem); | 513 | down_read(&mm->mmap_sem); |
537 | lock_page(page); | 514 | lock_page(page); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 269eded9b459..4b8e62a19370 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -74,6 +74,9 @@ struct scan_control { | |||
74 | 74 | ||
75 | int may_writepage; | 75 | int may_writepage; |
76 | 76 | ||
77 | /* Can pages be swapped as part of reclaim? */ | ||
78 | int may_swap; | ||
79 | |||
77 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | 80 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for |
78 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | 81 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. |
79 | * In this context, it doesn't matter that we scan the | 82 | * In this context, it doesn't matter that we scan the |
@@ -180,17 +183,20 @@ EXPORT_SYMBOL(remove_shrinker); | |||
180 | * `lru_pages' represents the number of on-LRU pages in all the zones which | 183 | * `lru_pages' represents the number of on-LRU pages in all the zones which |
181 | * are eligible for the caller's allocation attempt. It is used for balancing | 184 | * are eligible for the caller's allocation attempt. It is used for balancing |
182 | * slab reclaim versus page reclaim. | 185 | * slab reclaim versus page reclaim. |
186 | * | ||
187 | * Returns the number of slab objects which we shrunk. | ||
183 | */ | 188 | */ |
184 | static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, | 189 | static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, |
185 | unsigned long lru_pages) | 190 | unsigned long lru_pages) |
186 | { | 191 | { |
187 | struct shrinker *shrinker; | 192 | struct shrinker *shrinker; |
193 | int ret = 0; | ||
188 | 194 | ||
189 | if (scanned == 0) | 195 | if (scanned == 0) |
190 | scanned = SWAP_CLUSTER_MAX; | 196 | scanned = SWAP_CLUSTER_MAX; |
191 | 197 | ||
192 | if (!down_read_trylock(&shrinker_rwsem)) | 198 | if (!down_read_trylock(&shrinker_rwsem)) |
193 | return 0; | 199 | return 1; /* Assume we'll be able to shrink next time */ |
194 | 200 | ||
195 | list_for_each_entry(shrinker, &shrinker_list, list) { | 201 | list_for_each_entry(shrinker, &shrinker_list, list) { |
196 | unsigned long long delta; | 202 | unsigned long long delta; |
@@ -209,10 +215,14 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, | |||
209 | while (total_scan >= SHRINK_BATCH) { | 215 | while (total_scan >= SHRINK_BATCH) { |
210 | long this_scan = SHRINK_BATCH; | 216 | long this_scan = SHRINK_BATCH; |
211 | int shrink_ret; | 217 | int shrink_ret; |
218 | int nr_before; | ||
212 | 219 | ||
220 | nr_before = (*shrinker->shrinker)(0, gfp_mask); | ||
213 | shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); | 221 | shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); |
214 | if (shrink_ret == -1) | 222 | if (shrink_ret == -1) |
215 | break; | 223 | break; |
224 | if (shrink_ret < nr_before) | ||
225 | ret += nr_before - shrink_ret; | ||
216 | mod_page_state(slabs_scanned, this_scan); | 226 | mod_page_state(slabs_scanned, this_scan); |
217 | total_scan -= this_scan; | 227 | total_scan -= this_scan; |
218 | 228 | ||
@@ -222,7 +232,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, | |||
222 | shrinker->nr += total_scan; | 232 | shrinker->nr += total_scan; |
223 | } | 233 | } |
224 | up_read(&shrinker_rwsem); | 234 | up_read(&shrinker_rwsem); |
225 | return 0; | 235 | return ret; |
226 | } | 236 | } |
227 | 237 | ||
228 | /* Called without lock on whether page is mapped, so answer is unstable */ | 238 | /* Called without lock on whether page is mapped, so answer is unstable */ |
@@ -407,7 +417,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
407 | * Anonymous process memory has backing store? | 417 | * Anonymous process memory has backing store? |
408 | * Try to allocate it some swap space here. | 418 | * Try to allocate it some swap space here. |
409 | */ | 419 | */ |
410 | if (PageAnon(page) && !PageSwapCache(page)) { | 420 | if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) { |
411 | if (!add_to_swap(page)) | 421 | if (!add_to_swap(page)) |
412 | goto activate_locked; | 422 | goto activate_locked; |
413 | } | 423 | } |
@@ -890,7 +900,9 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
890 | if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) | 900 | if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) |
891 | continue; /* Let kswapd poll it */ | 901 | continue; /* Let kswapd poll it */ |
892 | 902 | ||
903 | atomic_inc(&zone->reclaim_in_progress); | ||
893 | shrink_zone(zone, sc); | 904 | shrink_zone(zone, sc); |
905 | atomic_dec(&zone->reclaim_in_progress); | ||
894 | } | 906 | } |
895 | } | 907 | } |
896 | 908 | ||
@@ -907,8 +919,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
907 | * holds filesystem locks which prevent writeout this might not work, and the | 919 | * holds filesystem locks which prevent writeout this might not work, and the |
908 | * allocation attempt will fail. | 920 | * allocation attempt will fail. |
909 | */ | 921 | */ |
910 | int try_to_free_pages(struct zone **zones, | 922 | int try_to_free_pages(struct zone **zones, unsigned int gfp_mask) |
911 | unsigned int gfp_mask, unsigned int order) | ||
912 | { | 923 | { |
913 | int priority; | 924 | int priority; |
914 | int ret = 0; | 925 | int ret = 0; |
@@ -920,6 +931,7 @@ int try_to_free_pages(struct zone **zones, | |||
920 | 931 | ||
921 | sc.gfp_mask = gfp_mask; | 932 | sc.gfp_mask = gfp_mask; |
922 | sc.may_writepage = 0; | 933 | sc.may_writepage = 0; |
934 | sc.may_swap = 1; | ||
923 | 935 | ||
924 | inc_page_state(allocstall); | 936 | inc_page_state(allocstall); |
925 | 937 | ||
@@ -1020,6 +1032,7 @@ loop_again: | |||
1020 | total_reclaimed = 0; | 1032 | total_reclaimed = 0; |
1021 | sc.gfp_mask = GFP_KERNEL; | 1033 | sc.gfp_mask = GFP_KERNEL; |
1022 | sc.may_writepage = 0; | 1034 | sc.may_writepage = 0; |
1035 | sc.may_swap = 1; | ||
1023 | sc.nr_mapped = read_page_state(nr_mapped); | 1036 | sc.nr_mapped = read_page_state(nr_mapped); |
1024 | 1037 | ||
1025 | inc_page_state(pageoutrun); | 1038 | inc_page_state(pageoutrun); |
@@ -1079,6 +1092,7 @@ scan: | |||
1079 | */ | 1092 | */ |
1080 | for (i = 0; i <= end_zone; i++) { | 1093 | for (i = 0; i <= end_zone; i++) { |
1081 | struct zone *zone = pgdat->node_zones + i; | 1094 | struct zone *zone = pgdat->node_zones + i; |
1095 | int nr_slab; | ||
1082 | 1096 | ||
1083 | if (zone->present_pages == 0) | 1097 | if (zone->present_pages == 0) |
1084 | continue; | 1098 | continue; |
@@ -1098,16 +1112,19 @@ scan: | |||
1098 | sc.nr_reclaimed = 0; | 1112 | sc.nr_reclaimed = 0; |
1099 | sc.priority = priority; | 1113 | sc.priority = priority; |
1100 | sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; | 1114 | sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; |
1115 | atomic_inc(&zone->reclaim_in_progress); | ||
1101 | shrink_zone(zone, &sc); | 1116 | shrink_zone(zone, &sc); |
1117 | atomic_dec(&zone->reclaim_in_progress); | ||
1102 | reclaim_state->reclaimed_slab = 0; | 1118 | reclaim_state->reclaimed_slab = 0; |
1103 | shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); | 1119 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1120 | lru_pages); | ||
1104 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 1121 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
1105 | total_reclaimed += sc.nr_reclaimed; | 1122 | total_reclaimed += sc.nr_reclaimed; |
1106 | total_scanned += sc.nr_scanned; | 1123 | total_scanned += sc.nr_scanned; |
1107 | if (zone->all_unreclaimable) | 1124 | if (zone->all_unreclaimable) |
1108 | continue; | 1125 | continue; |
1109 | if (zone->pages_scanned >= (zone->nr_active + | 1126 | if (nr_slab == 0 && zone->pages_scanned >= |
1110 | zone->nr_inactive) * 4) | 1127 | (zone->nr_active + zone->nr_inactive) * 4) |
1111 | zone->all_unreclaimable = 1; | 1128 | zone->all_unreclaimable = 1; |
1112 | /* | 1129 | /* |
1113 | * If we've done a decent amount of scanning and | 1130 | * If we've done a decent amount of scanning and |
@@ -1309,3 +1326,73 @@ static int __init kswapd_init(void) | |||
1309 | } | 1326 | } |
1310 | 1327 | ||
1311 | module_init(kswapd_init) | 1328 | module_init(kswapd_init) |
1329 | |||
1330 | |||
1331 | /* | ||
1332 | * Try to free up some pages from this zone through reclaim. | ||
1333 | */ | ||
1334 | int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order) | ||
1335 | { | ||
1336 | struct scan_control sc; | ||
1337 | int nr_pages = 1 << order; | ||
1338 | int total_reclaimed = 0; | ||
1339 | |||
1340 | /* The reclaim may sleep, so don't do it if sleep isn't allowed */ | ||
1341 | if (!(gfp_mask & __GFP_WAIT)) | ||
1342 | return 0; | ||
1343 | if (zone->all_unreclaimable) | ||
1344 | return 0; | ||
1345 | |||
1346 | sc.gfp_mask = gfp_mask; | ||
1347 | sc.may_writepage = 0; | ||
1348 | sc.may_swap = 0; | ||
1349 | sc.nr_mapped = read_page_state(nr_mapped); | ||
1350 | sc.nr_scanned = 0; | ||
1351 | sc.nr_reclaimed = 0; | ||
1352 | /* scan at the highest priority */ | ||
1353 | sc.priority = 0; | ||
1354 | |||
1355 | if (nr_pages > SWAP_CLUSTER_MAX) | ||
1356 | sc.swap_cluster_max = nr_pages; | ||
1357 | else | ||
1358 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
1359 | |||
1360 | /* Don't reclaim the zone if there are other reclaimers active */ | ||
1361 | if (!atomic_inc_and_test(&zone->reclaim_in_progress)) | ||
1362 | goto out; | ||
1363 | |||
1364 | shrink_zone(zone, &sc); | ||
1365 | total_reclaimed = sc.nr_reclaimed; | ||
1366 | |||
1367 | out: | ||
1368 | atomic_dec(&zone->reclaim_in_progress); | ||
1369 | return total_reclaimed; | ||
1370 | } | ||
1371 | |||
1372 | asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, | ||
1373 | unsigned int state) | ||
1374 | { | ||
1375 | struct zone *z; | ||
1376 | int i; | ||
1377 | |||
1378 | if (node >= MAX_NUMNODES || !node_online(node)) | ||
1379 | return -EINVAL; | ||
1380 | |||
1381 | /* This will break if we ever add more zones */ | ||
1382 | if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM))) | ||
1383 | return -EINVAL; | ||
1384 | |||
1385 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1386 | if (!(zone & 1<<i)) | ||
1387 | continue; | ||
1388 | |||
1389 | z = &NODE_DATA(node)->node_zones[i]; | ||
1390 | |||
1391 | if (state) | ||
1392 | z->reclaim_pages = 1; | ||
1393 | else | ||
1394 | z->reclaim_pages = 0; | ||
1395 | } | ||
1396 | |||
1397 | return 0; | ||
1398 | } | ||