diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 47 | ||||
-rw-r--r-- | mm/filemap.c | 15 | ||||
-rw-r--r-- | mm/fremap.c | 26 | ||||
-rw-r--r-- | mm/mmap.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 1 | ||||
-rw-r--r-- | mm/oom_kill.c | 107 | ||||
-rw-r--r-- | mm/page-writeback.c | 300 | ||||
-rw-r--r-- | mm/page_alloc.c | 23 | ||||
-rw-r--r-- | mm/readahead.c | 6 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 20 | ||||
-rw-r--r-- | mm/slab.c | 14 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 30 | ||||
-rw-r--r-- | mm/swap.c | 5 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 19 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/vmscan.c | 40 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
19 files changed, 484 insertions, 187 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f50a2811f9dc..b0ceb29da4c7 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -5,6 +5,41 @@ | |||
5 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | #include <linux/module.h> | 6 | #include <linux/module.h> |
7 | 7 | ||
8 | int bdi_init(struct backing_dev_info *bdi) | ||
9 | { | ||
10 | int i, j; | ||
11 | int err; | ||
12 | |||
13 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | ||
14 | err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); | ||
15 | if (err) | ||
16 | goto err; | ||
17 | } | ||
18 | |||
19 | bdi->dirty_exceeded = 0; | ||
20 | err = prop_local_init_percpu(&bdi->completions); | ||
21 | |||
22 | if (err) { | ||
23 | err: | ||
24 | for (j = 0; j < i; j++) | ||
25 | percpu_counter_destroy(&bdi->bdi_stat[i]); | ||
26 | } | ||
27 | |||
28 | return err; | ||
29 | } | ||
30 | EXPORT_SYMBOL(bdi_init); | ||
31 | |||
32 | void bdi_destroy(struct backing_dev_info *bdi) | ||
33 | { | ||
34 | int i; | ||
35 | |||
36 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | ||
37 | percpu_counter_destroy(&bdi->bdi_stat[i]); | ||
38 | |||
39 | prop_local_destroy_percpu(&bdi->completions); | ||
40 | } | ||
41 | EXPORT_SYMBOL(bdi_destroy); | ||
42 | |||
8 | static wait_queue_head_t congestion_wqh[2] = { | 43 | static wait_queue_head_t congestion_wqh[2] = { |
9 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | 44 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), |
10 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 45 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
@@ -55,15 +90,3 @@ long congestion_wait(int rw, long timeout) | |||
55 | } | 90 | } |
56 | EXPORT_SYMBOL(congestion_wait); | 91 | EXPORT_SYMBOL(congestion_wait); |
57 | 92 | ||
58 | /** | ||
59 | * congestion_end - wake up sleepers on a congested backing_dev_info | ||
60 | * @rw: READ or WRITE | ||
61 | */ | ||
62 | void congestion_end(int rw) | ||
63 | { | ||
64 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | ||
65 | |||
66 | if (waitqueue_active(wqh)) | ||
67 | wake_up(wqh); | ||
68 | } | ||
69 | EXPORT_SYMBOL(congestion_end); | ||
diff --git a/mm/filemap.c b/mm/filemap.c index c6049e947cd9..79f24a969cb4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
64 | * ->swap_lock (exclusive_swap_page, others) | 64 | * ->swap_lock (exclusive_swap_page, others) |
65 | * ->mapping->tree_lock | 65 | * ->mapping->tree_lock |
66 | * ->zone.lock | ||
66 | * | 67 | * |
67 | * ->i_mutex | 68 | * ->i_mutex |
68 | * ->i_mmap_lock (truncate->unmap_mapping_range) | 69 | * ->i_mmap_lock (truncate->unmap_mapping_range) |
@@ -1626,12 +1627,18 @@ int __remove_suid(struct dentry *dentry, int kill) | |||
1626 | 1627 | ||
1627 | int remove_suid(struct dentry *dentry) | 1628 | int remove_suid(struct dentry *dentry) |
1628 | { | 1629 | { |
1629 | int kill = should_remove_suid(dentry); | 1630 | int killsuid = should_remove_suid(dentry); |
1631 | int killpriv = security_inode_need_killpriv(dentry); | ||
1632 | int error = 0; | ||
1630 | 1633 | ||
1631 | if (unlikely(kill)) | 1634 | if (killpriv < 0) |
1632 | return __remove_suid(dentry, kill); | 1635 | return killpriv; |
1636 | if (killpriv) | ||
1637 | error = security_inode_killpriv(dentry); | ||
1638 | if (!error && killsuid) | ||
1639 | error = __remove_suid(dentry, killsuid); | ||
1633 | 1640 | ||
1634 | return 0; | 1641 | return error; |
1635 | } | 1642 | } |
1636 | EXPORT_SYMBOL(remove_suid); | 1643 | EXPORT_SYMBOL(remove_suid); |
1637 | 1644 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index 95bcb5641c72..14bd3bf7826e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * | 5 | * |
6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | 6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 |
7 | */ | 7 | */ |
8 | 8 | #include <linux/backing-dev.h> | |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
11 | #include <linux/file.h> | 11 | #include <linux/file.h> |
@@ -97,26 +97,28 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | |||
97 | 97 | ||
98 | } | 98 | } |
99 | 99 | ||
100 | /*** | 100 | /** |
101 | * sys_remap_file_pages - remap arbitrary pages of a shared backing store | 101 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma |
102 | * file within an existing vma. | ||
103 | * @start: start of the remapped virtual memory range | 102 | * @start: start of the remapped virtual memory range |
104 | * @size: size of the remapped virtual memory range | 103 | * @size: size of the remapped virtual memory range |
105 | * @prot: new protection bits of the range | 104 | * @prot: new protection bits of the range (see NOTE) |
106 | * @pgoff: to be mapped page of the backing store file | 105 | * @pgoff: to-be-mapped page of the backing store file |
107 | * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. | 106 | * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. |
108 | * | 107 | * |
109 | * this syscall works purely via pagetables, so it's the most efficient | 108 | * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma |
109 | * (shared backing store file). | ||
110 | * | ||
111 | * This syscall works purely via pagetables, so it's the most efficient | ||
110 | * way to map the same (large) file into a given virtual window. Unlike | 112 | * way to map the same (large) file into a given virtual window. Unlike |
111 | * mmap()/mremap() it does not create any new vmas. The new mappings are | 113 | * mmap()/mremap() it does not create any new vmas. The new mappings are |
112 | * also safe across swapout. | 114 | * also safe across swapout. |
113 | * | 115 | * |
114 | * NOTE: the 'prot' parameter right now is ignored, and the vma's default | 116 | * NOTE: the 'prot' parameter right now is ignored (but must be zero), |
115 | * protection is used. Arbitrary protections might be implemented in the | 117 | * and the vma's default protection is used. Arbitrary protections |
116 | * future. | 118 | * might be implemented in the future. |
117 | */ | 119 | */ |
118 | asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | 120 | asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, |
119 | unsigned long __prot, unsigned long pgoff, unsigned long flags) | 121 | unsigned long prot, unsigned long pgoff, unsigned long flags) |
120 | { | 122 | { |
121 | struct mm_struct *mm = current->mm; | 123 | struct mm_struct *mm = current->mm; |
122 | struct address_space *mapping; | 124 | struct address_space *mapping; |
@@ -125,7 +127,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | |||
125 | int err = -EINVAL; | 127 | int err = -EINVAL; |
126 | int has_write_lock = 0; | 128 | int has_write_lock = 0; |
127 | 129 | ||
128 | if (__prot) | 130 | if (prot) |
129 | return err; | 131 | return err; |
130 | /* | 132 | /* |
131 | * Sanitize the syscall parameters: | 133 | * Sanitize the syscall parameters: |
@@ -7,6 +7,7 @@ | |||
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/slab.h> | 9 | #include <linux/slab.h> |
10 | #include <linux/backing-dev.h> | ||
10 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
11 | #include <linux/shm.h> | 12 | #include <linux/shm.h> |
12 | #include <linux/mman.h> | 13 | #include <linux/mman.h> |
@@ -180,8 +181,6 @@ error: | |||
180 | return -ENOMEM; | 181 | return -ENOMEM; |
181 | } | 182 | } |
182 | 183 | ||
183 | EXPORT_SYMBOL(__vm_enough_memory); | ||
184 | |||
185 | /* | 184 | /* |
186 | * Requires inode->i_mapping->i_mmap_lock | 185 | * Requires inode->i_mapping->i_mmap_lock |
187 | */ | 186 | */ |
diff --git a/mm/nommu.c b/mm/nommu.c index 8ed0cb43118a..42fb84e9e815 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -44,7 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | |||
44 | int heap_stack_gap = 0; | 44 | int heap_stack_gap = 0; |
45 | 45 | ||
46 | EXPORT_SYMBOL(mem_map); | 46 | EXPORT_SYMBOL(mem_map); |
47 | EXPORT_SYMBOL(__vm_enough_memory); | ||
48 | EXPORT_SYMBOL(num_physpages); | 47 | EXPORT_SYMBOL(num_physpages); |
49 | 48 | ||
50 | /* list of shareable VMAs */ | 49 | /* list of shareable VMAs */ |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 41b4e362221d..a64decb5b13f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -27,6 +27,8 @@ | |||
27 | #include <linux/notifier.h> | 27 | #include <linux/notifier.h> |
28 | 28 | ||
29 | int sysctl_panic_on_oom; | 29 | int sysctl_panic_on_oom; |
30 | int sysctl_oom_kill_allocating_task; | ||
31 | static DEFINE_SPINLOCK(zone_scan_mutex); | ||
30 | /* #define DEBUG */ | 32 | /* #define DEBUG */ |
31 | 33 | ||
32 | /** | 34 | /** |
@@ -141,7 +143,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
141 | * because p may have allocated or otherwise mapped memory on | 143 | * because p may have allocated or otherwise mapped memory on |
142 | * this node before. However it will be less likely. | 144 | * this node before. However it will be less likely. |
143 | */ | 145 | */ |
144 | if (!cpuset_excl_nodes_overlap(p)) | 146 | if (!cpuset_mems_allowed_intersects(current, p)) |
145 | points /= 8; | 147 | points /= 8; |
146 | 148 | ||
147 | /* | 149 | /* |
@@ -164,16 +166,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
164 | } | 166 | } |
165 | 167 | ||
166 | /* | 168 | /* |
167 | * Types of limitations to the nodes from which allocations may occur | ||
168 | */ | ||
169 | #define CONSTRAINT_NONE 1 | ||
170 | #define CONSTRAINT_MEMORY_POLICY 2 | ||
171 | #define CONSTRAINT_CPUSET 3 | ||
172 | |||
173 | /* | ||
174 | * Determine the type of allocation constraint. | 169 | * Determine the type of allocation constraint. |
175 | */ | 170 | */ |
176 | static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | 171 | static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, |
172 | gfp_t gfp_mask) | ||
177 | { | 173 | { |
178 | #ifdef CONFIG_NUMA | 174 | #ifdef CONFIG_NUMA |
179 | struct zone **z; | 175 | struct zone **z; |
@@ -337,12 +333,20 @@ static int oom_kill_task(struct task_struct *p) | |||
337 | return 0; | 333 | return 0; |
338 | } | 334 | } |
339 | 335 | ||
340 | static int oom_kill_process(struct task_struct *p, unsigned long points, | 336 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, |
341 | const char *message) | 337 | unsigned long points, const char *message) |
342 | { | 338 | { |
343 | struct task_struct *c; | 339 | struct task_struct *c; |
344 | struct list_head *tsk; | 340 | struct list_head *tsk; |
345 | 341 | ||
342 | if (printk_ratelimit()) { | ||
343 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
344 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
345 | current->comm, gfp_mask, order, current->oomkilladj); | ||
346 | dump_stack(); | ||
347 | show_mem(); | ||
348 | } | ||
349 | |||
346 | /* | 350 | /* |
347 | * If the task is already exiting, don't alarm the sysadmin or kill | 351 | * If the task is already exiting, don't alarm the sysadmin or kill |
348 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 352 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
@@ -380,6 +384,57 @@ int unregister_oom_notifier(struct notifier_block *nb) | |||
380 | } | 384 | } |
381 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); | 385 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); |
382 | 386 | ||
387 | /* | ||
388 | * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero | ||
389 | * if a parallel OOM killing is already taking place that includes a zone in | ||
390 | * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. | ||
391 | */ | ||
392 | int try_set_zone_oom(struct zonelist *zonelist) | ||
393 | { | ||
394 | struct zone **z; | ||
395 | int ret = 1; | ||
396 | |||
397 | z = zonelist->zones; | ||
398 | |||
399 | spin_lock(&zone_scan_mutex); | ||
400 | do { | ||
401 | if (zone_is_oom_locked(*z)) { | ||
402 | ret = 0; | ||
403 | goto out; | ||
404 | } | ||
405 | } while (*(++z) != NULL); | ||
406 | |||
407 | /* | ||
408 | * Lock each zone in the zonelist under zone_scan_mutex so a parallel | ||
409 | * invocation of try_set_zone_oom() doesn't succeed when it shouldn't. | ||
410 | */ | ||
411 | z = zonelist->zones; | ||
412 | do { | ||
413 | zone_set_flag(*z, ZONE_OOM_LOCKED); | ||
414 | } while (*(++z) != NULL); | ||
415 | out: | ||
416 | spin_unlock(&zone_scan_mutex); | ||
417 | return ret; | ||
418 | } | ||
419 | |||
420 | /* | ||
421 | * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed | ||
422 | * allocation attempts with zonelists containing them may now recall the OOM | ||
423 | * killer, if necessary. | ||
424 | */ | ||
425 | void clear_zonelist_oom(struct zonelist *zonelist) | ||
426 | { | ||
427 | struct zone **z; | ||
428 | |||
429 | z = zonelist->zones; | ||
430 | |||
431 | spin_lock(&zone_scan_mutex); | ||
432 | do { | ||
433 | zone_clear_flag(*z, ZONE_OOM_LOCKED); | ||
434 | } while (*(++z) != NULL); | ||
435 | spin_unlock(&zone_scan_mutex); | ||
436 | } | ||
437 | |||
383 | /** | 438 | /** |
384 | * out_of_memory - kill the "best" process when we run out of memory | 439 | * out_of_memory - kill the "best" process when we run out of memory |
385 | * | 440 | * |
@@ -393,21 +448,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
393 | struct task_struct *p; | 448 | struct task_struct *p; |
394 | unsigned long points = 0; | 449 | unsigned long points = 0; |
395 | unsigned long freed = 0; | 450 | unsigned long freed = 0; |
396 | int constraint; | 451 | enum oom_constraint constraint; |
397 | 452 | ||
398 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | 453 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); |
399 | if (freed > 0) | 454 | if (freed > 0) |
400 | /* Got some memory back in the last second. */ | 455 | /* Got some memory back in the last second. */ |
401 | return; | 456 | return; |
402 | 457 | ||
403 | if (printk_ratelimit()) { | ||
404 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
405 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
406 | current->comm, gfp_mask, order, current->oomkilladj); | ||
407 | dump_stack(); | ||
408 | show_mem(); | ||
409 | } | ||
410 | |||
411 | if (sysctl_panic_on_oom == 2) | 458 | if (sysctl_panic_on_oom == 2) |
412 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); | 459 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); |
413 | 460 | ||
@@ -416,23 +463,24 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
416 | * NUMA) that may require different handling. | 463 | * NUMA) that may require different handling. |
417 | */ | 464 | */ |
418 | constraint = constrained_alloc(zonelist, gfp_mask); | 465 | constraint = constrained_alloc(zonelist, gfp_mask); |
419 | cpuset_lock(); | ||
420 | read_lock(&tasklist_lock); | 466 | read_lock(&tasklist_lock); |
421 | 467 | ||
422 | switch (constraint) { | 468 | switch (constraint) { |
423 | case CONSTRAINT_MEMORY_POLICY: | 469 | case CONSTRAINT_MEMORY_POLICY: |
424 | oom_kill_process(current, points, | 470 | oom_kill_process(current, gfp_mask, order, points, |
425 | "No available memory (MPOL_BIND)"); | 471 | "No available memory (MPOL_BIND)"); |
426 | break; | 472 | break; |
427 | 473 | ||
428 | case CONSTRAINT_CPUSET: | ||
429 | oom_kill_process(current, points, | ||
430 | "No available memory in cpuset"); | ||
431 | break; | ||
432 | |||
433 | case CONSTRAINT_NONE: | 474 | case CONSTRAINT_NONE: |
434 | if (sysctl_panic_on_oom) | 475 | if (sysctl_panic_on_oom) |
435 | panic("out of memory. panic_on_oom is selected\n"); | 476 | panic("out of memory. panic_on_oom is selected\n"); |
477 | /* Fall-through */ | ||
478 | case CONSTRAINT_CPUSET: | ||
479 | if (sysctl_oom_kill_allocating_task) { | ||
480 | oom_kill_process(current, gfp_mask, order, points, | ||
481 | "Out of memory (oom_kill_allocating_task)"); | ||
482 | break; | ||
483 | } | ||
436 | retry: | 484 | retry: |
437 | /* | 485 | /* |
438 | * Rambo mode: Shoot down a process and hope it solves whatever | 486 | * Rambo mode: Shoot down a process and hope it solves whatever |
@@ -446,11 +494,11 @@ retry: | |||
446 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 494 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
447 | if (!p) { | 495 | if (!p) { |
448 | read_unlock(&tasklist_lock); | 496 | read_unlock(&tasklist_lock); |
449 | cpuset_unlock(); | ||
450 | panic("Out of memory and no killable processes...\n"); | 497 | panic("Out of memory and no killable processes...\n"); |
451 | } | 498 | } |
452 | 499 | ||
453 | if (oom_kill_process(p, points, "Out of memory")) | 500 | if (oom_kill_process(p, points, gfp_mask, order, |
501 | "Out of memory")) | ||
454 | goto retry; | 502 | goto retry; |
455 | 503 | ||
456 | break; | 504 | break; |
@@ -458,7 +506,6 @@ retry: | |||
458 | 506 | ||
459 | out: | 507 | out: |
460 | read_unlock(&tasklist_lock); | 508 | read_unlock(&tasklist_lock); |
461 | cpuset_unlock(); | ||
462 | 509 | ||
463 | /* | 510 | /* |
464 | * Give "p" a good chance of killing itself before we | 511 | * Give "p" a good chance of killing itself before we |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d821321326e3..7845462064f4 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * mm/page-writeback.c | 2 | * mm/page-writeback.c |
3 | * | 3 | * |
4 | * Copyright (C) 2002, Linus Torvalds. | 4 | * Copyright (C) 2002, Linus Torvalds. |
5 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
5 | * | 6 | * |
6 | * Contains functions related to writing back dirty pages at the | 7 | * Contains functions related to writing back dirty pages at the |
7 | * address_space level. | 8 | * address_space level. |
@@ -36,7 +37,7 @@ | |||
36 | 37 | ||
37 | /* | 38 | /* |
38 | * The maximum number of pages to writeout in a single bdflush/kupdate | 39 | * The maximum number of pages to writeout in a single bdflush/kupdate |
39 | * operation. We do this so we don't hold I_LOCK against an inode for | 40 | * operation. We do this so we don't hold I_SYNC against an inode for |
40 | * enormous amounts of time, which would block a userspace task which has | 41 | * enormous amounts of time, which would block a userspace task which has |
41 | * been forced to throttle against that inode. Also, the code reevaluates | 42 | * been forced to throttle against that inode. Also, the code reevaluates |
42 | * the dirty each time it has written this many pages. | 43 | * the dirty each time it has written this many pages. |
@@ -49,8 +50,6 @@ | |||
49 | */ | 50 | */ |
50 | static long ratelimit_pages = 32; | 51 | static long ratelimit_pages = 32; |
51 | 52 | ||
52 | static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */ | ||
53 | |||
54 | /* | 53 | /* |
55 | * When balance_dirty_pages decides that the caller needs to perform some | 54 | * When balance_dirty_pages decides that the caller needs to perform some |
56 | * non-background writeback, this is how many pages it will attempt to write. | 55 | * non-background writeback, this is how many pages it will attempt to write. |
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode); | |||
103 | static void background_writeout(unsigned long _min_pages); | 102 | static void background_writeout(unsigned long _min_pages); |
104 | 103 | ||
105 | /* | 104 | /* |
105 | * Scale the writeback cache size proportional to the relative writeout speeds. | ||
106 | * | ||
107 | * We do this by keeping a floating proportion between BDIs, based on page | ||
108 | * writeback completions [end_page_writeback()]. Those devices that write out | ||
109 | * pages fastest will get the larger share, while the slower will get a smaller | ||
110 | * share. | ||
111 | * | ||
112 | * We use page writeout completions because we are interested in getting rid of | ||
113 | * dirty pages. Having them written out is the primary goal. | ||
114 | * | ||
115 | * We introduce a concept of time, a period over which we measure these events, | ||
116 | * because demand can/will vary over time. The length of this period itself is | ||
117 | * measured in page writeback completions. | ||
118 | * | ||
119 | */ | ||
120 | static struct prop_descriptor vm_completions; | ||
121 | static struct prop_descriptor vm_dirties; | ||
122 | |||
123 | static unsigned long determine_dirtyable_memory(void); | ||
124 | |||
125 | /* | ||
126 | * couple the period to the dirty_ratio: | ||
127 | * | ||
128 | * period/2 ~ roundup_pow_of_two(dirty limit) | ||
129 | */ | ||
130 | static int calc_period_shift(void) | ||
131 | { | ||
132 | unsigned long dirty_total; | ||
133 | |||
134 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; | ||
135 | return 2 + ilog2(dirty_total - 1); | ||
136 | } | ||
137 | |||
138 | /* | ||
139 | * update the period when the dirty ratio changes. | ||
140 | */ | ||
141 | int dirty_ratio_handler(struct ctl_table *table, int write, | ||
142 | struct file *filp, void __user *buffer, size_t *lenp, | ||
143 | loff_t *ppos) | ||
144 | { | ||
145 | int old_ratio = vm_dirty_ratio; | ||
146 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
147 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | ||
148 | int shift = calc_period_shift(); | ||
149 | prop_change_shift(&vm_completions, shift); | ||
150 | prop_change_shift(&vm_dirties, shift); | ||
151 | } | ||
152 | return ret; | ||
153 | } | ||
154 | |||
155 | /* | ||
156 | * Increment the BDI's writeout completion count and the global writeout | ||
157 | * completion count. Called from test_clear_page_writeback(). | ||
158 | */ | ||
159 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | ||
160 | { | ||
161 | __prop_inc_percpu(&vm_completions, &bdi->completions); | ||
162 | } | ||
163 | |||
164 | static inline void task_dirty_inc(struct task_struct *tsk) | ||
165 | { | ||
166 | prop_inc_single(&vm_dirties, &tsk->dirties); | ||
167 | } | ||
168 | |||
169 | /* | ||
170 | * Obtain an accurate fraction of the BDI's portion. | ||
171 | */ | ||
172 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | ||
173 | long *numerator, long *denominator) | ||
174 | { | ||
175 | if (bdi_cap_writeback_dirty(bdi)) { | ||
176 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
177 | numerator, denominator); | ||
178 | } else { | ||
179 | *numerator = 0; | ||
180 | *denominator = 1; | ||
181 | } | ||
182 | } | ||
183 | |||
184 | /* | ||
185 | * Clip the earned share of dirty pages to that which is actually available. | ||
186 | * This avoids exceeding the total dirty_limit when the floating averages | ||
187 | * fluctuate too quickly. | ||
188 | */ | ||
189 | static void | ||
190 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | ||
191 | { | ||
192 | long avail_dirty; | ||
193 | |||
194 | avail_dirty = dirty - | ||
195 | (global_page_state(NR_FILE_DIRTY) + | ||
196 | global_page_state(NR_WRITEBACK) + | ||
197 | global_page_state(NR_UNSTABLE_NFS)); | ||
198 | |||
199 | if (avail_dirty < 0) | ||
200 | avail_dirty = 0; | ||
201 | |||
202 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | ||
203 | bdi_stat(bdi, BDI_WRITEBACK); | ||
204 | |||
205 | *pbdi_dirty = min(*pbdi_dirty, avail_dirty); | ||
206 | } | ||
207 | |||
208 | static inline void task_dirties_fraction(struct task_struct *tsk, | ||
209 | long *numerator, long *denominator) | ||
210 | { | ||
211 | prop_fraction_single(&vm_dirties, &tsk->dirties, | ||
212 | numerator, denominator); | ||
213 | } | ||
214 | |||
215 | /* | ||
216 | * scale the dirty limit | ||
217 | * | ||
218 | * task specific dirty limit: | ||
219 | * | ||
220 | * dirty -= (dirty/8) * p_{t} | ||
221 | */ | ||
222 | void task_dirty_limit(struct task_struct *tsk, long *pdirty) | ||
223 | { | ||
224 | long numerator, denominator; | ||
225 | long dirty = *pdirty; | ||
226 | u64 inv = dirty >> 3; | ||
227 | |||
228 | task_dirties_fraction(tsk, &numerator, &denominator); | ||
229 | inv *= numerator; | ||
230 | do_div(inv, denominator); | ||
231 | |||
232 | dirty -= inv; | ||
233 | if (dirty < *pdirty/2) | ||
234 | dirty = *pdirty/2; | ||
235 | |||
236 | *pdirty = dirty; | ||
237 | } | ||
238 | |||
239 | /* | ||
106 | * Work out the current dirty-memory clamping and background writeout | 240 | * Work out the current dirty-memory clamping and background writeout |
107 | * thresholds. | 241 | * thresholds. |
108 | * | 242 | * |
@@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void) | |||
158 | } | 292 | } |
159 | 293 | ||
160 | static void | 294 | static void |
161 | get_dirty_limits(long *pbackground, long *pdirty, | 295 | get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, |
162 | struct address_space *mapping) | 296 | struct backing_dev_info *bdi) |
163 | { | 297 | { |
164 | int background_ratio; /* Percentages */ | 298 | int background_ratio; /* Percentages */ |
165 | int dirty_ratio; | 299 | int dirty_ratio; |
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
193 | } | 327 | } |
194 | *pbackground = background; | 328 | *pbackground = background; |
195 | *pdirty = dirty; | 329 | *pdirty = dirty; |
330 | |||
331 | if (bdi) { | ||
332 | u64 bdi_dirty = dirty; | ||
333 | long numerator, denominator; | ||
334 | |||
335 | /* | ||
336 | * Calculate this BDI's share of the dirty ratio. | ||
337 | */ | ||
338 | bdi_writeout_fraction(bdi, &numerator, &denominator); | ||
339 | |||
340 | bdi_dirty *= numerator; | ||
341 | do_div(bdi_dirty, denominator); | ||
342 | |||
343 | *pbdi_dirty = bdi_dirty; | ||
344 | clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); | ||
345 | task_dirty_limit(current, pbdi_dirty); | ||
346 | } | ||
196 | } | 347 | } |
197 | 348 | ||
198 | /* | 349 | /* |
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty, | |||
204 | */ | 355 | */ |
205 | static void balance_dirty_pages(struct address_space *mapping) | 356 | static void balance_dirty_pages(struct address_space *mapping) |
206 | { | 357 | { |
207 | long nr_reclaimable; | 358 | long bdi_nr_reclaimable; |
359 | long bdi_nr_writeback; | ||
208 | long background_thresh; | 360 | long background_thresh; |
209 | long dirty_thresh; | 361 | long dirty_thresh; |
362 | long bdi_thresh; | ||
210 | unsigned long pages_written = 0; | 363 | unsigned long pages_written = 0; |
211 | unsigned long write_chunk = sync_writeback_pages(); | 364 | unsigned long write_chunk = sync_writeback_pages(); |
212 | 365 | ||
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
221 | .range_cyclic = 1, | 374 | .range_cyclic = 1, |
222 | }; | 375 | }; |
223 | 376 | ||
224 | get_dirty_limits(&background_thresh, &dirty_thresh, mapping); | 377 | get_dirty_limits(&background_thresh, &dirty_thresh, |
225 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 378 | &bdi_thresh, bdi); |
226 | global_page_state(NR_UNSTABLE_NFS); | 379 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
227 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= | 380 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); |
228 | dirty_thresh) | 381 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) |
229 | break; | 382 | break; |
230 | 383 | ||
231 | if (!dirty_exceeded) | 384 | if (!bdi->dirty_exceeded) |
232 | dirty_exceeded = 1; | 385 | bdi->dirty_exceeded = 1; |
233 | 386 | ||
234 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 387 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
235 | * Unstable writes are a feature of certain networked | 388 | * Unstable writes are a feature of certain networked |
@@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
237 | * written to the server's write cache, but has not yet | 390 | * written to the server's write cache, but has not yet |
238 | * been flushed to permanent storage. | 391 | * been flushed to permanent storage. |
239 | */ | 392 | */ |
240 | if (nr_reclaimable) { | 393 | if (bdi_nr_reclaimable) { |
241 | writeback_inodes(&wbc); | 394 | writeback_inodes(&wbc); |
242 | get_dirty_limits(&background_thresh, | ||
243 | &dirty_thresh, mapping); | ||
244 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | ||
245 | global_page_state(NR_UNSTABLE_NFS); | ||
246 | if (nr_reclaimable + | ||
247 | global_page_state(NR_WRITEBACK) | ||
248 | <= dirty_thresh) | ||
249 | break; | ||
250 | pages_written += write_chunk - wbc.nr_to_write; | 395 | pages_written += write_chunk - wbc.nr_to_write; |
251 | if (pages_written >= write_chunk) | 396 | get_dirty_limits(&background_thresh, &dirty_thresh, |
252 | break; /* We've done our duty */ | 397 | &bdi_thresh, bdi); |
398 | } | ||
399 | |||
400 | /* | ||
401 | * In order to avoid the stacked BDI deadlock we need | ||
402 | * to ensure we accurately count the 'dirty' pages when | ||
403 | * the threshold is low. | ||
404 | * | ||
405 | * Otherwise it would be possible to get thresh+n pages | ||
406 | * reported dirty, even though there are thresh-m pages | ||
407 | * actually dirty; with m+n sitting in the percpu | ||
408 | * deltas. | ||
409 | */ | ||
410 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | ||
411 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | ||
412 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
413 | } else if (bdi_nr_reclaimable) { | ||
414 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | ||
415 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | ||
253 | } | 416 | } |
417 | |||
418 | if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) | ||
419 | break; | ||
420 | if (pages_written >= write_chunk) | ||
421 | break; /* We've done our duty */ | ||
422 | |||
254 | congestion_wait(WRITE, HZ/10); | 423 | congestion_wait(WRITE, HZ/10); |
255 | } | 424 | } |
256 | 425 | ||
257 | if (nr_reclaimable + global_page_state(NR_WRITEBACK) | 426 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && |
258 | <= dirty_thresh && dirty_exceeded) | 427 | bdi->dirty_exceeded) |
259 | dirty_exceeded = 0; | 428 | bdi->dirty_exceeded = 0; |
260 | 429 | ||
261 | if (writeback_in_progress(bdi)) | 430 | if (writeback_in_progress(bdi)) |
262 | return; /* pdflush is already working this queue */ | 431 | return; /* pdflush is already working this queue */ |
@@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
270 | * background_thresh, to keep the amount of dirty memory low. | 439 | * background_thresh, to keep the amount of dirty memory low. |
271 | */ | 440 | */ |
272 | if ((laptop_mode && pages_written) || | 441 | if ((laptop_mode && pages_written) || |
273 | (!laptop_mode && (nr_reclaimable > background_thresh))) | 442 | (!laptop_mode && (global_page_state(NR_FILE_DIRTY) |
443 | + global_page_state(NR_UNSTABLE_NFS) | ||
444 | > background_thresh))) | ||
274 | pdflush_operation(background_writeout, 0); | 445 | pdflush_operation(background_writeout, 0); |
275 | } | 446 | } |
276 | 447 | ||
@@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
306 | unsigned long *p; | 477 | unsigned long *p; |
307 | 478 | ||
308 | ratelimit = ratelimit_pages; | 479 | ratelimit = ratelimit_pages; |
309 | if (dirty_exceeded) | 480 | if (mapping->backing_dev_info->dirty_exceeded) |
310 | ratelimit = 8; | 481 | ratelimit = 8; |
311 | 482 | ||
312 | /* | 483 | /* |
@@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
331 | long background_thresh; | 502 | long background_thresh; |
332 | long dirty_thresh; | 503 | long dirty_thresh; |
333 | 504 | ||
334 | if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) { | ||
335 | /* | ||
336 | * The caller might hold locks which can prevent IO completion | ||
337 | * or progress in the filesystem. So we cannot just sit here | ||
338 | * waiting for IO to complete. | ||
339 | */ | ||
340 | congestion_wait(WRITE, HZ/10); | ||
341 | return; | ||
342 | } | ||
343 | |||
344 | for ( ; ; ) { | 505 | for ( ; ; ) { |
345 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 506 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
346 | 507 | ||
347 | /* | 508 | /* |
348 | * Boost the allowable dirty threshold a bit for page | 509 | * Boost the allowable dirty threshold a bit for page |
@@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
354 | global_page_state(NR_WRITEBACK) <= dirty_thresh) | 515 | global_page_state(NR_WRITEBACK) <= dirty_thresh) |
355 | break; | 516 | break; |
356 | congestion_wait(WRITE, HZ/10); | 517 | congestion_wait(WRITE, HZ/10); |
518 | |||
519 | /* | ||
520 | * The caller might hold locks which can prevent IO completion | ||
521 | * or progress in the filesystem. So we cannot just sit here | ||
522 | * waiting for IO to complete. | ||
523 | */ | ||
524 | if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) | ||
525 | break; | ||
357 | } | 526 | } |
358 | } | 527 | } |
359 | 528 | ||
@@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages) | |||
377 | long background_thresh; | 546 | long background_thresh; |
378 | long dirty_thresh; | 547 | long dirty_thresh; |
379 | 548 | ||
380 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL); | 549 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); |
381 | if (global_page_state(NR_FILE_DIRTY) + | 550 | if (global_page_state(NR_FILE_DIRTY) + |
382 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | 551 | global_page_state(NR_UNSTABLE_NFS) < background_thresh |
383 | && min_pages <= 0) | 552 | && min_pages <= 0) |
384 | break; | 553 | break; |
554 | wbc.more_io = 0; | ||
385 | wbc.encountered_congestion = 0; | 555 | wbc.encountered_congestion = 0; |
386 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 556 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
387 | wbc.pages_skipped = 0; | 557 | wbc.pages_skipped = 0; |
@@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages) | |||
389 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | 559 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; |
390 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { | 560 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { |
391 | /* Wrote less than expected */ | 561 | /* Wrote less than expected */ |
392 | congestion_wait(WRITE, HZ/10); | 562 | if (wbc.encountered_congestion || wbc.more_io) |
393 | if (!wbc.encountered_congestion) | 563 | congestion_wait(WRITE, HZ/10); |
564 | else | ||
394 | break; | 565 | break; |
395 | } | 566 | } |
396 | } | 567 | } |
@@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg) | |||
455 | global_page_state(NR_UNSTABLE_NFS) + | 626 | global_page_state(NR_UNSTABLE_NFS) + |
456 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 627 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
457 | while (nr_to_write > 0) { | 628 | while (nr_to_write > 0) { |
629 | wbc.more_io = 0; | ||
458 | wbc.encountered_congestion = 0; | 630 | wbc.encountered_congestion = 0; |
459 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | 631 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; |
460 | writeback_inodes(&wbc); | 632 | writeback_inodes(&wbc); |
461 | if (wbc.nr_to_write > 0) { | 633 | if (wbc.nr_to_write > 0) { |
462 | if (wbc.encountered_congestion) | 634 | if (wbc.encountered_congestion || wbc.more_io) |
463 | congestion_wait(WRITE, HZ/10); | 635 | congestion_wait(WRITE, HZ/10); |
464 | else | 636 | else |
465 | break; /* All the old data is written */ | 637 | break; /* All the old data is written */ |
@@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = { | |||
580 | */ | 752 | */ |
581 | void __init page_writeback_init(void) | 753 | void __init page_writeback_init(void) |
582 | { | 754 | { |
755 | int shift; | ||
756 | |||
583 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); | 757 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
584 | writeback_set_ratelimit(); | 758 | writeback_set_ratelimit(); |
585 | register_cpu_notifier(&ratelimit_nb); | 759 | register_cpu_notifier(&ratelimit_nb); |
760 | |||
761 | shift = calc_period_shift(); | ||
762 | prop_descriptor_init(&vm_completions, shift); | ||
763 | prop_descriptor_init(&vm_dirties, shift); | ||
586 | } | 764 | } |
587 | 765 | ||
588 | /** | 766 | /** |
@@ -672,8 +850,10 @@ retry: | |||
672 | 850 | ||
673 | ret = (*writepage)(page, wbc, data); | 851 | ret = (*writepage)(page, wbc, data); |
674 | 852 | ||
675 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) | 853 | if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { |
676 | unlock_page(page); | 854 | unlock_page(page); |
855 | ret = 0; | ||
856 | } | ||
677 | if (ret || (--(wbc->nr_to_write) <= 0)) | 857 | if (ret || (--(wbc->nr_to_write) <= 0)) |
678 | done = 1; | 858 | done = 1; |
679 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | 859 | if (wbc->nonblocking && bdi_write_congested(bdi)) { |
@@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
827 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | 1007 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
828 | if (mapping_cap_account_dirty(mapping)) { | 1008 | if (mapping_cap_account_dirty(mapping)) { |
829 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 1009 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
1010 | __inc_bdi_stat(mapping->backing_dev_info, | ||
1011 | BDI_RECLAIMABLE); | ||
830 | task_io_account_write(PAGE_CACHE_SIZE); | 1012 | task_io_account_write(PAGE_CACHE_SIZE); |
831 | } | 1013 | } |
832 | radix_tree_tag_set(&mapping->page_tree, | 1014 | radix_tree_tag_set(&mapping->page_tree, |
@@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage); | |||
859 | * If the mapping doesn't provide a set_page_dirty a_op, then | 1041 | * If the mapping doesn't provide a set_page_dirty a_op, then |
860 | * just fall through and assume that it wants buffer_heads. | 1042 | * just fall through and assume that it wants buffer_heads. |
861 | */ | 1043 | */ |
862 | int fastcall set_page_dirty(struct page *page) | 1044 | static int __set_page_dirty(struct page *page) |
863 | { | 1045 | { |
864 | struct address_space *mapping = page_mapping(page); | 1046 | struct address_space *mapping = page_mapping(page); |
865 | 1047 | ||
@@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page) | |||
877 | } | 1059 | } |
878 | return 0; | 1060 | return 0; |
879 | } | 1061 | } |
1062 | |||
1063 | int fastcall set_page_dirty(struct page *page) | ||
1064 | { | ||
1065 | int ret = __set_page_dirty(page); | ||
1066 | if (ret) | ||
1067 | task_dirty_inc(current); | ||
1068 | return ret; | ||
1069 | } | ||
880 | EXPORT_SYMBOL(set_page_dirty); | 1070 | EXPORT_SYMBOL(set_page_dirty); |
881 | 1071 | ||
882 | /* | 1072 | /* |
@@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page) | |||
961 | */ | 1151 | */ |
962 | if (TestClearPageDirty(page)) { | 1152 | if (TestClearPageDirty(page)) { |
963 | dec_zone_page_state(page, NR_FILE_DIRTY); | 1153 | dec_zone_page_state(page, NR_FILE_DIRTY); |
1154 | dec_bdi_stat(mapping->backing_dev_info, | ||
1155 | BDI_RECLAIMABLE); | ||
964 | return 1; | 1156 | return 1; |
965 | } | 1157 | } |
966 | return 0; | 1158 | return 0; |
@@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page) | |||
975 | int ret; | 1167 | int ret; |
976 | 1168 | ||
977 | if (mapping) { | 1169 | if (mapping) { |
1170 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
978 | unsigned long flags; | 1171 | unsigned long flags; |
979 | 1172 | ||
980 | write_lock_irqsave(&mapping->tree_lock, flags); | 1173 | write_lock_irqsave(&mapping->tree_lock, flags); |
981 | ret = TestClearPageWriteback(page); | 1174 | ret = TestClearPageWriteback(page); |
982 | if (ret) | 1175 | if (ret) { |
983 | radix_tree_tag_clear(&mapping->page_tree, | 1176 | radix_tree_tag_clear(&mapping->page_tree, |
984 | page_index(page), | 1177 | page_index(page), |
985 | PAGECACHE_TAG_WRITEBACK); | 1178 | PAGECACHE_TAG_WRITEBACK); |
1179 | if (bdi_cap_writeback_dirty(bdi)) { | ||
1180 | __dec_bdi_stat(bdi, BDI_WRITEBACK); | ||
1181 | __bdi_writeout_inc(bdi); | ||
1182 | } | ||
1183 | } | ||
986 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1184 | write_unlock_irqrestore(&mapping->tree_lock, flags); |
987 | } else { | 1185 | } else { |
988 | ret = TestClearPageWriteback(page); | 1186 | ret = TestClearPageWriteback(page); |
@@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page) | |||
998 | int ret; | 1196 | int ret; |
999 | 1197 | ||
1000 | if (mapping) { | 1198 | if (mapping) { |
1199 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
1001 | unsigned long flags; | 1200 | unsigned long flags; |
1002 | 1201 | ||
1003 | write_lock_irqsave(&mapping->tree_lock, flags); | 1202 | write_lock_irqsave(&mapping->tree_lock, flags); |
1004 | ret = TestSetPageWriteback(page); | 1203 | ret = TestSetPageWriteback(page); |
1005 | if (!ret) | 1204 | if (!ret) { |
1006 | radix_tree_tag_set(&mapping->page_tree, | 1205 | radix_tree_tag_set(&mapping->page_tree, |
1007 | page_index(page), | 1206 | page_index(page), |
1008 | PAGECACHE_TAG_WRITEBACK); | 1207 | PAGECACHE_TAG_WRITEBACK); |
1208 | if (bdi_cap_writeback_dirty(bdi)) | ||
1209 | __inc_bdi_stat(bdi, BDI_WRITEBACK); | ||
1210 | } | ||
1009 | if (!PageDirty(page)) | 1211 | if (!PageDirty(page)) |
1010 | radix_tree_tag_clear(&mapping->page_tree, | 1212 | radix_tree_tag_clear(&mapping->page_tree, |
1011 | page_index(page), | 1213 | page_index(page), |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d315e1127dc9..43f757fcf30f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/pagevec.h> | 27 | #include <linux/pagevec.h> |
28 | #include <linux/blkdev.h> | 28 | #include <linux/blkdev.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/oom.h> | ||
30 | #include <linux/notifier.h> | 31 | #include <linux/notifier.h> |
31 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
32 | #include <linux/sysctl.h> | 33 | #include <linux/sysctl.h> |
@@ -489,7 +490,7 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
489 | struct list_head *list, int order) | 490 | struct list_head *list, int order) |
490 | { | 491 | { |
491 | spin_lock(&zone->lock); | 492 | spin_lock(&zone->lock); |
492 | zone->all_unreclaimable = 0; | 493 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
493 | zone->pages_scanned = 0; | 494 | zone->pages_scanned = 0; |
494 | while (count--) { | 495 | while (count--) { |
495 | struct page *page; | 496 | struct page *page; |
@@ -506,7 +507,7 @@ static void free_pages_bulk(struct zone *zone, int count, | |||
506 | static void free_one_page(struct zone *zone, struct page *page, int order) | 507 | static void free_one_page(struct zone *zone, struct page *page, int order) |
507 | { | 508 | { |
508 | spin_lock(&zone->lock); | 509 | spin_lock(&zone->lock); |
509 | zone->all_unreclaimable = 0; | 510 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
510 | zone->pages_scanned = 0; | 511 | zone->pages_scanned = 0; |
511 | __free_one_page(page, zone, order); | 512 | __free_one_page(page, zone, order); |
512 | spin_unlock(&zone->lock); | 513 | spin_unlock(&zone->lock); |
@@ -1586,6 +1587,11 @@ nofail_alloc: | |||
1586 | if (page) | 1587 | if (page) |
1587 | goto got_pg; | 1588 | goto got_pg; |
1588 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 1589 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
1590 | if (!try_set_zone_oom(zonelist)) { | ||
1591 | schedule_timeout_uninterruptible(1); | ||
1592 | goto restart; | ||
1593 | } | ||
1594 | |||
1589 | /* | 1595 | /* |
1590 | * Go through the zonelist yet one more time, keep | 1596 | * Go through the zonelist yet one more time, keep |
1591 | * very high watermark here, this is only to catch | 1597 | * very high watermark here, this is only to catch |
@@ -1594,14 +1600,19 @@ nofail_alloc: | |||
1594 | */ | 1600 | */ |
1595 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1601 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
1596 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); | 1602 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); |
1597 | if (page) | 1603 | if (page) { |
1604 | clear_zonelist_oom(zonelist); | ||
1598 | goto got_pg; | 1605 | goto got_pg; |
1606 | } | ||
1599 | 1607 | ||
1600 | /* The OOM killer will not help higher order allocs so fail */ | 1608 | /* The OOM killer will not help higher order allocs so fail */ |
1601 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 1609 | if (order > PAGE_ALLOC_COSTLY_ORDER) { |
1610 | clear_zonelist_oom(zonelist); | ||
1602 | goto nopage; | 1611 | goto nopage; |
1612 | } | ||
1603 | 1613 | ||
1604 | out_of_memory(zonelist, gfp_mask, order); | 1614 | out_of_memory(zonelist, gfp_mask, order); |
1615 | clear_zonelist_oom(zonelist); | ||
1605 | goto restart; | 1616 | goto restart; |
1606 | } | 1617 | } |
1607 | 1618 | ||
@@ -1850,7 +1861,7 @@ void show_free_areas(void) | |||
1850 | K(zone_page_state(zone, NR_INACTIVE)), | 1861 | K(zone_page_state(zone, NR_INACTIVE)), |
1851 | K(zone->present_pages), | 1862 | K(zone->present_pages), |
1852 | zone->pages_scanned, | 1863 | zone->pages_scanned, |
1853 | (zone->all_unreclaimable ? "yes" : "no") | 1864 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
1854 | ); | 1865 | ); |
1855 | printk("lowmem_reserve[]:"); | 1866 | printk("lowmem_reserve[]:"); |
1856 | for (i = 0; i < MAX_NR_ZONES; i++) | 1867 | for (i = 0; i < MAX_NR_ZONES; i++) |
@@ -3371,7 +3382,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat, | |||
3371 | zone->nr_scan_active = 0; | 3382 | zone->nr_scan_active = 0; |
3372 | zone->nr_scan_inactive = 0; | 3383 | zone->nr_scan_inactive = 0; |
3373 | zap_zone_vm_stats(zone); | 3384 | zap_zone_vm_stats(zone); |
3374 | atomic_set(&zone->reclaim_in_progress, 0); | 3385 | zone->flags = 0; |
3375 | if (!size) | 3386 | if (!size) |
3376 | continue; | 3387 | continue; |
3377 | 3388 | ||
diff --git a/mm/readahead.c b/mm/readahead.c index 229788884010..c9c50ca1ec38 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -233,6 +233,12 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
233 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); | 233 | + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); |
234 | } | 234 | } |
235 | 235 | ||
236 | static int __init readahead_init(void) | ||
237 | { | ||
238 | return bdi_init(&default_backing_dev_info); | ||
239 | } | ||
240 | subsys_initcall(readahead_init); | ||
241 | |||
236 | /* | 242 | /* |
237 | * Submit IO for the read-ahead request in file_ra_state. | 243 | * Submit IO for the read-ahead request in file_ra_state. |
238 | */ | 244 | */ |
@@ -36,6 +36,7 @@ | |||
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_lock in __sync_single_inode) | 38 | * within inode_lock in __sync_single_inode) |
39 | * zone->lock (within radix tree node alloc) | ||
39 | */ | 40 | */ |
40 | 41 | ||
41 | #include <linux/mm.h> | 42 | #include <linux/mm.h> |
@@ -137,8 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
137 | anon_vma_free(anon_vma); | 138 | anon_vma_free(anon_vma); |
138 | } | 139 | } |
139 | 140 | ||
140 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, | 141 | static void anon_vma_ctor(struct kmem_cache *cachep, void *data) |
141 | unsigned long flags) | ||
142 | { | 142 | { |
143 | struct anon_vma *anon_vma = data; | 143 | struct anon_vma *anon_vma = data; |
144 | 144 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 8a82342a8595..289dbb0a6fd6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -2328,8 +2328,7 @@ static void shmem_destroy_inode(struct inode *inode) | |||
2328 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2328 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
2329 | } | 2329 | } |
2330 | 2330 | ||
2331 | static void init_once(void *foo, struct kmem_cache *cachep, | 2331 | static void init_once(struct kmem_cache *cachep, void *foo) |
2332 | unsigned long flags) | ||
2333 | { | 2332 | { |
2334 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2333 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
2335 | 2334 | ||
@@ -2344,9 +2343,7 @@ static int init_inodecache(void) | |||
2344 | { | 2343 | { |
2345 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", | 2344 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", |
2346 | sizeof(struct shmem_inode_info), | 2345 | sizeof(struct shmem_inode_info), |
2347 | 0, 0, init_once); | 2346 | 0, SLAB_PANIC, init_once); |
2348 | if (shmem_inode_cachep == NULL) | ||
2349 | return -ENOMEM; | ||
2350 | return 0; | 2347 | return 0; |
2351 | } | 2348 | } |
2352 | 2349 | ||
@@ -2464,6 +2461,10 @@ static int __init init_tmpfs(void) | |||
2464 | { | 2461 | { |
2465 | int error; | 2462 | int error; |
2466 | 2463 | ||
2464 | error = bdi_init(&shmem_backing_dev_info); | ||
2465 | if (error) | ||
2466 | goto out4; | ||
2467 | |||
2467 | error = init_inodecache(); | 2468 | error = init_inodecache(); |
2468 | if (error) | 2469 | if (error) |
2469 | goto out3; | 2470 | goto out3; |
@@ -2488,6 +2489,8 @@ out1: | |||
2488 | out2: | 2489 | out2: |
2489 | destroy_inodecache(); | 2490 | destroy_inodecache(); |
2490 | out3: | 2491 | out3: |
2492 | bdi_destroy(&shmem_backing_dev_info); | ||
2493 | out4: | ||
2491 | shm_mnt = ERR_PTR(error); | 2494 | shm_mnt = ERR_PTR(error); |
2492 | return error; | 2495 | return error; |
2493 | } | 2496 | } |
@@ -2540,11 +2543,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
2540 | d_instantiate(dentry, inode); | 2543 | d_instantiate(dentry, inode); |
2541 | inode->i_size = size; | 2544 | inode->i_size = size; |
2542 | inode->i_nlink = 0; /* It is unlinked */ | 2545 | inode->i_nlink = 0; /* It is unlinked */ |
2543 | file->f_path.mnt = mntget(shm_mnt); | 2546 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, |
2544 | file->f_path.dentry = dentry; | 2547 | &shmem_file_operations); |
2545 | file->f_mapping = inode->i_mapping; | ||
2546 | file->f_op = &shmem_file_operations; | ||
2547 | file->f_mode = FMODE_WRITE | FMODE_READ; | ||
2548 | return file; | 2548 | return file; |
2549 | 2549 | ||
2550 | close_file: | 2550 | close_file: |
@@ -267,11 +267,10 @@ struct array_cache { | |||
267 | unsigned int batchcount; | 267 | unsigned int batchcount; |
268 | unsigned int touched; | 268 | unsigned int touched; |
269 | spinlock_t lock; | 269 | spinlock_t lock; |
270 | void *entry[0]; /* | 270 | void *entry[]; /* |
271 | * Must have this definition in here for the proper | 271 | * Must have this definition in here for the proper |
272 | * alignment of array_cache. Also simplifies accessing | 272 | * alignment of array_cache. Also simplifies accessing |
273 | * the entries. | 273 | * the entries. |
274 | * [0] is for gcc 2.95. It should really be []. | ||
275 | */ | 274 | */ |
276 | }; | 275 | }; |
277 | 276 | ||
@@ -408,7 +407,7 @@ struct kmem_cache { | |||
408 | unsigned int dflags; /* dynamic flags */ | 407 | unsigned int dflags; /* dynamic flags */ |
409 | 408 | ||
410 | /* constructor func */ | 409 | /* constructor func */ |
411 | void (*ctor) (void *, struct kmem_cache *, unsigned long); | 410 | void (*ctor)(struct kmem_cache *, void *); |
412 | 411 | ||
413 | /* 5) cache creation/removal */ | 412 | /* 5) cache creation/removal */ |
414 | const char *name; | 413 | const char *name; |
@@ -2129,7 +2128,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
2129 | struct kmem_cache * | 2128 | struct kmem_cache * |
2130 | kmem_cache_create (const char *name, size_t size, size_t align, | 2129 | kmem_cache_create (const char *name, size_t size, size_t align, |
2131 | unsigned long flags, | 2130 | unsigned long flags, |
2132 | void (*ctor)(void*, struct kmem_cache *, unsigned long)) | 2131 | void (*ctor)(struct kmem_cache *, void *)) |
2133 | { | 2132 | { |
2134 | size_t left_over, slab_size, ralign; | 2133 | size_t left_over, slab_size, ralign; |
2135 | struct kmem_cache *cachep = NULL, *pc; | 2134 | struct kmem_cache *cachep = NULL, *pc; |
@@ -2636,8 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2636 | * They must also be threaded. | 2635 | * They must also be threaded. |
2637 | */ | 2636 | */ |
2638 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2637 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
2639 | cachep->ctor(objp + obj_offset(cachep), cachep, | 2638 | cachep->ctor(cachep, objp + obj_offset(cachep)); |
2640 | 0); | ||
2641 | 2639 | ||
2642 | if (cachep->flags & SLAB_RED_ZONE) { | 2640 | if (cachep->flags & SLAB_RED_ZONE) { |
2643 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2641 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
@@ -2653,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2653 | cachep->buffer_size / PAGE_SIZE, 0); | 2651 | cachep->buffer_size / PAGE_SIZE, 0); |
2654 | #else | 2652 | #else |
2655 | if (cachep->ctor) | 2653 | if (cachep->ctor) |
2656 | cachep->ctor(objp, cachep, 0); | 2654 | cachep->ctor(cachep, objp); |
2657 | #endif | 2655 | #endif |
2658 | slab_bufctl(slabp)[i] = i + 1; | 2656 | slab_bufctl(slabp)[i] = i + 1; |
2659 | } | 2657 | } |
@@ -3078,7 +3076,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3078 | #endif | 3076 | #endif |
3079 | objp += obj_offset(cachep); | 3077 | objp += obj_offset(cachep); |
3080 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3078 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3081 | cachep->ctor(objp, cachep, 0); | 3079 | cachep->ctor(cachep, objp); |
3082 | #if ARCH_SLAB_MINALIGN | 3080 | #if ARCH_SLAB_MINALIGN |
3083 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3081 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { |
3084 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3082 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
@@ -499,12 +499,12 @@ struct kmem_cache { | |||
499 | unsigned int size, align; | 499 | unsigned int size, align; |
500 | unsigned long flags; | 500 | unsigned long flags; |
501 | const char *name; | 501 | const char *name; |
502 | void (*ctor)(void *, struct kmem_cache *, unsigned long); | 502 | void (*ctor)(struct kmem_cache *, void *); |
503 | }; | 503 | }; |
504 | 504 | ||
505 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 505 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
506 | size_t align, unsigned long flags, | 506 | size_t align, unsigned long flags, |
507 | void (*ctor)(void*, struct kmem_cache *, unsigned long)) | 507 | void (*ctor)(struct kmem_cache *, void *)) |
508 | { | 508 | { |
509 | struct kmem_cache *c; | 509 | struct kmem_cache *c; |
510 | 510 | ||
@@ -548,7 +548,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
548 | b = slob_new_page(flags, get_order(c->size), node); | 548 | b = slob_new_page(flags, get_order(c->size), node); |
549 | 549 | ||
550 | if (c->ctor) | 550 | if (c->ctor) |
551 | c->ctor(b, c, 0); | 551 | c->ctor(c, b); |
552 | 552 | ||
553 | return b; | 553 | return b; |
554 | } | 554 | } |
@@ -980,7 +980,7 @@ __setup("slub_debug", setup_slub_debug); | |||
980 | 980 | ||
981 | static unsigned long kmem_cache_flags(unsigned long objsize, | 981 | static unsigned long kmem_cache_flags(unsigned long objsize, |
982 | unsigned long flags, const char *name, | 982 | unsigned long flags, const char *name, |
983 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 983 | void (*ctor)(struct kmem_cache *, void *)) |
984 | { | 984 | { |
985 | /* | 985 | /* |
986 | * The page->offset field is only 16 bit wide. This is an offset | 986 | * The page->offset field is only 16 bit wide. This is an offset |
@@ -1027,7 +1027,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
1027 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1027 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} |
1028 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1028 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1029 | unsigned long flags, const char *name, | 1029 | unsigned long flags, const char *name, |
1030 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 1030 | void (*ctor)(struct kmem_cache *, void *)) |
1031 | { | 1031 | { |
1032 | return flags; | 1032 | return flags; |
1033 | } | 1033 | } |
@@ -1071,7 +1071,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, | |||
1071 | { | 1071 | { |
1072 | setup_object_debug(s, page, object); | 1072 | setup_object_debug(s, page, object); |
1073 | if (unlikely(s->ctor)) | 1073 | if (unlikely(s->ctor)) |
1074 | s->ctor(object, s, 0); | 1074 | s->ctor(s, object); |
1075 | } | 1075 | } |
1076 | 1076 | ||
1077 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1077 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
@@ -1085,9 +1085,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1085 | 1085 | ||
1086 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 1086 | BUG_ON(flags & GFP_SLAB_BUG_MASK); |
1087 | 1087 | ||
1088 | if (flags & __GFP_WAIT) | ||
1089 | local_irq_enable(); | ||
1090 | |||
1091 | page = allocate_slab(s, | 1088 | page = allocate_slab(s, |
1092 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | 1089 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); |
1093 | if (!page) | 1090 | if (!page) |
@@ -1120,8 +1117,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1120 | page->freelist = start; | 1117 | page->freelist = start; |
1121 | page->inuse = 0; | 1118 | page->inuse = 0; |
1122 | out: | 1119 | out: |
1123 | if (flags & __GFP_WAIT) | ||
1124 | local_irq_disable(); | ||
1125 | return page; | 1120 | return page; |
1126 | } | 1121 | } |
1127 | 1122 | ||
@@ -1505,7 +1500,14 @@ new_slab: | |||
1505 | goto load_freelist; | 1500 | goto load_freelist; |
1506 | } | 1501 | } |
1507 | 1502 | ||
1503 | if (gfpflags & __GFP_WAIT) | ||
1504 | local_irq_enable(); | ||
1505 | |||
1508 | new = new_slab(s, gfpflags, node); | 1506 | new = new_slab(s, gfpflags, node); |
1507 | |||
1508 | if (gfpflags & __GFP_WAIT) | ||
1509 | local_irq_disable(); | ||
1510 | |||
1509 | if (new) { | 1511 | if (new) { |
1510 | c = get_cpu_slab(s, smp_processor_id()); | 1512 | c = get_cpu_slab(s, smp_processor_id()); |
1511 | if (c->page) { | 1513 | if (c->page) { |
@@ -2039,12 +2041,6 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, | |||
2039 | init_kmem_cache_node(n); | 2041 | init_kmem_cache_node(n); |
2040 | atomic_long_inc(&n->nr_slabs); | 2042 | atomic_long_inc(&n->nr_slabs); |
2041 | add_partial(n, page); | 2043 | add_partial(n, page); |
2042 | |||
2043 | /* | ||
2044 | * new_slab() disables interupts. If we do not reenable interrupts here | ||
2045 | * then bootup would continue with interrupts disabled. | ||
2046 | */ | ||
2047 | local_irq_enable(); | ||
2048 | return n; | 2044 | return n; |
2049 | } | 2045 | } |
2050 | 2046 | ||
@@ -2215,7 +2211,7 @@ static int calculate_sizes(struct kmem_cache *s) | |||
2215 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | 2211 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, |
2216 | const char *name, size_t size, | 2212 | const char *name, size_t size, |
2217 | size_t align, unsigned long flags, | 2213 | size_t align, unsigned long flags, |
2218 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 2214 | void (*ctor)(struct kmem_cache *, void *)) |
2219 | { | 2215 | { |
2220 | memset(s, 0, kmem_size); | 2216 | memset(s, 0, kmem_size); |
2221 | s->name = name; | 2217 | s->name = name; |
@@ -2805,7 +2801,7 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
2805 | 2801 | ||
2806 | static struct kmem_cache *find_mergeable(size_t size, | 2802 | static struct kmem_cache *find_mergeable(size_t size, |
2807 | size_t align, unsigned long flags, const char *name, | 2803 | size_t align, unsigned long flags, const char *name, |
2808 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 2804 | void (*ctor)(struct kmem_cache *, void *)) |
2809 | { | 2805 | { |
2810 | struct kmem_cache *s; | 2806 | struct kmem_cache *s; |
2811 | 2807 | ||
@@ -2846,7 +2842,7 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
2846 | 2842 | ||
2847 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 2843 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
2848 | size_t align, unsigned long flags, | 2844 | size_t align, unsigned long flags, |
2849 | void (*ctor)(void *, struct kmem_cache *, unsigned long)) | 2845 | void (*ctor)(struct kmem_cache *, void *)) |
2850 | { | 2846 | { |
2851 | struct kmem_cache *s; | 2847 | struct kmem_cache *s; |
2852 | 2848 | ||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
29 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/backing-dev.h> | ||
31 | 32 | ||
32 | /* How many pages do we try to swap or page in/out together? */ | 33 | /* How many pages do we try to swap or page in/out together? */ |
33 | int page_cluster; | 34 | int page_cluster; |
@@ -547,6 +548,10 @@ void __init swap_setup(void) | |||
547 | { | 548 | { |
548 | unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); | 549 | unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); |
549 | 550 | ||
551 | #ifdef CONFIG_SWAP | ||
552 | bdi_init(swapper_space.backing_dev_info); | ||
553 | #endif | ||
554 | |||
550 | /* Use a smaller cluster for small-memory machines */ | 555 | /* Use a smaller cluster for small-memory machines */ |
551 | if (megs < 16) | 556 | if (megs < 16) |
552 | page_cluster = 2; | 557 | page_cluster = 2; |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index 8803471593fd..d436a9c82db7 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
66 | if (!dentry) | 66 | if (!dentry) |
67 | goto put_memory; | 67 | goto put_memory; |
68 | 68 | ||
69 | error = -ENFILE; | ||
70 | file = get_empty_filp(); | ||
71 | if (!file) | ||
72 | goto put_dentry; | ||
73 | |||
74 | error = -ENOSPC; | 69 | error = -ENOSPC; |
75 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); | 70 | inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); |
76 | if (!inode) | 71 | if (!inode) |
77 | goto close_file; | 72 | goto put_dentry; |
78 | 73 | ||
79 | d_instantiate(dentry, inode); | 74 | d_instantiate(dentry, inode); |
80 | inode->i_nlink = 0; /* It is unlinked */ | 75 | error = -ENFILE; |
76 | file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
77 | &ramfs_file_operations); | ||
78 | if (!file) | ||
79 | goto put_dentry; | ||
81 | 80 | ||
82 | file->f_path.mnt = mntget(shm_mnt); | 81 | inode->i_nlink = 0; /* It is unlinked */ |
83 | file->f_path.dentry = dentry; | ||
84 | file->f_mapping = inode->i_mapping; | ||
85 | file->f_op = &ramfs_file_operations; | ||
86 | file->f_mode = FMODE_WRITE | FMODE_READ; | ||
87 | 82 | ||
88 | /* notify everyone as to the change of file size */ | 83 | /* notify everyone as to the change of file size */ |
89 | error = do_truncate(dentry, size, 0, file); | 84 | error = do_truncate(dentry, size, 0, file); |
diff --git a/mm/truncate.c b/mm/truncate.c index 5cdfbc1a59fd..cadc15653dde 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/backing-dev.h> | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
12 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) | |||
72 | struct address_space *mapping = page->mapping; | 73 | struct address_space *mapping = page->mapping; |
73 | if (mapping && mapping_cap_account_dirty(mapping)) { | 74 | if (mapping && mapping_cap_account_dirty(mapping)) { |
74 | dec_zone_page_state(page, NR_FILE_DIRTY); | 75 | dec_zone_page_state(page, NR_FILE_DIRTY); |
76 | dec_bdi_stat(mapping->backing_dev_info, | ||
77 | BDI_RECLAIMABLE); | ||
75 | if (account_size) | 78 | if (account_size) |
76 | task_io_account_cancelled_write(account_size); | 79 | task_io_account_cancelled_write(account_size); |
77 | } | 80 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index bbd194630c5b..e1471385d001 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1108,8 +1108,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1108 | unsigned long nr_to_scan; | 1108 | unsigned long nr_to_scan; |
1109 | unsigned long nr_reclaimed = 0; | 1109 | unsigned long nr_reclaimed = 0; |
1110 | 1110 | ||
1111 | atomic_inc(&zone->reclaim_in_progress); | ||
1112 | |||
1113 | /* | 1111 | /* |
1114 | * Add one to `nr_to_scan' just to make sure that the kernel will | 1112 | * Add one to `nr_to_scan' just to make sure that the kernel will |
1115 | * slowly sift through the active list. | 1113 | * slowly sift through the active list. |
@@ -1148,8 +1146,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1148 | } | 1146 | } |
1149 | 1147 | ||
1150 | throttle_vm_writeout(sc->gfp_mask); | 1148 | throttle_vm_writeout(sc->gfp_mask); |
1151 | |||
1152 | atomic_dec(&zone->reclaim_in_progress); | ||
1153 | return nr_reclaimed; | 1149 | return nr_reclaimed; |
1154 | } | 1150 | } |
1155 | 1151 | ||
@@ -1187,7 +1183,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
1187 | 1183 | ||
1188 | note_zone_scanning_priority(zone, priority); | 1184 | note_zone_scanning_priority(zone, priority); |
1189 | 1185 | ||
1190 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1186 | if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) |
1191 | continue; /* Let kswapd poll it */ | 1187 | continue; /* Let kswapd poll it */ |
1192 | 1188 | ||
1193 | sc->all_unreclaimable = 0; | 1189 | sc->all_unreclaimable = 0; |
@@ -1368,7 +1364,8 @@ loop_again: | |||
1368 | if (!populated_zone(zone)) | 1364 | if (!populated_zone(zone)) |
1369 | continue; | 1365 | continue; |
1370 | 1366 | ||
1371 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1367 | if (zone_is_all_unreclaimable(zone) && |
1368 | priority != DEF_PRIORITY) | ||
1372 | continue; | 1369 | continue; |
1373 | 1370 | ||
1374 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1371 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
@@ -1403,7 +1400,8 @@ loop_again: | |||
1403 | if (!populated_zone(zone)) | 1400 | if (!populated_zone(zone)) |
1404 | continue; | 1401 | continue; |
1405 | 1402 | ||
1406 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1403 | if (zone_is_all_unreclaimable(zone) && |
1404 | priority != DEF_PRIORITY) | ||
1407 | continue; | 1405 | continue; |
1408 | 1406 | ||
1409 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1407 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
@@ -1424,12 +1422,13 @@ loop_again: | |||
1424 | lru_pages); | 1422 | lru_pages); |
1425 | nr_reclaimed += reclaim_state->reclaimed_slab; | 1423 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1426 | total_scanned += sc.nr_scanned; | 1424 | total_scanned += sc.nr_scanned; |
1427 | if (zone->all_unreclaimable) | 1425 | if (zone_is_all_unreclaimable(zone)) |
1428 | continue; | 1426 | continue; |
1429 | if (nr_slab == 0 && zone->pages_scanned >= | 1427 | if (nr_slab == 0 && zone->pages_scanned >= |
1430 | (zone_page_state(zone, NR_ACTIVE) | 1428 | (zone_page_state(zone, NR_ACTIVE) |
1431 | + zone_page_state(zone, NR_INACTIVE)) * 6) | 1429 | + zone_page_state(zone, NR_INACTIVE)) * 6) |
1432 | zone->all_unreclaimable = 1; | 1430 | zone_set_flag(zone, |
1431 | ZONE_ALL_UNRECLAIMABLE); | ||
1433 | /* | 1432 | /* |
1434 | * If we've done a decent amount of scanning and | 1433 | * If we've done a decent amount of scanning and |
1435 | * the reclaim ratio is low, start doing writepage | 1434 | * the reclaim ratio is low, start doing writepage |
@@ -1595,7 +1594,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, | |||
1595 | if (!populated_zone(zone)) | 1594 | if (!populated_zone(zone)) |
1596 | continue; | 1595 | continue; |
1597 | 1596 | ||
1598 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) | 1597 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) |
1599 | continue; | 1598 | continue; |
1600 | 1599 | ||
1601 | /* For pass = 0 we don't shrink the active list */ | 1600 | /* For pass = 0 we don't shrink the active list */ |
@@ -1897,6 +1896,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1897 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1896 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
1898 | { | 1897 | { |
1899 | int node_id; | 1898 | int node_id; |
1899 | int ret; | ||
1900 | 1900 | ||
1901 | /* | 1901 | /* |
1902 | * Zone reclaim reclaims unmapped file backed pages and | 1902 | * Zone reclaim reclaims unmapped file backed pages and |
@@ -1914,15 +1914,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1914 | <= zone->min_slab_pages) | 1914 | <= zone->min_slab_pages) |
1915 | return 0; | 1915 | return 0; |
1916 | 1916 | ||
1917 | if (zone_is_all_unreclaimable(zone)) | ||
1918 | return 0; | ||
1919 | |||
1917 | /* | 1920 | /* |
1918 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | 1921 | * Do not scan if the allocation should not be delayed. |
1919 | * not have reclaimable pages and if we should not delay the allocation | ||
1920 | * then do not scan. | ||
1921 | */ | 1922 | */ |
1922 | if (!(gfp_mask & __GFP_WAIT) || | 1923 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) |
1923 | zone->all_unreclaimable || | ||
1924 | atomic_read(&zone->reclaim_in_progress) > 0 || | ||
1925 | (current->flags & PF_MEMALLOC)) | ||
1926 | return 0; | 1924 | return 0; |
1927 | 1925 | ||
1928 | /* | 1926 | /* |
@@ -1934,6 +1932,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1934 | node_id = zone_to_nid(zone); | 1932 | node_id = zone_to_nid(zone); |
1935 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 1933 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
1936 | return 0; | 1934 | return 0; |
1937 | return __zone_reclaim(zone, gfp_mask, order); | 1935 | |
1936 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | ||
1937 | return 0; | ||
1938 | ret = __zone_reclaim(zone, gfp_mask, order); | ||
1939 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | ||
1940 | |||
1941 | return ret; | ||
1938 | } | 1942 | } |
1939 | #endif | 1943 | #endif |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 3b5e9043e7db..4651bf153f35 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -704,7 +704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
704 | "\n all_unreclaimable: %u" | 704 | "\n all_unreclaimable: %u" |
705 | "\n prev_priority: %i" | 705 | "\n prev_priority: %i" |
706 | "\n start_pfn: %lu", | 706 | "\n start_pfn: %lu", |
707 | zone->all_unreclaimable, | 707 | zone_is_all_unreclaimable(zone), |
708 | zone->prev_priority, | 708 | zone->prev_priority, |
709 | zone->zone_start_pfn); | 709 | zone->zone_start_pfn); |
710 | seq_putc(m, '\n'); | 710 | seq_putc(m, '\n'); |