aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c47
-rw-r--r--mm/filemap.c15
-rw-r--r--mm/fremap.c26
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c1
-rw-r--r--mm/oom_kill.c107
-rw-r--r--mm/page-writeback.c300
-rw-r--r--mm/page_alloc.c23
-rw-r--r--mm/readahead.c6
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c20
-rw-r--r--mm/slab.c14
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c30
-rw-r--r--mm/swap.c5
-rw-r--r--mm/tiny-shmem.c19
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmscan.c40
-rw-r--r--mm/vmstat.c2
19 files changed, 484 insertions, 187 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f50a2811f9dc..b0ceb29da4c7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -5,6 +5,41 @@
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/module.h> 6#include <linux/module.h>
7 7
8int bdi_init(struct backing_dev_info *bdi)
9{
10 int i, j;
11 int err;
12
13 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
14 err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
15 if (err)
16 goto err;
17 }
18
19 bdi->dirty_exceeded = 0;
20 err = prop_local_init_percpu(&bdi->completions);
21
22 if (err) {
23err:
24 for (j = 0; j < i; j++)
25 percpu_counter_destroy(&bdi->bdi_stat[i]);
26 }
27
28 return err;
29}
30EXPORT_SYMBOL(bdi_init);
31
32void bdi_destroy(struct backing_dev_info *bdi)
33{
34 int i;
35
36 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
37 percpu_counter_destroy(&bdi->bdi_stat[i]);
38
39 prop_local_destroy_percpu(&bdi->completions);
40}
41EXPORT_SYMBOL(bdi_destroy);
42
8static wait_queue_head_t congestion_wqh[2] = { 43static wait_queue_head_t congestion_wqh[2] = {
9 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), 44 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
10 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) 45 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -55,15 +90,3 @@ long congestion_wait(int rw, long timeout)
55} 90}
56EXPORT_SYMBOL(congestion_wait); 91EXPORT_SYMBOL(congestion_wait);
57 92
58/**
59 * congestion_end - wake up sleepers on a congested backing_dev_info
60 * @rw: READ or WRITE
61 */
62void congestion_end(int rw)
63{
64 wait_queue_head_t *wqh = &congestion_wqh[rw];
65
66 if (waitqueue_active(wqh))
67 wake_up(wqh);
68}
69EXPORT_SYMBOL(congestion_end);
diff --git a/mm/filemap.c b/mm/filemap.c
index c6049e947cd9..79f24a969cb4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
63 * ->private_lock (__free_pte->__set_page_dirty_buffers) 63 * ->private_lock (__free_pte->__set_page_dirty_buffers)
64 * ->swap_lock (exclusive_swap_page, others) 64 * ->swap_lock (exclusive_swap_page, others)
65 * ->mapping->tree_lock 65 * ->mapping->tree_lock
66 * ->zone.lock
66 * 67 *
67 * ->i_mutex 68 * ->i_mutex
68 * ->i_mmap_lock (truncate->unmap_mapping_range) 69 * ->i_mmap_lock (truncate->unmap_mapping_range)
@@ -1626,12 +1627,18 @@ int __remove_suid(struct dentry *dentry, int kill)
1626 1627
1627int remove_suid(struct dentry *dentry) 1628int remove_suid(struct dentry *dentry)
1628{ 1629{
1629 int kill = should_remove_suid(dentry); 1630 int killsuid = should_remove_suid(dentry);
1631 int killpriv = security_inode_need_killpriv(dentry);
1632 int error = 0;
1630 1633
1631 if (unlikely(kill)) 1634 if (killpriv < 0)
1632 return __remove_suid(dentry, kill); 1635 return killpriv;
1636 if (killpriv)
1637 error = security_inode_killpriv(dentry);
1638 if (!error && killsuid)
1639 error = __remove_suid(dentry, killsuid);
1633 1640
1634 return 0; 1641 return error;
1635} 1642}
1636EXPORT_SYMBOL(remove_suid); 1643EXPORT_SYMBOL(remove_suid);
1637 1644
diff --git a/mm/fremap.c b/mm/fremap.c
index 95bcb5641c72..14bd3bf7826e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * started by Ingo Molnar, Copyright (C) 2002, 2003 6 * started by Ingo Molnar, Copyright (C) 2002, 2003
7 */ 7 */
8 8#include <linux/backing-dev.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/swap.h> 10#include <linux/swap.h>
11#include <linux/file.h> 11#include <linux/file.h>
@@ -97,26 +97,28 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
97 97
98} 98}
99 99
100/*** 100/**
101 * sys_remap_file_pages - remap arbitrary pages of a shared backing store 101 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
102 * file within an existing vma.
103 * @start: start of the remapped virtual memory range 102 * @start: start of the remapped virtual memory range
104 * @size: size of the remapped virtual memory range 103 * @size: size of the remapped virtual memory range
105 * @prot: new protection bits of the range 104 * @prot: new protection bits of the range (see NOTE)
106 * @pgoff: to be mapped page of the backing store file 105 * @pgoff: to-be-mapped page of the backing store file
107 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. 106 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
108 * 107 *
109 * this syscall works purely via pagetables, so it's the most efficient 108 * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
109 * (shared backing store file).
110 *
111 * This syscall works purely via pagetables, so it's the most efficient
110 * way to map the same (large) file into a given virtual window. Unlike 112 * way to map the same (large) file into a given virtual window. Unlike
111 * mmap()/mremap() it does not create any new vmas. The new mappings are 113 * mmap()/mremap() it does not create any new vmas. The new mappings are
112 * also safe across swapout. 114 * also safe across swapout.
113 * 115 *
114 * NOTE: the 'prot' parameter right now is ignored, and the vma's default 116 * NOTE: the 'prot' parameter right now is ignored (but must be zero),
115 * protection is used. Arbitrary protections might be implemented in the 117 * and the vma's default protection is used. Arbitrary protections
116 * future. 118 * might be implemented in the future.
117 */ 119 */
118asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, 120asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
119 unsigned long __prot, unsigned long pgoff, unsigned long flags) 121 unsigned long prot, unsigned long pgoff, unsigned long flags)
120{ 122{
121 struct mm_struct *mm = current->mm; 123 struct mm_struct *mm = current->mm;
122 struct address_space *mapping; 124 struct address_space *mapping;
@@ -125,7 +127,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
125 int err = -EINVAL; 127 int err = -EINVAL;
126 int has_write_lock = 0; 128 int has_write_lock = 0;
127 129
128 if (__prot) 130 if (prot)
129 return err; 131 return err;
130 /* 132 /*
131 * Sanitize the syscall parameters: 133 * Sanitize the syscall parameters:
diff --git a/mm/mmap.c b/mm/mmap.c
index 0d40e66c841b..4275e81e25ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/backing-dev.h>
10#include <linux/mm.h> 11#include <linux/mm.h>
11#include <linux/shm.h> 12#include <linux/shm.h>
12#include <linux/mman.h> 13#include <linux/mman.h>
@@ -180,8 +181,6 @@ error:
180 return -ENOMEM; 181 return -ENOMEM;
181} 182}
182 183
183EXPORT_SYMBOL(__vm_enough_memory);
184
185/* 184/*
186 * Requires inode->i_mapping->i_mmap_lock 185 * Requires inode->i_mapping->i_mmap_lock
187 */ 186 */
diff --git a/mm/nommu.c b/mm/nommu.c
index 8ed0cb43118a..42fb84e9e815 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -44,7 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
44int heap_stack_gap = 0; 44int heap_stack_gap = 0;
45 45
46EXPORT_SYMBOL(mem_map); 46EXPORT_SYMBOL(mem_map);
47EXPORT_SYMBOL(__vm_enough_memory);
48EXPORT_SYMBOL(num_physpages); 47EXPORT_SYMBOL(num_physpages);
49 48
50/* list of shareable VMAs */ 49/* list of shareable VMAs */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 41b4e362221d..a64decb5b13f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -27,6 +27,8 @@
27#include <linux/notifier.h> 27#include <linux/notifier.h>
28 28
29int sysctl_panic_on_oom; 29int sysctl_panic_on_oom;
30int sysctl_oom_kill_allocating_task;
31static DEFINE_SPINLOCK(zone_scan_mutex);
30/* #define DEBUG */ 32/* #define DEBUG */
31 33
32/** 34/**
@@ -141,7 +143,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
141 * because p may have allocated or otherwise mapped memory on 143 * because p may have allocated or otherwise mapped memory on
142 * this node before. However it will be less likely. 144 * this node before. However it will be less likely.
143 */ 145 */
144 if (!cpuset_excl_nodes_overlap(p)) 146 if (!cpuset_mems_allowed_intersects(current, p))
145 points /= 8; 147 points /= 8;
146 148
147 /* 149 /*
@@ -164,16 +166,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
164} 166}
165 167
166/* 168/*
167 * Types of limitations to the nodes from which allocations may occur
168 */
169#define CONSTRAINT_NONE 1
170#define CONSTRAINT_MEMORY_POLICY 2
171#define CONSTRAINT_CPUSET 3
172
173/*
174 * Determine the type of allocation constraint. 169 * Determine the type of allocation constraint.
175 */ 170 */
176static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) 171static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
172 gfp_t gfp_mask)
177{ 173{
178#ifdef CONFIG_NUMA 174#ifdef CONFIG_NUMA
179 struct zone **z; 175 struct zone **z;
@@ -337,12 +333,20 @@ static int oom_kill_task(struct task_struct *p)
337 return 0; 333 return 0;
338} 334}
339 335
340static int oom_kill_process(struct task_struct *p, unsigned long points, 336static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
341 const char *message) 337 unsigned long points, const char *message)
342{ 338{
343 struct task_struct *c; 339 struct task_struct *c;
344 struct list_head *tsk; 340 struct list_head *tsk;
345 341
342 if (printk_ratelimit()) {
343 printk(KERN_WARNING "%s invoked oom-killer: "
344 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
345 current->comm, gfp_mask, order, current->oomkilladj);
346 dump_stack();
347 show_mem();
348 }
349
346 /* 350 /*
347 * If the task is already exiting, don't alarm the sysadmin or kill 351 * If the task is already exiting, don't alarm the sysadmin or kill
348 * its children or threads, just set TIF_MEMDIE so it can die quickly 352 * its children or threads, just set TIF_MEMDIE so it can die quickly
@@ -380,6 +384,57 @@ int unregister_oom_notifier(struct notifier_block *nb)
380} 384}
381EXPORT_SYMBOL_GPL(unregister_oom_notifier); 385EXPORT_SYMBOL_GPL(unregister_oom_notifier);
382 386
387/*
388 * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero
389 * if a parallel OOM killing is already taking place that includes a zone in
390 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
391 */
392int try_set_zone_oom(struct zonelist *zonelist)
393{
394 struct zone **z;
395 int ret = 1;
396
397 z = zonelist->zones;
398
399 spin_lock(&zone_scan_mutex);
400 do {
401 if (zone_is_oom_locked(*z)) {
402 ret = 0;
403 goto out;
404 }
405 } while (*(++z) != NULL);
406
407 /*
408 * Lock each zone in the zonelist under zone_scan_mutex so a parallel
409 * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
410 */
411 z = zonelist->zones;
412 do {
413 zone_set_flag(*z, ZONE_OOM_LOCKED);
414 } while (*(++z) != NULL);
415out:
416 spin_unlock(&zone_scan_mutex);
417 return ret;
418}
419
420/*
421 * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
422 * allocation attempts with zonelists containing them may now recall the OOM
423 * killer, if necessary.
424 */
425void clear_zonelist_oom(struct zonelist *zonelist)
426{
427 struct zone **z;
428
429 z = zonelist->zones;
430
431 spin_lock(&zone_scan_mutex);
432 do {
433 zone_clear_flag(*z, ZONE_OOM_LOCKED);
434 } while (*(++z) != NULL);
435 spin_unlock(&zone_scan_mutex);
436}
437
383/** 438/**
384 * out_of_memory - kill the "best" process when we run out of memory 439 * out_of_memory - kill the "best" process when we run out of memory
385 * 440 *
@@ -393,21 +448,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
393 struct task_struct *p; 448 struct task_struct *p;
394 unsigned long points = 0; 449 unsigned long points = 0;
395 unsigned long freed = 0; 450 unsigned long freed = 0;
396 int constraint; 451 enum oom_constraint constraint;
397 452
398 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 453 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
399 if (freed > 0) 454 if (freed > 0)
400 /* Got some memory back in the last second. */ 455 /* Got some memory back in the last second. */
401 return; 456 return;
402 457
403 if (printk_ratelimit()) {
404 printk(KERN_WARNING "%s invoked oom-killer: "
405 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
406 current->comm, gfp_mask, order, current->oomkilladj);
407 dump_stack();
408 show_mem();
409 }
410
411 if (sysctl_panic_on_oom == 2) 458 if (sysctl_panic_on_oom == 2)
412 panic("out of memory. Compulsory panic_on_oom is selected.\n"); 459 panic("out of memory. Compulsory panic_on_oom is selected.\n");
413 460
@@ -416,23 +463,24 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
416 * NUMA) that may require different handling. 463 * NUMA) that may require different handling.
417 */ 464 */
418 constraint = constrained_alloc(zonelist, gfp_mask); 465 constraint = constrained_alloc(zonelist, gfp_mask);
419 cpuset_lock();
420 read_lock(&tasklist_lock); 466 read_lock(&tasklist_lock);
421 467
422 switch (constraint) { 468 switch (constraint) {
423 case CONSTRAINT_MEMORY_POLICY: 469 case CONSTRAINT_MEMORY_POLICY:
424 oom_kill_process(current, points, 470 oom_kill_process(current, gfp_mask, order, points,
425 "No available memory (MPOL_BIND)"); 471 "No available memory (MPOL_BIND)");
426 break; 472 break;
427 473
428 case CONSTRAINT_CPUSET:
429 oom_kill_process(current, points,
430 "No available memory in cpuset");
431 break;
432
433 case CONSTRAINT_NONE: 474 case CONSTRAINT_NONE:
434 if (sysctl_panic_on_oom) 475 if (sysctl_panic_on_oom)
435 panic("out of memory. panic_on_oom is selected\n"); 476 panic("out of memory. panic_on_oom is selected\n");
477 /* Fall-through */
478 case CONSTRAINT_CPUSET:
479 if (sysctl_oom_kill_allocating_task) {
480 oom_kill_process(current, gfp_mask, order, points,
481 "Out of memory (oom_kill_allocating_task)");
482 break;
483 }
436retry: 484retry:
437 /* 485 /*
438 * Rambo mode: Shoot down a process and hope it solves whatever 486 * Rambo mode: Shoot down a process and hope it solves whatever
@@ -446,11 +494,11 @@ retry:
446 /* Found nothing?!?! Either we hang forever, or we panic. */ 494 /* Found nothing?!?! Either we hang forever, or we panic. */
447 if (!p) { 495 if (!p) {
448 read_unlock(&tasklist_lock); 496 read_unlock(&tasklist_lock);
449 cpuset_unlock();
450 panic("Out of memory and no killable processes...\n"); 497 panic("Out of memory and no killable processes...\n");
451 } 498 }
452 499
453 if (oom_kill_process(p, points, "Out of memory")) 500 if (oom_kill_process(p, points, gfp_mask, order,
501 "Out of memory"))
454 goto retry; 502 goto retry;
455 503
456 break; 504 break;
@@ -458,7 +506,6 @@ retry:
458 506
459out: 507out:
460 read_unlock(&tasklist_lock); 508 read_unlock(&tasklist_lock);
461 cpuset_unlock();
462 509
463 /* 510 /*
464 * Give "p" a good chance of killing itself before we 511 * Give "p" a good chance of killing itself before we
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d821321326e3..7845462064f4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
2 * mm/page-writeback.c 2 * mm/page-writeback.c
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 * 6 *
6 * Contains functions related to writing back dirty pages at the 7 * Contains functions related to writing back dirty pages at the
7 * address_space level. 8 * address_space level.
@@ -36,7 +37,7 @@
36 37
37/* 38/*
38 * The maximum number of pages to writeout in a single bdflush/kupdate 39 * The maximum number of pages to writeout in a single bdflush/kupdate
39 * operation. We do this so we don't hold I_LOCK against an inode for 40 * operation. We do this so we don't hold I_SYNC against an inode for
40 * enormous amounts of time, which would block a userspace task which has 41 * enormous amounts of time, which would block a userspace task which has
41 * been forced to throttle against that inode. Also, the code reevaluates 42 * been forced to throttle against that inode. Also, the code reevaluates
42 * the dirty each time it has written this many pages. 43 * the dirty each time it has written this many pages.
@@ -49,8 +50,6 @@
49 */ 50 */
50static long ratelimit_pages = 32; 51static long ratelimit_pages = 32;
51 52
52static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
53
54/* 53/*
55 * When balance_dirty_pages decides that the caller needs to perform some 54 * When balance_dirty_pages decides that the caller needs to perform some
56 * non-background writeback, this is how many pages it will attempt to write. 55 * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode);
103static void background_writeout(unsigned long _min_pages); 102static void background_writeout(unsigned long _min_pages);
104 103
105/* 104/*
105 * Scale the writeback cache size proportional to the relative writeout speeds.
106 *
107 * We do this by keeping a floating proportion between BDIs, based on page
108 * writeback completions [end_page_writeback()]. Those devices that write out
109 * pages fastest will get the larger share, while the slower will get a smaller
110 * share.
111 *
112 * We use page writeout completions because we are interested in getting rid of
113 * dirty pages. Having them written out is the primary goal.
114 *
115 * We introduce a concept of time, a period over which we measure these events,
116 * because demand can/will vary over time. The length of this period itself is
117 * measured in page writeback completions.
118 *
119 */
120static struct prop_descriptor vm_completions;
121static struct prop_descriptor vm_dirties;
122
123static unsigned long determine_dirtyable_memory(void);
124
125/*
126 * couple the period to the dirty_ratio:
127 *
128 * period/2 ~ roundup_pow_of_two(dirty limit)
129 */
130static int calc_period_shift(void)
131{
132 unsigned long dirty_total;
133
134 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
135 return 2 + ilog2(dirty_total - 1);
136}
137
138/*
139 * update the period when the dirty ratio changes.
140 */
141int dirty_ratio_handler(struct ctl_table *table, int write,
142 struct file *filp, void __user *buffer, size_t *lenp,
143 loff_t *ppos)
144{
145 int old_ratio = vm_dirty_ratio;
146 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
147 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
148 int shift = calc_period_shift();
149 prop_change_shift(&vm_completions, shift);
150 prop_change_shift(&vm_dirties, shift);
151 }
152 return ret;
153}
154
155/*
156 * Increment the BDI's writeout completion count and the global writeout
157 * completion count. Called from test_clear_page_writeback().
158 */
159static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
160{
161 __prop_inc_percpu(&vm_completions, &bdi->completions);
162}
163
164static inline void task_dirty_inc(struct task_struct *tsk)
165{
166 prop_inc_single(&vm_dirties, &tsk->dirties);
167}
168
169/*
170 * Obtain an accurate fraction of the BDI's portion.
171 */
172static void bdi_writeout_fraction(struct backing_dev_info *bdi,
173 long *numerator, long *denominator)
174{
175 if (bdi_cap_writeback_dirty(bdi)) {
176 prop_fraction_percpu(&vm_completions, &bdi->completions,
177 numerator, denominator);
178 } else {
179 *numerator = 0;
180 *denominator = 1;
181 }
182}
183
184/*
185 * Clip the earned share of dirty pages to that which is actually available.
186 * This avoids exceeding the total dirty_limit when the floating averages
187 * fluctuate too quickly.
188 */
189static void
190clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
191{
192 long avail_dirty;
193
194 avail_dirty = dirty -
195 (global_page_state(NR_FILE_DIRTY) +
196 global_page_state(NR_WRITEBACK) +
197 global_page_state(NR_UNSTABLE_NFS));
198
199 if (avail_dirty < 0)
200 avail_dirty = 0;
201
202 avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
203 bdi_stat(bdi, BDI_WRITEBACK);
204
205 *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
206}
207
208static inline void task_dirties_fraction(struct task_struct *tsk,
209 long *numerator, long *denominator)
210{
211 prop_fraction_single(&vm_dirties, &tsk->dirties,
212 numerator, denominator);
213}
214
215/*
216 * scale the dirty limit
217 *
218 * task specific dirty limit:
219 *
220 * dirty -= (dirty/8) * p_{t}
221 */
222void task_dirty_limit(struct task_struct *tsk, long *pdirty)
223{
224 long numerator, denominator;
225 long dirty = *pdirty;
226 u64 inv = dirty >> 3;
227
228 task_dirties_fraction(tsk, &numerator, &denominator);
229 inv *= numerator;
230 do_div(inv, denominator);
231
232 dirty -= inv;
233 if (dirty < *pdirty/2)
234 dirty = *pdirty/2;
235
236 *pdirty = dirty;
237}
238
239/*
106 * Work out the current dirty-memory clamping and background writeout 240 * Work out the current dirty-memory clamping and background writeout
107 * thresholds. 241 * thresholds.
108 * 242 *
@@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void)
158} 292}
159 293
160static void 294static void
161get_dirty_limits(long *pbackground, long *pdirty, 295get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
162 struct address_space *mapping) 296 struct backing_dev_info *bdi)
163{ 297{
164 int background_ratio; /* Percentages */ 298 int background_ratio; /* Percentages */
165 int dirty_ratio; 299 int dirty_ratio;
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty,
193 } 327 }
194 *pbackground = background; 328 *pbackground = background;
195 *pdirty = dirty; 329 *pdirty = dirty;
330
331 if (bdi) {
332 u64 bdi_dirty = dirty;
333 long numerator, denominator;
334
335 /*
336 * Calculate this BDI's share of the dirty ratio.
337 */
338 bdi_writeout_fraction(bdi, &numerator, &denominator);
339
340 bdi_dirty *= numerator;
341 do_div(bdi_dirty, denominator);
342
343 *pbdi_dirty = bdi_dirty;
344 clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
345 task_dirty_limit(current, pbdi_dirty);
346 }
196} 347}
197 348
198/* 349/*
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
204 */ 355 */
205static void balance_dirty_pages(struct address_space *mapping) 356static void balance_dirty_pages(struct address_space *mapping)
206{ 357{
207 long nr_reclaimable; 358 long bdi_nr_reclaimable;
359 long bdi_nr_writeback;
208 long background_thresh; 360 long background_thresh;
209 long dirty_thresh; 361 long dirty_thresh;
362 long bdi_thresh;
210 unsigned long pages_written = 0; 363 unsigned long pages_written = 0;
211 unsigned long write_chunk = sync_writeback_pages(); 364 unsigned long write_chunk = sync_writeback_pages();
212 365
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping)
221 .range_cyclic = 1, 374 .range_cyclic = 1,
222 }; 375 };
223 376
224 get_dirty_limits(&background_thresh, &dirty_thresh, mapping); 377 get_dirty_limits(&background_thresh, &dirty_thresh,
225 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 378 &bdi_thresh, bdi);
226 global_page_state(NR_UNSTABLE_NFS); 379 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
227 if (nr_reclaimable + global_page_state(NR_WRITEBACK) <= 380 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
228 dirty_thresh) 381 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
229 break; 382 break;
230 383
231 if (!dirty_exceeded) 384 if (!bdi->dirty_exceeded)
232 dirty_exceeded = 1; 385 bdi->dirty_exceeded = 1;
233 386
234 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 387 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
235 * Unstable writes are a feature of certain networked 388 * Unstable writes are a feature of certain networked
@@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping)
237 * written to the server's write cache, but has not yet 390 * written to the server's write cache, but has not yet
238 * been flushed to permanent storage. 391 * been flushed to permanent storage.
239 */ 392 */
240 if (nr_reclaimable) { 393 if (bdi_nr_reclaimable) {
241 writeback_inodes(&wbc); 394 writeback_inodes(&wbc);
242 get_dirty_limits(&background_thresh,
243 &dirty_thresh, mapping);
244 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
245 global_page_state(NR_UNSTABLE_NFS);
246 if (nr_reclaimable +
247 global_page_state(NR_WRITEBACK)
248 <= dirty_thresh)
249 break;
250 pages_written += write_chunk - wbc.nr_to_write; 395 pages_written += write_chunk - wbc.nr_to_write;
251 if (pages_written >= write_chunk) 396 get_dirty_limits(&background_thresh, &dirty_thresh,
252 break; /* We've done our duty */ 397 &bdi_thresh, bdi);
398 }
399
400 /*
401 * In order to avoid the stacked BDI deadlock we need
402 * to ensure we accurately count the 'dirty' pages when
403 * the threshold is low.
404 *
405 * Otherwise it would be possible to get thresh+n pages
406 * reported dirty, even though there are thresh-m pages
407 * actually dirty; with m+n sitting in the percpu
408 * deltas.
409 */
410 if (bdi_thresh < 2*bdi_stat_error(bdi)) {
411 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
412 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
413 } else if (bdi_nr_reclaimable) {
414 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
415 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
253 } 416 }
417
418 if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
419 break;
420 if (pages_written >= write_chunk)
421 break; /* We've done our duty */
422
254 congestion_wait(WRITE, HZ/10); 423 congestion_wait(WRITE, HZ/10);
255 } 424 }
256 425
257 if (nr_reclaimable + global_page_state(NR_WRITEBACK) 426 if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
258 <= dirty_thresh && dirty_exceeded) 427 bdi->dirty_exceeded)
259 dirty_exceeded = 0; 428 bdi->dirty_exceeded = 0;
260 429
261 if (writeback_in_progress(bdi)) 430 if (writeback_in_progress(bdi))
262 return; /* pdflush is already working this queue */ 431 return; /* pdflush is already working this queue */
@@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping)
270 * background_thresh, to keep the amount of dirty memory low. 439 * background_thresh, to keep the amount of dirty memory low.
271 */ 440 */
272 if ((laptop_mode && pages_written) || 441 if ((laptop_mode && pages_written) ||
273 (!laptop_mode && (nr_reclaimable > background_thresh))) 442 (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
443 + global_page_state(NR_UNSTABLE_NFS)
444 > background_thresh)))
274 pdflush_operation(background_writeout, 0); 445 pdflush_operation(background_writeout, 0);
275} 446}
276 447
@@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
306 unsigned long *p; 477 unsigned long *p;
307 478
308 ratelimit = ratelimit_pages; 479 ratelimit = ratelimit_pages;
309 if (dirty_exceeded) 480 if (mapping->backing_dev_info->dirty_exceeded)
310 ratelimit = 8; 481 ratelimit = 8;
311 482
312 /* 483 /*
@@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
331 long background_thresh; 502 long background_thresh;
332 long dirty_thresh; 503 long dirty_thresh;
333 504
334 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
335 /*
336 * The caller might hold locks which can prevent IO completion
337 * or progress in the filesystem. So we cannot just sit here
338 * waiting for IO to complete.
339 */
340 congestion_wait(WRITE, HZ/10);
341 return;
342 }
343
344 for ( ; ; ) { 505 for ( ; ; ) {
345 get_dirty_limits(&background_thresh, &dirty_thresh, NULL); 506 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
346 507
347 /* 508 /*
348 * Boost the allowable dirty threshold a bit for page 509 * Boost the allowable dirty threshold a bit for page
@@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
354 global_page_state(NR_WRITEBACK) <= dirty_thresh) 515 global_page_state(NR_WRITEBACK) <= dirty_thresh)
355 break; 516 break;
356 congestion_wait(WRITE, HZ/10); 517 congestion_wait(WRITE, HZ/10);
518
519 /*
520 * The caller might hold locks which can prevent IO completion
521 * or progress in the filesystem. So we cannot just sit here
522 * waiting for IO to complete.
523 */
524 if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
525 break;
357 } 526 }
358} 527}
359 528
@@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages)
377 long background_thresh; 546 long background_thresh;
378 long dirty_thresh; 547 long dirty_thresh;
379 548
380 get_dirty_limits(&background_thresh, &dirty_thresh, NULL); 549 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
381 if (global_page_state(NR_FILE_DIRTY) + 550 if (global_page_state(NR_FILE_DIRTY) +
382 global_page_state(NR_UNSTABLE_NFS) < background_thresh 551 global_page_state(NR_UNSTABLE_NFS) < background_thresh
383 && min_pages <= 0) 552 && min_pages <= 0)
384 break; 553 break;
554 wbc.more_io = 0;
385 wbc.encountered_congestion = 0; 555 wbc.encountered_congestion = 0;
386 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 556 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
387 wbc.pages_skipped = 0; 557 wbc.pages_skipped = 0;
@@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages)
389 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 559 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
390 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 560 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
391 /* Wrote less than expected */ 561 /* Wrote less than expected */
392 congestion_wait(WRITE, HZ/10); 562 if (wbc.encountered_congestion || wbc.more_io)
393 if (!wbc.encountered_congestion) 563 congestion_wait(WRITE, HZ/10);
564 else
394 break; 565 break;
395 } 566 }
396 } 567 }
@@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg)
455 global_page_state(NR_UNSTABLE_NFS) + 626 global_page_state(NR_UNSTABLE_NFS) +
456 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 627 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
457 while (nr_to_write > 0) { 628 while (nr_to_write > 0) {
629 wbc.more_io = 0;
458 wbc.encountered_congestion = 0; 630 wbc.encountered_congestion = 0;
459 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 631 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
460 writeback_inodes(&wbc); 632 writeback_inodes(&wbc);
461 if (wbc.nr_to_write > 0) { 633 if (wbc.nr_to_write > 0) {
462 if (wbc.encountered_congestion) 634 if (wbc.encountered_congestion || wbc.more_io)
463 congestion_wait(WRITE, HZ/10); 635 congestion_wait(WRITE, HZ/10);
464 else 636 else
465 break; /* All the old data is written */ 637 break; /* All the old data is written */
@@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
580 */ 752 */
581void __init page_writeback_init(void) 753void __init page_writeback_init(void)
582{ 754{
755 int shift;
756
583 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 757 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
584 writeback_set_ratelimit(); 758 writeback_set_ratelimit();
585 register_cpu_notifier(&ratelimit_nb); 759 register_cpu_notifier(&ratelimit_nb);
760
761 shift = calc_period_shift();
762 prop_descriptor_init(&vm_completions, shift);
763 prop_descriptor_init(&vm_dirties, shift);
586} 764}
587 765
588/** 766/**
@@ -672,8 +850,10 @@ retry:
672 850
673 ret = (*writepage)(page, wbc, data); 851 ret = (*writepage)(page, wbc, data);
674 852
675 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) 853 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
676 unlock_page(page); 854 unlock_page(page);
855 ret = 0;
856 }
677 if (ret || (--(wbc->nr_to_write) <= 0)) 857 if (ret || (--(wbc->nr_to_write) <= 0))
678 done = 1; 858 done = 1;
679 if (wbc->nonblocking && bdi_write_congested(bdi)) { 859 if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page)
827 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 1007 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
828 if (mapping_cap_account_dirty(mapping)) { 1008 if (mapping_cap_account_dirty(mapping)) {
829 __inc_zone_page_state(page, NR_FILE_DIRTY); 1009 __inc_zone_page_state(page, NR_FILE_DIRTY);
1010 __inc_bdi_stat(mapping->backing_dev_info,
1011 BDI_RECLAIMABLE);
830 task_io_account_write(PAGE_CACHE_SIZE); 1012 task_io_account_write(PAGE_CACHE_SIZE);
831 } 1013 }
832 radix_tree_tag_set(&mapping->page_tree, 1014 radix_tree_tag_set(&mapping->page_tree,
@@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
859 * If the mapping doesn't provide a set_page_dirty a_op, then 1041 * If the mapping doesn't provide a set_page_dirty a_op, then
860 * just fall through and assume that it wants buffer_heads. 1042 * just fall through and assume that it wants buffer_heads.
861 */ 1043 */
862int fastcall set_page_dirty(struct page *page) 1044static int __set_page_dirty(struct page *page)
863{ 1045{
864 struct address_space *mapping = page_mapping(page); 1046 struct address_space *mapping = page_mapping(page);
865 1047
@@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page)
877 } 1059 }
878 return 0; 1060 return 0;
879} 1061}
1062
1063int fastcall set_page_dirty(struct page *page)
1064{
1065 int ret = __set_page_dirty(page);
1066 if (ret)
1067 task_dirty_inc(current);
1068 return ret;
1069}
880EXPORT_SYMBOL(set_page_dirty); 1070EXPORT_SYMBOL(set_page_dirty);
881 1071
882/* 1072/*
@@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page)
961 */ 1151 */
962 if (TestClearPageDirty(page)) { 1152 if (TestClearPageDirty(page)) {
963 dec_zone_page_state(page, NR_FILE_DIRTY); 1153 dec_zone_page_state(page, NR_FILE_DIRTY);
1154 dec_bdi_stat(mapping->backing_dev_info,
1155 BDI_RECLAIMABLE);
964 return 1; 1156 return 1;
965 } 1157 }
966 return 0; 1158 return 0;
@@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page)
975 int ret; 1167 int ret;
976 1168
977 if (mapping) { 1169 if (mapping) {
1170 struct backing_dev_info *bdi = mapping->backing_dev_info;
978 unsigned long flags; 1171 unsigned long flags;
979 1172
980 write_lock_irqsave(&mapping->tree_lock, flags); 1173 write_lock_irqsave(&mapping->tree_lock, flags);
981 ret = TestClearPageWriteback(page); 1174 ret = TestClearPageWriteback(page);
982 if (ret) 1175 if (ret) {
983 radix_tree_tag_clear(&mapping->page_tree, 1176 radix_tree_tag_clear(&mapping->page_tree,
984 page_index(page), 1177 page_index(page),
985 PAGECACHE_TAG_WRITEBACK); 1178 PAGECACHE_TAG_WRITEBACK);
1179 if (bdi_cap_writeback_dirty(bdi)) {
1180 __dec_bdi_stat(bdi, BDI_WRITEBACK);
1181 __bdi_writeout_inc(bdi);
1182 }
1183 }
986 write_unlock_irqrestore(&mapping->tree_lock, flags); 1184 write_unlock_irqrestore(&mapping->tree_lock, flags);
987 } else { 1185 } else {
988 ret = TestClearPageWriteback(page); 1186 ret = TestClearPageWriteback(page);
@@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page)
998 int ret; 1196 int ret;
999 1197
1000 if (mapping) { 1198 if (mapping) {
1199 struct backing_dev_info *bdi = mapping->backing_dev_info;
1001 unsigned long flags; 1200 unsigned long flags;
1002 1201
1003 write_lock_irqsave(&mapping->tree_lock, flags); 1202 write_lock_irqsave(&mapping->tree_lock, flags);
1004 ret = TestSetPageWriteback(page); 1203 ret = TestSetPageWriteback(page);
1005 if (!ret) 1204 if (!ret) {
1006 radix_tree_tag_set(&mapping->page_tree, 1205 radix_tree_tag_set(&mapping->page_tree,
1007 page_index(page), 1206 page_index(page),
1008 PAGECACHE_TAG_WRITEBACK); 1207 PAGECACHE_TAG_WRITEBACK);
1208 if (bdi_cap_writeback_dirty(bdi))
1209 __inc_bdi_stat(bdi, BDI_WRITEBACK);
1210 }
1009 if (!PageDirty(page)) 1211 if (!PageDirty(page))
1010 radix_tree_tag_clear(&mapping->page_tree, 1212 radix_tree_tag_clear(&mapping->page_tree,
1011 page_index(page), 1213 page_index(page),
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d315e1127dc9..43f757fcf30f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -27,6 +27,7 @@
27#include <linux/pagevec.h> 27#include <linux/pagevec.h>
28#include <linux/blkdev.h> 28#include <linux/blkdev.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/oom.h>
30#include <linux/notifier.h> 31#include <linux/notifier.h>
31#include <linux/topology.h> 32#include <linux/topology.h>
32#include <linux/sysctl.h> 33#include <linux/sysctl.h>
@@ -489,7 +490,7 @@ static void free_pages_bulk(struct zone *zone, int count,
489 struct list_head *list, int order) 490 struct list_head *list, int order)
490{ 491{
491 spin_lock(&zone->lock); 492 spin_lock(&zone->lock);
492 zone->all_unreclaimable = 0; 493 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
493 zone->pages_scanned = 0; 494 zone->pages_scanned = 0;
494 while (count--) { 495 while (count--) {
495 struct page *page; 496 struct page *page;
@@ -506,7 +507,7 @@ static void free_pages_bulk(struct zone *zone, int count,
506static void free_one_page(struct zone *zone, struct page *page, int order) 507static void free_one_page(struct zone *zone, struct page *page, int order)
507{ 508{
508 spin_lock(&zone->lock); 509 spin_lock(&zone->lock);
509 zone->all_unreclaimable = 0; 510 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
510 zone->pages_scanned = 0; 511 zone->pages_scanned = 0;
511 __free_one_page(page, zone, order); 512 __free_one_page(page, zone, order);
512 spin_unlock(&zone->lock); 513 spin_unlock(&zone->lock);
@@ -1586,6 +1587,11 @@ nofail_alloc:
1586 if (page) 1587 if (page)
1587 goto got_pg; 1588 goto got_pg;
1588 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1589 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1590 if (!try_set_zone_oom(zonelist)) {
1591 schedule_timeout_uninterruptible(1);
1592 goto restart;
1593 }
1594
1589 /* 1595 /*
1590 * Go through the zonelist yet one more time, keep 1596 * Go through the zonelist yet one more time, keep
1591 * very high watermark here, this is only to catch 1597 * very high watermark here, this is only to catch
@@ -1594,14 +1600,19 @@ nofail_alloc:
1594 */ 1600 */
1595 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1601 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1596 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1602 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1597 if (page) 1603 if (page) {
1604 clear_zonelist_oom(zonelist);
1598 goto got_pg; 1605 goto got_pg;
1606 }
1599 1607
1600 /* The OOM killer will not help higher order allocs so fail */ 1608 /* The OOM killer will not help higher order allocs so fail */
1601 if (order > PAGE_ALLOC_COSTLY_ORDER) 1609 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1610 clear_zonelist_oom(zonelist);
1602 goto nopage; 1611 goto nopage;
1612 }
1603 1613
1604 out_of_memory(zonelist, gfp_mask, order); 1614 out_of_memory(zonelist, gfp_mask, order);
1615 clear_zonelist_oom(zonelist);
1605 goto restart; 1616 goto restart;
1606 } 1617 }
1607 1618
@@ -1850,7 +1861,7 @@ void show_free_areas(void)
1850 K(zone_page_state(zone, NR_INACTIVE)), 1861 K(zone_page_state(zone, NR_INACTIVE)),
1851 K(zone->present_pages), 1862 K(zone->present_pages),
1852 zone->pages_scanned, 1863 zone->pages_scanned,
1853 (zone->all_unreclaimable ? "yes" : "no") 1864 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
1854 ); 1865 );
1855 printk("lowmem_reserve[]:"); 1866 printk("lowmem_reserve[]:");
1856 for (i = 0; i < MAX_NR_ZONES; i++) 1867 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -3371,7 +3382,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
3371 zone->nr_scan_active = 0; 3382 zone->nr_scan_active = 0;
3372 zone->nr_scan_inactive = 0; 3383 zone->nr_scan_inactive = 0;
3373 zap_zone_vm_stats(zone); 3384 zap_zone_vm_stats(zone);
3374 atomic_set(&zone->reclaim_in_progress, 0); 3385 zone->flags = 0;
3375 if (!size) 3386 if (!size)
3376 continue; 3387 continue;
3377 3388
diff --git a/mm/readahead.c b/mm/readahead.c
index 229788884010..c9c50ca1ec38 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -233,6 +233,12 @@ unsigned long max_sane_readahead(unsigned long nr)
233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
234} 234}
235 235
236static int __init readahead_init(void)
237{
238 return bdi_init(&default_backing_dev_info);
239}
240subsys_initcall(readahead_init);
241
236/* 242/*
237 * Submit IO for the read-ahead request in file_ra_state. 243 * Submit IO for the read-ahead request in file_ra_state.
238 */ 244 */
diff --git a/mm/rmap.c b/mm/rmap.c
index 2b9f413c9c00..8990f909492f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,7 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 * zone->lock (within radix tree node alloc)
39 */ 40 */
40 41
41#include <linux/mm.h> 42#include <linux/mm.h>
@@ -137,8 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
137 anon_vma_free(anon_vma); 138 anon_vma_free(anon_vma);
138} 139}
139 140
140static void anon_vma_ctor(void *data, struct kmem_cache *cachep, 141static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
141 unsigned long flags)
142{ 142{
143 struct anon_vma *anon_vma = data; 143 struct anon_vma *anon_vma = data;
144 144
diff --git a/mm/shmem.c b/mm/shmem.c
index 8a82342a8595..289dbb0a6fd6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2328,8 +2328,7 @@ static void shmem_destroy_inode(struct inode *inode)
2328 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2328 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2329} 2329}
2330 2330
2331static void init_once(void *foo, struct kmem_cache *cachep, 2331static void init_once(struct kmem_cache *cachep, void *foo)
2332 unsigned long flags)
2333{ 2332{
2334 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2333 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2335 2334
@@ -2344,9 +2343,7 @@ static int init_inodecache(void)
2344{ 2343{
2345 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2344 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2346 sizeof(struct shmem_inode_info), 2345 sizeof(struct shmem_inode_info),
2347 0, 0, init_once); 2346 0, SLAB_PANIC, init_once);
2348 if (shmem_inode_cachep == NULL)
2349 return -ENOMEM;
2350 return 0; 2347 return 0;
2351} 2348}
2352 2349
@@ -2464,6 +2461,10 @@ static int __init init_tmpfs(void)
2464{ 2461{
2465 int error; 2462 int error;
2466 2463
2464 error = bdi_init(&shmem_backing_dev_info);
2465 if (error)
2466 goto out4;
2467
2467 error = init_inodecache(); 2468 error = init_inodecache();
2468 if (error) 2469 if (error)
2469 goto out3; 2470 goto out3;
@@ -2488,6 +2489,8 @@ out1:
2488out2: 2489out2:
2489 destroy_inodecache(); 2490 destroy_inodecache();
2490out3: 2491out3:
2492 bdi_destroy(&shmem_backing_dev_info);
2493out4:
2491 shm_mnt = ERR_PTR(error); 2494 shm_mnt = ERR_PTR(error);
2492 return error; 2495 return error;
2493} 2496}
@@ -2540,11 +2543,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2540 d_instantiate(dentry, inode); 2543 d_instantiate(dentry, inode);
2541 inode->i_size = size; 2544 inode->i_size = size;
2542 inode->i_nlink = 0; /* It is unlinked */ 2545 inode->i_nlink = 0; /* It is unlinked */
2543 file->f_path.mnt = mntget(shm_mnt); 2546 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2544 file->f_path.dentry = dentry; 2547 &shmem_file_operations);
2545 file->f_mapping = inode->i_mapping;
2546 file->f_op = &shmem_file_operations;
2547 file->f_mode = FMODE_WRITE | FMODE_READ;
2548 return file; 2548 return file;
2549 2549
2550close_file: 2550close_file:
diff --git a/mm/slab.c b/mm/slab.c
index e34bcb87a6ee..3ce9bc024d67 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -267,11 +267,10 @@ struct array_cache {
267 unsigned int batchcount; 267 unsigned int batchcount;
268 unsigned int touched; 268 unsigned int touched;
269 spinlock_t lock; 269 spinlock_t lock;
270 void *entry[0]; /* 270 void *entry[]; /*
271 * Must have this definition in here for the proper 271 * Must have this definition in here for the proper
272 * alignment of array_cache. Also simplifies accessing 272 * alignment of array_cache. Also simplifies accessing
273 * the entries. 273 * the entries.
274 * [0] is for gcc 2.95. It should really be [].
275 */ 274 */
276}; 275};
277 276
@@ -408,7 +407,7 @@ struct kmem_cache {
408 unsigned int dflags; /* dynamic flags */ 407 unsigned int dflags; /* dynamic flags */
409 408
410 /* constructor func */ 409 /* constructor func */
411 void (*ctor) (void *, struct kmem_cache *, unsigned long); 410 void (*ctor)(struct kmem_cache *, void *);
412 411
413/* 5) cache creation/removal */ 412/* 5) cache creation/removal */
414 const char *name; 413 const char *name;
@@ -2129,7 +2128,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
2129struct kmem_cache * 2128struct kmem_cache *
2130kmem_cache_create (const char *name, size_t size, size_t align, 2129kmem_cache_create (const char *name, size_t size, size_t align,
2131 unsigned long flags, 2130 unsigned long flags,
2132 void (*ctor)(void*, struct kmem_cache *, unsigned long)) 2131 void (*ctor)(struct kmem_cache *, void *))
2133{ 2132{
2134 size_t left_over, slab_size, ralign; 2133 size_t left_over, slab_size, ralign;
2135 struct kmem_cache *cachep = NULL, *pc; 2134 struct kmem_cache *cachep = NULL, *pc;
@@ -2636,8 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2636 * They must also be threaded. 2635 * They must also be threaded.
2637 */ 2636 */
2638 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2637 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2639 cachep->ctor(objp + obj_offset(cachep), cachep, 2638 cachep->ctor(cachep, objp + obj_offset(cachep));
2640 0);
2641 2639
2642 if (cachep->flags & SLAB_RED_ZONE) { 2640 if (cachep->flags & SLAB_RED_ZONE) {
2643 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2641 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2653,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2653 cachep->buffer_size / PAGE_SIZE, 0); 2651 cachep->buffer_size / PAGE_SIZE, 0);
2654#else 2652#else
2655 if (cachep->ctor) 2653 if (cachep->ctor)
2656 cachep->ctor(objp, cachep, 0); 2654 cachep->ctor(cachep, objp);
2657#endif 2655#endif
2658 slab_bufctl(slabp)[i] = i + 1; 2656 slab_bufctl(slabp)[i] = i + 1;
2659 } 2657 }
@@ -3078,7 +3076,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3078#endif 3076#endif
3079 objp += obj_offset(cachep); 3077 objp += obj_offset(cachep);
3080 if (cachep->ctor && cachep->flags & SLAB_POISON) 3078 if (cachep->ctor && cachep->flags & SLAB_POISON)
3081 cachep->ctor(objp, cachep, 0); 3079 cachep->ctor(cachep, objp);
3082#if ARCH_SLAB_MINALIGN 3080#if ARCH_SLAB_MINALIGN
3083 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3081 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3084 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3082 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
diff --git a/mm/slob.c b/mm/slob.c
index de5d5563a46c..5bc2ceb692ec 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -499,12 +499,12 @@ struct kmem_cache {
499 unsigned int size, align; 499 unsigned int size, align;
500 unsigned long flags; 500 unsigned long flags;
501 const char *name; 501 const char *name;
502 void (*ctor)(void *, struct kmem_cache *, unsigned long); 502 void (*ctor)(struct kmem_cache *, void *);
503}; 503};
504 504
505struct kmem_cache *kmem_cache_create(const char *name, size_t size, 505struct kmem_cache *kmem_cache_create(const char *name, size_t size,
506 size_t align, unsigned long flags, 506 size_t align, unsigned long flags,
507 void (*ctor)(void*, struct kmem_cache *, unsigned long)) 507 void (*ctor)(struct kmem_cache *, void *))
508{ 508{
509 struct kmem_cache *c; 509 struct kmem_cache *c;
510 510
@@ -548,7 +548,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
548 b = slob_new_page(flags, get_order(c->size), node); 548 b = slob_new_page(flags, get_order(c->size), node);
549 549
550 if (c->ctor) 550 if (c->ctor)
551 c->ctor(b, c, 0); 551 c->ctor(c, b);
552 552
553 return b; 553 return b;
554} 554}
diff --git a/mm/slub.c b/mm/slub.c
index f426f9bc644b..e29a42988c78 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -980,7 +980,7 @@ __setup("slub_debug", setup_slub_debug);
980 980
981static unsigned long kmem_cache_flags(unsigned long objsize, 981static unsigned long kmem_cache_flags(unsigned long objsize,
982 unsigned long flags, const char *name, 982 unsigned long flags, const char *name,
983 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 983 void (*ctor)(struct kmem_cache *, void *))
984{ 984{
985 /* 985 /*
986 * The page->offset field is only 16 bit wide. This is an offset 986 * The page->offset field is only 16 bit wide. This is an offset
@@ -1027,7 +1027,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1027static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1027static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
1028static inline unsigned long kmem_cache_flags(unsigned long objsize, 1028static inline unsigned long kmem_cache_flags(unsigned long objsize,
1029 unsigned long flags, const char *name, 1029 unsigned long flags, const char *name,
1030 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 1030 void (*ctor)(struct kmem_cache *, void *))
1031{ 1031{
1032 return flags; 1032 return flags;
1033} 1033}
@@ -1071,7 +1071,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1071{ 1071{
1072 setup_object_debug(s, page, object); 1072 setup_object_debug(s, page, object);
1073 if (unlikely(s->ctor)) 1073 if (unlikely(s->ctor))
1074 s->ctor(object, s, 0); 1074 s->ctor(s, object);
1075} 1075}
1076 1076
1077static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1077static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1085,9 +1085,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1085 1085
1086 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1086 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1087 1087
1088 if (flags & __GFP_WAIT)
1089 local_irq_enable();
1090
1091 page = allocate_slab(s, 1088 page = allocate_slab(s,
1092 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1089 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
1093 if (!page) 1090 if (!page)
@@ -1120,8 +1117,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1120 page->freelist = start; 1117 page->freelist = start;
1121 page->inuse = 0; 1118 page->inuse = 0;
1122out: 1119out:
1123 if (flags & __GFP_WAIT)
1124 local_irq_disable();
1125 return page; 1120 return page;
1126} 1121}
1127 1122
@@ -1505,7 +1500,14 @@ new_slab:
1505 goto load_freelist; 1500 goto load_freelist;
1506 } 1501 }
1507 1502
1503 if (gfpflags & __GFP_WAIT)
1504 local_irq_enable();
1505
1508 new = new_slab(s, gfpflags, node); 1506 new = new_slab(s, gfpflags, node);
1507
1508 if (gfpflags & __GFP_WAIT)
1509 local_irq_disable();
1510
1509 if (new) { 1511 if (new) {
1510 c = get_cpu_slab(s, smp_processor_id()); 1512 c = get_cpu_slab(s, smp_processor_id());
1511 if (c->page) { 1513 if (c->page) {
@@ -2039,12 +2041,6 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2039 init_kmem_cache_node(n); 2041 init_kmem_cache_node(n);
2040 atomic_long_inc(&n->nr_slabs); 2042 atomic_long_inc(&n->nr_slabs);
2041 add_partial(n, page); 2043 add_partial(n, page);
2042
2043 /*
2044 * new_slab() disables interupts. If we do not reenable interrupts here
2045 * then bootup would continue with interrupts disabled.
2046 */
2047 local_irq_enable();
2048 return n; 2044 return n;
2049} 2045}
2050 2046
@@ -2215,7 +2211,7 @@ static int calculate_sizes(struct kmem_cache *s)
2215static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, 2211static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2216 const char *name, size_t size, 2212 const char *name, size_t size,
2217 size_t align, unsigned long flags, 2213 size_t align, unsigned long flags,
2218 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2214 void (*ctor)(struct kmem_cache *, void *))
2219{ 2215{
2220 memset(s, 0, kmem_size); 2216 memset(s, 0, kmem_size);
2221 s->name = name; 2217 s->name = name;
@@ -2805,7 +2801,7 @@ static int slab_unmergeable(struct kmem_cache *s)
2805 2801
2806static struct kmem_cache *find_mergeable(size_t size, 2802static struct kmem_cache *find_mergeable(size_t size,
2807 size_t align, unsigned long flags, const char *name, 2803 size_t align, unsigned long flags, const char *name,
2808 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2804 void (*ctor)(struct kmem_cache *, void *))
2809{ 2805{
2810 struct kmem_cache *s; 2806 struct kmem_cache *s;
2811 2807
@@ -2846,7 +2842,7 @@ static struct kmem_cache *find_mergeable(size_t size,
2846 2842
2847struct kmem_cache *kmem_cache_create(const char *name, size_t size, 2843struct kmem_cache *kmem_cache_create(const char *name, size_t size,
2848 size_t align, unsigned long flags, 2844 size_t align, unsigned long flags,
2849 void (*ctor)(void *, struct kmem_cache *, unsigned long)) 2845 void (*ctor)(struct kmem_cache *, void *))
2850{ 2846{
2851 struct kmem_cache *s; 2847 struct kmem_cache *s;
2852 2848
diff --git a/mm/swap.c b/mm/swap.c
index d034b2128d2b..a65eff8a517a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -28,6 +28,7 @@
28#include <linux/percpu.h> 28#include <linux/percpu.h>
29#include <linux/cpu.h> 29#include <linux/cpu.h>
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/backing-dev.h>
31 32
32/* How many pages do we try to swap or page in/out together? */ 33/* How many pages do we try to swap or page in/out together? */
33int page_cluster; 34int page_cluster;
@@ -547,6 +548,10 @@ void __init swap_setup(void)
547{ 548{
548 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); 549 unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
549 550
551#ifdef CONFIG_SWAP
552 bdi_init(swapper_space.backing_dev_info);
553#endif
554
550 /* Use a smaller cluster for small-memory machines */ 555 /* Use a smaller cluster for small-memory machines */
551 if (megs < 16) 556 if (megs < 16)
552 page_cluster = 2; 557 page_cluster = 2;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 8803471593fd..d436a9c82db7 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
66 if (!dentry) 66 if (!dentry)
67 goto put_memory; 67 goto put_memory;
68 68
69 error = -ENFILE;
70 file = get_empty_filp();
71 if (!file)
72 goto put_dentry;
73
74 error = -ENOSPC; 69 error = -ENOSPC;
75 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); 70 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
76 if (!inode) 71 if (!inode)
77 goto close_file; 72 goto put_dentry;
78 73
79 d_instantiate(dentry, inode); 74 d_instantiate(dentry, inode);
80 inode->i_nlink = 0; /* It is unlinked */ 75 error = -ENFILE;
76 file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
77 &ramfs_file_operations);
78 if (!file)
79 goto put_dentry;
81 80
82 file->f_path.mnt = mntget(shm_mnt); 81 inode->i_nlink = 0; /* It is unlinked */
83 file->f_path.dentry = dentry;
84 file->f_mapping = inode->i_mapping;
85 file->f_op = &ramfs_file_operations;
86 file->f_mode = FMODE_WRITE | FMODE_READ;
87 82
88 /* notify everyone as to the change of file size */ 83 /* notify everyone as to the change of file size */
89 error = do_truncate(dentry, size, 0, file); 84 error = do_truncate(dentry, size, 0, file);
diff --git a/mm/truncate.c b/mm/truncate.c
index 5cdfbc1a59fd..cadc15653dde 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/backing-dev.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/swap.h> 13#include <linux/swap.h>
13#include <linux/module.h> 14#include <linux/module.h>
@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
72 struct address_space *mapping = page->mapping; 73 struct address_space *mapping = page->mapping;
73 if (mapping && mapping_cap_account_dirty(mapping)) { 74 if (mapping && mapping_cap_account_dirty(mapping)) {
74 dec_zone_page_state(page, NR_FILE_DIRTY); 75 dec_zone_page_state(page, NR_FILE_DIRTY);
76 dec_bdi_stat(mapping->backing_dev_info,
77 BDI_RECLAIMABLE);
75 if (account_size) 78 if (account_size)
76 task_io_account_cancelled_write(account_size); 79 task_io_account_cancelled_write(account_size);
77 } 80 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bbd194630c5b..e1471385d001 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1108,8 +1108,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1108 unsigned long nr_to_scan; 1108 unsigned long nr_to_scan;
1109 unsigned long nr_reclaimed = 0; 1109 unsigned long nr_reclaimed = 0;
1110 1110
1111 atomic_inc(&zone->reclaim_in_progress);
1112
1113 /* 1111 /*
1114 * Add one to `nr_to_scan' just to make sure that the kernel will 1112 * Add one to `nr_to_scan' just to make sure that the kernel will
1115 * slowly sift through the active list. 1113 * slowly sift through the active list.
@@ -1148,8 +1146,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1148 } 1146 }
1149 1147
1150 throttle_vm_writeout(sc->gfp_mask); 1148 throttle_vm_writeout(sc->gfp_mask);
1151
1152 atomic_dec(&zone->reclaim_in_progress);
1153 return nr_reclaimed; 1149 return nr_reclaimed;
1154} 1150}
1155 1151
@@ -1187,7 +1183,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1187 1183
1188 note_zone_scanning_priority(zone, priority); 1184 note_zone_scanning_priority(zone, priority);
1189 1185
1190 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1186 if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
1191 continue; /* Let kswapd poll it */ 1187 continue; /* Let kswapd poll it */
1192 1188
1193 sc->all_unreclaimable = 0; 1189 sc->all_unreclaimable = 0;
@@ -1368,7 +1364,8 @@ loop_again:
1368 if (!populated_zone(zone)) 1364 if (!populated_zone(zone))
1369 continue; 1365 continue;
1370 1366
1371 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1367 if (zone_is_all_unreclaimable(zone) &&
1368 priority != DEF_PRIORITY)
1372 continue; 1369 continue;
1373 1370
1374 if (!zone_watermark_ok(zone, order, zone->pages_high, 1371 if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1403,7 +1400,8 @@ loop_again:
1403 if (!populated_zone(zone)) 1400 if (!populated_zone(zone))
1404 continue; 1401 continue;
1405 1402
1406 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1403 if (zone_is_all_unreclaimable(zone) &&
1404 priority != DEF_PRIORITY)
1407 continue; 1405 continue;
1408 1406
1409 if (!zone_watermark_ok(zone, order, zone->pages_high, 1407 if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1424,12 +1422,13 @@ loop_again:
1424 lru_pages); 1422 lru_pages);
1425 nr_reclaimed += reclaim_state->reclaimed_slab; 1423 nr_reclaimed += reclaim_state->reclaimed_slab;
1426 total_scanned += sc.nr_scanned; 1424 total_scanned += sc.nr_scanned;
1427 if (zone->all_unreclaimable) 1425 if (zone_is_all_unreclaimable(zone))
1428 continue; 1426 continue;
1429 if (nr_slab == 0 && zone->pages_scanned >= 1427 if (nr_slab == 0 && zone->pages_scanned >=
1430 (zone_page_state(zone, NR_ACTIVE) 1428 (zone_page_state(zone, NR_ACTIVE)
1431 + zone_page_state(zone, NR_INACTIVE)) * 6) 1429 + zone_page_state(zone, NR_INACTIVE)) * 6)
1432 zone->all_unreclaimable = 1; 1430 zone_set_flag(zone,
1431 ZONE_ALL_UNRECLAIMABLE);
1433 /* 1432 /*
1434 * If we've done a decent amount of scanning and 1433 * If we've done a decent amount of scanning and
1435 * the reclaim ratio is low, start doing writepage 1434 * the reclaim ratio is low, start doing writepage
@@ -1595,7 +1594,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1595 if (!populated_zone(zone)) 1594 if (!populated_zone(zone))
1596 continue; 1595 continue;
1597 1596
1598 if (zone->all_unreclaimable && prio != DEF_PRIORITY) 1597 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
1599 continue; 1598 continue;
1600 1599
1601 /* For pass = 0 we don't shrink the active list */ 1600 /* For pass = 0 we don't shrink the active list */
@@ -1897,6 +1896,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1897int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1896int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1898{ 1897{
1899 int node_id; 1898 int node_id;
1899 int ret;
1900 1900
1901 /* 1901 /*
1902 * Zone reclaim reclaims unmapped file backed pages and 1902 * Zone reclaim reclaims unmapped file backed pages and
@@ -1914,15 +1914,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1914 <= zone->min_slab_pages) 1914 <= zone->min_slab_pages)
1915 return 0; 1915 return 0;
1916 1916
1917 if (zone_is_all_unreclaimable(zone))
1918 return 0;
1919
1917 /* 1920 /*
1918 * Avoid concurrent zone reclaims, do not reclaim in a zone that does 1921 * Do not scan if the allocation should not be delayed.
1919 * not have reclaimable pages and if we should not delay the allocation
1920 * then do not scan.
1921 */ 1922 */
1922 if (!(gfp_mask & __GFP_WAIT) || 1923 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
1923 zone->all_unreclaimable ||
1924 atomic_read(&zone->reclaim_in_progress) > 0 ||
1925 (current->flags & PF_MEMALLOC))
1926 return 0; 1924 return 0;
1927 1925
1928 /* 1926 /*
@@ -1934,6 +1932,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1934 node_id = zone_to_nid(zone); 1932 node_id = zone_to_nid(zone);
1935 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 1933 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
1936 return 0; 1934 return 0;
1937 return __zone_reclaim(zone, gfp_mask, order); 1935
1936 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
1937 return 0;
1938 ret = __zone_reclaim(zone, gfp_mask, order);
1939 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
1940
1941 return ret;
1938} 1942}
1939#endif 1943#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 3b5e9043e7db..4651bf153f35 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -704,7 +704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
704 "\n all_unreclaimable: %u" 704 "\n all_unreclaimable: %u"
705 "\n prev_priority: %i" 705 "\n prev_priority: %i"
706 "\n start_pfn: %lu", 706 "\n start_pfn: %lu",
707 zone->all_unreclaimable, 707 zone_is_all_unreclaimable(zone),
708 zone->prev_priority, 708 zone->prev_priority,
709 zone->zone_start_pfn); 709 zone->zone_start_pfn);
710 seq_putc(m, '\n'); 710 seq_putc(m, '\n');