aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2006-02-23 22:05:47 -0500
committerPaul Mackerras <paulus@samba.org>2006-02-23 22:05:47 -0500
commita00428f5b149e36b8225b2a0812742a6dfb07b8c (patch)
treea78869cd67cf78a0eb091fb0ea5d397734bd6738 /mm
parent774fee58c465ea1c7e9775e347ec307bcf2deeb3 (diff)
parentfb5c594c2acc441f0d2d8f457484a0e0e9285db3 (diff)
Merge ../powerpc-merge
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memory.c10
-rw-r--r--mm/mempolicy.c22
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c123
-rw-r--r--mm/page_alloc.c46
-rw-r--r--mm/shmem.c81
-rw-r--r--mm/slab.c10
-rw-r--r--mm/swap.c2
-rw-r--r--mm/vmscan.c106
11 files changed, 306 insertions, 121 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67f29516662a..508707704d2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
85 BUG_ON(page_count(page)); 85 BUG_ON(page_count(page));
86 86
87 INIT_LIST_HEAD(&page->lru); 87 INIT_LIST_HEAD(&page->lru);
88 page[1].mapping = NULL; 88 page[1].lru.next = NULL; /* reset dtor */
89 89
90 spin_lock(&hugetlb_lock); 90 spin_lock(&hugetlb_lock);
91 enqueue_huge_page(page); 91 enqueue_huge_page(page);
@@ -105,7 +105,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
105 } 105 }
106 spin_unlock(&hugetlb_lock); 106 spin_unlock(&hugetlb_lock);
107 set_page_count(page, 1); 107 set_page_count(page, 1);
108 page[1].mapping = (void *)free_huge_page; 108 page[1].lru.next = (void *)free_huge_page; /* set dtor */
109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) 109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110 clear_user_highpage(&page[i], addr); 110 clear_user_highpage(&page[i], addr);
111 return page; 111 return page;
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299a..af3d573b0141 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
22 struct mm_struct * mm = vma->vm_mm; 22 struct mm_struct * mm = vma->vm_mm;
23 int error = 0; 23 int error = 0;
24 pgoff_t pgoff; 24 pgoff_t pgoff;
25 int new_flags = vma->vm_flags & ~VM_READHINTMASK; 25 int new_flags = vma->vm_flags;
26 26
27 switch (behavior) { 27 switch (behavior) {
28 case MADV_NORMAL:
29 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
30 break;
28 case MADV_SEQUENTIAL: 31 case MADV_SEQUENTIAL:
29 new_flags |= VM_SEQ_READ; 32 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
30 break; 33 break;
31 case MADV_RANDOM: 34 case MADV_RANDOM:
32 new_flags |= VM_RAND_READ; 35 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
33 break; 36 break;
34 default: 37 case MADV_DONTFORK:
38 new_flags |= VM_DONTCOPY;
39 break;
40 case MADV_DOFORK:
41 new_flags &= ~VM_DONTCOPY;
35 break; 42 break;
36 } 43 }
37 44
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
177 long error; 184 long error;
178 185
179 switch (behavior) { 186 switch (behavior) {
187 case MADV_DOFORK:
188 if (vma->vm_flags & VM_IO) {
189 error = -EINVAL;
190 break;
191 }
192 case MADV_DONTFORK:
180 case MADV_NORMAL: 193 case MADV_NORMAL:
181 case MADV_SEQUENTIAL: 194 case MADV_SEQUENTIAL:
182 case MADV_RANDOM: 195 case MADV_RANDOM:
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8a..9abc6008544b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
82EXPORT_SYMBOL(high_memory); 82EXPORT_SYMBOL(high_memory);
83EXPORT_SYMBOL(vmalloc_earlyreserve); 83EXPORT_SYMBOL(vmalloc_earlyreserve);
84 84
85int randomize_va_space __read_mostly = 1;
86
87static int __init disable_randmaps(char *s)
88{
89 randomize_va_space = 0;
90 return 0;
91}
92__setup("norandmaps", disable_randmaps);
93
94
85/* 95/*
86 * If a p?d_bad entry is found while walking page tables, report 96 * If a p?d_bad entry is found while walking page tables, report
87 * the error, before resetting entry to p?d_none. Usually (but 97 * the error, before resetting entry to p?d_none. Usually (but
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b75..880831bd3003 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
132 } 132 }
133 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 133 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
134} 134}
135
135/* Generate a custom zonelist for the BIND policy. */ 136/* Generate a custom zonelist for the BIND policy. */
136static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
137{ 138{
138 struct zonelist *zl; 139 struct zonelist *zl;
139 int num, max, nd; 140 int num, max, nd, k;
140 141
141 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
142 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
143 if (!zl) 144 if (!zl)
144 return NULL; 145 return NULL;
145 num = 0; 146 num = 0;
146 for_each_node_mask(nd, *nodes) 147 /* First put in the highest zones from all nodes, then all the next
147 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; 148 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you
150 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) {
152 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0)
155 zl->zones[num++] = z;
156 }
157 }
148 zl->zones[num] = NULL; 158 zl->zones[num] = NULL;
149 return zl; 159 return zl;
150} 160}
@@ -577,7 +587,7 @@ redo:
577 } 587 }
578 list_add(&page->lru, &newlist); 588 list_add(&page->lru, &newlist);
579 nr_pages++; 589 nr_pages++;
580 if (nr_pages > MIGRATE_CHUNK_SIZE); 590 if (nr_pages > MIGRATE_CHUNK_SIZE)
581 break; 591 break;
582 } 592 }
583 err = migrate_pages(pagelist, &newlist, &moved, &failed); 593 err = migrate_pages(pagelist, &newlist, &moved, &failed);
@@ -798,6 +808,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
798 nodes_clear(*nodes); 808 nodes_clear(*nodes);
799 if (maxnode == 0 || !nmask) 809 if (maxnode == 0 || !nmask)
800 return 0; 810 return 0;
811 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
812 return -EINVAL;
801 813
802 nlongs = BITS_TO_LONGS(maxnode); 814 nlongs = BITS_TO_LONGS(maxnode);
803 if ((maxnode % BITS_PER_LONG) == 0) 815 if ((maxnode % BITS_PER_LONG) == 0)
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d68232..99d21020ec9d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc);
57EXPORT_SYMBOL(vfree); 57EXPORT_SYMBOL(vfree);
58EXPORT_SYMBOL(vmalloc_to_page); 58EXPORT_SYMBOL(vmalloc_to_page);
59EXPORT_SYMBOL(vmalloc_32); 59EXPORT_SYMBOL(vmalloc_32);
60EXPORT_SYMBOL(vmap);
61EXPORT_SYMBOL(vunmap);
60 62
61/* 63/*
62 * Handle all mappings that got truncated by a "truncate()" 64 * Handle all mappings that got truncated by a "truncate()"
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b05ab8f2a562..8123fad5a485 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 58
59 /* 59 /*
60 * Processes which fork a lot of child processes are likely 60 * Processes which fork a lot of child processes are likely
61 * a good choice. We add the vmsize of the children if they 61 * a good choice. We add half the vmsize of the children if they
62 * have an own mm. This prevents forking servers to flood the 62 * have an own mm. This prevents forking servers to flood the
63 * machine with an endless amount of children 63 * machine with an endless amount of children. In case a single
64 * child is eating the vast majority of memory, adding only half
65 * to the parents will make the child our kill candidate of choice.
64 */ 66 */
65 list_for_each(tsk, &p->children) { 67 list_for_each(tsk, &p->children) {
66 struct task_struct *chld; 68 struct task_struct *chld;
67 chld = list_entry(tsk, struct task_struct, sibling); 69 chld = list_entry(tsk, struct task_struct, sibling);
68 if (chld->mm != p->mm && chld->mm) 70 if (chld->mm != p->mm && chld->mm)
69 points += chld->mm->total_vm; 71 points += chld->mm->total_vm/2 + 1;
70 } 72 }
71 73
72 /* 74 /*
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
131} 133}
132 134
133/* 135/*
136 * Types of limitations to the nodes from which allocations may occur
137 */
138#define CONSTRAINT_NONE 1
139#define CONSTRAINT_MEMORY_POLICY 2
140#define CONSTRAINT_CPUSET 3
141
142/*
143 * Determine the type of allocation constraint.
144 */
145static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
146{
147#ifdef CONFIG_NUMA
148 struct zone **z;
149 nodemask_t nodes = node_online_map;
150
151 for (z = zonelist->zones; *z; z++)
152 if (cpuset_zone_allowed(*z, gfp_mask))
153 node_clear((*z)->zone_pgdat->node_id,
154 nodes);
155 else
156 return CONSTRAINT_CPUSET;
157
158 if (!nodes_empty(nodes))
159 return CONSTRAINT_MEMORY_POLICY;
160#endif
161
162 return CONSTRAINT_NONE;
163}
164
165/*
134 * Simple selection loop. We chose the process with the highest 166 * Simple selection loop. We chose the process with the highest
135 * number of 'points'. We expect the caller will lock the tasklist. 167 * number of 'points'. We expect the caller will lock the tasklist.
136 * 168 *
137 * (not docbooked, we don't want this one cluttering up the manual) 169 * (not docbooked, we don't want this one cluttering up the manual)
138 */ 170 */
139static struct task_struct * select_bad_process(void) 171static struct task_struct *select_bad_process(unsigned long *ppoints)
140{ 172{
141 unsigned long maxpoints = 0;
142 struct task_struct *g, *p; 173 struct task_struct *g, *p;
143 struct task_struct *chosen = NULL; 174 struct task_struct *chosen = NULL;
144 struct timespec uptime; 175 struct timespec uptime;
176 *ppoints = 0;
145 177
146 do_posix_clock_monotonic_gettime(&uptime); 178 do_posix_clock_monotonic_gettime(&uptime);
147 do_each_thread(g, p) { 179 do_each_thread(g, p) {
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void)
169 return p; 201 return p;
170 202
171 points = badness(p, uptime.tv_sec); 203 points = badness(p, uptime.tv_sec);
172 if (points > maxpoints || !chosen) { 204 if (points > *ppoints || !chosen) {
173 chosen = p; 205 chosen = p;
174 maxpoints = points; 206 *ppoints = points;
175 } 207 }
176 } while_each_thread(g, p); 208 } while_each_thread(g, p);
177 return chosen; 209 return chosen;
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void)
182 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that 214 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
183 * we select a process with CAP_SYS_RAW_IO set). 215 * we select a process with CAP_SYS_RAW_IO set).
184 */ 216 */
185static void __oom_kill_task(task_t *p) 217static void __oom_kill_task(task_t *p, const char *message)
186{ 218{
187 if (p->pid == 1) { 219 if (p->pid == 1) {
188 WARN_ON(1); 220 WARN_ON(1);
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p)
198 return; 230 return;
199 } 231 }
200 task_unlock(p); 232 task_unlock(p);
201 printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", 233 printk(KERN_ERR "%s: Killed process %d (%s).\n",
202 p->pid, p->comm); 234 message, p->pid, p->comm);
203 235
204 /* 236 /*
205 * We give our sacrificial lamb high priority and access to 237 * We give our sacrificial lamb high priority and access to
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p)
212 force_sig(SIGKILL, p); 244 force_sig(SIGKILL, p);
213} 245}
214 246
215static struct mm_struct *oom_kill_task(task_t *p) 247static struct mm_struct *oom_kill_task(task_t *p, const char *message)
216{ 248{
217 struct mm_struct *mm = get_task_mm(p); 249 struct mm_struct *mm = get_task_mm(p);
218 task_t * g, * q; 250 task_t * g, * q;
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p)
224 return NULL; 256 return NULL;
225 } 257 }
226 258
227 __oom_kill_task(p); 259 __oom_kill_task(p, message);
228 /* 260 /*
229 * kill all processes that share the ->mm (i.e. all threads), 261 * kill all processes that share the ->mm (i.e. all threads),
230 * but are in a different thread group 262 * but are in a different thread group
231 */ 263 */
232 do_each_thread(g, q) 264 do_each_thread(g, q)
233 if (q->mm == mm && q->tgid != p->tgid) 265 if (q->mm == mm && q->tgid != p->tgid)
234 __oom_kill_task(q); 266 __oom_kill_task(q, message);
235 while_each_thread(g, q); 267 while_each_thread(g, q);
236 268
237 return mm; 269 return mm;
238} 270}
239 271
240static struct mm_struct *oom_kill_process(struct task_struct *p) 272static struct mm_struct *oom_kill_process(struct task_struct *p,
273 unsigned long points, const char *message)
241{ 274{
242 struct mm_struct *mm; 275 struct mm_struct *mm;
243 struct task_struct *c; 276 struct task_struct *c;
244 struct list_head *tsk; 277 struct list_head *tsk;
245 278
279 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
280 "children.\n", p->pid, p->comm, points);
246 /* Try to kill a child first */ 281 /* Try to kill a child first */
247 list_for_each(tsk, &p->children) { 282 list_for_each(tsk, &p->children) {
248 c = list_entry(tsk, struct task_struct, sibling); 283 c = list_entry(tsk, struct task_struct, sibling);
249 if (c->mm == p->mm) 284 if (c->mm == p->mm)
250 continue; 285 continue;
251 mm = oom_kill_task(c); 286 mm = oom_kill_task(c, message);
252 if (mm) 287 if (mm)
253 return mm; 288 return mm;
254 } 289 }
255 return oom_kill_task(p); 290 return oom_kill_task(p, message);
256} 291}
257 292
258/** 293/**
@@ -263,10 +298,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
263 * OR try to be smart about which process to kill. Note that we 298 * OR try to be smart about which process to kill. Note that we
264 * don't have to be perfect here, we just have to be good. 299 * don't have to be perfect here, we just have to be good.
265 */ 300 */
266void out_of_memory(gfp_t gfp_mask, int order) 301void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
267{ 302{
268 struct mm_struct *mm = NULL; 303 struct mm_struct *mm = NULL;
269 task_t * p; 304 task_t *p;
305 unsigned long points;
270 306
271 if (printk_ratelimit()) { 307 if (printk_ratelimit()) {
272 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 308 printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -277,25 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order)
277 313
278 cpuset_lock(); 314 cpuset_lock();
279 read_lock(&tasklist_lock); 315 read_lock(&tasklist_lock);
316
317 /*
318 * Check if there were limitations on the allocation (only relevant for
319 * NUMA) that may require different handling.
320 */
321 switch (constrained_alloc(zonelist, gfp_mask)) {
322 case CONSTRAINT_MEMORY_POLICY:
323 mm = oom_kill_process(current, points,
324 "No available memory (MPOL_BIND)");
325 break;
326
327 case CONSTRAINT_CPUSET:
328 mm = oom_kill_process(current, points,
329 "No available memory in cpuset");
330 break;
331
332 case CONSTRAINT_NONE:
280retry: 333retry:
281 p = select_bad_process(); 334 /*
335 * Rambo mode: Shoot down a process and hope it solves whatever
336 * issues we may have.
337 */
338 p = select_bad_process(&points);
282 339
283 if (PTR_ERR(p) == -1UL) 340 if (PTR_ERR(p) == -1UL)
284 goto out; 341 goto out;
285 342
286 /* Found nothing?!?! Either we hang forever, or we panic. */ 343 /* Found nothing?!?! Either we hang forever, or we panic. */
287 if (!p) { 344 if (!p) {
288 read_unlock(&tasklist_lock); 345 read_unlock(&tasklist_lock);
289 cpuset_unlock(); 346 cpuset_unlock();
290 panic("Out of memory and no killable processes...\n"); 347 panic("Out of memory and no killable processes...\n");
291 } 348 }
292 349
293 mm = oom_kill_process(p); 350 mm = oom_kill_process(p, points, "Out of memory");
294 if (!mm) 351 if (!mm)
295 goto retry; 352 goto retry;
353
354 break;
355 }
296 356
297 out: 357out:
298 read_unlock(&tasklist_lock);
299 cpuset_unlock(); 358 cpuset_unlock();
300 if (mm) 359 if (mm)
301 mmput(mm); 360 mmput(mm);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde04ff4be31..791690d7d3fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
56int percpu_pagelist_fraction; 56int percpu_pagelist_fraction;
57 57
58static void fastcall free_hot_cold_page(struct page *page, int cold); 58static void fastcall free_hot_cold_page(struct page *page, int cold);
59static void __free_pages_ok(struct page *page, unsigned int order);
59 60
60/* 61/*
61 * results with 256, 32 in the lowmem_reserve sysctl: 62 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
169 * All pages have PG_compound set. All pages have their ->private pointing at 170 * All pages have PG_compound set. All pages have their ->private pointing at
170 * the head page (even the head page has this). 171 * the head page (even the head page has this).
171 * 172 *
172 * The first tail page's ->mapping, if non-zero, holds the address of the 173 * The first tail page's ->lru.next holds the address of the compound page's
173 * compound page's put_page() function. 174 * put_page() function. Its ->lru.prev holds the order of allocation.
174 * 175 * This usage means that zero-order pages may not be compound.
175 * The order of the allocation is stored in the first tail page's ->index
176 * This is only for debug at present. This usage means that zero-order pages
177 * may not be compound.
178 */ 176 */
177
178static void free_compound_page(struct page *page)
179{
180 __free_pages_ok(page, (unsigned long)page[1].lru.prev);
181}
182
179static void prep_compound_page(struct page *page, unsigned long order) 183static void prep_compound_page(struct page *page, unsigned long order)
180{ 184{
181 int i; 185 int i;
182 int nr_pages = 1 << order; 186 int nr_pages = 1 << order;
183 187
184 page[1].mapping = NULL; 188 page[1].lru.next = (void *)free_compound_page; /* set dtor */
185 page[1].index = order; 189 page[1].lru.prev = (void *)order;
186 for (i = 0; i < nr_pages; i++) { 190 for (i = 0; i < nr_pages; i++) {
187 struct page *p = page + i; 191 struct page *p = page + i;
188 192
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
196 int i; 200 int i;
197 int nr_pages = 1 << order; 201 int nr_pages = 1 << order;
198 202
199 if (unlikely(page[1].index != order)) 203 if (unlikely((unsigned long)page[1].lru.prev != order))
200 bad_page(page); 204 bad_page(page);
201 205
202 for (i = 0; i < nr_pages; i++) { 206 for (i = 0; i < nr_pages; i++) {
@@ -1011,7 +1015,7 @@ rebalance:
1011 if (page) 1015 if (page)
1012 goto got_pg; 1016 goto got_pg;
1013 1017
1014 out_of_memory(gfp_mask, order); 1018 out_of_memory(zonelist, gfp_mask, order);
1015 goto restart; 1019 goto restart;
1016 } 1020 }
1017 1021
@@ -1537,29 +1541,29 @@ static int __initdata node_load[MAX_NUMNODES];
1537 */ 1541 */
1538static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1542static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1539{ 1543{
1540 int i, n, val; 1544 int n, val;
1541 int min_val = INT_MAX; 1545 int min_val = INT_MAX;
1542 int best_node = -1; 1546 int best_node = -1;
1543 1547
1544 for_each_online_node(i) { 1548 /* Use the local node if we haven't already */
1545 cpumask_t tmp; 1549 if (!node_isset(node, *used_node_mask)) {
1550 node_set(node, *used_node_mask);
1551 return node;
1552 }
1546 1553
1547 /* Start from local node */ 1554 for_each_online_node(n) {
1548 n = (node+i) % num_online_nodes(); 1555 cpumask_t tmp;
1549 1556
1550 /* Don't want a node to appear more than once */ 1557 /* Don't want a node to appear more than once */
1551 if (node_isset(n, *used_node_mask)) 1558 if (node_isset(n, *used_node_mask))
1552 continue; 1559 continue;
1553 1560
1554 /* Use the local node if we haven't already */
1555 if (!node_isset(node, *used_node_mask)) {
1556 best_node = node;
1557 break;
1558 }
1559
1560 /* Use the distance array to find the distance */ 1561 /* Use the distance array to find the distance */
1561 val = node_distance(node, n); 1562 val = node_distance(node, n);
1562 1563
1564 /* Penalize nodes under us ("prefer the next node") */
1565 val += (n < node);
1566
1563 /* Give preference to headless and unused nodes */ 1567 /* Give preference to headless and unused nodes */
1564 tmp = node_to_cpumask(n); 1568 tmp = node_to_cpumask(n);
1565 if (!cpus_empty(tmp)) 1569 if (!cpus_empty(tmp))
diff --git a/mm/shmem.c b/mm/shmem.c
index f7ac7b812f92..7c455fbaff7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/mempolicy.h> 46#include <linux/mempolicy.h>
47#include <linux/namei.h> 47#include <linux/namei.h>
48#include <linux/ctype.h>
48#include <asm/uaccess.h> 49#include <asm/uaccess.h>
49#include <asm/div64.h> 50#include <asm/div64.h>
50#include <asm/pgtable.h> 51#include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
874} 875}
875 876
876#ifdef CONFIG_NUMA 877#ifdef CONFIG_NUMA
878static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
879{
880 char *nodelist = strchr(value, ':');
881 int err = 1;
882
883 if (nodelist) {
884 /* NUL-terminate policy string */
885 *nodelist++ = '\0';
886 if (nodelist_parse(nodelist, *policy_nodes))
887 goto out;
888 }
889 if (!strcmp(value, "default")) {
890 *policy = MPOL_DEFAULT;
891 /* Don't allow a nodelist */
892 if (!nodelist)
893 err = 0;
894 } else if (!strcmp(value, "prefer")) {
895 *policy = MPOL_PREFERRED;
896 /* Insist on a nodelist of one node only */
897 if (nodelist) {
898 char *rest = nodelist;
899 while (isdigit(*rest))
900 rest++;
901 if (!*rest)
902 err = 0;
903 }
904 } else if (!strcmp(value, "bind")) {
905 *policy = MPOL_BIND;
906 /* Insist on a nodelist */
907 if (nodelist)
908 err = 0;
909 } else if (!strcmp(value, "interleave")) {
910 *policy = MPOL_INTERLEAVE;
911 /* Default to nodes online if no nodelist */
912 if (!nodelist)
913 *policy_nodes = node_online_map;
914 err = 0;
915 }
916out:
917 /* Restore string for error message */
918 if (nodelist)
919 *--nodelist = ':';
920 return err;
921}
922
877static struct page *shmem_swapin_async(struct shared_policy *p, 923static struct page *shmem_swapin_async(struct shared_policy *p,
878 swp_entry_t entry, unsigned long idx) 924 swp_entry_t entry, unsigned long idx)
879{ 925{
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
926 return page; 972 return page;
927} 973}
928#else 974#else
975static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
976{
977 return 1;
978}
979
929static inline struct page * 980static inline struct page *
930shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) 981shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
931{ 982{
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1859{ 1910{
1860 char *this_char, *value, *rest; 1911 char *this_char, *value, *rest;
1861 1912
1862 while ((this_char = strsep(&options, ",")) != NULL) { 1913 while (options != NULL) {
1914 this_char = options;
1915 for (;;) {
1916 /*
1917 * NUL-terminate this option: unfortunately,
1918 * mount options form a comma-separated list,
1919 * but mpol's nodelist may also contain commas.
1920 */
1921 options = strchr(options, ',');
1922 if (options == NULL)
1923 break;
1924 options++;
1925 if (!isdigit(*options)) {
1926 options[-1] = '\0';
1927 break;
1928 }
1929 }
1863 if (!*this_char) 1930 if (!*this_char)
1864 continue; 1931 continue;
1865 if ((value = strchr(this_char,'=')) != NULL) { 1932 if ((value = strchr(this_char,'=')) != NULL) {
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1910 if (*rest) 1977 if (*rest)
1911 goto bad_val; 1978 goto bad_val;
1912 } else if (!strcmp(this_char,"mpol")) { 1979 } else if (!strcmp(this_char,"mpol")) {
1913 if (!strcmp(value,"default")) 1980 if (shmem_parse_mpol(value,policy,policy_nodes))
1914 *policy = MPOL_DEFAULT;
1915 else if (!strcmp(value,"preferred"))
1916 *policy = MPOL_PREFERRED;
1917 else if (!strcmp(value,"bind"))
1918 *policy = MPOL_BIND;
1919 else if (!strcmp(value,"interleave"))
1920 *policy = MPOL_INTERLEAVE;
1921 else
1922 goto bad_val; 1981 goto bad_val;
1923 } else if (!strcmp(this_char,"mpol_nodelist")) {
1924 nodelist_parse(value, *policy_nodes);
1925 } else { 1982 } else {
1926 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1983 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1927 this_char); 1984 this_char);
diff --git a/mm/slab.c b/mm/slab.c
index d66c2b0d9715..add05d808a4a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1717,6 +1717,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1717 BUG(); 1717 BUG();
1718 } 1718 }
1719 1719
1720 /*
1721 * Prevent CPUs from coming and going.
1722 * lock_cpu_hotplug() nests outside cache_chain_mutex
1723 */
1724 lock_cpu_hotplug();
1725
1720 mutex_lock(&cache_chain_mutex); 1726 mutex_lock(&cache_chain_mutex);
1721 1727
1722 list_for_each(p, &cache_chain) { 1728 list_for_each(p, &cache_chain) {
@@ -1918,8 +1924,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1918 cachep->dtor = dtor; 1924 cachep->dtor = dtor;
1919 cachep->name = name; 1925 cachep->name = name;
1920 1926
1921 /* Don't let CPUs to come and go */
1922 lock_cpu_hotplug();
1923 1927
1924 if (g_cpucache_up == FULL) { 1928 if (g_cpucache_up == FULL) {
1925 enable_cpucache(cachep); 1929 enable_cpucache(cachep);
@@ -1978,12 +1982,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1978 1982
1979 /* cache setup completed, link it into the list */ 1983 /* cache setup completed, link it into the list */
1980 list_add(&cachep->next, &cache_chain); 1984 list_add(&cachep->next, &cache_chain);
1981 unlock_cpu_hotplug();
1982 oops: 1985 oops:
1983 if (!cachep && (flags & SLAB_PANIC)) 1986 if (!cachep && (flags & SLAB_PANIC))
1984 panic("kmem_cache_create(): failed to create slab `%s'\n", 1987 panic("kmem_cache_create(): failed to create slab `%s'\n",
1985 name); 1988 name);
1986 mutex_unlock(&cache_chain_mutex); 1989 mutex_unlock(&cache_chain_mutex);
1990 unlock_cpu_hotplug();
1987 return cachep; 1991 return cachep;
1988} 1992}
1989EXPORT_SYMBOL(kmem_cache_create); 1993EXPORT_SYMBOL(kmem_cache_create);
diff --git a/mm/swap.c b/mm/swap.c
index 76247424dea1..cce3dda59c59 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page)
40 if (put_page_testzero(page)) { 40 if (put_page_testzero(page)) {
41 void (*dtor)(struct page *page); 41 void (*dtor)(struct page *page);
42 42
43 dtor = (void (*)(struct page *))page[1].mapping; 43 dtor = (void (*)(struct page *))page[1].lru.next;
44 (*dtor)(page); 44 (*dtor)(page);
45 } 45 }
46} 46}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd06..1838c15ca4fd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
443 BUG_ON(PageActive(page)); 443 BUG_ON(PageActive(page));
444 444
445 sc->nr_scanned++; 445 sc->nr_scanned++;
446
447 if (!sc->may_swap && page_mapped(page))
448 goto keep_locked;
449
446 /* Double the slab pressure for mapped and swapcache pages */ 450 /* Double the slab pressure for mapped and swapcache pages */
447 if (page_mapped(page) || PageSwapCache(page)) 451 if (page_mapped(page) || PageSwapCache(page))
448 sc->nr_scanned++; 452 sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
632 struct address_space *mapping = page_mapping(page); 636 struct address_space *mapping = page_mapping(page);
633 637
634 if (page_mapped(page) && mapping) 638 if (page_mapped(page) && mapping)
635 if (try_to_unmap(page, 0) != SWAP_SUCCESS) 639 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
636 goto unlock_retry; 640 goto unlock_retry;
637 641
638 if (PageDirty(page)) { 642 if (PageDirty(page)) {
@@ -839,7 +843,7 @@ EXPORT_SYMBOL(migrate_page);
839 * pages are swapped out. 843 * pages are swapped out.
840 * 844 *
841 * The function returns after 10 attempts or if no pages 845 * The function returns after 10 attempts or if no pages
842 * are movable anymore because t has become empty 846 * are movable anymore because to has become empty
843 * or no retryable pages exist anymore. 847 * or no retryable pages exist anymore.
844 * 848 *
845 * Return: Number of pages not migrated when "to" ran empty. 849 * Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +932,21 @@ redo:
928 goto unlock_both; 932 goto unlock_both;
929 933
930 if (mapping->a_ops->migratepage) { 934 if (mapping->a_ops->migratepage) {
935 /*
936 * Most pages have a mapping and most filesystems
937 * should provide a migration function. Anonymous
938 * pages are part of swap space which also has its
939 * own migration function. This is the most common
940 * path for page migration.
941 */
931 rc = mapping->a_ops->migratepage(newpage, page); 942 rc = mapping->a_ops->migratepage(newpage, page);
932 goto unlock_both; 943 goto unlock_both;
933 } 944 }
934 945
935 /* 946 /*
936 * Trigger writeout if page is dirty 947 * Default handling if a filesystem does not provide
948 * a migration function. We can only migrate clean
949 * pages so try to write out any dirty pages first.
937 */ 950 */
938 if (PageDirty(page)) { 951 if (PageDirty(page)) {
939 switch (pageout(page, mapping)) { 952 switch (pageout(page, mapping)) {
@@ -949,9 +962,10 @@ redo:
949 ; /* try to migrate the page below */ 962 ; /* try to migrate the page below */
950 } 963 }
951 } 964 }
965
952 /* 966 /*
953 * If we have no buffer or can release the buffer 967 * Buffers are managed in a filesystem specific way.
954 * then do a simple migration. 968 * We must have no buffers or drop them.
955 */ 969 */
956 if (!page_has_buffers(page) || 970 if (!page_has_buffers(page) ||
957 try_to_release_page(page, GFP_KERNEL)) { 971 try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +980,11 @@ redo:
966 * swap them out. 980 * swap them out.
967 */ 981 */
968 if (pass > 4) { 982 if (pass > 4) {
983 /*
984 * Persistently unable to drop buffers..... As a
985 * measure of last resort we fall back to
986 * swap_page().
987 */
969 unlock_page(newpage); 988 unlock_page(newpage);
970 newpage = NULL; 989 newpage = NULL;
971 rc = swap_page(page); 990 rc = swap_page(page);
@@ -1176,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1176 struct page *page; 1195 struct page *page;
1177 struct pagevec pvec; 1196 struct pagevec pvec;
1178 int reclaim_mapped = 0; 1197 int reclaim_mapped = 0;
1179 long mapped_ratio; 1198
1180 long distress; 1199 if (unlikely(sc->may_swap)) {
1181 long swap_tendency; 1200 long mapped_ratio;
1201 long distress;
1202 long swap_tendency;
1203
1204 /*
1205 * `distress' is a measure of how much trouble we're having
1206 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
1207 */
1208 distress = 100 >> zone->prev_priority;
1209
1210 /*
1211 * The point of this algorithm is to decide when to start
1212 * reclaiming mapped memory instead of just pagecache. Work out
1213 * how much memory
1214 * is mapped.
1215 */
1216 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
1217
1218 /*
1219 * Now decide how much we really want to unmap some pages. The
1220 * mapped ratio is downgraded - just because there's a lot of
1221 * mapped memory doesn't necessarily mean that page reclaim
1222 * isn't succeeding.
1223 *
1224 * The distress ratio is important - we don't want to start
1225 * going oom.
1226 *
1227 * A 100% value of vm_swappiness overrides this algorithm
1228 * altogether.
1229 */
1230 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
1231
1232 /*
1233 * Now use this metric to decide whether to start moving mapped
1234 * memory onto the inactive list.
1235 */
1236 if (swap_tendency >= 100)
1237 reclaim_mapped = 1;
1238 }
1182 1239
1183 lru_add_drain(); 1240 lru_add_drain();
1184 spin_lock_irq(&zone->lru_lock); 1241 spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1188 zone->nr_active -= pgmoved; 1245 zone->nr_active -= pgmoved;
1189 spin_unlock_irq(&zone->lru_lock); 1246 spin_unlock_irq(&zone->lru_lock);
1190 1247
1191 /*
1192 * `distress' is a measure of how much trouble we're having reclaiming
1193 * pages. 0 -> no problems. 100 -> great trouble.
1194 */
1195 distress = 100 >> zone->prev_priority;
1196
1197 /*
1198 * The point of this algorithm is to decide when to start reclaiming
1199 * mapped memory instead of just pagecache. Work out how much memory
1200 * is mapped.
1201 */
1202 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
1203
1204 /*
1205 * Now decide how much we really want to unmap some pages. The mapped
1206 * ratio is downgraded - just because there's a lot of mapped memory
1207 * doesn't necessarily mean that page reclaim isn't succeeding.
1208 *
1209 * The distress ratio is important - we don't want to start going oom.
1210 *
1211 * A 100% value of vm_swappiness overrides this algorithm altogether.
1212 */
1213 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
1214
1215 /*
1216 * Now use this metric to decide whether to start moving mapped memory
1217 * onto the inactive list.
1218 */
1219 if (swap_tendency >= 100)
1220 reclaim_mapped = 1;
1221
1222 while (!list_empty(&l_hold)) { 1248 while (!list_empty(&l_hold)) {
1223 cond_resched(); 1249 cond_resched();
1224 page = lru_to_page(&l_hold); 1250 page = lru_to_page(&l_hold);
@@ -1595,9 +1621,7 @@ scan:
1595 sc.nr_reclaimed = 0; 1621 sc.nr_reclaimed = 0;
1596 sc.priority = priority; 1622 sc.priority = priority;
1597 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; 1623 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1598 atomic_inc(&zone->reclaim_in_progress);
1599 shrink_zone(zone, &sc); 1624 shrink_zone(zone, &sc);
1600 atomic_dec(&zone->reclaim_in_progress);
1601 reclaim_state->reclaimed_slab = 0; 1625 reclaim_state->reclaimed_slab = 0;
1602 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1626 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1603 lru_pages); 1627 lru_pages);