aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile3
-rw-r--r--mm/allocpercpu.c2
-rw-r--r--mm/bootmem.c27
-rw-r--r--mm/dmapool.c500
-rw-r--r--mm/fadvise.c16
-rw-r--r--mm/filemap.c27
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c5
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memcontrol.c1192
-rw-r--r--mm/memory.c256
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/migrate.c54
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/nommu.c53
-rw-r--r--mm/oom_kill.c90
-rw-r--r--mm/page-writeback.c24
-rw-r--r--mm/page_alloc.c162
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pagewalk.c131
-rw-r--r--mm/rmap.c53
-rw-r--r--mm/shmem.c517
-rw-r--r--mm/slob.c51
-rw-r--r--mm/slub.c182
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c12
-rw-r--r--mm/swap_state.c153
-rw-r--r--mm/swapfile.c150
-rw-r--r--mm/tiny-shmem.c12
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/vmalloc.c74
-rw-r--r--mm/vmscan.c495
-rw-r--r--mm/vmstat.c61
35 files changed, 3308 insertions, 1046 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 5c0b0ea7572d..9f117bab5322 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -13,8 +13,10 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o $(mmu-y) 14 page_isolation.o $(mmu-y)
15 15
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
16obj-$(CONFIG_BOUNCE) += bounce.o 17obj-$(CONFIG_BOUNCE) += bounce.o
17obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 18obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
19obj-$(CONFIG_HAS_DMA) += dmapool.o
18obj-$(CONFIG_HUGETLBFS) += hugetlb.o 20obj-$(CONFIG_HUGETLBFS) += hugetlb.o
19obj-$(CONFIG_NUMA) += mempolicy.o 21obj-$(CONFIG_NUMA) += mempolicy.o
20obj-$(CONFIG_SPARSEMEM) += sparse.o 22obj-$(CONFIG_SPARSEMEM) += sparse.o
@@ -30,4 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
30obj-$(CONFIG_MIGRATION) += migrate.o 32obj-$(CONFIG_MIGRATION) += migrate.o
31obj-$(CONFIG_SMP) += allocpercpu.o 33obj-$(CONFIG_SMP) += allocpercpu.o
32obj-$(CONFIG_QUICKLIST) += quicklist.o 34obj-$(CONFIG_QUICKLIST) += quicklist.o
35obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
33 36
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 00b02623f008..7e58322b7134 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask);
98 */ 98 */
99void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) 99void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
100{ 100{
101 void *pdata = kzalloc(sizeof(struct percpu_data), gfp); 101 void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp);
102 void *__pdata = __percpu_disguise(pdata); 102 void *__pdata = __percpu_disguise(pdata);
103 103
104 if (unlikely(!pdata)) 104 if (unlikely(!pdata))
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 00a96970b237..f6ff4337b424 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -111,11 +111,12 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
111 * might be used for boot-time allocations - or it might get added 111 * might be used for boot-time allocations - or it might get added
112 * to the free page pool later on. 112 * to the free page pool later on.
113 */ 113 */
114static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr, 114static int __init reserve_bootmem_core(bootmem_data_t *bdata,
115 unsigned long size) 115 unsigned long addr, unsigned long size, int flags)
116{ 116{
117 unsigned long sidx, eidx; 117 unsigned long sidx, eidx;
118 unsigned long i; 118 unsigned long i;
119 int ret;
119 120
120 /* 121 /*
121 * round up, partially reserved pages are considered 122 * round up, partially reserved pages are considered
@@ -133,7 +134,20 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
133#ifdef CONFIG_DEBUG_BOOTMEM 134#ifdef CONFIG_DEBUG_BOOTMEM
134 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); 135 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
135#endif 136#endif
137 if (flags & BOOTMEM_EXCLUSIVE) {
138 ret = -EBUSY;
139 goto err;
140 }
136 } 141 }
142
143 return 0;
144
145err:
146 /* unreserve memory we accidentally reserved */
147 for (i--; i >= sidx; i--)
148 clear_bit(i, bdata->node_bootmem_map);
149
150 return ret;
137} 151}
138 152
139static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, 153static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
@@ -374,9 +388,9 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
374} 388}
375 389
376void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 390void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
377 unsigned long size) 391 unsigned long size, int flags)
378{ 392{
379 reserve_bootmem_core(pgdat->bdata, physaddr, size); 393 reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
380} 394}
381 395
382void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 396void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
@@ -398,9 +412,10 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
398} 412}
399 413
400#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE 414#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
401void __init reserve_bootmem(unsigned long addr, unsigned long size) 415int __init reserve_bootmem(unsigned long addr, unsigned long size,
416 int flags)
402{ 417{
403 reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size); 418 return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags);
404} 419}
405#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 420#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
406 421
diff --git a/mm/dmapool.c b/mm/dmapool.c
new file mode 100644
index 000000000000..34aaac451a96
--- /dev/null
+++ b/mm/dmapool.c
@@ -0,0 +1,500 @@
1/*
2 * DMA Pool allocator
3 *
4 * Copyright 2001 David Brownell
5 * Copyright 2007 Intel Corporation
6 * Author: Matthew Wilcox <willy@linux.intel.com>
7 *
8 * This software may be redistributed and/or modified under the terms of
9 * the GNU General Public License ("GPL") version 2 as published by the
10 * Free Software Foundation.
11 *
12 * This allocator returns small blocks of a given size which are DMA-able by
13 * the given device. It uses the dma_alloc_coherent page allocator to get
14 * new pages, then splits them up into blocks of the required size.
15 * Many older drivers still have their own code to do this.
16 *
17 * The current design of this allocator is fairly simple. The pool is
18 * represented by the 'struct dma_pool' which keeps a doubly-linked list of
19 * allocated pages. Each page in the page_list is split into blocks of at
20 * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
21 * list of free blocks within the page. Used blocks aren't tracked, but we
22 * keep a count of how many are currently allocated from each page.
23 */
24
25#include <linux/device.h>
26#include <linux/dma-mapping.h>
27#include <linux/dmapool.h>
28#include <linux/kernel.h>
29#include <linux/list.h>
30#include <linux/module.h>
31#include <linux/mutex.h>
32#include <linux/poison.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/spinlock.h>
36#include <linux/string.h>
37#include <linux/types.h>
38#include <linux/wait.h>
39
40struct dma_pool { /* the pool */
41 struct list_head page_list;
42 spinlock_t lock;
43 size_t size;
44 struct device *dev;
45 size_t allocation;
46 size_t boundary;
47 char name[32];
48 wait_queue_head_t waitq;
49 struct list_head pools;
50};
51
52struct dma_page { /* cacheable header for 'allocation' bytes */
53 struct list_head page_list;
54 void *vaddr;
55 dma_addr_t dma;
56 unsigned int in_use;
57 unsigned int offset;
58};
59
60#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
61
62static DEFINE_MUTEX(pools_lock);
63
64static ssize_t
65show_pools(struct device *dev, struct device_attribute *attr, char *buf)
66{
67 unsigned temp;
68 unsigned size;
69 char *next;
70 struct dma_page *page;
71 struct dma_pool *pool;
72
73 next = buf;
74 size = PAGE_SIZE;
75
76 temp = scnprintf(next, size, "poolinfo - 0.1\n");
77 size -= temp;
78 next += temp;
79
80 mutex_lock(&pools_lock);
81 list_for_each_entry(pool, &dev->dma_pools, pools) {
82 unsigned pages = 0;
83 unsigned blocks = 0;
84
85 list_for_each_entry(page, &pool->page_list, page_list) {
86 pages++;
87 blocks += page->in_use;
88 }
89
90 /* per-pool info, no real statistics yet */
91 temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
92 pool->name, blocks,
93 pages * (pool->allocation / pool->size),
94 pool->size, pages);
95 size -= temp;
96 next += temp;
97 }
98 mutex_unlock(&pools_lock);
99
100 return PAGE_SIZE - size;
101}
102
103static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
104
105/**
106 * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
107 * @name: name of pool, for diagnostics
108 * @dev: device that will be doing the DMA
109 * @size: size of the blocks in this pool.
110 * @align: alignment requirement for blocks; must be a power of two
111 * @boundary: returned blocks won't cross this power of two boundary
112 * Context: !in_interrupt()
113 *
114 * Returns a dma allocation pool with the requested characteristics, or
115 * null if one can't be created. Given one of these pools, dma_pool_alloc()
116 * may be used to allocate memory. Such memory will all have "consistent"
117 * DMA mappings, accessible by the device and its driver without using
118 * cache flushing primitives. The actual size of blocks allocated may be
119 * larger than requested because of alignment.
120 *
121 * If @boundary is nonzero, objects returned from dma_pool_alloc() won't
122 * cross that size boundary. This is useful for devices which have
123 * addressing restrictions on individual DMA transfers, such as not crossing
124 * boundaries of 4KBytes.
125 */
126struct dma_pool *dma_pool_create(const char *name, struct device *dev,
127 size_t size, size_t align, size_t boundary)
128{
129 struct dma_pool *retval;
130 size_t allocation;
131
132 if (align == 0) {
133 align = 1;
134 } else if (align & (align - 1)) {
135 return NULL;
136 }
137
138 if (size == 0) {
139 return NULL;
140 } else if (size < 4) {
141 size = 4;
142 }
143
144 if ((size % align) != 0)
145 size = ALIGN(size, align);
146
147 allocation = max_t(size_t, size, PAGE_SIZE);
148
149 if (!boundary) {
150 boundary = allocation;
151 } else if ((boundary < size) || (boundary & (boundary - 1))) {
152 return NULL;
153 }
154
155 retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
156 if (!retval)
157 return retval;
158
159 strlcpy(retval->name, name, sizeof(retval->name));
160
161 retval->dev = dev;
162
163 INIT_LIST_HEAD(&retval->page_list);
164 spin_lock_init(&retval->lock);
165 retval->size = size;
166 retval->boundary = boundary;
167 retval->allocation = allocation;
168 init_waitqueue_head(&retval->waitq);
169
170 if (dev) {
171 int ret;
172
173 mutex_lock(&pools_lock);
174 if (list_empty(&dev->dma_pools))
175 ret = device_create_file(dev, &dev_attr_pools);
176 else
177 ret = 0;
178 /* note: not currently insisting "name" be unique */
179 if (!ret)
180 list_add(&retval->pools, &dev->dma_pools);
181 else {
182 kfree(retval);
183 retval = NULL;
184 }
185 mutex_unlock(&pools_lock);
186 } else
187 INIT_LIST_HEAD(&retval->pools);
188
189 return retval;
190}
191EXPORT_SYMBOL(dma_pool_create);
192
193static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
194{
195 unsigned int offset = 0;
196 unsigned int next_boundary = pool->boundary;
197
198 do {
199 unsigned int next = offset + pool->size;
200 if (unlikely((next + pool->size) >= next_boundary)) {
201 next = next_boundary;
202 next_boundary += pool->boundary;
203 }
204 *(int *)(page->vaddr + offset) = next;
205 offset = next;
206 } while (offset < pool->allocation);
207}
208
209static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
210{
211 struct dma_page *page;
212
213 page = kmalloc(sizeof(*page), mem_flags);
214 if (!page)
215 return NULL;
216 page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
217 &page->dma, mem_flags);
218 if (page->vaddr) {
219#ifdef CONFIG_DEBUG_SLAB
220 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
221#endif
222 pool_initialise_page(pool, page);
223 list_add(&page->page_list, &pool->page_list);
224 page->in_use = 0;
225 page->offset = 0;
226 } else {
227 kfree(page);
228 page = NULL;
229 }
230 return page;
231}
232
233static inline int is_page_busy(struct dma_page *page)
234{
235 return page->in_use != 0;
236}
237
238static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
239{
240 dma_addr_t dma = page->dma;
241
242#ifdef CONFIG_DEBUG_SLAB
243 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
244#endif
245 dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
246 list_del(&page->page_list);
247 kfree(page);
248}
249
250/**
251 * dma_pool_destroy - destroys a pool of dma memory blocks.
252 * @pool: dma pool that will be destroyed
253 * Context: !in_interrupt()
254 *
255 * Caller guarantees that no more memory from the pool is in use,
256 * and that nothing will try to use the pool after this call.
257 */
258void dma_pool_destroy(struct dma_pool *pool)
259{
260 mutex_lock(&pools_lock);
261 list_del(&pool->pools);
262 if (pool->dev && list_empty(&pool->dev->dma_pools))
263 device_remove_file(pool->dev, &dev_attr_pools);
264 mutex_unlock(&pools_lock);
265
266 while (!list_empty(&pool->page_list)) {
267 struct dma_page *page;
268 page = list_entry(pool->page_list.next,
269 struct dma_page, page_list);
270 if (is_page_busy(page)) {
271 if (pool->dev)
272 dev_err(pool->dev,
273 "dma_pool_destroy %s, %p busy\n",
274 pool->name, page->vaddr);
275 else
276 printk(KERN_ERR
277 "dma_pool_destroy %s, %p busy\n",
278 pool->name, page->vaddr);
279 /* leak the still-in-use consistent memory */
280 list_del(&page->page_list);
281 kfree(page);
282 } else
283 pool_free_page(pool, page);
284 }
285
286 kfree(pool);
287}
288EXPORT_SYMBOL(dma_pool_destroy);
289
290/**
291 * dma_pool_alloc - get a block of consistent memory
292 * @pool: dma pool that will produce the block
293 * @mem_flags: GFP_* bitmask
294 * @handle: pointer to dma address of block
295 *
296 * This returns the kernel virtual address of a currently unused block,
297 * and reports its dma address through the handle.
298 * If such a memory block can't be allocated, %NULL is returned.
299 */
300void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
301 dma_addr_t *handle)
302{
303 unsigned long flags;
304 struct dma_page *page;
305 size_t offset;
306 void *retval;
307
308 spin_lock_irqsave(&pool->lock, flags);
309 restart:
310 list_for_each_entry(page, &pool->page_list, page_list) {
311 if (page->offset < pool->allocation)
312 goto ready;
313 }
314 page = pool_alloc_page(pool, GFP_ATOMIC);
315 if (!page) {
316 if (mem_flags & __GFP_WAIT) {
317 DECLARE_WAITQUEUE(wait, current);
318
319 __set_current_state(TASK_INTERRUPTIBLE);
320 __add_wait_queue(&pool->waitq, &wait);
321 spin_unlock_irqrestore(&pool->lock, flags);
322
323 schedule_timeout(POOL_TIMEOUT_JIFFIES);
324
325 spin_lock_irqsave(&pool->lock, flags);
326 __remove_wait_queue(&pool->waitq, &wait);
327 goto restart;
328 }
329 retval = NULL;
330 goto done;
331 }
332
333 ready:
334 page->in_use++;
335 offset = page->offset;
336 page->offset = *(int *)(page->vaddr + offset);
337 retval = offset + page->vaddr;
338 *handle = offset + page->dma;
339#ifdef CONFIG_DEBUG_SLAB
340 memset(retval, POOL_POISON_ALLOCATED, pool->size);
341#endif
342 done:
343 spin_unlock_irqrestore(&pool->lock, flags);
344 return retval;
345}
346EXPORT_SYMBOL(dma_pool_alloc);
347
348static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
349{
350 unsigned long flags;
351 struct dma_page *page;
352
353 spin_lock_irqsave(&pool->lock, flags);
354 list_for_each_entry(page, &pool->page_list, page_list) {
355 if (dma < page->dma)
356 continue;
357 if (dma < (page->dma + pool->allocation))
358 goto done;
359 }
360 page = NULL;
361 done:
362 spin_unlock_irqrestore(&pool->lock, flags);
363 return page;
364}
365
366/**
367 * dma_pool_free - put block back into dma pool
368 * @pool: the dma pool holding the block
369 * @vaddr: virtual address of block
370 * @dma: dma address of block
371 *
372 * Caller promises neither device nor driver will again touch this block
373 * unless it is first re-allocated.
374 */
375void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
376{
377 struct dma_page *page;
378 unsigned long flags;
379 unsigned int offset;
380
381 page = pool_find_page(pool, dma);
382 if (!page) {
383 if (pool->dev)
384 dev_err(pool->dev,
385 "dma_pool_free %s, %p/%lx (bad dma)\n",
386 pool->name, vaddr, (unsigned long)dma);
387 else
388 printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
389 pool->name, vaddr, (unsigned long)dma);
390 return;
391 }
392
393 offset = vaddr - page->vaddr;
394#ifdef CONFIG_DEBUG_SLAB
395 if ((dma - page->dma) != offset) {
396 if (pool->dev)
397 dev_err(pool->dev,
398 "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
399 pool->name, vaddr, (unsigned long long)dma);
400 else
401 printk(KERN_ERR
402 "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
403 pool->name, vaddr, (unsigned long long)dma);
404 return;
405 }
406 {
407 unsigned int chain = page->offset;
408 while (chain < pool->allocation) {
409 if (chain != offset) {
410 chain = *(int *)(page->vaddr + chain);
411 continue;
412 }
413 if (pool->dev)
414 dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
415 "already free\n", pool->name,
416 (unsigned long long)dma);
417 else
418 printk(KERN_ERR "dma_pool_free %s, dma %Lx "
419 "already free\n", pool->name,
420 (unsigned long long)dma);
421 return;
422 }
423 }
424 memset(vaddr, POOL_POISON_FREED, pool->size);
425#endif
426
427 spin_lock_irqsave(&pool->lock, flags);
428 page->in_use--;
429 *(int *)vaddr = page->offset;
430 page->offset = offset;
431 if (waitqueue_active(&pool->waitq))
432 wake_up_locked(&pool->waitq);
433 /*
434 * Resist a temptation to do
435 * if (!is_page_busy(page)) pool_free_page(pool, page);
436 * Better have a few empty pages hang around.
437 */
438 spin_unlock_irqrestore(&pool->lock, flags);
439}
440EXPORT_SYMBOL(dma_pool_free);
441
442/*
443 * Managed DMA pool
444 */
445static void dmam_pool_release(struct device *dev, void *res)
446{
447 struct dma_pool *pool = *(struct dma_pool **)res;
448
449 dma_pool_destroy(pool);
450}
451
452static int dmam_pool_match(struct device *dev, void *res, void *match_data)
453{
454 return *(struct dma_pool **)res == match_data;
455}
456
457/**
458 * dmam_pool_create - Managed dma_pool_create()
459 * @name: name of pool, for diagnostics
460 * @dev: device that will be doing the DMA
461 * @size: size of the blocks in this pool.
462 * @align: alignment requirement for blocks; must be a power of two
463 * @allocation: returned blocks won't cross this boundary (or zero)
464 *
465 * Managed dma_pool_create(). DMA pool created with this function is
466 * automatically destroyed on driver detach.
467 */
468struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
469 size_t size, size_t align, size_t allocation)
470{
471 struct dma_pool **ptr, *pool;
472
473 ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
474 if (!ptr)
475 return NULL;
476
477 pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
478 if (pool)
479 devres_add(dev, ptr);
480 else
481 devres_free(ptr);
482
483 return pool;
484}
485EXPORT_SYMBOL(dmam_pool_create);
486
487/**
488 * dmam_pool_destroy - Managed dma_pool_destroy()
489 * @pool: dma pool that will be destroyed
490 *
491 * Managed dma_pool_destroy().
492 */
493void dmam_pool_destroy(struct dma_pool *pool)
494{
495 struct device *dev = pool->dev;
496
497 dma_pool_destroy(pool);
498 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
499}
500EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0df4c899e979..3c0f1e99f5e4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,9 +49,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
49 goto out; 49 goto out;
50 } 50 }
51 51
52 if (mapping->a_ops->get_xip_page) 52 if (mapping->a_ops->get_xip_page) {
53 /* no bad return value, but ignore advice */ 53 switch (advice) {
54 case POSIX_FADV_NORMAL:
55 case POSIX_FADV_RANDOM:
56 case POSIX_FADV_SEQUENTIAL:
57 case POSIX_FADV_WILLNEED:
58 case POSIX_FADV_NOREUSE:
59 case POSIX_FADV_DONTNEED:
60 /* no bad return value, but ignore advice */
61 break;
62 default:
63 ret = -EINVAL;
64 }
54 goto out; 65 goto out;
66 }
55 67
56 /* Careful about overflows. Len == 0 means "as much as possible" */ 68 /* Careful about overflows. Len == 0 means "as much as possible" */
57 endbyte = offset + len; 69 endbyte = offset + len;
diff --git a/mm/filemap.c b/mm/filemap.c
index 76bea88cbebc..5357fcc4643b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/syscalls.h> 33#include <linux/syscalls.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 35#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
36#include <linux/memcontrol.h>
36#include "internal.h" 37#include "internal.h"
37 38
38/* 39/*
@@ -65,7 +66,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
65 * ->private_lock (__free_pte->__set_page_dirty_buffers) 66 * ->private_lock (__free_pte->__set_page_dirty_buffers)
66 * ->swap_lock (exclusive_swap_page, others) 67 * ->swap_lock (exclusive_swap_page, others)
67 * ->mapping->tree_lock 68 * ->mapping->tree_lock
68 * ->zone.lock
69 * 69 *
70 * ->i_mutex 70 * ->i_mutex
71 * ->i_mmap_lock (truncate->unmap_mapping_range) 71 * ->i_mmap_lock (truncate->unmap_mapping_range)
@@ -119,6 +119,7 @@ void __remove_from_page_cache(struct page *page)
119{ 119{
120 struct address_space *mapping = page->mapping; 120 struct address_space *mapping = page->mapping;
121 121
122 mem_cgroup_uncharge_page(page);
122 radix_tree_delete(&mapping->page_tree, page->index); 123 radix_tree_delete(&mapping->page_tree, page->index);
123 page->mapping = NULL; 124 page->mapping = NULL;
124 mapping->nrpages--; 125 mapping->nrpages--;
@@ -459,8 +460,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
459int add_to_page_cache(struct page *page, struct address_space *mapping, 460int add_to_page_cache(struct page *page, struct address_space *mapping,
460 pgoff_t offset, gfp_t gfp_mask) 461 pgoff_t offset, gfp_t gfp_mask)
461{ 462{
462 int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 463 int error = mem_cgroup_cache_charge(page, current->mm,
464 gfp_mask & ~__GFP_HIGHMEM);
465 if (error)
466 goto out;
463 467
468 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
464 if (error == 0) { 469 if (error == 0) {
465 write_lock_irq(&mapping->tree_lock); 470 write_lock_irq(&mapping->tree_lock);
466 error = radix_tree_insert(&mapping->page_tree, offset, page); 471 error = radix_tree_insert(&mapping->page_tree, offset, page);
@@ -471,10 +476,14 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
471 page->index = offset; 476 page->index = offset;
472 mapping->nrpages++; 477 mapping->nrpages++;
473 __inc_zone_page_state(page, NR_FILE_PAGES); 478 __inc_zone_page_state(page, NR_FILE_PAGES);
474 } 479 } else
480 mem_cgroup_uncharge_page(page);
481
475 write_unlock_irq(&mapping->tree_lock); 482 write_unlock_irq(&mapping->tree_lock);
476 radix_tree_preload_end(); 483 radix_tree_preload_end();
477 } 484 } else
485 mem_cgroup_uncharge_page(page);
486out:
478 return error; 487 return error;
479} 488}
480EXPORT_SYMBOL(add_to_page_cache); 489EXPORT_SYMBOL(add_to_page_cache);
@@ -528,7 +537,7 @@ static inline void wake_up_page(struct page *page, int bit)
528 __wake_up_bit(page_waitqueue(page), &page->flags, bit); 537 __wake_up_bit(page_waitqueue(page), &page->flags, bit);
529} 538}
530 539
531void fastcall wait_on_page_bit(struct page *page, int bit_nr) 540void wait_on_page_bit(struct page *page, int bit_nr)
532{ 541{
533 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 542 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
534 543
@@ -552,7 +561,7 @@ EXPORT_SYMBOL(wait_on_page_bit);
552 * the clear_bit and the read of the waitqueue (to avoid SMP races with a 561 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
553 * parallel wait_on_page_locked()). 562 * parallel wait_on_page_locked()).
554 */ 563 */
555void fastcall unlock_page(struct page *page) 564void unlock_page(struct page *page)
556{ 565{
557 smp_mb__before_clear_bit(); 566 smp_mb__before_clear_bit();
558 if (!TestClearPageLocked(page)) 567 if (!TestClearPageLocked(page))
@@ -586,7 +595,7 @@ EXPORT_SYMBOL(end_page_writeback);
586 * chances are that on the second loop, the block layer's plug list is empty, 595 * chances are that on the second loop, the block layer's plug list is empty,
587 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. 596 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
588 */ 597 */
589void fastcall __lock_page(struct page *page) 598void __lock_page(struct page *page)
590{ 599{
591 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 600 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
592 601
@@ -607,7 +616,7 @@ int fastcall __lock_page_killable(struct page *page)
607 * Variant of lock_page that does not require the caller to hold a reference 616 * Variant of lock_page that does not require the caller to hold a reference
608 * on the page's mapping. 617 * on the page's mapping.
609 */ 618 */
610void fastcall __lock_page_nosync(struct page *page) 619void __lock_page_nosync(struct page *page)
611{ 620{
612 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 621 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
613 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, 622 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
@@ -1277,7 +1286,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1277 * This adds the requested page to the page cache if it isn't already there, 1286 * This adds the requested page to the page cache if it isn't already there,
1278 * and schedules an I/O to read in its contents from disk. 1287 * and schedules an I/O to read in its contents from disk.
1279 */ 1288 */
1280static int fastcall page_cache_read(struct file * file, pgoff_t offset) 1289static int page_cache_read(struct file *file, pgoff_t offset)
1281{ 1290{
1282 struct address_space *mapping = file->f_mapping; 1291 struct address_space *mapping = file->f_mapping;
1283 struct page *page; 1292 struct page *page;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index f874ae818ad3..0420a0292b03 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -431,7 +431,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
431 else 431 else
432 return PTR_ERR(page); 432 return PTR_ERR(page);
433 } 433 }
434 zero_user_page(page, offset, length, KM_USER0); 434 zero_user(page, offset, length);
435 return 0; 435 return 0;
436} 436}
437EXPORT_SYMBOL_GPL(xip_truncate_page); 437EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index 14bd3bf7826e..69a37c2bdf81 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -190,10 +190,13 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
190 */ 190 */
191 if (mapping_cap_account_dirty(mapping)) { 191 if (mapping_cap_account_dirty(mapping)) {
192 unsigned long addr; 192 unsigned long addr;
193 struct file *file = vma->vm_file;
193 194
194 flags &= MAP_NONBLOCK; 195 flags &= MAP_NONBLOCK;
195 addr = mmap_region(vma->vm_file, start, size, 196 get_file(file);
197 addr = mmap_region(file, start, size,
196 flags, vma->vm_flags, pgoff, 1); 198 flags, vma->vm_flags, pgoff, 1);
199 fput(file);
197 if (IS_ERR_VALUE(addr)) { 200 if (IS_ERR_VALUE(addr)) {
198 err = addr; 201 err = addr;
199 } else { 202 } else {
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a967bc35152..35d47733cde4 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -163,7 +163,7 @@ start:
163 return vaddr; 163 return vaddr;
164} 164}
165 165
166void fastcall *kmap_high(struct page *page) 166void *kmap_high(struct page *page)
167{ 167{
168 unsigned long vaddr; 168 unsigned long vaddr;
169 169
@@ -185,7 +185,7 @@ void fastcall *kmap_high(struct page *page)
185 185
186EXPORT_SYMBOL(kmap_high); 186EXPORT_SYMBOL(kmap_high);
187 187
188void fastcall kunmap_high(struct page *page) 188void kunmap_high(struct page *page)
189{ 189{
190 unsigned long vaddr; 190 unsigned long vaddr;
191 unsigned long nr; 191 unsigned long nr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db861d8b6c28..1a5642074e34 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -813,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
813 813
814 spin_unlock(&mm->page_table_lock); 814 spin_unlock(&mm->page_table_lock);
815 copy_huge_page(new_page, old_page, address, vma); 815 copy_huge_page(new_page, old_page, address, vma);
816 __SetPageUptodate(new_page);
816 spin_lock(&mm->page_table_lock); 817 spin_lock(&mm->page_table_lock);
817 818
818 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 819 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -858,6 +859,7 @@ retry:
858 goto out; 859 goto out;
859 } 860 }
860 clear_huge_page(page, address); 861 clear_huge_page(page, address);
862 __SetPageUptodate(page);
861 863
862 if (vma->vm_flags & VM_SHARED) { 864 if (vma->vm_flags & VM_SHARED) {
863 int err; 865 int err;
diff --git a/mm/internal.h b/mm/internal.h
index 953f941ea867..5a9a6200e034 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v)
24 */ 24 */
25static inline void set_page_refcounted(struct page *page) 25static inline void set_page_refcounted(struct page *page)
26{ 26{
27 VM_BUG_ON(PageCompound(page) && PageTail(page)); 27 VM_BUG_ON(PageTail(page));
28 VM_BUG_ON(atomic_read(&page->_count)); 28 VM_BUG_ON(atomic_read(&page->_count));
29 set_page_count(page, 1); 29 set_page_count(page, 1);
30} 30}
@@ -34,7 +34,7 @@ static inline void __put_page(struct page *page)
34 atomic_dec(&page->_count); 34 atomic_dec(&page->_count);
35} 35}
36 36
37extern void fastcall __init __free_pages_bootmem(struct page *page, 37extern void __init __free_pages_bootmem(struct page *page,
38 unsigned int order); 38 unsigned int order);
39 39
40/* 40/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
new file mode 100644
index 000000000000..5c2c702af617
--- /dev/null
+++ b/mm/memcontrol.c
@@ -0,0 +1,1192 @@
1/* memcontrol.c - Memory Controller
2 *
3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 *
6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 */
19
20#include <linux/res_counter.h>
21#include <linux/memcontrol.h>
22#include <linux/cgroup.h>
23#include <linux/mm.h>
24#include <linux/smp.h>
25#include <linux/page-flags.h>
26#include <linux/backing-dev.h>
27#include <linux/bit_spinlock.h>
28#include <linux/rcupdate.h>
29#include <linux/swap.h>
30#include <linux/spinlock.h>
31#include <linux/fs.h>
32#include <linux/seq_file.h>
33
34#include <asm/uaccess.h>
35
36struct cgroup_subsys mem_cgroup_subsys;
37static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
38
39/*
40 * Statistics for memory cgroup.
41 */
42enum mem_cgroup_stat_index {
43 /*
44 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
45 */
46 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
47 MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
48
49 MEM_CGROUP_STAT_NSTATS,
50};
51
52struct mem_cgroup_stat_cpu {
53 s64 count[MEM_CGROUP_STAT_NSTATS];
54} ____cacheline_aligned_in_smp;
55
56struct mem_cgroup_stat {
57 struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
58};
59
60/*
61 * For accounting under irq disable, no need for increment preempt count.
62 */
63static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
64 enum mem_cgroup_stat_index idx, int val)
65{
66 int cpu = smp_processor_id();
67 stat->cpustat[cpu].count[idx] += val;
68}
69
70static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
71 enum mem_cgroup_stat_index idx)
72{
73 int cpu;
74 s64 ret = 0;
75 for_each_possible_cpu(cpu)
76 ret += stat->cpustat[cpu].count[idx];
77 return ret;
78}
79
80/*
81 * per-zone information in memory controller.
82 */
83
84enum mem_cgroup_zstat_index {
85 MEM_CGROUP_ZSTAT_ACTIVE,
86 MEM_CGROUP_ZSTAT_INACTIVE,
87
88 NR_MEM_CGROUP_ZSTAT,
89};
90
91struct mem_cgroup_per_zone {
92 /*
93 * spin_lock to protect the per cgroup LRU
94 */
95 spinlock_t lru_lock;
96 struct list_head active_list;
97 struct list_head inactive_list;
98 unsigned long count[NR_MEM_CGROUP_ZSTAT];
99};
100/* Macro for accessing counter */
101#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
102
103struct mem_cgroup_per_node {
104 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
105};
106
107struct mem_cgroup_lru_info {
108 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
109};
110
111/*
112 * The memory controller data structure. The memory controller controls both
113 * page cache and RSS per cgroup. We would eventually like to provide
114 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
115 * to help the administrator determine what knobs to tune.
116 *
117 * TODO: Add a water mark for the memory controller. Reclaim will begin when
118 * we hit the water mark. May be even add a low water mark, such that
119 * no reclaim occurs from a cgroup at it's low water mark, this is
120 * a feature that will be implemented much later in the future.
121 */
122struct mem_cgroup {
123 struct cgroup_subsys_state css;
124 /*
125 * the counter to account for memory usage
126 */
127 struct res_counter res;
128 /*
129 * Per cgroup active and inactive list, similar to the
130 * per zone LRU lists.
131 */
132 struct mem_cgroup_lru_info info;
133
134 int prev_priority; /* for recording reclaim priority */
135 /*
136 * statistics.
137 */
138 struct mem_cgroup_stat stat;
139};
140
141/*
142 * We use the lower bit of the page->page_cgroup pointer as a bit spin
143 * lock. We need to ensure that page->page_cgroup is atleast two
144 * byte aligned (based on comments from Nick Piggin)
145 */
146#define PAGE_CGROUP_LOCK_BIT 0x0
147#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
148
149/*
150 * A page_cgroup page is associated with every page descriptor. The
151 * page_cgroup helps us identify information about the cgroup
152 */
153struct page_cgroup {
154 struct list_head lru; /* per cgroup LRU list */
155 struct page *page;
156 struct mem_cgroup *mem_cgroup;
157 atomic_t ref_cnt; /* Helpful when pages move b/w */
158 /* mapped and cached states */
159 int flags;
160};
161#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
162#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
163
164static inline int page_cgroup_nid(struct page_cgroup *pc)
165{
166 return page_to_nid(pc->page);
167}
168
169static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
170{
171 return page_zonenum(pc->page);
172}
173
174enum {
175 MEM_CGROUP_TYPE_UNSPEC = 0,
176 MEM_CGROUP_TYPE_MAPPED,
177 MEM_CGROUP_TYPE_CACHED,
178 MEM_CGROUP_TYPE_ALL,
179 MEM_CGROUP_TYPE_MAX,
180};
181
182enum charge_type {
183 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
184 MEM_CGROUP_CHARGE_TYPE_MAPPED,
185};
186
187
188/*
189 * Always modified under lru lock. Then, not necessary to preempt_disable()
190 */
191static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
192 bool charge)
193{
194 int val = (charge)? 1 : -1;
195 struct mem_cgroup_stat *stat = &mem->stat;
196 VM_BUG_ON(!irqs_disabled());
197
198 if (flags & PAGE_CGROUP_FLAG_CACHE)
199 __mem_cgroup_stat_add_safe(stat,
200 MEM_CGROUP_STAT_CACHE, val);
201 else
202 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
203}
204
205static inline struct mem_cgroup_per_zone *
206mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
207{
208 BUG_ON(!mem->info.nodeinfo[nid]);
209 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
210}
211
212static inline struct mem_cgroup_per_zone *
213page_cgroup_zoneinfo(struct page_cgroup *pc)
214{
215 struct mem_cgroup *mem = pc->mem_cgroup;
216 int nid = page_cgroup_nid(pc);
217 int zid = page_cgroup_zid(pc);
218
219 return mem_cgroup_zoneinfo(mem, nid, zid);
220}
221
222static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
223 enum mem_cgroup_zstat_index idx)
224{
225 int nid, zid;
226 struct mem_cgroup_per_zone *mz;
227 u64 total = 0;
228
229 for_each_online_node(nid)
230 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
231 mz = mem_cgroup_zoneinfo(mem, nid, zid);
232 total += MEM_CGROUP_ZSTAT(mz, idx);
233 }
234 return total;
235}
236
237static struct mem_cgroup init_mem_cgroup;
238
239static inline
240struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
241{
242 return container_of(cgroup_subsys_state(cont,
243 mem_cgroup_subsys_id), struct mem_cgroup,
244 css);
245}
246
247static inline
248struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
249{
250 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
251 struct mem_cgroup, css);
252}
253
254void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
255{
256 struct mem_cgroup *mem;
257
258 mem = mem_cgroup_from_task(p);
259 css_get(&mem->css);
260 mm->mem_cgroup = mem;
261}
262
263void mm_free_cgroup(struct mm_struct *mm)
264{
265 css_put(&mm->mem_cgroup->css);
266}
267
268static inline int page_cgroup_locked(struct page *page)
269{
270 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
271 &page->page_cgroup);
272}
273
274void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
275{
276 int locked;
277
278 /*
279 * While resetting the page_cgroup we might not hold the
280 * page_cgroup lock. free_hot_cold_page() is an example
281 * of such a scenario
282 */
283 if (pc)
284 VM_BUG_ON(!page_cgroup_locked(page));
285 locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
286 page->page_cgroup = ((unsigned long)pc | locked);
287}
288
289struct page_cgroup *page_get_page_cgroup(struct page *page)
290{
291 return (struct page_cgroup *)
292 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
293}
294
295static void __always_inline lock_page_cgroup(struct page *page)
296{
297 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
298 VM_BUG_ON(!page_cgroup_locked(page));
299}
300
301static void __always_inline unlock_page_cgroup(struct page *page)
302{
303 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
304}
305
306/*
307 * Tie new page_cgroup to struct page under lock_page_cgroup()
308 * This can fail if the page has been tied to a page_cgroup.
309 * If success, returns 0.
310 */
311static int page_cgroup_assign_new_page_cgroup(struct page *page,
312 struct page_cgroup *pc)
313{
314 int ret = 0;
315
316 lock_page_cgroup(page);
317 if (!page_get_page_cgroup(page))
318 page_assign_page_cgroup(page, pc);
319 else /* A page is tied to other pc. */
320 ret = 1;
321 unlock_page_cgroup(page);
322 return ret;
323}
324
325/*
326 * Clear page->page_cgroup member under lock_page_cgroup().
327 * If given "pc" value is different from one page->page_cgroup,
328 * page->cgroup is not cleared.
329 * Returns a value of page->page_cgroup at lock taken.
330 * A can can detect failure of clearing by following
331 * clear_page_cgroup(page, pc) == pc
332 */
333
334static struct page_cgroup *clear_page_cgroup(struct page *page,
335 struct page_cgroup *pc)
336{
337 struct page_cgroup *ret;
338 /* lock and clear */
339 lock_page_cgroup(page);
340 ret = page_get_page_cgroup(page);
341 if (likely(ret == pc))
342 page_assign_page_cgroup(page, NULL);
343 unlock_page_cgroup(page);
344 return ret;
345}
346
347static void __mem_cgroup_remove_list(struct page_cgroup *pc)
348{
349 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
350 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
351
352 if (from)
353 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
354 else
355 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
356
357 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
358 list_del_init(&pc->lru);
359}
360
361static void __mem_cgroup_add_list(struct page_cgroup *pc)
362{
363 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
364 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
365
366 if (!to) {
367 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
368 list_add(&pc->lru, &mz->inactive_list);
369 } else {
370 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
371 list_add(&pc->lru, &mz->active_list);
372 }
373 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
374}
375
376static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
377{
378 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
379 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
380
381 if (from)
382 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
383 else
384 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
385
386 if (active) {
387 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
388 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
389 list_move(&pc->lru, &mz->active_list);
390 } else {
391 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
392 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
393 list_move(&pc->lru, &mz->inactive_list);
394 }
395}
396
397int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
398{
399 int ret;
400
401 task_lock(task);
402 ret = task->mm && mm_cgroup(task->mm) == mem;
403 task_unlock(task);
404 return ret;
405}
406
407/*
408 * This routine assumes that the appropriate zone's lru lock is already held
409 */
410void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
411{
412 struct mem_cgroup_per_zone *mz;
413 unsigned long flags;
414
415 if (!pc)
416 return;
417
418 mz = page_cgroup_zoneinfo(pc);
419 spin_lock_irqsave(&mz->lru_lock, flags);
420 __mem_cgroup_move_lists(pc, active);
421 spin_unlock_irqrestore(&mz->lru_lock, flags);
422}
423
424/*
425 * Calculate mapped_ratio under memory controller. This will be used in
426 * vmscan.c for deteremining we have to reclaim mapped pages.
427 */
428int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
429{
430 long total, rss;
431
432 /*
433 * usage is recorded in bytes. But, here, we assume the number of
434 * physical pages can be represented by "long" on any arch.
435 */
436 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
437 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
438 return (int)((rss * 100L) / total);
439}
440/*
441 * This function is called from vmscan.c. In page reclaiming loop. balance
442 * between active and inactive list is calculated. For memory controller
443 * page reclaiming, we should use using mem_cgroup's imbalance rather than
444 * zone's global lru imbalance.
445 */
446long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
447{
448 unsigned long active, inactive;
449 /* active and inactive are the number of pages. 'long' is ok.*/
450 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
451 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
452 return (long) (active / (inactive + 1));
453}
454
455/*
456 * prev_priority control...this will be used in memory reclaim path.
457 */
458int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
459{
460 return mem->prev_priority;
461}
462
463void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
464{
465 if (priority < mem->prev_priority)
466 mem->prev_priority = priority;
467}
468
469void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
470{
471 mem->prev_priority = priority;
472}
473
474/*
475 * Calculate # of pages to be scanned in this priority/zone.
476 * See also vmscan.c
477 *
478 * priority starts from "DEF_PRIORITY" and decremented in each loop.
479 * (see include/linux/mmzone.h)
480 */
481
482long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
483 struct zone *zone, int priority)
484{
485 long nr_active;
486 int nid = zone->zone_pgdat->node_id;
487 int zid = zone_idx(zone);
488 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
489
490 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
491 return (nr_active >> priority);
492}
493
494long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
495 struct zone *zone, int priority)
496{
497 long nr_inactive;
498 int nid = zone->zone_pgdat->node_id;
499 int zid = zone_idx(zone);
500 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
501
502 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
503
504 return (nr_inactive >> priority);
505}
506
507unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
508 struct list_head *dst,
509 unsigned long *scanned, int order,
510 int mode, struct zone *z,
511 struct mem_cgroup *mem_cont,
512 int active)
513{
514 unsigned long nr_taken = 0;
515 struct page *page;
516 unsigned long scan;
517 LIST_HEAD(pc_list);
518 struct list_head *src;
519 struct page_cgroup *pc, *tmp;
520 int nid = z->zone_pgdat->node_id;
521 int zid = zone_idx(z);
522 struct mem_cgroup_per_zone *mz;
523
524 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
525 if (active)
526 src = &mz->active_list;
527 else
528 src = &mz->inactive_list;
529
530
531 spin_lock(&mz->lru_lock);
532 scan = 0;
533 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
534 if (scan >= nr_to_scan)
535 break;
536 page = pc->page;
537 VM_BUG_ON(!pc);
538
539 if (unlikely(!PageLRU(page)))
540 continue;
541
542 if (PageActive(page) && !active) {
543 __mem_cgroup_move_lists(pc, true);
544 continue;
545 }
546 if (!PageActive(page) && active) {
547 __mem_cgroup_move_lists(pc, false);
548 continue;
549 }
550
551 scan++;
552 list_move(&pc->lru, &pc_list);
553
554 if (__isolate_lru_page(page, mode) == 0) {
555 list_move(&page->lru, dst);
556 nr_taken++;
557 }
558 }
559
560 list_splice(&pc_list, src);
561 spin_unlock(&mz->lru_lock);
562
563 *scanned = scan;
564 return nr_taken;
565}
566
567/*
568 * Charge the memory controller for page usage.
569 * Return
570 * 0 if the charge was successful
571 * < 0 if the cgroup is over its limit
572 */
573static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
574 gfp_t gfp_mask, enum charge_type ctype)
575{
576 struct mem_cgroup *mem;
577 struct page_cgroup *pc;
578 unsigned long flags;
579 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
580 struct mem_cgroup_per_zone *mz;
581
582 /*
583 * Should page_cgroup's go to their own slab?
584 * One could optimize the performance of the charging routine
585 * by saving a bit in the page_flags and using it as a lock
586 * to see if the cgroup page already has a page_cgroup associated
587 * with it
588 */
589retry:
590 if (page) {
591 lock_page_cgroup(page);
592 pc = page_get_page_cgroup(page);
593 /*
594 * The page_cgroup exists and
595 * the page has already been accounted.
596 */
597 if (pc) {
598 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
599 /* this page is under being uncharged ? */
600 unlock_page_cgroup(page);
601 cpu_relax();
602 goto retry;
603 } else {
604 unlock_page_cgroup(page);
605 goto done;
606 }
607 }
608 unlock_page_cgroup(page);
609 }
610
611 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
612 if (pc == NULL)
613 goto err;
614
615 /*
616 * We always charge the cgroup the mm_struct belongs to.
617 * The mm_struct's mem_cgroup changes on task migration if the
618 * thread group leader migrates. It's possible that mm is not
619 * set, if so charge the init_mm (happens for pagecache usage).
620 */
621 if (!mm)
622 mm = &init_mm;
623
624 rcu_read_lock();
625 mem = rcu_dereference(mm->mem_cgroup);
626 /*
627 * For every charge from the cgroup, increment reference
628 * count
629 */
630 css_get(&mem->css);
631 rcu_read_unlock();
632
633 /*
634 * If we created the page_cgroup, we should free it on exceeding
635 * the cgroup limit.
636 */
637 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
638 if (!(gfp_mask & __GFP_WAIT))
639 goto out;
640
641 if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
642 continue;
643
644 /*
645 * try_to_free_mem_cgroup_pages() might not give us a full
646 * picture of reclaim. Some pages are reclaimed and might be
647 * moved to swap cache or just unmapped from the cgroup.
648 * Check the limit again to see if the reclaim reduced the
649 * current usage of the cgroup before giving up
650 */
651 if (res_counter_check_under_limit(&mem->res))
652 continue;
653
654 if (!nr_retries--) {
655 mem_cgroup_out_of_memory(mem, gfp_mask);
656 goto out;
657 }
658 congestion_wait(WRITE, HZ/10);
659 }
660
661 atomic_set(&pc->ref_cnt, 1);
662 pc->mem_cgroup = mem;
663 pc->page = page;
664 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
665 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
666 pc->flags |= PAGE_CGROUP_FLAG_CACHE;
667
668 if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
669 /*
670 * Another charge has been added to this page already.
671 * We take lock_page_cgroup(page) again and read
672 * page->cgroup, increment refcnt.... just retry is OK.
673 */
674 res_counter_uncharge(&mem->res, PAGE_SIZE);
675 css_put(&mem->css);
676 kfree(pc);
677 if (!page)
678 goto done;
679 goto retry;
680 }
681
682 mz = page_cgroup_zoneinfo(pc);
683 spin_lock_irqsave(&mz->lru_lock, flags);
684 /* Update statistics vector */
685 __mem_cgroup_add_list(pc);
686 spin_unlock_irqrestore(&mz->lru_lock, flags);
687
688done:
689 return 0;
690out:
691 css_put(&mem->css);
692 kfree(pc);
693err:
694 return -ENOMEM;
695}
696
697int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
698 gfp_t gfp_mask)
699{
700 return mem_cgroup_charge_common(page, mm, gfp_mask,
701 MEM_CGROUP_CHARGE_TYPE_MAPPED);
702}
703
704/*
705 * See if the cached pages should be charged at all?
706 */
707int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
708 gfp_t gfp_mask)
709{
710 int ret = 0;
711 if (!mm)
712 mm = &init_mm;
713
714 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
715 MEM_CGROUP_CHARGE_TYPE_CACHE);
716 return ret;
717}
718
719/*
720 * Uncharging is always a welcome operation, we never complain, simply
721 * uncharge. This routine should be called with lock_page_cgroup held
722 */
723void mem_cgroup_uncharge(struct page_cgroup *pc)
724{
725 struct mem_cgroup *mem;
726 struct mem_cgroup_per_zone *mz;
727 struct page *page;
728 unsigned long flags;
729
730 /*
731 * Check if our page_cgroup is valid
732 */
733 if (!pc)
734 return;
735
736 if (atomic_dec_and_test(&pc->ref_cnt)) {
737 page = pc->page;
738 mz = page_cgroup_zoneinfo(pc);
739 /*
740 * get page->cgroup and clear it under lock.
741 * force_empty can drop page->cgroup without checking refcnt.
742 */
743 unlock_page_cgroup(page);
744 if (clear_page_cgroup(page, pc) == pc) {
745 mem = pc->mem_cgroup;
746 css_put(&mem->css);
747 res_counter_uncharge(&mem->res, PAGE_SIZE);
748 spin_lock_irqsave(&mz->lru_lock, flags);
749 __mem_cgroup_remove_list(pc);
750 spin_unlock_irqrestore(&mz->lru_lock, flags);
751 kfree(pc);
752 }
753 lock_page_cgroup(page);
754 }
755}
756
757void mem_cgroup_uncharge_page(struct page *page)
758{
759 lock_page_cgroup(page);
760 mem_cgroup_uncharge(page_get_page_cgroup(page));
761 unlock_page_cgroup(page);
762}
763
764/*
765 * Returns non-zero if a page (under migration) has valid page_cgroup member.
766 * Refcnt of page_cgroup is incremented.
767 */
768
769int mem_cgroup_prepare_migration(struct page *page)
770{
771 struct page_cgroup *pc;
772 int ret = 0;
773 lock_page_cgroup(page);
774 pc = page_get_page_cgroup(page);
775 if (pc && atomic_inc_not_zero(&pc->ref_cnt))
776 ret = 1;
777 unlock_page_cgroup(page);
778 return ret;
779}
780
781void mem_cgroup_end_migration(struct page *page)
782{
783 struct page_cgroup *pc;
784
785 lock_page_cgroup(page);
786 pc = page_get_page_cgroup(page);
787 mem_cgroup_uncharge(pc);
788 unlock_page_cgroup(page);
789}
790/*
791 * We know both *page* and *newpage* are now not-on-LRU and Pg_locked.
792 * And no race with uncharge() routines because page_cgroup for *page*
793 * has extra one reference by mem_cgroup_prepare_migration.
794 */
795
796void mem_cgroup_page_migration(struct page *page, struct page *newpage)
797{
798 struct page_cgroup *pc;
799 struct mem_cgroup *mem;
800 unsigned long flags;
801 struct mem_cgroup_per_zone *mz;
802retry:
803 pc = page_get_page_cgroup(page);
804 if (!pc)
805 return;
806 mem = pc->mem_cgroup;
807 mz = page_cgroup_zoneinfo(pc);
808 if (clear_page_cgroup(page, pc) != pc)
809 goto retry;
810 spin_lock_irqsave(&mz->lru_lock, flags);
811
812 __mem_cgroup_remove_list(pc);
813 spin_unlock_irqrestore(&mz->lru_lock, flags);
814
815 pc->page = newpage;
816 lock_page_cgroup(newpage);
817 page_assign_page_cgroup(newpage, pc);
818 unlock_page_cgroup(newpage);
819
820 mz = page_cgroup_zoneinfo(pc);
821 spin_lock_irqsave(&mz->lru_lock, flags);
822 __mem_cgroup_add_list(pc);
823 spin_unlock_irqrestore(&mz->lru_lock, flags);
824 return;
825}
826
827/*
828 * This routine traverse page_cgroup in given list and drop them all.
829 * This routine ignores page_cgroup->ref_cnt.
830 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
831 */
832#define FORCE_UNCHARGE_BATCH (128)
833static void
834mem_cgroup_force_empty_list(struct mem_cgroup *mem,
835 struct mem_cgroup_per_zone *mz,
836 int active)
837{
838 struct page_cgroup *pc;
839 struct page *page;
840 int count;
841 unsigned long flags;
842 struct list_head *list;
843
844 if (active)
845 list = &mz->active_list;
846 else
847 list = &mz->inactive_list;
848
849 if (list_empty(list))
850 return;
851retry:
852 count = FORCE_UNCHARGE_BATCH;
853 spin_lock_irqsave(&mz->lru_lock, flags);
854
855 while (--count && !list_empty(list)) {
856 pc = list_entry(list->prev, struct page_cgroup, lru);
857 page = pc->page;
858 /* Avoid race with charge */
859 atomic_set(&pc->ref_cnt, 0);
860 if (clear_page_cgroup(page, pc) == pc) {
861 css_put(&mem->css);
862 res_counter_uncharge(&mem->res, PAGE_SIZE);
863 __mem_cgroup_remove_list(pc);
864 kfree(pc);
865 } else /* being uncharged ? ...do relax */
866 break;
867 }
868 spin_unlock_irqrestore(&mz->lru_lock, flags);
869 if (!list_empty(list)) {
870 cond_resched();
871 goto retry;
872 }
873 return;
874}
875
876/*
877 * make mem_cgroup's charge to be 0 if there is no task.
878 * This enables deleting this mem_cgroup.
879 */
880
881int mem_cgroup_force_empty(struct mem_cgroup *mem)
882{
883 int ret = -EBUSY;
884 int node, zid;
885 css_get(&mem->css);
886 /*
887 * page reclaim code (kswapd etc..) will move pages between
888` * active_list <-> inactive_list while we don't take a lock.
889 * So, we have to do loop here until all lists are empty.
890 */
891 while (mem->res.usage > 0) {
892 if (atomic_read(&mem->css.cgroup->count) > 0)
893 goto out;
894 for_each_node_state(node, N_POSSIBLE)
895 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
896 struct mem_cgroup_per_zone *mz;
897 mz = mem_cgroup_zoneinfo(mem, node, zid);
898 /* drop all page_cgroup in active_list */
899 mem_cgroup_force_empty_list(mem, mz, 1);
900 /* drop all page_cgroup in inactive_list */
901 mem_cgroup_force_empty_list(mem, mz, 0);
902 }
903 }
904 ret = 0;
905out:
906 css_put(&mem->css);
907 return ret;
908}
909
910
911
912int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
913{
914 *tmp = memparse(buf, &buf);
915 if (*buf != '\0')
916 return -EINVAL;
917
918 /*
919 * Round up the value to the closest page size
920 */
921 *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
922 return 0;
923}
924
925static ssize_t mem_cgroup_read(struct cgroup *cont,
926 struct cftype *cft, struct file *file,
927 char __user *userbuf, size_t nbytes, loff_t *ppos)
928{
929 return res_counter_read(&mem_cgroup_from_cont(cont)->res,
930 cft->private, userbuf, nbytes, ppos,
931 NULL);
932}
933
934static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
935 struct file *file, const char __user *userbuf,
936 size_t nbytes, loff_t *ppos)
937{
938 return res_counter_write(&mem_cgroup_from_cont(cont)->res,
939 cft->private, userbuf, nbytes, ppos,
940 mem_cgroup_write_strategy);
941}
942
943static ssize_t mem_force_empty_write(struct cgroup *cont,
944 struct cftype *cft, struct file *file,
945 const char __user *userbuf,
946 size_t nbytes, loff_t *ppos)
947{
948 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
949 int ret;
950 ret = mem_cgroup_force_empty(mem);
951 if (!ret)
952 ret = nbytes;
953 return ret;
954}
955
956/*
957 * Note: This should be removed if cgroup supports write-only file.
958 */
959
960static ssize_t mem_force_empty_read(struct cgroup *cont,
961 struct cftype *cft,
962 struct file *file, char __user *userbuf,
963 size_t nbytes, loff_t *ppos)
964{
965 return -EINVAL;
966}
967
968
969static const struct mem_cgroup_stat_desc {
970 const char *msg;
971 u64 unit;
972} mem_cgroup_stat_desc[] = {
973 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
974 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
975};
976
977static int mem_control_stat_show(struct seq_file *m, void *arg)
978{
979 struct cgroup *cont = m->private;
980 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
981 struct mem_cgroup_stat *stat = &mem_cont->stat;
982 int i;
983
984 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
985 s64 val;
986
987 val = mem_cgroup_read_stat(stat, i);
988 val *= mem_cgroup_stat_desc[i].unit;
989 seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
990 (long long)val);
991 }
992 /* showing # of active pages */
993 {
994 unsigned long active, inactive;
995
996 inactive = mem_cgroup_get_all_zonestat(mem_cont,
997 MEM_CGROUP_ZSTAT_INACTIVE);
998 active = mem_cgroup_get_all_zonestat(mem_cont,
999 MEM_CGROUP_ZSTAT_ACTIVE);
1000 seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
1001 seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
1002 }
1003 return 0;
1004}
1005
1006static const struct file_operations mem_control_stat_file_operations = {
1007 .read = seq_read,
1008 .llseek = seq_lseek,
1009 .release = single_release,
1010};
1011
1012static int mem_control_stat_open(struct inode *unused, struct file *file)
1013{
1014 /* XXX __d_cont */
1015 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
1016
1017 file->f_op = &mem_control_stat_file_operations;
1018 return single_open(file, mem_control_stat_show, cont);
1019}
1020
1021
1022
1023static struct cftype mem_cgroup_files[] = {
1024 {
1025 .name = "usage_in_bytes",
1026 .private = RES_USAGE,
1027 .read = mem_cgroup_read,
1028 },
1029 {
1030 .name = "limit_in_bytes",
1031 .private = RES_LIMIT,
1032 .write = mem_cgroup_write,
1033 .read = mem_cgroup_read,
1034 },
1035 {
1036 .name = "failcnt",
1037 .private = RES_FAILCNT,
1038 .read = mem_cgroup_read,
1039 },
1040 {
1041 .name = "force_empty",
1042 .write = mem_force_empty_write,
1043 .read = mem_force_empty_read,
1044 },
1045 {
1046 .name = "stat",
1047 .open = mem_control_stat_open,
1048 },
1049};
1050
1051static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1052{
1053 struct mem_cgroup_per_node *pn;
1054 struct mem_cgroup_per_zone *mz;
1055 int zone;
1056 /*
1057 * This routine is called against possible nodes.
1058 * But it's BUG to call kmalloc() against offline node.
1059 *
1060 * TODO: this routine can waste much memory for nodes which will
1061 * never be onlined. It's better to use memory hotplug callback
1062 * function.
1063 */
1064 if (node_state(node, N_HIGH_MEMORY))
1065 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node);
1066 else
1067 pn = kmalloc(sizeof(*pn), GFP_KERNEL);
1068 if (!pn)
1069 return 1;
1070
1071 mem->info.nodeinfo[node] = pn;
1072 memset(pn, 0, sizeof(*pn));
1073
1074 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1075 mz = &pn->zoneinfo[zone];
1076 INIT_LIST_HEAD(&mz->active_list);
1077 INIT_LIST_HEAD(&mz->inactive_list);
1078 spin_lock_init(&mz->lru_lock);
1079 }
1080 return 0;
1081}
1082
1083static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1084{
1085 kfree(mem->info.nodeinfo[node]);
1086}
1087
1088
1089static struct mem_cgroup init_mem_cgroup;
1090
1091static struct cgroup_subsys_state *
1092mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1093{
1094 struct mem_cgroup *mem;
1095 int node;
1096
1097 if (unlikely((cont->parent) == NULL)) {
1098 mem = &init_mem_cgroup;
1099 init_mm.mem_cgroup = mem;
1100 } else
1101 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
1102
1103 if (mem == NULL)
1104 return NULL;
1105
1106 res_counter_init(&mem->res);
1107
1108 memset(&mem->info, 0, sizeof(mem->info));
1109
1110 for_each_node_state(node, N_POSSIBLE)
1111 if (alloc_mem_cgroup_per_zone_info(mem, node))
1112 goto free_out;
1113
1114 return &mem->css;
1115free_out:
1116 for_each_node_state(node, N_POSSIBLE)
1117 free_mem_cgroup_per_zone_info(mem, node);
1118 if (cont->parent != NULL)
1119 kfree(mem);
1120 return NULL;
1121}
1122
1123static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1124 struct cgroup *cont)
1125{
1126 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1127 mem_cgroup_force_empty(mem);
1128}
1129
1130static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1131 struct cgroup *cont)
1132{
1133 int node;
1134 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1135
1136 for_each_node_state(node, N_POSSIBLE)
1137 free_mem_cgroup_per_zone_info(mem, node);
1138
1139 kfree(mem_cgroup_from_cont(cont));
1140}
1141
1142static int mem_cgroup_populate(struct cgroup_subsys *ss,
1143 struct cgroup *cont)
1144{
1145 return cgroup_add_files(cont, ss, mem_cgroup_files,
1146 ARRAY_SIZE(mem_cgroup_files));
1147}
1148
1149static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1150 struct cgroup *cont,
1151 struct cgroup *old_cont,
1152 struct task_struct *p)
1153{
1154 struct mm_struct *mm;
1155 struct mem_cgroup *mem, *old_mem;
1156
1157 mm = get_task_mm(p);
1158 if (mm == NULL)
1159 return;
1160
1161 mem = mem_cgroup_from_cont(cont);
1162 old_mem = mem_cgroup_from_cont(old_cont);
1163
1164 if (mem == old_mem)
1165 goto out;
1166
1167 /*
1168 * Only thread group leaders are allowed to migrate, the mm_struct is
1169 * in effect owned by the leader
1170 */
1171 if (p->tgid != p->pid)
1172 goto out;
1173
1174 css_get(&mem->css);
1175 rcu_assign_pointer(mm->mem_cgroup, mem);
1176 css_put(&old_mem->css);
1177
1178out:
1179 mmput(mm);
1180 return;
1181}
1182
1183struct cgroup_subsys mem_cgroup_subsys = {
1184 .name = "memory",
1185 .subsys_id = mem_cgroup_subsys_id,
1186 .create = mem_cgroup_create,
1187 .pre_destroy = mem_cgroup_pre_destroy,
1188 .destroy = mem_cgroup_destroy,
1189 .populate = mem_cgroup_populate,
1190 .attach = mem_cgroup_move_task,
1191 .early_init = 0,
1192};
diff --git a/mm/memory.c b/mm/memory.c
index d902d0e25edc..153a54b2013c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
50#include <linux/delayacct.h> 50#include <linux/delayacct.h>
51#include <linux/init.h> 51#include <linux/init.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h>
53 54
54#include <asm/pgalloc.h> 55#include <asm/pgalloc.h>
55#include <asm/uaccess.h> 56#include <asm/uaccess.h>
@@ -82,7 +83,18 @@ void * high_memory;
82EXPORT_SYMBOL(num_physpages); 83EXPORT_SYMBOL(num_physpages);
83EXPORT_SYMBOL(high_memory); 84EXPORT_SYMBOL(high_memory);
84 85
85int randomize_va_space __read_mostly = 1; 86/*
87 * Randomize the address space (stacks, mmaps, brk, etc.).
88 *
89 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
90 * as ancient (libc5 based) binaries can segfault. )
91 */
92int randomize_va_space __read_mostly =
93#ifdef CONFIG_COMPAT_BRK
94 1;
95#else
96 2;
97#endif
86 98
87static int __init disable_randmaps(char *s) 99static int __init disable_randmaps(char *s)
88{ 100{
@@ -305,7 +317,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
305 spin_lock(&mm->page_table_lock); 317 spin_lock(&mm->page_table_lock);
306 if (pmd_present(*pmd)) { /* Another has populated it */ 318 if (pmd_present(*pmd)) { /* Another has populated it */
307 pte_lock_deinit(new); 319 pte_lock_deinit(new);
308 pte_free(new); 320 pte_free(mm, new);
309 } else { 321 } else {
310 mm->nr_ptes++; 322 mm->nr_ptes++;
311 inc_zone_page_state(new, NR_PAGETABLE); 323 inc_zone_page_state(new, NR_PAGETABLE);
@@ -323,7 +335,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
323 335
324 spin_lock(&init_mm.page_table_lock); 336 spin_lock(&init_mm.page_table_lock);
325 if (pmd_present(*pmd)) /* Another has populated it */ 337 if (pmd_present(*pmd)) /* Another has populated it */
326 pte_free_kernel(new); 338 pte_free_kernel(&init_mm, new);
327 else 339 else
328 pmd_populate_kernel(&init_mm, pmd, new); 340 pmd_populate_kernel(&init_mm, pmd, new);
329 spin_unlock(&init_mm.page_table_lock); 341 spin_unlock(&init_mm.page_table_lock);
@@ -1109,7 +1121,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1109} 1121}
1110EXPORT_SYMBOL(get_user_pages); 1122EXPORT_SYMBOL(get_user_pages);
1111 1123
1112pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) 1124pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1125 spinlock_t **ptl)
1113{ 1126{
1114 pgd_t * pgd = pgd_offset(mm, addr); 1127 pgd_t * pgd = pgd_offset(mm, addr);
1115 pud_t * pud = pud_alloc(mm, pgd, addr); 1128 pud_t * pud = pud_alloc(mm, pgd, addr);
@@ -1132,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
1132{ 1145{
1133 int retval; 1146 int retval;
1134 pte_t *pte; 1147 pte_t *pte;
1135 spinlock_t *ptl; 1148 spinlock_t *ptl;
1149
1150 retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
1151 if (retval)
1152 goto out;
1136 1153
1137 retval = -EINVAL; 1154 retval = -EINVAL;
1138 if (PageAnon(page)) 1155 if (PageAnon(page))
1139 goto out; 1156 goto out_uncharge;
1140 retval = -ENOMEM; 1157 retval = -ENOMEM;
1141 flush_dcache_page(page); 1158 flush_dcache_page(page);
1142 pte = get_locked_pte(mm, addr, &ptl); 1159 pte = get_locked_pte(mm, addr, &ptl);
1143 if (!pte) 1160 if (!pte)
1144 goto out; 1161 goto out_uncharge;
1145 retval = -EBUSY; 1162 retval = -EBUSY;
1146 if (!pte_none(*pte)) 1163 if (!pte_none(*pte))
1147 goto out_unlock; 1164 goto out_unlock;
@@ -1153,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
1153 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1170 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1154 1171
1155 retval = 0; 1172 retval = 0;
1173 pte_unmap_unlock(pte, ptl);
1174 return retval;
1156out_unlock: 1175out_unlock:
1157 pte_unmap_unlock(pte, ptl); 1176 pte_unmap_unlock(pte, ptl);
1177out_uncharge:
1178 mem_cgroup_uncharge_page(page);
1158out: 1179out:
1159 return retval; 1180 return retval;
1160} 1181}
@@ -1517,10 +1538,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1517 memset(kaddr, 0, PAGE_SIZE); 1538 memset(kaddr, 0, PAGE_SIZE);
1518 kunmap_atomic(kaddr, KM_USER0); 1539 kunmap_atomic(kaddr, KM_USER0);
1519 flush_dcache_page(dst); 1540 flush_dcache_page(dst);
1520 return; 1541 } else
1521 1542 copy_user_highpage(dst, src, va, vma);
1522 }
1523 copy_user_highpage(dst, src, va, vma);
1524} 1543}
1525 1544
1526/* 1545/*
@@ -1629,6 +1648,10 @@ gotten:
1629 if (!new_page) 1648 if (!new_page)
1630 goto oom; 1649 goto oom;
1631 cow_user_page(new_page, old_page, address, vma); 1650 cow_user_page(new_page, old_page, address, vma);
1651 __SetPageUptodate(new_page);
1652
1653 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
1654 goto oom_free_new;
1632 1655
1633 /* 1656 /*
1634 * Re-check the pte - we dropped the lock 1657 * Re-check the pte - we dropped the lock
@@ -1661,7 +1684,9 @@ gotten:
1661 /* Free the old page.. */ 1684 /* Free the old page.. */
1662 new_page = old_page; 1685 new_page = old_page;
1663 ret |= VM_FAULT_WRITE; 1686 ret |= VM_FAULT_WRITE;
1664 } 1687 } else
1688 mem_cgroup_uncharge_page(new_page);
1689
1665 if (new_page) 1690 if (new_page)
1666 page_cache_release(new_page); 1691 page_cache_release(new_page);
1667 if (old_page) 1692 if (old_page)
@@ -1685,6 +1710,8 @@ unlock:
1685 put_page(dirty_page); 1710 put_page(dirty_page);
1686 } 1711 }
1687 return ret; 1712 return ret;
1713oom_free_new:
1714 __free_page(new_page);
1688oom: 1715oom:
1689 if (old_page) 1716 if (old_page)
1690 page_cache_release(old_page); 1717 page_cache_release(old_page);
@@ -1909,50 +1936,49 @@ EXPORT_SYMBOL(unmap_mapping_range);
1909 */ 1936 */
1910int vmtruncate(struct inode * inode, loff_t offset) 1937int vmtruncate(struct inode * inode, loff_t offset)
1911{ 1938{
1912 struct address_space *mapping = inode->i_mapping; 1939 if (inode->i_size < offset) {
1913 unsigned long limit; 1940 unsigned long limit;
1914 1941
1915 if (inode->i_size < offset) 1942 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1916 goto do_expand; 1943 if (limit != RLIM_INFINITY && offset > limit)
1917 /* 1944 goto out_sig;
1918 * truncation of in-use swapfiles is disallowed - it would cause 1945 if (offset > inode->i_sb->s_maxbytes)
1919 * subsequent swapout to scribble on the now-freed blocks. 1946 goto out_big;
1920 */ 1947 i_size_write(inode, offset);
1921 if (IS_SWAPFILE(inode)) 1948 } else {
1922 goto out_busy; 1949 struct address_space *mapping = inode->i_mapping;
1923 i_size_write(inode, offset); 1950
1951 /*
1952 * truncation of in-use swapfiles is disallowed - it would
1953 * cause subsequent swapout to scribble on the now-freed
1954 * blocks.
1955 */
1956 if (IS_SWAPFILE(inode))
1957 return -ETXTBSY;
1958 i_size_write(inode, offset);
1959
1960 /*
1961 * unmap_mapping_range is called twice, first simply for
1962 * efficiency so that truncate_inode_pages does fewer
1963 * single-page unmaps. However after this first call, and
1964 * before truncate_inode_pages finishes, it is possible for
1965 * private pages to be COWed, which remain after
1966 * truncate_inode_pages finishes, hence the second
1967 * unmap_mapping_range call must be made for correctness.
1968 */
1969 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1970 truncate_inode_pages(mapping, offset);
1971 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1972 }
1924 1973
1925 /*
1926 * unmap_mapping_range is called twice, first simply for efficiency
1927 * so that truncate_inode_pages does fewer single-page unmaps. However
1928 * after this first call, and before truncate_inode_pages finishes,
1929 * it is possible for private pages to be COWed, which remain after
1930 * truncate_inode_pages finishes, hence the second unmap_mapping_range
1931 * call must be made for correctness.
1932 */
1933 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1934 truncate_inode_pages(mapping, offset);
1935 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1936 goto out_truncate;
1937
1938do_expand:
1939 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1940 if (limit != RLIM_INFINITY && offset > limit)
1941 goto out_sig;
1942 if (offset > inode->i_sb->s_maxbytes)
1943 goto out_big;
1944 i_size_write(inode, offset);
1945
1946out_truncate:
1947 if (inode->i_op && inode->i_op->truncate) 1974 if (inode->i_op && inode->i_op->truncate)
1948 inode->i_op->truncate(inode); 1975 inode->i_op->truncate(inode);
1949 return 0; 1976 return 0;
1977
1950out_sig: 1978out_sig:
1951 send_sig(SIGXFSZ, current, 0); 1979 send_sig(SIGXFSZ, current, 0);
1952out_big: 1980out_big:
1953 return -EFBIG; 1981 return -EFBIG;
1954out_busy:
1955 return -ETXTBSY;
1956} 1982}
1957EXPORT_SYMBOL(vmtruncate); 1983EXPORT_SYMBOL(vmtruncate);
1958 1984
@@ -1980,67 +2006,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1980 return 0; 2006 return 0;
1981} 2007}
1982 2008
1983/**
1984 * swapin_readahead - swap in pages in hope we need them soon
1985 * @entry: swap entry of this memory
1986 * @addr: address to start
1987 * @vma: user vma this addresses belong to
1988 *
1989 * Primitive swap readahead code. We simply read an aligned block of
1990 * (1 << page_cluster) entries in the swap area. This method is chosen
1991 * because it doesn't cost us any seek time. We also make sure to queue
1992 * the 'original' request together with the readahead ones...
1993 *
1994 * This has been extended to use the NUMA policies from the mm triggering
1995 * the readahead.
1996 *
1997 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
1998 */
1999void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
2000{
2001#ifdef CONFIG_NUMA
2002 struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
2003#endif
2004 int i, num;
2005 struct page *new_page;
2006 unsigned long offset;
2007
2008 /*
2009 * Get the number of handles we should do readahead io to.
2010 */
2011 num = valid_swaphandles(entry, &offset);
2012 for (i = 0; i < num; offset++, i++) {
2013 /* Ok, do the async read-ahead now */
2014 new_page = read_swap_cache_async(swp_entry(swp_type(entry),
2015 offset), vma, addr);
2016 if (!new_page)
2017 break;
2018 page_cache_release(new_page);
2019#ifdef CONFIG_NUMA
2020 /*
2021 * Find the next applicable VMA for the NUMA policy.
2022 */
2023 addr += PAGE_SIZE;
2024 if (addr == 0)
2025 vma = NULL;
2026 if (vma) {
2027 if (addr >= vma->vm_end) {
2028 vma = next_vma;
2029 next_vma = vma ? vma->vm_next : NULL;
2030 }
2031 if (vma && addr < vma->vm_start)
2032 vma = NULL;
2033 } else {
2034 if (next_vma && addr >= next_vma->vm_start) {
2035 vma = next_vma;
2036 next_vma = vma->vm_next;
2037 }
2038 }
2039#endif
2040 }
2041 lru_add_drain(); /* Push any new pages onto the LRU now */
2042}
2043
2044/* 2009/*
2045 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2010 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2046 * but allow concurrent faults), and pte mapped but not yet locked. 2011 * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2068,8 +2033,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2068 page = lookup_swap_cache(entry); 2033 page = lookup_swap_cache(entry);
2069 if (!page) { 2034 if (!page) {
2070 grab_swap_token(); /* Contend for token _before_ read-in */ 2035 grab_swap_token(); /* Contend for token _before_ read-in */
2071 swapin_readahead(entry, address, vma); 2036 page = swapin_readahead(entry,
2072 page = read_swap_cache_async(entry, vma, address); 2037 GFP_HIGHUSER_MOVABLE, vma, address);
2073 if (!page) { 2038 if (!page) {
2074 /* 2039 /*
2075 * Back out if somebody else faulted in this pte 2040 * Back out if somebody else faulted in this pte
@@ -2087,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2087 count_vm_event(PGMAJFAULT); 2052 count_vm_event(PGMAJFAULT);
2088 } 2053 }
2089 2054
2055 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2056 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2057 ret = VM_FAULT_OOM;
2058 goto out;
2059 }
2060
2090 mark_page_accessed(page); 2061 mark_page_accessed(page);
2091 lock_page(page); 2062 lock_page(page);
2092 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2063 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2124,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2124 if (write_access) { 2095 if (write_access) {
2125 /* XXX: We could OR the do_wp_page code with this one? */ 2096 /* XXX: We could OR the do_wp_page code with this one? */
2126 if (do_wp_page(mm, vma, address, 2097 if (do_wp_page(mm, vma, address,
2127 page_table, pmd, ptl, pte) & VM_FAULT_OOM) 2098 page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
2099 mem_cgroup_uncharge_page(page);
2128 ret = VM_FAULT_OOM; 2100 ret = VM_FAULT_OOM;
2101 }
2129 goto out; 2102 goto out;
2130 } 2103 }
2131 2104
@@ -2136,6 +2109,7 @@ unlock:
2136out: 2109out:
2137 return ret; 2110 return ret;
2138out_nomap: 2111out_nomap:
2112 mem_cgroup_uncharge_page(page);
2139 pte_unmap_unlock(page_table, ptl); 2113 pte_unmap_unlock(page_table, ptl);
2140 unlock_page(page); 2114 unlock_page(page);
2141 page_cache_release(page); 2115 page_cache_release(page);
@@ -2163,6 +2137,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2163 page = alloc_zeroed_user_highpage_movable(vma, address); 2137 page = alloc_zeroed_user_highpage_movable(vma, address);
2164 if (!page) 2138 if (!page)
2165 goto oom; 2139 goto oom;
2140 __SetPageUptodate(page);
2141
2142 if (mem_cgroup_charge(page, mm, GFP_KERNEL))
2143 goto oom_free_page;
2166 2144
2167 entry = mk_pte(page, vma->vm_page_prot); 2145 entry = mk_pte(page, vma->vm_page_prot);
2168 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2146 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2181,8 +2159,11 @@ unlock:
2181 pte_unmap_unlock(page_table, ptl); 2159 pte_unmap_unlock(page_table, ptl);
2182 return 0; 2160 return 0;
2183release: 2161release:
2162 mem_cgroup_uncharge_page(page);
2184 page_cache_release(page); 2163 page_cache_release(page);
2185 goto unlock; 2164 goto unlock;
2165oom_free_page:
2166 __free_page(page);
2186oom: 2167oom:
2187 return VM_FAULT_OOM; 2168 return VM_FAULT_OOM;
2188} 2169}
@@ -2263,6 +2244,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2263 goto out; 2244 goto out;
2264 } 2245 }
2265 copy_user_highpage(page, vmf.page, address, vma); 2246 copy_user_highpage(page, vmf.page, address, vma);
2247 __SetPageUptodate(page);
2266 } else { 2248 } else {
2267 /* 2249 /*
2268 * If the page will be shareable, see if the backing 2250 * If the page will be shareable, see if the backing
@@ -2295,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2295 2277
2296 } 2278 }
2297 2279
2280 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2281 ret = VM_FAULT_OOM;
2282 goto out;
2283 }
2284
2298 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2285 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2299 2286
2300 /* 2287 /*
@@ -2330,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2330 /* no need to invalidate: a not-present page won't be cached */ 2317 /* no need to invalidate: a not-present page won't be cached */
2331 update_mmu_cache(vma, address, entry); 2318 update_mmu_cache(vma, address, entry);
2332 } else { 2319 } else {
2320 mem_cgroup_uncharge_page(page);
2333 if (anon) 2321 if (anon)
2334 page_cache_release(page); 2322 page_cache_release(page);
2335 else 2323 else
@@ -2563,7 +2551,7 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2563 2551
2564 spin_lock(&mm->page_table_lock); 2552 spin_lock(&mm->page_table_lock);
2565 if (pgd_present(*pgd)) /* Another has populated it */ 2553 if (pgd_present(*pgd)) /* Another has populated it */
2566 pud_free(new); 2554 pud_free(mm, new);
2567 else 2555 else
2568 pgd_populate(mm, pgd, new); 2556 pgd_populate(mm, pgd, new);
2569 spin_unlock(&mm->page_table_lock); 2557 spin_unlock(&mm->page_table_lock);
@@ -2585,12 +2573,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2585 spin_lock(&mm->page_table_lock); 2573 spin_lock(&mm->page_table_lock);
2586#ifndef __ARCH_HAS_4LEVEL_HACK 2574#ifndef __ARCH_HAS_4LEVEL_HACK
2587 if (pud_present(*pud)) /* Another has populated it */ 2575 if (pud_present(*pud)) /* Another has populated it */
2588 pmd_free(new); 2576 pmd_free(mm, new);
2589 else 2577 else
2590 pud_populate(mm, pud, new); 2578 pud_populate(mm, pud, new);
2591#else 2579#else
2592 if (pgd_present(*pud)) /* Another has populated it */ 2580 if (pgd_present(*pud)) /* Another has populated it */
2593 pmd_free(new); 2581 pmd_free(mm, new);
2594 else 2582 else
2595 pgd_populate(mm, pud, new); 2583 pgd_populate(mm, pud, new);
2596#endif /* __ARCH_HAS_4LEVEL_HACK */ 2584#endif /* __ARCH_HAS_4LEVEL_HACK */
@@ -2618,46 +2606,6 @@ int make_pages_present(unsigned long addr, unsigned long end)
2618 return ret == len ? 0 : -1; 2606 return ret == len ? 0 : -1;
2619} 2607}
2620 2608
2621/*
2622 * Map a vmalloc()-space virtual address to the physical page.
2623 */
2624struct page * vmalloc_to_page(void * vmalloc_addr)
2625{
2626 unsigned long addr = (unsigned long) vmalloc_addr;
2627 struct page *page = NULL;
2628 pgd_t *pgd = pgd_offset_k(addr);
2629 pud_t *pud;
2630 pmd_t *pmd;
2631 pte_t *ptep, pte;
2632
2633 if (!pgd_none(*pgd)) {
2634 pud = pud_offset(pgd, addr);
2635 if (!pud_none(*pud)) {
2636 pmd = pmd_offset(pud, addr);
2637 if (!pmd_none(*pmd)) {
2638 ptep = pte_offset_map(pmd, addr);
2639 pte = *ptep;
2640 if (pte_present(pte))
2641 page = pte_page(pte);
2642 pte_unmap(ptep);
2643 }
2644 }
2645 }
2646 return page;
2647}
2648
2649EXPORT_SYMBOL(vmalloc_to_page);
2650
2651/*
2652 * Map a vmalloc()-space virtual address to the physical page frame number.
2653 */
2654unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2655{
2656 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
2657}
2658
2659EXPORT_SYMBOL(vmalloc_to_pfn);
2660
2661#if !defined(__HAVE_ARCH_GATE_AREA) 2609#if !defined(__HAVE_ARCH_GATE_AREA)
2662 2610
2663#if defined(AT_SYSINFO_EHDR) 2611#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9512a544d044..7469c503580d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -481,8 +481,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
481 return offlined; 481 return offlined;
482} 482}
483 483
484extern void drain_all_local_pages(void);
485
486int offline_pages(unsigned long start_pfn, 484int offline_pages(unsigned long start_pfn,
487 unsigned long end_pfn, unsigned long timeout) 485 unsigned long end_pfn, unsigned long timeout)
488{ 486{
@@ -540,7 +538,7 @@ repeat:
540 lru_add_drain_all(); 538 lru_add_drain_all();
541 flush_scheduled_work(); 539 flush_scheduled_work();
542 cond_resched(); 540 cond_resched();
543 drain_all_local_pages(); 541 drain_all_pages();
544 } 542 }
545 543
546 pfn = scan_lru_pages(start_pfn, end_pfn); 544 pfn = scan_lru_pages(start_pfn, end_pfn);
@@ -563,7 +561,7 @@ repeat:
563 flush_scheduled_work(); 561 flush_scheduled_work();
564 yield(); 562 yield();
565 /* drain pcp pages , this is synchrouns. */ 563 /* drain pcp pages , this is synchrouns. */
566 drain_all_local_pages(); 564 drain_all_pages();
567 /* check again */ 565 /* check again */
568 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 566 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
569 if (offlined_pages < 0) { 567 if (offlined_pages < 0) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a207e8d17ea..a73504ff5ab9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -29,6 +29,7 @@
29#include <linux/mempolicy.h> 29#include <linux/mempolicy.h>
30#include <linux/vmalloc.h> 30#include <linux/vmalloc.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/memcontrol.h>
32 33
33#include "internal.h" 34#include "internal.h"
34 35
@@ -115,11 +116,6 @@ int putback_lru_pages(struct list_head *l)
115 return count; 116 return count;
116} 117}
117 118
118static inline int is_swap_pte(pte_t pte)
119{
120 return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
121}
122
123/* 119/*
124 * Restore a potential migration pte to a working pte entry 120 * Restore a potential migration pte to a working pte entry
125 */ 121 */
@@ -157,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma,
157 return; 153 return;
158 } 154 }
159 155
156 if (mem_cgroup_charge(new, mm, GFP_KERNEL)) {
157 pte_unmap(ptep);
158 return;
159 }
160
160 ptl = pte_lockptr(mm, pmd); 161 ptl = pte_lockptr(mm, pmd);
161 spin_lock(ptl); 162 spin_lock(ptl);
162 pte = *ptep; 163 pte = *ptep;
@@ -592,9 +593,10 @@ static int move_to_new_page(struct page *newpage, struct page *page)
592 else 593 else
593 rc = fallback_migrate_page(mapping, newpage, page); 594 rc = fallback_migrate_page(mapping, newpage, page);
594 595
595 if (!rc) 596 if (!rc) {
597 mem_cgroup_page_migration(page, newpage);
596 remove_migration_ptes(page, newpage); 598 remove_migration_ptes(page, newpage);
597 else 599 } else
598 newpage->mapping = NULL; 600 newpage->mapping = NULL;
599 601
600 unlock_page(newpage); 602 unlock_page(newpage);
@@ -613,6 +615,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
613 int *result = NULL; 615 int *result = NULL;
614 struct page *newpage = get_new_page(page, private, &result); 616 struct page *newpage = get_new_page(page, private, &result);
615 int rcu_locked = 0; 617 int rcu_locked = 0;
618 int charge = 0;
616 619
617 if (!newpage) 620 if (!newpage)
618 return -ENOMEM; 621 return -ENOMEM;
@@ -645,23 +648,46 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
645 rcu_read_lock(); 648 rcu_read_lock();
646 rcu_locked = 1; 649 rcu_locked = 1;
647 } 650 }
651
648 /* 652 /*
649 * This is a corner case handling. 653 * Corner case handling:
650 * When a new swap-cache is read into, it is linked to LRU 654 * 1. When a new swap-cache page is read into, it is added to the LRU
651 * and treated as swapcache but has no rmap yet. 655 * and treated as swapcache but it has no rmap yet.
652 * Calling try_to_unmap() against a page->mapping==NULL page is 656 * Calling try_to_unmap() against a page->mapping==NULL page will
653 * BUG. So handle it here. 657 * trigger a BUG. So handle it here.
658 * 2. An orphaned page (see truncate_complete_page) might have
659 * fs-private metadata. The page can be picked up due to memory
660 * offlining. Everywhere else except page reclaim, the page is
661 * invisible to the vm, so the page can not be migrated. So try to
662 * free the metadata, so the page can be freed.
654 */ 663 */
655 if (!page->mapping) 664 if (!page->mapping) {
665 if (!PageAnon(page) && PagePrivate(page)) {
666 /*
667 * Go direct to try_to_free_buffers() here because
668 * a) that's what try_to_release_page() would do anyway
669 * b) we may be under rcu_read_lock() here, so we can't
670 * use GFP_KERNEL which is what try_to_release_page()
671 * needs to be effective.
672 */
673 try_to_free_buffers(page);
674 }
656 goto rcu_unlock; 675 goto rcu_unlock;
676 }
677
678 charge = mem_cgroup_prepare_migration(page);
657 /* Establish migration ptes or remove ptes */ 679 /* Establish migration ptes or remove ptes */
658 try_to_unmap(page, 1); 680 try_to_unmap(page, 1);
659 681
660 if (!page_mapped(page)) 682 if (!page_mapped(page))
661 rc = move_to_new_page(newpage, page); 683 rc = move_to_new_page(newpage, page);
662 684
663 if (rc) 685 if (rc) {
664 remove_migration_ptes(page, page); 686 remove_migration_ptes(page, page);
687 if (charge)
688 mem_cgroup_end_migration(page);
689 } else if (charge)
690 mem_cgroup_end_migration(newpage);
665rcu_unlock: 691rcu_unlock:
666 if (rcu_locked) 692 if (rcu_locked)
667 rcu_read_unlock(); 693 rcu_read_unlock();
diff --git a/mm/mmap.c b/mm/mmap.c
index d2b6d44962b7..ad6e4eaf34f8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -36,6 +36,10 @@
36#define arch_mmap_check(addr, len, flags) (0) 36#define arch_mmap_check(addr, len, flags) (0)
37#endif 37#endif
38 38
39#ifndef arch_rebalance_pgtables
40#define arch_rebalance_pgtables(addr, len) (addr)
41#endif
42
39static void unmap_region(struct mm_struct *mm, 43static void unmap_region(struct mm_struct *mm,
40 struct vm_area_struct *vma, struct vm_area_struct *prev, 44 struct vm_area_struct *vma, struct vm_area_struct *prev,
41 unsigned long start, unsigned long end); 45 unsigned long start, unsigned long end);
@@ -241,7 +245,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
241 245
242 down_write(&mm->mmap_sem); 246 down_write(&mm->mmap_sem);
243 247
244 if (brk < mm->end_code) 248 if (brk < mm->start_brk)
245 goto out; 249 goto out;
246 250
247 /* 251 /*
@@ -1424,7 +1428,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1424 if (addr & ~PAGE_MASK) 1428 if (addr & ~PAGE_MASK)
1425 return -EINVAL; 1429 return -EINVAL;
1426 1430
1427 return addr; 1431 return arch_rebalance_pgtables(addr, len);
1428} 1432}
1429 1433
1430EXPORT_SYMBOL(get_unmapped_area); 1434EXPORT_SYMBOL(get_unmapped_area);
@@ -2216,7 +2220,7 @@ int install_special_mapping(struct mm_struct *mm,
2216 vma->vm_start = addr; 2220 vma->vm_start = addr;
2217 vma->vm_end = addr + len; 2221 vma->vm_end = addr + len;
2218 2222
2219 vma->vm_flags = vm_flags | mm->def_flags; 2223 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
2220 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 2224 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
2221 2225
2222 vma->vm_ops = &special_mapping_vmops; 2226 vma->vm_ops = &special_mapping_vmops;
diff --git a/mm/nommu.c b/mm/nommu.c
index b989cb928a7c..5d8ae086f74e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,6 +10,7 @@
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org>
13 */ 14 */
14 15
15#include <linux/module.h> 16#include <linux/module.h>
@@ -167,7 +168,7 @@ EXPORT_SYMBOL(get_user_pages);
167DEFINE_RWLOCK(vmlist_lock); 168DEFINE_RWLOCK(vmlist_lock);
168struct vm_struct *vmlist; 169struct vm_struct *vmlist;
169 170
170void vfree(void *addr) 171void vfree(const void *addr)
171{ 172{
172 kfree(addr); 173 kfree(addr);
173} 174}
@@ -183,13 +184,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
183} 184}
184EXPORT_SYMBOL(__vmalloc); 185EXPORT_SYMBOL(__vmalloc);
185 186
186struct page * vmalloc_to_page(void *addr) 187void *vmalloc_user(unsigned long size)
188{
189 void *ret;
190
191 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
192 PAGE_KERNEL);
193 if (ret) {
194 struct vm_area_struct *vma;
195
196 down_write(&current->mm->mmap_sem);
197 vma = find_vma(current->mm, (unsigned long)ret);
198 if (vma)
199 vma->vm_flags |= VM_USERMAP;
200 up_write(&current->mm->mmap_sem);
201 }
202
203 return ret;
204}
205EXPORT_SYMBOL(vmalloc_user);
206
207struct page *vmalloc_to_page(const void *addr)
187{ 208{
188 return virt_to_page(addr); 209 return virt_to_page(addr);
189} 210}
190EXPORT_SYMBOL(vmalloc_to_page); 211EXPORT_SYMBOL(vmalloc_to_page);
191 212
192unsigned long vmalloc_to_pfn(void *addr) 213unsigned long vmalloc_to_pfn(const void *addr)
193{ 214{
194 return page_to_pfn(virt_to_page(addr)); 215 return page_to_pfn(virt_to_page(addr));
195} 216}
@@ -253,10 +274,17 @@ EXPORT_SYMBOL(vmalloc_32);
253 * 274 *
254 * The resulting memory area is 32bit addressable and zeroed so it can be 275 * The resulting memory area is 32bit addressable and zeroed so it can be
255 * mapped to userspace without leaking data. 276 * mapped to userspace without leaking data.
277 *
278 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
279 * remap_vmalloc_range() are permissible.
256 */ 280 */
257void *vmalloc_32_user(unsigned long size) 281void *vmalloc_32_user(unsigned long size)
258{ 282{
259 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 283 /*
284 * We'll have to sort out the ZONE_DMA bits for 64-bit,
285 * but for now this can simply use vmalloc_user() directly.
286 */
287 return vmalloc_user(size);
260} 288}
261EXPORT_SYMBOL(vmalloc_32_user); 289EXPORT_SYMBOL(vmalloc_32_user);
262 290
@@ -267,7 +295,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_
267} 295}
268EXPORT_SYMBOL(vmap); 296EXPORT_SYMBOL(vmap);
269 297
270void vunmap(void *addr) 298void vunmap(const void *addr)
271{ 299{
272 BUG(); 300 BUG();
273} 301}
@@ -1216,6 +1244,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1216} 1244}
1217EXPORT_SYMBOL(remap_pfn_range); 1245EXPORT_SYMBOL(remap_pfn_range);
1218 1246
1247int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1248 unsigned long pgoff)
1249{
1250 unsigned int size = vma->vm_end - vma->vm_start;
1251
1252 if (!(vma->vm_flags & VM_USERMAP))
1253 return -EINVAL;
1254
1255 vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
1256 vma->vm_end = vma->vm_start + size;
1257
1258 return 0;
1259}
1260EXPORT_SYMBOL(remap_vmalloc_range);
1261
1219void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1262void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1220{ 1263{
1221} 1264}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 96473b482099..4194b9db0104 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -25,9 +25,11 @@
25#include <linux/cpuset.h> 25#include <linux/cpuset.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/notifier.h> 27#include <linux/notifier.h>
28#include <linux/memcontrol.h>
28 29
29int sysctl_panic_on_oom; 30int sysctl_panic_on_oom;
30int sysctl_oom_kill_allocating_task; 31int sysctl_oom_kill_allocating_task;
32int sysctl_oom_dump_tasks;
31static DEFINE_SPINLOCK(zone_scan_mutex); 33static DEFINE_SPINLOCK(zone_scan_mutex);
32/* #define DEBUG */ 34/* #define DEBUG */
33 35
@@ -50,7 +52,8 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
50 * of least surprise ... (be careful when you change it) 52 * of least surprise ... (be careful when you change it)
51 */ 53 */
52 54
53unsigned long badness(struct task_struct *p, unsigned long uptime) 55unsigned long badness(struct task_struct *p, unsigned long uptime,
56 struct mem_cgroup *mem)
54{ 57{
55 unsigned long points, cpu_time, run_time, s; 58 unsigned long points, cpu_time, run_time, s;
56 struct mm_struct *mm; 59 struct mm_struct *mm;
@@ -125,8 +128,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
125 * Superuser processes are usually more important, so we make it 128 * Superuser processes are usually more important, so we make it
126 * less likely that we kill those. 129 * less likely that we kill those.
127 */ 130 */
128 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || 131 if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
129 p->uid == 0 || p->euid == 0)
130 points /= 4; 132 points /= 4;
131 133
132 /* 134 /*
@@ -135,7 +137,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
135 * tend to only have this flag set on applications they think 137 * tend to only have this flag set on applications they think
136 * of as important. 138 * of as important.
137 */ 139 */
138 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) 140 if (__capable(p, CAP_SYS_RAWIO))
139 points /= 4; 141 points /= 4;
140 142
141 /* 143 /*
@@ -194,7 +196,8 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
194 * 196 *
195 * (not docbooked, we don't want this one cluttering up the manual) 197 * (not docbooked, we don't want this one cluttering up the manual)
196 */ 198 */
197static struct task_struct *select_bad_process(unsigned long *ppoints) 199static struct task_struct *select_bad_process(unsigned long *ppoints,
200 struct mem_cgroup *mem)
198{ 201{
199 struct task_struct *g, *p; 202 struct task_struct *g, *p;
200 struct task_struct *chosen = NULL; 203 struct task_struct *chosen = NULL;
@@ -214,6 +217,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
214 /* skip the init task */ 217 /* skip the init task */
215 if (is_global_init(p)) 218 if (is_global_init(p))
216 continue; 219 continue;
220 if (mem && !task_in_mem_cgroup(p, mem))
221 continue;
217 222
218 /* 223 /*
219 * This task already has access to memory reserves and is 224 * This task already has access to memory reserves and is
@@ -248,7 +253,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
248 if (p->oomkilladj == OOM_DISABLE) 253 if (p->oomkilladj == OOM_DISABLE)
249 continue; 254 continue;
250 255
251 points = badness(p, uptime.tv_sec); 256 points = badness(p, uptime.tv_sec, mem);
252 if (points > *ppoints || !chosen) { 257 if (points > *ppoints || !chosen) {
253 chosen = p; 258 chosen = p;
254 *ppoints = points; 259 *ppoints = points;
@@ -259,6 +264,41 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
259} 264}
260 265
261/** 266/**
267 * Dumps the current memory state of all system tasks, excluding kernel threads.
268 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
269 * score, and name.
270 *
271 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
272 * shown.
273 *
274 * Call with tasklist_lock read-locked.
275 */
276static void dump_tasks(const struct mem_cgroup *mem)
277{
278 struct task_struct *g, *p;
279
280 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
281 "name\n");
282 do_each_thread(g, p) {
283 /*
284 * total_vm and rss sizes do not exist for tasks with a
285 * detached mm so there's no need to report them.
286 */
287 if (!p->mm)
288 continue;
289 if (mem && !task_in_mem_cgroup(p, mem))
290 continue;
291
292 task_lock(p);
293 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
294 p->pid, p->uid, p->tgid, p->mm->total_vm,
295 get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
296 p->comm);
297 task_unlock(p);
298 } while_each_thread(g, p);
299}
300
301/**
262 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO 302 * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
263 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO 303 * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
264 * set. 304 * set.
@@ -335,7 +375,8 @@ static int oom_kill_task(struct task_struct *p)
335} 375}
336 376
337static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 377static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
338 unsigned long points, const char *message) 378 unsigned long points, struct mem_cgroup *mem,
379 const char *message)
339{ 380{
340 struct task_struct *c; 381 struct task_struct *c;
341 382
@@ -345,6 +386,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
345 current->comm, gfp_mask, order, current->oomkilladj); 386 current->comm, gfp_mask, order, current->oomkilladj);
346 dump_stack(); 387 dump_stack();
347 show_mem(); 388 show_mem();
389 if (sysctl_oom_dump_tasks)
390 dump_tasks(mem);
348 } 391 }
349 392
350 /* 393 /*
@@ -369,6 +412,31 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
369 return oom_kill_task(p); 412 return oom_kill_task(p);
370} 413}
371 414
415#ifdef CONFIG_CGROUP_MEM_CONT
416void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
417{
418 unsigned long points = 0;
419 struct task_struct *p;
420
421 cgroup_lock();
422 rcu_read_lock();
423retry:
424 p = select_bad_process(&points, mem);
425 if (PTR_ERR(p) == -1UL)
426 goto out;
427
428 if (!p)
429 p = current;
430
431 if (oom_kill_process(p, gfp_mask, 0, points, mem,
432 "Memory cgroup out of memory"))
433 goto retry;
434out:
435 rcu_read_unlock();
436 cgroup_unlock();
437}
438#endif
439
372static BLOCKING_NOTIFIER_HEAD(oom_notify_list); 440static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
373 441
374int register_oom_notifier(struct notifier_block *nb) 442int register_oom_notifier(struct notifier_block *nb)
@@ -466,7 +534,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
466 534
467 switch (constraint) { 535 switch (constraint) {
468 case CONSTRAINT_MEMORY_POLICY: 536 case CONSTRAINT_MEMORY_POLICY:
469 oom_kill_process(current, gfp_mask, order, points, 537 oom_kill_process(current, gfp_mask, order, points, NULL,
470 "No available memory (MPOL_BIND)"); 538 "No available memory (MPOL_BIND)");
471 break; 539 break;
472 540
@@ -476,7 +544,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
476 /* Fall-through */ 544 /* Fall-through */
477 case CONSTRAINT_CPUSET: 545 case CONSTRAINT_CPUSET:
478 if (sysctl_oom_kill_allocating_task) { 546 if (sysctl_oom_kill_allocating_task) {
479 oom_kill_process(current, gfp_mask, order, points, 547 oom_kill_process(current, gfp_mask, order, points, NULL,
480 "Out of memory (oom_kill_allocating_task)"); 548 "Out of memory (oom_kill_allocating_task)");
481 break; 549 break;
482 } 550 }
@@ -485,7 +553,7 @@ retry:
485 * Rambo mode: Shoot down a process and hope it solves whatever 553 * Rambo mode: Shoot down a process and hope it solves whatever
486 * issues we may have. 554 * issues we may have.
487 */ 555 */
488 p = select_bad_process(&points); 556 p = select_bad_process(&points, NULL);
489 557
490 if (PTR_ERR(p) == -1UL) 558 if (PTR_ERR(p) == -1UL)
491 goto out; 559 goto out;
@@ -496,7 +564,7 @@ retry:
496 panic("Out of memory and no killable processes...\n"); 564 panic("Out of memory and no killable processes...\n");
497 } 565 }
498 566
499 if (oom_kill_process(p, gfp_mask, order, points, 567 if (oom_kill_process(p, gfp_mask, order, points, NULL,
500 "Out of memory")) 568 "Out of memory"))
501 goto retry; 569 goto retry;
502 570
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3d3848fa6324..5e00f1772c20 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
69int dirty_background_ratio = 5; 69int dirty_background_ratio = 5;
70 70
71/* 71/*
72 * free highmem will not be subtracted from the total free memory
73 * for calculating free ratios if vm_highmem_is_dirtyable is true
74 */
75int vm_highmem_is_dirtyable;
76
77/*
72 * The generator of dirty data starts writeback at this percentage 78 * The generator of dirty data starts writeback at this percentage
73 */ 79 */
74int vm_dirty_ratio = 10; 80int vm_dirty_ratio = 10;
@@ -219,7 +225,7 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
219 * 225 *
220 * dirty -= (dirty/8) * p_{t} 226 * dirty -= (dirty/8) * p_{t}
221 */ 227 */
222void task_dirty_limit(struct task_struct *tsk, long *pdirty) 228static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
223{ 229{
224 long numerator, denominator; 230 long numerator, denominator;
225 long dirty = *pdirty; 231 long dirty = *pdirty;
@@ -287,7 +293,10 @@ static unsigned long determine_dirtyable_memory(void)
287 x = global_page_state(NR_FREE_PAGES) 293 x = global_page_state(NR_FREE_PAGES)
288 + global_page_state(NR_INACTIVE) 294 + global_page_state(NR_INACTIVE)
289 + global_page_state(NR_ACTIVE); 295 + global_page_state(NR_ACTIVE);
290 x -= highmem_dirtyable_memory(x); 296
297 if (!vm_highmem_is_dirtyable)
298 x -= highmem_dirtyable_memory(x);
299
291 return x + 1; /* Ensure that we never return 0 */ 300 return x + 1; /* Ensure that we never return 0 */
292} 301}
293 302
@@ -558,6 +567,7 @@ static void background_writeout(unsigned long _min_pages)
558 global_page_state(NR_UNSTABLE_NFS) < background_thresh 567 global_page_state(NR_UNSTABLE_NFS) < background_thresh
559 && min_pages <= 0) 568 && min_pages <= 0)
560 break; 569 break;
570 wbc.more_io = 0;
561 wbc.encountered_congestion = 0; 571 wbc.encountered_congestion = 0;
562 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 572 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
563 wbc.pages_skipped = 0; 573 wbc.pages_skipped = 0;
@@ -565,8 +575,9 @@ static void background_writeout(unsigned long _min_pages)
565 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 575 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
566 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 576 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
567 /* Wrote less than expected */ 577 /* Wrote less than expected */
568 congestion_wait(WRITE, HZ/10); 578 if (wbc.encountered_congestion || wbc.more_io)
569 if (!wbc.encountered_congestion) 579 congestion_wait(WRITE, HZ/10);
580 else
570 break; 581 break;
571 } 582 }
572 } 583 }
@@ -631,11 +642,12 @@ static void wb_kupdate(unsigned long arg)
631 global_page_state(NR_UNSTABLE_NFS) + 642 global_page_state(NR_UNSTABLE_NFS) +
632 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 643 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
633 while (nr_to_write > 0) { 644 while (nr_to_write > 0) {
645 wbc.more_io = 0;
634 wbc.encountered_congestion = 0; 646 wbc.encountered_congestion = 0;
635 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 647 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
636 writeback_inodes(&wbc); 648 writeback_inodes(&wbc);
637 if (wbc.nr_to_write > 0) { 649 if (wbc.nr_to_write > 0) {
638 if (wbc.encountered_congestion) 650 if (wbc.encountered_congestion || wbc.more_io)
639 congestion_wait(WRITE, HZ/10); 651 congestion_wait(WRITE, HZ/10);
640 else 652 else
641 break; /* All the old data is written */ 653 break; /* All the old data is written */
@@ -1064,7 +1076,7 @@ static int __set_page_dirty(struct page *page)
1064 return 0; 1076 return 0;
1065} 1077}
1066 1078
1067int fastcall set_page_dirty(struct page *page) 1079int set_page_dirty(struct page *page)
1068{ 1080{
1069 int ret = __set_page_dirty(page); 1081 int ret = __set_page_dirty(page);
1070 if (ret) 1082 if (ret)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b2838c24e582..26a54a17dc9f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
43#include <linux/backing-dev.h> 43#include <linux/backing-dev.h>
44#include <linux/fault-inject.h> 44#include <linux/fault-inject.h>
45#include <linux/page-isolation.h> 45#include <linux/page-isolation.h>
46#include <linux/memcontrol.h>
46 47
47#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
48#include <asm/div64.h> 49#include <asm/div64.h>
@@ -537,7 +538,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
537/* 538/*
538 * permit the bootmem allocator to evade page validation on high-order frees 539 * permit the bootmem allocator to evade page validation on high-order frees
539 */ 540 */
540void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) 541void __init __free_pages_bootmem(struct page *page, unsigned int order)
541{ 542{
542 if (order == 0) { 543 if (order == 0) {
543 __ClearPageReserved(page); 544 __ClearPageReserved(page);
@@ -890,31 +891,51 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
890} 891}
891#endif 892#endif
892 893
893static void __drain_pages(unsigned int cpu) 894/*
895 * Drain pages of the indicated processor.
896 *
897 * The processor must either be the current processor and the
898 * thread pinned to the current processor or a processor that
899 * is not online.
900 */
901static void drain_pages(unsigned int cpu)
894{ 902{
895 unsigned long flags; 903 unsigned long flags;
896 struct zone *zone; 904 struct zone *zone;
897 int i;
898 905
899 for_each_zone(zone) { 906 for_each_zone(zone) {
900 struct per_cpu_pageset *pset; 907 struct per_cpu_pageset *pset;
908 struct per_cpu_pages *pcp;
901 909
902 if (!populated_zone(zone)) 910 if (!populated_zone(zone))
903 continue; 911 continue;
904 912
905 pset = zone_pcp(zone, cpu); 913 pset = zone_pcp(zone, cpu);
906 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 914
907 struct per_cpu_pages *pcp; 915 pcp = &pset->pcp;
908 916 local_irq_save(flags);
909 pcp = &pset->pcp[i]; 917 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
910 local_irq_save(flags); 918 pcp->count = 0;
911 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 919 local_irq_restore(flags);
912 pcp->count = 0;
913 local_irq_restore(flags);
914 }
915 } 920 }
916} 921}
917 922
923/*
924 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
925 */
926void drain_local_pages(void *arg)
927{
928 drain_pages(smp_processor_id());
929}
930
931/*
932 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
933 */
934void drain_all_pages(void)
935{
936 on_each_cpu(drain_local_pages, NULL, 0, 1);
937}
938
918#ifdef CONFIG_HIBERNATION 939#ifdef CONFIG_HIBERNATION
919 940
920void mark_free_pages(struct zone *zone) 941void mark_free_pages(struct zone *zone)
@@ -952,40 +973,9 @@ void mark_free_pages(struct zone *zone)
952#endif /* CONFIG_PM */ 973#endif /* CONFIG_PM */
953 974
954/* 975/*
955 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
956 */
957void drain_local_pages(void)
958{
959 unsigned long flags;
960
961 local_irq_save(flags);
962 __drain_pages(smp_processor_id());
963 local_irq_restore(flags);
964}
965
966void smp_drain_local_pages(void *arg)
967{
968 drain_local_pages();
969}
970
971/*
972 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
973 */
974void drain_all_local_pages(void)
975{
976 unsigned long flags;
977
978 local_irq_save(flags);
979 __drain_pages(smp_processor_id());
980 local_irq_restore(flags);
981
982 smp_call_function(smp_drain_local_pages, NULL, 0, 1);
983}
984
985/*
986 * Free a 0-order page 976 * Free a 0-order page
987 */ 977 */
988static void fastcall free_hot_cold_page(struct page *page, int cold) 978static void free_hot_cold_page(struct page *page, int cold)
989{ 979{
990 struct zone *zone = page_zone(page); 980 struct zone *zone = page_zone(page);
991 struct per_cpu_pages *pcp; 981 struct per_cpu_pages *pcp;
@@ -998,13 +988,17 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
998 988
999 if (!PageHighMem(page)) 989 if (!PageHighMem(page))
1000 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 990 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
991 VM_BUG_ON(page_get_page_cgroup(page));
1001 arch_free_page(page, 0); 992 arch_free_page(page, 0);
1002 kernel_map_pages(page, 1, 0); 993 kernel_map_pages(page, 1, 0);
1003 994
1004 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 995 pcp = &zone_pcp(zone, get_cpu())->pcp;
1005 local_irq_save(flags); 996 local_irq_save(flags);
1006 __count_vm_event(PGFREE); 997 __count_vm_event(PGFREE);
1007 list_add(&page->lru, &pcp->list); 998 if (cold)
999 list_add_tail(&page->lru, &pcp->list);
1000 else
1001 list_add(&page->lru, &pcp->list);
1008 set_page_private(page, get_pageblock_migratetype(page)); 1002 set_page_private(page, get_pageblock_migratetype(page));
1009 pcp->count++; 1003 pcp->count++;
1010 if (pcp->count >= pcp->high) { 1004 if (pcp->count >= pcp->high) {
@@ -1015,12 +1009,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
1015 put_cpu(); 1009 put_cpu();
1016} 1010}
1017 1011
1018void fastcall free_hot_page(struct page *page) 1012void free_hot_page(struct page *page)
1019{ 1013{
1020 free_hot_cold_page(page, 0); 1014 free_hot_cold_page(page, 0);
1021} 1015}
1022 1016
1023void fastcall free_cold_page(struct page *page) 1017void free_cold_page(struct page *page)
1024{ 1018{
1025 free_hot_cold_page(page, 1); 1019 free_hot_cold_page(page, 1);
1026} 1020}
@@ -1062,7 +1056,7 @@ again:
1062 if (likely(order == 0)) { 1056 if (likely(order == 0)) {
1063 struct per_cpu_pages *pcp; 1057 struct per_cpu_pages *pcp;
1064 1058
1065 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 1059 pcp = &zone_pcp(zone, cpu)->pcp;
1066 local_irq_save(flags); 1060 local_irq_save(flags);
1067 if (!pcp->count) { 1061 if (!pcp->count) {
1068 pcp->count = rmqueue_bulk(zone, 0, 1062 pcp->count = rmqueue_bulk(zone, 0,
@@ -1072,9 +1066,15 @@ again:
1072 } 1066 }
1073 1067
1074 /* Find a page of the appropriate migrate type */ 1068 /* Find a page of the appropriate migrate type */
1075 list_for_each_entry(page, &pcp->list, lru) 1069 if (cold) {
1076 if (page_private(page) == migratetype) 1070 list_for_each_entry_reverse(page, &pcp->list, lru)
1077 break; 1071 if (page_private(page) == migratetype)
1072 break;
1073 } else {
1074 list_for_each_entry(page, &pcp->list, lru)
1075 if (page_private(page) == migratetype)
1076 break;
1077 }
1078 1078
1079 /* Allocate more to the pcp list if necessary */ 1079 /* Allocate more to the pcp list if necessary */
1080 if (unlikely(&page->lru == &pcp->list)) { 1080 if (unlikely(&page->lru == &pcp->list)) {
@@ -1569,7 +1569,7 @@ nofail_alloc:
1569 cond_resched(); 1569 cond_resched();
1570 1570
1571 if (order != 0) 1571 if (order != 0)
1572 drain_all_local_pages(); 1572 drain_all_pages();
1573 1573
1574 if (likely(did_some_progress)) { 1574 if (likely(did_some_progress)) {
1575 page = get_page_from_freelist(gfp_mask, order, 1575 page = get_page_from_freelist(gfp_mask, order,
@@ -1643,7 +1643,7 @@ EXPORT_SYMBOL(__alloc_pages);
1643/* 1643/*
1644 * Common helper functions. 1644 * Common helper functions.
1645 */ 1645 */
1646fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1646unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1647{ 1647{
1648 struct page * page; 1648 struct page * page;
1649 page = alloc_pages(gfp_mask, order); 1649 page = alloc_pages(gfp_mask, order);
@@ -1654,7 +1654,7 @@ fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1654 1654
1655EXPORT_SYMBOL(__get_free_pages); 1655EXPORT_SYMBOL(__get_free_pages);
1656 1656
1657fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1657unsigned long get_zeroed_page(gfp_t gfp_mask)
1658{ 1658{
1659 struct page * page; 1659 struct page * page;
1660 1660
@@ -1680,7 +1680,7 @@ void __pagevec_free(struct pagevec *pvec)
1680 free_hot_cold_page(pvec->pages[i], pvec->cold); 1680 free_hot_cold_page(pvec->pages[i], pvec->cold);
1681} 1681}
1682 1682
1683fastcall void __free_pages(struct page *page, unsigned int order) 1683void __free_pages(struct page *page, unsigned int order)
1684{ 1684{
1685 if (put_page_testzero(page)) { 1685 if (put_page_testzero(page)) {
1686 if (order == 0) 1686 if (order == 0)
@@ -1692,7 +1692,7 @@ fastcall void __free_pages(struct page *page, unsigned int order)
1692 1692
1693EXPORT_SYMBOL(__free_pages); 1693EXPORT_SYMBOL(__free_pages);
1694 1694
1695fastcall void free_pages(unsigned long addr, unsigned int order) 1695void free_pages(unsigned long addr, unsigned int order)
1696{ 1696{
1697 if (addr != 0) { 1697 if (addr != 0) {
1698 VM_BUG_ON(!virt_addr_valid((void *)addr)); 1698 VM_BUG_ON(!virt_addr_valid((void *)addr));
@@ -1801,12 +1801,9 @@ void show_free_areas(void)
1801 1801
1802 pageset = zone_pcp(zone, cpu); 1802 pageset = zone_pcp(zone, cpu);
1803 1803
1804 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " 1804 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
1805 "Cold: hi:%5d, btch:%4d usd:%4d\n", 1805 cpu, pageset->pcp.high,
1806 cpu, pageset->pcp[0].high, 1806 pageset->pcp.batch, pageset->pcp.count);
1807 pageset->pcp[0].batch, pageset->pcp[0].count,
1808 pageset->pcp[1].high, pageset->pcp[1].batch,
1809 pageset->pcp[1].count);
1810 } 1807 }
1811 } 1808 }
1812 1809
@@ -1879,6 +1876,8 @@ void show_free_areas(void)
1879 printk("= %lukB\n", K(total)); 1876 printk("= %lukB\n", K(total));
1880 } 1877 }
1881 1878
1879 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
1880
1882 show_swap_cache_info(); 1881 show_swap_cache_info();
1883} 1882}
1884 1883
@@ -2528,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2528 set_page_links(page, zone, nid, pfn); 2527 set_page_links(page, zone, nid, pfn);
2529 init_page_count(page); 2528 init_page_count(page);
2530 reset_page_mapcount(page); 2529 reset_page_mapcount(page);
2530 page_assign_page_cgroup(page, NULL);
2531 SetPageReserved(page); 2531 SetPageReserved(page);
2532 2532
2533 /* 2533 /*
@@ -2551,8 +2551,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2551 } 2551 }
2552} 2552}
2553 2553
2554static void __meminit zone_init_free_lists(struct pglist_data *pgdat, 2554static void __meminit zone_init_free_lists(struct zone *zone)
2555 struct zone *zone, unsigned long size)
2556{ 2555{
2557 int order, t; 2556 int order, t;
2558 for_each_migratetype_order(order, t) { 2557 for_each_migratetype_order(order, t) {
@@ -2604,17 +2603,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2604 2603
2605 memset(p, 0, sizeof(*p)); 2604 memset(p, 0, sizeof(*p));
2606 2605
2607 pcp = &p->pcp[0]; /* hot */ 2606 pcp = &p->pcp;
2608 pcp->count = 0; 2607 pcp->count = 0;
2609 pcp->high = 6 * batch; 2608 pcp->high = 6 * batch;
2610 pcp->batch = max(1UL, 1 * batch); 2609 pcp->batch = max(1UL, 1 * batch);
2611 INIT_LIST_HEAD(&pcp->list); 2610 INIT_LIST_HEAD(&pcp->list);
2612
2613 pcp = &p->pcp[1]; /* cold*/
2614 pcp->count = 0;
2615 pcp->high = 2 * batch;
2616 pcp->batch = max(1UL, batch/2);
2617 INIT_LIST_HEAD(&pcp->list);
2618} 2611}
2619 2612
2620/* 2613/*
@@ -2627,7 +2620,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
2627{ 2620{
2628 struct per_cpu_pages *pcp; 2621 struct per_cpu_pages *pcp;
2629 2622
2630 pcp = &p->pcp[0]; /* hot list */ 2623 pcp = &p->pcp;
2631 pcp->high = high; 2624 pcp->high = high;
2632 pcp->batch = max(1UL, high/4); 2625 pcp->batch = max(1UL, high/4);
2633 if ((high/4) > (PAGE_SHIFT * 8)) 2626 if ((high/4) > (PAGE_SHIFT * 8))
@@ -2831,7 +2824,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2831 2824
2832 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2825 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2833 2826
2834 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 2827 zone_init_free_lists(zone);
2835 2828
2836 return 0; 2829 return 0;
2837} 2830}
@@ -3978,10 +3971,23 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
3978 int cpu = (unsigned long)hcpu; 3971 int cpu = (unsigned long)hcpu;
3979 3972
3980 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 3973 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3981 local_irq_disable(); 3974 drain_pages(cpu);
3982 __drain_pages(cpu); 3975
3976 /*
3977 * Spill the event counters of the dead processor
3978 * into the current processors event counters.
3979 * This artificially elevates the count of the current
3980 * processor.
3981 */
3983 vm_events_fold_cpu(cpu); 3982 vm_events_fold_cpu(cpu);
3984 local_irq_enable(); 3983
3984 /*
3985 * Zero the differential counters of the dead processor
3986 * so that the vm statistics are consistent.
3987 *
3988 * This is only okay since the processor is dead and cannot
3989 * race with what we are doing.
3990 */
3985 refresh_cpu_vm_stats(cpu); 3991 refresh_cpu_vm_stats(cpu);
3986 } 3992 }
3987 return NOTIFY_OK; 3993 return NOTIFY_OK;
@@ -4480,7 +4486,7 @@ int set_migratetype_isolate(struct page *page)
4480out: 4486out:
4481 spin_unlock_irqrestore(&zone->lock, flags); 4487 spin_unlock_irqrestore(&zone->lock, flags);
4482 if (!ret) 4488 if (!ret)
4483 drain_all_local_pages(); 4489 drain_all_pages();
4484 return ret; 4490 return ret;
4485} 4491}
4486 4492
diff --git a/mm/page_io.c b/mm/page_io.c
index 3b97f6850273..065c4480eaf0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page)
126 int ret = 0; 126 int ret = 0;
127 127
128 BUG_ON(!PageLocked(page)); 128 BUG_ON(!PageLocked(page));
129 ClearPageUptodate(page); 129 BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
131 end_swap_bio_read); 131 end_swap_bio_read);
132 if (bio == NULL) { 132 if (bio == NULL) {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
new file mode 100644
index 000000000000..b4f27d22da91
--- /dev/null
+++ b/mm/pagewalk.c
@@ -0,0 +1,131 @@
1#include <linux/mm.h>
2#include <linux/highmem.h>
3#include <linux/sched.h>
4
5static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
6 const struct mm_walk *walk, void *private)
7{
8 pte_t *pte;
9 int err = 0;
10
11 pte = pte_offset_map(pmd, addr);
12 do {
13 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
14 if (err)
15 break;
16 } while (pte++, addr += PAGE_SIZE, addr != end);
17
18 pte_unmap(pte);
19 return err;
20}
21
22static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
23 const struct mm_walk *walk, void *private)
24{
25 pmd_t *pmd;
26 unsigned long next;
27 int err = 0;
28
29 pmd = pmd_offset(pud, addr);
30 do {
31 next = pmd_addr_end(addr, end);
32 if (pmd_none_or_clear_bad(pmd)) {
33 if (walk->pte_hole)
34 err = walk->pte_hole(addr, next, private);
35 if (err)
36 break;
37 continue;
38 }
39 if (walk->pmd_entry)
40 err = walk->pmd_entry(pmd, addr, next, private);
41 if (!err && walk->pte_entry)
42 err = walk_pte_range(pmd, addr, next, walk, private);
43 if (err)
44 break;
45 } while (pmd++, addr = next, addr != end);
46
47 return err;
48}
49
50static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
51 const struct mm_walk *walk, void *private)
52{
53 pud_t *pud;
54 unsigned long next;
55 int err = 0;
56
57 pud = pud_offset(pgd, addr);
58 do {
59 next = pud_addr_end(addr, end);
60 if (pud_none_or_clear_bad(pud)) {
61 if (walk->pte_hole)
62 err = walk->pte_hole(addr, next, private);
63 if (err)
64 break;
65 continue;
66 }
67 if (walk->pud_entry)
68 err = walk->pud_entry(pud, addr, next, private);
69 if (!err && (walk->pmd_entry || walk->pte_entry))
70 err = walk_pmd_range(pud, addr, next, walk, private);
71 if (err)
72 break;
73 } while (pud++, addr = next, addr != end);
74
75 return err;
76}
77
78/**
79 * walk_page_range - walk a memory map's page tables with a callback
80 * @mm - memory map to walk
81 * @addr - starting address
82 * @end - ending address
83 * @walk - set of callbacks to invoke for each level of the tree
84 * @private - private data passed to the callback function
85 *
86 * Recursively walk the page table for the memory area in a VMA,
87 * calling supplied callbacks. Callbacks are called in-order (first
88 * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
89 * etc.). If lower-level callbacks are omitted, walking depth is reduced.
90 *
91 * Each callback receives an entry pointer, the start and end of the
92 * associated range, and a caller-supplied private data pointer.
93 *
94 * No locks are taken, but the bottom level iterator will map PTE
95 * directories from highmem if necessary.
96 *
97 * If any callback returns a non-zero value, the walk is aborted and
98 * the return value is propagated back to the caller. Otherwise 0 is returned.
99 */
100int walk_page_range(const struct mm_struct *mm,
101 unsigned long addr, unsigned long end,
102 const struct mm_walk *walk, void *private)
103{
104 pgd_t *pgd;
105 unsigned long next;
106 int err = 0;
107
108 if (addr >= end)
109 return err;
110
111 pgd = pgd_offset(mm, addr);
112 do {
113 next = pgd_addr_end(addr, end);
114 if (pgd_none_or_clear_bad(pgd)) {
115 if (walk->pte_hole)
116 err = walk->pte_hole(addr, next, private);
117 if (err)
118 break;
119 continue;
120 }
121 if (walk->pgd_entry)
122 err = walk->pgd_entry(pgd, addr, next, private);
123 if (!err &&
124 (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
125 err = walk_pud_range(pgd, addr, next, walk, private);
126 if (err)
127 break;
128 } while (pgd++, addr = next, addr != end);
129
130 return err;
131}
diff --git a/mm/rmap.c b/mm/rmap.c
index dbc2ca2057a5..a0e92a263d12 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,7 +36,6 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 * zone->lock (within radix tree node alloc)
40 */ 39 */
41 40
42#include <linux/mm.h> 41#include <linux/mm.h>
@@ -49,6 +48,7 @@
49#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
50#include <linux/module.h> 49#include <linux/module.h>
51#include <linux/kallsyms.h> 50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h>
52 52
53#include <asm/tlbflush.h> 53#include <asm/tlbflush.h>
54 54
@@ -284,7 +284,10 @@ static int page_referenced_one(struct page *page,
284 if (!pte) 284 if (!pte)
285 goto out; 285 goto out;
286 286
287 if (ptep_clear_flush_young(vma, address, pte)) 287 if (vma->vm_flags & VM_LOCKED) {
288 referenced++;
289 *mapcount = 1; /* break early from loop */
290 } else if (ptep_clear_flush_young(vma, address, pte))
288 referenced++; 291 referenced++;
289 292
290 /* Pretend the page is referenced if the task has the 293 /* Pretend the page is referenced if the task has the
@@ -299,7 +302,8 @@ out:
299 return referenced; 302 return referenced;
300} 303}
301 304
302static int page_referenced_anon(struct page *page) 305static int page_referenced_anon(struct page *page,
306 struct mem_cgroup *mem_cont)
303{ 307{
304 unsigned int mapcount; 308 unsigned int mapcount;
305 struct anon_vma *anon_vma; 309 struct anon_vma *anon_vma;
@@ -312,6 +316,13 @@ static int page_referenced_anon(struct page *page)
312 316
313 mapcount = page_mapcount(page); 317 mapcount = page_mapcount(page);
314 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 318 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
319 /*
320 * If we are reclaiming on behalf of a cgroup, skip
321 * counting on behalf of references from different
322 * cgroups
323 */
324 if (mem_cont && (mm_cgroup(vma->vm_mm) != mem_cont))
325 continue;
315 referenced += page_referenced_one(page, vma, &mapcount); 326 referenced += page_referenced_one(page, vma, &mapcount);
316 if (!mapcount) 327 if (!mapcount)
317 break; 328 break;
@@ -332,7 +343,8 @@ static int page_referenced_anon(struct page *page)
332 * 343 *
333 * This function is only called from page_referenced for object-based pages. 344 * This function is only called from page_referenced for object-based pages.
334 */ 345 */
335static int page_referenced_file(struct page *page) 346static int page_referenced_file(struct page *page,
347 struct mem_cgroup *mem_cont)
336{ 348{
337 unsigned int mapcount; 349 unsigned int mapcount;
338 struct address_space *mapping = page->mapping; 350 struct address_space *mapping = page->mapping;
@@ -365,6 +377,13 @@ static int page_referenced_file(struct page *page)
365 mapcount = page_mapcount(page); 377 mapcount = page_mapcount(page);
366 378
367 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 379 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
380 /*
381 * If we are reclaiming on behalf of a cgroup, skip
382 * counting on behalf of references from different
383 * cgroups
384 */
385 if (mem_cont && (mm_cgroup(vma->vm_mm) != mem_cont))
386 continue;
368 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) 387 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
369 == (VM_LOCKED|VM_MAYSHARE)) { 388 == (VM_LOCKED|VM_MAYSHARE)) {
370 referenced++; 389 referenced++;
@@ -387,7 +406,8 @@ static int page_referenced_file(struct page *page)
387 * Quick test_and_clear_referenced for all mappings to a page, 406 * Quick test_and_clear_referenced for all mappings to a page,
388 * returns the number of ptes which referenced the page. 407 * returns the number of ptes which referenced the page.
389 */ 408 */
390int page_referenced(struct page *page, int is_locked) 409int page_referenced(struct page *page, int is_locked,
410 struct mem_cgroup *mem_cont)
391{ 411{
392 int referenced = 0; 412 int referenced = 0;
393 413
@@ -399,14 +419,15 @@ int page_referenced(struct page *page, int is_locked)
399 419
400 if (page_mapped(page) && page->mapping) { 420 if (page_mapped(page) && page->mapping) {
401 if (PageAnon(page)) 421 if (PageAnon(page))
402 referenced += page_referenced_anon(page); 422 referenced += page_referenced_anon(page, mem_cont);
403 else if (is_locked) 423 else if (is_locked)
404 referenced += page_referenced_file(page); 424 referenced += page_referenced_file(page, mem_cont);
405 else if (TestSetPageLocked(page)) 425 else if (TestSetPageLocked(page))
406 referenced++; 426 referenced++;
407 else { 427 else {
408 if (page->mapping) 428 if (page->mapping)
409 referenced += page_referenced_file(page); 429 referenced +=
430 page_referenced_file(page, mem_cont);
410 unlock_page(page); 431 unlock_page(page);
411 } 432 }
412 } 433 }
@@ -552,8 +573,14 @@ void page_add_anon_rmap(struct page *page,
552 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 573 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
553 if (atomic_inc_and_test(&page->_mapcount)) 574 if (atomic_inc_and_test(&page->_mapcount))
554 __page_set_anon_rmap(page, vma, address); 575 __page_set_anon_rmap(page, vma, address);
555 else 576 else {
556 __page_check_anon_rmap(page, vma, address); 577 __page_check_anon_rmap(page, vma, address);
578 /*
579 * We unconditionally charged during prepare, we uncharge here
580 * This takes care of balancing the reference counts
581 */
582 mem_cgroup_uncharge_page(page);
583 }
557} 584}
558 585
559/* 586/*
@@ -584,6 +611,12 @@ void page_add_file_rmap(struct page *page)
584{ 611{
585 if (atomic_inc_and_test(&page->_mapcount)) 612 if (atomic_inc_and_test(&page->_mapcount))
586 __inc_zone_page_state(page, NR_FILE_MAPPED); 613 __inc_zone_page_state(page, NR_FILE_MAPPED);
614 else
615 /*
616 * We unconditionally charged during prepare, we uncharge here
617 * This takes care of balancing the reference counts
618 */
619 mem_cgroup_uncharge_page(page);
587} 620}
588 621
589#ifdef CONFIG_DEBUG_VM 622#ifdef CONFIG_DEBUG_VM
@@ -644,6 +677,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
644 page_clear_dirty(page); 677 page_clear_dirty(page);
645 set_page_dirty(page); 678 set_page_dirty(page);
646 } 679 }
680 mem_cgroup_uncharge_page(page);
681
647 __dec_zone_page_state(page, 682 __dec_zone_page_state(page,
648 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 683 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
649 } 684 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 51b3d6ccddab..85bed948fafc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,11 +78,10 @@
78 78
79/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 79/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
80enum sgp_type { 80enum sgp_type {
81 SGP_QUICK, /* don't try more than file page cache lookup */
82 SGP_READ, /* don't exceed i_size, don't allocate page */ 81 SGP_READ, /* don't exceed i_size, don't allocate page */
83 SGP_CACHE, /* don't exceed i_size, may allocate page */ 82 SGP_CACHE, /* don't exceed i_size, may allocate page */
83 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
84 SGP_WRITE, /* may exceed i_size, may allocate page */ 84 SGP_WRITE, /* may exceed i_size, may allocate page */
85 SGP_FAULT, /* same as SGP_CACHE, return with page locked */
86}; 85};
87 86
88static int shmem_getpage(struct inode *inode, unsigned long idx, 87static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -194,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
194}; 193};
195 194
196static LIST_HEAD(shmem_swaplist); 195static LIST_HEAD(shmem_swaplist);
197static DEFINE_SPINLOCK(shmem_swaplist_lock); 196static DEFINE_MUTEX(shmem_swaplist_mutex);
198 197
199static void shmem_free_blocks(struct inode *inode, long pages) 198static void shmem_free_blocks(struct inode *inode, long pages)
200{ 199{
@@ -207,6 +206,31 @@ static void shmem_free_blocks(struct inode *inode, long pages)
207 } 206 }
208} 207}
209 208
209static int shmem_reserve_inode(struct super_block *sb)
210{
211 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
212 if (sbinfo->max_inodes) {
213 spin_lock(&sbinfo->stat_lock);
214 if (!sbinfo->free_inodes) {
215 spin_unlock(&sbinfo->stat_lock);
216 return -ENOSPC;
217 }
218 sbinfo->free_inodes--;
219 spin_unlock(&sbinfo->stat_lock);
220 }
221 return 0;
222}
223
224static void shmem_free_inode(struct super_block *sb)
225{
226 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
227 if (sbinfo->max_inodes) {
228 spin_lock(&sbinfo->stat_lock);
229 sbinfo->free_inodes++;
230 spin_unlock(&sbinfo->stat_lock);
231 }
232}
233
210/* 234/*
211 * shmem_recalc_inode - recalculate the size of an inode 235 * shmem_recalc_inode - recalculate the size of an inode
212 * 236 *
@@ -731,6 +755,8 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
731 (void) shmem_getpage(inode, 755 (void) shmem_getpage(inode,
732 attr->ia_size>>PAGE_CACHE_SHIFT, 756 attr->ia_size>>PAGE_CACHE_SHIFT,
733 &page, SGP_READ, NULL); 757 &page, SGP_READ, NULL);
758 if (page)
759 unlock_page(page);
734 } 760 }
735 /* 761 /*
736 * Reset SHMEM_PAGEIN flag so that shmem_truncate can 762 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
@@ -762,7 +788,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
762 788
763static void shmem_delete_inode(struct inode *inode) 789static void shmem_delete_inode(struct inode *inode)
764{ 790{
765 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
766 struct shmem_inode_info *info = SHMEM_I(inode); 791 struct shmem_inode_info *info = SHMEM_I(inode);
767 792
768 if (inode->i_op->truncate == shmem_truncate) { 793 if (inode->i_op->truncate == shmem_truncate) {
@@ -771,17 +796,13 @@ static void shmem_delete_inode(struct inode *inode)
771 inode->i_size = 0; 796 inode->i_size = 0;
772 shmem_truncate(inode); 797 shmem_truncate(inode);
773 if (!list_empty(&info->swaplist)) { 798 if (!list_empty(&info->swaplist)) {
774 spin_lock(&shmem_swaplist_lock); 799 mutex_lock(&shmem_swaplist_mutex);
775 list_del_init(&info->swaplist); 800 list_del_init(&info->swaplist);
776 spin_unlock(&shmem_swaplist_lock); 801 mutex_unlock(&shmem_swaplist_mutex);
777 } 802 }
778 } 803 }
779 BUG_ON(inode->i_blocks); 804 BUG_ON(inode->i_blocks);
780 if (sbinfo->max_inodes) { 805 shmem_free_inode(inode->i_sb);
781 spin_lock(&sbinfo->stat_lock);
782 sbinfo->free_inodes++;
783 spin_unlock(&sbinfo->stat_lock);
784 }
785 clear_inode(inode); 806 clear_inode(inode);
786} 807}
787 808
@@ -807,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
807 struct page *subdir; 828 struct page *subdir;
808 swp_entry_t *ptr; 829 swp_entry_t *ptr;
809 int offset; 830 int offset;
831 int error;
810 832
811 idx = 0; 833 idx = 0;
812 ptr = info->i_direct; 834 ptr = info->i_direct;
813 spin_lock(&info->lock); 835 spin_lock(&info->lock);
836 if (!info->swapped) {
837 list_del_init(&info->swaplist);
838 goto lost2;
839 }
814 limit = info->next_index; 840 limit = info->next_index;
815 size = limit; 841 size = limit;
816 if (size > SHMEM_NR_DIRECT) 842 if (size > SHMEM_NR_DIRECT)
817 size = SHMEM_NR_DIRECT; 843 size = SHMEM_NR_DIRECT;
818 offset = shmem_find_swp(entry, ptr, ptr+size); 844 offset = shmem_find_swp(entry, ptr, ptr+size);
819 if (offset >= 0) { 845 if (offset >= 0)
820 shmem_swp_balance_unmap();
821 goto found; 846 goto found;
822 }
823 if (!info->i_indirect) 847 if (!info->i_indirect)
824 goto lost2; 848 goto lost2;
825 849
@@ -829,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
829 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { 853 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
830 if (unlikely(idx == stage)) { 854 if (unlikely(idx == stage)) {
831 shmem_dir_unmap(dir-1); 855 shmem_dir_unmap(dir-1);
856 if (cond_resched_lock(&info->lock)) {
857 /* check it has not been truncated */
858 if (limit > info->next_index) {
859 limit = info->next_index;
860 if (idx >= limit)
861 goto lost2;
862 }
863 }
832 dir = shmem_dir_map(info->i_indirect) + 864 dir = shmem_dir_map(info->i_indirect) +
833 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; 865 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
834 while (!*dir) { 866 while (!*dir) {
@@ -849,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
849 if (size > ENTRIES_PER_PAGE) 881 if (size > ENTRIES_PER_PAGE)
850 size = ENTRIES_PER_PAGE; 882 size = ENTRIES_PER_PAGE;
851 offset = shmem_find_swp(entry, ptr, ptr+size); 883 offset = shmem_find_swp(entry, ptr, ptr+size);
884 shmem_swp_unmap(ptr);
852 if (offset >= 0) { 885 if (offset >= 0) {
853 shmem_dir_unmap(dir); 886 shmem_dir_unmap(dir);
854 goto found; 887 goto found;
855 } 888 }
856 shmem_swp_unmap(ptr);
857 } 889 }
858 } 890 }
859lost1: 891lost1:
@@ -863,19 +895,69 @@ lost2:
863 return 0; 895 return 0;
864found: 896found:
865 idx += offset; 897 idx += offset;
866 inode = &info->vfs_inode; 898 inode = igrab(&info->vfs_inode);
867 if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
868 info->flags |= SHMEM_PAGEIN;
869 shmem_swp_set(info, ptr + offset, 0);
870 }
871 shmem_swp_unmap(ptr);
872 spin_unlock(&info->lock); 899 spin_unlock(&info->lock);
900
873 /* 901 /*
874 * Decrement swap count even when the entry is left behind: 902 * Move _head_ to start search for next from here.
875 * try_to_unuse will skip over mms, then reincrement count. 903 * But be careful: shmem_delete_inode checks list_empty without taking
904 * mutex, and there's an instant in list_move_tail when info->swaplist
905 * would appear empty, if it were the only one on shmem_swaplist. We
906 * could avoid doing it if inode NULL; or use this minor optimization.
876 */ 907 */
877 swap_free(entry); 908 if (shmem_swaplist.next != &info->swaplist)
878 return 1; 909 list_move_tail(&shmem_swaplist, &info->swaplist);
910 mutex_unlock(&shmem_swaplist_mutex);
911
912 error = 1;
913 if (!inode)
914 goto out;
915 /* Precharge page while we can wait, compensate afterwards */
916 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
917 if (error)
918 goto out;
919 error = radix_tree_preload(GFP_KERNEL);
920 if (error)
921 goto uncharge;
922 error = 1;
923
924 spin_lock(&info->lock);
925 ptr = shmem_swp_entry(info, idx, NULL);
926 if (ptr && ptr->val == entry.val)
927 error = add_to_page_cache(page, inode->i_mapping,
928 idx, GFP_NOWAIT);
929 if (error == -EEXIST) {
930 struct page *filepage = find_get_page(inode->i_mapping, idx);
931 error = 1;
932 if (filepage) {
933 /*
934 * There might be a more uptodate page coming down
935 * from a stacked writepage: forget our swappage if so.
936 */
937 if (PageUptodate(filepage))
938 error = 0;
939 page_cache_release(filepage);
940 }
941 }
942 if (!error) {
943 delete_from_swap_cache(page);
944 set_page_dirty(page);
945 info->flags |= SHMEM_PAGEIN;
946 shmem_swp_set(info, ptr, 0);
947 swap_free(entry);
948 error = 1; /* not an error, but entry was found */
949 }
950 if (ptr)
951 shmem_swp_unmap(ptr);
952 spin_unlock(&info->lock);
953 radix_tree_preload_end();
954uncharge:
955 mem_cgroup_uncharge_page(page);
956out:
957 unlock_page(page);
958 page_cache_release(page);
959 iput(inode); /* allows for NULL */
960 return error;
879} 961}
880 962
881/* 963/*
@@ -887,20 +969,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
887 struct shmem_inode_info *info; 969 struct shmem_inode_info *info;
888 int found = 0; 970 int found = 0;
889 971
890 spin_lock(&shmem_swaplist_lock); 972 mutex_lock(&shmem_swaplist_mutex);
891 list_for_each_safe(p, next, &shmem_swaplist) { 973 list_for_each_safe(p, next, &shmem_swaplist) {
892 info = list_entry(p, struct shmem_inode_info, swaplist); 974 info = list_entry(p, struct shmem_inode_info, swaplist);
893 if (!info->swapped) 975 found = shmem_unuse_inode(info, entry, page);
894 list_del_init(&info->swaplist); 976 cond_resched();
895 else if (shmem_unuse_inode(info, entry, page)) { 977 if (found)
896 /* move head to start search for next from here */ 978 goto out;
897 list_move_tail(&shmem_swaplist, &info->swaplist);
898 found = 1;
899 break;
900 }
901 } 979 }
902 spin_unlock(&shmem_swaplist_lock); 980 mutex_unlock(&shmem_swaplist_mutex);
903 return found; 981out: return found; /* 0 or 1 or -ENOMEM */
904} 982}
905 983
906/* 984/*
@@ -915,54 +993,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
915 struct inode *inode; 993 struct inode *inode;
916 994
917 BUG_ON(!PageLocked(page)); 995 BUG_ON(!PageLocked(page));
918 /*
919 * shmem_backing_dev_info's capabilities prevent regular writeback or
920 * sync from ever calling shmem_writepage; but a stacking filesystem
921 * may use the ->writepage of its underlying filesystem, in which case
922 * we want to do nothing when that underlying filesystem is tmpfs
923 * (writing out to swap is useful as a response to memory pressure, but
924 * of no use to stabilize the data) - just redirty the page, unlock it
925 * and claim success in this case. AOP_WRITEPAGE_ACTIVATE, and the
926 * page_mapped check below, must be avoided unless we're in reclaim.
927 */
928 if (!wbc->for_reclaim) {
929 set_page_dirty(page);
930 unlock_page(page);
931 return 0;
932 }
933 BUG_ON(page_mapped(page));
934
935 mapping = page->mapping; 996 mapping = page->mapping;
936 index = page->index; 997 index = page->index;
937 inode = mapping->host; 998 inode = mapping->host;
938 info = SHMEM_I(inode); 999 info = SHMEM_I(inode);
939 if (info->flags & VM_LOCKED) 1000 if (info->flags & VM_LOCKED)
940 goto redirty; 1001 goto redirty;
941 swap = get_swap_page(); 1002 if (!total_swap_pages)
942 if (!swap.val)
943 goto redirty; 1003 goto redirty;
944 1004
1005 /*
1006 * shmem_backing_dev_info's capabilities prevent regular writeback or
1007 * sync from ever calling shmem_writepage; but a stacking filesystem
1008 * may use the ->writepage of its underlying filesystem, in which case
1009 * tmpfs should write out to swap only in response to memory pressure,
1010 * and not for pdflush or sync. However, in those cases, we do still
1011 * want to check if there's a redundant swappage to be discarded.
1012 */
1013 if (wbc->for_reclaim)
1014 swap = get_swap_page();
1015 else
1016 swap.val = 0;
1017
945 spin_lock(&info->lock); 1018 spin_lock(&info->lock);
946 shmem_recalc_inode(inode);
947 if (index >= info->next_index) { 1019 if (index >= info->next_index) {
948 BUG_ON(!(info->flags & SHMEM_TRUNCATE)); 1020 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
949 goto unlock; 1021 goto unlock;
950 } 1022 }
951 entry = shmem_swp_entry(info, index, NULL); 1023 entry = shmem_swp_entry(info, index, NULL);
952 BUG_ON(!entry); 1024 if (entry->val) {
953 BUG_ON(entry->val); 1025 /*
1026 * The more uptodate page coming down from a stacked
1027 * writepage should replace our old swappage.
1028 */
1029 free_swap_and_cache(*entry);
1030 shmem_swp_set(info, entry, 0);
1031 }
1032 shmem_recalc_inode(inode);
954 1033
955 if (move_to_swap_cache(page, swap) == 0) { 1034 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1035 remove_from_page_cache(page);
956 shmem_swp_set(info, entry, swap.val); 1036 shmem_swp_set(info, entry, swap.val);
957 shmem_swp_unmap(entry); 1037 shmem_swp_unmap(entry);
1038 if (list_empty(&info->swaplist))
1039 inode = igrab(inode);
1040 else
1041 inode = NULL;
958 spin_unlock(&info->lock); 1042 spin_unlock(&info->lock);
959 if (list_empty(&info->swaplist)) { 1043 swap_duplicate(swap);
960 spin_lock(&shmem_swaplist_lock); 1044 BUG_ON(page_mapped(page));
1045 page_cache_release(page); /* pagecache ref */
1046 set_page_dirty(page);
1047 unlock_page(page);
1048 if (inode) {
1049 mutex_lock(&shmem_swaplist_mutex);
961 /* move instead of add in case we're racing */ 1050 /* move instead of add in case we're racing */
962 list_move_tail(&info->swaplist, &shmem_swaplist); 1051 list_move_tail(&info->swaplist, &shmem_swaplist);
963 spin_unlock(&shmem_swaplist_lock); 1052 mutex_unlock(&shmem_swaplist_mutex);
1053 iput(inode);
964 } 1054 }
965 unlock_page(page);
966 return 0; 1055 return 0;
967 } 1056 }
968 1057
@@ -972,7 +1061,10 @@ unlock:
972 swap_free(swap); 1061 swap_free(swap);
973redirty: 1062redirty:
974 set_page_dirty(page); 1063 set_page_dirty(page);
975 return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ 1064 if (wbc->for_reclaim)
1065 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
1066 unlock_page(page);
1067 return 0;
976} 1068}
977 1069
978#ifdef CONFIG_NUMA 1070#ifdef CONFIG_NUMA
@@ -1025,53 +1117,33 @@ out:
1025 return err; 1117 return err;
1026} 1118}
1027 1119
1028static struct page *shmem_swapin_async(struct shared_policy *p, 1120static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1029 swp_entry_t entry, unsigned long idx) 1121 struct shmem_inode_info *info, unsigned long idx)
1030{ 1122{
1031 struct page *page;
1032 struct vm_area_struct pvma; 1123 struct vm_area_struct pvma;
1124 struct page *page;
1033 1125
1034 /* Create a pseudo vma that just contains the policy */ 1126 /* Create a pseudo vma that just contains the policy */
1035 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1127 pvma.vm_start = 0;
1036 pvma.vm_end = PAGE_SIZE;
1037 pvma.vm_pgoff = idx; 1128 pvma.vm_pgoff = idx;
1038 pvma.vm_policy = mpol_shared_policy_lookup(p, idx); 1129 pvma.vm_ops = NULL;
1039 page = read_swap_cache_async(entry, &pvma, 0); 1130 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1131 page = swapin_readahead(entry, gfp, &pvma, 0);
1040 mpol_free(pvma.vm_policy); 1132 mpol_free(pvma.vm_policy);
1041 return page; 1133 return page;
1042} 1134}
1043 1135
1044static struct page *shmem_swapin(struct shmem_inode_info *info, 1136static struct page *shmem_alloc_page(gfp_t gfp,
1045 swp_entry_t entry, unsigned long idx) 1137 struct shmem_inode_info *info, unsigned long idx)
1046{
1047 struct shared_policy *p = &info->policy;
1048 int i, num;
1049 struct page *page;
1050 unsigned long offset;
1051
1052 num = valid_swaphandles(entry, &offset);
1053 for (i = 0; i < num; offset++, i++) {
1054 page = shmem_swapin_async(p,
1055 swp_entry(swp_type(entry), offset), idx);
1056 if (!page)
1057 break;
1058 page_cache_release(page);
1059 }
1060 lru_add_drain(); /* Push any new pages onto the LRU now */
1061 return shmem_swapin_async(p, entry, idx);
1062}
1063
1064static struct page *
1065shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
1066 unsigned long idx)
1067{ 1138{
1068 struct vm_area_struct pvma; 1139 struct vm_area_struct pvma;
1069 struct page *page; 1140 struct page *page;
1070 1141
1071 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1142 /* Create a pseudo vma that just contains the policy */
1072 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 1143 pvma.vm_start = 0;
1073 pvma.vm_pgoff = idx; 1144 pvma.vm_pgoff = idx;
1074 pvma.vm_end = PAGE_SIZE; 1145 pvma.vm_ops = NULL;
1146 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1075 page = alloc_page_vma(gfp, &pvma, 0); 1147 page = alloc_page_vma(gfp, &pvma, 0);
1076 mpol_free(pvma.vm_policy); 1148 mpol_free(pvma.vm_policy);
1077 return page; 1149 return page;
@@ -1083,15 +1155,14 @@ static inline int shmem_parse_mpol(char *value, int *policy,
1083 return 1; 1155 return 1;
1084} 1156}
1085 1157
1086static inline struct page * 1158static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1087shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) 1159 struct shmem_inode_info *info, unsigned long idx)
1088{ 1160{
1089 swapin_readahead(entry, 0, NULL); 1161 return swapin_readahead(entry, gfp, NULL, 0);
1090 return read_swap_cache_async(entry, NULL, 0);
1091} 1162}
1092 1163
1093static inline struct page * 1164static inline struct page *shmem_alloc_page(gfp_t gfp,
1094shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx) 1165 struct shmem_inode_info *info, unsigned long idx)
1095{ 1166{
1096 return alloc_page(gfp); 1167 return alloc_page(gfp);
1097} 1168}
@@ -1114,6 +1185,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1114 struct page *swappage; 1185 struct page *swappage;
1115 swp_entry_t *entry; 1186 swp_entry_t *entry;
1116 swp_entry_t swap; 1187 swp_entry_t swap;
1188 gfp_t gfp;
1117 int error; 1189 int error;
1118 1190
1119 if (idx >= SHMEM_MAX_INDEX) 1191 if (idx >= SHMEM_MAX_INDEX)
@@ -1126,7 +1198,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1126 * Normally, filepage is NULL on entry, and either found 1198 * Normally, filepage is NULL on entry, and either found
1127 * uptodate immediately, or allocated and zeroed, or read 1199 * uptodate immediately, or allocated and zeroed, or read
1128 * in under swappage, which is then assigned to filepage. 1200 * in under swappage, which is then assigned to filepage.
1129 * But shmem_readpage and shmem_write_begin pass in a locked 1201 * But shmem_readpage (required for splice) passes in a locked
1130 * filepage, which may be found not uptodate by other callers 1202 * filepage, which may be found not uptodate by other callers
1131 * too, and may need to be copied from the swappage read in. 1203 * too, and may need to be copied from the swappage read in.
1132 */ 1204 */
@@ -1136,8 +1208,17 @@ repeat:
1136 if (filepage && PageUptodate(filepage)) 1208 if (filepage && PageUptodate(filepage))
1137 goto done; 1209 goto done;
1138 error = 0; 1210 error = 0;
1139 if (sgp == SGP_QUICK) 1211 gfp = mapping_gfp_mask(mapping);
1140 goto failed; 1212 if (!filepage) {
1213 /*
1214 * Try to preload while we can wait, to not make a habit of
1215 * draining atomic reserves; but don't latch on to this cpu.
1216 */
1217 error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
1218 if (error)
1219 goto failed;
1220 radix_tree_preload_end();
1221 }
1141 1222
1142 spin_lock(&info->lock); 1223 spin_lock(&info->lock);
1143 shmem_recalc_inode(inode); 1224 shmem_recalc_inode(inode);
@@ -1160,7 +1241,7 @@ repeat:
1160 *type |= VM_FAULT_MAJOR; 1241 *type |= VM_FAULT_MAJOR;
1161 } 1242 }
1162 spin_unlock(&info->lock); 1243 spin_unlock(&info->lock);
1163 swappage = shmem_swapin(info, swap, idx); 1244 swappage = shmem_swapin(swap, gfp, info, idx);
1164 if (!swappage) { 1245 if (!swappage) {
1165 spin_lock(&info->lock); 1246 spin_lock(&info->lock);
1166 entry = shmem_swp_alloc(info, idx, sgp); 1247 entry = shmem_swp_alloc(info, idx, sgp);
@@ -1218,13 +1299,15 @@ repeat:
1218 SetPageUptodate(filepage); 1299 SetPageUptodate(filepage);
1219 set_page_dirty(filepage); 1300 set_page_dirty(filepage);
1220 swap_free(swap); 1301 swap_free(swap);
1221 } else if (!(error = move_from_swap_cache( 1302 } else if (!(error = add_to_page_cache(
1222 swappage, idx, mapping))) { 1303 swappage, mapping, idx, GFP_NOWAIT))) {
1223 info->flags |= SHMEM_PAGEIN; 1304 info->flags |= SHMEM_PAGEIN;
1224 shmem_swp_set(info, entry, 0); 1305 shmem_swp_set(info, entry, 0);
1225 shmem_swp_unmap(entry); 1306 shmem_swp_unmap(entry);
1307 delete_from_swap_cache(swappage);
1226 spin_unlock(&info->lock); 1308 spin_unlock(&info->lock);
1227 filepage = swappage; 1309 filepage = swappage;
1310 set_page_dirty(filepage);
1228 swap_free(swap); 1311 swap_free(swap);
1229 } else { 1312 } else {
1230 shmem_swp_unmap(entry); 1313 shmem_swp_unmap(entry);
@@ -1232,8 +1315,11 @@ repeat:
1232 unlock_page(swappage); 1315 unlock_page(swappage);
1233 page_cache_release(swappage); 1316 page_cache_release(swappage);
1234 if (error == -ENOMEM) { 1317 if (error == -ENOMEM) {
1235 /* let kswapd refresh zone for GFP_ATOMICs */ 1318 /* allow reclaim from this memory cgroup */
1236 congestion_wait(WRITE, HZ/50); 1319 error = mem_cgroup_cache_charge(NULL,
1320 current->mm, gfp & ~__GFP_HIGHMEM);
1321 if (error)
1322 goto failed;
1237 } 1323 }
1238 goto repeat; 1324 goto repeat;
1239 } 1325 }
@@ -1272,9 +1358,7 @@ repeat:
1272 1358
1273 if (!filepage) { 1359 if (!filepage) {
1274 spin_unlock(&info->lock); 1360 spin_unlock(&info->lock);
1275 filepage = shmem_alloc_page(mapping_gfp_mask(mapping), 1361 filepage = shmem_alloc_page(gfp, info, idx);
1276 info,
1277 idx);
1278 if (!filepage) { 1362 if (!filepage) {
1279 shmem_unacct_blocks(info->flags, 1); 1363 shmem_unacct_blocks(info->flags, 1);
1280 shmem_free_blocks(inode, 1); 1364 shmem_free_blocks(inode, 1);
@@ -1282,6 +1366,17 @@ repeat:
1282 goto failed; 1366 goto failed;
1283 } 1367 }
1284 1368
1369 /* Precharge page while we can wait, compensate after */
1370 error = mem_cgroup_cache_charge(filepage, current->mm,
1371 gfp & ~__GFP_HIGHMEM);
1372 if (error) {
1373 page_cache_release(filepage);
1374 shmem_unacct_blocks(info->flags, 1);
1375 shmem_free_blocks(inode, 1);
1376 filepage = NULL;
1377 goto failed;
1378 }
1379
1285 spin_lock(&info->lock); 1380 spin_lock(&info->lock);
1286 entry = shmem_swp_alloc(info, idx, sgp); 1381 entry = shmem_swp_alloc(info, idx, sgp);
1287 if (IS_ERR(entry)) 1382 if (IS_ERR(entry))
@@ -1291,8 +1386,9 @@ repeat:
1291 shmem_swp_unmap(entry); 1386 shmem_swp_unmap(entry);
1292 } 1387 }
1293 if (error || swap.val || 0 != add_to_page_cache_lru( 1388 if (error || swap.val || 0 != add_to_page_cache_lru(
1294 filepage, mapping, idx, GFP_ATOMIC)) { 1389 filepage, mapping, idx, GFP_NOWAIT)) {
1295 spin_unlock(&info->lock); 1390 spin_unlock(&info->lock);
1391 mem_cgroup_uncharge_page(filepage);
1296 page_cache_release(filepage); 1392 page_cache_release(filepage);
1297 shmem_unacct_blocks(info->flags, 1); 1393 shmem_unacct_blocks(info->flags, 1);
1298 shmem_free_blocks(inode, 1); 1394 shmem_free_blocks(inode, 1);
@@ -1301,6 +1397,7 @@ repeat:
1301 goto failed; 1397 goto failed;
1302 goto repeat; 1398 goto repeat;
1303 } 1399 }
1400 mem_cgroup_uncharge_page(filepage);
1304 info->flags |= SHMEM_PAGEIN; 1401 info->flags |= SHMEM_PAGEIN;
1305 } 1402 }
1306 1403
@@ -1309,14 +1406,11 @@ repeat:
1309 clear_highpage(filepage); 1406 clear_highpage(filepage);
1310 flush_dcache_page(filepage); 1407 flush_dcache_page(filepage);
1311 SetPageUptodate(filepage); 1408 SetPageUptodate(filepage);
1409 if (sgp == SGP_DIRTY)
1410 set_page_dirty(filepage);
1312 } 1411 }
1313done: 1412done:
1314 if (*pagep != filepage) { 1413 *pagep = filepage;
1315 *pagep = filepage;
1316 if (sgp != SGP_FAULT)
1317 unlock_page(filepage);
1318
1319 }
1320 return 0; 1414 return 0;
1321 1415
1322failed: 1416failed:
@@ -1336,7 +1430,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1336 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 1430 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1337 return VM_FAULT_SIGBUS; 1431 return VM_FAULT_SIGBUS;
1338 1432
1339 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret); 1433 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1340 if (error) 1434 if (error)
1341 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1435 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1342 1436
@@ -1399,15 +1493,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1399 struct shmem_inode_info *info; 1493 struct shmem_inode_info *info;
1400 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1494 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1401 1495
1402 if (sbinfo->max_inodes) { 1496 if (shmem_reserve_inode(sb))
1403 spin_lock(&sbinfo->stat_lock); 1497 return NULL;
1404 if (!sbinfo->free_inodes) {
1405 spin_unlock(&sbinfo->stat_lock);
1406 return NULL;
1407 }
1408 sbinfo->free_inodes--;
1409 spin_unlock(&sbinfo->stat_lock);
1410 }
1411 1498
1412 inode = new_inode(sb); 1499 inode = new_inode(sb);
1413 if (inode) { 1500 if (inode) {
@@ -1451,11 +1538,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1451 NULL); 1538 NULL);
1452 break; 1539 break;
1453 } 1540 }
1454 } else if (sbinfo->max_inodes) { 1541 } else
1455 spin_lock(&sbinfo->stat_lock); 1542 shmem_free_inode(sb);
1456 sbinfo->free_inodes++;
1457 spin_unlock(&sbinfo->stat_lock);
1458 }
1459 return inode; 1543 return inode;
1460} 1544}
1461 1545
@@ -1494,123 +1578,30 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1494{ 1578{
1495 struct inode *inode = mapping->host; 1579 struct inode *inode = mapping->host;
1496 1580
1581 if (pos + copied > inode->i_size)
1582 i_size_write(inode, pos + copied);
1583
1584 unlock_page(page);
1497 set_page_dirty(page); 1585 set_page_dirty(page);
1498 page_cache_release(page); 1586 page_cache_release(page);
1499 1587
1500 if (pos+copied > inode->i_size)
1501 i_size_write(inode, pos+copied);
1502
1503 return copied; 1588 return copied;
1504} 1589}
1505 1590
1506static ssize_t
1507shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1508{
1509 struct inode *inode = file->f_path.dentry->d_inode;
1510 loff_t pos;
1511 unsigned long written;
1512 ssize_t err;
1513
1514 if ((ssize_t) count < 0)
1515 return -EINVAL;
1516
1517 if (!access_ok(VERIFY_READ, buf, count))
1518 return -EFAULT;
1519
1520 mutex_lock(&inode->i_mutex);
1521
1522 pos = *ppos;
1523 written = 0;
1524
1525 err = generic_write_checks(file, &pos, &count, 0);
1526 if (err || !count)
1527 goto out;
1528
1529 err = remove_suid(file->f_path.dentry);
1530 if (err)
1531 goto out;
1532
1533 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1534
1535 do {
1536 struct page *page = NULL;
1537 unsigned long bytes, index, offset;
1538 char *kaddr;
1539 int left;
1540
1541 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1542 index = pos >> PAGE_CACHE_SHIFT;
1543 bytes = PAGE_CACHE_SIZE - offset;
1544 if (bytes > count)
1545 bytes = count;
1546
1547 /*
1548 * We don't hold page lock across copy from user -
1549 * what would it guard against? - so no deadlock here.
1550 * But it still may be a good idea to prefault below.
1551 */
1552
1553 err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1554 if (err)
1555 break;
1556
1557 left = bytes;
1558 if (PageHighMem(page)) {
1559 volatile unsigned char dummy;
1560 __get_user(dummy, buf);
1561 __get_user(dummy, buf + bytes - 1);
1562
1563 kaddr = kmap_atomic(page, KM_USER0);
1564 left = __copy_from_user_inatomic(kaddr + offset,
1565 buf, bytes);
1566 kunmap_atomic(kaddr, KM_USER0);
1567 }
1568 if (left) {
1569 kaddr = kmap(page);
1570 left = __copy_from_user(kaddr + offset, buf, bytes);
1571 kunmap(page);
1572 }
1573
1574 written += bytes;
1575 count -= bytes;
1576 pos += bytes;
1577 buf += bytes;
1578 if (pos > inode->i_size)
1579 i_size_write(inode, pos);
1580
1581 flush_dcache_page(page);
1582 set_page_dirty(page);
1583 mark_page_accessed(page);
1584 page_cache_release(page);
1585
1586 if (left) {
1587 pos -= left;
1588 written -= left;
1589 err = -EFAULT;
1590 break;
1591 }
1592
1593 /*
1594 * Our dirty pages are not counted in nr_dirty,
1595 * and we do not attempt to balance dirty pages.
1596 */
1597
1598 cond_resched();
1599 } while (count);
1600
1601 *ppos = pos;
1602 if (written)
1603 err = written;
1604out:
1605 mutex_unlock(&inode->i_mutex);
1606 return err;
1607}
1608
1609static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1591static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1610{ 1592{
1611 struct inode *inode = filp->f_path.dentry->d_inode; 1593 struct inode *inode = filp->f_path.dentry->d_inode;
1612 struct address_space *mapping = inode->i_mapping; 1594 struct address_space *mapping = inode->i_mapping;
1613 unsigned long index, offset; 1595 unsigned long index, offset;
1596 enum sgp_type sgp = SGP_READ;
1597
1598 /*
1599 * Might this read be for a stacking filesystem? Then when reading
1600 * holes of a sparse file, we actually need to allocate those pages,
1601 * and even mark them dirty, so it cannot exceed the max_blocks limit.
1602 */
1603 if (segment_eq(get_fs(), KERNEL_DS))
1604 sgp = SGP_DIRTY;
1614 1605
1615 index = *ppos >> PAGE_CACHE_SHIFT; 1606 index = *ppos >> PAGE_CACHE_SHIFT;
1616 offset = *ppos & ~PAGE_CACHE_MASK; 1607 offset = *ppos & ~PAGE_CACHE_MASK;
@@ -1629,12 +1620,14 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1629 break; 1620 break;
1630 } 1621 }
1631 1622
1632 desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL); 1623 desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
1633 if (desc->error) { 1624 if (desc->error) {
1634 if (desc->error == -EINVAL) 1625 if (desc->error == -EINVAL)
1635 desc->error = 0; 1626 desc->error = 0;
1636 break; 1627 break;
1637 } 1628 }
1629 if (page)
1630 unlock_page(page);
1638 1631
1639 /* 1632 /*
1640 * We must evaluate after, since reads (unlike writes) 1633 * We must evaluate after, since reads (unlike writes)
@@ -1798,22 +1791,16 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1798static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 1791static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1799{ 1792{
1800 struct inode *inode = old_dentry->d_inode; 1793 struct inode *inode = old_dentry->d_inode;
1801 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1794 int ret;
1802 1795
1803 /* 1796 /*
1804 * No ordinary (disk based) filesystem counts links as inodes; 1797 * No ordinary (disk based) filesystem counts links as inodes;
1805 * but each new link needs a new dentry, pinning lowmem, and 1798 * but each new link needs a new dentry, pinning lowmem, and
1806 * tmpfs dentries cannot be pruned until they are unlinked. 1799 * tmpfs dentries cannot be pruned until they are unlinked.
1807 */ 1800 */
1808 if (sbinfo->max_inodes) { 1801 ret = shmem_reserve_inode(inode->i_sb);
1809 spin_lock(&sbinfo->stat_lock); 1802 if (ret)
1810 if (!sbinfo->free_inodes) { 1803 goto out;
1811 spin_unlock(&sbinfo->stat_lock);
1812 return -ENOSPC;
1813 }
1814 sbinfo->free_inodes--;
1815 spin_unlock(&sbinfo->stat_lock);
1816 }
1817 1804
1818 dir->i_size += BOGO_DIRENT_SIZE; 1805 dir->i_size += BOGO_DIRENT_SIZE;
1819 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1806 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1821,21 +1808,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1821 atomic_inc(&inode->i_count); /* New dentry reference */ 1808 atomic_inc(&inode->i_count); /* New dentry reference */
1822 dget(dentry); /* Extra pinning count for the created dentry */ 1809 dget(dentry); /* Extra pinning count for the created dentry */
1823 d_instantiate(dentry, inode); 1810 d_instantiate(dentry, inode);
1824 return 0; 1811out:
1812 return ret;
1825} 1813}
1826 1814
1827static int shmem_unlink(struct inode *dir, struct dentry *dentry) 1815static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1828{ 1816{
1829 struct inode *inode = dentry->d_inode; 1817 struct inode *inode = dentry->d_inode;
1830 1818
1831 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { 1819 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
1832 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1820 shmem_free_inode(inode->i_sb);
1833 if (sbinfo->max_inodes) {
1834 spin_lock(&sbinfo->stat_lock);
1835 sbinfo->free_inodes++;
1836 spin_unlock(&sbinfo->stat_lock);
1837 }
1838 }
1839 1821
1840 dir->i_size -= BOGO_DIRENT_SIZE; 1822 dir->i_size -= BOGO_DIRENT_SIZE;
1841 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1823 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1924,6 +1906,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1924 iput(inode); 1906 iput(inode);
1925 return error; 1907 return error;
1926 } 1908 }
1909 unlock_page(page);
1927 inode->i_op = &shmem_symlink_inode_operations; 1910 inode->i_op = &shmem_symlink_inode_operations;
1928 kaddr = kmap_atomic(page, KM_USER0); 1911 kaddr = kmap_atomic(page, KM_USER0);
1929 memcpy(kaddr, symname, len); 1912 memcpy(kaddr, symname, len);
@@ -1951,6 +1934,8 @@ static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1951 struct page *page = NULL; 1934 struct page *page = NULL;
1952 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1935 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1953 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1936 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1937 if (page)
1938 unlock_page(page);
1954 return page; 1939 return page;
1955} 1940}
1956 1941
@@ -1996,8 +1981,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name,
1996{ 1981{
1997 if (strcmp(name, "") == 0) 1982 if (strcmp(name, "") == 0)
1998 return -EINVAL; 1983 return -EINVAL;
1999 return security_inode_getsecurity(inode, name, buffer, size, 1984 return xattr_getsecurity(inode, name, buffer, size);
2000 -EOPNOTSUPP);
2001} 1985}
2002 1986
2003static int shmem_xattr_security_set(struct inode *inode, const char *name, 1987static int shmem_xattr_security_set(struct inode *inode, const char *name,
@@ -2138,7 +2122,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
2138 } 2122 }
2139 if (*rest) 2123 if (*rest)
2140 goto bad_val; 2124 goto bad_val;
2141 *blocks = size >> PAGE_CACHE_SHIFT; 2125 *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
2142 } else if (!strcmp(this_char,"nr_blocks")) { 2126 } else if (!strcmp(this_char,"nr_blocks")) {
2143 *blocks = memparse(value,&rest); 2127 *blocks = memparse(value,&rest);
2144 if (*rest) 2128 if (*rest)
@@ -2375,7 +2359,8 @@ static const struct file_operations shmem_file_operations = {
2375#ifdef CONFIG_TMPFS 2359#ifdef CONFIG_TMPFS
2376 .llseek = generic_file_llseek, 2360 .llseek = generic_file_llseek,
2377 .read = shmem_file_read, 2361 .read = shmem_file_read,
2378 .write = shmem_file_write, 2362 .write = do_sync_write,
2363 .aio_write = generic_file_aio_write,
2379 .fsync = simple_sync_file, 2364 .fsync = simple_sync_file,
2380 .splice_read = generic_file_splice_read, 2365 .splice_read = generic_file_splice_read,
2381 .splice_write = generic_file_splice_write, 2366 .splice_write = generic_file_splice_write,
diff --git a/mm/slob.c b/mm/slob.c
index 773a7aa80ab5..e2c3c0ec5463 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -12,10 +12,17 @@
12 * allocator is as little as 2 bytes, however typically most architectures 12 * allocator is as little as 2 bytes, however typically most architectures
13 * will require 4 bytes on 32-bit and 8 bytes on 64-bit. 13 * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
14 * 14 *
15 * The slob heap is a linked list of pages from alloc_pages(), and 15 * The slob heap is a set of linked list of pages from alloc_pages(),
16 * within each page, there is a singly-linked list of free blocks (slob_t). 16 * and within each page, there is a singly-linked list of free blocks
17 * The heap is grown on demand and allocation from the heap is currently 17 * (slob_t). The heap is grown on demand. To reduce fragmentation,
18 * first-fit. 18 * heap pages are segregated into three lists, with objects less than
19 * 256 bytes, objects less than 1024 bytes, and all other objects.
20 *
21 * Allocation from heap involves first searching for a page with
22 * sufficient free blocks (using a next-fit-like approach) followed by
23 * a first-fit scan of the page. Deallocation inserts objects back
24 * into the free list in address order, so this is effectively an
25 * address-ordered first fit.
19 * 26 *
20 * Above this is an implementation of kmalloc/kfree. Blocks returned 27 * Above this is an implementation of kmalloc/kfree. Blocks returned
21 * from kmalloc are prepended with a 4-byte header with the kmalloc size. 28 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
@@ -110,9 +117,13 @@ static inline void free_slob_page(struct slob_page *sp)
110} 117}
111 118
112/* 119/*
113 * All (partially) free slob pages go on this list. 120 * All partially free slob pages go on these lists.
114 */ 121 */
115static LIST_HEAD(free_slob_pages); 122#define SLOB_BREAK1 256
123#define SLOB_BREAK2 1024
124static LIST_HEAD(free_slob_small);
125static LIST_HEAD(free_slob_medium);
126static LIST_HEAD(free_slob_large);
116 127
117/* 128/*
118 * slob_page: True for all slob pages (false for bigblock pages) 129 * slob_page: True for all slob pages (false for bigblock pages)
@@ -140,9 +151,9 @@ static inline int slob_page_free(struct slob_page *sp)
140 return test_bit(PG_private, &sp->flags); 151 return test_bit(PG_private, &sp->flags);
141} 152}
142 153
143static inline void set_slob_page_free(struct slob_page *sp) 154static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
144{ 155{
145 list_add(&sp->list, &free_slob_pages); 156 list_add(&sp->list, list);
146 __set_bit(PG_private, &sp->flags); 157 __set_bit(PG_private, &sp->flags);
147} 158}
148 159
@@ -294,12 +305,20 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
294{ 305{
295 struct slob_page *sp; 306 struct slob_page *sp;
296 struct list_head *prev; 307 struct list_head *prev;
308 struct list_head *slob_list;
297 slob_t *b = NULL; 309 slob_t *b = NULL;
298 unsigned long flags; 310 unsigned long flags;
299 311
312 if (size < SLOB_BREAK1)
313 slob_list = &free_slob_small;
314 else if (size < SLOB_BREAK2)
315 slob_list = &free_slob_medium;
316 else
317 slob_list = &free_slob_large;
318
300 spin_lock_irqsave(&slob_lock, flags); 319 spin_lock_irqsave(&slob_lock, flags);
301 /* Iterate through each partially free page, try to find room */ 320 /* Iterate through each partially free page, try to find room */
302 list_for_each_entry(sp, &free_slob_pages, list) { 321 list_for_each_entry(sp, slob_list, list) {
303#ifdef CONFIG_NUMA 322#ifdef CONFIG_NUMA
304 /* 323 /*
305 * If there's a node specification, search for a partial 324 * If there's a node specification, search for a partial
@@ -321,9 +340,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
321 /* Improve fragment distribution and reduce our average 340 /* Improve fragment distribution and reduce our average
322 * search time by starting our next search here. (see 341 * search time by starting our next search here. (see
323 * Knuth vol 1, sec 2.5, pg 449) */ 342 * Knuth vol 1, sec 2.5, pg 449) */
324 if (prev != free_slob_pages.prev && 343 if (prev != slob_list->prev &&
325 free_slob_pages.next != prev->next) 344 slob_list->next != prev->next)
326 list_move_tail(&free_slob_pages, prev->next); 345 list_move_tail(slob_list, prev->next);
327 break; 346 break;
328 } 347 }
329 spin_unlock_irqrestore(&slob_lock, flags); 348 spin_unlock_irqrestore(&slob_lock, flags);
@@ -341,7 +360,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
341 sp->free = b; 360 sp->free = b;
342 INIT_LIST_HEAD(&sp->list); 361 INIT_LIST_HEAD(&sp->list);
343 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 362 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
344 set_slob_page_free(sp); 363 set_slob_page_free(sp, slob_list);
345 b = slob_page_alloc(sp, size, align); 364 b = slob_page_alloc(sp, size, align);
346 BUG_ON(!b); 365 BUG_ON(!b);
347 spin_unlock_irqrestore(&slob_lock, flags); 366 spin_unlock_irqrestore(&slob_lock, flags);
@@ -387,7 +406,7 @@ static void slob_free(void *block, int size)
387 set_slob(b, units, 406 set_slob(b, units,
388 (void *)((unsigned long)(b + 407 (void *)((unsigned long)(b +
389 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); 408 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
390 set_slob_page_free(sp); 409 set_slob_page_free(sp, &free_slob_small);
391 goto out; 410 goto out;
392 } 411 }
393 412
@@ -398,6 +417,10 @@ static void slob_free(void *block, int size)
398 sp->units += units; 417 sp->units += units;
399 418
400 if (b < sp->free) { 419 if (b < sp->free) {
420 if (b + units == sp->free) {
421 units += slob_units(sp->free);
422 sp->free = slob_next(sp->free);
423 }
401 set_slob(b, units, sp->free); 424 set_slob(b, units, sp->free);
402 sp->free = b; 425 sp->free = b;
403 } else { 426 } else {
diff --git a/mm/slub.c b/mm/slub.c
index 5cc4b7dddb50..3f056677fa8f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -247,7 +247,10 @@ static void sysfs_slab_remove(struct kmem_cache *);
247static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 247static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
248static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 248static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
249 { return 0; } 249 { return 0; }
250static inline void sysfs_slab_remove(struct kmem_cache *s) {} 250static inline void sysfs_slab_remove(struct kmem_cache *s)
251{
252 kfree(s);
253}
251#endif 254#endif
252 255
253/******************************************************************** 256/********************************************************************
@@ -354,22 +357,22 @@ static void print_section(char *text, u8 *addr, unsigned int length)
354 printk(KERN_ERR "%8s 0x%p: ", text, addr + i); 357 printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
355 newline = 0; 358 newline = 0;
356 } 359 }
357 printk(" %02x", addr[i]); 360 printk(KERN_CONT " %02x", addr[i]);
358 offset = i % 16; 361 offset = i % 16;
359 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; 362 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
360 if (offset == 15) { 363 if (offset == 15) {
361 printk(" %s\n",ascii); 364 printk(KERN_CONT " %s\n", ascii);
362 newline = 1; 365 newline = 1;
363 } 366 }
364 } 367 }
365 if (!newline) { 368 if (!newline) {
366 i %= 16; 369 i %= 16;
367 while (i < 16) { 370 while (i < 16) {
368 printk(" "); 371 printk(KERN_CONT " ");
369 ascii[i] = ' '; 372 ascii[i] = ' ';
370 i++; 373 i++;
371 } 374 }
372 printk(" %s\n", ascii); 375 printk(KERN_CONT " %s\n", ascii);
373 } 376 }
374} 377}
375 378
@@ -529,7 +532,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
529 532
530 if (s->flags & __OBJECT_POISON) { 533 if (s->flags & __OBJECT_POISON) {
531 memset(p, POISON_FREE, s->objsize - 1); 534 memset(p, POISON_FREE, s->objsize - 1);
532 p[s->objsize -1] = POISON_END; 535 p[s->objsize - 1] = POISON_END;
533 } 536 }
534 537
535 if (s->flags & SLAB_RED_ZONE) 538 if (s->flags & SLAB_RED_ZONE)
@@ -558,7 +561,7 @@ static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
558 561
559static int check_bytes_and_report(struct kmem_cache *s, struct page *page, 562static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
560 u8 *object, char *what, 563 u8 *object, char *what,
561 u8* start, unsigned int value, unsigned int bytes) 564 u8 *start, unsigned int value, unsigned int bytes)
562{ 565{
563 u8 *fault; 566 u8 *fault;
564 u8 *end; 567 u8 *end;
@@ -692,7 +695,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
692 (!check_bytes_and_report(s, page, p, "Poison", p, 695 (!check_bytes_and_report(s, page, p, "Poison", p,
693 POISON_FREE, s->objsize - 1) || 696 POISON_FREE, s->objsize - 1) ||
694 !check_bytes_and_report(s, page, p, "Poison", 697 !check_bytes_and_report(s, page, p, "Poison",
695 p + s->objsize -1, POISON_END, 1))) 698 p + s->objsize - 1, POISON_END, 1)))
696 return 0; 699 return 0;
697 /* 700 /*
698 * check_pad_bytes cleans up on its own. 701 * check_pad_bytes cleans up on its own.
@@ -900,8 +903,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
900 "SLUB <none>: no slab for object 0x%p.\n", 903 "SLUB <none>: no slab for object 0x%p.\n",
901 object); 904 object);
902 dump_stack(); 905 dump_stack();
903 } 906 } else
904 else
905 object_err(s, page, object, 907 object_err(s, page, object,
906 "page slab pointer corrupt."); 908 "page slab pointer corrupt.");
907 goto fail; 909 goto fail;
@@ -947,7 +949,7 @@ static int __init setup_slub_debug(char *str)
947 /* 949 /*
948 * Determine which debug features should be switched on 950 * Determine which debug features should be switched on
949 */ 951 */
950 for ( ;*str && *str != ','; str++) { 952 for (; *str && *str != ','; str++) {
951 switch (tolower(*str)) { 953 switch (tolower(*str)) {
952 case 'f': 954 case 'f':
953 slub_debug |= SLAB_DEBUG_FREE; 955 slub_debug |= SLAB_DEBUG_FREE;
@@ -966,7 +968,7 @@ static int __init setup_slub_debug(char *str)
966 break; 968 break;
967 default: 969 default:
968 printk(KERN_ERR "slub_debug option '%c' " 970 printk(KERN_ERR "slub_debug option '%c' "
969 "unknown. skipped\n",*str); 971 "unknown. skipped\n", *str);
970 } 972 }
971 } 973 }
972 974
@@ -1039,7 +1041,7 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize,
1039 */ 1041 */
1040static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1042static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1041{ 1043{
1042 struct page * page; 1044 struct page *page;
1043 int pages = 1 << s->order; 1045 int pages = 1 << s->order;
1044 1046
1045 if (s->order) 1047 if (s->order)
@@ -1135,7 +1137,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1135 mod_zone_page_state(page_zone(page), 1137 mod_zone_page_state(page_zone(page),
1136 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1138 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
1137 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1139 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1138 - pages); 1140 -pages);
1139 1141
1140 __free_pages(page, s->order); 1142 __free_pages(page, s->order);
1141} 1143}
@@ -1195,19 +1197,15 @@ static __always_inline int slab_trylock(struct page *page)
1195/* 1197/*
1196 * Management of partially allocated slabs 1198 * Management of partially allocated slabs
1197 */ 1199 */
1198static void add_partial_tail(struct kmem_cache_node *n, struct page *page) 1200static void add_partial(struct kmem_cache_node *n,
1199{ 1201 struct page *page, int tail)
1200 spin_lock(&n->list_lock);
1201 n->nr_partial++;
1202 list_add_tail(&page->lru, &n->partial);
1203 spin_unlock(&n->list_lock);
1204}
1205
1206static void add_partial(struct kmem_cache_node *n, struct page *page)
1207{ 1202{
1208 spin_lock(&n->list_lock); 1203 spin_lock(&n->list_lock);
1209 n->nr_partial++; 1204 n->nr_partial++;
1210 list_add(&page->lru, &n->partial); 1205 if (tail)
1206 list_add_tail(&page->lru, &n->partial);
1207 else
1208 list_add(&page->lru, &n->partial);
1211 spin_unlock(&n->list_lock); 1209 spin_unlock(&n->list_lock);
1212} 1210}
1213 1211
@@ -1292,7 +1290,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1292 * expensive if we do it every time we are trying to find a slab 1290 * expensive if we do it every time we are trying to find a slab
1293 * with available objects. 1291 * with available objects.
1294 */ 1292 */
1295 if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) 1293 if (!s->remote_node_defrag_ratio ||
1294 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1296 return NULL; 1295 return NULL;
1297 1296
1298 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 1297 zonelist = &NODE_DATA(slab_node(current->mempolicy))
@@ -1335,7 +1334,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1335 * 1334 *
1336 * On exit the slab lock will have been dropped. 1335 * On exit the slab lock will have been dropped.
1337 */ 1336 */
1338static void unfreeze_slab(struct kmem_cache *s, struct page *page) 1337static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1339{ 1338{
1340 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1339 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1341 1340
@@ -1343,7 +1342,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
1343 if (page->inuse) { 1342 if (page->inuse) {
1344 1343
1345 if (page->freelist) 1344 if (page->freelist)
1346 add_partial(n, page); 1345 add_partial(n, page, tail);
1347 else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) 1346 else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
1348 add_full(n, page); 1347 add_full(n, page);
1349 slab_unlock(page); 1348 slab_unlock(page);
@@ -1358,7 +1357,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
1358 * partial list stays small. kmem_cache_shrink can 1357 * partial list stays small. kmem_cache_shrink can
1359 * reclaim empty slabs from the partial list. 1358 * reclaim empty slabs from the partial list.
1360 */ 1359 */
1361 add_partial_tail(n, page); 1360 add_partial(n, page, 1);
1362 slab_unlock(page); 1361 slab_unlock(page);
1363 } else { 1362 } else {
1364 slab_unlock(page); 1363 slab_unlock(page);
@@ -1373,6 +1372,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
1373static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1372static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1374{ 1373{
1375 struct page *page = c->page; 1374 struct page *page = c->page;
1375 int tail = 1;
1376 /* 1376 /*
1377 * Merge cpu freelist into freelist. Typically we get here 1377 * Merge cpu freelist into freelist. Typically we get here
1378 * because both freelists are empty. So this is unlikely 1378 * because both freelists are empty. So this is unlikely
@@ -1381,6 +1381,8 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1381 while (unlikely(c->freelist)) { 1381 while (unlikely(c->freelist)) {
1382 void **object; 1382 void **object;
1383 1383
1384 tail = 0; /* Hot objects. Put the slab first */
1385
1384 /* Retrieve object from cpu_freelist */ 1386 /* Retrieve object from cpu_freelist */
1385 object = c->freelist; 1387 object = c->freelist;
1386 c->freelist = c->freelist[c->offset]; 1388 c->freelist = c->freelist[c->offset];
@@ -1391,7 +1393,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1391 page->inuse--; 1393 page->inuse--;
1392 } 1394 }
1393 c->page = NULL; 1395 c->page = NULL;
1394 unfreeze_slab(s, page); 1396 unfreeze_slab(s, page, tail);
1395} 1397}
1396 1398
1397static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1399static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -1539,7 +1541,7 @@ debug:
1539 * 1541 *
1540 * Otherwise we can simply pick the next object from the lockless free list. 1542 * Otherwise we can simply pick the next object from the lockless free list.
1541 */ 1543 */
1542static void __always_inline *slab_alloc(struct kmem_cache *s, 1544static __always_inline void *slab_alloc(struct kmem_cache *s,
1543 gfp_t gfpflags, int node, void *addr) 1545 gfp_t gfpflags, int node, void *addr)
1544{ 1546{
1545 void **object; 1547 void **object;
@@ -1613,7 +1615,7 @@ checks_ok:
1613 * then add it. 1615 * then add it.
1614 */ 1616 */
1615 if (unlikely(!prior)) 1617 if (unlikely(!prior))
1616 add_partial_tail(get_node(s, page_to_nid(page)), page); 1618 add_partial(get_node(s, page_to_nid(page)), page, 1);
1617 1619
1618out_unlock: 1620out_unlock:
1619 slab_unlock(page); 1621 slab_unlock(page);
@@ -1647,7 +1649,7 @@ debug:
1647 * If fastpath is not possible then fall back to __slab_free where we deal 1649 * If fastpath is not possible then fall back to __slab_free where we deal
1648 * with all sorts of special processing. 1650 * with all sorts of special processing.
1649 */ 1651 */
1650static void __always_inline slab_free(struct kmem_cache *s, 1652static __always_inline void slab_free(struct kmem_cache *s,
1651 struct page *page, void *x, void *addr) 1653 struct page *page, void *x, void *addr)
1652{ 1654{
1653 void **object = (void *)x; 1655 void **object = (void *)x;
@@ -1997,6 +1999,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
1997{ 1999{
1998 struct page *page; 2000 struct page *page;
1999 struct kmem_cache_node *n; 2001 struct kmem_cache_node *n;
2002 unsigned long flags;
2000 2003
2001 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 2004 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
2002 2005
@@ -2021,7 +2024,14 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
2021#endif 2024#endif
2022 init_kmem_cache_node(n); 2025 init_kmem_cache_node(n);
2023 atomic_long_inc(&n->nr_slabs); 2026 atomic_long_inc(&n->nr_slabs);
2024 add_partial(n, page); 2027 /*
2028 * lockdep requires consistent irq usage for each lock
2029 * so even though there cannot be a race this early in
2030 * the boot sequence, we still disable irqs.
2031 */
2032 local_irq_save(flags);
2033 add_partial(n, page, 0);
2034 local_irq_restore(flags);
2025 return n; 2035 return n;
2026} 2036}
2027 2037
@@ -2206,7 +2216,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
2206 2216
2207 s->refcount = 1; 2217 s->refcount = 1;
2208#ifdef CONFIG_NUMA 2218#ifdef CONFIG_NUMA
2209 s->defrag_ratio = 100; 2219 s->remote_node_defrag_ratio = 100;
2210#endif 2220#endif
2211 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) 2221 if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
2212 goto error; 2222 goto error;
@@ -2228,7 +2238,7 @@ error:
2228 */ 2238 */
2229int kmem_ptr_validate(struct kmem_cache *s, const void *object) 2239int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2230{ 2240{
2231 struct page * page; 2241 struct page *page;
2232 2242
2233 page = get_object_page(object); 2243 page = get_object_page(object);
2234 2244
@@ -2322,7 +2332,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
2322 if (kmem_cache_close(s)) 2332 if (kmem_cache_close(s))
2323 WARN_ON(1); 2333 WARN_ON(1);
2324 sysfs_slab_remove(s); 2334 sysfs_slab_remove(s);
2325 kfree(s);
2326 } else 2335 } else
2327 up_write(&slub_lock); 2336 up_write(&slub_lock);
2328} 2337}
@@ -2341,7 +2350,7 @@ static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
2341 2350
2342static int __init setup_slub_min_order(char *str) 2351static int __init setup_slub_min_order(char *str)
2343{ 2352{
2344 get_option (&str, &slub_min_order); 2353 get_option(&str, &slub_min_order);
2345 2354
2346 return 1; 2355 return 1;
2347} 2356}
@@ -2350,7 +2359,7 @@ __setup("slub_min_order=", setup_slub_min_order);
2350 2359
2351static int __init setup_slub_max_order(char *str) 2360static int __init setup_slub_max_order(char *str)
2352{ 2361{
2353 get_option (&str, &slub_max_order); 2362 get_option(&str, &slub_max_order);
2354 2363
2355 return 1; 2364 return 1;
2356} 2365}
@@ -2359,7 +2368,7 @@ __setup("slub_max_order=", setup_slub_max_order);
2359 2368
2360static int __init setup_slub_min_objects(char *str) 2369static int __init setup_slub_min_objects(char *str)
2361{ 2370{
2362 get_option (&str, &slub_min_objects); 2371 get_option(&str, &slub_min_objects);
2363 2372
2364 return 1; 2373 return 1;
2365} 2374}
@@ -2605,6 +2614,19 @@ void kfree(const void *x)
2605} 2614}
2606EXPORT_SYMBOL(kfree); 2615EXPORT_SYMBOL(kfree);
2607 2616
2617static unsigned long count_partial(struct kmem_cache_node *n)
2618{
2619 unsigned long flags;
2620 unsigned long x = 0;
2621 struct page *page;
2622
2623 spin_lock_irqsave(&n->list_lock, flags);
2624 list_for_each_entry(page, &n->partial, lru)
2625 x += page->inuse;
2626 spin_unlock_irqrestore(&n->list_lock, flags);
2627 return x;
2628}
2629
2608/* 2630/*
2609 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 2631 * kmem_cache_shrink removes empty slabs from the partial lists and sorts
2610 * the remaining slabs by the number of items in use. The slabs with the 2632 * the remaining slabs by the number of items in use. The slabs with the
@@ -2931,7 +2953,7 @@ static struct kmem_cache *find_mergeable(size_t size,
2931 * Check if alignment is compatible. 2953 * Check if alignment is compatible.
2932 * Courtesy of Adrian Drzewiecki 2954 * Courtesy of Adrian Drzewiecki
2933 */ 2955 */
2934 if ((s->size & ~(align -1)) != s->size) 2956 if ((s->size & ~(align - 1)) != s->size)
2935 continue; 2957 continue;
2936 2958
2937 if (s->size - size >= sizeof(void *)) 2959 if (s->size - size >= sizeof(void *))
@@ -3040,8 +3062,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3040 return NOTIFY_OK; 3062 return NOTIFY_OK;
3041} 3063}
3042 3064
3043static struct notifier_block __cpuinitdata slab_notifier = 3065static struct notifier_block __cpuinitdata slab_notifier = {
3044 { &slab_cpuup_callback, NULL, 0 }; 3066 &slab_cpuup_callback, NULL, 0
3067};
3045 3068
3046#endif 3069#endif
3047 3070
@@ -3076,19 +3099,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3076 return slab_alloc(s, gfpflags, node, caller); 3099 return slab_alloc(s, gfpflags, node, caller);
3077} 3100}
3078 3101
3079static unsigned long count_partial(struct kmem_cache_node *n)
3080{
3081 unsigned long flags;
3082 unsigned long x = 0;
3083 struct page *page;
3084
3085 spin_lock_irqsave(&n->list_lock, flags);
3086 list_for_each_entry(page, &n->partial, lru)
3087 x += page->inuse;
3088 spin_unlock_irqrestore(&n->list_lock, flags);
3089 return x;
3090}
3091
3092#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) 3102#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
3093static int validate_slab(struct kmem_cache *s, struct page *page, 3103static int validate_slab(struct kmem_cache *s, struct page *page,
3094 unsigned long *map) 3104 unsigned long *map)
@@ -3390,7 +3400,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
3390static int list_locations(struct kmem_cache *s, char *buf, 3400static int list_locations(struct kmem_cache *s, char *buf,
3391 enum track_item alloc) 3401 enum track_item alloc)
3392{ 3402{
3393 int n = 0; 3403 int len = 0;
3394 unsigned long i; 3404 unsigned long i;
3395 struct loc_track t = { 0, 0, NULL }; 3405 struct loc_track t = { 0, 0, NULL };
3396 int node; 3406 int node;
@@ -3421,54 +3431,54 @@ static int list_locations(struct kmem_cache *s, char *buf,
3421 for (i = 0; i < t.count; i++) { 3431 for (i = 0; i < t.count; i++) {
3422 struct location *l = &t.loc[i]; 3432 struct location *l = &t.loc[i];
3423 3433
3424 if (n > PAGE_SIZE - 100) 3434 if (len > PAGE_SIZE - 100)
3425 break; 3435 break;
3426 n += sprintf(buf + n, "%7ld ", l->count); 3436 len += sprintf(buf + len, "%7ld ", l->count);
3427 3437
3428 if (l->addr) 3438 if (l->addr)
3429 n += sprint_symbol(buf + n, (unsigned long)l->addr); 3439 len += sprint_symbol(buf + len, (unsigned long)l->addr);
3430 else 3440 else
3431 n += sprintf(buf + n, "<not-available>"); 3441 len += sprintf(buf + len, "<not-available>");
3432 3442
3433 if (l->sum_time != l->min_time) { 3443 if (l->sum_time != l->min_time) {
3434 unsigned long remainder; 3444 unsigned long remainder;
3435 3445
3436 n += sprintf(buf + n, " age=%ld/%ld/%ld", 3446 len += sprintf(buf + len, " age=%ld/%ld/%ld",
3437 l->min_time, 3447 l->min_time,
3438 div_long_long_rem(l->sum_time, l->count, &remainder), 3448 div_long_long_rem(l->sum_time, l->count, &remainder),
3439 l->max_time); 3449 l->max_time);
3440 } else 3450 } else
3441 n += sprintf(buf + n, " age=%ld", 3451 len += sprintf(buf + len, " age=%ld",
3442 l->min_time); 3452 l->min_time);
3443 3453
3444 if (l->min_pid != l->max_pid) 3454 if (l->min_pid != l->max_pid)
3445 n += sprintf(buf + n, " pid=%ld-%ld", 3455 len += sprintf(buf + len, " pid=%ld-%ld",
3446 l->min_pid, l->max_pid); 3456 l->min_pid, l->max_pid);
3447 else 3457 else
3448 n += sprintf(buf + n, " pid=%ld", 3458 len += sprintf(buf + len, " pid=%ld",
3449 l->min_pid); 3459 l->min_pid);
3450 3460
3451 if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && 3461 if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
3452 n < PAGE_SIZE - 60) { 3462 len < PAGE_SIZE - 60) {
3453 n += sprintf(buf + n, " cpus="); 3463 len += sprintf(buf + len, " cpus=");
3454 n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, 3464 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3455 l->cpus); 3465 l->cpus);
3456 } 3466 }
3457 3467
3458 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3468 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
3459 n < PAGE_SIZE - 60) { 3469 len < PAGE_SIZE - 60) {
3460 n += sprintf(buf + n, " nodes="); 3470 len += sprintf(buf + len, " nodes=");
3461 n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, 3471 len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3462 l->nodes); 3472 l->nodes);
3463 } 3473 }
3464 3474
3465 n += sprintf(buf + n, "\n"); 3475 len += sprintf(buf + len, "\n");
3466 } 3476 }
3467 3477
3468 free_loc_track(&t); 3478 free_loc_track(&t);
3469 if (!t.count) 3479 if (!t.count)
3470 n += sprintf(buf, "No data\n"); 3480 len += sprintf(buf, "No data\n");
3471 return n; 3481 return len;
3472} 3482}
3473 3483
3474enum slab_stat_type { 3484enum slab_stat_type {
@@ -3498,7 +3508,6 @@ static unsigned long slab_objects(struct kmem_cache *s,
3498 3508
3499 for_each_possible_cpu(cpu) { 3509 for_each_possible_cpu(cpu) {
3500 struct page *page; 3510 struct page *page;
3501 int node;
3502 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); 3511 struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
3503 3512
3504 if (!c) 3513 if (!c)
@@ -3510,8 +3519,6 @@ static unsigned long slab_objects(struct kmem_cache *s,
3510 continue; 3519 continue;
3511 if (page) { 3520 if (page) {
3512 if (flags & SO_CPU) { 3521 if (flags & SO_CPU) {
3513 int x = 0;
3514
3515 if (flags & SO_OBJECTS) 3522 if (flags & SO_OBJECTS)
3516 x = page->inuse; 3523 x = page->inuse;
3517 else 3524 else
@@ -3848,24 +3855,24 @@ static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
3848SLAB_ATTR_RO(free_calls); 3855SLAB_ATTR_RO(free_calls);
3849 3856
3850#ifdef CONFIG_NUMA 3857#ifdef CONFIG_NUMA
3851static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) 3858static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
3852{ 3859{
3853 return sprintf(buf, "%d\n", s->defrag_ratio / 10); 3860 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10);
3854} 3861}
3855 3862
3856static ssize_t defrag_ratio_store(struct kmem_cache *s, 3863static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
3857 const char *buf, size_t length) 3864 const char *buf, size_t length)
3858{ 3865{
3859 int n = simple_strtoul(buf, NULL, 10); 3866 int n = simple_strtoul(buf, NULL, 10);
3860 3867
3861 if (n < 100) 3868 if (n < 100)
3862 s->defrag_ratio = n * 10; 3869 s->remote_node_defrag_ratio = n * 10;
3863 return length; 3870 return length;
3864} 3871}
3865SLAB_ATTR(defrag_ratio); 3872SLAB_ATTR(remote_node_defrag_ratio);
3866#endif 3873#endif
3867 3874
3868static struct attribute * slab_attrs[] = { 3875static struct attribute *slab_attrs[] = {
3869 &slab_size_attr.attr, 3876 &slab_size_attr.attr,
3870 &object_size_attr.attr, 3877 &object_size_attr.attr,
3871 &objs_per_slab_attr.attr, 3878 &objs_per_slab_attr.attr,
@@ -3893,7 +3900,7 @@ static struct attribute * slab_attrs[] = {
3893 &cache_dma_attr.attr, 3900 &cache_dma_attr.attr,
3894#endif 3901#endif
3895#ifdef CONFIG_NUMA 3902#ifdef CONFIG_NUMA
3896 &defrag_ratio_attr.attr, 3903 &remote_node_defrag_ratio_attr.attr,
3897#endif 3904#endif
3898 NULL 3905 NULL
3899}; 3906};
@@ -3940,6 +3947,13 @@ static ssize_t slab_attr_store(struct kobject *kobj,
3940 return err; 3947 return err;
3941} 3948}
3942 3949
3950static void kmem_cache_release(struct kobject *kobj)
3951{
3952 struct kmem_cache *s = to_slab(kobj);
3953
3954 kfree(s);
3955}
3956
3943static struct sysfs_ops slab_sysfs_ops = { 3957static struct sysfs_ops slab_sysfs_ops = {
3944 .show = slab_attr_show, 3958 .show = slab_attr_show,
3945 .store = slab_attr_store, 3959 .store = slab_attr_store,
@@ -3947,6 +3961,7 @@ static struct sysfs_ops slab_sysfs_ops = {
3947 3961
3948static struct kobj_type slab_ktype = { 3962static struct kobj_type slab_ktype = {
3949 .sysfs_ops = &slab_sysfs_ops, 3963 .sysfs_ops = &slab_sysfs_ops,
3964 .release = kmem_cache_release
3950}; 3965};
3951 3966
3952static int uevent_filter(struct kset *kset, struct kobject *kobj) 3967static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -4048,6 +4063,7 @@ static void sysfs_slab_remove(struct kmem_cache *s)
4048{ 4063{
4049 kobject_uevent(&s->kobj, KOBJ_REMOVE); 4064 kobject_uevent(&s->kobj, KOBJ_REMOVE);
4050 kobject_del(&s->kobj); 4065 kobject_del(&s->kobj);
4066 kobject_put(&s->kobj);
4051} 4067}
4052 4068
4053/* 4069/*
diff --git a/mm/sparse.c b/mm/sparse.c
index a2183cb5d524..f6a43c09c322 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -237,7 +237,7 @@ static unsigned long *__kmalloc_section_usemap(void)
237} 237}
238#endif /* CONFIG_MEMORY_HOTPLUG */ 238#endif /* CONFIG_MEMORY_HOTPLUG */
239 239
240static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) 240static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
241{ 241{
242 unsigned long *usemap; 242 unsigned long *usemap;
243 struct mem_section *ms = __nr_to_section(pnum); 243 struct mem_section *ms = __nr_to_section(pnum);
@@ -353,17 +353,9 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
353 return __kmalloc_section_memmap(nr_pages); 353 return __kmalloc_section_memmap(nr_pages);
354} 354}
355 355
356static int vaddr_in_vmalloc_area(void *addr)
357{
358 if (addr >= (void *)VMALLOC_START &&
359 addr < (void *)VMALLOC_END)
360 return 1;
361 return 0;
362}
363
364static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 356static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
365{ 357{
366 if (vaddr_in_vmalloc_area(memmap)) 358 if (is_vmalloc_addr(memmap))
367 vfree(memmap); 359 vfree(memmap);
368 else 360 else
369 free_pages((unsigned long)memmap, 361 free_pages((unsigned long)memmap,
diff --git a/mm/swap.c b/mm/swap.c
index 9ac88323d237..710a20bb9749 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,6 +29,7 @@
29#include <linux/cpu.h> 29#include <linux/cpu.h>
30#include <linux/notifier.h> 30#include <linux/notifier.h>
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h>
32 33
33/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
34int page_cluster; 35int page_cluster;
@@ -41,7 +42,7 @@ static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
41 * This path almost never happens for VM activity - pages are normally 42 * This path almost never happens for VM activity - pages are normally
42 * freed via pagevecs. But it gets used by networking. 43 * freed via pagevecs. But it gets used by networking.
43 */ 44 */
44static void fastcall __page_cache_release(struct page *page) 45static void __page_cache_release(struct page *page)
45{ 46{
46 if (PageLRU(page)) { 47 if (PageLRU(page)) {
47 unsigned long flags; 48 unsigned long flags;
@@ -165,7 +166,7 @@ int rotate_reclaimable_page(struct page *page)
165/* 166/*
166 * FIXME: speed this up? 167 * FIXME: speed this up?
167 */ 168 */
168void fastcall activate_page(struct page *page) 169void activate_page(struct page *page)
169{ 170{
170 struct zone *zone = page_zone(page); 171 struct zone *zone = page_zone(page);
171 172
@@ -175,6 +176,7 @@ void fastcall activate_page(struct page *page)
175 SetPageActive(page); 176 SetPageActive(page);
176 add_page_to_active_list(zone, page); 177 add_page_to_active_list(zone, page);
177 __count_vm_event(PGACTIVATE); 178 __count_vm_event(PGACTIVATE);
179 mem_cgroup_move_lists(page_get_page_cgroup(page), true);
178 } 180 }
179 spin_unlock_irq(&zone->lru_lock); 181 spin_unlock_irq(&zone->lru_lock);
180} 182}
@@ -186,7 +188,7 @@ void fastcall activate_page(struct page *page)
186 * inactive,referenced -> active,unreferenced 188 * inactive,referenced -> active,unreferenced
187 * active,unreferenced -> active,referenced 189 * active,unreferenced -> active,referenced
188 */ 190 */
189void fastcall mark_page_accessed(struct page *page) 191void mark_page_accessed(struct page *page)
190{ 192{
191 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 193 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
192 activate_page(page); 194 activate_page(page);
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mark_page_accessed);
202 * lru_cache_add: add a page to the page lists 204 * lru_cache_add: add a page to the page lists
203 * @page: the page to add 205 * @page: the page to add
204 */ 206 */
205void fastcall lru_cache_add(struct page *page) 207void lru_cache_add(struct page *page)
206{ 208{
207 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 209 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
208 210
@@ -212,7 +214,7 @@ void fastcall lru_cache_add(struct page *page)
212 put_cpu_var(lru_add_pvecs); 214 put_cpu_var(lru_add_pvecs);
213} 215}
214 216
215void fastcall lru_cache_add_active(struct page *page) 217void lru_cache_add_active(struct page *page)
216{ 218{
217 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 219 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
218 220
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b52635601dfe..ec42f01a8d02 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -10,6 +10,7 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/swapops.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/pagemap.h> 15#include <linux/pagemap.h>
15#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
@@ -51,26 +52,22 @@ static struct {
51 unsigned long del_total; 52 unsigned long del_total;
52 unsigned long find_success; 53 unsigned long find_success;
53 unsigned long find_total; 54 unsigned long find_total;
54 unsigned long noent_race;
55 unsigned long exist_race;
56} swap_cache_info; 55} swap_cache_info;
57 56
58void show_swap_cache_info(void) 57void show_swap_cache_info(void)
59{ 58{
60 printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", 59 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
61 swap_cache_info.add_total, swap_cache_info.del_total, 60 swap_cache_info.add_total, swap_cache_info.del_total,
62 swap_cache_info.find_success, swap_cache_info.find_total, 61 swap_cache_info.find_success, swap_cache_info.find_total);
63 swap_cache_info.noent_race, swap_cache_info.exist_race);
64 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
65 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
66} 64}
67 65
68/* 66/*
69 * __add_to_swap_cache resembles add_to_page_cache on swapper_space, 67 * add_to_swap_cache resembles add_to_page_cache on swapper_space,
70 * but sets SwapCache flag and private instead of mapping and index. 68 * but sets SwapCache flag and private instead of mapping and index.
71 */ 69 */
72static int __add_to_swap_cache(struct page *page, swp_entry_t entry, 70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
73 gfp_t gfp_mask)
74{ 71{
75 int error; 72 int error;
76 73
@@ -88,6 +85,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
88 set_page_private(page, entry.val); 85 set_page_private(page, entry.val);
89 total_swapcache_pages++; 86 total_swapcache_pages++;
90 __inc_zone_page_state(page, NR_FILE_PAGES); 87 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total);
91 } 89 }
92 write_unlock_irq(&swapper_space.tree_lock); 90 write_unlock_irq(&swapper_space.tree_lock);
93 radix_tree_preload_end(); 91 radix_tree_preload_end();
@@ -95,31 +93,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
95 return error; 93 return error;
96} 94}
97 95
98static int add_to_swap_cache(struct page *page, swp_entry_t entry)
99{
100 int error;
101
102 BUG_ON(PageLocked(page));
103 if (!swap_duplicate(entry)) {
104 INC_CACHE_INFO(noent_race);
105 return -ENOENT;
106 }
107 SetPageLocked(page);
108 error = __add_to_swap_cache(page, entry, GFP_KERNEL);
109 /*
110 * Anon pages are already on the LRU, we don't run lru_cache_add here.
111 */
112 if (error) {
113 ClearPageLocked(page);
114 swap_free(entry);
115 if (error == -EEXIST)
116 INC_CACHE_INFO(exist_race);
117 return error;
118 }
119 INC_CACHE_INFO(add_total);
120 return 0;
121}
122
123/* 96/*
124 * This must be called only on pages that have 97 * This must be called only on pages that have
125 * been verified to be in the swap cache. 98 * been verified to be in the swap cache.
@@ -152,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
152 int err; 125 int err;
153 126
154 BUG_ON(!PageLocked(page)); 127 BUG_ON(!PageLocked(page));
128 BUG_ON(!PageUptodate(page));
155 129
156 for (;;) { 130 for (;;) {
157 entry = get_swap_page(); 131 entry = get_swap_page();
@@ -169,18 +143,15 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
169 /* 143 /*
170 * Add it to the swap cache and mark it dirty 144 * Add it to the swap cache and mark it dirty
171 */ 145 */
172 err = __add_to_swap_cache(page, entry, 146 err = add_to_swap_cache(page, entry,
173 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); 147 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
174 148
175 switch (err) { 149 switch (err) {
176 case 0: /* Success */ 150 case 0: /* Success */
177 SetPageUptodate(page);
178 SetPageDirty(page); 151 SetPageDirty(page);
179 INC_CACHE_INFO(add_total);
180 return 1; 152 return 1;
181 case -EEXIST: 153 case -EEXIST:
182 /* Raced with "speculative" read_swap_cache_async */ 154 /* Raced with "speculative" read_swap_cache_async */
183 INC_CACHE_INFO(exist_race);
184 swap_free(entry); 155 swap_free(entry);
185 continue; 156 continue;
186 default: 157 default:
@@ -211,40 +182,6 @@ void delete_from_swap_cache(struct page *page)
211 page_cache_release(page); 182 page_cache_release(page);
212} 183}
213 184
214/*
215 * Strange swizzling function only for use by shmem_writepage
216 */
217int move_to_swap_cache(struct page *page, swp_entry_t entry)
218{
219 int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
220 if (!err) {
221 remove_from_page_cache(page);
222 page_cache_release(page); /* pagecache ref */
223 if (!swap_duplicate(entry))
224 BUG();
225 SetPageDirty(page);
226 INC_CACHE_INFO(add_total);
227 } else if (err == -EEXIST)
228 INC_CACHE_INFO(exist_race);
229 return err;
230}
231
232/*
233 * Strange swizzling function for shmem_getpage (and shmem_unuse)
234 */
235int move_from_swap_cache(struct page *page, unsigned long index,
236 struct address_space *mapping)
237{
238 int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
239 if (!err) {
240 delete_from_swap_cache(page);
241 /* shift page from clean_pages to dirty_pages list */
242 ClearPageDirty(page);
243 set_page_dirty(page);
244 }
245 return err;
246}
247
248/* 185/*
249 * If we are the only user, then try to free up the swap cache. 186 * If we are the only user, then try to free up the swap cache.
250 * 187 *
@@ -317,7 +254,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
317 * A failure return means that either the page allocation failed or that 254 * A failure return means that either the page allocation failed or that
318 * the swap entry is no longer in use. 255 * the swap entry is no longer in use.
319 */ 256 */
320struct page *read_swap_cache_async(swp_entry_t entry, 257struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
321 struct vm_area_struct *vma, unsigned long addr) 258 struct vm_area_struct *vma, unsigned long addr)
322{ 259{
323 struct page *found_page, *new_page = NULL; 260 struct page *found_page, *new_page = NULL;
@@ -337,23 +274,27 @@ struct page *read_swap_cache_async(swp_entry_t entry,
337 * Get a new page to read into from swap. 274 * Get a new page to read into from swap.
338 */ 275 */
339 if (!new_page) { 276 if (!new_page) {
340 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 277 new_page = alloc_page_vma(gfp_mask, vma, addr);
341 vma, addr);
342 if (!new_page) 278 if (!new_page)
343 break; /* Out of memory */ 279 break; /* Out of memory */
344 } 280 }
345 281
346 /* 282 /*
283 * Swap entry may have been freed since our caller observed it.
284 */
285 if (!swap_duplicate(entry))
286 break;
287
288 /*
347 * Associate the page with swap entry in the swap cache. 289 * Associate the page with swap entry in the swap cache.
348 * May fail (-ENOENT) if swap entry has been freed since 290 * May fail (-EEXIST) if there is already a page associated
349 * our caller observed it. May fail (-EEXIST) if there 291 * with this entry in the swap cache: added by a racing
350 * is already a page associated with this entry in the 292 * read_swap_cache_async, or add_to_swap or shmem_writepage
351 * swap cache: added by a racing read_swap_cache_async, 293 * re-using the just freed swap entry for an existing page.
352 * or by try_to_swap_out (or shmem_writepage) re-using
353 * the just freed swap entry for an existing page.
354 * May fail (-ENOMEM) if radix-tree node allocation failed. 294 * May fail (-ENOMEM) if radix-tree node allocation failed.
355 */ 295 */
356 err = add_to_swap_cache(new_page, entry); 296 SetPageLocked(new_page);
297 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
357 if (!err) { 298 if (!err) {
358 /* 299 /*
359 * Initiate read into locked page and return. 300 * Initiate read into locked page and return.
@@ -362,9 +303,57 @@ struct page *read_swap_cache_async(swp_entry_t entry,
362 swap_readpage(NULL, new_page); 303 swap_readpage(NULL, new_page);
363 return new_page; 304 return new_page;
364 } 305 }
365 } while (err != -ENOENT && err != -ENOMEM); 306 ClearPageLocked(new_page);
307 swap_free(entry);
308 } while (err != -ENOMEM);
366 309
367 if (new_page) 310 if (new_page)
368 page_cache_release(new_page); 311 page_cache_release(new_page);
369 return found_page; 312 return found_page;
370} 313}
314
315/**
316 * swapin_readahead - swap in pages in hope we need them soon
317 * @entry: swap entry of this memory
318 * @vma: user vma this address belongs to
319 * @addr: target address for mempolicy
320 *
321 * Returns the struct page for entry and addr, after queueing swapin.
322 *
323 * Primitive swap readahead code. We simply read an aligned block of
324 * (1 << page_cluster) entries in the swap area. This method is chosen
325 * because it doesn't cost us any seek time. We also make sure to queue
326 * the 'original' request together with the readahead ones...
327 *
328 * This has been extended to use the NUMA policies from the mm triggering
329 * the readahead.
330 *
331 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
332 */
333struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
334 struct vm_area_struct *vma, unsigned long addr)
335{
336 int nr_pages;
337 struct page *page;
338 unsigned long offset;
339 unsigned long end_offset;
340
341 /*
342 * Get starting offset for readaround, and number of pages to read.
343 * Adjust starting address by readbehind (for NUMA interleave case)?
344 * No, it's very unlikely that swap layout would follow vma layout,
345 * more likely that neighbouring swap pages came from the same node:
346 * so use the same "addr" to choose the same node for each swap read.
347 */
348 nr_pages = valid_swaphandles(entry, &offset);
349 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
350 /* Ok, do the async read-ahead now */
351 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
352 gfp_mask, vma, addr);
353 if (!page)
354 break;
355 page_cache_release(page);
356 }
357 lru_add_drain(); /* Push any new pages onto the LRU now */
358 return read_swap_cache_async(entry, gfp_mask, vma, addr);
359}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648e1360..02ccab5ad9d9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -27,6 +27,7 @@
27#include <linux/mutex.h> 27#include <linux/mutex.h>
28#include <linux/capability.h> 28#include <linux/capability.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/memcontrol.h>
30 31
31#include <asm/pgtable.h> 32#include <asm/pgtable.h>
32#include <asm/tlbflush.h> 33#include <asm/tlbflush.h>
@@ -506,9 +507,24 @@ unsigned int count_swap_pages(int type, int free)
506 * just let do_wp_page work it out if a write is requested later - to 507 * just let do_wp_page work it out if a write is requested later - to
507 * force COW, vm_page_prot omits write permission from any private vma. 508 * force COW, vm_page_prot omits write permission from any private vma.
508 */ 509 */
509static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, 510static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
510 unsigned long addr, swp_entry_t entry, struct page *page) 511 unsigned long addr, swp_entry_t entry, struct page *page)
511{ 512{
513 spinlock_t *ptl;
514 pte_t *pte;
515 int ret = 1;
516
517 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
518 ret = -ENOMEM;
519
520 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
521 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
522 if (ret > 0)
523 mem_cgroup_uncharge_page(page);
524 ret = 0;
525 goto out;
526 }
527
512 inc_mm_counter(vma->vm_mm, anon_rss); 528 inc_mm_counter(vma->vm_mm, anon_rss);
513 get_page(page); 529 get_page(page);
514 set_pte_at(vma->vm_mm, addr, pte, 530 set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +536,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
520 * immediately swapped out again after swapon. 536 * immediately swapped out again after swapon.
521 */ 537 */
522 activate_page(page); 538 activate_page(page);
539out:
540 pte_unmap_unlock(pte, ptl);
541 return ret;
523} 542}
524 543
525static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 544static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -528,23 +547,34 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
528{ 547{
529 pte_t swp_pte = swp_entry_to_pte(entry); 548 pte_t swp_pte = swp_entry_to_pte(entry);
530 pte_t *pte; 549 pte_t *pte;
531 spinlock_t *ptl; 550 int ret = 0;
532 int found = 0;
533 551
534 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 552 /*
553 * We don't actually need pte lock while scanning for swp_pte: since
554 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
555 * page table while we're scanning; though it could get zapped, and on
556 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
557 * of unmatched parts which look like swp_pte, so unuse_pte must
558 * recheck under pte lock. Scanning without pte lock lets it be
559 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
560 */
561 pte = pte_offset_map(pmd, addr);
535 do { 562 do {
536 /* 563 /*
537 * swapoff spends a _lot_ of time in this loop! 564 * swapoff spends a _lot_ of time in this loop!
538 * Test inline before going to call unuse_pte. 565 * Test inline before going to call unuse_pte.
539 */ 566 */
540 if (unlikely(pte_same(*pte, swp_pte))) { 567 if (unlikely(pte_same(*pte, swp_pte))) {
541 unuse_pte(vma, pte++, addr, entry, page); 568 pte_unmap(pte);
542 found = 1; 569 ret = unuse_pte(vma, pmd, addr, entry, page);
543 break; 570 if (ret)
571 goto out;
572 pte = pte_offset_map(pmd, addr);
544 } 573 }
545 } while (pte++, addr += PAGE_SIZE, addr != end); 574 } while (pte++, addr += PAGE_SIZE, addr != end);
546 pte_unmap_unlock(pte - 1, ptl); 575 pte_unmap(pte - 1);
547 return found; 576out:
577 return ret;
548} 578}
549 579
550static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 580static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -553,14 +583,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
553{ 583{
554 pmd_t *pmd; 584 pmd_t *pmd;
555 unsigned long next; 585 unsigned long next;
586 int ret;
556 587
557 pmd = pmd_offset(pud, addr); 588 pmd = pmd_offset(pud, addr);
558 do { 589 do {
559 next = pmd_addr_end(addr, end); 590 next = pmd_addr_end(addr, end);
560 if (pmd_none_or_clear_bad(pmd)) 591 if (pmd_none_or_clear_bad(pmd))
561 continue; 592 continue;
562 if (unuse_pte_range(vma, pmd, addr, next, entry, page)) 593 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
563 return 1; 594 if (ret)
595 return ret;
564 } while (pmd++, addr = next, addr != end); 596 } while (pmd++, addr = next, addr != end);
565 return 0; 597 return 0;
566} 598}
@@ -571,14 +603,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
571{ 603{
572 pud_t *pud; 604 pud_t *pud;
573 unsigned long next; 605 unsigned long next;
606 int ret;
574 607
575 pud = pud_offset(pgd, addr); 608 pud = pud_offset(pgd, addr);
576 do { 609 do {
577 next = pud_addr_end(addr, end); 610 next = pud_addr_end(addr, end);
578 if (pud_none_or_clear_bad(pud)) 611 if (pud_none_or_clear_bad(pud))
579 continue; 612 continue;
580 if (unuse_pmd_range(vma, pud, addr, next, entry, page)) 613 ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
581 return 1; 614 if (ret)
615 return ret;
582 } while (pud++, addr = next, addr != end); 616 } while (pud++, addr = next, addr != end);
583 return 0; 617 return 0;
584} 618}
@@ -588,6 +622,7 @@ static int unuse_vma(struct vm_area_struct *vma,
588{ 622{
589 pgd_t *pgd; 623 pgd_t *pgd;
590 unsigned long addr, end, next; 624 unsigned long addr, end, next;
625 int ret;
591 626
592 if (page->mapping) { 627 if (page->mapping) {
593 addr = page_address_in_vma(page, vma); 628 addr = page_address_in_vma(page, vma);
@@ -605,8 +640,9 @@ static int unuse_vma(struct vm_area_struct *vma,
605 next = pgd_addr_end(addr, end); 640 next = pgd_addr_end(addr, end);
606 if (pgd_none_or_clear_bad(pgd)) 641 if (pgd_none_or_clear_bad(pgd))
607 continue; 642 continue;
608 if (unuse_pud_range(vma, pgd, addr, next, entry, page)) 643 ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
609 return 1; 644 if (ret)
645 return ret;
610 } while (pgd++, addr = next, addr != end); 646 } while (pgd++, addr = next, addr != end);
611 return 0; 647 return 0;
612} 648}
@@ -615,6 +651,7 @@ static int unuse_mm(struct mm_struct *mm,
615 swp_entry_t entry, struct page *page) 651 swp_entry_t entry, struct page *page)
616{ 652{
617 struct vm_area_struct *vma; 653 struct vm_area_struct *vma;
654 int ret = 0;
618 655
619 if (!down_read_trylock(&mm->mmap_sem)) { 656 if (!down_read_trylock(&mm->mmap_sem)) {
620 /* 657 /*
@@ -627,15 +664,11 @@ static int unuse_mm(struct mm_struct *mm,
627 lock_page(page); 664 lock_page(page);
628 } 665 }
629 for (vma = mm->mmap; vma; vma = vma->vm_next) { 666 for (vma = mm->mmap; vma; vma = vma->vm_next) {
630 if (vma->anon_vma && unuse_vma(vma, entry, page)) 667 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
631 break; 668 break;
632 } 669 }
633 up_read(&mm->mmap_sem); 670 up_read(&mm->mmap_sem);
634 /* 671 return (ret < 0)? ret: 0;
635 * Currently unuse_mm cannot fail, but leave error handling
636 * at call sites for now, since we change it from time to time.
637 */
638 return 0;
639} 672}
640 673
641/* 674/*
@@ -730,7 +763,8 @@ static int try_to_unuse(unsigned int type)
730 */ 763 */
731 swap_map = &si->swap_map[i]; 764 swap_map = &si->swap_map[i];
732 entry = swp_entry(type, i); 765 entry = swp_entry(type, i);
733 page = read_swap_cache_async(entry, NULL, 0); 766 page = read_swap_cache_async(entry,
767 GFP_HIGHUSER_MOVABLE, NULL, 0);
734 if (!page) { 768 if (!page) {
735 /* 769 /*
736 * Either swap_duplicate() failed because entry 770 * Either swap_duplicate() failed because entry
@@ -789,7 +823,7 @@ static int try_to_unuse(unsigned int type)
789 atomic_inc(&new_start_mm->mm_users); 823 atomic_inc(&new_start_mm->mm_users);
790 atomic_inc(&prev_mm->mm_users); 824 atomic_inc(&prev_mm->mm_users);
791 spin_lock(&mmlist_lock); 825 spin_lock(&mmlist_lock);
792 while (*swap_map > 1 && !retval && 826 while (*swap_map > 1 && !retval && !shmem &&
793 (p = p->next) != &start_mm->mmlist) { 827 (p = p->next) != &start_mm->mmlist) {
794 mm = list_entry(p, struct mm_struct, mmlist); 828 mm = list_entry(p, struct mm_struct, mmlist);
795 if (!atomic_inc_not_zero(&mm->mm_users)) 829 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -821,6 +855,13 @@ static int try_to_unuse(unsigned int type)
821 mmput(start_mm); 855 mmput(start_mm);
822 start_mm = new_start_mm; 856 start_mm = new_start_mm;
823 } 857 }
858 if (shmem) {
859 /* page has already been unlocked and released */
860 if (shmem > 0)
861 continue;
862 retval = shmem;
863 break;
864 }
824 if (retval) { 865 if (retval) {
825 unlock_page(page); 866 unlock_page(page);
826 page_cache_release(page); 867 page_cache_release(page);
@@ -859,12 +900,6 @@ static int try_to_unuse(unsigned int type)
859 * read from disk into another page. Splitting into two 900 * read from disk into another page. Splitting into two
860 * pages would be incorrect if swap supported "shared 901 * pages would be incorrect if swap supported "shared
861 * private" pages, but they are handled by tmpfs files. 902 * private" pages, but they are handled by tmpfs files.
862 *
863 * Note shmem_unuse already deleted a swappage from
864 * the swap cache, unless the move to filepage failed:
865 * in which case it left swappage in cache, lowered its
866 * swap count to pass quickly through the loops above,
867 * and now we must reincrement count to try again later.
868 */ 903 */
869 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 904 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
870 struct writeback_control wbc = { 905 struct writeback_control wbc = {
@@ -875,12 +910,8 @@ static int try_to_unuse(unsigned int type)
875 lock_page(page); 910 lock_page(page);
876 wait_on_page_writeback(page); 911 wait_on_page_writeback(page);
877 } 912 }
878 if (PageSwapCache(page)) { 913 if (PageSwapCache(page))
879 if (shmem) 914 delete_from_swap_cache(page);
880 swap_duplicate(entry);
881 else
882 delete_from_swap_cache(page);
883 }
884 915
885 /* 916 /*
886 * So we could skip searching mms once swap count went 917 * So we could skip searching mms once swap count went
@@ -1768,31 +1799,48 @@ get_swap_info_struct(unsigned type)
1768 */ 1799 */
1769int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1800int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1770{ 1801{
1802 struct swap_info_struct *si;
1771 int our_page_cluster = page_cluster; 1803 int our_page_cluster = page_cluster;
1772 int ret = 0, i = 1 << our_page_cluster; 1804 pgoff_t target, toff;
1773 unsigned long toff; 1805 pgoff_t base, end;
1774 struct swap_info_struct *swapdev = swp_type(entry) + swap_info; 1806 int nr_pages = 0;
1775 1807
1776 if (!our_page_cluster) /* no readahead */ 1808 if (!our_page_cluster) /* no readahead */
1777 return 0; 1809 return 0;
1778 toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; 1810
1779 if (!toff) /* first page is swap header */ 1811 si = &swap_info[swp_type(entry)];
1780 toff++, i--; 1812 target = swp_offset(entry);
1781 *offset = toff; 1813 base = (target >> our_page_cluster) << our_page_cluster;
1814 end = base + (1 << our_page_cluster);
1815 if (!base) /* first page is swap header */
1816 base++;
1782 1817
1783 spin_lock(&swap_lock); 1818 spin_lock(&swap_lock);
1784 do { 1819 if (end > si->max) /* don't go beyond end of map */
1785 /* Don't read-ahead past the end of the swap area */ 1820 end = si->max;
1786 if (toff >= swapdev->max) 1821
1822 /* Count contiguous allocated slots above our target */
1823 for (toff = target; ++toff < end; nr_pages++) {
1824 /* Don't read in free or bad pages */
1825 if (!si->swap_map[toff])
1787 break; 1826 break;
1827 if (si->swap_map[toff] == SWAP_MAP_BAD)
1828 break;
1829 }
1830 /* Count contiguous allocated slots below our target */
1831 for (toff = target; --toff >= base; nr_pages++) {
1788 /* Don't read in free or bad pages */ 1832 /* Don't read in free or bad pages */
1789 if (!swapdev->swap_map[toff]) 1833 if (!si->swap_map[toff])
1790 break; 1834 break;
1791 if (swapdev->swap_map[toff] == SWAP_MAP_BAD) 1835 if (si->swap_map[toff] == SWAP_MAP_BAD)
1792 break; 1836 break;
1793 toff++; 1837 }
1794 ret++;
1795 } while (--i);
1796 spin_unlock(&swap_lock); 1838 spin_unlock(&swap_lock);
1797 return ret; 1839
1840 /*
1841 * Indicate starting offset, and return number of pages to get:
1842 * if only 1, say 0, since there's then no readahead to be done.
1843 */
1844 *offset = ++toff;
1845 return nr_pages? ++nr_pages: 0;
1798} 1846}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index d436a9c82db7..702083638c16 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -121,18 +121,6 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
121 return 0; 121 return 0;
122} 122}
123 123
124#if 0
125int shmem_mmap(struct file *file, struct vm_area_struct *vma)
126{
127 file_accessed(file);
128#ifndef CONFIG_MMU
129 return ramfs_nommu_mmap(file, vma);
130#else
131 return 0;
132#endif
133}
134#endif /* 0 */
135
136#ifndef CONFIG_MMU 124#ifndef CONFIG_MMU
137unsigned long shmem_get_unmapped_area(struct file *file, 125unsigned long shmem_get_unmapped_area(struct file *file,
138 unsigned long addr, 126 unsigned long addr,
diff --git a/mm/truncate.c b/mm/truncate.c
index cadc15653dde..c35c49e54fb6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -21,7 +21,7 @@
21 21
22 22
23/** 23/**
24 * do_invalidatepage - invalidate part of all of a page 24 * do_invalidatepage - invalidate part or all of a page
25 * @page: the page which is affected 25 * @page: the page which is affected
26 * @offset: the index of the truncation point 26 * @offset: the index of the truncation point
27 * 27 *
@@ -48,7 +48,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
48 48
49static inline void truncate_partial_page(struct page *page, unsigned partial) 49static inline void truncate_partial_page(struct page *page, unsigned partial)
50{ 50{
51 zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0); 51 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
52 if (PagePrivate(page)) 52 if (PagePrivate(page))
53 do_invalidatepage(page, partial); 53 do_invalidatepage(page, partial);
54} 54}
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(cancel_dirty_page);
84 84
85/* 85/*
86 * If truncate cannot remove the fs-private metadata from the page, the page 86 * If truncate cannot remove the fs-private metadata from the page, the page
87 * becomes anonymous. It will be left on the LRU and may even be mapped into 87 * becomes orphaned. It will be left on the LRU and may even be mapped into
88 * user pagetables if we're racing with filemap_fault(). 88 * user pagetables if we're racing with filemap_fault().
89 * 89 *
90 * We need to bale out if page->mapping is no longer equal to the original 90 * We need to bale out if page->mapping is no longer equal to the original
@@ -98,11 +98,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
98 if (page->mapping != mapping) 98 if (page->mapping != mapping)
99 return; 99 return;
100 100
101 cancel_dirty_page(page, PAGE_CACHE_SIZE);
102
103 if (PagePrivate(page)) 101 if (PagePrivate(page))
104 do_invalidatepage(page, 0); 102 do_invalidatepage(page, 0);
105 103
104 cancel_dirty_page(page, PAGE_CACHE_SIZE);
105
106 remove_from_page_cache(page); 106 remove_from_page_cache(page);
107 ClearPageUptodate(page); 107 ClearPageUptodate(page);
108 ClearPageMappedToDisk(page); 108 ClearPageMappedToDisk(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af77e171e339..0536dde139d1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -166,6 +166,44 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
166} 166}
167EXPORT_SYMBOL_GPL(map_vm_area); 167EXPORT_SYMBOL_GPL(map_vm_area);
168 168
169/*
170 * Map a vmalloc()-space virtual address to the physical page.
171 */
172struct page *vmalloc_to_page(const void *vmalloc_addr)
173{
174 unsigned long addr = (unsigned long) vmalloc_addr;
175 struct page *page = NULL;
176 pgd_t *pgd = pgd_offset_k(addr);
177 pud_t *pud;
178 pmd_t *pmd;
179 pte_t *ptep, pte;
180
181 if (!pgd_none(*pgd)) {
182 pud = pud_offset(pgd, addr);
183 if (!pud_none(*pud)) {
184 pmd = pmd_offset(pud, addr);
185 if (!pmd_none(*pmd)) {
186 ptep = pte_offset_map(pmd, addr);
187 pte = *ptep;
188 if (pte_present(pte))
189 page = pte_page(pte);
190 pte_unmap(ptep);
191 }
192 }
193 }
194 return page;
195}
196EXPORT_SYMBOL(vmalloc_to_page);
197
198/*
199 * Map a vmalloc()-space virtual address to the physical page frame number.
200 */
201unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
202{
203 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
204}
205EXPORT_SYMBOL(vmalloc_to_pfn);
206
169static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, 207static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
170 unsigned long start, unsigned long end, 208 unsigned long start, unsigned long end,
171 int node, gfp_t gfp_mask) 209 int node, gfp_t gfp_mask)
@@ -216,6 +254,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
216 if (addr > end - size) 254 if (addr > end - size)
217 goto out; 255 goto out;
218 } 256 }
257 if ((size + addr) < addr)
258 goto out;
259 if (addr > end - size)
260 goto out;
219 261
220found: 262found:
221 area->next = *p; 263 area->next = *p;
@@ -268,7 +310,7 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
268} 310}
269 311
270/* Caller must hold vmlist_lock */ 312/* Caller must hold vmlist_lock */
271static struct vm_struct *__find_vm_area(void *addr) 313static struct vm_struct *__find_vm_area(const void *addr)
272{ 314{
273 struct vm_struct *tmp; 315 struct vm_struct *tmp;
274 316
@@ -281,7 +323,7 @@ static struct vm_struct *__find_vm_area(void *addr)
281} 323}
282 324
283/* Caller must hold vmlist_lock */ 325/* Caller must hold vmlist_lock */
284static struct vm_struct *__remove_vm_area(void *addr) 326static struct vm_struct *__remove_vm_area(const void *addr)
285{ 327{
286 struct vm_struct **p, *tmp; 328 struct vm_struct **p, *tmp;
287 329
@@ -310,7 +352,7 @@ found:
310 * This function returns the found VM area, but using it is NOT safe 352 * This function returns the found VM area, but using it is NOT safe
311 * on SMP machines, except for its size or flags. 353 * on SMP machines, except for its size or flags.
312 */ 354 */
313struct vm_struct *remove_vm_area(void *addr) 355struct vm_struct *remove_vm_area(const void *addr)
314{ 356{
315 struct vm_struct *v; 357 struct vm_struct *v;
316 write_lock(&vmlist_lock); 358 write_lock(&vmlist_lock);
@@ -319,7 +361,7 @@ struct vm_struct *remove_vm_area(void *addr)
319 return v; 361 return v;
320} 362}
321 363
322static void __vunmap(void *addr, int deallocate_pages) 364static void __vunmap(const void *addr, int deallocate_pages)
323{ 365{
324 struct vm_struct *area; 366 struct vm_struct *area;
325 367
@@ -346,8 +388,10 @@ static void __vunmap(void *addr, int deallocate_pages)
346 int i; 388 int i;
347 389
348 for (i = 0; i < area->nr_pages; i++) { 390 for (i = 0; i < area->nr_pages; i++) {
349 BUG_ON(!area->pages[i]); 391 struct page *page = area->pages[i];
350 __free_page(area->pages[i]); 392
393 BUG_ON(!page);
394 __free_page(page);
351 } 395 }
352 396
353 if (area->flags & VM_VPAGES) 397 if (area->flags & VM_VPAGES)
@@ -370,7 +414,7 @@ static void __vunmap(void *addr, int deallocate_pages)
370 * 414 *
371 * Must not be called in interrupt context. 415 * Must not be called in interrupt context.
372 */ 416 */
373void vfree(void *addr) 417void vfree(const void *addr)
374{ 418{
375 BUG_ON(in_interrupt()); 419 BUG_ON(in_interrupt());
376 __vunmap(addr, 1); 420 __vunmap(addr, 1);
@@ -386,7 +430,7 @@ EXPORT_SYMBOL(vfree);
386 * 430 *
387 * Must not be called in interrupt context. 431 * Must not be called in interrupt context.
388 */ 432 */
389void vunmap(void *addr) 433void vunmap(const void *addr)
390{ 434{
391 BUG_ON(in_interrupt()); 435 BUG_ON(in_interrupt());
392 __vunmap(addr, 0); 436 __vunmap(addr, 0);
@@ -423,8 +467,8 @@ void *vmap(struct page **pages, unsigned int count,
423} 467}
424EXPORT_SYMBOL(vmap); 468EXPORT_SYMBOL(vmap);
425 469
426void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 470static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
427 pgprot_t prot, int node) 471 pgprot_t prot, int node)
428{ 472{
429 struct page **pages; 473 struct page **pages;
430 unsigned int nr_pages, array_size, i; 474 unsigned int nr_pages, array_size, i;
@@ -451,15 +495,19 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
451 } 495 }
452 496
453 for (i = 0; i < area->nr_pages; i++) { 497 for (i = 0; i < area->nr_pages; i++) {
498 struct page *page;
499
454 if (node < 0) 500 if (node < 0)
455 area->pages[i] = alloc_page(gfp_mask); 501 page = alloc_page(gfp_mask);
456 else 502 else
457 area->pages[i] = alloc_pages_node(node, gfp_mask, 0); 503 page = alloc_pages_node(node, gfp_mask, 0);
458 if (unlikely(!area->pages[i])) { 504
505 if (unlikely(!page)) {
459 /* Successfully allocated i pages, free them in __vunmap() */ 506 /* Successfully allocated i pages, free them in __vunmap() */
460 area->nr_pages = i; 507 area->nr_pages = i;
461 goto fail; 508 goto fail;
462 } 509 }
510 area->pages[i] = page;
463 } 511 }
464 512
465 if (map_vm_area(area, prot, &pages)) 513 if (map_vm_area(area, prot, &pages))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5a9597e3bbc..a26dabd62fed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/kthread.h> 38#include <linux/kthread.h>
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h>
40 41
41#include <asm/tlbflush.h> 42#include <asm/tlbflush.h>
42#include <asm/div64.h> 43#include <asm/div64.h>
@@ -68,6 +69,22 @@ struct scan_control {
68 int all_unreclaimable; 69 int all_unreclaimable;
69 70
70 int order; 71 int order;
72
73 /*
74 * Pages that have (or should have) IO pending. If we run into
75 * a lot of these, we're better off waiting a little for IO to
76 * finish rather than scanning more pages in the VM.
77 */
78 int nr_io_pages;
79
80 /* Which cgroup do we reclaim from */
81 struct mem_cgroup *mem_cgroup;
82
83 /* Pluggable isolate pages callback */
84 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
85 unsigned long *scanned, int order, int mode,
86 struct zone *z, struct mem_cgroup *mem_cont,
87 int active);
71}; 88};
72 89
73#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 90#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -109,6 +126,12 @@ long vm_total_pages; /* The total number of pages which the VM controls */
109static LIST_HEAD(shrinker_list); 126static LIST_HEAD(shrinker_list);
110static DECLARE_RWSEM(shrinker_rwsem); 127static DECLARE_RWSEM(shrinker_rwsem);
111 128
129#ifdef CONFIG_CGROUP_MEM_CONT
130#define scan_global_lru(sc) (!(sc)->mem_cgroup)
131#else
132#define scan_global_lru(sc) (1)
133#endif
134
112/* 135/*
113 * Add a shrinker callback to be called from the vm 136 * Add a shrinker callback to be called from the vm
114 */ 137 */
@@ -489,11 +512,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
489 */ 512 */
490 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) 513 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
491 wait_on_page_writeback(page); 514 wait_on_page_writeback(page);
492 else 515 else {
516 sc->nr_io_pages++;
493 goto keep_locked; 517 goto keep_locked;
518 }
494 } 519 }
495 520
496 referenced = page_referenced(page, 1); 521 referenced = page_referenced(page, 1, sc->mem_cgroup);
497 /* In active use or really unfreeable? Activate it. */ 522 /* In active use or really unfreeable? Activate it. */
498 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 523 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
499 referenced && page_mapping_inuse(page)) 524 referenced && page_mapping_inuse(page))
@@ -529,8 +554,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
529 if (PageDirty(page)) { 554 if (PageDirty(page)) {
530 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) 555 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
531 goto keep_locked; 556 goto keep_locked;
532 if (!may_enter_fs) 557 if (!may_enter_fs) {
558 sc->nr_io_pages++;
533 goto keep_locked; 559 goto keep_locked;
560 }
534 if (!sc->may_writepage) 561 if (!sc->may_writepage)
535 goto keep_locked; 562 goto keep_locked;
536 563
@@ -541,8 +568,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
541 case PAGE_ACTIVATE: 568 case PAGE_ACTIVATE:
542 goto activate_locked; 569 goto activate_locked;
543 case PAGE_SUCCESS: 570 case PAGE_SUCCESS:
544 if (PageWriteback(page) || PageDirty(page)) 571 if (PageWriteback(page) || PageDirty(page)) {
572 sc->nr_io_pages++;
545 goto keep; 573 goto keep;
574 }
546 /* 575 /*
547 * A synchronous write - probably a ramdisk. Go 576 * A synchronous write - probably a ramdisk. Go
548 * ahead and try to reclaim the page. 577 * ahead and try to reclaim the page.
@@ -626,7 +655,7 @@ keep:
626 * 655 *
627 * returns 0 on success, -ve errno on failure. 656 * returns 0 on success, -ve errno on failure.
628 */ 657 */
629static int __isolate_lru_page(struct page *page, int mode) 658int __isolate_lru_page(struct page *page, int mode)
630{ 659{
631 int ret = -EINVAL; 660 int ret = -EINVAL;
632 661
@@ -760,6 +789,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
760 return nr_taken; 789 return nr_taken;
761} 790}
762 791
792static unsigned long isolate_pages_global(unsigned long nr,
793 struct list_head *dst,
794 unsigned long *scanned, int order,
795 int mode, struct zone *z,
796 struct mem_cgroup *mem_cont,
797 int active)
798{
799 if (active)
800 return isolate_lru_pages(nr, &z->active_list, dst,
801 scanned, order, mode);
802 else
803 return isolate_lru_pages(nr, &z->inactive_list, dst,
804 scanned, order, mode);
805}
806
763/* 807/*
764 * clear_active_flags() is a helper for shrink_active_list(), clearing 808 * clear_active_flags() is a helper for shrink_active_list(), clearing
765 * any active bits from the pages in the list. 809 * any active bits from the pages in the list.
@@ -801,18 +845,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
801 unsigned long nr_freed; 845 unsigned long nr_freed;
802 unsigned long nr_active; 846 unsigned long nr_active;
803 847
804 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 848 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
805 &zone->inactive_list,
806 &page_list, &nr_scan, sc->order, 849 &page_list, &nr_scan, sc->order,
807 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 850 (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
808 ISOLATE_BOTH : ISOLATE_INACTIVE); 851 ISOLATE_BOTH : ISOLATE_INACTIVE,
852 zone, sc->mem_cgroup, 0);
809 nr_active = clear_active_flags(&page_list); 853 nr_active = clear_active_flags(&page_list);
810 __count_vm_events(PGDEACTIVATE, nr_active); 854 __count_vm_events(PGDEACTIVATE, nr_active);
811 855
812 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); 856 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
813 __mod_zone_page_state(zone, NR_INACTIVE, 857 __mod_zone_page_state(zone, NR_INACTIVE,
814 -(nr_taken - nr_active)); 858 -(nr_taken - nr_active));
815 zone->pages_scanned += nr_scan; 859 if (scan_global_lru(sc))
860 zone->pages_scanned += nr_scan;
816 spin_unlock_irq(&zone->lru_lock); 861 spin_unlock_irq(&zone->lru_lock);
817 862
818 nr_scanned += nr_scan; 863 nr_scanned += nr_scan;
@@ -844,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
844 if (current_is_kswapd()) { 889 if (current_is_kswapd()) {
845 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 890 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
846 __count_vm_events(KSWAPD_STEAL, nr_freed); 891 __count_vm_events(KSWAPD_STEAL, nr_freed);
847 } else 892 } else if (scan_global_lru(sc))
848 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 893 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
894
849 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 895 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
850 896
851 if (nr_taken == 0) 897 if (nr_taken == 0)
@@ -899,6 +945,113 @@ static inline int zone_is_near_oom(struct zone *zone)
899} 945}
900 946
901/* 947/*
948 * Determine we should try to reclaim mapped pages.
949 * This is called only when sc->mem_cgroup is NULL.
950 */
951static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
952 int priority)
953{
954 long mapped_ratio;
955 long distress;
956 long swap_tendency;
957 long imbalance;
958 int reclaim_mapped = 0;
959 int prev_priority;
960
961 if (scan_global_lru(sc) && zone_is_near_oom(zone))
962 return 1;
963 /*
964 * `distress' is a measure of how much trouble we're having
965 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
966 */
967 if (scan_global_lru(sc))
968 prev_priority = zone->prev_priority;
969 else
970 prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
971
972 distress = 100 >> min(prev_priority, priority);
973
974 /*
975 * The point of this algorithm is to decide when to start
976 * reclaiming mapped memory instead of just pagecache. Work out
977 * how much memory
978 * is mapped.
979 */
980 if (scan_global_lru(sc))
981 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
982 global_page_state(NR_ANON_PAGES)) * 100) /
983 vm_total_pages;
984 else
985 mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
986
987 /*
988 * Now decide how much we really want to unmap some pages. The
989 * mapped ratio is downgraded - just because there's a lot of
990 * mapped memory doesn't necessarily mean that page reclaim
991 * isn't succeeding.
992 *
993 * The distress ratio is important - we don't want to start
994 * going oom.
995 *
996 * A 100% value of vm_swappiness overrides this algorithm
997 * altogether.
998 */
999 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1000
1001 /*
1002 * If there's huge imbalance between active and inactive
1003 * (think active 100 times larger than inactive) we should
1004 * become more permissive, or the system will take too much
1005 * cpu before it start swapping during memory pressure.
1006 * Distress is about avoiding early-oom, this is about
1007 * making swappiness graceful despite setting it to low
1008 * values.
1009 *
1010 * Avoid div by zero with nr_inactive+1, and max resulting
1011 * value is vm_total_pages.
1012 */
1013 if (scan_global_lru(sc)) {
1014 imbalance = zone_page_state(zone, NR_ACTIVE);
1015 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1016 } else
1017 imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
1018
1019 /*
1020 * Reduce the effect of imbalance if swappiness is low,
1021 * this means for a swappiness very low, the imbalance
1022 * must be much higher than 100 for this logic to make
1023 * the difference.
1024 *
1025 * Max temporary value is vm_total_pages*100.
1026 */
1027 imbalance *= (vm_swappiness + 1);
1028 imbalance /= 100;
1029
1030 /*
1031 * If not much of the ram is mapped, makes the imbalance
1032 * less relevant, it's high priority we refill the inactive
1033 * list with mapped pages only in presence of high ratio of
1034 * mapped pages.
1035 *
1036 * Max temporary value is vm_total_pages*100.
1037 */
1038 imbalance *= mapped_ratio;
1039 imbalance /= 100;
1040
1041 /* apply imbalance feedback to swap_tendency */
1042 swap_tendency += imbalance;
1043
1044 /*
1045 * Now use this metric to decide whether to start moving mapped
1046 * memory onto the inactive list.
1047 */
1048 if (swap_tendency >= 100)
1049 reclaim_mapped = 1;
1050
1051 return reclaim_mapped;
1052}
1053
1054/*
902 * This moves pages from the active list to the inactive list. 1055 * This moves pages from the active list to the inactive list.
903 * 1056 *
904 * We move them the other way if the page is referenced by one or more 1057 * We move them the other way if the page is referenced by one or more
@@ -915,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone)
915 * The downside is that we have to touch page->_count against each page. 1068 * The downside is that we have to touch page->_count against each page.
916 * But we had to alter page->flags anyway. 1069 * But we had to alter page->flags anyway.
917 */ 1070 */
1071
1072
918static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1073static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
919 struct scan_control *sc, int priority) 1074 struct scan_control *sc, int priority)
920{ 1075{
@@ -928,99 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
928 struct pagevec pvec; 1083 struct pagevec pvec;
929 int reclaim_mapped = 0; 1084 int reclaim_mapped = 0;
930 1085
931 if (sc->may_swap) { 1086 if (sc->may_swap)
932 long mapped_ratio; 1087 reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
933 long distress;
934 long swap_tendency;
935 long imbalance;
936
937 if (zone_is_near_oom(zone))
938 goto force_reclaim_mapped;
939
940 /*
941 * `distress' is a measure of how much trouble we're having
942 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
943 */
944 distress = 100 >> min(zone->prev_priority, priority);
945
946 /*
947 * The point of this algorithm is to decide when to start
948 * reclaiming mapped memory instead of just pagecache. Work out
949 * how much memory
950 * is mapped.
951 */
952 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
953 global_page_state(NR_ANON_PAGES)) * 100) /
954 vm_total_pages;
955
956 /*
957 * Now decide how much we really want to unmap some pages. The
958 * mapped ratio is downgraded - just because there's a lot of
959 * mapped memory doesn't necessarily mean that page reclaim
960 * isn't succeeding.
961 *
962 * The distress ratio is important - we don't want to start
963 * going oom.
964 *
965 * A 100% value of vm_swappiness overrides this algorithm
966 * altogether.
967 */
968 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
969
970 /*
971 * If there's huge imbalance between active and inactive
972 * (think active 100 times larger than inactive) we should
973 * become more permissive, or the system will take too much
974 * cpu before it start swapping during memory pressure.
975 * Distress is about avoiding early-oom, this is about
976 * making swappiness graceful despite setting it to low
977 * values.
978 *
979 * Avoid div by zero with nr_inactive+1, and max resulting
980 * value is vm_total_pages.
981 */
982 imbalance = zone_page_state(zone, NR_ACTIVE);
983 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
984
985 /*
986 * Reduce the effect of imbalance if swappiness is low,
987 * this means for a swappiness very low, the imbalance
988 * must be much higher than 100 for this logic to make
989 * the difference.
990 *
991 * Max temporary value is vm_total_pages*100.
992 */
993 imbalance *= (vm_swappiness + 1);
994 imbalance /= 100;
995
996 /*
997 * If not much of the ram is mapped, makes the imbalance
998 * less relevant, it's high priority we refill the inactive
999 * list with mapped pages only in presence of high ratio of
1000 * mapped pages.
1001 *
1002 * Max temporary value is vm_total_pages*100.
1003 */
1004 imbalance *= mapped_ratio;
1005 imbalance /= 100;
1006
1007 /* apply imbalance feedback to swap_tendency */
1008 swap_tendency += imbalance;
1009
1010 /*
1011 * Now use this metric to decide whether to start moving mapped
1012 * memory onto the inactive list.
1013 */
1014 if (swap_tendency >= 100)
1015force_reclaim_mapped:
1016 reclaim_mapped = 1;
1017 }
1018 1088
1019 lru_add_drain(); 1089 lru_add_drain();
1020 spin_lock_irq(&zone->lru_lock); 1090 spin_lock_irq(&zone->lru_lock);
1021 pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, 1091 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1022 &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); 1092 ISOLATE_ACTIVE, zone,
1023 zone->pages_scanned += pgscanned; 1093 sc->mem_cgroup, 1);
1094 /*
1095 * zone->pages_scanned is used for detect zone's oom
1096 * mem_cgroup remembers nr_scan by itself.
1097 */
1098 if (scan_global_lru(sc))
1099 zone->pages_scanned += pgscanned;
1100
1024 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1101 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
1025 spin_unlock_irq(&zone->lru_lock); 1102 spin_unlock_irq(&zone->lru_lock);
1026 1103
@@ -1031,7 +1108,7 @@ force_reclaim_mapped:
1031 if (page_mapped(page)) { 1108 if (page_mapped(page)) {
1032 if (!reclaim_mapped || 1109 if (!reclaim_mapped ||
1033 (total_swap_pages == 0 && PageAnon(page)) || 1110 (total_swap_pages == 0 && PageAnon(page)) ||
1034 page_referenced(page, 0)) { 1111 page_referenced(page, 0, sc->mem_cgroup)) {
1035 list_add(&page->lru, &l_active); 1112 list_add(&page->lru, &l_active);
1036 continue; 1113 continue;
1037 } 1114 }
@@ -1051,6 +1128,7 @@ force_reclaim_mapped:
1051 ClearPageActive(page); 1128 ClearPageActive(page);
1052 1129
1053 list_move(&page->lru, &zone->inactive_list); 1130 list_move(&page->lru, &zone->inactive_list);
1131 mem_cgroup_move_lists(page_get_page_cgroup(page), false);
1054 pgmoved++; 1132 pgmoved++;
1055 if (!pagevec_add(&pvec, page)) { 1133 if (!pagevec_add(&pvec, page)) {
1056 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1134 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1079,6 +1157,7 @@ force_reclaim_mapped:
1079 SetPageLRU(page); 1157 SetPageLRU(page);
1080 VM_BUG_ON(!PageActive(page)); 1158 VM_BUG_ON(!PageActive(page));
1081 list_move(&page->lru, &zone->active_list); 1159 list_move(&page->lru, &zone->active_list);
1160 mem_cgroup_move_lists(page_get_page_cgroup(page), true);
1082 pgmoved++; 1161 pgmoved++;
1083 if (!pagevec_add(&pvec, page)) { 1162 if (!pagevec_add(&pvec, page)) {
1084 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); 1163 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1108,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1108 unsigned long nr_to_scan; 1187 unsigned long nr_to_scan;
1109 unsigned long nr_reclaimed = 0; 1188 unsigned long nr_reclaimed = 0;
1110 1189
1111 /* 1190 if (scan_global_lru(sc)) {
1112 * Add one to `nr_to_scan' just to make sure that the kernel will 1191 /*
1113 * slowly sift through the active list. 1192 * Add one to nr_to_scan just to make sure that the kernel
1114 */ 1193 * will slowly sift through the active list.
1115 zone->nr_scan_active += 1194 */
1116 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; 1195 zone->nr_scan_active +=
1117 nr_active = zone->nr_scan_active; 1196 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
1118 if (nr_active >= sc->swap_cluster_max) 1197 nr_active = zone->nr_scan_active;
1119 zone->nr_scan_active = 0; 1198 zone->nr_scan_inactive +=
1120 else 1199 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1121 nr_active = 0; 1200 nr_inactive = zone->nr_scan_inactive;
1201 if (nr_inactive >= sc->swap_cluster_max)
1202 zone->nr_scan_inactive = 0;
1203 else
1204 nr_inactive = 0;
1205
1206 if (nr_active >= sc->swap_cluster_max)
1207 zone->nr_scan_active = 0;
1208 else
1209 nr_active = 0;
1210 } else {
1211 /*
1212 * This reclaim occurs not because zone memory shortage but
1213 * because memory controller hits its limit.
1214 * Then, don't modify zone reclaim related data.
1215 */
1216 nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
1217 zone, priority);
1218
1219 nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
1220 zone, priority);
1221 }
1122 1222
1123 zone->nr_scan_inactive +=
1124 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1125 nr_inactive = zone->nr_scan_inactive;
1126 if (nr_inactive >= sc->swap_cluster_max)
1127 zone->nr_scan_inactive = 0;
1128 else
1129 nr_inactive = 0;
1130 1223
1131 while (nr_active || nr_inactive) { 1224 while (nr_active || nr_inactive) {
1132 if (nr_active) { 1225 if (nr_active) {
@@ -1171,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1171 unsigned long nr_reclaimed = 0; 1264 unsigned long nr_reclaimed = 0;
1172 int i; 1265 int i;
1173 1266
1267
1174 sc->all_unreclaimable = 1; 1268 sc->all_unreclaimable = 1;
1175 for (i = 0; zones[i] != NULL; i++) { 1269 for (i = 0; zones[i] != NULL; i++) {
1176 struct zone *zone = zones[i]; 1270 struct zone *zone = zones[i];
1177 1271
1178 if (!populated_zone(zone)) 1272 if (!populated_zone(zone))
1179 continue; 1273 continue;
1274 /*
1275 * Take care memory controller reclaiming has small influence
1276 * to global LRU.
1277 */
1278 if (scan_global_lru(sc)) {
1279 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1280 continue;
1281 note_zone_scanning_priority(zone, priority);
1180 1282
1181 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1283 if (zone_is_all_unreclaimable(zone) &&
1182 continue; 1284 priority != DEF_PRIORITY)
1183 1285 continue; /* Let kswapd poll it */
1184 note_zone_scanning_priority(zone, priority); 1286 sc->all_unreclaimable = 0;
1185 1287 } else {
1186 if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) 1288 /*
1187 continue; /* Let kswapd poll it */ 1289 * Ignore cpuset limitation here. We just want to reduce
1188 1290 * # of used pages by us regardless of memory shortage.
1189 sc->all_unreclaimable = 0; 1291 */
1292 sc->all_unreclaimable = 0;
1293 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1294 priority);
1295 }
1190 1296
1191 nr_reclaimed += shrink_zone(priority, zone, sc); 1297 nr_reclaimed += shrink_zone(priority, zone, sc);
1192 } 1298 }
1299
1193 return nr_reclaimed; 1300 return nr_reclaimed;
1194} 1301}
1195 1302
@@ -1206,7 +1313,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1206 * holds filesystem locks which prevent writeout this might not work, and the 1313 * holds filesystem locks which prevent writeout this might not work, and the
1207 * allocation attempt will fail. 1314 * allocation attempt will fail.
1208 */ 1315 */
1209unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) 1316static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
1317 struct scan_control *sc)
1210{ 1318{
1211 int priority; 1319 int priority;
1212 int ret = 0; 1320 int ret = 0;
@@ -1215,39 +1323,43 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1215 struct reclaim_state *reclaim_state = current->reclaim_state; 1323 struct reclaim_state *reclaim_state = current->reclaim_state;
1216 unsigned long lru_pages = 0; 1324 unsigned long lru_pages = 0;
1217 int i; 1325 int i;
1218 struct scan_control sc = {
1219 .gfp_mask = gfp_mask,
1220 .may_writepage = !laptop_mode,
1221 .swap_cluster_max = SWAP_CLUSTER_MAX,
1222 .may_swap = 1,
1223 .swappiness = vm_swappiness,
1224 .order = order,
1225 };
1226
1227 count_vm_event(ALLOCSTALL);
1228 1326
1229 for (i = 0; zones[i] != NULL; i++) { 1327 if (scan_global_lru(sc))
1230 struct zone *zone = zones[i]; 1328 count_vm_event(ALLOCSTALL);
1329 /*
1330 * mem_cgroup will not do shrink_slab.
1331 */
1332 if (scan_global_lru(sc)) {
1333 for (i = 0; zones[i] != NULL; i++) {
1334 struct zone *zone = zones[i];
1231 1335
1232 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1336 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1233 continue; 1337 continue;
1234 1338
1235 lru_pages += zone_page_state(zone, NR_ACTIVE) 1339 lru_pages += zone_page_state(zone, NR_ACTIVE)
1236 + zone_page_state(zone, NR_INACTIVE); 1340 + zone_page_state(zone, NR_INACTIVE);
1341 }
1237 } 1342 }
1238 1343
1239 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1344 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1240 sc.nr_scanned = 0; 1345 sc->nr_scanned = 0;
1346 sc->nr_io_pages = 0;
1241 if (!priority) 1347 if (!priority)
1242 disable_swap_token(); 1348 disable_swap_token();
1243 nr_reclaimed += shrink_zones(priority, zones, &sc); 1349 nr_reclaimed += shrink_zones(priority, zones, sc);
1244 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 1350 /*
1245 if (reclaim_state) { 1351 * Don't shrink slabs when reclaiming memory from
1246 nr_reclaimed += reclaim_state->reclaimed_slab; 1352 * over limit cgroups
1247 reclaim_state->reclaimed_slab = 0; 1353 */
1354 if (scan_global_lru(sc)) {
1355 shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
1356 if (reclaim_state) {
1357 nr_reclaimed += reclaim_state->reclaimed_slab;
1358 reclaim_state->reclaimed_slab = 0;
1359 }
1248 } 1360 }
1249 total_scanned += sc.nr_scanned; 1361 total_scanned += sc->nr_scanned;
1250 if (nr_reclaimed >= sc.swap_cluster_max) { 1362 if (nr_reclaimed >= sc->swap_cluster_max) {
1251 ret = 1; 1363 ret = 1;
1252 goto out; 1364 goto out;
1253 } 1365 }
@@ -1259,18 +1371,19 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1259 * that's undesirable in laptop mode, where we *want* lumpy 1371 * that's undesirable in laptop mode, where we *want* lumpy
1260 * writeout. So in laptop mode, write out the whole world. 1372 * writeout. So in laptop mode, write out the whole world.
1261 */ 1373 */
1262 if (total_scanned > sc.swap_cluster_max + 1374 if (total_scanned > sc->swap_cluster_max +
1263 sc.swap_cluster_max / 2) { 1375 sc->swap_cluster_max / 2) {
1264 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1376 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1265 sc.may_writepage = 1; 1377 sc->may_writepage = 1;
1266 } 1378 }
1267 1379
1268 /* Take a nap, wait for some writeback to complete */ 1380 /* Take a nap, wait for some writeback to complete */
1269 if (sc.nr_scanned && priority < DEF_PRIORITY - 2) 1381 if (sc->nr_scanned && priority < DEF_PRIORITY - 2 &&
1382 sc->nr_io_pages > sc->swap_cluster_max)
1270 congestion_wait(WRITE, HZ/10); 1383 congestion_wait(WRITE, HZ/10);
1271 } 1384 }
1272 /* top priority shrink_caches still had more to do? don't OOM, then */ 1385 /* top priority shrink_caches still had more to do? don't OOM, then */
1273 if (!sc.all_unreclaimable) 1386 if (!sc->all_unreclaimable && scan_global_lru(sc))
1274 ret = 1; 1387 ret = 1;
1275out: 1388out:
1276 /* 1389 /*
@@ -1282,17 +1395,63 @@ out:
1282 */ 1395 */
1283 if (priority < 0) 1396 if (priority < 0)
1284 priority = 0; 1397 priority = 0;
1285 for (i = 0; zones[i] != NULL; i++) {
1286 struct zone *zone = zones[i];
1287 1398
1288 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1399 if (scan_global_lru(sc)) {
1289 continue; 1400 for (i = 0; zones[i] != NULL; i++) {
1401 struct zone *zone = zones[i];
1402
1403 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1404 continue;
1405
1406 zone->prev_priority = priority;
1407 }
1408 } else
1409 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1290 1410
1291 zone->prev_priority = priority;
1292 }
1293 return ret; 1411 return ret;
1294} 1412}
1295 1413
1414unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1415{
1416 struct scan_control sc = {
1417 .gfp_mask = gfp_mask,
1418 .may_writepage = !laptop_mode,
1419 .swap_cluster_max = SWAP_CLUSTER_MAX,
1420 .may_swap = 1,
1421 .swappiness = vm_swappiness,
1422 .order = order,
1423 .mem_cgroup = NULL,
1424 .isolate_pages = isolate_pages_global,
1425 };
1426
1427 return do_try_to_free_pages(zones, gfp_mask, &sc);
1428}
1429
1430#ifdef CONFIG_CGROUP_MEM_CONT
1431
1432unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1433 gfp_t gfp_mask)
1434{
1435 struct scan_control sc = {
1436 .gfp_mask = gfp_mask,
1437 .may_writepage = !laptop_mode,
1438 .may_swap = 1,
1439 .swap_cluster_max = SWAP_CLUSTER_MAX,
1440 .swappiness = vm_swappiness,
1441 .order = 0,
1442 .mem_cgroup = mem_cont,
1443 .isolate_pages = mem_cgroup_isolate_pages,
1444 };
1445 struct zone **zones;
1446 int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
1447
1448 zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
1449 if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
1450 return 1;
1451 return 0;
1452}
1453#endif
1454
1296/* 1455/*
1297 * For kswapd, balance_pgdat() will work across all this node's zones until 1456 * For kswapd, balance_pgdat() will work across all this node's zones until
1298 * they are all at pages_high. 1457 * they are all at pages_high.
@@ -1328,6 +1487,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1328 .swap_cluster_max = SWAP_CLUSTER_MAX, 1487 .swap_cluster_max = SWAP_CLUSTER_MAX,
1329 .swappiness = vm_swappiness, 1488 .swappiness = vm_swappiness,
1330 .order = order, 1489 .order = order,
1490 .mem_cgroup = NULL,
1491 .isolate_pages = isolate_pages_global,
1331 }; 1492 };
1332 /* 1493 /*
1333 * temp_priority is used to remember the scanning priority at which 1494 * temp_priority is used to remember the scanning priority at which
@@ -1352,6 +1513,7 @@ loop_again:
1352 if (!priority) 1513 if (!priority)
1353 disable_swap_token(); 1514 disable_swap_token();
1354 1515
1516 sc.nr_io_pages = 0;
1355 all_zones_ok = 1; 1517 all_zones_ok = 1;
1356 1518
1357 /* 1519 /*
@@ -1444,7 +1606,8 @@ loop_again:
1444 * OK, kswapd is getting into trouble. Take a nap, then take 1606 * OK, kswapd is getting into trouble. Take a nap, then take
1445 * another pass across the zones. 1607 * another pass across the zones.
1446 */ 1608 */
1447 if (total_scanned && priority < DEF_PRIORITY - 2) 1609 if (total_scanned && priority < DEF_PRIORITY - 2 &&
1610 sc.nr_io_pages > sc.swap_cluster_max)
1448 congestion_wait(WRITE, HZ/10); 1611 congestion_wait(WRITE, HZ/10);
1449 1612
1450 /* 1613 /*
@@ -1649,6 +1812,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1649 .swap_cluster_max = nr_pages, 1812 .swap_cluster_max = nr_pages,
1650 .may_writepage = 1, 1813 .may_writepage = 1,
1651 .swappiness = vm_swappiness, 1814 .swappiness = vm_swappiness,
1815 .isolate_pages = isolate_pages_global,
1652 }; 1816 };
1653 1817
1654 current->reclaim_state = &reclaim_state; 1818 current->reclaim_state = &reclaim_state;
@@ -1834,6 +1998,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1834 SWAP_CLUSTER_MAX), 1998 SWAP_CLUSTER_MAX),
1835 .gfp_mask = gfp_mask, 1999 .gfp_mask = gfp_mask,
1836 .swappiness = vm_swappiness, 2000 .swappiness = vm_swappiness,
2001 .isolate_pages = isolate_pages_global,
1837 }; 2002 };
1838 unsigned long slab_reclaimable; 2003 unsigned long slab_reclaimable;
1839 2004
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e8d846f57774..422d960ffcd8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -21,21 +21,14 @@ EXPORT_PER_CPU_SYMBOL(vm_event_states);
21 21
22static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) 22static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
23{ 23{
24 int cpu = 0; 24 int cpu;
25 int i; 25 int i;
26 26
27 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 27 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
28 28
29 cpu = first_cpu(*cpumask); 29 for_each_cpu_mask(cpu, *cpumask) {
30 while (cpu < NR_CPUS) {
31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 30 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
32 31
33 cpu = next_cpu(cpu, *cpumask);
34
35 if (cpu < NR_CPUS)
36 prefetch(&per_cpu(vm_event_states, cpu));
37
38
39 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 32 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
40 ret[i] += this->event[i]; 33 ret[i] += this->event[i];
41 } 34 }
@@ -284,6 +277,10 @@ EXPORT_SYMBOL(dec_zone_page_state);
284/* 277/*
285 * Update the zone counters for one cpu. 278 * Update the zone counters for one cpu.
286 * 279 *
280 * The cpu specified must be either the current cpu or a processor that
281 * is not online. If it is the current cpu then the execution thread must
282 * be pinned to the current cpu.
283 *
287 * Note that refresh_cpu_vm_stats strives to only access 284 * Note that refresh_cpu_vm_stats strives to only access
288 * node local memory. The per cpu pagesets on remote zones are placed 285 * node local memory. The per cpu pagesets on remote zones are placed
289 * in the memory local to the processor using that pageset. So the 286 * in the memory local to the processor using that pageset. So the
@@ -299,7 +296,7 @@ void refresh_cpu_vm_stats(int cpu)
299{ 296{
300 struct zone *zone; 297 struct zone *zone;
301 int i; 298 int i;
302 unsigned long flags; 299 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
303 300
304 for_each_zone(zone) { 301 for_each_zone(zone) {
305 struct per_cpu_pageset *p; 302 struct per_cpu_pageset *p;
@@ -311,15 +308,19 @@ void refresh_cpu_vm_stats(int cpu)
311 308
312 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 309 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
313 if (p->vm_stat_diff[i]) { 310 if (p->vm_stat_diff[i]) {
311 unsigned long flags;
312 int v;
313
314 local_irq_save(flags); 314 local_irq_save(flags);
315 zone_page_state_add(p->vm_stat_diff[i], 315 v = p->vm_stat_diff[i];
316 zone, i);
317 p->vm_stat_diff[i] = 0; 316 p->vm_stat_diff[i] = 0;
317 local_irq_restore(flags);
318 atomic_long_add(v, &zone->vm_stat[i]);
319 global_diff[i] += v;
318#ifdef CONFIG_NUMA 320#ifdef CONFIG_NUMA
319 /* 3 seconds idle till flush */ 321 /* 3 seconds idle till flush */
320 p->expire = 3; 322 p->expire = 3;
321#endif 323#endif
322 local_irq_restore(flags);
323 } 324 }
324#ifdef CONFIG_NUMA 325#ifdef CONFIG_NUMA
325 /* 326 /*
@@ -329,7 +330,7 @@ void refresh_cpu_vm_stats(int cpu)
329 * Check if there are pages remaining in this pageset 330 * Check if there are pages remaining in this pageset
330 * if not then there is nothing to expire. 331 * if not then there is nothing to expire.
331 */ 332 */
332 if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count)) 333 if (!p->expire || !p->pcp.count)
333 continue; 334 continue;
334 335
335 /* 336 /*
@@ -344,13 +345,14 @@ void refresh_cpu_vm_stats(int cpu)
344 if (p->expire) 345 if (p->expire)
345 continue; 346 continue;
346 347
347 if (p->pcp[0].count) 348 if (p->pcp.count)
348 drain_zone_pages(zone, p->pcp + 0); 349 drain_zone_pages(zone, &p->pcp);
349
350 if (p->pcp[1].count)
351 drain_zone_pages(zone, p->pcp + 1);
352#endif 350#endif
353 } 351 }
352
353 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
354 if (global_diff[i])
355 atomic_long_add(global_diff[i], &vm_stat[i]);
354} 356}
355 357
356#endif 358#endif
@@ -681,20 +683,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
681 "\n pagesets"); 683 "\n pagesets");
682 for_each_online_cpu(i) { 684 for_each_online_cpu(i) {
683 struct per_cpu_pageset *pageset; 685 struct per_cpu_pageset *pageset;
684 int j;
685 686
686 pageset = zone_pcp(zone, i); 687 pageset = zone_pcp(zone, i);
687 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 688 seq_printf(m,
688 seq_printf(m, 689 "\n cpu: %i"
689 "\n cpu: %i pcp: %i" 690 "\n count: %i"
690 "\n count: %i" 691 "\n high: %i"
691 "\n high: %i" 692 "\n batch: %i",
692 "\n batch: %i", 693 i,
693 i, j, 694 pageset->pcp.count,
694 pageset->pcp[j].count, 695 pageset->pcp.high,
695 pageset->pcp[j].high, 696 pageset->pcp.batch);
696 pageset->pcp[j].batch);
697 }
698#ifdef CONFIG_SMP 697#ifdef CONFIG_SMP
699 seq_printf(m, "\n vm stats threshold: %d", 698 seq_printf(m, "\n vm stats threshold: %d",
700 pageset->stat_threshold); 699 pageset->stat_threshold);