aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSteve French <sfrench@us.ibm.com>2008-02-06 11:04:00 -0500
committerSteve French <sfrench@us.ibm.com>2008-02-06 11:04:00 -0500
commitf315ccb3e679f271583f2a4f463ad9b65665b751 (patch)
tree44eb52102587d7b0bb592464cef6ec04bcac8b90 /mm
parentead03e30b050d6dda769e7e9b071c5fa720bf8d2 (diff)
parent551e4fb2465b87de9d4aa1669b27d624435443bb (diff)
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile2
-rw-r--r--mm/dmapool.c500
-rw-r--r--mm/fadvise.c16
-rw-r--r--mm/filemap.c11
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c5
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/internal.h4
-rw-r--r--mm/memory.c196
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/migrate.c35
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/nommu.c53
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c24
-rw-r--r--mm/page_alloc.c159
-rw-r--r--mm/page_io.c2
-rw-r--r--mm/pagewalk.c131
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/shmem.c495
-rw-r--r--mm/slob.c51
-rw-r--r--mm/sparse.c12
-rw-r--r--mm/swap.c10
-rw-r--r--mm/swap_state.c153
-rw-r--r--mm/swapfile.c113
-rw-r--r--mm/tiny-shmem.c12
-rw-r--r--mm/truncate.c8
-rw-r--r--mm/vmalloc.c74
-rw-r--r--mm/vmstat.c61
30 files changed, 1409 insertions, 749 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 5c0b0ea7572d..4af5dff37277 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -13,8 +13,10 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o $(mmu-y) 14 page_isolation.o $(mmu-y)
15 15
16obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
16obj-$(CONFIG_BOUNCE) += bounce.o 17obj-$(CONFIG_BOUNCE) += bounce.o
17obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 18obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
19obj-$(CONFIG_HAS_DMA) += dmapool.o
18obj-$(CONFIG_HUGETLBFS) += hugetlb.o 20obj-$(CONFIG_HUGETLBFS) += hugetlb.o
19obj-$(CONFIG_NUMA) += mempolicy.o 21obj-$(CONFIG_NUMA) += mempolicy.o
20obj-$(CONFIG_SPARSEMEM) += sparse.o 22obj-$(CONFIG_SPARSEMEM) += sparse.o
diff --git a/mm/dmapool.c b/mm/dmapool.c
new file mode 100644
index 000000000000..34aaac451a96
--- /dev/null
+++ b/mm/dmapool.c
@@ -0,0 +1,500 @@
1/*
2 * DMA Pool allocator
3 *
4 * Copyright 2001 David Brownell
5 * Copyright 2007 Intel Corporation
6 * Author: Matthew Wilcox <willy@linux.intel.com>
7 *
8 * This software may be redistributed and/or modified under the terms of
9 * the GNU General Public License ("GPL") version 2 as published by the
10 * Free Software Foundation.
11 *
12 * This allocator returns small blocks of a given size which are DMA-able by
13 * the given device. It uses the dma_alloc_coherent page allocator to get
14 * new pages, then splits them up into blocks of the required size.
15 * Many older drivers still have their own code to do this.
16 *
17 * The current design of this allocator is fairly simple. The pool is
18 * represented by the 'struct dma_pool' which keeps a doubly-linked list of
19 * allocated pages. Each page in the page_list is split into blocks of at
20 * least 'size' bytes. Free blocks are tracked in an unsorted singly-linked
21 * list of free blocks within the page. Used blocks aren't tracked, but we
22 * keep a count of how many are currently allocated from each page.
23 */
24
25#include <linux/device.h>
26#include <linux/dma-mapping.h>
27#include <linux/dmapool.h>
28#include <linux/kernel.h>
29#include <linux/list.h>
30#include <linux/module.h>
31#include <linux/mutex.h>
32#include <linux/poison.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/spinlock.h>
36#include <linux/string.h>
37#include <linux/types.h>
38#include <linux/wait.h>
39
40struct dma_pool { /* the pool */
41 struct list_head page_list;
42 spinlock_t lock;
43 size_t size;
44 struct device *dev;
45 size_t allocation;
46 size_t boundary;
47 char name[32];
48 wait_queue_head_t waitq;
49 struct list_head pools;
50};
51
52struct dma_page { /* cacheable header for 'allocation' bytes */
53 struct list_head page_list;
54 void *vaddr;
55 dma_addr_t dma;
56 unsigned int in_use;
57 unsigned int offset;
58};
59
60#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
61
62static DEFINE_MUTEX(pools_lock);
63
64static ssize_t
65show_pools(struct device *dev, struct device_attribute *attr, char *buf)
66{
67 unsigned temp;
68 unsigned size;
69 char *next;
70 struct dma_page *page;
71 struct dma_pool *pool;
72
73 next = buf;
74 size = PAGE_SIZE;
75
76 temp = scnprintf(next, size, "poolinfo - 0.1\n");
77 size -= temp;
78 next += temp;
79
80 mutex_lock(&pools_lock);
81 list_for_each_entry(pool, &dev->dma_pools, pools) {
82 unsigned pages = 0;
83 unsigned blocks = 0;
84
85 list_for_each_entry(page, &pool->page_list, page_list) {
86 pages++;
87 blocks += page->in_use;
88 }
89
90 /* per-pool info, no real statistics yet */
91 temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n",
92 pool->name, blocks,
93 pages * (pool->allocation / pool->size),
94 pool->size, pages);
95 size -= temp;
96 next += temp;
97 }
98 mutex_unlock(&pools_lock);
99
100 return PAGE_SIZE - size;
101}
102
103static DEVICE_ATTR(pools, S_IRUGO, show_pools, NULL);
104
105/**
106 * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
107 * @name: name of pool, for diagnostics
108 * @dev: device that will be doing the DMA
109 * @size: size of the blocks in this pool.
110 * @align: alignment requirement for blocks; must be a power of two
111 * @boundary: returned blocks won't cross this power of two boundary
112 * Context: !in_interrupt()
113 *
114 * Returns a dma allocation pool with the requested characteristics, or
115 * null if one can't be created. Given one of these pools, dma_pool_alloc()
116 * may be used to allocate memory. Such memory will all have "consistent"
117 * DMA mappings, accessible by the device and its driver without using
118 * cache flushing primitives. The actual size of blocks allocated may be
119 * larger than requested because of alignment.
120 *
121 * If @boundary is nonzero, objects returned from dma_pool_alloc() won't
122 * cross that size boundary. This is useful for devices which have
123 * addressing restrictions on individual DMA transfers, such as not crossing
124 * boundaries of 4KBytes.
125 */
126struct dma_pool *dma_pool_create(const char *name, struct device *dev,
127 size_t size, size_t align, size_t boundary)
128{
129 struct dma_pool *retval;
130 size_t allocation;
131
132 if (align == 0) {
133 align = 1;
134 } else if (align & (align - 1)) {
135 return NULL;
136 }
137
138 if (size == 0) {
139 return NULL;
140 } else if (size < 4) {
141 size = 4;
142 }
143
144 if ((size % align) != 0)
145 size = ALIGN(size, align);
146
147 allocation = max_t(size_t, size, PAGE_SIZE);
148
149 if (!boundary) {
150 boundary = allocation;
151 } else if ((boundary < size) || (boundary & (boundary - 1))) {
152 return NULL;
153 }
154
155 retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
156 if (!retval)
157 return retval;
158
159 strlcpy(retval->name, name, sizeof(retval->name));
160
161 retval->dev = dev;
162
163 INIT_LIST_HEAD(&retval->page_list);
164 spin_lock_init(&retval->lock);
165 retval->size = size;
166 retval->boundary = boundary;
167 retval->allocation = allocation;
168 init_waitqueue_head(&retval->waitq);
169
170 if (dev) {
171 int ret;
172
173 mutex_lock(&pools_lock);
174 if (list_empty(&dev->dma_pools))
175 ret = device_create_file(dev, &dev_attr_pools);
176 else
177 ret = 0;
178 /* note: not currently insisting "name" be unique */
179 if (!ret)
180 list_add(&retval->pools, &dev->dma_pools);
181 else {
182 kfree(retval);
183 retval = NULL;
184 }
185 mutex_unlock(&pools_lock);
186 } else
187 INIT_LIST_HEAD(&retval->pools);
188
189 return retval;
190}
191EXPORT_SYMBOL(dma_pool_create);
192
193static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
194{
195 unsigned int offset = 0;
196 unsigned int next_boundary = pool->boundary;
197
198 do {
199 unsigned int next = offset + pool->size;
200 if (unlikely((next + pool->size) >= next_boundary)) {
201 next = next_boundary;
202 next_boundary += pool->boundary;
203 }
204 *(int *)(page->vaddr + offset) = next;
205 offset = next;
206 } while (offset < pool->allocation);
207}
208
209static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
210{
211 struct dma_page *page;
212
213 page = kmalloc(sizeof(*page), mem_flags);
214 if (!page)
215 return NULL;
216 page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
217 &page->dma, mem_flags);
218 if (page->vaddr) {
219#ifdef CONFIG_DEBUG_SLAB
220 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
221#endif
222 pool_initialise_page(pool, page);
223 list_add(&page->page_list, &pool->page_list);
224 page->in_use = 0;
225 page->offset = 0;
226 } else {
227 kfree(page);
228 page = NULL;
229 }
230 return page;
231}
232
233static inline int is_page_busy(struct dma_page *page)
234{
235 return page->in_use != 0;
236}
237
238static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
239{
240 dma_addr_t dma = page->dma;
241
242#ifdef CONFIG_DEBUG_SLAB
243 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
244#endif
245 dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
246 list_del(&page->page_list);
247 kfree(page);
248}
249
250/**
251 * dma_pool_destroy - destroys a pool of dma memory blocks.
252 * @pool: dma pool that will be destroyed
253 * Context: !in_interrupt()
254 *
255 * Caller guarantees that no more memory from the pool is in use,
256 * and that nothing will try to use the pool after this call.
257 */
258void dma_pool_destroy(struct dma_pool *pool)
259{
260 mutex_lock(&pools_lock);
261 list_del(&pool->pools);
262 if (pool->dev && list_empty(&pool->dev->dma_pools))
263 device_remove_file(pool->dev, &dev_attr_pools);
264 mutex_unlock(&pools_lock);
265
266 while (!list_empty(&pool->page_list)) {
267 struct dma_page *page;
268 page = list_entry(pool->page_list.next,
269 struct dma_page, page_list);
270 if (is_page_busy(page)) {
271 if (pool->dev)
272 dev_err(pool->dev,
273 "dma_pool_destroy %s, %p busy\n",
274 pool->name, page->vaddr);
275 else
276 printk(KERN_ERR
277 "dma_pool_destroy %s, %p busy\n",
278 pool->name, page->vaddr);
279 /* leak the still-in-use consistent memory */
280 list_del(&page->page_list);
281 kfree(page);
282 } else
283 pool_free_page(pool, page);
284 }
285
286 kfree(pool);
287}
288EXPORT_SYMBOL(dma_pool_destroy);
289
290/**
291 * dma_pool_alloc - get a block of consistent memory
292 * @pool: dma pool that will produce the block
293 * @mem_flags: GFP_* bitmask
294 * @handle: pointer to dma address of block
295 *
296 * This returns the kernel virtual address of a currently unused block,
297 * and reports its dma address through the handle.
298 * If such a memory block can't be allocated, %NULL is returned.
299 */
300void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
301 dma_addr_t *handle)
302{
303 unsigned long flags;
304 struct dma_page *page;
305 size_t offset;
306 void *retval;
307
308 spin_lock_irqsave(&pool->lock, flags);
309 restart:
310 list_for_each_entry(page, &pool->page_list, page_list) {
311 if (page->offset < pool->allocation)
312 goto ready;
313 }
314 page = pool_alloc_page(pool, GFP_ATOMIC);
315 if (!page) {
316 if (mem_flags & __GFP_WAIT) {
317 DECLARE_WAITQUEUE(wait, current);
318
319 __set_current_state(TASK_INTERRUPTIBLE);
320 __add_wait_queue(&pool->waitq, &wait);
321 spin_unlock_irqrestore(&pool->lock, flags);
322
323 schedule_timeout(POOL_TIMEOUT_JIFFIES);
324
325 spin_lock_irqsave(&pool->lock, flags);
326 __remove_wait_queue(&pool->waitq, &wait);
327 goto restart;
328 }
329 retval = NULL;
330 goto done;
331 }
332
333 ready:
334 page->in_use++;
335 offset = page->offset;
336 page->offset = *(int *)(page->vaddr + offset);
337 retval = offset + page->vaddr;
338 *handle = offset + page->dma;
339#ifdef CONFIG_DEBUG_SLAB
340 memset(retval, POOL_POISON_ALLOCATED, pool->size);
341#endif
342 done:
343 spin_unlock_irqrestore(&pool->lock, flags);
344 return retval;
345}
346EXPORT_SYMBOL(dma_pool_alloc);
347
348static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
349{
350 unsigned long flags;
351 struct dma_page *page;
352
353 spin_lock_irqsave(&pool->lock, flags);
354 list_for_each_entry(page, &pool->page_list, page_list) {
355 if (dma < page->dma)
356 continue;
357 if (dma < (page->dma + pool->allocation))
358 goto done;
359 }
360 page = NULL;
361 done:
362 spin_unlock_irqrestore(&pool->lock, flags);
363 return page;
364}
365
366/**
367 * dma_pool_free - put block back into dma pool
368 * @pool: the dma pool holding the block
369 * @vaddr: virtual address of block
370 * @dma: dma address of block
371 *
372 * Caller promises neither device nor driver will again touch this block
373 * unless it is first re-allocated.
374 */
375void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
376{
377 struct dma_page *page;
378 unsigned long flags;
379 unsigned int offset;
380
381 page = pool_find_page(pool, dma);
382 if (!page) {
383 if (pool->dev)
384 dev_err(pool->dev,
385 "dma_pool_free %s, %p/%lx (bad dma)\n",
386 pool->name, vaddr, (unsigned long)dma);
387 else
388 printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n",
389 pool->name, vaddr, (unsigned long)dma);
390 return;
391 }
392
393 offset = vaddr - page->vaddr;
394#ifdef CONFIG_DEBUG_SLAB
395 if ((dma - page->dma) != offset) {
396 if (pool->dev)
397 dev_err(pool->dev,
398 "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
399 pool->name, vaddr, (unsigned long long)dma);
400 else
401 printk(KERN_ERR
402 "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
403 pool->name, vaddr, (unsigned long long)dma);
404 return;
405 }
406 {
407 unsigned int chain = page->offset;
408 while (chain < pool->allocation) {
409 if (chain != offset) {
410 chain = *(int *)(page->vaddr + chain);
411 continue;
412 }
413 if (pool->dev)
414 dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
415 "already free\n", pool->name,
416 (unsigned long long)dma);
417 else
418 printk(KERN_ERR "dma_pool_free %s, dma %Lx "
419 "already free\n", pool->name,
420 (unsigned long long)dma);
421 return;
422 }
423 }
424 memset(vaddr, POOL_POISON_FREED, pool->size);
425#endif
426
427 spin_lock_irqsave(&pool->lock, flags);
428 page->in_use--;
429 *(int *)vaddr = page->offset;
430 page->offset = offset;
431 if (waitqueue_active(&pool->waitq))
432 wake_up_locked(&pool->waitq);
433 /*
434 * Resist a temptation to do
435 * if (!is_page_busy(page)) pool_free_page(pool, page);
436 * Better have a few empty pages hang around.
437 */
438 spin_unlock_irqrestore(&pool->lock, flags);
439}
440EXPORT_SYMBOL(dma_pool_free);
441
442/*
443 * Managed DMA pool
444 */
445static void dmam_pool_release(struct device *dev, void *res)
446{
447 struct dma_pool *pool = *(struct dma_pool **)res;
448
449 dma_pool_destroy(pool);
450}
451
452static int dmam_pool_match(struct device *dev, void *res, void *match_data)
453{
454 return *(struct dma_pool **)res == match_data;
455}
456
457/**
458 * dmam_pool_create - Managed dma_pool_create()
459 * @name: name of pool, for diagnostics
460 * @dev: device that will be doing the DMA
461 * @size: size of the blocks in this pool.
462 * @align: alignment requirement for blocks; must be a power of two
463 * @allocation: returned blocks won't cross this boundary (or zero)
464 *
465 * Managed dma_pool_create(). DMA pool created with this function is
466 * automatically destroyed on driver detach.
467 */
468struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
469 size_t size, size_t align, size_t allocation)
470{
471 struct dma_pool **ptr, *pool;
472
473 ptr = devres_alloc(dmam_pool_release, sizeof(*ptr), GFP_KERNEL);
474 if (!ptr)
475 return NULL;
476
477 pool = *ptr = dma_pool_create(name, dev, size, align, allocation);
478 if (pool)
479 devres_add(dev, ptr);
480 else
481 devres_free(ptr);
482
483 return pool;
484}
485EXPORT_SYMBOL(dmam_pool_create);
486
487/**
488 * dmam_pool_destroy - Managed dma_pool_destroy()
489 * @pool: dma pool that will be destroyed
490 *
491 * Managed dma_pool_destroy().
492 */
493void dmam_pool_destroy(struct dma_pool *pool)
494{
495 struct device *dev = pool->dev;
496
497 dma_pool_destroy(pool);
498 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
499}
500EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 0df4c899e979..3c0f1e99f5e4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,9 +49,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
49 goto out; 49 goto out;
50 } 50 }
51 51
52 if (mapping->a_ops->get_xip_page) 52 if (mapping->a_ops->get_xip_page) {
53 /* no bad return value, but ignore advice */ 53 switch (advice) {
54 case POSIX_FADV_NORMAL:
55 case POSIX_FADV_RANDOM:
56 case POSIX_FADV_SEQUENTIAL:
57 case POSIX_FADV_WILLNEED:
58 case POSIX_FADV_NOREUSE:
59 case POSIX_FADV_DONTNEED:
60 /* no bad return value, but ignore advice */
61 break;
62 default:
63 ret = -EINVAL;
64 }
54 goto out; 65 goto out;
66 }
55 67
56 /* Careful about overflows. Len == 0 means "as much as possible" */ 68 /* Careful about overflows. Len == 0 means "as much as possible" */
57 endbyte = offset + len; 69 endbyte = offset + len;
diff --git a/mm/filemap.c b/mm/filemap.c
index 76bea88cbebc..81fb9bff0d4f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -65,7 +65,6 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
65 * ->private_lock (__free_pte->__set_page_dirty_buffers) 65 * ->private_lock (__free_pte->__set_page_dirty_buffers)
66 * ->swap_lock (exclusive_swap_page, others) 66 * ->swap_lock (exclusive_swap_page, others)
67 * ->mapping->tree_lock 67 * ->mapping->tree_lock
68 * ->zone.lock
69 * 68 *
70 * ->i_mutex 69 * ->i_mutex
71 * ->i_mmap_lock (truncate->unmap_mapping_range) 70 * ->i_mmap_lock (truncate->unmap_mapping_range)
@@ -528,7 +527,7 @@ static inline void wake_up_page(struct page *page, int bit)
528 __wake_up_bit(page_waitqueue(page), &page->flags, bit); 527 __wake_up_bit(page_waitqueue(page), &page->flags, bit);
529} 528}
530 529
531void fastcall wait_on_page_bit(struct page *page, int bit_nr) 530void wait_on_page_bit(struct page *page, int bit_nr)
532{ 531{
533 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); 532 DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
534 533
@@ -552,7 +551,7 @@ EXPORT_SYMBOL(wait_on_page_bit);
552 * the clear_bit and the read of the waitqueue (to avoid SMP races with a 551 * the clear_bit and the read of the waitqueue (to avoid SMP races with a
553 * parallel wait_on_page_locked()). 552 * parallel wait_on_page_locked()).
554 */ 553 */
555void fastcall unlock_page(struct page *page) 554void unlock_page(struct page *page)
556{ 555{
557 smp_mb__before_clear_bit(); 556 smp_mb__before_clear_bit();
558 if (!TestClearPageLocked(page)) 557 if (!TestClearPageLocked(page))
@@ -586,7 +585,7 @@ EXPORT_SYMBOL(end_page_writeback);
586 * chances are that on the second loop, the block layer's plug list is empty, 585 * chances are that on the second loop, the block layer's plug list is empty,
587 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. 586 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
588 */ 587 */
589void fastcall __lock_page(struct page *page) 588void __lock_page(struct page *page)
590{ 589{
591 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 590 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
592 591
@@ -607,7 +606,7 @@ int fastcall __lock_page_killable(struct page *page)
607 * Variant of lock_page that does not require the caller to hold a reference 606 * Variant of lock_page that does not require the caller to hold a reference
608 * on the page's mapping. 607 * on the page's mapping.
609 */ 608 */
610void fastcall __lock_page_nosync(struct page *page) 609void __lock_page_nosync(struct page *page)
611{ 610{
612 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); 611 DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
613 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, 612 __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
@@ -1277,7 +1276,7 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1277 * This adds the requested page to the page cache if it isn't already there, 1276 * This adds the requested page to the page cache if it isn't already there,
1278 * and schedules an I/O to read in its contents from disk. 1277 * and schedules an I/O to read in its contents from disk.
1279 */ 1278 */
1280static int fastcall page_cache_read(struct file * file, pgoff_t offset) 1279static int page_cache_read(struct file *file, pgoff_t offset)
1281{ 1280{
1282 struct address_space *mapping = file->f_mapping; 1281 struct address_space *mapping = file->f_mapping;
1283 struct page *page; 1282 struct page *page;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index f874ae818ad3..0420a0292b03 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -431,7 +431,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
431 else 431 else
432 return PTR_ERR(page); 432 return PTR_ERR(page);
433 } 433 }
434 zero_user_page(page, offset, length, KM_USER0); 434 zero_user(page, offset, length);
435 return 0; 435 return 0;
436} 436}
437EXPORT_SYMBOL_GPL(xip_truncate_page); 437EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
index 14bd3bf7826e..69a37c2bdf81 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -190,10 +190,13 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
190 */ 190 */
191 if (mapping_cap_account_dirty(mapping)) { 191 if (mapping_cap_account_dirty(mapping)) {
192 unsigned long addr; 192 unsigned long addr;
193 struct file *file = vma->vm_file;
193 194
194 flags &= MAP_NONBLOCK; 195 flags &= MAP_NONBLOCK;
195 addr = mmap_region(vma->vm_file, start, size, 196 get_file(file);
197 addr = mmap_region(file, start, size,
196 flags, vma->vm_flags, pgoff, 1); 198 flags, vma->vm_flags, pgoff, 1);
199 fput(file);
197 if (IS_ERR_VALUE(addr)) { 200 if (IS_ERR_VALUE(addr)) {
198 err = addr; 201 err = addr;
199 } else { 202 } else {
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a967bc35152..35d47733cde4 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -163,7 +163,7 @@ start:
163 return vaddr; 163 return vaddr;
164} 164}
165 165
166void fastcall *kmap_high(struct page *page) 166void *kmap_high(struct page *page)
167{ 167{
168 unsigned long vaddr; 168 unsigned long vaddr;
169 169
@@ -185,7 +185,7 @@ void fastcall *kmap_high(struct page *page)
185 185
186EXPORT_SYMBOL(kmap_high); 186EXPORT_SYMBOL(kmap_high);
187 187
188void fastcall kunmap_high(struct page *page) 188void kunmap_high(struct page *page)
189{ 189{
190 unsigned long vaddr; 190 unsigned long vaddr;
191 unsigned long nr; 191 unsigned long nr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index db861d8b6c28..1a5642074e34 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -813,6 +813,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
813 813
814 spin_unlock(&mm->page_table_lock); 814 spin_unlock(&mm->page_table_lock);
815 copy_huge_page(new_page, old_page, address, vma); 815 copy_huge_page(new_page, old_page, address, vma);
816 __SetPageUptodate(new_page);
816 spin_lock(&mm->page_table_lock); 817 spin_lock(&mm->page_table_lock);
817 818
818 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 819 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -858,6 +859,7 @@ retry:
858 goto out; 859 goto out;
859 } 860 }
860 clear_huge_page(page, address); 861 clear_huge_page(page, address);
862 __SetPageUptodate(page);
861 863
862 if (vma->vm_flags & VM_SHARED) { 864 if (vma->vm_flags & VM_SHARED) {
863 int err; 865 int err;
diff --git a/mm/internal.h b/mm/internal.h
index 953f941ea867..5a9a6200e034 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -24,7 +24,7 @@ static inline void set_page_count(struct page *page, int v)
24 */ 24 */
25static inline void set_page_refcounted(struct page *page) 25static inline void set_page_refcounted(struct page *page)
26{ 26{
27 VM_BUG_ON(PageCompound(page) && PageTail(page)); 27 VM_BUG_ON(PageTail(page));
28 VM_BUG_ON(atomic_read(&page->_count)); 28 VM_BUG_ON(atomic_read(&page->_count));
29 set_page_count(page, 1); 29 set_page_count(page, 1);
30} 30}
@@ -34,7 +34,7 @@ static inline void __put_page(struct page *page)
34 atomic_dec(&page->_count); 34 atomic_dec(&page->_count);
35} 35}
36 36
37extern void fastcall __init __free_pages_bootmem(struct page *page, 37extern void __init __free_pages_bootmem(struct page *page,
38 unsigned int order); 38 unsigned int order);
39 39
40/* 40/*
diff --git a/mm/memory.c b/mm/memory.c
index d902d0e25edc..7bb70728bb52 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,7 +305,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
305 spin_lock(&mm->page_table_lock); 305 spin_lock(&mm->page_table_lock);
306 if (pmd_present(*pmd)) { /* Another has populated it */ 306 if (pmd_present(*pmd)) { /* Another has populated it */
307 pte_lock_deinit(new); 307 pte_lock_deinit(new);
308 pte_free(new); 308 pte_free(mm, new);
309 } else { 309 } else {
310 mm->nr_ptes++; 310 mm->nr_ptes++;
311 inc_zone_page_state(new, NR_PAGETABLE); 311 inc_zone_page_state(new, NR_PAGETABLE);
@@ -323,7 +323,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
323 323
324 spin_lock(&init_mm.page_table_lock); 324 spin_lock(&init_mm.page_table_lock);
325 if (pmd_present(*pmd)) /* Another has populated it */ 325 if (pmd_present(*pmd)) /* Another has populated it */
326 pte_free_kernel(new); 326 pte_free_kernel(&init_mm, new);
327 else 327 else
328 pmd_populate_kernel(&init_mm, pmd, new); 328 pmd_populate_kernel(&init_mm, pmd, new);
329 spin_unlock(&init_mm.page_table_lock); 329 spin_unlock(&init_mm.page_table_lock);
@@ -1109,7 +1109,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1109} 1109}
1110EXPORT_SYMBOL(get_user_pages); 1110EXPORT_SYMBOL(get_user_pages);
1111 1111
1112pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) 1112pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1113 spinlock_t **ptl)
1113{ 1114{
1114 pgd_t * pgd = pgd_offset(mm, addr); 1115 pgd_t * pgd = pgd_offset(mm, addr);
1115 pud_t * pud = pud_alloc(mm, pgd, addr); 1116 pud_t * pud = pud_alloc(mm, pgd, addr);
@@ -1517,10 +1518,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
1517 memset(kaddr, 0, PAGE_SIZE); 1518 memset(kaddr, 0, PAGE_SIZE);
1518 kunmap_atomic(kaddr, KM_USER0); 1519 kunmap_atomic(kaddr, KM_USER0);
1519 flush_dcache_page(dst); 1520 flush_dcache_page(dst);
1520 return; 1521 } else
1521 1522 copy_user_highpage(dst, src, va, vma);
1522 }
1523 copy_user_highpage(dst, src, va, vma);
1524} 1523}
1525 1524
1526/* 1525/*
@@ -1629,6 +1628,7 @@ gotten:
1629 if (!new_page) 1628 if (!new_page)
1630 goto oom; 1629 goto oom;
1631 cow_user_page(new_page, old_page, address, vma); 1630 cow_user_page(new_page, old_page, address, vma);
1631 __SetPageUptodate(new_page);
1632 1632
1633 /* 1633 /*
1634 * Re-check the pte - we dropped the lock 1634 * Re-check the pte - we dropped the lock
@@ -1909,50 +1909,49 @@ EXPORT_SYMBOL(unmap_mapping_range);
1909 */ 1909 */
1910int vmtruncate(struct inode * inode, loff_t offset) 1910int vmtruncate(struct inode * inode, loff_t offset)
1911{ 1911{
1912 struct address_space *mapping = inode->i_mapping; 1912 if (inode->i_size < offset) {
1913 unsigned long limit; 1913 unsigned long limit;
1914 1914
1915 if (inode->i_size < offset) 1915 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1916 goto do_expand; 1916 if (limit != RLIM_INFINITY && offset > limit)
1917 /* 1917 goto out_sig;
1918 * truncation of in-use swapfiles is disallowed - it would cause 1918 if (offset > inode->i_sb->s_maxbytes)
1919 * subsequent swapout to scribble on the now-freed blocks. 1919 goto out_big;
1920 */ 1920 i_size_write(inode, offset);
1921 if (IS_SWAPFILE(inode)) 1921 } else {
1922 goto out_busy; 1922 struct address_space *mapping = inode->i_mapping;
1923 i_size_write(inode, offset); 1923
1924 /*
1925 * truncation of in-use swapfiles is disallowed - it would
1926 * cause subsequent swapout to scribble on the now-freed
1927 * blocks.
1928 */
1929 if (IS_SWAPFILE(inode))
1930 return -ETXTBSY;
1931 i_size_write(inode, offset);
1932
1933 /*
1934 * unmap_mapping_range is called twice, first simply for
1935 * efficiency so that truncate_inode_pages does fewer
1936 * single-page unmaps. However after this first call, and
1937 * before truncate_inode_pages finishes, it is possible for
1938 * private pages to be COWed, which remain after
1939 * truncate_inode_pages finishes, hence the second
1940 * unmap_mapping_range call must be made for correctness.
1941 */
1942 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1943 truncate_inode_pages(mapping, offset);
1944 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1945 }
1924 1946
1925 /*
1926 * unmap_mapping_range is called twice, first simply for efficiency
1927 * so that truncate_inode_pages does fewer single-page unmaps. However
1928 * after this first call, and before truncate_inode_pages finishes,
1929 * it is possible for private pages to be COWed, which remain after
1930 * truncate_inode_pages finishes, hence the second unmap_mapping_range
1931 * call must be made for correctness.
1932 */
1933 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1934 truncate_inode_pages(mapping, offset);
1935 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
1936 goto out_truncate;
1937
1938do_expand:
1939 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
1940 if (limit != RLIM_INFINITY && offset > limit)
1941 goto out_sig;
1942 if (offset > inode->i_sb->s_maxbytes)
1943 goto out_big;
1944 i_size_write(inode, offset);
1945
1946out_truncate:
1947 if (inode->i_op && inode->i_op->truncate) 1947 if (inode->i_op && inode->i_op->truncate)
1948 inode->i_op->truncate(inode); 1948 inode->i_op->truncate(inode);
1949 return 0; 1949 return 0;
1950
1950out_sig: 1951out_sig:
1951 send_sig(SIGXFSZ, current, 0); 1952 send_sig(SIGXFSZ, current, 0);
1952out_big: 1953out_big:
1953 return -EFBIG; 1954 return -EFBIG;
1954out_busy:
1955 return -ETXTBSY;
1956} 1955}
1957EXPORT_SYMBOL(vmtruncate); 1956EXPORT_SYMBOL(vmtruncate);
1958 1957
@@ -1980,67 +1979,6 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1980 return 0; 1979 return 0;
1981} 1980}
1982 1981
1983/**
1984 * swapin_readahead - swap in pages in hope we need them soon
1985 * @entry: swap entry of this memory
1986 * @addr: address to start
1987 * @vma: user vma this addresses belong to
1988 *
1989 * Primitive swap readahead code. We simply read an aligned block of
1990 * (1 << page_cluster) entries in the swap area. This method is chosen
1991 * because it doesn't cost us any seek time. We also make sure to queue
1992 * the 'original' request together with the readahead ones...
1993 *
1994 * This has been extended to use the NUMA policies from the mm triggering
1995 * the readahead.
1996 *
1997 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
1998 */
1999void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struct *vma)
2000{
2001#ifdef CONFIG_NUMA
2002 struct vm_area_struct *next_vma = vma ? vma->vm_next : NULL;
2003#endif
2004 int i, num;
2005 struct page *new_page;
2006 unsigned long offset;
2007
2008 /*
2009 * Get the number of handles we should do readahead io to.
2010 */
2011 num = valid_swaphandles(entry, &offset);
2012 for (i = 0; i < num; offset++, i++) {
2013 /* Ok, do the async read-ahead now */
2014 new_page = read_swap_cache_async(swp_entry(swp_type(entry),
2015 offset), vma, addr);
2016 if (!new_page)
2017 break;
2018 page_cache_release(new_page);
2019#ifdef CONFIG_NUMA
2020 /*
2021 * Find the next applicable VMA for the NUMA policy.
2022 */
2023 addr += PAGE_SIZE;
2024 if (addr == 0)
2025 vma = NULL;
2026 if (vma) {
2027 if (addr >= vma->vm_end) {
2028 vma = next_vma;
2029 next_vma = vma ? vma->vm_next : NULL;
2030 }
2031 if (vma && addr < vma->vm_start)
2032 vma = NULL;
2033 } else {
2034 if (next_vma && addr >= next_vma->vm_start) {
2035 vma = next_vma;
2036 next_vma = vma->vm_next;
2037 }
2038 }
2039#endif
2040 }
2041 lru_add_drain(); /* Push any new pages onto the LRU now */
2042}
2043
2044/* 1982/*
2045 * We enter with non-exclusive mmap_sem (to exclude vma changes, 1983 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2046 * but allow concurrent faults), and pte mapped but not yet locked. 1984 * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2068,8 +2006,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2068 page = lookup_swap_cache(entry); 2006 page = lookup_swap_cache(entry);
2069 if (!page) { 2007 if (!page) {
2070 grab_swap_token(); /* Contend for token _before_ read-in */ 2008 grab_swap_token(); /* Contend for token _before_ read-in */
2071 swapin_readahead(entry, address, vma); 2009 page = swapin_readahead(entry,
2072 page = read_swap_cache_async(entry, vma, address); 2010 GFP_HIGHUSER_MOVABLE, vma, address);
2073 if (!page) { 2011 if (!page) {
2074 /* 2012 /*
2075 * Back out if somebody else faulted in this pte 2013 * Back out if somebody else faulted in this pte
@@ -2163,6 +2101,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2163 page = alloc_zeroed_user_highpage_movable(vma, address); 2101 page = alloc_zeroed_user_highpage_movable(vma, address);
2164 if (!page) 2102 if (!page)
2165 goto oom; 2103 goto oom;
2104 __SetPageUptodate(page);
2166 2105
2167 entry = mk_pte(page, vma->vm_page_prot); 2106 entry = mk_pte(page, vma->vm_page_prot);
2168 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2263,6 +2202,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2263 goto out; 2202 goto out;
2264 } 2203 }
2265 copy_user_highpage(page, vmf.page, address, vma); 2204 copy_user_highpage(page, vmf.page, address, vma);
2205 __SetPageUptodate(page);
2266 } else { 2206 } else {
2267 /* 2207 /*
2268 * If the page will be shareable, see if the backing 2208 * If the page will be shareable, see if the backing
@@ -2563,7 +2503,7 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2563 2503
2564 spin_lock(&mm->page_table_lock); 2504 spin_lock(&mm->page_table_lock);
2565 if (pgd_present(*pgd)) /* Another has populated it */ 2505 if (pgd_present(*pgd)) /* Another has populated it */
2566 pud_free(new); 2506 pud_free(mm, new);
2567 else 2507 else
2568 pgd_populate(mm, pgd, new); 2508 pgd_populate(mm, pgd, new);
2569 spin_unlock(&mm->page_table_lock); 2509 spin_unlock(&mm->page_table_lock);
@@ -2585,12 +2525,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2585 spin_lock(&mm->page_table_lock); 2525 spin_lock(&mm->page_table_lock);
2586#ifndef __ARCH_HAS_4LEVEL_HACK 2526#ifndef __ARCH_HAS_4LEVEL_HACK
2587 if (pud_present(*pud)) /* Another has populated it */ 2527 if (pud_present(*pud)) /* Another has populated it */
2588 pmd_free(new); 2528 pmd_free(mm, new);
2589 else 2529 else
2590 pud_populate(mm, pud, new); 2530 pud_populate(mm, pud, new);
2591#else 2531#else
2592 if (pgd_present(*pud)) /* Another has populated it */ 2532 if (pgd_present(*pud)) /* Another has populated it */
2593 pmd_free(new); 2533 pmd_free(mm, new);
2594 else 2534 else
2595 pgd_populate(mm, pud, new); 2535 pgd_populate(mm, pud, new);
2596#endif /* __ARCH_HAS_4LEVEL_HACK */ 2536#endif /* __ARCH_HAS_4LEVEL_HACK */
@@ -2618,46 +2558,6 @@ int make_pages_present(unsigned long addr, unsigned long end)
2618 return ret == len ? 0 : -1; 2558 return ret == len ? 0 : -1;
2619} 2559}
2620 2560
2621/*
2622 * Map a vmalloc()-space virtual address to the physical page.
2623 */
2624struct page * vmalloc_to_page(void * vmalloc_addr)
2625{
2626 unsigned long addr = (unsigned long) vmalloc_addr;
2627 struct page *page = NULL;
2628 pgd_t *pgd = pgd_offset_k(addr);
2629 pud_t *pud;
2630 pmd_t *pmd;
2631 pte_t *ptep, pte;
2632
2633 if (!pgd_none(*pgd)) {
2634 pud = pud_offset(pgd, addr);
2635 if (!pud_none(*pud)) {
2636 pmd = pmd_offset(pud, addr);
2637 if (!pmd_none(*pmd)) {
2638 ptep = pte_offset_map(pmd, addr);
2639 pte = *ptep;
2640 if (pte_present(pte))
2641 page = pte_page(pte);
2642 pte_unmap(ptep);
2643 }
2644 }
2645 }
2646 return page;
2647}
2648
2649EXPORT_SYMBOL(vmalloc_to_page);
2650
2651/*
2652 * Map a vmalloc()-space virtual address to the physical page frame number.
2653 */
2654unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2655{
2656 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
2657}
2658
2659EXPORT_SYMBOL(vmalloc_to_pfn);
2660
2661#if !defined(__HAVE_ARCH_GATE_AREA) 2561#if !defined(__HAVE_ARCH_GATE_AREA)
2662 2562
2663#if defined(AT_SYSINFO_EHDR) 2563#if defined(AT_SYSINFO_EHDR)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9512a544d044..7469c503580d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -481,8 +481,6 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
481 return offlined; 481 return offlined;
482} 482}
483 483
484extern void drain_all_local_pages(void);
485
486int offline_pages(unsigned long start_pfn, 484int offline_pages(unsigned long start_pfn,
487 unsigned long end_pfn, unsigned long timeout) 485 unsigned long end_pfn, unsigned long timeout)
488{ 486{
@@ -540,7 +538,7 @@ repeat:
540 lru_add_drain_all(); 538 lru_add_drain_all();
541 flush_scheduled_work(); 539 flush_scheduled_work();
542 cond_resched(); 540 cond_resched();
543 drain_all_local_pages(); 541 drain_all_pages();
544 } 542 }
545 543
546 pfn = scan_lru_pages(start_pfn, end_pfn); 544 pfn = scan_lru_pages(start_pfn, end_pfn);
@@ -563,7 +561,7 @@ repeat:
563 flush_scheduled_work(); 561 flush_scheduled_work();
564 yield(); 562 yield();
565 /* drain pcp pages , this is synchrouns. */ 563 /* drain pcp pages , this is synchrouns. */
566 drain_all_local_pages(); 564 drain_all_pages();
567 /* check again */ 565 /* check again */
568 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 566 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
569 if (offlined_pages < 0) { 567 if (offlined_pages < 0) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a207e8d17ea..857a987e3690 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -115,11 +115,6 @@ int putback_lru_pages(struct list_head *l)
115 return count; 115 return count;
116} 116}
117 117
118static inline int is_swap_pte(pte_t pte)
119{
120 return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
121}
122
123/* 118/*
124 * Restore a potential migration pte to a working pte entry 119 * Restore a potential migration pte to a working pte entry
125 */ 120 */
@@ -645,15 +640,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
645 rcu_read_lock(); 640 rcu_read_lock();
646 rcu_locked = 1; 641 rcu_locked = 1;
647 } 642 }
643
648 /* 644 /*
649 * This is a corner case handling. 645 * Corner case handling:
650 * When a new swap-cache is read into, it is linked to LRU 646 * 1. When a new swap-cache page is read into, it is added to the LRU
651 * and treated as swapcache but has no rmap yet. 647 * and treated as swapcache but it has no rmap yet.
652 * Calling try_to_unmap() against a page->mapping==NULL page is 648 * Calling try_to_unmap() against a page->mapping==NULL page will
653 * BUG. So handle it here. 649 * trigger a BUG. So handle it here.
650 * 2. An orphaned page (see truncate_complete_page) might have
651 * fs-private metadata. The page can be picked up due to memory
652 * offlining. Everywhere else except page reclaim, the page is
653 * invisible to the vm, so the page can not be migrated. So try to
654 * free the metadata, so the page can be freed.
654 */ 655 */
655 if (!page->mapping) 656 if (!page->mapping) {
657 if (!PageAnon(page) && PagePrivate(page)) {
658 /*
659 * Go direct to try_to_free_buffers() here because
660 * a) that's what try_to_release_page() would do anyway
661 * b) we may be under rcu_read_lock() here, so we can't
662 * use GFP_KERNEL which is what try_to_release_page()
663 * needs to be effective.
664 */
665 try_to_free_buffers(page);
666 }
656 goto rcu_unlock; 667 goto rcu_unlock;
668 }
669
657 /* Establish migration ptes or remove ptes */ 670 /* Establish migration ptes or remove ptes */
658 try_to_unmap(page, 1); 671 try_to_unmap(page, 1);
659 672
diff --git a/mm/mmap.c b/mm/mmap.c
index 8295577a83b2..bb4c963cc534 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -36,6 +36,10 @@
36#define arch_mmap_check(addr, len, flags) (0) 36#define arch_mmap_check(addr, len, flags) (0)
37#endif 37#endif
38 38
39#ifndef arch_rebalance_pgtables
40#define arch_rebalance_pgtables(addr, len) (addr)
41#endif
42
39static void unmap_region(struct mm_struct *mm, 43static void unmap_region(struct mm_struct *mm,
40 struct vm_area_struct *vma, struct vm_area_struct *prev, 44 struct vm_area_struct *vma, struct vm_area_struct *prev,
41 unsigned long start, unsigned long end); 45 unsigned long start, unsigned long end);
@@ -1424,7 +1428,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1424 if (addr & ~PAGE_MASK) 1428 if (addr & ~PAGE_MASK)
1425 return -EINVAL; 1429 return -EINVAL;
1426 1430
1427 return addr; 1431 return arch_rebalance_pgtables(addr, len);
1428} 1432}
1429 1433
1430EXPORT_SYMBOL(get_unmapped_area); 1434EXPORT_SYMBOL(get_unmapped_area);
diff --git a/mm/nommu.c b/mm/nommu.c
index b989cb928a7c..5d8ae086f74e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,6 +10,7 @@
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org>
13 */ 14 */
14 15
15#include <linux/module.h> 16#include <linux/module.h>
@@ -167,7 +168,7 @@ EXPORT_SYMBOL(get_user_pages);
167DEFINE_RWLOCK(vmlist_lock); 168DEFINE_RWLOCK(vmlist_lock);
168struct vm_struct *vmlist; 169struct vm_struct *vmlist;
169 170
170void vfree(void *addr) 171void vfree(const void *addr)
171{ 172{
172 kfree(addr); 173 kfree(addr);
173} 174}
@@ -183,13 +184,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
183} 184}
184EXPORT_SYMBOL(__vmalloc); 185EXPORT_SYMBOL(__vmalloc);
185 186
186struct page * vmalloc_to_page(void *addr) 187void *vmalloc_user(unsigned long size)
188{
189 void *ret;
190
191 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
192 PAGE_KERNEL);
193 if (ret) {
194 struct vm_area_struct *vma;
195
196 down_write(&current->mm->mmap_sem);
197 vma = find_vma(current->mm, (unsigned long)ret);
198 if (vma)
199 vma->vm_flags |= VM_USERMAP;
200 up_write(&current->mm->mmap_sem);
201 }
202
203 return ret;
204}
205EXPORT_SYMBOL(vmalloc_user);
206
207struct page *vmalloc_to_page(const void *addr)
187{ 208{
188 return virt_to_page(addr); 209 return virt_to_page(addr);
189} 210}
190EXPORT_SYMBOL(vmalloc_to_page); 211EXPORT_SYMBOL(vmalloc_to_page);
191 212
192unsigned long vmalloc_to_pfn(void *addr) 213unsigned long vmalloc_to_pfn(const void *addr)
193{ 214{
194 return page_to_pfn(virt_to_page(addr)); 215 return page_to_pfn(virt_to_page(addr));
195} 216}
@@ -253,10 +274,17 @@ EXPORT_SYMBOL(vmalloc_32);
253 * 274 *
254 * The resulting memory area is 32bit addressable and zeroed so it can be 275 * The resulting memory area is 32bit addressable and zeroed so it can be
255 * mapped to userspace without leaking data. 276 * mapped to userspace without leaking data.
277 *
278 * VM_USERMAP is set on the corresponding VMA so that subsequent calls to
279 * remap_vmalloc_range() are permissible.
256 */ 280 */
257void *vmalloc_32_user(unsigned long size) 281void *vmalloc_32_user(unsigned long size)
258{ 282{
259 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); 283 /*
284 * We'll have to sort out the ZONE_DMA bits for 64-bit,
285 * but for now this can simply use vmalloc_user() directly.
286 */
287 return vmalloc_user(size);
260} 288}
261EXPORT_SYMBOL(vmalloc_32_user); 289EXPORT_SYMBOL(vmalloc_32_user);
262 290
@@ -267,7 +295,7 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_
267} 295}
268EXPORT_SYMBOL(vmap); 296EXPORT_SYMBOL(vmap);
269 297
270void vunmap(void *addr) 298void vunmap(const void *addr)
271{ 299{
272 BUG(); 300 BUG();
273} 301}
@@ -1216,6 +1244,21 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
1216} 1244}
1217EXPORT_SYMBOL(remap_pfn_range); 1245EXPORT_SYMBOL(remap_pfn_range);
1218 1246
1247int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
1248 unsigned long pgoff)
1249{
1250 unsigned int size = vma->vm_end - vma->vm_start;
1251
1252 if (!(vma->vm_flags & VM_USERMAP))
1253 return -EINVAL;
1254
1255 vma->vm_start = (unsigned long)(addr + (pgoff << PAGE_SHIFT));
1256 vma->vm_end = vma->vm_start + size;
1257
1258 return 0;
1259}
1260EXPORT_SYMBOL(remap_vmalloc_range);
1261
1219void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) 1262void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1220{ 1263{
1221} 1264}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 96473b482099..c1850bf991cd 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -125,8 +125,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
125 * Superuser processes are usually more important, so we make it 125 * Superuser processes are usually more important, so we make it
126 * less likely that we kill those. 126 * less likely that we kill those.
127 */ 127 */
128 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || 128 if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
129 p->uid == 0 || p->euid == 0)
130 points /= 4; 129 points /= 4;
131 130
132 /* 131 /*
@@ -135,7 +134,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
135 * tend to only have this flag set on applications they think 134 * tend to only have this flag set on applications they think
136 * of as important. 135 * of as important.
137 */ 136 */
138 if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) 137 if (__capable(p, CAP_SYS_RAWIO))
139 points /= 4; 138 points /= 4;
140 139
141 /* 140 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3d3848fa6324..5e00f1772c20 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
69int dirty_background_ratio = 5; 69int dirty_background_ratio = 5;
70 70
71/* 71/*
72 * free highmem will not be subtracted from the total free memory
73 * for calculating free ratios if vm_highmem_is_dirtyable is true
74 */
75int vm_highmem_is_dirtyable;
76
77/*
72 * The generator of dirty data starts writeback at this percentage 78 * The generator of dirty data starts writeback at this percentage
73 */ 79 */
74int vm_dirty_ratio = 10; 80int vm_dirty_ratio = 10;
@@ -219,7 +225,7 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
219 * 225 *
220 * dirty -= (dirty/8) * p_{t} 226 * dirty -= (dirty/8) * p_{t}
221 */ 227 */
222void task_dirty_limit(struct task_struct *tsk, long *pdirty) 228static void task_dirty_limit(struct task_struct *tsk, long *pdirty)
223{ 229{
224 long numerator, denominator; 230 long numerator, denominator;
225 long dirty = *pdirty; 231 long dirty = *pdirty;
@@ -287,7 +293,10 @@ static unsigned long determine_dirtyable_memory(void)
287 x = global_page_state(NR_FREE_PAGES) 293 x = global_page_state(NR_FREE_PAGES)
288 + global_page_state(NR_INACTIVE) 294 + global_page_state(NR_INACTIVE)
289 + global_page_state(NR_ACTIVE); 295 + global_page_state(NR_ACTIVE);
290 x -= highmem_dirtyable_memory(x); 296
297 if (!vm_highmem_is_dirtyable)
298 x -= highmem_dirtyable_memory(x);
299
291 return x + 1; /* Ensure that we never return 0 */ 300 return x + 1; /* Ensure that we never return 0 */
292} 301}
293 302
@@ -558,6 +567,7 @@ static void background_writeout(unsigned long _min_pages)
558 global_page_state(NR_UNSTABLE_NFS) < background_thresh 567 global_page_state(NR_UNSTABLE_NFS) < background_thresh
559 && min_pages <= 0) 568 && min_pages <= 0)
560 break; 569 break;
570 wbc.more_io = 0;
561 wbc.encountered_congestion = 0; 571 wbc.encountered_congestion = 0;
562 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 572 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
563 wbc.pages_skipped = 0; 573 wbc.pages_skipped = 0;
@@ -565,8 +575,9 @@ static void background_writeout(unsigned long _min_pages)
565 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; 575 min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
566 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { 576 if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
567 /* Wrote less than expected */ 577 /* Wrote less than expected */
568 congestion_wait(WRITE, HZ/10); 578 if (wbc.encountered_congestion || wbc.more_io)
569 if (!wbc.encountered_congestion) 579 congestion_wait(WRITE, HZ/10);
580 else
570 break; 581 break;
571 } 582 }
572 } 583 }
@@ -631,11 +642,12 @@ static void wb_kupdate(unsigned long arg)
631 global_page_state(NR_UNSTABLE_NFS) + 642 global_page_state(NR_UNSTABLE_NFS) +
632 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 643 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
633 while (nr_to_write > 0) { 644 while (nr_to_write > 0) {
645 wbc.more_io = 0;
634 wbc.encountered_congestion = 0; 646 wbc.encountered_congestion = 0;
635 wbc.nr_to_write = MAX_WRITEBACK_PAGES; 647 wbc.nr_to_write = MAX_WRITEBACK_PAGES;
636 writeback_inodes(&wbc); 648 writeback_inodes(&wbc);
637 if (wbc.nr_to_write > 0) { 649 if (wbc.nr_to_write > 0) {
638 if (wbc.encountered_congestion) 650 if (wbc.encountered_congestion || wbc.more_io)
639 congestion_wait(WRITE, HZ/10); 651 congestion_wait(WRITE, HZ/10);
640 else 652 else
641 break; /* All the old data is written */ 653 break; /* All the old data is written */
@@ -1064,7 +1076,7 @@ static int __set_page_dirty(struct page *page)
1064 return 0; 1076 return 0;
1065} 1077}
1066 1078
1067int fastcall set_page_dirty(struct page *page) 1079int set_page_dirty(struct page *page)
1068{ 1080{
1069 int ret = __set_page_dirty(page); 1081 int ret = __set_page_dirty(page);
1070 if (ret) 1082 if (ret)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b2838c24e582..37576b822f06 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -537,7 +537,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
537/* 537/*
538 * permit the bootmem allocator to evade page validation on high-order frees 538 * permit the bootmem allocator to evade page validation on high-order frees
539 */ 539 */
540void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) 540void __init __free_pages_bootmem(struct page *page, unsigned int order)
541{ 541{
542 if (order == 0) { 542 if (order == 0) {
543 __ClearPageReserved(page); 543 __ClearPageReserved(page);
@@ -890,31 +890,51 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
890} 890}
891#endif 891#endif
892 892
893static void __drain_pages(unsigned int cpu) 893/*
894 * Drain pages of the indicated processor.
895 *
896 * The processor must either be the current processor and the
897 * thread pinned to the current processor or a processor that
898 * is not online.
899 */
900static void drain_pages(unsigned int cpu)
894{ 901{
895 unsigned long flags; 902 unsigned long flags;
896 struct zone *zone; 903 struct zone *zone;
897 int i;
898 904
899 for_each_zone(zone) { 905 for_each_zone(zone) {
900 struct per_cpu_pageset *pset; 906 struct per_cpu_pageset *pset;
907 struct per_cpu_pages *pcp;
901 908
902 if (!populated_zone(zone)) 909 if (!populated_zone(zone))
903 continue; 910 continue;
904 911
905 pset = zone_pcp(zone, cpu); 912 pset = zone_pcp(zone, cpu);
906 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 913
907 struct per_cpu_pages *pcp; 914 pcp = &pset->pcp;
908 915 local_irq_save(flags);
909 pcp = &pset->pcp[i]; 916 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
910 local_irq_save(flags); 917 pcp->count = 0;
911 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 918 local_irq_restore(flags);
912 pcp->count = 0;
913 local_irq_restore(flags);
914 }
915 } 919 }
916} 920}
917 921
922/*
923 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
924 */
925void drain_local_pages(void *arg)
926{
927 drain_pages(smp_processor_id());
928}
929
930/*
931 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
932 */
933void drain_all_pages(void)
934{
935 on_each_cpu(drain_local_pages, NULL, 0, 1);
936}
937
918#ifdef CONFIG_HIBERNATION 938#ifdef CONFIG_HIBERNATION
919 939
920void mark_free_pages(struct zone *zone) 940void mark_free_pages(struct zone *zone)
@@ -952,40 +972,9 @@ void mark_free_pages(struct zone *zone)
952#endif /* CONFIG_PM */ 972#endif /* CONFIG_PM */
953 973
954/* 974/*
955 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
956 */
957void drain_local_pages(void)
958{
959 unsigned long flags;
960
961 local_irq_save(flags);
962 __drain_pages(smp_processor_id());
963 local_irq_restore(flags);
964}
965
966void smp_drain_local_pages(void *arg)
967{
968 drain_local_pages();
969}
970
971/*
972 * Spill all the per-cpu pages from all CPUs back into the buddy allocator
973 */
974void drain_all_local_pages(void)
975{
976 unsigned long flags;
977
978 local_irq_save(flags);
979 __drain_pages(smp_processor_id());
980 local_irq_restore(flags);
981
982 smp_call_function(smp_drain_local_pages, NULL, 0, 1);
983}
984
985/*
986 * Free a 0-order page 975 * Free a 0-order page
987 */ 976 */
988static void fastcall free_hot_cold_page(struct page *page, int cold) 977static void free_hot_cold_page(struct page *page, int cold)
989{ 978{
990 struct zone *zone = page_zone(page); 979 struct zone *zone = page_zone(page);
991 struct per_cpu_pages *pcp; 980 struct per_cpu_pages *pcp;
@@ -1001,10 +990,13 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
1001 arch_free_page(page, 0); 990 arch_free_page(page, 0);
1002 kernel_map_pages(page, 1, 0); 991 kernel_map_pages(page, 1, 0);
1003 992
1004 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 993 pcp = &zone_pcp(zone, get_cpu())->pcp;
1005 local_irq_save(flags); 994 local_irq_save(flags);
1006 __count_vm_event(PGFREE); 995 __count_vm_event(PGFREE);
1007 list_add(&page->lru, &pcp->list); 996 if (cold)
997 list_add_tail(&page->lru, &pcp->list);
998 else
999 list_add(&page->lru, &pcp->list);
1008 set_page_private(page, get_pageblock_migratetype(page)); 1000 set_page_private(page, get_pageblock_migratetype(page));
1009 pcp->count++; 1001 pcp->count++;
1010 if (pcp->count >= pcp->high) { 1002 if (pcp->count >= pcp->high) {
@@ -1015,12 +1007,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
1015 put_cpu(); 1007 put_cpu();
1016} 1008}
1017 1009
1018void fastcall free_hot_page(struct page *page) 1010void free_hot_page(struct page *page)
1019{ 1011{
1020 free_hot_cold_page(page, 0); 1012 free_hot_cold_page(page, 0);
1021} 1013}
1022 1014
1023void fastcall free_cold_page(struct page *page) 1015void free_cold_page(struct page *page)
1024{ 1016{
1025 free_hot_cold_page(page, 1); 1017 free_hot_cold_page(page, 1);
1026} 1018}
@@ -1062,7 +1054,7 @@ again:
1062 if (likely(order == 0)) { 1054 if (likely(order == 0)) {
1063 struct per_cpu_pages *pcp; 1055 struct per_cpu_pages *pcp;
1064 1056
1065 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 1057 pcp = &zone_pcp(zone, cpu)->pcp;
1066 local_irq_save(flags); 1058 local_irq_save(flags);
1067 if (!pcp->count) { 1059 if (!pcp->count) {
1068 pcp->count = rmqueue_bulk(zone, 0, 1060 pcp->count = rmqueue_bulk(zone, 0,
@@ -1072,9 +1064,15 @@ again:
1072 } 1064 }
1073 1065
1074 /* Find a page of the appropriate migrate type */ 1066 /* Find a page of the appropriate migrate type */
1075 list_for_each_entry(page, &pcp->list, lru) 1067 if (cold) {
1076 if (page_private(page) == migratetype) 1068 list_for_each_entry_reverse(page, &pcp->list, lru)
1077 break; 1069 if (page_private(page) == migratetype)
1070 break;
1071 } else {
1072 list_for_each_entry(page, &pcp->list, lru)
1073 if (page_private(page) == migratetype)
1074 break;
1075 }
1078 1076
1079 /* Allocate more to the pcp list if necessary */ 1077 /* Allocate more to the pcp list if necessary */
1080 if (unlikely(&page->lru == &pcp->list)) { 1078 if (unlikely(&page->lru == &pcp->list)) {
@@ -1569,7 +1567,7 @@ nofail_alloc:
1569 cond_resched(); 1567 cond_resched();
1570 1568
1571 if (order != 0) 1569 if (order != 0)
1572 drain_all_local_pages(); 1570 drain_all_pages();
1573 1571
1574 if (likely(did_some_progress)) { 1572 if (likely(did_some_progress)) {
1575 page = get_page_from_freelist(gfp_mask, order, 1573 page = get_page_from_freelist(gfp_mask, order,
@@ -1643,7 +1641,7 @@ EXPORT_SYMBOL(__alloc_pages);
1643/* 1641/*
1644 * Common helper functions. 1642 * Common helper functions.
1645 */ 1643 */
1646fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1644unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1647{ 1645{
1648 struct page * page; 1646 struct page * page;
1649 page = alloc_pages(gfp_mask, order); 1647 page = alloc_pages(gfp_mask, order);
@@ -1654,7 +1652,7 @@ fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1654 1652
1655EXPORT_SYMBOL(__get_free_pages); 1653EXPORT_SYMBOL(__get_free_pages);
1656 1654
1657fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1655unsigned long get_zeroed_page(gfp_t gfp_mask)
1658{ 1656{
1659 struct page * page; 1657 struct page * page;
1660 1658
@@ -1680,7 +1678,7 @@ void __pagevec_free(struct pagevec *pvec)
1680 free_hot_cold_page(pvec->pages[i], pvec->cold); 1678 free_hot_cold_page(pvec->pages[i], pvec->cold);
1681} 1679}
1682 1680
1683fastcall void __free_pages(struct page *page, unsigned int order) 1681void __free_pages(struct page *page, unsigned int order)
1684{ 1682{
1685 if (put_page_testzero(page)) { 1683 if (put_page_testzero(page)) {
1686 if (order == 0) 1684 if (order == 0)
@@ -1692,7 +1690,7 @@ fastcall void __free_pages(struct page *page, unsigned int order)
1692 1690
1693EXPORT_SYMBOL(__free_pages); 1691EXPORT_SYMBOL(__free_pages);
1694 1692
1695fastcall void free_pages(unsigned long addr, unsigned int order) 1693void free_pages(unsigned long addr, unsigned int order)
1696{ 1694{
1697 if (addr != 0) { 1695 if (addr != 0) {
1698 VM_BUG_ON(!virt_addr_valid((void *)addr)); 1696 VM_BUG_ON(!virt_addr_valid((void *)addr));
@@ -1801,12 +1799,9 @@ void show_free_areas(void)
1801 1799
1802 pageset = zone_pcp(zone, cpu); 1800 pageset = zone_pcp(zone, cpu);
1803 1801
1804 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " 1802 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
1805 "Cold: hi:%5d, btch:%4d usd:%4d\n", 1803 cpu, pageset->pcp.high,
1806 cpu, pageset->pcp[0].high, 1804 pageset->pcp.batch, pageset->pcp.count);
1807 pageset->pcp[0].batch, pageset->pcp[0].count,
1808 pageset->pcp[1].high, pageset->pcp[1].batch,
1809 pageset->pcp[1].count);
1810 } 1805 }
1811 } 1806 }
1812 1807
@@ -1879,6 +1874,8 @@ void show_free_areas(void)
1879 printk("= %lukB\n", K(total)); 1874 printk("= %lukB\n", K(total));
1880 } 1875 }
1881 1876
1877 printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
1878
1882 show_swap_cache_info(); 1879 show_swap_cache_info();
1883} 1880}
1884 1881
@@ -2551,8 +2548,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2551 } 2548 }
2552} 2549}
2553 2550
2554static void __meminit zone_init_free_lists(struct pglist_data *pgdat, 2551static void __meminit zone_init_free_lists(struct zone *zone)
2555 struct zone *zone, unsigned long size)
2556{ 2552{
2557 int order, t; 2553 int order, t;
2558 for_each_migratetype_order(order, t) { 2554 for_each_migratetype_order(order, t) {
@@ -2604,17 +2600,11 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2604 2600
2605 memset(p, 0, sizeof(*p)); 2601 memset(p, 0, sizeof(*p));
2606 2602
2607 pcp = &p->pcp[0]; /* hot */ 2603 pcp = &p->pcp;
2608 pcp->count = 0; 2604 pcp->count = 0;
2609 pcp->high = 6 * batch; 2605 pcp->high = 6 * batch;
2610 pcp->batch = max(1UL, 1 * batch); 2606 pcp->batch = max(1UL, 1 * batch);
2611 INIT_LIST_HEAD(&pcp->list); 2607 INIT_LIST_HEAD(&pcp->list);
2612
2613 pcp = &p->pcp[1]; /* cold*/
2614 pcp->count = 0;
2615 pcp->high = 2 * batch;
2616 pcp->batch = max(1UL, batch/2);
2617 INIT_LIST_HEAD(&pcp->list);
2618} 2608}
2619 2609
2620/* 2610/*
@@ -2627,7 +2617,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
2627{ 2617{
2628 struct per_cpu_pages *pcp; 2618 struct per_cpu_pages *pcp;
2629 2619
2630 pcp = &p->pcp[0]; /* hot list */ 2620 pcp = &p->pcp;
2631 pcp->high = high; 2621 pcp->high = high;
2632 pcp->batch = max(1UL, high/4); 2622 pcp->batch = max(1UL, high/4);
2633 if ((high/4) > (PAGE_SHIFT * 8)) 2623 if ((high/4) > (PAGE_SHIFT * 8))
@@ -2831,7 +2821,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
2831 2821
2832 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2822 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2833 2823
2834 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 2824 zone_init_free_lists(zone);
2835 2825
2836 return 0; 2826 return 0;
2837} 2827}
@@ -3978,10 +3968,23 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
3978 int cpu = (unsigned long)hcpu; 3968 int cpu = (unsigned long)hcpu;
3979 3969
3980 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 3970 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
3981 local_irq_disable(); 3971 drain_pages(cpu);
3982 __drain_pages(cpu); 3972
3973 /*
3974 * Spill the event counters of the dead processor
3975 * into the current processors event counters.
3976 * This artificially elevates the count of the current
3977 * processor.
3978 */
3983 vm_events_fold_cpu(cpu); 3979 vm_events_fold_cpu(cpu);
3984 local_irq_enable(); 3980
3981 /*
3982 * Zero the differential counters of the dead processor
3983 * so that the vm statistics are consistent.
3984 *
3985 * This is only okay since the processor is dead and cannot
3986 * race with what we are doing.
3987 */
3985 refresh_cpu_vm_stats(cpu); 3988 refresh_cpu_vm_stats(cpu);
3986 } 3989 }
3987 return NOTIFY_OK; 3990 return NOTIFY_OK;
@@ -4480,7 +4483,7 @@ int set_migratetype_isolate(struct page *page)
4480out: 4483out:
4481 spin_unlock_irqrestore(&zone->lock, flags); 4484 spin_unlock_irqrestore(&zone->lock, flags);
4482 if (!ret) 4485 if (!ret)
4483 drain_all_local_pages(); 4486 drain_all_pages();
4484 return ret; 4487 return ret;
4485} 4488}
4486 4489
diff --git a/mm/page_io.c b/mm/page_io.c
index 3b97f6850273..065c4480eaf0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -126,7 +126,7 @@ int swap_readpage(struct file *file, struct page *page)
126 int ret = 0; 126 int ret = 0;
127 127
128 BUG_ON(!PageLocked(page)); 128 BUG_ON(!PageLocked(page));
129 ClearPageUptodate(page); 129 BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
131 end_swap_bio_read); 131 end_swap_bio_read);
132 if (bio == NULL) { 132 if (bio == NULL) {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
new file mode 100644
index 000000000000..b4f27d22da91
--- /dev/null
+++ b/mm/pagewalk.c
@@ -0,0 +1,131 @@
1#include <linux/mm.h>
2#include <linux/highmem.h>
3#include <linux/sched.h>
4
5static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
6 const struct mm_walk *walk, void *private)
7{
8 pte_t *pte;
9 int err = 0;
10
11 pte = pte_offset_map(pmd, addr);
12 do {
13 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
14 if (err)
15 break;
16 } while (pte++, addr += PAGE_SIZE, addr != end);
17
18 pte_unmap(pte);
19 return err;
20}
21
22static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
23 const struct mm_walk *walk, void *private)
24{
25 pmd_t *pmd;
26 unsigned long next;
27 int err = 0;
28
29 pmd = pmd_offset(pud, addr);
30 do {
31 next = pmd_addr_end(addr, end);
32 if (pmd_none_or_clear_bad(pmd)) {
33 if (walk->pte_hole)
34 err = walk->pte_hole(addr, next, private);
35 if (err)
36 break;
37 continue;
38 }
39 if (walk->pmd_entry)
40 err = walk->pmd_entry(pmd, addr, next, private);
41 if (!err && walk->pte_entry)
42 err = walk_pte_range(pmd, addr, next, walk, private);
43 if (err)
44 break;
45 } while (pmd++, addr = next, addr != end);
46
47 return err;
48}
49
50static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
51 const struct mm_walk *walk, void *private)
52{
53 pud_t *pud;
54 unsigned long next;
55 int err = 0;
56
57 pud = pud_offset(pgd, addr);
58 do {
59 next = pud_addr_end(addr, end);
60 if (pud_none_or_clear_bad(pud)) {
61 if (walk->pte_hole)
62 err = walk->pte_hole(addr, next, private);
63 if (err)
64 break;
65 continue;
66 }
67 if (walk->pud_entry)
68 err = walk->pud_entry(pud, addr, next, private);
69 if (!err && (walk->pmd_entry || walk->pte_entry))
70 err = walk_pmd_range(pud, addr, next, walk, private);
71 if (err)
72 break;
73 } while (pud++, addr = next, addr != end);
74
75 return err;
76}
77
78/**
79 * walk_page_range - walk a memory map's page tables with a callback
80 * @mm - memory map to walk
81 * @addr - starting address
82 * @end - ending address
83 * @walk - set of callbacks to invoke for each level of the tree
84 * @private - private data passed to the callback function
85 *
86 * Recursively walk the page table for the memory area in a VMA,
87 * calling supplied callbacks. Callbacks are called in-order (first
88 * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
89 * etc.). If lower-level callbacks are omitted, walking depth is reduced.
90 *
91 * Each callback receives an entry pointer, the start and end of the
92 * associated range, and a caller-supplied private data pointer.
93 *
94 * No locks are taken, but the bottom level iterator will map PTE
95 * directories from highmem if necessary.
96 *
97 * If any callback returns a non-zero value, the walk is aborted and
98 * the return value is propagated back to the caller. Otherwise 0 is returned.
99 */
100int walk_page_range(const struct mm_struct *mm,
101 unsigned long addr, unsigned long end,
102 const struct mm_walk *walk, void *private)
103{
104 pgd_t *pgd;
105 unsigned long next;
106 int err = 0;
107
108 if (addr >= end)
109 return err;
110
111 pgd = pgd_offset(mm, addr);
112 do {
113 next = pgd_addr_end(addr, end);
114 if (pgd_none_or_clear_bad(pgd)) {
115 if (walk->pte_hole)
116 err = walk->pte_hole(addr, next, private);
117 if (err)
118 break;
119 continue;
120 }
121 if (walk->pgd_entry)
122 err = walk->pgd_entry(pgd, addr, next, private);
123 if (!err &&
124 (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
125 err = walk_pud_range(pgd, addr, next, walk, private);
126 if (err)
127 break;
128 } while (pgd++, addr = next, addr != end);
129
130 return err;
131}
diff --git a/mm/rmap.c b/mm/rmap.c
index dbc2ca2057a5..57ad276900c9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,7 +36,6 @@
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_lock in __sync_single_inode) 38 * within inode_lock in __sync_single_inode)
39 * zone->lock (within radix tree node alloc)
40 */ 39 */
41 40
42#include <linux/mm.h> 41#include <linux/mm.h>
@@ -284,7 +283,10 @@ static int page_referenced_one(struct page *page,
284 if (!pte) 283 if (!pte)
285 goto out; 284 goto out;
286 285
287 if (ptep_clear_flush_young(vma, address, pte)) 286 if (vma->vm_flags & VM_LOCKED) {
287 referenced++;
288 *mapcount = 1; /* break early from loop */
289 } else if (ptep_clear_flush_young(vma, address, pte))
288 referenced++; 290 referenced++;
289 291
290 /* Pretend the page is referenced if the task has the 292 /* Pretend the page is referenced if the task has the
diff --git a/mm/shmem.c b/mm/shmem.c
index 51b3d6ccddab..0f246c44a574 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -78,11 +78,10 @@
78 78
79/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 79/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
80enum sgp_type { 80enum sgp_type {
81 SGP_QUICK, /* don't try more than file page cache lookup */
82 SGP_READ, /* don't exceed i_size, don't allocate page */ 81 SGP_READ, /* don't exceed i_size, don't allocate page */
83 SGP_CACHE, /* don't exceed i_size, may allocate page */ 82 SGP_CACHE, /* don't exceed i_size, may allocate page */
83 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
84 SGP_WRITE, /* may exceed i_size, may allocate page */ 84 SGP_WRITE, /* may exceed i_size, may allocate page */
85 SGP_FAULT, /* same as SGP_CACHE, return with page locked */
86}; 85};
87 86
88static int shmem_getpage(struct inode *inode, unsigned long idx, 87static int shmem_getpage(struct inode *inode, unsigned long idx,
@@ -194,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
194}; 193};
195 194
196static LIST_HEAD(shmem_swaplist); 195static LIST_HEAD(shmem_swaplist);
197static DEFINE_SPINLOCK(shmem_swaplist_lock); 196static DEFINE_MUTEX(shmem_swaplist_mutex);
198 197
199static void shmem_free_blocks(struct inode *inode, long pages) 198static void shmem_free_blocks(struct inode *inode, long pages)
200{ 199{
@@ -207,6 +206,31 @@ static void shmem_free_blocks(struct inode *inode, long pages)
207 } 206 }
208} 207}
209 208
209static int shmem_reserve_inode(struct super_block *sb)
210{
211 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
212 if (sbinfo->max_inodes) {
213 spin_lock(&sbinfo->stat_lock);
214 if (!sbinfo->free_inodes) {
215 spin_unlock(&sbinfo->stat_lock);
216 return -ENOSPC;
217 }
218 sbinfo->free_inodes--;
219 spin_unlock(&sbinfo->stat_lock);
220 }
221 return 0;
222}
223
224static void shmem_free_inode(struct super_block *sb)
225{
226 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
227 if (sbinfo->max_inodes) {
228 spin_lock(&sbinfo->stat_lock);
229 sbinfo->free_inodes++;
230 spin_unlock(&sbinfo->stat_lock);
231 }
232}
233
210/* 234/*
211 * shmem_recalc_inode - recalculate the size of an inode 235 * shmem_recalc_inode - recalculate the size of an inode
212 * 236 *
@@ -731,6 +755,8 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
731 (void) shmem_getpage(inode, 755 (void) shmem_getpage(inode,
732 attr->ia_size>>PAGE_CACHE_SHIFT, 756 attr->ia_size>>PAGE_CACHE_SHIFT,
733 &page, SGP_READ, NULL); 757 &page, SGP_READ, NULL);
758 if (page)
759 unlock_page(page);
734 } 760 }
735 /* 761 /*
736 * Reset SHMEM_PAGEIN flag so that shmem_truncate can 762 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
@@ -762,7 +788,6 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
762 788
763static void shmem_delete_inode(struct inode *inode) 789static void shmem_delete_inode(struct inode *inode)
764{ 790{
765 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
766 struct shmem_inode_info *info = SHMEM_I(inode); 791 struct shmem_inode_info *info = SHMEM_I(inode);
767 792
768 if (inode->i_op->truncate == shmem_truncate) { 793 if (inode->i_op->truncate == shmem_truncate) {
@@ -771,17 +796,13 @@ static void shmem_delete_inode(struct inode *inode)
771 inode->i_size = 0; 796 inode->i_size = 0;
772 shmem_truncate(inode); 797 shmem_truncate(inode);
773 if (!list_empty(&info->swaplist)) { 798 if (!list_empty(&info->swaplist)) {
774 spin_lock(&shmem_swaplist_lock); 799 mutex_lock(&shmem_swaplist_mutex);
775 list_del_init(&info->swaplist); 800 list_del_init(&info->swaplist);
776 spin_unlock(&shmem_swaplist_lock); 801 mutex_unlock(&shmem_swaplist_mutex);
777 } 802 }
778 } 803 }
779 BUG_ON(inode->i_blocks); 804 BUG_ON(inode->i_blocks);
780 if (sbinfo->max_inodes) { 805 shmem_free_inode(inode->i_sb);
781 spin_lock(&sbinfo->stat_lock);
782 sbinfo->free_inodes++;
783 spin_unlock(&sbinfo->stat_lock);
784 }
785 clear_inode(inode); 806 clear_inode(inode);
786} 807}
787 808
@@ -807,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
807 struct page *subdir; 828 struct page *subdir;
808 swp_entry_t *ptr; 829 swp_entry_t *ptr;
809 int offset; 830 int offset;
831 int error;
810 832
811 idx = 0; 833 idx = 0;
812 ptr = info->i_direct; 834 ptr = info->i_direct;
813 spin_lock(&info->lock); 835 spin_lock(&info->lock);
836 if (!info->swapped) {
837 list_del_init(&info->swaplist);
838 goto lost2;
839 }
814 limit = info->next_index; 840 limit = info->next_index;
815 size = limit; 841 size = limit;
816 if (size > SHMEM_NR_DIRECT) 842 if (size > SHMEM_NR_DIRECT)
817 size = SHMEM_NR_DIRECT; 843 size = SHMEM_NR_DIRECT;
818 offset = shmem_find_swp(entry, ptr, ptr+size); 844 offset = shmem_find_swp(entry, ptr, ptr+size);
819 if (offset >= 0) { 845 if (offset >= 0)
820 shmem_swp_balance_unmap();
821 goto found; 846 goto found;
822 }
823 if (!info->i_indirect) 847 if (!info->i_indirect)
824 goto lost2; 848 goto lost2;
825 849
@@ -829,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
829 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { 853 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
830 if (unlikely(idx == stage)) { 854 if (unlikely(idx == stage)) {
831 shmem_dir_unmap(dir-1); 855 shmem_dir_unmap(dir-1);
856 if (cond_resched_lock(&info->lock)) {
857 /* check it has not been truncated */
858 if (limit > info->next_index) {
859 limit = info->next_index;
860 if (idx >= limit)
861 goto lost2;
862 }
863 }
832 dir = shmem_dir_map(info->i_indirect) + 864 dir = shmem_dir_map(info->i_indirect) +
833 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; 865 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
834 while (!*dir) { 866 while (!*dir) {
@@ -849,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
849 if (size > ENTRIES_PER_PAGE) 881 if (size > ENTRIES_PER_PAGE)
850 size = ENTRIES_PER_PAGE; 882 size = ENTRIES_PER_PAGE;
851 offset = shmem_find_swp(entry, ptr, ptr+size); 883 offset = shmem_find_swp(entry, ptr, ptr+size);
884 shmem_swp_unmap(ptr);
852 if (offset >= 0) { 885 if (offset >= 0) {
853 shmem_dir_unmap(dir); 886 shmem_dir_unmap(dir);
854 goto found; 887 goto found;
855 } 888 }
856 shmem_swp_unmap(ptr);
857 } 889 }
858 } 890 }
859lost1: 891lost1:
@@ -863,19 +895,63 @@ lost2:
863 return 0; 895 return 0;
864found: 896found:
865 idx += offset; 897 idx += offset;
866 inode = &info->vfs_inode; 898 inode = igrab(&info->vfs_inode);
867 if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
868 info->flags |= SHMEM_PAGEIN;
869 shmem_swp_set(info, ptr + offset, 0);
870 }
871 shmem_swp_unmap(ptr);
872 spin_unlock(&info->lock); 899 spin_unlock(&info->lock);
900
873 /* 901 /*
874 * Decrement swap count even when the entry is left behind: 902 * Move _head_ to start search for next from here.
875 * try_to_unuse will skip over mms, then reincrement count. 903 * But be careful: shmem_delete_inode checks list_empty without taking
904 * mutex, and there's an instant in list_move_tail when info->swaplist
905 * would appear empty, if it were the only one on shmem_swaplist. We
906 * could avoid doing it if inode NULL; or use this minor optimization.
876 */ 907 */
877 swap_free(entry); 908 if (shmem_swaplist.next != &info->swaplist)
878 return 1; 909 list_move_tail(&shmem_swaplist, &info->swaplist);
910 mutex_unlock(&shmem_swaplist_mutex);
911
912 error = 1;
913 if (!inode)
914 goto out;
915 error = radix_tree_preload(GFP_KERNEL);
916 if (error)
917 goto out;
918 error = 1;
919
920 spin_lock(&info->lock);
921 ptr = shmem_swp_entry(info, idx, NULL);
922 if (ptr && ptr->val == entry.val)
923 error = add_to_page_cache(page, inode->i_mapping,
924 idx, GFP_NOWAIT);
925 if (error == -EEXIST) {
926 struct page *filepage = find_get_page(inode->i_mapping, idx);
927 error = 1;
928 if (filepage) {
929 /*
930 * There might be a more uptodate page coming down
931 * from a stacked writepage: forget our swappage if so.
932 */
933 if (PageUptodate(filepage))
934 error = 0;
935 page_cache_release(filepage);
936 }
937 }
938 if (!error) {
939 delete_from_swap_cache(page);
940 set_page_dirty(page);
941 info->flags |= SHMEM_PAGEIN;
942 shmem_swp_set(info, ptr, 0);
943 swap_free(entry);
944 error = 1; /* not an error, but entry was found */
945 }
946 if (ptr)
947 shmem_swp_unmap(ptr);
948 spin_unlock(&info->lock);
949 radix_tree_preload_end();
950out:
951 unlock_page(page);
952 page_cache_release(page);
953 iput(inode); /* allows for NULL */
954 return error;
879} 955}
880 956
881/* 957/*
@@ -887,20 +963,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
887 struct shmem_inode_info *info; 963 struct shmem_inode_info *info;
888 int found = 0; 964 int found = 0;
889 965
890 spin_lock(&shmem_swaplist_lock); 966 mutex_lock(&shmem_swaplist_mutex);
891 list_for_each_safe(p, next, &shmem_swaplist) { 967 list_for_each_safe(p, next, &shmem_swaplist) {
892 info = list_entry(p, struct shmem_inode_info, swaplist); 968 info = list_entry(p, struct shmem_inode_info, swaplist);
893 if (!info->swapped) 969 found = shmem_unuse_inode(info, entry, page);
894 list_del_init(&info->swaplist); 970 cond_resched();
895 else if (shmem_unuse_inode(info, entry, page)) { 971 if (found)
896 /* move head to start search for next from here */ 972 goto out;
897 list_move_tail(&shmem_swaplist, &info->swaplist);
898 found = 1;
899 break;
900 }
901 } 973 }
902 spin_unlock(&shmem_swaplist_lock); 974 mutex_unlock(&shmem_swaplist_mutex);
903 return found; 975out: return found; /* 0 or 1 or -ENOMEM */
904} 976}
905 977
906/* 978/*
@@ -915,54 +987,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
915 struct inode *inode; 987 struct inode *inode;
916 988
917 BUG_ON(!PageLocked(page)); 989 BUG_ON(!PageLocked(page));
918 /*
919 * shmem_backing_dev_info's capabilities prevent regular writeback or
920 * sync from ever calling shmem_writepage; but a stacking filesystem
921 * may use the ->writepage of its underlying filesystem, in which case
922 * we want to do nothing when that underlying filesystem is tmpfs
923 * (writing out to swap is useful as a response to memory pressure, but
924 * of no use to stabilize the data) - just redirty the page, unlock it
925 * and claim success in this case. AOP_WRITEPAGE_ACTIVATE, and the
926 * page_mapped check below, must be avoided unless we're in reclaim.
927 */
928 if (!wbc->for_reclaim) {
929 set_page_dirty(page);
930 unlock_page(page);
931 return 0;
932 }
933 BUG_ON(page_mapped(page));
934
935 mapping = page->mapping; 990 mapping = page->mapping;
936 index = page->index; 991 index = page->index;
937 inode = mapping->host; 992 inode = mapping->host;
938 info = SHMEM_I(inode); 993 info = SHMEM_I(inode);
939 if (info->flags & VM_LOCKED) 994 if (info->flags & VM_LOCKED)
940 goto redirty; 995 goto redirty;
941 swap = get_swap_page(); 996 if (!total_swap_pages)
942 if (!swap.val)
943 goto redirty; 997 goto redirty;
944 998
999 /*
1000 * shmem_backing_dev_info's capabilities prevent regular writeback or
1001 * sync from ever calling shmem_writepage; but a stacking filesystem
1002 * may use the ->writepage of its underlying filesystem, in which case
1003 * tmpfs should write out to swap only in response to memory pressure,
1004 * and not for pdflush or sync. However, in those cases, we do still
1005 * want to check if there's a redundant swappage to be discarded.
1006 */
1007 if (wbc->for_reclaim)
1008 swap = get_swap_page();
1009 else
1010 swap.val = 0;
1011
945 spin_lock(&info->lock); 1012 spin_lock(&info->lock);
946 shmem_recalc_inode(inode);
947 if (index >= info->next_index) { 1013 if (index >= info->next_index) {
948 BUG_ON(!(info->flags & SHMEM_TRUNCATE)); 1014 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
949 goto unlock; 1015 goto unlock;
950 } 1016 }
951 entry = shmem_swp_entry(info, index, NULL); 1017 entry = shmem_swp_entry(info, index, NULL);
952 BUG_ON(!entry); 1018 if (entry->val) {
953 BUG_ON(entry->val); 1019 /*
1020 * The more uptodate page coming down from a stacked
1021 * writepage should replace our old swappage.
1022 */
1023 free_swap_and_cache(*entry);
1024 shmem_swp_set(info, entry, 0);
1025 }
1026 shmem_recalc_inode(inode);
954 1027
955 if (move_to_swap_cache(page, swap) == 0) { 1028 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1029 remove_from_page_cache(page);
956 shmem_swp_set(info, entry, swap.val); 1030 shmem_swp_set(info, entry, swap.val);
957 shmem_swp_unmap(entry); 1031 shmem_swp_unmap(entry);
1032 if (list_empty(&info->swaplist))
1033 inode = igrab(inode);
1034 else
1035 inode = NULL;
958 spin_unlock(&info->lock); 1036 spin_unlock(&info->lock);
959 if (list_empty(&info->swaplist)) { 1037 swap_duplicate(swap);
960 spin_lock(&shmem_swaplist_lock); 1038 BUG_ON(page_mapped(page));
1039 page_cache_release(page); /* pagecache ref */
1040 set_page_dirty(page);
1041 unlock_page(page);
1042 if (inode) {
1043 mutex_lock(&shmem_swaplist_mutex);
961 /* move instead of add in case we're racing */ 1044 /* move instead of add in case we're racing */
962 list_move_tail(&info->swaplist, &shmem_swaplist); 1045 list_move_tail(&info->swaplist, &shmem_swaplist);
963 spin_unlock(&shmem_swaplist_lock); 1046 mutex_unlock(&shmem_swaplist_mutex);
1047 iput(inode);
964 } 1048 }
965 unlock_page(page);
966 return 0; 1049 return 0;
967 } 1050 }
968 1051
@@ -972,7 +1055,10 @@ unlock:
972 swap_free(swap); 1055 swap_free(swap);
973redirty: 1056redirty:
974 set_page_dirty(page); 1057 set_page_dirty(page);
975 return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ 1058 if (wbc->for_reclaim)
1059 return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
1060 unlock_page(page);
1061 return 0;
976} 1062}
977 1063
978#ifdef CONFIG_NUMA 1064#ifdef CONFIG_NUMA
@@ -1025,53 +1111,33 @@ out:
1025 return err; 1111 return err;
1026} 1112}
1027 1113
1028static struct page *shmem_swapin_async(struct shared_policy *p, 1114static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1029 swp_entry_t entry, unsigned long idx) 1115 struct shmem_inode_info *info, unsigned long idx)
1030{ 1116{
1031 struct page *page;
1032 struct vm_area_struct pvma; 1117 struct vm_area_struct pvma;
1118 struct page *page;
1033 1119
1034 /* Create a pseudo vma that just contains the policy */ 1120 /* Create a pseudo vma that just contains the policy */
1035 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1121 pvma.vm_start = 0;
1036 pvma.vm_end = PAGE_SIZE;
1037 pvma.vm_pgoff = idx; 1122 pvma.vm_pgoff = idx;
1038 pvma.vm_policy = mpol_shared_policy_lookup(p, idx); 1123 pvma.vm_ops = NULL;
1039 page = read_swap_cache_async(entry, &pvma, 0); 1124 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1125 page = swapin_readahead(entry, gfp, &pvma, 0);
1040 mpol_free(pvma.vm_policy); 1126 mpol_free(pvma.vm_policy);
1041 return page; 1127 return page;
1042} 1128}
1043 1129
1044static struct page *shmem_swapin(struct shmem_inode_info *info, 1130static struct page *shmem_alloc_page(gfp_t gfp,
1045 swp_entry_t entry, unsigned long idx) 1131 struct shmem_inode_info *info, unsigned long idx)
1046{
1047 struct shared_policy *p = &info->policy;
1048 int i, num;
1049 struct page *page;
1050 unsigned long offset;
1051
1052 num = valid_swaphandles(entry, &offset);
1053 for (i = 0; i < num; offset++, i++) {
1054 page = shmem_swapin_async(p,
1055 swp_entry(swp_type(entry), offset), idx);
1056 if (!page)
1057 break;
1058 page_cache_release(page);
1059 }
1060 lru_add_drain(); /* Push any new pages onto the LRU now */
1061 return shmem_swapin_async(p, entry, idx);
1062}
1063
1064static struct page *
1065shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
1066 unsigned long idx)
1067{ 1132{
1068 struct vm_area_struct pvma; 1133 struct vm_area_struct pvma;
1069 struct page *page; 1134 struct page *page;
1070 1135
1071 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1136 /* Create a pseudo vma that just contains the policy */
1072 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 1137 pvma.vm_start = 0;
1073 pvma.vm_pgoff = idx; 1138 pvma.vm_pgoff = idx;
1074 pvma.vm_end = PAGE_SIZE; 1139 pvma.vm_ops = NULL;
1140 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1075 page = alloc_page_vma(gfp, &pvma, 0); 1141 page = alloc_page_vma(gfp, &pvma, 0);
1076 mpol_free(pvma.vm_policy); 1142 mpol_free(pvma.vm_policy);
1077 return page; 1143 return page;
@@ -1083,15 +1149,14 @@ static inline int shmem_parse_mpol(char *value, int *policy,
1083 return 1; 1149 return 1;
1084} 1150}
1085 1151
1086static inline struct page * 1152static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1087shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) 1153 struct shmem_inode_info *info, unsigned long idx)
1088{ 1154{
1089 swapin_readahead(entry, 0, NULL); 1155 return swapin_readahead(entry, gfp, NULL, 0);
1090 return read_swap_cache_async(entry, NULL, 0);
1091} 1156}
1092 1157
1093static inline struct page * 1158static inline struct page *shmem_alloc_page(gfp_t gfp,
1094shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx) 1159 struct shmem_inode_info *info, unsigned long idx)
1095{ 1160{
1096 return alloc_page(gfp); 1161 return alloc_page(gfp);
1097} 1162}
@@ -1114,6 +1179,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1114 struct page *swappage; 1179 struct page *swappage;
1115 swp_entry_t *entry; 1180 swp_entry_t *entry;
1116 swp_entry_t swap; 1181 swp_entry_t swap;
1182 gfp_t gfp;
1117 int error; 1183 int error;
1118 1184
1119 if (idx >= SHMEM_MAX_INDEX) 1185 if (idx >= SHMEM_MAX_INDEX)
@@ -1126,7 +1192,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1126 * Normally, filepage is NULL on entry, and either found 1192 * Normally, filepage is NULL on entry, and either found
1127 * uptodate immediately, or allocated and zeroed, or read 1193 * uptodate immediately, or allocated and zeroed, or read
1128 * in under swappage, which is then assigned to filepage. 1194 * in under swappage, which is then assigned to filepage.
1129 * But shmem_readpage and shmem_write_begin pass in a locked 1195 * But shmem_readpage (required for splice) passes in a locked
1130 * filepage, which may be found not uptodate by other callers 1196 * filepage, which may be found not uptodate by other callers
1131 * too, and may need to be copied from the swappage read in. 1197 * too, and may need to be copied from the swappage read in.
1132 */ 1198 */
@@ -1136,8 +1202,17 @@ repeat:
1136 if (filepage && PageUptodate(filepage)) 1202 if (filepage && PageUptodate(filepage))
1137 goto done; 1203 goto done;
1138 error = 0; 1204 error = 0;
1139 if (sgp == SGP_QUICK) 1205 gfp = mapping_gfp_mask(mapping);
1140 goto failed; 1206 if (!filepage) {
1207 /*
1208 * Try to preload while we can wait, to not make a habit of
1209 * draining atomic reserves; but don't latch on to this cpu.
1210 */
1211 error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
1212 if (error)
1213 goto failed;
1214 radix_tree_preload_end();
1215 }
1141 1216
1142 spin_lock(&info->lock); 1217 spin_lock(&info->lock);
1143 shmem_recalc_inode(inode); 1218 shmem_recalc_inode(inode);
@@ -1160,7 +1235,7 @@ repeat:
1160 *type |= VM_FAULT_MAJOR; 1235 *type |= VM_FAULT_MAJOR;
1161 } 1236 }
1162 spin_unlock(&info->lock); 1237 spin_unlock(&info->lock);
1163 swappage = shmem_swapin(info, swap, idx); 1238 swappage = shmem_swapin(swap, gfp, info, idx);
1164 if (!swappage) { 1239 if (!swappage) {
1165 spin_lock(&info->lock); 1240 spin_lock(&info->lock);
1166 entry = shmem_swp_alloc(info, idx, sgp); 1241 entry = shmem_swp_alloc(info, idx, sgp);
@@ -1218,23 +1293,21 @@ repeat:
1218 SetPageUptodate(filepage); 1293 SetPageUptodate(filepage);
1219 set_page_dirty(filepage); 1294 set_page_dirty(filepage);
1220 swap_free(swap); 1295 swap_free(swap);
1221 } else if (!(error = move_from_swap_cache( 1296 } else if (!(error = add_to_page_cache(
1222 swappage, idx, mapping))) { 1297 swappage, mapping, idx, GFP_NOWAIT))) {
1223 info->flags |= SHMEM_PAGEIN; 1298 info->flags |= SHMEM_PAGEIN;
1224 shmem_swp_set(info, entry, 0); 1299 shmem_swp_set(info, entry, 0);
1225 shmem_swp_unmap(entry); 1300 shmem_swp_unmap(entry);
1301 delete_from_swap_cache(swappage);
1226 spin_unlock(&info->lock); 1302 spin_unlock(&info->lock);
1227 filepage = swappage; 1303 filepage = swappage;
1304 set_page_dirty(filepage);
1228 swap_free(swap); 1305 swap_free(swap);
1229 } else { 1306 } else {
1230 shmem_swp_unmap(entry); 1307 shmem_swp_unmap(entry);
1231 spin_unlock(&info->lock); 1308 spin_unlock(&info->lock);
1232 unlock_page(swappage); 1309 unlock_page(swappage);
1233 page_cache_release(swappage); 1310 page_cache_release(swappage);
1234 if (error == -ENOMEM) {
1235 /* let kswapd refresh zone for GFP_ATOMICs */
1236 congestion_wait(WRITE, HZ/50);
1237 }
1238 goto repeat; 1311 goto repeat;
1239 } 1312 }
1240 } else if (sgp == SGP_READ && !filepage) { 1313 } else if (sgp == SGP_READ && !filepage) {
@@ -1272,9 +1345,7 @@ repeat:
1272 1345
1273 if (!filepage) { 1346 if (!filepage) {
1274 spin_unlock(&info->lock); 1347 spin_unlock(&info->lock);
1275 filepage = shmem_alloc_page(mapping_gfp_mask(mapping), 1348 filepage = shmem_alloc_page(gfp, info, idx);
1276 info,
1277 idx);
1278 if (!filepage) { 1349 if (!filepage) {
1279 shmem_unacct_blocks(info->flags, 1); 1350 shmem_unacct_blocks(info->flags, 1);
1280 shmem_free_blocks(inode, 1); 1351 shmem_free_blocks(inode, 1);
@@ -1291,7 +1362,7 @@ repeat:
1291 shmem_swp_unmap(entry); 1362 shmem_swp_unmap(entry);
1292 } 1363 }
1293 if (error || swap.val || 0 != add_to_page_cache_lru( 1364 if (error || swap.val || 0 != add_to_page_cache_lru(
1294 filepage, mapping, idx, GFP_ATOMIC)) { 1365 filepage, mapping, idx, GFP_NOWAIT)) {
1295 spin_unlock(&info->lock); 1366 spin_unlock(&info->lock);
1296 page_cache_release(filepage); 1367 page_cache_release(filepage);
1297 shmem_unacct_blocks(info->flags, 1); 1368 shmem_unacct_blocks(info->flags, 1);
@@ -1309,14 +1380,11 @@ repeat:
1309 clear_highpage(filepage); 1380 clear_highpage(filepage);
1310 flush_dcache_page(filepage); 1381 flush_dcache_page(filepage);
1311 SetPageUptodate(filepage); 1382 SetPageUptodate(filepage);
1383 if (sgp == SGP_DIRTY)
1384 set_page_dirty(filepage);
1312 } 1385 }
1313done: 1386done:
1314 if (*pagep != filepage) { 1387 *pagep = filepage;
1315 *pagep = filepage;
1316 if (sgp != SGP_FAULT)
1317 unlock_page(filepage);
1318
1319 }
1320 return 0; 1388 return 0;
1321 1389
1322failed: 1390failed:
@@ -1336,7 +1404,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1336 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 1404 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1337 return VM_FAULT_SIGBUS; 1405 return VM_FAULT_SIGBUS;
1338 1406
1339 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_FAULT, &ret); 1407 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1340 if (error) 1408 if (error)
1341 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1409 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1342 1410
@@ -1399,15 +1467,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1399 struct shmem_inode_info *info; 1467 struct shmem_inode_info *info;
1400 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1468 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1401 1469
1402 if (sbinfo->max_inodes) { 1470 if (shmem_reserve_inode(sb))
1403 spin_lock(&sbinfo->stat_lock); 1471 return NULL;
1404 if (!sbinfo->free_inodes) {
1405 spin_unlock(&sbinfo->stat_lock);
1406 return NULL;
1407 }
1408 sbinfo->free_inodes--;
1409 spin_unlock(&sbinfo->stat_lock);
1410 }
1411 1472
1412 inode = new_inode(sb); 1473 inode = new_inode(sb);
1413 if (inode) { 1474 if (inode) {
@@ -1451,11 +1512,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1451 NULL); 1512 NULL);
1452 break; 1513 break;
1453 } 1514 }
1454 } else if (sbinfo->max_inodes) { 1515 } else
1455 spin_lock(&sbinfo->stat_lock); 1516 shmem_free_inode(sb);
1456 sbinfo->free_inodes++;
1457 spin_unlock(&sbinfo->stat_lock);
1458 }
1459 return inode; 1517 return inode;
1460} 1518}
1461 1519
@@ -1494,123 +1552,30 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1494{ 1552{
1495 struct inode *inode = mapping->host; 1553 struct inode *inode = mapping->host;
1496 1554
1555 if (pos + copied > inode->i_size)
1556 i_size_write(inode, pos + copied);
1557
1558 unlock_page(page);
1497 set_page_dirty(page); 1559 set_page_dirty(page);
1498 page_cache_release(page); 1560 page_cache_release(page);
1499 1561
1500 if (pos+copied > inode->i_size)
1501 i_size_write(inode, pos+copied);
1502
1503 return copied; 1562 return copied;
1504} 1563}
1505 1564
1506static ssize_t
1507shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
1508{
1509 struct inode *inode = file->f_path.dentry->d_inode;
1510 loff_t pos;
1511 unsigned long written;
1512 ssize_t err;
1513
1514 if ((ssize_t) count < 0)
1515 return -EINVAL;
1516
1517 if (!access_ok(VERIFY_READ, buf, count))
1518 return -EFAULT;
1519
1520 mutex_lock(&inode->i_mutex);
1521
1522 pos = *ppos;
1523 written = 0;
1524
1525 err = generic_write_checks(file, &pos, &count, 0);
1526 if (err || !count)
1527 goto out;
1528
1529 err = remove_suid(file->f_path.dentry);
1530 if (err)
1531 goto out;
1532
1533 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1534
1535 do {
1536 struct page *page = NULL;
1537 unsigned long bytes, index, offset;
1538 char *kaddr;
1539 int left;
1540
1541 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1542 index = pos >> PAGE_CACHE_SHIFT;
1543 bytes = PAGE_CACHE_SIZE - offset;
1544 if (bytes > count)
1545 bytes = count;
1546
1547 /*
1548 * We don't hold page lock across copy from user -
1549 * what would it guard against? - so no deadlock here.
1550 * But it still may be a good idea to prefault below.
1551 */
1552
1553 err = shmem_getpage(inode, index, &page, SGP_WRITE, NULL);
1554 if (err)
1555 break;
1556
1557 left = bytes;
1558 if (PageHighMem(page)) {
1559 volatile unsigned char dummy;
1560 __get_user(dummy, buf);
1561 __get_user(dummy, buf + bytes - 1);
1562
1563 kaddr = kmap_atomic(page, KM_USER0);
1564 left = __copy_from_user_inatomic(kaddr + offset,
1565 buf, bytes);
1566 kunmap_atomic(kaddr, KM_USER0);
1567 }
1568 if (left) {
1569 kaddr = kmap(page);
1570 left = __copy_from_user(kaddr + offset, buf, bytes);
1571 kunmap(page);
1572 }
1573
1574 written += bytes;
1575 count -= bytes;
1576 pos += bytes;
1577 buf += bytes;
1578 if (pos > inode->i_size)
1579 i_size_write(inode, pos);
1580
1581 flush_dcache_page(page);
1582 set_page_dirty(page);
1583 mark_page_accessed(page);
1584 page_cache_release(page);
1585
1586 if (left) {
1587 pos -= left;
1588 written -= left;
1589 err = -EFAULT;
1590 break;
1591 }
1592
1593 /*
1594 * Our dirty pages are not counted in nr_dirty,
1595 * and we do not attempt to balance dirty pages.
1596 */
1597
1598 cond_resched();
1599 } while (count);
1600
1601 *ppos = pos;
1602 if (written)
1603 err = written;
1604out:
1605 mutex_unlock(&inode->i_mutex);
1606 return err;
1607}
1608
1609static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) 1565static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor)
1610{ 1566{
1611 struct inode *inode = filp->f_path.dentry->d_inode; 1567 struct inode *inode = filp->f_path.dentry->d_inode;
1612 struct address_space *mapping = inode->i_mapping; 1568 struct address_space *mapping = inode->i_mapping;
1613 unsigned long index, offset; 1569 unsigned long index, offset;
1570 enum sgp_type sgp = SGP_READ;
1571
1572 /*
1573 * Might this read be for a stacking filesystem? Then when reading
1574 * holes of a sparse file, we actually need to allocate those pages,
1575 * and even mark them dirty, so it cannot exceed the max_blocks limit.
1576 */
1577 if (segment_eq(get_fs(), KERNEL_DS))
1578 sgp = SGP_DIRTY;
1614 1579
1615 index = *ppos >> PAGE_CACHE_SHIFT; 1580 index = *ppos >> PAGE_CACHE_SHIFT;
1616 offset = *ppos & ~PAGE_CACHE_MASK; 1581 offset = *ppos & ~PAGE_CACHE_MASK;
@@ -1629,12 +1594,14 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1629 break; 1594 break;
1630 } 1595 }
1631 1596
1632 desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL); 1597 desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
1633 if (desc->error) { 1598 if (desc->error) {
1634 if (desc->error == -EINVAL) 1599 if (desc->error == -EINVAL)
1635 desc->error = 0; 1600 desc->error = 0;
1636 break; 1601 break;
1637 } 1602 }
1603 if (page)
1604 unlock_page(page);
1638 1605
1639 /* 1606 /*
1640 * We must evaluate after, since reads (unlike writes) 1607 * We must evaluate after, since reads (unlike writes)
@@ -1798,22 +1765,16 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, int mode,
1798static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 1765static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1799{ 1766{
1800 struct inode *inode = old_dentry->d_inode; 1767 struct inode *inode = old_dentry->d_inode;
1801 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1768 int ret;
1802 1769
1803 /* 1770 /*
1804 * No ordinary (disk based) filesystem counts links as inodes; 1771 * No ordinary (disk based) filesystem counts links as inodes;
1805 * but each new link needs a new dentry, pinning lowmem, and 1772 * but each new link needs a new dentry, pinning lowmem, and
1806 * tmpfs dentries cannot be pruned until they are unlinked. 1773 * tmpfs dentries cannot be pruned until they are unlinked.
1807 */ 1774 */
1808 if (sbinfo->max_inodes) { 1775 ret = shmem_reserve_inode(inode->i_sb);
1809 spin_lock(&sbinfo->stat_lock); 1776 if (ret)
1810 if (!sbinfo->free_inodes) { 1777 goto out;
1811 spin_unlock(&sbinfo->stat_lock);
1812 return -ENOSPC;
1813 }
1814 sbinfo->free_inodes--;
1815 spin_unlock(&sbinfo->stat_lock);
1816 }
1817 1778
1818 dir->i_size += BOGO_DIRENT_SIZE; 1779 dir->i_size += BOGO_DIRENT_SIZE;
1819 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1780 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1821,21 +1782,16 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
1821 atomic_inc(&inode->i_count); /* New dentry reference */ 1782 atomic_inc(&inode->i_count); /* New dentry reference */
1822 dget(dentry); /* Extra pinning count for the created dentry */ 1783 dget(dentry); /* Extra pinning count for the created dentry */
1823 d_instantiate(dentry, inode); 1784 d_instantiate(dentry, inode);
1824 return 0; 1785out:
1786 return ret;
1825} 1787}
1826 1788
1827static int shmem_unlink(struct inode *dir, struct dentry *dentry) 1789static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1828{ 1790{
1829 struct inode *inode = dentry->d_inode; 1791 struct inode *inode = dentry->d_inode;
1830 1792
1831 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) { 1793 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
1832 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 1794 shmem_free_inode(inode->i_sb);
1833 if (sbinfo->max_inodes) {
1834 spin_lock(&sbinfo->stat_lock);
1835 sbinfo->free_inodes++;
1836 spin_unlock(&sbinfo->stat_lock);
1837 }
1838 }
1839 1795
1840 dir->i_size -= BOGO_DIRENT_SIZE; 1796 dir->i_size -= BOGO_DIRENT_SIZE;
1841 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1797 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1924,6 +1880,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1924 iput(inode); 1880 iput(inode);
1925 return error; 1881 return error;
1926 } 1882 }
1883 unlock_page(page);
1927 inode->i_op = &shmem_symlink_inode_operations; 1884 inode->i_op = &shmem_symlink_inode_operations;
1928 kaddr = kmap_atomic(page, KM_USER0); 1885 kaddr = kmap_atomic(page, KM_USER0);
1929 memcpy(kaddr, symname, len); 1886 memcpy(kaddr, symname, len);
@@ -1951,6 +1908,8 @@ static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1951 struct page *page = NULL; 1908 struct page *page = NULL;
1952 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1909 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
1953 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1910 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
1911 if (page)
1912 unlock_page(page);
1954 return page; 1913 return page;
1955} 1914}
1956 1915
@@ -1996,8 +1955,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name,
1996{ 1955{
1997 if (strcmp(name, "") == 0) 1956 if (strcmp(name, "") == 0)
1998 return -EINVAL; 1957 return -EINVAL;
1999 return security_inode_getsecurity(inode, name, buffer, size, 1958 return xattr_getsecurity(inode, name, buffer, size);
2000 -EOPNOTSUPP);
2001} 1959}
2002 1960
2003static int shmem_xattr_security_set(struct inode *inode, const char *name, 1961static int shmem_xattr_security_set(struct inode *inode, const char *name,
@@ -2138,7 +2096,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
2138 } 2096 }
2139 if (*rest) 2097 if (*rest)
2140 goto bad_val; 2098 goto bad_val;
2141 *blocks = size >> PAGE_CACHE_SHIFT; 2099 *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
2142 } else if (!strcmp(this_char,"nr_blocks")) { 2100 } else if (!strcmp(this_char,"nr_blocks")) {
2143 *blocks = memparse(value,&rest); 2101 *blocks = memparse(value,&rest);
2144 if (*rest) 2102 if (*rest)
@@ -2375,7 +2333,8 @@ static const struct file_operations shmem_file_operations = {
2375#ifdef CONFIG_TMPFS 2333#ifdef CONFIG_TMPFS
2376 .llseek = generic_file_llseek, 2334 .llseek = generic_file_llseek,
2377 .read = shmem_file_read, 2335 .read = shmem_file_read,
2378 .write = shmem_file_write, 2336 .write = do_sync_write,
2337 .aio_write = generic_file_aio_write,
2379 .fsync = simple_sync_file, 2338 .fsync = simple_sync_file,
2380 .splice_read = generic_file_splice_read, 2339 .splice_read = generic_file_splice_read,
2381 .splice_write = generic_file_splice_write, 2340 .splice_write = generic_file_splice_write,
diff --git a/mm/slob.c b/mm/slob.c
index 773a7aa80ab5..e2c3c0ec5463 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -12,10 +12,17 @@
12 * allocator is as little as 2 bytes, however typically most architectures 12 * allocator is as little as 2 bytes, however typically most architectures
13 * will require 4 bytes on 32-bit and 8 bytes on 64-bit. 13 * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
14 * 14 *
15 * The slob heap is a linked list of pages from alloc_pages(), and 15 * The slob heap is a set of linked list of pages from alloc_pages(),
16 * within each page, there is a singly-linked list of free blocks (slob_t). 16 * and within each page, there is a singly-linked list of free blocks
17 * The heap is grown on demand and allocation from the heap is currently 17 * (slob_t). The heap is grown on demand. To reduce fragmentation,
18 * first-fit. 18 * heap pages are segregated into three lists, with objects less than
19 * 256 bytes, objects less than 1024 bytes, and all other objects.
20 *
21 * Allocation from heap involves first searching for a page with
22 * sufficient free blocks (using a next-fit-like approach) followed by
23 * a first-fit scan of the page. Deallocation inserts objects back
24 * into the free list in address order, so this is effectively an
25 * address-ordered first fit.
19 * 26 *
20 * Above this is an implementation of kmalloc/kfree. Blocks returned 27 * Above this is an implementation of kmalloc/kfree. Blocks returned
21 * from kmalloc are prepended with a 4-byte header with the kmalloc size. 28 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
@@ -110,9 +117,13 @@ static inline void free_slob_page(struct slob_page *sp)
110} 117}
111 118
112/* 119/*
113 * All (partially) free slob pages go on this list. 120 * All partially free slob pages go on these lists.
114 */ 121 */
115static LIST_HEAD(free_slob_pages); 122#define SLOB_BREAK1 256
123#define SLOB_BREAK2 1024
124static LIST_HEAD(free_slob_small);
125static LIST_HEAD(free_slob_medium);
126static LIST_HEAD(free_slob_large);
116 127
117/* 128/*
118 * slob_page: True for all slob pages (false for bigblock pages) 129 * slob_page: True for all slob pages (false for bigblock pages)
@@ -140,9 +151,9 @@ static inline int slob_page_free(struct slob_page *sp)
140 return test_bit(PG_private, &sp->flags); 151 return test_bit(PG_private, &sp->flags);
141} 152}
142 153
143static inline void set_slob_page_free(struct slob_page *sp) 154static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
144{ 155{
145 list_add(&sp->list, &free_slob_pages); 156 list_add(&sp->list, list);
146 __set_bit(PG_private, &sp->flags); 157 __set_bit(PG_private, &sp->flags);
147} 158}
148 159
@@ -294,12 +305,20 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
294{ 305{
295 struct slob_page *sp; 306 struct slob_page *sp;
296 struct list_head *prev; 307 struct list_head *prev;
308 struct list_head *slob_list;
297 slob_t *b = NULL; 309 slob_t *b = NULL;
298 unsigned long flags; 310 unsigned long flags;
299 311
312 if (size < SLOB_BREAK1)
313 slob_list = &free_slob_small;
314 else if (size < SLOB_BREAK2)
315 slob_list = &free_slob_medium;
316 else
317 slob_list = &free_slob_large;
318
300 spin_lock_irqsave(&slob_lock, flags); 319 spin_lock_irqsave(&slob_lock, flags);
301 /* Iterate through each partially free page, try to find room */ 320 /* Iterate through each partially free page, try to find room */
302 list_for_each_entry(sp, &free_slob_pages, list) { 321 list_for_each_entry(sp, slob_list, list) {
303#ifdef CONFIG_NUMA 322#ifdef CONFIG_NUMA
304 /* 323 /*
305 * If there's a node specification, search for a partial 324 * If there's a node specification, search for a partial
@@ -321,9 +340,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
321 /* Improve fragment distribution and reduce our average 340 /* Improve fragment distribution and reduce our average
322 * search time by starting our next search here. (see 341 * search time by starting our next search here. (see
323 * Knuth vol 1, sec 2.5, pg 449) */ 342 * Knuth vol 1, sec 2.5, pg 449) */
324 if (prev != free_slob_pages.prev && 343 if (prev != slob_list->prev &&
325 free_slob_pages.next != prev->next) 344 slob_list->next != prev->next)
326 list_move_tail(&free_slob_pages, prev->next); 345 list_move_tail(slob_list, prev->next);
327 break; 346 break;
328 } 347 }
329 spin_unlock_irqrestore(&slob_lock, flags); 348 spin_unlock_irqrestore(&slob_lock, flags);
@@ -341,7 +360,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
341 sp->free = b; 360 sp->free = b;
342 INIT_LIST_HEAD(&sp->list); 361 INIT_LIST_HEAD(&sp->list);
343 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); 362 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
344 set_slob_page_free(sp); 363 set_slob_page_free(sp, slob_list);
345 b = slob_page_alloc(sp, size, align); 364 b = slob_page_alloc(sp, size, align);
346 BUG_ON(!b); 365 BUG_ON(!b);
347 spin_unlock_irqrestore(&slob_lock, flags); 366 spin_unlock_irqrestore(&slob_lock, flags);
@@ -387,7 +406,7 @@ static void slob_free(void *block, int size)
387 set_slob(b, units, 406 set_slob(b, units,
388 (void *)((unsigned long)(b + 407 (void *)((unsigned long)(b +
389 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); 408 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
390 set_slob_page_free(sp); 409 set_slob_page_free(sp, &free_slob_small);
391 goto out; 410 goto out;
392 } 411 }
393 412
@@ -398,6 +417,10 @@ static void slob_free(void *block, int size)
398 sp->units += units; 417 sp->units += units;
399 418
400 if (b < sp->free) { 419 if (b < sp->free) {
420 if (b + units == sp->free) {
421 units += slob_units(sp->free);
422 sp->free = slob_next(sp->free);
423 }
401 set_slob(b, units, sp->free); 424 set_slob(b, units, sp->free);
402 sp->free = b; 425 sp->free = b;
403 } else { 426 } else {
diff --git a/mm/sparse.c b/mm/sparse.c
index a2183cb5d524..f6a43c09c322 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -237,7 +237,7 @@ static unsigned long *__kmalloc_section_usemap(void)
237} 237}
238#endif /* CONFIG_MEMORY_HOTPLUG */ 238#endif /* CONFIG_MEMORY_HOTPLUG */
239 239
240static unsigned long *sparse_early_usemap_alloc(unsigned long pnum) 240static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
241{ 241{
242 unsigned long *usemap; 242 unsigned long *usemap;
243 struct mem_section *ms = __nr_to_section(pnum); 243 struct mem_section *ms = __nr_to_section(pnum);
@@ -353,17 +353,9 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
353 return __kmalloc_section_memmap(nr_pages); 353 return __kmalloc_section_memmap(nr_pages);
354} 354}
355 355
356static int vaddr_in_vmalloc_area(void *addr)
357{
358 if (addr >= (void *)VMALLOC_START &&
359 addr < (void *)VMALLOC_END)
360 return 1;
361 return 0;
362}
363
364static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) 356static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
365{ 357{
366 if (vaddr_in_vmalloc_area(memmap)) 358 if (is_vmalloc_addr(memmap))
367 vfree(memmap); 359 vfree(memmap);
368 else 360 else
369 free_pages((unsigned long)memmap, 361 free_pages((unsigned long)memmap,
diff --git a/mm/swap.c b/mm/swap.c
index 9ac88323d237..57b7e25a939c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -41,7 +41,7 @@ static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
41 * This path almost never happens for VM activity - pages are normally 41 * This path almost never happens for VM activity - pages are normally
42 * freed via pagevecs. But it gets used by networking. 42 * freed via pagevecs. But it gets used by networking.
43 */ 43 */
44static void fastcall __page_cache_release(struct page *page) 44static void __page_cache_release(struct page *page)
45{ 45{
46 if (PageLRU(page)) { 46 if (PageLRU(page)) {
47 unsigned long flags; 47 unsigned long flags;
@@ -165,7 +165,7 @@ int rotate_reclaimable_page(struct page *page)
165/* 165/*
166 * FIXME: speed this up? 166 * FIXME: speed this up?
167 */ 167 */
168void fastcall activate_page(struct page *page) 168void activate_page(struct page *page)
169{ 169{
170 struct zone *zone = page_zone(page); 170 struct zone *zone = page_zone(page);
171 171
@@ -186,7 +186,7 @@ void fastcall activate_page(struct page *page)
186 * inactive,referenced -> active,unreferenced 186 * inactive,referenced -> active,unreferenced
187 * active,unreferenced -> active,referenced 187 * active,unreferenced -> active,referenced
188 */ 188 */
189void fastcall mark_page_accessed(struct page *page) 189void mark_page_accessed(struct page *page)
190{ 190{
191 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 191 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
192 activate_page(page); 192 activate_page(page);
@@ -202,7 +202,7 @@ EXPORT_SYMBOL(mark_page_accessed);
202 * lru_cache_add: add a page to the page lists 202 * lru_cache_add: add a page to the page lists
203 * @page: the page to add 203 * @page: the page to add
204 */ 204 */
205void fastcall lru_cache_add(struct page *page) 205void lru_cache_add(struct page *page)
206{ 206{
207 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 207 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
208 208
@@ -212,7 +212,7 @@ void fastcall lru_cache_add(struct page *page)
212 put_cpu_var(lru_add_pvecs); 212 put_cpu_var(lru_add_pvecs);
213} 213}
214 214
215void fastcall lru_cache_add_active(struct page *page) 215void lru_cache_add_active(struct page *page)
216{ 216{
217 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 217 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
218 218
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b52635601dfe..ec42f01a8d02 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -10,6 +10,7 @@
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/swapops.h>
13#include <linux/init.h> 14#include <linux/init.h>
14#include <linux/pagemap.h> 15#include <linux/pagemap.h>
15#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
@@ -51,26 +52,22 @@ static struct {
51 unsigned long del_total; 52 unsigned long del_total;
52 unsigned long find_success; 53 unsigned long find_success;
53 unsigned long find_total; 54 unsigned long find_total;
54 unsigned long noent_race;
55 unsigned long exist_race;
56} swap_cache_info; 55} swap_cache_info;
57 56
58void show_swap_cache_info(void) 57void show_swap_cache_info(void)
59{ 58{
60 printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n", 59 printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
61 swap_cache_info.add_total, swap_cache_info.del_total, 60 swap_cache_info.add_total, swap_cache_info.del_total,
62 swap_cache_info.find_success, swap_cache_info.find_total, 61 swap_cache_info.find_success, swap_cache_info.find_total);
63 swap_cache_info.noent_race, swap_cache_info.exist_race);
64 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); 62 printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
65 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); 63 printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
66} 64}
67 65
68/* 66/*
69 * __add_to_swap_cache resembles add_to_page_cache on swapper_space, 67 * add_to_swap_cache resembles add_to_page_cache on swapper_space,
70 * but sets SwapCache flag and private instead of mapping and index. 68 * but sets SwapCache flag and private instead of mapping and index.
71 */ 69 */
72static int __add_to_swap_cache(struct page *page, swp_entry_t entry, 70int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
73 gfp_t gfp_mask)
74{ 71{
75 int error; 72 int error;
76 73
@@ -88,6 +85,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
88 set_page_private(page, entry.val); 85 set_page_private(page, entry.val);
89 total_swapcache_pages++; 86 total_swapcache_pages++;
90 __inc_zone_page_state(page, NR_FILE_PAGES); 87 __inc_zone_page_state(page, NR_FILE_PAGES);
88 INC_CACHE_INFO(add_total);
91 } 89 }
92 write_unlock_irq(&swapper_space.tree_lock); 90 write_unlock_irq(&swapper_space.tree_lock);
93 radix_tree_preload_end(); 91 radix_tree_preload_end();
@@ -95,31 +93,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
95 return error; 93 return error;
96} 94}
97 95
98static int add_to_swap_cache(struct page *page, swp_entry_t entry)
99{
100 int error;
101
102 BUG_ON(PageLocked(page));
103 if (!swap_duplicate(entry)) {
104 INC_CACHE_INFO(noent_race);
105 return -ENOENT;
106 }
107 SetPageLocked(page);
108 error = __add_to_swap_cache(page, entry, GFP_KERNEL);
109 /*
110 * Anon pages are already on the LRU, we don't run lru_cache_add here.
111 */
112 if (error) {
113 ClearPageLocked(page);
114 swap_free(entry);
115 if (error == -EEXIST)
116 INC_CACHE_INFO(exist_race);
117 return error;
118 }
119 INC_CACHE_INFO(add_total);
120 return 0;
121}
122
123/* 96/*
124 * This must be called only on pages that have 97 * This must be called only on pages that have
125 * been verified to be in the swap cache. 98 * been verified to be in the swap cache.
@@ -152,6 +125,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
152 int err; 125 int err;
153 126
154 BUG_ON(!PageLocked(page)); 127 BUG_ON(!PageLocked(page));
128 BUG_ON(!PageUptodate(page));
155 129
156 for (;;) { 130 for (;;) {
157 entry = get_swap_page(); 131 entry = get_swap_page();
@@ -169,18 +143,15 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
169 /* 143 /*
170 * Add it to the swap cache and mark it dirty 144 * Add it to the swap cache and mark it dirty
171 */ 145 */
172 err = __add_to_swap_cache(page, entry, 146 err = add_to_swap_cache(page, entry,
173 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); 147 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
174 148
175 switch (err) { 149 switch (err) {
176 case 0: /* Success */ 150 case 0: /* Success */
177 SetPageUptodate(page);
178 SetPageDirty(page); 151 SetPageDirty(page);
179 INC_CACHE_INFO(add_total);
180 return 1; 152 return 1;
181 case -EEXIST: 153 case -EEXIST:
182 /* Raced with "speculative" read_swap_cache_async */ 154 /* Raced with "speculative" read_swap_cache_async */
183 INC_CACHE_INFO(exist_race);
184 swap_free(entry); 155 swap_free(entry);
185 continue; 156 continue;
186 default: 157 default:
@@ -211,40 +182,6 @@ void delete_from_swap_cache(struct page *page)
211 page_cache_release(page); 182 page_cache_release(page);
212} 183}
213 184
214/*
215 * Strange swizzling function only for use by shmem_writepage
216 */
217int move_to_swap_cache(struct page *page, swp_entry_t entry)
218{
219 int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
220 if (!err) {
221 remove_from_page_cache(page);
222 page_cache_release(page); /* pagecache ref */
223 if (!swap_duplicate(entry))
224 BUG();
225 SetPageDirty(page);
226 INC_CACHE_INFO(add_total);
227 } else if (err == -EEXIST)
228 INC_CACHE_INFO(exist_race);
229 return err;
230}
231
232/*
233 * Strange swizzling function for shmem_getpage (and shmem_unuse)
234 */
235int move_from_swap_cache(struct page *page, unsigned long index,
236 struct address_space *mapping)
237{
238 int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
239 if (!err) {
240 delete_from_swap_cache(page);
241 /* shift page from clean_pages to dirty_pages list */
242 ClearPageDirty(page);
243 set_page_dirty(page);
244 }
245 return err;
246}
247
248/* 185/*
249 * If we are the only user, then try to free up the swap cache. 186 * If we are the only user, then try to free up the swap cache.
250 * 187 *
@@ -317,7 +254,7 @@ struct page * lookup_swap_cache(swp_entry_t entry)
317 * A failure return means that either the page allocation failed or that 254 * A failure return means that either the page allocation failed or that
318 * the swap entry is no longer in use. 255 * the swap entry is no longer in use.
319 */ 256 */
320struct page *read_swap_cache_async(swp_entry_t entry, 257struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
321 struct vm_area_struct *vma, unsigned long addr) 258 struct vm_area_struct *vma, unsigned long addr)
322{ 259{
323 struct page *found_page, *new_page = NULL; 260 struct page *found_page, *new_page = NULL;
@@ -337,23 +274,27 @@ struct page *read_swap_cache_async(swp_entry_t entry,
337 * Get a new page to read into from swap. 274 * Get a new page to read into from swap.
338 */ 275 */
339 if (!new_page) { 276 if (!new_page) {
340 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, 277 new_page = alloc_page_vma(gfp_mask, vma, addr);
341 vma, addr);
342 if (!new_page) 278 if (!new_page)
343 break; /* Out of memory */ 279 break; /* Out of memory */
344 } 280 }
345 281
346 /* 282 /*
283 * Swap entry may have been freed since our caller observed it.
284 */
285 if (!swap_duplicate(entry))
286 break;
287
288 /*
347 * Associate the page with swap entry in the swap cache. 289 * Associate the page with swap entry in the swap cache.
348 * May fail (-ENOENT) if swap entry has been freed since 290 * May fail (-EEXIST) if there is already a page associated
349 * our caller observed it. May fail (-EEXIST) if there 291 * with this entry in the swap cache: added by a racing
350 * is already a page associated with this entry in the 292 * read_swap_cache_async, or add_to_swap or shmem_writepage
351 * swap cache: added by a racing read_swap_cache_async, 293 * re-using the just freed swap entry for an existing page.
352 * or by try_to_swap_out (or shmem_writepage) re-using
353 * the just freed swap entry for an existing page.
354 * May fail (-ENOMEM) if radix-tree node allocation failed. 294 * May fail (-ENOMEM) if radix-tree node allocation failed.
355 */ 295 */
356 err = add_to_swap_cache(new_page, entry); 296 SetPageLocked(new_page);
297 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
357 if (!err) { 298 if (!err) {
358 /* 299 /*
359 * Initiate read into locked page and return. 300 * Initiate read into locked page and return.
@@ -362,9 +303,57 @@ struct page *read_swap_cache_async(swp_entry_t entry,
362 swap_readpage(NULL, new_page); 303 swap_readpage(NULL, new_page);
363 return new_page; 304 return new_page;
364 } 305 }
365 } while (err != -ENOENT && err != -ENOMEM); 306 ClearPageLocked(new_page);
307 swap_free(entry);
308 } while (err != -ENOMEM);
366 309
367 if (new_page) 310 if (new_page)
368 page_cache_release(new_page); 311 page_cache_release(new_page);
369 return found_page; 312 return found_page;
370} 313}
314
315/**
316 * swapin_readahead - swap in pages in hope we need them soon
317 * @entry: swap entry of this memory
318 * @vma: user vma this address belongs to
319 * @addr: target address for mempolicy
320 *
321 * Returns the struct page for entry and addr, after queueing swapin.
322 *
323 * Primitive swap readahead code. We simply read an aligned block of
324 * (1 << page_cluster) entries in the swap area. This method is chosen
325 * because it doesn't cost us any seek time. We also make sure to queue
326 * the 'original' request together with the readahead ones...
327 *
328 * This has been extended to use the NUMA policies from the mm triggering
329 * the readahead.
330 *
331 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
332 */
333struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
334 struct vm_area_struct *vma, unsigned long addr)
335{
336 int nr_pages;
337 struct page *page;
338 unsigned long offset;
339 unsigned long end_offset;
340
341 /*
342 * Get starting offset for readaround, and number of pages to read.
343 * Adjust starting address by readbehind (for NUMA interleave case)?
344 * No, it's very unlikely that swap layout would follow vma layout,
345 * more likely that neighbouring swap pages came from the same node:
346 * so use the same "addr" to choose the same node for each swap read.
347 */
348 nr_pages = valid_swaphandles(entry, &offset);
349 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
350 /* Ok, do the async read-ahead now */
351 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
352 gfp_mask, vma, addr);
353 if (!page)
354 break;
355 page_cache_release(page);
356 }
357 lru_add_drain(); /* Push any new pages onto the LRU now */
358 return read_swap_cache_async(entry, gfp_mask, vma, addr);
359}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f071648e1360..eade24da9310 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -506,9 +506,19 @@ unsigned int count_swap_pages(int type, int free)
506 * just let do_wp_page work it out if a write is requested later - to 506 * just let do_wp_page work it out if a write is requested later - to
507 * force COW, vm_page_prot omits write permission from any private vma. 507 * force COW, vm_page_prot omits write permission from any private vma.
508 */ 508 */
509static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, 509static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
510 unsigned long addr, swp_entry_t entry, struct page *page) 510 unsigned long addr, swp_entry_t entry, struct page *page)
511{ 511{
512 spinlock_t *ptl;
513 pte_t *pte;
514 int found = 1;
515
516 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
517 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
518 found = 0;
519 goto out;
520 }
521
512 inc_mm_counter(vma->vm_mm, anon_rss); 522 inc_mm_counter(vma->vm_mm, anon_rss);
513 get_page(page); 523 get_page(page);
514 set_pte_at(vma->vm_mm, addr, pte, 524 set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +530,9 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
520 * immediately swapped out again after swapon. 530 * immediately swapped out again after swapon.
521 */ 531 */
522 activate_page(page); 532 activate_page(page);
533out:
534 pte_unmap_unlock(pte, ptl);
535 return found;
523} 536}
524 537
525static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 538static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -528,22 +541,33 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
528{ 541{
529 pte_t swp_pte = swp_entry_to_pte(entry); 542 pte_t swp_pte = swp_entry_to_pte(entry);
530 pte_t *pte; 543 pte_t *pte;
531 spinlock_t *ptl;
532 int found = 0; 544 int found = 0;
533 545
534 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 546 /*
547 * We don't actually need pte lock while scanning for swp_pte: since
548 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
549 * page table while we're scanning; though it could get zapped, and on
550 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
551 * of unmatched parts which look like swp_pte, so unuse_pte must
552 * recheck under pte lock. Scanning without pte lock lets it be
553 * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
554 */
555 pte = pte_offset_map(pmd, addr);
535 do { 556 do {
536 /* 557 /*
537 * swapoff spends a _lot_ of time in this loop! 558 * swapoff spends a _lot_ of time in this loop!
538 * Test inline before going to call unuse_pte. 559 * Test inline before going to call unuse_pte.
539 */ 560 */
540 if (unlikely(pte_same(*pte, swp_pte))) { 561 if (unlikely(pte_same(*pte, swp_pte))) {
541 unuse_pte(vma, pte++, addr, entry, page); 562 pte_unmap(pte);
542 found = 1; 563 found = unuse_pte(vma, pmd, addr, entry, page);
543 break; 564 if (found)
565 goto out;
566 pte = pte_offset_map(pmd, addr);
544 } 567 }
545 } while (pte++, addr += PAGE_SIZE, addr != end); 568 } while (pte++, addr += PAGE_SIZE, addr != end);
546 pte_unmap_unlock(pte - 1, ptl); 569 pte_unmap(pte - 1);
570out:
547 return found; 571 return found;
548} 572}
549 573
@@ -730,7 +754,8 @@ static int try_to_unuse(unsigned int type)
730 */ 754 */
731 swap_map = &si->swap_map[i]; 755 swap_map = &si->swap_map[i];
732 entry = swp_entry(type, i); 756 entry = swp_entry(type, i);
733 page = read_swap_cache_async(entry, NULL, 0); 757 page = read_swap_cache_async(entry,
758 GFP_HIGHUSER_MOVABLE, NULL, 0);
734 if (!page) { 759 if (!page) {
735 /* 760 /*
736 * Either swap_duplicate() failed because entry 761 * Either swap_duplicate() failed because entry
@@ -789,7 +814,7 @@ static int try_to_unuse(unsigned int type)
789 atomic_inc(&new_start_mm->mm_users); 814 atomic_inc(&new_start_mm->mm_users);
790 atomic_inc(&prev_mm->mm_users); 815 atomic_inc(&prev_mm->mm_users);
791 spin_lock(&mmlist_lock); 816 spin_lock(&mmlist_lock);
792 while (*swap_map > 1 && !retval && 817 while (*swap_map > 1 && !retval && !shmem &&
793 (p = p->next) != &start_mm->mmlist) { 818 (p = p->next) != &start_mm->mmlist) {
794 mm = list_entry(p, struct mm_struct, mmlist); 819 mm = list_entry(p, struct mm_struct, mmlist);
795 if (!atomic_inc_not_zero(&mm->mm_users)) 820 if (!atomic_inc_not_zero(&mm->mm_users))
@@ -821,6 +846,13 @@ static int try_to_unuse(unsigned int type)
821 mmput(start_mm); 846 mmput(start_mm);
822 start_mm = new_start_mm; 847 start_mm = new_start_mm;
823 } 848 }
849 if (shmem) {
850 /* page has already been unlocked and released */
851 if (shmem > 0)
852 continue;
853 retval = shmem;
854 break;
855 }
824 if (retval) { 856 if (retval) {
825 unlock_page(page); 857 unlock_page(page);
826 page_cache_release(page); 858 page_cache_release(page);
@@ -859,12 +891,6 @@ static int try_to_unuse(unsigned int type)
859 * read from disk into another page. Splitting into two 891 * read from disk into another page. Splitting into two
860 * pages would be incorrect if swap supported "shared 892 * pages would be incorrect if swap supported "shared
861 * private" pages, but they are handled by tmpfs files. 893 * private" pages, but they are handled by tmpfs files.
862 *
863 * Note shmem_unuse already deleted a swappage from
864 * the swap cache, unless the move to filepage failed:
865 * in which case it left swappage in cache, lowered its
866 * swap count to pass quickly through the loops above,
867 * and now we must reincrement count to try again later.
868 */ 894 */
869 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { 895 if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
870 struct writeback_control wbc = { 896 struct writeback_control wbc = {
@@ -875,12 +901,8 @@ static int try_to_unuse(unsigned int type)
875 lock_page(page); 901 lock_page(page);
876 wait_on_page_writeback(page); 902 wait_on_page_writeback(page);
877 } 903 }
878 if (PageSwapCache(page)) { 904 if (PageSwapCache(page))
879 if (shmem) 905 delete_from_swap_cache(page);
880 swap_duplicate(entry);
881 else
882 delete_from_swap_cache(page);
883 }
884 906
885 /* 907 /*
886 * So we could skip searching mms once swap count went 908 * So we could skip searching mms once swap count went
@@ -1768,31 +1790,48 @@ get_swap_info_struct(unsigned type)
1768 */ 1790 */
1769int valid_swaphandles(swp_entry_t entry, unsigned long *offset) 1791int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
1770{ 1792{
1793 struct swap_info_struct *si;
1771 int our_page_cluster = page_cluster; 1794 int our_page_cluster = page_cluster;
1772 int ret = 0, i = 1 << our_page_cluster; 1795 pgoff_t target, toff;
1773 unsigned long toff; 1796 pgoff_t base, end;
1774 struct swap_info_struct *swapdev = swp_type(entry) + swap_info; 1797 int nr_pages = 0;
1775 1798
1776 if (!our_page_cluster) /* no readahead */ 1799 if (!our_page_cluster) /* no readahead */
1777 return 0; 1800 return 0;
1778 toff = (swp_offset(entry) >> our_page_cluster) << our_page_cluster; 1801
1779 if (!toff) /* first page is swap header */ 1802 si = &swap_info[swp_type(entry)];
1780 toff++, i--; 1803 target = swp_offset(entry);
1781 *offset = toff; 1804 base = (target >> our_page_cluster) << our_page_cluster;
1805 end = base + (1 << our_page_cluster);
1806 if (!base) /* first page is swap header */
1807 base++;
1782 1808
1783 spin_lock(&swap_lock); 1809 spin_lock(&swap_lock);
1784 do { 1810 if (end > si->max) /* don't go beyond end of map */
1785 /* Don't read-ahead past the end of the swap area */ 1811 end = si->max;
1786 if (toff >= swapdev->max) 1812
1813 /* Count contiguous allocated slots above our target */
1814 for (toff = target; ++toff < end; nr_pages++) {
1815 /* Don't read in free or bad pages */
1816 if (!si->swap_map[toff])
1817 break;
1818 if (si->swap_map[toff] == SWAP_MAP_BAD)
1787 break; 1819 break;
1820 }
1821 /* Count contiguous allocated slots below our target */
1822 for (toff = target; --toff >= base; nr_pages++) {
1788 /* Don't read in free or bad pages */ 1823 /* Don't read in free or bad pages */
1789 if (!swapdev->swap_map[toff]) 1824 if (!si->swap_map[toff])
1790 break; 1825 break;
1791 if (swapdev->swap_map[toff] == SWAP_MAP_BAD) 1826 if (si->swap_map[toff] == SWAP_MAP_BAD)
1792 break; 1827 break;
1793 toff++; 1828 }
1794 ret++;
1795 } while (--i);
1796 spin_unlock(&swap_lock); 1829 spin_unlock(&swap_lock);
1797 return ret; 1830
1831 /*
1832 * Indicate starting offset, and return number of pages to get:
1833 * if only 1, say 0, since there's then no readahead to be done.
1834 */
1835 *offset = ++toff;
1836 return nr_pages? ++nr_pages: 0;
1798} 1837}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index d436a9c82db7..702083638c16 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -121,18 +121,6 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
121 return 0; 121 return 0;
122} 122}
123 123
124#if 0
125int shmem_mmap(struct file *file, struct vm_area_struct *vma)
126{
127 file_accessed(file);
128#ifndef CONFIG_MMU
129 return ramfs_nommu_mmap(file, vma);
130#else
131 return 0;
132#endif
133}
134#endif /* 0 */
135
136#ifndef CONFIG_MMU 124#ifndef CONFIG_MMU
137unsigned long shmem_get_unmapped_area(struct file *file, 125unsigned long shmem_get_unmapped_area(struct file *file,
138 unsigned long addr, 126 unsigned long addr,
diff --git a/mm/truncate.c b/mm/truncate.c
index c3123b08ff6d..c35c49e54fb6 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -48,7 +48,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
48 48
49static inline void truncate_partial_page(struct page *page, unsigned partial) 49static inline void truncate_partial_page(struct page *page, unsigned partial)
50{ 50{
51 zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0); 51 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
52 if (PagePrivate(page)) 52 if (PagePrivate(page))
53 do_invalidatepage(page, partial); 53 do_invalidatepage(page, partial);
54} 54}
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(cancel_dirty_page);
84 84
85/* 85/*
86 * If truncate cannot remove the fs-private metadata from the page, the page 86 * If truncate cannot remove the fs-private metadata from the page, the page
87 * becomes anonymous. It will be left on the LRU and may even be mapped into 87 * becomes orphaned. It will be left on the LRU and may even be mapped into
88 * user pagetables if we're racing with filemap_fault(). 88 * user pagetables if we're racing with filemap_fault().
89 * 89 *
90 * We need to bale out if page->mapping is no longer equal to the original 90 * We need to bale out if page->mapping is no longer equal to the original
@@ -98,11 +98,11 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
98 if (page->mapping != mapping) 98 if (page->mapping != mapping)
99 return; 99 return;
100 100
101 cancel_dirty_page(page, PAGE_CACHE_SIZE);
102
103 if (PagePrivate(page)) 101 if (PagePrivate(page))
104 do_invalidatepage(page, 0); 102 do_invalidatepage(page, 0);
105 103
104 cancel_dirty_page(page, PAGE_CACHE_SIZE);
105
106 remove_from_page_cache(page); 106 remove_from_page_cache(page);
107 ClearPageUptodate(page); 107 ClearPageUptodate(page);
108 ClearPageMappedToDisk(page); 108 ClearPageMappedToDisk(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af77e171e339..0536dde139d1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -166,6 +166,44 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
166} 166}
167EXPORT_SYMBOL_GPL(map_vm_area); 167EXPORT_SYMBOL_GPL(map_vm_area);
168 168
169/*
170 * Map a vmalloc()-space virtual address to the physical page.
171 */
172struct page *vmalloc_to_page(const void *vmalloc_addr)
173{
174 unsigned long addr = (unsigned long) vmalloc_addr;
175 struct page *page = NULL;
176 pgd_t *pgd = pgd_offset_k(addr);
177 pud_t *pud;
178 pmd_t *pmd;
179 pte_t *ptep, pte;
180
181 if (!pgd_none(*pgd)) {
182 pud = pud_offset(pgd, addr);
183 if (!pud_none(*pud)) {
184 pmd = pmd_offset(pud, addr);
185 if (!pmd_none(*pmd)) {
186 ptep = pte_offset_map(pmd, addr);
187 pte = *ptep;
188 if (pte_present(pte))
189 page = pte_page(pte);
190 pte_unmap(ptep);
191 }
192 }
193 }
194 return page;
195}
196EXPORT_SYMBOL(vmalloc_to_page);
197
198/*
199 * Map a vmalloc()-space virtual address to the physical page frame number.
200 */
201unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
202{
203 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
204}
205EXPORT_SYMBOL(vmalloc_to_pfn);
206
169static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, 207static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
170 unsigned long start, unsigned long end, 208 unsigned long start, unsigned long end,
171 int node, gfp_t gfp_mask) 209 int node, gfp_t gfp_mask)
@@ -216,6 +254,10 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
216 if (addr > end - size) 254 if (addr > end - size)
217 goto out; 255 goto out;
218 } 256 }
257 if ((size + addr) < addr)
258 goto out;
259 if (addr > end - size)
260 goto out;
219 261
220found: 262found:
221 area->next = *p; 263 area->next = *p;
@@ -268,7 +310,7 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
268} 310}
269 311
270/* Caller must hold vmlist_lock */ 312/* Caller must hold vmlist_lock */
271static struct vm_struct *__find_vm_area(void *addr) 313static struct vm_struct *__find_vm_area(const void *addr)
272{ 314{
273 struct vm_struct *tmp; 315 struct vm_struct *tmp;
274 316
@@ -281,7 +323,7 @@ static struct vm_struct *__find_vm_area(void *addr)
281} 323}
282 324
283/* Caller must hold vmlist_lock */ 325/* Caller must hold vmlist_lock */
284static struct vm_struct *__remove_vm_area(void *addr) 326static struct vm_struct *__remove_vm_area(const void *addr)
285{ 327{
286 struct vm_struct **p, *tmp; 328 struct vm_struct **p, *tmp;
287 329
@@ -310,7 +352,7 @@ found:
310 * This function returns the found VM area, but using it is NOT safe 352 * This function returns the found VM area, but using it is NOT safe
311 * on SMP machines, except for its size or flags. 353 * on SMP machines, except for its size or flags.
312 */ 354 */
313struct vm_struct *remove_vm_area(void *addr) 355struct vm_struct *remove_vm_area(const void *addr)
314{ 356{
315 struct vm_struct *v; 357 struct vm_struct *v;
316 write_lock(&vmlist_lock); 358 write_lock(&vmlist_lock);
@@ -319,7 +361,7 @@ struct vm_struct *remove_vm_area(void *addr)
319 return v; 361 return v;
320} 362}
321 363
322static void __vunmap(void *addr, int deallocate_pages) 364static void __vunmap(const void *addr, int deallocate_pages)
323{ 365{
324 struct vm_struct *area; 366 struct vm_struct *area;
325 367
@@ -346,8 +388,10 @@ static void __vunmap(void *addr, int deallocate_pages)
346 int i; 388 int i;
347 389
348 for (i = 0; i < area->nr_pages; i++) { 390 for (i = 0; i < area->nr_pages; i++) {
349 BUG_ON(!area->pages[i]); 391 struct page *page = area->pages[i];
350 __free_page(area->pages[i]); 392
393 BUG_ON(!page);
394 __free_page(page);
351 } 395 }
352 396
353 if (area->flags & VM_VPAGES) 397 if (area->flags & VM_VPAGES)
@@ -370,7 +414,7 @@ static void __vunmap(void *addr, int deallocate_pages)
370 * 414 *
371 * Must not be called in interrupt context. 415 * Must not be called in interrupt context.
372 */ 416 */
373void vfree(void *addr) 417void vfree(const void *addr)
374{ 418{
375 BUG_ON(in_interrupt()); 419 BUG_ON(in_interrupt());
376 __vunmap(addr, 1); 420 __vunmap(addr, 1);
@@ -386,7 +430,7 @@ EXPORT_SYMBOL(vfree);
386 * 430 *
387 * Must not be called in interrupt context. 431 * Must not be called in interrupt context.
388 */ 432 */
389void vunmap(void *addr) 433void vunmap(const void *addr)
390{ 434{
391 BUG_ON(in_interrupt()); 435 BUG_ON(in_interrupt());
392 __vunmap(addr, 0); 436 __vunmap(addr, 0);
@@ -423,8 +467,8 @@ void *vmap(struct page **pages, unsigned int count,
423} 467}
424EXPORT_SYMBOL(vmap); 468EXPORT_SYMBOL(vmap);
425 469
426void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 470static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
427 pgprot_t prot, int node) 471 pgprot_t prot, int node)
428{ 472{
429 struct page **pages; 473 struct page **pages;
430 unsigned int nr_pages, array_size, i; 474 unsigned int nr_pages, array_size, i;
@@ -451,15 +495,19 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
451 } 495 }
452 496
453 for (i = 0; i < area->nr_pages; i++) { 497 for (i = 0; i < area->nr_pages; i++) {
498 struct page *page;
499
454 if (node < 0) 500 if (node < 0)
455 area->pages[i] = alloc_page(gfp_mask); 501 page = alloc_page(gfp_mask);
456 else 502 else
457 area->pages[i] = alloc_pages_node(node, gfp_mask, 0); 503 page = alloc_pages_node(node, gfp_mask, 0);
458 if (unlikely(!area->pages[i])) { 504
505 if (unlikely(!page)) {
459 /* Successfully allocated i pages, free them in __vunmap() */ 506 /* Successfully allocated i pages, free them in __vunmap() */
460 area->nr_pages = i; 507 area->nr_pages = i;
461 goto fail; 508 goto fail;
462 } 509 }
510 area->pages[i] = page;
463 } 511 }
464 512
465 if (map_vm_area(area, prot, &pages)) 513 if (map_vm_area(area, prot, &pages))
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e8d846f57774..422d960ffcd8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -21,21 +21,14 @@ EXPORT_PER_CPU_SYMBOL(vm_event_states);
21 21
22static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) 22static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
23{ 23{
24 int cpu = 0; 24 int cpu;
25 int i; 25 int i;
26 26
27 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); 27 memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
28 28
29 cpu = first_cpu(*cpumask); 29 for_each_cpu_mask(cpu, *cpumask) {
30 while (cpu < NR_CPUS) {
31 struct vm_event_state *this = &per_cpu(vm_event_states, cpu); 30 struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
32 31
33 cpu = next_cpu(cpu, *cpumask);
34
35 if (cpu < NR_CPUS)
36 prefetch(&per_cpu(vm_event_states, cpu));
37
38
39 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) 32 for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
40 ret[i] += this->event[i]; 33 ret[i] += this->event[i];
41 } 34 }
@@ -284,6 +277,10 @@ EXPORT_SYMBOL(dec_zone_page_state);
284/* 277/*
285 * Update the zone counters for one cpu. 278 * Update the zone counters for one cpu.
286 * 279 *
280 * The cpu specified must be either the current cpu or a processor that
281 * is not online. If it is the current cpu then the execution thread must
282 * be pinned to the current cpu.
283 *
287 * Note that refresh_cpu_vm_stats strives to only access 284 * Note that refresh_cpu_vm_stats strives to only access
288 * node local memory. The per cpu pagesets on remote zones are placed 285 * node local memory. The per cpu pagesets on remote zones are placed
289 * in the memory local to the processor using that pageset. So the 286 * in the memory local to the processor using that pageset. So the
@@ -299,7 +296,7 @@ void refresh_cpu_vm_stats(int cpu)
299{ 296{
300 struct zone *zone; 297 struct zone *zone;
301 int i; 298 int i;
302 unsigned long flags; 299 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
303 300
304 for_each_zone(zone) { 301 for_each_zone(zone) {
305 struct per_cpu_pageset *p; 302 struct per_cpu_pageset *p;
@@ -311,15 +308,19 @@ void refresh_cpu_vm_stats(int cpu)
311 308
312 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 309 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
313 if (p->vm_stat_diff[i]) { 310 if (p->vm_stat_diff[i]) {
311 unsigned long flags;
312 int v;
313
314 local_irq_save(flags); 314 local_irq_save(flags);
315 zone_page_state_add(p->vm_stat_diff[i], 315 v = p->vm_stat_diff[i];
316 zone, i);
317 p->vm_stat_diff[i] = 0; 316 p->vm_stat_diff[i] = 0;
317 local_irq_restore(flags);
318 atomic_long_add(v, &zone->vm_stat[i]);
319 global_diff[i] += v;
318#ifdef CONFIG_NUMA 320#ifdef CONFIG_NUMA
319 /* 3 seconds idle till flush */ 321 /* 3 seconds idle till flush */
320 p->expire = 3; 322 p->expire = 3;
321#endif 323#endif
322 local_irq_restore(flags);
323 } 324 }
324#ifdef CONFIG_NUMA 325#ifdef CONFIG_NUMA
325 /* 326 /*
@@ -329,7 +330,7 @@ void refresh_cpu_vm_stats(int cpu)
329 * Check if there are pages remaining in this pageset 330 * Check if there are pages remaining in this pageset
330 * if not then there is nothing to expire. 331 * if not then there is nothing to expire.
331 */ 332 */
332 if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count)) 333 if (!p->expire || !p->pcp.count)
333 continue; 334 continue;
334 335
335 /* 336 /*
@@ -344,13 +345,14 @@ void refresh_cpu_vm_stats(int cpu)
344 if (p->expire) 345 if (p->expire)
345 continue; 346 continue;
346 347
347 if (p->pcp[0].count) 348 if (p->pcp.count)
348 drain_zone_pages(zone, p->pcp + 0); 349 drain_zone_pages(zone, &p->pcp);
349
350 if (p->pcp[1].count)
351 drain_zone_pages(zone, p->pcp + 1);
352#endif 350#endif
353 } 351 }
352
353 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
354 if (global_diff[i])
355 atomic_long_add(global_diff[i], &vm_stat[i]);
354} 356}
355 357
356#endif 358#endif
@@ -681,20 +683,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
681 "\n pagesets"); 683 "\n pagesets");
682 for_each_online_cpu(i) { 684 for_each_online_cpu(i) {
683 struct per_cpu_pageset *pageset; 685 struct per_cpu_pageset *pageset;
684 int j;
685 686
686 pageset = zone_pcp(zone, i); 687 pageset = zone_pcp(zone, i);
687 for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { 688 seq_printf(m,
688 seq_printf(m, 689 "\n cpu: %i"
689 "\n cpu: %i pcp: %i" 690 "\n count: %i"
690 "\n count: %i" 691 "\n high: %i"
691 "\n high: %i" 692 "\n batch: %i",
692 "\n batch: %i", 693 i,
693 i, j, 694 pageset->pcp.count,
694 pageset->pcp[j].count, 695 pageset->pcp.high,
695 pageset->pcp[j].high, 696 pageset->pcp.batch);
696 pageset->pcp[j].batch);
697 }
698#ifdef CONFIG_SMP 697#ifdef CONFIG_SMP
699 seq_printf(m, "\n vm stats threshold: %d", 698 seq_printf(m, "\n vm stats threshold: %d",
700 pageset->stat_threshold); 699 pageset->stat_threshold);