aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDave Kleikamp <shaggy@austin.ibm.com>2006-01-24 15:34:47 -0500
committerDave Kleikamp <shaggy@austin.ibm.com>2006-01-24 15:34:47 -0500
commit0a0fc0ddbe732779366ab6b1b879f62195e65967 (patch)
tree7b42490a676cf39ae0691b6859ecf7fd410f229b /mm
parent4d5dbd0945d9e0833dd7964a3d6ee33157f7cc7a (diff)
parent3ee68c4af3fd7228c1be63254b9f884614f9ebb2 (diff)
Merge with /home/shaggy/git/linus-clean/
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig15
-rw-r--r--mm/Makefile6
-rw-r--r--mm/bootmem.c60
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/filemap.c162
-rw-r--r--mm/filemap_xip.c8
-rw-r--r--mm/fremap.c52
-rw-r--r--mm/hugetlb.c200
-rw-r--r--mm/internal.h21
-rw-r--r--mm/madvise.c37
-rw-r--r--mm/memory.c438
-rw-r--r--mm/memory_hotplug.c3
-rw-r--r--mm/mempolicy.c708
-rw-r--r--mm/mlock.c1
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/msync.c14
-rw-r--r--mm/nommu.c9
-rw-r--r--mm/oom_kill.c8
-rw-r--r--mm/page-writeback.c18
-rw-r--r--mm/page_alloc.c788
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c134
-rw-r--r--mm/shmem.c81
-rw-r--r--mm/slab.c1247
-rw-r--r--mm/slob.c385
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c58
-rw-r--r--mm/swap_state.c8
-rw-r--r--mm/swapfile.c60
-rw-r--r--mm/thrash.c10
-rw-r--r--mm/tiny-shmem.c29
-rw-r--r--mm/truncate.c51
-rw-r--r--mm/util.c39
-rw-r--r--mm/vmscan.c485
37 files changed, 3479 insertions, 1709 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ae9ce6b73e..a9cb80ae64 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
11 11
12config FLATMEM_MANUAL 12config FLATMEM_MANUAL
13 bool "Flat Memory" 13 bool "Flat Memory"
14 depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE 14 depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
15 help 15 help
16 This option allows you to change some of the ways that 16 This option allows you to change some of the ways that
17 Linux manages its memory internally. Most users will 17 Linux manages its memory internally. Most users will
@@ -125,12 +125,17 @@ comment "Memory hotplug is currently incompatible with Software Suspend"
125# space can be handled with less contention: split it at this NR_CPUS. 125# space can be handled with less contention: split it at this NR_CPUS.
126# Default to 4 for wider testing, though 8 might be more appropriate. 126# Default to 4 for wider testing, though 8 might be more appropriate.
127# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. 127# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
128# PA-RISC's debug spinlock_t is too large for the 32-bit struct page. 128# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
129# ARM26 and SPARC32 and PPC64 may use one page for multiple page tables.
130# 129#
131config SPLIT_PTLOCK_CPUS 130config SPLIT_PTLOCK_CPUS
132 int 131 int
133 default "4096" if ARM && !CPU_CACHE_VIPT 132 default "4096" if ARM && !CPU_CACHE_VIPT
134 default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT 133 default "4096" if PARISC && !PA20
135 default "4096" if ARM26 || SPARC32 || PPC64
136 default "4" 134 default "4"
135
136#
137# support for page migration
138#
139config MIGRATION
140 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
141 depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f..9aa03fa1dc 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o slab.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o $(mmu-y) 13 prio_tree.o util.o $(mmu-y)
14 14
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o 18obj-$(CONFIG_SPARSEMEM) += sparse.o
19obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
21obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o
21obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
22obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8c567177d..35c32290f7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -204,6 +204,8 @@ restart_scan:
204 unsigned long j; 204 unsigned long j;
205 i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); 205 i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
206 i = ALIGN(i, incr); 206 i = ALIGN(i, incr);
207 if (i >= eidx)
208 break;
207 if (test_bit(i, bdata->node_bootmem_map)) 209 if (test_bit(i, bdata->node_bootmem_map))
208 continue; 210 continue;
209 for (j = i + 1; j < i + areasize; ++j) { 211 for (j = i + 1; j < i + areasize; ++j) {
@@ -294,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
294 unsigned long v = ~map[i / BITS_PER_LONG]; 296 unsigned long v = ~map[i / BITS_PER_LONG];
295 297
296 if (gofast && v == ~0UL) { 298 if (gofast && v == ~0UL) {
297 int j, order; 299 int order;
298 300
299 page = pfn_to_page(pfn); 301 page = pfn_to_page(pfn);
300 count += BITS_PER_LONG; 302 count += BITS_PER_LONG;
301 __ClearPageReserved(page);
302 order = ffs(BITS_PER_LONG) - 1; 303 order = ffs(BITS_PER_LONG) - 1;
303 set_page_refs(page, order); 304 __free_pages_bootmem(page, order);
304 for (j = 1; j < BITS_PER_LONG; j++) {
305 if (j + 16 < BITS_PER_LONG)
306 prefetchw(page + j + 16);
307 __ClearPageReserved(page + j);
308 set_page_count(page + j, 0);
309 }
310 __free_pages(page, order);
311 i += BITS_PER_LONG; 305 i += BITS_PER_LONG;
312 page += BITS_PER_LONG; 306 page += BITS_PER_LONG;
313 } else if (v) { 307 } else if (v) {
@@ -317,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
317 for (m = 1; m && i < idx; m<<=1, page++, i++) { 311 for (m = 1; m && i < idx; m<<=1, page++, i++) {
318 if (v & m) { 312 if (v & m) {
319 count++; 313 count++;
320 __ClearPageReserved(page); 314 __free_pages_bootmem(page, 0);
321 set_page_refs(page, 0);
322 __free_page(page);
323 } 315 }
324 } 316 }
325 } else { 317 } else {
@@ -337,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
337 count = 0; 329 count = 0;
338 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { 330 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
339 count++; 331 count++;
340 __ClearPageReserved(page); 332 __free_pages_bootmem(page, 0);
341 set_page_count(page, 1);
342 __free_page(page);
343 } 333 }
344 total += count; 334 total += count;
345 bdata->node_bootmem_map = NULL; 335 bdata->node_bootmem_map = NULL;
@@ -391,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
391 return(free_all_bootmem_core(NODE_DATA(0))); 381 return(free_all_bootmem_core(NODE_DATA(0)));
392} 382}
393 383
394void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, 384void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
395 unsigned long limit)
396{ 385{
397 pg_data_t *pgdat = pgdat_list; 386 pg_data_t *pgdat = pgdat_list;
398 void *ptr; 387 void *ptr;
399 388
400 for_each_pgdat(pgdat) 389 for_each_pgdat(pgdat)
401 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, 390 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
402 align, goal, limit))) 391 align, goal, 0)))
403 return(ptr); 392 return(ptr);
404 393
405 /* 394 /*
@@ -411,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
411} 400}
412 401
413 402
414void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, 403void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
415 unsigned long goal, unsigned long limit) 404 unsigned long goal)
416{ 405{
417 void *ptr; 406 void *ptr;
418 407
419 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); 408 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
420 if (ptr) 409 if (ptr)
421 return (ptr); 410 return (ptr);
422 411
423 return __alloc_bootmem_limit(size, align, goal, limit); 412 return __alloc_bootmem(size, align, goal);
413}
414
415#define LOW32LIMIT 0xffffffff
416
417void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
418{
419 pg_data_t *pgdat = pgdat_list;
420 void *ptr;
421
422 for_each_pgdat(pgdat)
423 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
424 align, goal, LOW32LIMIT)))
425 return(ptr);
426
427 /*
428 * Whoops, we cannot satisfy the allocation request.
429 */
430 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
431 panic("Out of low memory");
432 return NULL;
424} 433}
425 434
435void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
436 unsigned long align, unsigned long goal)
437{
438 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
439}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5..d257c89e77 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
37 if (!file) 37 if (!file)
38 return -EBADF; 38 return -EBADF;
39 39
40 if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
41 ret = -ESPIPE;
42 goto out;
43 }
44
40 mapping = file->f_mapping; 45 mapping = file->f_mapping;
41 if (!mapping || len < 0) { 46 if (!mapping || len < 0) {
42 ret = -EINVAL; 47 ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 5d6e4c2000..44da3d4769 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/aio.h> 17#include <linux/aio.h>
18#include <linux/capability.h>
18#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
19#include <linux/mm.h> 20#include <linux/mm.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
@@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
61 * ->swap_lock (exclusive_swap_page, others) 62 * ->swap_lock (exclusive_swap_page, others)
62 * ->mapping->tree_lock 63 * ->mapping->tree_lock
63 * 64 *
64 * ->i_sem 65 * ->i_mutex
65 * ->i_mmap_lock (truncate->unmap_mapping_range) 66 * ->i_mmap_lock (truncate->unmap_mapping_range)
66 * 67 *
67 * ->mmap_sem 68 * ->mmap_sem
@@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
73 * ->lock_page (access_process_vm) 74 * ->lock_page (access_process_vm)
74 * 75 *
75 * ->mmap_sem 76 * ->mmap_sem
76 * ->i_sem (msync) 77 * ->i_mutex (msync)
77 * 78 *
78 * ->i_sem 79 * ->i_mutex
79 * ->i_alloc_sem (various) 80 * ->i_alloc_sem (various)
80 * 81 *
81 * ->inode_lock 82 * ->inode_lock
@@ -93,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
93 * ->private_lock (try_to_unmap_one) 94 * ->private_lock (try_to_unmap_one)
94 * ->tree_lock (try_to_unmap_one) 95 * ->tree_lock (try_to_unmap_one)
95 * ->zone.lru_lock (follow_page->mark_page_accessed) 96 * ->zone.lru_lock (follow_page->mark_page_accessed)
97 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
96 * ->private_lock (page_remove_rmap->set_page_dirty) 98 * ->private_lock (page_remove_rmap->set_page_dirty)
97 * ->tree_lock (page_remove_rmap->set_page_dirty) 99 * ->tree_lock (page_remove_rmap->set_page_dirty)
98 * ->inode_lock (page_remove_rmap->set_page_dirty) 100 * ->inode_lock (page_remove_rmap->set_page_dirty)
@@ -134,7 +136,7 @@ static int sync_page(void *word)
134 struct address_space *mapping; 136 struct address_space *mapping;
135 struct page *page; 137 struct page *page;
136 138
137 page = container_of((page_flags_t *)word, struct page, flags); 139 page = container_of((unsigned long *)word, struct page, flags);
138 140
139 /* 141 /*
140 * page_mapping() is being called without PG_locked held. 142 * page_mapping() is being called without PG_locked held.
@@ -276,11 +278,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
276 * integrity" operation. It waits upon in-flight writeout before starting and 278 * integrity" operation. It waits upon in-flight writeout before starting and
277 * waiting upon new writeout. If there was an IO error, return it. 279 * waiting upon new writeout. If there was an IO error, return it.
278 * 280 *
279 * We need to re-take i_sem during the generic_osync_inode list walk because 281 * We need to re-take i_mutex during the generic_osync_inode list walk because
280 * it is otherwise livelockable. 282 * it is otherwise livelockable.
281 */ 283 */
282int sync_page_range(struct inode *inode, struct address_space *mapping, 284int sync_page_range(struct inode *inode, struct address_space *mapping,
283 loff_t pos, size_t count) 285 loff_t pos, loff_t count)
284{ 286{
285 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 287 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
286 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 288 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -290,9 +292,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
290 return 0; 292 return 0;
291 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); 293 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
292 if (ret == 0) { 294 if (ret == 0) {
293 down(&inode->i_sem); 295 mutex_lock(&inode->i_mutex);
294 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); 296 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
295 up(&inode->i_sem); 297 mutex_unlock(&inode->i_mutex);
296 } 298 }
297 if (ret == 0) 299 if (ret == 0)
298 ret = wait_on_page_writeback_range(mapping, start, end); 300 ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,13 +303,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
301EXPORT_SYMBOL(sync_page_range); 303EXPORT_SYMBOL(sync_page_range);
302 304
303/* 305/*
304 * Note: Holding i_sem across sync_page_range_nolock is not a good idea 306 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
305 * as it forces O_SYNC writers to different parts of the same file 307 * as it forces O_SYNC writers to different parts of the same file
306 * to be serialised right until io completion. 308 * to be serialised right until io completion.
307 */ 309 */
308static int sync_page_range_nolock(struct inode *inode, 310int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
309 struct address_space *mapping, 311 loff_t pos, loff_t count)
310 loff_t pos, size_t count)
311{ 312{
312 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 313 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
313 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 314 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +323,7 @@ static int sync_page_range_nolock(struct inode *inode,
322 ret = wait_on_page_writeback_range(mapping, start, end); 323 ret = wait_on_page_writeback_range(mapping, start, end);
323 return ret; 324 return ret;
324} 325}
326EXPORT_SYMBOL(sync_page_range_nolock);
325 327
326/** 328/**
327 * filemap_fdatawait - walk the list of under-writeback pages of the given 329 * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +345,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
343 345
344int filemap_write_and_wait(struct address_space *mapping) 346int filemap_write_and_wait(struct address_space *mapping)
345{ 347{
346 int retval = 0; 348 int err = 0;
347 349
348 if (mapping->nrpages) { 350 if (mapping->nrpages) {
349 retval = filemap_fdatawrite(mapping); 351 err = filemap_fdatawrite(mapping);
350 if (retval == 0) 352 /*
351 retval = filemap_fdatawait(mapping); 353 * Even if the above returned error, the pages may be
354 * written partially (e.g. -ENOSPC), so we wait for it.
355 * But the -EIO is special case, it may indicate the worst
356 * thing (e.g. bug) happened, so we avoid waiting for it.
357 */
358 if (err != -EIO) {
359 int err2 = filemap_fdatawait(mapping);
360 if (!err)
361 err = err2;
362 }
352 } 363 }
353 return retval; 364 return err;
354} 365}
366EXPORT_SYMBOL(filemap_write_and_wait);
355 367
356int filemap_write_and_wait_range(struct address_space *mapping, 368int filemap_write_and_wait_range(struct address_space *mapping,
357 loff_t lstart, loff_t lend) 369 loff_t lstart, loff_t lend)
358{ 370{
359 int retval = 0; 371 int err = 0;
360 372
361 if (mapping->nrpages) { 373 if (mapping->nrpages) {
362 retval = __filemap_fdatawrite_range(mapping, lstart, lend, 374 err = __filemap_fdatawrite_range(mapping, lstart, lend,
363 WB_SYNC_ALL); 375 WB_SYNC_ALL);
364 if (retval == 0) 376 /* See comment of filemap_write_and_wait() */
365 retval = wait_on_page_writeback_range(mapping, 377 if (err != -EIO) {
366 lstart >> PAGE_CACHE_SHIFT, 378 int err2 = wait_on_page_writeback_range(mapping,
367 lend >> PAGE_CACHE_SHIFT); 379 lstart >> PAGE_CACHE_SHIFT,
380 lend >> PAGE_CACHE_SHIFT);
381 if (!err)
382 err = err2;
383 }
368 } 384 }
369 return retval; 385 return err;
370} 386}
371 387
372/* 388/*
@@ -555,11 +571,12 @@ repeat:
555 page_cache_get(page); 571 page_cache_get(page);
556 if (TestSetPageLocked(page)) { 572 if (TestSetPageLocked(page)) {
557 read_unlock_irq(&mapping->tree_lock); 573 read_unlock_irq(&mapping->tree_lock);
558 lock_page(page); 574 __lock_page(page);
559 read_lock_irq(&mapping->tree_lock); 575 read_lock_irq(&mapping->tree_lock);
560 576
561 /* Has the page been truncated while we slept? */ 577 /* Has the page been truncated while we slept? */
562 if (page->mapping != mapping || page->index != offset) { 578 if (unlikely(page->mapping != mapping ||
579 page->index != offset)) {
563 unlock_page(page); 580 unlock_page(page);
564 page_cache_release(page); 581 page_cache_release(page);
565 goto repeat; 582 goto repeat;
@@ -831,8 +848,13 @@ readpage:
831 /* Start the actual read. The read will unlock the page. */ 848 /* Start the actual read. The read will unlock the page. */
832 error = mapping->a_ops->readpage(filp, page); 849 error = mapping->a_ops->readpage(filp, page);
833 850
834 if (unlikely(error)) 851 if (unlikely(error)) {
852 if (error == AOP_TRUNCATED_PAGE) {
853 page_cache_release(page);
854 goto find_page;
855 }
835 goto readpage_error; 856 goto readpage_error;
857 }
836 858
837 if (!PageUptodate(page)) { 859 if (!PageUptodate(page)) {
838 lock_page(page); 860 lock_page(page);
@@ -1152,26 +1174,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
1152{ 1174{
1153 struct address_space *mapping = file->f_mapping; 1175 struct address_space *mapping = file->f_mapping;
1154 struct page *page; 1176 struct page *page;
1155 int error; 1177 int ret;
1156 1178
1157 page = page_cache_alloc_cold(mapping); 1179 do {
1158 if (!page) 1180 page = page_cache_alloc_cold(mapping);
1159 return -ENOMEM; 1181 if (!page)
1182 return -ENOMEM;
1183
1184 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1185 if (ret == 0)
1186 ret = mapping->a_ops->readpage(file, page);
1187 else if (ret == -EEXIST)
1188 ret = 0; /* losing race to add is OK */
1160 1189
1161 error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1162 if (!error) {
1163 error = mapping->a_ops->readpage(file, page);
1164 page_cache_release(page); 1190 page_cache_release(page);
1165 return error;
1166 }
1167 1191
1168 /* 1192 } while (ret == AOP_TRUNCATED_PAGE);
1169 * We arrive here in the unlikely event that someone 1193
1170 * raced with us and added our page to the cache first 1194 return ret;
1171 * or we are out of memory for radix-tree nodes.
1172 */
1173 page_cache_release(page);
1174 return error == -EEXIST ? 0 : error;
1175} 1195}
1176 1196
1177#define MMAP_LOTSAMISS (100) 1197#define MMAP_LOTSAMISS (100)
@@ -1331,10 +1351,14 @@ page_not_uptodate:
1331 goto success; 1351 goto success;
1332 } 1352 }
1333 1353
1334 if (!mapping->a_ops->readpage(file, page)) { 1354 error = mapping->a_ops->readpage(file, page);
1355 if (!error) {
1335 wait_on_page_locked(page); 1356 wait_on_page_locked(page);
1336 if (PageUptodate(page)) 1357 if (PageUptodate(page))
1337 goto success; 1358 goto success;
1359 } else if (error == AOP_TRUNCATED_PAGE) {
1360 page_cache_release(page);
1361 goto retry_find;
1338 } 1362 }
1339 1363
1340 /* 1364 /*
@@ -1358,10 +1382,14 @@ page_not_uptodate:
1358 goto success; 1382 goto success;
1359 } 1383 }
1360 ClearPageError(page); 1384 ClearPageError(page);
1361 if (!mapping->a_ops->readpage(file, page)) { 1385 error = mapping->a_ops->readpage(file, page);
1386 if (!error) {
1362 wait_on_page_locked(page); 1387 wait_on_page_locked(page);
1363 if (PageUptodate(page)) 1388 if (PageUptodate(page))
1364 goto success; 1389 goto success;
1390 } else if (error == AOP_TRUNCATED_PAGE) {
1391 page_cache_release(page);
1392 goto retry_find;
1365 } 1393 }
1366 1394
1367 /* 1395 /*
@@ -1444,10 +1472,14 @@ page_not_uptodate:
1444 goto success; 1472 goto success;
1445 } 1473 }
1446 1474
1447 if (!mapping->a_ops->readpage(file, page)) { 1475 error = mapping->a_ops->readpage(file, page);
1476 if (!error) {
1448 wait_on_page_locked(page); 1477 wait_on_page_locked(page);
1449 if (PageUptodate(page)) 1478 if (PageUptodate(page))
1450 goto success; 1479 goto success;
1480 } else if (error == AOP_TRUNCATED_PAGE) {
1481 page_cache_release(page);
1482 goto retry_find;
1451 } 1483 }
1452 1484
1453 /* 1485 /*
@@ -1470,10 +1502,14 @@ page_not_uptodate:
1470 } 1502 }
1471 1503
1472 ClearPageError(page); 1504 ClearPageError(page);
1473 if (!mapping->a_ops->readpage(file, page)) { 1505 error = mapping->a_ops->readpage(file, page);
1506 if (!error) {
1474 wait_on_page_locked(page); 1507 wait_on_page_locked(page);
1475 if (PageUptodate(page)) 1508 if (PageUptodate(page))
1476 goto success; 1509 goto success;
1510 } else if (error == AOP_TRUNCATED_PAGE) {
1511 page_cache_release(page);
1512 goto retry_find;
1477 } 1513 }
1478 1514
1479 /* 1515 /*
@@ -1858,7 +1894,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1858 /* 1894 /*
1859 * Sync the fs metadata but not the minor inode changes and 1895 * Sync the fs metadata but not the minor inode changes and
1860 * of course not the data as we did direct DMA for the IO. 1896 * of course not the data as we did direct DMA for the IO.
1861 * i_sem is held, which protects generic_osync_inode() from 1897 * i_mutex is held, which protects generic_osync_inode() from
1862 * livelocking. 1898 * livelocking.
1863 */ 1899 */
1864 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 1900 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -1934,12 +1970,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1934 status = a_ops->prepare_write(file, page, offset, offset+bytes); 1970 status = a_ops->prepare_write(file, page, offset, offset+bytes);
1935 if (unlikely(status)) { 1971 if (unlikely(status)) {
1936 loff_t isize = i_size_read(inode); 1972 loff_t isize = i_size_read(inode);
1973
1974 if (status != AOP_TRUNCATED_PAGE)
1975 unlock_page(page);
1976 page_cache_release(page);
1977 if (status == AOP_TRUNCATED_PAGE)
1978 continue;
1937 /* 1979 /*
1938 * prepare_write() may have instantiated a few blocks 1980 * prepare_write() may have instantiated a few blocks
1939 * outside i_size. Trim these off again. 1981 * outside i_size. Trim these off again.
1940 */ 1982 */
1941 unlock_page(page);
1942 page_cache_release(page);
1943 if (pos + bytes > isize) 1983 if (pos + bytes > isize)
1944 vmtruncate(inode, isize); 1984 vmtruncate(inode, isize);
1945 break; 1985 break;
@@ -1952,6 +1992,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1952 cur_iov, iov_base, bytes); 1992 cur_iov, iov_base, bytes);
1953 flush_dcache_page(page); 1993 flush_dcache_page(page);
1954 status = a_ops->commit_write(file, page, offset, offset+bytes); 1994 status = a_ops->commit_write(file, page, offset, offset+bytes);
1995 if (status == AOP_TRUNCATED_PAGE) {
1996 page_cache_release(page);
1997 continue;
1998 }
1955 if (likely(copied > 0)) { 1999 if (likely(copied > 0)) {
1956 if (!status) 2000 if (!status)
1957 status = copied; 2001 status = copied;
@@ -2066,7 +2110,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2066 if (err) 2110 if (err)
2067 goto out; 2111 goto out;
2068 2112
2069 inode_update_time(inode, 1); 2113 file_update_time(file);
2070 2114
2071 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2115 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2072 if (unlikely(file->f_flags & O_DIRECT)) { 2116 if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2153,10 +2197,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2153 2197
2154 BUG_ON(iocb->ki_pos != pos); 2198 BUG_ON(iocb->ki_pos != pos);
2155 2199
2156 down(&inode->i_sem); 2200 mutex_lock(&inode->i_mutex);
2157 ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, 2201 ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
2158 &iocb->ki_pos); 2202 &iocb->ki_pos);
2159 up(&inode->i_sem); 2203 mutex_unlock(&inode->i_mutex);
2160 2204
2161 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2205 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2162 ssize_t err; 2206 ssize_t err;
@@ -2178,9 +2222,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
2178 struct iovec local_iov = { .iov_base = (void __user *)buf, 2222 struct iovec local_iov = { .iov_base = (void __user *)buf,
2179 .iov_len = count }; 2223 .iov_len = count };
2180 2224
2181 down(&inode->i_sem); 2225 mutex_lock(&inode->i_mutex);
2182 ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); 2226 ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2183 up(&inode->i_sem); 2227 mutex_unlock(&inode->i_mutex);
2184 2228
2185 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2229 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2186 ssize_t err; 2230 ssize_t err;
@@ -2214,9 +2258,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2214 struct inode *inode = mapping->host; 2258 struct inode *inode = mapping->host;
2215 ssize_t ret; 2259 ssize_t ret;
2216 2260
2217 down(&inode->i_sem); 2261 mutex_lock(&inode->i_mutex);
2218 ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); 2262 ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2219 up(&inode->i_sem); 2263 mutex_unlock(&inode->i_mutex);
2220 2264
2221 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2265 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2222 int err; 2266 int err;
@@ -2230,7 +2274,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2230EXPORT_SYMBOL(generic_file_writev); 2274EXPORT_SYMBOL(generic_file_writev);
2231 2275
2232/* 2276/*
2233 * Called under i_sem for writes to S_ISREG files. Returns -EIO if something 2277 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2234 * went wrong during pagecache shootdown. 2278 * went wrong during pagecache shootdown.
2235 */ 2279 */
2236static ssize_t 2280static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a2..b960ac8e59 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
338 *ppos = pos; 338 *ppos = pos;
339 /* 339 /*
340 * No need to use i_size_read() here, the i_size 340 * No need to use i_size_read() here, the i_size
341 * cannot change under us because we hold i_sem. 341 * cannot change under us because we hold i_mutex.
342 */ 342 */
343 if (pos > inode->i_size) { 343 if (pos > inode->i_size) {
344 i_size_write(inode, pos); 344 i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
358 loff_t pos; 358 loff_t pos;
359 ssize_t ret; 359 ssize_t ret;
360 360
361 down(&inode->i_sem); 361 mutex_lock(&inode->i_mutex);
362 362
363 if (!access_ok(VERIFY_READ, buf, len)) { 363 if (!access_ok(VERIFY_READ, buf, len)) {
364 ret=-EFAULT; 364 ret=-EFAULT;
@@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
383 if (ret) 383 if (ret)
384 goto out_backing; 384 goto out_backing;
385 385
386 inode_update_time(inode, 1); 386 file_update_time(filp);
387 387
388 ret = __xip_file_write (filp, buf, count, pos, ppos); 388 ret = __xip_file_write (filp, buf, count, pos, ppos);
389 389
390 out_backing: 390 out_backing:
391 current->backing_dev_info = NULL; 391 current->backing_dev_info = NULL;
392 out_up: 392 out_up:
393 up(&inode->i_sem); 393 mutex_unlock(&inode->i_mutex);
394 return ret; 394 return ret;
395} 395}
396EXPORT_SYMBOL_GPL(xip_file_write); 396EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/fremap.c b/mm/fremap.c
index d862be3bc3..9f381e58bf 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
27 struct page *page = NULL; 27 struct page *page = NULL;
28 28
29 if (pte_present(pte)) { 29 if (pte_present(pte)) {
30 unsigned long pfn = pte_pfn(pte); 30 flush_cache_page(vma, addr, pte_pfn(pte));
31 flush_cache_page(vma, addr, pfn);
32 pte = ptep_clear_flush(vma, addr, ptep); 31 pte = ptep_clear_flush(vma, addr, ptep);
33 if (unlikely(!pfn_valid(pfn))) { 32 page = vm_normal_page(vma, addr, pte);
34 print_bad_pte(vma, pte, addr); 33 if (page) {
35 goto out; 34 if (pte_dirty(pte))
35 set_page_dirty(page);
36 page_remove_rmap(page);
37 page_cache_release(page);
36 } 38 }
37 page = pfn_to_page(pfn);
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
42 } else { 39 } else {
43 if (!pte_file(pte)) 40 if (!pte_file(pte))
44 free_swap_and_cache(pte_to_swp_entry(pte)); 41 free_swap_and_cache(pte_to_swp_entry(pte));
45 pte_clear(mm, addr, ptep); 42 pte_clear(mm, addr, ptep);
46 } 43 }
47out:
48 return !!page; 44 return !!page;
49} 45}
50 46
@@ -59,22 +55,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
59 pgoff_t size; 55 pgoff_t size;
60 int err = -ENOMEM; 56 int err = -ENOMEM;
61 pte_t *pte; 57 pte_t *pte;
62 pmd_t *pmd;
63 pud_t *pud;
64 pgd_t *pgd;
65 pte_t pte_val; 58 pte_t pte_val;
66 spinlock_t *ptl; 59 spinlock_t *ptl;
67 60
68 BUG_ON(vma->vm_flags & VM_RESERVED); 61 pte = get_locked_pte(mm, addr, &ptl);
69
70 pgd = pgd_offset(mm, addr);
71 pud = pud_alloc(mm, pgd, addr);
72 if (!pud)
73 goto out;
74 pmd = pmd_alloc(mm, pud, addr);
75 if (!pmd)
76 goto out;
77 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
78 if (!pte) 62 if (!pte)
79 goto out; 63 goto out;
80 64
@@ -116,22 +100,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
116{ 100{
117 int err = -ENOMEM; 101 int err = -ENOMEM;
118 pte_t *pte; 102 pte_t *pte;
119 pmd_t *pmd;
120 pud_t *pud;
121 pgd_t *pgd;
122 pte_t pte_val; 103 pte_t pte_val;
123 spinlock_t *ptl; 104 spinlock_t *ptl;
124 105
125 BUG_ON(vma->vm_flags & VM_RESERVED); 106 pte = get_locked_pte(mm, addr, &ptl);
126
127 pgd = pgd_offset(mm, addr);
128 pud = pud_alloc(mm, pgd, addr);
129 if (!pud)
130 goto out;
131 pmd = pmd_alloc(mm, pud, addr);
132 if (!pmd)
133 goto out;
134 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
135 if (!pte) 107 if (!pte)
136 goto out; 108 goto out;
137 109
@@ -204,12 +176,10 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
204 * Make sure the vma is shared, that it supports prefaulting, 176 * Make sure the vma is shared, that it supports prefaulting,
205 * and that the remapped range is valid and fully within 177 * and that the remapped range is valid and fully within
206 * the single existing vma. vm_private_data is used as a 178 * the single existing vma. vm_private_data is used as a
207 * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED 179 * swapout cursor in a VM_NONLINEAR vma.
208 * or VM_LOCKED, but VM_LOCKED could be revoked later on).
209 */ 180 */
210 if (vma && (vma->vm_flags & VM_SHARED) && 181 if (vma && (vma->vm_flags & VM_SHARED) &&
211 (!vma->vm_private_data || 182 (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
212 (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) &&
213 vma->vm_ops && vma->vm_ops->populate && 183 vma->vm_ops && vma->vm_ops->populate &&
214 end > start && start >= vma->vm_start && 184 end > start && start >= vma->vm_start &&
215 end <= vma->vm_end) { 185 end <= vma->vm_end) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 728e9bda12..b21d78c941 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,9 @@
11#include <linux/highmem.h> 11#include <linux/highmem.h>
12#include <linux/nodemask.h> 12#include <linux/nodemask.h>
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15#include <linux/cpuset.h>
16
14#include <asm/page.h> 17#include <asm/page.h>
15#include <asm/pgtable.h> 18#include <asm/pgtable.h>
16 19
@@ -22,6 +25,10 @@ unsigned long max_huge_pages;
22static struct list_head hugepage_freelists[MAX_NUMNODES]; 25static struct list_head hugepage_freelists[MAX_NUMNODES];
23static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 26static unsigned int nr_huge_pages_node[MAX_NUMNODES];
24static unsigned int free_huge_pages_node[MAX_NUMNODES]; 27static unsigned int free_huge_pages_node[MAX_NUMNODES];
28
29/*
30 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
31 */
25static DEFINE_SPINLOCK(hugetlb_lock); 32static DEFINE_SPINLOCK(hugetlb_lock);
26 33
27static void enqueue_huge_page(struct page *page) 34static void enqueue_huge_page(struct page *page)
@@ -32,18 +39,22 @@ static void enqueue_huge_page(struct page *page)
32 free_huge_pages_node[nid]++; 39 free_huge_pages_node[nid]++;
33} 40}
34 41
35static struct page *dequeue_huge_page(void) 42static struct page *dequeue_huge_page(struct vm_area_struct *vma,
43 unsigned long address)
36{ 44{
37 int nid = numa_node_id(); 45 int nid = numa_node_id();
38 struct page *page = NULL; 46 struct page *page = NULL;
47 struct zonelist *zonelist = huge_zonelist(vma, address);
48 struct zone **z;
39 49
40 if (list_empty(&hugepage_freelists[nid])) { 50 for (z = zonelist->zones; *z; z++) {
41 for (nid = 0; nid < MAX_NUMNODES; ++nid) 51 nid = (*z)->zone_pgdat->node_id;
42 if (!list_empty(&hugepage_freelists[nid])) 52 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
43 break; 53 !list_empty(&hugepage_freelists[nid]))
54 break;
44 } 55 }
45 if (nid >= 0 && nid < MAX_NUMNODES && 56
46 !list_empty(&hugepage_freelists[nid])) { 57 if (*z) {
47 page = list_entry(hugepage_freelists[nid].next, 58 page = list_entry(hugepage_freelists[nid].next,
48 struct page, lru); 59 struct page, lru);
49 list_del(&page->lru); 60 list_del(&page->lru);
@@ -61,8 +72,10 @@ static struct page *alloc_fresh_huge_page(void)
61 HUGETLB_PAGE_ORDER); 72 HUGETLB_PAGE_ORDER);
62 nid = (nid + 1) % num_online_nodes(); 73 nid = (nid + 1) % num_online_nodes();
63 if (page) { 74 if (page) {
75 spin_lock(&hugetlb_lock);
64 nr_huge_pages++; 76 nr_huge_pages++;
65 nr_huge_pages_node[page_to_nid(page)]++; 77 nr_huge_pages_node[page_to_nid(page)]++;
78 spin_unlock(&hugetlb_lock);
66 } 79 }
67 return page; 80 return page;
68} 81}
@@ -79,13 +92,13 @@ void free_huge_page(struct page *page)
79 spin_unlock(&hugetlb_lock); 92 spin_unlock(&hugetlb_lock);
80} 93}
81 94
82struct page *alloc_huge_page(void) 95struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
83{ 96{
84 struct page *page; 97 struct page *page;
85 int i; 98 int i;
86 99
87 spin_lock(&hugetlb_lock); 100 spin_lock(&hugetlb_lock);
88 page = dequeue_huge_page(); 101 page = dequeue_huge_page(vma, addr);
89 if (!page) { 102 if (!page) {
90 spin_unlock(&hugetlb_lock); 103 spin_unlock(&hugetlb_lock);
91 return NULL; 104 return NULL;
@@ -188,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
188 spin_lock(&hugetlb_lock); 201 spin_lock(&hugetlb_lock);
189 try_to_free_low(count); 202 try_to_free_low(count);
190 while (count < nr_huge_pages) { 203 while (count < nr_huge_pages) {
191 struct page *page = dequeue_huge_page(); 204 struct page *page = dequeue_huge_page(NULL, 0);
192 if (!page) 205 if (!page)
193 break; 206 break;
194 update_and_free_page(page); 207 update_and_free_page(page);
@@ -255,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
255 .nopage = hugetlb_nopage, 268 .nopage = hugetlb_nopage,
256}; 269};
257 270
258static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) 271static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
272 int writable)
259{ 273{
260 pte_t entry; 274 pte_t entry;
261 275
262 if (vma->vm_flags & VM_WRITE) { 276 if (writable) {
263 entry = 277 entry =
264 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 278 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
265 } else { 279 } else {
@@ -271,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
271 return entry; 285 return entry;
272} 286}
273 287
288static void set_huge_ptep_writable(struct vm_area_struct *vma,
289 unsigned long address, pte_t *ptep)
290{
291 pte_t entry;
292
293 entry = pte_mkwrite(pte_mkdirty(*ptep));
294 ptep_set_access_flags(vma, address, ptep, entry, 1);
295 update_mmu_cache(vma, address, entry);
296 lazy_mmu_prot_update(entry);
297}
298
299
274int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 300int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
275 struct vm_area_struct *vma) 301 struct vm_area_struct *vma)
276{ 302{
277 pte_t *src_pte, *dst_pte, entry; 303 pte_t *src_pte, *dst_pte, entry;
278 struct page *ptepage; 304 struct page *ptepage;
279 unsigned long addr; 305 unsigned long addr;
306 int cow;
307
308 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
280 309
281 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 310 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
282 src_pte = huge_pte_offset(src, addr); 311 src_pte = huge_pte_offset(src, addr);
@@ -288,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
288 spin_lock(&dst->page_table_lock); 317 spin_lock(&dst->page_table_lock);
289 spin_lock(&src->page_table_lock); 318 spin_lock(&src->page_table_lock);
290 if (!pte_none(*src_pte)) { 319 if (!pte_none(*src_pte)) {
320 if (cow)
321 ptep_set_wrprotect(src, addr, src_pte);
291 entry = *src_pte; 322 entry = *src_pte;
292 ptepage = pte_page(entry); 323 ptepage = pte_page(entry);
293 get_page(ptepage); 324 get_page(ptepage);
@@ -339,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
339 flush_tlb_range(vma, start, end); 370 flush_tlb_range(vma, start, end);
340} 371}
341 372
342static struct page *find_lock_huge_page(struct address_space *mapping, 373static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
343 unsigned long idx) 374 unsigned long address, pte_t *ptep, pte_t pte)
344{ 375{
345 struct page *page; 376 struct page *old_page, *new_page;
346 int err; 377 int i, avoidcopy;
347 struct inode *inode = mapping->host;
348 unsigned long size;
349 378
350retry: 379 old_page = pte_page(pte);
351 page = find_lock_page(mapping, idx);
352 if (page)
353 goto out;
354 380
355 /* Check to make sure the mapping hasn't been truncated */ 381 /* If no-one else is actually using this page, avoid the copy
356 size = i_size_read(inode) >> HPAGE_SHIFT; 382 * and just make the page writable */
357 if (idx >= size) 383 avoidcopy = (page_count(old_page) == 1);
358 goto out; 384 if (avoidcopy) {
385 set_huge_ptep_writable(vma, address, ptep);
386 return VM_FAULT_MINOR;
387 }
359 388
360 if (hugetlb_get_quota(mapping)) 389 page_cache_get(old_page);
361 goto out; 390 new_page = alloc_huge_page(vma, address);
362 page = alloc_huge_page(); 391
363 if (!page) { 392 if (!new_page) {
364 hugetlb_put_quota(mapping); 393 page_cache_release(old_page);
365 goto out; 394
395 /* Logically this is OOM, not a SIGBUS, but an OOM
396 * could cause the kernel to go killing other
397 * processes which won't help the hugepage situation
398 * at all (?) */
399 return VM_FAULT_SIGBUS;
366 } 400 }
367 401
368 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 402 spin_unlock(&mm->page_table_lock);
369 if (err) { 403 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
370 put_page(page); 404 copy_user_highpage(new_page + i, old_page + i,
371 hugetlb_put_quota(mapping); 405 address + i*PAGE_SIZE);
372 if (err == -EEXIST) 406 spin_lock(&mm->page_table_lock);
373 goto retry; 407
374 page = NULL; 408 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
409 if (likely(pte_same(*ptep, pte))) {
410 /* Break COW */
411 set_huge_pte_at(mm, address, ptep,
412 make_huge_pte(vma, new_page, 1));
413 /* Make the old page be freed below */
414 new_page = old_page;
375 } 415 }
376out: 416 page_cache_release(new_page);
377 return page; 417 page_cache_release(old_page);
418 return VM_FAULT_MINOR;
378} 419}
379 420
380int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 421int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
381 unsigned long address, int write_access) 422 unsigned long address, pte_t *ptep, int write_access)
382{ 423{
383 int ret = VM_FAULT_SIGBUS; 424 int ret = VM_FAULT_SIGBUS;
384 unsigned long idx; 425 unsigned long idx;
385 unsigned long size; 426 unsigned long size;
386 pte_t *pte;
387 struct page *page; 427 struct page *page;
388 struct address_space *mapping; 428 struct address_space *mapping;
389 429 pte_t new_pte;
390 pte = huge_pte_alloc(mm, address);
391 if (!pte)
392 goto out;
393 430
394 mapping = vma->vm_file->f_mapping; 431 mapping = vma->vm_file->f_mapping;
395 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 432 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -399,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
399 * Use page lock to guard against racing truncation 436 * Use page lock to guard against racing truncation
400 * before we get page_table_lock. 437 * before we get page_table_lock.
401 */ 438 */
402 page = find_lock_huge_page(mapping, idx); 439retry:
403 if (!page) 440 page = find_lock_page(mapping, idx);
404 goto out; 441 if (!page) {
442 if (hugetlb_get_quota(mapping))
443 goto out;
444 page = alloc_huge_page(vma, address);
445 if (!page) {
446 hugetlb_put_quota(mapping);
447 goto out;
448 }
449
450 if (vma->vm_flags & VM_SHARED) {
451 int err;
452
453 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
454 if (err) {
455 put_page(page);
456 hugetlb_put_quota(mapping);
457 if (err == -EEXIST)
458 goto retry;
459 goto out;
460 }
461 } else
462 lock_page(page);
463 }
405 464
406 spin_lock(&mm->page_table_lock); 465 spin_lock(&mm->page_table_lock);
407 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 466 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -409,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
409 goto backout; 468 goto backout;
410 469
411 ret = VM_FAULT_MINOR; 470 ret = VM_FAULT_MINOR;
412 if (!pte_none(*pte)) 471 if (!pte_none(*ptep))
413 goto backout; 472 goto backout;
414 473
415 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 474 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
416 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); 475 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
476 && (vma->vm_flags & VM_SHARED)));
477 set_huge_pte_at(mm, address, ptep, new_pte);
478
479 if (write_access && !(vma->vm_flags & VM_SHARED)) {
480 /* Optimization, do the COW without a second fault */
481 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
482 }
483
417 spin_unlock(&mm->page_table_lock); 484 spin_unlock(&mm->page_table_lock);
418 unlock_page(page); 485 unlock_page(page);
419out: 486out:
@@ -427,6 +494,33 @@ backout:
427 goto out; 494 goto out;
428} 495}
429 496
497int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
498 unsigned long address, int write_access)
499{
500 pte_t *ptep;
501 pte_t entry;
502 int ret;
503
504 ptep = huge_pte_alloc(mm, address);
505 if (!ptep)
506 return VM_FAULT_OOM;
507
508 entry = *ptep;
509 if (pte_none(entry))
510 return hugetlb_no_page(mm, vma, address, ptep, write_access);
511
512 ret = VM_FAULT_MINOR;
513
514 spin_lock(&mm->page_table_lock);
515 /* Check for a racing update before calling hugetlb_cow */
516 if (likely(pte_same(entry, *ptep)))
517 if (write_access && !pte_write(entry))
518 ret = hugetlb_cow(mm, vma, address, ptep, entry);
519 spin_unlock(&mm->page_table_lock);
520
521 return ret;
522}
523
430int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 524int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
431 struct page **pages, struct vm_area_struct **vmas, 525 struct page **pages, struct vm_area_struct **vmas,
432 unsigned long *position, int *length, int i) 526 unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb..17256bb2f4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12/* page_alloc.c */ 12static inline void set_page_refs(struct page *page, int order)
13extern void set_page_refs(struct page *page, int order); 13{
14#ifdef CONFIG_MMU
15 set_page_count(page, 1);
16#else
17 int i;
18
19 /*
20 * We need to reference all the pages for this order, otherwise if
21 * anyone accesses one of the pages with (get/put) it will be freed.
22 * - eg: access_process_vm()
23 */
24 for (i = 0; i < (1 << order); i++)
25 set_page_count(page + i, 1);
26#endif /* CONFIG_MMU */
27}
28
29extern void fastcall __init __free_pages_bootmem(struct page *page,
30 unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 17aaf3e164..ae0ae3ea29 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
126 unsigned long start, unsigned long end) 126 unsigned long start, unsigned long end)
127{ 127{
128 *prev = vma; 128 *prev = vma;
129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) 129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
130 return -EINVAL; 130 return -EINVAL;
131 131
132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
140 return 0; 140 return 0;
141} 141}
142 142
143/*
144 * Application wants to free up the pages and associated backing store.
145 * This is effectively punching a hole into the middle of a file.
146 *
147 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
148 * Other filesystems return -ENOSYS.
149 */
150static long madvise_remove(struct vm_area_struct *vma,
151 unsigned long start, unsigned long end)
152{
153 struct address_space *mapping;
154 loff_t offset, endoff;
155
156 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
157 return -EINVAL;
158
159 if (!vma->vm_file || !vma->vm_file->f_mapping
160 || !vma->vm_file->f_mapping->host) {
161 return -EINVAL;
162 }
163
164 mapping = vma->vm_file->f_mapping;
165
166 offset = (loff_t)(start - vma->vm_start)
167 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
168 endoff = (loff_t)(end - vma->vm_start - 1)
169 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
170 return vmtruncate_range(mapping->host, offset, endoff);
171}
172
143static long 173static long
144madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 174madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
145 unsigned long start, unsigned long end, int behavior) 175 unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
152 case MADV_RANDOM: 182 case MADV_RANDOM:
153 error = madvise_behavior(vma, prev, start, end, behavior); 183 error = madvise_behavior(vma, prev, start, end, behavior);
154 break; 184 break;
185 case MADV_REMOVE:
186 error = madvise_remove(vma, start, end);
187 break;
155 188
156 case MADV_WILLNEED: 189 case MADV_WILLNEED:
157 error = madvise_willneed(vma, prev, start, end); 190 error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
190 * some pages ahead. 223 * some pages ahead.
191 * MADV_DONTNEED - the application is finished with the given range, 224 * MADV_DONTNEED - the application is finished with the given range,
192 * so the kernel can free resources associated with it. 225 * so the kernel can free resources associated with it.
226 * MADV_REMOVE - the application wants to free up the given range of
227 * pages and associated backing store.
193 * 228 *
194 * return values: 229 * return values:
195 * zero - success 230 * zero - success
diff --git a/mm/memory.c b/mm/memory.c
index 0f60baf6f6..7a11ddd506 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
333} 333}
334 334
335/* 335/*
336 * This function is called to print an error when a pte in a 336 * This function is called to print an error when a bad pte
337 * !VM_RESERVED region is found pointing to an invalid pfn (which 337 * is found. For example, we might have a PFN-mapped pte in
338 * is an error. 338 * a region that doesn't allow it.
339 * 339 *
340 * The calling function must still handle the error. 340 * The calling function must still handle the error.
341 */ 341 */
@@ -349,6 +349,66 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
349 dump_stack(); 349 dump_stack();
350} 350}
351 351
352static inline int is_cow_mapping(unsigned int flags)
353{
354 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
355}
356
357/*
358 * This function gets the "struct page" associated with a pte.
359 *
360 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
361 * will have each page table entry just pointing to a raw page frame
362 * number, and as far as the VM layer is concerned, those do not have
363 * pages associated with them - even if the PFN might point to memory
364 * that otherwise is perfectly fine and has a "struct page".
365 *
366 * The way we recognize those mappings is through the rules set up
367 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
368 * and the vm_pgoff will point to the first PFN mapped: thus every
369 * page that is a raw mapping will always honor the rule
370 *
371 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
372 *
373 * and if that isn't true, the page has been COW'ed (in which case it
374 * _does_ have a "struct page" associated with it even if it is in a
375 * VM_PFNMAP range).
376 */
377struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
378{
379 unsigned long pfn = pte_pfn(pte);
380
381 if (vma->vm_flags & VM_PFNMAP) {
382 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
383 if (pfn == vma->vm_pgoff + off)
384 return NULL;
385 if (!is_cow_mapping(vma->vm_flags))
386 return NULL;
387 }
388
389 /*
390 * Add some anal sanity checks for now. Eventually,
391 * we should just do "return pfn_to_page(pfn)", but
392 * in the meantime we check that we get a valid pfn,
393 * and that the resulting page looks ok.
394 *
395 * Remove this test eventually!
396 */
397 if (unlikely(!pfn_valid(pfn))) {
398 print_bad_pte(vma, pte, addr);
399 return NULL;
400 }
401
402 /*
403 * NOTE! We still have PageReserved() pages in the page
404 * tables.
405 *
406 * The PAGE_ZERO() pages and various VDSO mappings can
407 * cause them to exist.
408 */
409 return pfn_to_page(pfn);
410}
411
352/* 412/*
353 * copy one vm_area from one task to the other. Assumes the page tables 413 * copy one vm_area from one task to the other. Assumes the page tables
354 * already present in the new task to be cleared in the whole range 414 * already present in the new task to be cleared in the whole range
@@ -363,7 +423,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
363 unsigned long vm_flags = vma->vm_flags; 423 unsigned long vm_flags = vma->vm_flags;
364 pte_t pte = *src_pte; 424 pte_t pte = *src_pte;
365 struct page *page; 425 struct page *page;
366 unsigned long pfn;
367 426
368 /* pte contains position in swap or file, so copy. */ 427 /* pte contains position in swap or file, so copy. */
369 if (unlikely(!pte_present(pte))) { 428 if (unlikely(!pte_present(pte))) {
@@ -381,28 +440,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
381 goto out_set_pte; 440 goto out_set_pte;
382 } 441 }
383 442
384 /* If the region is VM_RESERVED, the mapping is not
385 * mapped via rmap - duplicate the pte as is.
386 */
387 if (vm_flags & VM_RESERVED)
388 goto out_set_pte;
389
390 pfn = pte_pfn(pte);
391 /* If the pte points outside of valid memory but
392 * the region is not VM_RESERVED, we have a problem.
393 */
394 if (unlikely(!pfn_valid(pfn))) {
395 print_bad_pte(vma, pte, addr);
396 goto out_set_pte; /* try to do something sane */
397 }
398
399 page = pfn_to_page(pfn);
400
401 /* 443 /*
402 * If it's a COW mapping, write protect it both 444 * If it's a COW mapping, write protect it both
403 * in the parent and the child 445 * in the parent and the child
404 */ 446 */
405 if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) { 447 if (is_cow_mapping(vm_flags)) {
406 ptep_set_wrprotect(src_mm, addr, src_pte); 448 ptep_set_wrprotect(src_mm, addr, src_pte);
407 pte = *src_pte; 449 pte = *src_pte;
408 } 450 }
@@ -414,9 +456,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
414 if (vm_flags & VM_SHARED) 456 if (vm_flags & VM_SHARED)
415 pte = pte_mkclean(pte); 457 pte = pte_mkclean(pte);
416 pte = pte_mkold(pte); 458 pte = pte_mkold(pte);
417 get_page(page); 459
418 page_dup_rmap(page); 460 page = vm_normal_page(vma, addr, pte);
419 rss[!!PageAnon(page)]++; 461 if (page) {
462 get_page(page);
463 page_dup_rmap(page);
464 rss[!!PageAnon(page)]++;
465 }
420 466
421out_set_pte: 467out_set_pte:
422 set_pte_at(dst_mm, addr, dst_pte, pte); 468 set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -528,7 +574,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
528 * readonly mappings. The tradeoff is that copy_page_range is more 574 * readonly mappings. The tradeoff is that copy_page_range is more
529 * efficient than faulting. 575 * efficient than faulting.
530 */ 576 */
531 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { 577 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
532 if (!vma->anon_vma) 578 if (!vma->anon_vma)
533 return 0; 579 return 0;
534 } 580 }
@@ -549,10 +595,10 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
549 return 0; 595 return 0;
550} 596}
551 597
552static void zap_pte_range(struct mmu_gather *tlb, 598static unsigned long zap_pte_range(struct mmu_gather *tlb,
553 struct vm_area_struct *vma, pmd_t *pmd, 599 struct vm_area_struct *vma, pmd_t *pmd,
554 unsigned long addr, unsigned long end, 600 unsigned long addr, unsigned long end,
555 struct zap_details *details) 601 long *zap_work, struct zap_details *details)
556{ 602{
557 struct mm_struct *mm = tlb->mm; 603 struct mm_struct *mm = tlb->mm;
558 pte_t *pte; 604 pte_t *pte;
@@ -563,17 +609,16 @@ static void zap_pte_range(struct mmu_gather *tlb,
563 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 609 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
564 do { 610 do {
565 pte_t ptent = *pte; 611 pte_t ptent = *pte;
566 if (pte_none(ptent)) 612 if (pte_none(ptent)) {
613 (*zap_work)--;
567 continue; 614 continue;
615 }
568 if (pte_present(ptent)) { 616 if (pte_present(ptent)) {
569 struct page *page = NULL; 617 struct page *page;
570 if (!(vma->vm_flags & VM_RESERVED)) { 618
571 unsigned long pfn = pte_pfn(ptent); 619 (*zap_work) -= PAGE_SIZE;
572 if (unlikely(!pfn_valid(pfn))) 620
573 print_bad_pte(vma, ptent, addr); 621 page = vm_normal_page(vma, addr, ptent);
574 else
575 page = pfn_to_page(pfn);
576 }
577 if (unlikely(details) && page) { 622 if (unlikely(details) && page) {
578 /* 623 /*
579 * unmap_shared_mapping_pages() wants to 624 * unmap_shared_mapping_pages() wants to
@@ -624,16 +669,18 @@ static void zap_pte_range(struct mmu_gather *tlb,
624 if (!pte_file(ptent)) 669 if (!pte_file(ptent))
625 free_swap_and_cache(pte_to_swp_entry(ptent)); 670 free_swap_and_cache(pte_to_swp_entry(ptent));
626 pte_clear_full(mm, addr, pte, tlb->fullmm); 671 pte_clear_full(mm, addr, pte, tlb->fullmm);
627 } while (pte++, addr += PAGE_SIZE, addr != end); 672 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
628 673
629 add_mm_rss(mm, file_rss, anon_rss); 674 add_mm_rss(mm, file_rss, anon_rss);
630 pte_unmap_unlock(pte - 1, ptl); 675 pte_unmap_unlock(pte - 1, ptl);
676
677 return addr;
631} 678}
632 679
633static inline void zap_pmd_range(struct mmu_gather *tlb, 680static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
634 struct vm_area_struct *vma, pud_t *pud, 681 struct vm_area_struct *vma, pud_t *pud,
635 unsigned long addr, unsigned long end, 682 unsigned long addr, unsigned long end,
636 struct zap_details *details) 683 long *zap_work, struct zap_details *details)
637{ 684{
638 pmd_t *pmd; 685 pmd_t *pmd;
639 unsigned long next; 686 unsigned long next;
@@ -641,16 +688,21 @@ static inline void zap_pmd_range(struct mmu_gather *tlb,
641 pmd = pmd_offset(pud, addr); 688 pmd = pmd_offset(pud, addr);
642 do { 689 do {
643 next = pmd_addr_end(addr, end); 690 next = pmd_addr_end(addr, end);
644 if (pmd_none_or_clear_bad(pmd)) 691 if (pmd_none_or_clear_bad(pmd)) {
692 (*zap_work)--;
645 continue; 693 continue;
646 zap_pte_range(tlb, vma, pmd, addr, next, details); 694 }
647 } while (pmd++, addr = next, addr != end); 695 next = zap_pte_range(tlb, vma, pmd, addr, next,
696 zap_work, details);
697 } while (pmd++, addr = next, (addr != end && *zap_work > 0));
698
699 return addr;
648} 700}
649 701
650static inline void zap_pud_range(struct mmu_gather *tlb, 702static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
651 struct vm_area_struct *vma, pgd_t *pgd, 703 struct vm_area_struct *vma, pgd_t *pgd,
652 unsigned long addr, unsigned long end, 704 unsigned long addr, unsigned long end,
653 struct zap_details *details) 705 long *zap_work, struct zap_details *details)
654{ 706{
655 pud_t *pud; 707 pud_t *pud;
656 unsigned long next; 708 unsigned long next;
@@ -658,15 +710,21 @@ static inline void zap_pud_range(struct mmu_gather *tlb,
658 pud = pud_offset(pgd, addr); 710 pud = pud_offset(pgd, addr);
659 do { 711 do {
660 next = pud_addr_end(addr, end); 712 next = pud_addr_end(addr, end);
661 if (pud_none_or_clear_bad(pud)) 713 if (pud_none_or_clear_bad(pud)) {
714 (*zap_work)--;
662 continue; 715 continue;
663 zap_pmd_range(tlb, vma, pud, addr, next, details); 716 }
664 } while (pud++, addr = next, addr != end); 717 next = zap_pmd_range(tlb, vma, pud, addr, next,
718 zap_work, details);
719 } while (pud++, addr = next, (addr != end && *zap_work > 0));
720
721 return addr;
665} 722}
666 723
667static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, 724static unsigned long unmap_page_range(struct mmu_gather *tlb,
725 struct vm_area_struct *vma,
668 unsigned long addr, unsigned long end, 726 unsigned long addr, unsigned long end,
669 struct zap_details *details) 727 long *zap_work, struct zap_details *details)
670{ 728{
671 pgd_t *pgd; 729 pgd_t *pgd;
672 unsigned long next; 730 unsigned long next;
@@ -679,11 +737,16 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
679 pgd = pgd_offset(vma->vm_mm, addr); 737 pgd = pgd_offset(vma->vm_mm, addr);
680 do { 738 do {
681 next = pgd_addr_end(addr, end); 739 next = pgd_addr_end(addr, end);
682 if (pgd_none_or_clear_bad(pgd)) 740 if (pgd_none_or_clear_bad(pgd)) {
741 (*zap_work)--;
683 continue; 742 continue;
684 zap_pud_range(tlb, vma, pgd, addr, next, details); 743 }
685 } while (pgd++, addr = next, addr != end); 744 next = zap_pud_range(tlb, vma, pgd, addr, next,
745 zap_work, details);
746 } while (pgd++, addr = next, (addr != end && *zap_work > 0));
686 tlb_end_vma(tlb, vma); 747 tlb_end_vma(tlb, vma);
748
749 return addr;
687} 750}
688 751
689#ifdef CONFIG_PREEMPT 752#ifdef CONFIG_PREEMPT
@@ -724,7 +787,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
724 unsigned long end_addr, unsigned long *nr_accounted, 787 unsigned long end_addr, unsigned long *nr_accounted,
725 struct zap_details *details) 788 struct zap_details *details)
726{ 789{
727 unsigned long zap_bytes = ZAP_BLOCK_SIZE; 790 long zap_work = ZAP_BLOCK_SIZE;
728 unsigned long tlb_start = 0; /* For tlb_finish_mmu */ 791 unsigned long tlb_start = 0; /* For tlb_finish_mmu */
729 int tlb_start_valid = 0; 792 int tlb_start_valid = 0;
730 unsigned long start = start_addr; 793 unsigned long start = start_addr;
@@ -745,27 +808,25 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
745 *nr_accounted += (end - start) >> PAGE_SHIFT; 808 *nr_accounted += (end - start) >> PAGE_SHIFT;
746 809
747 while (start != end) { 810 while (start != end) {
748 unsigned long block;
749
750 if (!tlb_start_valid) { 811 if (!tlb_start_valid) {
751 tlb_start = start; 812 tlb_start = start;
752 tlb_start_valid = 1; 813 tlb_start_valid = 1;
753 } 814 }
754 815
755 if (is_vm_hugetlb_page(vma)) { 816 if (unlikely(is_vm_hugetlb_page(vma))) {
756 block = end - start;
757 unmap_hugepage_range(vma, start, end); 817 unmap_hugepage_range(vma, start, end);
758 } else { 818 zap_work -= (end - start) /
759 block = min(zap_bytes, end - start); 819 (HPAGE_SIZE / PAGE_SIZE);
760 unmap_page_range(*tlbp, vma, start, 820 start = end;
761 start + block, details); 821 } else
822 start = unmap_page_range(*tlbp, vma,
823 start, end, &zap_work, details);
824
825 if (zap_work > 0) {
826 BUG_ON(start != end);
827 break;
762 } 828 }
763 829
764 start += block;
765 zap_bytes -= block;
766 if ((long)zap_bytes > 0)
767 continue;
768
769 tlb_finish_mmu(*tlbp, tlb_start, start); 830 tlb_finish_mmu(*tlbp, tlb_start, start);
770 831
771 if (need_resched() || 832 if (need_resched() ||
@@ -779,7 +840,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
779 840
780 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); 841 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
781 tlb_start_valid = 0; 842 tlb_start_valid = 0;
782 zap_bytes = ZAP_BLOCK_SIZE; 843 zap_work = ZAP_BLOCK_SIZE;
783 } 844 }
784 } 845 }
785out: 846out:
@@ -813,7 +874,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
813/* 874/*
814 * Do a quick page-table lookup for a single page. 875 * Do a quick page-table lookup for a single page.
815 */ 876 */
816struct page *follow_page(struct mm_struct *mm, unsigned long address, 877struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
817 unsigned int flags) 878 unsigned int flags)
818{ 879{
819 pgd_t *pgd; 880 pgd_t *pgd;
@@ -821,8 +882,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
821 pmd_t *pmd; 882 pmd_t *pmd;
822 pte_t *ptep, pte; 883 pte_t *ptep, pte;
823 spinlock_t *ptl; 884 spinlock_t *ptl;
824 unsigned long pfn;
825 struct page *page; 885 struct page *page;
886 struct mm_struct *mm = vma->vm_mm;
826 887
827 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 888 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
828 if (!IS_ERR(page)) { 889 if (!IS_ERR(page)) {
@@ -858,11 +919,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
858 goto unlock; 919 goto unlock;
859 if ((flags & FOLL_WRITE) && !pte_write(pte)) 920 if ((flags & FOLL_WRITE) && !pte_write(pte))
860 goto unlock; 921 goto unlock;
861 pfn = pte_pfn(pte); 922 page = vm_normal_page(vma, address, pte);
862 if (!pfn_valid(pfn)) 923 if (unlikely(!page))
863 goto unlock; 924 goto unlock;
864 925
865 page = pfn_to_page(pfn);
866 if (flags & FOLL_GET) 926 if (flags & FOLL_GET)
867 get_page(page); 927 get_page(page);
868 if (flags & FOLL_TOUCH) { 928 if (flags & FOLL_TOUCH) {
@@ -935,8 +995,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
935 return i ? : -EFAULT; 995 return i ? : -EFAULT;
936 } 996 }
937 if (pages) { 997 if (pages) {
938 pages[i] = pte_page(*pte); 998 struct page *page = vm_normal_page(gate_vma, start, *pte);
939 get_page(pages[i]); 999 pages[i] = page;
1000 if (page)
1001 get_page(page);
940 } 1002 }
941 pte_unmap(pte); 1003 pte_unmap(pte);
942 if (vmas) 1004 if (vmas)
@@ -947,7 +1009,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
947 continue; 1009 continue;
948 } 1010 }
949 1011
950 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) 1012 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
951 || !(vm_flags & vma->vm_flags)) 1013 || !(vm_flags & vma->vm_flags))
952 return i ? : -EFAULT; 1014 return i ? : -EFAULT;
953 1015
@@ -971,7 +1033,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
971 foll_flags |= FOLL_WRITE; 1033 foll_flags |= FOLL_WRITE;
972 1034
973 cond_resched(); 1035 cond_resched();
974 while (!(page = follow_page(mm, start, foll_flags))) { 1036 while (!(page = follow_page(vma, start, foll_flags))) {
975 int ret; 1037 int ret;
976 ret = __handle_mm_fault(mm, vma, start, 1038 ret = __handle_mm_fault(mm, vma, start,
977 foll_flags & FOLL_WRITE); 1039 foll_flags & FOLL_WRITE);
@@ -1091,6 +1153,86 @@ int zeromap_page_range(struct vm_area_struct *vma,
1091 return err; 1153 return err;
1092} 1154}
1093 1155
1156pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
1157{
1158 pgd_t * pgd = pgd_offset(mm, addr);
1159 pud_t * pud = pud_alloc(mm, pgd, addr);
1160 if (pud) {
1161 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1162 if (pmd)
1163 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1164 }
1165 return NULL;
1166}
1167
1168/*
1169 * This is the old fallback for page remapping.
1170 *
1171 * For historical reasons, it only allows reserved pages. Only
1172 * old drivers should use this, and they needed to mark their
1173 * pages reserved for the old functions anyway.
1174 */
1175static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
1176{
1177 int retval;
1178 pte_t *pte;
1179 spinlock_t *ptl;
1180
1181 retval = -EINVAL;
1182 if (PageAnon(page))
1183 goto out;
1184 retval = -ENOMEM;
1185 flush_dcache_page(page);
1186 pte = get_locked_pte(mm, addr, &ptl);
1187 if (!pte)
1188 goto out;
1189 retval = -EBUSY;
1190 if (!pte_none(*pte))
1191 goto out_unlock;
1192
1193 /* Ok, finally just insert the thing.. */
1194 get_page(page);
1195 inc_mm_counter(mm, file_rss);
1196 page_add_file_rmap(page);
1197 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1198
1199 retval = 0;
1200out_unlock:
1201 pte_unmap_unlock(pte, ptl);
1202out:
1203 return retval;
1204}
1205
1206/*
1207 * This allows drivers to insert individual pages they've allocated
1208 * into a user vma.
1209 *
1210 * The page has to be a nice clean _individual_ kernel allocation.
1211 * If you allocate a compound page, you need to have marked it as
1212 * such (__GFP_COMP), or manually just split the page up yourself
1213 * (which is mainly an issue of doing "set_page_count(page, 1)" for
1214 * each sub-page, and then freeing them one by one when you free
1215 * them rather than freeing it as a compound page).
1216 *
1217 * NOTE! Traditionally this was done with "remap_pfn_range()" which
1218 * took an arbitrary page protection parameter. This doesn't allow
1219 * that. Your vma protection will have to be set up correctly, which
1220 * means that if you want a shared writable mapping, you'd better
1221 * ask for a shared writable mapping!
1222 *
1223 * The page does not need to be reserved.
1224 */
1225int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
1226{
1227 if (addr < vma->vm_start || addr >= vma->vm_end)
1228 return -EFAULT;
1229 if (!page_count(page))
1230 return -EINVAL;
1231 vma->vm_flags |= VM_INSERTPAGE;
1232 return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
1233}
1234EXPORT_SYMBOL(vm_insert_page);
1235
1094/* 1236/*
1095 * maps a range of physical memory into the requested pages. the old 1237 * maps a range of physical memory into the requested pages. the old
1096 * mappings are removed. any references to nonexistent pages results 1238 * mappings are removed. any references to nonexistent pages results
@@ -1170,10 +1312,26 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1170 * rest of the world about it: 1312 * rest of the world about it:
1171 * VM_IO tells people not to look at these pages 1313 * VM_IO tells people not to look at these pages
1172 * (accesses can have side effects). 1314 * (accesses can have side effects).
1173 * VM_RESERVED tells the core MM not to "manage" these pages 1315 * VM_RESERVED is specified all over the place, because
1174 * (e.g. refcount, mapcount, try to swap them out). 1316 * in 2.4 it kept swapout's vma scan off this vma; but
1317 * in 2.6 the LRU scan won't even find its pages, so this
1318 * flag means no more than count its pages in reserved_vm,
1319 * and omit it from core dump, even when VM_IO turned off.
1320 * VM_PFNMAP tells the core MM that the base pages are just
1321 * raw PFN mappings, and do not have a "struct page" associated
1322 * with them.
1323 *
1324 * There's a horrible special case to handle copy-on-write
1325 * behaviour that some programs depend on. We mark the "original"
1326 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
1175 */ 1327 */
1176 vma->vm_flags |= VM_IO | VM_RESERVED; 1328 if (is_cow_mapping(vma->vm_flags)) {
1329 if (addr != vma->vm_start || end != vma->vm_end)
1330 return -EINVAL;
1331 vma->vm_pgoff = pfn;
1332 }
1333
1334 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1177 1335
1178 BUG_ON(addr >= end); 1336 BUG_ON(addr >= end);
1179 pfn -= addr >> PAGE_SHIFT; 1337 pfn -= addr >> PAGE_SHIFT;
@@ -1228,6 +1386,33 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1228 return pte; 1386 return pte;
1229} 1387}
1230 1388
1389static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
1390{
1391 /*
1392 * If the source page was a PFN mapping, we don't have
1393 * a "struct page" for it. We do a best-effort copy by
1394 * just copying from the original user address. If that
1395 * fails, we just zero-fill it. Live with it.
1396 */
1397 if (unlikely(!src)) {
1398 void *kaddr = kmap_atomic(dst, KM_USER0);
1399 void __user *uaddr = (void __user *)(va & PAGE_MASK);
1400
1401 /*
1402 * This really shouldn't fail, because the page is there
1403 * in the page tables. But it might just be unreadable,
1404 * in which case we just give up and fill the result with
1405 * zeroes.
1406 */
1407 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
1408 memset(kaddr, 0, PAGE_SIZE);
1409 kunmap_atomic(kaddr, KM_USER0);
1410 return;
1411
1412 }
1413 copy_user_highpage(dst, src, va);
1414}
1415
1231/* 1416/*
1232 * This routine handles present pages, when users try to write 1417 * This routine handles present pages, when users try to write
1233 * to a shared page. It is done by copying the page to a new address 1418 * to a shared page. It is done by copying the page to a new address
@@ -1251,27 +1436,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1251 spinlock_t *ptl, pte_t orig_pte) 1436 spinlock_t *ptl, pte_t orig_pte)
1252{ 1437{
1253 struct page *old_page, *new_page; 1438 struct page *old_page, *new_page;
1254 unsigned long pfn = pte_pfn(orig_pte);
1255 pte_t entry; 1439 pte_t entry;
1256 int ret = VM_FAULT_MINOR; 1440 int ret = VM_FAULT_MINOR;
1257 1441
1258 BUG_ON(vma->vm_flags & VM_RESERVED); 1442 old_page = vm_normal_page(vma, address, orig_pte);
1259 1443 if (!old_page)
1260 if (unlikely(!pfn_valid(pfn))) { 1444 goto gotten;
1261 /*
1262 * Page table corrupted: show pte and kill process.
1263 */
1264 print_bad_pte(vma, orig_pte, address);
1265 ret = VM_FAULT_OOM;
1266 goto unlock;
1267 }
1268 old_page = pfn_to_page(pfn);
1269 1445
1270 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1446 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1271 int reuse = can_share_swap_page(old_page); 1447 int reuse = can_share_swap_page(old_page);
1272 unlock_page(old_page); 1448 unlock_page(old_page);
1273 if (reuse) { 1449 if (reuse) {
1274 flush_cache_page(vma, address, pfn); 1450 flush_cache_page(vma, address, pte_pfn(orig_pte));
1275 entry = pte_mkyoung(orig_pte); 1451 entry = pte_mkyoung(orig_pte);
1276 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1452 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1277 ptep_set_access_flags(vma, address, page_table, entry, 1); 1453 ptep_set_access_flags(vma, address, page_table, entry, 1);
@@ -1286,6 +1462,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1286 * Ok, we need to copy. Oh, well.. 1462 * Ok, we need to copy. Oh, well..
1287 */ 1463 */
1288 page_cache_get(old_page); 1464 page_cache_get(old_page);
1465gotten:
1289 pte_unmap_unlock(page_table, ptl); 1466 pte_unmap_unlock(page_table, ptl);
1290 1467
1291 if (unlikely(anon_vma_prepare(vma))) 1468 if (unlikely(anon_vma_prepare(vma)))
@@ -1298,7 +1475,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1298 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1475 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1299 if (!new_page) 1476 if (!new_page)
1300 goto oom; 1477 goto oom;
1301 copy_user_highpage(new_page, old_page, address); 1478 cow_user_page(new_page, old_page, address);
1302 } 1479 }
1303 1480
1304 /* 1481 /*
@@ -1306,31 +1483,37 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1306 */ 1483 */
1307 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 1484 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1308 if (likely(pte_same(*page_table, orig_pte))) { 1485 if (likely(pte_same(*page_table, orig_pte))) {
1309 page_remove_rmap(old_page); 1486 if (old_page) {
1310 if (!PageAnon(old_page)) { 1487 page_remove_rmap(old_page);
1488 if (!PageAnon(old_page)) {
1489 dec_mm_counter(mm, file_rss);
1490 inc_mm_counter(mm, anon_rss);
1491 }
1492 } else
1311 inc_mm_counter(mm, anon_rss); 1493 inc_mm_counter(mm, anon_rss);
1312 dec_mm_counter(mm, file_rss); 1494 flush_cache_page(vma, address, pte_pfn(orig_pte));
1313 }
1314 flush_cache_page(vma, address, pfn);
1315 entry = mk_pte(new_page, vma->vm_page_prot); 1495 entry = mk_pte(new_page, vma->vm_page_prot);
1316 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1496 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1317 ptep_establish(vma, address, page_table, entry); 1497 ptep_establish(vma, address, page_table, entry);
1318 update_mmu_cache(vma, address, entry); 1498 update_mmu_cache(vma, address, entry);
1319 lazy_mmu_prot_update(entry); 1499 lazy_mmu_prot_update(entry);
1320 lru_cache_add_active(new_page); 1500 lru_cache_add_active(new_page);
1321 page_add_anon_rmap(new_page, vma, address); 1501 page_add_new_anon_rmap(new_page, vma, address);
1322 1502
1323 /* Free the old page.. */ 1503 /* Free the old page.. */
1324 new_page = old_page; 1504 new_page = old_page;
1325 ret |= VM_FAULT_WRITE; 1505 ret |= VM_FAULT_WRITE;
1326 } 1506 }
1327 page_cache_release(new_page); 1507 if (new_page)
1328 page_cache_release(old_page); 1508 page_cache_release(new_page);
1509 if (old_page)
1510 page_cache_release(old_page);
1329unlock: 1511unlock:
1330 pte_unmap_unlock(page_table, ptl); 1512 pte_unmap_unlock(page_table, ptl);
1331 return ret; 1513 return ret;
1332oom: 1514oom:
1333 page_cache_release(old_page); 1515 if (old_page)
1516 page_cache_release(old_page);
1334 return VM_FAULT_OOM; 1517 return VM_FAULT_OOM;
1335} 1518}
1336 1519
@@ -1587,9 +1770,32 @@ out_big:
1587out_busy: 1770out_busy:
1588 return -ETXTBSY; 1771 return -ETXTBSY;
1589} 1772}
1590
1591EXPORT_SYMBOL(vmtruncate); 1773EXPORT_SYMBOL(vmtruncate);
1592 1774
1775int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1776{
1777 struct address_space *mapping = inode->i_mapping;
1778
1779 /*
1780 * If the underlying filesystem is not going to provide
1781 * a way to truncate a range of blocks (punch a hole) -
1782 * we should return failure right now.
1783 */
1784 if (!inode->i_op || !inode->i_op->truncate_range)
1785 return -ENOSYS;
1786
1787 mutex_lock(&inode->i_mutex);
1788 down_write(&inode->i_alloc_sem);
1789 unmap_mapping_range(mapping, offset, (end - offset), 1);
1790 truncate_inode_pages_range(mapping, offset, end);
1791 inode->i_op->truncate_range(inode, offset, end);
1792 up_write(&inode->i_alloc_sem);
1793 mutex_unlock(&inode->i_mutex);
1794
1795 return 0;
1796}
1797EXPORT_SYMBOL(vmtruncate_range);
1798
1593/* 1799/*
1594 * Primitive swap readahead code. We simply read an aligned block of 1800 * Primitive swap readahead code. We simply read an aligned block of
1595 * (1 << page_cluster) entries in the swap area. This method is chosen 1801 * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1771,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1771 goto release; 1977 goto release;
1772 inc_mm_counter(mm, anon_rss); 1978 inc_mm_counter(mm, anon_rss);
1773 lru_cache_add_active(page); 1979 lru_cache_add_active(page);
1774 SetPageReferenced(page); 1980 page_add_new_anon_rmap(page, vma, address);
1775 page_add_anon_rmap(page, vma, address);
1776 } else { 1981 } else {
1777 /* Map the ZERO_PAGE - vm_page_prot is readonly */ 1982 /* Map the ZERO_PAGE - vm_page_prot is readonly */
1778 page = ZERO_PAGE(address); 1983 page = ZERO_PAGE(address);
@@ -1828,6 +2033,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1828 int anon = 0; 2033 int anon = 0;
1829 2034
1830 pte_unmap(page_table); 2035 pte_unmap(page_table);
2036 BUG_ON(vma->vm_flags & VM_PFNMAP);
1831 2037
1832 if (vma->vm_file) { 2038 if (vma->vm_file) {
1833 mapping = vma->vm_file->f_mapping; 2039 mapping = vma->vm_file->f_mapping;
@@ -1902,8 +2108,8 @@ retry:
1902 if (anon) { 2108 if (anon) {
1903 inc_mm_counter(mm, anon_rss); 2109 inc_mm_counter(mm, anon_rss);
1904 lru_cache_add_active(new_page); 2110 lru_cache_add_active(new_page);
1905 page_add_anon_rmap(new_page, vma, address); 2111 page_add_new_anon_rmap(new_page, vma, address);
1906 } else if (!(vma->vm_flags & VM_RESERVED)) { 2112 } else {
1907 inc_mm_counter(mm, file_rss); 2113 inc_mm_counter(mm, file_rss);
1908 page_add_file_rmap(new_page); 2114 page_add_file_rmap(new_page);
1909 } 2115 }
@@ -2061,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2061 return handle_pte_fault(mm, vma, address, pte, pmd, write_access); 2267 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2062} 2268}
2063 2269
2270EXPORT_SYMBOL_GPL(__handle_mm_fault);
2271
2064#ifndef __PAGETABLE_PUD_FOLDED 2272#ifndef __PAGETABLE_PUD_FOLDED
2065/* 2273/*
2066 * Allocate page upper directory. 2274 * Allocate page upper directory.
@@ -2080,6 +2288,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2080 spin_unlock(&mm->page_table_lock); 2288 spin_unlock(&mm->page_table_lock);
2081 return 0; 2289 return 0;
2082} 2290}
2291#else
2292/* Workaround for gcc 2.96 */
2293int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2294{
2295 return 0;
2296}
2083#endif /* __PAGETABLE_PUD_FOLDED */ 2297#endif /* __PAGETABLE_PUD_FOLDED */
2084 2298
2085#ifndef __PAGETABLE_PMD_FOLDED 2299#ifndef __PAGETABLE_PMD_FOLDED
@@ -2108,6 +2322,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2108 spin_unlock(&mm->page_table_lock); 2322 spin_unlock(&mm->page_table_lock);
2109 return 0; 2323 return 0;
2110} 2324}
2325#else
2326/* Workaround for gcc 2.96 */
2327int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2328{
2329 return 0;
2330}
2111#endif /* __PAGETABLE_PMD_FOLDED */ 2331#endif /* __PAGETABLE_PMD_FOLDED */
2112 2332
2113int make_pages_present(unsigned long addr, unsigned long end) 2333int make_pages_present(unsigned long addr, unsigned long end)
@@ -2182,7 +2402,7 @@ static int __init gate_vma_init(void)
2182 gate_vma.vm_start = FIXADDR_USER_START; 2402 gate_vma.vm_start = FIXADDR_USER_START;
2183 gate_vma.vm_end = FIXADDR_USER_END; 2403 gate_vma.vm_end = FIXADDR_USER_END;
2184 gate_vma.vm_page_prot = PAGE_READONLY; 2404 gate_vma.vm_page_prot = PAGE_READONLY;
2185 gate_vma.vm_flags = VM_RESERVED; 2405 gate_vma.vm_flags = 0;
2186 return 0; 2406 return 0;
2187} 2407}
2188__initcall(gate_vma_init); 2408__initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 431a64f021..a918f77f02 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
42 int nr_pages); 42 int nr_pages);
43static int __add_section(struct zone *zone, unsigned long phys_start_pfn) 43static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
44{ 44{
45 struct pglist_data *pgdat = zone->zone_pgdat;
46 int nr_pages = PAGES_PER_SECTION; 45 int nr_pages = PAGES_PER_SECTION;
47 int ret; 46 int ret;
48 47
@@ -104,7 +103,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
104 pgdat->node_start_pfn = start_pfn; 103 pgdat->node_start_pfn = start_pfn;
105 104
106 if (end_pfn > old_pgdat_end_pfn) 105 if (end_pfn > old_pgdat_end_pfn)
107 pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages; 106 pgdat->node_spanned_pages = end_pfn - pgdat->node_start_pfn;
108} 107}
109 108
110int online_pages(unsigned long pfn, unsigned long nr_pages) 109int online_pages(unsigned long pfn, unsigned long nr_pages)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5abc57c2b8..73790188b0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/compat.h> 84#include <linux/compat.h>
85#include <linux/mempolicy.h> 85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
89
86#include <asm/tlbflush.h> 90#include <asm/tlbflush.h>
87#include <asm/uaccess.h> 91#include <asm/uaccess.h>
88 92
93/* Internal flags */
94#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97
89static kmem_cache_t *policy_cache; 98static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache; 99static kmem_cache_t *sn_cache;
91 100
@@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache;
93 102
94/* Highest zone. An specific allocation for a zone below that is not 103/* Highest zone. An specific allocation for a zone below that is not
95 policied. */ 104 policied. */
96static int policy_zone; 105int policy_zone = ZONE_DMA;
97 106
98struct mempolicy default_policy = { 107struct mempolicy default_policy = {
99 .refcnt = ATOMIC_INIT(1), /* never free it */ 108 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
131 if (!zl) 140 if (!zl)
132 return NULL; 141 return NULL;
133 num = 0; 142 num = 0;
134 for_each_node_mask(nd, *nodes) { 143 for_each_node_mask(nd, *nodes)
135 int k; 144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
145 zl->zones[num] = NULL; 145 zl->zones[num] = NULL;
146 return zl; 146 return zl;
147} 147}
@@ -161,6 +161,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
161 switch (mode) { 161 switch (mode) {
162 case MPOL_INTERLEAVE: 162 case MPOL_INTERLEAVE:
163 policy->v.nodes = *nodes; 163 policy->v.nodes = *nodes;
164 if (nodes_weight(*nodes) == 0) {
165 kmem_cache_free(policy_cache, policy);
166 return ERR_PTR(-EINVAL);
167 }
164 break; 168 break;
165 case MPOL_PREFERRED: 169 case MPOL_PREFERRED:
166 policy->v.preferred_node = first_node(*nodes); 170 policy->v.preferred_node = first_node(*nodes);
@@ -176,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
176 break; 180 break;
177 } 181 }
178 policy->policy = mode; 182 policy->policy = mode;
183 policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
179 return policy; 184 return policy;
180} 185}
181 186
182/* Ensure all existing pages follow the policy. */ 187static void gather_stats(struct page *, void *);
188static void migrate_page_add(struct page *page, struct list_head *pagelist,
189 unsigned long flags);
190
191/* Scan through pages checking if pages follow certain conditions. */
183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 192static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184 unsigned long addr, unsigned long end, nodemask_t *nodes) 193 unsigned long addr, unsigned long end,
194 const nodemask_t *nodes, unsigned long flags,
195 void *private)
185{ 196{
186 pte_t *orig_pte; 197 pte_t *orig_pte;
187 pte_t *pte; 198 pte_t *pte;
@@ -189,18 +200,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
189 200
190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 201 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 do { 202 do {
192 unsigned long pfn; 203 struct page *page;
193 unsigned int nid; 204 unsigned int nid;
194 205
195 if (!pte_present(*pte)) 206 if (!pte_present(*pte))
196 continue; 207 continue;
197 pfn = pte_pfn(*pte); 208 page = vm_normal_page(vma, addr, *pte);
198 if (!pfn_valid(pfn)) { 209 if (!page)
199 print_bad_pte(vma, *pte, addr);
200 continue; 210 continue;
201 } 211 /*
202 nid = pfn_to_nid(pfn); 212 * The check for PageReserved here is important to avoid
203 if (!node_isset(nid, *nodes)) 213 * handling zero pages and other pages that may have been
214 * marked special by the system.
215 *
216 * If the PageReserved would not be checked here then f.e.
217 * the location of the zero page could have an influence
218 * on MPOL_MF_STRICT, zero pages would be counted for
219 * the per node stats, and there would be useless attempts
220 * to put zero pages on the migration list.
221 */
222 if (PageReserved(page))
223 continue;
224 nid = page_to_nid(page);
225 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
226 continue;
227
228 if (flags & MPOL_MF_STATS)
229 gather_stats(page, private);
230 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
231 migrate_page_add(page, private, flags);
232 else
204 break; 233 break;
205 } while (pte++, addr += PAGE_SIZE, addr != end); 234 } while (pte++, addr += PAGE_SIZE, addr != end);
206 pte_unmap_unlock(orig_pte, ptl); 235 pte_unmap_unlock(orig_pte, ptl);
@@ -208,7 +237,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
208} 237}
209 238
210static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 239static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
211 unsigned long addr, unsigned long end, nodemask_t *nodes) 240 unsigned long addr, unsigned long end,
241 const nodemask_t *nodes, unsigned long flags,
242 void *private)
212{ 243{
213 pmd_t *pmd; 244 pmd_t *pmd;
214 unsigned long next; 245 unsigned long next;
@@ -218,14 +249,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
218 next = pmd_addr_end(addr, end); 249 next = pmd_addr_end(addr, end);
219 if (pmd_none_or_clear_bad(pmd)) 250 if (pmd_none_or_clear_bad(pmd))
220 continue; 251 continue;
221 if (check_pte_range(vma, pmd, addr, next, nodes)) 252 if (check_pte_range(vma, pmd, addr, next, nodes,
253 flags, private))
222 return -EIO; 254 return -EIO;
223 } while (pmd++, addr = next, addr != end); 255 } while (pmd++, addr = next, addr != end);
224 return 0; 256 return 0;
225} 257}
226 258
227static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 259static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
228 unsigned long addr, unsigned long end, nodemask_t *nodes) 260 unsigned long addr, unsigned long end,
261 const nodemask_t *nodes, unsigned long flags,
262 void *private)
229{ 263{
230 pud_t *pud; 264 pud_t *pud;
231 unsigned long next; 265 unsigned long next;
@@ -235,14 +269,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
235 next = pud_addr_end(addr, end); 269 next = pud_addr_end(addr, end);
236 if (pud_none_or_clear_bad(pud)) 270 if (pud_none_or_clear_bad(pud))
237 continue; 271 continue;
238 if (check_pmd_range(vma, pud, addr, next, nodes)) 272 if (check_pmd_range(vma, pud, addr, next, nodes,
273 flags, private))
239 return -EIO; 274 return -EIO;
240 } while (pud++, addr = next, addr != end); 275 } while (pud++, addr = next, addr != end);
241 return 0; 276 return 0;
242} 277}
243 278
244static inline int check_pgd_range(struct vm_area_struct *vma, 279static inline int check_pgd_range(struct vm_area_struct *vma,
245 unsigned long addr, unsigned long end, nodemask_t *nodes) 280 unsigned long addr, unsigned long end,
281 const nodemask_t *nodes, unsigned long flags,
282 void *private)
246{ 283{
247 pgd_t *pgd; 284 pgd_t *pgd;
248 unsigned long next; 285 unsigned long next;
@@ -252,38 +289,61 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
252 next = pgd_addr_end(addr, end); 289 next = pgd_addr_end(addr, end);
253 if (pgd_none_or_clear_bad(pgd)) 290 if (pgd_none_or_clear_bad(pgd))
254 continue; 291 continue;
255 if (check_pud_range(vma, pgd, addr, next, nodes)) 292 if (check_pud_range(vma, pgd, addr, next, nodes,
293 flags, private))
256 return -EIO; 294 return -EIO;
257 } while (pgd++, addr = next, addr != end); 295 } while (pgd++, addr = next, addr != end);
258 return 0; 296 return 0;
259} 297}
260 298
261/* Step 1: check the range */ 299/* Check if a vma is migratable */
300static inline int vma_migratable(struct vm_area_struct *vma)
301{
302 if (vma->vm_flags & (
303 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
304 return 0;
305 return 1;
306}
307
308/*
309 * Check if all pages in a range are on a set of nodes.
310 * If pagelist != NULL then isolate pages from the LRU and
311 * put them on the pagelist.
312 */
262static struct vm_area_struct * 313static struct vm_area_struct *
263check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 314check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
264 nodemask_t *nodes, unsigned long flags) 315 const nodemask_t *nodes, unsigned long flags, void *private)
265{ 316{
266 int err; 317 int err;
267 struct vm_area_struct *first, *vma, *prev; 318 struct vm_area_struct *first, *vma, *prev;
268 319
320 /* Clear the LRU lists so pages can be isolated */
321 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
322 lru_add_drain_all();
323
269 first = find_vma(mm, start); 324 first = find_vma(mm, start);
270 if (!first) 325 if (!first)
271 return ERR_PTR(-EFAULT); 326 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_RESERVED)
273 return ERR_PTR(-EACCES);
274 prev = NULL; 327 prev = NULL;
275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 328 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276 if (!vma->vm_next && vma->vm_end < end) 329 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
277 return ERR_PTR(-EFAULT); 330 if (!vma->vm_next && vma->vm_end < end)
278 if (prev && prev->vm_end < vma->vm_start) 331 return ERR_PTR(-EFAULT);
279 return ERR_PTR(-EFAULT); 332 if (prev && prev->vm_end < vma->vm_start)
280 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 333 return ERR_PTR(-EFAULT);
334 }
335 if (!is_vm_hugetlb_page(vma) &&
336 ((flags & MPOL_MF_STRICT) ||
337 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
338 vma_migratable(vma)))) {
281 unsigned long endvma = vma->vm_end; 339 unsigned long endvma = vma->vm_end;
340
282 if (endvma > end) 341 if (endvma > end)
283 endvma = end; 342 endvma = end;
284 if (vma->vm_start > start) 343 if (vma->vm_start > start)
285 start = vma->vm_start; 344 start = vma->vm_start;
286 err = check_pgd_range(vma, start, endvma, nodes); 345 err = check_pgd_range(vma, start, endvma, nodes,
346 flags, private);
287 if (err) { 347 if (err) {
288 first = ERR_PTR(err); 348 first = ERR_PTR(err);
289 break; 349 break;
@@ -342,51 +402,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
342 if (!nodes) 402 if (!nodes)
343 return 0; 403 return 0;
344 404
345 /* Update current mems_allowed */ 405 cpuset_update_task_memory_state();
346 cpuset_update_current_mems_allowed(); 406 if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes->bits);
349 return mpol_check_policy(mode, nodes);
350}
351
352long do_mbind(unsigned long start, unsigned long len,
353 unsigned long mode, nodemask_t *nmask, unsigned long flags)
354{
355 struct vm_area_struct *vma;
356 struct mm_struct *mm = current->mm;
357 struct mempolicy *new;
358 unsigned long end;
359 int err;
360
361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
362 return -EINVAL;
363 if (start & ~PAGE_MASK)
364 return -EINVAL;
365 if (mode == MPOL_DEFAULT)
366 flags &= ~MPOL_MF_STRICT;
367 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
368 end = start + len;
369 if (end < start)
370 return -EINVAL;
371 if (end == start)
372 return 0;
373 if (mpol_check_policy(mode, nmask))
374 return -EINVAL; 407 return -EINVAL;
375 new = mpol_new(mode, nmask); 408 return mpol_check_policy(mode, nodes);
376 if (IS_ERR(new))
377 return PTR_ERR(new);
378
379 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
380 mode,nodes_addr(nodes)[0]);
381
382 down_write(&mm->mmap_sem);
383 vma = check_range(mm, start, end, nmask, flags);
384 err = PTR_ERR(vma);
385 if (!IS_ERR(vma))
386 err = mbind_range(vma, start, end, new);
387 up_write(&mm->mmap_sem);
388 mpol_free(new);
389 return err;
390} 409}
391 410
392/* Set the process memory policy */ 411/* Set the process memory policy */
@@ -457,7 +476,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
457 struct vm_area_struct *vma = NULL; 476 struct vm_area_struct *vma = NULL;
458 struct mempolicy *pol = current->mempolicy; 477 struct mempolicy *pol = current->mempolicy;
459 478
460 cpuset_update_current_mems_allowed(); 479 cpuset_update_task_memory_state();
461 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 480 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
462 return -EINVAL; 481 return -EINVAL;
463 if (flags & MPOL_F_ADDR) { 482 if (flags & MPOL_F_ADDR) {
@@ -509,11 +528,141 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
509} 528}
510 529
511/* 530/*
531 * page migration
532 */
533
534static void migrate_page_add(struct page *page, struct list_head *pagelist,
535 unsigned long flags)
536{
537 /*
538 * Avoid migrating a page that is shared with others.
539 */
540 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
541 if (isolate_lru_page(page))
542 list_add(&page->lru, pagelist);
543 }
544}
545
546static int swap_pages(struct list_head *pagelist)
547{
548 LIST_HEAD(moved);
549 LIST_HEAD(failed);
550 int n;
551
552 n = migrate_pages(pagelist, NULL, &moved, &failed);
553 putback_lru_pages(&failed);
554 putback_lru_pages(&moved);
555
556 return n;
557}
558
559/*
560 * For now migrate_pages simply swaps out the pages from nodes that are in
561 * the source set but not in the target set. In the future, we would
562 * want a function that moves pages between the two nodesets in such
563 * a way as to preserve the physical layout as much as possible.
564 *
565 * Returns the number of page that could not be moved.
566 */
567int do_migrate_pages(struct mm_struct *mm,
568 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
569{
570 LIST_HEAD(pagelist);
571 int count = 0;
572 nodemask_t nodes;
573
574 nodes_andnot(nodes, *from_nodes, *to_nodes);
575
576 down_read(&mm->mmap_sem);
577 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
578 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
579
580 if (!list_empty(&pagelist)) {
581 count = swap_pages(&pagelist);
582 putback_lru_pages(&pagelist);
583 }
584
585 up_read(&mm->mmap_sem);
586 return count;
587}
588
589long do_mbind(unsigned long start, unsigned long len,
590 unsigned long mode, nodemask_t *nmask, unsigned long flags)
591{
592 struct vm_area_struct *vma;
593 struct mm_struct *mm = current->mm;
594 struct mempolicy *new;
595 unsigned long end;
596 int err;
597 LIST_HEAD(pagelist);
598
599 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
600 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
601 || mode > MPOL_MAX)
602 return -EINVAL;
603 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
604 return -EPERM;
605
606 if (start & ~PAGE_MASK)
607 return -EINVAL;
608
609 if (mode == MPOL_DEFAULT)
610 flags &= ~MPOL_MF_STRICT;
611
612 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
613 end = start + len;
614
615 if (end < start)
616 return -EINVAL;
617 if (end == start)
618 return 0;
619
620 if (mpol_check_policy(mode, nmask))
621 return -EINVAL;
622
623 new = mpol_new(mode, nmask);
624 if (IS_ERR(new))
625 return PTR_ERR(new);
626
627 /*
628 * If we are using the default policy then operation
629 * on discontinuous address spaces is okay after all
630 */
631 if (!new)
632 flags |= MPOL_MF_DISCONTIG_OK;
633
634 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
635 mode,nodes_addr(nodes)[0]);
636
637 down_write(&mm->mmap_sem);
638 vma = check_range(mm, start, end, nmask,
639 flags | MPOL_MF_INVERT, &pagelist);
640
641 err = PTR_ERR(vma);
642 if (!IS_ERR(vma)) {
643 int nr_failed = 0;
644
645 err = mbind_range(vma, start, end, new);
646 if (!list_empty(&pagelist))
647 nr_failed = swap_pages(&pagelist);
648
649 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
650 err = -EIO;
651 }
652 if (!list_empty(&pagelist))
653 putback_lru_pages(&pagelist);
654
655 up_write(&mm->mmap_sem);
656 mpol_free(new);
657 return err;
658}
659
660/*
512 * User space interface with variable sized bitmaps for nodelists. 661 * User space interface with variable sized bitmaps for nodelists.
513 */ 662 */
514 663
515/* Copy a node mask from user space. */ 664/* Copy a node mask from user space. */
516static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, 665static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
517 unsigned long maxnode) 666 unsigned long maxnode)
518{ 667{
519 unsigned long k; 668 unsigned long k;
@@ -602,6 +751,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
602 return do_set_mempolicy(mode, &nodes); 751 return do_set_mempolicy(mode, &nodes);
603} 752}
604 753
754asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
755 const unsigned long __user *old_nodes,
756 const unsigned long __user *new_nodes)
757{
758 struct mm_struct *mm;
759 struct task_struct *task;
760 nodemask_t old;
761 nodemask_t new;
762 nodemask_t task_nodes;
763 int err;
764
765 err = get_nodes(&old, old_nodes, maxnode);
766 if (err)
767 return err;
768
769 err = get_nodes(&new, new_nodes, maxnode);
770 if (err)
771 return err;
772
773 /* Find the mm_struct */
774 read_lock(&tasklist_lock);
775 task = pid ? find_task_by_pid(pid) : current;
776 if (!task) {
777 read_unlock(&tasklist_lock);
778 return -ESRCH;
779 }
780 mm = get_task_mm(task);
781 read_unlock(&tasklist_lock);
782
783 if (!mm)
784 return -EINVAL;
785
786 /*
787 * Check if this process has the right to modify the specified
788 * process. The right exists if the process has administrative
789 * capabilities, superuser priviledges or the same
790 * userid as the target process.
791 */
792 if ((current->euid != task->suid) && (current->euid != task->uid) &&
793 (current->uid != task->suid) && (current->uid != task->uid) &&
794 !capable(CAP_SYS_ADMIN)) {
795 err = -EPERM;
796 goto out;
797 }
798
799 task_nodes = cpuset_mems_allowed(task);
800 /* Is the user allowed to access the target nodes? */
801 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
802 err = -EPERM;
803 goto out;
804 }
805
806 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
807out:
808 mmput(mm);
809 return err;
810}
811
812
605/* Retrieve NUMA policy */ 813/* Retrieve NUMA policy */
606asmlinkage long sys_get_mempolicy(int __user *policy, 814asmlinkage long sys_get_mempolicy(int __user *policy,
607 unsigned long __user *nmask, 815 unsigned long __user *nmask,
@@ -708,8 +916,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
708#endif 916#endif
709 917
710/* Return effective policy for a VMA */ 918/* Return effective policy for a VMA */
711struct mempolicy * 919static struct mempolicy * get_vma_policy(struct task_struct *task,
712get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) 920 struct vm_area_struct *vma, unsigned long addr)
713{ 921{
714 struct mempolicy *pol = task->mempolicy; 922 struct mempolicy *pol = task->mempolicy;
715 923
@@ -768,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
768 return nid; 976 return nid;
769} 977}
770 978
979/*
980 * Depending on the memory policy provide a node from which to allocate the
981 * next slab entry.
982 */
983unsigned slab_node(struct mempolicy *policy)
984{
985 switch (policy->policy) {
986 case MPOL_INTERLEAVE:
987 return interleave_nodes(policy);
988
989 case MPOL_BIND:
990 /*
991 * Follow bind policy behavior and start allocation at the
992 * first node.
993 */
994 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
995
996 case MPOL_PREFERRED:
997 if (policy->v.preferred_node >= 0)
998 return policy->v.preferred_node;
999 /* Fall through */
1000
1001 default:
1002 return numa_node_id();
1003 }
1004}
1005
771/* Do static interleaving for a VMA with known offset. */ 1006/* Do static interleaving for a VMA with known offset. */
772static unsigned offset_il_node(struct mempolicy *pol, 1007static unsigned offset_il_node(struct mempolicy *pol,
773 struct vm_area_struct *vma, unsigned long off) 1008 struct vm_area_struct *vma, unsigned long off)
@@ -785,6 +1020,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
785 return nid; 1020 return nid;
786} 1021}
787 1022
1023/* Determine a node number for interleave */
1024static inline unsigned interleave_nid(struct mempolicy *pol,
1025 struct vm_area_struct *vma, unsigned long addr, int shift)
1026{
1027 if (vma) {
1028 unsigned long off;
1029
1030 off = vma->vm_pgoff;
1031 off += (addr - vma->vm_start) >> shift;
1032 return offset_il_node(pol, vma, off);
1033 } else
1034 return interleave_nodes(pol);
1035}
1036
1037/* Return a zonelist suitable for a huge page allocation. */
1038struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1039{
1040 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1041
1042 if (pol->policy == MPOL_INTERLEAVE) {
1043 unsigned nid;
1044
1045 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1046 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1047 }
1048 return zonelist_policy(GFP_HIGHUSER, pol);
1049}
1050
788/* Allocate a page in interleaved policy. 1051/* Allocate a page in interleaved policy.
789 Own path because it needs to do special accounting. */ 1052 Own path because it needs to do special accounting. */
790static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1053static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -829,19 +1092,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
829{ 1092{
830 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1093 struct mempolicy *pol = get_vma_policy(current, vma, addr);
831 1094
832 cpuset_update_current_mems_allowed(); 1095 cpuset_update_task_memory_state();
833 1096
834 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 1097 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835 unsigned nid; 1098 unsigned nid;
836 if (vma) { 1099
837 unsigned long off; 1100 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
838 off = vma->vm_pgoff;
839 off += (addr - vma->vm_start) >> PAGE_SHIFT;
840 nid = offset_il_node(pol, vma, off);
841 } else {
842 /* fall back to process interleaving */
843 nid = interleave_nodes(pol);
844 }
845 return alloc_page_interleave(gfp, 0, nid); 1101 return alloc_page_interleave(gfp, 0, nid);
846 } 1102 }
847 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 1103 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -862,7 +1118,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
862 * interrupt context and apply the current process NUMA policy. 1118 * interrupt context and apply the current process NUMA policy.
863 * Returns NULL when no page can be allocated. 1119 * Returns NULL when no page can be allocated.
864 * 1120 *
865 * Don't call cpuset_update_current_mems_allowed() unless 1121 * Don't call cpuset_update_task_memory_state() unless
866 * 1) it's ok to take cpuset_sem (can WAIT), and 1122 * 1) it's ok to take cpuset_sem (can WAIT), and
867 * 2) allocating for current task (not interrupt). 1123 * 2) allocating for current task (not interrupt).
868 */ 1124 */
@@ -871,7 +1127,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
871 struct mempolicy *pol = current->mempolicy; 1127 struct mempolicy *pol = current->mempolicy;
872 1128
873 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1129 if ((gfp & __GFP_WAIT) && !in_interrupt())
874 cpuset_update_current_mems_allowed(); 1130 cpuset_update_task_memory_state();
875 if (!pol || in_interrupt()) 1131 if (!pol || in_interrupt())
876 pol = &default_policy; 1132 pol = &default_policy;
877 if (pol->policy == MPOL_INTERLEAVE) 1133 if (pol->policy == MPOL_INTERLEAVE)
@@ -880,6 +1136,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
880} 1136}
881EXPORT_SYMBOL(alloc_pages_current); 1137EXPORT_SYMBOL(alloc_pages_current);
882 1138
1139/*
1140 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1141 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1142 * with the mems_allowed returned by cpuset_mems_allowed(). This
1143 * keeps mempolicies cpuset relative after its cpuset moves. See
1144 * further kernel/cpuset.c update_nodemask().
1145 */
1146void *cpuset_being_rebound;
1147
883/* Slow path of a mempolicy copy */ 1148/* Slow path of a mempolicy copy */
884struct mempolicy *__mpol_copy(struct mempolicy *old) 1149struct mempolicy *__mpol_copy(struct mempolicy *old)
885{ 1150{
@@ -887,6 +1152,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
887 1152
888 if (!new) 1153 if (!new)
889 return ERR_PTR(-ENOMEM); 1154 return ERR_PTR(-ENOMEM);
1155 if (current_cpuset_is_being_rebound()) {
1156 nodemask_t mems = cpuset_mems_allowed(current);
1157 mpol_rebind_policy(old, &mems);
1158 }
890 *new = *old; 1159 *new = *old;
891 atomic_set(&new->refcnt, 1); 1160 atomic_set(&new->refcnt, 1);
892 if (new->policy == MPOL_BIND) { 1161 if (new->policy == MPOL_BIND) {
@@ -940,54 +1209,6 @@ void __mpol_free(struct mempolicy *p)
940} 1209}
941 1210
942/* 1211/*
943 * Hugetlb policy. Same as above, just works with node numbers instead of
944 * zonelists.
945 */
946
947/* Find first node suitable for an allocation */
948int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
949{
950 struct mempolicy *pol = get_vma_policy(current, vma, addr);
951
952 switch (pol->policy) {
953 case MPOL_DEFAULT:
954 return numa_node_id();
955 case MPOL_BIND:
956 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
957 case MPOL_INTERLEAVE:
958 return interleave_nodes(pol);
959 case MPOL_PREFERRED:
960 return pol->v.preferred_node >= 0 ?
961 pol->v.preferred_node : numa_node_id();
962 }
963 BUG();
964 return 0;
965}
966
967/* Find secondary valid nodes for an allocation */
968int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
969{
970 struct mempolicy *pol = get_vma_policy(current, vma, addr);
971
972 switch (pol->policy) {
973 case MPOL_PREFERRED:
974 case MPOL_DEFAULT:
975 case MPOL_INTERLEAVE:
976 return 1;
977 case MPOL_BIND: {
978 struct zone **z;
979 for (z = pol->v.zonelist->zones; *z; z++)
980 if ((*z)->zone_pgdat->node_id == nid)
981 return 1;
982 return 0;
983 }
984 default:
985 BUG();
986 return 0;
987 }
988}
989
990/*
991 * Shared memory backing store policy support. 1212 * Shared memory backing store policy support.
992 * 1213 *
993 * Remember policies even when nobody has shared memory mapped. 1214 * Remember policies even when nobody has shared memory mapped.
@@ -1141,6 +1362,30 @@ restart:
1141 return 0; 1362 return 0;
1142} 1363}
1143 1364
1365void mpol_shared_policy_init(struct shared_policy *info, int policy,
1366 nodemask_t *policy_nodes)
1367{
1368 info->root = RB_ROOT;
1369 spin_lock_init(&info->lock);
1370
1371 if (policy != MPOL_DEFAULT) {
1372 struct mempolicy *newpol;
1373
1374 /* Falls back to MPOL_DEFAULT on any error */
1375 newpol = mpol_new(policy, policy_nodes);
1376 if (!IS_ERR(newpol)) {
1377 /* Create pseudo-vma that contains just the policy */
1378 struct vm_area_struct pvma;
1379
1380 memset(&pvma, 0, sizeof(struct vm_area_struct));
1381 /* Policy covers entire file */
1382 pvma.vm_end = TASK_SIZE;
1383 mpol_set_shared_policy(info, &pvma, newpol);
1384 mpol_free(newpol);
1385 }
1386 }
1387}
1388
1144int mpol_set_shared_policy(struct shared_policy *info, 1389int mpol_set_shared_policy(struct shared_policy *info,
1145 struct vm_area_struct *vma, struct mempolicy *npol) 1390 struct vm_area_struct *vma, struct mempolicy *npol)
1146{ 1391{
@@ -1209,25 +1454,31 @@ void numa_default_policy(void)
1209} 1454}
1210 1455
1211/* Migrate a policy to a different set of nodes */ 1456/* Migrate a policy to a different set of nodes */
1212static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, 1457void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1213 const nodemask_t *new)
1214{ 1458{
1459 nodemask_t *mpolmask;
1215 nodemask_t tmp; 1460 nodemask_t tmp;
1216 1461
1217 if (!pol) 1462 if (!pol)
1218 return; 1463 return;
1464 mpolmask = &pol->cpuset_mems_allowed;
1465 if (nodes_equal(*mpolmask, *newmask))
1466 return;
1219 1467
1220 switch (pol->policy) { 1468 switch (pol->policy) {
1221 case MPOL_DEFAULT: 1469 case MPOL_DEFAULT:
1222 break; 1470 break;
1223 case MPOL_INTERLEAVE: 1471 case MPOL_INTERLEAVE:
1224 nodes_remap(tmp, pol->v.nodes, *old, *new); 1472 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1225 pol->v.nodes = tmp; 1473 pol->v.nodes = tmp;
1226 current->il_next = node_remap(current->il_next, *old, *new); 1474 *mpolmask = *newmask;
1475 current->il_next = node_remap(current->il_next,
1476 *mpolmask, *newmask);
1227 break; 1477 break;
1228 case MPOL_PREFERRED: 1478 case MPOL_PREFERRED:
1229 pol->v.preferred_node = node_remap(pol->v.preferred_node, 1479 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1230 *old, *new); 1480 *mpolmask, *newmask);
1481 *mpolmask = *newmask;
1231 break; 1482 break;
1232 case MPOL_BIND: { 1483 case MPOL_BIND: {
1233 nodemask_t nodes; 1484 nodemask_t nodes;
@@ -1237,7 +1488,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1237 nodes_clear(nodes); 1488 nodes_clear(nodes);
1238 for (z = pol->v.zonelist->zones; *z; z++) 1489 for (z = pol->v.zonelist->zones; *z; z++)
1239 node_set((*z)->zone_pgdat->node_id, nodes); 1490 node_set((*z)->zone_pgdat->node_id, nodes);
1240 nodes_remap(tmp, nodes, *old, *new); 1491 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1241 nodes = tmp; 1492 nodes = tmp;
1242 1493
1243 zonelist = bind_zonelist(&nodes); 1494 zonelist = bind_zonelist(&nodes);
@@ -1252,6 +1503,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1252 kfree(pol->v.zonelist); 1503 kfree(pol->v.zonelist);
1253 pol->v.zonelist = zonelist; 1504 pol->v.zonelist = zonelist;
1254 } 1505 }
1506 *mpolmask = *newmask;
1255 break; 1507 break;
1256 } 1508 }
1257 default: 1509 default:
@@ -1261,12 +1513,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1261} 1513}
1262 1514
1263/* 1515/*
1264 * Someone moved this task to different nodes. Fixup mempolicies. 1516 * Wrapper for mpol_rebind_policy() that just requires task
1517 * pointer, and updates task mempolicy.
1518 */
1519
1520void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1521{
1522 mpol_rebind_policy(tsk->mempolicy, new);
1523}
1524
1525/*
1526 * Rebind each vma in mm to new nodemask.
1265 * 1527 *
1266 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, 1528 * Call holding a reference to mm. Takes mm->mmap_sem during call.
1267 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1268 */ 1529 */
1269void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) 1530
1531void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1270{ 1532{
1271 rebind_policy(current->mempolicy, old, new); 1533 struct vm_area_struct *vma;
1534
1535 down_write(&mm->mmap_sem);
1536 for (vma = mm->mmap; vma; vma = vma->vm_next)
1537 mpol_rebind_policy(vma->vm_policy, new);
1538 up_write(&mm->mmap_sem);
1272} 1539}
1540
1541/*
1542 * Display pages allocated per node and memory policy via /proc.
1543 */
1544
1545static const char *policy_types[] = { "default", "prefer", "bind",
1546 "interleave" };
1547
1548/*
1549 * Convert a mempolicy into a string.
1550 * Returns the number of characters in buffer (if positive)
1551 * or an error (negative)
1552 */
1553static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1554{
1555 char *p = buffer;
1556 int l;
1557 nodemask_t nodes;
1558 int mode = pol ? pol->policy : MPOL_DEFAULT;
1559
1560 switch (mode) {
1561 case MPOL_DEFAULT:
1562 nodes_clear(nodes);
1563 break;
1564
1565 case MPOL_PREFERRED:
1566 nodes_clear(nodes);
1567 node_set(pol->v.preferred_node, nodes);
1568 break;
1569
1570 case MPOL_BIND:
1571 get_zonemask(pol, &nodes);
1572 break;
1573
1574 case MPOL_INTERLEAVE:
1575 nodes = pol->v.nodes;
1576 break;
1577
1578 default:
1579 BUG();
1580 return -EFAULT;
1581 }
1582
1583 l = strlen(policy_types[mode]);
1584 if (buffer + maxlen < p + l + 1)
1585 return -ENOSPC;
1586
1587 strcpy(p, policy_types[mode]);
1588 p += l;
1589
1590 if (!nodes_empty(nodes)) {
1591 if (buffer + maxlen < p + 2)
1592 return -ENOSPC;
1593 *p++ = '=';
1594 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1595 }
1596 return p - buffer;
1597}
1598
1599struct numa_maps {
1600 unsigned long pages;
1601 unsigned long anon;
1602 unsigned long mapped;
1603 unsigned long mapcount_max;
1604 unsigned long node[MAX_NUMNODES];
1605};
1606
1607static void gather_stats(struct page *page, void *private)
1608{
1609 struct numa_maps *md = private;
1610 int count = page_mapcount(page);
1611
1612 if (count)
1613 md->mapped++;
1614
1615 if (count > md->mapcount_max)
1616 md->mapcount_max = count;
1617
1618 md->pages++;
1619
1620 if (PageAnon(page))
1621 md->anon++;
1622
1623 md->node[page_to_nid(page)]++;
1624 cond_resched();
1625}
1626
1627int show_numa_map(struct seq_file *m, void *v)
1628{
1629 struct task_struct *task = m->private;
1630 struct vm_area_struct *vma = v;
1631 struct numa_maps *md;
1632 int n;
1633 char buffer[50];
1634
1635 if (!vma->vm_mm)
1636 return 0;
1637
1638 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1639 if (!md)
1640 return 0;
1641
1642 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1643 &node_online_map, MPOL_MF_STATS, md);
1644
1645 if (md->pages) {
1646 mpol_to_str(buffer, sizeof(buffer),
1647 get_vma_policy(task, vma, vma->vm_start));
1648
1649 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1650 vma->vm_start, buffer, md->pages,
1651 md->mapped, md->mapcount_max);
1652
1653 if (md->anon)
1654 seq_printf(m," anon=%lu",md->anon);
1655
1656 for_each_online_node(n)
1657 if (md->node[n])
1658 seq_printf(m, " N%d=%lu", n, md->node[n]);
1659
1660 seq_putc(m, '\n');
1661 }
1662 kfree(md);
1663
1664 if (m->count < m->size)
1665 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1666 return 0;
1667}
1668
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ae3a46ff7..b90c59573a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -5,6 +5,7 @@
5 * (C) Copyright 2002 Christoph Hellwig 5 * (C) Copyright 2002 Christoph Hellwig
6 */ 6 */
7 7
8#include <linux/capability.h>
8#include <linux/mman.h> 9#include <linux/mman.h>
9#include <linux/mm.h> 10#include <linux/mm.h>
10#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 6c997b1596..47556d2b3e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/capability.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/file.h> 18#include <linux/file.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
@@ -611,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end);
611 * If the vma has a ->close operation then the driver probably needs to release 612 * If the vma has a ->close operation then the driver probably needs to release
612 * per-vma resources, so we don't attempt to merge those. 613 * per-vma resources, so we don't attempt to merge those.
613 */ 614 */
614#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED) 615#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
615 616
616static inline int is_mergeable_vma(struct vm_area_struct *vma, 617static inline int is_mergeable_vma(struct vm_area_struct *vma,
617 struct file *file, unsigned long vm_flags) 618 struct file *file, unsigned long vm_flags)
@@ -1076,17 +1077,6 @@ munmap_back:
1076 error = file->f_op->mmap(file, vma); 1077 error = file->f_op->mmap(file, vma);
1077 if (error) 1078 if (error)
1078 goto unmap_and_free_vma; 1079 goto unmap_and_free_vma;
1079 if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
1080 == (VM_WRITE | VM_RESERVED)) {
1081 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
1082 "PROT_WRITE mmap of VM_RESERVED memory, which "
1083 "is deprecated. Please report this to "
1084 "linux-kernel@vger.kernel.org\n",current->comm);
1085 if (vma->vm_ops && vma->vm_ops->close)
1086 vma->vm_ops->close(vma);
1087 error = -EACCES;
1088 goto unmap_and_free_vma;
1089 }
1090 } else if (vm_flags & VM_SHARED) { 1080 } else if (vm_flags & VM_SHARED) {
1091 error = shmem_zero_setup(vma); 1081 error = shmem_zero_setup(vma);
1092 if (error) 1082 if (error)
@@ -1501,7 +1491,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1501 * PA-RISC uses this for its stack; IA64 for its Register Backing Store. 1491 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
1502 * vma is the last one with address > vma->vm_end. Have to extend vma. 1492 * vma is the last one with address > vma->vm_end. Have to extend vma.
1503 */ 1493 */
1504#ifdef CONFIG_STACK_GROWSUP 1494#ifndef CONFIG_IA64
1505static inline 1495static inline
1506#endif 1496#endif
1507int expand_upwards(struct vm_area_struct *vma, unsigned long address) 1497int expand_upwards(struct vm_area_struct *vma, unsigned long address)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 17a2b52b75..653b8571c1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,14 +124,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
124 * a MAP_NORESERVE private mapping to writable will now reserve. 124 * a MAP_NORESERVE private mapping to writable will now reserve.
125 */ 125 */
126 if (newflags & VM_WRITE) { 126 if (newflags & VM_WRITE) {
127 if (oldflags & VM_RESERVED) {
128 BUG_ON(oldflags & VM_WRITE);
129 printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
130 "PROT_WRITE mprotect of VM_RESERVED memory, "
131 "which is deprecated. Please report this to "
132 "linux-kernel@vger.kernel.org\n",current->comm);
133 return -EACCES;
134 }
135 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { 127 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
136 charged = nrpages; 128 charged = nrpages;
137 if (security_vm_enough_memory(charged)) 129 if (security_vm_enough_memory(charged))
diff --git a/mm/mremap.c b/mm/mremap.c
index b535438c36..1903bdf65e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -13,6 +13,7 @@
13#include <linux/shm.h> 13#include <linux/shm.h>
14#include <linux/mman.h> 14#include <linux/mman.h>
15#include <linux/swap.h> 15#include <linux/swap.h>
16#include <linux/capability.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17#include <linux/highmem.h> 18#include <linux/highmem.h>
18#include <linux/security.h> 19#include <linux/security.h>
@@ -323,7 +324,7 @@ unsigned long do_mremap(unsigned long addr,
323 /* We can't remap across vm area boundaries */ 324 /* We can't remap across vm area boundaries */
324 if (old_len > vma->vm_end - addr) 325 if (old_len > vma->vm_end - addr)
325 goto out; 326 goto out;
326 if (vma->vm_flags & VM_DONTEXPAND) { 327 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
327 if (new_len > old_len) 328 if (new_len > old_len)
328 goto out; 329 goto out;
329 } 330 }
diff --git a/mm/msync.c b/mm/msync.c
index 0e040e9c39..3563a56e1a 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
27again: 27again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
29 do { 29 do {
30 unsigned long pfn;
31 struct page *page; 30 struct page *page;
32 31
33 if (progress >= 64) { 32 if (progress >= 64) {
@@ -40,13 +39,9 @@ again:
40 continue; 39 continue;
41 if (!pte_maybe_dirty(*pte)) 40 if (!pte_maybe_dirty(*pte))
42 continue; 41 continue;
43 pfn = pte_pfn(*pte); 42 page = vm_normal_page(vma, addr, *pte);
44 if (unlikely(!pfn_valid(pfn))) { 43 if (!page)
45 print_bad_pte(vma, *pte, addr);
46 continue; 44 continue;
47 }
48 page = pfn_to_page(pfn);
49
50 if (ptep_clear_flush_dirty(vma, addr, pte) || 45 if (ptep_clear_flush_dirty(vma, addr, pte) ||
51 page_test_and_clear_dirty(page)) 46 page_test_and_clear_dirty(page))
52 set_page_dirty(page); 47 set_page_dirty(page);
@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma,
97 /* For hugepages we can't go walking the page table normally, 92 /* For hugepages we can't go walking the page table normally,
98 * but that's ok, hugetlbfs is memory based, so we don't need 93 * but that's ok, hugetlbfs is memory based, so we don't need
99 * to do anything more on an msync(). 94 * to do anything more on an msync().
100 * Can't do anything with VM_RESERVED regions either.
101 */ 95 */
102 if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) 96 if (vma->vm_flags & VM_HUGETLB)
103 return; 97 return;
104 98
105 BUG_ON(addr >= end); 99 BUG_ON(addr >= end);
@@ -143,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma,
143 ret = filemap_fdatawrite(mapping); 137 ret = filemap_fdatawrite(mapping);
144 if (file->f_op && file->f_op->fsync) { 138 if (file->f_op && file->f_op->fsync) {
145 /* 139 /*
146 * We don't take i_sem here because mmap_sem 140 * We don't take i_mutex here because mmap_sem
147 * is already held. 141 * is already held.
148 */ 142 */
149 err = file->f_op->fsync(file,file->f_dentry,1); 143 err = file->f_op->fsync(file,file->f_dentry,1);
diff --git a/mm/nommu.c b/mm/nommu.c
index 6deb6ab3d6..c10262d682 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1045 1045
1046EXPORT_SYMBOL(find_vma); 1046EXPORT_SYMBOL(find_vma);
1047 1047
1048struct page *follow_page(struct mm_struct *mm, unsigned long address, 1048struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1049 unsigned int foll_flags) 1049 unsigned int foll_flags)
1050{ 1050{
1051 return NULL; 1051 return NULL;
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
1177{ 1177{
1178 return 0; 1178 return 0;
1179} 1179}
1180
1181struct page *filemap_nopage(struct vm_area_struct *area,
1182 unsigned long address, int *type)
1183{
1184 BUG();
1185 return NULL;
1186}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b90359..14bd4ec795 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -274,6 +274,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
274 show_mem(); 274 show_mem();
275 } 275 }
276 276
277 cpuset_lock();
277 read_lock(&tasklist_lock); 278 read_lock(&tasklist_lock);
278retry: 279retry:
279 p = select_bad_process(); 280 p = select_bad_process();
@@ -284,6 +285,7 @@ retry:
284 /* Found nothing?!?! Either we hang forever, or we panic. */ 285 /* Found nothing?!?! Either we hang forever, or we panic. */
285 if (!p) { 286 if (!p) {
286 read_unlock(&tasklist_lock); 287 read_unlock(&tasklist_lock);
288 cpuset_unlock();
287 panic("Out of memory and no killable processes...\n"); 289 panic("Out of memory and no killable processes...\n");
288 } 290 }
289 291
@@ -293,12 +295,14 @@ retry:
293 295
294 out: 296 out:
295 read_unlock(&tasklist_lock); 297 read_unlock(&tasklist_lock);
298 cpuset_unlock();
296 if (mm) 299 if (mm)
297 mmput(mm); 300 mmput(mm);
298 301
299 /* 302 /*
300 * Give "p" a good chance of killing itself before we 303 * Give "p" a good chance of killing itself before we
301 * retry to allocate memory. 304 * retry to allocate memory unless "p" is current
302 */ 305 */
303 schedule_timeout_interruptible(1); 306 if (!test_thread_flag(TIF_MEMDIE))
307 schedule_timeout_interruptible(1);
304} 308}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 74138c9a22..945559fb63 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,7 @@
46static long ratelimit_pages = 32; 46static long ratelimit_pages = 32;
47 47
48static long total_pages; /* The total number of pages in the machine. */ 48static long total_pages; /* The total number of pages in the machine. */
49static int dirty_exceeded; /* Dirty mem may be over limit */ 49static int dirty_exceeded __cacheline_aligned_in_smp; /* Dirty mem may be over limit */
50 50
51/* 51/*
52 * When balance_dirty_pages decides that the caller needs to perform some 52 * When balance_dirty_pages decides that the caller needs to perform some
@@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping)
212 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) 212 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
213 break; 213 break;
214 214
215 dirty_exceeded = 1; 215 if (!dirty_exceeded)
216 dirty_exceeded = 1;
216 217
217 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 218 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
218 * Unstable writes are a feature of certain networked 219 * Unstable writes are a feature of certain networked
@@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping)
234 blk_congestion_wait(WRITE, HZ/10); 235 blk_congestion_wait(WRITE, HZ/10);
235 } 236 }
236 237
237 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh) 238 if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
238 dirty_exceeded = 0; 239 dirty_exceeded = 0;
239 240
240 if (writeback_in_progress(bdi)) 241 if (writeback_in_progress(bdi))
@@ -550,11 +551,17 @@ void __init page_writeback_init(void)
550 551
551int do_writepages(struct address_space *mapping, struct writeback_control *wbc) 552int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
552{ 553{
554 int ret;
555
553 if (wbc->nr_to_write <= 0) 556 if (wbc->nr_to_write <= 0)
554 return 0; 557 return 0;
558 wbc->for_writepages = 1;
555 if (mapping->a_ops->writepages) 559 if (mapping->a_ops->writepages)
556 return mapping->a_ops->writepages(mapping, wbc); 560 ret = mapping->a_ops->writepages(mapping, wbc);
557 return generic_writepages(mapping, wbc); 561 else
562 ret = generic_writepages(mapping, wbc);
563 wbc->for_writepages = 0;
564 return ret;
558} 565}
559 566
560/** 567/**
@@ -750,6 +757,7 @@ int clear_page_dirty_for_io(struct page *page)
750 } 757 }
751 return TestClearPageDirty(page); 758 return TestClearPageDirty(page);
752} 759}
760EXPORT_SYMBOL(clear_page_dirty_for_io);
753 761
754int test_clear_page_writeback(struct page *page) 762int test_clear_page_writeback(struct page *page)
755{ 763{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 987225bdd6..df54e2fc8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
36#include <linux/memory_hotplug.h> 36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly;
52unsigned long totalram_pages __read_mostly; 53unsigned long totalram_pages __read_mostly;
53unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
54long nr_swap_pages; 55long nr_swap_pages;
56int percpu_pagelist_fraction;
57
58static void fastcall free_hot_cold_page(struct page *page, int cold);
55 59
56/* 60/*
57 * results with 256, 32 in the lowmem_reserve sysctl: 61 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -60,8 +64,11 @@ long nr_swap_pages;
60 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 64 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
61 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 65 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
62 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 66 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
67 *
68 * TBD: should special case ZONE_DMA32 machines here - in those we normally
69 * don't need any ZONE_NORMAL reservation
63 */ 70 */
64int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 71int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
65 72
66EXPORT_SYMBOL(totalram_pages); 73EXPORT_SYMBOL(totalram_pages);
67 74
@@ -72,12 +79,13 @@ EXPORT_SYMBOL(totalram_pages);
72struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 79struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
73EXPORT_SYMBOL(zone_table); 80EXPORT_SYMBOL(zone_table);
74 81
75static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 82static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
76int min_free_kbytes = 1024; 83int min_free_kbytes = 1024;
77 84
78unsigned long __initdata nr_kernel_pages; 85unsigned long __initdata nr_kernel_pages;
79unsigned long __initdata nr_all_pages; 86unsigned long __initdata nr_all_pages;
80 87
88#ifdef CONFIG_DEBUG_VM
81static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 89static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
82{ 90{
83 int ret = 0; 91 int ret = 0;
@@ -119,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page)
119 return 0; 127 return 0;
120} 128}
121 129
122static void bad_page(const char *function, struct page *page) 130#else
131static inline int bad_range(struct zone *zone, struct page *page)
132{
133 return 0;
134}
135#endif
136
137static void bad_page(struct page *page)
123{ 138{
124 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 139 printk(KERN_EMERG "Bad page state in process '%s'\n"
125 function, current->comm, page); 140 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
126 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 141 KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
127 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, 142 KERN_EMERG "Backtrace:\n",
128 page->mapping, page_mapcount(page), page_count(page)); 143 current->comm, page, (int)(2*sizeof(unsigned long)),
129 printk(KERN_EMERG "Backtrace:\n"); 144 (unsigned long)page->flags, page->mapping,
145 page_mapcount(page), page_count(page));
130 dump_stack(); 146 dump_stack();
131 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
132 page->flags &= ~(1 << PG_lru | 147 page->flags &= ~(1 << PG_lru |
133 1 << PG_private | 148 1 << PG_private |
134 1 << PG_locked | 149 1 << PG_locked |
@@ -137,18 +152,13 @@ static void bad_page(const char *function, struct page *page)
137 1 << PG_reclaim | 152 1 << PG_reclaim |
138 1 << PG_slab | 153 1 << PG_slab |
139 1 << PG_swapcache | 154 1 << PG_swapcache |
140 1 << PG_writeback | 155 1 << PG_writeback );
141 1 << PG_reserved );
142 set_page_count(page, 0); 156 set_page_count(page, 0);
143 reset_page_mapcount(page); 157 reset_page_mapcount(page);
144 page->mapping = NULL; 158 page->mapping = NULL;
145 add_taint(TAINT_BAD_PAGE); 159 add_taint(TAINT_BAD_PAGE);
146} 160}
147 161
148#ifndef CONFIG_HUGETLB_PAGE
149#define prep_compound_page(page, order) do { } while (0)
150#define destroy_compound_page(page, order) do { } while (0)
151#else
152/* 162/*
153 * Higher-order pages are called "compound pages". They are structured thusly: 163 * Higher-order pages are called "compound pages". They are structured thusly:
154 * 164 *
@@ -186,23 +196,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
186 int i; 196 int i;
187 int nr_pages = 1 << order; 197 int nr_pages = 1 << order;
188 198
189 if (!PageCompound(page)) 199 if (unlikely(page[1].index != order))
190 return; 200 bad_page(page);
191
192 if (page[1].index != order)
193 bad_page(__FUNCTION__, page);
194 201
195 for (i = 0; i < nr_pages; i++) { 202 for (i = 0; i < nr_pages; i++) {
196 struct page *p = page + i; 203 struct page *p = page + i;
197 204
198 if (!PageCompound(p)) 205 if (unlikely(!PageCompound(p) |
199 bad_page(__FUNCTION__, page); 206 (page_private(p) != (unsigned long)page)))
200 if (page_private(p) != (unsigned long)page) 207 bad_page(page);
201 bad_page(__FUNCTION__, page);
202 ClearPageCompound(p); 208 ClearPageCompound(p);
203 } 209 }
204} 210}
205#endif /* CONFIG_HUGETLB_PAGE */
206 211
207/* 212/*
208 * function for dealing with page's order in buddy system. 213 * function for dealing with page's order in buddy system.
@@ -258,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
258/* 263/*
259 * This function checks whether a page is free && is the buddy 264 * This function checks whether a page is free && is the buddy
260 * we can do coalesce a page and its buddy if 265 * we can do coalesce a page and its buddy if
261 * (a) the buddy is free && 266 * (a) the buddy is not in a hole &&
262 * (b) the buddy is on the buddy system && 267 * (b) the buddy is free &&
263 * (c) a page and its buddy have the same order. 268 * (c) the buddy is on the buddy system &&
269 * (d) a page and its buddy have the same order.
264 * for recording page's order, we use page_private(page) and PG_private. 270 * for recording page's order, we use page_private(page) and PG_private.
265 * 271 *
266 */ 272 */
267static inline int page_is_buddy(struct page *page, int order) 273static inline int page_is_buddy(struct page *page, int order)
268{ 274{
275#ifdef CONFIG_HOLES_IN_ZONE
276 if (!pfn_valid(page_to_pfn(page)))
277 return 0;
278#endif
279
269 if (PagePrivate(page) && 280 if (PagePrivate(page) &&
270 (page_order(page) == order) && 281 (page_order(page) == order) &&
271 page_count(page) == 0) 282 page_count(page) == 0)
@@ -297,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order)
297 * -- wli 308 * -- wli
298 */ 309 */
299 310
300static inline void __free_pages_bulk (struct page *page, 311static inline void __free_one_page(struct page *page,
301 struct zone *zone, unsigned int order) 312 struct zone *zone, unsigned int order)
302{ 313{
303 unsigned long page_idx; 314 unsigned long page_idx;
304 int order_size = 1 << order; 315 int order_size = 1 << order;
305 316
306 if (unlikely(order)) 317 if (unlikely(PageCompound(page)))
307 destroy_compound_page(page, order); 318 destroy_compound_page(page, order);
308 319
309 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 320 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -317,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
317 struct free_area *area; 328 struct free_area *area;
318 struct page *buddy; 329 struct page *buddy;
319 330
320 combined_idx = __find_combined_index(page_idx, order);
321 buddy = __page_find_buddy(page, page_idx, order); 331 buddy = __page_find_buddy(page, page_idx, order);
322
323 if (bad_range(zone, buddy))
324 break;
325 if (!page_is_buddy(buddy, order)) 332 if (!page_is_buddy(buddy, order))
326 break; /* Move the buddy up one level. */ 333 break; /* Move the buddy up one level. */
334
327 list_del(&buddy->lru); 335 list_del(&buddy->lru);
328 area = zone->free_area + order; 336 area = zone->free_area + order;
329 area->nr_free--; 337 area->nr_free--;
330 rmv_page_order(buddy); 338 rmv_page_order(buddy);
339 combined_idx = __find_combined_index(page_idx, order);
331 page = page + (combined_idx - page_idx); 340 page = page + (combined_idx - page_idx);
332 page_idx = combined_idx; 341 page_idx = combined_idx;
333 order++; 342 order++;
@@ -337,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page,
337 zone->free_area[order].nr_free++; 346 zone->free_area[order].nr_free++;
338} 347}
339 348
340static inline void free_pages_check(const char *function, struct page *page) 349static inline int free_pages_check(struct page *page)
341{ 350{
342 if ( page_mapcount(page) || 351 if (unlikely(page_mapcount(page) |
343 page->mapping != NULL || 352 (page->mapping != NULL) |
344 page_count(page) != 0 || 353 (page_count(page) != 0) |
345 (page->flags & ( 354 (page->flags & (
346 1 << PG_lru | 355 1 << PG_lru |
347 1 << PG_private | 356 1 << PG_private |
@@ -351,10 +360,16 @@ static inline void free_pages_check(const char *function, struct page *page)
351 1 << PG_slab | 360 1 << PG_slab |
352 1 << PG_swapcache | 361 1 << PG_swapcache |
353 1 << PG_writeback | 362 1 << PG_writeback |
354 1 << PG_reserved ))) 363 1 << PG_reserved ))))
355 bad_page(function, page); 364 bad_page(page);
356 if (PageDirty(page)) 365 if (PageDirty(page))
357 __ClearPageDirty(page); 366 __ClearPageDirty(page);
367 /*
368 * For now, we report if PG_reserved was found set, but do not
369 * clear it, and do not free the page. But we shall soon need
370 * to do more, for when the ZERO_PAGE count wraps negative.
371 */
372 return PageReserved(page);
358} 373}
359 374
360/* 375/*
@@ -368,48 +383,90 @@ static inline void free_pages_check(const char *function, struct page *page)
368 * And clear the zone's pages_scanned counter, to hold off the "all pages are 383 * And clear the zone's pages_scanned counter, to hold off the "all pages are
369 * pinned" detection logic. 384 * pinned" detection logic.
370 */ 385 */
371static int 386static void free_pages_bulk(struct zone *zone, int count,
372free_pages_bulk(struct zone *zone, int count, 387 struct list_head *list, int order)
373 struct list_head *list, unsigned int order)
374{ 388{
375 unsigned long flags; 389 spin_lock(&zone->lock);
376 struct page *page = NULL;
377 int ret = 0;
378
379 spin_lock_irqsave(&zone->lock, flags);
380 zone->all_unreclaimable = 0; 390 zone->all_unreclaimable = 0;
381 zone->pages_scanned = 0; 391 zone->pages_scanned = 0;
382 while (!list_empty(list) && count--) { 392 while (count--) {
393 struct page *page;
394
395 BUG_ON(list_empty(list));
383 page = list_entry(list->prev, struct page, lru); 396 page = list_entry(list->prev, struct page, lru);
384 /* have to delete it as __free_pages_bulk list manipulates */ 397 /* have to delete it as __free_one_page list manipulates */
385 list_del(&page->lru); 398 list_del(&page->lru);
386 __free_pages_bulk(page, zone, order); 399 __free_one_page(page, zone, order);
387 ret++;
388 } 400 }
389 spin_unlock_irqrestore(&zone->lock, flags); 401 spin_unlock(&zone->lock);
390 return ret;
391} 402}
392 403
393void __free_pages_ok(struct page *page, unsigned int order) 404static void free_one_page(struct zone *zone, struct page *page, int order)
394{ 405{
395 LIST_HEAD(list); 406 LIST_HEAD(list);
407 list_add(&page->lru, &list);
408 free_pages_bulk(zone, 1, &list, order);
409}
410
411static void __free_pages_ok(struct page *page, unsigned int order)
412{
413 unsigned long flags;
396 int i; 414 int i;
415 int reserved = 0;
397 416
398 arch_free_page(page, order); 417 arch_free_page(page, order);
399 418 if (!PageHighMem(page))
400 mod_page_state(pgfree, 1 << order); 419 mutex_debug_check_no_locks_freed(page_address(page),
420 PAGE_SIZE<<order);
401 421
402#ifndef CONFIG_MMU 422#ifndef CONFIG_MMU
403 if (order > 0) 423 for (i = 1 ; i < (1 << order) ; ++i)
404 for (i = 1 ; i < (1 << order) ; ++i) 424 __put_page(page + i);
405 __put_page(page + i);
406#endif 425#endif
407 426
408 for (i = 0 ; i < (1 << order) ; ++i) 427 for (i = 0 ; i < (1 << order) ; ++i)
409 free_pages_check(__FUNCTION__, page + i); 428 reserved += free_pages_check(page + i);
410 list_add(&page->lru, &list); 429 if (reserved)
411 kernel_map_pages(page, 1<<order, 0); 430 return;
412 free_pages_bulk(page_zone(page), 1, &list, order); 431
432 kernel_map_pages(page, 1 << order, 0);
433 local_irq_save(flags);
434 __mod_page_state(pgfree, 1 << order);
435 free_one_page(page_zone(page), page, order);
436 local_irq_restore(flags);
437}
438
439/*
440 * permit the bootmem allocator to evade page validation on high-order frees
441 */
442void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
443{
444 if (order == 0) {
445 __ClearPageReserved(page);
446 set_page_count(page, 0);
447
448 free_hot_cold_page(page, 0);
449 } else {
450 LIST_HEAD(list);
451 int loop;
452
453 for (loop = 0; loop < BITS_PER_LONG; loop++) {
454 struct page *p = &page[loop];
455
456 if (loop + 16 < BITS_PER_LONG)
457 prefetchw(p + 16);
458 __ClearPageReserved(p);
459 set_page_count(p, 0);
460 }
461
462 arch_free_page(page, order);
463
464 mod_page_state(pgfree, 1 << order);
465
466 list_add(&page->lru, &list);
467 kernel_map_pages(page, 1 << order, 0);
468 free_pages_bulk(page_zone(page), 1, &list, order);
469 }
413} 470}
414 471
415 472
@@ -427,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
427 * 484 *
428 * -- wli 485 * -- wli
429 */ 486 */
430static inline struct page * 487static inline void expand(struct zone *zone, struct page *page,
431expand(struct zone *zone, struct page *page,
432 int low, int high, struct free_area *area) 488 int low, int high, struct free_area *area)
433{ 489{
434 unsigned long size = 1 << high; 490 unsigned long size = 1 << high;
@@ -442,34 +498,16 @@ expand(struct zone *zone, struct page *page,
442 area->nr_free++; 498 area->nr_free++;
443 set_page_order(&page[size], high); 499 set_page_order(&page[size], high);
444 } 500 }
445 return page;
446}
447
448void set_page_refs(struct page *page, int order)
449{
450#ifdef CONFIG_MMU
451 set_page_count(page, 1);
452#else
453 int i;
454
455 /*
456 * We need to reference all the pages for this order, otherwise if
457 * anyone accesses one of the pages with (get/put) it will be freed.
458 * - eg: access_process_vm()
459 */
460 for (i = 0; i < (1 << order); i++)
461 set_page_count(page + i, 1);
462#endif /* CONFIG_MMU */
463} 501}
464 502
465/* 503/*
466 * This page is about to be returned from the page allocator 504 * This page is about to be returned from the page allocator
467 */ 505 */
468static void prep_new_page(struct page *page, int order) 506static int prep_new_page(struct page *page, int order)
469{ 507{
470 if ( page_mapcount(page) || 508 if (unlikely(page_mapcount(page) |
471 page->mapping != NULL || 509 (page->mapping != NULL) |
472 page_count(page) != 0 || 510 (page_count(page) != 0) |
473 (page->flags & ( 511 (page->flags & (
474 1 << PG_lru | 512 1 << PG_lru |
475 1 << PG_private | 513 1 << PG_private |
@@ -480,8 +518,15 @@ static void prep_new_page(struct page *page, int order)
480 1 << PG_slab | 518 1 << PG_slab |
481 1 << PG_swapcache | 519 1 << PG_swapcache |
482 1 << PG_writeback | 520 1 << PG_writeback |
483 1 << PG_reserved ))) 521 1 << PG_reserved ))))
484 bad_page(__FUNCTION__, page); 522 bad_page(page);
523
524 /*
525 * For now, we report if PG_reserved was found set, but do not
526 * clear it, and do not allocate the page: as a safety net.
527 */
528 if (PageReserved(page))
529 return 1;
485 530
486 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 531 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
487 1 << PG_referenced | 1 << PG_arch_1 | 532 1 << PG_referenced | 1 << PG_arch_1 |
@@ -489,6 +534,7 @@ static void prep_new_page(struct page *page, int order)
489 set_page_private(page, 0); 534 set_page_private(page, 0);
490 set_page_refs(page, order); 535 set_page_refs(page, order);
491 kernel_map_pages(page, 1 << order, 1); 536 kernel_map_pages(page, 1 << order, 1);
537 return 0;
492} 538}
493 539
494/* 540/*
@@ -511,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
511 rmv_page_order(page); 557 rmv_page_order(page);
512 area->nr_free--; 558 area->nr_free--;
513 zone->free_pages -= 1UL << order; 559 zone->free_pages -= 1UL << order;
514 return expand(zone, page, order, current_order, area); 560 expand(zone, page, order, current_order, area);
561 return page;
515 } 562 }
516 563
517 return NULL; 564 return NULL;
@@ -525,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
525static int rmqueue_bulk(struct zone *zone, unsigned int order, 572static int rmqueue_bulk(struct zone *zone, unsigned int order,
526 unsigned long count, struct list_head *list) 573 unsigned long count, struct list_head *list)
527{ 574{
528 unsigned long flags;
529 int i; 575 int i;
530 int allocated = 0;
531 struct page *page;
532 576
533 spin_lock_irqsave(&zone->lock, flags); 577 spin_lock(&zone->lock);
534 for (i = 0; i < count; ++i) { 578 for (i = 0; i < count; ++i) {
535 page = __rmqueue(zone, order); 579 struct page *page = __rmqueue(zone, order);
536 if (page == NULL) 580 if (unlikely(page == NULL))
537 break; 581 break;
538 allocated++;
539 list_add_tail(&page->lru, list); 582 list_add_tail(&page->lru, list);
540 } 583 }
541 spin_unlock_irqrestore(&zone->lock, flags); 584 spin_unlock(&zone->lock);
542 return allocated; 585 return i;
543} 586}
544 587
545#ifdef CONFIG_NUMA 588#ifdef CONFIG_NUMA
@@ -558,14 +601,13 @@ void drain_remote_pages(void)
558 if (zone->zone_pgdat->node_id == numa_node_id()) 601 if (zone->zone_pgdat->node_id == numa_node_id())
559 continue; 602 continue;
560 603
561 pset = zone->pageset[smp_processor_id()]; 604 pset = zone_pcp(zone, smp_processor_id());
562 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 605 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
563 struct per_cpu_pages *pcp; 606 struct per_cpu_pages *pcp;
564 607
565 pcp = &pset->pcp[i]; 608 pcp = &pset->pcp[i];
566 if (pcp->count) 609 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
567 pcp->count -= free_pages_bulk(zone, pcp->count, 610 pcp->count = 0;
568 &pcp->list, 0);
569 } 611 }
570 } 612 }
571 local_irq_restore(flags); 613 local_irq_restore(flags);
@@ -575,6 +617,7 @@ void drain_remote_pages(void)
575#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 617#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
576static void __drain_pages(unsigned int cpu) 618static void __drain_pages(unsigned int cpu)
577{ 619{
620 unsigned long flags;
578 struct zone *zone; 621 struct zone *zone;
579 int i; 622 int i;
580 623
@@ -586,8 +629,10 @@ static void __drain_pages(unsigned int cpu)
586 struct per_cpu_pages *pcp; 629 struct per_cpu_pages *pcp;
587 630
588 pcp = &pset->pcp[i]; 631 pcp = &pset->pcp[i];
589 pcp->count -= free_pages_bulk(zone, pcp->count, 632 local_irq_save(flags);
590 &pcp->list, 0); 633 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
634 pcp->count = 0;
635 local_irq_restore(flags);
591 } 636 }
592 } 637 }
593} 638}
@@ -633,18 +678,14 @@ void drain_local_pages(void)
633} 678}
634#endif /* CONFIG_PM */ 679#endif /* CONFIG_PM */
635 680
636static void zone_statistics(struct zonelist *zonelist, struct zone *z) 681static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
637{ 682{
638#ifdef CONFIG_NUMA 683#ifdef CONFIG_NUMA
639 unsigned long flags;
640 int cpu;
641 pg_data_t *pg = z->zone_pgdat; 684 pg_data_t *pg = z->zone_pgdat;
642 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 685 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
643 struct per_cpu_pageset *p; 686 struct per_cpu_pageset *p;
644 687
645 local_irq_save(flags); 688 p = zone_pcp(z, cpu);
646 cpu = smp_processor_id();
647 p = zone_pcp(z,cpu);
648 if (pg == orig) { 689 if (pg == orig) {
649 p->numa_hit++; 690 p->numa_hit++;
650 } else { 691 } else {
@@ -655,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
655 p->local_node++; 696 p->local_node++;
656 else 697 else
657 p->other_node++; 698 p->other_node++;
658 local_irq_restore(flags);
659#endif 699#endif
660} 700}
661 701
662/* 702/*
663 * Free a 0-order page 703 * Free a 0-order page
664 */ 704 */
665static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
666static void fastcall free_hot_cold_page(struct page *page, int cold) 705static void fastcall free_hot_cold_page(struct page *page, int cold)
667{ 706{
668 struct zone *zone = page_zone(page); 707 struct zone *zone = page_zone(page);
@@ -671,17 +710,22 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
671 710
672 arch_free_page(page, 0); 711 arch_free_page(page, 0);
673 712
674 kernel_map_pages(page, 1, 0);
675 inc_page_state(pgfree);
676 if (PageAnon(page)) 713 if (PageAnon(page))
677 page->mapping = NULL; 714 page->mapping = NULL;
678 free_pages_check(__FUNCTION__, page); 715 if (free_pages_check(page))
716 return;
717
718 kernel_map_pages(page, 1, 0);
719
679 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 720 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
680 local_irq_save(flags); 721 local_irq_save(flags);
722 __inc_page_state(pgfree);
681 list_add(&page->lru, &pcp->list); 723 list_add(&page->lru, &pcp->list);
682 pcp->count++; 724 pcp->count++;
683 if (pcp->count >= pcp->high) 725 if (pcp->count >= pcp->high) {
684 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 726 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
727 pcp->count -= pcp->batch;
728 }
685 local_irq_restore(flags); 729 local_irq_restore(flags);
686 put_cpu(); 730 put_cpu();
687} 731}
@@ -710,64 +754,82 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
710 * we cheat by calling it from here, in the order > 0 path. Saves a branch 754 * we cheat by calling it from here, in the order > 0 path. Saves a branch
711 * or two. 755 * or two.
712 */ 756 */
713static struct page * 757static struct page *buffered_rmqueue(struct zonelist *zonelist,
714buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 758 struct zone *zone, int order, gfp_t gfp_flags)
715{ 759{
716 unsigned long flags; 760 unsigned long flags;
717 struct page *page = NULL; 761 struct page *page;
718 int cold = !!(gfp_flags & __GFP_COLD); 762 int cold = !!(gfp_flags & __GFP_COLD);
763 int cpu;
719 764
720 if (order == 0) { 765again:
766 cpu = get_cpu();
767 if (likely(order == 0)) {
721 struct per_cpu_pages *pcp; 768 struct per_cpu_pages *pcp;
722 769
723 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 770 pcp = &zone_pcp(zone, cpu)->pcp[cold];
724 local_irq_save(flags); 771 local_irq_save(flags);
725 if (pcp->count <= pcp->low) 772 if (!pcp->count) {
726 pcp->count += rmqueue_bulk(zone, 0, 773 pcp->count += rmqueue_bulk(zone, 0,
727 pcp->batch, &pcp->list); 774 pcp->batch, &pcp->list);
728 if (pcp->count) { 775 if (unlikely(!pcp->count))
729 page = list_entry(pcp->list.next, struct page, lru); 776 goto failed;
730 list_del(&page->lru);
731 pcp->count--;
732 } 777 }
733 local_irq_restore(flags); 778 page = list_entry(pcp->list.next, struct page, lru);
734 put_cpu(); 779 list_del(&page->lru);
735 } 780 pcp->count--;
736 781 } else {
737 if (page == NULL) {
738 spin_lock_irqsave(&zone->lock, flags); 782 spin_lock_irqsave(&zone->lock, flags);
739 page = __rmqueue(zone, order); 783 page = __rmqueue(zone, order);
740 spin_unlock_irqrestore(&zone->lock, flags); 784 spin_unlock(&zone->lock);
785 if (!page)
786 goto failed;
741 } 787 }
742 788
743 if (page != NULL) { 789 __mod_page_state_zone(zone, pgalloc, 1 << order);
744 BUG_ON(bad_range(zone, page)); 790 zone_statistics(zonelist, zone, cpu);
745 mod_page_state_zone(zone, pgalloc, 1 << order); 791 local_irq_restore(flags);
746 prep_new_page(page, order); 792 put_cpu();
793
794 BUG_ON(bad_range(zone, page));
795 if (prep_new_page(page, order))
796 goto again;
747 797
748 if (gfp_flags & __GFP_ZERO) 798 if (gfp_flags & __GFP_ZERO)
749 prep_zero_page(page, order, gfp_flags); 799 prep_zero_page(page, order, gfp_flags);
750 800
751 if (order && (gfp_flags & __GFP_COMP)) 801 if (order && (gfp_flags & __GFP_COMP))
752 prep_compound_page(page, order); 802 prep_compound_page(page, order);
753 }
754 return page; 803 return page;
804
805failed:
806 local_irq_restore(flags);
807 put_cpu();
808 return NULL;
755} 809}
756 810
811#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
812#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
813#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
814#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
815#define ALLOC_HARDER 0x10 /* try to alloc harder */
816#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
817#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
818
757/* 819/*
758 * Return 1 if free pages are above 'mark'. This takes into account the order 820 * Return 1 if free pages are above 'mark'. This takes into account the order
759 * of the allocation. 821 * of the allocation.
760 */ 822 */
761int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 823int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
762 int classzone_idx, int can_try_harder, gfp_t gfp_high) 824 int classzone_idx, int alloc_flags)
763{ 825{
764 /* free_pages my go negative - that's OK */ 826 /* free_pages my go negative - that's OK */
765 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 827 long min = mark, free_pages = z->free_pages - (1 << order) + 1;
766 int o; 828 int o;
767 829
768 if (gfp_high) 830 if (alloc_flags & ALLOC_HIGH)
769 min -= min / 2; 831 min -= min / 2;
770 if (can_try_harder) 832 if (alloc_flags & ALLOC_HARDER)
771 min -= min / 4; 833 min -= min / 4;
772 834
773 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 835 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -785,14 +847,48 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
785 return 1; 847 return 1;
786} 848}
787 849
788static inline int 850/*
789should_reclaim_zone(struct zone *z, gfp_t gfp_mask) 851 * get_page_from_freeliest goes through the zonelist trying to allocate
852 * a page.
853 */
854static struct page *
855get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
856 struct zonelist *zonelist, int alloc_flags)
790{ 857{
791 if (!z->reclaim_pages) 858 struct zone **z = zonelist->zones;
792 return 0; 859 struct page *page = NULL;
793 if (gfp_mask & __GFP_NORECLAIM) 860 int classzone_idx = zone_idx(*z);
794 return 0; 861
795 return 1; 862 /*
863 * Go through the zonelist once, looking for a zone with enough free.
864 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
865 */
866 do {
867 if ((alloc_flags & ALLOC_CPUSET) &&
868 !cpuset_zone_allowed(*z, gfp_mask))
869 continue;
870
871 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
872 unsigned long mark;
873 if (alloc_flags & ALLOC_WMARK_MIN)
874 mark = (*z)->pages_min;
875 else if (alloc_flags & ALLOC_WMARK_LOW)
876 mark = (*z)->pages_low;
877 else
878 mark = (*z)->pages_high;
879 if (!zone_watermark_ok(*z, order, mark,
880 classzone_idx, alloc_flags))
881 if (!zone_reclaim_mode ||
882 !zone_reclaim(*z, gfp_mask, order))
883 continue;
884 }
885
886 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
887 if (page) {
888 break;
889 }
890 } while (*(++z) != NULL);
891 return page;
796} 892}
797 893
798/* 894/*
@@ -803,105 +899,76 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
803 struct zonelist *zonelist) 899 struct zonelist *zonelist)
804{ 900{
805 const gfp_t wait = gfp_mask & __GFP_WAIT; 901 const gfp_t wait = gfp_mask & __GFP_WAIT;
806 struct zone **zones, *z; 902 struct zone **z;
807 struct page *page; 903 struct page *page;
808 struct reclaim_state reclaim_state; 904 struct reclaim_state reclaim_state;
809 struct task_struct *p = current; 905 struct task_struct *p = current;
810 int i;
811 int classzone_idx;
812 int do_retry; 906 int do_retry;
813 int can_try_harder; 907 int alloc_flags;
814 int did_some_progress; 908 int did_some_progress;
815 909
816 might_sleep_if(wait); 910 might_sleep_if(wait);
817 911
818 /* 912restart:
819 * The caller may dip into page reserves a bit more if the caller 913 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
820 * cannot run direct reclaim, or is the caller has realtime scheduling
821 * policy
822 */
823 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
824
825 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
826 914
827 if (unlikely(zones[0] == NULL)) { 915 if (unlikely(*z == NULL)) {
828 /* Should this ever happen?? */ 916 /* Should this ever happen?? */
829 return NULL; 917 return NULL;
830 } 918 }
831 919
832 classzone_idx = zone_idx(zones[0]); 920 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
921 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
922 if (page)
923 goto got_pg;
924
925 do {
926 wakeup_kswapd(*z, order);
927 } while (*(++z));
833 928
834restart:
835 /* 929 /*
836 * Go through the zonelist once, looking for a zone with enough free. 930 * OK, we're below the kswapd watermark and have kicked background
837 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 931 * reclaim. Now things get more complex, so set up alloc_flags according
932 * to how we want to proceed.
933 *
934 * The caller may dip into page reserves a bit more if the caller
935 * cannot run direct reclaim, or if the caller has realtime scheduling
936 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
937 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
838 */ 938 */
839 for (i = 0; (z = zones[i]) != NULL; i++) { 939 alloc_flags = ALLOC_WMARK_MIN;
840 int do_reclaim = should_reclaim_zone(z, gfp_mask); 940 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
841 941 alloc_flags |= ALLOC_HARDER;
842 if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) 942 if (gfp_mask & __GFP_HIGH)
843 continue; 943 alloc_flags |= ALLOC_HIGH;
844 944 alloc_flags |= ALLOC_CPUSET;
845 /*
846 * If the zone is to attempt early page reclaim then this loop
847 * will try to reclaim pages and check the watermark a second
848 * time before giving up and falling back to the next zone.
849 */
850zone_reclaim_retry:
851 if (!zone_watermark_ok(z, order, z->pages_low,
852 classzone_idx, 0, 0)) {
853 if (!do_reclaim)
854 continue;
855 else {
856 zone_reclaim(z, gfp_mask, order);
857 /* Only try reclaim once */
858 do_reclaim = 0;
859 goto zone_reclaim_retry;
860 }
861 }
862
863 page = buffered_rmqueue(z, order, gfp_mask);
864 if (page)
865 goto got_pg;
866 }
867
868 for (i = 0; (z = zones[i]) != NULL; i++)
869 wakeup_kswapd(z, order);
870 945
871 /* 946 /*
872 * Go through the zonelist again. Let __GFP_HIGH and allocations 947 * Go through the zonelist again. Let __GFP_HIGH and allocations
873 * coming from realtime tasks to go deeper into reserves 948 * coming from realtime tasks go deeper into reserves.
874 * 949 *
875 * This is the last chance, in general, before the goto nopage. 950 * This is the last chance, in general, before the goto nopage.
876 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 951 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
877 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 952 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
878 */ 953 */
879 for (i = 0; (z = zones[i]) != NULL; i++) { 954 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
880 if (!zone_watermark_ok(z, order, z->pages_min, 955 if (page)
881 classzone_idx, can_try_harder, 956 goto got_pg;
882 gfp_mask & __GFP_HIGH))
883 continue;
884
885 if (wait && !cpuset_zone_allowed(z, gfp_mask))
886 continue;
887
888 page = buffered_rmqueue(z, order, gfp_mask);
889 if (page)
890 goto got_pg;
891 }
892 957
893 /* This allocation should allow future memory freeing. */ 958 /* This allocation should allow future memory freeing. */
894 959
895 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 960 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
896 && !in_interrupt()) { 961 && !in_interrupt()) {
897 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 962 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
963nofail_alloc:
898 /* go through the zonelist yet again, ignoring mins */ 964 /* go through the zonelist yet again, ignoring mins */
899 for (i = 0; (z = zones[i]) != NULL; i++) { 965 page = get_page_from_freelist(gfp_mask, order,
900 if (!cpuset_zone_allowed(z, gfp_mask)) 966 zonelist, ALLOC_NO_WATERMARKS);
901 continue; 967 if (page)
902 page = buffered_rmqueue(z, order, gfp_mask); 968 goto got_pg;
903 if (page) 969 if (gfp_mask & __GFP_NOFAIL) {
904 goto got_pg; 970 blk_congestion_wait(WRITE, HZ/50);
971 goto nofail_alloc;
905 } 972 }
906 } 973 }
907 goto nopage; 974 goto nopage;
@@ -915,11 +982,12 @@ rebalance:
915 cond_resched(); 982 cond_resched();
916 983
917 /* We now go into synchronous reclaim */ 984 /* We now go into synchronous reclaim */
985 cpuset_memory_pressure_bump();
918 p->flags |= PF_MEMALLOC; 986 p->flags |= PF_MEMALLOC;
919 reclaim_state.reclaimed_slab = 0; 987 reclaim_state.reclaimed_slab = 0;
920 p->reclaim_state = &reclaim_state; 988 p->reclaim_state = &reclaim_state;
921 989
922 did_some_progress = try_to_free_pages(zones, gfp_mask); 990 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
923 991
924 p->reclaim_state = NULL; 992 p->reclaim_state = NULL;
925 p->flags &= ~PF_MEMALLOC; 993 p->flags &= ~PF_MEMALLOC;
@@ -927,19 +995,10 @@ rebalance:
927 cond_resched(); 995 cond_resched();
928 996
929 if (likely(did_some_progress)) { 997 if (likely(did_some_progress)) {
930 for (i = 0; (z = zones[i]) != NULL; i++) { 998 page = get_page_from_freelist(gfp_mask, order,
931 if (!zone_watermark_ok(z, order, z->pages_min, 999 zonelist, alloc_flags);
932 classzone_idx, can_try_harder, 1000 if (page)
933 gfp_mask & __GFP_HIGH)) 1001 goto got_pg;
934 continue;
935
936 if (!cpuset_zone_allowed(z, gfp_mask))
937 continue;
938
939 page = buffered_rmqueue(z, order, gfp_mask);
940 if (page)
941 goto got_pg;
942 }
943 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1002 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
944 /* 1003 /*
945 * Go through the zonelist yet one more time, keep 1004 * Go through the zonelist yet one more time, keep
@@ -947,18 +1006,10 @@ rebalance:
947 * a parallel oom killing, we must fail if we're still 1006 * a parallel oom killing, we must fail if we're still
948 * under heavy pressure. 1007 * under heavy pressure.
949 */ 1008 */
950 for (i = 0; (z = zones[i]) != NULL; i++) { 1009 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
951 if (!zone_watermark_ok(z, order, z->pages_high, 1010 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
952 classzone_idx, 0, 0)) 1011 if (page)
953 continue; 1012 goto got_pg;
954
955 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
956 continue;
957
958 page = buffered_rmqueue(z, order, gfp_mask);
959 if (page)
960 goto got_pg;
961 }
962 1013
963 out_of_memory(gfp_mask, order); 1014 out_of_memory(gfp_mask, order);
964 goto restart; 1015 goto restart;
@@ -991,9 +1042,7 @@ nopage:
991 dump_stack(); 1042 dump_stack();
992 show_mem(); 1043 show_mem();
993 } 1044 }
994 return NULL;
995got_pg: 1045got_pg:
996 zone_statistics(zonelist, z);
997 return page; 1046 return page;
998} 1047}
999 1048
@@ -1160,7 +1209,7 @@ EXPORT_SYMBOL(nr_pagecache);
1160DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1209DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1161#endif 1210#endif
1162 1211
1163void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1212static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1164{ 1213{
1165 int cpu = 0; 1214 int cpu = 0;
1166 1215
@@ -1213,7 +1262,7 @@ void get_full_page_state(struct page_state *ret)
1213 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1262 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1214} 1263}
1215 1264
1216unsigned long __read_page_state(unsigned long offset) 1265unsigned long read_page_state_offset(unsigned long offset)
1217{ 1266{
1218 unsigned long ret = 0; 1267 unsigned long ret = 0;
1219 int cpu; 1268 int cpu;
@@ -1227,18 +1276,26 @@ unsigned long __read_page_state(unsigned long offset)
1227 return ret; 1276 return ret;
1228} 1277}
1229 1278
1230void __mod_page_state(unsigned long offset, unsigned long delta) 1279void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1280{
1281 void *ptr;
1282
1283 ptr = &__get_cpu_var(page_states);
1284 *(unsigned long *)(ptr + offset) += delta;
1285}
1286EXPORT_SYMBOL(__mod_page_state_offset);
1287
1288void mod_page_state_offset(unsigned long offset, unsigned long delta)
1231{ 1289{
1232 unsigned long flags; 1290 unsigned long flags;
1233 void* ptr; 1291 void *ptr;
1234 1292
1235 local_irq_save(flags); 1293 local_irq_save(flags);
1236 ptr = &__get_cpu_var(page_states); 1294 ptr = &__get_cpu_var(page_states);
1237 *(unsigned long*)(ptr + offset) += delta; 1295 *(unsigned long *)(ptr + offset) += delta;
1238 local_irq_restore(flags); 1296 local_irq_restore(flags);
1239} 1297}
1240 1298EXPORT_SYMBOL(mod_page_state_offset);
1241EXPORT_SYMBOL(__mod_page_state);
1242 1299
1243void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1300void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1244 unsigned long *free, struct pglist_data *pgdat) 1301 unsigned long *free, struct pglist_data *pgdat)
@@ -1324,7 +1381,7 @@ void show_free_areas(void)
1324 show_node(zone); 1381 show_node(zone);
1325 printk("%s per-cpu:", zone->name); 1382 printk("%s per-cpu:", zone->name);
1326 1383
1327 if (!zone->present_pages) { 1384 if (!populated_zone(zone)) {
1328 printk(" empty\n"); 1385 printk(" empty\n");
1329 continue; 1386 continue;
1330 } else 1387 } else
@@ -1336,10 +1393,9 @@ void show_free_areas(void)
1336 pageset = zone_pcp(zone, cpu); 1393 pageset = zone_pcp(zone, cpu);
1337 1394
1338 for (temperature = 0; temperature < 2; temperature++) 1395 for (temperature = 0; temperature < 2; temperature++)
1339 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1396 printk("cpu %d %s: high %d, batch %d used:%d\n",
1340 cpu, 1397 cpu,
1341 temperature ? "cold" : "hot", 1398 temperature ? "cold" : "hot",
1342 pageset->pcp[temperature].low,
1343 pageset->pcp[temperature].high, 1399 pageset->pcp[temperature].high,
1344 pageset->pcp[temperature].batch, 1400 pageset->pcp[temperature].batch,
1345 pageset->pcp[temperature].count); 1401 pageset->pcp[temperature].count);
@@ -1402,7 +1458,7 @@ void show_free_areas(void)
1402 1458
1403 show_node(zone); 1459 show_node(zone);
1404 printk("%s: ", zone->name); 1460 printk("%s: ", zone->name);
1405 if (!zone->present_pages) { 1461 if (!populated_zone(zone)) {
1406 printk("empty\n"); 1462 printk("empty\n");
1407 continue; 1463 continue;
1408 } 1464 }
@@ -1422,32 +1478,29 @@ void show_free_areas(void)
1422 1478
1423/* 1479/*
1424 * Builds allocation fallback zone lists. 1480 * Builds allocation fallback zone lists.
1481 *
1482 * Add all populated zones of a node to the zonelist.
1425 */ 1483 */
1426static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1484static int __init build_zonelists_node(pg_data_t *pgdat,
1427{ 1485 struct zonelist *zonelist, int nr_zones, int zone_type)
1428 switch (k) { 1486{
1429 struct zone *zone; 1487 struct zone *zone;
1430 default: 1488
1431 BUG(); 1489 BUG_ON(zone_type > ZONE_HIGHMEM);
1432 case ZONE_HIGHMEM: 1490
1433 zone = pgdat->node_zones + ZONE_HIGHMEM; 1491 do {
1434 if (zone->present_pages) { 1492 zone = pgdat->node_zones + zone_type;
1493 if (populated_zone(zone)) {
1435#ifndef CONFIG_HIGHMEM 1494#ifndef CONFIG_HIGHMEM
1436 BUG(); 1495 BUG_ON(zone_type > ZONE_NORMAL);
1437#endif 1496#endif
1438 zonelist->zones[j++] = zone; 1497 zonelist->zones[nr_zones++] = zone;
1498 check_highest_zone(zone_type);
1439 } 1499 }
1440 case ZONE_NORMAL: 1500 zone_type--;
1441 zone = pgdat->node_zones + ZONE_NORMAL;
1442 if (zone->present_pages)
1443 zonelist->zones[j++] = zone;
1444 case ZONE_DMA:
1445 zone = pgdat->node_zones + ZONE_DMA;
1446 if (zone->present_pages)
1447 zonelist->zones[j++] = zone;
1448 }
1449 1501
1450 return j; 1502 } while (zone_type >= 0);
1503 return nr_zones;
1451} 1504}
1452 1505
1453static inline int highest_zone(int zone_bits) 1506static inline int highest_zone(int zone_bits)
@@ -1455,6 +1508,8 @@ static inline int highest_zone(int zone_bits)
1455 int res = ZONE_NORMAL; 1508 int res = ZONE_NORMAL;
1456 if (zone_bits & (__force int)__GFP_HIGHMEM) 1509 if (zone_bits & (__force int)__GFP_HIGHMEM)
1457 res = ZONE_HIGHMEM; 1510 res = ZONE_HIGHMEM;
1511 if (zone_bits & (__force int)__GFP_DMA32)
1512 res = ZONE_DMA32;
1458 if (zone_bits & (__force int)__GFP_DMA) 1513 if (zone_bits & (__force int)__GFP_DMA)
1459 res = ZONE_DMA; 1514 res = ZONE_DMA;
1460 return res; 1515 return res;
@@ -1542,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
1542 prev_node = local_node; 1597 prev_node = local_node;
1543 nodes_clear(used_mask); 1598 nodes_clear(used_mask);
1544 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1599 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1600 int distance = node_distance(local_node, node);
1601
1602 /*
1603 * If another node is sufficiently far away then it is better
1604 * to reclaim pages in a zone before going off node.
1605 */
1606 if (distance > RECLAIM_DISTANCE)
1607 zone_reclaim_mode = 1;
1608
1545 /* 1609 /*
1546 * We don't want to pressure a particular node. 1610 * We don't want to pressure a particular node.
1547 * So adding penalty to the first node in same 1611 * So adding penalty to the first node in same
1548 * distance group to make it round-robin. 1612 * distance group to make it round-robin.
1549 */ 1613 */
1550 if (node_distance(local_node, node) != 1614
1551 node_distance(local_node, prev_node)) 1615 if (distance != node_distance(local_node, prev_node))
1552 node_load[node] += load; 1616 node_load[node] += load;
1553 prev_node = node; 1617 prev_node = node;
1554 load--; 1618 load--;
@@ -1682,18 +1746,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1682 * up by free_all_bootmem() once the early boot process is 1746 * up by free_all_bootmem() once the early boot process is
1683 * done. Non-atomic initialization, single-pass. 1747 * done. Non-atomic initialization, single-pass.
1684 */ 1748 */
1685void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1749void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1686 unsigned long start_pfn) 1750 unsigned long start_pfn)
1687{ 1751{
1688 struct page *page; 1752 struct page *page;
1689 unsigned long end_pfn = start_pfn + size; 1753 unsigned long end_pfn = start_pfn + size;
1690 unsigned long pfn; 1754 unsigned long pfn;
1691 1755
1692 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1756 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1693 if (!early_pfn_valid(pfn)) 1757 if (!early_pfn_valid(pfn))
1694 continue; 1758 continue;
1695 if (!early_pfn_in_nid(pfn, nid))
1696 continue;
1697 page = pfn_to_page(pfn); 1759 page = pfn_to_page(pfn);
1698 set_page_links(page, zone, nid, pfn); 1760 set_page_links(page, zone, nid, pfn);
1699 set_page_count(page, 1); 1761 set_page_count(page, 1);
@@ -1737,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1737 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1799 memmap_init_zone((size), (nid), (zone), (start_pfn))
1738#endif 1800#endif
1739 1801
1740static int __devinit zone_batchsize(struct zone *zone) 1802static int __meminit zone_batchsize(struct zone *zone)
1741{ 1803{
1742 int batch; 1804 int batch;
1743 1805
@@ -1755,16 +1817,16 @@ static int __devinit zone_batchsize(struct zone *zone)
1755 batch = 1; 1817 batch = 1;
1756 1818
1757 /* 1819 /*
1758 * We will be trying to allcoate bigger chunks of contiguous 1820 * Clamp the batch to a 2^n - 1 value. Having a power
1759 * memory of the order of fls(batch). This should result in 1821 * of 2 value was found to be more likely to have
1760 * better cache coloring. 1822 * suboptimal cache aliasing properties in some cases.
1761 * 1823 *
1762 * A sanity check also to ensure that batch is still in limits. 1824 * For example if 2 tasks are alternately allocating
1825 * batches of pages, one task can end up with a lot
1826 * of pages of one half of the possible page colors
1827 * and the other with pages of the other colors.
1763 */ 1828 */
1764 batch = (1 << fls(batch + batch/2)); 1829 batch = (1 << (fls(batch + batch/2)-1)) - 1;
1765
1766 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
1767 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
1768 1830
1769 return batch; 1831 return batch;
1770} 1832}
@@ -1777,19 +1839,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1777 1839
1778 pcp = &p->pcp[0]; /* hot */ 1840 pcp = &p->pcp[0]; /* hot */
1779 pcp->count = 0; 1841 pcp->count = 0;
1780 pcp->low = 0;
1781 pcp->high = 6 * batch; 1842 pcp->high = 6 * batch;
1782 pcp->batch = max(1UL, 1 * batch); 1843 pcp->batch = max(1UL, 1 * batch);
1783 INIT_LIST_HEAD(&pcp->list); 1844 INIT_LIST_HEAD(&pcp->list);
1784 1845
1785 pcp = &p->pcp[1]; /* cold*/ 1846 pcp = &p->pcp[1]; /* cold*/
1786 pcp->count = 0; 1847 pcp->count = 0;
1787 pcp->low = 0;
1788 pcp->high = 2 * batch; 1848 pcp->high = 2 * batch;
1789 pcp->batch = max(1UL, batch/2); 1849 pcp->batch = max(1UL, batch/2);
1790 INIT_LIST_HEAD(&pcp->list); 1850 INIT_LIST_HEAD(&pcp->list);
1791} 1851}
1792 1852
1853/*
1854 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1855 * to the value high for the pageset p.
1856 */
1857
1858static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1859 unsigned long high)
1860{
1861 struct per_cpu_pages *pcp;
1862
1863 pcp = &p->pcp[0]; /* hot list */
1864 pcp->high = high;
1865 pcp->batch = max(1UL, high/4);
1866 if ((high/4) > (PAGE_SHIFT * 8))
1867 pcp->batch = PAGE_SHIFT * 8;
1868}
1869
1870
1793#ifdef CONFIG_NUMA 1871#ifdef CONFIG_NUMA
1794/* 1872/*
1795 * Boot pageset table. One per cpu which is going to be used for all 1873 * Boot pageset table. One per cpu which is going to be used for all
@@ -1815,18 +1893,22 @@ static struct per_cpu_pageset
1815 * Dynamically allocate memory for the 1893 * Dynamically allocate memory for the
1816 * per cpu pageset array in struct zone. 1894 * per cpu pageset array in struct zone.
1817 */ 1895 */
1818static int __devinit process_zones(int cpu) 1896static int __meminit process_zones(int cpu)
1819{ 1897{
1820 struct zone *zone, *dzone; 1898 struct zone *zone, *dzone;
1821 1899
1822 for_each_zone(zone) { 1900 for_each_zone(zone) {
1823 1901
1824 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1902 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1825 GFP_KERNEL, cpu_to_node(cpu)); 1903 GFP_KERNEL, cpu_to_node(cpu));
1826 if (!zone->pageset[cpu]) 1904 if (!zone_pcp(zone, cpu))
1827 goto bad; 1905 goto bad;
1828 1906
1829 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1907 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
1908
1909 if (percpu_pagelist_fraction)
1910 setup_pagelist_highmark(zone_pcp(zone, cpu),
1911 (zone->present_pages / percpu_pagelist_fraction));
1830 } 1912 }
1831 1913
1832 return 0; 1914 return 0;
@@ -1834,15 +1916,14 @@ bad:
1834 for_each_zone(dzone) { 1916 for_each_zone(dzone) {
1835 if (dzone == zone) 1917 if (dzone == zone)
1836 break; 1918 break;
1837 kfree(dzone->pageset[cpu]); 1919 kfree(zone_pcp(dzone, cpu));
1838 dzone->pageset[cpu] = NULL; 1920 zone_pcp(dzone, cpu) = NULL;
1839 } 1921 }
1840 return -ENOMEM; 1922 return -ENOMEM;
1841} 1923}
1842 1924
1843static inline void free_zone_pagesets(int cpu) 1925static inline void free_zone_pagesets(int cpu)
1844{ 1926{
1845#ifdef CONFIG_NUMA
1846 struct zone *zone; 1927 struct zone *zone;
1847 1928
1848 for_each_zone(zone) { 1929 for_each_zone(zone) {
@@ -1851,10 +1932,9 @@ static inline void free_zone_pagesets(int cpu)
1851 zone_pcp(zone, cpu) = NULL; 1932 zone_pcp(zone, cpu) = NULL;
1852 kfree(pset); 1933 kfree(pset);
1853 } 1934 }
1854#endif
1855} 1935}
1856 1936
1857static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1937static int __meminit pageset_cpuup_callback(struct notifier_block *nfb,
1858 unsigned long action, 1938 unsigned long action,
1859 void *hcpu) 1939 void *hcpu)
1860{ 1940{
@@ -1866,11 +1946,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1866 if (process_zones(cpu)) 1946 if (process_zones(cpu))
1867 ret = NOTIFY_BAD; 1947 ret = NOTIFY_BAD;
1868 break; 1948 break;
1869#ifdef CONFIG_HOTPLUG_CPU 1949 case CPU_UP_CANCELED:
1870 case CPU_DEAD: 1950 case CPU_DEAD:
1871 free_zone_pagesets(cpu); 1951 free_zone_pagesets(cpu);
1872 break; 1952 break;
1873#endif
1874 default: 1953 default:
1875 break; 1954 break;
1876 } 1955 }
@@ -1880,7 +1959,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1880static struct notifier_block pageset_notifier = 1959static struct notifier_block pageset_notifier =
1881 { &pageset_cpuup_callback, NULL, 0 }; 1960 { &pageset_cpuup_callback, NULL, 0 };
1882 1961
1883void __init setup_per_cpu_pageset() 1962void __init setup_per_cpu_pageset(void)
1884{ 1963{
1885 int err; 1964 int err;
1886 1965
@@ -1895,7 +1974,7 @@ void __init setup_per_cpu_pageset()
1895 1974
1896#endif 1975#endif
1897 1976
1898static __devinit 1977static __meminit
1899void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 1978void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1900{ 1979{
1901 int i; 1980 int i;
@@ -1915,7 +1994,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1915 init_waitqueue_head(zone->wait_table + i); 1994 init_waitqueue_head(zone->wait_table + i);
1916} 1995}
1917 1996
1918static __devinit void zone_pcp_init(struct zone *zone) 1997static __meminit void zone_pcp_init(struct zone *zone)
1919{ 1998{
1920 int cpu; 1999 int cpu;
1921 unsigned long batch = zone_batchsize(zone); 2000 unsigned long batch = zone_batchsize(zone);
@@ -1923,7 +2002,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
1923 for (cpu = 0; cpu < NR_CPUS; cpu++) { 2002 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1924#ifdef CONFIG_NUMA 2003#ifdef CONFIG_NUMA
1925 /* Early boot. Slab allocator not functional yet */ 2004 /* Early boot. Slab allocator not functional yet */
1926 zone->pageset[cpu] = &boot_pageset[cpu]; 2005 zone_pcp(zone, cpu) = &boot_pageset[cpu];
1927 setup_pageset(&boot_pageset[cpu],0); 2006 setup_pageset(&boot_pageset[cpu],0);
1928#else 2007#else
1929 setup_pageset(zone_pcp(zone,cpu), batch); 2008 setup_pageset(zone_pcp(zone,cpu), batch);
@@ -1933,7 +2012,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
1933 zone->name, zone->present_pages, batch); 2012 zone->name, zone->present_pages, batch);
1934} 2013}
1935 2014
1936static __devinit void init_currently_empty_zone(struct zone *zone, 2015static __meminit void init_currently_empty_zone(struct zone *zone,
1937 unsigned long zone_start_pfn, unsigned long size) 2016 unsigned long zone_start_pfn, unsigned long size)
1938{ 2017{
1939 struct pglist_data *pgdat = zone->zone_pgdat; 2018 struct pglist_data *pgdat = zone->zone_pgdat;
@@ -1975,7 +2054,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1975 if (zholes_size) 2054 if (zholes_size)
1976 realsize -= zholes_size[j]; 2055 realsize -= zholes_size[j];
1977 2056
1978 if (j == ZONE_DMA || j == ZONE_NORMAL) 2057 if (j < ZONE_HIGHMEM)
1979 nr_kernel_pages += realsize; 2058 nr_kernel_pages += realsize;
1980 nr_all_pages += realsize; 2059 nr_all_pages += realsize;
1981 2060
@@ -2100,7 +2179,7 @@ static int frag_show(struct seq_file *m, void *arg)
2100 int order; 2179 int order;
2101 2180
2102 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2181 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2103 if (!zone->present_pages) 2182 if (!populated_zone(zone))
2104 continue; 2183 continue;
2105 2184
2106 spin_lock_irqsave(&zone->lock, flags); 2185 spin_lock_irqsave(&zone->lock, flags);
@@ -2133,7 +2212,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2133 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2212 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2134 int i; 2213 int i;
2135 2214
2136 if (!zone->present_pages) 2215 if (!populated_zone(zone))
2137 continue; 2216 continue;
2138 2217
2139 spin_lock_irqsave(&zone->lock, flags); 2218 spin_lock_irqsave(&zone->lock, flags);
@@ -2166,7 +2245,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2166 seq_printf(m, 2245 seq_printf(m,
2167 ")" 2246 ")"
2168 "\n pagesets"); 2247 "\n pagesets");
2169 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2248 for_each_online_cpu(i) {
2170 struct per_cpu_pageset *pageset; 2249 struct per_cpu_pageset *pageset;
2171 int j; 2250 int j;
2172 2251
@@ -2181,12 +2260,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2181 seq_printf(m, 2260 seq_printf(m,
2182 "\n cpu: %i pcp: %i" 2261 "\n cpu: %i pcp: %i"
2183 "\n count: %i" 2262 "\n count: %i"
2184 "\n low: %i"
2185 "\n high: %i" 2263 "\n high: %i"
2186 "\n batch: %i", 2264 "\n batch: %i",
2187 i, j, 2265 i, j,
2188 pageset->pcp[j].count, 2266 pageset->pcp[j].count,
2189 pageset->pcp[j].low,
2190 pageset->pcp[j].high, 2267 pageset->pcp[j].high,
2191 pageset->pcp[j].batch); 2268 pageset->pcp[j].batch);
2192 } 2269 }
@@ -2241,32 +2318,40 @@ static char *vmstat_text[] = {
2241 "pgpgout", 2318 "pgpgout",
2242 "pswpin", 2319 "pswpin",
2243 "pswpout", 2320 "pswpout",
2244 "pgalloc_high",
2245 2321
2322 "pgalloc_high",
2246 "pgalloc_normal", 2323 "pgalloc_normal",
2324 "pgalloc_dma32",
2247 "pgalloc_dma", 2325 "pgalloc_dma",
2326
2248 "pgfree", 2327 "pgfree",
2249 "pgactivate", 2328 "pgactivate",
2250 "pgdeactivate", 2329 "pgdeactivate",
2251 2330
2252 "pgfault", 2331 "pgfault",
2253 "pgmajfault", 2332 "pgmajfault",
2333
2254 "pgrefill_high", 2334 "pgrefill_high",
2255 "pgrefill_normal", 2335 "pgrefill_normal",
2336 "pgrefill_dma32",
2256 "pgrefill_dma", 2337 "pgrefill_dma",
2257 2338
2258 "pgsteal_high", 2339 "pgsteal_high",
2259 "pgsteal_normal", 2340 "pgsteal_normal",
2341 "pgsteal_dma32",
2260 "pgsteal_dma", 2342 "pgsteal_dma",
2343
2261 "pgscan_kswapd_high", 2344 "pgscan_kswapd_high",
2262 "pgscan_kswapd_normal", 2345 "pgscan_kswapd_normal",
2263 2346 "pgscan_kswapd_dma32",
2264 "pgscan_kswapd_dma", 2347 "pgscan_kswapd_dma",
2348
2265 "pgscan_direct_high", 2349 "pgscan_direct_high",
2266 "pgscan_direct_normal", 2350 "pgscan_direct_normal",
2351 "pgscan_direct_dma32",
2267 "pgscan_direct_dma", 2352 "pgscan_direct_dma",
2268 "pginodesteal",
2269 2353
2354 "pginodesteal",
2270 "slabs_scanned", 2355 "slabs_scanned",
2271 "kswapd_steal", 2356 "kswapd_steal",
2272 "kswapd_inodesteal", 2357 "kswapd_inodesteal",
@@ -2417,13 +2502,18 @@ void setup_per_zone_pages_min(void)
2417 } 2502 }
2418 2503
2419 for_each_zone(zone) { 2504 for_each_zone(zone) {
2505 unsigned long tmp;
2420 spin_lock_irqsave(&zone->lru_lock, flags); 2506 spin_lock_irqsave(&zone->lru_lock, flags);
2507 tmp = (pages_min * zone->present_pages) / lowmem_pages;
2421 if (is_highmem(zone)) { 2508 if (is_highmem(zone)) {
2422 /* 2509 /*
2423 * Often, highmem doesn't need to reserve any pages. 2510 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
2424 * But the pages_min/low/high values are also used for 2511 * need highmem pages, so cap pages_min to a small
2425 * batching up page reclaim activity so we need a 2512 * value here.
2426 * decent value here. 2513 *
2514 * The (pages_high-pages_low) and (pages_low-pages_min)
2515 * deltas controls asynch page reclaim, and so should
2516 * not be capped for highmem.
2427 */ 2517 */
2428 int min_pages; 2518 int min_pages;
2429 2519
@@ -2434,19 +2524,15 @@ void setup_per_zone_pages_min(void)
2434 min_pages = 128; 2524 min_pages = 128;
2435 zone->pages_min = min_pages; 2525 zone->pages_min = min_pages;
2436 } else { 2526 } else {
2437 /* if it's a lowmem zone, reserve a number of pages 2527 /*
2528 * If it's a lowmem zone, reserve a number of pages
2438 * proportionate to the zone's size. 2529 * proportionate to the zone's size.
2439 */ 2530 */
2440 zone->pages_min = (pages_min * zone->present_pages) / 2531 zone->pages_min = tmp;
2441 lowmem_pages;
2442 } 2532 }
2443 2533
2444 /* 2534 zone->pages_low = zone->pages_min + tmp / 4;
2445 * When interpreting these watermarks, just keep in mind that: 2535 zone->pages_high = zone->pages_min + tmp / 2;
2446 * zone->pages_min == (zone->pages_min * 4) / 4;
2447 */
2448 zone->pages_low = (zone->pages_min * 5) / 4;
2449 zone->pages_high = (zone->pages_min * 6) / 4;
2450 spin_unlock_irqrestore(&zone->lru_lock, flags); 2536 spin_unlock_irqrestore(&zone->lru_lock, flags);
2451 } 2537 }
2452} 2538}
@@ -2522,6 +2608,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2522 return 0; 2608 return 0;
2523} 2609}
2524 2610
2611/*
2612 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
2613 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
2614 * can have before it gets flushed back to buddy allocator.
2615 */
2616
2617int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2618 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2619{
2620 struct zone *zone;
2621 unsigned int cpu;
2622 int ret;
2623
2624 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2625 if (!write || (ret == -EINVAL))
2626 return ret;
2627 for_each_zone(zone) {
2628 for_each_online_cpu(cpu) {
2629 unsigned long high;
2630 high = zone->present_pages / percpu_pagelist_fraction;
2631 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
2632 }
2633 }
2634 return 0;
2635}
2636
2525__initdata int hashdist = HASHDIST_DEFAULT; 2637__initdata int hashdist = HASHDIST_DEFAULT;
2526 2638
2527#ifdef CONFIG_NUMA 2639#ifdef CONFIG_NUMA
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c4..c4b6d0afd7 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
90 90
91static int __pdflush(struct pdflush_work *my_work) 91static int __pdflush(struct pdflush_work *my_work)
92{ 92{
93 current->flags |= PF_FLUSHER; 93 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
94 my_work->fn = NULL; 94 my_work->fn = NULL;
95 my_work->who = current; 95 my_work->who = current;
96 INIT_LIST_HEAD(&my_work->list); 96 INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87..8d6eeaaa62 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
158{ 158{
159 unsigned page_idx; 159 unsigned page_idx;
160 struct pagevec lru_pvec; 160 struct pagevec lru_pvec;
161 int ret = 0; 161 int ret;
162 162
163 if (mapping->a_ops->readpages) { 163 if (mapping->a_ops->readpages) {
164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
171 list_del(&page->lru); 171 list_del(&page->lru);
172 if (!add_to_page_cache(page, mapping, 172 if (!add_to_page_cache(page, mapping,
173 page->index, GFP_KERNEL)) { 173 page->index, GFP_KERNEL)) {
174 mapping->a_ops->readpage(filp, page); 174 ret = mapping->a_ops->readpage(filp, page);
175 if (!pagevec_add(&lru_pvec, page)) 175 if (ret != AOP_TRUNCATED_PAGE) {
176 __pagevec_lru_add(&lru_pvec); 176 if (!pagevec_add(&lru_pvec, page))
177 } else { 177 __pagevec_lru_add(&lru_pvec);
178 page_cache_release(page); 178 continue;
179 } /* else fall through to release */
179 } 180 }
181 page_cache_release(page);
180 } 182 }
181 pagevec_lru_add(&lru_pvec); 183 pagevec_lru_add(&lru_pvec);
184 ret = 0;
182out: 185out:
183 return ret; 186 return ret;
184} 187}
diff --git a/mm/rmap.c b/mm/rmap.c
index 914d04b98b..d85a99d28c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,20 +20,20 @@
20/* 20/*
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_sem (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem 24 * inode->i_alloc_sem
25 * 25 *
26 * When a page fault occurs in writing from user to file, down_read 26 * When a page fault occurs in writing from user to file, down_read
27 * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within 27 * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
28 * down_read of mmap_sem; i_sem and down_write of mmap_sem are never 28 * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
29 * taken together; in truncation, i_sem is taken outermost. 29 * taken together; in truncation, i_mutex is taken outermost.
30 * 30 *
31 * mm->mmap_sem 31 * mm->mmap_sem
32 * page->flags PG_locked (lock_page) 32 * page->flags PG_locked (lock_page)
33 * mapping->i_mmap_lock 33 * mapping->i_mmap_lock
34 * anon_vma->lock 34 * anon_vma->lock
35 * mm->page_table_lock or pte_lock 35 * mm->page_table_lock or pte_lock
36 * zone->lru_lock (in mark_page_accessed) 36 * zone->lru_lock (in mark_page_accessed, isolate_lru_page)
37 * swap_lock (in swap_duplicate, swap_info_get) 37 * swap_lock (in swap_duplicate, swap_info_get)
38 * mmlist_lock (in mmput, drain_mmlist and others) 38 * mmlist_lock (in mmput, drain_mmlist and others)
39 * mapping->private_lock (in __set_page_dirty_buffers) 39 * mapping->private_lock (in __set_page_dirty_buffers)
@@ -225,7 +225,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
225 225
226/* 226/*
227 * At what user virtual address is page expected in vma? checking that the 227 * At what user virtual address is page expected in vma? checking that the
228 * page matches the vma: currently only used by unuse_process, on anon pages. 228 * page matches the vma: currently only used on anon pages, by unuse_vma;
229 */ 229 */
230unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 230unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
231{ 231{
@@ -234,7 +234,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
234 (void *)page->mapping - PAGE_MAPPING_ANON) 234 (void *)page->mapping - PAGE_MAPPING_ANON)
235 return -EFAULT; 235 return -EFAULT;
236 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 236 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
237 if (vma->vm_file->f_mapping != page->mapping) 237 if (!vma->vm_file ||
238 vma->vm_file->f_mapping != page->mapping)
238 return -EFAULT; 239 return -EFAULT;
239 } else 240 } else
240 return -EFAULT; 241 return -EFAULT;
@@ -289,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
289 * repeatedly from either page_referenced_anon or page_referenced_file. 290 * repeatedly from either page_referenced_anon or page_referenced_file.
290 */ 291 */
291static int page_referenced_one(struct page *page, 292static int page_referenced_one(struct page *page,
292 struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token) 293 struct vm_area_struct *vma, unsigned int *mapcount)
293{ 294{
294 struct mm_struct *mm = vma->vm_mm; 295 struct mm_struct *mm = vma->vm_mm;
295 unsigned long address; 296 unsigned long address;
@@ -310,7 +311,7 @@ static int page_referenced_one(struct page *page,
310 311
311 /* Pretend the page is referenced if the task has the 312 /* Pretend the page is referenced if the task has the
312 swap token and is in the middle of a page fault. */ 313 swap token and is in the middle of a page fault. */
313 if (mm != current->mm && !ignore_token && has_swap_token(mm) && 314 if (mm != current->mm && has_swap_token(mm) &&
314 rwsem_is_locked(&mm->mmap_sem)) 315 rwsem_is_locked(&mm->mmap_sem))
315 referenced++; 316 referenced++;
316 317
@@ -320,7 +321,7 @@ out:
320 return referenced; 321 return referenced;
321} 322}
322 323
323static int page_referenced_anon(struct page *page, int ignore_token) 324static int page_referenced_anon(struct page *page)
324{ 325{
325 unsigned int mapcount; 326 unsigned int mapcount;
326 struct anon_vma *anon_vma; 327 struct anon_vma *anon_vma;
@@ -333,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
333 334
334 mapcount = page_mapcount(page); 335 mapcount = page_mapcount(page);
335 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 336 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
336 referenced += page_referenced_one(page, vma, &mapcount, 337 referenced += page_referenced_one(page, vma, &mapcount);
337 ignore_token);
338 if (!mapcount) 338 if (!mapcount)
339 break; 339 break;
340 } 340 }
@@ -353,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
353 * 353 *
354 * This function is only called from page_referenced for object-based pages. 354 * This function is only called from page_referenced for object-based pages.
355 */ 355 */
356static int page_referenced_file(struct page *page, int ignore_token) 356static int page_referenced_file(struct page *page)
357{ 357{
358 unsigned int mapcount; 358 unsigned int mapcount;
359 struct address_space *mapping = page->mapping; 359 struct address_space *mapping = page->mapping;
@@ -391,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token)
391 referenced++; 391 referenced++;
392 break; 392 break;
393 } 393 }
394 referenced += page_referenced_one(page, vma, &mapcount, 394 referenced += page_referenced_one(page, vma, &mapcount);
395 ignore_token);
396 if (!mapcount) 395 if (!mapcount)
397 break; 396 break;
398 } 397 }
@@ -409,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token)
409 * Quick test_and_clear_referenced for all mappings to a page, 408 * Quick test_and_clear_referenced for all mappings to a page,
410 * returns the number of ptes which referenced the page. 409 * returns the number of ptes which referenced the page.
411 */ 410 */
412int page_referenced(struct page *page, int is_locked, int ignore_token) 411int page_referenced(struct page *page, int is_locked)
413{ 412{
414 int referenced = 0; 413 int referenced = 0;
415 414
416 if (!swap_token_default_timeout)
417 ignore_token = 1;
418
419 if (page_test_and_clear_young(page)) 415 if (page_test_and_clear_young(page))
420 referenced++; 416 referenced++;
421 417
@@ -424,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
424 420
425 if (page_mapped(page) && page->mapping) { 421 if (page_mapped(page) && page->mapping) {
426 if (PageAnon(page)) 422 if (PageAnon(page))
427 referenced += page_referenced_anon(page, ignore_token); 423 referenced += page_referenced_anon(page);
428 else if (is_locked) 424 else if (is_locked)
429 referenced += page_referenced_file(page, ignore_token); 425 referenced += page_referenced_file(page);
430 else if (TestSetPageLocked(page)) 426 else if (TestSetPageLocked(page))
431 referenced++; 427 referenced++;
432 else { 428 else {
433 if (page->mapping) 429 if (page->mapping)
434 referenced += page_referenced_file(page, 430 referenced += page_referenced_file(page);
435 ignore_token);
436 unlock_page(page); 431 unlock_page(page);
437 } 432 }
438 } 433 }
@@ -440,6 +435,30 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
440} 435}
441 436
442/** 437/**
438 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to
440 * @vma: the vm area in which the mapping is added
441 * @address: the user virtual address mapped
442 */
443static void __page_set_anon_rmap(struct page *page,
444 struct vm_area_struct *vma, unsigned long address)
445{
446 struct anon_vma *anon_vma = vma->anon_vma;
447
448 BUG_ON(!anon_vma);
449 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
450 page->mapping = (struct address_space *) anon_vma;
451
452 page->index = linear_page_index(vma, address);
453
454 /*
455 * nr_mapped state can be updated without turning off
456 * interrupts because it is not modified via interrupt.
457 */
458 __inc_page_state(nr_mapped);
459}
460
461/**
443 * page_add_anon_rmap - add pte mapping to an anonymous page 462 * page_add_anon_rmap - add pte mapping to an anonymous page
444 * @page: the page to add the mapping to 463 * @page: the page to add the mapping to
445 * @vma: the vm area in which the mapping is added 464 * @vma: the vm area in which the mapping is added
@@ -450,20 +469,27 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
450void page_add_anon_rmap(struct page *page, 469void page_add_anon_rmap(struct page *page,
451 struct vm_area_struct *vma, unsigned long address) 470 struct vm_area_struct *vma, unsigned long address)
452{ 471{
453 if (atomic_inc_and_test(&page->_mapcount)) { 472 if (atomic_inc_and_test(&page->_mapcount))
454 struct anon_vma *anon_vma = vma->anon_vma; 473 __page_set_anon_rmap(page, vma, address);
455
456 BUG_ON(!anon_vma);
457 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
458 page->mapping = (struct address_space *) anon_vma;
459
460 page->index = linear_page_index(vma, address);
461
462 inc_page_state(nr_mapped);
463 }
464 /* else checking page index and mapping is racy */ 474 /* else checking page index and mapping is racy */
465} 475}
466 476
477/*
478 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
479 * @page: the page to add the mapping to
480 * @vma: the vm area in which the mapping is added
481 * @address: the user virtual address mapped
482 *
483 * Same as page_add_anon_rmap but must only be called on *new* pages.
484 * This means the inc-and-test can be bypassed.
485 */
486void page_add_new_anon_rmap(struct page *page,
487 struct vm_area_struct *vma, unsigned long address)
488{
489 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
490 __page_set_anon_rmap(page, vma, address);
491}
492
467/** 493/**
468 * page_add_file_rmap - add pte mapping to a file page 494 * page_add_file_rmap - add pte mapping to a file page
469 * @page: the page to add the mapping to 495 * @page: the page to add the mapping to
@@ -476,7 +502,7 @@ void page_add_file_rmap(struct page *page)
476 BUG_ON(!pfn_valid(page_to_pfn(page))); 502 BUG_ON(!pfn_valid(page_to_pfn(page)));
477 503
478 if (atomic_inc_and_test(&page->_mapcount)) 504 if (atomic_inc_and_test(&page->_mapcount))
479 inc_page_state(nr_mapped); 505 __inc_page_state(nr_mapped);
480} 506}
481 507
482/** 508/**
@@ -488,6 +514,13 @@ void page_add_file_rmap(struct page *page)
488void page_remove_rmap(struct page *page) 514void page_remove_rmap(struct page *page)
489{ 515{
490 if (atomic_add_negative(-1, &page->_mapcount)) { 516 if (atomic_add_negative(-1, &page->_mapcount)) {
517 if (page_mapcount(page) < 0) {
518 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
519 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
520 printk (KERN_EMERG " page->count = %x\n", page_count(page));
521 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
522 }
523
491 BUG_ON(page_mapcount(page) < 0); 524 BUG_ON(page_mapcount(page) < 0);
492 /* 525 /*
493 * It would be tidy to reset the PageAnon mapping here, 526 * It would be tidy to reset the PageAnon mapping here,
@@ -500,7 +533,7 @@ void page_remove_rmap(struct page *page)
500 */ 533 */
501 if (page_test_and_clear_dirty(page)) 534 if (page_test_and_clear_dirty(page))
502 set_page_dirty(page); 535 set_page_dirty(page);
503 dec_page_state(nr_mapped); 536 __dec_page_state(nr_mapped);
504 } 537 }
505} 538}
506 539
@@ -529,10 +562,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
529 * If the page is mlock()d, we cannot swap it out. 562 * If the page is mlock()d, we cannot swap it out.
530 * If it's recently referenced (perhaps page_referenced 563 * If it's recently referenced (perhaps page_referenced
531 * skipped over this mm) then we should reactivate it. 564 * skipped over this mm) then we should reactivate it.
532 *
533 * Pages belonging to VM_RESERVED regions should not happen here.
534 */ 565 */
535 if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || 566 if ((vma->vm_flags & VM_LOCKED) ||
536 ptep_clear_flush_young(vma, address, pte)) { 567 ptep_clear_flush_young(vma, address, pte)) {
537 ret = SWAP_FAIL; 568 ret = SWAP_FAIL;
538 goto out_unmap; 569 goto out_unmap;
@@ -613,7 +644,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
613 struct page *page; 644 struct page *page;
614 unsigned long address; 645 unsigned long address;
615 unsigned long end; 646 unsigned long end;
616 unsigned long pfn;
617 647
618 address = (vma->vm_start + cursor) & CLUSTER_MASK; 648 address = (vma->vm_start + cursor) & CLUSTER_MASK;
619 end = address + CLUSTER_SIZE; 649 end = address + CLUSTER_SIZE;
@@ -642,21 +672,14 @@ static void try_to_unmap_cluster(unsigned long cursor,
642 for (; address < end; pte++, address += PAGE_SIZE) { 672 for (; address < end; pte++, address += PAGE_SIZE) {
643 if (!pte_present(*pte)) 673 if (!pte_present(*pte))
644 continue; 674 continue;
645 675 page = vm_normal_page(vma, address, *pte);
646 pfn = pte_pfn(*pte); 676 BUG_ON(!page || PageAnon(page));
647 if (unlikely(!pfn_valid(pfn))) {
648 print_bad_pte(vma, *pte, address);
649 continue;
650 }
651
652 page = pfn_to_page(pfn);
653 BUG_ON(PageAnon(page));
654 677
655 if (ptep_clear_flush_young(vma, address, pte)) 678 if (ptep_clear_flush_young(vma, address, pte))
656 continue; 679 continue;
657 680
658 /* Nuke the page table entry. */ 681 /* Nuke the page table entry. */
659 flush_cache_page(vma, address, pfn); 682 flush_cache_page(vma, address, pte_pfn(*pte));
660 pteval = ptep_clear_flush(vma, address, pte); 683 pteval = ptep_clear_flush(vma, address, pte);
661 684
662 /* If nonlinear, store the file page offset in the pte. */ 685 /* If nonlinear, store the file page offset in the pte. */
@@ -727,7 +750,7 @@ static int try_to_unmap_file(struct page *page)
727 750
728 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 751 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
729 shared.vm_set.list) { 752 shared.vm_set.list) {
730 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) 753 if (vma->vm_flags & VM_LOCKED)
731 continue; 754 continue;
732 cursor = (unsigned long) vma->vm_private_data; 755 cursor = (unsigned long) vma->vm_private_data;
733 if (cursor > max_nl_cursor) 756 if (cursor > max_nl_cursor)
@@ -761,7 +784,7 @@ static int try_to_unmap_file(struct page *page)
761 do { 784 do {
762 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 785 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
763 shared.vm_set.list) { 786 shared.vm_set.list) {
764 if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) 787 if (vma->vm_flags & VM_LOCKED)
765 continue; 788 continue;
766 cursor = (unsigned long) vma->vm_private_data; 789 cursor = (unsigned long) vma->vm_private_data;
767 while ( cursor < max_nl_cursor && 790 while ( cursor < max_nl_cursor &&
@@ -783,11 +806,8 @@ static int try_to_unmap_file(struct page *page)
783 * in locked vmas). Reset cursor on all unreserved nonlinear 806 * in locked vmas). Reset cursor on all unreserved nonlinear
784 * vmas, now forgetting on which ones it had fallen behind. 807 * vmas, now forgetting on which ones it had fallen behind.
785 */ 808 */
786 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 809 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
787 shared.vm_set.list) { 810 vma->vm_private_data = NULL;
788 if (!(vma->vm_flags & VM_RESERVED))
789 vma->vm_private_data = NULL;
790 }
791out: 811out:
792 spin_unlock(&mapping->i_mmap_lock); 812 spin_unlock(&mapping->i_mmap_lock);
793 return ret; 813 return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61..ce501bce1c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
457 } while (next); 457 } while (next);
458} 458}
459 459
460static void shmem_truncate(struct inode *inode) 460static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
461{ 461{
462 struct shmem_inode_info *info = SHMEM_I(inode); 462 struct shmem_inode_info *info = SHMEM_I(inode);
463 unsigned long idx; 463 unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
475 long nr_swaps_freed = 0; 475 long nr_swaps_freed = 0;
476 int offset; 476 int offset;
477 int freed; 477 int freed;
478 int punch_hole = 0;
478 479
479 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 480 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
480 idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 481 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
481 if (idx >= info->next_index) 482 if (idx >= info->next_index)
482 return; 483 return;
483 484
484 spin_lock(&info->lock); 485 spin_lock(&info->lock);
485 info->flags |= SHMEM_TRUNCATE; 486 info->flags |= SHMEM_TRUNCATE;
486 limit = info->next_index; 487 if (likely(end == (loff_t) -1)) {
487 info->next_index = idx; 488 limit = info->next_index;
489 info->next_index = idx;
490 } else {
491 limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
492 if (limit > info->next_index)
493 limit = info->next_index;
494 punch_hole = 1;
495 }
496
488 topdir = info->i_indirect; 497 topdir = info->i_indirect;
489 if (topdir && idx <= SHMEM_NR_DIRECT) { 498 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
490 info->i_indirect = NULL; 499 info->i_indirect = NULL;
491 nr_pages_to_free++; 500 nr_pages_to_free++;
492 list_add(&topdir->lru, &pages_to_free); 501 list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
573 set_page_private(subdir, page_private(subdir) - freed); 582 set_page_private(subdir, page_private(subdir) - freed);
574 if (offset) 583 if (offset)
575 spin_unlock(&info->lock); 584 spin_unlock(&info->lock);
576 BUG_ON(page_private(subdir) > offset); 585 if (!punch_hole)
586 BUG_ON(page_private(subdir) > offset);
577 } 587 }
578 if (offset) 588 if (offset)
579 offset = 0; 589 offset = 0;
580 else if (subdir) { 590 else if (subdir && !page_private(subdir)) {
581 dir[diroff] = NULL; 591 dir[diroff] = NULL;
582 nr_pages_to_free++; 592 nr_pages_to_free++;
583 list_add(&subdir->lru, &pages_to_free); 593 list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
594 * Also, though shmem_getpage checks i_size before adding to 604 * Also, though shmem_getpage checks i_size before adding to
595 * cache, no recheck after: so fix the narrow window there too. 605 * cache, no recheck after: so fix the narrow window there too.
596 */ 606 */
597 truncate_inode_pages(inode->i_mapping, inode->i_size); 607 truncate_inode_pages_range(inode->i_mapping, start, end);
598 } 608 }
599 609
600 spin_lock(&info->lock); 610 spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
614 } 624 }
615} 625}
616 626
627static void shmem_truncate(struct inode *inode)
628{
629 shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
630}
631
617static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) 632static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
618{ 633{
619 struct inode *inode = dentry->d_inode; 634 struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
855 swap_free(swap); 870 swap_free(swap);
856redirty: 871redirty:
857 set_page_dirty(page); 872 set_page_dirty(page);
858 return WRITEPAGE_ACTIVATE; /* Return with the page locked */ 873 return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
859} 874}
860 875
861#ifdef CONFIG_NUMA 876#ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
1255 return retval; 1270 return retval;
1256} 1271}
1257 1272
1258static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 1273int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1259{ 1274{
1260 file_accessed(file); 1275 file_accessed(file);
1261 vma->vm_ops = &shmem_vm_ops; 1276 vma->vm_ops = &shmem_vm_ops;
@@ -1301,7 +1316,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1301 case S_IFREG: 1316 case S_IFREG:
1302 inode->i_op = &shmem_inode_operations; 1317 inode->i_op = &shmem_inode_operations;
1303 inode->i_fop = &shmem_file_operations; 1318 inode->i_fop = &shmem_file_operations;
1304 mpol_shared_policy_init(&info->policy); 1319 mpol_shared_policy_init(&info->policy, sbinfo->policy,
1320 &sbinfo->policy_nodes);
1305 break; 1321 break;
1306 case S_IFDIR: 1322 case S_IFDIR:
1307 inode->i_nlink++; 1323 inode->i_nlink++;
@@ -1315,7 +1331,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1315 * Must not load anything in the rbtree, 1331 * Must not load anything in the rbtree,
1316 * mpol_free_shared_policy will not be called. 1332 * mpol_free_shared_policy will not be called.
1317 */ 1333 */
1318 mpol_shared_policy_init(&info->policy); 1334 mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
1335 NULL);
1319 break; 1336 break;
1320 } 1337 }
1321 } else if (sbinfo->max_inodes) { 1338 } else if (sbinfo->max_inodes) {
@@ -1355,7 +1372,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
1355 if (!access_ok(VERIFY_READ, buf, count)) 1372 if (!access_ok(VERIFY_READ, buf, count))
1356 return -EFAULT; 1373 return -EFAULT;
1357 1374
1358 down(&inode->i_sem); 1375 mutex_lock(&inode->i_mutex);
1359 1376
1360 pos = *ppos; 1377 pos = *ppos;
1361 written = 0; 1378 written = 0;
@@ -1440,7 +1457,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
1440 if (written) 1457 if (written)
1441 err = written; 1458 err = written;
1442out: 1459out:
1443 up(&inode->i_sem); 1460 mutex_unlock(&inode->i_mutex);
1444 return err; 1461 return err;
1445} 1462}
1446 1463
@@ -1476,7 +1493,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1476 1493
1477 /* 1494 /*
1478 * We must evaluate after, since reads (unlike writes) 1495 * We must evaluate after, since reads (unlike writes)
1479 * are called without i_sem protection against truncate 1496 * are called without i_mutex protection against truncate
1480 */ 1497 */
1481 nr = PAGE_CACHE_SIZE; 1498 nr = PAGE_CACHE_SIZE;
1482 i_size = i_size_read(inode); 1499 i_size = i_size_read(inode);
@@ -1828,7 +1845,9 @@ static struct inode_operations shmem_symlink_inode_operations = {
1828 .put_link = shmem_put_link, 1845 .put_link = shmem_put_link,
1829}; 1846};
1830 1847
1831static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes) 1848static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1849 gid_t *gid, unsigned long *blocks, unsigned long *inodes,
1850 int *policy, nodemask_t *policy_nodes)
1832{ 1851{
1833 char *this_char, *value, *rest; 1852 char *this_char, *value, *rest;
1834 1853
@@ -1882,6 +1901,19 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
1882 *gid = simple_strtoul(value,&rest,0); 1901 *gid = simple_strtoul(value,&rest,0);
1883 if (*rest) 1902 if (*rest)
1884 goto bad_val; 1903 goto bad_val;
1904 } else if (!strcmp(this_char,"mpol")) {
1905 if (!strcmp(value,"default"))
1906 *policy = MPOL_DEFAULT;
1907 else if (!strcmp(value,"preferred"))
1908 *policy = MPOL_PREFERRED;
1909 else if (!strcmp(value,"bind"))
1910 *policy = MPOL_BIND;
1911 else if (!strcmp(value,"interleave"))
1912 *policy = MPOL_INTERLEAVE;
1913 else
1914 goto bad_val;
1915 } else if (!strcmp(this_char,"mpol_nodelist")) {
1916 nodelist_parse(value, *policy_nodes);
1885 } else { 1917 } else {
1886 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1918 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1887 this_char); 1919 this_char);
@@ -1902,12 +1934,14 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1902 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 1934 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1903 unsigned long max_blocks = sbinfo->max_blocks; 1935 unsigned long max_blocks = sbinfo->max_blocks;
1904 unsigned long max_inodes = sbinfo->max_inodes; 1936 unsigned long max_inodes = sbinfo->max_inodes;
1937 int policy = sbinfo->policy;
1938 nodemask_t policy_nodes = sbinfo->policy_nodes;
1905 unsigned long blocks; 1939 unsigned long blocks;
1906 unsigned long inodes; 1940 unsigned long inodes;
1907 int error = -EINVAL; 1941 int error = -EINVAL;
1908 1942
1909 if (shmem_parse_options(data, NULL, NULL, NULL, 1943 if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
1910 &max_blocks, &max_inodes)) 1944 &max_inodes, &policy, &policy_nodes))
1911 return error; 1945 return error;
1912 1946
1913 spin_lock(&sbinfo->stat_lock); 1947 spin_lock(&sbinfo->stat_lock);
@@ -1933,6 +1967,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1933 sbinfo->free_blocks = max_blocks - blocks; 1967 sbinfo->free_blocks = max_blocks - blocks;
1934 sbinfo->max_inodes = max_inodes; 1968 sbinfo->max_inodes = max_inodes;
1935 sbinfo->free_inodes = max_inodes - inodes; 1969 sbinfo->free_inodes = max_inodes - inodes;
1970 sbinfo->policy = policy;
1971 sbinfo->policy_nodes = policy_nodes;
1936out: 1972out:
1937 spin_unlock(&sbinfo->stat_lock); 1973 spin_unlock(&sbinfo->stat_lock);
1938 return error; 1974 return error;
@@ -1957,6 +1993,8 @@ static int shmem_fill_super(struct super_block *sb,
1957 struct shmem_sb_info *sbinfo; 1993 struct shmem_sb_info *sbinfo;
1958 unsigned long blocks = 0; 1994 unsigned long blocks = 0;
1959 unsigned long inodes = 0; 1995 unsigned long inodes = 0;
1996 int policy = MPOL_DEFAULT;
1997 nodemask_t policy_nodes = node_online_map;
1960 1998
1961#ifdef CONFIG_TMPFS 1999#ifdef CONFIG_TMPFS
1962 /* 2000 /*
@@ -1969,8 +2007,8 @@ static int shmem_fill_super(struct super_block *sb,
1969 inodes = totalram_pages - totalhigh_pages; 2007 inodes = totalram_pages - totalhigh_pages;
1970 if (inodes > blocks) 2008 if (inodes > blocks)
1971 inodes = blocks; 2009 inodes = blocks;
1972 if (shmem_parse_options(data, &mode, &uid, &gid, 2010 if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
1973 &blocks, &inodes)) 2011 &inodes, &policy, &policy_nodes))
1974 return -EINVAL; 2012 return -EINVAL;
1975 } 2013 }
1976#else 2014#else
@@ -1988,6 +2026,8 @@ static int shmem_fill_super(struct super_block *sb,
1988 sbinfo->free_blocks = blocks; 2026 sbinfo->free_blocks = blocks;
1989 sbinfo->max_inodes = inodes; 2027 sbinfo->max_inodes = inodes;
1990 sbinfo->free_inodes = inodes; 2028 sbinfo->free_inodes = inodes;
2029 sbinfo->policy = policy;
2030 sbinfo->policy_nodes = policy_nodes;
1991 2031
1992 sb->s_fs_info = sbinfo; 2032 sb->s_fs_info = sbinfo;
1993 sb->s_maxbytes = SHMEM_MAX_BYTES; 2033 sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2083,6 +2123,7 @@ static struct file_operations shmem_file_operations = {
2083static struct inode_operations shmem_inode_operations = { 2123static struct inode_operations shmem_inode_operations = {
2084 .truncate = shmem_truncate, 2124 .truncate = shmem_truncate,
2085 .setattr = shmem_notify_change, 2125 .setattr = shmem_notify_change,
2126 .truncate_range = shmem_truncate_range,
2086}; 2127};
2087 2128
2088static struct inode_operations shmem_dir_inode_operations = { 2129static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/slab.c b/mm/slab.c
index e291f5e1af..6f8495e218 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
68 * Further notes from the original documentation: 68 * Further notes from the original documentation:
69 * 69 *
70 * 11 April '97. Started multi-threading - markhe 70 * 11 April '97. Started multi-threading - markhe
71 * The global cache-chain is protected by the semaphore 'cache_chain_sem'. 71 * The global cache-chain is protected by the mutex 'cache_chain_mutex'.
72 * The sem is only needed when accessing/extending the cache-chain, which 72 * The sem is only needed when accessing/extending the cache-chain, which
73 * can never happen inside an interrupt (kmem_cache_create(), 73 * can never happen inside an interrupt (kmem_cache_create(),
74 * kmem_cache_shrink() and kmem_cache_reap()). 74 * kmem_cache_shrink() and kmem_cache_reap()).
@@ -103,6 +103,8 @@
103#include <linux/rcupdate.h> 103#include <linux/rcupdate.h>
104#include <linux/string.h> 104#include <linux/string.h>
105#include <linux/nodemask.h> 105#include <linux/nodemask.h>
106#include <linux/mempolicy.h>
107#include <linux/mutex.h>
106 108
107#include <asm/uaccess.h> 109#include <asm/uaccess.h>
108#include <asm/cacheflush.h> 110#include <asm/cacheflush.h>
@@ -130,7 +132,6 @@
130#define FORCED_DEBUG 0 132#define FORCED_DEBUG 0
131#endif 133#endif
132 134
133
134/* Shouldn't this be in a header file somewhere? */ 135/* Shouldn't this be in a header file somewhere? */
135#define BYTES_PER_WORD sizeof(void *) 136#define BYTES_PER_WORD sizeof(void *)
136 137
@@ -217,12 +218,12 @@ static unsigned long offslab_limit;
217 * Slabs are chained into three list: fully used, partial, fully free slabs. 218 * Slabs are chained into three list: fully used, partial, fully free slabs.
218 */ 219 */
219struct slab { 220struct slab {
220 struct list_head list; 221 struct list_head list;
221 unsigned long colouroff; 222 unsigned long colouroff;
222 void *s_mem; /* including colour offset */ 223 void *s_mem; /* including colour offset */
223 unsigned int inuse; /* num of objs active in slab */ 224 unsigned int inuse; /* num of objs active in slab */
224 kmem_bufctl_t free; 225 kmem_bufctl_t free;
225 unsigned short nodeid; 226 unsigned short nodeid;
226}; 227};
227 228
228/* 229/*
@@ -242,9 +243,9 @@ struct slab {
242 * We assume struct slab_rcu can overlay struct slab when destroying. 243 * We assume struct slab_rcu can overlay struct slab when destroying.
243 */ 244 */
244struct slab_rcu { 245struct slab_rcu {
245 struct rcu_head head; 246 struct rcu_head head;
246 kmem_cache_t *cachep; 247 kmem_cache_t *cachep;
247 void *addr; 248 void *addr;
248}; 249};
249 250
250/* 251/*
@@ -279,23 +280,23 @@ struct array_cache {
279#define BOOT_CPUCACHE_ENTRIES 1 280#define BOOT_CPUCACHE_ENTRIES 1
280struct arraycache_init { 281struct arraycache_init {
281 struct array_cache cache; 282 struct array_cache cache;
282 void * entries[BOOT_CPUCACHE_ENTRIES]; 283 void *entries[BOOT_CPUCACHE_ENTRIES];
283}; 284};
284 285
285/* 286/*
286 * The slab lists for all objects. 287 * The slab lists for all objects.
287 */ 288 */
288struct kmem_list3 { 289struct kmem_list3 {
289 struct list_head slabs_partial; /* partial list first, better asm code */ 290 struct list_head slabs_partial; /* partial list first, better asm code */
290 struct list_head slabs_full; 291 struct list_head slabs_full;
291 struct list_head slabs_free; 292 struct list_head slabs_free;
292 unsigned long free_objects; 293 unsigned long free_objects;
293 unsigned long next_reap; 294 unsigned long next_reap;
294 int free_touched; 295 int free_touched;
295 unsigned int free_limit; 296 unsigned int free_limit;
296 spinlock_t list_lock; 297 spinlock_t list_lock;
297 struct array_cache *shared; /* shared per node */ 298 struct array_cache *shared; /* shared per node */
298 struct array_cache **alien; /* on other nodes */ 299 struct array_cache **alien; /* on other nodes */
299}; 300};
300 301
301/* 302/*
@@ -367,63 +368,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
367 * 368 *
368 * manages a cache. 369 * manages a cache.
369 */ 370 */
370 371
371struct kmem_cache { 372struct kmem_cache {
372/* 1) per-cpu data, touched during every alloc/free */ 373/* 1) per-cpu data, touched during every alloc/free */
373 struct array_cache *array[NR_CPUS]; 374 struct array_cache *array[NR_CPUS];
374 unsigned int batchcount; 375 unsigned int batchcount;
375 unsigned int limit; 376 unsigned int limit;
376 unsigned int shared; 377 unsigned int shared;
377 unsigned int objsize; 378 unsigned int objsize;
378/* 2) touched by every alloc & free from the backend */ 379/* 2) touched by every alloc & free from the backend */
379 struct kmem_list3 *nodelists[MAX_NUMNODES]; 380 struct kmem_list3 *nodelists[MAX_NUMNODES];
380 unsigned int flags; /* constant flags */ 381 unsigned int flags; /* constant flags */
381 unsigned int num; /* # of objs per slab */ 382 unsigned int num; /* # of objs per slab */
382 spinlock_t spinlock; 383 spinlock_t spinlock;
383 384
384/* 3) cache_grow/shrink */ 385/* 3) cache_grow/shrink */
385 /* order of pgs per slab (2^n) */ 386 /* order of pgs per slab (2^n) */
386 unsigned int gfporder; 387 unsigned int gfporder;
387 388
388 /* force GFP flags, e.g. GFP_DMA */ 389 /* force GFP flags, e.g. GFP_DMA */
389 gfp_t gfpflags; 390 gfp_t gfpflags;
390 391
391 size_t colour; /* cache colouring range */ 392 size_t colour; /* cache colouring range */
392 unsigned int colour_off; /* colour offset */ 393 unsigned int colour_off; /* colour offset */
393 unsigned int colour_next; /* cache colouring */ 394 unsigned int colour_next; /* cache colouring */
394 kmem_cache_t *slabp_cache; 395 kmem_cache_t *slabp_cache;
395 unsigned int slab_size; 396 unsigned int slab_size;
396 unsigned int dflags; /* dynamic flags */ 397 unsigned int dflags; /* dynamic flags */
397 398
398 /* constructor func */ 399 /* constructor func */
399 void (*ctor)(void *, kmem_cache_t *, unsigned long); 400 void (*ctor) (void *, kmem_cache_t *, unsigned long);
400 401
401 /* de-constructor func */ 402 /* de-constructor func */
402 void (*dtor)(void *, kmem_cache_t *, unsigned long); 403 void (*dtor) (void *, kmem_cache_t *, unsigned long);
403 404
404/* 4) cache creation/removal */ 405/* 4) cache creation/removal */
405 const char *name; 406 const char *name;
406 struct list_head next; 407 struct list_head next;
407 408
408/* 5) statistics */ 409/* 5) statistics */
409#if STATS 410#if STATS
410 unsigned long num_active; 411 unsigned long num_active;
411 unsigned long num_allocations; 412 unsigned long num_allocations;
412 unsigned long high_mark; 413 unsigned long high_mark;
413 unsigned long grown; 414 unsigned long grown;
414 unsigned long reaped; 415 unsigned long reaped;
415 unsigned long errors; 416 unsigned long errors;
416 unsigned long max_freeable; 417 unsigned long max_freeable;
417 unsigned long node_allocs; 418 unsigned long node_allocs;
418 unsigned long node_frees; 419 unsigned long node_frees;
419 atomic_t allochit; 420 atomic_t allochit;
420 atomic_t allocmiss; 421 atomic_t allocmiss;
421 atomic_t freehit; 422 atomic_t freehit;
422 atomic_t freemiss; 423 atomic_t freemiss;
423#endif 424#endif
424#if DEBUG 425#if DEBUG
425 int dbghead; 426 int dbghead;
426 int reallen; 427 int reallen;
427#endif 428#endif
428}; 429};
429 430
@@ -434,7 +435,7 @@ struct kmem_cache {
434/* Optimization question: fewer reaps means less 435/* Optimization question: fewer reaps means less
435 * probability for unnessary cpucache drain/refill cycles. 436 * probability for unnessary cpucache drain/refill cycles.
436 * 437 *
437 * OTHO the cpuarrays can contain lots of objects, 438 * OTOH the cpuarrays can contain lots of objects,
438 * which could lock up otherwise freeable slabs. 439 * which could lock up otherwise freeable slabs.
439 */ 440 */
440#define REAPTIMEOUT_CPUC (2*HZ) 441#define REAPTIMEOUT_CPUC (2*HZ)
@@ -523,14 +524,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
523{ 524{
524 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 525 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
525 if (cachep->flags & SLAB_STORE_USER) 526 if (cachep->flags & SLAB_STORE_USER)
526 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); 527 return (unsigned long *)(objp + cachep->objsize -
527 return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); 528 2 * BYTES_PER_WORD);
529 return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
528} 530}
529 531
530static void **dbg_userword(kmem_cache_t *cachep, void *objp) 532static void **dbg_userword(kmem_cache_t *cachep, void *objp)
531{ 533{
532 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 534 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
533 return (void**)(objp+cachep->objsize-BYTES_PER_WORD); 535 return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
534} 536}
535 537
536#else 538#else
@@ -565,14 +567,29 @@ static void **dbg_userword(kmem_cache_t *cachep, void *objp)
565#define BREAK_GFP_ORDER_LO 0 567#define BREAK_GFP_ORDER_LO 0
566static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 568static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
567 569
568/* Macros for storing/retrieving the cachep and or slab from the 570/* Functions for storing/retrieving the cachep and or slab from the
569 * global 'mem_map'. These are used to find the slab an obj belongs to. 571 * global 'mem_map'. These are used to find the slab an obj belongs to.
570 * With kfree(), these are used to find the cache which an obj belongs to. 572 * With kfree(), these are used to find the cache which an obj belongs to.
571 */ 573 */
572#define SET_PAGE_CACHE(pg,x) ((pg)->lru.next = (struct list_head *)(x)) 574static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
573#define GET_PAGE_CACHE(pg) ((kmem_cache_t *)(pg)->lru.next) 575{
574#define SET_PAGE_SLAB(pg,x) ((pg)->lru.prev = (struct list_head *)(x)) 576 page->lru.next = (struct list_head *)cache;
575#define GET_PAGE_SLAB(pg) ((struct slab *)(pg)->lru.prev) 577}
578
579static inline struct kmem_cache *page_get_cache(struct page *page)
580{
581 return (struct kmem_cache *)page->lru.next;
582}
583
584static inline void page_set_slab(struct page *page, struct slab *slab)
585{
586 page->lru.prev = (struct list_head *)slab;
587}
588
589static inline struct slab *page_get_slab(struct page *page)
590{
591 return (struct slab *)page->lru.prev;
592}
576 593
577/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 594/* These are the default caches for kmalloc. Custom caches can have other sizes. */
578struct cache_sizes malloc_sizes[] = { 595struct cache_sizes malloc_sizes[] = {
@@ -592,31 +609,31 @@ struct cache_names {
592static struct cache_names __initdata cache_names[] = { 609static struct cache_names __initdata cache_names[] = {
593#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 610#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
594#include <linux/kmalloc_sizes.h> 611#include <linux/kmalloc_sizes.h>
595 { NULL, } 612 {NULL,}
596#undef CACHE 613#undef CACHE
597}; 614};
598 615
599static struct arraycache_init initarray_cache __initdata = 616static struct arraycache_init initarray_cache __initdata =
600 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 617 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
601static struct arraycache_init initarray_generic = 618static struct arraycache_init initarray_generic =
602 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 619 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
603 620
604/* internal cache of cache description objs */ 621/* internal cache of cache description objs */
605static kmem_cache_t cache_cache = { 622static kmem_cache_t cache_cache = {
606 .batchcount = 1, 623 .batchcount = 1,
607 .limit = BOOT_CPUCACHE_ENTRIES, 624 .limit = BOOT_CPUCACHE_ENTRIES,
608 .shared = 1, 625 .shared = 1,
609 .objsize = sizeof(kmem_cache_t), 626 .objsize = sizeof(kmem_cache_t),
610 .flags = SLAB_NO_REAP, 627 .flags = SLAB_NO_REAP,
611 .spinlock = SPIN_LOCK_UNLOCKED, 628 .spinlock = SPIN_LOCK_UNLOCKED,
612 .name = "kmem_cache", 629 .name = "kmem_cache",
613#if DEBUG 630#if DEBUG
614 .reallen = sizeof(kmem_cache_t), 631 .reallen = sizeof(kmem_cache_t),
615#endif 632#endif
616}; 633};
617 634
618/* Guard access to the cache-chain. */ 635/* Guard access to the cache-chain. */
619static struct semaphore cache_chain_sem; 636static DEFINE_MUTEX(cache_chain_mutex);
620static struct list_head cache_chain; 637static struct list_head cache_chain;
621 638
622/* 639/*
@@ -640,9 +657,9 @@ static enum {
640 657
641static DEFINE_PER_CPU(struct work_struct, reap_work); 658static DEFINE_PER_CPU(struct work_struct, reap_work);
642 659
643static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node); 660static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
644static void enable_cpucache (kmem_cache_t *cachep); 661static void enable_cpucache(kmem_cache_t *cachep);
645static void cache_reap (void *unused); 662static void cache_reap(void *unused);
646static int __node_shrink(kmem_cache_t *cachep, int node); 663static int __node_shrink(kmem_cache_t *cachep, int node);
647 664
648static inline struct array_cache *ac_data(kmem_cache_t *cachep) 665static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -656,9 +673,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
656 673
657#if DEBUG 674#if DEBUG
658 /* This happens if someone tries to call 675 /* This happens if someone tries to call
659 * kmem_cache_create(), or __kmalloc(), before 676 * kmem_cache_create(), or __kmalloc(), before
660 * the generic caches are initialized. 677 * the generic caches are initialized.
661 */ 678 */
662 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 679 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
663#endif 680#endif
664 while (size > csizep->cs_size) 681 while (size > csizep->cs_size)
@@ -682,10 +699,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
682 699
683/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 700/* Cal the num objs, wastage, and bytes left over for a given slab size. */
684static void cache_estimate(unsigned long gfporder, size_t size, size_t align, 701static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
685 int flags, size_t *left_over, unsigned int *num) 702 int flags, size_t *left_over, unsigned int *num)
686{ 703{
687 int i; 704 int i;
688 size_t wastage = PAGE_SIZE<<gfporder; 705 size_t wastage = PAGE_SIZE << gfporder;
689 size_t extra = 0; 706 size_t extra = 0;
690 size_t base = 0; 707 size_t base = 0;
691 708
@@ -694,7 +711,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
694 extra = sizeof(kmem_bufctl_t); 711 extra = sizeof(kmem_bufctl_t);
695 } 712 }
696 i = 0; 713 i = 0;
697 while (i*size + ALIGN(base+i*extra, align) <= wastage) 714 while (i * size + ALIGN(base + i * extra, align) <= wastage)
698 i++; 715 i++;
699 if (i > 0) 716 if (i > 0)
700 i--; 717 i--;
@@ -703,8 +720,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
703 i = SLAB_LIMIT; 720 i = SLAB_LIMIT;
704 721
705 *num = i; 722 *num = i;
706 wastage -= i*size; 723 wastage -= i * size;
707 wastage -= ALIGN(base+i*extra, align); 724 wastage -= ALIGN(base + i * extra, align);
708 *left_over = wastage; 725 *left_over = wastage;
709} 726}
710 727
@@ -713,7 +730,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
713static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 730static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
714{ 731{
715 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 732 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
716 function, cachep->name, msg); 733 function, cachep->name, msg);
717 dump_stack(); 734 dump_stack();
718} 735}
719 736
@@ -740,9 +757,9 @@ static void __devinit start_cpu_timer(int cpu)
740} 757}
741 758
742static struct array_cache *alloc_arraycache(int node, int entries, 759static struct array_cache *alloc_arraycache(int node, int entries,
743 int batchcount) 760 int batchcount)
744{ 761{
745 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 762 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
746 struct array_cache *nc = NULL; 763 struct array_cache *nc = NULL;
747 764
748 nc = kmalloc_node(memsize, GFP_KERNEL, node); 765 nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -757,10 +774,12 @@ static struct array_cache *alloc_arraycache(int node, int entries,
757} 774}
758 775
759#ifdef CONFIG_NUMA 776#ifdef CONFIG_NUMA
777static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
778
760static inline struct array_cache **alloc_alien_cache(int node, int limit) 779static inline struct array_cache **alloc_alien_cache(int node, int limit)
761{ 780{
762 struct array_cache **ac_ptr; 781 struct array_cache **ac_ptr;
763 int memsize = sizeof(void*)*MAX_NUMNODES; 782 int memsize = sizeof(void *) * MAX_NUMNODES;
764 int i; 783 int i;
765 784
766 if (limit > 1) 785 if (limit > 1)
@@ -774,7 +793,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
774 } 793 }
775 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 794 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
776 if (!ac_ptr[i]) { 795 if (!ac_ptr[i]) {
777 for (i--; i <=0; i--) 796 for (i--; i <= 0; i--)
778 kfree(ac_ptr[i]); 797 kfree(ac_ptr[i]);
779 kfree(ac_ptr); 798 kfree(ac_ptr);
780 return NULL; 799 return NULL;
@@ -792,12 +811,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
792 return; 811 return;
793 812
794 for_each_node(i) 813 for_each_node(i)
795 kfree(ac_ptr[i]); 814 kfree(ac_ptr[i]);
796 815
797 kfree(ac_ptr); 816 kfree(ac_ptr);
798} 817}
799 818
800static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node) 819static inline void __drain_alien_cache(kmem_cache_t *cachep,
820 struct array_cache *ac, int node)
801{ 821{
802 struct kmem_list3 *rl3 = cachep->nodelists[node]; 822 struct kmem_list3 *rl3 = cachep->nodelists[node];
803 823
@@ -811,7 +831,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
811 831
812static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 832static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
813{ 833{
814 int i=0; 834 int i = 0;
815 struct array_cache *ac; 835 struct array_cache *ac;
816 unsigned long flags; 836 unsigned long flags;
817 837
@@ -831,18 +851,17 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
831#endif 851#endif
832 852
833static int __devinit cpuup_callback(struct notifier_block *nfb, 853static int __devinit cpuup_callback(struct notifier_block *nfb,
834 unsigned long action, void *hcpu) 854 unsigned long action, void *hcpu)
835{ 855{
836 long cpu = (long)hcpu; 856 long cpu = (long)hcpu;
837 kmem_cache_t* cachep; 857 kmem_cache_t *cachep;
838 struct kmem_list3 *l3 = NULL; 858 struct kmem_list3 *l3 = NULL;
839 int node = cpu_to_node(cpu); 859 int node = cpu_to_node(cpu);
840 int memsize = sizeof(struct kmem_list3); 860 int memsize = sizeof(struct kmem_list3);
841 struct array_cache *nc = NULL;
842 861
843 switch (action) { 862 switch (action) {
844 case CPU_UP_PREPARE: 863 case CPU_UP_PREPARE:
845 down(&cache_chain_sem); 864 mutex_lock(&cache_chain_mutex);
846 /* we need to do this right in the beginning since 865 /* we need to do this right in the beginning since
847 * alloc_arraycache's are going to use this list. 866 * alloc_arraycache's are going to use this list.
848 * kmalloc_node allows us to add the slab to the right 867 * kmalloc_node allows us to add the slab to the right
@@ -856,27 +875,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
856 */ 875 */
857 if (!cachep->nodelists[node]) { 876 if (!cachep->nodelists[node]) {
858 if (!(l3 = kmalloc_node(memsize, 877 if (!(l3 = kmalloc_node(memsize,
859 GFP_KERNEL, node))) 878 GFP_KERNEL, node)))
860 goto bad; 879 goto bad;
861 kmem_list3_init(l3); 880 kmem_list3_init(l3);
862 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 881 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
863 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 882 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
864 883
865 cachep->nodelists[node] = l3; 884 cachep->nodelists[node] = l3;
866 } 885 }
867 886
868 spin_lock_irq(&cachep->nodelists[node]->list_lock); 887 spin_lock_irq(&cachep->nodelists[node]->list_lock);
869 cachep->nodelists[node]->free_limit = 888 cachep->nodelists[node]->free_limit =
870 (1 + nr_cpus_node(node)) * 889 (1 + nr_cpus_node(node)) *
871 cachep->batchcount + cachep->num; 890 cachep->batchcount + cachep->num;
872 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 891 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
873 } 892 }
874 893
875 /* Now we can go ahead with allocating the shared array's 894 /* Now we can go ahead with allocating the shared array's
876 & array cache's */ 895 & array cache's */
877 list_for_each_entry(cachep, &cache_chain, next) { 896 list_for_each_entry(cachep, &cache_chain, next) {
897 struct array_cache *nc;
898
878 nc = alloc_arraycache(node, cachep->limit, 899 nc = alloc_arraycache(node, cachep->limit,
879 cachep->batchcount); 900 cachep->batchcount);
880 if (!nc) 901 if (!nc)
881 goto bad; 902 goto bad;
882 cachep->array[cpu] = nc; 903 cachep->array[cpu] = nc;
@@ -885,16 +906,17 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
885 BUG_ON(!l3); 906 BUG_ON(!l3);
886 if (!l3->shared) { 907 if (!l3->shared) {
887 if (!(nc = alloc_arraycache(node, 908 if (!(nc = alloc_arraycache(node,
888 cachep->shared*cachep->batchcount, 909 cachep->shared *
889 0xbaadf00d))) 910 cachep->batchcount,
890 goto bad; 911 0xbaadf00d)))
912 goto bad;
891 913
892 /* we are serialised from CPU_DEAD or 914 /* we are serialised from CPU_DEAD or
893 CPU_UP_CANCELLED by the cpucontrol lock */ 915 CPU_UP_CANCELLED by the cpucontrol lock */
894 l3->shared = nc; 916 l3->shared = nc;
895 } 917 }
896 } 918 }
897 up(&cache_chain_sem); 919 mutex_unlock(&cache_chain_mutex);
898 break; 920 break;
899 case CPU_ONLINE: 921 case CPU_ONLINE:
900 start_cpu_timer(cpu); 922 start_cpu_timer(cpu);
@@ -903,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
903 case CPU_DEAD: 925 case CPU_DEAD:
904 /* fall thru */ 926 /* fall thru */
905 case CPU_UP_CANCELED: 927 case CPU_UP_CANCELED:
906 down(&cache_chain_sem); 928 mutex_lock(&cache_chain_mutex);
907 929
908 list_for_each_entry(cachep, &cache_chain, next) { 930 list_for_each_entry(cachep, &cache_chain, next) {
909 struct array_cache *nc; 931 struct array_cache *nc;
@@ -927,13 +949,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
927 free_block(cachep, nc->entry, nc->avail, node); 949 free_block(cachep, nc->entry, nc->avail, node);
928 950
929 if (!cpus_empty(mask)) { 951 if (!cpus_empty(mask)) {
930 spin_unlock(&l3->list_lock); 952 spin_unlock(&l3->list_lock);
931 goto unlock_cache; 953 goto unlock_cache;
932 } 954 }
933 955
934 if (l3->shared) { 956 if (l3->shared) {
935 free_block(cachep, l3->shared->entry, 957 free_block(cachep, l3->shared->entry,
936 l3->shared->avail, node); 958 l3->shared->avail, node);
937 kfree(l3->shared); 959 kfree(l3->shared);
938 l3->shared = NULL; 960 l3->shared = NULL;
939 } 961 }
@@ -951,17 +973,17 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
951 } else { 973 } else {
952 spin_unlock(&l3->list_lock); 974 spin_unlock(&l3->list_lock);
953 } 975 }
954unlock_cache: 976 unlock_cache:
955 spin_unlock_irq(&cachep->spinlock); 977 spin_unlock_irq(&cachep->spinlock);
956 kfree(nc); 978 kfree(nc);
957 } 979 }
958 up(&cache_chain_sem); 980 mutex_unlock(&cache_chain_mutex);
959 break; 981 break;
960#endif 982#endif
961 } 983 }
962 return NOTIFY_OK; 984 return NOTIFY_OK;
963bad: 985 bad:
964 up(&cache_chain_sem); 986 mutex_unlock(&cache_chain_mutex);
965 return NOTIFY_BAD; 987 return NOTIFY_BAD;
966} 988}
967 989
@@ -970,8 +992,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
970/* 992/*
971 * swap the static kmem_list3 with kmalloced memory 993 * swap the static kmem_list3 with kmalloced memory
972 */ 994 */
973static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, 995static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
974 int nodeid)
975{ 996{
976 struct kmem_list3 *ptr; 997 struct kmem_list3 *ptr;
977 998
@@ -1030,7 +1051,6 @@ void __init kmem_cache_init(void)
1030 */ 1051 */
1031 1052
1032 /* 1) create the cache_cache */ 1053 /* 1) create the cache_cache */
1033 init_MUTEX(&cache_chain_sem);
1034 INIT_LIST_HEAD(&cache_chain); 1054 INIT_LIST_HEAD(&cache_chain);
1035 list_add(&cache_cache.next, &cache_chain); 1055 list_add(&cache_cache.next, &cache_chain);
1036 cache_cache.colour_off = cache_line_size(); 1056 cache_cache.colour_off = cache_line_size();
@@ -1040,14 +1060,14 @@ void __init kmem_cache_init(void)
1040 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1060 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
1041 1061
1042 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1062 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
1043 &left_over, &cache_cache.num); 1063 &left_over, &cache_cache.num);
1044 if (!cache_cache.num) 1064 if (!cache_cache.num)
1045 BUG(); 1065 BUG();
1046 1066
1047 cache_cache.colour = left_over/cache_cache.colour_off; 1067 cache_cache.colour = left_over / cache_cache.colour_off;
1048 cache_cache.colour_next = 0; 1068 cache_cache.colour_next = 0;
1049 cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + 1069 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1050 sizeof(struct slab), cache_line_size()); 1070 sizeof(struct slab), cache_line_size());
1051 1071
1052 /* 2+3) create the kmalloc caches */ 1072 /* 2+3) create the kmalloc caches */
1053 sizes = malloc_sizes; 1073 sizes = malloc_sizes;
@@ -1059,14 +1079,18 @@ void __init kmem_cache_init(void)
1059 */ 1079 */
1060 1080
1061 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1081 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1062 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, 1082 sizes[INDEX_AC].cs_size,
1063 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1083 ARCH_KMALLOC_MINALIGN,
1084 (ARCH_KMALLOC_FLAGS |
1085 SLAB_PANIC), NULL, NULL);
1064 1086
1065 if (INDEX_AC != INDEX_L3) 1087 if (INDEX_AC != INDEX_L3)
1066 sizes[INDEX_L3].cs_cachep = 1088 sizes[INDEX_L3].cs_cachep =
1067 kmem_cache_create(names[INDEX_L3].name, 1089 kmem_cache_create(names[INDEX_L3].name,
1068 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, 1090 sizes[INDEX_L3].cs_size,
1069 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1091 ARCH_KMALLOC_MINALIGN,
1092 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
1093 NULL);
1070 1094
1071 while (sizes->cs_size != ULONG_MAX) { 1095 while (sizes->cs_size != ULONG_MAX) {
1072 /* 1096 /*
@@ -1076,35 +1100,41 @@ void __init kmem_cache_init(void)
1076 * Note for systems short on memory removing the alignment will 1100 * Note for systems short on memory removing the alignment will
1077 * allow tighter packing of the smaller caches. 1101 * allow tighter packing of the smaller caches.
1078 */ 1102 */
1079 if(!sizes->cs_cachep) 1103 if (!sizes->cs_cachep)
1080 sizes->cs_cachep = kmem_cache_create(names->name, 1104 sizes->cs_cachep = kmem_cache_create(names->name,
1081 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1105 sizes->cs_size,
1082 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1106 ARCH_KMALLOC_MINALIGN,
1107 (ARCH_KMALLOC_FLAGS
1108 | SLAB_PANIC),
1109 NULL, NULL);
1083 1110
1084 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1111 /* Inc off-slab bufctl limit until the ceiling is hit. */
1085 if (!(OFF_SLAB(sizes->cs_cachep))) { 1112 if (!(OFF_SLAB(sizes->cs_cachep))) {
1086 offslab_limit = sizes->cs_size-sizeof(struct slab); 1113 offslab_limit = sizes->cs_size - sizeof(struct slab);
1087 offslab_limit /= sizeof(kmem_bufctl_t); 1114 offslab_limit /= sizeof(kmem_bufctl_t);
1088 } 1115 }
1089 1116
1090 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1117 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1091 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1118 sizes->cs_size,
1092 (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), 1119 ARCH_KMALLOC_MINALIGN,
1093 NULL, NULL); 1120 (ARCH_KMALLOC_FLAGS |
1121 SLAB_CACHE_DMA |
1122 SLAB_PANIC), NULL,
1123 NULL);
1094 1124
1095 sizes++; 1125 sizes++;
1096 names++; 1126 names++;
1097 } 1127 }
1098 /* 4) Replace the bootstrap head arrays */ 1128 /* 4) Replace the bootstrap head arrays */
1099 { 1129 {
1100 void * ptr; 1130 void *ptr;
1101 1131
1102 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1132 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1103 1133
1104 local_irq_disable(); 1134 local_irq_disable();
1105 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1135 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
1106 memcpy(ptr, ac_data(&cache_cache), 1136 memcpy(ptr, ac_data(&cache_cache),
1107 sizeof(struct arraycache_init)); 1137 sizeof(struct arraycache_init));
1108 cache_cache.array[smp_processor_id()] = ptr; 1138 cache_cache.array[smp_processor_id()] = ptr;
1109 local_irq_enable(); 1139 local_irq_enable();
1110 1140
@@ -1112,11 +1142,11 @@ void __init kmem_cache_init(void)
1112 1142
1113 local_irq_disable(); 1143 local_irq_disable();
1114 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1144 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
1115 != &initarray_generic.cache); 1145 != &initarray_generic.cache);
1116 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1146 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
1117 sizeof(struct arraycache_init)); 1147 sizeof(struct arraycache_init));
1118 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1148 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1119 ptr; 1149 ptr;
1120 local_irq_enable(); 1150 local_irq_enable();
1121 } 1151 }
1122 /* 5) Replace the bootstrap kmem_list3's */ 1152 /* 5) Replace the bootstrap kmem_list3's */
@@ -1124,16 +1154,16 @@ void __init kmem_cache_init(void)
1124 int node; 1154 int node;
1125 /* Replace the static kmem_list3 structures for the boot cpu */ 1155 /* Replace the static kmem_list3 structures for the boot cpu */
1126 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1156 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1127 numa_node_id()); 1157 numa_node_id());
1128 1158
1129 for_each_online_node(node) { 1159 for_each_online_node(node) {
1130 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1160 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1131 &initkmem_list3[SIZE_AC+node], node); 1161 &initkmem_list3[SIZE_AC + node], node);
1132 1162
1133 if (INDEX_AC != INDEX_L3) { 1163 if (INDEX_AC != INDEX_L3) {
1134 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1164 init_list(malloc_sizes[INDEX_L3].cs_cachep,
1135 &initkmem_list3[SIZE_L3+node], 1165 &initkmem_list3[SIZE_L3 + node],
1136 node); 1166 node);
1137 } 1167 }
1138 } 1168 }
1139 } 1169 }
@@ -1141,10 +1171,10 @@ void __init kmem_cache_init(void)
1141 /* 6) resize the head arrays to their final sizes */ 1171 /* 6) resize the head arrays to their final sizes */
1142 { 1172 {
1143 kmem_cache_t *cachep; 1173 kmem_cache_t *cachep;
1144 down(&cache_chain_sem); 1174 mutex_lock(&cache_chain_mutex);
1145 list_for_each_entry(cachep, &cache_chain, next) 1175 list_for_each_entry(cachep, &cache_chain, next)
1146 enable_cpucache(cachep); 1176 enable_cpucache(cachep);
1147 up(&cache_chain_sem); 1177 mutex_unlock(&cache_chain_mutex);
1148 } 1178 }
1149 1179
1150 /* Done! */ 1180 /* Done! */
@@ -1169,7 +1199,7 @@ static int __init cpucache_init(void)
1169 * pages to gfp. 1199 * pages to gfp.
1170 */ 1200 */
1171 for_each_online_cpu(cpu) 1201 for_each_online_cpu(cpu)
1172 start_cpu_timer(cpu); 1202 start_cpu_timer(cpu);
1173 1203
1174 return 0; 1204 return 0;
1175} 1205}
@@ -1190,11 +1220,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1190 int i; 1220 int i;
1191 1221
1192 flags |= cachep->gfpflags; 1222 flags |= cachep->gfpflags;
1193 if (likely(nodeid == -1)) { 1223 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1194 page = alloc_pages(flags, cachep->gfporder);
1195 } else {
1196 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1197 }
1198 if (!page) 1224 if (!page)
1199 return NULL; 1225 return NULL;
1200 addr = page_address(page); 1226 addr = page_address(page);
@@ -1215,7 +1241,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1215 */ 1241 */
1216static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1242static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1217{ 1243{
1218 unsigned long i = (1<<cachep->gfporder); 1244 unsigned long i = (1 << cachep->gfporder);
1219 struct page *page = virt_to_page(addr); 1245 struct page *page = virt_to_page(addr);
1220 const unsigned long nr_freed = i; 1246 const unsigned long nr_freed = i;
1221 1247
@@ -1228,13 +1254,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1228 if (current->reclaim_state) 1254 if (current->reclaim_state)
1229 current->reclaim_state->reclaimed_slab += nr_freed; 1255 current->reclaim_state->reclaimed_slab += nr_freed;
1230 free_pages((unsigned long)addr, cachep->gfporder); 1256 free_pages((unsigned long)addr, cachep->gfporder);
1231 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1257 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1232 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); 1258 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1233} 1259}
1234 1260
1235static void kmem_rcu_free(struct rcu_head *head) 1261static void kmem_rcu_free(struct rcu_head *head)
1236{ 1262{
1237 struct slab_rcu *slab_rcu = (struct slab_rcu *) head; 1263 struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1238 kmem_cache_t *cachep = slab_rcu->cachep; 1264 kmem_cache_t *cachep = slab_rcu->cachep;
1239 1265
1240 kmem_freepages(cachep, slab_rcu->addr); 1266 kmem_freepages(cachep, slab_rcu->addr);
@@ -1246,19 +1272,19 @@ static void kmem_rcu_free(struct rcu_head *head)
1246 1272
1247#ifdef CONFIG_DEBUG_PAGEALLOC 1273#ifdef CONFIG_DEBUG_PAGEALLOC
1248static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1274static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1249 unsigned long caller) 1275 unsigned long caller)
1250{ 1276{
1251 int size = obj_reallen(cachep); 1277 int size = obj_reallen(cachep);
1252 1278
1253 addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; 1279 addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
1254 1280
1255 if (size < 5*sizeof(unsigned long)) 1281 if (size < 5 * sizeof(unsigned long))
1256 return; 1282 return;
1257 1283
1258 *addr++=0x12345678; 1284 *addr++ = 0x12345678;
1259 *addr++=caller; 1285 *addr++ = caller;
1260 *addr++=smp_processor_id(); 1286 *addr++ = smp_processor_id();
1261 size -= 3*sizeof(unsigned long); 1287 size -= 3 * sizeof(unsigned long);
1262 { 1288 {
1263 unsigned long *sptr = &caller; 1289 unsigned long *sptr = &caller;
1264 unsigned long svalue; 1290 unsigned long svalue;
@@ -1266,7 +1292,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1266 while (!kstack_end(sptr)) { 1292 while (!kstack_end(sptr)) {
1267 svalue = *sptr++; 1293 svalue = *sptr++;
1268 if (kernel_text_address(svalue)) { 1294 if (kernel_text_address(svalue)) {
1269 *addr++=svalue; 1295 *addr++ = svalue;
1270 size -= sizeof(unsigned long); 1296 size -= sizeof(unsigned long);
1271 if (size <= sizeof(unsigned long)) 1297 if (size <= sizeof(unsigned long))
1272 break; 1298 break;
@@ -1274,25 +1300,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1274 } 1300 }
1275 1301
1276 } 1302 }
1277 *addr++=0x87654321; 1303 *addr++ = 0x87654321;
1278} 1304}
1279#endif 1305#endif
1280 1306
1281static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1307static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
1282{ 1308{
1283 int size = obj_reallen(cachep); 1309 int size = obj_reallen(cachep);
1284 addr = &((char*)addr)[obj_dbghead(cachep)]; 1310 addr = &((char *)addr)[obj_dbghead(cachep)];
1285 1311
1286 memset(addr, val, size); 1312 memset(addr, val, size);
1287 *(unsigned char *)(addr+size-1) = POISON_END; 1313 *(unsigned char *)(addr + size - 1) = POISON_END;
1288} 1314}
1289 1315
1290static void dump_line(char *data, int offset, int limit) 1316static void dump_line(char *data, int offset, int limit)
1291{ 1317{
1292 int i; 1318 int i;
1293 printk(KERN_ERR "%03x:", offset); 1319 printk(KERN_ERR "%03x:", offset);
1294 for (i=0;i<limit;i++) { 1320 for (i = 0; i < limit; i++) {
1295 printk(" %02x", (unsigned char)data[offset+i]); 1321 printk(" %02x", (unsigned char)data[offset + i]);
1296 } 1322 }
1297 printk("\n"); 1323 printk("\n");
1298} 1324}
@@ -1307,24 +1333,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1307 1333
1308 if (cachep->flags & SLAB_RED_ZONE) { 1334 if (cachep->flags & SLAB_RED_ZONE) {
1309 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1335 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1310 *dbg_redzone1(cachep, objp), 1336 *dbg_redzone1(cachep, objp),
1311 *dbg_redzone2(cachep, objp)); 1337 *dbg_redzone2(cachep, objp));
1312 } 1338 }
1313 1339
1314 if (cachep->flags & SLAB_STORE_USER) { 1340 if (cachep->flags & SLAB_STORE_USER) {
1315 printk(KERN_ERR "Last user: [<%p>]", 1341 printk(KERN_ERR "Last user: [<%p>]",
1316 *dbg_userword(cachep, objp)); 1342 *dbg_userword(cachep, objp));
1317 print_symbol("(%s)", 1343 print_symbol("(%s)",
1318 (unsigned long)*dbg_userword(cachep, objp)); 1344 (unsigned long)*dbg_userword(cachep, objp));
1319 printk("\n"); 1345 printk("\n");
1320 } 1346 }
1321 realobj = (char*)objp+obj_dbghead(cachep); 1347 realobj = (char *)objp + obj_dbghead(cachep);
1322 size = obj_reallen(cachep); 1348 size = obj_reallen(cachep);
1323 for (i=0; i<size && lines;i+=16, lines--) { 1349 for (i = 0; i < size && lines; i += 16, lines--) {
1324 int limit; 1350 int limit;
1325 limit = 16; 1351 limit = 16;
1326 if (i+limit > size) 1352 if (i + limit > size)
1327 limit = size-i; 1353 limit = size - i;
1328 dump_line(realobj, i, limit); 1354 dump_line(realobj, i, limit);
1329 } 1355 }
1330} 1356}
@@ -1335,27 +1361,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1335 int size, i; 1361 int size, i;
1336 int lines = 0; 1362 int lines = 0;
1337 1363
1338 realobj = (char*)objp+obj_dbghead(cachep); 1364 realobj = (char *)objp + obj_dbghead(cachep);
1339 size = obj_reallen(cachep); 1365 size = obj_reallen(cachep);
1340 1366
1341 for (i=0;i<size;i++) { 1367 for (i = 0; i < size; i++) {
1342 char exp = POISON_FREE; 1368 char exp = POISON_FREE;
1343 if (i == size-1) 1369 if (i == size - 1)
1344 exp = POISON_END; 1370 exp = POISON_END;
1345 if (realobj[i] != exp) { 1371 if (realobj[i] != exp) {
1346 int limit; 1372 int limit;
1347 /* Mismatch ! */ 1373 /* Mismatch ! */
1348 /* Print header */ 1374 /* Print header */
1349 if (lines == 0) { 1375 if (lines == 0) {
1350 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", 1376 printk(KERN_ERR
1351 realobj, size); 1377 "Slab corruption: start=%p, len=%d\n",
1378 realobj, size);
1352 print_objinfo(cachep, objp, 0); 1379 print_objinfo(cachep, objp, 0);
1353 } 1380 }
1354 /* Hexdump the affected line */ 1381 /* Hexdump the affected line */
1355 i = (i/16)*16; 1382 i = (i / 16) * 16;
1356 limit = 16; 1383 limit = 16;
1357 if (i+limit > size) 1384 if (i + limit > size)
1358 limit = size-i; 1385 limit = size - i;
1359 dump_line(realobj, i, limit); 1386 dump_line(realobj, i, limit);
1360 i += 16; 1387 i += 16;
1361 lines++; 1388 lines++;
@@ -1368,22 +1395,22 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1368 /* Print some data about the neighboring objects, if they 1395 /* Print some data about the neighboring objects, if they
1369 * exist: 1396 * exist:
1370 */ 1397 */
1371 struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp)); 1398 struct slab *slabp = page_get_slab(virt_to_page(objp));
1372 int objnr; 1399 int objnr;
1373 1400
1374 objnr = (objp-slabp->s_mem)/cachep->objsize; 1401 objnr = (objp - slabp->s_mem) / cachep->objsize;
1375 if (objnr) { 1402 if (objnr) {
1376 objp = slabp->s_mem+(objnr-1)*cachep->objsize; 1403 objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
1377 realobj = (char*)objp+obj_dbghead(cachep); 1404 realobj = (char *)objp + obj_dbghead(cachep);
1378 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1405 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1379 realobj, size); 1406 realobj, size);
1380 print_objinfo(cachep, objp, 2); 1407 print_objinfo(cachep, objp, 2);
1381 } 1408 }
1382 if (objnr+1 < cachep->num) { 1409 if (objnr + 1 < cachep->num) {
1383 objp = slabp->s_mem+(objnr+1)*cachep->objsize; 1410 objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
1384 realobj = (char*)objp+obj_dbghead(cachep); 1411 realobj = (char *)objp + obj_dbghead(cachep);
1385 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1412 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1386 realobj, size); 1413 realobj, size);
1387 print_objinfo(cachep, objp, 2); 1414 print_objinfo(cachep, objp, 2);
1388 } 1415 }
1389 } 1416 }
@@ -1394,7 +1421,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1394 * Before calling the slab must have been unlinked from the cache. 1421 * Before calling the slab must have been unlinked from the cache.
1395 * The cache-lock is not held/needed. 1422 * The cache-lock is not held/needed.
1396 */ 1423 */
1397static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) 1424static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1398{ 1425{
1399 void *addr = slabp->s_mem - slabp->colouroff; 1426 void *addr = slabp->s_mem - slabp->colouroff;
1400 1427
@@ -1405,8 +1432,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1405 1432
1406 if (cachep->flags & SLAB_POISON) { 1433 if (cachep->flags & SLAB_POISON) {
1407#ifdef CONFIG_DEBUG_PAGEALLOC 1434#ifdef CONFIG_DEBUG_PAGEALLOC
1408 if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) 1435 if ((cachep->objsize % PAGE_SIZE) == 0
1409 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); 1436 && OFF_SLAB(cachep))
1437 kernel_map_pages(virt_to_page(objp),
1438 cachep->objsize / PAGE_SIZE,
1439 1);
1410 else 1440 else
1411 check_poison_obj(cachep, objp); 1441 check_poison_obj(cachep, objp);
1412#else 1442#else
@@ -1416,20 +1446,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1416 if (cachep->flags & SLAB_RED_ZONE) { 1446 if (cachep->flags & SLAB_RED_ZONE) {
1417 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1447 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1418 slab_error(cachep, "start of a freed object " 1448 slab_error(cachep, "start of a freed object "
1419 "was overwritten"); 1449 "was overwritten");
1420 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1450 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1421 slab_error(cachep, "end of a freed object " 1451 slab_error(cachep, "end of a freed object "
1422 "was overwritten"); 1452 "was overwritten");
1423 } 1453 }
1424 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1454 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1425 (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); 1455 (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
1426 } 1456 }
1427#else 1457#else
1428 if (cachep->dtor) { 1458 if (cachep->dtor) {
1429 int i; 1459 int i;
1430 for (i = 0; i < cachep->num; i++) { 1460 for (i = 0; i < cachep->num; i++) {
1431 void* objp = slabp->s_mem+cachep->objsize*i; 1461 void *objp = slabp->s_mem + cachep->objsize * i;
1432 (cachep->dtor)(objp, cachep, 0); 1462 (cachep->dtor) (objp, cachep, 0);
1433 } 1463 }
1434 } 1464 }
1435#endif 1465#endif
@@ -1437,7 +1467,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1437 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1467 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1438 struct slab_rcu *slab_rcu; 1468 struct slab_rcu *slab_rcu;
1439 1469
1440 slab_rcu = (struct slab_rcu *) slabp; 1470 slab_rcu = (struct slab_rcu *)slabp;
1441 slab_rcu->cachep = cachep; 1471 slab_rcu->cachep = cachep;
1442 slab_rcu->addr = addr; 1472 slab_rcu->addr = addr;
1443 call_rcu(&slab_rcu->head, kmem_rcu_free); 1473 call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1455,11 +1485,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1455 int node; 1485 int node;
1456 1486
1457 for_each_online_node(node) { 1487 for_each_online_node(node) {
1458 cachep->nodelists[node] = &initkmem_list3[index+node]; 1488 cachep->nodelists[node] = &initkmem_list3[index + node];
1459 cachep->nodelists[node]->next_reap = jiffies + 1489 cachep->nodelists[node]->next_reap = jiffies +
1460 REAPTIMEOUT_LIST3 + 1490 REAPTIMEOUT_LIST3 +
1461 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1491 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1492 }
1493}
1494
1495/**
1496 * calculate_slab_order - calculate size (page order) of slabs and the number
1497 * of objects per slab.
1498 *
1499 * This could be made much more intelligent. For now, try to avoid using
1500 * high order pages for slabs. When the gfp() functions are more friendly
1501 * towards high-order requests, this should be changed.
1502 */
1503static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
1504 size_t align, gfp_t flags)
1505{
1506 size_t left_over = 0;
1507
1508 for (;; cachep->gfporder++) {
1509 unsigned int num;
1510 size_t remainder;
1511
1512 if (cachep->gfporder > MAX_GFP_ORDER) {
1513 cachep->num = 0;
1514 break;
1515 }
1516
1517 cache_estimate(cachep->gfporder, size, align, flags,
1518 &remainder, &num);
1519 if (!num)
1520 continue;
1521 /* More than offslab_limit objects will cause problems */
1522 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
1523 break;
1524
1525 cachep->num = num;
1526 left_over = remainder;
1527
1528 /*
1529 * Large number of objects is good, but very large slabs are
1530 * currently bad for the gfp()s.
1531 */
1532 if (cachep->gfporder >= slab_break_gfp_order)
1533 break;
1534
1535 if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
1536 /* Acceptable internal fragmentation */
1537 break;
1462 } 1538 }
1539 return left_over;
1463} 1540}
1464 1541
1465/** 1542/**
@@ -1508,16 +1585,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1508 * Sanity checks... these are all serious usage bugs. 1585 * Sanity checks... these are all serious usage bugs.
1509 */ 1586 */
1510 if ((!name) || 1587 if ((!name) ||
1511 in_interrupt() || 1588 in_interrupt() ||
1512 (size < BYTES_PER_WORD) || 1589 (size < BYTES_PER_WORD) ||
1513 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || 1590 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1514 (dtor && !ctor)) { 1591 printk(KERN_ERR "%s: Early error in slab %s\n",
1515 printk(KERN_ERR "%s: Early error in slab %s\n", 1592 __FUNCTION__, name);
1516 __FUNCTION__, name); 1593 BUG();
1517 BUG(); 1594 }
1518 }
1519 1595
1520 down(&cache_chain_sem); 1596 mutex_lock(&cache_chain_mutex);
1521 1597
1522 list_for_each(p, &cache_chain) { 1598 list_for_each(p, &cache_chain) {
1523 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next); 1599 kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
@@ -1535,11 +1611,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1535 set_fs(old_fs); 1611 set_fs(old_fs);
1536 if (res) { 1612 if (res) {
1537 printk("SLAB: cache with size %d has lost its name\n", 1613 printk("SLAB: cache with size %d has lost its name\n",
1538 pc->objsize); 1614 pc->objsize);
1539 continue; 1615 continue;
1540 } 1616 }
1541 1617
1542 if (!strcmp(pc->name,name)) { 1618 if (!strcmp(pc->name, name)) {
1543 printk("kmem_cache_create: duplicate cache %s\n", name); 1619 printk("kmem_cache_create: duplicate cache %s\n", name);
1544 dump_stack(); 1620 dump_stack();
1545 goto oops; 1621 goto oops;
@@ -1551,10 +1627,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1551 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1627 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1552 /* No constructor, but inital state check requested */ 1628 /* No constructor, but inital state check requested */
1553 printk(KERN_ERR "%s: No con, but init state check " 1629 printk(KERN_ERR "%s: No con, but init state check "
1554 "requested - %s\n", __FUNCTION__, name); 1630 "requested - %s\n", __FUNCTION__, name);
1555 flags &= ~SLAB_DEBUG_INITIAL; 1631 flags &= ~SLAB_DEBUG_INITIAL;
1556 } 1632 }
1557
1558#if FORCED_DEBUG 1633#if FORCED_DEBUG
1559 /* 1634 /*
1560 * Enable redzoning and last user accounting, except for caches with 1635 * Enable redzoning and last user accounting, except for caches with
@@ -1562,8 +1637,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1562 * above the next power of two: caches with object sizes just above a 1637 * above the next power of two: caches with object sizes just above a
1563 * power of two have a significant amount of internal fragmentation. 1638 * power of two have a significant amount of internal fragmentation.
1564 */ 1639 */
1565 if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) 1640 if ((size < 4096
1566 flags |= SLAB_RED_ZONE|SLAB_STORE_USER; 1641 || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1642 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1567 if (!(flags & SLAB_DESTROY_BY_RCU)) 1643 if (!(flags & SLAB_DESTROY_BY_RCU))
1568 flags |= SLAB_POISON; 1644 flags |= SLAB_POISON;
1569#endif 1645#endif
@@ -1584,9 +1660,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1584 * unaligned accesses for some archs when redzoning is used, and makes 1660 * unaligned accesses for some archs when redzoning is used, and makes
1585 * sure any on-slab bufctl's are also correctly aligned. 1661 * sure any on-slab bufctl's are also correctly aligned.
1586 */ 1662 */
1587 if (size & (BYTES_PER_WORD-1)) { 1663 if (size & (BYTES_PER_WORD - 1)) {
1588 size += (BYTES_PER_WORD-1); 1664 size += (BYTES_PER_WORD - 1);
1589 size &= ~(BYTES_PER_WORD-1); 1665 size &= ~(BYTES_PER_WORD - 1);
1590 } 1666 }
1591 1667
1592 /* calculate out the final buffer alignment: */ 1668 /* calculate out the final buffer alignment: */
@@ -1597,7 +1673,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1597 * objects into one cacheline. 1673 * objects into one cacheline.
1598 */ 1674 */
1599 ralign = cache_line_size(); 1675 ralign = cache_line_size();
1600 while (size <= ralign/2) 1676 while (size <= ralign / 2)
1601 ralign /= 2; 1677 ralign /= 2;
1602 } else { 1678 } else {
1603 ralign = BYTES_PER_WORD; 1679 ralign = BYTES_PER_WORD;
@@ -1606,13 +1682,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1606 if (ralign < ARCH_SLAB_MINALIGN) { 1682 if (ralign < ARCH_SLAB_MINALIGN) {
1607 ralign = ARCH_SLAB_MINALIGN; 1683 ralign = ARCH_SLAB_MINALIGN;
1608 if (ralign > BYTES_PER_WORD) 1684 if (ralign > BYTES_PER_WORD)
1609 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1685 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1610 } 1686 }
1611 /* 3) caller mandated alignment: disables debug if necessary */ 1687 /* 3) caller mandated alignment: disables debug if necessary */
1612 if (ralign < align) { 1688 if (ralign < align) {
1613 ralign = align; 1689 ralign = align;
1614 if (ralign > BYTES_PER_WORD) 1690 if (ralign > BYTES_PER_WORD)
1615 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1691 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1616 } 1692 }
1617 /* 4) Store it. Note that the debug code below can reduce 1693 /* 4) Store it. Note that the debug code below can reduce
1618 * the alignment to BYTES_PER_WORD. 1694 * the alignment to BYTES_PER_WORD.
@@ -1634,7 +1710,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1634 1710
1635 /* add space for red zone words */ 1711 /* add space for red zone words */
1636 cachep->dbghead += BYTES_PER_WORD; 1712 cachep->dbghead += BYTES_PER_WORD;
1637 size += 2*BYTES_PER_WORD; 1713 size += 2 * BYTES_PER_WORD;
1638 } 1714 }
1639 if (flags & SLAB_STORE_USER) { 1715 if (flags & SLAB_STORE_USER) {
1640 /* user store requires word alignment and 1716 /* user store requires word alignment and
@@ -1645,7 +1721,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1645 size += BYTES_PER_WORD; 1721 size += BYTES_PER_WORD;
1646 } 1722 }
1647#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1723#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1648 if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1724 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1725 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1649 cachep->dbghead += PAGE_SIZE - size; 1726 cachep->dbghead += PAGE_SIZE - size;
1650 size = PAGE_SIZE; 1727 size = PAGE_SIZE;
1651 } 1728 }
@@ -1653,7 +1730,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1653#endif 1730#endif
1654 1731
1655 /* Determine if the slab management is 'on' or 'off' slab. */ 1732 /* Determine if the slab management is 'on' or 'off' slab. */
1656 if (size >= (PAGE_SIZE>>3)) 1733 if (size >= (PAGE_SIZE >> 3))
1657 /* 1734 /*
1658 * Size is large, assume best to place the slab management obj 1735 * Size is large, assume best to place the slab management obj
1659 * off-slab (should allow better packing of objs). 1736 * off-slab (should allow better packing of objs).
@@ -1670,47 +1747,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1670 */ 1747 */
1671 cachep->gfporder = 0; 1748 cachep->gfporder = 0;
1672 cache_estimate(cachep->gfporder, size, align, flags, 1749 cache_estimate(cachep->gfporder, size, align, flags,
1673 &left_over, &cachep->num); 1750 &left_over, &cachep->num);
1674 } else { 1751 } else
1675 /* 1752 left_over = calculate_slab_order(cachep, size, align, flags);
1676 * Calculate size (in pages) of slabs, and the num of objs per
1677 * slab. This could be made much more intelligent. For now,
1678 * try to avoid using high page-orders for slabs. When the
1679 * gfp() funcs are more friendly towards high-order requests,
1680 * this should be changed.
1681 */
1682 do {
1683 unsigned int break_flag = 0;
1684cal_wastage:
1685 cache_estimate(cachep->gfporder, size, align, flags,
1686 &left_over, &cachep->num);
1687 if (break_flag)
1688 break;
1689 if (cachep->gfporder >= MAX_GFP_ORDER)
1690 break;
1691 if (!cachep->num)
1692 goto next;
1693 if (flags & CFLGS_OFF_SLAB &&
1694 cachep->num > offslab_limit) {
1695 /* This num of objs will cause problems. */
1696 cachep->gfporder--;
1697 break_flag++;
1698 goto cal_wastage;
1699 }
1700
1701 /*
1702 * Large num of objs is good, but v. large slabs are
1703 * currently bad for the gfp()s.
1704 */
1705 if (cachep->gfporder >= slab_break_gfp_order)
1706 break;
1707
1708 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
1709 break; /* Acceptable internal fragmentation. */
1710next:
1711 cachep->gfporder++;
1712 } while (1);
1713 }
1714 1753
1715 if (!cachep->num) { 1754 if (!cachep->num) {
1716 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1755 printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1718,8 +1757,8 @@ next:
1718 cachep = NULL; 1757 cachep = NULL;
1719 goto oops; 1758 goto oops;
1720 } 1759 }
1721 slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) 1760 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
1722 + sizeof(struct slab), align); 1761 + sizeof(struct slab), align);
1723 1762
1724 /* 1763 /*
1725 * If the slab has been placed off-slab, and we have enough space then 1764 * If the slab has been placed off-slab, and we have enough space then
@@ -1732,14 +1771,15 @@ next:
1732 1771
1733 if (flags & CFLGS_OFF_SLAB) { 1772 if (flags & CFLGS_OFF_SLAB) {
1734 /* really off slab. No need for manual alignment */ 1773 /* really off slab. No need for manual alignment */
1735 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); 1774 slab_size =
1775 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
1736 } 1776 }
1737 1777
1738 cachep->colour_off = cache_line_size(); 1778 cachep->colour_off = cache_line_size();
1739 /* Offset must be a multiple of the alignment. */ 1779 /* Offset must be a multiple of the alignment. */
1740 if (cachep->colour_off < align) 1780 if (cachep->colour_off < align)
1741 cachep->colour_off = align; 1781 cachep->colour_off = align;
1742 cachep->colour = left_over/cachep->colour_off; 1782 cachep->colour = left_over / cachep->colour_off;
1743 cachep->slab_size = slab_size; 1783 cachep->slab_size = slab_size;
1744 cachep->flags = flags; 1784 cachep->flags = flags;
1745 cachep->gfpflags = 0; 1785 cachep->gfpflags = 0;
@@ -1766,7 +1806,7 @@ next:
1766 * the creation of further caches will BUG(). 1806 * the creation of further caches will BUG().
1767 */ 1807 */
1768 cachep->array[smp_processor_id()] = 1808 cachep->array[smp_processor_id()] =
1769 &initarray_generic.cache; 1809 &initarray_generic.cache;
1770 1810
1771 /* If the cache that's used by 1811 /* If the cache that's used by
1772 * kmalloc(sizeof(kmem_list3)) is the first cache, 1812 * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1780,8 +1820,7 @@ next:
1780 g_cpucache_up = PARTIAL_AC; 1820 g_cpucache_up = PARTIAL_AC;
1781 } else { 1821 } else {
1782 cachep->array[smp_processor_id()] = 1822 cachep->array[smp_processor_id()] =
1783 kmalloc(sizeof(struct arraycache_init), 1823 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1784 GFP_KERNEL);
1785 1824
1786 if (g_cpucache_up == PARTIAL_AC) { 1825 if (g_cpucache_up == PARTIAL_AC) {
1787 set_up_list3s(cachep, SIZE_L3); 1826 set_up_list3s(cachep, SIZE_L3);
@@ -1791,16 +1830,18 @@ next:
1791 for_each_online_node(node) { 1830 for_each_online_node(node) {
1792 1831
1793 cachep->nodelists[node] = 1832 cachep->nodelists[node] =
1794 kmalloc_node(sizeof(struct kmem_list3), 1833 kmalloc_node(sizeof
1795 GFP_KERNEL, node); 1834 (struct kmem_list3),
1835 GFP_KERNEL, node);
1796 BUG_ON(!cachep->nodelists[node]); 1836 BUG_ON(!cachep->nodelists[node]);
1797 kmem_list3_init(cachep->nodelists[node]); 1837 kmem_list3_init(cachep->
1838 nodelists[node]);
1798 } 1839 }
1799 } 1840 }
1800 } 1841 }
1801 cachep->nodelists[numa_node_id()]->next_reap = 1842 cachep->nodelists[numa_node_id()]->next_reap =
1802 jiffies + REAPTIMEOUT_LIST3 + 1843 jiffies + REAPTIMEOUT_LIST3 +
1803 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1844 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1804 1845
1805 BUG_ON(!ac_data(cachep)); 1846 BUG_ON(!ac_data(cachep));
1806 ac_data(cachep)->avail = 0; 1847 ac_data(cachep)->avail = 0;
@@ -1809,16 +1850,16 @@ next:
1809 ac_data(cachep)->touched = 0; 1850 ac_data(cachep)->touched = 0;
1810 cachep->batchcount = 1; 1851 cachep->batchcount = 1;
1811 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1852 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1812 } 1853 }
1813 1854
1814 /* cache setup completed, link it into the list */ 1855 /* cache setup completed, link it into the list */
1815 list_add(&cachep->next, &cache_chain); 1856 list_add(&cachep->next, &cache_chain);
1816 unlock_cpu_hotplug(); 1857 unlock_cpu_hotplug();
1817oops: 1858 oops:
1818 if (!cachep && (flags & SLAB_PANIC)) 1859 if (!cachep && (flags & SLAB_PANIC))
1819 panic("kmem_cache_create(): failed to create slab `%s'\n", 1860 panic("kmem_cache_create(): failed to create slab `%s'\n",
1820 name); 1861 name);
1821 up(&cache_chain_sem); 1862 mutex_unlock(&cache_chain_mutex);
1822 return cachep; 1863 return cachep;
1823} 1864}
1824EXPORT_SYMBOL(kmem_cache_create); 1865EXPORT_SYMBOL(kmem_cache_create);
@@ -1860,7 +1901,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
1860/* 1901/*
1861 * Waits for all CPUs to execute func(). 1902 * Waits for all CPUs to execute func().
1862 */ 1903 */
1863static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) 1904static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1864{ 1905{
1865 check_irq_on(); 1906 check_irq_on();
1866 preempt_disable(); 1907 preempt_disable();
@@ -1875,12 +1916,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1875 preempt_enable(); 1916 preempt_enable();
1876} 1917}
1877 1918
1878static void drain_array_locked(kmem_cache_t* cachep, 1919static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
1879 struct array_cache *ac, int force, int node); 1920 int force, int node);
1880 1921
1881static void do_drain(void *arg) 1922static void do_drain(void *arg)
1882{ 1923{
1883 kmem_cache_t *cachep = (kmem_cache_t*)arg; 1924 kmem_cache_t *cachep = (kmem_cache_t *) arg;
1884 struct array_cache *ac; 1925 struct array_cache *ac;
1885 int node = numa_node_id(); 1926 int node = numa_node_id();
1886 1927
@@ -1900,7 +1941,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
1900 smp_call_function_all_cpus(do_drain, cachep); 1941 smp_call_function_all_cpus(do_drain, cachep);
1901 check_irq_on(); 1942 check_irq_on();
1902 spin_lock_irq(&cachep->spinlock); 1943 spin_lock_irq(&cachep->spinlock);
1903 for_each_online_node(node) { 1944 for_each_online_node(node) {
1904 l3 = cachep->nodelists[node]; 1945 l3 = cachep->nodelists[node];
1905 if (l3) { 1946 if (l3) {
1906 spin_lock(&l3->list_lock); 1947 spin_lock(&l3->list_lock);
@@ -1938,8 +1979,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
1938 slab_destroy(cachep, slabp); 1979 slab_destroy(cachep, slabp);
1939 spin_lock_irq(&l3->list_lock); 1980 spin_lock_irq(&l3->list_lock);
1940 } 1981 }
1941 ret = !list_empty(&l3->slabs_full) || 1982 ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
1942 !list_empty(&l3->slabs_partial);
1943 return ret; 1983 return ret;
1944} 1984}
1945 1985
@@ -1995,7 +2035,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
1995 * The caller must guarantee that noone will allocate memory from the cache 2035 * The caller must guarantee that noone will allocate memory from the cache
1996 * during the kmem_cache_destroy(). 2036 * during the kmem_cache_destroy().
1997 */ 2037 */
1998int kmem_cache_destroy(kmem_cache_t * cachep) 2038int kmem_cache_destroy(kmem_cache_t *cachep)
1999{ 2039{
2000 int i; 2040 int i;
2001 struct kmem_list3 *l3; 2041 struct kmem_list3 *l3;
@@ -2007,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2007 lock_cpu_hotplug(); 2047 lock_cpu_hotplug();
2008 2048
2009 /* Find the cache in the chain of caches. */ 2049 /* Find the cache in the chain of caches. */
2010 down(&cache_chain_sem); 2050 mutex_lock(&cache_chain_mutex);
2011 /* 2051 /*
2012 * the chain is never empty, cache_cache is never destroyed 2052 * the chain is never empty, cache_cache is never destroyed
2013 */ 2053 */
2014 list_del(&cachep->next); 2054 list_del(&cachep->next);
2015 up(&cache_chain_sem); 2055 mutex_unlock(&cache_chain_mutex);
2016 2056
2017 if (__cache_shrink(cachep)) { 2057 if (__cache_shrink(cachep)) {
2018 slab_error(cachep, "Can't free all objects"); 2058 slab_error(cachep, "Can't free all objects");
2019 down(&cache_chain_sem); 2059 mutex_lock(&cache_chain_mutex);
2020 list_add(&cachep->next,&cache_chain); 2060 list_add(&cachep->next, &cache_chain);
2021 up(&cache_chain_sem); 2061 mutex_unlock(&cache_chain_mutex);
2022 unlock_cpu_hotplug(); 2062 unlock_cpu_hotplug();
2023 return 1; 2063 return 1;
2024 } 2064 }
@@ -2027,7 +2067,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2027 synchronize_rcu(); 2067 synchronize_rcu();
2028 2068
2029 for_each_online_cpu(i) 2069 for_each_online_cpu(i)
2030 kfree(cachep->array[i]); 2070 kfree(cachep->array[i]);
2031 2071
2032 /* NUMA: free the list3 structures */ 2072 /* NUMA: free the list3 structures */
2033 for_each_online_node(i) { 2073 for_each_online_node(i) {
@@ -2046,39 +2086,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2046EXPORT_SYMBOL(kmem_cache_destroy); 2086EXPORT_SYMBOL(kmem_cache_destroy);
2047 2087
2048/* Get the memory for a slab management obj. */ 2088/* Get the memory for a slab management obj. */
2049static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2089static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
2050 int colour_off, gfp_t local_flags) 2090 int colour_off, gfp_t local_flags)
2051{ 2091{
2052 struct slab *slabp; 2092 struct slab *slabp;
2053 2093
2054 if (OFF_SLAB(cachep)) { 2094 if (OFF_SLAB(cachep)) {
2055 /* Slab management obj is off-slab. */ 2095 /* Slab management obj is off-slab. */
2056 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 2096 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
2057 if (!slabp) 2097 if (!slabp)
2058 return NULL; 2098 return NULL;
2059 } else { 2099 } else {
2060 slabp = objp+colour_off; 2100 slabp = objp + colour_off;
2061 colour_off += cachep->slab_size; 2101 colour_off += cachep->slab_size;
2062 } 2102 }
2063 slabp->inuse = 0; 2103 slabp->inuse = 0;
2064 slabp->colouroff = colour_off; 2104 slabp->colouroff = colour_off;
2065 slabp->s_mem = objp+colour_off; 2105 slabp->s_mem = objp + colour_off;
2066 2106
2067 return slabp; 2107 return slabp;
2068} 2108}
2069 2109
2070static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2110static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2071{ 2111{
2072 return (kmem_bufctl_t *)(slabp+1); 2112 return (kmem_bufctl_t *) (slabp + 1);
2073} 2113}
2074 2114
2075static void cache_init_objs(kmem_cache_t *cachep, 2115static void cache_init_objs(kmem_cache_t *cachep,
2076 struct slab *slabp, unsigned long ctor_flags) 2116 struct slab *slabp, unsigned long ctor_flags)
2077{ 2117{
2078 int i; 2118 int i;
2079 2119
2080 for (i = 0; i < cachep->num; i++) { 2120 for (i = 0; i < cachep->num; i++) {
2081 void *objp = slabp->s_mem+cachep->objsize*i; 2121 void *objp = slabp->s_mem + cachep->objsize * i;
2082#if DEBUG 2122#if DEBUG
2083 /* need to poison the objs? */ 2123 /* need to poison the objs? */
2084 if (cachep->flags & SLAB_POISON) 2124 if (cachep->flags & SLAB_POISON)
@@ -2096,25 +2136,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
2096 * Otherwise, deadlock. They must also be threaded. 2136 * Otherwise, deadlock. They must also be threaded.
2097 */ 2137 */
2098 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2138 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2099 cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); 2139 cachep->ctor(objp + obj_dbghead(cachep), cachep,
2140 ctor_flags);
2100 2141
2101 if (cachep->flags & SLAB_RED_ZONE) { 2142 if (cachep->flags & SLAB_RED_ZONE) {
2102 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2143 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2103 slab_error(cachep, "constructor overwrote the" 2144 slab_error(cachep, "constructor overwrote the"
2104 " end of an object"); 2145 " end of an object");
2105 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2146 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2106 slab_error(cachep, "constructor overwrote the" 2147 slab_error(cachep, "constructor overwrote the"
2107 " start of an object"); 2148 " start of an object");
2108 } 2149 }
2109 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2150 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2110 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2151 && cachep->flags & SLAB_POISON)
2152 kernel_map_pages(virt_to_page(objp),
2153 cachep->objsize / PAGE_SIZE, 0);
2111#else 2154#else
2112 if (cachep->ctor) 2155 if (cachep->ctor)
2113 cachep->ctor(objp, cachep, ctor_flags); 2156 cachep->ctor(objp, cachep, ctor_flags);
2114#endif 2157#endif
2115 slab_bufctl(slabp)[i] = i+1; 2158 slab_bufctl(slabp)[i] = i + 1;
2116 } 2159 }
2117 slab_bufctl(slabp)[i-1] = BUFCTL_END; 2160 slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2118 slabp->free = 0; 2161 slabp->free = 0;
2119} 2162}
2120 2163
@@ -2138,8 +2181,8 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2138 i = 1 << cachep->gfporder; 2181 i = 1 << cachep->gfporder;
2139 page = virt_to_page(objp); 2182 page = virt_to_page(objp);
2140 do { 2183 do {
2141 SET_PAGE_CACHE(page, cachep); 2184 page_set_cache(page, cachep);
2142 SET_PAGE_SLAB(page, slabp); 2185 page_set_slab(page, slabp);
2143 page++; 2186 page++;
2144 } while (--i); 2187 } while (--i);
2145} 2188}
@@ -2150,17 +2193,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2150 */ 2193 */
2151static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2194static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2152{ 2195{
2153 struct slab *slabp; 2196 struct slab *slabp;
2154 void *objp; 2197 void *objp;
2155 size_t offset; 2198 size_t offset;
2156 gfp_t local_flags; 2199 gfp_t local_flags;
2157 unsigned long ctor_flags; 2200 unsigned long ctor_flags;
2158 struct kmem_list3 *l3; 2201 struct kmem_list3 *l3;
2159 2202
2160 /* Be lazy and only check for valid flags here, 2203 /* Be lazy and only check for valid flags here,
2161 * keeping it out of the critical path in kmem_cache_alloc(). 2204 * keeping it out of the critical path in kmem_cache_alloc().
2162 */ 2205 */
2163 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) 2206 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2164 BUG(); 2207 BUG();
2165 if (flags & SLAB_NO_GROW) 2208 if (flags & SLAB_NO_GROW)
2166 return 0; 2209 return 0;
@@ -2226,9 +2269,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2226 l3->free_objects += cachep->num; 2269 l3->free_objects += cachep->num;
2227 spin_unlock(&l3->list_lock); 2270 spin_unlock(&l3->list_lock);
2228 return 1; 2271 return 1;
2229opps1: 2272 opps1:
2230 kmem_freepages(cachep, objp); 2273 kmem_freepages(cachep, objp);
2231failed: 2274 failed:
2232 if (local_flags & __GFP_WAIT) 2275 if (local_flags & __GFP_WAIT)
2233 local_irq_disable(); 2276 local_irq_disable();
2234 return 0; 2277 return 0;
@@ -2248,18 +2291,19 @@ static void kfree_debugcheck(const void *objp)
2248 2291
2249 if (!virt_addr_valid(objp)) { 2292 if (!virt_addr_valid(objp)) {
2250 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2293 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2251 (unsigned long)objp); 2294 (unsigned long)objp);
2252 BUG(); 2295 BUG();
2253 } 2296 }
2254 page = virt_to_page(objp); 2297 page = virt_to_page(objp);
2255 if (!PageSlab(page)) { 2298 if (!PageSlab(page)) {
2256 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); 2299 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2300 (unsigned long)objp);
2257 BUG(); 2301 BUG();
2258 } 2302 }
2259} 2303}
2260 2304
2261static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2305static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2262 void *caller) 2306 void *caller)
2263{ 2307{
2264 struct page *page; 2308 struct page *page;
2265 unsigned int objnr; 2309 unsigned int objnr;
@@ -2269,21 +2313,27 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2269 kfree_debugcheck(objp); 2313 kfree_debugcheck(objp);
2270 page = virt_to_page(objp); 2314 page = virt_to_page(objp);
2271 2315
2272 if (GET_PAGE_CACHE(page) != cachep) { 2316 if (page_get_cache(page) != cachep) {
2273 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2317 printk(KERN_ERR
2274 GET_PAGE_CACHE(page),cachep); 2318 "mismatch in kmem_cache_free: expected cache %p, got %p\n",
2319 page_get_cache(page), cachep);
2275 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2320 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2276 printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name); 2321 printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2322 page_get_cache(page)->name);
2277 WARN_ON(1); 2323 WARN_ON(1);
2278 } 2324 }
2279 slabp = GET_PAGE_SLAB(page); 2325 slabp = page_get_slab(page);
2280 2326
2281 if (cachep->flags & SLAB_RED_ZONE) { 2327 if (cachep->flags & SLAB_RED_ZONE) {
2282 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2328 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
2283 slab_error(cachep, "double free, or memory outside" 2329 || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2284 " object was overwritten"); 2330 slab_error(cachep,
2285 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2331 "double free, or memory outside"
2286 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2332 " object was overwritten");
2333 printk(KERN_ERR
2334 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2335 objp, *dbg_redzone1(cachep, objp),
2336 *dbg_redzone2(cachep, objp));
2287 } 2337 }
2288 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2338 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2289 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2339 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2291,30 +2341,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2291 if (cachep->flags & SLAB_STORE_USER) 2341 if (cachep->flags & SLAB_STORE_USER)
2292 *dbg_userword(cachep, objp) = caller; 2342 *dbg_userword(cachep, objp) = caller;
2293 2343
2294 objnr = (objp-slabp->s_mem)/cachep->objsize; 2344 objnr = (objp - slabp->s_mem) / cachep->objsize;
2295 2345
2296 BUG_ON(objnr >= cachep->num); 2346 BUG_ON(objnr >= cachep->num);
2297 BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); 2347 BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
2298 2348
2299 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2349 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2300 /* Need to call the slab's constructor so the 2350 /* Need to call the slab's constructor so the
2301 * caller can perform a verify of its state (debugging). 2351 * caller can perform a verify of its state (debugging).
2302 * Called without the cache-lock held. 2352 * Called without the cache-lock held.
2303 */ 2353 */
2304 cachep->ctor(objp+obj_dbghead(cachep), 2354 cachep->ctor(objp + obj_dbghead(cachep),
2305 cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); 2355 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2306 } 2356 }
2307 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2357 if (cachep->flags & SLAB_POISON && cachep->dtor) {
2308 /* we want to cache poison the object, 2358 /* we want to cache poison the object,
2309 * call the destruction callback 2359 * call the destruction callback
2310 */ 2360 */
2311 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); 2361 cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
2312 } 2362 }
2313 if (cachep->flags & SLAB_POISON) { 2363 if (cachep->flags & SLAB_POISON) {
2314#ifdef CONFIG_DEBUG_PAGEALLOC 2364#ifdef CONFIG_DEBUG_PAGEALLOC
2315 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2365 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2316 store_stackinfo(cachep, objp, (unsigned long)caller); 2366 store_stackinfo(cachep, objp, (unsigned long)caller);
2317 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2367 kernel_map_pages(virt_to_page(objp),
2368 cachep->objsize / PAGE_SIZE, 0);
2318 } else { 2369 } else {
2319 poison_obj(cachep, objp, POISON_FREE); 2370 poison_obj(cachep, objp, POISON_FREE);
2320 } 2371 }
@@ -2329,7 +2380,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2329{ 2380{
2330 kmem_bufctl_t i; 2381 kmem_bufctl_t i;
2331 int entries = 0; 2382 int entries = 0;
2332 2383
2333 /* Check slab's freelist to see if this obj is there. */ 2384 /* Check slab's freelist to see if this obj is there. */
2334 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2385 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2335 entries++; 2386 entries++;
@@ -2337,13 +2388,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2337 goto bad; 2388 goto bad;
2338 } 2389 }
2339 if (entries != cachep->num - slabp->inuse) { 2390 if (entries != cachep->num - slabp->inuse) {
2340bad: 2391 bad:
2341 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2392 printk(KERN_ERR
2342 cachep->name, cachep->num, slabp, slabp->inuse); 2393 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2343 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { 2394 cachep->name, cachep->num, slabp, slabp->inuse);
2344 if ((i%16)==0) 2395 for (i = 0;
2396 i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
2397 i++) {
2398 if ((i % 16) == 0)
2345 printk("\n%03x:", i); 2399 printk("\n%03x:", i);
2346 printk(" %02x", ((unsigned char*)slabp)[i]); 2400 printk(" %02x", ((unsigned char *)slabp)[i]);
2347 } 2401 }
2348 printk("\n"); 2402 printk("\n");
2349 BUG(); 2403 BUG();
@@ -2363,7 +2417,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2363 2417
2364 check_irq_off(); 2418 check_irq_off();
2365 ac = ac_data(cachep); 2419 ac = ac_data(cachep);
2366retry: 2420 retry:
2367 batchcount = ac->batchcount; 2421 batchcount = ac->batchcount;
2368 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2422 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2369 /* if there was little recent activity on this 2423 /* if there was little recent activity on this
@@ -2385,8 +2439,8 @@ retry:
2385 shared_array->avail -= batchcount; 2439 shared_array->avail -= batchcount;
2386 ac->avail = batchcount; 2440 ac->avail = batchcount;
2387 memcpy(ac->entry, 2441 memcpy(ac->entry,
2388 &(shared_array->entry[shared_array->avail]), 2442 &(shared_array->entry[shared_array->avail]),
2389 sizeof(void*)*batchcount); 2443 sizeof(void *) * batchcount);
2390 shared_array->touched = 1; 2444 shared_array->touched = 1;
2391 goto alloc_done; 2445 goto alloc_done;
2392 } 2446 }
@@ -2414,7 +2468,7 @@ retry:
2414 2468
2415 /* get obj pointer */ 2469 /* get obj pointer */
2416 ac->entry[ac->avail++] = slabp->s_mem + 2470 ac->entry[ac->avail++] = slabp->s_mem +
2417 slabp->free*cachep->objsize; 2471 slabp->free * cachep->objsize;
2418 2472
2419 slabp->inuse++; 2473 slabp->inuse++;
2420 next = slab_bufctl(slabp)[slabp->free]; 2474 next = slab_bufctl(slabp)[slabp->free];
@@ -2422,7 +2476,7 @@ retry:
2422 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2476 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2423 WARN_ON(numa_node_id() != slabp->nodeid); 2477 WARN_ON(numa_node_id() != slabp->nodeid);
2424#endif 2478#endif
2425 slabp->free = next; 2479 slabp->free = next;
2426 } 2480 }
2427 check_slabp(cachep, slabp); 2481 check_slabp(cachep, slabp);
2428 2482
@@ -2434,9 +2488,9 @@ retry:
2434 list_add(&slabp->list, &l3->slabs_partial); 2488 list_add(&slabp->list, &l3->slabs_partial);
2435 } 2489 }
2436 2490
2437must_grow: 2491 must_grow:
2438 l3->free_objects -= ac->avail; 2492 l3->free_objects -= ac->avail;
2439alloc_done: 2493 alloc_done:
2440 spin_unlock(&l3->list_lock); 2494 spin_unlock(&l3->list_lock);
2441 2495
2442 if (unlikely(!ac->avail)) { 2496 if (unlikely(!ac->avail)) {
@@ -2448,7 +2502,7 @@ alloc_done:
2448 if (!x && ac->avail == 0) // no objects in sight? abort 2502 if (!x && ac->avail == 0) // no objects in sight? abort
2449 return NULL; 2503 return NULL;
2450 2504
2451 if (!ac->avail) // objects refilled by interrupt? 2505 if (!ac->avail) // objects refilled by interrupt?
2452 goto retry; 2506 goto retry;
2453 } 2507 }
2454 ac->touched = 1; 2508 ac->touched = 1;
@@ -2465,16 +2519,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
2465} 2519}
2466 2520
2467#if DEBUG 2521#if DEBUG
2468static void * 2522static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2469cache_alloc_debugcheck_after(kmem_cache_t *cachep, 2523 void *objp, void *caller)
2470 gfp_t flags, void *objp, void *caller)
2471{ 2524{
2472 if (!objp) 2525 if (!objp)
2473 return objp; 2526 return objp;
2474 if (cachep->flags & SLAB_POISON) { 2527 if (cachep->flags & SLAB_POISON) {
2475#ifdef CONFIG_DEBUG_PAGEALLOC 2528#ifdef CONFIG_DEBUG_PAGEALLOC
2476 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2529 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2477 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); 2530 kernel_map_pages(virt_to_page(objp),
2531 cachep->objsize / PAGE_SIZE, 1);
2478 else 2532 else
2479 check_poison_obj(cachep, objp); 2533 check_poison_obj(cachep, objp);
2480#else 2534#else
@@ -2486,24 +2540,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2486 *dbg_userword(cachep, objp) = caller; 2540 *dbg_userword(cachep, objp) = caller;
2487 2541
2488 if (cachep->flags & SLAB_RED_ZONE) { 2542 if (cachep->flags & SLAB_RED_ZONE) {
2489 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2543 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
2490 slab_error(cachep, "double free, or memory outside" 2544 || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2491 " object was overwritten"); 2545 slab_error(cachep,
2492 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2546 "double free, or memory outside"
2493 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2547 " object was overwritten");
2548 printk(KERN_ERR
2549 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2550 objp, *dbg_redzone1(cachep, objp),
2551 *dbg_redzone2(cachep, objp));
2494 } 2552 }
2495 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2553 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2496 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2554 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2497 } 2555 }
2498 objp += obj_dbghead(cachep); 2556 objp += obj_dbghead(cachep);
2499 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2557 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2500 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2558 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2501 2559
2502 if (!(flags & __GFP_WAIT)) 2560 if (!(flags & __GFP_WAIT))
2503 ctor_flags |= SLAB_CTOR_ATOMIC; 2561 ctor_flags |= SLAB_CTOR_ATOMIC;
2504 2562
2505 cachep->ctor(objp, cachep, ctor_flags); 2563 cachep->ctor(objp, cachep, ctor_flags);
2506 } 2564 }
2507 return objp; 2565 return objp;
2508} 2566}
2509#else 2567#else
@@ -2512,9 +2570,18 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2512 2570
2513static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2571static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2514{ 2572{
2515 void* objp; 2573 void *objp;
2516 struct array_cache *ac; 2574 struct array_cache *ac;
2517 2575
2576#ifdef CONFIG_NUMA
2577 if (unlikely(current->mempolicy && !in_interrupt())) {
2578 int nid = slab_node(current->mempolicy);
2579
2580 if (nid != numa_node_id())
2581 return __cache_alloc_node(cachep, flags, nid);
2582 }
2583#endif
2584
2518 check_irq_off(); 2585 check_irq_off();
2519 ac = ac_data(cachep); 2586 ac = ac_data(cachep);
2520 if (likely(ac->avail)) { 2587 if (likely(ac->avail)) {
@@ -2531,7 +2598,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2531static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2598static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2532{ 2599{
2533 unsigned long save_flags; 2600 unsigned long save_flags;
2534 void* objp; 2601 void *objp;
2535 2602
2536 cache_alloc_debugcheck_before(cachep, flags); 2603 cache_alloc_debugcheck_before(cachep, flags);
2537 2604
@@ -2539,7 +2606,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2539 objp = ____cache_alloc(cachep, flags); 2606 objp = ____cache_alloc(cachep, flags);
2540 local_irq_restore(save_flags); 2607 local_irq_restore(save_flags);
2541 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2608 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2542 __builtin_return_address(0)); 2609 __builtin_return_address(0));
2543 prefetchw(objp); 2610 prefetchw(objp);
2544 return objp; 2611 return objp;
2545} 2612}
@@ -2551,74 +2618,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2551static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2618static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2552{ 2619{
2553 struct list_head *entry; 2620 struct list_head *entry;
2554 struct slab *slabp; 2621 struct slab *slabp;
2555 struct kmem_list3 *l3; 2622 struct kmem_list3 *l3;
2556 void *obj; 2623 void *obj;
2557 kmem_bufctl_t next; 2624 kmem_bufctl_t next;
2558 int x; 2625 int x;
2559 2626
2560 l3 = cachep->nodelists[nodeid]; 2627 l3 = cachep->nodelists[nodeid];
2561 BUG_ON(!l3); 2628 BUG_ON(!l3);
2562 2629
2563retry: 2630 retry:
2564 spin_lock(&l3->list_lock); 2631 spin_lock(&l3->list_lock);
2565 entry = l3->slabs_partial.next; 2632 entry = l3->slabs_partial.next;
2566 if (entry == &l3->slabs_partial) { 2633 if (entry == &l3->slabs_partial) {
2567 l3->free_touched = 1; 2634 l3->free_touched = 1;
2568 entry = l3->slabs_free.next; 2635 entry = l3->slabs_free.next;
2569 if (entry == &l3->slabs_free) 2636 if (entry == &l3->slabs_free)
2570 goto must_grow; 2637 goto must_grow;
2571 } 2638 }
2572 2639
2573 slabp = list_entry(entry, struct slab, list); 2640 slabp = list_entry(entry, struct slab, list);
2574 check_spinlock_acquired_node(cachep, nodeid); 2641 check_spinlock_acquired_node(cachep, nodeid);
2575 check_slabp(cachep, slabp); 2642 check_slabp(cachep, slabp);
2576 2643
2577 STATS_INC_NODEALLOCS(cachep); 2644 STATS_INC_NODEALLOCS(cachep);
2578 STATS_INC_ACTIVE(cachep); 2645 STATS_INC_ACTIVE(cachep);
2579 STATS_SET_HIGH(cachep); 2646 STATS_SET_HIGH(cachep);
2580 2647
2581 BUG_ON(slabp->inuse == cachep->num); 2648 BUG_ON(slabp->inuse == cachep->num);
2582 2649
2583 /* get obj pointer */ 2650 /* get obj pointer */
2584 obj = slabp->s_mem + slabp->free*cachep->objsize; 2651 obj = slabp->s_mem + slabp->free * cachep->objsize;
2585 slabp->inuse++; 2652 slabp->inuse++;
2586 next = slab_bufctl(slabp)[slabp->free]; 2653 next = slab_bufctl(slabp)[slabp->free];
2587#if DEBUG 2654#if DEBUG
2588 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2655 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2589#endif 2656#endif
2590 slabp->free = next; 2657 slabp->free = next;
2591 check_slabp(cachep, slabp); 2658 check_slabp(cachep, slabp);
2592 l3->free_objects--; 2659 l3->free_objects--;
2593 /* move slabp to correct slabp list: */ 2660 /* move slabp to correct slabp list: */
2594 list_del(&slabp->list); 2661 list_del(&slabp->list);
2595 2662
2596 if (slabp->free == BUFCTL_END) { 2663 if (slabp->free == BUFCTL_END) {
2597 list_add(&slabp->list, &l3->slabs_full); 2664 list_add(&slabp->list, &l3->slabs_full);
2598 } else { 2665 } else {
2599 list_add(&slabp->list, &l3->slabs_partial); 2666 list_add(&slabp->list, &l3->slabs_partial);
2600 } 2667 }
2601 2668
2602 spin_unlock(&l3->list_lock); 2669 spin_unlock(&l3->list_lock);
2603 goto done; 2670 goto done;
2604 2671
2605must_grow: 2672 must_grow:
2606 spin_unlock(&l3->list_lock); 2673 spin_unlock(&l3->list_lock);
2607 x = cache_grow(cachep, flags, nodeid); 2674 x = cache_grow(cachep, flags, nodeid);
2608 2675
2609 if (!x) 2676 if (!x)
2610 return NULL; 2677 return NULL;
2611 2678
2612 goto retry; 2679 goto retry;
2613done: 2680 done:
2614 return obj; 2681 return obj;
2615} 2682}
2616#endif 2683#endif
2617 2684
2618/* 2685/*
2619 * Caller needs to acquire correct kmem_list's list_lock 2686 * Caller needs to acquire correct kmem_list's list_lock
2620 */ 2687 */
2621static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node) 2688static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2689 int node)
2622{ 2690{
2623 int i; 2691 int i;
2624 struct kmem_list3 *l3; 2692 struct kmem_list3 *l3;
@@ -2628,7 +2696,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
2628 struct slab *slabp; 2696 struct slab *slabp;
2629 unsigned int objnr; 2697 unsigned int objnr;
2630 2698
2631 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2699 slabp = page_get_slab(virt_to_page(objp));
2632 l3 = cachep->nodelists[node]; 2700 l3 = cachep->nodelists[node];
2633 list_del(&slabp->list); 2701 list_del(&slabp->list);
2634 objnr = (objp - slabp->s_mem) / cachep->objsize; 2702 objnr = (objp - slabp->s_mem) / cachep->objsize;
@@ -2641,7 +2709,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
2641 2709
2642 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2710 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2643 printk(KERN_ERR "slab: double free detected in cache " 2711 printk(KERN_ERR "slab: double free detected in cache "
2644 "'%s', objp %p\n", cachep->name, objp); 2712 "'%s', objp %p\n", cachep->name, objp);
2645 BUG(); 2713 BUG();
2646 } 2714 }
2647#endif 2715#endif
@@ -2685,20 +2753,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2685 spin_lock(&l3->list_lock); 2753 spin_lock(&l3->list_lock);
2686 if (l3->shared) { 2754 if (l3->shared) {
2687 struct array_cache *shared_array = l3->shared; 2755 struct array_cache *shared_array = l3->shared;
2688 int max = shared_array->limit-shared_array->avail; 2756 int max = shared_array->limit - shared_array->avail;
2689 if (max) { 2757 if (max) {
2690 if (batchcount > max) 2758 if (batchcount > max)
2691 batchcount = max; 2759 batchcount = max;
2692 memcpy(&(shared_array->entry[shared_array->avail]), 2760 memcpy(&(shared_array->entry[shared_array->avail]),
2693 ac->entry, 2761 ac->entry, sizeof(void *) * batchcount);
2694 sizeof(void*)*batchcount);
2695 shared_array->avail += batchcount; 2762 shared_array->avail += batchcount;
2696 goto free_done; 2763 goto free_done;
2697 } 2764 }
2698 } 2765 }
2699 2766
2700 free_block(cachep, ac->entry, batchcount, node); 2767 free_block(cachep, ac->entry, batchcount, node);
2701free_done: 2768 free_done:
2702#if STATS 2769#if STATS
2703 { 2770 {
2704 int i = 0; 2771 int i = 0;
@@ -2720,10 +2787,9 @@ free_done:
2720 spin_unlock(&l3->list_lock); 2787 spin_unlock(&l3->list_lock);
2721 ac->avail -= batchcount; 2788 ac->avail -= batchcount;
2722 memmove(ac->entry, &(ac->entry[batchcount]), 2789 memmove(ac->entry, &(ac->entry[batchcount]),
2723 sizeof(void*)*ac->avail); 2790 sizeof(void *) * ac->avail);
2724} 2791}
2725 2792
2726
2727/* 2793/*
2728 * __cache_free 2794 * __cache_free
2729 * Release an obj back to its cache. If the obj has a constructed 2795 * Release an obj back to its cache. If the obj has a constructed
@@ -2744,11 +2810,12 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2744#ifdef CONFIG_NUMA 2810#ifdef CONFIG_NUMA
2745 { 2811 {
2746 struct slab *slabp; 2812 struct slab *slabp;
2747 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2813 slabp = page_get_slab(virt_to_page(objp));
2748 if (unlikely(slabp->nodeid != numa_node_id())) { 2814 if (unlikely(slabp->nodeid != numa_node_id())) {
2749 struct array_cache *alien = NULL; 2815 struct array_cache *alien = NULL;
2750 int nodeid = slabp->nodeid; 2816 int nodeid = slabp->nodeid;
2751 struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()]; 2817 struct kmem_list3 *l3 =
2818 cachep->nodelists[numa_node_id()];
2752 2819
2753 STATS_INC_NODEFREES(cachep); 2820 STATS_INC_NODEFREES(cachep);
2754 if (l3->alien && l3->alien[nodeid]) { 2821 if (l3->alien && l3->alien[nodeid]) {
@@ -2756,15 +2823,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2756 spin_lock(&alien->lock); 2823 spin_lock(&alien->lock);
2757 if (unlikely(alien->avail == alien->limit)) 2824 if (unlikely(alien->avail == alien->limit))
2758 __drain_alien_cache(cachep, 2825 __drain_alien_cache(cachep,
2759 alien, nodeid); 2826 alien, nodeid);
2760 alien->entry[alien->avail++] = objp; 2827 alien->entry[alien->avail++] = objp;
2761 spin_unlock(&alien->lock); 2828 spin_unlock(&alien->lock);
2762 } else { 2829 } else {
2763 spin_lock(&(cachep->nodelists[nodeid])-> 2830 spin_lock(&(cachep->nodelists[nodeid])->
2764 list_lock); 2831 list_lock);
2765 free_block(cachep, &objp, 1, nodeid); 2832 free_block(cachep, &objp, 1, nodeid);
2766 spin_unlock(&(cachep->nodelists[nodeid])-> 2833 spin_unlock(&(cachep->nodelists[nodeid])->
2767 list_lock); 2834 list_lock);
2768 } 2835 }
2769 return; 2836 return;
2770 } 2837 }
@@ -2811,9 +2878,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
2811 */ 2878 */
2812int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2879int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2813{ 2880{
2814 unsigned long addr = (unsigned long) ptr; 2881 unsigned long addr = (unsigned long)ptr;
2815 unsigned long min_addr = PAGE_OFFSET; 2882 unsigned long min_addr = PAGE_OFFSET;
2816 unsigned long align_mask = BYTES_PER_WORD-1; 2883 unsigned long align_mask = BYTES_PER_WORD - 1;
2817 unsigned long size = cachep->objsize; 2884 unsigned long size = cachep->objsize;
2818 struct page *page; 2885 struct page *page;
2819 2886
@@ -2830,10 +2897,10 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2830 page = virt_to_page(ptr); 2897 page = virt_to_page(ptr);
2831 if (unlikely(!PageSlab(page))) 2898 if (unlikely(!PageSlab(page)))
2832 goto out; 2899 goto out;
2833 if (unlikely(GET_PAGE_CACHE(page) != cachep)) 2900 if (unlikely(page_get_cache(page) != cachep))
2834 goto out; 2901 goto out;
2835 return 1; 2902 return 1;
2836out: 2903 out:
2837 return 0; 2904 return 0;
2838} 2905}
2839 2906
@@ -2860,8 +2927,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2860 2927
2861 if (unlikely(!cachep->nodelists[nodeid])) { 2928 if (unlikely(!cachep->nodelists[nodeid])) {
2862 /* Fall back to __cache_alloc if we run into trouble */ 2929 /* Fall back to __cache_alloc if we run into trouble */
2863 printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name); 2930 printk(KERN_WARNING
2864 return __cache_alloc(cachep,flags); 2931 "slab: not allocating in inactive node %d for cache %s\n",
2932 nodeid, cachep->name);
2933 return __cache_alloc(cachep, flags);
2865 } 2934 }
2866 2935
2867 cache_alloc_debugcheck_before(cachep, flags); 2936 cache_alloc_debugcheck_before(cachep, flags);
@@ -2871,7 +2940,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2871 else 2940 else
2872 ptr = __cache_alloc_node(cachep, flags, nodeid); 2941 ptr = __cache_alloc_node(cachep, flags, nodeid);
2873 local_irq_restore(save_flags); 2942 local_irq_restore(save_flags);
2874 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); 2943 ptr =
2944 cache_alloc_debugcheck_after(cachep, flags, ptr,
2945 __builtin_return_address(0));
2875 2946
2876 return ptr; 2947 return ptr;
2877} 2948}
@@ -2933,12 +3004,11 @@ EXPORT_SYMBOL(__kmalloc);
2933 * Objects should be dereferenced using the per_cpu_ptr macro only. 3004 * Objects should be dereferenced using the per_cpu_ptr macro only.
2934 * 3005 *
2935 * @size: how many bytes of memory are required. 3006 * @size: how many bytes of memory are required.
2936 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
2937 */ 3007 */
2938void *__alloc_percpu(size_t size, size_t align) 3008void *__alloc_percpu(size_t size)
2939{ 3009{
2940 int i; 3010 int i;
2941 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); 3011 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
2942 3012
2943 if (!pdata) 3013 if (!pdata)
2944 return NULL; 3014 return NULL;
@@ -2962,9 +3032,9 @@ void *__alloc_percpu(size_t size, size_t align)
2962 } 3032 }
2963 3033
2964 /* Catch derefs w/o wrappers */ 3034 /* Catch derefs w/o wrappers */
2965 return (void *) (~(unsigned long) pdata); 3035 return (void *)(~(unsigned long)pdata);
2966 3036
2967unwind_oom: 3037 unwind_oom:
2968 while (--i >= 0) { 3038 while (--i >= 0) {
2969 if (!cpu_possible(i)) 3039 if (!cpu_possible(i))
2970 continue; 3040 continue;
@@ -2995,20 +3065,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
2995EXPORT_SYMBOL(kmem_cache_free); 3065EXPORT_SYMBOL(kmem_cache_free);
2996 3066
2997/** 3067/**
2998 * kzalloc - allocate memory. The memory is set to zero.
2999 * @size: how many bytes of memory are required.
3000 * @flags: the type of memory to allocate.
3001 */
3002void *kzalloc(size_t size, gfp_t flags)
3003{
3004 void *ret = kmalloc(size, flags);
3005 if (ret)
3006 memset(ret, 0, size);
3007 return ret;
3008}
3009EXPORT_SYMBOL(kzalloc);
3010
3011/**
3012 * kfree - free previously allocated memory 3068 * kfree - free previously allocated memory
3013 * @objp: pointer returned by kmalloc. 3069 * @objp: pointer returned by kmalloc.
3014 * 3070 *
@@ -3026,8 +3082,9 @@ void kfree(const void *objp)
3026 return; 3082 return;
3027 local_irq_save(flags); 3083 local_irq_save(flags);
3028 kfree_debugcheck(objp); 3084 kfree_debugcheck(objp);
3029 c = GET_PAGE_CACHE(virt_to_page(objp)); 3085 c = page_get_cache(virt_to_page(objp));
3030 __cache_free(c, (void*)objp); 3086 mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
3087 __cache_free(c, (void *)objp);
3031 local_irq_restore(flags); 3088 local_irq_restore(flags);
3032} 3089}
3033EXPORT_SYMBOL(kfree); 3090EXPORT_SYMBOL(kfree);
@@ -3040,17 +3097,16 @@ EXPORT_SYMBOL(kfree);
3040 * Don't free memory not originally allocated by alloc_percpu() 3097 * Don't free memory not originally allocated by alloc_percpu()
3041 * The complemented objp is to check for that. 3098 * The complemented objp is to check for that.
3042 */ 3099 */
3043void 3100void free_percpu(const void *objp)
3044free_percpu(const void *objp)
3045{ 3101{
3046 int i; 3102 int i;
3047 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 3103 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3048 3104
3049 /* 3105 /*
3050 * We allocate for all cpus so we cannot use for online cpu here. 3106 * We allocate for all cpus so we cannot use for online cpu here.
3051 */ 3107 */
3052 for_each_cpu(i) 3108 for_each_cpu(i)
3053 kfree(p->ptrs[i]); 3109 kfree(p->ptrs[i]);
3054 kfree(p); 3110 kfree(p);
3055} 3111}
3056EXPORT_SYMBOL(free_percpu); 3112EXPORT_SYMBOL(free_percpu);
@@ -3084,44 +3140,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
3084 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3140 if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3085 goto fail; 3141 goto fail;
3086#endif 3142#endif
3087 if (!(new = alloc_arraycache(node, (cachep->shared* 3143 if (!(new = alloc_arraycache(node, (cachep->shared *
3088 cachep->batchcount), 0xbaadf00d))) 3144 cachep->batchcount),
3145 0xbaadf00d)))
3089 goto fail; 3146 goto fail;
3090 if ((l3 = cachep->nodelists[node])) { 3147 if ((l3 = cachep->nodelists[node])) {
3091 3148
3092 spin_lock_irq(&l3->list_lock); 3149 spin_lock_irq(&l3->list_lock);
3093 3150
3094 if ((nc = cachep->nodelists[node]->shared)) 3151 if ((nc = cachep->nodelists[node]->shared))
3095 free_block(cachep, nc->entry, 3152 free_block(cachep, nc->entry, nc->avail, node);
3096 nc->avail, node);
3097 3153
3098 l3->shared = new; 3154 l3->shared = new;
3099 if (!cachep->nodelists[node]->alien) { 3155 if (!cachep->nodelists[node]->alien) {
3100 l3->alien = new_alien; 3156 l3->alien = new_alien;
3101 new_alien = NULL; 3157 new_alien = NULL;
3102 } 3158 }
3103 l3->free_limit = (1 + nr_cpus_node(node))* 3159 l3->free_limit = (1 + nr_cpus_node(node)) *
3104 cachep->batchcount + cachep->num; 3160 cachep->batchcount + cachep->num;
3105 spin_unlock_irq(&l3->list_lock); 3161 spin_unlock_irq(&l3->list_lock);
3106 kfree(nc); 3162 kfree(nc);
3107 free_alien_cache(new_alien); 3163 free_alien_cache(new_alien);
3108 continue; 3164 continue;
3109 } 3165 }
3110 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3166 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
3111 GFP_KERNEL, node))) 3167 GFP_KERNEL, node)))
3112 goto fail; 3168 goto fail;
3113 3169
3114 kmem_list3_init(l3); 3170 kmem_list3_init(l3);
3115 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3171 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3116 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 3172 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3117 l3->shared = new; 3173 l3->shared = new;
3118 l3->alien = new_alien; 3174 l3->alien = new_alien;
3119 l3->free_limit = (1 + nr_cpus_node(node))* 3175 l3->free_limit = (1 + nr_cpus_node(node)) *
3120 cachep->batchcount + cachep->num; 3176 cachep->batchcount + cachep->num;
3121 cachep->nodelists[node] = l3; 3177 cachep->nodelists[node] = l3;
3122 } 3178 }
3123 return err; 3179 return err;
3124fail: 3180 fail:
3125 err = -ENOMEM; 3181 err = -ENOMEM;
3126 return err; 3182 return err;
3127} 3183}
@@ -3143,18 +3199,19 @@ static void do_ccupdate_local(void *info)
3143 new->new[smp_processor_id()] = old; 3199 new->new[smp_processor_id()] = old;
3144} 3200}
3145 3201
3146
3147static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3202static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3148 int shared) 3203 int shared)
3149{ 3204{
3150 struct ccupdate_struct new; 3205 struct ccupdate_struct new;
3151 int i, err; 3206 int i, err;
3152 3207
3153 memset(&new.new,0,sizeof(new.new)); 3208 memset(&new.new, 0, sizeof(new.new));
3154 for_each_online_cpu(i) { 3209 for_each_online_cpu(i) {
3155 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); 3210 new.new[i] =
3211 alloc_arraycache(cpu_to_node(i), limit, batchcount);
3156 if (!new.new[i]) { 3212 if (!new.new[i]) {
3157 for (i--; i >= 0; i--) kfree(new.new[i]); 3213 for (i--; i >= 0; i--)
3214 kfree(new.new[i]);
3158 return -ENOMEM; 3215 return -ENOMEM;
3159 } 3216 }
3160 } 3217 }
@@ -3182,13 +3239,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3182 err = alloc_kmemlist(cachep); 3239 err = alloc_kmemlist(cachep);
3183 if (err) { 3240 if (err) {
3184 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3241 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3185 cachep->name, -err); 3242 cachep->name, -err);
3186 BUG(); 3243 BUG();
3187 } 3244 }
3188 return 0; 3245 return 0;
3189} 3246}
3190 3247
3191
3192static void enable_cpucache(kmem_cache_t *cachep) 3248static void enable_cpucache(kmem_cache_t *cachep)
3193{ 3249{
3194 int err; 3250 int err;
@@ -3235,14 +3291,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
3235 if (limit > 32) 3291 if (limit > 32)
3236 limit = 32; 3292 limit = 32;
3237#endif 3293#endif
3238 err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); 3294 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3239 if (err) 3295 if (err)
3240 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3296 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3241 cachep->name, -err); 3297 cachep->name, -err);
3242} 3298}
3243 3299
3244static void drain_array_locked(kmem_cache_t *cachep, 3300static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
3245 struct array_cache *ac, int force, int node) 3301 int force, int node)
3246{ 3302{
3247 int tofree; 3303 int tofree;
3248 3304
@@ -3250,14 +3306,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
3250 if (ac->touched && !force) { 3306 if (ac->touched && !force) {
3251 ac->touched = 0; 3307 ac->touched = 0;
3252 } else if (ac->avail) { 3308 } else if (ac->avail) {
3253 tofree = force ? ac->avail : (ac->limit+4)/5; 3309 tofree = force ? ac->avail : (ac->limit + 4) / 5;
3254 if (tofree > ac->avail) { 3310 if (tofree > ac->avail) {
3255 tofree = (ac->avail+1)/2; 3311 tofree = (ac->avail + 1) / 2;
3256 } 3312 }
3257 free_block(cachep, ac->entry, tofree, node); 3313 free_block(cachep, ac->entry, tofree, node);
3258 ac->avail -= tofree; 3314 ac->avail -= tofree;
3259 memmove(ac->entry, &(ac->entry[tofree]), 3315 memmove(ac->entry, &(ac->entry[tofree]),
3260 sizeof(void*)*ac->avail); 3316 sizeof(void *) * ac->avail);
3261 } 3317 }
3262} 3318}
3263 3319
@@ -3270,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
3270 * - clear the per-cpu caches for this CPU. 3326 * - clear the per-cpu caches for this CPU.
3271 * - return freeable pages to the main free memory pool. 3327 * - return freeable pages to the main free memory pool.
3272 * 3328 *
3273 * If we cannot acquire the cache chain semaphore then just give up - we'll 3329 * If we cannot acquire the cache chain mutex then just give up - we'll
3274 * try again on the next iteration. 3330 * try again on the next iteration.
3275 */ 3331 */
3276static void cache_reap(void *unused) 3332static void cache_reap(void *unused)
@@ -3278,15 +3334,16 @@ static void cache_reap(void *unused)
3278 struct list_head *walk; 3334 struct list_head *walk;
3279 struct kmem_list3 *l3; 3335 struct kmem_list3 *l3;
3280 3336
3281 if (down_trylock(&cache_chain_sem)) { 3337 if (!mutex_trylock(&cache_chain_mutex)) {
3282 /* Give up. Setup the next iteration. */ 3338 /* Give up. Setup the next iteration. */
3283 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3339 schedule_delayed_work(&__get_cpu_var(reap_work),
3340 REAPTIMEOUT_CPUC);
3284 return; 3341 return;
3285 } 3342 }
3286 3343
3287 list_for_each(walk, &cache_chain) { 3344 list_for_each(walk, &cache_chain) {
3288 kmem_cache_t *searchp; 3345 kmem_cache_t *searchp;
3289 struct list_head* p; 3346 struct list_head *p;
3290 int tofree; 3347 int tofree;
3291 struct slab *slabp; 3348 struct slab *slabp;
3292 3349
@@ -3303,7 +3360,7 @@ static void cache_reap(void *unused)
3303 spin_lock_irq(&l3->list_lock); 3360 spin_lock_irq(&l3->list_lock);
3304 3361
3305 drain_array_locked(searchp, ac_data(searchp), 0, 3362 drain_array_locked(searchp, ac_data(searchp), 0,
3306 numa_node_id()); 3363 numa_node_id());
3307 3364
3308 if (time_after(l3->next_reap, jiffies)) 3365 if (time_after(l3->next_reap, jiffies))
3309 goto next_unlock; 3366 goto next_unlock;
@@ -3312,14 +3369,16 @@ static void cache_reap(void *unused)
3312 3369
3313 if (l3->shared) 3370 if (l3->shared)
3314 drain_array_locked(searchp, l3->shared, 0, 3371 drain_array_locked(searchp, l3->shared, 0,
3315 numa_node_id()); 3372 numa_node_id());
3316 3373
3317 if (l3->free_touched) { 3374 if (l3->free_touched) {
3318 l3->free_touched = 0; 3375 l3->free_touched = 0;
3319 goto next_unlock; 3376 goto next_unlock;
3320 } 3377 }
3321 3378
3322 tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num); 3379 tofree =
3380 (l3->free_limit + 5 * searchp->num -
3381 1) / (5 * searchp->num);
3323 do { 3382 do {
3324 p = l3->slabs_free.next; 3383 p = l3->slabs_free.next;
3325 if (p == &(l3->slabs_free)) 3384 if (p == &(l3->slabs_free))
@@ -3339,14 +3398,14 @@ static void cache_reap(void *unused)
3339 spin_unlock_irq(&l3->list_lock); 3398 spin_unlock_irq(&l3->list_lock);
3340 slab_destroy(searchp, slabp); 3399 slab_destroy(searchp, slabp);
3341 spin_lock_irq(&l3->list_lock); 3400 spin_lock_irq(&l3->list_lock);
3342 } while(--tofree > 0); 3401 } while (--tofree > 0);
3343next_unlock: 3402 next_unlock:
3344 spin_unlock_irq(&l3->list_lock); 3403 spin_unlock_irq(&l3->list_lock);
3345next: 3404 next:
3346 cond_resched(); 3405 cond_resched();
3347 } 3406 }
3348 check_irq_on(); 3407 check_irq_on();
3349 up(&cache_chain_sem); 3408 mutex_unlock(&cache_chain_mutex);
3350 drain_remote_pages(); 3409 drain_remote_pages();
3351 /* Setup the next iteration */ 3410 /* Setup the next iteration */
3352 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3411 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
@@ -3354,32 +3413,37 @@ next:
3354 3413
3355#ifdef CONFIG_PROC_FS 3414#ifdef CONFIG_PROC_FS
3356 3415
3357static void *s_start(struct seq_file *m, loff_t *pos) 3416static void print_slabinfo_header(struct seq_file *m)
3358{ 3417{
3359 loff_t n = *pos; 3418 /*
3360 struct list_head *p; 3419 * Output format version, so at least we can change it
3361 3420 * without _too_ many complaints.
3362 down(&cache_chain_sem); 3421 */
3363 if (!n) {
3364 /*
3365 * Output format version, so at least we can change it
3366 * without _too_ many complaints.
3367 */
3368#if STATS 3422#if STATS
3369 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3423 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3370#else 3424#else
3371 seq_puts(m, "slabinfo - version: 2.1\n"); 3425 seq_puts(m, "slabinfo - version: 2.1\n");
3372#endif 3426#endif
3373 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 3427 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
3374 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3428 "<objperslab> <pagesperslab>");
3375 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3429 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3430 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3376#if STATS 3431#if STATS
3377 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" 3432 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3378 " <error> <maxfreeable> <nodeallocs> <remotefrees>"); 3433 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
3379 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3434 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3380#endif 3435#endif
3381 seq_putc(m, '\n'); 3436 seq_putc(m, '\n');
3382 } 3437}
3438
3439static void *s_start(struct seq_file *m, loff_t *pos)
3440{
3441 loff_t n = *pos;
3442 struct list_head *p;
3443
3444 mutex_lock(&cache_chain_mutex);
3445 if (!n)
3446 print_slabinfo_header(m);
3383 p = cache_chain.next; 3447 p = cache_chain.next;
3384 while (n--) { 3448 while (n--) {
3385 p = p->next; 3449 p = p->next;
@@ -3394,23 +3458,23 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3394 kmem_cache_t *cachep = p; 3458 kmem_cache_t *cachep = p;
3395 ++*pos; 3459 ++*pos;
3396 return cachep->next.next == &cache_chain ? NULL 3460 return cachep->next.next == &cache_chain ? NULL
3397 : list_entry(cachep->next.next, kmem_cache_t, next); 3461 : list_entry(cachep->next.next, kmem_cache_t, next);
3398} 3462}
3399 3463
3400static void s_stop(struct seq_file *m, void *p) 3464static void s_stop(struct seq_file *m, void *p)
3401{ 3465{
3402 up(&cache_chain_sem); 3466 mutex_unlock(&cache_chain_mutex);
3403} 3467}
3404 3468
3405static int s_show(struct seq_file *m, void *p) 3469static int s_show(struct seq_file *m, void *p)
3406{ 3470{
3407 kmem_cache_t *cachep = p; 3471 kmem_cache_t *cachep = p;
3408 struct list_head *q; 3472 struct list_head *q;
3409 struct slab *slabp; 3473 struct slab *slabp;
3410 unsigned long active_objs; 3474 unsigned long active_objs;
3411 unsigned long num_objs; 3475 unsigned long num_objs;
3412 unsigned long active_slabs = 0; 3476 unsigned long active_slabs = 0;
3413 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3477 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3414 const char *name; 3478 const char *name;
3415 char *error = NULL; 3479 char *error = NULL;
3416 int node; 3480 int node;
@@ -3427,14 +3491,14 @@ static int s_show(struct seq_file *m, void *p)
3427 3491
3428 spin_lock(&l3->list_lock); 3492 spin_lock(&l3->list_lock);
3429 3493
3430 list_for_each(q,&l3->slabs_full) { 3494 list_for_each(q, &l3->slabs_full) {
3431 slabp = list_entry(q, struct slab, list); 3495 slabp = list_entry(q, struct slab, list);
3432 if (slabp->inuse != cachep->num && !error) 3496 if (slabp->inuse != cachep->num && !error)
3433 error = "slabs_full accounting error"; 3497 error = "slabs_full accounting error";
3434 active_objs += cachep->num; 3498 active_objs += cachep->num;
3435 active_slabs++; 3499 active_slabs++;
3436 } 3500 }
3437 list_for_each(q,&l3->slabs_partial) { 3501 list_for_each(q, &l3->slabs_partial) {
3438 slabp = list_entry(q, struct slab, list); 3502 slabp = list_entry(q, struct slab, list);
3439 if (slabp->inuse == cachep->num && !error) 3503 if (slabp->inuse == cachep->num && !error)
3440 error = "slabs_partial inuse accounting error"; 3504 error = "slabs_partial inuse accounting error";
@@ -3443,7 +3507,7 @@ static int s_show(struct seq_file *m, void *p)
3443 active_objs += slabp->inuse; 3507 active_objs += slabp->inuse;
3444 active_slabs++; 3508 active_slabs++;
3445 } 3509 }
3446 list_for_each(q,&l3->slabs_free) { 3510 list_for_each(q, &l3->slabs_free) {
3447 slabp = list_entry(q, struct slab, list); 3511 slabp = list_entry(q, struct slab, list);
3448 if (slabp->inuse && !error) 3512 if (slabp->inuse && !error)
3449 error = "slabs_free/inuse accounting error"; 3513 error = "slabs_free/inuse accounting error";
@@ -3454,25 +3518,24 @@ static int s_show(struct seq_file *m, void *p)
3454 3518
3455 spin_unlock(&l3->list_lock); 3519 spin_unlock(&l3->list_lock);
3456 } 3520 }
3457 num_slabs+=active_slabs; 3521 num_slabs += active_slabs;
3458 num_objs = num_slabs*cachep->num; 3522 num_objs = num_slabs * cachep->num;
3459 if (num_objs - active_objs != free_objects && !error) 3523 if (num_objs - active_objs != free_objects && !error)
3460 error = "free_objects accounting error"; 3524 error = "free_objects accounting error";
3461 3525
3462 name = cachep->name; 3526 name = cachep->name;
3463 if (error) 3527 if (error)
3464 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3528 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3465 3529
3466 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3530 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3467 name, active_objs, num_objs, cachep->objsize, 3531 name, active_objs, num_objs, cachep->objsize,
3468 cachep->num, (1<<cachep->gfporder)); 3532 cachep->num, (1 << cachep->gfporder));
3469 seq_printf(m, " : tunables %4u %4u %4u", 3533 seq_printf(m, " : tunables %4u %4u %4u",
3470 cachep->limit, cachep->batchcount, 3534 cachep->limit, cachep->batchcount, cachep->shared);
3471 cachep->shared);
3472 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3535 seq_printf(m, " : slabdata %6lu %6lu %6lu",
3473 active_slabs, num_slabs, shared_avail); 3536 active_slabs, num_slabs, shared_avail);
3474#if STATS 3537#if STATS
3475 { /* list3 stats */ 3538 { /* list3 stats */
3476 unsigned long high = cachep->high_mark; 3539 unsigned long high = cachep->high_mark;
3477 unsigned long allocs = cachep->num_allocations; 3540 unsigned long allocs = cachep->num_allocations;
3478 unsigned long grown = cachep->grown; 3541 unsigned long grown = cachep->grown;
@@ -3483,9 +3546,7 @@ static int s_show(struct seq_file *m, void *p)
3483 unsigned long node_frees = cachep->node_frees; 3546 unsigned long node_frees = cachep->node_frees;
3484 3547
3485 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3548 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3486 %4lu %4lu %4lu %4lu", 3549 %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
3487 allocs, high, grown, reaped, errors,
3488 max_freeable, node_allocs, node_frees);
3489 } 3550 }
3490 /* cpu stats */ 3551 /* cpu stats */
3491 { 3552 {
@@ -3495,7 +3556,7 @@ static int s_show(struct seq_file *m, void *p)
3495 unsigned long freemiss = atomic_read(&cachep->freemiss); 3556 unsigned long freemiss = atomic_read(&cachep->freemiss);
3496 3557
3497 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3558 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3498 allochit, allocmiss, freehit, freemiss); 3559 allochit, allocmiss, freehit, freemiss);
3499 } 3560 }
3500#endif 3561#endif
3501 seq_putc(m, '\n'); 3562 seq_putc(m, '\n');
@@ -3518,10 +3579,10 @@ static int s_show(struct seq_file *m, void *p)
3518 */ 3579 */
3519 3580
3520struct seq_operations slabinfo_op = { 3581struct seq_operations slabinfo_op = {
3521 .start = s_start, 3582 .start = s_start,
3522 .next = s_next, 3583 .next = s_next,
3523 .stop = s_stop, 3584 .stop = s_stop,
3524 .show = s_show, 3585 .show = s_show,
3525}; 3586};
3526 3587
3527#define MAX_SLABINFO_WRITE 128 3588#define MAX_SLABINFO_WRITE 128
@@ -3532,18 +3593,18 @@ struct seq_operations slabinfo_op = {
3532 * @count: data length 3593 * @count: data length
3533 * @ppos: unused 3594 * @ppos: unused
3534 */ 3595 */
3535ssize_t slabinfo_write(struct file *file, const char __user *buffer, 3596ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3536 size_t count, loff_t *ppos) 3597 size_t count, loff_t *ppos)
3537{ 3598{
3538 char kbuf[MAX_SLABINFO_WRITE+1], *tmp; 3599 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3539 int limit, batchcount, shared, res; 3600 int limit, batchcount, shared, res;
3540 struct list_head *p; 3601 struct list_head *p;
3541 3602
3542 if (count > MAX_SLABINFO_WRITE) 3603 if (count > MAX_SLABINFO_WRITE)
3543 return -EINVAL; 3604 return -EINVAL;
3544 if (copy_from_user(&kbuf, buffer, count)) 3605 if (copy_from_user(&kbuf, buffer, count))
3545 return -EFAULT; 3606 return -EFAULT;
3546 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3607 kbuf[MAX_SLABINFO_WRITE] = '\0';
3547 3608
3548 tmp = strchr(kbuf, ' '); 3609 tmp = strchr(kbuf, ' ');
3549 if (!tmp) 3610 if (!tmp)
@@ -3554,25 +3615,24 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
3554 return -EINVAL; 3615 return -EINVAL;
3555 3616
3556 /* Find the cache in the chain of caches. */ 3617 /* Find the cache in the chain of caches. */
3557 down(&cache_chain_sem); 3618 mutex_lock(&cache_chain_mutex);
3558 res = -EINVAL; 3619 res = -EINVAL;
3559 list_for_each(p,&cache_chain) { 3620 list_for_each(p, &cache_chain) {
3560 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3621 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
3561 3622
3562 if (!strcmp(cachep->name, kbuf)) { 3623 if (!strcmp(cachep->name, kbuf)) {
3563 if (limit < 1 || 3624 if (limit < 1 ||
3564 batchcount < 1 || 3625 batchcount < 1 ||
3565 batchcount > limit || 3626 batchcount > limit || shared < 0) {
3566 shared < 0) {
3567 res = 0; 3627 res = 0;
3568 } else { 3628 } else {
3569 res = do_tune_cpucache(cachep, limit, 3629 res = do_tune_cpucache(cachep, limit,
3570 batchcount, shared); 3630 batchcount, shared);
3571 } 3631 }
3572 break; 3632 break;
3573 } 3633 }
3574 } 3634 }
3575 up(&cache_chain_sem); 3635 mutex_unlock(&cache_chain_mutex);
3576 if (res >= 0) 3636 if (res >= 0)
3577 res = count; 3637 res = count;
3578 return res; 3638 return res;
@@ -3596,28 +3656,5 @@ unsigned int ksize(const void *objp)
3596 if (unlikely(objp == NULL)) 3656 if (unlikely(objp == NULL))
3597 return 0; 3657 return 0;
3598 3658
3599 return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp))); 3659 return obj_reallen(page_get_cache(virt_to_page(objp)));
3600}
3601
3602
3603/*
3604 * kstrdup - allocate space for and copy an existing string
3605 *
3606 * @s: the string to duplicate
3607 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
3608 */
3609char *kstrdup(const char *s, gfp_t gfp)
3610{
3611 size_t len;
3612 char *buf;
3613
3614 if (!s)
3615 return NULL;
3616
3617 len = strlen(s) + 1;
3618 buf = kmalloc(len, gfp);
3619 if (buf)
3620 memcpy(buf, s, len);
3621 return buf;
3622} 3660}
3623EXPORT_SYMBOL(kstrdup);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 0000000000..1c240c4b71
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
1/*
2 * SLOB Allocator: Simple List Of Blocks
3 *
4 * Matt Mackall <mpm@selenic.com> 12/30/03
5 *
6 * How SLOB works:
7 *
8 * The core of SLOB is a traditional K&R style heap allocator, with
9 * support for returning aligned objects. The granularity of this
10 * allocator is 8 bytes on x86, though it's perhaps possible to reduce
11 * this to 4 if it's deemed worth the effort. The slob heap is a
12 * singly-linked list of pages from __get_free_page, grown on demand
13 * and allocation from the heap is currently first-fit.
14 *
15 * Above this is an implementation of kmalloc/kfree. Blocks returned
16 * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
17 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
18 * __get_free_pages directly so that it can return page-aligned blocks
19 * and keeps a linked list of such pages and their orders. These
20 * objects are detected in kfree() by their page alignment.
21 *
22 * SLAB is emulated on top of SLOB by simply calling constructors and
23 * destructors for every SLAB allocation. Objects are returned with
24 * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
25 * set, in which case the low-level allocator will fragment blocks to
26 * create the proper alignment. Again, objects of page-size or greater
27 * are allocated by calling __get_free_pages. As SLAB objects know
28 * their size, no separate size bookkeeping is necessary and there is
29 * essentially no allocation space overhead.
30 */
31
32#include <linux/config.h>
33#include <linux/slab.h>
34#include <linux/mm.h>
35#include <linux/cache.h>
36#include <linux/init.h>
37#include <linux/module.h>
38#include <linux/timer.h>
39
40struct slob_block {
41 int units;
42 struct slob_block *next;
43};
44typedef struct slob_block slob_t;
45
46#define SLOB_UNIT sizeof(slob_t)
47#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
48#define SLOB_ALIGN L1_CACHE_BYTES
49
50struct bigblock {
51 int order;
52 void *pages;
53 struct bigblock *next;
54};
55typedef struct bigblock bigblock_t;
56
57static slob_t arena = { .next = &arena, .units = 1 };
58static slob_t *slobfree = &arena;
59static bigblock_t *bigblocks;
60static DEFINE_SPINLOCK(slob_lock);
61static DEFINE_SPINLOCK(block_lock);
62
63static void slob_free(void *b, int size);
64
65static void *slob_alloc(size_t size, gfp_t gfp, int align)
66{
67 slob_t *prev, *cur, *aligned = 0;
68 int delta = 0, units = SLOB_UNITS(size);
69 unsigned long flags;
70
71 spin_lock_irqsave(&slob_lock, flags);
72 prev = slobfree;
73 for (cur = prev->next; ; prev = cur, cur = cur->next) {
74 if (align) {
75 aligned = (slob_t *)ALIGN((unsigned long)cur, align);
76 delta = aligned - cur;
77 }
78 if (cur->units >= units + delta) { /* room enough? */
79 if (delta) { /* need to fragment head to align? */
80 aligned->units = cur->units - delta;
81 aligned->next = cur->next;
82 cur->next = aligned;
83 cur->units = delta;
84 prev = cur;
85 cur = aligned;
86 }
87
88 if (cur->units == units) /* exact fit? */
89 prev->next = cur->next; /* unlink */
90 else { /* fragment */
91 prev->next = cur + units;
92 prev->next->units = cur->units - units;
93 prev->next->next = cur->next;
94 cur->units = units;
95 }
96
97 slobfree = prev;
98 spin_unlock_irqrestore(&slob_lock, flags);
99 return cur;
100 }
101 if (cur == slobfree) {
102 spin_unlock_irqrestore(&slob_lock, flags);
103
104 if (size == PAGE_SIZE) /* trying to shrink arena? */
105 return 0;
106
107 cur = (slob_t *)__get_free_page(gfp);
108 if (!cur)
109 return 0;
110
111 slob_free(cur, PAGE_SIZE);
112 spin_lock_irqsave(&slob_lock, flags);
113 cur = slobfree;
114 }
115 }
116}
117
118static void slob_free(void *block, int size)
119{
120 slob_t *cur, *b = (slob_t *)block;
121 unsigned long flags;
122
123 if (!block)
124 return;
125
126 if (size)
127 b->units = SLOB_UNITS(size);
128
129 /* Find reinsertion point */
130 spin_lock_irqsave(&slob_lock, flags);
131 for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
132 if (cur >= cur->next && (b > cur || b < cur->next))
133 break;
134
135 if (b + b->units == cur->next) {
136 b->units += cur->next->units;
137 b->next = cur->next->next;
138 } else
139 b->next = cur->next;
140
141 if (cur + cur->units == b) {
142 cur->units += b->units;
143 cur->next = b->next;
144 } else
145 cur->next = b;
146
147 slobfree = cur;
148
149 spin_unlock_irqrestore(&slob_lock, flags);
150}
151
152static int FASTCALL(find_order(int size));
153static int fastcall find_order(int size)
154{
155 int order = 0;
156 for ( ; size > 4096 ; size >>=1)
157 order++;
158 return order;
159}
160
161void *kmalloc(size_t size, gfp_t gfp)
162{
163 slob_t *m;
164 bigblock_t *bb;
165 unsigned long flags;
166
167 if (size < PAGE_SIZE - SLOB_UNIT) {
168 m = slob_alloc(size + SLOB_UNIT, gfp, 0);
169 return m ? (void *)(m + 1) : 0;
170 }
171
172 bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
173 if (!bb)
174 return 0;
175
176 bb->order = find_order(size);
177 bb->pages = (void *)__get_free_pages(gfp, bb->order);
178
179 if (bb->pages) {
180 spin_lock_irqsave(&block_lock, flags);
181 bb->next = bigblocks;
182 bigblocks = bb;
183 spin_unlock_irqrestore(&block_lock, flags);
184 return bb->pages;
185 }
186
187 slob_free(bb, sizeof(bigblock_t));
188 return 0;
189}
190
191EXPORT_SYMBOL(kmalloc);
192
193void kfree(const void *block)
194{
195 bigblock_t *bb, **last = &bigblocks;
196 unsigned long flags;
197
198 if (!block)
199 return;
200
201 if (!((unsigned long)block & (PAGE_SIZE-1))) {
202 /* might be on the big block list */
203 spin_lock_irqsave(&block_lock, flags);
204 for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
205 if (bb->pages == block) {
206 *last = bb->next;
207 spin_unlock_irqrestore(&block_lock, flags);
208 free_pages((unsigned long)block, bb->order);
209 slob_free(bb, sizeof(bigblock_t));
210 return;
211 }
212 }
213 spin_unlock_irqrestore(&block_lock, flags);
214 }
215
216 slob_free((slob_t *)block - 1, 0);
217 return;
218}
219
220EXPORT_SYMBOL(kfree);
221
222unsigned int ksize(const void *block)
223{
224 bigblock_t *bb;
225 unsigned long flags;
226
227 if (!block)
228 return 0;
229
230 if (!((unsigned long)block & (PAGE_SIZE-1))) {
231 spin_lock_irqsave(&block_lock, flags);
232 for (bb = bigblocks; bb; bb = bb->next)
233 if (bb->pages == block) {
234 spin_unlock_irqrestore(&slob_lock, flags);
235 return PAGE_SIZE << bb->order;
236 }
237 spin_unlock_irqrestore(&block_lock, flags);
238 }
239
240 return ((slob_t *)block - 1)->units * SLOB_UNIT;
241}
242
243struct kmem_cache {
244 unsigned int size, align;
245 const char *name;
246 void (*ctor)(void *, struct kmem_cache *, unsigned long);
247 void (*dtor)(void *, struct kmem_cache *, unsigned long);
248};
249
250struct kmem_cache *kmem_cache_create(const char *name, size_t size,
251 size_t align, unsigned long flags,
252 void (*ctor)(void*, struct kmem_cache *, unsigned long),
253 void (*dtor)(void*, struct kmem_cache *, unsigned long))
254{
255 struct kmem_cache *c;
256
257 c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
258
259 if (c) {
260 c->name = name;
261 c->size = size;
262 c->ctor = ctor;
263 c->dtor = dtor;
264 /* ignore alignment unless it's forced */
265 c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
266 if (c->align < align)
267 c->align = align;
268 }
269
270 return c;
271}
272EXPORT_SYMBOL(kmem_cache_create);
273
274int kmem_cache_destroy(struct kmem_cache *c)
275{
276 slob_free(c, sizeof(struct kmem_cache));
277 return 0;
278}
279EXPORT_SYMBOL(kmem_cache_destroy);
280
281void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
282{
283 void *b;
284
285 if (c->size < PAGE_SIZE)
286 b = slob_alloc(c->size, flags, c->align);
287 else
288 b = (void *)__get_free_pages(flags, find_order(c->size));
289
290 if (c->ctor)
291 c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
292
293 return b;
294}
295EXPORT_SYMBOL(kmem_cache_alloc);
296
297void kmem_cache_free(struct kmem_cache *c, void *b)
298{
299 if (c->dtor)
300 c->dtor(b, c, 0);
301
302 if (c->size < PAGE_SIZE)
303 slob_free(b, c->size);
304 else
305 free_pages((unsigned long)b, find_order(c->size));
306}
307EXPORT_SYMBOL(kmem_cache_free);
308
309unsigned int kmem_cache_size(struct kmem_cache *c)
310{
311 return c->size;
312}
313EXPORT_SYMBOL(kmem_cache_size);
314
315const char *kmem_cache_name(struct kmem_cache *c)
316{
317 return c->name;
318}
319EXPORT_SYMBOL(kmem_cache_name);
320
321static struct timer_list slob_timer = TIMER_INITIALIZER(
322 (void (*)(unsigned long))kmem_cache_init, 0, 0);
323
324void kmem_cache_init(void)
325{
326 void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
327
328 if (p)
329 free_page((unsigned long)p);
330
331 mod_timer(&slob_timer, jiffies + HZ);
332}
333
334atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
335EXPORT_SYMBOL(slab_reclaim_pages);
336
337#ifdef CONFIG_SMP
338
339void *__alloc_percpu(size_t size, size_t align)
340{
341 int i;
342 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
343
344 if (!pdata)
345 return NULL;
346
347 for (i = 0; i < NR_CPUS; i++) {
348 if (!cpu_possible(i))
349 continue;
350 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
351 if (!pdata->ptrs[i])
352 goto unwind_oom;
353 memset(pdata->ptrs[i], 0, size);
354 }
355
356 /* Catch derefs w/o wrappers */
357 return (void *) (~(unsigned long) pdata);
358
359unwind_oom:
360 while (--i >= 0) {
361 if (!cpu_possible(i))
362 continue;
363 kfree(pdata->ptrs[i]);
364 }
365 kfree(pdata);
366 return NULL;
367}
368EXPORT_SYMBOL(__alloc_percpu);
369
370void
371free_percpu(const void *objp)
372{
373 int i;
374 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
375
376 for (i = 0; i < NR_CPUS; i++) {
377 if (!cpu_possible(i))
378 continue;
379 kfree(p->ptrs[i]);
380 }
381 kfree(p);
382}
383EXPORT_SYMBOL(free_percpu);
384
385#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e..0a51f36ba3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
18 */ 18 */
19#ifdef CONFIG_SPARSEMEM_EXTREME 19#ifdef CONFIG_SPARSEMEM_EXTREME
20struct mem_section *mem_section[NR_SECTION_ROOTS] 20struct mem_section *mem_section[NR_SECTION_ROOTS]
21 ____cacheline_maxaligned_in_smp; 21 ____cacheline_internodealigned_in_smp;
22#else 22#else
23struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] 23struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
24 ____cacheline_maxaligned_in_smp; 24 ____cacheline_internodealigned_in_smp;
25#endif 25#endif
26EXPORT_SYMBOL(mem_section); 26EXPORT_SYMBOL(mem_section);
27 27
diff --git a/mm/swap.c b/mm/swap.c
index d09cf7f03e..bc2442a7b0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,8 +34,6 @@
34/* How many pages do we try to swap or page in/out together? */ 34/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 35int page_cluster;
36 36
37#ifdef CONFIG_HUGETLB_PAGE
38
39void put_page(struct page *page) 37void put_page(struct page *page)
40{ 38{
41 if (unlikely(PageCompound(page))) { 39 if (unlikely(PageCompound(page))) {
@@ -52,7 +50,6 @@ void put_page(struct page *page)
52 __page_cache_release(page); 50 __page_cache_release(page);
53} 51}
54EXPORT_SYMBOL(put_page); 52EXPORT_SYMBOL(put_page);
55#endif
56 53
57/* 54/*
58 * Writeback is about to end against a page which has been marked for immediate 55 * Writeback is about to end against a page which has been marked for immediate
@@ -159,18 +156,50 @@ void fastcall lru_cache_add_active(struct page *page)
159 put_cpu_var(lru_add_active_pvecs); 156 put_cpu_var(lru_add_active_pvecs);
160} 157}
161 158
162void lru_add_drain(void) 159static void __lru_add_drain(int cpu)
163{ 160{
164 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 161 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
165 162
163 /* CPU is dead, so no locking needed. */
166 if (pagevec_count(pvec)) 164 if (pagevec_count(pvec))
167 __pagevec_lru_add(pvec); 165 __pagevec_lru_add(pvec);
168 pvec = &__get_cpu_var(lru_add_active_pvecs); 166 pvec = &per_cpu(lru_add_active_pvecs, cpu);
169 if (pagevec_count(pvec)) 167 if (pagevec_count(pvec))
170 __pagevec_lru_add_active(pvec); 168 __pagevec_lru_add_active(pvec);
171 put_cpu_var(lru_add_pvecs);
172} 169}
173 170
171void lru_add_drain(void)
172{
173 __lru_add_drain(get_cpu());
174 put_cpu();
175}
176
177#ifdef CONFIG_NUMA
178static void lru_add_drain_per_cpu(void *dummy)
179{
180 lru_add_drain();
181}
182
183/*
184 * Returns 0 for success
185 */
186int lru_add_drain_all(void)
187{
188 return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
189}
190
191#else
192
193/*
194 * Returns 0 for success
195 */
196int lru_add_drain_all(void)
197{
198 lru_add_drain();
199 return 0;
200}
201#endif
202
174/* 203/*
175 * This path almost never happens for VM activity - pages are normally 204 * This path almost never happens for VM activity - pages are normally
176 * freed via pagevecs. But it gets used by networking. 205 * freed via pagevecs. But it gets used by networking.
@@ -381,6 +410,8 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
381 return pagevec_count(pvec); 410 return pagevec_count(pvec);
382} 411}
383 412
413EXPORT_SYMBOL(pagevec_lookup);
414
384unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 415unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
385 pgoff_t *index, int tag, unsigned nr_pages) 416 pgoff_t *index, int tag, unsigned nr_pages)
386{ 417{
@@ -415,17 +446,6 @@ void vm_acct_memory(long pages)
415} 446}
416 447
417#ifdef CONFIG_HOTPLUG_CPU 448#ifdef CONFIG_HOTPLUG_CPU
418static void lru_drain_cache(unsigned int cpu)
419{
420 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
421
422 /* CPU is dead, so no locking needed. */
423 if (pagevec_count(pvec))
424 __pagevec_lru_add(pvec);
425 pvec = &per_cpu(lru_add_active_pvecs, cpu);
426 if (pagevec_count(pvec))
427 __pagevec_lru_add_active(pvec);
428}
429 449
430/* Drop the CPU's cached committed space back into the central pool. */ 450/* Drop the CPU's cached committed space back into the central pool. */
431static int cpu_swap_callback(struct notifier_block *nfb, 451static int cpu_swap_callback(struct notifier_block *nfb,
@@ -438,7 +458,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
438 if (action == CPU_DEAD) { 458 if (action == CPU_DEAD) {
439 atomic_add(*committed, &vm_committed_space); 459 atomic_add(*committed, &vm_committed_space);
440 *committed = 0; 460 *committed = 0;
441 lru_drain_cache((long)hcpu); 461 __lru_add_drain((long)hcpu);
442 } 462 }
443 return NOTIFY_OK; 463 return NOTIFY_OK;
444} 464}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1d..7b09ac503f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/pagevec.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19 20
@@ -140,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
140 * Allocate swap space for the page and add the page to the 141 * Allocate swap space for the page and add the page to the
141 * swap cache. Caller needs to hold the page lock. 142 * swap cache. Caller needs to hold the page lock.
142 */ 143 */
143int add_to_swap(struct page * page) 144int add_to_swap(struct page * page, gfp_t gfp_mask)
144{ 145{
145 swp_entry_t entry; 146 swp_entry_t entry;
146 int err; 147 int err;
@@ -165,7 +166,7 @@ int add_to_swap(struct page * page)
165 * Add it to the swap cache and mark it dirty 166 * Add it to the swap cache and mark it dirty
166 */ 167 */
167 err = __add_to_swap_cache(page, entry, 168 err = __add_to_swap_cache(page, entry,
168 GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN); 169 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
169 170
170 switch (err) { 171 switch (err) {
171 case 0: /* Success */ 172 case 0: /* Success */
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
272 */ 273 */
273void free_pages_and_swap_cache(struct page **pages, int nr) 274void free_pages_and_swap_cache(struct page **pages, int nr)
274{ 275{
275 int chunk = 16;
276 struct page **pagep = pages; 276 struct page **pagep = pages;
277 277
278 lru_add_drain(); 278 lru_add_drain();
279 while (nr) { 279 while (nr) {
280 int todo = min(chunk, nr); 280 int todo = min(nr, PAGEVEC_SIZE);
281 int i; 281 int i;
282 282
283 for (i = 0; i < todo; i++) 283 for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace30..f1e69c30d2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,8 @@
25#include <linux/rmap.h> 25#include <linux/rmap.h>
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/mutex.h>
29#include <linux/capability.h>
28#include <linux/syscalls.h> 30#include <linux/syscalls.h>
29 31
30#include <asm/pgtable.h> 32#include <asm/pgtable.h>
@@ -45,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1};
45 47
46struct swap_info_struct swap_info[MAX_SWAPFILES]; 48struct swap_info_struct swap_info[MAX_SWAPFILES];
47 49
48static DECLARE_MUTEX(swapon_sem); 50static DEFINE_MUTEX(swapon_mutex);
49 51
50/* 52/*
51 * We need this because the bdev->unplug_fn can sleep and we cannot 53 * We need this because the bdev->unplug_fn can sleep and we cannot
52 * hold swap_lock while calling the unplug_fn. And swap_lock 54 * hold swap_lock while calling the unplug_fn. And swap_lock
53 * cannot be turned into a semaphore. 55 * cannot be turned into a mutex.
54 */ 56 */
55static DECLARE_RWSEM(swap_unplug_sem); 57static DECLARE_RWSEM(swap_unplug_sem);
56 58
@@ -211,6 +213,26 @@ noswap:
211 return (swp_entry_t) {0}; 213 return (swp_entry_t) {0};
212} 214}
213 215
216swp_entry_t get_swap_page_of_type(int type)
217{
218 struct swap_info_struct *si;
219 pgoff_t offset;
220
221 spin_lock(&swap_lock);
222 si = swap_info + type;
223 if (si->flags & SWP_WRITEOK) {
224 nr_swap_pages--;
225 offset = scan_swap_map(si);
226 if (offset) {
227 spin_unlock(&swap_lock);
228 return swp_entry(type, offset);
229 }
230 nr_swap_pages++;
231 }
232 spin_unlock(&swap_lock);
233 return (swp_entry_t) {0};
234}
235
214static struct swap_info_struct * swap_info_get(swp_entry_t entry) 236static struct swap_info_struct * swap_info_get(swp_entry_t entry)
215{ 237{
216 struct swap_info_struct * p; 238 struct swap_info_struct * p;
@@ -1140,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1140 up_write(&swap_unplug_sem); 1162 up_write(&swap_unplug_sem);
1141 1163
1142 destroy_swap_extents(p); 1164 destroy_swap_extents(p);
1143 down(&swapon_sem); 1165 mutex_lock(&swapon_mutex);
1144 spin_lock(&swap_lock); 1166 spin_lock(&swap_lock);
1145 drain_mmlist(); 1167 drain_mmlist();
1146 1168
@@ -1159,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1159 p->swap_map = NULL; 1181 p->swap_map = NULL;
1160 p->flags = 0; 1182 p->flags = 0;
1161 spin_unlock(&swap_lock); 1183 spin_unlock(&swap_lock);
1162 up(&swapon_sem); 1184 mutex_unlock(&swapon_mutex);
1163 vfree(swap_map); 1185 vfree(swap_map);
1164 inode = mapping->host; 1186 inode = mapping->host;
1165 if (S_ISBLK(inode->i_mode)) { 1187 if (S_ISBLK(inode->i_mode)) {
@@ -1167,9 +1189,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1167 set_blocksize(bdev, p->old_block_size); 1189 set_blocksize(bdev, p->old_block_size);
1168 bd_release(bdev); 1190 bd_release(bdev);
1169 } else { 1191 } else {
1170 down(&inode->i_sem); 1192 mutex_lock(&inode->i_mutex);
1171 inode->i_flags &= ~S_SWAPFILE; 1193 inode->i_flags &= ~S_SWAPFILE;
1172 up(&inode->i_sem); 1194 mutex_unlock(&inode->i_mutex);
1173 } 1195 }
1174 filp_close(swap_file, NULL); 1196 filp_close(swap_file, NULL);
1175 err = 0; 1197 err = 0;
@@ -1188,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
1188 int i; 1210 int i;
1189 loff_t l = *pos; 1211 loff_t l = *pos;
1190 1212
1191 down(&swapon_sem); 1213 mutex_lock(&swapon_mutex);
1192 1214
1193 for (i = 0; i < nr_swapfiles; i++, ptr++) { 1215 for (i = 0; i < nr_swapfiles; i++, ptr++) {
1194 if (!(ptr->flags & SWP_USED) || !ptr->swap_map) 1216 if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
@@ -1217,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
1217 1239
1218static void swap_stop(struct seq_file *swap, void *v) 1240static void swap_stop(struct seq_file *swap, void *v)
1219{ 1241{
1220 up(&swapon_sem); 1242 mutex_unlock(&swapon_mutex);
1221} 1243}
1222 1244
1223static int swap_show(struct seq_file *swap, void *v) 1245static int swap_show(struct seq_file *swap, void *v)
@@ -1386,7 +1408,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1386 p->bdev = bdev; 1408 p->bdev = bdev;
1387 } else if (S_ISREG(inode->i_mode)) { 1409 } else if (S_ISREG(inode->i_mode)) {
1388 p->bdev = inode->i_sb->s_bdev; 1410 p->bdev = inode->i_sb->s_bdev;
1389 down(&inode->i_sem); 1411 mutex_lock(&inode->i_mutex);
1390 did_down = 1; 1412 did_down = 1;
1391 if (IS_SWAPFILE(inode)) { 1413 if (IS_SWAPFILE(inode)) {
1392 error = -EBUSY; 1414 error = -EBUSY;
@@ -1422,7 +1444,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1422 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) 1444 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1423 swap_header_version = 2; 1445 swap_header_version = 2;
1424 else { 1446 else {
1425 printk("Unable to find swap-space signature\n"); 1447 printk(KERN_ERR "Unable to find swap-space signature\n");
1426 error = -EINVAL; 1448 error = -EINVAL;
1427 goto bad_swap; 1449 goto bad_swap;
1428 } 1450 }
@@ -1473,7 +1495,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1473 goto bad_swap; 1495 goto bad_swap;
1474 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1496 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1475 goto bad_swap; 1497 goto bad_swap;
1476 1498
1477 /* OK, set up the swap map and apply the bad block list */ 1499 /* OK, set up the swap map and apply the bad block list */
1478 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1500 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
1479 error = -ENOMEM; 1501 error = -ENOMEM;
@@ -1482,17 +1504,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1482 1504
1483 error = 0; 1505 error = 0;
1484 memset(p->swap_map, 0, maxpages * sizeof(short)); 1506 memset(p->swap_map, 0, maxpages * sizeof(short));
1485 for (i=0; i<swap_header->info.nr_badpages; i++) { 1507 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1486 int page = swap_header->info.badpages[i]; 1508 int page_nr = swap_header->info.badpages[i];
1487 if (page <= 0 || page >= swap_header->info.last_page) 1509 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1488 error = -EINVAL; 1510 error = -EINVAL;
1489 else 1511 else
1490 p->swap_map[page] = SWAP_MAP_BAD; 1512 p->swap_map[page_nr] = SWAP_MAP_BAD;
1491 } 1513 }
1492 nr_good_pages = swap_header->info.last_page - 1514 nr_good_pages = swap_header->info.last_page -
1493 swap_header->info.nr_badpages - 1515 swap_header->info.nr_badpages -
1494 1 /* header page */; 1516 1 /* header page */;
1495 if (error) 1517 if (error)
1496 goto bad_swap; 1518 goto bad_swap;
1497 } 1519 }
1498 1520
@@ -1519,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1519 goto bad_swap; 1541 goto bad_swap;
1520 } 1542 }
1521 1543
1522 down(&swapon_sem); 1544 mutex_lock(&swapon_mutex);
1523 spin_lock(&swap_lock); 1545 spin_lock(&swap_lock);
1524 p->flags = SWP_ACTIVE; 1546 p->flags = SWP_ACTIVE;
1525 nr_swap_pages += nr_good_pages; 1547 nr_swap_pages += nr_good_pages;
@@ -1545,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1545 swap_info[prev].next = p - swap_info; 1567 swap_info[prev].next = p - swap_info;
1546 } 1568 }
1547 spin_unlock(&swap_lock); 1569 spin_unlock(&swap_lock);
1548 up(&swapon_sem); 1570 mutex_unlock(&swapon_mutex);
1549 error = 0; 1571 error = 0;
1550 goto out; 1572 goto out;
1551bad_swap: 1573bad_swap:
@@ -1576,7 +1598,7 @@ out:
1576 if (did_down) { 1598 if (did_down) {
1577 if (!error) 1599 if (!error)
1578 inode->i_flags |= S_SWAPFILE; 1600 inode->i_flags |= S_SWAPFILE;
1579 up(&inode->i_sem); 1601 mutex_unlock(&inode->i_mutex);
1580 } 1602 }
1581 return error; 1603 return error;
1582} 1604}
diff --git a/mm/thrash.c b/mm/thrash.c
index eff3c18c33..f4c560b4a2 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -57,14 +57,17 @@ void grab_swap_token(void)
57 /* We have the token. Let others know we still need it. */ 57 /* We have the token. Let others know we still need it. */
58 if (has_swap_token(current->mm)) { 58 if (has_swap_token(current->mm)) {
59 current->mm->recent_pagein = 1; 59 current->mm->recent_pagein = 1;
60 if (unlikely(!swap_token_default_timeout))
61 disable_swap_token();
60 return; 62 return;
61 } 63 }
62 64
63 if (time_after(jiffies, swap_token_check)) { 65 if (time_after(jiffies, swap_token_check)) {
64 66
65 /* Can't get swapout protection if we exceed our RSS limit. */ 67 if (!swap_token_default_timeout) {
66 // if (current->mm->rss > current->mm->rlimit_rss) 68 swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
67 // return; 69 return;
70 }
68 71
69 /* ... or if we recently held the token. */ 72 /* ... or if we recently held the token. */
70 if (time_before(jiffies, current->mm->swap_token_time)) 73 if (time_before(jiffies, current->mm->swap_token_time))
@@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm)
95{ 98{
96 spin_lock(&swap_token_lock); 99 spin_lock(&swap_token_lock);
97 if (likely(mm == swap_token_mm)) { 100 if (likely(mm == swap_token_mm)) {
101 mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
98 swap_token_mm = &init_mm; 102 swap_token_mm = &init_mm;
99 swap_token_check = jiffies; 103 swap_token_check = jiffies;
100 } 104 }
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44e..f9d6a9cc91 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
81 goto close_file; 81 goto close_file;
82 82
83 d_instantiate(dentry, inode); 83 d_instantiate(dentry, inode);
84 inode->i_size = size;
85 inode->i_nlink = 0; /* It is unlinked */ 84 inode->i_nlink = 0; /* It is unlinked */
85
86 file->f_vfsmnt = mntget(shm_mnt); 86 file->f_vfsmnt = mntget(shm_mnt);
87 file->f_dentry = dentry; 87 file->f_dentry = dentry;
88 file->f_mapping = inode->i_mapping; 88 file->f_mapping = inode->i_mapping;
89 file->f_op = &ramfs_file_operations; 89 file->f_op = &ramfs_file_operations;
90 file->f_mode = FMODE_WRITE | FMODE_READ; 90 file->f_mode = FMODE_WRITE | FMODE_READ;
91
92 /* notify everyone as to the change of file size */
93 error = do_truncate(dentry, size, 0, file);
94 if (error < 0)
95 goto close_file;
96
91 return file; 97 return file;
92 98
93close_file: 99close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
123{ 129{
124 return 0; 130 return 0;
125} 131}
132
133int shmem_mmap(struct file *file, struct vm_area_struct *vma)
134{
135 file_accessed(file);
136#ifndef CONFIG_MMU
137 return ramfs_nommu_mmap(file, vma);
138#else
139 return 0;
140#endif
141}
142
143#ifndef CONFIG_MMU
144unsigned long shmem_get_unmapped_area(struct file *file,
145 unsigned long addr,
146 unsigned long len,
147 unsigned long pgoff,
148 unsigned long flags)
149{
150 return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
151}
152#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 29c18f68dc..6cb3fff25f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
82} 82}
83 83
84/** 84/**
85 * truncate_inode_pages - truncate *all* the pages from an offset 85 * truncate_inode_pages - truncate range of pages specified by start and
86 * end byte offsets
86 * @mapping: mapping to truncate 87 * @mapping: mapping to truncate
87 * @lstart: offset from which to truncate 88 * @lstart: offset from which to truncate
89 * @lend: offset to which to truncate
88 * 90 *
89 * Truncate the page cache at a set offset, removing the pages that are beyond 91 * Truncate the page cache, removing the pages that are between
90 * that offset (and zeroing out partial pages). 92 * specified offsets (and zeroing out partial page
93 * (if lstart is not page aligned)).
91 * 94 *
92 * Truncate takes two passes - the first pass is nonblocking. It will not 95 * Truncate takes two passes - the first pass is nonblocking. It will not
93 * block on page locks and it will not block on writeback. The second pass 96 * block on page locks and it will not block on writeback. The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
101 * We pass down the cache-hot hint to the page freeing code. Even if the 104 * We pass down the cache-hot hint to the page freeing code. Even if the
102 * mapping is large, it is probably the case that the final pages are the most 105 * mapping is large, it is probably the case that the final pages are the most
103 * recently touched, and freeing happens in ascending file offset order. 106 * recently touched, and freeing happens in ascending file offset order.
104 *
105 * Called under (and serialised by) inode->i_sem.
106 */ 107 */
107void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 108void truncate_inode_pages_range(struct address_space *mapping,
109 loff_t lstart, loff_t lend)
108{ 110{
109 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 111 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
112 pgoff_t end;
110 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 113 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
111 struct pagevec pvec; 114 struct pagevec pvec;
112 pgoff_t next; 115 pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
115 if (mapping->nrpages == 0) 118 if (mapping->nrpages == 0)
116 return; 119 return;
117 120
121 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
122 end = (lend >> PAGE_CACHE_SHIFT);
123
118 pagevec_init(&pvec, 0); 124 pagevec_init(&pvec, 0);
119 next = start; 125 next = start;
120 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 126 while (next <= end &&
127 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
121 for (i = 0; i < pagevec_count(&pvec); i++) { 128 for (i = 0; i < pagevec_count(&pvec); i++) {
122 struct page *page = pvec.pages[i]; 129 struct page *page = pvec.pages[i];
123 pgoff_t page_index = page->index; 130 pgoff_t page_index = page->index;
124 131
132 if (page_index > end) {
133 next = page_index;
134 break;
135 }
136
125 if (page_index > next) 137 if (page_index > next)
126 next = page_index; 138 next = page_index;
127 next++; 139 next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
157 next = start; 169 next = start;
158 continue; 170 continue;
159 } 171 }
172 if (pvec.pages[0]->index > end) {
173 pagevec_release(&pvec);
174 break;
175 }
160 for (i = 0; i < pagevec_count(&pvec); i++) { 176 for (i = 0; i < pagevec_count(&pvec); i++) {
161 struct page *page = pvec.pages[i]; 177 struct page *page = pvec.pages[i];
162 178
179 if (page->index > end)
180 break;
163 lock_page(page); 181 lock_page(page);
164 wait_on_page_writeback(page); 182 wait_on_page_writeback(page);
165 if (page->index > next) 183 if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
171 pagevec_release(&pvec); 189 pagevec_release(&pvec);
172 } 190 }
173} 191}
192EXPORT_SYMBOL(truncate_inode_pages_range);
174 193
194/**
195 * truncate_inode_pages - truncate *all* the pages from an offset
196 * @mapping: mapping to truncate
197 * @lstart: offset from which to truncate
198 *
199 * Called under (and serialised by) inode->i_mutex.
200 */
201void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
202{
203 truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
204}
175EXPORT_SYMBOL(truncate_inode_pages); 205EXPORT_SYMBOL(truncate_inode_pages);
176 206
177/** 207/**
@@ -219,7 +249,6 @@ unlock:
219 break; 249 break;
220 } 250 }
221 pagevec_release(&pvec); 251 pagevec_release(&pvec);
222 cond_resched();
223 } 252 }
224 return ret; 253 return ret;
225} 254}
@@ -282,8 +311,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
282 * Zap the rest of the file in one hit. 311 * Zap the rest of the file in one hit.
283 */ 312 */
284 unmap_mapping_range(mapping, 313 unmap_mapping_range(mapping,
285 page_index << PAGE_CACHE_SHIFT, 314 (loff_t)page_index<<PAGE_CACHE_SHIFT,
286 (end - page_index + 1) 315 (loff_t)(end - page_index + 1)
287 << PAGE_CACHE_SHIFT, 316 << PAGE_CACHE_SHIFT,
288 0); 317 0);
289 did_range_unmap = 1; 318 did_range_unmap = 1;
@@ -292,7 +321,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
292 * Just zap this page 321 * Just zap this page
293 */ 322 */
294 unmap_mapping_range(mapping, 323 unmap_mapping_range(mapping,
295 page_index << PAGE_CACHE_SHIFT, 324 (loff_t)page_index<<PAGE_CACHE_SHIFT,
296 PAGE_CACHE_SIZE, 0); 325 PAGE_CACHE_SIZE, 0);
297 } 326 }
298 } 327 }
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 0000000000..5f4bb59da6
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
1#include <linux/slab.h>
2#include <linux/string.h>
3#include <linux/module.h>
4
5/**
6 * kzalloc - allocate memory. The memory is set to zero.
7 * @size: how many bytes of memory are required.
8 * @flags: the type of memory to allocate.
9 */
10void *kzalloc(size_t size, gfp_t flags)
11{
12 void *ret = kmalloc(size, flags);
13 if (ret)
14 memset(ret, 0, size);
15 return ret;
16}
17EXPORT_SYMBOL(kzalloc);
18
19/*
20 * kstrdup - allocate space for and copy an existing string
21 *
22 * @s: the string to duplicate
23 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
24 */
25char *kstrdup(const char *s, gfp_t gfp)
26{
27 size_t len;
28 char *buf;
29
30 if (!s)
31 return NULL;
32
33 len = strlen(s) + 1;
34 buf = kmalloc(len, gfp);
35 if (buf)
36 memcpy(buf, s, len);
37 return buf;
38}
39EXPORT_SYMBOL(kstrdup);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 135bf8ca96..2e34b61a70 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
63 63
64 unsigned long nr_mapped; /* From page_state */ 64 unsigned long nr_mapped; /* From page_state */
65 65
66 /* How many pages shrink_cache() should reclaim */
67 int nr_to_reclaim;
68
69 /* Ask shrink_caches, or shrink_zone to scan at this priority */ 66 /* Ask shrink_caches, or shrink_zone to scan at this priority */
70 unsigned int priority; 67 unsigned int priority;
71 68
@@ -186,8 +183,7 @@ EXPORT_SYMBOL(remove_shrinker);
186 * 183 *
187 * Returns the number of slab objects which we shrunk. 184 * Returns the number of slab objects which we shrunk.
188 */ 185 */
189static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, 186int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
190 unsigned long lru_pages)
191{ 187{
192 struct shrinker *shrinker; 188 struct shrinker *shrinker;
193 int ret = 0; 189 int ret = 0;
@@ -201,13 +197,25 @@ static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
201 list_for_each_entry(shrinker, &shrinker_list, list) { 197 list_for_each_entry(shrinker, &shrinker_list, list) {
202 unsigned long long delta; 198 unsigned long long delta;
203 unsigned long total_scan; 199 unsigned long total_scan;
200 unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
204 201
205 delta = (4 * scanned) / shrinker->seeks; 202 delta = (4 * scanned) / shrinker->seeks;
206 delta *= (*shrinker->shrinker)(0, gfp_mask); 203 delta *= max_pass;
207 do_div(delta, lru_pages + 1); 204 do_div(delta, lru_pages + 1);
208 shrinker->nr += delta; 205 shrinker->nr += delta;
209 if (shrinker->nr < 0) 206 if (shrinker->nr < 0) {
210 shrinker->nr = LONG_MAX; /* It wrapped! */ 207 printk(KERN_ERR "%s: nr=%ld\n",
208 __FUNCTION__, shrinker->nr);
209 shrinker->nr = max_pass;
210 }
211
212 /*
213 * Avoid risking looping forever due to too large nr value:
214 * never try to free more than twice the estimate number of
215 * freeable entries.
216 */
217 if (shrinker->nr > max_pass * 2)
218 shrinker->nr = max_pass * 2;
211 219
212 total_scan = shrinker->nr; 220 total_scan = shrinker->nr;
213 shrinker->nr = 0; 221 shrinker->nr = 0;
@@ -263,9 +271,7 @@ static inline int is_page_cache_freeable(struct page *page)
263 271
264static int may_write_to_queue(struct backing_dev_info *bdi) 272static int may_write_to_queue(struct backing_dev_info *bdi)
265{ 273{
266 if (current_is_kswapd()) 274 if (current->flags & PF_SWAPWRITE)
267 return 1;
268 if (current_is_pdflush()) /* This is unlikely, but why not... */
269 return 1; 275 return 1;
270 if (!bdi_write_congested(bdi)) 276 if (!bdi_write_congested(bdi))
271 return 1; 277 return 1;
@@ -355,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
355 res = mapping->a_ops->writepage(page, &wbc); 361 res = mapping->a_ops->writepage(page, &wbc);
356 if (res < 0) 362 if (res < 0)
357 handle_write_error(mapping, page, res); 363 handle_write_error(mapping, page, res);
358 if (res == WRITEPAGE_ACTIVATE) { 364 if (res == AOP_WRITEPAGE_ACTIVATE) {
359 ClearPageReclaim(page); 365 ClearPageReclaim(page);
360 return PAGE_ACTIVATE; 366 return PAGE_ACTIVATE;
361 } 367 }
@@ -370,6 +376,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
370 return PAGE_CLEAN; 376 return PAGE_CLEAN;
371} 377}
372 378
379static int remove_mapping(struct address_space *mapping, struct page *page)
380{
381 if (!mapping)
382 return 0; /* truncate got there first */
383
384 write_lock_irq(&mapping->tree_lock);
385
386 /*
387 * The non-racy check for busy page. It is critical to check
388 * PageDirty _after_ making sure that the page is freeable and
389 * not in use by anybody. (pagecache + us == 2)
390 */
391 if (unlikely(page_count(page) != 2))
392 goto cannot_free;
393 smp_rmb();
394 if (unlikely(PageDirty(page)))
395 goto cannot_free;
396
397 if (PageSwapCache(page)) {
398 swp_entry_t swap = { .val = page_private(page) };
399 __delete_from_swap_cache(page);
400 write_unlock_irq(&mapping->tree_lock);
401 swap_free(swap);
402 __put_page(page); /* The pagecache ref */
403 return 1;
404 }
405
406 __remove_from_page_cache(page);
407 write_unlock_irq(&mapping->tree_lock);
408 __put_page(page);
409 return 1;
410
411cannot_free:
412 write_unlock_irq(&mapping->tree_lock);
413 return 0;
414}
415
373/* 416/*
374 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed 417 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
375 */ 418 */
@@ -407,7 +450,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
407 if (PageWriteback(page)) 450 if (PageWriteback(page))
408 goto keep_locked; 451 goto keep_locked;
409 452
410 referenced = page_referenced(page, 1, sc->priority <= 0); 453 referenced = page_referenced(page, 1);
411 /* In active use or really unfreeable? Activate it. */ 454 /* In active use or really unfreeable? Activate it. */
412 if (referenced && page_mapping_inuse(page)) 455 if (referenced && page_mapping_inuse(page))
413 goto activate_locked; 456 goto activate_locked;
@@ -420,7 +463,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
420 if (PageAnon(page) && !PageSwapCache(page)) { 463 if (PageAnon(page) && !PageSwapCache(page)) {
421 if (!sc->may_swap) 464 if (!sc->may_swap)
422 goto keep_locked; 465 goto keep_locked;
423 if (!add_to_swap(page)) 466 if (!add_to_swap(page, GFP_ATOMIC))
424 goto activate_locked; 467 goto activate_locked;
425 } 468 }
426#endif /* CONFIG_SWAP */ 469#endif /* CONFIG_SWAP */
@@ -503,36 +546,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
503 goto free_it; 546 goto free_it;
504 } 547 }
505 548
506 if (!mapping) 549 if (!remove_mapping(mapping, page))
507 goto keep_locked; /* truncate got there first */ 550 goto keep_locked;
508
509 write_lock_irq(&mapping->tree_lock);
510
511 /*
512 * The non-racy check for busy page. It is critical to check
513 * PageDirty _after_ making sure that the page is freeable and
514 * not in use by anybody. (pagecache + us == 2)
515 */
516 if (unlikely(page_count(page) != 2))
517 goto cannot_free;
518 smp_rmb();
519 if (unlikely(PageDirty(page)))
520 goto cannot_free;
521
522#ifdef CONFIG_SWAP
523 if (PageSwapCache(page)) {
524 swp_entry_t swap = { .val = page_private(page) };
525 __delete_from_swap_cache(page);
526 write_unlock_irq(&mapping->tree_lock);
527 swap_free(swap);
528 __put_page(page); /* The pagecache ref */
529 goto free_it;
530 }
531#endif /* CONFIG_SWAP */
532
533 __remove_from_page_cache(page);
534 write_unlock_irq(&mapping->tree_lock);
535 __put_page(page);
536 551
537free_it: 552free_it:
538 unlock_page(page); 553 unlock_page(page);
@@ -541,10 +556,6 @@ free_it:
541 __pagevec_release_nonlru(&freed_pvec); 556 __pagevec_release_nonlru(&freed_pvec);
542 continue; 557 continue;
543 558
544cannot_free:
545 write_unlock_irq(&mapping->tree_lock);
546 goto keep_locked;
547
548activate_locked: 559activate_locked:
549 SetPageActive(page); 560 SetPageActive(page);
550 pgactivate++; 561 pgactivate++;
@@ -562,6 +573,228 @@ keep:
562 return reclaimed; 573 return reclaimed;
563} 574}
564 575
576#ifdef CONFIG_MIGRATION
577static inline void move_to_lru(struct page *page)
578{
579 list_del(&page->lru);
580 if (PageActive(page)) {
581 /*
582 * lru_cache_add_active checks that
583 * the PG_active bit is off.
584 */
585 ClearPageActive(page);
586 lru_cache_add_active(page);
587 } else {
588 lru_cache_add(page);
589 }
590 put_page(page);
591}
592
593/*
594 * Add isolated pages on the list back to the LRU.
595 *
596 * returns the number of pages put back.
597 */
598int putback_lru_pages(struct list_head *l)
599{
600 struct page *page;
601 struct page *page2;
602 int count = 0;
603
604 list_for_each_entry_safe(page, page2, l, lru) {
605 move_to_lru(page);
606 count++;
607 }
608 return count;
609}
610
611/*
612 * swapout a single page
613 * page is locked upon entry, unlocked on exit
614 */
615static int swap_page(struct page *page)
616{
617 struct address_space *mapping = page_mapping(page);
618
619 if (page_mapped(page) && mapping)
620 if (try_to_unmap(page) != SWAP_SUCCESS)
621 goto unlock_retry;
622
623 if (PageDirty(page)) {
624 /* Page is dirty, try to write it out here */
625 switch(pageout(page, mapping)) {
626 case PAGE_KEEP:
627 case PAGE_ACTIVATE:
628 goto unlock_retry;
629
630 case PAGE_SUCCESS:
631 goto retry;
632
633 case PAGE_CLEAN:
634 ; /* try to free the page below */
635 }
636 }
637
638 if (PagePrivate(page)) {
639 if (!try_to_release_page(page, GFP_KERNEL) ||
640 (!mapping && page_count(page) == 1))
641 goto unlock_retry;
642 }
643
644 if (remove_mapping(mapping, page)) {
645 /* Success */
646 unlock_page(page);
647 return 0;
648 }
649
650unlock_retry:
651 unlock_page(page);
652
653retry:
654 return -EAGAIN;
655}
656/*
657 * migrate_pages
658 *
659 * Two lists are passed to this function. The first list
660 * contains the pages isolated from the LRU to be migrated.
661 * The second list contains new pages that the pages isolated
662 * can be moved to. If the second list is NULL then all
663 * pages are swapped out.
664 *
665 * The function returns after 10 attempts or if no pages
666 * are movable anymore because t has become empty
667 * or no retryable pages exist anymore.
668 *
669 * SIMPLIFIED VERSION: This implementation of migrate_pages
670 * is only swapping out pages and never touches the second
671 * list. The direct migration patchset
672 * extends this function to avoid the use of swap.
673 *
674 * Return: Number of pages not migrated when "to" ran empty.
675 */
676int migrate_pages(struct list_head *from, struct list_head *to,
677 struct list_head *moved, struct list_head *failed)
678{
679 int retry;
680 int nr_failed = 0;
681 int pass = 0;
682 struct page *page;
683 struct page *page2;
684 int swapwrite = current->flags & PF_SWAPWRITE;
685 int rc;
686
687 if (!swapwrite)
688 current->flags |= PF_SWAPWRITE;
689
690redo:
691 retry = 0;
692
693 list_for_each_entry_safe(page, page2, from, lru) {
694 cond_resched();
695
696 rc = 0;
697 if (page_count(page) == 1)
698 /* page was freed from under us. So we are done. */
699 goto next;
700
701 /*
702 * Skip locked pages during the first two passes to give the
703 * functions holding the lock time to release the page. Later we
704 * use lock_page() to have a higher chance of acquiring the
705 * lock.
706 */
707 rc = -EAGAIN;
708 if (pass > 2)
709 lock_page(page);
710 else
711 if (TestSetPageLocked(page))
712 goto next;
713
714 /*
715 * Only wait on writeback if we have already done a pass where
716 * we we may have triggered writeouts for lots of pages.
717 */
718 if (pass > 0) {
719 wait_on_page_writeback(page);
720 } else {
721 if (PageWriteback(page))
722 goto unlock_page;
723 }
724
725 /*
726 * Anonymous pages must have swap cache references otherwise
727 * the information contained in the page maps cannot be
728 * preserved.
729 */
730 if (PageAnon(page) && !PageSwapCache(page)) {
731 if (!add_to_swap(page, GFP_KERNEL)) {
732 rc = -ENOMEM;
733 goto unlock_page;
734 }
735 }
736
737 /*
738 * Page is properly locked and writeback is complete.
739 * Try to migrate the page.
740 */
741 rc = swap_page(page);
742 goto next;
743
744unlock_page:
745 unlock_page(page);
746
747next:
748 if (rc == -EAGAIN) {
749 retry++;
750 } else if (rc) {
751 /* Permanent failure */
752 list_move(&page->lru, failed);
753 nr_failed++;
754 } else {
755 /* Success */
756 list_move(&page->lru, moved);
757 }
758 }
759 if (retry && pass++ < 10)
760 goto redo;
761
762 if (!swapwrite)
763 current->flags &= ~PF_SWAPWRITE;
764
765 return nr_failed + retry;
766}
767
768/*
769 * Isolate one page from the LRU lists and put it on the
770 * indicated list with elevated refcount.
771 *
772 * Result:
773 * 0 = page not on LRU list
774 * 1 = page removed from LRU list and added to the specified list.
775 */
776int isolate_lru_page(struct page *page)
777{
778 int ret = 0;
779
780 if (PageLRU(page)) {
781 struct zone *zone = page_zone(page);
782 spin_lock_irq(&zone->lru_lock);
783 if (TestClearPageLRU(page)) {
784 ret = 1;
785 get_page(page);
786 if (PageActive(page))
787 del_page_from_active_list(zone, page);
788 else
789 del_page_from_inactive_list(zone, page);
790 }
791 spin_unlock_irq(&zone->lru_lock);
792 }
793
794 return ret;
795}
796#endif
797
565/* 798/*
566 * zone->lru_lock is heavily contended. Some of the functions that 799 * zone->lru_lock is heavily contended. Some of the functions that
567 * shrink the lists perform better by taking out a batch of pages 800 * shrink the lists perform better by taking out a batch of pages
@@ -641,17 +874,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
641 goto done; 874 goto done;
642 875
643 max_scan -= nr_scan; 876 max_scan -= nr_scan;
644 if (current_is_kswapd())
645 mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
646 else
647 mod_page_state_zone(zone, pgscan_direct, nr_scan);
648 nr_freed = shrink_list(&page_list, sc); 877 nr_freed = shrink_list(&page_list, sc);
649 if (current_is_kswapd())
650 mod_page_state(kswapd_steal, nr_freed);
651 mod_page_state_zone(zone, pgsteal, nr_freed);
652 sc->nr_to_reclaim -= nr_freed;
653 878
654 spin_lock_irq(&zone->lru_lock); 879 local_irq_disable();
880 if (current_is_kswapd()) {
881 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
882 __mod_page_state(kswapd_steal, nr_freed);
883 } else
884 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
885 __mod_page_state_zone(zone, pgsteal, nr_freed);
886
887 spin_lock(&zone->lru_lock);
655 /* 888 /*
656 * Put back any unfreeable pages. 889 * Put back any unfreeable pages.
657 */ 890 */
@@ -756,7 +989,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
756 if (page_mapped(page)) { 989 if (page_mapped(page)) {
757 if (!reclaim_mapped || 990 if (!reclaim_mapped ||
758 (total_swap_pages == 0 && PageAnon(page)) || 991 (total_swap_pages == 0 && PageAnon(page)) ||
759 page_referenced(page, 0, sc->priority <= 0)) { 992 page_referenced(page, 0)) {
760 list_add(&page->lru, &l_active); 993 list_add(&page->lru, &l_active);
761 continue; 994 continue;
762 } 995 }
@@ -813,11 +1046,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
813 } 1046 }
814 } 1047 }
815 zone->nr_active += pgmoved; 1048 zone->nr_active += pgmoved;
816 spin_unlock_irq(&zone->lru_lock); 1049 spin_unlock(&zone->lru_lock);
817 pagevec_release(&pvec); 1050
1051 __mod_page_state_zone(zone, pgrefill, pgscanned);
1052 __mod_page_state(pgdeactivate, pgdeactivate);
1053 local_irq_enable();
818 1054
819 mod_page_state_zone(zone, pgrefill, pgscanned); 1055 pagevec_release(&pvec);
820 mod_page_state(pgdeactivate, pgdeactivate);
821} 1056}
822 1057
823/* 1058/*
@@ -849,8 +1084,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
849 else 1084 else
850 nr_inactive = 0; 1085 nr_inactive = 0;
851 1086
852 sc->nr_to_reclaim = sc->swap_cluster_max;
853
854 while (nr_active || nr_inactive) { 1087 while (nr_active || nr_inactive) {
855 if (nr_active) { 1088 if (nr_active) {
856 sc->nr_to_scan = min(nr_active, 1089 sc->nr_to_scan = min(nr_active,
@@ -864,8 +1097,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
864 (unsigned long)sc->swap_cluster_max); 1097 (unsigned long)sc->swap_cluster_max);
865 nr_inactive -= sc->nr_to_scan; 1098 nr_inactive -= sc->nr_to_scan;
866 shrink_cache(zone, sc); 1099 shrink_cache(zone, sc);
867 if (sc->nr_to_reclaim <= 0)
868 break;
869 } 1100 }
870 } 1101 }
871 1102
@@ -898,7 +1129,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
898 for (i = 0; zones[i] != NULL; i++) { 1129 for (i = 0; zones[i] != NULL; i++) {
899 struct zone *zone = zones[i]; 1130 struct zone *zone = zones[i];
900 1131
901 if (zone->present_pages == 0) 1132 if (!populated_zone(zone))
902 continue; 1133 continue;
903 1134
904 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1135 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -960,6 +1191,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
960 sc.nr_reclaimed = 0; 1191 sc.nr_reclaimed = 0;
961 sc.priority = priority; 1192 sc.priority = priority;
962 sc.swap_cluster_max = SWAP_CLUSTER_MAX; 1193 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1194 if (!priority)
1195 disable_swap_token();
963 shrink_caches(zones, &sc); 1196 shrink_caches(zones, &sc);
964 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 1197 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
965 if (reclaim_state) { 1198 if (reclaim_state) {
@@ -1056,6 +1289,10 @@ loop_again:
1056 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 1289 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
1057 unsigned long lru_pages = 0; 1290 unsigned long lru_pages = 0;
1058 1291
1292 /* The swap token gets in the way of swapout... */
1293 if (!priority)
1294 disable_swap_token();
1295
1059 all_zones_ok = 1; 1296 all_zones_ok = 1;
1060 1297
1061 if (nr_pages == 0) { 1298 if (nr_pages == 0) {
@@ -1066,7 +1303,7 @@ loop_again:
1066 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1303 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1067 struct zone *zone = pgdat->node_zones + i; 1304 struct zone *zone = pgdat->node_zones + i;
1068 1305
1069 if (zone->present_pages == 0) 1306 if (!populated_zone(zone))
1070 continue; 1307 continue;
1071 1308
1072 if (zone->all_unreclaimable && 1309 if (zone->all_unreclaimable &&
@@ -1074,7 +1311,7 @@ loop_again:
1074 continue; 1311 continue;
1075 1312
1076 if (!zone_watermark_ok(zone, order, 1313 if (!zone_watermark_ok(zone, order,
1077 zone->pages_high, 0, 0, 0)) { 1314 zone->pages_high, 0, 0)) {
1078 end_zone = i; 1315 end_zone = i;
1079 goto scan; 1316 goto scan;
1080 } 1317 }
@@ -1103,7 +1340,7 @@ scan:
1103 struct zone *zone = pgdat->node_zones + i; 1340 struct zone *zone = pgdat->node_zones + i;
1104 int nr_slab; 1341 int nr_slab;
1105 1342
1106 if (zone->present_pages == 0) 1343 if (!populated_zone(zone))
1107 continue; 1344 continue;
1108 1345
1109 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1346 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1111,7 +1348,7 @@ scan:
1111 1348
1112 if (nr_pages == 0) { /* Not software suspend */ 1349 if (nr_pages == 0) { /* Not software suspend */
1113 if (!zone_watermark_ok(zone, order, 1350 if (!zone_watermark_ok(zone, order,
1114 zone->pages_high, end_zone, 0, 0)) 1351 zone->pages_high, end_zone, 0))
1115 all_zones_ok = 0; 1352 all_zones_ok = 0;
1116 } 1353 }
1117 zone->temp_priority = priority; 1354 zone->temp_priority = priority;
@@ -1220,7 +1457,7 @@ static int kswapd(void *p)
1220 * us from recursively trying to free more memory as we're 1457 * us from recursively trying to free more memory as we're
1221 * trying to free the first piece of memory in the first place). 1458 * trying to free the first piece of memory in the first place).
1222 */ 1459 */
1223 tsk->flags |= PF_MEMALLOC|PF_KSWAPD; 1460 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
1224 1461
1225 order = 0; 1462 order = 0;
1226 for ( ; ; ) { 1463 for ( ; ; ) {
@@ -1255,11 +1492,11 @@ void wakeup_kswapd(struct zone *zone, int order)
1255{ 1492{
1256 pg_data_t *pgdat; 1493 pg_data_t *pgdat;
1257 1494
1258 if (zone->present_pages == 0) 1495 if (!populated_zone(zone))
1259 return; 1496 return;
1260 1497
1261 pgdat = zone->zone_pgdat; 1498 pgdat = zone->zone_pgdat;
1262 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0)) 1499 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
1263 return; 1500 return;
1264 if (pgdat->kswapd_max_order < order) 1501 if (pgdat->kswapd_max_order < order)
1265 pgdat->kswapd_max_order = order; 1502 pgdat->kswapd_max_order = order;
@@ -1336,74 +1573,70 @@ static int __init kswapd_init(void)
1336 1573
1337module_init(kswapd_init) 1574module_init(kswapd_init)
1338 1575
1576#ifdef CONFIG_NUMA
1577/*
1578 * Zone reclaim mode
1579 *
1580 * If non-zero call zone_reclaim when the number of free pages falls below
1581 * the watermarks.
1582 *
1583 * In the future we may add flags to the mode. However, the page allocator
1584 * should only have to check that zone_reclaim_mode != 0 before calling
1585 * zone_reclaim().
1586 */
1587int zone_reclaim_mode __read_mostly;
1339 1588
1340/* 1589/*
1590 * Mininum time between zone reclaim scans
1591 */
1592#define ZONE_RECLAIM_INTERVAL HZ/2
1593/*
1341 * Try to free up some pages from this zone through reclaim. 1594 * Try to free up some pages from this zone through reclaim.
1342 */ 1595 */
1343int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1596int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1344{ 1597{
1345 struct scan_control sc;
1346 int nr_pages = 1 << order; 1598 int nr_pages = 1 << order;
1347 int total_reclaimed = 0; 1599 struct task_struct *p = current;
1600 struct reclaim_state reclaim_state;
1601 struct scan_control sc = {
1602 .gfp_mask = gfp_mask,
1603 .may_writepage = 0,
1604 .may_swap = 0,
1605 .nr_mapped = read_page_state(nr_mapped),
1606 .nr_scanned = 0,
1607 .nr_reclaimed = 0,
1608 .priority = 0
1609 };
1348 1610
1349 /* The reclaim may sleep, so don't do it if sleep isn't allowed */ 1611 if (!(gfp_mask & __GFP_WAIT) ||
1350 if (!(gfp_mask & __GFP_WAIT)) 1612 zone->zone_pgdat->node_id != numa_node_id() ||
1351 return 0; 1613 zone->all_unreclaimable ||
1352 if (zone->all_unreclaimable) 1614 atomic_read(&zone->reclaim_in_progress) > 0)
1353 return 0; 1615 return 0;
1354 1616
1355 sc.gfp_mask = gfp_mask; 1617 if (time_before(jiffies,
1356 sc.may_writepage = 0; 1618 zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
1357 sc.may_swap = 0; 1619 return 0;
1358 sc.nr_mapped = read_page_state(nr_mapped); 1620
1359 sc.nr_scanned = 0; 1621 disable_swap_token();
1360 sc.nr_reclaimed = 0;
1361 /* scan at the highest priority */
1362 sc.priority = 0;
1363 1622
1364 if (nr_pages > SWAP_CLUSTER_MAX) 1623 if (nr_pages > SWAP_CLUSTER_MAX)
1365 sc.swap_cluster_max = nr_pages; 1624 sc.swap_cluster_max = nr_pages;
1366 else 1625 else
1367 sc.swap_cluster_max = SWAP_CLUSTER_MAX; 1626 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1368 1627
1369 /* Don't reclaim the zone if there are other reclaimers active */ 1628 cond_resched();
1370 if (atomic_read(&zone->reclaim_in_progress) > 0) 1629 p->flags |= PF_MEMALLOC;
1371 goto out; 1630 reclaim_state.reclaimed_slab = 0;
1372 1631 p->reclaim_state = &reclaim_state;
1373 shrink_zone(zone, &sc); 1632 shrink_zone(zone, &sc);
1374 total_reclaimed = sc.nr_reclaimed; 1633 p->reclaim_state = NULL;
1375 1634 current->flags &= ~PF_MEMALLOC;
1376 out:
1377 return total_reclaimed;
1378}
1379
1380asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
1381 unsigned int state)
1382{
1383 struct zone *z;
1384 int i;
1385
1386 if (!capable(CAP_SYS_ADMIN))
1387 return -EACCES;
1388 1635
1389 if (node >= MAX_NUMNODES || !node_online(node)) 1636 if (sc.nr_reclaimed == 0)
1390 return -EINVAL; 1637 zone->last_unsuccessful_zone_reclaim = jiffies;
1391 1638
1392 /* This will break if we ever add more zones */ 1639 return sc.nr_reclaimed > nr_pages;
1393 if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
1394 return -EINVAL;
1395
1396 for (i = 0; i < MAX_NR_ZONES; i++) {
1397 if (!(zone & 1<<i))
1398 continue;
1399
1400 z = &NODE_DATA(node)->node_zones[i];
1401
1402 if (state)
1403 z->reclaim_pages = 1;
1404 else
1405 z->reclaim_pages = 0;
1406 }
1407
1408 return 0;
1409} 1640}
1641#endif
1642