aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-03-31 15:34:58 -0500
committerSteven Whitehouse <swhiteho@redhat.com>2006-03-31 15:34:58 -0500
commit86579dd06deecfa6ac88d5e84e4d63c397cd6f6d (patch)
treeb4475d3ccde53015ad84a06e4e55e64591171b75 /mm
parent7ea9ea832212c4a755650f7c7cc1ff0b63292a41 (diff)
parenta0f067802576d4eb4c65d40b8ee7d6ea3c81dd61 (diff)
Merge branch 'master'
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem.c41
-rw-r--r--mm/fadvise.c46
-rw-r--r--mm/filemap.c43
-rw-r--r--mm/highmem.c26
-rw-r--r--mm/hugetlb.c286
-rw-r--r--mm/internal.h34
-rw-r--r--mm/memory.c21
-rw-r--r--mm/mempolicy.c151
-rw-r--r--mm/mempool.c50
-rw-r--r--mm/migrate.c655
-rw-r--r--mm/mmap.c16
-rw-r--r--mm/mmzone.c50
-rw-r--r--mm/mprotect.c12
-rw-r--r--mm/msync.c139
-rw-r--r--mm/nommu.c4
-rw-r--r--mm/page-writeback.c64
-rw-r--r--mm/page_alloc.c180
-rw-r--r--mm/readahead.c33
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c7
-rw-r--r--mm/slab.c1233
-rw-r--r--mm/slob.c10
-rw-r--r--mm/swap.c66
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c59
-rw-r--r--mm/util.c47
-rw-r--r--mm/vmscan.c888
29 files changed, 2603 insertions, 1587 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae6409..332f5c29b53a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
137# support for page migration 137# support for page migration
138# 138#
139config MIGRATION 139config MIGRATION
140 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM 140 bool "Page migration"
141 depends on SWAP 141 def_bool y if NUMA
142 depends on SWAP && NUMA
143 help
144 Allows the migration of the physical location of pages of processes
145 while the virtual addresses are not changed. This is useful for
146 example on NUMA systems to put pages nearer to the processors accessing
147 the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dcc3..0b8f73f2ed16 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o util.o $(mmu-y) 13 prio_tree.o util.o mmzone.o $(mmu-y)
14 14
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o 22obj-$(CONFIG_SLAB) += slab.o
23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
24obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
25obj-$(CONFIG_MIGRATION) += migrate.o
26
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 35c32290f717..d3e3bd2ffcea 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so
33 * dma_get_required_mask(), which uses 33 * dma_get_required_mask(), which uses
34 * it, can be an inline function */ 34 * it, can be an inline function */
35 35
36static LIST_HEAD(bdata_list);
36#ifdef CONFIG_CRASH_DUMP 37#ifdef CONFIG_CRASH_DUMP
37/* 38/*
38 * If we have booted due to a crash, max_pfn will be a very low value. We need 39 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
52 53
53 return mapsize; 54 return mapsize;
54} 55}
56/*
57 * link bdata in order
58 */
59static void link_bootmem(bootmem_data_t *bdata)
60{
61 bootmem_data_t *ent;
62 if (list_empty(&bdata_list)) {
63 list_add(&bdata->list, &bdata_list);
64 return;
65 }
66 /* insert in order */
67 list_for_each_entry(ent, &bdata_list, list) {
68 if (bdata->node_boot_start < ent->node_boot_start) {
69 list_add_tail(&bdata->list, &ent->list);
70 return;
71 }
72 }
73 list_add_tail(&bdata->list, &bdata_list);
74 return;
75}
76
55 77
56/* 78/*
57 * Called once to set up the allocator itself. 79 * Called once to set up the allocator itself.
@@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
62 bootmem_data_t *bdata = pgdat->bdata; 84 bootmem_data_t *bdata = pgdat->bdata;
63 unsigned long mapsize = ((end - start)+7)/8; 85 unsigned long mapsize = ((end - start)+7)/8;
64 86
65 pgdat->pgdat_next = pgdat_list;
66 pgdat_list = pgdat;
67
68 mapsize = ALIGN(mapsize, sizeof(long)); 87 mapsize = ALIGN(mapsize, sizeof(long));
69 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); 88 bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
70 bdata->node_boot_start = (start << PAGE_SHIFT); 89 bdata->node_boot_start = (start << PAGE_SHIFT);
71 bdata->node_low_pfn = end; 90 bdata->node_low_pfn = end;
91 link_bootmem(bdata);
72 92
73 /* 93 /*
74 * Initially all pages are reserved - setup_arch() has to 94 * Initially all pages are reserved - setup_arch() has to
@@ -152,7 +172,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
152 * 172 *
153 * NOTE: This function is _not_ reentrant. 173 * NOTE: This function is _not_ reentrant.
154 */ 174 */
155static void * __init 175void * __init
156__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, 176__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
157 unsigned long align, unsigned long goal, unsigned long limit) 177 unsigned long align, unsigned long goal, unsigned long limit)
158{ 178{
@@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void)
383 403
384void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) 404void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
385{ 405{
386 pg_data_t *pgdat = pgdat_list; 406 bootmem_data_t *bdata;
387 void *ptr; 407 void *ptr;
388 408
389 for_each_pgdat(pgdat) 409 list_for_each_entry(bdata, &bdata_list, list)
390 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, 410 if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0)))
391 align, goal, 0)))
392 return(ptr); 411 return(ptr);
393 412
394 /* 413 /*
@@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne
416 435
417void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) 436void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
418{ 437{
419 pg_data_t *pgdat = pgdat_list; 438 bootmem_data_t *bdata;
420 void *ptr; 439 void *ptr;
421 440
422 for_each_pgdat(pgdat) 441 list_for_each_entry(bdata, &bdata_list, list)
423 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, 442 if ((ptr = __alloc_bootmem_core(bdata, size,
424 align, goal, LOW32LIMIT))) 443 align, goal, LOW32LIMIT)))
425 return(ptr); 444 return(ptr);
426 445
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89e7704..907c39257ca0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/fadvise.h> 17#include <linux/fadvise.h>
18#include <linux/writeback.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
19 20
20#include <asm/unistd.h> 21#include <asm/unistd.h>
@@ -22,13 +23,36 @@
22/* 23/*
23 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could 24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
24 * deactivate the pages and clear PG_Referenced. 25 * deactivate the pages and clear PG_Referenced.
26 *
27 * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
28 * offsets `offset' and `offset+len' inclusive. Any pages which are currently
29 * under writeout are skipped, whether or not they are dirty.
30 *
31 * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
32 * offsets `offset' and `offset+len'.
33 *
34 * By combining these two operations the application may do several things:
35 *
36 * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
37 *
38 * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
39 * dirty pages at the disk.
40 *
41 * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
42 * all of the currently dirty pages at the disk, wait until they have been
43 * written.
44 *
45 * It should be noted that none of these operations write out the file's
46 * metadata. So unless the application is strictly performing overwrites of
47 * already-instantiated disk blocks, there are no guarantees here that the data
48 * will be available after a crash.
25 */ 49 */
26asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) 50asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
27{ 51{
28 struct file *file = fget(fd); 52 struct file *file = fget(fd);
29 struct address_space *mapping; 53 struct address_space *mapping;
30 struct backing_dev_info *bdi; 54 struct backing_dev_info *bdi;
31 loff_t endbyte; 55 loff_t endbyte; /* inclusive */
32 pgoff_t start_index; 56 pgoff_t start_index;
33 pgoff_t end_index; 57 pgoff_t end_index;
34 unsigned long nrpages; 58 unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
56 endbyte = offset + len; 80 endbyte = offset + len;
57 if (!len || endbyte < len) 81 if (!len || endbyte < len)
58 endbyte = -1; 82 endbyte = -1;
83 else
84 endbyte--; /* inclusive */
59 85
60 bdi = mapping->backing_dev_info; 86 bdi = mapping->backing_dev_info;
61 87
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
78 104
79 /* First and last PARTIAL page! */ 105 /* First and last PARTIAL page! */
80 start_index = offset >> PAGE_CACHE_SHIFT; 106 start_index = offset >> PAGE_CACHE_SHIFT;
81 end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; 107 end_index = endbyte >> PAGE_CACHE_SHIFT;
82 108
83 /* Careful about overflow on the "+1" */ 109 /* Careful about overflow on the "+1" */
84 nrpages = end_index - start_index + 1; 110 nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
96 filemap_flush(mapping); 122 filemap_flush(mapping);
97 123
98 /* First and last FULL page! */ 124 /* First and last FULL page! */
99 start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 125 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
100 end_index = (endbyte >> PAGE_CACHE_SHIFT); 126 end_index = (endbyte >> PAGE_CACHE_SHIFT);
101 127
102 if (end_index > start_index) 128 if (end_index >= start_index)
103 invalidate_mapping_pages(mapping, start_index, end_index-1); 129 invalidate_mapping_pages(mapping, start_index,
130 end_index);
131 break;
132 case LINUX_FADV_ASYNC_WRITE:
133 ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
134 WB_SYNC_NONE);
135 break;
136 case LINUX_FADV_WRITE_WAIT:
137 ret = wait_on_page_writeback_range(mapping,
138 offset >> PAGE_CACHE_SHIFT,
139 endbyte >> PAGE_CACHE_SHIFT);
104 break; 140 break;
105 default: 141 default:
106 ret = -EINVAL; 142 ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 7624c26fcea6..1120338a5d0f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,10 @@
29#include <linux/blkdev.h> 29#include <linux/blkdev.h>
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/cpuset.h>
32#include "filemap.h" 33#include "filemap.h"
34#include "internal.h"
35
33/* 36/*
34 * FIXME: remove all knowledge of the buffer layer from the core VM 37 * FIXME: remove all knowledge of the buffer layer from the core VM
35 */ 38 */
@@ -172,7 +175,7 @@ static int sync_page(void *word)
172 * dirty pages that lie within the byte offsets <start, end> 175 * dirty pages that lie within the byte offsets <start, end>
173 * @mapping: address space structure to write 176 * @mapping: address space structure to write
174 * @start: offset in bytes where the range starts 177 * @start: offset in bytes where the range starts
175 * @end: offset in bytes where the range ends 178 * @end: offset in bytes where the range ends (inclusive)
176 * @sync_mode: enable synchronous operation 179 * @sync_mode: enable synchronous operation
177 * 180 *
178 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 181 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -180,8 +183,8 @@ static int sync_page(void *word)
180 * these two operations is that if a dirty page/buffer is encountered, it must 183 * these two operations is that if a dirty page/buffer is encountered, it must
181 * be waited upon, and not just skipped over. 184 * be waited upon, and not just skipped over.
182 */ 185 */
183static int __filemap_fdatawrite_range(struct address_space *mapping, 186int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
184 loff_t start, loff_t end, int sync_mode) 187 loff_t end, int sync_mode)
185{ 188{
186 int ret; 189 int ret;
187 struct writeback_control wbc = { 190 struct writeback_control wbc = {
@@ -210,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
210} 213}
211EXPORT_SYMBOL(filemap_fdatawrite); 214EXPORT_SYMBOL(filemap_fdatawrite);
212 215
213static int filemap_fdatawrite_range(struct address_space *mapping, 216static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
214 loff_t start, loff_t end) 217 loff_t end)
215{ 218{
216 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 219 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
217} 220}
@@ -230,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
230 * Wait for writeback to complete against pages indexed by start->end 233 * Wait for writeback to complete against pages indexed by start->end
231 * inclusive 234 * inclusive
232 */ 235 */
233static int wait_on_page_writeback_range(struct address_space *mapping, 236int wait_on_page_writeback_range(struct address_space *mapping,
234 pgoff_t start, pgoff_t end) 237 pgoff_t start, pgoff_t end)
235{ 238{
236 struct pagevec pvec; 239 struct pagevec pvec;
@@ -365,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping)
365} 368}
366EXPORT_SYMBOL(filemap_write_and_wait); 369EXPORT_SYMBOL(filemap_write_and_wait);
367 370
371/*
372 * Write out and wait upon file offsets lstart->lend, inclusive.
373 *
374 * Note that `lend' is inclusive (describes the last byte to be written) so
375 * that this function can be used to write to the very end-of-file (end = -1).
376 */
368int filemap_write_and_wait_range(struct address_space *mapping, 377int filemap_write_and_wait_range(struct address_space *mapping,
369 loff_t lstart, loff_t lend) 378 loff_t lstart, loff_t lend)
370{ 379{
@@ -425,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
425 return ret; 434 return ret;
426} 435}
427 436
437#ifdef CONFIG_NUMA
438struct page *page_cache_alloc(struct address_space *x)
439{
440 if (cpuset_do_page_mem_spread()) {
441 int n = cpuset_mem_spread_node();
442 return alloc_pages_node(n, mapping_gfp_mask(x), 0);
443 }
444 return alloc_pages(mapping_gfp_mask(x), 0);
445}
446EXPORT_SYMBOL(page_cache_alloc);
447
448struct page *page_cache_alloc_cold(struct address_space *x)
449{
450 if (cpuset_do_page_mem_spread()) {
451 int n = cpuset_mem_spread_node();
452 return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
453 }
454 return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
455}
456EXPORT_SYMBOL(page_cache_alloc_cold);
457#endif
458
428/* 459/*
429 * In order to wait for pages to become available there must be 460 * In order to wait for pages to become available there must be
430 * waitqueues associated with pages. By using a hash table of 461 * waitqueues associated with pages. By using a hash table of
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa7..55885f64af40 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,18 +26,14 @@
26#include <linux/init.h> 26#include <linux/init.h>
27#include <linux/hash.h> 27#include <linux/hash.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/blktrace_api.h>
29#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
30 31
31static mempool_t *page_pool, *isa_page_pool; 32static mempool_t *page_pool, *isa_page_pool;
32 33
33static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data) 34static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
34{ 35{
35 return alloc_page(gfp_mask | GFP_DMA); 36 return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
36}
37
38static void page_pool_free(void *page, void *data)
39{
40 __free_page(page);
41} 37}
42 38
43/* 39/*
@@ -50,11 +46,6 @@ static void page_pool_free(void *page, void *data)
50 */ 46 */
51#ifdef CONFIG_HIGHMEM 47#ifdef CONFIG_HIGHMEM
52 48
53static void *page_pool_alloc(gfp_t gfp_mask, void *data)
54{
55 return alloc_page(gfp_mask);
56}
57
58static int pkmap_count[LAST_PKMAP]; 49static int pkmap_count[LAST_PKMAP];
59static unsigned int last_pkmap_nr; 50static unsigned int last_pkmap_nr;
60static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); 51static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -228,7 +219,7 @@ static __init int init_emergency_pool(void)
228 if (!i.totalhigh) 219 if (!i.totalhigh)
229 return 0; 220 return 0;
230 221
231 page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL); 222 page_pool = mempool_create_page_pool(POOL_SIZE, 0);
232 if (!page_pool) 223 if (!page_pool)
233 BUG(); 224 BUG();
234 printk("highmem bounce pool size: %d pages\n", POOL_SIZE); 225 printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
@@ -271,7 +262,8 @@ int init_emergency_isa_pool(void)
271 if (isa_page_pool) 262 if (isa_page_pool)
272 return 0; 263 return 0;
273 264
274 isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL); 265 isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
266 mempool_free_pages, (void *) 0);
275 if (!isa_page_pool) 267 if (!isa_page_pool)
276 BUG(); 268 BUG();
277 269
@@ -336,7 +328,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
336 bio_put(bio); 328 bio_put(bio);
337} 329}
338 330
339static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err) 331static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
340{ 332{
341 if (bio->bi_size) 333 if (bio->bi_size)
342 return 1; 334 return 1;
@@ -383,7 +375,7 @@ static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int
383} 375}
384 376
385static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, 377static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
386 mempool_t *pool) 378 mempool_t *pool)
387{ 379{
388 struct page *page; 380 struct page *page;
389 struct bio *bio = NULL; 381 struct bio *bio = NULL;
@@ -483,6 +475,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
483 pool = isa_page_pool; 475 pool = isa_page_pool;
484 } 476 }
485 477
478 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
479
486 /* 480 /*
487 * slow path 481 * slow path
488 */ 482 */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2c..ebad6bbb3501 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 15#include <linux/cpuset.h>
16#include <linux/mutex.h>
16 17
17#include <asm/page.h> 18#include <asm/page.h>
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19 20
20#include <linux/hugetlb.h> 21#include <linux/hugetlb.h>
22#include "internal.h"
21 23
22const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
23static unsigned long nr_huge_pages, free_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
24unsigned long max_huge_pages; 26unsigned long max_huge_pages;
25static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
26static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
27static unsigned int free_huge_pages_node[MAX_NUMNODES]; 29static unsigned int free_huge_pages_node[MAX_NUMNODES];
28
29/* 30/*
30 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 31 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
31 */ 32 */
32static DEFINE_SPINLOCK(hugetlb_lock); 33static DEFINE_SPINLOCK(hugetlb_lock);
33 34
35static void clear_huge_page(struct page *page, unsigned long addr)
36{
37 int i;
38
39 might_sleep();
40 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
41 cond_resched();
42 clear_user_highpage(page + i, addr);
43 }
44}
45
46static void copy_huge_page(struct page *dst, struct page *src,
47 unsigned long addr)
48{
49 int i;
50
51 might_sleep();
52 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
53 cond_resched();
54 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
55 }
56}
57
34static void enqueue_huge_page(struct page *page) 58static void enqueue_huge_page(struct page *page)
35{ 59{
36 int nid = page_to_nid(page); 60 int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
64 return page; 88 return page;
65} 89}
66 90
67static struct page *alloc_fresh_huge_page(void) 91static void free_huge_page(struct page *page)
92{
93 BUG_ON(page_count(page));
94
95 INIT_LIST_HEAD(&page->lru);
96
97 spin_lock(&hugetlb_lock);
98 enqueue_huge_page(page);
99 spin_unlock(&hugetlb_lock);
100}
101
102static int alloc_fresh_huge_page(void)
68{ 103{
69 static int nid = 0; 104 static int nid = 0;
70 struct page *page; 105 struct page *page;
71 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 106 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
72 HUGETLB_PAGE_ORDER); 107 HUGETLB_PAGE_ORDER);
73 nid = (nid + 1) % num_online_nodes(); 108 nid = next_node(nid, node_online_map);
109 if (nid == MAX_NUMNODES)
110 nid = first_node(node_online_map);
74 if (page) { 111 if (page) {
112 page[1].lru.next = (void *)free_huge_page; /* dtor */
75 spin_lock(&hugetlb_lock); 113 spin_lock(&hugetlb_lock);
76 nr_huge_pages++; 114 nr_huge_pages++;
77 nr_huge_pages_node[page_to_nid(page)]++; 115 nr_huge_pages_node[page_to_nid(page)]++;
78 spin_unlock(&hugetlb_lock); 116 spin_unlock(&hugetlb_lock);
117 put_page(page); /* free it into the hugepage allocator */
118 return 1;
79 } 119 }
80 return page; 120 return 0;
81} 121}
82 122
83void free_huge_page(struct page *page) 123static struct page *alloc_huge_page(struct vm_area_struct *vma,
124 unsigned long addr)
84{ 125{
85 BUG_ON(page_count(page)); 126 struct inode *inode = vma->vm_file->f_dentry->d_inode;
127 struct page *page;
128 int use_reserve = 0;
129 unsigned long idx;
86 130
87 INIT_LIST_HEAD(&page->lru); 131 spin_lock(&hugetlb_lock);
88 page[1].lru.next = NULL; /* reset dtor */ 132
133 if (vma->vm_flags & VM_MAYSHARE) {
134
135 /* idx = radix tree index, i.e. offset into file in
136 * HPAGE_SIZE units */
137 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139
140 /* The hugetlbfs specific inode info stores the number
141 * of "guaranteed available" (huge) pages. That is,
142 * the first 'prereserved_hpages' pages of the inode
143 * are either already instantiated, or have been
144 * pre-reserved (by hugetlb_reserve_for_inode()). Here
145 * we're in the process of instantiating the page, so
146 * we use this to determine whether to draw from the
147 * pre-reserved pool or the truly free pool. */
148 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149 use_reserve = 1;
150 }
151
152 if (!use_reserve) {
153 if (free_huge_pages <= reserved_huge_pages)
154 goto fail;
155 } else {
156 BUG_ON(reserved_huge_pages == 0);
157 reserved_huge_pages--;
158 }
159
160 page = dequeue_huge_page(vma, addr);
161 if (!page)
162 goto fail;
163
164 spin_unlock(&hugetlb_lock);
165 set_page_refcounted(page);
166 return page;
167
168 fail:
169 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170 spin_unlock(&hugetlb_lock);
171 return NULL;
172}
173
174/* hugetlb_extend_reservation()
175 *
176 * Ensure that at least 'atleast' hugepages are, and will remain,
177 * available to instantiate the first 'atleast' pages of the given
178 * inode. If the inode doesn't already have this many pages reserved
179 * or instantiated, set aside some hugepages in the reserved pool to
180 * satisfy later faults (or fail now if there aren't enough, rather
181 * than getting the SIGBUS later).
182 */
183int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184 unsigned long atleast)
185{
186 struct inode *inode = &info->vfs_inode;
187 unsigned long change_in_reserve = 0;
188 int ret = 0;
89 189
90 spin_lock(&hugetlb_lock); 190 spin_lock(&hugetlb_lock);
91 enqueue_huge_page(page); 191 read_lock_irq(&inode->i_mapping->tree_lock);
192
193 if (info->prereserved_hpages >= atleast)
194 goto out;
195
196 /* Because we always call this on shared mappings, none of the
197 * pages beyond info->prereserved_hpages can have been
198 * instantiated, so we need to reserve all of them now. */
199 change_in_reserve = atleast - info->prereserved_hpages;
200
201 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202 ret = -ENOMEM;
203 goto out;
204 }
205
206 reserved_huge_pages += change_in_reserve;
207 info->prereserved_hpages = atleast;
208
209 out:
210 read_unlock_irq(&inode->i_mapping->tree_lock);
92 spin_unlock(&hugetlb_lock); 211 spin_unlock(&hugetlb_lock);
212
213 return ret;
93} 214}
94 215
95struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 216/* hugetlb_truncate_reservation()
217 *
218 * This returns pages reserved for the given inode to the general free
219 * hugepage pool. If the inode has any pages prereserved, but not
220 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221 * them.
222 */
223void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224 unsigned long atmost)
96{ 225{
226 struct inode *inode = &info->vfs_inode;
227 struct address_space *mapping = inode->i_mapping;
228 unsigned long idx;
229 unsigned long change_in_reserve = 0;
97 struct page *page; 230 struct page *page;
98 int i;
99 231
100 spin_lock(&hugetlb_lock); 232 spin_lock(&hugetlb_lock);
101 page = dequeue_huge_page(vma, addr); 233 read_lock_irq(&inode->i_mapping->tree_lock);
102 if (!page) { 234
103 spin_unlock(&hugetlb_lock); 235 if (info->prereserved_hpages <= atmost)
104 return NULL; 236 goto out;
237
238 /* Count pages which were reserved, but not instantiated, and
239 * which we can now release. */
240 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241 page = radix_tree_lookup(&mapping->page_tree, idx);
242 if (!page)
243 /* Pages which are already instantiated can't
244 * be unreserved (and in fact have already
245 * been removed from the reserved pool) */
246 change_in_reserve++;
105 } 247 }
248
249 BUG_ON(reserved_huge_pages < change_in_reserve);
250 reserved_huge_pages -= change_in_reserve;
251 info->prereserved_hpages = atmost;
252
253 out:
254 read_unlock_irq(&inode->i_mapping->tree_lock);
106 spin_unlock(&hugetlb_lock); 255 spin_unlock(&hugetlb_lock);
107 set_page_count(page, 1);
108 page[1].lru.next = (void *)free_huge_page; /* set dtor */
109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110 clear_user_highpage(&page[i], addr);
111 return page;
112} 256}
113 257
114static int __init hugetlb_init(void) 258static int __init hugetlb_init(void)
115{ 259{
116 unsigned long i; 260 unsigned long i;
117 struct page *page;
118 261
119 if (HPAGE_SHIFT == 0) 262 if (HPAGE_SHIFT == 0)
120 return 0; 263 return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
123 INIT_LIST_HEAD(&hugepage_freelists[i]); 266 INIT_LIST_HEAD(&hugepage_freelists[i]);
124 267
125 for (i = 0; i < max_huge_pages; ++i) { 268 for (i = 0; i < max_huge_pages; ++i) {
126 page = alloc_fresh_huge_page(); 269 if (!alloc_fresh_huge_page())
127 if (!page)
128 break; 270 break;
129 spin_lock(&hugetlb_lock);
130 enqueue_huge_page(page);
131 spin_unlock(&hugetlb_lock);
132 } 271 }
133 max_huge_pages = free_huge_pages = nr_huge_pages = i; 272 max_huge_pages = free_huge_pages = nr_huge_pages = i;
134 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 273 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
154 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 293 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
155 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 294 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
156 1 << PG_private | 1<< PG_writeback); 295 1 << PG_private | 1<< PG_writeback);
157 set_page_count(&page[i], 0);
158 } 296 }
159 set_page_count(page, 1); 297 page[1].lru.next = NULL;
298 set_page_refcounted(page);
160 __free_pages(page, HUGETLB_PAGE_ORDER); 299 __free_pages(page, HUGETLB_PAGE_ORDER);
161} 300}
162 301
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
188static unsigned long set_max_huge_pages(unsigned long count) 327static unsigned long set_max_huge_pages(unsigned long count)
189{ 328{
190 while (count > nr_huge_pages) { 329 while (count > nr_huge_pages) {
191 struct page *page = alloc_fresh_huge_page(); 330 if (!alloc_fresh_huge_page())
192 if (!page)
193 return nr_huge_pages; 331 return nr_huge_pages;
194 spin_lock(&hugetlb_lock);
195 enqueue_huge_page(page);
196 spin_unlock(&hugetlb_lock);
197 } 332 }
198 if (count >= nr_huge_pages) 333 if (count >= nr_huge_pages)
199 return nr_huge_pages; 334 return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
225 return sprintf(buf, 360 return sprintf(buf,
226 "HugePages_Total: %5lu\n" 361 "HugePages_Total: %5lu\n"
227 "HugePages_Free: %5lu\n" 362 "HugePages_Free: %5lu\n"
363 "HugePages_Rsvd: %5lu\n"
228 "Hugepagesize: %5lu kB\n", 364 "Hugepagesize: %5lu kB\n",
229 nr_huge_pages, 365 nr_huge_pages,
230 free_huge_pages, 366 free_huge_pages,
367 reserved_huge_pages,
231 HPAGE_SIZE/1024); 368 HPAGE_SIZE/1024);
232} 369}
233 370
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
240 nid, free_huge_pages_node[nid]); 377 nid, free_huge_pages_node[nid]);
241} 378}
242 379
243int is_hugepage_mem_enough(size_t size)
244{
245 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
246}
247
248/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 380/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
249unsigned long hugetlb_total_pages(void) 381unsigned long hugetlb_total_pages(void)
250{ 382{
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
374 unsigned long address, pte_t *ptep, pte_t pte) 506 unsigned long address, pte_t *ptep, pte_t pte)
375{ 507{
376 struct page *old_page, *new_page; 508 struct page *old_page, *new_page;
377 int i, avoidcopy; 509 int avoidcopy;
378 510
379 old_page = pte_page(pte); 511 old_page = pte_page(pte);
380 512
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
395 } 527 }
396 528
397 spin_unlock(&mm->page_table_lock); 529 spin_unlock(&mm->page_table_lock);
398 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) 530 copy_huge_page(new_page, old_page, address);
399 copy_user_highpage(new_page + i, old_page + i,
400 address + i*PAGE_SIZE);
401 spin_lock(&mm->page_table_lock); 531 spin_lock(&mm->page_table_lock);
402 532
403 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 533 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
442 ret = VM_FAULT_OOM; 572 ret = VM_FAULT_OOM;
443 goto out; 573 goto out;
444 } 574 }
575 clear_huge_page(page, address);
445 576
446 if (vma->vm_flags & VM_SHARED) { 577 if (vma->vm_flags & VM_SHARED) {
447 int err; 578 int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
496 pte_t *ptep; 627 pte_t *ptep;
497 pte_t entry; 628 pte_t entry;
498 int ret; 629 int ret;
630 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
499 631
500 ptep = huge_pte_alloc(mm, address); 632 ptep = huge_pte_alloc(mm, address);
501 if (!ptep) 633 if (!ptep)
502 return VM_FAULT_OOM; 634 return VM_FAULT_OOM;
503 635
636 /*
637 * Serialize hugepage allocation and instantiation, so that we don't
638 * get spurious allocation failures if two CPUs race to instantiate
639 * the same page in the page cache.
640 */
641 mutex_lock(&hugetlb_instantiation_mutex);
504 entry = *ptep; 642 entry = *ptep;
505 if (pte_none(entry)) 643 if (pte_none(entry)) {
506 return hugetlb_no_page(mm, vma, address, ptep, write_access); 644 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
645 mutex_unlock(&hugetlb_instantiation_mutex);
646 return ret;
647 }
507 648
508 ret = VM_FAULT_MINOR; 649 ret = VM_FAULT_MINOR;
509 650
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
513 if (write_access && !pte_write(entry)) 654 if (write_access && !pte_write(entry))
514 ret = hugetlb_cow(mm, vma, address, ptep, entry); 655 ret = hugetlb_cow(mm, vma, address, ptep, entry);
515 spin_unlock(&mm->page_table_lock); 656 spin_unlock(&mm->page_table_lock);
657 mutex_unlock(&hugetlb_instantiation_mutex);
516 658
517 return ret; 659 return ret;
518} 660}
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
521 struct page **pages, struct vm_area_struct **vmas, 663 struct page **pages, struct vm_area_struct **vmas,
522 unsigned long *position, int *length, int i) 664 unsigned long *position, int *length, int i)
523{ 665{
524 unsigned long vpfn, vaddr = *position; 666 unsigned long pfn_offset;
667 unsigned long vaddr = *position;
525 int remainder = *length; 668 int remainder = *length;
526 669
527 vpfn = vaddr/PAGE_SIZE;
528 spin_lock(&mm->page_table_lock); 670 spin_lock(&mm->page_table_lock);
529 while (vaddr < vma->vm_end && remainder) { 671 while (vaddr < vma->vm_end && remainder) {
530 pte_t *pte; 672 pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
552 break; 694 break;
553 } 695 }
554 696
555 if (pages) { 697 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
556 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 698 page = pte_page(*pte);
557 get_page(page); 699same_page:
558 pages[i] = page; 700 get_page(page);
559 } 701 if (pages)
702 pages[i] = page + pfn_offset;
560 703
561 if (vmas) 704 if (vmas)
562 vmas[i] = vma; 705 vmas[i] = vma;
563 706
564 vaddr += PAGE_SIZE; 707 vaddr += PAGE_SIZE;
565 ++vpfn; 708 ++pfn_offset;
566 --remainder; 709 --remainder;
567 ++i; 710 ++i;
711 if (vaddr < vma->vm_end && remainder &&
712 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
713 /*
714 * We use pfn_offset to avoid touching the pageframes
715 * of this compound page.
716 */
717 goto same_page;
718 }
568 } 719 }
569 spin_unlock(&mm->page_table_lock); 720 spin_unlock(&mm->page_table_lock);
570 *length = remainder; 721 *length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
572 723
573 return i; 724 return i;
574} 725}
726
727void hugetlb_change_protection(struct vm_area_struct *vma,
728 unsigned long address, unsigned long end, pgprot_t newprot)
729{
730 struct mm_struct *mm = vma->vm_mm;
731 unsigned long start = address;
732 pte_t *ptep;
733 pte_t pte;
734
735 BUG_ON(address >= end);
736 flush_cache_range(vma, address, end);
737
738 spin_lock(&mm->page_table_lock);
739 for (; address < end; address += HPAGE_SIZE) {
740 ptep = huge_pte_offset(mm, address);
741 if (!ptep)
742 continue;
743 if (!pte_none(*ptep)) {
744 pte = huge_ptep_get_and_clear(mm, address, ptep);
745 pte = pte_mkhuge(pte_modify(pte, newprot));
746 set_huge_pte_at(mm, address, ptep, pte);
747 lazy_mmu_prot_update(pte);
748 }
749 }
750 spin_unlock(&mm->page_table_lock);
751
752 flush_tlb_range(vma, start, end);
753}
754
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4ef..d20e3cc4aef0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,23 +8,33 @@
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11#ifndef __MM_INTERNAL_H
12#define __MM_INTERNAL_H
11 13
12static inline void set_page_refs(struct page *page, int order) 14#include <linux/mm.h>
15
16static inline void set_page_count(struct page *page, int v)
17{
18 atomic_set(&page->_count, v);
19}
20
21/*
22 * Turn a non-refcounted page (->_count == 0) into refcounted with
23 * a count of one.
24 */
25static inline void set_page_refcounted(struct page *page)
13{ 26{
14#ifdef CONFIG_MMU 27 BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
28 BUG_ON(atomic_read(&page->_count));
15 set_page_count(page, 1); 29 set_page_count(page, 1);
16#else 30}
17 int i;
18 31
19 /* 32static inline void __put_page(struct page *page)
20 * We need to reference all the pages for this order, otherwise if 33{
21 * anyone accesses one of the pages with (get/put) it will be freed. 34 atomic_dec(&page->_count);
22 * - eg: access_process_vm()
23 */
24 for (i = 0; i < (1 << order); i++)
25 set_page_count(page + i, 1);
26#endif /* CONFIG_MMU */
27} 35}
28 36
29extern void fastcall __init __free_pages_bootmem(struct page *page, 37extern void fastcall __init __free_pages_bootmem(struct page *page,
30 unsigned int order); 38 unsigned int order);
39
40#endif
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db29..8d8f52569f32 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
277 anon_vma_unlink(vma); 277 anon_vma_unlink(vma);
278 unlink_file_vma(vma); 278 unlink_file_vma(vma);
279 279
280 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { 280 if (is_vm_hugetlb_page(vma)) {
281 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 281 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
282 floor, next? next->vm_start: ceiling); 282 floor, next? next->vm_start: ceiling);
283 } else { 283 } else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
285 * Optimization: gather nearby vmas into one call down 285 * Optimization: gather nearby vmas into one call down
286 */ 286 */
287 while (next && next->vm_start <= vma->vm_end + PMD_SIZE 287 while (next && next->vm_start <= vma->vm_end + PMD_SIZE
288 && !is_hugepage_only_range(vma->vm_mm, next->vm_start, 288 && !is_vm_hugetlb_page(next)) {
289 HPAGE_SIZE)) {
290 vma = next; 289 vma = next;
291 next = vma->vm_next; 290 next = vma->vm_next;
292 anon_vma_unlink(vma); 291 anon_vma_unlink(vma);
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
388{ 387{
389 unsigned long pfn = pte_pfn(pte); 388 unsigned long pfn = pte_pfn(pte);
390 389
391 if (vma->vm_flags & VM_PFNMAP) { 390 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
392 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; 391 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
393 if (pfn == vma->vm_pgoff + off) 392 if (pfn == vma->vm_pgoff + off)
394 return NULL; 393 return NULL;
@@ -401,8 +400,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
401 * we should just do "return pfn_to_page(pfn)", but 400 * we should just do "return pfn_to_page(pfn)", but
402 * in the meantime we check that we get a valid pfn, 401 * in the meantime we check that we get a valid pfn,
403 * and that the resulting page looks ok. 402 * and that the resulting page looks ok.
404 *
405 * Remove this test eventually!
406 */ 403 */
407 if (unlikely(!pfn_valid(pfn))) { 404 if (unlikely(!pfn_valid(pfn))) {
408 print_bad_pte(vma, pte, addr); 405 print_bad_pte(vma, pte, addr);
@@ -1074,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1074 } 1071 }
1075 if (pages) { 1072 if (pages) {
1076 pages[i] = page; 1073 pages[i] = page;
1074
1075 flush_anon_page(page, start);
1077 flush_dcache_page(page); 1076 flush_dcache_page(page);
1078 } 1077 }
1079 if (vmas) 1078 if (vmas)
@@ -1221,9 +1220,7 @@ out:
1221 * The page has to be a nice clean _individual_ kernel allocation. 1220 * The page has to be a nice clean _individual_ kernel allocation.
1222 * If you allocate a compound page, you need to have marked it as 1221 * If you allocate a compound page, you need to have marked it as
1223 * such (__GFP_COMP), or manually just split the page up yourself 1222 * such (__GFP_COMP), or manually just split the page up yourself
1224 * (which is mainly an issue of doing "set_page_count(page, 1)" for 1223 * (see split_page()).
1225 * each sub-page, and then freeing them one by one when you free
1226 * them rather than freeing it as a compound page).
1227 * 1224 *
1228 * NOTE! Traditionally this was done with "remap_pfn_range()" which 1225 * NOTE! Traditionally this was done with "remap_pfn_range()" which
1229 * took an arbitrary page protection parameter. This doesn't allow 1226 * took an arbitrary page protection parameter. This doesn't allow
@@ -2357,10 +2354,8 @@ int make_pages_present(unsigned long addr, unsigned long end)
2357 if (!vma) 2354 if (!vma)
2358 return -1; 2355 return -1;
2359 write = (vma->vm_flags & VM_WRITE) != 0; 2356 write = (vma->vm_flags & VM_WRITE) != 0;
2360 if (addr >= end) 2357 BUG_ON(addr >= end);
2361 BUG(); 2358 BUG_ON(end > vma->vm_end);
2362 if (end > vma->vm_end)
2363 BUG();
2364 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; 2359 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
2365 ret = get_user_pages(current, current->mm, addr, 2360 ret = get_user_pages(current, current->mm, addr,
2366 len, write, 0, NULL, NULL); 2361 len, write, 0, NULL, NULL);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a39f0b..dec8249e972d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
86#include <linux/swap.h> 86#include <linux/swap.h>
87#include <linux/seq_file.h> 87#include <linux/seq_file.h>
88#include <linux/proc_fs.h> 88#include <linux/proc_fs.h>
89#include <linux/migrate.h>
89 90
90#include <asm/tlbflush.h> 91#include <asm/tlbflush.h>
91#include <asm/uaccess.h> 92#include <asm/uaccess.h>
@@ -95,11 +96,8 @@
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 96#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 97#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97 98
98/* The number of pages to migrate per call to migrate_pages() */ 99static struct kmem_cache *policy_cache;
99#define MIGRATE_CHUNK_SIZE 256 100static struct kmem_cache *sn_cache;
100
101static kmem_cache_t *policy_cache;
102static kmem_cache_t *sn_cache;
103 101
104#define PDprintk(fmt...) 102#define PDprintk(fmt...)
105 103
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
331 struct vm_area_struct *first, *vma, *prev; 329 struct vm_area_struct *first, *vma, *prev;
332 330
333 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { 331 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
334 /* Must have swap device for migration */
335 if (nr_swap_pages <= 0)
336 return ERR_PTR(-ENODEV);
337 332
338 /* 333 err = migrate_prep();
339 * Clear the LRU lists so pages can be isolated. 334 if (err)
340 * Note that pages may be moved off the LRU after we have 335 return ERR_PTR(err);
341 * drained them. Those pages will fail to migrate like other
342 * pages that may be busy.
343 */
344 lru_add_drain_all();
345 } 336 }
346 337
347 first = find_vma(mm, start); 338 first = find_vma(mm, start);
@@ -431,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
431 return mpol_check_policy(mode, nodes); 422 return mpol_check_policy(mode, nodes);
432} 423}
433 424
425
426/*
427 * Update task->flags PF_MEMPOLICY bit: set iff non-default
428 * mempolicy. Allows more rapid checking of this (combined perhaps
429 * with other PF_* flag bits) on memory allocation hot code paths.
430 *
431 * If called from outside this file, the task 'p' should -only- be
432 * a newly forked child not yet visible on the task list, because
433 * manipulating the task flags of a visible task is not safe.
434 *
435 * The above limitation is why this routine has the funny name
436 * mpol_fix_fork_child_flag().
437 *
438 * It is also safe to call this with a task pointer of current,
439 * which the static wrapper mpol_set_task_struct_flag() does,
440 * for use within this file.
441 */
442
443void mpol_fix_fork_child_flag(struct task_struct *p)
444{
445 if (p->mempolicy)
446 p->flags |= PF_MEMPOLICY;
447 else
448 p->flags &= ~PF_MEMPOLICY;
449}
450
451static void mpol_set_task_struct_flag(void)
452{
453 mpol_fix_fork_child_flag(current);
454}
455
434/* Set the process memory policy */ 456/* Set the process memory policy */
435long do_set_mempolicy(int mode, nodemask_t *nodes) 457long do_set_mempolicy(int mode, nodemask_t *nodes)
436{ 458{
@@ -443,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
443 return PTR_ERR(new); 465 return PTR_ERR(new);
444 mpol_free(current->mempolicy); 466 mpol_free(current->mempolicy);
445 current->mempolicy = new; 467 current->mempolicy = new;
468 mpol_set_task_struct_flag();
446 if (new && new->policy == MPOL_INTERLEAVE) 469 if (new && new->policy == MPOL_INTERLEAVE)
447 current->il_next = first_node(new->v.nodes); 470 current->il_next = first_node(new->v.nodes);
448 return 0; 471 return 0;
@@ -550,92 +573,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
550 return err; 573 return err;
551} 574}
552 575
576#ifdef CONFIG_MIGRATION
553/* 577/*
554 * page migration 578 * page migration
555 */ 579 */
556
557static void migrate_page_add(struct page *page, struct list_head *pagelist, 580static void migrate_page_add(struct page *page, struct list_head *pagelist,
558 unsigned long flags) 581 unsigned long flags)
559{ 582{
560 /* 583 /*
561 * Avoid migrating a page that is shared with others. 584 * Avoid migrating a page that is shared with others.
562 */ 585 */
563 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 586 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
564 if (isolate_lru_page(page)) 587 isolate_lru_page(page, pagelist);
565 list_add_tail(&page->lru, pagelist);
566 }
567}
568
569/*
570 * Migrate the list 'pagelist' of pages to a certain destination.
571 *
572 * Specify destination with either non-NULL vma or dest_node >= 0
573 * Return the number of pages not migrated or error code
574 */
575static int migrate_pages_to(struct list_head *pagelist,
576 struct vm_area_struct *vma, int dest)
577{
578 LIST_HEAD(newlist);
579 LIST_HEAD(moved);
580 LIST_HEAD(failed);
581 int err = 0;
582 unsigned long offset = 0;
583 int nr_pages;
584 struct page *page;
585 struct list_head *p;
586
587redo:
588 nr_pages = 0;
589 list_for_each(p, pagelist) {
590 if (vma) {
591 /*
592 * The address passed to alloc_page_vma is used to
593 * generate the proper interleave behavior. We fake
594 * the address here by an increasing offset in order
595 * to get the proper distribution of pages.
596 *
597 * No decision has been made as to which page
598 * a certain old page is moved to so we cannot
599 * specify the correct address.
600 */
601 page = alloc_page_vma(GFP_HIGHUSER, vma,
602 offset + vma->vm_start);
603 offset += PAGE_SIZE;
604 }
605 else
606 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
607
608 if (!page) {
609 err = -ENOMEM;
610 goto out;
611 }
612 list_add_tail(&page->lru, &newlist);
613 nr_pages++;
614 if (nr_pages > MIGRATE_CHUNK_SIZE)
615 break;
616 }
617 err = migrate_pages(pagelist, &newlist, &moved, &failed);
618
619 putback_lru_pages(&moved); /* Call release pages instead ?? */
620
621 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
622 goto redo;
623out:
624 /* Return leftover allocated pages */
625 while (!list_empty(&newlist)) {
626 page = list_entry(newlist.next, struct page, lru);
627 list_del(&page->lru);
628 __free_page(page);
629 }
630 list_splice(&failed, pagelist);
631 if (err < 0)
632 return err;
633
634 /* Calculate number of leftover pages */
635 nr_pages = 0;
636 list_for_each(p, pagelist)
637 nr_pages++;
638 return nr_pages;
639} 588}
640 589
641/* 590/*
@@ -742,8 +691,23 @@ int do_migrate_pages(struct mm_struct *mm,
742 if (err < 0) 691 if (err < 0)
743 return err; 692 return err;
744 return busy; 693 return busy;
694
745} 695}
746 696
697#else
698
699static void migrate_page_add(struct page *page, struct list_head *pagelist,
700 unsigned long flags)
701{
702}
703
704int do_migrate_pages(struct mm_struct *mm,
705 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
706{
707 return -ENOSYS;
708}
709#endif
710
747long do_mbind(unsigned long start, unsigned long len, 711long do_mbind(unsigned long start, unsigned long len,
748 unsigned long mode, nodemask_t *nmask, unsigned long flags) 712 unsigned long mode, nodemask_t *nmask, unsigned long flags)
749{ 713{
@@ -808,6 +772,7 @@ long do_mbind(unsigned long start, unsigned long len,
808 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 772 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
809 err = -EIO; 773 err = -EIO;
810 } 774 }
775
811 if (!list_empty(&pagelist)) 776 if (!list_empty(&pagelist))
812 putback_lru_pages(&pagelist); 777 putback_lru_pages(&pagelist);
813 778
@@ -947,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
947 /* 912 /*
948 * Check if this process has the right to modify the specified 913 * Check if this process has the right to modify the specified
949 * process. The right exists if the process has administrative 914 * process. The right exists if the process has administrative
950 * capabilities, superuser priviledges or the same 915 * capabilities, superuser privileges or the same
951 * userid as the target process. 916 * userid as the target process.
952 */ 917 */
953 if ((current->euid != task->suid) && (current->euid != task->uid) && 918 if ((current->euid != task->suid) && (current->euid != task->uid) &&
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80480d3..fe6e05289cc5 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -183,8 +183,8 @@ EXPORT_SYMBOL(mempool_resize);
183 */ 183 */
184void mempool_destroy(mempool_t *pool) 184void mempool_destroy(mempool_t *pool)
185{ 185{
186 if (pool->curr_nr != pool->min_nr) 186 /* Check for outstanding elements */
187 BUG(); /* There were outstanding elements */ 187 BUG_ON(pool->curr_nr != pool->min_nr);
188 free_pool(pool); 188 free_pool(pool);
189} 189}
190EXPORT_SYMBOL(mempool_destroy); 190EXPORT_SYMBOL(mempool_destroy);
@@ -278,14 +278,56 @@ EXPORT_SYMBOL(mempool_free);
278 */ 278 */
279void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) 279void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
280{ 280{
281 kmem_cache_t *mem = (kmem_cache_t *) pool_data; 281 struct kmem_cache *mem = pool_data;
282 return kmem_cache_alloc(mem, gfp_mask); 282 return kmem_cache_alloc(mem, gfp_mask);
283} 283}
284EXPORT_SYMBOL(mempool_alloc_slab); 284EXPORT_SYMBOL(mempool_alloc_slab);
285 285
286void mempool_free_slab(void *element, void *pool_data) 286void mempool_free_slab(void *element, void *pool_data)
287{ 287{
288 kmem_cache_t *mem = (kmem_cache_t *) pool_data; 288 struct kmem_cache *mem = pool_data;
289 kmem_cache_free(mem, element); 289 kmem_cache_free(mem, element);
290} 290}
291EXPORT_SYMBOL(mempool_free_slab); 291EXPORT_SYMBOL(mempool_free_slab);
292
293/*
294 * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
295 * specfied by pool_data
296 */
297void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
298{
299 size_t size = (size_t)(long)pool_data;
300 return kmalloc(size, gfp_mask);
301}
302EXPORT_SYMBOL(mempool_kmalloc);
303
304void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
305{
306 size_t size = (size_t) pool_data;
307 return kzalloc(size, gfp_mask);
308}
309EXPORT_SYMBOL(mempool_kzalloc);
310
311void mempool_kfree(void *element, void *pool_data)
312{
313 kfree(element);
314}
315EXPORT_SYMBOL(mempool_kfree);
316
317/*
318 * A simple mempool-backed page allocator that allocates pages
319 * of the order specified by pool_data.
320 */
321void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
322{
323 int order = (int)(long)pool_data;
324 return alloc_pages(gfp_mask, order);
325}
326EXPORT_SYMBOL(mempool_alloc_pages);
327
328void mempool_free_pages(void *element, void *pool_data)
329{
330 int order = (int)(long)pool_data;
331 __free_pages(element, order);
332}
333EXPORT_SYMBOL(mempool_free_pages);
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 000000000000..09f6e4aa87fc
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
1/*
2 * Memory Migration functionality - linux/mm/migration.c
3 *
4 * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
5 *
6 * Page migration was first developed in the context of the memory hotplug
7 * project. The main authors of the migration code are:
8 *
9 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
10 * Hirokazu Takahashi <taka@valinux.co.jp>
11 * Dave Hansen <haveblue@us.ibm.com>
12 * Christoph Lameter <clameter@sgi.com>
13 */
14
15#include <linux/migrate.h>
16#include <linux/module.h>
17#include <linux/swap.h>
18#include <linux/pagemap.h>
19#include <linux/buffer_head.h> /* for try_to_release_page(),
20 buffer_heads_over_limit */
21#include <linux/mm_inline.h>
22#include <linux/pagevec.h>
23#include <linux/rmap.h>
24#include <linux/topology.h>
25#include <linux/cpu.h>
26#include <linux/cpuset.h>
27#include <linux/swapops.h>
28
29#include "internal.h"
30
31#include "internal.h"
32
33/* The maximum number of pages to take off the LRU for migration */
34#define MIGRATE_CHUNK_SIZE 256
35
36#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
37
38/*
39 * Isolate one page from the LRU lists. If successful put it onto
40 * the indicated list with elevated page count.
41 *
42 * Result:
43 * -EBUSY: page not on LRU list
44 * 0: page removed from LRU list and added to the specified list.
45 */
46int isolate_lru_page(struct page *page, struct list_head *pagelist)
47{
48 int ret = -EBUSY;
49
50 if (PageLRU(page)) {
51 struct zone *zone = page_zone(page);
52
53 spin_lock_irq(&zone->lru_lock);
54 if (PageLRU(page)) {
55 ret = 0;
56 get_page(page);
57 ClearPageLRU(page);
58 if (PageActive(page))
59 del_page_from_active_list(zone, page);
60 else
61 del_page_from_inactive_list(zone, page);
62 list_add_tail(&page->lru, pagelist);
63 }
64 spin_unlock_irq(&zone->lru_lock);
65 }
66 return ret;
67}
68
69/*
70 * migrate_prep() needs to be called after we have compiled the list of pages
71 * to be migrated using isolate_lru_page() but before we begin a series of calls
72 * to migrate_pages().
73 */
74int migrate_prep(void)
75{
76 /* Must have swap device for migration */
77 if (nr_swap_pages <= 0)
78 return -ENODEV;
79
80 /*
81 * Clear the LRU lists so pages can be isolated.
82 * Note that pages may be moved off the LRU after we have
83 * drained them. Those pages will fail to migrate like other
84 * pages that may be busy.
85 */
86 lru_add_drain_all();
87
88 return 0;
89}
90
91static inline void move_to_lru(struct page *page)
92{
93 list_del(&page->lru);
94 if (PageActive(page)) {
95 /*
96 * lru_cache_add_active checks that
97 * the PG_active bit is off.
98 */
99 ClearPageActive(page);
100 lru_cache_add_active(page);
101 } else {
102 lru_cache_add(page);
103 }
104 put_page(page);
105}
106
107/*
108 * Add isolated pages on the list back to the LRU.
109 *
110 * returns the number of pages put back.
111 */
112int putback_lru_pages(struct list_head *l)
113{
114 struct page *page;
115 struct page *page2;
116 int count = 0;
117
118 list_for_each_entry_safe(page, page2, l, lru) {
119 move_to_lru(page);
120 count++;
121 }
122 return count;
123}
124
125/*
126 * Non migratable page
127 */
128int fail_migrate_page(struct page *newpage, struct page *page)
129{
130 return -EIO;
131}
132EXPORT_SYMBOL(fail_migrate_page);
133
134/*
135 * swapout a single page
136 * page is locked upon entry, unlocked on exit
137 */
138static int swap_page(struct page *page)
139{
140 struct address_space *mapping = page_mapping(page);
141
142 if (page_mapped(page) && mapping)
143 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
144 goto unlock_retry;
145
146 if (PageDirty(page)) {
147 /* Page is dirty, try to write it out here */
148 switch(pageout(page, mapping)) {
149 case PAGE_KEEP:
150 case PAGE_ACTIVATE:
151 goto unlock_retry;
152
153 case PAGE_SUCCESS:
154 goto retry;
155
156 case PAGE_CLEAN:
157 ; /* try to free the page below */
158 }
159 }
160
161 if (PagePrivate(page)) {
162 if (!try_to_release_page(page, GFP_KERNEL) ||
163 (!mapping && page_count(page) == 1))
164 goto unlock_retry;
165 }
166
167 if (remove_mapping(mapping, page)) {
168 /* Success */
169 unlock_page(page);
170 return 0;
171 }
172
173unlock_retry:
174 unlock_page(page);
175
176retry:
177 return -EAGAIN;
178}
179EXPORT_SYMBOL(swap_page);
180
181/*
182 * Remove references for a page and establish the new page with the correct
183 * basic settings to be able to stop accesses to the page.
184 */
185int migrate_page_remove_references(struct page *newpage,
186 struct page *page, int nr_refs)
187{
188 struct address_space *mapping = page_mapping(page);
189 struct page **radix_pointer;
190
191 /*
192 * Avoid doing any of the following work if the page count
193 * indicates that the page is in use or truncate has removed
194 * the page.
195 */
196 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
197 return -EAGAIN;
198
199 /*
200 * Establish swap ptes for anonymous pages or destroy pte
201 * maps for files.
202 *
203 * In order to reestablish file backed mappings the fault handlers
204 * will take the radix tree_lock which may then be used to stop
205 * processses from accessing this page until the new page is ready.
206 *
207 * A process accessing via a swap pte (an anonymous page) will take a
208 * page_lock on the old page which will block the process until the
209 * migration attempt is complete. At that time the PageSwapCache bit
210 * will be examined. If the page was migrated then the PageSwapCache
211 * bit will be clear and the operation to retrieve the page will be
212 * retried which will find the new page in the radix tree. Then a new
213 * direct mapping may be generated based on the radix tree contents.
214 *
215 * If the page was not migrated then the PageSwapCache bit
216 * is still set and the operation may continue.
217 */
218 if (try_to_unmap(page, 1) == SWAP_FAIL)
219 /* A vma has VM_LOCKED set -> permanent failure */
220 return -EPERM;
221
222 /*
223 * Give up if we were unable to remove all mappings.
224 */
225 if (page_mapcount(page))
226 return -EAGAIN;
227
228 write_lock_irq(&mapping->tree_lock);
229
230 radix_pointer = (struct page **)radix_tree_lookup_slot(
231 &mapping->page_tree,
232 page_index(page));
233
234 if (!page_mapping(page) || page_count(page) != nr_refs ||
235 *radix_pointer != page) {
236 write_unlock_irq(&mapping->tree_lock);
237 return 1;
238 }
239
240 /*
241 * Now we know that no one else is looking at the page.
242 *
243 * Certain minimal information about a page must be available
244 * in order for other subsystems to properly handle the page if they
245 * find it through the radix tree update before we are finished
246 * copying the page.
247 */
248 get_page(newpage);
249 newpage->index = page->index;
250 newpage->mapping = page->mapping;
251 if (PageSwapCache(page)) {
252 SetPageSwapCache(newpage);
253 set_page_private(newpage, page_private(page));
254 }
255
256 *radix_pointer = newpage;
257 __put_page(page);
258 write_unlock_irq(&mapping->tree_lock);
259
260 return 0;
261}
262EXPORT_SYMBOL(migrate_page_remove_references);
263
264/*
265 * Copy the page to its new location
266 */
267void migrate_page_copy(struct page *newpage, struct page *page)
268{
269 copy_highpage(newpage, page);
270
271 if (PageError(page))
272 SetPageError(newpage);
273 if (PageReferenced(page))
274 SetPageReferenced(newpage);
275 if (PageUptodate(page))
276 SetPageUptodate(newpage);
277 if (PageActive(page))
278 SetPageActive(newpage);
279 if (PageChecked(page))
280 SetPageChecked(newpage);
281 if (PageMappedToDisk(page))
282 SetPageMappedToDisk(newpage);
283
284 if (PageDirty(page)) {
285 clear_page_dirty_for_io(page);
286 set_page_dirty(newpage);
287 }
288
289 ClearPageSwapCache(page);
290 ClearPageActive(page);
291 ClearPagePrivate(page);
292 set_page_private(page, 0);
293 page->mapping = NULL;
294
295 /*
296 * If any waiters have accumulated on the new page then
297 * wake them up.
298 */
299 if (PageWriteback(newpage))
300 end_page_writeback(newpage);
301}
302EXPORT_SYMBOL(migrate_page_copy);
303
304/*
305 * Common logic to directly migrate a single page suitable for
306 * pages that do not use PagePrivate.
307 *
308 * Pages are locked upon entry and exit.
309 */
310int migrate_page(struct page *newpage, struct page *page)
311{
312 int rc;
313
314 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
315
316 rc = migrate_page_remove_references(newpage, page, 2);
317
318 if (rc)
319 return rc;
320
321 migrate_page_copy(newpage, page);
322
323 /*
324 * Remove auxiliary swap entries and replace
325 * them with real ptes.
326 *
327 * Note that a real pte entry will allow processes that are not
328 * waiting on the page lock to use the new page via the page tables
329 * before the new page is unlocked.
330 */
331 remove_from_swap(newpage);
332 return 0;
333}
334EXPORT_SYMBOL(migrate_page);
335
336/*
337 * migrate_pages
338 *
339 * Two lists are passed to this function. The first list
340 * contains the pages isolated from the LRU to be migrated.
341 * The second list contains new pages that the pages isolated
342 * can be moved to. If the second list is NULL then all
343 * pages are swapped out.
344 *
345 * The function returns after 10 attempts or if no pages
346 * are movable anymore because to has become empty
347 * or no retryable pages exist anymore.
348 *
349 * Return: Number of pages not migrated when "to" ran empty.
350 */
351int migrate_pages(struct list_head *from, struct list_head *to,
352 struct list_head *moved, struct list_head *failed)
353{
354 int retry;
355 int nr_failed = 0;
356 int pass = 0;
357 struct page *page;
358 struct page *page2;
359 int swapwrite = current->flags & PF_SWAPWRITE;
360 int rc;
361
362 if (!swapwrite)
363 current->flags |= PF_SWAPWRITE;
364
365redo:
366 retry = 0;
367
368 list_for_each_entry_safe(page, page2, from, lru) {
369 struct page *newpage = NULL;
370 struct address_space *mapping;
371
372 cond_resched();
373
374 rc = 0;
375 if (page_count(page) == 1)
376 /* page was freed from under us. So we are done. */
377 goto next;
378
379 if (to && list_empty(to))
380 break;
381
382 /*
383 * Skip locked pages during the first two passes to give the
384 * functions holding the lock time to release the page. Later we
385 * use lock_page() to have a higher chance of acquiring the
386 * lock.
387 */
388 rc = -EAGAIN;
389 if (pass > 2)
390 lock_page(page);
391 else
392 if (TestSetPageLocked(page))
393 goto next;
394
395 /*
396 * Only wait on writeback if we have already done a pass where
397 * we we may have triggered writeouts for lots of pages.
398 */
399 if (pass > 0) {
400 wait_on_page_writeback(page);
401 } else {
402 if (PageWriteback(page))
403 goto unlock_page;
404 }
405
406 /*
407 * Anonymous pages must have swap cache references otherwise
408 * the information contained in the page maps cannot be
409 * preserved.
410 */
411 if (PageAnon(page) && !PageSwapCache(page)) {
412 if (!add_to_swap(page, GFP_KERNEL)) {
413 rc = -ENOMEM;
414 goto unlock_page;
415 }
416 }
417
418 if (!to) {
419 rc = swap_page(page);
420 goto next;
421 }
422
423 newpage = lru_to_page(to);
424 lock_page(newpage);
425
426 /*
427 * Pages are properly locked and writeback is complete.
428 * Try to migrate the page.
429 */
430 mapping = page_mapping(page);
431 if (!mapping)
432 goto unlock_both;
433
434 if (mapping->a_ops->migratepage) {
435 /*
436 * Most pages have a mapping and most filesystems
437 * should provide a migration function. Anonymous
438 * pages are part of swap space which also has its
439 * own migration function. This is the most common
440 * path for page migration.
441 */
442 rc = mapping->a_ops->migratepage(newpage, page);
443 goto unlock_both;
444 }
445
446 /*
447 * Default handling if a filesystem does not provide
448 * a migration function. We can only migrate clean
449 * pages so try to write out any dirty pages first.
450 */
451 if (PageDirty(page)) {
452 switch (pageout(page, mapping)) {
453 case PAGE_KEEP:
454 case PAGE_ACTIVATE:
455 goto unlock_both;
456
457 case PAGE_SUCCESS:
458 unlock_page(newpage);
459 goto next;
460
461 case PAGE_CLEAN:
462 ; /* try to migrate the page below */
463 }
464 }
465
466 /*
467 * Buffers are managed in a filesystem specific way.
468 * We must have no buffers or drop them.
469 */
470 if (!page_has_buffers(page) ||
471 try_to_release_page(page, GFP_KERNEL)) {
472 rc = migrate_page(newpage, page);
473 goto unlock_both;
474 }
475
476 /*
477 * On early passes with mapped pages simply
478 * retry. There may be a lock held for some
479 * buffers that may go away. Later
480 * swap them out.
481 */
482 if (pass > 4) {
483 /*
484 * Persistently unable to drop buffers..... As a
485 * measure of last resort we fall back to
486 * swap_page().
487 */
488 unlock_page(newpage);
489 newpage = NULL;
490 rc = swap_page(page);
491 goto next;
492 }
493
494unlock_both:
495 unlock_page(newpage);
496
497unlock_page:
498 unlock_page(page);
499
500next:
501 if (rc == -EAGAIN) {
502 retry++;
503 } else if (rc) {
504 /* Permanent failure */
505 list_move(&page->lru, failed);
506 nr_failed++;
507 } else {
508 if (newpage) {
509 /* Successful migration. Return page to LRU */
510 move_to_lru(newpage);
511 }
512 list_move(&page->lru, moved);
513 }
514 }
515 if (retry && pass++ < 10)
516 goto redo;
517
518 if (!swapwrite)
519 current->flags &= ~PF_SWAPWRITE;
520
521 return nr_failed + retry;
522}
523
524/*
525 * Migration function for pages with buffers. This function can only be used
526 * if the underlying filesystem guarantees that no other references to "page"
527 * exist.
528 */
529int buffer_migrate_page(struct page *newpage, struct page *page)
530{
531 struct address_space *mapping = page->mapping;
532 struct buffer_head *bh, *head;
533 int rc;
534
535 if (!mapping)
536 return -EAGAIN;
537
538 if (!page_has_buffers(page))
539 return migrate_page(newpage, page);
540
541 head = page_buffers(page);
542
543 rc = migrate_page_remove_references(newpage, page, 3);
544
545 if (rc)
546 return rc;
547
548 bh = head;
549 do {
550 get_bh(bh);
551 lock_buffer(bh);
552 bh = bh->b_this_page;
553
554 } while (bh != head);
555
556 ClearPagePrivate(page);
557 set_page_private(newpage, page_private(page));
558 set_page_private(page, 0);
559 put_page(page);
560 get_page(newpage);
561
562 bh = head;
563 do {
564 set_bh_page(bh, newpage, bh_offset(bh));
565 bh = bh->b_this_page;
566
567 } while (bh != head);
568
569 SetPagePrivate(newpage);
570
571 migrate_page_copy(newpage, page);
572
573 bh = head;
574 do {
575 unlock_buffer(bh);
576 put_bh(bh);
577 bh = bh->b_this_page;
578
579 } while (bh != head);
580
581 return 0;
582}
583EXPORT_SYMBOL(buffer_migrate_page);
584
585/*
586 * Migrate the list 'pagelist' of pages to a certain destination.
587 *
588 * Specify destination with either non-NULL vma or dest_node >= 0
589 * Return the number of pages not migrated or error code
590 */
591int migrate_pages_to(struct list_head *pagelist,
592 struct vm_area_struct *vma, int dest)
593{
594 LIST_HEAD(newlist);
595 LIST_HEAD(moved);
596 LIST_HEAD(failed);
597 int err = 0;
598 unsigned long offset = 0;
599 int nr_pages;
600 struct page *page;
601 struct list_head *p;
602
603redo:
604 nr_pages = 0;
605 list_for_each(p, pagelist) {
606 if (vma) {
607 /*
608 * The address passed to alloc_page_vma is used to
609 * generate the proper interleave behavior. We fake
610 * the address here by an increasing offset in order
611 * to get the proper distribution of pages.
612 *
613 * No decision has been made as to which page
614 * a certain old page is moved to so we cannot
615 * specify the correct address.
616 */
617 page = alloc_page_vma(GFP_HIGHUSER, vma,
618 offset + vma->vm_start);
619 offset += PAGE_SIZE;
620 }
621 else
622 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
623
624 if (!page) {
625 err = -ENOMEM;
626 goto out;
627 }
628 list_add_tail(&page->lru, &newlist);
629 nr_pages++;
630 if (nr_pages > MIGRATE_CHUNK_SIZE)
631 break;
632 }
633 err = migrate_pages(pagelist, &newlist, &moved, &failed);
634
635 putback_lru_pages(&moved); /* Call release pages instead ?? */
636
637 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
638 goto redo;
639out:
640 /* Return leftover allocated pages */
641 while (!list_empty(&newlist)) {
642 page = list_entry(newlist.next, struct page, lru);
643 list_del(&page->lru);
644 __free_page(page);
645 }
646 list_splice(&failed, pagelist);
647 if (err < 0)
648 return err;
649
650 /* Calculate number of leftover pages */
651 nr_pages = 0;
652 list_for_each(p, pagelist)
653 nr_pages++;
654 return nr_pages;
655}
diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2b3e90..4f5b5709136a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end);
612 * If the vma has a ->close operation then the driver probably needs to release 612 * If the vma has a ->close operation then the driver probably needs to release
613 * per-vma resources, so we don't attempt to merge those. 613 * per-vma resources, so we don't attempt to merge those.
614 */ 614 */
615#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) 615#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
616 616
617static inline int is_mergeable_vma(struct vm_area_struct *vma, 617static inline int is_mergeable_vma(struct vm_area_struct *vma,
618 struct file *file, unsigned long vm_flags) 618 struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
845 const unsigned long stack_flags 845 const unsigned long stack_flags
846 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); 846 = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
847 847
848#ifdef CONFIG_HUGETLB
849 if (flags & VM_HUGETLB) {
850 if (!(flags & VM_DONTCOPY))
851 mm->shared_vm += pages;
852 return;
853 }
854#endif /* CONFIG_HUGETLB */
855
856 if (file) { 848 if (file) {
857 mm->shared_vm += pages; 849 mm->shared_vm += pages;
858 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) 850 if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1048,12 +1040,11 @@ munmap_back:
1048 * specific mapper. the address has already been validated, but 1040 * specific mapper. the address has already been validated, but
1049 * not unmapped, but the maps are removed from the list. 1041 * not unmapped, but the maps are removed from the list.
1050 */ 1042 */
1051 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 1043 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1052 if (!vma) { 1044 if (!vma) {
1053 error = -ENOMEM; 1045 error = -ENOMEM;
1054 goto unacct_error; 1046 goto unacct_error;
1055 } 1047 }
1056 memset(vma, 0, sizeof(*vma));
1057 1048
1058 vma->vm_mm = mm; 1049 vma->vm_mm = mm;
1059 vma->vm_start = addr; 1050 vma->vm_start = addr;
@@ -1904,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1904 /* 1895 /*
1905 * create a vma struct for an anonymous mapping 1896 * create a vma struct for an anonymous mapping
1906 */ 1897 */
1907 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 1898 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1908 if (!vma) { 1899 if (!vma) {
1909 vm_unacct_memory(len >> PAGE_SHIFT); 1900 vm_unacct_memory(len >> PAGE_SHIFT);
1910 return -ENOMEM; 1901 return -ENOMEM;
1911 } 1902 }
1912 memset(vma, 0, sizeof(*vma));
1913 1903
1914 vma->vm_mm = mm; 1904 vma->vm_mm = mm;
1915 vma->vm_start = addr; 1905 vma->vm_start = addr;
diff --git a/mm/mmzone.c b/mm/mmzone.c
new file mode 100644
index 000000000000..b022370e612e
--- /dev/null
+++ b/mm/mmzone.c
@@ -0,0 +1,50 @@
1/*
2 * linux/mm/mmzone.c
3 *
4 * management codes for pgdats and zones.
5 */
6
7
8#include <linux/config.h>
9#include <linux/stddef.h>
10#include <linux/mmzone.h>
11#include <linux/module.h>
12
13struct pglist_data *first_online_pgdat(void)
14{
15 return NODE_DATA(first_online_node);
16}
17
18EXPORT_SYMBOL(first_online_pgdat);
19
20struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
21{
22 int nid = next_online_node(pgdat->node_id);
23
24 if (nid == MAX_NUMNODES)
25 return NULL;
26 return NODE_DATA(nid);
27}
28EXPORT_SYMBOL(next_online_pgdat);
29
30
31/*
32 * next_zone - helper magic for for_each_zone()
33 */
34struct zone *next_zone(struct zone *zone)
35{
36 pg_data_t *pgdat = zone->zone_pgdat;
37
38 if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
39 zone++;
40 else {
41 pgdat = next_online_pgdat(pgdat);
42 if (pgdat)
43 zone = pgdat->node_zones;
44 else
45 zone = NULL;
46 }
47 return zone;
48}
49EXPORT_SYMBOL(next_zone);
50
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1ed..4c14d4289b61 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
124 * a MAP_NORESERVE private mapping to writable will now reserve. 124 * a MAP_NORESERVE private mapping to writable will now reserve.
125 */ 125 */
126 if (newflags & VM_WRITE) { 126 if (newflags & VM_WRITE) {
127 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { 127 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
128 charged = nrpages; 128 charged = nrpages;
129 if (security_vm_enough_memory(charged)) 129 if (security_vm_enough_memory(charged))
130 return -ENOMEM; 130 return -ENOMEM;
@@ -166,7 +166,10 @@ success:
166 */ 166 */
167 vma->vm_flags = newflags; 167 vma->vm_flags = newflags;
168 vma->vm_page_prot = newprot; 168 vma->vm_page_prot = newprot;
169 change_protection(vma, start, end, newprot); 169 if (is_vm_hugetlb_page(vma))
170 hugetlb_change_protection(vma, start, end, newprot);
171 else
172 change_protection(vma, start, end, newprot);
170 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 173 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
171 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 174 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
172 return 0; 175 return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
240 243
241 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ 244 /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
242 245
243 if (is_vm_hugetlb_page(vma)) {
244 error = -EACCES;
245 goto out;
246 }
247
248 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); 246 newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
249 247
250 /* newflags >> 4 shift VM_MAY% in place of VM_% */ 248 /* newflags >> 4 shift VM_MAY% in place of VM_% */
diff --git a/mm/msync.c b/mm/msync.c
index 3563a56e1a51..bc6c95376366 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -9,20 +9,24 @@
9 */ 9 */
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/fs.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/hugetlb.h> 15#include <linux/hugetlb.h>
16#include <linux/writeback.h>
17#include <linux/file.h>
15#include <linux/syscalls.h> 18#include <linux/syscalls.h>
16 19
17#include <asm/pgtable.h> 20#include <asm/pgtable.h>
18#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
19 22
20static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 23static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
21 unsigned long addr, unsigned long end) 24 unsigned long addr, unsigned long end)
22{ 25{
23 pte_t *pte; 26 pte_t *pte;
24 spinlock_t *ptl; 27 spinlock_t *ptl;
25 int progress = 0; 28 int progress = 0;
29 unsigned long ret = 0;
26 30
27again: 31again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 32 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -43,58 +47,64 @@ again:
43 if (!page) 47 if (!page)
44 continue; 48 continue;
45 if (ptep_clear_flush_dirty(vma, addr, pte) || 49 if (ptep_clear_flush_dirty(vma, addr, pte) ||
46 page_test_and_clear_dirty(page)) 50 page_test_and_clear_dirty(page))
47 set_page_dirty(page); 51 ret += set_page_dirty(page);
48 progress += 3; 52 progress += 3;
49 } while (pte++, addr += PAGE_SIZE, addr != end); 53 } while (pte++, addr += PAGE_SIZE, addr != end);
50 pte_unmap_unlock(pte - 1, ptl); 54 pte_unmap_unlock(pte - 1, ptl);
51 cond_resched(); 55 cond_resched();
52 if (addr != end) 56 if (addr != end)
53 goto again; 57 goto again;
58 return ret;
54} 59}
55 60
56static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, 61static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
57 unsigned long addr, unsigned long end) 62 pud_t *pud, unsigned long addr, unsigned long end)
58{ 63{
59 pmd_t *pmd; 64 pmd_t *pmd;
60 unsigned long next; 65 unsigned long next;
66 unsigned long ret = 0;
61 67
62 pmd = pmd_offset(pud, addr); 68 pmd = pmd_offset(pud, addr);
63 do { 69 do {
64 next = pmd_addr_end(addr, end); 70 next = pmd_addr_end(addr, end);
65 if (pmd_none_or_clear_bad(pmd)) 71 if (pmd_none_or_clear_bad(pmd))
66 continue; 72 continue;
67 msync_pte_range(vma, pmd, addr, next); 73 ret += msync_pte_range(vma, pmd, addr, next);
68 } while (pmd++, addr = next, addr != end); 74 } while (pmd++, addr = next, addr != end);
75 return ret;
69} 76}
70 77
71static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 78static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
72 unsigned long addr, unsigned long end) 79 pgd_t *pgd, unsigned long addr, unsigned long end)
73{ 80{
74 pud_t *pud; 81 pud_t *pud;
75 unsigned long next; 82 unsigned long next;
83 unsigned long ret = 0;
76 84
77 pud = pud_offset(pgd, addr); 85 pud = pud_offset(pgd, addr);
78 do { 86 do {
79 next = pud_addr_end(addr, end); 87 next = pud_addr_end(addr, end);
80 if (pud_none_or_clear_bad(pud)) 88 if (pud_none_or_clear_bad(pud))
81 continue; 89 continue;
82 msync_pmd_range(vma, pud, addr, next); 90 ret += msync_pmd_range(vma, pud, addr, next);
83 } while (pud++, addr = next, addr != end); 91 } while (pud++, addr = next, addr != end);
92 return ret;
84} 93}
85 94
86static void msync_page_range(struct vm_area_struct *vma, 95static unsigned long msync_page_range(struct vm_area_struct *vma,
87 unsigned long addr, unsigned long end) 96 unsigned long addr, unsigned long end)
88{ 97{
89 pgd_t *pgd; 98 pgd_t *pgd;
90 unsigned long next; 99 unsigned long next;
100 unsigned long ret = 0;
91 101
92 /* For hugepages we can't go walking the page table normally, 102 /* For hugepages we can't go walking the page table normally,
93 * but that's ok, hugetlbfs is memory based, so we don't need 103 * but that's ok, hugetlbfs is memory based, so we don't need
94 * to do anything more on an msync(). 104 * to do anything more on an msync().
95 */ 105 */
96 if (vma->vm_flags & VM_HUGETLB) 106 if (vma->vm_flags & VM_HUGETLB)
97 return; 107 return 0;
98 108
99 BUG_ON(addr >= end); 109 BUG_ON(addr >= end);
100 pgd = pgd_offset(vma->vm_mm, addr); 110 pgd = pgd_offset(vma->vm_mm, addr);
@@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma,
103 next = pgd_addr_end(addr, end); 113 next = pgd_addr_end(addr, end);
104 if (pgd_none_or_clear_bad(pgd)) 114 if (pgd_none_or_clear_bad(pgd))
105 continue; 115 continue;
106 msync_pud_range(vma, pgd, addr, next); 116 ret += msync_pud_range(vma, pgd, addr, next);
107 } while (pgd++, addr = next, addr != end); 117 } while (pgd++, addr = next, addr != end);
118 return ret;
108} 119}
109 120
110/* 121/*
@@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma,
115 * write out the dirty pages and wait on the writeout and check the result. 126 * write out the dirty pages and wait on the writeout and check the result.
116 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start 127 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
117 * async writeout immediately. 128 * async writeout immediately.
118 * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to 129 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
119 * applications. 130 * applications.
120 */ 131 */
121static int msync_interval(struct vm_area_struct *vma, 132static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
122 unsigned long addr, unsigned long end, int flags) 133 unsigned long end, int flags,
134 unsigned long *nr_pages_dirtied)
123{ 135{
124 int ret = 0;
125 struct file *file = vma->vm_file; 136 struct file *file = vma->vm_file;
126 137
127 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) 138 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
128 return -EBUSY; 139 return -EBUSY;
129 140
130 if (file && (vma->vm_flags & VM_SHARED)) { 141 if (file && (vma->vm_flags & VM_SHARED))
131 msync_page_range(vma, addr, end); 142 *nr_pages_dirtied = msync_page_range(vma, addr, end);
132 143 return 0;
133 if (flags & MS_SYNC) {
134 struct address_space *mapping = file->f_mapping;
135 int err;
136
137 ret = filemap_fdatawrite(mapping);
138 if (file->f_op && file->f_op->fsync) {
139 /*
140 * We don't take i_mutex here because mmap_sem
141 * is already held.
142 */
143 err = file->f_op->fsync(file,file->f_dentry,1);
144 if (err && !ret)
145 ret = err;
146 }
147 err = filemap_fdatawait(mapping);
148 if (!ret)
149 ret = err;
150 }
151 }
152 return ret;
153} 144}
154 145
155asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 146asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
156{ 147{
157 unsigned long end; 148 unsigned long end;
158 struct vm_area_struct *vma; 149 struct vm_area_struct *vma;
159 int unmapped_error, error = -EINVAL; 150 int unmapped_error = 0;
160 151 int error = -EINVAL;
161 if (flags & MS_SYNC) 152 int done = 0;
162 current->flags |= PF_SYNCWRITE;
163 153
164 down_read(&current->mm->mmap_sem);
165 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 154 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
166 goto out; 155 goto out;
167 if (start & ~PAGE_MASK) 156 if (start & ~PAGE_MASK)
@@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
180 * If the interval [start,end) covers some unmapped address ranges, 169 * If the interval [start,end) covers some unmapped address ranges,
181 * just ignore them, but return -ENOMEM at the end. 170 * just ignore them, but return -ENOMEM at the end.
182 */ 171 */
172 down_read(&current->mm->mmap_sem);
173 if (flags & MS_SYNC)
174 current->flags |= PF_SYNCWRITE;
183 vma = find_vma(current->mm, start); 175 vma = find_vma(current->mm, start);
184 unmapped_error = 0; 176 if (!vma) {
185 for (;;) {
186 /* Still start < end. */
187 error = -ENOMEM; 177 error = -ENOMEM;
188 if (!vma) 178 goto out_unlock;
189 goto out; 179 }
180 do {
181 unsigned long nr_pages_dirtied = 0;
182 struct file *file;
183
190 /* Here start < vma->vm_end. */ 184 /* Here start < vma->vm_end. */
191 if (start < vma->vm_start) { 185 if (start < vma->vm_start) {
192 unmapped_error = -ENOMEM; 186 unmapped_error = -ENOMEM;
@@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
195 /* Here vma->vm_start <= start < vma->vm_end. */ 189 /* Here vma->vm_start <= start < vma->vm_end. */
196 if (end <= vma->vm_end) { 190 if (end <= vma->vm_end) {
197 if (start < end) { 191 if (start < end) {
198 error = msync_interval(vma, start, end, flags); 192 error = msync_interval(vma, start, end, flags,
193 &nr_pages_dirtied);
199 if (error) 194 if (error)
200 goto out; 195 goto out_unlock;
201 } 196 }
202 error = unmapped_error; 197 error = unmapped_error;
203 goto out; 198 done = 1;
199 } else {
200 /* Here vma->vm_start <= start < vma->vm_end < end. */
201 error = msync_interval(vma, start, vma->vm_end, flags,
202 &nr_pages_dirtied);
203 if (error)
204 goto out_unlock;
204 } 205 }
205 /* Here vma->vm_start <= start < vma->vm_end < end. */ 206 file = vma->vm_file;
206 error = msync_interval(vma, start, vma->vm_end, flags);
207 if (error)
208 goto out;
209 start = vma->vm_end; 207 start = vma->vm_end;
210 vma = vma->vm_next; 208 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
211 } 209 get_file(file);
212out: 210 up_read(&current->mm->mmap_sem);
213 up_read(&current->mm->mmap_sem); 211 balance_dirty_pages_ratelimited_nr(file->f_mapping,
212 nr_pages_dirtied);
213 fput(file);
214 down_read(&current->mm->mmap_sem);
215 vma = find_vma(current->mm, start);
216 } else if ((flags & MS_SYNC) && file &&
217 (vma->vm_flags & VM_SHARED)) {
218 get_file(file);
219 up_read(&current->mm->mmap_sem);
220 error = do_fsync(file, 0);
221 fput(file);
222 down_read(&current->mm->mmap_sem);
223 if (error)
224 goto out_unlock;
225 vma = find_vma(current->mm, start);
226 } else {
227 vma = vma->vm_next;
228 }
229 } while (vma && !done);
230out_unlock:
214 current->flags &= ~PF_SYNCWRITE; 231 current->flags &= ~PF_SYNCWRITE;
232 up_read(&current->mm->mmap_sem);
233out:
215 return error; 234 return error;
216} 235}
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f28..db45efac17cc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
159 /* 159 /*
160 * kmalloc doesn't like __GFP_HIGHMEM for some reason 160 * kmalloc doesn't like __GFP_HIGHMEM for some reason
161 */ 161 */
162 return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); 162 return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
163} 163}
164 164
165struct page * vmalloc_to_page(void *addr) 165struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
623 * - note that this may not return a page-aligned address if the object 623 * - note that this may not return a page-aligned address if the object
624 * we're allocating is smaller than a page 624 * we're allocating is smaller than a page
625 */ 625 */
626 base = kmalloc(len, GFP_KERNEL); 626 base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
627 if (!base) 627 if (!base)
628 goto enomem; 628 goto enomem;
629 629
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559fb63d2..893d7677579e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
75 * The interval between `kupdate'-style writebacks, in centiseconds 75 * The interval between `kupdate'-style writebacks, in centiseconds
76 * (hundredths of a second) 76 * (hundredths of a second)
77 */ 77 */
78int dirty_writeback_centisecs = 5 * 100; 78int dirty_writeback_interval = 5 * HZ;
79 79
80/* 80/*
81 * The longest number of centiseconds for which data is allowed to remain dirty 81 * The longest number of centiseconds for which data is allowed to remain dirty
82 */ 82 */
83int dirty_expire_centisecs = 30 * 100; 83int dirty_expire_interval = 30 * HZ;
84 84
85/* 85/*
86 * Flag that makes the machine dump writes/reads and block dirtyings. 86 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100;
88int block_dump; 88int block_dump;
89 89
90/* 90/*
91 * Flag that puts the machine in "laptop mode". 91 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
92 * a full sync is triggered after this time elapses without any disk activity.
92 */ 93 */
93int laptop_mode; 94int laptop_mode;
94 95
@@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping)
255} 256}
256 257
257/** 258/**
258 * balance_dirty_pages_ratelimited - balance dirty memory state 259 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
259 * @mapping: address_space which was dirtied 260 * @mapping: address_space which was dirtied
261 * @nr_pages: number of pages which the caller has just dirtied
260 * 262 *
261 * Processes which are dirtying memory should call in here once for each page 263 * Processes which are dirtying memory should call in here once for each page
262 * which was newly dirtied. The function will periodically check the system's 264 * which was newly dirtied. The function will periodically check the system's
@@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping)
267 * limit we decrease the ratelimiting by a lot, to prevent individual processes 269 * limit we decrease the ratelimiting by a lot, to prevent individual processes
268 * from overshooting the limit by (ratelimit_pages) each. 270 * from overshooting the limit by (ratelimit_pages) each.
269 */ 271 */
270void balance_dirty_pages_ratelimited(struct address_space *mapping) 272void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
273 unsigned long nr_pages_dirtied)
271{ 274{
272 static DEFINE_PER_CPU(int, ratelimits) = 0; 275 static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
273 long ratelimit; 276 unsigned long ratelimit;
277 unsigned long *p;
274 278
275 ratelimit = ratelimit_pages; 279 ratelimit = ratelimit_pages;
276 if (dirty_exceeded) 280 if (dirty_exceeded)
@@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
280 * Check the rate limiting. Also, we do not want to throttle real-time 284 * Check the rate limiting. Also, we do not want to throttle real-time
281 * tasks in balance_dirty_pages(). Period. 285 * tasks in balance_dirty_pages(). Period.
282 */ 286 */
283 if (get_cpu_var(ratelimits)++ >= ratelimit) { 287 preempt_disable();
284 __get_cpu_var(ratelimits) = 0; 288 p = &__get_cpu_var(ratelimits);
285 put_cpu_var(ratelimits); 289 *p += nr_pages_dirtied;
290 if (unlikely(*p >= ratelimit)) {
291 *p = 0;
292 preempt_enable();
286 balance_dirty_pages(mapping); 293 balance_dirty_pages(mapping);
287 return; 294 return;
288 } 295 }
289 put_cpu_var(ratelimits); 296 preempt_enable();
290} 297}
291EXPORT_SYMBOL(balance_dirty_pages_ratelimited); 298EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
292 299
293void throttle_vm_writeout(void) 300void throttle_vm_writeout(void)
294{ 301{
@@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
380 * just walks the superblock inode list, writing back any inodes which are 387 * just walks the superblock inode list, writing back any inodes which are
381 * older than a specific point in time. 388 * older than a specific point in time.
382 * 389 *
383 * Try to run once per dirty_writeback_centisecs. But if a writeback event 390 * Try to run once per dirty_writeback_interval. But if a writeback event
384 * takes longer than a dirty_writeback_centisecs interval, then leave a 391 * takes longer than a dirty_writeback_interval interval, then leave a
385 * one-second gap. 392 * one-second gap.
386 * 393 *
387 * older_than_this takes precedence over nr_to_write. So we'll only write back 394 * older_than_this takes precedence over nr_to_write. So we'll only write back
@@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg)
406 sync_supers(); 413 sync_supers();
407 414
408 get_writeback_state(&wbs); 415 get_writeback_state(&wbs);
409 oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; 416 oldest_jif = jiffies - dirty_expire_interval;
410 start_jif = jiffies; 417 start_jif = jiffies;
411 next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; 418 next_jif = start_jif + dirty_writeback_interval;
412 nr_to_write = wbs.nr_dirty + wbs.nr_unstable + 419 nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
413 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 420 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
414 while (nr_to_write > 0) { 421 while (nr_to_write > 0) {
@@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg)
425 } 432 }
426 if (time_before(next_jif, jiffies + HZ)) 433 if (time_before(next_jif, jiffies + HZ))
427 next_jif = jiffies + HZ; 434 next_jif = jiffies + HZ;
428 if (dirty_writeback_centisecs) 435 if (dirty_writeback_interval)
429 mod_timer(&wb_timer, next_jif); 436 mod_timer(&wb_timer, next_jif);
430} 437}
431 438
@@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg)
435int dirty_writeback_centisecs_handler(ctl_table *table, int write, 442int dirty_writeback_centisecs_handler(ctl_table *table, int write,
436 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 443 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
437{ 444{
438 proc_dointvec(table, write, file, buffer, length, ppos); 445 proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
439 if (dirty_writeback_centisecs) { 446 if (dirty_writeback_interval) {
440 mod_timer(&wb_timer, 447 mod_timer(&wb_timer,
441 jiffies + (dirty_writeback_centisecs * HZ) / 100); 448 jiffies + dirty_writeback_interval);
442 } else { 449 } else {
443 del_timer(&wb_timer); 450 del_timer(&wb_timer);
444 } 451 }
445 return 0; 452 return 0;
@@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused)
468 */ 475 */
469void laptop_io_completion(void) 476void laptop_io_completion(void)
470{ 477{
471 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); 478 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
472} 479}
473 480
474/* 481/*
@@ -544,7 +551,7 @@ void __init page_writeback_init(void)
544 if (vm_dirty_ratio <= 0) 551 if (vm_dirty_ratio <= 0)
545 vm_dirty_ratio = 1; 552 vm_dirty_ratio = 1;
546 } 553 }
547 mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); 554 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
548 set_ratelimit(); 555 set_ratelimit();
549 register_cpu_notifier(&ratelimit_nb); 556 register_cpu_notifier(&ratelimit_nb);
550} 557}
@@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page);
621 */ 628 */
622int __set_page_dirty_nobuffers(struct page *page) 629int __set_page_dirty_nobuffers(struct page *page)
623{ 630{
624 int ret = 0;
625
626 if (!TestSetPageDirty(page)) { 631 if (!TestSetPageDirty(page)) {
627 struct address_space *mapping = page_mapping(page); 632 struct address_space *mapping = page_mapping(page);
628 struct address_space *mapping2; 633 struct address_space *mapping2;
@@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page)
644 I_DIRTY_PAGES); 649 I_DIRTY_PAGES);
645 } 650 }
646 } 651 }
652 return 1;
647 } 653 }
648 return ret; 654 return 0;
649} 655}
650EXPORT_SYMBOL(__set_page_dirty_nobuffers); 656EXPORT_SYMBOL(__set_page_dirty_nobuffers);
651 657
@@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page)
675 return (*spd)(page); 681 return (*spd)(page);
676 return __set_page_dirty_buffers(page); 682 return __set_page_dirty_buffers(page);
677 } 683 }
678 if (!PageDirty(page)) 684 if (!PageDirty(page)) {
679 SetPageDirty(page); 685 if (!TestSetPageDirty(page))
686 return 1;
687 }
680 return 0; 688 return 0;
681} 689}
682EXPORT_SYMBOL(set_page_dirty); 690EXPORT_SYMBOL(set_page_dirty);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd4895d14..dc523a1f270d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,13 +49,11 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
49EXPORT_SYMBOL(node_online_map); 49EXPORT_SYMBOL(node_online_map);
50nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 50nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
51EXPORT_SYMBOL(node_possible_map); 51EXPORT_SYMBOL(node_possible_map);
52struct pglist_data *pgdat_list __read_mostly;
53unsigned long totalram_pages __read_mostly; 52unsigned long totalram_pages __read_mostly;
54unsigned long totalhigh_pages __read_mostly; 53unsigned long totalhigh_pages __read_mostly;
55long nr_swap_pages; 54long nr_swap_pages;
56int percpu_pagelist_fraction; 55int percpu_pagelist_fraction;
57 56
58static void fastcall free_hot_cold_page(struct page *page, int cold);
59static void __free_pages_ok(struct page *page, unsigned int order); 57static void __free_pages_ok(struct page *page, unsigned int order);
60 58
61/* 59/*
@@ -190,7 +188,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
190 for (i = 0; i < nr_pages; i++) { 188 for (i = 0; i < nr_pages; i++) {
191 struct page *p = page + i; 189 struct page *p = page + i;
192 190
193 SetPageCompound(p); 191 __SetPageCompound(p);
194 set_page_private(p, (unsigned long)page); 192 set_page_private(p, (unsigned long)page);
195 } 193 }
196} 194}
@@ -209,10 +207,24 @@ static void destroy_compound_page(struct page *page, unsigned long order)
209 if (unlikely(!PageCompound(p) | 207 if (unlikely(!PageCompound(p) |
210 (page_private(p) != (unsigned long)page))) 208 (page_private(p) != (unsigned long)page)))
211 bad_page(page); 209 bad_page(page);
212 ClearPageCompound(p); 210 __ClearPageCompound(p);
213 } 211 }
214} 212}
215 213
214static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
215{
216 int i;
217
218 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
219 /*
220 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
221 * and __GFP_HIGHMEM from hard or soft interrupt context.
222 */
223 BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
224 for (i = 0; i < (1 << order); i++)
225 clear_highpage(page + i);
226}
227
216/* 228/*
217 * function for dealing with page's order in buddy system. 229 * function for dealing with page's order in buddy system.
218 * zone->lock is already acquired when we use these. 230 * zone->lock is already acquired when we use these.
@@ -423,11 +435,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
423 mutex_debug_check_no_locks_freed(page_address(page), 435 mutex_debug_check_no_locks_freed(page_address(page),
424 PAGE_SIZE<<order); 436 PAGE_SIZE<<order);
425 437
426#ifndef CONFIG_MMU
427 for (i = 1 ; i < (1 << order) ; ++i)
428 __put_page(page + i);
429#endif
430
431 for (i = 0 ; i < (1 << order) ; ++i) 438 for (i = 0 ; i < (1 << order) ; ++i)
432 reserved += free_pages_check(page + i); 439 reserved += free_pages_check(page + i);
433 if (reserved) 440 if (reserved)
@@ -448,28 +455,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
448 if (order == 0) { 455 if (order == 0) {
449 __ClearPageReserved(page); 456 __ClearPageReserved(page);
450 set_page_count(page, 0); 457 set_page_count(page, 0);
451 458 set_page_refcounted(page);
452 free_hot_cold_page(page, 0); 459 __free_page(page);
453 } else { 460 } else {
454 LIST_HEAD(list);
455 int loop; 461 int loop;
456 462
463 prefetchw(page);
457 for (loop = 0; loop < BITS_PER_LONG; loop++) { 464 for (loop = 0; loop < BITS_PER_LONG; loop++) {
458 struct page *p = &page[loop]; 465 struct page *p = &page[loop];
459 466
460 if (loop + 16 < BITS_PER_LONG) 467 if (loop + 1 < BITS_PER_LONG)
461 prefetchw(p + 16); 468 prefetchw(p + 1);
462 __ClearPageReserved(p); 469 __ClearPageReserved(p);
463 set_page_count(p, 0); 470 set_page_count(p, 0);
464 } 471 }
465 472
466 arch_free_page(page, order); 473 set_page_refcounted(page);
467 474 __free_pages(page, order);
468 mod_page_state(pgfree, 1 << order);
469
470 list_add(&page->lru, &list);
471 kernel_map_pages(page, 1 << order, 0);
472 free_pages_bulk(page_zone(page), 1, &list, order);
473 } 475 }
474} 476}
475 477
@@ -507,7 +509,7 @@ static inline void expand(struct zone *zone, struct page *page,
507/* 509/*
508 * This page is about to be returned from the page allocator 510 * This page is about to be returned from the page allocator
509 */ 511 */
510static int prep_new_page(struct page *page, int order) 512static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
511{ 513{
512 if (unlikely(page_mapcount(page) | 514 if (unlikely(page_mapcount(page) |
513 (page->mapping != NULL) | 515 (page->mapping != NULL) |
@@ -536,8 +538,15 @@ static int prep_new_page(struct page *page, int order)
536 1 << PG_referenced | 1 << PG_arch_1 | 538 1 << PG_referenced | 1 << PG_arch_1 |
537 1 << PG_checked | 1 << PG_mappedtodisk); 539 1 << PG_checked | 1 << PG_mappedtodisk);
538 set_page_private(page, 0); 540 set_page_private(page, 0);
539 set_page_refs(page, order); 541 set_page_refcounted(page);
540 kernel_map_pages(page, 1 << order, 1); 542 kernel_map_pages(page, 1 << order, 1);
543
544 if (gfp_flags & __GFP_ZERO)
545 prep_zero_page(page, order, gfp_flags);
546
547 if (order && (gfp_flags & __GFP_COMP))
548 prep_compound_page(page, order);
549
541 return 0; 550 return 0;
542} 551}
543 552
@@ -593,13 +602,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
593/* 602/*
594 * Called from the slab reaper to drain pagesets on a particular node that 603 * Called from the slab reaper to drain pagesets on a particular node that
595 * belong to the currently executing processor. 604 * belong to the currently executing processor.
605 * Note that this function must be called with the thread pinned to
606 * a single processor.
596 */ 607 */
597void drain_node_pages(int nodeid) 608void drain_node_pages(int nodeid)
598{ 609{
599 int i, z; 610 int i, z;
600 unsigned long flags; 611 unsigned long flags;
601 612
602 local_irq_save(flags);
603 for (z = 0; z < MAX_NR_ZONES; z++) { 613 for (z = 0; z < MAX_NR_ZONES; z++) {
604 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 614 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
605 struct per_cpu_pageset *pset; 615 struct per_cpu_pageset *pset;
@@ -609,11 +619,14 @@ void drain_node_pages(int nodeid)
609 struct per_cpu_pages *pcp; 619 struct per_cpu_pages *pcp;
610 620
611 pcp = &pset->pcp[i]; 621 pcp = &pset->pcp[i];
612 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 622 if (pcp->count) {
613 pcp->count = 0; 623 local_irq_save(flags);
624 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
625 pcp->count = 0;
626 local_irq_restore(flags);
627 }
614 } 628 }
615 } 629 }
616 local_irq_restore(flags);
617} 630}
618#endif 631#endif
619 632
@@ -743,13 +756,22 @@ void fastcall free_cold_page(struct page *page)
743 free_hot_cold_page(page, 1); 756 free_hot_cold_page(page, 1);
744} 757}
745 758
746static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 759/*
760 * split_page takes a non-compound higher-order page, and splits it into
761 * n (1<<order) sub-pages: page[0..n]
762 * Each sub-page must be freed individually.
763 *
764 * Note: this is probably too low level an operation for use in drivers.
765 * Please consult with lkml before using this in your driver.
766 */
767void split_page(struct page *page, unsigned int order)
747{ 768{
748 int i; 769 int i;
749 770
750 BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 771 BUG_ON(PageCompound(page));
751 for(i = 0; i < (1 << order); i++) 772 BUG_ON(!page_count(page));
752 clear_highpage(page + i); 773 for (i = 1; i < (1 << order); i++)
774 set_page_refcounted(page + i);
753} 775}
754 776
755/* 777/*
@@ -795,14 +817,8 @@ again:
795 put_cpu(); 817 put_cpu();
796 818
797 BUG_ON(bad_range(zone, page)); 819 BUG_ON(bad_range(zone, page));
798 if (prep_new_page(page, order)) 820 if (prep_new_page(page, order, gfp_flags))
799 goto again; 821 goto again;
800
801 if (gfp_flags & __GFP_ZERO)
802 prep_zero_page(page, order, gfp_flags);
803
804 if (order && (gfp_flags & __GFP_COMP))
805 prep_compound_page(page, order);
806 return page; 822 return page;
807 823
808failed: 824failed:
@@ -926,7 +942,8 @@ restart:
926 goto got_pg; 942 goto got_pg;
927 943
928 do { 944 do {
929 wakeup_kswapd(*z, order); 945 if (cpuset_zone_allowed(*z, gfp_mask))
946 wakeup_kswapd(*z, order);
930 } while (*(++z)); 947 } while (*(++z));
931 948
932 /* 949 /*
@@ -1183,7 +1200,7 @@ unsigned int nr_free_highpages (void)
1183 pg_data_t *pgdat; 1200 pg_data_t *pgdat;
1184 unsigned int pages = 0; 1201 unsigned int pages = 0;
1185 1202
1186 for_each_pgdat(pgdat) 1203 for_each_online_pgdat(pgdat)
1187 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; 1204 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
1188 1205
1189 return pages; 1206 return pages;
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1214 1231
1215static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1232static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1216{ 1233{
1217 int cpu = 0; 1234 unsigned cpu;
1218 1235
1219 memset(ret, 0, nr * sizeof(unsigned long)); 1236 memset(ret, 0, nr * sizeof(unsigned long));
1220 cpus_and(*cpumask, *cpumask, cpu_online_map); 1237 cpus_and(*cpumask, *cpumask, cpu_online_map);
1221 1238
1222 cpu = first_cpu(*cpumask); 1239 for_each_cpu_mask(cpu, *cpumask) {
1223 while (cpu < NR_CPUS) { 1240 unsigned long *in;
1224 unsigned long *in, *out, off; 1241 unsigned long *out;
1225 1242 unsigned off;
1226 if (!cpu_isset(cpu, *cpumask)) 1243 unsigned next_cpu;
1227 continue;
1228 1244
1229 in = (unsigned long *)&per_cpu(page_states, cpu); 1245 in = (unsigned long *)&per_cpu(page_states, cpu);
1230 1246
1231 cpu = next_cpu(cpu, *cpumask); 1247 next_cpu = next_cpu(cpu, *cpumask);
1232 1248 if (likely(next_cpu < NR_CPUS))
1233 if (likely(cpu < NR_CPUS)) 1249 prefetch(&per_cpu(page_states, next_cpu));
1234 prefetch(&per_cpu(page_states, cpu));
1235 1250
1236 out = (unsigned long *)ret; 1251 out = (unsigned long *)ret;
1237 for (off = 0; off < nr; off++) 1252 for (off = 0; off < nr; off++)
@@ -1327,7 +1342,7 @@ void get_zone_counts(unsigned long *active,
1327 *active = 0; 1342 *active = 0;
1328 *inactive = 0; 1343 *inactive = 0;
1329 *free = 0; 1344 *free = 0;
1330 for_each_pgdat(pgdat) { 1345 for_each_online_pgdat(pgdat) {
1331 unsigned long l, m, n; 1346 unsigned long l, m, n;
1332 __get_zone_counts(&l, &m, &n, pgdat); 1347 __get_zone_counts(&l, &m, &n, pgdat);
1333 *active += l; 1348 *active += l;
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1764 continue; 1779 continue;
1765 page = pfn_to_page(pfn); 1780 page = pfn_to_page(pfn);
1766 set_page_links(page, zone, nid, pfn); 1781 set_page_links(page, zone, nid, pfn);
1767 set_page_count(page, 1); 1782 init_page_count(page);
1768 reset_page_mapcount(page); 1783 reset_page_mapcount(page);
1769 SetPageReserved(page); 1784 SetPageReserved(page);
1770 INIT_LIST_HEAD(&page->lru); 1785 INIT_LIST_HEAD(&page->lru);
@@ -2013,8 +2028,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
2013 setup_pageset(zone_pcp(zone,cpu), batch); 2028 setup_pageset(zone_pcp(zone,cpu), batch);
2014#endif 2029#endif
2015 } 2030 }
2016 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 2031 if (zone->present_pages)
2017 zone->name, zone->present_pages, batch); 2032 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
2033 zone->name, zone->present_pages, batch);
2018} 2034}
2019 2035
2020static __meminit void init_currently_empty_zone(struct zone *zone, 2036static __meminit void init_currently_empty_zone(struct zone *zone,
@@ -2025,7 +2041,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
2025 zone_wait_table_init(zone, size); 2041 zone_wait_table_init(zone, size);
2026 pgdat->nr_zones = zone_idx(zone) + 1; 2042 pgdat->nr_zones = zone_idx(zone) + 1;
2027 2043
2028 zone->zone_mem_map = pfn_to_page(zone_start_pfn);
2029 zone->zone_start_pfn = zone_start_pfn; 2044 zone->zone_start_pfn = zone_start_pfn;
2030 2045
2031 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2046 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
@@ -2153,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos)
2153{ 2168{
2154 pg_data_t *pgdat; 2169 pg_data_t *pgdat;
2155 loff_t node = *pos; 2170 loff_t node = *pos;
2156 2171 for (pgdat = first_online_pgdat();
2157 for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) 2172 pgdat && node;
2173 pgdat = next_online_pgdat(pgdat))
2158 --node; 2174 --node;
2159 2175
2160 return pgdat; 2176 return pgdat;
@@ -2165,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
2165 pg_data_t *pgdat = (pg_data_t *)arg; 2181 pg_data_t *pgdat = (pg_data_t *)arg;
2166 2182
2167 (*pos)++; 2183 (*pos)++;
2168 return pgdat->pgdat_next; 2184 return next_online_pgdat(pgdat);
2169} 2185}
2170 2186
2171static void frag_stop(struct seq_file *m, void *arg) 2187static void frag_stop(struct seq_file *m, void *arg)
@@ -2466,7 +2482,7 @@ static void setup_per_zone_lowmem_reserve(void)
2466 struct pglist_data *pgdat; 2482 struct pglist_data *pgdat;
2467 int j, idx; 2483 int j, idx;
2468 2484
2469 for_each_pgdat(pgdat) { 2485 for_each_online_pgdat(pgdat) {
2470 for (j = 0; j < MAX_NR_ZONES; j++) { 2486 for (j = 0; j < MAX_NR_ZONES; j++) {
2471 struct zone *zone = pgdat->node_zones + j; 2487 struct zone *zone = pgdat->node_zones + j;
2472 unsigned long present_pages = zone->present_pages; 2488 unsigned long present_pages = zone->present_pages;
@@ -2685,8 +2701,7 @@ void *__init alloc_large_system_hash(const char *tablename,
2685 else 2701 else
2686 numentries <<= (PAGE_SHIFT - scale); 2702 numentries <<= (PAGE_SHIFT - scale);
2687 } 2703 }
2688 /* rounded up to nearest power of 2 in size */ 2704 numentries = roundup_pow_of_two(numentries);
2689 numentries = 1UL << (long_log2(numentries) + 1);
2690 2705
2691 /* limit allocation size to 1/16 total memory by default */ 2706 /* limit allocation size to 1/16 total memory by default */
2692 if (max == 0) { 2707 if (max == 0) {
@@ -2729,3 +2744,44 @@ void *__init alloc_large_system_hash(const char *tablename,
2729 2744
2730 return table; 2745 return table;
2731} 2746}
2747
2748#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
2749/*
2750 * pfn <-> page translation. out-of-line version.
2751 * (see asm-generic/memory_model.h)
2752 */
2753#if defined(CONFIG_FLATMEM)
2754struct page *pfn_to_page(unsigned long pfn)
2755{
2756 return mem_map + (pfn - ARCH_PFN_OFFSET);
2757}
2758unsigned long page_to_pfn(struct page *page)
2759{
2760 return (page - mem_map) + ARCH_PFN_OFFSET;
2761}
2762#elif defined(CONFIG_DISCONTIGMEM)
2763struct page *pfn_to_page(unsigned long pfn)
2764{
2765 int nid = arch_pfn_to_nid(pfn);
2766 return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
2767}
2768unsigned long page_to_pfn(struct page *page)
2769{
2770 struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
2771 return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
2772}
2773#elif defined(CONFIG_SPARSEMEM)
2774struct page *pfn_to_page(unsigned long pfn)
2775{
2776 return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
2777}
2778
2779unsigned long page_to_pfn(struct page *page)
2780{
2781 long section_id = page_to_section(page);
2782 return page - __section_mem_map_addr(__nr_to_section(section_id));
2783}
2784#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
2785EXPORT_SYMBOL(pfn_to_page);
2786EXPORT_SYMBOL(page_to_pfn);
2787#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff --git a/mm/readahead.c b/mm/readahead.c
index 9f0b98227b41..ba7db816f4c8 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -53,13 +53,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
53 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; 53 return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
54} 54}
55 55
56static inline void reset_ahead_window(struct file_ra_state *ra)
57{
58 /*
59 * ... but preserve ahead_start + ahead_size value,
60 * see 'recheck:' label in page_cache_readahead().
61 * Note: We never use ->ahead_size as rvalue without
62 * checking ->ahead_start != 0 first.
63 */
64 ra->ahead_size += ra->ahead_start;
65 ra->ahead_start = 0;
66}
67
56static inline void ra_off(struct file_ra_state *ra) 68static inline void ra_off(struct file_ra_state *ra)
57{ 69{
58 ra->start = 0; 70 ra->start = 0;
59 ra->flags = 0; 71 ra->flags = 0;
60 ra->size = 0; 72 ra->size = 0;
61 ra->ahead_start = 0; 73 reset_ahead_window(ra);
62 ra->ahead_size = 0;
63 return; 74 return;
64} 75}
65 76
@@ -73,10 +84,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
73{ 84{
74 unsigned long newsize = roundup_pow_of_two(size); 85 unsigned long newsize = roundup_pow_of_two(size);
75 86
76 if (newsize <= max / 64) 87 if (newsize <= max / 32)
77 newsize = newsize * newsize; 88 newsize = newsize * 4;
78 else if (newsize <= max / 4) 89 else if (newsize <= max / 4)
79 newsize = max / 4; 90 newsize = newsize * 2;
80 else 91 else
81 newsize = max; 92 newsize = max;
82 return newsize; 93 return newsize;
@@ -427,8 +438,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
427 * congestion. The ahead window will any way be closed 438 * congestion. The ahead window will any way be closed
428 * in case we failed due to excessive page cache hits. 439 * in case we failed due to excessive page cache hits.
429 */ 440 */
430 ra->ahead_start = 0; 441 reset_ahead_window(ra);
431 ra->ahead_size = 0;
432 } 442 }
433 443
434 return ret; 444 return ret;
@@ -521,11 +531,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
521 * If we get here we are doing sequential IO and this was not the first 531 * If we get here we are doing sequential IO and this was not the first
522 * occurence (ie we have an existing window) 532 * occurence (ie we have an existing window)
523 */ 533 */
524
525 if (ra->ahead_start == 0) { /* no ahead window yet */ 534 if (ra->ahead_start == 0) { /* no ahead window yet */
526 if (!make_ahead_window(mapping, filp, ra, 0)) 535 if (!make_ahead_window(mapping, filp, ra, 0))
527 goto out; 536 goto recheck;
528 } 537 }
538
529 /* 539 /*
530 * Already have an ahead window, check if we crossed into it. 540 * Already have an ahead window, check if we crossed into it.
531 * If so, shift windows and issue a new ahead window. 541 * If so, shift windows and issue a new ahead window.
@@ -537,11 +547,16 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
537 ra->start = ra->ahead_start; 547 ra->start = ra->ahead_start;
538 ra->size = ra->ahead_size; 548 ra->size = ra->ahead_size;
539 make_ahead_window(mapping, filp, ra, 0); 549 make_ahead_window(mapping, filp, ra, 0);
550recheck:
551 /* prev_page shouldn't overrun the ahead window */
552 ra->prev_page = min(ra->prev_page,
553 ra->ahead_start + ra->ahead_size - 1);
540 } 554 }
541 555
542out: 556out:
543 return ra->prev_page + 1; 557 return ra->prev_page + 1;
544} 558}
559EXPORT_SYMBOL_GPL(page_cache_readahead);
545 560
546/* 561/*
547 * handle_ra_miss() is called when it is known that a page which should have 562 * handle_ra_miss() is called when it is known that a page which should have
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20b101f..1963e269314d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
56 56
57#include <asm/tlbflush.h> 57#include <asm/tlbflush.h>
58 58
59//#define RMAP_DEBUG /* can be enabled only for debugging */ 59struct kmem_cache *anon_vma_cachep;
60
61kmem_cache_t *anon_vma_cachep;
62 60
63static inline void validate_anon_vma(struct vm_area_struct *find_vma) 61static inline void validate_anon_vma(struct vm_area_struct *find_vma)
64{ 62{
65#ifdef RMAP_DEBUG 63#ifdef CONFIG_DEBUG_VM
66 struct anon_vma *anon_vma = find_vma->anon_vma; 64 struct anon_vma *anon_vma = find_vma->anon_vma;
67 struct vm_area_struct *vma; 65 struct vm_area_struct *vma;
68 unsigned int mapcount = 0; 66 unsigned int mapcount = 0;
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
166 anon_vma_free(anon_vma); 164 anon_vma_free(anon_vma);
167} 165}
168 166
169static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) 167static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
168 unsigned long flags)
170{ 169{
171 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == 170 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
172 SLAB_CTOR_CONSTRUCTOR) { 171 SLAB_CTOR_CONSTRUCTOR) {
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page)
550void page_remove_rmap(struct page *page) 549void page_remove_rmap(struct page *page)
551{ 550{
552 if (atomic_add_negative(-1, &page->_mapcount)) { 551 if (atomic_add_negative(-1, &page->_mapcount)) {
553 if (page_mapcount(page) < 0) { 552#ifdef CONFIG_DEBUG_VM
553 if (unlikely(page_mapcount(page) < 0)) {
554 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); 554 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
555 printk (KERN_EMERG " page->flags = %lx\n", page->flags); 555 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
556 printk (KERN_EMERG " page->count = %x\n", page_count(page)); 556 printk (KERN_EMERG " page->count = %x\n", page_count(page));
557 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 557 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
558 } 558 }
559 559#endif
560 BUG_ON(page_mapcount(page) < 0); 560 BUG_ON(page_mapcount(page) < 0);
561 /* 561 /*
562 * It would be tidy to reset the PageAnon mapping here, 562 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fbaff7b..37eaf42ed2c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
875} 875}
876 876
877#ifdef CONFIG_NUMA 877#ifdef CONFIG_NUMA
878static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) 878static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
879{ 879{
880 char *nodelist = strchr(value, ':'); 880 char *nodelist = strchr(value, ':');
881 int err = 1; 881 int err = 1;
@@ -2119,7 +2119,7 @@ failed:
2119 return err; 2119 return err;
2120} 2120}
2121 2121
2122static kmem_cache_t *shmem_inode_cachep; 2122static struct kmem_cache *shmem_inode_cachep;
2123 2123
2124static struct inode *shmem_alloc_inode(struct super_block *sb) 2124static struct inode *shmem_alloc_inode(struct super_block *sb)
2125{ 2125{
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
2139 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2139 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2140} 2140}
2141 2141
2142static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) 2142static void init_once(void *foo, struct kmem_cache *cachep,
2143 unsigned long flags)
2143{ 2144{
2144 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2145 struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
2145 2146
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab04..4cbf8bb13557 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
50 * The head array is strictly LIFO and should improve the cache hit rates. 50 * The head array is strictly LIFO and should improve the cache hit rates.
51 * On SMP, it additionally reduces the spinlock operations. 51 * On SMP, it additionally reduces the spinlock operations.
52 * 52 *
53 * The c_cpuarray may not be read with enabled local interrupts - 53 * The c_cpuarray may not be read with enabled local interrupts -
54 * it's changed with a smp_call_function(). 54 * it's changed with a smp_call_function().
55 * 55 *
56 * SMP synchronization: 56 * SMP synchronization:
@@ -94,6 +94,7 @@
94#include <linux/interrupt.h> 94#include <linux/interrupt.h>
95#include <linux/init.h> 95#include <linux/init.h>
96#include <linux/compiler.h> 96#include <linux/compiler.h>
97#include <linux/cpuset.h>
97#include <linux/seq_file.h> 98#include <linux/seq_file.h>
98#include <linux/notifier.h> 99#include <linux/notifier.h>
99#include <linux/kallsyms.h> 100#include <linux/kallsyms.h>
@@ -170,15 +171,15 @@
170#if DEBUG 171#if DEBUG
171# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ 172# define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
172 SLAB_POISON | SLAB_HWCACHE_ALIGN | \ 173 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
173 SLAB_NO_REAP | SLAB_CACHE_DMA | \ 174 SLAB_CACHE_DMA | \
174 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 175 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
175 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 176 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
176 SLAB_DESTROY_BY_RCU) 177 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
177#else 178#else
178# define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ 179# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
179 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 180 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181 SLAB_DESTROY_BY_RCU) 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
182#endif 183#endif
183 184
184/* 185/*
@@ -203,7 +204,8 @@
203typedef unsigned int kmem_bufctl_t; 204typedef unsigned int kmem_bufctl_t;
204#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 205#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
205#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 206#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
206#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) 207#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
208#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
207 209
208/* Max number of objs-per-slab for caches which use off-slab slabs. 210/* Max number of objs-per-slab for caches which use off-slab slabs.
209 * Needed to avoid a possible looping condition in cache_grow(). 211 * Needed to avoid a possible looping condition in cache_grow().
@@ -266,16 +268,17 @@ struct array_cache {
266 unsigned int batchcount; 268 unsigned int batchcount;
267 unsigned int touched; 269 unsigned int touched;
268 spinlock_t lock; 270 spinlock_t lock;
269 void *entry[0]; /* 271 void *entry[0]; /*
270 * Must have this definition in here for the proper 272 * Must have this definition in here for the proper
271 * alignment of array_cache. Also simplifies accessing 273 * alignment of array_cache. Also simplifies accessing
272 * the entries. 274 * the entries.
273 * [0] is for gcc 2.95. It should really be []. 275 * [0] is for gcc 2.95. It should really be [].
274 */ 276 */
275}; 277};
276 278
277/* bootstrap: The caches do not work without cpuarrays anymore, 279/*
278 * but the cpuarrays are allocated from the generic caches... 280 * bootstrap: The caches do not work without cpuarrays anymore, but the
281 * cpuarrays are allocated from the generic caches...
279 */ 282 */
280#define BOOT_CPUCACHE_ENTRIES 1 283#define BOOT_CPUCACHE_ENTRIES 1
281struct arraycache_init { 284struct arraycache_init {
@@ -291,13 +294,13 @@ struct kmem_list3 {
291 struct list_head slabs_full; 294 struct list_head slabs_full;
292 struct list_head slabs_free; 295 struct list_head slabs_free;
293 unsigned long free_objects; 296 unsigned long free_objects;
294 unsigned long next_reap;
295 int free_touched;
296 unsigned int free_limit; 297 unsigned int free_limit;
297 unsigned int colour_next; /* Per-node cache coloring */ 298 unsigned int colour_next; /* Per-node cache coloring */
298 spinlock_t list_lock; 299 spinlock_t list_lock;
299 struct array_cache *shared; /* shared per node */ 300 struct array_cache *shared; /* shared per node */
300 struct array_cache **alien; /* on other nodes */ 301 struct array_cache **alien; /* on other nodes */
302 unsigned long next_reap; /* updated without locking */
303 int free_touched; /* updated without locking */
301}; 304};
302 305
303/* 306/*
@@ -310,10 +313,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
310#define SIZE_L3 (1 + MAX_NUMNODES) 313#define SIZE_L3 (1 + MAX_NUMNODES)
311 314
312/* 315/*
313 * This function must be completely optimized away if 316 * This function must be completely optimized away if a constant is passed to
314 * a constant is passed to it. Mostly the same as 317 * it. Mostly the same as what is in linux/slab.h except it returns an index.
315 * what is in linux/slab.h except it returns an
316 * index.
317 */ 318 */
318static __always_inline int index_of(const size_t size) 319static __always_inline int index_of(const size_t size)
319{ 320{
@@ -351,14 +352,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
351 parent->free_touched = 0; 352 parent->free_touched = 0;
352} 353}
353 354
354#define MAKE_LIST(cachep, listp, slab, nodeid) \ 355#define MAKE_LIST(cachep, listp, slab, nodeid) \
355 do { \ 356 do { \
356 INIT_LIST_HEAD(listp); \ 357 INIT_LIST_HEAD(listp); \
357 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ 358 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
358 } while (0) 359 } while (0)
359 360
360#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ 361#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
361 do { \ 362 do { \
362 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ 363 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
363 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ 364 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
364 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ 365 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
@@ -373,28 +374,30 @@ static void kmem_list3_init(struct kmem_list3 *parent)
373struct kmem_cache { 374struct kmem_cache {
374/* 1) per-cpu data, touched during every alloc/free */ 375/* 1) per-cpu data, touched during every alloc/free */
375 struct array_cache *array[NR_CPUS]; 376 struct array_cache *array[NR_CPUS];
377/* 2) Cache tunables. Protected by cache_chain_mutex */
376 unsigned int batchcount; 378 unsigned int batchcount;
377 unsigned int limit; 379 unsigned int limit;
378 unsigned int shared; 380 unsigned int shared;
381
379 unsigned int buffer_size; 382 unsigned int buffer_size;
380/* 2) touched by every alloc & free from the backend */ 383/* 3) touched by every alloc & free from the backend */
381 struct kmem_list3 *nodelists[MAX_NUMNODES]; 384 struct kmem_list3 *nodelists[MAX_NUMNODES];
382 unsigned int flags; /* constant flags */
383 unsigned int num; /* # of objs per slab */
384 spinlock_t spinlock;
385 385
386/* 3) cache_grow/shrink */ 386 unsigned int flags; /* constant flags */
387 unsigned int num; /* # of objs per slab */
388
389/* 4) cache_grow/shrink */
387 /* order of pgs per slab (2^n) */ 390 /* order of pgs per slab (2^n) */
388 unsigned int gfporder; 391 unsigned int gfporder;
389 392
390 /* force GFP flags, e.g. GFP_DMA */ 393 /* force GFP flags, e.g. GFP_DMA */
391 gfp_t gfpflags; 394 gfp_t gfpflags;
392 395
393 size_t colour; /* cache colouring range */ 396 size_t colour; /* cache colouring range */
394 unsigned int colour_off; /* colour offset */ 397 unsigned int colour_off; /* colour offset */
395 struct kmem_cache *slabp_cache; 398 struct kmem_cache *slabp_cache;
396 unsigned int slab_size; 399 unsigned int slab_size;
397 unsigned int dflags; /* dynamic flags */ 400 unsigned int dflags; /* dynamic flags */
398 401
399 /* constructor func */ 402 /* constructor func */
400 void (*ctor) (void *, struct kmem_cache *, unsigned long); 403 void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -402,11 +405,11 @@ struct kmem_cache {
402 /* de-constructor func */ 405 /* de-constructor func */
403 void (*dtor) (void *, struct kmem_cache *, unsigned long); 406 void (*dtor) (void *, struct kmem_cache *, unsigned long);
404 407
405/* 4) cache creation/removal */ 408/* 5) cache creation/removal */
406 const char *name; 409 const char *name;
407 struct list_head next; 410 struct list_head next;
408 411
409/* 5) statistics */ 412/* 6) statistics */
410#if STATS 413#if STATS
411 unsigned long num_active; 414 unsigned long num_active;
412 unsigned long num_allocations; 415 unsigned long num_allocations;
@@ -438,8 +441,9 @@ struct kmem_cache {
438#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) 441#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)
439 442
440#define BATCHREFILL_LIMIT 16 443#define BATCHREFILL_LIMIT 16
441/* Optimization question: fewer reaps means less 444/*
442 * probability for unnessary cpucache drain/refill cycles. 445 * Optimization question: fewer reaps means less probability for unnessary
446 * cpucache drain/refill cycles.
443 * 447 *
444 * OTOH the cpuarrays can contain lots of objects, 448 * OTOH the cpuarrays can contain lots of objects,
445 * which could lock up otherwise freeable slabs. 449 * which could lock up otherwise freeable slabs.
@@ -453,17 +457,19 @@ struct kmem_cache {
453#define STATS_INC_ALLOCED(x) ((x)->num_allocations++) 457#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
454#define STATS_INC_GROWN(x) ((x)->grown++) 458#define STATS_INC_GROWN(x) ((x)->grown++)
455#define STATS_INC_REAPED(x) ((x)->reaped++) 459#define STATS_INC_REAPED(x) ((x)->reaped++)
456#define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ 460#define STATS_SET_HIGH(x) \
457 (x)->high_mark = (x)->num_active; \ 461 do { \
458 } while (0) 462 if ((x)->num_active > (x)->high_mark) \
463 (x)->high_mark = (x)->num_active; \
464 } while (0)
459#define STATS_INC_ERR(x) ((x)->errors++) 465#define STATS_INC_ERR(x) ((x)->errors++)
460#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 466#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
461#define STATS_INC_NODEFREES(x) ((x)->node_frees++) 467#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
462#define STATS_SET_FREEABLE(x, i) \ 468#define STATS_SET_FREEABLE(x, i) \
463 do { if ((x)->max_freeable < i) \ 469 do { \
464 (x)->max_freeable = i; \ 470 if ((x)->max_freeable < i) \
465 } while (0) 471 (x)->max_freeable = i; \
466 472 } while (0)
467#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) 473#define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit)
468#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) 474#define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss)
469#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) 475#define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit)
@@ -478,9 +484,7 @@ struct kmem_cache {
478#define STATS_INC_ERR(x) do { } while (0) 484#define STATS_INC_ERR(x) do { } while (0)
479#define STATS_INC_NODEALLOCS(x) do { } while (0) 485#define STATS_INC_NODEALLOCS(x) do { } while (0)
480#define STATS_INC_NODEFREES(x) do { } while (0) 486#define STATS_INC_NODEFREES(x) do { } while (0)
481#define STATS_SET_FREEABLE(x, i) \ 487#define STATS_SET_FREEABLE(x, i) do { } while (0)
482 do { } while (0)
483
484#define STATS_INC_ALLOCHIT(x) do { } while (0) 488#define STATS_INC_ALLOCHIT(x) do { } while (0)
485#define STATS_INC_ALLOCMISS(x) do { } while (0) 489#define STATS_INC_ALLOCMISS(x) do { } while (0)
486#define STATS_INC_FREEHIT(x) do { } while (0) 490#define STATS_INC_FREEHIT(x) do { } while (0)
@@ -488,7 +492,8 @@ struct kmem_cache {
488#endif 492#endif
489 493
490#if DEBUG 494#if DEBUG
491/* Magic nums for obj red zoning. 495/*
496 * Magic nums for obj red zoning.
492 * Placed in the first word before and the first word after an obj. 497 * Placed in the first word before and the first word after an obj.
493 */ 498 */
494#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ 499#define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */
@@ -499,7 +504,8 @@ struct kmem_cache {
499#define POISON_FREE 0x6b /* for use-after-free poisoning */ 504#define POISON_FREE 0x6b /* for use-after-free poisoning */
500#define POISON_END 0xa5 /* end-byte of poisoning */ 505#define POISON_END 0xa5 /* end-byte of poisoning */
501 506
502/* memory layout of objects: 507/*
508 * memory layout of objects:
503 * 0 : objp 509 * 0 : objp
504 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that 510 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
505 * the end of an object is aligned with the end of the real 511 * the end of an object is aligned with the end of the real
@@ -508,7 +514,8 @@ struct kmem_cache {
508 * redzone word. 514 * redzone word.
509 * cachep->obj_offset: The real object. 515 * cachep->obj_offset: The real object.
510 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] 516 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
511 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] 517 * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
518 * [BYTES_PER_WORD long]
512 */ 519 */
513static int obj_offset(struct kmem_cache *cachep) 520static int obj_offset(struct kmem_cache *cachep)
514{ 521{
@@ -552,8 +559,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
552#endif 559#endif
553 560
554/* 561/*
555 * Maximum size of an obj (in 2^order pages) 562 * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
556 * and absolute limit for the gfp order. 563 * order.
557 */ 564 */
558#if defined(CONFIG_LARGE_ALLOCS) 565#if defined(CONFIG_LARGE_ALLOCS)
559#define MAX_OBJ_ORDER 13 /* up to 32Mb */ 566#define MAX_OBJ_ORDER 13 /* up to 32Mb */
@@ -573,9 +580,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
573#define BREAK_GFP_ORDER_LO 0 580#define BREAK_GFP_ORDER_LO 0
574static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; 581static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
575 582
576/* Functions for storing/retrieving the cachep and or slab from the 583/*
577 * global 'mem_map'. These are used to find the slab an obj belongs to. 584 * Functions for storing/retrieving the cachep and or slab from the page
578 * With kfree(), these are used to find the cache which an obj belongs to. 585 * allocator. These are used to find the slab an obj belongs to. With kfree(),
586 * these are used to find the cache which an obj belongs to.
579 */ 587 */
580static inline void page_set_cache(struct page *page, struct kmem_cache *cache) 588static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
581{ 589{
@@ -584,6 +592,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
584 592
585static inline struct kmem_cache *page_get_cache(struct page *page) 593static inline struct kmem_cache *page_get_cache(struct page *page)
586{ 594{
595 if (unlikely(PageCompound(page)))
596 page = (struct page *)page_private(page);
587 return (struct kmem_cache *)page->lru.next; 597 return (struct kmem_cache *)page->lru.next;
588} 598}
589 599
@@ -594,6 +604,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
594 604
595static inline struct slab *page_get_slab(struct page *page) 605static inline struct slab *page_get_slab(struct page *page)
596{ 606{
607 if (unlikely(PageCompound(page)))
608 page = (struct page *)page_private(page);
597 return (struct slab *)page->lru.prev; 609 return (struct slab *)page->lru.prev;
598} 610}
599 611
@@ -609,7 +621,21 @@ static inline struct slab *virt_to_slab(const void *obj)
609 return page_get_slab(page); 621 return page_get_slab(page);
610} 622}
611 623
612/* These are the default caches for kmalloc. Custom caches can have other sizes. */ 624static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
625 unsigned int idx)
626{
627 return slab->s_mem + cache->buffer_size * idx;
628}
629
630static inline unsigned int obj_to_index(struct kmem_cache *cache,
631 struct slab *slab, void *obj)
632{
633 return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
634}
635
636/*
637 * These are the default caches for kmalloc. Custom caches can have other sizes.
638 */
613struct cache_sizes malloc_sizes[] = { 639struct cache_sizes malloc_sizes[] = {
614#define CACHE(x) { .cs_size = (x) }, 640#define CACHE(x) { .cs_size = (x) },
615#include <linux/kmalloc_sizes.h> 641#include <linux/kmalloc_sizes.h>
@@ -642,8 +668,6 @@ static struct kmem_cache cache_cache = {
642 .limit = BOOT_CPUCACHE_ENTRIES, 668 .limit = BOOT_CPUCACHE_ENTRIES,
643 .shared = 1, 669 .shared = 1,
644 .buffer_size = sizeof(struct kmem_cache), 670 .buffer_size = sizeof(struct kmem_cache),
645 .flags = SLAB_NO_REAP,
646 .spinlock = SPIN_LOCK_UNLOCKED,
647 .name = "kmem_cache", 671 .name = "kmem_cache",
648#if DEBUG 672#if DEBUG
649 .obj_size = sizeof(struct kmem_cache), 673 .obj_size = sizeof(struct kmem_cache),
@@ -655,8 +679,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
655static struct list_head cache_chain; 679static struct list_head cache_chain;
656 680
657/* 681/*
658 * vm_enough_memory() looks at this to determine how many 682 * vm_enough_memory() looks at this to determine how many slab-allocated pages
659 * slab-allocated pages are possibly freeable under pressure 683 * are possibly freeable under pressure
660 * 684 *
661 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 685 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
662 */ 686 */
@@ -675,7 +699,8 @@ static enum {
675 699
676static DEFINE_PER_CPU(struct work_struct, reap_work); 700static DEFINE_PER_CPU(struct work_struct, reap_work);
677 701
678static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); 702static void free_block(struct kmem_cache *cachep, void **objpp, int len,
703 int node);
679static void enable_cpucache(struct kmem_cache *cachep); 704static void enable_cpucache(struct kmem_cache *cachep);
680static void cache_reap(void *unused); 705static void cache_reap(void *unused);
681static int __node_shrink(struct kmem_cache *cachep, int node); 706static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -685,7 +710,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
685 return cachep->array[smp_processor_id()]; 710 return cachep->array[smp_processor_id()];
686} 711}
687 712
688static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) 713static inline struct kmem_cache *__find_general_cachep(size_t size,
714 gfp_t gfpflags)
689{ 715{
690 struct cache_sizes *csizep = malloc_sizes; 716 struct cache_sizes *csizep = malloc_sizes;
691 717
@@ -720,8 +746,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
720 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); 746 return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
721} 747}
722 748
723/* Calculate the number of objects and left-over bytes for a given 749/*
724 buffer size. */ 750 * Calculate the number of objects and left-over bytes for a given buffer size.
751 */
725static void cache_estimate(unsigned long gfporder, size_t buffer_size, 752static void cache_estimate(unsigned long gfporder, size_t buffer_size,
726 size_t align, int flags, size_t *left_over, 753 size_t align, int flags, size_t *left_over,
727 unsigned int *num) 754 unsigned int *num)
@@ -782,7 +809,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
782 809
783#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) 810#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
784 811
785static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) 812static void __slab_error(const char *function, struct kmem_cache *cachep,
813 char *msg)
786{ 814{
787 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 815 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
788 function, cachep->name, msg); 816 function, cachep->name, msg);
@@ -804,7 +832,7 @@ static void init_reap_node(int cpu)
804 832
805 node = next_node(cpu_to_node(cpu), node_online_map); 833 node = next_node(cpu_to_node(cpu), node_online_map);
806 if (node == MAX_NUMNODES) 834 if (node == MAX_NUMNODES)
807 node = 0; 835 node = first_node(node_online_map);
808 836
809 __get_cpu_var(reap_node) = node; 837 __get_cpu_var(reap_node) = node;
810} 838}
@@ -870,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries,
870 return nc; 898 return nc;
871} 899}
872 900
901/*
902 * Transfer objects in one arraycache to another.
903 * Locking must be handled by the caller.
904 *
905 * Return the number of entries transferred.
906 */
907static int transfer_objects(struct array_cache *to,
908 struct array_cache *from, unsigned int max)
909{
910 /* Figure out how many entries to transfer */
911 int nr = min(min(from->avail, max), to->limit - to->avail);
912
913 if (!nr)
914 return 0;
915
916 memcpy(to->entry + to->avail, from->entry + from->avail -nr,
917 sizeof(void *) *nr);
918
919 from->avail -= nr;
920 to->avail += nr;
921 to->touched = 1;
922 return nr;
923}
924
873#ifdef CONFIG_NUMA 925#ifdef CONFIG_NUMA
874static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 926static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
927static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
875 928
876static struct array_cache **alloc_alien_cache(int node, int limit) 929static struct array_cache **alloc_alien_cache(int node, int limit)
877{ 930{
@@ -906,10 +959,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
906 959
907 if (!ac_ptr) 960 if (!ac_ptr)
908 return; 961 return;
909
910 for_each_node(i) 962 for_each_node(i)
911 kfree(ac_ptr[i]); 963 kfree(ac_ptr[i]);
912
913 kfree(ac_ptr); 964 kfree(ac_ptr);
914} 965}
915 966
@@ -920,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
920 971
921 if (ac->avail) { 972 if (ac->avail) {
922 spin_lock(&rl3->list_lock); 973 spin_lock(&rl3->list_lock);
974 /*
975 * Stuff objects into the remote nodes shared array first.
976 * That way we could avoid the overhead of putting the objects
977 * into the free lists and getting them back later.
978 */
979 transfer_objects(rl3->shared, ac, ac->limit);
980
923 free_block(cachep, ac->entry, ac->avail, node); 981 free_block(cachep, ac->entry, ac->avail, node);
924 ac->avail = 0; 982 ac->avail = 0;
925 spin_unlock(&rl3->list_lock); 983 spin_unlock(&rl3->list_lock);
@@ -935,15 +993,16 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
935 993
936 if (l3->alien) { 994 if (l3->alien) {
937 struct array_cache *ac = l3->alien[node]; 995 struct array_cache *ac = l3->alien[node];
938 if (ac && ac->avail) { 996
939 spin_lock_irq(&ac->lock); 997 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
940 __drain_alien_cache(cachep, ac, node); 998 __drain_alien_cache(cachep, ac, node);
941 spin_unlock_irq(&ac->lock); 999 spin_unlock_irq(&ac->lock);
942 } 1000 }
943 } 1001 }
944} 1002}
945 1003
946static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) 1004static void drain_alien_cache(struct kmem_cache *cachep,
1005 struct array_cache **alien)
947{ 1006{
948 int i = 0; 1007 int i = 0;
949 struct array_cache *ac; 1008 struct array_cache *ac;
@@ -986,20 +1045,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
986 switch (action) { 1045 switch (action) {
987 case CPU_UP_PREPARE: 1046 case CPU_UP_PREPARE:
988 mutex_lock(&cache_chain_mutex); 1047 mutex_lock(&cache_chain_mutex);
989 /* we need to do this right in the beginning since 1048 /*
1049 * We need to do this right in the beginning since
990 * alloc_arraycache's are going to use this list. 1050 * alloc_arraycache's are going to use this list.
991 * kmalloc_node allows us to add the slab to the right 1051 * kmalloc_node allows us to add the slab to the right
992 * kmem_list3 and not this cpu's kmem_list3 1052 * kmem_list3 and not this cpu's kmem_list3
993 */ 1053 */
994 1054
995 list_for_each_entry(cachep, &cache_chain, next) { 1055 list_for_each_entry(cachep, &cache_chain, next) {
996 /* setup the size64 kmemlist for cpu before we can 1056 /*
1057 * Set up the size64 kmemlist for cpu before we can
997 * begin anything. Make sure some other cpu on this 1058 * begin anything. Make sure some other cpu on this
998 * node has not already allocated this 1059 * node has not already allocated this
999 */ 1060 */
1000 if (!cachep->nodelists[node]) { 1061 if (!cachep->nodelists[node]) {
1001 if (!(l3 = kmalloc_node(memsize, 1062 l3 = kmalloc_node(memsize, GFP_KERNEL, node);
1002 GFP_KERNEL, node))) 1063 if (!l3)
1003 goto bad; 1064 goto bad;
1004 kmem_list3_init(l3); 1065 kmem_list3_init(l3);
1005 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 1066 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1015,13 +1076,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1015 1076
1016 spin_lock_irq(&cachep->nodelists[node]->list_lock); 1077 spin_lock_irq(&cachep->nodelists[node]->list_lock);
1017 cachep->nodelists[node]->free_limit = 1078 cachep->nodelists[node]->free_limit =
1018 (1 + nr_cpus_node(node)) * 1079 (1 + nr_cpus_node(node)) *
1019 cachep->batchcount + cachep->num; 1080 cachep->batchcount + cachep->num;
1020 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 1081 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
1021 } 1082 }
1022 1083
1023 /* Now we can go ahead with allocating the shared array's 1084 /*
1024 & array cache's */ 1085 * Now we can go ahead with allocating the shared arrays and
1086 * array caches
1087 */
1025 list_for_each_entry(cachep, &cache_chain, next) { 1088 list_for_each_entry(cachep, &cache_chain, next) {
1026 struct array_cache *nc; 1089 struct array_cache *nc;
1027 struct array_cache *shared; 1090 struct array_cache *shared;
@@ -1041,7 +1104,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1041 if (!alien) 1104 if (!alien)
1042 goto bad; 1105 goto bad;
1043 cachep->array[cpu] = nc; 1106 cachep->array[cpu] = nc;
1044
1045 l3 = cachep->nodelists[node]; 1107 l3 = cachep->nodelists[node];
1046 BUG_ON(!l3); 1108 BUG_ON(!l3);
1047 1109
@@ -1061,7 +1123,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1061 } 1123 }
1062#endif 1124#endif
1063 spin_unlock_irq(&l3->list_lock); 1125 spin_unlock_irq(&l3->list_lock);
1064
1065 kfree(shared); 1126 kfree(shared);
1066 free_alien_cache(alien); 1127 free_alien_cache(alien);
1067 } 1128 }
@@ -1083,7 +1144,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
1083 /* fall thru */ 1144 /* fall thru */
1084 case CPU_UP_CANCELED: 1145 case CPU_UP_CANCELED:
1085 mutex_lock(&cache_chain_mutex); 1146 mutex_lock(&cache_chain_mutex);
1086
1087 list_for_each_entry(cachep, &cache_chain, next) { 1147 list_for_each_entry(cachep, &cache_chain, next) {
1088 struct array_cache *nc; 1148 struct array_cache *nc;
1089 struct array_cache *shared; 1149 struct array_cache *shared;
@@ -1150,7 +1210,7 @@ free_array_cache:
1150#endif 1210#endif
1151 } 1211 }
1152 return NOTIFY_OK; 1212 return NOTIFY_OK;
1153 bad: 1213bad:
1154 mutex_unlock(&cache_chain_mutex); 1214 mutex_unlock(&cache_chain_mutex);
1155 return NOTIFY_BAD; 1215 return NOTIFY_BAD;
1156} 1216}
@@ -1160,7 +1220,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
1160/* 1220/*
1161 * swap the static kmem_list3 with kmalloced memory 1221 * swap the static kmem_list3 with kmalloced memory
1162 */ 1222 */
1163static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) 1223static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
1224 int nodeid)
1164{ 1225{
1165 struct kmem_list3 *ptr; 1226 struct kmem_list3 *ptr;
1166 1227
@@ -1175,8 +1236,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
1175 local_irq_enable(); 1236 local_irq_enable();
1176} 1237}
1177 1238
1178/* Initialisation. 1239/*
1179 * Called after the gfp() functions have been enabled, and before smp_init(). 1240 * Initialisation. Called after the page allocator have been initialised and
1241 * before smp_init().
1180 */ 1242 */
1181void __init kmem_cache_init(void) 1243void __init kmem_cache_init(void)
1182{ 1244{
@@ -1201,9 +1263,9 @@ void __init kmem_cache_init(void)
1201 1263
1202 /* Bootstrap is tricky, because several objects are allocated 1264 /* Bootstrap is tricky, because several objects are allocated
1203 * from caches that do not exist yet: 1265 * from caches that do not exist yet:
1204 * 1) initialize the cache_cache cache: it contains the struct kmem_cache 1266 * 1) initialize the cache_cache cache: it contains the struct
1205 * structures of all caches, except cache_cache itself: cache_cache 1267 * kmem_cache structures of all caches, except cache_cache itself:
1206 * is statically allocated. 1268 * cache_cache is statically allocated.
1207 * Initially an __init data area is used for the head array and the 1269 * Initially an __init data area is used for the head array and the
1208 * kmem_list3 structures, it's replaced with a kmalloc allocated 1270 * kmem_list3 structures, it's replaced with a kmalloc allocated
1209 * array at the end of the bootstrap. 1271 * array at the end of the bootstrap.
@@ -1226,7 +1288,8 @@ void __init kmem_cache_init(void)
1226 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1288 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1227 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; 1289 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
1228 1290
1229 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); 1291 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1292 cache_line_size());
1230 1293
1231 for (order = 0; order < MAX_ORDER; order++) { 1294 for (order = 0; order < MAX_ORDER; order++) {
1232 cache_estimate(order, cache_cache.buffer_size, 1295 cache_estimate(order, cache_cache.buffer_size,
@@ -1245,24 +1308,26 @@ void __init kmem_cache_init(void)
1245 sizes = malloc_sizes; 1308 sizes = malloc_sizes;
1246 names = cache_names; 1309 names = cache_names;
1247 1310
1248 /* Initialize the caches that provide memory for the array cache 1311 /*
1249 * and the kmem_list3 structures first. 1312 * Initialize the caches that provide memory for the array cache and the
1250 * Without this, further allocations will bug 1313 * kmem_list3 structures first. Without this, further allocations will
1314 * bug.
1251 */ 1315 */
1252 1316
1253 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1317 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1254 sizes[INDEX_AC].cs_size, 1318 sizes[INDEX_AC].cs_size,
1255 ARCH_KMALLOC_MINALIGN, 1319 ARCH_KMALLOC_MINALIGN,
1256 (ARCH_KMALLOC_FLAGS | 1320 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1257 SLAB_PANIC), NULL, NULL); 1321 NULL, NULL);
1258 1322
1259 if (INDEX_AC != INDEX_L3) 1323 if (INDEX_AC != INDEX_L3) {
1260 sizes[INDEX_L3].cs_cachep = 1324 sizes[INDEX_L3].cs_cachep =
1261 kmem_cache_create(names[INDEX_L3].name, 1325 kmem_cache_create(names[INDEX_L3].name,
1262 sizes[INDEX_L3].cs_size, 1326 sizes[INDEX_L3].cs_size,
1263 ARCH_KMALLOC_MINALIGN, 1327 ARCH_KMALLOC_MINALIGN,
1264 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, 1328 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1265 NULL); 1329 NULL, NULL);
1330 }
1266 1331
1267 while (sizes->cs_size != ULONG_MAX) { 1332 while (sizes->cs_size != ULONG_MAX) {
1268 /* 1333 /*
@@ -1272,13 +1337,13 @@ void __init kmem_cache_init(void)
1272 * Note for systems short on memory removing the alignment will 1337 * Note for systems short on memory removing the alignment will
1273 * allow tighter packing of the smaller caches. 1338 * allow tighter packing of the smaller caches.
1274 */ 1339 */
1275 if (!sizes->cs_cachep) 1340 if (!sizes->cs_cachep) {
1276 sizes->cs_cachep = kmem_cache_create(names->name, 1341 sizes->cs_cachep = kmem_cache_create(names->name,
1277 sizes->cs_size, 1342 sizes->cs_size,
1278 ARCH_KMALLOC_MINALIGN, 1343 ARCH_KMALLOC_MINALIGN,
1279 (ARCH_KMALLOC_FLAGS 1344 ARCH_KMALLOC_FLAGS|SLAB_PANIC,
1280 | SLAB_PANIC), 1345 NULL, NULL);
1281 NULL, NULL); 1346 }
1282 1347
1283 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1348 /* Inc off-slab bufctl limit until the ceiling is hit. */
1284 if (!(OFF_SLAB(sizes->cs_cachep))) { 1349 if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1287,13 +1352,11 @@ void __init kmem_cache_init(void)
1287 } 1352 }
1288 1353
1289 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1354 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1290 sizes->cs_size, 1355 sizes->cs_size,
1291 ARCH_KMALLOC_MINALIGN, 1356 ARCH_KMALLOC_MINALIGN,
1292 (ARCH_KMALLOC_FLAGS | 1357 ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
1293 SLAB_CACHE_DMA | 1358 SLAB_PANIC,
1294 SLAB_PANIC), NULL, 1359 NULL, NULL);
1295 NULL);
1296
1297 sizes++; 1360 sizes++;
1298 names++; 1361 names++;
1299 } 1362 }
@@ -1345,20 +1408,22 @@ void __init kmem_cache_init(void)
1345 struct kmem_cache *cachep; 1408 struct kmem_cache *cachep;
1346 mutex_lock(&cache_chain_mutex); 1409 mutex_lock(&cache_chain_mutex);
1347 list_for_each_entry(cachep, &cache_chain, next) 1410 list_for_each_entry(cachep, &cache_chain, next)
1348 enable_cpucache(cachep); 1411 enable_cpucache(cachep);
1349 mutex_unlock(&cache_chain_mutex); 1412 mutex_unlock(&cache_chain_mutex);
1350 } 1413 }
1351 1414
1352 /* Done! */ 1415 /* Done! */
1353 g_cpucache_up = FULL; 1416 g_cpucache_up = FULL;
1354 1417
1355 /* Register a cpu startup notifier callback 1418 /*
1356 * that initializes cpu_cache_get for all new cpus 1419 * Register a cpu startup notifier callback that initializes
1420 * cpu_cache_get for all new cpus
1357 */ 1421 */
1358 register_cpu_notifier(&cpucache_notifier); 1422 register_cpu_notifier(&cpucache_notifier);
1359 1423
1360 /* The reap timers are started later, with a module init call: 1424 /*
1361 * That part of the kernel is not yet operational. 1425 * The reap timers are started later, with a module init call: That part
1426 * of the kernel is not yet operational.
1362 */ 1427 */
1363} 1428}
1364 1429
@@ -1366,16 +1431,13 @@ static int __init cpucache_init(void)
1366{ 1431{
1367 int cpu; 1432 int cpu;
1368 1433
1369 /* 1434 /*
1370 * Register the timers that return unneeded 1435 * Register the timers that return unneeded pages to the page allocator
1371 * pages to gfp.
1372 */ 1436 */
1373 for_each_online_cpu(cpu) 1437 for_each_online_cpu(cpu)
1374 start_cpu_timer(cpu); 1438 start_cpu_timer(cpu);
1375
1376 return 0; 1439 return 0;
1377} 1440}
1378
1379__initcall(cpucache_init); 1441__initcall(cpucache_init);
1380 1442
1381/* 1443/*
@@ -1402,7 +1464,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1402 atomic_add(i, &slab_reclaim_pages); 1464 atomic_add(i, &slab_reclaim_pages);
1403 add_page_state(nr_slab, i); 1465 add_page_state(nr_slab, i);
1404 while (i--) { 1466 while (i--) {
1405 SetPageSlab(page); 1467 __SetPageSlab(page);
1406 page++; 1468 page++;
1407 } 1469 }
1408 return addr; 1470 return addr;
@@ -1418,8 +1480,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
1418 const unsigned long nr_freed = i; 1480 const unsigned long nr_freed = i;
1419 1481
1420 while (i--) { 1482 while (i--) {
1421 if (!TestClearPageSlab(page)) 1483 BUG_ON(!PageSlab(page));
1422 BUG(); 1484 __ClearPageSlab(page);
1423 page++; 1485 page++;
1424 } 1486 }
1425 sub_page_state(nr_slab, nr_freed); 1487 sub_page_state(nr_slab, nr_freed);
@@ -1489,9 +1551,8 @@ static void dump_line(char *data, int offset, int limit)
1489{ 1551{
1490 int i; 1552 int i;
1491 printk(KERN_ERR "%03x:", offset); 1553 printk(KERN_ERR "%03x:", offset);
1492 for (i = 0; i < limit; i++) { 1554 for (i = 0; i < limit; i++)
1493 printk(" %02x", (unsigned char)data[offset + i]); 1555 printk(" %02x", (unsigned char)data[offset + i]);
1494 }
1495 printk("\n"); 1556 printk("\n");
1496} 1557}
1497#endif 1558#endif
@@ -1505,15 +1566,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
1505 1566
1506 if (cachep->flags & SLAB_RED_ZONE) { 1567 if (cachep->flags & SLAB_RED_ZONE) {
1507 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1568 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1508 *dbg_redzone1(cachep, objp), 1569 *dbg_redzone1(cachep, objp),
1509 *dbg_redzone2(cachep, objp)); 1570 *dbg_redzone2(cachep, objp));
1510 } 1571 }
1511 1572
1512 if (cachep->flags & SLAB_STORE_USER) { 1573 if (cachep->flags & SLAB_STORE_USER) {
1513 printk(KERN_ERR "Last user: [<%p>]", 1574 printk(KERN_ERR "Last user: [<%p>]",
1514 *dbg_userword(cachep, objp)); 1575 *dbg_userword(cachep, objp));
1515 print_symbol("(%s)", 1576 print_symbol("(%s)",
1516 (unsigned long)*dbg_userword(cachep, objp)); 1577 (unsigned long)*dbg_userword(cachep, objp));
1517 printk("\n"); 1578 printk("\n");
1518 } 1579 }
1519 realobj = (char *)objp + obj_offset(cachep); 1580 realobj = (char *)objp + obj_offset(cachep);
@@ -1546,8 +1607,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1546 /* Print header */ 1607 /* Print header */
1547 if (lines == 0) { 1608 if (lines == 0) {
1548 printk(KERN_ERR 1609 printk(KERN_ERR
1549 "Slab corruption: start=%p, len=%d\n", 1610 "Slab corruption: start=%p, len=%d\n",
1550 realobj, size); 1611 realobj, size);
1551 print_objinfo(cachep, objp, 0); 1612 print_objinfo(cachep, objp, 0);
1552 } 1613 }
1553 /* Hexdump the affected line */ 1614 /* Hexdump the affected line */
@@ -1568,18 +1629,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1568 * exist: 1629 * exist:
1569 */ 1630 */
1570 struct slab *slabp = virt_to_slab(objp); 1631 struct slab *slabp = virt_to_slab(objp);
1571 int objnr; 1632 unsigned int objnr;
1572 1633
1573 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 1634 objnr = obj_to_index(cachep, slabp, objp);
1574 if (objnr) { 1635 if (objnr) {
1575 objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; 1636 objp = index_to_obj(cachep, slabp, objnr - 1);
1576 realobj = (char *)objp + obj_offset(cachep); 1637 realobj = (char *)objp + obj_offset(cachep);
1577 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1638 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1578 realobj, size); 1639 realobj, size);
1579 print_objinfo(cachep, objp, 2); 1640 print_objinfo(cachep, objp, 2);
1580 } 1641 }
1581 if (objnr + 1 < cachep->num) { 1642 if (objnr + 1 < cachep->num) {
1582 objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; 1643 objp = index_to_obj(cachep, slabp, objnr + 1);
1583 realobj = (char *)objp + obj_offset(cachep); 1644 realobj = (char *)objp + obj_offset(cachep);
1584 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1645 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1585 realobj, size); 1646 realobj, size);
@@ -1591,22 +1652,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
1591 1652
1592#if DEBUG 1653#if DEBUG
1593/** 1654/**
1594 * slab_destroy_objs - call the registered destructor for each object in 1655 * slab_destroy_objs - destroy a slab and its objects
1595 * a slab that is to be destroyed. 1656 * @cachep: cache pointer being destroyed
1657 * @slabp: slab pointer being destroyed
1658 *
1659 * Call the registered destructor for each object in a slab that is being
1660 * destroyed.
1596 */ 1661 */
1597static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) 1662static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1598{ 1663{
1599 int i; 1664 int i;
1600 for (i = 0; i < cachep->num; i++) { 1665 for (i = 0; i < cachep->num; i++) {
1601 void *objp = slabp->s_mem + cachep->buffer_size * i; 1666 void *objp = index_to_obj(cachep, slabp, i);
1602 1667
1603 if (cachep->flags & SLAB_POISON) { 1668 if (cachep->flags & SLAB_POISON) {
1604#ifdef CONFIG_DEBUG_PAGEALLOC 1669#ifdef CONFIG_DEBUG_PAGEALLOC
1605 if ((cachep->buffer_size % PAGE_SIZE) == 0 1670 if (cachep->buffer_size % PAGE_SIZE == 0 &&
1606 && OFF_SLAB(cachep)) 1671 OFF_SLAB(cachep))
1607 kernel_map_pages(virt_to_page(objp), 1672 kernel_map_pages(virt_to_page(objp),
1608 cachep->buffer_size / PAGE_SIZE, 1673 cachep->buffer_size / PAGE_SIZE, 1);
1609 1);
1610 else 1674 else
1611 check_poison_obj(cachep, objp); 1675 check_poison_obj(cachep, objp);
1612#else 1676#else
@@ -1631,7 +1695,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1631 if (cachep->dtor) { 1695 if (cachep->dtor) {
1632 int i; 1696 int i;
1633 for (i = 0; i < cachep->num; i++) { 1697 for (i = 0; i < cachep->num; i++) {
1634 void *objp = slabp->s_mem + cachep->buffer_size * i; 1698 void *objp = index_to_obj(cachep, slabp, i);
1635 (cachep->dtor) (objp, cachep, 0); 1699 (cachep->dtor) (objp, cachep, 0);
1636 } 1700 }
1637 } 1701 }
@@ -1639,9 +1703,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
1639#endif 1703#endif
1640 1704
1641/** 1705/**
1706 * slab_destroy - destroy and release all objects in a slab
1707 * @cachep: cache pointer being destroyed
1708 * @slabp: slab pointer being destroyed
1709 *
1642 * Destroy all the objs in a slab, and release the mem back to the system. 1710 * Destroy all the objs in a slab, and release the mem back to the system.
1643 * Before calling the slab must have been unlinked from the cache. 1711 * Before calling the slab must have been unlinked from the cache. The
1644 * The cache-lock is not held/needed. 1712 * cache-lock is not held/needed.
1645 */ 1713 */
1646static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) 1714static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1647{ 1715{
@@ -1662,8 +1730,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
1662 } 1730 }
1663} 1731}
1664 1732
1665/* For setting up all the kmem_list3s for cache whose buffer_size is same 1733/*
1666 as size of kmem_list3. */ 1734 * For setting up all the kmem_list3s for cache whose buffer_size is same as
1735 * size of kmem_list3.
1736 */
1667static void set_up_list3s(struct kmem_cache *cachep, int index) 1737static void set_up_list3s(struct kmem_cache *cachep, int index)
1668{ 1738{
1669 int node; 1739 int node;
@@ -1689,13 +1759,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
1689 * high order pages for slabs. When the gfp() functions are more friendly 1759 * high order pages for slabs. When the gfp() functions are more friendly
1690 * towards high-order requests, this should be changed. 1760 * towards high-order requests, this should be changed.
1691 */ 1761 */
1692static inline size_t calculate_slab_order(struct kmem_cache *cachep, 1762static size_t calculate_slab_order(struct kmem_cache *cachep,
1693 size_t size, size_t align, unsigned long flags) 1763 size_t size, size_t align, unsigned long flags)
1694{ 1764{
1695 size_t left_over = 0; 1765 size_t left_over = 0;
1696 int gfporder; 1766 int gfporder;
1697 1767
1698 for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { 1768 for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
1699 unsigned int num; 1769 unsigned int num;
1700 size_t remainder; 1770 size_t remainder;
1701 1771
@@ -1730,12 +1800,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1730 /* 1800 /*
1731 * Acceptable internal fragmentation? 1801 * Acceptable internal fragmentation?
1732 */ 1802 */
1733 if ((left_over * 8) <= (PAGE_SIZE << gfporder)) 1803 if (left_over * 8 <= (PAGE_SIZE << gfporder))
1734 break; 1804 break;
1735 } 1805 }
1736 return left_over; 1806 return left_over;
1737} 1807}
1738 1808
1809static void setup_cpu_cache(struct kmem_cache *cachep)
1810{
1811 if (g_cpucache_up == FULL) {
1812 enable_cpucache(cachep);
1813 return;
1814 }
1815 if (g_cpucache_up == NONE) {
1816 /*
1817 * Note: the first kmem_cache_create must create the cache
1818 * that's used by kmalloc(24), otherwise the creation of
1819 * further caches will BUG().
1820 */
1821 cachep->array[smp_processor_id()] = &initarray_generic.cache;
1822
1823 /*
1824 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
1825 * the first cache, then we need to set up all its list3s,
1826 * otherwise the creation of further caches will BUG().
1827 */
1828 set_up_list3s(cachep, SIZE_AC);
1829 if (INDEX_AC == INDEX_L3)
1830 g_cpucache_up = PARTIAL_L3;
1831 else
1832 g_cpucache_up = PARTIAL_AC;
1833 } else {
1834 cachep->array[smp_processor_id()] =
1835 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1836
1837 if (g_cpucache_up == PARTIAL_AC) {
1838 set_up_list3s(cachep, SIZE_L3);
1839 g_cpucache_up = PARTIAL_L3;
1840 } else {
1841 int node;
1842 for_each_online_node(node) {
1843 cachep->nodelists[node] =
1844 kmalloc_node(sizeof(struct kmem_list3),
1845 GFP_KERNEL, node);
1846 BUG_ON(!cachep->nodelists[node]);
1847 kmem_list3_init(cachep->nodelists[node]);
1848 }
1849 }
1850 }
1851 cachep->nodelists[numa_node_id()]->next_reap =
1852 jiffies + REAPTIMEOUT_LIST3 +
1853 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1854
1855 cpu_cache_get(cachep)->avail = 0;
1856 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1857 cpu_cache_get(cachep)->batchcount = 1;
1858 cpu_cache_get(cachep)->touched = 0;
1859 cachep->batchcount = 1;
1860 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1861}
1862
1739/** 1863/**
1740 * kmem_cache_create - Create a cache. 1864 * kmem_cache_create - Create a cache.
1741 * @name: A string which is used in /proc/slabinfo to identify this cache. 1865 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1751,9 +1875,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1751 * and the @dtor is run before the pages are handed back. 1875 * and the @dtor is run before the pages are handed back.
1752 * 1876 *
1753 * @name must be valid until the cache is destroyed. This implies that 1877 * @name must be valid until the cache is destroyed. This implies that
1754 * the module calling this has to destroy the cache before getting 1878 * the module calling this has to destroy the cache before getting unloaded.
1755 * unloaded. 1879 *
1756 *
1757 * The flags are 1880 * The flags are
1758 * 1881 *
1759 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 1882 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1762,16 +1885,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1762 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 1885 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1763 * for buffer overruns. 1886 * for buffer overruns.
1764 * 1887 *
1765 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1766 * memory pressure.
1767 *
1768 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 1888 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1769 * cacheline. This can be beneficial if you're counting cycles as closely 1889 * cacheline. This can be beneficial if you're counting cycles as closely
1770 * as davem. 1890 * as davem.
1771 */ 1891 */
1772struct kmem_cache * 1892struct kmem_cache *
1773kmem_cache_create (const char *name, size_t size, size_t align, 1893kmem_cache_create (const char *name, size_t size, size_t align,
1774 unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), 1894 unsigned long flags,
1895 void (*ctor)(void*, struct kmem_cache *, unsigned long),
1775 void (*dtor)(void*, struct kmem_cache *, unsigned long)) 1896 void (*dtor)(void*, struct kmem_cache *, unsigned long))
1776{ 1897{
1777 size_t left_over, slab_size, ralign; 1898 size_t left_over, slab_size, ralign;
@@ -1781,12 +1902,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1781 /* 1902 /*
1782 * Sanity checks... these are all serious usage bugs. 1903 * Sanity checks... these are all serious usage bugs.
1783 */ 1904 */
1784 if ((!name) || 1905 if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
1785 in_interrupt() ||
1786 (size < BYTES_PER_WORD) ||
1787 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { 1906 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1788 printk(KERN_ERR "%s: Early error in slab %s\n", 1907 printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
1789 __FUNCTION__, name); 1908 name);
1790 BUG(); 1909 BUG();
1791 } 1910 }
1792 1911
@@ -1840,8 +1959,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1840 * above the next power of two: caches with object sizes just above a 1959 * above the next power of two: caches with object sizes just above a
1841 * power of two have a significant amount of internal fragmentation. 1960 * power of two have a significant amount of internal fragmentation.
1842 */ 1961 */
1843 if ((size < 4096 1962 if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
1844 || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1845 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 1963 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1846 if (!(flags & SLAB_DESTROY_BY_RCU)) 1964 if (!(flags & SLAB_DESTROY_BY_RCU))
1847 flags |= SLAB_POISON; 1965 flags |= SLAB_POISON;
@@ -1853,13 +1971,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1853 BUG_ON(dtor); 1971 BUG_ON(dtor);
1854 1972
1855 /* 1973 /*
1856 * Always checks flags, a caller might be expecting debug 1974 * Always checks flags, a caller might be expecting debug support which
1857 * support which isn't available. 1975 * isn't available.
1858 */ 1976 */
1859 if (flags & ~CREATE_MASK) 1977 if (flags & ~CREATE_MASK)
1860 BUG(); 1978 BUG();
1861 1979
1862 /* Check that size is in terms of words. This is needed to avoid 1980 /*
1981 * Check that size is in terms of words. This is needed to avoid
1863 * unaligned accesses for some archs when redzoning is used, and makes 1982 * unaligned accesses for some archs when redzoning is used, and makes
1864 * sure any on-slab bufctl's are also correctly aligned. 1983 * sure any on-slab bufctl's are also correctly aligned.
1865 */ 1984 */
@@ -1868,12 +1987,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1868 size &= ~(BYTES_PER_WORD - 1); 1987 size &= ~(BYTES_PER_WORD - 1);
1869 } 1988 }
1870 1989
1871 /* calculate out the final buffer alignment: */ 1990 /* calculate the final buffer alignment: */
1991
1872 /* 1) arch recommendation: can be overridden for debug */ 1992 /* 1) arch recommendation: can be overridden for debug */
1873 if (flags & SLAB_HWCACHE_ALIGN) { 1993 if (flags & SLAB_HWCACHE_ALIGN) {
1874 /* Default alignment: as specified by the arch code. 1994 /*
1875 * Except if an object is really small, then squeeze multiple 1995 * Default alignment: as specified by the arch code. Except if
1876 * objects into one cacheline. 1996 * an object is really small, then squeeze multiple objects into
1997 * one cacheline.
1877 */ 1998 */
1878 ralign = cache_line_size(); 1999 ralign = cache_line_size();
1879 while (size <= ralign / 2) 2000 while (size <= ralign / 2)
@@ -1893,16 +2014,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1893 if (ralign > BYTES_PER_WORD) 2014 if (ralign > BYTES_PER_WORD)
1894 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); 2015 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1895 } 2016 }
1896 /* 4) Store it. Note that the debug code below can reduce 2017 /*
2018 * 4) Store it. Note that the debug code below can reduce
1897 * the alignment to BYTES_PER_WORD. 2019 * the alignment to BYTES_PER_WORD.
1898 */ 2020 */
1899 align = ralign; 2021 align = ralign;
1900 2022
1901 /* Get cache's description obj. */ 2023 /* Get cache's description obj. */
1902 cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 2024 cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
1903 if (!cachep) 2025 if (!cachep)
1904 goto oops; 2026 goto oops;
1905 memset(cachep, 0, sizeof(struct kmem_cache));
1906 2027
1907#if DEBUG 2028#if DEBUG
1908 cachep->obj_size = size; 2029 cachep->obj_size = size;
@@ -1978,7 +2099,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1978 cachep->gfpflags = 0; 2099 cachep->gfpflags = 0;
1979 if (flags & SLAB_CACHE_DMA) 2100 if (flags & SLAB_CACHE_DMA)
1980 cachep->gfpflags |= GFP_DMA; 2101 cachep->gfpflags |= GFP_DMA;
1981 spin_lock_init(&cachep->spinlock);
1982 cachep->buffer_size = size; 2102 cachep->buffer_size = size;
1983 2103
1984 if (flags & CFLGS_OFF_SLAB) 2104 if (flags & CFLGS_OFF_SLAB)
@@ -1988,64 +2108,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1988 cachep->name = name; 2108 cachep->name = name;
1989 2109
1990 2110
1991 if (g_cpucache_up == FULL) { 2111 setup_cpu_cache(cachep);
1992 enable_cpucache(cachep);
1993 } else {
1994 if (g_cpucache_up == NONE) {
1995 /* Note: the first kmem_cache_create must create
1996 * the cache that's used by kmalloc(24), otherwise
1997 * the creation of further caches will BUG().
1998 */
1999 cachep->array[smp_processor_id()] =
2000 &initarray_generic.cache;
2001
2002 /* If the cache that's used by
2003 * kmalloc(sizeof(kmem_list3)) is the first cache,
2004 * then we need to set up all its list3s, otherwise
2005 * the creation of further caches will BUG().
2006 */
2007 set_up_list3s(cachep, SIZE_AC);
2008 if (INDEX_AC == INDEX_L3)
2009 g_cpucache_up = PARTIAL_L3;
2010 else
2011 g_cpucache_up = PARTIAL_AC;
2012 } else {
2013 cachep->array[smp_processor_id()] =
2014 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
2015
2016 if (g_cpucache_up == PARTIAL_AC) {
2017 set_up_list3s(cachep, SIZE_L3);
2018 g_cpucache_up = PARTIAL_L3;
2019 } else {
2020 int node;
2021 for_each_online_node(node) {
2022
2023 cachep->nodelists[node] =
2024 kmalloc_node(sizeof
2025 (struct kmem_list3),
2026 GFP_KERNEL, node);
2027 BUG_ON(!cachep->nodelists[node]);
2028 kmem_list3_init(cachep->
2029 nodelists[node]);
2030 }
2031 }
2032 }
2033 cachep->nodelists[numa_node_id()]->next_reap =
2034 jiffies + REAPTIMEOUT_LIST3 +
2035 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2036
2037 BUG_ON(!cpu_cache_get(cachep));
2038 cpu_cache_get(cachep)->avail = 0;
2039 cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
2040 cpu_cache_get(cachep)->batchcount = 1;
2041 cpu_cache_get(cachep)->touched = 0;
2042 cachep->batchcount = 1;
2043 cachep->limit = BOOT_CPUCACHE_ENTRIES;
2044 }
2045 2112
2046 /* cache setup completed, link it into the list */ 2113 /* cache setup completed, link it into the list */
2047 list_add(&cachep->next, &cache_chain); 2114 list_add(&cachep->next, &cache_chain);
2048 oops: 2115oops:
2049 if (!cachep && (flags & SLAB_PANIC)) 2116 if (!cachep && (flags & SLAB_PANIC))
2050 panic("kmem_cache_create(): failed to create slab `%s'\n", 2117 panic("kmem_cache_create(): failed to create slab `%s'\n",
2051 name); 2118 name);
@@ -2089,30 +2156,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
2089#define check_spinlock_acquired_node(x, y) do { } while(0) 2156#define check_spinlock_acquired_node(x, y) do { } while(0)
2090#endif 2157#endif
2091 2158
2092/* 2159static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
2093 * Waits for all CPUs to execute func(). 2160 struct array_cache *ac,
2094 */ 2161 int force, int node);
2095static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
2096{
2097 check_irq_on();
2098 preempt_disable();
2099
2100 local_irq_disable();
2101 func(arg);
2102 local_irq_enable();
2103
2104 if (smp_call_function(func, arg, 1, 1))
2105 BUG();
2106
2107 preempt_enable();
2108}
2109
2110static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
2111 int force, int node);
2112 2162
2113static void do_drain(void *arg) 2163static void do_drain(void *arg)
2114{ 2164{
2115 struct kmem_cache *cachep = (struct kmem_cache *) arg; 2165 struct kmem_cache *cachep = arg;
2116 struct array_cache *ac; 2166 struct array_cache *ac;
2117 int node = numa_node_id(); 2167 int node = numa_node_id();
2118 2168
@@ -2129,14 +2179,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
2129 struct kmem_list3 *l3; 2179 struct kmem_list3 *l3;
2130 int node; 2180 int node;
2131 2181
2132 smp_call_function_all_cpus(do_drain, cachep); 2182 on_each_cpu(do_drain, cachep, 1, 1);
2133 check_irq_on(); 2183 check_irq_on();
2134 for_each_online_node(node) { 2184 for_each_online_node(node) {
2135 l3 = cachep->nodelists[node]; 2185 l3 = cachep->nodelists[node];
2136 if (l3) { 2186 if (l3) {
2137 spin_lock_irq(&l3->list_lock); 2187 drain_array(cachep, l3, l3->shared, 1, node);
2138 drain_array_locked(cachep, l3->shared, 1, node);
2139 spin_unlock_irq(&l3->list_lock);
2140 if (l3->alien) 2188 if (l3->alien)
2141 drain_alien_cache(cachep, l3->alien); 2189 drain_alien_cache(cachep, l3->alien);
2142 } 2190 }
@@ -2260,16 +2308,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
2260 2308
2261 /* NUMA: free the list3 structures */ 2309 /* NUMA: free the list3 structures */
2262 for_each_online_node(i) { 2310 for_each_online_node(i) {
2263 if ((l3 = cachep->nodelists[i])) { 2311 l3 = cachep->nodelists[i];
2312 if (l3) {
2264 kfree(l3->shared); 2313 kfree(l3->shared);
2265 free_alien_cache(l3->alien); 2314 free_alien_cache(l3->alien);
2266 kfree(l3); 2315 kfree(l3);
2267 } 2316 }
2268 } 2317 }
2269 kmem_cache_free(&cache_cache, cachep); 2318 kmem_cache_free(&cache_cache, cachep);
2270
2271 unlock_cpu_hotplug(); 2319 unlock_cpu_hotplug();
2272
2273 return 0; 2320 return 0;
2274} 2321}
2275EXPORT_SYMBOL(kmem_cache_destroy); 2322EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2292,7 +2339,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2292 slabp->inuse = 0; 2339 slabp->inuse = 0;
2293 slabp->colouroff = colour_off; 2340 slabp->colouroff = colour_off;
2294 slabp->s_mem = objp + colour_off; 2341 slabp->s_mem = objp + colour_off;
2295
2296 return slabp; 2342 return slabp;
2297} 2343}
2298 2344
@@ -2307,7 +2353,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2307 int i; 2353 int i;
2308 2354
2309 for (i = 0; i < cachep->num; i++) { 2355 for (i = 0; i < cachep->num; i++) {
2310 void *objp = slabp->s_mem + cachep->buffer_size * i; 2356 void *objp = index_to_obj(cachep, slabp, i);
2311#if DEBUG 2357#if DEBUG
2312 /* need to poison the objs? */ 2358 /* need to poison the objs? */
2313 if (cachep->flags & SLAB_POISON) 2359 if (cachep->flags & SLAB_POISON)
@@ -2320,9 +2366,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
2320 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2366 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
2321 } 2367 }
2322 /* 2368 /*
2323 * Constructors are not allowed to allocate memory from 2369 * Constructors are not allowed to allocate memory from the same
2324 * the same cache which they are a constructor for. 2370 * cache which they are a constructor for. Otherwise, deadlock.
2325 * Otherwise, deadlock. They must also be threaded. 2371 * They must also be threaded.
2326 */ 2372 */
2327 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2373 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2328 cachep->ctor(objp + obj_offset(cachep), cachep, 2374 cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2336,8 +2382,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
2336 slab_error(cachep, "constructor overwrote the" 2382 slab_error(cachep, "constructor overwrote the"
2337 " start of an object"); 2383 " start of an object");
2338 } 2384 }
2339 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) 2385 if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
2340 && cachep->flags & SLAB_POISON) 2386 OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
2341 kernel_map_pages(virt_to_page(objp), 2387 kernel_map_pages(virt_to_page(objp),
2342 cachep->buffer_size / PAGE_SIZE, 0); 2388 cachep->buffer_size / PAGE_SIZE, 0);
2343#else 2389#else
@@ -2352,18 +2398,16 @@ static void cache_init_objs(struct kmem_cache *cachep,
2352 2398
2353static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2399static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2354{ 2400{
2355 if (flags & SLAB_DMA) { 2401 if (flags & SLAB_DMA)
2356 if (!(cachep->gfpflags & GFP_DMA)) 2402 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2357 BUG(); 2403 else
2358 } else { 2404 BUG_ON(cachep->gfpflags & GFP_DMA);
2359 if (cachep->gfpflags & GFP_DMA)
2360 BUG();
2361 }
2362} 2405}
2363 2406
2364static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) 2407static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
2408 int nodeid)
2365{ 2409{
2366 void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); 2410 void *objp = index_to_obj(cachep, slabp, slabp->free);
2367 kmem_bufctl_t next; 2411 kmem_bufctl_t next;
2368 2412
2369 slabp->inuse++; 2413 slabp->inuse++;
@@ -2377,18 +2421,18 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
2377 return objp; 2421 return objp;
2378} 2422}
2379 2423
2380static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, 2424static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2381 int nodeid) 2425 void *objp, int nodeid)
2382{ 2426{
2383 unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; 2427 unsigned int objnr = obj_to_index(cachep, slabp, objp);
2384 2428
2385#if DEBUG 2429#if DEBUG
2386 /* Verify that the slab belongs to the intended node */ 2430 /* Verify that the slab belongs to the intended node */
2387 WARN_ON(slabp->nodeid != nodeid); 2431 WARN_ON(slabp->nodeid != nodeid);
2388 2432
2389 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2433 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2390 printk(KERN_ERR "slab: double free detected in cache " 2434 printk(KERN_ERR "slab: double free detected in cache "
2391 "'%s', objp %p\n", cachep->name, objp); 2435 "'%s', objp %p\n", cachep->name, objp);
2392 BUG(); 2436 BUG();
2393 } 2437 }
2394#endif 2438#endif
@@ -2397,14 +2441,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
2397 slabp->inuse--; 2441 slabp->inuse--;
2398} 2442}
2399 2443
2400static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) 2444static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
2445 void *objp)
2401{ 2446{
2402 int i; 2447 int i;
2403 struct page *page; 2448 struct page *page;
2404 2449
2405 /* Nasty!!!!!! I hope this is OK. */ 2450 /* Nasty!!!!!! I hope this is OK. */
2406 i = 1 << cachep->gfporder;
2407 page = virt_to_page(objp); 2451 page = virt_to_page(objp);
2452
2453 i = 1;
2454 if (likely(!PageCompound(page)))
2455 i <<= cachep->gfporder;
2408 do { 2456 do {
2409 page_set_cache(page, cachep); 2457 page_set_cache(page, cachep);
2410 page_set_slab(page, slabp); 2458 page_set_slab(page, slabp);
@@ -2425,8 +2473,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2425 unsigned long ctor_flags; 2473 unsigned long ctor_flags;
2426 struct kmem_list3 *l3; 2474 struct kmem_list3 *l3;
2427 2475
2428 /* Be lazy and only check for valid flags here, 2476 /*
2429 * keeping it out of the critical path in kmem_cache_alloc(). 2477 * Be lazy and only check for valid flags here, keeping it out of the
2478 * critical path in kmem_cache_alloc().
2430 */ 2479 */
2431 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) 2480 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2432 BUG(); 2481 BUG();
@@ -2467,14 +2516,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2467 */ 2516 */
2468 kmem_flagcheck(cachep, flags); 2517 kmem_flagcheck(cachep, flags);
2469 2518
2470 /* Get mem for the objs. 2519 /*
2471 * Attempt to allocate a physical page from 'nodeid', 2520 * Get mem for the objs. Attempt to allocate a physical page from
2521 * 'nodeid'.
2472 */ 2522 */
2473 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2523 objp = kmem_getpages(cachep, flags, nodeid);
2524 if (!objp)
2474 goto failed; 2525 goto failed;
2475 2526
2476 /* Get slab management. */ 2527 /* Get slab management. */
2477 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2528 slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
2529 if (!slabp)
2478 goto opps1; 2530 goto opps1;
2479 2531
2480 slabp->nodeid = nodeid; 2532 slabp->nodeid = nodeid;
@@ -2493,9 +2545,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2493 l3->free_objects += cachep->num; 2545 l3->free_objects += cachep->num;
2494 spin_unlock(&l3->list_lock); 2546 spin_unlock(&l3->list_lock);
2495 return 1; 2547 return 1;
2496 opps1: 2548opps1:
2497 kmem_freepages(cachep, objp); 2549 kmem_freepages(cachep, objp);
2498 failed: 2550failed:
2499 if (local_flags & __GFP_WAIT) 2551 if (local_flags & __GFP_WAIT)
2500 local_irq_disable(); 2552 local_irq_disable();
2501 return 0; 2553 return 0;
@@ -2538,8 +2590,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2538 page = virt_to_page(objp); 2590 page = virt_to_page(objp);
2539 2591
2540 if (page_get_cache(page) != cachep) { 2592 if (page_get_cache(page) != cachep) {
2541 printk(KERN_ERR 2593 printk(KERN_ERR "mismatch in kmem_cache_free: expected "
2542 "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2594 "cache %p, got %p\n",
2543 page_get_cache(page), cachep); 2595 page_get_cache(page), cachep);
2544 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2596 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2545 printk(KERN_ERR "%p is %s.\n", page_get_cache(page), 2597 printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2549,13 +2601,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2549 slabp = page_get_slab(page); 2601 slabp = page_get_slab(page);
2550 2602
2551 if (cachep->flags & SLAB_RED_ZONE) { 2603 if (cachep->flags & SLAB_RED_ZONE) {
2552 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE 2604 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
2553 || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2605 *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2554 slab_error(cachep, 2606 slab_error(cachep, "double free, or memory outside"
2555 "double free, or memory outside" 2607 " object was overwritten");
2556 " object was overwritten"); 2608 printk(KERN_ERR "%p: redzone 1:0x%lx, "
2557 printk(KERN_ERR 2609 "redzone 2:0x%lx.\n",
2558 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2559 objp, *dbg_redzone1(cachep, objp), 2610 objp, *dbg_redzone1(cachep, objp),
2560 *dbg_redzone2(cachep, objp)); 2611 *dbg_redzone2(cachep, objp));
2561 } 2612 }
@@ -2565,15 +2616,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2565 if (cachep->flags & SLAB_STORE_USER) 2616 if (cachep->flags & SLAB_STORE_USER)
2566 *dbg_userword(cachep, objp) = caller; 2617 *dbg_userword(cachep, objp) = caller;
2567 2618
2568 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; 2619 objnr = obj_to_index(cachep, slabp, objp);
2569 2620
2570 BUG_ON(objnr >= cachep->num); 2621 BUG_ON(objnr >= cachep->num);
2571 BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); 2622 BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
2572 2623
2573 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2624 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2574 /* Need to call the slab's constructor so the 2625 /*
2575 * caller can perform a verify of its state (debugging). 2626 * Need to call the slab's constructor so the caller can
2576 * Called without the cache-lock held. 2627 * perform a verify of its state (debugging). Called without
2628 * the cache-lock held.
2577 */ 2629 */
2578 cachep->ctor(objp + obj_offset(cachep), 2630 cachep->ctor(objp + obj_offset(cachep),
2579 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); 2631 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2584,9 +2636,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2584 */ 2636 */
2585 cachep->dtor(objp + obj_offset(cachep), cachep, 0); 2637 cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2586 } 2638 }
2639#ifdef CONFIG_DEBUG_SLAB_LEAK
2640 slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2641#endif
2587 if (cachep->flags & SLAB_POISON) { 2642 if (cachep->flags & SLAB_POISON) {
2588#ifdef CONFIG_DEBUG_PAGEALLOC 2643#ifdef CONFIG_DEBUG_PAGEALLOC
2589 if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2644 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
2590 store_stackinfo(cachep, objp, (unsigned long)caller); 2645 store_stackinfo(cachep, objp, (unsigned long)caller);
2591 kernel_map_pages(virt_to_page(objp), 2646 kernel_map_pages(virt_to_page(objp),
2592 cachep->buffer_size / PAGE_SIZE, 0); 2647 cachep->buffer_size / PAGE_SIZE, 0);
@@ -2612,14 +2667,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2612 goto bad; 2667 goto bad;
2613 } 2668 }
2614 if (entries != cachep->num - slabp->inuse) { 2669 if (entries != cachep->num - slabp->inuse) {
2615 bad: 2670bad:
2616 printk(KERN_ERR 2671 printk(KERN_ERR "slab: Internal list corruption detected in "
2617 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2672 "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2618 cachep->name, cachep->num, slabp, slabp->inuse); 2673 cachep->name, cachep->num, slabp, slabp->inuse);
2619 for (i = 0; 2674 for (i = 0;
2620 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); 2675 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2621 i++) { 2676 i++) {
2622 if ((i % 16) == 0) 2677 if (i % 16 == 0)
2623 printk("\n%03x:", i); 2678 printk("\n%03x:", i);
2624 printk(" %02x", ((unsigned char *)slabp)[i]); 2679 printk(" %02x", ((unsigned char *)slabp)[i]);
2625 } 2680 }
@@ -2641,12 +2696,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2641 2696
2642 check_irq_off(); 2697 check_irq_off();
2643 ac = cpu_cache_get(cachep); 2698 ac = cpu_cache_get(cachep);
2644 retry: 2699retry:
2645 batchcount = ac->batchcount; 2700 batchcount = ac->batchcount;
2646 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2701 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2647 /* if there was little recent activity on this 2702 /*
2648 * cache, then perform only a partial refill. 2703 * If there was little recent activity on this cache, then
2649 * Otherwise we could generate refill bouncing. 2704 * perform only a partial refill. Otherwise we could generate
2705 * refill bouncing.
2650 */ 2706 */
2651 batchcount = BATCHREFILL_LIMIT; 2707 batchcount = BATCHREFILL_LIMIT;
2652 } 2708 }
@@ -2655,20 +2711,10 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2655 BUG_ON(ac->avail > 0 || !l3); 2711 BUG_ON(ac->avail > 0 || !l3);
2656 spin_lock(&l3->list_lock); 2712 spin_lock(&l3->list_lock);
2657 2713
2658 if (l3->shared) { 2714 /* See if we can refill from the shared array */
2659 struct array_cache *shared_array = l3->shared; 2715 if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2660 if (shared_array->avail) { 2716 goto alloc_done;
2661 if (batchcount > shared_array->avail) 2717
2662 batchcount = shared_array->avail;
2663 shared_array->avail -= batchcount;
2664 ac->avail = batchcount;
2665 memcpy(ac->entry,
2666 &(shared_array->entry[shared_array->avail]),
2667 sizeof(void *) * batchcount);
2668 shared_array->touched = 1;
2669 goto alloc_done;
2670 }
2671 }
2672 while (batchcount > 0) { 2718 while (batchcount > 0) {
2673 struct list_head *entry; 2719 struct list_head *entry;
2674 struct slab *slabp; 2720 struct slab *slabp;
@@ -2702,29 +2748,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
2702 list_add(&slabp->list, &l3->slabs_partial); 2748 list_add(&slabp->list, &l3->slabs_partial);
2703 } 2749 }
2704 2750
2705 must_grow: 2751must_grow:
2706 l3->free_objects -= ac->avail; 2752 l3->free_objects -= ac->avail;
2707 alloc_done: 2753alloc_done:
2708 spin_unlock(&l3->list_lock); 2754 spin_unlock(&l3->list_lock);
2709 2755
2710 if (unlikely(!ac->avail)) { 2756 if (unlikely(!ac->avail)) {
2711 int x; 2757 int x;
2712 x = cache_grow(cachep, flags, numa_node_id()); 2758 x = cache_grow(cachep, flags, numa_node_id());
2713 2759
2714 // cache_grow can reenable interrupts, then ac could change. 2760 /* cache_grow can reenable interrupts, then ac could change. */
2715 ac = cpu_cache_get(cachep); 2761 ac = cpu_cache_get(cachep);
2716 if (!x && ac->avail == 0) // no objects in sight? abort 2762 if (!x && ac->avail == 0) /* no objects in sight? abort */
2717 return NULL; 2763 return NULL;
2718 2764
2719 if (!ac->avail) // objects refilled by interrupt? 2765 if (!ac->avail) /* objects refilled by interrupt? */
2720 goto retry; 2766 goto retry;
2721 } 2767 }
2722 ac->touched = 1; 2768 ac->touched = 1;
2723 return ac->entry[--ac->avail]; 2769 return ac->entry[--ac->avail];
2724} 2770}
2725 2771
2726static inline void 2772static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
2727cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) 2773 gfp_t flags)
2728{ 2774{
2729 might_sleep_if(flags & __GFP_WAIT); 2775 might_sleep_if(flags & __GFP_WAIT);
2730#if DEBUG 2776#if DEBUG
@@ -2733,8 +2779,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
2733} 2779}
2734 2780
2735#if DEBUG 2781#if DEBUG
2736static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, 2782static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2737 void *objp, void *caller) 2783 gfp_t flags, void *objp, void *caller)
2738{ 2784{
2739 if (!objp) 2785 if (!objp)
2740 return objp; 2786 return objp;
@@ -2754,19 +2800,28 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
2754 *dbg_userword(cachep, objp) = caller; 2800 *dbg_userword(cachep, objp) = caller;
2755 2801
2756 if (cachep->flags & SLAB_RED_ZONE) { 2802 if (cachep->flags & SLAB_RED_ZONE) {
2757 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE 2803 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
2758 || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2804 *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2759 slab_error(cachep, 2805 slab_error(cachep, "double free, or memory outside"
2760 "double free, or memory outside" 2806 " object was overwritten");
2761 " object was overwritten");
2762 printk(KERN_ERR 2807 printk(KERN_ERR
2763 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2808 "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
2764 objp, *dbg_redzone1(cachep, objp), 2809 objp, *dbg_redzone1(cachep, objp),
2765 *dbg_redzone2(cachep, objp)); 2810 *dbg_redzone2(cachep, objp));
2766 } 2811 }
2767 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2812 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2768 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2813 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2769 } 2814 }
2815#ifdef CONFIG_DEBUG_SLAB_LEAK
2816 {
2817 struct slab *slabp;
2818 unsigned objnr;
2819
2820 slabp = page_get_slab(virt_to_page(objp));
2821 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
2822 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
2823 }
2824#endif
2770 objp += obj_offset(cachep); 2825 objp += obj_offset(cachep);
2771 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2826 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2772 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2827 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
@@ -2788,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2788 struct array_cache *ac; 2843 struct array_cache *ac;
2789 2844
2790#ifdef CONFIG_NUMA 2845#ifdef CONFIG_NUMA
2791 if (unlikely(current->mempolicy && !in_interrupt())) { 2846 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
2792 int nid = slab_node(current->mempolicy); 2847 objp = alternate_node_alloc(cachep, flags);
2793 2848 if (objp != NULL)
2794 if (nid != numa_node_id()) 2849 return objp;
2795 return __cache_alloc_node(cachep, flags, nid);
2796 } 2850 }
2797#endif 2851#endif
2798 2852
@@ -2809,8 +2863,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2809 return objp; 2863 return objp;
2810} 2864}
2811 2865
2812static __always_inline void * 2866static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2813__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) 2867 gfp_t flags, void *caller)
2814{ 2868{
2815 unsigned long save_flags; 2869 unsigned long save_flags;
2816 void *objp; 2870 void *objp;
@@ -2828,9 +2882,32 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
2828 2882
2829#ifdef CONFIG_NUMA 2883#ifdef CONFIG_NUMA
2830/* 2884/*
2885 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
2886 *
2887 * If we are in_interrupt, then process context, including cpusets and
2888 * mempolicy, may not apply and should not be used for allocation policy.
2889 */
2890static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
2891{
2892 int nid_alloc, nid_here;
2893
2894 if (in_interrupt())
2895 return NULL;
2896 nid_alloc = nid_here = numa_node_id();
2897 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
2898 nid_alloc = cpuset_mem_spread_node();
2899 else if (current->mempolicy)
2900 nid_alloc = slab_node(current->mempolicy);
2901 if (nid_alloc != nid_here)
2902 return __cache_alloc_node(cachep, flags, nid_alloc);
2903 return NULL;
2904}
2905
2906/*
2831 * A interface to enable slab creation on nodeid 2907 * A interface to enable slab creation on nodeid
2832 */ 2908 */
2833static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 2909static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
2910 int nodeid)
2834{ 2911{
2835 struct list_head *entry; 2912 struct list_head *entry;
2836 struct slab *slabp; 2913 struct slab *slabp;
@@ -2841,7 +2918,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2841 l3 = cachep->nodelists[nodeid]; 2918 l3 = cachep->nodelists[nodeid];
2842 BUG_ON(!l3); 2919 BUG_ON(!l3);
2843 2920
2844 retry: 2921retry:
2845 check_irq_off(); 2922 check_irq_off();
2846 spin_lock(&l3->list_lock); 2923 spin_lock(&l3->list_lock);
2847 entry = l3->slabs_partial.next; 2924 entry = l3->slabs_partial.next;
@@ -2868,16 +2945,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2868 /* move slabp to correct slabp list: */ 2945 /* move slabp to correct slabp list: */
2869 list_del(&slabp->list); 2946 list_del(&slabp->list);
2870 2947
2871 if (slabp->free == BUFCTL_END) { 2948 if (slabp->free == BUFCTL_END)
2872 list_add(&slabp->list, &l3->slabs_full); 2949 list_add(&slabp->list, &l3->slabs_full);
2873 } else { 2950 else
2874 list_add(&slabp->list, &l3->slabs_partial); 2951 list_add(&slabp->list, &l3->slabs_partial);
2875 }
2876 2952
2877 spin_unlock(&l3->list_lock); 2953 spin_unlock(&l3->list_lock);
2878 goto done; 2954 goto done;
2879 2955
2880 must_grow: 2956must_grow:
2881 spin_unlock(&l3->list_lock); 2957 spin_unlock(&l3->list_lock);
2882 x = cache_grow(cachep, flags, nodeid); 2958 x = cache_grow(cachep, flags, nodeid);
2883 2959
@@ -2885,7 +2961,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
2885 return NULL; 2961 return NULL;
2886 2962
2887 goto retry; 2963 goto retry;
2888 done: 2964done:
2889 return obj; 2965 return obj;
2890} 2966}
2891#endif 2967#endif
@@ -2958,7 +3034,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2958 } 3034 }
2959 3035
2960 free_block(cachep, ac->entry, batchcount, node); 3036 free_block(cachep, ac->entry, batchcount, node);
2961 free_done: 3037free_done:
2962#if STATS 3038#if STATS
2963 { 3039 {
2964 int i = 0; 3040 int i = 0;
@@ -2979,16 +3055,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
2979#endif 3055#endif
2980 spin_unlock(&l3->list_lock); 3056 spin_unlock(&l3->list_lock);
2981 ac->avail -= batchcount; 3057 ac->avail -= batchcount;
2982 memmove(ac->entry, &(ac->entry[batchcount]), 3058 memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
2983 sizeof(void *) * ac->avail);
2984} 3059}
2985 3060
2986/* 3061/*
2987 * __cache_free 3062 * Release an obj back to its cache. If the obj has a constructed state, it must
2988 * Release an obj back to its cache. If the obj has a constructed 3063 * be in this state _before_ it is released. Called with disabled ints.
2989 * state, it must be in this state _before_ it is released.
2990 *
2991 * Called with disabled ints.
2992 */ 3064 */
2993static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3065static inline void __cache_free(struct kmem_cache *cachep, void *objp)
2994{ 3066{
@@ -3007,9 +3079,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3007 if (unlikely(slabp->nodeid != numa_node_id())) { 3079 if (unlikely(slabp->nodeid != numa_node_id())) {
3008 struct array_cache *alien = NULL; 3080 struct array_cache *alien = NULL;
3009 int nodeid = slabp->nodeid; 3081 int nodeid = slabp->nodeid;
3010 struct kmem_list3 *l3 = 3082 struct kmem_list3 *l3;
3011 cachep->nodelists[numa_node_id()];
3012 3083
3084 l3 = cachep->nodelists[numa_node_id()];
3013 STATS_INC_NODEFREES(cachep); 3085 STATS_INC_NODEFREES(cachep);
3014 if (l3->alien && l3->alien[nodeid]) { 3086 if (l3->alien && l3->alien[nodeid]) {
3015 alien = l3->alien[nodeid]; 3087 alien = l3->alien[nodeid];
@@ -3056,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3056EXPORT_SYMBOL(kmem_cache_alloc); 3128EXPORT_SYMBOL(kmem_cache_alloc);
3057 3129
3058/** 3130/**
3131 * kmem_cache_alloc - Allocate an object. The memory is set to zero.
3132 * @cache: The cache to allocate from.
3133 * @flags: See kmalloc().
3134 *
3135 * Allocate an object from this cache and set the allocated memory to zero.
3136 * The flags are only relevant if the cache has no available objects.
3137 */
3138void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3139{
3140 void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3141 if (ret)
3142 memset(ret, 0, obj_size(cache));
3143 return ret;
3144}
3145EXPORT_SYMBOL(kmem_cache_zalloc);
3146
3147/**
3059 * kmem_ptr_validate - check if an untrusted pointer might 3148 * kmem_ptr_validate - check if an untrusted pointer might
3060 * be a slab entry. 3149 * be a slab entry.
3061 * @cachep: the cache we're checking against 3150 * @cachep: the cache we're checking against
@@ -3093,7 +3182,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
3093 if (unlikely(page_get_cache(page) != cachep)) 3182 if (unlikely(page_get_cache(page) != cachep))
3094 goto out; 3183 goto out;
3095 return 1; 3184 return 1;
3096 out: 3185out:
3097 return 0; 3186 return 0;
3098} 3187}
3099 3188
@@ -3119,7 +3208,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3119 local_irq_save(save_flags); 3208 local_irq_save(save_flags);
3120 3209
3121 if (nodeid == -1 || nodeid == numa_node_id() || 3210 if (nodeid == -1 || nodeid == numa_node_id() ||
3122 !cachep->nodelists[nodeid]) 3211 !cachep->nodelists[nodeid])
3123 ptr = ____cache_alloc(cachep, flags); 3212 ptr = ____cache_alloc(cachep, flags);
3124 else 3213 else
3125 ptr = __cache_alloc_node(cachep, flags, nodeid); 3214 ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3148,6 +3237,7 @@ EXPORT_SYMBOL(kmalloc_node);
3148 * kmalloc - allocate memory 3237 * kmalloc - allocate memory
3149 * @size: how many bytes of memory are required. 3238 * @size: how many bytes of memory are required.
3150 * @flags: the type of memory to allocate. 3239 * @flags: the type of memory to allocate.
3240 * @caller: function caller for debug tracking of the caller
3151 * 3241 *
3152 * kmalloc is the normal method of allocating memory 3242 * kmalloc is the normal method of allocating memory
3153 * in the kernel. 3243 * in the kernel.
@@ -3181,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3181 return __cache_alloc(cachep, flags, caller); 3271 return __cache_alloc(cachep, flags, caller);
3182} 3272}
3183 3273
3184#ifndef CONFIG_DEBUG_SLAB
3185 3274
3186void *__kmalloc(size_t size, gfp_t flags) 3275void *__kmalloc(size_t size, gfp_t flags)
3187{ 3276{
3277#ifndef CONFIG_DEBUG_SLAB
3188 return __do_kmalloc(size, flags, NULL); 3278 return __do_kmalloc(size, flags, NULL);
3279#else
3280 return __do_kmalloc(size, flags, __builtin_return_address(0));
3281#endif
3189} 3282}
3190EXPORT_SYMBOL(__kmalloc); 3283EXPORT_SYMBOL(__kmalloc);
3191 3284
3192#else 3285#ifdef CONFIG_DEBUG_SLAB
3193
3194void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) 3286void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3195{ 3287{
3196 return __do_kmalloc(size, flags, caller); 3288 return __do_kmalloc(size, flags, caller);
3197} 3289}
3198EXPORT_SYMBOL(__kmalloc_track_caller); 3290EXPORT_SYMBOL(__kmalloc_track_caller);
3199
3200#endif 3291#endif
3201 3292
3202#ifdef CONFIG_SMP 3293#ifdef CONFIG_SMP
@@ -3220,7 +3311,7 @@ void *__alloc_percpu(size_t size)
3220 * and we have no way of figuring out how to fix the array 3311 * and we have no way of figuring out how to fix the array
3221 * that we have allocated then.... 3312 * that we have allocated then....
3222 */ 3313 */
3223 for_each_cpu(i) { 3314 for_each_possible_cpu(i) {
3224 int node = cpu_to_node(i); 3315 int node = cpu_to_node(i);
3225 3316
3226 if (node_online(node)) 3317 if (node_online(node))
@@ -3236,7 +3327,7 @@ void *__alloc_percpu(size_t size)
3236 /* Catch derefs w/o wrappers */ 3327 /* Catch derefs w/o wrappers */
3237 return (void *)(~(unsigned long)pdata); 3328 return (void *)(~(unsigned long)pdata);
3238 3329
3239 unwind_oom: 3330unwind_oom:
3240 while (--i >= 0) { 3331 while (--i >= 0) {
3241 if (!cpu_possible(i)) 3332 if (!cpu_possible(i))
3242 continue; 3333 continue;
@@ -3307,7 +3398,7 @@ void free_percpu(const void *objp)
3307 /* 3398 /*
3308 * We allocate for all cpus so we cannot use for online cpu here. 3399 * We allocate for all cpus so we cannot use for online cpu here.
3309 */ 3400 */
3310 for_each_cpu(i) 3401 for_each_possible_cpu(i)
3311 kfree(p->ptrs[i]); 3402 kfree(p->ptrs[i]);
3312 kfree(p); 3403 kfree(p);
3313} 3404}
@@ -3327,61 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep)
3327EXPORT_SYMBOL_GPL(kmem_cache_name); 3418EXPORT_SYMBOL_GPL(kmem_cache_name);
3328 3419
3329/* 3420/*
3330 * This initializes kmem_list3 for all nodes. 3421 * This initializes kmem_list3 or resizes varioius caches for all nodes.
3331 */ 3422 */
3332static int alloc_kmemlist(struct kmem_cache *cachep) 3423static int alloc_kmemlist(struct kmem_cache *cachep)
3333{ 3424{
3334 int node; 3425 int node;
3335 struct kmem_list3 *l3; 3426 struct kmem_list3 *l3;
3336 int err = 0; 3427 struct array_cache *new_shared;
3428 struct array_cache **new_alien;
3337 3429
3338 for_each_online_node(node) { 3430 for_each_online_node(node) {
3339 struct array_cache *nc = NULL, *new; 3431
3340 struct array_cache **new_alien = NULL; 3432 new_alien = alloc_alien_cache(node, cachep->limit);
3341#ifdef CONFIG_NUMA 3433 if (!new_alien)
3342 if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3343 goto fail; 3434 goto fail;
3344#endif 3435
3345 if (!(new = alloc_arraycache(node, (cachep->shared * 3436 new_shared = alloc_arraycache(node,
3346 cachep->batchcount), 3437 cachep->shared*cachep->batchcount,
3347 0xbaadf00d))) 3438 0xbaadf00d);
3439 if (!new_shared) {
3440 free_alien_cache(new_alien);
3348 goto fail; 3441 goto fail;
3349 if ((l3 = cachep->nodelists[node])) { 3442 }
3443
3444 l3 = cachep->nodelists[node];
3445 if (l3) {
3446 struct array_cache *shared = l3->shared;
3350 3447
3351 spin_lock_irq(&l3->list_lock); 3448 spin_lock_irq(&l3->list_lock);
3352 3449
3353 if ((nc = cachep->nodelists[node]->shared)) 3450 if (shared)
3354 free_block(cachep, nc->entry, nc->avail, node); 3451 free_block(cachep, shared->entry,
3452 shared->avail, node);
3355 3453
3356 l3->shared = new; 3454 l3->shared = new_shared;
3357 if (!cachep->nodelists[node]->alien) { 3455 if (!l3->alien) {
3358 l3->alien = new_alien; 3456 l3->alien = new_alien;
3359 new_alien = NULL; 3457 new_alien = NULL;
3360 } 3458 }
3361 l3->free_limit = (1 + nr_cpus_node(node)) * 3459 l3->free_limit = (1 + nr_cpus_node(node)) *
3362 cachep->batchcount + cachep->num; 3460 cachep->batchcount + cachep->num;
3363 spin_unlock_irq(&l3->list_lock); 3461 spin_unlock_irq(&l3->list_lock);
3364 kfree(nc); 3462 kfree(shared);
3365 free_alien_cache(new_alien); 3463 free_alien_cache(new_alien);
3366 continue; 3464 continue;
3367 } 3465 }
3368 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3466 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3369 GFP_KERNEL, node))) 3467 if (!l3) {
3468 free_alien_cache(new_alien);
3469 kfree(new_shared);
3370 goto fail; 3470 goto fail;
3471 }
3371 3472
3372 kmem_list3_init(l3); 3473 kmem_list3_init(l3);
3373 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3474 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3374 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3475 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3375 l3->shared = new; 3476 l3->shared = new_shared;
3376 l3->alien = new_alien; 3477 l3->alien = new_alien;
3377 l3->free_limit = (1 + nr_cpus_node(node)) * 3478 l3->free_limit = (1 + nr_cpus_node(node)) *
3378 cachep->batchcount + cachep->num; 3479 cachep->batchcount + cachep->num;
3379 cachep->nodelists[node] = l3; 3480 cachep->nodelists[node] = l3;
3380 } 3481 }
3381 return err; 3482 return 0;
3382 fail: 3483
3383 err = -ENOMEM; 3484fail:
3384 return err; 3485 if (!cachep->next.next) {
3486 /* Cache is not active yet. Roll back what we did */
3487 node--;
3488 while (node >= 0) {
3489 if (cachep->nodelists[node]) {
3490 l3 = cachep->nodelists[node];
3491
3492 kfree(l3->shared);
3493 free_alien_cache(l3->alien);
3494 kfree(l3);
3495 cachep->nodelists[node] = NULL;
3496 }
3497 node--;
3498 }
3499 }
3500 return -ENOMEM;
3385} 3501}
3386 3502
3387struct ccupdate_struct { 3503struct ccupdate_struct {
@@ -3391,7 +3507,7 @@ struct ccupdate_struct {
3391 3507
3392static void do_ccupdate_local(void *info) 3508static void do_ccupdate_local(void *info)
3393{ 3509{
3394 struct ccupdate_struct *new = (struct ccupdate_struct *)info; 3510 struct ccupdate_struct *new = info;
3395 struct array_cache *old; 3511 struct array_cache *old;
3396 3512
3397 check_irq_off(); 3513 check_irq_off();
@@ -3401,16 +3517,17 @@ static void do_ccupdate_local(void *info)
3401 new->new[smp_processor_id()] = old; 3517 new->new[smp_processor_id()] = old;
3402} 3518}
3403 3519
3404static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, 3520/* Always called with the cache_chain_mutex held */
3405 int shared) 3521static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3522 int batchcount, int shared)
3406{ 3523{
3407 struct ccupdate_struct new; 3524 struct ccupdate_struct new;
3408 int i, err; 3525 int i, err;
3409 3526
3410 memset(&new.new, 0, sizeof(new.new)); 3527 memset(&new.new, 0, sizeof(new.new));
3411 for_each_online_cpu(i) { 3528 for_each_online_cpu(i) {
3412 new.new[i] = 3529 new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
3413 alloc_arraycache(cpu_to_node(i), limit, batchcount); 3530 batchcount);
3414 if (!new.new[i]) { 3531 if (!new.new[i]) {
3415 for (i--; i >= 0; i--) 3532 for (i--; i >= 0; i--)
3416 kfree(new.new[i]); 3533 kfree(new.new[i]);
@@ -3419,14 +3536,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
3419 } 3536 }
3420 new.cachep = cachep; 3537 new.cachep = cachep;
3421 3538
3422 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3539 on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
3423 3540
3424 check_irq_on(); 3541 check_irq_on();
3425 spin_lock(&cachep->spinlock);
3426 cachep->batchcount = batchcount; 3542 cachep->batchcount = batchcount;
3427 cachep->limit = limit; 3543 cachep->limit = limit;
3428 cachep->shared = shared; 3544 cachep->shared = shared;
3429 spin_unlock(&cachep->spinlock);
3430 3545
3431 for_each_online_cpu(i) { 3546 for_each_online_cpu(i) {
3432 struct array_cache *ccold = new.new[i]; 3547 struct array_cache *ccold = new.new[i];
@@ -3447,15 +3562,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
3447 return 0; 3562 return 0;
3448} 3563}
3449 3564
3565/* Called with cache_chain_mutex held always */
3450static void enable_cpucache(struct kmem_cache *cachep) 3566static void enable_cpucache(struct kmem_cache *cachep)
3451{ 3567{
3452 int err; 3568 int err;
3453 int limit, shared; 3569 int limit, shared;
3454 3570
3455 /* The head array serves three purposes: 3571 /*
3572 * The head array serves three purposes:
3456 * - create a LIFO ordering, i.e. return objects that are cache-warm 3573 * - create a LIFO ordering, i.e. return objects that are cache-warm
3457 * - reduce the number of spinlock operations. 3574 * - reduce the number of spinlock operations.
3458 * - reduce the number of linked list operations on the slab and 3575 * - reduce the number of linked list operations on the slab and
3459 * bufctl chains: array operations are cheaper. 3576 * bufctl chains: array operations are cheaper.
3460 * The numbers are guessed, we should auto-tune as described by 3577 * The numbers are guessed, we should auto-tune as described by
3461 * Bonwick. 3578 * Bonwick.
@@ -3471,7 +3588,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
3471 else 3588 else
3472 limit = 120; 3589 limit = 120;
3473 3590
3474 /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound 3591 /*
3592 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
3475 * allocation behaviour: Most allocs on one cpu, most free operations 3593 * allocation behaviour: Most allocs on one cpu, most free operations
3476 * on another cpu. For these cases, an efficient object passing between 3594 * on another cpu. For these cases, an efficient object passing between
3477 * cpus is necessary. This is provided by a shared array. The array 3595 * cpus is necessary. This is provided by a shared array. The array
@@ -3486,9 +3604,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
3486#endif 3604#endif
3487 3605
3488#if DEBUG 3606#if DEBUG
3489 /* With debugging enabled, large batchcount lead to excessively 3607 /*
3490 * long periods with disabled local interrupts. Limit the 3608 * With debugging enabled, large batchcount lead to excessively long
3491 * batchcount 3609 * periods with disabled local interrupts. Limit the batchcount
3492 */ 3610 */
3493 if (limit > 32) 3611 if (limit > 32)
3494 limit = 32; 3612 limit = 32;
@@ -3499,23 +3617,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
3499 cachep->name, -err); 3617 cachep->name, -err);
3500} 3618}
3501 3619
3502static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, 3620/*
3503 int force, int node) 3621 * Drain an array if it contains any elements taking the l3 lock only if
3622 * necessary. Note that the l3 listlock also protects the array_cache
3623 * if drain_array() is used on the shared array.
3624 */
3625void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3626 struct array_cache *ac, int force, int node)
3504{ 3627{
3505 int tofree; 3628 int tofree;
3506 3629
3507 check_spinlock_acquired_node(cachep, node); 3630 if (!ac || !ac->avail)
3631 return;
3508 if (ac->touched && !force) { 3632 if (ac->touched && !force) {
3509 ac->touched = 0; 3633 ac->touched = 0;
3510 } else if (ac->avail) { 3634 } else {
3511 tofree = force ? ac->avail : (ac->limit + 4) / 5; 3635 spin_lock_irq(&l3->list_lock);
3512 if (tofree > ac->avail) { 3636 if (ac->avail) {
3513 tofree = (ac->avail + 1) / 2; 3637 tofree = force ? ac->avail : (ac->limit + 4) / 5;
3638 if (tofree > ac->avail)
3639 tofree = (ac->avail + 1) / 2;
3640 free_block(cachep, ac->entry, tofree, node);
3641 ac->avail -= tofree;
3642 memmove(ac->entry, &(ac->entry[tofree]),
3643 sizeof(void *) * ac->avail);
3514 } 3644 }
3515 free_block(cachep, ac->entry, tofree, node); 3645 spin_unlock_irq(&l3->list_lock);
3516 ac->avail -= tofree;
3517 memmove(ac->entry, &(ac->entry[tofree]),
3518 sizeof(void *) * ac->avail);
3519 } 3646 }
3520} 3647}
3521 3648
@@ -3528,13 +3655,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
3528 * - clear the per-cpu caches for this CPU. 3655 * - clear the per-cpu caches for this CPU.
3529 * - return freeable pages to the main free memory pool. 3656 * - return freeable pages to the main free memory pool.
3530 * 3657 *
3531 * If we cannot acquire the cache chain mutex then just give up - we'll 3658 * If we cannot acquire the cache chain mutex then just give up - we'll try
3532 * try again on the next iteration. 3659 * again on the next iteration.
3533 */ 3660 */
3534static void cache_reap(void *unused) 3661static void cache_reap(void *unused)
3535{ 3662{
3536 struct list_head *walk; 3663 struct list_head *walk;
3537 struct kmem_list3 *l3; 3664 struct kmem_list3 *l3;
3665 int node = numa_node_id();
3538 3666
3539 if (!mutex_trylock(&cache_chain_mutex)) { 3667 if (!mutex_trylock(&cache_chain_mutex)) {
3540 /* Give up. Setup the next iteration. */ 3668 /* Give up. Setup the next iteration. */
@@ -3550,65 +3678,72 @@ static void cache_reap(void *unused)
3550 struct slab *slabp; 3678 struct slab *slabp;
3551 3679
3552 searchp = list_entry(walk, struct kmem_cache, next); 3680 searchp = list_entry(walk, struct kmem_cache, next);
3553
3554 if (searchp->flags & SLAB_NO_REAP)
3555 goto next;
3556
3557 check_irq_on(); 3681 check_irq_on();
3558 3682
3559 l3 = searchp->nodelists[numa_node_id()]; 3683 /*
3684 * We only take the l3 lock if absolutely necessary and we
3685 * have established with reasonable certainty that
3686 * we can do some work if the lock was obtained.
3687 */
3688 l3 = searchp->nodelists[node];
3689
3560 reap_alien(searchp, l3); 3690 reap_alien(searchp, l3);
3561 spin_lock_irq(&l3->list_lock);
3562 3691
3563 drain_array_locked(searchp, cpu_cache_get(searchp), 0, 3692 drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
3564 numa_node_id());
3565 3693
3694 /*
3695 * These are racy checks but it does not matter
3696 * if we skip one check or scan twice.
3697 */
3566 if (time_after(l3->next_reap, jiffies)) 3698 if (time_after(l3->next_reap, jiffies))
3567 goto next_unlock; 3699 goto next;
3568 3700
3569 l3->next_reap = jiffies + REAPTIMEOUT_LIST3; 3701 l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
3570 3702
3571 if (l3->shared) 3703 drain_array(searchp, l3, l3->shared, 0, node);
3572 drain_array_locked(searchp, l3->shared, 0,
3573 numa_node_id());
3574 3704
3575 if (l3->free_touched) { 3705 if (l3->free_touched) {
3576 l3->free_touched = 0; 3706 l3->free_touched = 0;
3577 goto next_unlock; 3707 goto next;
3578 } 3708 }
3579 3709
3580 tofree = 3710 tofree = (l3->free_limit + 5 * searchp->num - 1) /
3581 (l3->free_limit + 5 * searchp->num - 3711 (5 * searchp->num);
3582 1) / (5 * searchp->num);
3583 do { 3712 do {
3713 /*
3714 * Do not lock if there are no free blocks.
3715 */
3716 if (list_empty(&l3->slabs_free))
3717 break;
3718
3719 spin_lock_irq(&l3->list_lock);
3584 p = l3->slabs_free.next; 3720 p = l3->slabs_free.next;
3585 if (p == &(l3->slabs_free)) 3721 if (p == &(l3->slabs_free)) {
3722 spin_unlock_irq(&l3->list_lock);
3586 break; 3723 break;
3724 }
3587 3725
3588 slabp = list_entry(p, struct slab, list); 3726 slabp = list_entry(p, struct slab, list);
3589 BUG_ON(slabp->inuse); 3727 BUG_ON(slabp->inuse);
3590 list_del(&slabp->list); 3728 list_del(&slabp->list);
3591 STATS_INC_REAPED(searchp); 3729 STATS_INC_REAPED(searchp);
3592 3730
3593 /* Safe to drop the lock. The slab is no longer 3731 /*
3594 * linked to the cache. 3732 * Safe to drop the lock. The slab is no longer linked
3595 * searchp cannot disappear, we hold 3733 * to the cache. searchp cannot disappear, we hold
3596 * cache_chain_lock 3734 * cache_chain_lock
3597 */ 3735 */
3598 l3->free_objects -= searchp->num; 3736 l3->free_objects -= searchp->num;
3599 spin_unlock_irq(&l3->list_lock); 3737 spin_unlock_irq(&l3->list_lock);
3600 slab_destroy(searchp, slabp); 3738 slab_destroy(searchp, slabp);
3601 spin_lock_irq(&l3->list_lock);
3602 } while (--tofree > 0); 3739 } while (--tofree > 0);
3603 next_unlock: 3740next:
3604 spin_unlock_irq(&l3->list_lock);
3605 next:
3606 cond_resched(); 3741 cond_resched();
3607 } 3742 }
3608 check_irq_on(); 3743 check_irq_on();
3609 mutex_unlock(&cache_chain_mutex); 3744 mutex_unlock(&cache_chain_mutex);
3610 next_reap_node(); 3745 next_reap_node();
3611 /* Setup the next iteration */ 3746 /* Set up the next iteration */
3612 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3747 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3613} 3748}
3614 3749
@@ -3658,8 +3793,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3658{ 3793{
3659 struct kmem_cache *cachep = p; 3794 struct kmem_cache *cachep = p;
3660 ++*pos; 3795 ++*pos;
3661 return cachep->next.next == &cache_chain ? NULL 3796 return cachep->next.next == &cache_chain ?
3662 : list_entry(cachep->next.next, struct kmem_cache, next); 3797 NULL : list_entry(cachep->next.next, struct kmem_cache, next);
3663} 3798}
3664 3799
3665static void s_stop(struct seq_file *m, void *p) 3800static void s_stop(struct seq_file *m, void *p)
@@ -3681,7 +3816,6 @@ static int s_show(struct seq_file *m, void *p)
3681 int node; 3816 int node;
3682 struct kmem_list3 *l3; 3817 struct kmem_list3 *l3;
3683 3818
3684 spin_lock(&cachep->spinlock);
3685 active_objs = 0; 3819 active_objs = 0;
3686 num_slabs = 0; 3820 num_slabs = 0;
3687 for_each_online_node(node) { 3821 for_each_online_node(node) {
@@ -3748,7 +3882,9 @@ static int s_show(struct seq_file *m, void *p)
3748 unsigned long node_frees = cachep->node_frees; 3882 unsigned long node_frees = cachep->node_frees;
3749 3883
3750 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3884 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3751 %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); 3885 %4lu %4lu %4lu %4lu", allocs, high, grown,
3886 reaped, errors, max_freeable, node_allocs,
3887 node_frees);
3752 } 3888 }
3753 /* cpu stats */ 3889 /* cpu stats */
3754 { 3890 {
@@ -3762,7 +3898,6 @@ static int s_show(struct seq_file *m, void *p)
3762 } 3898 }
3763#endif 3899#endif
3764 seq_putc(m, '\n'); 3900 seq_putc(m, '\n');
3765 spin_unlock(&cachep->spinlock);
3766 return 0; 3901 return 0;
3767} 3902}
3768 3903
@@ -3820,13 +3955,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3820 mutex_lock(&cache_chain_mutex); 3955 mutex_lock(&cache_chain_mutex);
3821 res = -EINVAL; 3956 res = -EINVAL;
3822 list_for_each(p, &cache_chain) { 3957 list_for_each(p, &cache_chain) {
3823 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, 3958 struct kmem_cache *cachep;
3824 next);
3825 3959
3960 cachep = list_entry(p, struct kmem_cache, next);
3826 if (!strcmp(cachep->name, kbuf)) { 3961 if (!strcmp(cachep->name, kbuf)) {
3827 if (limit < 1 || 3962 if (limit < 1 || batchcount < 1 ||
3828 batchcount < 1 || 3963 batchcount > limit || shared < 0) {
3829 batchcount > limit || shared < 0) {
3830 res = 0; 3964 res = 0;
3831 } else { 3965 } else {
3832 res = do_tune_cpucache(cachep, limit, 3966 res = do_tune_cpucache(cachep, limit,
@@ -3840,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3840 res = count; 3974 res = count;
3841 return res; 3975 return res;
3842} 3976}
3977
3978#ifdef CONFIG_DEBUG_SLAB_LEAK
3979
3980static void *leaks_start(struct seq_file *m, loff_t *pos)
3981{
3982 loff_t n = *pos;
3983 struct list_head *p;
3984
3985 mutex_lock(&cache_chain_mutex);
3986 p = cache_chain.next;
3987 while (n--) {
3988 p = p->next;
3989 if (p == &cache_chain)
3990 return NULL;
3991 }
3992 return list_entry(p, struct kmem_cache, next);
3993}
3994
3995static inline int add_caller(unsigned long *n, unsigned long v)
3996{
3997 unsigned long *p;
3998 int l;
3999 if (!v)
4000 return 1;
4001 l = n[1];
4002 p = n + 2;
4003 while (l) {
4004 int i = l/2;
4005 unsigned long *q = p + 2 * i;
4006 if (*q == v) {
4007 q[1]++;
4008 return 1;
4009 }
4010 if (*q > v) {
4011 l = i;
4012 } else {
4013 p = q + 2;
4014 l -= i + 1;
4015 }
4016 }
4017 if (++n[1] == n[0])
4018 return 0;
4019 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4020 p[0] = v;
4021 p[1] = 1;
4022 return 1;
4023}
4024
4025static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4026{
4027 void *p;
4028 int i;
4029 if (n[0] == n[1])
4030 return;
4031 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4032 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4033 continue;
4034 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4035 return;
4036 }
4037}
4038
4039static void show_symbol(struct seq_file *m, unsigned long address)
4040{
4041#ifdef CONFIG_KALLSYMS
4042 char *modname;
4043 const char *name;
4044 unsigned long offset, size;
4045 char namebuf[KSYM_NAME_LEN+1];
4046
4047 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4048
4049 if (name) {
4050 seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4051 if (modname)
4052 seq_printf(m, " [%s]", modname);
4053 return;
4054 }
4055#endif
4056 seq_printf(m, "%p", (void *)address);
4057}
4058
4059static int leaks_show(struct seq_file *m, void *p)
4060{
4061 struct kmem_cache *cachep = p;
4062 struct list_head *q;
4063 struct slab *slabp;
4064 struct kmem_list3 *l3;
4065 const char *name;
4066 unsigned long *n = m->private;
4067 int node;
4068 int i;
4069
4070 if (!(cachep->flags & SLAB_STORE_USER))
4071 return 0;
4072 if (!(cachep->flags & SLAB_RED_ZONE))
4073 return 0;
4074
4075 /* OK, we can do it */
4076
4077 n[1] = 0;
4078
4079 for_each_online_node(node) {
4080 l3 = cachep->nodelists[node];
4081 if (!l3)
4082 continue;
4083
4084 check_irq_on();
4085 spin_lock_irq(&l3->list_lock);
4086
4087 list_for_each(q, &l3->slabs_full) {
4088 slabp = list_entry(q, struct slab, list);
4089 handle_slab(n, cachep, slabp);
4090 }
4091 list_for_each(q, &l3->slabs_partial) {
4092 slabp = list_entry(q, struct slab, list);
4093 handle_slab(n, cachep, slabp);
4094 }
4095 spin_unlock_irq(&l3->list_lock);
4096 }
4097 name = cachep->name;
4098 if (n[0] == n[1]) {
4099 /* Increase the buffer size */
4100 mutex_unlock(&cache_chain_mutex);
4101 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4102 if (!m->private) {
4103 /* Too bad, we are really out */
4104 m->private = n;
4105 mutex_lock(&cache_chain_mutex);
4106 return -ENOMEM;
4107 }
4108 *(unsigned long *)m->private = n[0] * 2;
4109 kfree(n);
4110 mutex_lock(&cache_chain_mutex);
4111 /* Now make sure this entry will be retried */
4112 m->count = m->size;
4113 return 0;
4114 }
4115 for (i = 0; i < n[1]; i++) {
4116 seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4117 show_symbol(m, n[2*i+2]);
4118 seq_putc(m, '\n');
4119 }
4120 return 0;
4121}
4122
4123struct seq_operations slabstats_op = {
4124 .start = leaks_start,
4125 .next = s_next,
4126 .stop = s_stop,
4127 .show = leaks_show,
4128};
4129#endif
3843#endif 4130#endif
3844 4131
3845/** 4132/**
diff --git a/mm/slob.c b/mm/slob.c
index a1f42bdc0245..9bcc7e2cabfd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
294} 294}
295EXPORT_SYMBOL(kmem_cache_alloc); 295EXPORT_SYMBOL(kmem_cache_alloc);
296 296
297void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
298{
299 void *ret = kmem_cache_alloc(c, flags);
300 if (ret)
301 memset(ret, 0, c->size);
302
303 return ret;
304}
305EXPORT_SYMBOL(kmem_cache_zalloc);
306
297void kmem_cache_free(struct kmem_cache *c, void *b) 307void kmem_cache_free(struct kmem_cache *c, void *b)
298{ 308{
299 if (c->dtor) 309 if (c->dtor)
diff --git a/mm/swap.c b/mm/swap.c
index b524ea90bddb..88895c249bc9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
209 */ 209 */
210void fastcall __page_cache_release(struct page *page) 210void fastcall __page_cache_release(struct page *page)
211{ 211{
212 unsigned long flags; 212 if (PageLRU(page)) {
213 struct zone *zone = page_zone(page); 213 unsigned long flags;
214 struct zone *zone = page_zone(page);
214 215
215 spin_lock_irqsave(&zone->lru_lock, flags); 216 spin_lock_irqsave(&zone->lru_lock, flags);
216 if (TestClearPageLRU(page)) 217 BUG_ON(!PageLRU(page));
218 __ClearPageLRU(page);
217 del_page_from_lru(zone, page); 219 del_page_from_lru(zone, page);
218 if (page_count(page) != 0) 220 spin_unlock_irqrestore(&zone->lru_lock, flags);
219 page = NULL; 221 }
220 spin_unlock_irqrestore(&zone->lru_lock, flags); 222 free_hot_page(page);
221 if (page)
222 free_hot_page(page);
223} 223}
224
225EXPORT_SYMBOL(__page_cache_release); 224EXPORT_SYMBOL(__page_cache_release);
226 225
227/* 226/*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
245 pagevec_init(&pages_to_free, cold); 244 pagevec_init(&pages_to_free, cold);
246 for (i = 0; i < nr; i++) { 245 for (i = 0; i < nr; i++) {
247 struct page *page = pages[i]; 246 struct page *page = pages[i];
248 struct zone *pagezone;
249 247
250 if (unlikely(PageCompound(page))) { 248 if (unlikely(PageCompound(page))) {
251 if (zone) { 249 if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
259 if (!put_page_testzero(page)) 257 if (!put_page_testzero(page))
260 continue; 258 continue;
261 259
262 pagezone = page_zone(page); 260 if (PageLRU(page)) {
263 if (pagezone != zone) { 261 struct zone *pagezone = page_zone(page);
264 if (zone) 262 if (pagezone != zone) {
265 spin_unlock_irq(&zone->lru_lock); 263 if (zone)
266 zone = pagezone; 264 spin_unlock_irq(&zone->lru_lock);
267 spin_lock_irq(&zone->lru_lock); 265 zone = pagezone;
268 } 266 spin_lock_irq(&zone->lru_lock);
269 if (TestClearPageLRU(page)) 267 }
268 BUG_ON(!PageLRU(page));
269 __ClearPageLRU(page);
270 del_page_from_lru(zone, page); 270 del_page_from_lru(zone, page);
271 if (page_count(page) == 0) { 271 }
272 if (!pagevec_add(&pages_to_free, page)) { 272
273 if (!pagevec_add(&pages_to_free, page)) {
274 if (zone) {
273 spin_unlock_irq(&zone->lru_lock); 275 spin_unlock_irq(&zone->lru_lock);
274 __pagevec_free(&pages_to_free); 276 zone = NULL;
275 pagevec_reinit(&pages_to_free);
276 zone = NULL; /* No lock is held */
277 } 277 }
278 } 278 __pagevec_free(&pages_to_free);
279 pagevec_reinit(&pages_to_free);
280 }
279 } 281 }
280 if (zone) 282 if (zone)
281 spin_unlock_irq(&zone->lru_lock); 283 spin_unlock_irq(&zone->lru_lock);
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
343 zone = pagezone; 345 zone = pagezone;
344 spin_lock_irq(&zone->lru_lock); 346 spin_lock_irq(&zone->lru_lock);
345 } 347 }
346 if (TestSetPageLRU(page)) 348 BUG_ON(PageLRU(page));
347 BUG(); 349 SetPageLRU(page);
348 add_page_to_inactive_list(zone, page); 350 add_page_to_inactive_list(zone, page);
349 } 351 }
350 if (zone) 352 if (zone)
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
370 zone = pagezone; 372 zone = pagezone;
371 spin_lock_irq(&zone->lru_lock); 373 spin_lock_irq(&zone->lru_lock);
372 } 374 }
373 if (TestSetPageLRU(page)) 375 BUG_ON(PageLRU(page));
374 BUG(); 376 SetPageLRU(page);
375 if (TestSetPageActive(page)) 377 BUG_ON(PageActive(page));
376 BUG(); 378 SetPageActive(page);
377 add_page_to_active_list(zone, page); 379 add_page_to_active_list(zone, page);
378 } 380 }
379 if (zone) 381 if (zone)
@@ -510,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc)
510 512
511 spin_lock(&fbc->lock); 513 spin_lock(&fbc->lock);
512 ret = fbc->count; 514 ret = fbc->count;
513 for_each_cpu(cpu) { 515 for_each_possible_cpu(cpu) {
514 long *pcount = per_cpu_ptr(fbc->counters, cpu); 516 long *pcount = per_cpu_ptr(fbc->counters, cpu);
515 ret += *pcount; 517 ret += *pcount;
516 } 518 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e1636..d7af296833fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/pagevec.h> 17#include <linux/pagevec.h>
18#include <linux/migrate.h>
18 19
19#include <asm/pgtable.h> 20#include <asm/pgtable.h>
20 21
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d073b8..39aa9d129612 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
45 45
46struct swap_list_t swap_list = {-1, -1}; 46struct swap_list_t swap_list = {-1, -1};
47 47
48struct swap_info_struct swap_info[MAX_SWAPFILES]; 48static struct swap_info_struct swap_info[MAX_SWAPFILES];
49 49
50static DEFINE_MUTEX(swapon_mutex); 50static DEFINE_MUTEX(swapon_mutex);
51 51
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
116 last_in_cluster = offset + SWAPFILE_CLUSTER; 116 last_in_cluster = offset + SWAPFILE_CLUSTER;
117 else if (offset == last_in_cluster) { 117 else if (offset == last_in_cluster) {
118 spin_lock(&swap_lock); 118 spin_lock(&swap_lock);
119 si->cluster_next = offset-SWAPFILE_CLUSTER-1; 119 si->cluster_next = offset-SWAPFILE_CLUSTER+1;
120 goto cluster; 120 goto cluster;
121 } 121 }
122 if (unlikely(--latency_ration < 0)) { 122 if (unlikely(--latency_ration < 0)) {
@@ -417,6 +417,61 @@ void free_swap_and_cache(swp_entry_t entry)
417 } 417 }
418} 418}
419 419
420#ifdef CONFIG_SOFTWARE_SUSPEND
421/*
422 * Find the swap type that corresponds to given device (if any)
423 *
424 * This is needed for software suspend and is done in such a way that inode
425 * aliasing is allowed.
426 */
427int swap_type_of(dev_t device)
428{
429 int i;
430
431 spin_lock(&swap_lock);
432 for (i = 0; i < nr_swapfiles; i++) {
433 struct inode *inode;
434
435 if (!(swap_info[i].flags & SWP_WRITEOK))
436 continue;
437 if (!device) {
438 spin_unlock(&swap_lock);
439 return i;
440 }
441 inode = swap_info->swap_file->f_dentry->d_inode;
442 if (S_ISBLK(inode->i_mode) &&
443 device == MKDEV(imajor(inode), iminor(inode))) {
444 spin_unlock(&swap_lock);
445 return i;
446 }
447 }
448 spin_unlock(&swap_lock);
449 return -ENODEV;
450}
451
452/*
453 * Return either the total number of swap pages of given type, or the number
454 * of free pages of that type (depending on @free)
455 *
456 * This is needed for software suspend
457 */
458unsigned int count_swap_pages(int type, int free)
459{
460 unsigned int n = 0;
461
462 if (type < nr_swapfiles) {
463 spin_lock(&swap_lock);
464 if (swap_info[type].flags & SWP_WRITEOK) {
465 n = swap_info[type].pages;
466 if (free)
467 n -= swap_info[type].inuse_pages;
468 }
469 spin_unlock(&swap_lock);
470 }
471 return n;
472}
473#endif
474
420/* 475/*
421 * No need to decide whether this PTE shares the swap entry with others, 476 * No need to decide whether this PTE shares the swap entry with others,
422 * just let do_wp_page work it out if a write is requested later - to 477 * just let do_wp_page work it out if a write is requested later - to
diff --git a/mm/util.c b/mm/util.c
index 5f4bb59da63c..7368479220b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,20 +1,22 @@
1#include <linux/slab.h> 1#include <linux/slab.h>
2#include <linux/string.h> 2#include <linux/string.h>
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/err.h>
5#include <asm/uaccess.h>
4 6
5/** 7/**
6 * kzalloc - allocate memory. The memory is set to zero. 8 * __kzalloc - allocate memory. The memory is set to zero.
7 * @size: how many bytes of memory are required. 9 * @size: how many bytes of memory are required.
8 * @flags: the type of memory to allocate. 10 * @flags: the type of memory to allocate.
9 */ 11 */
10void *kzalloc(size_t size, gfp_t flags) 12void *__kzalloc(size_t size, gfp_t flags)
11{ 13{
12 void *ret = kmalloc(size, flags); 14 void *ret = ____kmalloc(size, flags);
13 if (ret) 15 if (ret)
14 memset(ret, 0, size); 16 memset(ret, 0, size);
15 return ret; 17 return ret;
16} 18}
17EXPORT_SYMBOL(kzalloc); 19EXPORT_SYMBOL(__kzalloc);
18 20
19/* 21/*
20 * kstrdup - allocate space for and copy an existing string 22 * kstrdup - allocate space for and copy an existing string
@@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp)
31 return NULL; 33 return NULL;
32 34
33 len = strlen(s) + 1; 35 len = strlen(s) + 1;
34 buf = kmalloc(len, gfp); 36 buf = ____kmalloc(len, gfp);
35 if (buf) 37 if (buf)
36 memcpy(buf, s, len); 38 memcpy(buf, s, len);
37 return buf; 39 return buf;
38} 40}
39EXPORT_SYMBOL(kstrdup); 41EXPORT_SYMBOL(kstrdup);
42
43/*
44 * strndup_user - duplicate an existing string from user space
45 *
46 * @s: The string to duplicate
47 * @n: Maximum number of bytes to copy, including the trailing NUL.
48 */
49char *strndup_user(const char __user *s, long n)
50{
51 char *p;
52 long length;
53
54 length = strnlen_user(s, n);
55
56 if (!length)
57 return ERR_PTR(-EFAULT);
58
59 if (length > n)
60 return ERR_PTR(-EINVAL);
61
62 p = kmalloc(length, GFP_KERNEL);
63
64 if (!p)
65 return ERR_PTR(-ENOMEM);
66
67 if (copy_from_user(p, s, length)) {
68 kfree(p);
69 return ERR_PTR(-EFAULT);
70 }
71
72 p[length - 1] = '\0';
73
74 return p;
75}
76EXPORT_SYMBOL(strndup_user);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02e2..acdf001d6941 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,39 +33,21 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/notifier.h> 34#include <linux/notifier.h>
35#include <linux/rwsem.h> 35#include <linux/rwsem.h>
36#include <linux/delay.h>
36 37
37#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
38#include <asm/div64.h> 39#include <asm/div64.h>
39 40
40#include <linux/swapops.h> 41#include <linux/swapops.h>
41 42
42/* possible outcome of pageout() */ 43#include "internal.h"
43typedef enum {
44 /* failed to write page out, page is locked */
45 PAGE_KEEP,
46 /* move page to the active list, page is locked */
47 PAGE_ACTIVATE,
48 /* page has been sent to the disk successfully, page is unlocked */
49 PAGE_SUCCESS,
50 /* page is clean and locked */
51 PAGE_CLEAN,
52} pageout_t;
53 44
54struct scan_control { 45struct scan_control {
55 /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
56 unsigned long nr_to_scan;
57
58 /* Incremented by the number of inactive pages that were scanned */ 46 /* Incremented by the number of inactive pages that were scanned */
59 unsigned long nr_scanned; 47 unsigned long nr_scanned;
60 48
61 /* Incremented by the number of pages reclaimed */
62 unsigned long nr_reclaimed;
63
64 unsigned long nr_mapped; /* From page_state */ 49 unsigned long nr_mapped; /* From page_state */
65 50
66 /* Ask shrink_caches, or shrink_zone to scan at this priority */
67 unsigned int priority;
68
69 /* This context's GFP mask */ 51 /* This context's GFP mask */
70 gfp_t gfp_mask; 52 gfp_t gfp_mask;
71 53
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker);
183 * 165 *
184 * Returns the number of slab objects which we shrunk. 166 * Returns the number of slab objects which we shrunk.
185 */ 167 */
186int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) 168unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
169 unsigned long lru_pages)
187{ 170{
188 struct shrinker *shrinker; 171 struct shrinker *shrinker;
189 int ret = 0; 172 unsigned long ret = 0;
190 173
191 if (scanned == 0) 174 if (scanned == 0)
192 scanned = SWAP_CLUSTER_MAX; 175 scanned = SWAP_CLUSTER_MAX;
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping,
306} 289}
307 290
308/* 291/*
309 * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). 292 * pageout is called by shrink_page_list() for each dirty page.
293 * Calls ->writepage().
310 */ 294 */
311static pageout_t pageout(struct page *page, struct address_space *mapping) 295pageout_t pageout(struct page *page, struct address_space *mapping)
312{ 296{
313 /* 297 /*
314 * If the page is dirty, only perform writeback if that write 298 * If the page is dirty, only perform writeback if that write
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
376 return PAGE_CLEAN; 360 return PAGE_CLEAN;
377} 361}
378 362
379static int remove_mapping(struct address_space *mapping, struct page *page) 363int remove_mapping(struct address_space *mapping, struct page *page)
380{ 364{
381 if (!mapping) 365 if (!mapping)
382 return 0; /* truncate got there first */ 366 return 0; /* truncate got there first */
@@ -414,14 +398,15 @@ cannot_free:
414} 398}
415 399
416/* 400/*
417 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed 401 * shrink_page_list() returns the number of reclaimed pages
418 */ 402 */
419static int shrink_list(struct list_head *page_list, struct scan_control *sc) 403static unsigned long shrink_page_list(struct list_head *page_list,
404 struct scan_control *sc)
420{ 405{
421 LIST_HEAD(ret_pages); 406 LIST_HEAD(ret_pages);
422 struct pagevec freed_pvec; 407 struct pagevec freed_pvec;
423 int pgactivate = 0; 408 int pgactivate = 0;
424 int reclaimed = 0; 409 unsigned long nr_reclaimed = 0;
425 410
426 cond_resched(); 411 cond_resched();
427 412
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
464 * Anonymous process memory has backing store? 449 * Anonymous process memory has backing store?
465 * Try to allocate it some swap space here. 450 * Try to allocate it some swap space here.
466 */ 451 */
467 if (PageAnon(page) && !PageSwapCache(page)) { 452 if (PageAnon(page) && !PageSwapCache(page))
468 if (!sc->may_swap)
469 goto keep_locked;
470 if (!add_to_swap(page, GFP_ATOMIC)) 453 if (!add_to_swap(page, GFP_ATOMIC))
471 goto activate_locked; 454 goto activate_locked;
472 }
473#endif /* CONFIG_SWAP */ 455#endif /* CONFIG_SWAP */
474 456
475 mapping = page_mapping(page); 457 mapping = page_mapping(page);
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
481 * processes. Try to unmap it here. 463 * processes. Try to unmap it here.
482 */ 464 */
483 if (page_mapped(page) && mapping) { 465 if (page_mapped(page) && mapping) {
484 /*
485 * No unmapping if we do not swap
486 */
487 if (!sc->may_swap)
488 goto keep_locked;
489
490 switch (try_to_unmap(page, 0)) { 466 switch (try_to_unmap(page, 0)) {
491 case SWAP_FAIL: 467 case SWAP_FAIL:
492 goto activate_locked; 468 goto activate_locked;
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
561 537
562free_it: 538free_it:
563 unlock_page(page); 539 unlock_page(page);
564 reclaimed++; 540 nr_reclaimed++;
565 if (!pagevec_add(&freed_pvec, page)) 541 if (!pagevec_add(&freed_pvec, page))
566 __pagevec_release_nonlru(&freed_pvec); 542 __pagevec_release_nonlru(&freed_pvec);
567 continue; 543 continue;
@@ -579,483 +555,8 @@ keep:
579 if (pagevec_count(&freed_pvec)) 555 if (pagevec_count(&freed_pvec))
580 __pagevec_release_nonlru(&freed_pvec); 556 __pagevec_release_nonlru(&freed_pvec);
581 mod_page_state(pgactivate, pgactivate); 557 mod_page_state(pgactivate, pgactivate);
582 sc->nr_reclaimed += reclaimed; 558 return nr_reclaimed;
583 return reclaimed;
584}
585
586#ifdef CONFIG_MIGRATION
587static inline void move_to_lru(struct page *page)
588{
589 list_del(&page->lru);
590 if (PageActive(page)) {
591 /*
592 * lru_cache_add_active checks that
593 * the PG_active bit is off.
594 */
595 ClearPageActive(page);
596 lru_cache_add_active(page);
597 } else {
598 lru_cache_add(page);
599 }
600 put_page(page);
601}
602
603/*
604 * Add isolated pages on the list back to the LRU.
605 *
606 * returns the number of pages put back.
607 */
608int putback_lru_pages(struct list_head *l)
609{
610 struct page *page;
611 struct page *page2;
612 int count = 0;
613
614 list_for_each_entry_safe(page, page2, l, lru) {
615 move_to_lru(page);
616 count++;
617 }
618 return count;
619}
620
621/*
622 * Non migratable page
623 */
624int fail_migrate_page(struct page *newpage, struct page *page)
625{
626 return -EIO;
627}
628EXPORT_SYMBOL(fail_migrate_page);
629
630/*
631 * swapout a single page
632 * page is locked upon entry, unlocked on exit
633 */
634static int swap_page(struct page *page)
635{
636 struct address_space *mapping = page_mapping(page);
637
638 if (page_mapped(page) && mapping)
639 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
640 goto unlock_retry;
641
642 if (PageDirty(page)) {
643 /* Page is dirty, try to write it out here */
644 switch(pageout(page, mapping)) {
645 case PAGE_KEEP:
646 case PAGE_ACTIVATE:
647 goto unlock_retry;
648
649 case PAGE_SUCCESS:
650 goto retry;
651
652 case PAGE_CLEAN:
653 ; /* try to free the page below */
654 }
655 }
656
657 if (PagePrivate(page)) {
658 if (!try_to_release_page(page, GFP_KERNEL) ||
659 (!mapping && page_count(page) == 1))
660 goto unlock_retry;
661 }
662
663 if (remove_mapping(mapping, page)) {
664 /* Success */
665 unlock_page(page);
666 return 0;
667 }
668
669unlock_retry:
670 unlock_page(page);
671
672retry:
673 return -EAGAIN;
674} 559}
675EXPORT_SYMBOL(swap_page);
676
677/*
678 * Page migration was first developed in the context of the memory hotplug
679 * project. The main authors of the migration code are:
680 *
681 * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
682 * Hirokazu Takahashi <taka@valinux.co.jp>
683 * Dave Hansen <haveblue@us.ibm.com>
684 * Christoph Lameter <clameter@sgi.com>
685 */
686
687/*
688 * Remove references for a page and establish the new page with the correct
689 * basic settings to be able to stop accesses to the page.
690 */
691int migrate_page_remove_references(struct page *newpage,
692 struct page *page, int nr_refs)
693{
694 struct address_space *mapping = page_mapping(page);
695 struct page **radix_pointer;
696
697 /*
698 * Avoid doing any of the following work if the page count
699 * indicates that the page is in use or truncate has removed
700 * the page.
701 */
702 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
703 return -EAGAIN;
704
705 /*
706 * Establish swap ptes for anonymous pages or destroy pte
707 * maps for files.
708 *
709 * In order to reestablish file backed mappings the fault handlers
710 * will take the radix tree_lock which may then be used to stop
711 * processses from accessing this page until the new page is ready.
712 *
713 * A process accessing via a swap pte (an anonymous page) will take a
714 * page_lock on the old page which will block the process until the
715 * migration attempt is complete. At that time the PageSwapCache bit
716 * will be examined. If the page was migrated then the PageSwapCache
717 * bit will be clear and the operation to retrieve the page will be
718 * retried which will find the new page in the radix tree. Then a new
719 * direct mapping may be generated based on the radix tree contents.
720 *
721 * If the page was not migrated then the PageSwapCache bit
722 * is still set and the operation may continue.
723 */
724 if (try_to_unmap(page, 1) == SWAP_FAIL)
725 /* A vma has VM_LOCKED set -> Permanent failure */
726 return -EPERM;
727
728 /*
729 * Give up if we were unable to remove all mappings.
730 */
731 if (page_mapcount(page))
732 return -EAGAIN;
733
734 write_lock_irq(&mapping->tree_lock);
735
736 radix_pointer = (struct page **)radix_tree_lookup_slot(
737 &mapping->page_tree,
738 page_index(page));
739
740 if (!page_mapping(page) || page_count(page) != nr_refs ||
741 *radix_pointer != page) {
742 write_unlock_irq(&mapping->tree_lock);
743 return -EAGAIN;
744 }
745
746 /*
747 * Now we know that no one else is looking at the page.
748 *
749 * Certain minimal information about a page must be available
750 * in order for other subsystems to properly handle the page if they
751 * find it through the radix tree update before we are finished
752 * copying the page.
753 */
754 get_page(newpage);
755 newpage->index = page->index;
756 newpage->mapping = page->mapping;
757 if (PageSwapCache(page)) {
758 SetPageSwapCache(newpage);
759 set_page_private(newpage, page_private(page));
760 }
761
762 *radix_pointer = newpage;
763 __put_page(page);
764 write_unlock_irq(&mapping->tree_lock);
765
766 return 0;
767}
768EXPORT_SYMBOL(migrate_page_remove_references);
769
770/*
771 * Copy the page to its new location
772 */
773void migrate_page_copy(struct page *newpage, struct page *page)
774{
775 copy_highpage(newpage, page);
776
777 if (PageError(page))
778 SetPageError(newpage);
779 if (PageReferenced(page))
780 SetPageReferenced(newpage);
781 if (PageUptodate(page))
782 SetPageUptodate(newpage);
783 if (PageActive(page))
784 SetPageActive(newpage);
785 if (PageChecked(page))
786 SetPageChecked(newpage);
787 if (PageMappedToDisk(page))
788 SetPageMappedToDisk(newpage);
789
790 if (PageDirty(page)) {
791 clear_page_dirty_for_io(page);
792 set_page_dirty(newpage);
793 }
794
795 ClearPageSwapCache(page);
796 ClearPageActive(page);
797 ClearPagePrivate(page);
798 set_page_private(page, 0);
799 page->mapping = NULL;
800
801 /*
802 * If any waiters have accumulated on the new page then
803 * wake them up.
804 */
805 if (PageWriteback(newpage))
806 end_page_writeback(newpage);
807}
808EXPORT_SYMBOL(migrate_page_copy);
809
810/*
811 * Common logic to directly migrate a single page suitable for
812 * pages that do not use PagePrivate.
813 *
814 * Pages are locked upon entry and exit.
815 */
816int migrate_page(struct page *newpage, struct page *page)
817{
818 int rc;
819
820 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
821
822 rc = migrate_page_remove_references(newpage, page, 2);
823
824 if (rc)
825 return rc;
826
827 migrate_page_copy(newpage, page);
828
829 /*
830 * Remove auxiliary swap entries and replace
831 * them with real ptes.
832 *
833 * Note that a real pte entry will allow processes that are not
834 * waiting on the page lock to use the new page via the page tables
835 * before the new page is unlocked.
836 */
837 remove_from_swap(newpage);
838 return 0;
839}
840EXPORT_SYMBOL(migrate_page);
841
842/*
843 * migrate_pages
844 *
845 * Two lists are passed to this function. The first list
846 * contains the pages isolated from the LRU to be migrated.
847 * The second list contains new pages that the pages isolated
848 * can be moved to. If the second list is NULL then all
849 * pages are swapped out.
850 *
851 * The function returns after 10 attempts or if no pages
852 * are movable anymore because to has become empty
853 * or no retryable pages exist anymore.
854 *
855 * Return: Number of pages not migrated when "to" ran empty.
856 */
857int migrate_pages(struct list_head *from, struct list_head *to,
858 struct list_head *moved, struct list_head *failed)
859{
860 int retry;
861 int nr_failed = 0;
862 int pass = 0;
863 struct page *page;
864 struct page *page2;
865 int swapwrite = current->flags & PF_SWAPWRITE;
866 int rc;
867
868 if (!swapwrite)
869 current->flags |= PF_SWAPWRITE;
870
871redo:
872 retry = 0;
873
874 list_for_each_entry_safe(page, page2, from, lru) {
875 struct page *newpage = NULL;
876 struct address_space *mapping;
877
878 cond_resched();
879
880 rc = 0;
881 if (page_count(page) == 1)
882 /* page was freed from under us. So we are done. */
883 goto next;
884
885 if (to && list_empty(to))
886 break;
887
888 /*
889 * Skip locked pages during the first two passes to give the
890 * functions holding the lock time to release the page. Later we
891 * use lock_page() to have a higher chance of acquiring the
892 * lock.
893 */
894 rc = -EAGAIN;
895 if (pass > 2)
896 lock_page(page);
897 else
898 if (TestSetPageLocked(page))
899 goto next;
900
901 /*
902 * Only wait on writeback if we have already done a pass where
903 * we we may have triggered writeouts for lots of pages.
904 */
905 if (pass > 0) {
906 wait_on_page_writeback(page);
907 } else {
908 if (PageWriteback(page))
909 goto unlock_page;
910 }
911
912 /*
913 * Anonymous pages must have swap cache references otherwise
914 * the information contained in the page maps cannot be
915 * preserved.
916 */
917 if (PageAnon(page) && !PageSwapCache(page)) {
918 if (!add_to_swap(page, GFP_KERNEL)) {
919 rc = -ENOMEM;
920 goto unlock_page;
921 }
922 }
923
924 if (!to) {
925 rc = swap_page(page);
926 goto next;
927 }
928
929 newpage = lru_to_page(to);
930 lock_page(newpage);
931
932 /*
933 * Pages are properly locked and writeback is complete.
934 * Try to migrate the page.
935 */
936 mapping = page_mapping(page);
937 if (!mapping)
938 goto unlock_both;
939
940 if (mapping->a_ops->migratepage) {
941 /*
942 * Most pages have a mapping and most filesystems
943 * should provide a migration function. Anonymous
944 * pages are part of swap space which also has its
945 * own migration function. This is the most common
946 * path for page migration.
947 */
948 rc = mapping->a_ops->migratepage(newpage, page);
949 goto unlock_both;
950 }
951
952 /*
953 * Default handling if a filesystem does not provide
954 * a migration function. We can only migrate clean
955 * pages so try to write out any dirty pages first.
956 */
957 if (PageDirty(page)) {
958 switch (pageout(page, mapping)) {
959 case PAGE_KEEP:
960 case PAGE_ACTIVATE:
961 goto unlock_both;
962
963 case PAGE_SUCCESS:
964 unlock_page(newpage);
965 goto next;
966
967 case PAGE_CLEAN:
968 ; /* try to migrate the page below */
969 }
970 }
971
972 /*
973 * Buffers are managed in a filesystem specific way.
974 * We must have no buffers or drop them.
975 */
976 if (!page_has_buffers(page) ||
977 try_to_release_page(page, GFP_KERNEL)) {
978 rc = migrate_page(newpage, page);
979 goto unlock_both;
980 }
981
982 /*
983 * On early passes with mapped pages simply
984 * retry. There may be a lock held for some
985 * buffers that may go away. Later
986 * swap them out.
987 */
988 if (pass > 4) {
989 /*
990 * Persistently unable to drop buffers..... As a
991 * measure of last resort we fall back to
992 * swap_page().
993 */
994 unlock_page(newpage);
995 newpage = NULL;
996 rc = swap_page(page);
997 goto next;
998 }
999
1000unlock_both:
1001 unlock_page(newpage);
1002
1003unlock_page:
1004 unlock_page(page);
1005
1006next:
1007 if (rc == -EAGAIN) {
1008 retry++;
1009 } else if (rc) {
1010 /* Permanent failure */
1011 list_move(&page->lru, failed);
1012 nr_failed++;
1013 } else {
1014 if (newpage) {
1015 /* Successful migration. Return page to LRU */
1016 move_to_lru(newpage);
1017 }
1018 list_move(&page->lru, moved);
1019 }
1020 }
1021 if (retry && pass++ < 10)
1022 goto redo;
1023
1024 if (!swapwrite)
1025 current->flags &= ~PF_SWAPWRITE;
1026
1027 return nr_failed + retry;
1028}
1029
1030/*
1031 * Isolate one page from the LRU lists and put it on the
1032 * indicated list with elevated refcount.
1033 *
1034 * Result:
1035 * 0 = page not on LRU list
1036 * 1 = page removed from LRU list and added to the specified list.
1037 */
1038int isolate_lru_page(struct page *page)
1039{
1040 int ret = 0;
1041
1042 if (PageLRU(page)) {
1043 struct zone *zone = page_zone(page);
1044 spin_lock_irq(&zone->lru_lock);
1045 if (TestClearPageLRU(page)) {
1046 ret = 1;
1047 get_page(page);
1048 if (PageActive(page))
1049 del_page_from_active_list(zone, page);
1050 else
1051 del_page_from_inactive_list(zone, page);
1052 }
1053 spin_unlock_irq(&zone->lru_lock);
1054 }
1055
1056 return ret;
1057}
1058#endif
1059 560
1060/* 561/*
1061 * zone->lru_lock is heavily contended. Some of the functions that 562 * zone->lru_lock is heavily contended. Some of the functions that
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page)
1074 * 575 *
1075 * returns how many pages were moved onto *@dst. 576 * returns how many pages were moved onto *@dst.
1076 */ 577 */
1077static int isolate_lru_pages(int nr_to_scan, struct list_head *src, 578static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1078 struct list_head *dst, int *scanned) 579 struct list_head *src, struct list_head *dst,
580 unsigned long *scanned)
1079{ 581{
1080 int nr_taken = 0; 582 unsigned long nr_taken = 0;
1081 struct page *page; 583 struct page *page;
1082 int scan = 0; 584 unsigned long scan;
1083 585
1084 while (scan++ < nr_to_scan && !list_empty(src)) { 586 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
587 struct list_head *target;
1085 page = lru_to_page(src); 588 page = lru_to_page(src);
1086 prefetchw_prev_lru_page(page, src, flags); 589 prefetchw_prev_lru_page(page, src, flags);
1087 590
1088 if (!TestClearPageLRU(page)) 591 BUG_ON(!PageLRU(page));
1089 BUG(); 592
1090 list_del(&page->lru); 593 list_del(&page->lru);
1091 if (get_page_testone(page)) { 594 target = src;
595 if (likely(get_page_unless_zero(page))) {
1092 /* 596 /*
1093 * It is being freed elsewhere 597 * Be careful not to clear PageLRU until after we're
598 * sure the page is not being freed elsewhere -- the
599 * page release code relies on it.
1094 */ 600 */
1095 __put_page(page); 601 ClearPageLRU(page);
1096 SetPageLRU(page); 602 target = dst;
1097 list_add(&page->lru, src);
1098 continue;
1099 } else {
1100 list_add(&page->lru, dst);
1101 nr_taken++; 603 nr_taken++;
1102 } 604 } /* else it is being freed elsewhere */
605
606 list_add(&page->lru, target);
1103 } 607 }
1104 608
1105 *scanned = scan; 609 *scanned = scan;
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
1107} 611}
1108 612
1109/* 613/*
1110 * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed 614 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
615 * of reclaimed pages
1111 */ 616 */
1112static void shrink_cache(struct zone *zone, struct scan_control *sc) 617static unsigned long shrink_inactive_list(unsigned long max_scan,
618 struct zone *zone, struct scan_control *sc)
1113{ 619{
1114 LIST_HEAD(page_list); 620 LIST_HEAD(page_list);
1115 struct pagevec pvec; 621 struct pagevec pvec;
1116 int max_scan = sc->nr_to_scan; 622 unsigned long nr_scanned = 0;
623 unsigned long nr_reclaimed = 0;
1117 624
1118 pagevec_init(&pvec, 1); 625 pagevec_init(&pvec, 1);
1119 626
1120 lru_add_drain(); 627 lru_add_drain();
1121 spin_lock_irq(&zone->lru_lock); 628 spin_lock_irq(&zone->lru_lock);
1122 while (max_scan > 0) { 629 do {
1123 struct page *page; 630 struct page *page;
1124 int nr_taken; 631 unsigned long nr_taken;
1125 int nr_scan; 632 unsigned long nr_scan;
1126 int nr_freed; 633 unsigned long nr_freed;
1127 634
1128 nr_taken = isolate_lru_pages(sc->swap_cluster_max, 635 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
1129 &zone->inactive_list, 636 &zone->inactive_list,
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1132 zone->pages_scanned += nr_scan; 639 zone->pages_scanned += nr_scan;
1133 spin_unlock_irq(&zone->lru_lock); 640 spin_unlock_irq(&zone->lru_lock);
1134 641
1135 if (nr_taken == 0) 642 nr_scanned += nr_scan;
1136 goto done; 643 nr_freed = shrink_page_list(&page_list, sc);
1137 644 nr_reclaimed += nr_freed;
1138 max_scan -= nr_scan;
1139 nr_freed = shrink_list(&page_list, sc);
1140
1141 local_irq_disable(); 645 local_irq_disable();
1142 if (current_is_kswapd()) { 646 if (current_is_kswapd()) {
1143 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); 647 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1146 __mod_page_state_zone(zone, pgscan_direct, nr_scan); 650 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
1147 __mod_page_state_zone(zone, pgsteal, nr_freed); 651 __mod_page_state_zone(zone, pgsteal, nr_freed);
1148 652
653 if (nr_taken == 0)
654 goto done;
655
1149 spin_lock(&zone->lru_lock); 656 spin_lock(&zone->lru_lock);
1150 /* 657 /*
1151 * Put back any unfreeable pages. 658 * Put back any unfreeable pages.
1152 */ 659 */
1153 while (!list_empty(&page_list)) { 660 while (!list_empty(&page_list)) {
1154 page = lru_to_page(&page_list); 661 page = lru_to_page(&page_list);
1155 if (TestSetPageLRU(page)) 662 BUG_ON(PageLRU(page));
1156 BUG(); 663 SetPageLRU(page);
1157 list_del(&page->lru); 664 list_del(&page->lru);
1158 if (PageActive(page)) 665 if (PageActive(page))
1159 add_page_to_active_list(zone, page); 666 add_page_to_active_list(zone, page);
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
1165 spin_lock_irq(&zone->lru_lock); 672 spin_lock_irq(&zone->lru_lock);
1166 } 673 }
1167 } 674 }
1168 } 675 } while (nr_scanned < max_scan);
1169 spin_unlock_irq(&zone->lru_lock); 676 spin_unlock(&zone->lru_lock);
1170done: 677done:
678 local_irq_enable();
1171 pagevec_release(&pvec); 679 pagevec_release(&pvec);
680 return nr_reclaimed;
1172} 681}
1173 682
1174/* 683/*
@@ -1188,13 +697,12 @@ done:
1188 * The downside is that we have to touch page->_count against each page. 697 * The downside is that we have to touch page->_count against each page.
1189 * But we had to alter page->flags anyway. 698 * But we had to alter page->flags anyway.
1190 */ 699 */
1191static void 700static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1192refill_inactive_zone(struct zone *zone, struct scan_control *sc) 701 struct scan_control *sc)
1193{ 702{
1194 int pgmoved; 703 unsigned long pgmoved;
1195 int pgdeactivate = 0; 704 int pgdeactivate = 0;
1196 int pgscanned; 705 unsigned long pgscanned;
1197 int nr_pages = sc->nr_to_scan;
1198 LIST_HEAD(l_hold); /* The pages which were snipped off */ 706 LIST_HEAD(l_hold); /* The pages which were snipped off */
1199 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 707 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
1200 LIST_HEAD(l_active); /* Pages to go onto the active_list */ 708 LIST_HEAD(l_active); /* Pages to go onto the active_list */
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1202 struct pagevec pvec; 710 struct pagevec pvec;
1203 int reclaim_mapped = 0; 711 int reclaim_mapped = 0;
1204 712
1205 if (unlikely(sc->may_swap)) { 713 if (sc->may_swap) {
1206 long mapped_ratio; 714 long mapped_ratio;
1207 long distress; 715 long distress;
1208 long swap_tendency; 716 long swap_tendency;
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1272 while (!list_empty(&l_inactive)) { 780 while (!list_empty(&l_inactive)) {
1273 page = lru_to_page(&l_inactive); 781 page = lru_to_page(&l_inactive);
1274 prefetchw_prev_lru_page(page, &l_inactive, flags); 782 prefetchw_prev_lru_page(page, &l_inactive, flags);
1275 if (TestSetPageLRU(page)) 783 BUG_ON(PageLRU(page));
1276 BUG(); 784 SetPageLRU(page);
1277 if (!TestClearPageActive(page)) 785 BUG_ON(!PageActive(page));
1278 BUG(); 786 ClearPageActive(page);
787
1279 list_move(&page->lru, &zone->inactive_list); 788 list_move(&page->lru, &zone->inactive_list);
1280 pgmoved++; 789 pgmoved++;
1281 if (!pagevec_add(&pvec, page)) { 790 if (!pagevec_add(&pvec, page)) {
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1301 while (!list_empty(&l_active)) { 810 while (!list_empty(&l_active)) {
1302 page = lru_to_page(&l_active); 811 page = lru_to_page(&l_active);
1303 prefetchw_prev_lru_page(page, &l_active, flags); 812 prefetchw_prev_lru_page(page, &l_active, flags);
1304 if (TestSetPageLRU(page)) 813 BUG_ON(PageLRU(page));
1305 BUG(); 814 SetPageLRU(page);
1306 BUG_ON(!PageActive(page)); 815 BUG_ON(!PageActive(page));
1307 list_move(&page->lru, &zone->active_list); 816 list_move(&page->lru, &zone->active_list);
1308 pgmoved++; 817 pgmoved++;
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1327/* 836/*
1328 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 837 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1329 */ 838 */
1330static void 839static unsigned long shrink_zone(int priority, struct zone *zone,
1331shrink_zone(struct zone *zone, struct scan_control *sc) 840 struct scan_control *sc)
1332{ 841{
1333 unsigned long nr_active; 842 unsigned long nr_active;
1334 unsigned long nr_inactive; 843 unsigned long nr_inactive;
844 unsigned long nr_to_scan;
845 unsigned long nr_reclaimed = 0;
1335 846
1336 atomic_inc(&zone->reclaim_in_progress); 847 atomic_inc(&zone->reclaim_in_progress);
1337 848
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1339 * Add one to `nr_to_scan' just to make sure that the kernel will 850 * Add one to `nr_to_scan' just to make sure that the kernel will
1340 * slowly sift through the active list. 851 * slowly sift through the active list.
1341 */ 852 */
1342 zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; 853 zone->nr_scan_active += (zone->nr_active >> priority) + 1;
1343 nr_active = zone->nr_scan_active; 854 nr_active = zone->nr_scan_active;
1344 if (nr_active >= sc->swap_cluster_max) 855 if (nr_active >= sc->swap_cluster_max)
1345 zone->nr_scan_active = 0; 856 zone->nr_scan_active = 0;
1346 else 857 else
1347 nr_active = 0; 858 nr_active = 0;
1348 859
1349 zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; 860 zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
1350 nr_inactive = zone->nr_scan_inactive; 861 nr_inactive = zone->nr_scan_inactive;
1351 if (nr_inactive >= sc->swap_cluster_max) 862 if (nr_inactive >= sc->swap_cluster_max)
1352 zone->nr_scan_inactive = 0; 863 zone->nr_scan_inactive = 0;
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1355 866
1356 while (nr_active || nr_inactive) { 867 while (nr_active || nr_inactive) {
1357 if (nr_active) { 868 if (nr_active) {
1358 sc->nr_to_scan = min(nr_active, 869 nr_to_scan = min(nr_active,
1359 (unsigned long)sc->swap_cluster_max); 870 (unsigned long)sc->swap_cluster_max);
1360 nr_active -= sc->nr_to_scan; 871 nr_active -= nr_to_scan;
1361 refill_inactive_zone(zone, sc); 872 shrink_active_list(nr_to_scan, zone, sc);
1362 } 873 }
1363 874
1364 if (nr_inactive) { 875 if (nr_inactive) {
1365 sc->nr_to_scan = min(nr_inactive, 876 nr_to_scan = min(nr_inactive,
1366 (unsigned long)sc->swap_cluster_max); 877 (unsigned long)sc->swap_cluster_max);
1367 nr_inactive -= sc->nr_to_scan; 878 nr_inactive -= nr_to_scan;
1368 shrink_cache(zone, sc); 879 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
880 sc);
1369 } 881 }
1370 } 882 }
1371 883
1372 throttle_vm_writeout(); 884 throttle_vm_writeout();
1373 885
1374 atomic_dec(&zone->reclaim_in_progress); 886 atomic_dec(&zone->reclaim_in_progress);
887 return nr_reclaimed;
1375} 888}
1376 889
1377/* 890/*
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
1390 * If a zone is deemed to be full of pinned pages then just give it a light 903 * If a zone is deemed to be full of pinned pages then just give it a light
1391 * scan then give up on it. 904 * scan then give up on it.
1392 */ 905 */
1393static void 906static unsigned long shrink_zones(int priority, struct zone **zones,
1394shrink_caches(struct zone **zones, struct scan_control *sc) 907 struct scan_control *sc)
1395{ 908{
909 unsigned long nr_reclaimed = 0;
1396 int i; 910 int i;
1397 911
1398 for (i = 0; zones[i] != NULL; i++) { 912 for (i = 0; zones[i] != NULL; i++) {
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
1404 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 918 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1405 continue; 919 continue;
1406 920
1407 zone->temp_priority = sc->priority; 921 zone->temp_priority = priority;
1408 if (zone->prev_priority > sc->priority) 922 if (zone->prev_priority > priority)
1409 zone->prev_priority = sc->priority; 923 zone->prev_priority = priority;
1410 924
1411 if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) 925 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1412 continue; /* Let kswapd poll it */ 926 continue; /* Let kswapd poll it */
1413 927
1414 shrink_zone(zone, sc); 928 nr_reclaimed += shrink_zone(priority, zone, sc);
1415 } 929 }
930 return nr_reclaimed;
1416} 931}
1417 932
1418/* 933/*
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
1428 * holds filesystem locks which prevent writeout this might not work, and the 943 * holds filesystem locks which prevent writeout this might not work, and the
1429 * allocation attempt will fail. 944 * allocation attempt will fail.
1430 */ 945 */
1431int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) 946unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1432{ 947{
1433 int priority; 948 int priority;
1434 int ret = 0; 949 int ret = 0;
1435 int total_scanned = 0, total_reclaimed = 0; 950 unsigned long total_scanned = 0;
951 unsigned long nr_reclaimed = 0;
1436 struct reclaim_state *reclaim_state = current->reclaim_state; 952 struct reclaim_state *reclaim_state = current->reclaim_state;
1437 struct scan_control sc;
1438 unsigned long lru_pages = 0; 953 unsigned long lru_pages = 0;
1439 int i; 954 int i;
1440 955 struct scan_control sc = {
1441 sc.gfp_mask = gfp_mask; 956 .gfp_mask = gfp_mask,
1442 sc.may_writepage = !laptop_mode; 957 .may_writepage = !laptop_mode,
1443 sc.may_swap = 1; 958 .swap_cluster_max = SWAP_CLUSTER_MAX,
959 .may_swap = 1,
960 };
1444 961
1445 inc_page_state(allocstall); 962 inc_page_state(allocstall);
1446 963
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1457 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 974 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1458 sc.nr_mapped = read_page_state(nr_mapped); 975 sc.nr_mapped = read_page_state(nr_mapped);
1459 sc.nr_scanned = 0; 976 sc.nr_scanned = 0;
1460 sc.nr_reclaimed = 0;
1461 sc.priority = priority;
1462 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1463 if (!priority) 977 if (!priority)
1464 disable_swap_token(); 978 disable_swap_token();
1465 shrink_caches(zones, &sc); 979 nr_reclaimed += shrink_zones(priority, zones, &sc);
1466 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); 980 shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
1467 if (reclaim_state) { 981 if (reclaim_state) {
1468 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 982 nr_reclaimed += reclaim_state->reclaimed_slab;
1469 reclaim_state->reclaimed_slab = 0; 983 reclaim_state->reclaimed_slab = 0;
1470 } 984 }
1471 total_scanned += sc.nr_scanned; 985 total_scanned += sc.nr_scanned;
1472 total_reclaimed += sc.nr_reclaimed; 986 if (nr_reclaimed >= sc.swap_cluster_max) {
1473 if (total_reclaimed >= sc.swap_cluster_max) {
1474 ret = 1; 987 ret = 1;
1475 goto out; 988 goto out;
1476 } 989 }
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1482 * that's undesirable in laptop mode, where we *want* lumpy 995 * that's undesirable in laptop mode, where we *want* lumpy
1483 * writeout. So in laptop mode, write out the whole world. 996 * writeout. So in laptop mode, write out the whole world.
1484 */ 997 */
1485 if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { 998 if (total_scanned > sc.swap_cluster_max +
999 sc.swap_cluster_max / 2) {
1486 wakeup_pdflush(laptop_mode ? 0 : total_scanned); 1000 wakeup_pdflush(laptop_mode ? 0 : total_scanned);
1487 sc.may_writepage = 1; 1001 sc.may_writepage = 1;
1488 } 1002 }
@@ -1528,22 +1042,26 @@ out:
1528 * the page allocator fallback scheme to ensure that aging of pages is balanced 1042 * the page allocator fallback scheme to ensure that aging of pages is balanced
1529 * across the zones. 1043 * across the zones.
1530 */ 1044 */
1531static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) 1045static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
1046 int order)
1532{ 1047{
1533 int to_free = nr_pages; 1048 unsigned long to_free = nr_pages;
1534 int all_zones_ok; 1049 int all_zones_ok;
1535 int priority; 1050 int priority;
1536 int i; 1051 int i;
1537 int total_scanned, total_reclaimed; 1052 unsigned long total_scanned;
1053 unsigned long nr_reclaimed;
1538 struct reclaim_state *reclaim_state = current->reclaim_state; 1054 struct reclaim_state *reclaim_state = current->reclaim_state;
1539 struct scan_control sc; 1055 struct scan_control sc = {
1056 .gfp_mask = GFP_KERNEL,
1057 .may_swap = 1,
1058 .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
1059 };
1540 1060
1541loop_again: 1061loop_again:
1542 total_scanned = 0; 1062 total_scanned = 0;
1543 total_reclaimed = 0; 1063 nr_reclaimed = 0;
1544 sc.gfp_mask = GFP_KERNEL; 1064 sc.may_writepage = !laptop_mode,
1545 sc.may_writepage = !laptop_mode;
1546 sc.may_swap = 1;
1547 sc.nr_mapped = read_page_state(nr_mapped); 1065 sc.nr_mapped = read_page_state(nr_mapped);
1548 1066
1549 inc_page_state(pageoutrun); 1067 inc_page_state(pageoutrun);
@@ -1624,15 +1142,11 @@ scan:
1624 if (zone->prev_priority > priority) 1142 if (zone->prev_priority > priority)
1625 zone->prev_priority = priority; 1143 zone->prev_priority = priority;
1626 sc.nr_scanned = 0; 1144 sc.nr_scanned = 0;
1627 sc.nr_reclaimed = 0; 1145 nr_reclaimed += shrink_zone(priority, zone, &sc);
1628 sc.priority = priority;
1629 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1630 shrink_zone(zone, &sc);
1631 reclaim_state->reclaimed_slab = 0; 1146 reclaim_state->reclaimed_slab = 0;
1632 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1147 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1633 lru_pages); 1148 lru_pages);
1634 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 1149 nr_reclaimed += reclaim_state->reclaimed_slab;
1635 total_reclaimed += sc.nr_reclaimed;
1636 total_scanned += sc.nr_scanned; 1150 total_scanned += sc.nr_scanned;
1637 if (zone->all_unreclaimable) 1151 if (zone->all_unreclaimable)
1638 continue; 1152 continue;
@@ -1645,10 +1159,10 @@ scan:
1645 * even in laptop mode 1159 * even in laptop mode
1646 */ 1160 */
1647 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1161 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1648 total_scanned > total_reclaimed+total_reclaimed/2) 1162 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1649 sc.may_writepage = 1; 1163 sc.may_writepage = 1;
1650 } 1164 }
1651 if (nr_pages && to_free > total_reclaimed) 1165 if (nr_pages && to_free > nr_reclaimed)
1652 continue; /* swsusp: need to do more work */ 1166 continue; /* swsusp: need to do more work */
1653 if (all_zones_ok) 1167 if (all_zones_ok)
1654 break; /* kswapd: all done */ 1168 break; /* kswapd: all done */
@@ -1665,7 +1179,7 @@ scan:
1665 * matches the direct reclaim path behaviour in terms of impact 1179 * matches the direct reclaim path behaviour in terms of impact
1666 * on zone->*_priority. 1180 * on zone->*_priority.
1667 */ 1181 */
1668 if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) 1182 if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
1669 break; 1183 break;
1670 } 1184 }
1671out: 1185out:
@@ -1679,7 +1193,7 @@ out:
1679 goto loop_again; 1193 goto loop_again;
1680 } 1194 }
1681 1195
1682 return total_reclaimed; 1196 return nr_reclaimed;
1683} 1197}
1684 1198
1685/* 1199/*
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order)
1779 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed 1293 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed
1780 * pages. 1294 * pages.
1781 */ 1295 */
1782int shrink_all_memory(int nr_pages) 1296unsigned long shrink_all_memory(unsigned long nr_pages)
1783{ 1297{
1784 pg_data_t *pgdat; 1298 pg_data_t *pgdat;
1785 int nr_to_free = nr_pages; 1299 unsigned long nr_to_free = nr_pages;
1786 int ret = 0; 1300 unsigned long ret = 0;
1301 unsigned retry = 2;
1787 struct reclaim_state reclaim_state = { 1302 struct reclaim_state reclaim_state = {
1788 .reclaimed_slab = 0, 1303 .reclaimed_slab = 0,
1789 }; 1304 };
1790 1305
1791 current->reclaim_state = &reclaim_state; 1306 current->reclaim_state = &reclaim_state;
1792 for_each_pgdat(pgdat) { 1307repeat:
1793 int freed; 1308 for_each_online_pgdat(pgdat) {
1309 unsigned long freed;
1310
1794 freed = balance_pgdat(pgdat, nr_to_free, 0); 1311 freed = balance_pgdat(pgdat, nr_to_free, 0);
1795 ret += freed; 1312 ret += freed;
1796 nr_to_free -= freed; 1313 nr_to_free -= freed;
1797 if (nr_to_free <= 0) 1314 if ((long)nr_to_free <= 0)
1798 break; 1315 break;
1799 } 1316 }
1317 if (retry-- && ret < nr_pages) {
1318 blk_congestion_wait(WRITE, HZ/5);
1319 goto repeat;
1320 }
1800 current->reclaim_state = NULL; 1321 current->reclaim_state = NULL;
1801 return ret; 1322 return ret;
1802} 1323}
@@ -1808,14 +1329,13 @@ int shrink_all_memory(int nr_pages)
1808 away, we get changed to run anywhere: as the first one comes back, 1329 away, we get changed to run anywhere: as the first one comes back,
1809 restore their cpu bindings. */ 1330 restore their cpu bindings. */
1810static int __devinit cpu_callback(struct notifier_block *nfb, 1331static int __devinit cpu_callback(struct notifier_block *nfb,
1811 unsigned long action, 1332 unsigned long action, void *hcpu)
1812 void *hcpu)
1813{ 1333{
1814 pg_data_t *pgdat; 1334 pg_data_t *pgdat;
1815 cpumask_t mask; 1335 cpumask_t mask;
1816 1336
1817 if (action == CPU_ONLINE) { 1337 if (action == CPU_ONLINE) {
1818 for_each_pgdat(pgdat) { 1338 for_each_online_pgdat(pgdat) {
1819 mask = node_to_cpumask(pgdat->node_id); 1339 mask = node_to_cpumask(pgdat->node_id);
1820 if (any_online_cpu(mask) != NR_CPUS) 1340 if (any_online_cpu(mask) != NR_CPUS)
1821 /* One of our CPUs online: restore mask */ 1341 /* One of our CPUs online: restore mask */
@@ -1829,10 +1349,17 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
1829static int __init kswapd_init(void) 1349static int __init kswapd_init(void)
1830{ 1350{
1831 pg_data_t *pgdat; 1351 pg_data_t *pgdat;
1352
1832 swap_setup(); 1353 swap_setup();
1833 for_each_pgdat(pgdat) 1354 for_each_online_pgdat(pgdat) {
1834 pgdat->kswapd 1355 pid_t pid;
1835 = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); 1356
1357 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
1358 BUG_ON(pid < 0);
1359 read_lock(&tasklist_lock);
1360 pgdat->kswapd = find_task_by_pid(pid);
1361 read_unlock(&tasklist_lock);
1362 }
1836 total_memory = nr_free_pagecache_pages(); 1363 total_memory = nr_free_pagecache_pages();
1837 hotcpu_notifier(cpu_callback, 0); 1364 hotcpu_notifier(cpu_callback, 0);
1838 return 0; 1365 return 0;
@@ -1874,46 +1401,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
1874/* 1401/*
1875 * Try to free up some pages from this zone through reclaim. 1402 * Try to free up some pages from this zone through reclaim.
1876 */ 1403 */
1877int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 1404static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1878{ 1405{
1879 int nr_pages; 1406 /* Minimum pages needed in order to stay on node */
1407 const unsigned long nr_pages = 1 << order;
1880 struct task_struct *p = current; 1408 struct task_struct *p = current;
1881 struct reclaim_state reclaim_state; 1409 struct reclaim_state reclaim_state;
1882 struct scan_control sc; 1410 int priority;
1883 cpumask_t mask; 1411 unsigned long nr_reclaimed = 0;
1884 int node_id; 1412 struct scan_control sc = {
1885 1413 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
1886 if (time_before(jiffies, 1414 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
1887 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) 1415 .nr_mapped = read_page_state(nr_mapped),
1888 return 0; 1416 .swap_cluster_max = max_t(unsigned long, nr_pages,
1889 1417 SWAP_CLUSTER_MAX),
1890 if (!(gfp_mask & __GFP_WAIT) || 1418 .gfp_mask = gfp_mask,
1891 zone->all_unreclaimable || 1419 };
1892 atomic_read(&zone->reclaim_in_progress) > 0 ||
1893 (p->flags & PF_MEMALLOC))
1894 return 0;
1895
1896 node_id = zone->zone_pgdat->node_id;
1897 mask = node_to_cpumask(node_id);
1898 if (!cpus_empty(mask) && node_id != numa_node_id())
1899 return 0;
1900
1901 sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
1902 sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
1903 sc.nr_scanned = 0;
1904 sc.nr_reclaimed = 0;
1905 sc.priority = ZONE_RECLAIM_PRIORITY + 1;
1906 sc.nr_mapped = read_page_state(nr_mapped);
1907 sc.gfp_mask = gfp_mask;
1908 1420
1909 disable_swap_token(); 1421 disable_swap_token();
1910
1911 nr_pages = 1 << order;
1912 if (nr_pages > SWAP_CLUSTER_MAX)
1913 sc.swap_cluster_max = nr_pages;
1914 else
1915 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1916
1917 cond_resched(); 1422 cond_resched();
1918 /* 1423 /*
1919 * We need to be able to allocate from the reserves for RECLAIM_SWAP 1424 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1928,17 +1433,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1928 * Free memory by calling shrink zone with increasing priorities 1433 * Free memory by calling shrink zone with increasing priorities
1929 * until we have enough memory freed. 1434 * until we have enough memory freed.
1930 */ 1435 */
1436 priority = ZONE_RECLAIM_PRIORITY;
1931 do { 1437 do {
1932 sc.priority--; 1438 nr_reclaimed += shrink_zone(priority, zone, &sc);
1933 shrink_zone(zone, &sc); 1439 priority--;
1440 } while (priority >= 0 && nr_reclaimed < nr_pages);
1934 1441
1935 } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); 1442 if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1936
1937 if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
1938 /* 1443 /*
1939 * shrink_slab does not currently allow us to determine 1444 * shrink_slab() does not currently allow us to determine how
1940 * how many pages were freed in the zone. So we just 1445 * many pages were freed in this zone. So we just shake the slab
1941 * shake the slab and then go offnode for a single allocation. 1446 * a bit and then go off node for this particular allocation
1447 * despite possibly having freed enough memory to allocate in
1448 * this zone. If we freed local memory then the next
1449 * allocations will be local again.
1942 * 1450 *
1943 * shrink_slab will free memory on all zones and may take 1451 * shrink_slab will free memory on all zones and may take
1944 * a long time. 1452 * a long time.
@@ -1949,10 +1457,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1949 p->reclaim_state = NULL; 1457 p->reclaim_state = NULL;
1950 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 1458 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1951 1459
1952 if (sc.nr_reclaimed == 0) 1460 if (nr_reclaimed == 0) {
1461 /*
1462 * We were unable to reclaim enough pages to stay on node. We
1463 * now allow off node accesses for a certain time period before
1464 * trying again to reclaim pages from the local zone.
1465 */
1953 zone->last_unsuccessful_zone_reclaim = jiffies; 1466 zone->last_unsuccessful_zone_reclaim = jiffies;
1467 }
1954 1468
1955 return sc.nr_reclaimed >= nr_pages; 1469 return nr_reclaimed >= nr_pages;
1956} 1470}
1957#endif
1958 1471
1472int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1473{
1474 cpumask_t mask;
1475 int node_id;
1476
1477 /*
1478 * Do not reclaim if there was a recent unsuccessful attempt at zone
1479 * reclaim. In that case we let allocations go off node for the
1480 * zone_reclaim_interval. Otherwise we would scan for each off-node
1481 * page allocation.
1482 */
1483 if (time_before(jiffies,
1484 zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
1485 return 0;
1486
1487 /*
1488 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
1489 * not have reclaimable pages and if we should not delay the allocation
1490 * then do not scan.
1491 */
1492 if (!(gfp_mask & __GFP_WAIT) ||
1493 zone->all_unreclaimable ||
1494 atomic_read(&zone->reclaim_in_progress) > 0 ||
1495 (current->flags & PF_MEMALLOC))
1496 return 0;
1497
1498 /*
1499 * Only run zone reclaim on the local zone or on zones that do not
1500 * have associated processors. This will favor the local processor
1501 * over remote processors and spread off node memory allocations
1502 * as wide as possible.
1503 */
1504 node_id = zone->zone_pgdat->node_id;
1505 mask = node_to_cpumask(node_id);
1506 if (!cpus_empty(mask) && node_id != numa_node_id())
1507 return 0;
1508 return __zone_reclaim(zone, gfp_mask, order);
1509}
1510#endif