aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Makefile6
-rw-r--r--mm/bootmem.c58
-rw-r--r--mm/fadvise.c5
-rw-r--r--mm/filemap.c159
-rw-r--r--mm/filemap_xip.c8
-rw-r--r--mm/hugetlb.c194
-rw-r--r--mm/internal.h21
-rw-r--r--mm/madvise.c35
-rw-r--r--mm/memory.c34
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c669
-rw-r--r--mm/mlock.c1
-rw-r--r--mm/mmap.c1
-rw-r--r--mm/mremap.c1
-rw-r--r--mm/msync.c2
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c472
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/readahead.c15
-rw-r--r--mm/rmap.c72
-rw-r--r--mm/shmem.c42
-rw-r--r--mm/slab.c1140
-rw-r--r--mm/slob.c385
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c29
-rw-r--r--mm/swap_state.c8
-rw-r--r--mm/swapfile.c43
-rw-r--r--mm/tiny-shmem.c29
-rw-r--r--mm/truncate.c45
-rw-r--r--mm/util.c39
-rw-r--r--mm/vmscan.c468
34 files changed, 2722 insertions, 1297 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d4da8f..a9cb80ae6409 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
11 11
12config FLATMEM_MANUAL 12config FLATMEM_MANUAL
13 bool "Flat Memory" 13 bool "Flat Memory"
14 depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE 14 depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
15 help 15 help
16 This option allows you to change some of the ways that 16 This option allows you to change some of the ways that
17 Linux manages its memory internally. Most users will 17 Linux manages its memory internally. Most users will
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
132 default "4096" if ARM && !CPU_CACHE_VIPT 132 default "4096" if ARM && !CPU_CACHE_VIPT
133 default "4096" if PARISC && !PA20 133 default "4096" if PARISC && !PA20
134 default "4" 134 default "4"
135
136#
137# support for page migration
138#
139config MIGRATION
140 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
141 depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f28..9aa03fa1dcc3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 page_alloc.o page-writeback.o pdflush.o \ 11 page_alloc.o page-writeback.o pdflush.o \
12 readahead.o slab.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o \
13 prio_tree.o $(mmu-y) 13 prio_tree.o util.o $(mmu-y)
14 14
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o 18obj-$(CONFIG_SPARSEMEM) += sparse.o
19obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
21obj-$(CONFIG_SLOB) += slob.o
22obj-$(CONFIG_SLAB) += slab.o
21obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 23obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
22obj-$(CONFIG_FS_XIP) += filemap_xip.o 24obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465eb4eb..35c32290f717 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
296 unsigned long v = ~map[i / BITS_PER_LONG]; 296 unsigned long v = ~map[i / BITS_PER_LONG];
297 297
298 if (gofast && v == ~0UL) { 298 if (gofast && v == ~0UL) {
299 int j, order; 299 int order;
300 300
301 page = pfn_to_page(pfn); 301 page = pfn_to_page(pfn);
302 count += BITS_PER_LONG; 302 count += BITS_PER_LONG;
303 __ClearPageReserved(page);
304 order = ffs(BITS_PER_LONG) - 1; 303 order = ffs(BITS_PER_LONG) - 1;
305 set_page_refs(page, order); 304 __free_pages_bootmem(page, order);
306 for (j = 1; j < BITS_PER_LONG; j++) {
307 if (j + 16 < BITS_PER_LONG)
308 prefetchw(page + j + 16);
309 __ClearPageReserved(page + j);
310 set_page_count(page + j, 0);
311 }
312 __free_pages(page, order);
313 i += BITS_PER_LONG; 305 i += BITS_PER_LONG;
314 page += BITS_PER_LONG; 306 page += BITS_PER_LONG;
315 } else if (v) { 307 } else if (v) {
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
319 for (m = 1; m && i < idx; m<<=1, page++, i++) { 311 for (m = 1; m && i < idx; m<<=1, page++, i++) {
320 if (v & m) { 312 if (v & m) {
321 count++; 313 count++;
322 __ClearPageReserved(page); 314 __free_pages_bootmem(page, 0);
323 set_page_refs(page, 0);
324 __free_page(page);
325 } 315 }
326 } 316 }
327 } else { 317 } else {
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
339 count = 0; 329 count = 0;
340 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { 330 for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
341 count++; 331 count++;
342 __ClearPageReserved(page); 332 __free_pages_bootmem(page, 0);
343 set_page_count(page, 1);
344 __free_page(page);
345 } 333 }
346 total += count; 334 total += count;
347 bdata->node_bootmem_map = NULL; 335 bdata->node_bootmem_map = NULL;
@@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
393 return(free_all_bootmem_core(NODE_DATA(0))); 381 return(free_all_bootmem_core(NODE_DATA(0)));
394} 382}
395 383
396void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, 384void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
397 unsigned long limit)
398{ 385{
399 pg_data_t *pgdat = pgdat_list; 386 pg_data_t *pgdat = pgdat_list;
400 void *ptr; 387 void *ptr;
401 388
402 for_each_pgdat(pgdat) 389 for_each_pgdat(pgdat)
403 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, 390 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
404 align, goal, limit))) 391 align, goal, 0)))
405 return(ptr); 392 return(ptr);
406 393
407 /* 394 /*
@@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
413} 400}
414 401
415 402
416void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, 403void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
417 unsigned long goal, unsigned long limit) 404 unsigned long goal)
418{ 405{
419 void *ptr; 406 void *ptr;
420 407
421 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); 408 ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
422 if (ptr) 409 if (ptr)
423 return (ptr); 410 return (ptr);
424 411
425 return __alloc_bootmem_limit(size, align, goal, limit); 412 return __alloc_bootmem(size, align, goal);
413}
414
415#define LOW32LIMIT 0xffffffff
416
417void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
418{
419 pg_data_t *pgdat = pgdat_list;
420 void *ptr;
421
422 for_each_pgdat(pgdat)
423 if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
424 align, goal, LOW32LIMIT)))
425 return(ptr);
426
427 /*
428 * Whoops, we cannot satisfy the allocation request.
429 */
430 printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
431 panic("Out of low memory");
432 return NULL;
426} 433}
427 434
435void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
436 unsigned long align, unsigned long goal)
437{
438 return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
439}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5af..d257c89e7704 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
37 if (!file) 37 if (!file)
38 return -EBADF; 38 return -EBADF;
39 39
40 if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
41 ret = -ESPIPE;
42 goto out;
43 }
44
40 mapping = file->f_mapping; 45 mapping = file->f_mapping;
41 if (!mapping || len < 0) { 46 if (!mapping || len < 0) {
42 ret = -EINVAL; 47 ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde158..a965b6b35f26 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
15#include <linux/compiler.h> 15#include <linux/compiler.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/aio.h> 17#include <linux/aio.h>
18#include <linux/capability.h>
18#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
19#include <linux/mm.h> 20#include <linux/mm.h>
20#include <linux/swap.h> 21#include <linux/swap.h>
@@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
61 * ->swap_lock (exclusive_swap_page, others) 62 * ->swap_lock (exclusive_swap_page, others)
62 * ->mapping->tree_lock 63 * ->mapping->tree_lock
63 * 64 *
64 * ->i_sem 65 * ->i_mutex
65 * ->i_mmap_lock (truncate->unmap_mapping_range) 66 * ->i_mmap_lock (truncate->unmap_mapping_range)
66 * 67 *
67 * ->mmap_sem 68 * ->mmap_sem
@@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
73 * ->lock_page (access_process_vm) 74 * ->lock_page (access_process_vm)
74 * 75 *
75 * ->mmap_sem 76 * ->mmap_sem
76 * ->i_sem (msync) 77 * ->i_mutex (msync)
77 * 78 *
78 * ->i_sem 79 * ->i_mutex
79 * ->i_alloc_sem (various) 80 * ->i_alloc_sem (various)
80 * 81 *
81 * ->inode_lock 82 * ->inode_lock
@@ -276,11 +277,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
276 * integrity" operation. It waits upon in-flight writeout before starting and 277 * integrity" operation. It waits upon in-flight writeout before starting and
277 * waiting upon new writeout. If there was an IO error, return it. 278 * waiting upon new writeout. If there was an IO error, return it.
278 * 279 *
279 * We need to re-take i_sem during the generic_osync_inode list walk because 280 * We need to re-take i_mutex during the generic_osync_inode list walk because
280 * it is otherwise livelockable. 281 * it is otherwise livelockable.
281 */ 282 */
282int sync_page_range(struct inode *inode, struct address_space *mapping, 283int sync_page_range(struct inode *inode, struct address_space *mapping,
283 loff_t pos, size_t count) 284 loff_t pos, loff_t count)
284{ 285{
285 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 286 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
286 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 287 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -290,9 +291,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
290 return 0; 291 return 0;
291 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); 292 ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
292 if (ret == 0) { 293 if (ret == 0) {
293 down(&inode->i_sem); 294 mutex_lock(&inode->i_mutex);
294 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); 295 ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
295 up(&inode->i_sem); 296 mutex_unlock(&inode->i_mutex);
296 } 297 }
297 if (ret == 0) 298 if (ret == 0)
298 ret = wait_on_page_writeback_range(mapping, start, end); 299 ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,13 +302,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
301EXPORT_SYMBOL(sync_page_range); 302EXPORT_SYMBOL(sync_page_range);
302 303
303/* 304/*
304 * Note: Holding i_sem across sync_page_range_nolock is not a good idea 305 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
305 * as it forces O_SYNC writers to different parts of the same file 306 * as it forces O_SYNC writers to different parts of the same file
306 * to be serialised right until io completion. 307 * to be serialised right until io completion.
307 */ 308 */
308static int sync_page_range_nolock(struct inode *inode, 309int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
309 struct address_space *mapping, 310 loff_t pos, loff_t count)
310 loff_t pos, size_t count)
311{ 311{
312 pgoff_t start = pos >> PAGE_CACHE_SHIFT; 312 pgoff_t start = pos >> PAGE_CACHE_SHIFT;
313 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; 313 pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +322,7 @@ static int sync_page_range_nolock(struct inode *inode,
322 ret = wait_on_page_writeback_range(mapping, start, end); 322 ret = wait_on_page_writeback_range(mapping, start, end);
323 return ret; 323 return ret;
324} 324}
325EXPORT_SYMBOL(sync_page_range_nolock);
325 326
326/** 327/**
327 * filemap_fdatawait - walk the list of under-writeback pages of the given 328 * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +344,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
343 344
344int filemap_write_and_wait(struct address_space *mapping) 345int filemap_write_and_wait(struct address_space *mapping)
345{ 346{
346 int retval = 0; 347 int err = 0;
347 348
348 if (mapping->nrpages) { 349 if (mapping->nrpages) {
349 retval = filemap_fdatawrite(mapping); 350 err = filemap_fdatawrite(mapping);
350 if (retval == 0) 351 /*
351 retval = filemap_fdatawait(mapping); 352 * Even if the above returned error, the pages may be
353 * written partially (e.g. -ENOSPC), so we wait for it.
354 * But the -EIO is special case, it may indicate the worst
355 * thing (e.g. bug) happened, so we avoid waiting for it.
356 */
357 if (err != -EIO) {
358 int err2 = filemap_fdatawait(mapping);
359 if (!err)
360 err = err2;
361 }
352 } 362 }
353 return retval; 363 return err;
354} 364}
365EXPORT_SYMBOL(filemap_write_and_wait);
355 366
356int filemap_write_and_wait_range(struct address_space *mapping, 367int filemap_write_and_wait_range(struct address_space *mapping,
357 loff_t lstart, loff_t lend) 368 loff_t lstart, loff_t lend)
358{ 369{
359 int retval = 0; 370 int err = 0;
360 371
361 if (mapping->nrpages) { 372 if (mapping->nrpages) {
362 retval = __filemap_fdatawrite_range(mapping, lstart, lend, 373 err = __filemap_fdatawrite_range(mapping, lstart, lend,
363 WB_SYNC_ALL); 374 WB_SYNC_ALL);
364 if (retval == 0) 375 /* See comment of filemap_write_and_wait() */
365 retval = wait_on_page_writeback_range(mapping, 376 if (err != -EIO) {
366 lstart >> PAGE_CACHE_SHIFT, 377 int err2 = wait_on_page_writeback_range(mapping,
367 lend >> PAGE_CACHE_SHIFT); 378 lstart >> PAGE_CACHE_SHIFT,
379 lend >> PAGE_CACHE_SHIFT);
380 if (!err)
381 err = err2;
382 }
368 } 383 }
369 return retval; 384 return err;
370} 385}
371 386
372/* 387/*
@@ -555,11 +570,12 @@ repeat:
555 page_cache_get(page); 570 page_cache_get(page);
556 if (TestSetPageLocked(page)) { 571 if (TestSetPageLocked(page)) {
557 read_unlock_irq(&mapping->tree_lock); 572 read_unlock_irq(&mapping->tree_lock);
558 lock_page(page); 573 __lock_page(page);
559 read_lock_irq(&mapping->tree_lock); 574 read_lock_irq(&mapping->tree_lock);
560 575
561 /* Has the page been truncated while we slept? */ 576 /* Has the page been truncated while we slept? */
562 if (page->mapping != mapping || page->index != offset) { 577 if (unlikely(page->mapping != mapping ||
578 page->index != offset)) {
563 unlock_page(page); 579 unlock_page(page);
564 page_cache_release(page); 580 page_cache_release(page);
565 goto repeat; 581 goto repeat;
@@ -831,8 +847,13 @@ readpage:
831 /* Start the actual read. The read will unlock the page. */ 847 /* Start the actual read. The read will unlock the page. */
832 error = mapping->a_ops->readpage(filp, page); 848 error = mapping->a_ops->readpage(filp, page);
833 849
834 if (unlikely(error)) 850 if (unlikely(error)) {
851 if (error == AOP_TRUNCATED_PAGE) {
852 page_cache_release(page);
853 goto find_page;
854 }
835 goto readpage_error; 855 goto readpage_error;
856 }
836 857
837 if (!PageUptodate(page)) { 858 if (!PageUptodate(page)) {
838 lock_page(page); 859 lock_page(page);
@@ -1152,26 +1173,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
1152{ 1173{
1153 struct address_space *mapping = file->f_mapping; 1174 struct address_space *mapping = file->f_mapping;
1154 struct page *page; 1175 struct page *page;
1155 int error; 1176 int ret;
1156 1177
1157 page = page_cache_alloc_cold(mapping); 1178 do {
1158 if (!page) 1179 page = page_cache_alloc_cold(mapping);
1159 return -ENOMEM; 1180 if (!page)
1181 return -ENOMEM;
1182
1183 ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1184 if (ret == 0)
1185 ret = mapping->a_ops->readpage(file, page);
1186 else if (ret == -EEXIST)
1187 ret = 0; /* losing race to add is OK */
1160 1188
1161 error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
1162 if (!error) {
1163 error = mapping->a_ops->readpage(file, page);
1164 page_cache_release(page); 1189 page_cache_release(page);
1165 return error;
1166 }
1167 1190
1168 /* 1191 } while (ret == AOP_TRUNCATED_PAGE);
1169 * We arrive here in the unlikely event that someone 1192
1170 * raced with us and added our page to the cache first 1193 return ret;
1171 * or we are out of memory for radix-tree nodes.
1172 */
1173 page_cache_release(page);
1174 return error == -EEXIST ? 0 : error;
1175} 1194}
1176 1195
1177#define MMAP_LOTSAMISS (100) 1196#define MMAP_LOTSAMISS (100)
@@ -1331,10 +1350,14 @@ page_not_uptodate:
1331 goto success; 1350 goto success;
1332 } 1351 }
1333 1352
1334 if (!mapping->a_ops->readpage(file, page)) { 1353 error = mapping->a_ops->readpage(file, page);
1354 if (!error) {
1335 wait_on_page_locked(page); 1355 wait_on_page_locked(page);
1336 if (PageUptodate(page)) 1356 if (PageUptodate(page))
1337 goto success; 1357 goto success;
1358 } else if (error == AOP_TRUNCATED_PAGE) {
1359 page_cache_release(page);
1360 goto retry_find;
1338 } 1361 }
1339 1362
1340 /* 1363 /*
@@ -1358,10 +1381,14 @@ page_not_uptodate:
1358 goto success; 1381 goto success;
1359 } 1382 }
1360 ClearPageError(page); 1383 ClearPageError(page);
1361 if (!mapping->a_ops->readpage(file, page)) { 1384 error = mapping->a_ops->readpage(file, page);
1385 if (!error) {
1362 wait_on_page_locked(page); 1386 wait_on_page_locked(page);
1363 if (PageUptodate(page)) 1387 if (PageUptodate(page))
1364 goto success; 1388 goto success;
1389 } else if (error == AOP_TRUNCATED_PAGE) {
1390 page_cache_release(page);
1391 goto retry_find;
1365 } 1392 }
1366 1393
1367 /* 1394 /*
@@ -1444,10 +1471,14 @@ page_not_uptodate:
1444 goto success; 1471 goto success;
1445 } 1472 }
1446 1473
1447 if (!mapping->a_ops->readpage(file, page)) { 1474 error = mapping->a_ops->readpage(file, page);
1475 if (!error) {
1448 wait_on_page_locked(page); 1476 wait_on_page_locked(page);
1449 if (PageUptodate(page)) 1477 if (PageUptodate(page))
1450 goto success; 1478 goto success;
1479 } else if (error == AOP_TRUNCATED_PAGE) {
1480 page_cache_release(page);
1481 goto retry_find;
1451 } 1482 }
1452 1483
1453 /* 1484 /*
@@ -1470,10 +1501,14 @@ page_not_uptodate:
1470 } 1501 }
1471 1502
1472 ClearPageError(page); 1503 ClearPageError(page);
1473 if (!mapping->a_ops->readpage(file, page)) { 1504 error = mapping->a_ops->readpage(file, page);
1505 if (!error) {
1474 wait_on_page_locked(page); 1506 wait_on_page_locked(page);
1475 if (PageUptodate(page)) 1507 if (PageUptodate(page))
1476 goto success; 1508 goto success;
1509 } else if (error == AOP_TRUNCATED_PAGE) {
1510 page_cache_release(page);
1511 goto retry_find;
1477 } 1512 }
1478 1513
1479 /* 1514 /*
@@ -1858,7 +1893,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1858 /* 1893 /*
1859 * Sync the fs metadata but not the minor inode changes and 1894 * Sync the fs metadata but not the minor inode changes and
1860 * of course not the data as we did direct DMA for the IO. 1895 * of course not the data as we did direct DMA for the IO.
1861 * i_sem is held, which protects generic_osync_inode() from 1896 * i_mutex is held, which protects generic_osync_inode() from
1862 * livelocking. 1897 * livelocking.
1863 */ 1898 */
1864 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 1899 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -1934,12 +1969,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1934 status = a_ops->prepare_write(file, page, offset, offset+bytes); 1969 status = a_ops->prepare_write(file, page, offset, offset+bytes);
1935 if (unlikely(status)) { 1970 if (unlikely(status)) {
1936 loff_t isize = i_size_read(inode); 1971 loff_t isize = i_size_read(inode);
1972
1973 if (status != AOP_TRUNCATED_PAGE)
1974 unlock_page(page);
1975 page_cache_release(page);
1976 if (status == AOP_TRUNCATED_PAGE)
1977 continue;
1937 /* 1978 /*
1938 * prepare_write() may have instantiated a few blocks 1979 * prepare_write() may have instantiated a few blocks
1939 * outside i_size. Trim these off again. 1980 * outside i_size. Trim these off again.
1940 */ 1981 */
1941 unlock_page(page);
1942 page_cache_release(page);
1943 if (pos + bytes > isize) 1982 if (pos + bytes > isize)
1944 vmtruncate(inode, isize); 1983 vmtruncate(inode, isize);
1945 break; 1984 break;
@@ -1952,6 +1991,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1952 cur_iov, iov_base, bytes); 1991 cur_iov, iov_base, bytes);
1953 flush_dcache_page(page); 1992 flush_dcache_page(page);
1954 status = a_ops->commit_write(file, page, offset, offset+bytes); 1993 status = a_ops->commit_write(file, page, offset, offset+bytes);
1994 if (status == AOP_TRUNCATED_PAGE) {
1995 page_cache_release(page);
1996 continue;
1997 }
1955 if (likely(copied > 0)) { 1998 if (likely(copied > 0)) {
1956 if (!status) 1999 if (!status)
1957 status = copied; 2000 status = copied;
@@ -2066,7 +2109,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
2066 if (err) 2109 if (err)
2067 goto out; 2110 goto out;
2068 2111
2069 inode_update_time(inode, 1); 2112 file_update_time(file);
2070 2113
2071 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2114 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
2072 if (unlikely(file->f_flags & O_DIRECT)) { 2115 if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2153,10 +2196,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
2153 2196
2154 BUG_ON(iocb->ki_pos != pos); 2197 BUG_ON(iocb->ki_pos != pos);
2155 2198
2156 down(&inode->i_sem); 2199 mutex_lock(&inode->i_mutex);
2157 ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, 2200 ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
2158 &iocb->ki_pos); 2201 &iocb->ki_pos);
2159 up(&inode->i_sem); 2202 mutex_unlock(&inode->i_mutex);
2160 2203
2161 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2204 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2162 ssize_t err; 2205 ssize_t err;
@@ -2178,9 +2221,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
2178 struct iovec local_iov = { .iov_base = (void __user *)buf, 2221 struct iovec local_iov = { .iov_base = (void __user *)buf,
2179 .iov_len = count }; 2222 .iov_len = count };
2180 2223
2181 down(&inode->i_sem); 2224 mutex_lock(&inode->i_mutex);
2182 ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); 2225 ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
2183 up(&inode->i_sem); 2226 mutex_unlock(&inode->i_mutex);
2184 2227
2185 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2228 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2186 ssize_t err; 2229 ssize_t err;
@@ -2214,9 +2257,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2214 struct inode *inode = mapping->host; 2257 struct inode *inode = mapping->host;
2215 ssize_t ret; 2258 ssize_t ret;
2216 2259
2217 down(&inode->i_sem); 2260 mutex_lock(&inode->i_mutex);
2218 ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); 2261 ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
2219 up(&inode->i_sem); 2262 mutex_unlock(&inode->i_mutex);
2220 2263
2221 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2264 if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2222 int err; 2265 int err;
@@ -2230,7 +2273,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
2230EXPORT_SYMBOL(generic_file_writev); 2273EXPORT_SYMBOL(generic_file_writev);
2231 2274
2232/* 2275/*
2233 * Called under i_sem for writes to S_ISREG files. Returns -EIO if something 2276 * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something
2234 * went wrong during pagecache shootdown. 2277 * went wrong during pagecache shootdown.
2235 */ 2278 */
2236static ssize_t 2279static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a29a..b960ac8e5918 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
338 *ppos = pos; 338 *ppos = pos;
339 /* 339 /*
340 * No need to use i_size_read() here, the i_size 340 * No need to use i_size_read() here, the i_size
341 * cannot change under us because we hold i_sem. 341 * cannot change under us because we hold i_mutex.
342 */ 342 */
343 if (pos > inode->i_size) { 343 if (pos > inode->i_size) {
344 i_size_write(inode, pos); 344 i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
358 loff_t pos; 358 loff_t pos;
359 ssize_t ret; 359 ssize_t ret;
360 360
361 down(&inode->i_sem); 361 mutex_lock(&inode->i_mutex);
362 362
363 if (!access_ok(VERIFY_READ, buf, len)) { 363 if (!access_ok(VERIFY_READ, buf, len)) {
364 ret=-EFAULT; 364 ret=-EFAULT;
@@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
383 if (ret) 383 if (ret)
384 goto out_backing; 384 goto out_backing;
385 385
386 inode_update_time(inode, 1); 386 file_update_time(filp);
387 387
388 ret = __xip_file_write (filp, buf, count, pos, ppos); 388 ret = __xip_file_write (filp, buf, count, pos, ppos);
389 389
390 out_backing: 390 out_backing:
391 current->backing_dev_info = NULL; 391 current->backing_dev_info = NULL;
392 out_up: 392 out_up:
393 up(&inode->i_sem); 393 mutex_unlock(&inode->i_mutex);
394 return ret; 394 return ret;
395} 395}
396EXPORT_SYMBOL_GPL(xip_file_write); 396EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7c471b..b21d78c941b5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,9 @@
11#include <linux/highmem.h> 11#include <linux/highmem.h>
12#include <linux/nodemask.h> 12#include <linux/nodemask.h>
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h>
15#include <linux/cpuset.h>
16
14#include <asm/page.h> 17#include <asm/page.h>
15#include <asm/pgtable.h> 18#include <asm/pgtable.h>
16 19
@@ -36,18 +39,22 @@ static void enqueue_huge_page(struct page *page)
36 free_huge_pages_node[nid]++; 39 free_huge_pages_node[nid]++;
37} 40}
38 41
39static struct page *dequeue_huge_page(void) 42static struct page *dequeue_huge_page(struct vm_area_struct *vma,
43 unsigned long address)
40{ 44{
41 int nid = numa_node_id(); 45 int nid = numa_node_id();
42 struct page *page = NULL; 46 struct page *page = NULL;
47 struct zonelist *zonelist = huge_zonelist(vma, address);
48 struct zone **z;
43 49
44 if (list_empty(&hugepage_freelists[nid])) { 50 for (z = zonelist->zones; *z; z++) {
45 for (nid = 0; nid < MAX_NUMNODES; ++nid) 51 nid = (*z)->zone_pgdat->node_id;
46 if (!list_empty(&hugepage_freelists[nid])) 52 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
47 break; 53 !list_empty(&hugepage_freelists[nid]))
54 break;
48 } 55 }
49 if (nid >= 0 && nid < MAX_NUMNODES && 56
50 !list_empty(&hugepage_freelists[nid])) { 57 if (*z) {
51 page = list_entry(hugepage_freelists[nid].next, 58 page = list_entry(hugepage_freelists[nid].next,
52 struct page, lru); 59 struct page, lru);
53 list_del(&page->lru); 60 list_del(&page->lru);
@@ -85,13 +92,13 @@ void free_huge_page(struct page *page)
85 spin_unlock(&hugetlb_lock); 92 spin_unlock(&hugetlb_lock);
86} 93}
87 94
88struct page *alloc_huge_page(void) 95struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
89{ 96{
90 struct page *page; 97 struct page *page;
91 int i; 98 int i;
92 99
93 spin_lock(&hugetlb_lock); 100 spin_lock(&hugetlb_lock);
94 page = dequeue_huge_page(); 101 page = dequeue_huge_page(vma, addr);
95 if (!page) { 102 if (!page) {
96 spin_unlock(&hugetlb_lock); 103 spin_unlock(&hugetlb_lock);
97 return NULL; 104 return NULL;
@@ -194,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
194 spin_lock(&hugetlb_lock); 201 spin_lock(&hugetlb_lock);
195 try_to_free_low(count); 202 try_to_free_low(count);
196 while (count < nr_huge_pages) { 203 while (count < nr_huge_pages) {
197 struct page *page = dequeue_huge_page(); 204 struct page *page = dequeue_huge_page(NULL, 0);
198 if (!page) 205 if (!page)
199 break; 206 break;
200 update_and_free_page(page); 207 update_and_free_page(page);
@@ -261,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
261 .nopage = hugetlb_nopage, 268 .nopage = hugetlb_nopage,
262}; 269};
263 270
264static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) 271static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
272 int writable)
265{ 273{
266 pte_t entry; 274 pte_t entry;
267 275
268 if (vma->vm_flags & VM_WRITE) { 276 if (writable) {
269 entry = 277 entry =
270 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 278 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
271 } else { 279 } else {
@@ -277,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
277 return entry; 285 return entry;
278} 286}
279 287
288static void set_huge_ptep_writable(struct vm_area_struct *vma,
289 unsigned long address, pte_t *ptep)
290{
291 pte_t entry;
292
293 entry = pte_mkwrite(pte_mkdirty(*ptep));
294 ptep_set_access_flags(vma, address, ptep, entry, 1);
295 update_mmu_cache(vma, address, entry);
296 lazy_mmu_prot_update(entry);
297}
298
299
280int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, 300int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
281 struct vm_area_struct *vma) 301 struct vm_area_struct *vma)
282{ 302{
283 pte_t *src_pte, *dst_pte, entry; 303 pte_t *src_pte, *dst_pte, entry;
284 struct page *ptepage; 304 struct page *ptepage;
285 unsigned long addr; 305 unsigned long addr;
306 int cow;
307
308 cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
286 309
287 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 310 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
288 src_pte = huge_pte_offset(src, addr); 311 src_pte = huge_pte_offset(src, addr);
@@ -294,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
294 spin_lock(&dst->page_table_lock); 317 spin_lock(&dst->page_table_lock);
295 spin_lock(&src->page_table_lock); 318 spin_lock(&src->page_table_lock);
296 if (!pte_none(*src_pte)) { 319 if (!pte_none(*src_pte)) {
320 if (cow)
321 ptep_set_wrprotect(src, addr, src_pte);
297 entry = *src_pte; 322 entry = *src_pte;
298 ptepage = pte_page(entry); 323 ptepage = pte_page(entry);
299 get_page(ptepage); 324 get_page(ptepage);
@@ -345,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
345 flush_tlb_range(vma, start, end); 370 flush_tlb_range(vma, start, end);
346} 371}
347 372
348static struct page *find_lock_huge_page(struct address_space *mapping, 373static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
349 unsigned long idx) 374 unsigned long address, pte_t *ptep, pte_t pte)
350{ 375{
351 struct page *page; 376 struct page *old_page, *new_page;
352 int err; 377 int i, avoidcopy;
353 struct inode *inode = mapping->host;
354 unsigned long size;
355 378
356retry: 379 old_page = pte_page(pte);
357 page = find_lock_page(mapping, idx);
358 if (page)
359 goto out;
360 380
361 /* Check to make sure the mapping hasn't been truncated */ 381 /* If no-one else is actually using this page, avoid the copy
362 size = i_size_read(inode) >> HPAGE_SHIFT; 382 * and just make the page writable */
363 if (idx >= size) 383 avoidcopy = (page_count(old_page) == 1);
364 goto out; 384 if (avoidcopy) {
385 set_huge_ptep_writable(vma, address, ptep);
386 return VM_FAULT_MINOR;
387 }
365 388
366 if (hugetlb_get_quota(mapping)) 389 page_cache_get(old_page);
367 goto out; 390 new_page = alloc_huge_page(vma, address);
368 page = alloc_huge_page(); 391
369 if (!page) { 392 if (!new_page) {
370 hugetlb_put_quota(mapping); 393 page_cache_release(old_page);
371 goto out; 394
395 /* Logically this is OOM, not a SIGBUS, but an OOM
396 * could cause the kernel to go killing other
397 * processes which won't help the hugepage situation
398 * at all (?) */
399 return VM_FAULT_SIGBUS;
372 } 400 }
373 401
374 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); 402 spin_unlock(&mm->page_table_lock);
375 if (err) { 403 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
376 put_page(page); 404 copy_user_highpage(new_page + i, old_page + i,
377 hugetlb_put_quota(mapping); 405 address + i*PAGE_SIZE);
378 if (err == -EEXIST) 406 spin_lock(&mm->page_table_lock);
379 goto retry; 407
380 page = NULL; 408 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
409 if (likely(pte_same(*ptep, pte))) {
410 /* Break COW */
411 set_huge_pte_at(mm, address, ptep,
412 make_huge_pte(vma, new_page, 1));
413 /* Make the old page be freed below */
414 new_page = old_page;
381 } 415 }
382out: 416 page_cache_release(new_page);
383 return page; 417 page_cache_release(old_page);
418 return VM_FAULT_MINOR;
384} 419}
385 420
386int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 421int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
387 unsigned long address, int write_access) 422 unsigned long address, pte_t *ptep, int write_access)
388{ 423{
389 int ret = VM_FAULT_SIGBUS; 424 int ret = VM_FAULT_SIGBUS;
390 unsigned long idx; 425 unsigned long idx;
391 unsigned long size; 426 unsigned long size;
392 pte_t *pte;
393 struct page *page; 427 struct page *page;
394 struct address_space *mapping; 428 struct address_space *mapping;
395 429 pte_t new_pte;
396 pte = huge_pte_alloc(mm, address);
397 if (!pte)
398 goto out;
399 430
400 mapping = vma->vm_file->f_mapping; 431 mapping = vma->vm_file->f_mapping;
401 idx = ((address - vma->vm_start) >> HPAGE_SHIFT) 432 idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -405,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
405 * Use page lock to guard against racing truncation 436 * Use page lock to guard against racing truncation
406 * before we get page_table_lock. 437 * before we get page_table_lock.
407 */ 438 */
408 page = find_lock_huge_page(mapping, idx); 439retry:
409 if (!page) 440 page = find_lock_page(mapping, idx);
410 goto out; 441 if (!page) {
442 if (hugetlb_get_quota(mapping))
443 goto out;
444 page = alloc_huge_page(vma, address);
445 if (!page) {
446 hugetlb_put_quota(mapping);
447 goto out;
448 }
449
450 if (vma->vm_flags & VM_SHARED) {
451 int err;
452
453 err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
454 if (err) {
455 put_page(page);
456 hugetlb_put_quota(mapping);
457 if (err == -EEXIST)
458 goto retry;
459 goto out;
460 }
461 } else
462 lock_page(page);
463 }
411 464
412 spin_lock(&mm->page_table_lock); 465 spin_lock(&mm->page_table_lock);
413 size = i_size_read(mapping->host) >> HPAGE_SHIFT; 466 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -415,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
415 goto backout; 468 goto backout;
416 469
417 ret = VM_FAULT_MINOR; 470 ret = VM_FAULT_MINOR;
418 if (!pte_none(*pte)) 471 if (!pte_none(*ptep))
419 goto backout; 472 goto backout;
420 473
421 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 474 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
422 set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); 475 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
476 && (vma->vm_flags & VM_SHARED)));
477 set_huge_pte_at(mm, address, ptep, new_pte);
478
479 if (write_access && !(vma->vm_flags & VM_SHARED)) {
480 /* Optimization, do the COW without a second fault */
481 ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
482 }
483
423 spin_unlock(&mm->page_table_lock); 484 spin_unlock(&mm->page_table_lock);
424 unlock_page(page); 485 unlock_page(page);
425out: 486out:
@@ -433,6 +494,33 @@ backout:
433 goto out; 494 goto out;
434} 495}
435 496
497int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
498 unsigned long address, int write_access)
499{
500 pte_t *ptep;
501 pte_t entry;
502 int ret;
503
504 ptep = huge_pte_alloc(mm, address);
505 if (!ptep)
506 return VM_FAULT_OOM;
507
508 entry = *ptep;
509 if (pte_none(entry))
510 return hugetlb_no_page(mm, vma, address, ptep, write_access);
511
512 ret = VM_FAULT_MINOR;
513
514 spin_lock(&mm->page_table_lock);
515 /* Check for a racing update before calling hugetlb_cow */
516 if (likely(pte_same(entry, *ptep)))
517 if (write_access && !pte_write(entry))
518 ret = hugetlb_cow(mm, vma, address, ptep, entry);
519 spin_unlock(&mm->page_table_lock);
520
521 return ret;
522}
523
436int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 524int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
437 struct page **pages, struct vm_area_struct **vmas, 525 struct page **pages, struct vm_area_struct **vmas,
438 unsigned long *position, int *length, int i) 526 unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb3d..17256bb2f4ef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12/* page_alloc.c */ 12static inline void set_page_refs(struct page *page, int order)
13extern void set_page_refs(struct page *page, int order); 13{
14#ifdef CONFIG_MMU
15 set_page_count(page, 1);
16#else
17 int i;
18
19 /*
20 * We need to reference all the pages for this order, otherwise if
21 * anyone accesses one of the pages with (get/put) it will be freed.
22 * - eg: access_process_vm()
23 */
24 for (i = 0; i < (1 << order); i++)
25 set_page_count(page + i, 1);
26#endif /* CONFIG_MMU */
27}
28
29extern void fastcall __init __free_pages_bootmem(struct page *page,
30 unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a21..ae0ae3ea299a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
140 return 0; 140 return 0;
141} 141}
142 142
143/*
144 * Application wants to free up the pages and associated backing store.
145 * This is effectively punching a hole into the middle of a file.
146 *
147 * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
148 * Other filesystems return -ENOSYS.
149 */
150static long madvise_remove(struct vm_area_struct *vma,
151 unsigned long start, unsigned long end)
152{
153 struct address_space *mapping;
154 loff_t offset, endoff;
155
156 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
157 return -EINVAL;
158
159 if (!vma->vm_file || !vma->vm_file->f_mapping
160 || !vma->vm_file->f_mapping->host) {
161 return -EINVAL;
162 }
163
164 mapping = vma->vm_file->f_mapping;
165
166 offset = (loff_t)(start - vma->vm_start)
167 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
168 endoff = (loff_t)(end - vma->vm_start - 1)
169 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
170 return vmtruncate_range(mapping->host, offset, endoff);
171}
172
143static long 173static long
144madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, 174madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
145 unsigned long start, unsigned long end, int behavior) 175 unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
152 case MADV_RANDOM: 182 case MADV_RANDOM:
153 error = madvise_behavior(vma, prev, start, end, behavior); 183 error = madvise_behavior(vma, prev, start, end, behavior);
154 break; 184 break;
185 case MADV_REMOVE:
186 error = madvise_remove(vma, start, end);
187 break;
155 188
156 case MADV_WILLNEED: 189 case MADV_WILLNEED:
157 error = madvise_willneed(vma, prev, start, end); 190 error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
190 * some pages ahead. 223 * some pages ahead.
191 * MADV_DONTNEED - the application is finished with the given range, 224 * MADV_DONTNEED - the application is finished with the given range,
192 * so the kernel can free resources associated with it. 225 * so the kernel can free resources associated with it.
226 * MADV_REMOVE - the application wants to free up the given range of
227 * pages and associated backing store.
193 * 228 *
194 * return values: 229 * return values:
195 * zero - success 230 * zero - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07a3656..7a11ddd5060f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
1498 update_mmu_cache(vma, address, entry); 1498 update_mmu_cache(vma, address, entry);
1499 lazy_mmu_prot_update(entry); 1499 lazy_mmu_prot_update(entry);
1500 lru_cache_add_active(new_page); 1500 lru_cache_add_active(new_page);
1501 page_add_anon_rmap(new_page, vma, address); 1501 page_add_new_anon_rmap(new_page, vma, address);
1502 1502
1503 /* Free the old page.. */ 1503 /* Free the old page.. */
1504 new_page = old_page; 1504 new_page = old_page;
@@ -1770,9 +1770,32 @@ out_big:
1770out_busy: 1770out_busy:
1771 return -ETXTBSY; 1771 return -ETXTBSY;
1772} 1772}
1773
1774EXPORT_SYMBOL(vmtruncate); 1773EXPORT_SYMBOL(vmtruncate);
1775 1774
1775int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
1776{
1777 struct address_space *mapping = inode->i_mapping;
1778
1779 /*
1780 * If the underlying filesystem is not going to provide
1781 * a way to truncate a range of blocks (punch a hole) -
1782 * we should return failure right now.
1783 */
1784 if (!inode->i_op || !inode->i_op->truncate_range)
1785 return -ENOSYS;
1786
1787 mutex_lock(&inode->i_mutex);
1788 down_write(&inode->i_alloc_sem);
1789 unmap_mapping_range(mapping, offset, (end - offset), 1);
1790 truncate_inode_pages_range(mapping, offset, end);
1791 inode->i_op->truncate_range(inode, offset, end);
1792 up_write(&inode->i_alloc_sem);
1793 mutex_unlock(&inode->i_mutex);
1794
1795 return 0;
1796}
1797EXPORT_SYMBOL(vmtruncate_range);
1798
1776/* 1799/*
1777 * Primitive swap readahead code. We simply read an aligned block of 1800 * Primitive swap readahead code. We simply read an aligned block of
1778 * (1 << page_cluster) entries in the swap area. This method is chosen 1801 * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1954 goto release; 1977 goto release;
1955 inc_mm_counter(mm, anon_rss); 1978 inc_mm_counter(mm, anon_rss);
1956 lru_cache_add_active(page); 1979 lru_cache_add_active(page);
1957 SetPageReferenced(page); 1980 page_add_new_anon_rmap(page, vma, address);
1958 page_add_anon_rmap(page, vma, address);
1959 } else { 1981 } else {
1960 /* Map the ZERO_PAGE - vm_page_prot is readonly */ 1982 /* Map the ZERO_PAGE - vm_page_prot is readonly */
1961 page = ZERO_PAGE(address); 1983 page = ZERO_PAGE(address);
@@ -2086,7 +2108,7 @@ retry:
2086 if (anon) { 2108 if (anon) {
2087 inc_mm_counter(mm, anon_rss); 2109 inc_mm_counter(mm, anon_rss);
2088 lru_cache_add_active(new_page); 2110 lru_cache_add_active(new_page);
2089 page_add_anon_rmap(new_page, vma, address); 2111 page_add_new_anon_rmap(new_page, vma, address);
2090 } else { 2112 } else {
2091 inc_mm_counter(mm, file_rss); 2113 inc_mm_counter(mm, file_rss);
2092 page_add_file_rmap(new_page); 2114 page_add_file_rmap(new_page);
@@ -2245,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2245 return handle_pte_fault(mm, vma, address, pte, pmd, write_access); 2267 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2246} 2268}
2247 2269
2270EXPORT_SYMBOL_GPL(__handle_mm_fault);
2271
2248#ifndef __PAGETABLE_PUD_FOLDED 2272#ifndef __PAGETABLE_PUD_FOLDED
2249/* 2273/*
2250 * Allocate page upper directory. 2274 * Allocate page upper directory.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6d4af8af8a8..a918f77f02f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
42 int nr_pages); 42 int nr_pages);
43static int __add_section(struct zone *zone, unsigned long phys_start_pfn) 43static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
44{ 44{
45 struct pglist_data *pgdat = zone->zone_pgdat;
46 int nr_pages = PAGES_PER_SECTION; 45 int nr_pages = PAGES_PER_SECTION;
47 int ret; 46 int ret;
48 47
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bec88c81244e..b62cab575a84 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
83#include <linux/init.h> 83#include <linux/init.h>
84#include <linux/compat.h> 84#include <linux/compat.h>
85#include <linux/mempolicy.h> 85#include <linux/mempolicy.h>
86#include <linux/swap.h>
87#include <linux/seq_file.h>
88#include <linux/proc_fs.h>
89
86#include <asm/tlbflush.h> 90#include <asm/tlbflush.h>
87#include <asm/uaccess.h> 91#include <asm/uaccess.h>
88 92
93/* Internal flags */
94#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97
89static kmem_cache_t *policy_cache; 98static kmem_cache_t *policy_cache;
90static kmem_cache_t *sn_cache; 99static kmem_cache_t *sn_cache;
91 100
@@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache;
93 102
94/* Highest zone. An specific allocation for a zone below that is not 103/* Highest zone. An specific allocation for a zone below that is not
95 policied. */ 104 policied. */
96static int policy_zone; 105int policy_zone = ZONE_DMA;
97 106
98struct mempolicy default_policy = { 107struct mempolicy default_policy = {
99 .refcnt = ATOMIC_INIT(1), /* never free it */ 108 .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
131 if (!zl) 140 if (!zl)
132 return NULL; 141 return NULL;
133 num = 0; 142 num = 0;
134 for_each_node_mask(nd, *nodes) { 143 for_each_node_mask(nd, *nodes)
135 int k; 144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
138 if (!z->present_pages)
139 continue;
140 zl->zones[num++] = z;
141 if (k > policy_zone)
142 policy_zone = k;
143 }
144 }
145 zl->zones[num] = NULL; 145 zl->zones[num] = NULL;
146 return zl; 146 return zl;
147} 147}
@@ -161,6 +161,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
161 switch (mode) { 161 switch (mode) {
162 case MPOL_INTERLEAVE: 162 case MPOL_INTERLEAVE:
163 policy->v.nodes = *nodes; 163 policy->v.nodes = *nodes;
164 if (nodes_weight(*nodes) == 0) {
165 kmem_cache_free(policy_cache, policy);
166 return ERR_PTR(-EINVAL);
167 }
164 break; 168 break;
165 case MPOL_PREFERRED: 169 case MPOL_PREFERRED:
166 policy->v.preferred_node = first_node(*nodes); 170 policy->v.preferred_node = first_node(*nodes);
@@ -176,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
176 break; 180 break;
177 } 181 }
178 policy->policy = mode; 182 policy->policy = mode;
183 policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
179 return policy; 184 return policy;
180} 185}
181 186
182/* Ensure all existing pages follow the policy. */ 187static void gather_stats(struct page *, void *);
188static void migrate_page_add(struct vm_area_struct *vma,
189 struct page *page, struct list_head *pagelist, unsigned long flags);
190
191/* Scan through pages checking if pages follow certain conditions. */
183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 192static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
184 unsigned long addr, unsigned long end, nodemask_t *nodes) 193 unsigned long addr, unsigned long end,
194 const nodemask_t *nodes, unsigned long flags,
195 void *private)
185{ 196{
186 pte_t *orig_pte; 197 pte_t *orig_pte;
187 pte_t *pte; 198 pte_t *pte;
@@ -197,8 +208,20 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
197 page = vm_normal_page(vma, addr, *pte); 208 page = vm_normal_page(vma, addr, *pte);
198 if (!page) 209 if (!page)
199 continue; 210 continue;
211 if (PageReserved(page))
212 continue;
200 nid = page_to_nid(page); 213 nid = page_to_nid(page);
201 if (!node_isset(nid, *nodes)) 214 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
215 continue;
216
217 if (flags & MPOL_MF_STATS)
218 gather_stats(page, private);
219 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
220 spin_unlock(ptl);
221 migrate_page_add(vma, page, private, flags);
222 spin_lock(ptl);
223 }
224 else
202 break; 225 break;
203 } while (pte++, addr += PAGE_SIZE, addr != end); 226 } while (pte++, addr += PAGE_SIZE, addr != end);
204 pte_unmap_unlock(orig_pte, ptl); 227 pte_unmap_unlock(orig_pte, ptl);
@@ -206,7 +229,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
206} 229}
207 230
208static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 231static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
209 unsigned long addr, unsigned long end, nodemask_t *nodes) 232 unsigned long addr, unsigned long end,
233 const nodemask_t *nodes, unsigned long flags,
234 void *private)
210{ 235{
211 pmd_t *pmd; 236 pmd_t *pmd;
212 unsigned long next; 237 unsigned long next;
@@ -216,14 +241,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
216 next = pmd_addr_end(addr, end); 241 next = pmd_addr_end(addr, end);
217 if (pmd_none_or_clear_bad(pmd)) 242 if (pmd_none_or_clear_bad(pmd))
218 continue; 243 continue;
219 if (check_pte_range(vma, pmd, addr, next, nodes)) 244 if (check_pte_range(vma, pmd, addr, next, nodes,
245 flags, private))
220 return -EIO; 246 return -EIO;
221 } while (pmd++, addr = next, addr != end); 247 } while (pmd++, addr = next, addr != end);
222 return 0; 248 return 0;
223} 249}
224 250
225static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 251static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
226 unsigned long addr, unsigned long end, nodemask_t *nodes) 252 unsigned long addr, unsigned long end,
253 const nodemask_t *nodes, unsigned long flags,
254 void *private)
227{ 255{
228 pud_t *pud; 256 pud_t *pud;
229 unsigned long next; 257 unsigned long next;
@@ -233,14 +261,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
233 next = pud_addr_end(addr, end); 261 next = pud_addr_end(addr, end);
234 if (pud_none_or_clear_bad(pud)) 262 if (pud_none_or_clear_bad(pud))
235 continue; 263 continue;
236 if (check_pmd_range(vma, pud, addr, next, nodes)) 264 if (check_pmd_range(vma, pud, addr, next, nodes,
265 flags, private))
237 return -EIO; 266 return -EIO;
238 } while (pud++, addr = next, addr != end); 267 } while (pud++, addr = next, addr != end);
239 return 0; 268 return 0;
240} 269}
241 270
242static inline int check_pgd_range(struct vm_area_struct *vma, 271static inline int check_pgd_range(struct vm_area_struct *vma,
243 unsigned long addr, unsigned long end, nodemask_t *nodes) 272 unsigned long addr, unsigned long end,
273 const nodemask_t *nodes, unsigned long flags,
274 void *private)
244{ 275{
245 pgd_t *pgd; 276 pgd_t *pgd;
246 unsigned long next; 277 unsigned long next;
@@ -250,16 +281,30 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
250 next = pgd_addr_end(addr, end); 281 next = pgd_addr_end(addr, end);
251 if (pgd_none_or_clear_bad(pgd)) 282 if (pgd_none_or_clear_bad(pgd))
252 continue; 283 continue;
253 if (check_pud_range(vma, pgd, addr, next, nodes)) 284 if (check_pud_range(vma, pgd, addr, next, nodes,
285 flags, private))
254 return -EIO; 286 return -EIO;
255 } while (pgd++, addr = next, addr != end); 287 } while (pgd++, addr = next, addr != end);
256 return 0; 288 return 0;
257} 289}
258 290
259/* Step 1: check the range */ 291/* Check if a vma is migratable */
292static inline int vma_migratable(struct vm_area_struct *vma)
293{
294 if (vma->vm_flags & (
295 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
296 return 0;
297 return 1;
298}
299
300/*
301 * Check if all pages in a range are on a set of nodes.
302 * If pagelist != NULL then isolate pages from the LRU and
303 * put them on the pagelist.
304 */
260static struct vm_area_struct * 305static struct vm_area_struct *
261check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 306check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
262 nodemask_t *nodes, unsigned long flags) 307 const nodemask_t *nodes, unsigned long flags, void *private)
263{ 308{
264 int err; 309 int err;
265 struct vm_area_struct *first, *vma, *prev; 310 struct vm_area_struct *first, *vma, *prev;
@@ -269,17 +314,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
269 return ERR_PTR(-EFAULT); 314 return ERR_PTR(-EFAULT);
270 prev = NULL; 315 prev = NULL;
271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 316 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
272 if (!vma->vm_next && vma->vm_end < end) 317 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
273 return ERR_PTR(-EFAULT); 318 if (!vma->vm_next && vma->vm_end < end)
274 if (prev && prev->vm_end < vma->vm_start) 319 return ERR_PTR(-EFAULT);
275 return ERR_PTR(-EFAULT); 320 if (prev && prev->vm_end < vma->vm_start)
276 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { 321 return ERR_PTR(-EFAULT);
322 }
323 if (!is_vm_hugetlb_page(vma) &&
324 ((flags & MPOL_MF_STRICT) ||
325 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
326 vma_migratable(vma)))) {
277 unsigned long endvma = vma->vm_end; 327 unsigned long endvma = vma->vm_end;
328
278 if (endvma > end) 329 if (endvma > end)
279 endvma = end; 330 endvma = end;
280 if (vma->vm_start > start) 331 if (vma->vm_start > start)
281 start = vma->vm_start; 332 start = vma->vm_start;
282 err = check_pgd_range(vma, start, endvma, nodes); 333 err = check_pgd_range(vma, start, endvma, nodes,
334 flags, private);
283 if (err) { 335 if (err) {
284 first = ERR_PTR(err); 336 first = ERR_PTR(err);
285 break; 337 break;
@@ -338,51 +390,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
338 if (!nodes) 390 if (!nodes)
339 return 0; 391 return 0;
340 392
341 /* Update current mems_allowed */ 393 cpuset_update_task_memory_state();
342 cpuset_update_current_mems_allowed(); 394 if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
343 /* Ignore nodes not set in current->mems_allowed */
344 cpuset_restrict_to_mems_allowed(nodes->bits);
345 return mpol_check_policy(mode, nodes);
346}
347
348long do_mbind(unsigned long start, unsigned long len,
349 unsigned long mode, nodemask_t *nmask, unsigned long flags)
350{
351 struct vm_area_struct *vma;
352 struct mm_struct *mm = current->mm;
353 struct mempolicy *new;
354 unsigned long end;
355 int err;
356
357 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
358 return -EINVAL;
359 if (start & ~PAGE_MASK)
360 return -EINVAL;
361 if (mode == MPOL_DEFAULT)
362 flags &= ~MPOL_MF_STRICT;
363 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
364 end = start + len;
365 if (end < start)
366 return -EINVAL;
367 if (end == start)
368 return 0;
369 if (mpol_check_policy(mode, nmask))
370 return -EINVAL; 395 return -EINVAL;
371 new = mpol_new(mode, nmask); 396 return mpol_check_policy(mode, nodes);
372 if (IS_ERR(new))
373 return PTR_ERR(new);
374
375 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
376 mode,nodes_addr(nodes)[0]);
377
378 down_write(&mm->mmap_sem);
379 vma = check_range(mm, start, end, nmask, flags);
380 err = PTR_ERR(vma);
381 if (!IS_ERR(vma))
382 err = mbind_range(vma, start, end, new);
383 up_write(&mm->mmap_sem);
384 mpol_free(new);
385 return err;
386} 397}
387 398
388/* Set the process memory policy */ 399/* Set the process memory policy */
@@ -453,7 +464,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
453 struct vm_area_struct *vma = NULL; 464 struct vm_area_struct *vma = NULL;
454 struct mempolicy *pol = current->mempolicy; 465 struct mempolicy *pol = current->mempolicy;
455 466
456 cpuset_update_current_mems_allowed(); 467 cpuset_update_task_memory_state();
457 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 468 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
458 return -EINVAL; 469 return -EINVAL;
459 if (flags & MPOL_F_ADDR) { 470 if (flags & MPOL_F_ADDR) {
@@ -505,11 +516,177 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
505} 516}
506 517
507/* 518/*
519 * page migration
520 */
521
522/* Check if we are the only process mapping the page in question */
523static inline int single_mm_mapping(struct mm_struct *mm,
524 struct address_space *mapping)
525{
526 struct vm_area_struct *vma;
527 struct prio_tree_iter iter;
528 int rc = 1;
529
530 spin_lock(&mapping->i_mmap_lock);
531 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
532 if (mm != vma->vm_mm) {
533 rc = 0;
534 goto out;
535 }
536 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
537 if (mm != vma->vm_mm) {
538 rc = 0;
539 goto out;
540 }
541out:
542 spin_unlock(&mapping->i_mmap_lock);
543 return rc;
544}
545
546/*
547 * Add a page to be migrated to the pagelist
548 */
549static void migrate_page_add(struct vm_area_struct *vma,
550 struct page *page, struct list_head *pagelist, unsigned long flags)
551{
552 /*
553 * Avoid migrating a page that is shared by others and not writable.
554 */
555 if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
556 mapping_writably_mapped(page->mapping) ||
557 single_mm_mapping(vma->vm_mm, page->mapping)) {
558 int rc = isolate_lru_page(page);
559
560 if (rc == 1)
561 list_add(&page->lru, pagelist);
562 /*
563 * If the isolate attempt was not successful then we just
564 * encountered an unswappable page. Something must be wrong.
565 */
566 WARN_ON(rc == 0);
567 }
568}
569
570static int swap_pages(struct list_head *pagelist)
571{
572 LIST_HEAD(moved);
573 LIST_HEAD(failed);
574 int n;
575
576 n = migrate_pages(pagelist, NULL, &moved, &failed);
577 putback_lru_pages(&failed);
578 putback_lru_pages(&moved);
579
580 return n;
581}
582
583/*
584 * For now migrate_pages simply swaps out the pages from nodes that are in
585 * the source set but not in the target set. In the future, we would
586 * want a function that moves pages between the two nodesets in such
587 * a way as to preserve the physical layout as much as possible.
588 *
589 * Returns the number of page that could not be moved.
590 */
591int do_migrate_pages(struct mm_struct *mm,
592 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
593{
594 LIST_HEAD(pagelist);
595 int count = 0;
596 nodemask_t nodes;
597
598 nodes_andnot(nodes, *from_nodes, *to_nodes);
599
600 down_read(&mm->mmap_sem);
601 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
602 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
603
604 if (!list_empty(&pagelist)) {
605 count = swap_pages(&pagelist);
606 putback_lru_pages(&pagelist);
607 }
608
609 up_read(&mm->mmap_sem);
610 return count;
611}
612
613long do_mbind(unsigned long start, unsigned long len,
614 unsigned long mode, nodemask_t *nmask, unsigned long flags)
615{
616 struct vm_area_struct *vma;
617 struct mm_struct *mm = current->mm;
618 struct mempolicy *new;
619 unsigned long end;
620 int err;
621 LIST_HEAD(pagelist);
622
623 if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
624 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
625 || mode > MPOL_MAX)
626 return -EINVAL;
627 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
628 return -EPERM;
629
630 if (start & ~PAGE_MASK)
631 return -EINVAL;
632
633 if (mode == MPOL_DEFAULT)
634 flags &= ~MPOL_MF_STRICT;
635
636 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
637 end = start + len;
638
639 if (end < start)
640 return -EINVAL;
641 if (end == start)
642 return 0;
643
644 if (mpol_check_policy(mode, nmask))
645 return -EINVAL;
646
647 new = mpol_new(mode, nmask);
648 if (IS_ERR(new))
649 return PTR_ERR(new);
650
651 /*
652 * If we are using the default policy then operation
653 * on discontinuous address spaces is okay after all
654 */
655 if (!new)
656 flags |= MPOL_MF_DISCONTIG_OK;
657
658 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
659 mode,nodes_addr(nodes)[0]);
660
661 down_write(&mm->mmap_sem);
662 vma = check_range(mm, start, end, nmask,
663 flags | MPOL_MF_INVERT, &pagelist);
664
665 err = PTR_ERR(vma);
666 if (!IS_ERR(vma)) {
667 int nr_failed = 0;
668
669 err = mbind_range(vma, start, end, new);
670 if (!list_empty(&pagelist))
671 nr_failed = swap_pages(&pagelist);
672
673 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
674 err = -EIO;
675 }
676 if (!list_empty(&pagelist))
677 putback_lru_pages(&pagelist);
678
679 up_write(&mm->mmap_sem);
680 mpol_free(new);
681 return err;
682}
683
684/*
508 * User space interface with variable sized bitmaps for nodelists. 685 * User space interface with variable sized bitmaps for nodelists.
509 */ 686 */
510 687
511/* Copy a node mask from user space. */ 688/* Copy a node mask from user space. */
512static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, 689static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
513 unsigned long maxnode) 690 unsigned long maxnode)
514{ 691{
515 unsigned long k; 692 unsigned long k;
@@ -598,6 +775,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
598 return do_set_mempolicy(mode, &nodes); 775 return do_set_mempolicy(mode, &nodes);
599} 776}
600 777
778asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
779 const unsigned long __user *old_nodes,
780 const unsigned long __user *new_nodes)
781{
782 struct mm_struct *mm;
783 struct task_struct *task;
784 nodemask_t old;
785 nodemask_t new;
786 nodemask_t task_nodes;
787 int err;
788
789 err = get_nodes(&old, old_nodes, maxnode);
790 if (err)
791 return err;
792
793 err = get_nodes(&new, new_nodes, maxnode);
794 if (err)
795 return err;
796
797 /* Find the mm_struct */
798 read_lock(&tasklist_lock);
799 task = pid ? find_task_by_pid(pid) : current;
800 if (!task) {
801 read_unlock(&tasklist_lock);
802 return -ESRCH;
803 }
804 mm = get_task_mm(task);
805 read_unlock(&tasklist_lock);
806
807 if (!mm)
808 return -EINVAL;
809
810 /*
811 * Check if this process has the right to modify the specified
812 * process. The right exists if the process has administrative
813 * capabilities, superuser priviledges or the same
814 * userid as the target process.
815 */
816 if ((current->euid != task->suid) && (current->euid != task->uid) &&
817 (current->uid != task->suid) && (current->uid != task->uid) &&
818 !capable(CAP_SYS_ADMIN)) {
819 err = -EPERM;
820 goto out;
821 }
822
823 task_nodes = cpuset_mems_allowed(task);
824 /* Is the user allowed to access the target nodes? */
825 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
826 err = -EPERM;
827 goto out;
828 }
829
830 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
831out:
832 mmput(mm);
833 return err;
834}
835
836
601/* Retrieve NUMA policy */ 837/* Retrieve NUMA policy */
602asmlinkage long sys_get_mempolicy(int __user *policy, 838asmlinkage long sys_get_mempolicy(int __user *policy,
603 unsigned long __user *nmask, 839 unsigned long __user *nmask,
@@ -704,8 +940,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
704#endif 940#endif
705 941
706/* Return effective policy for a VMA */ 942/* Return effective policy for a VMA */
707struct mempolicy * 943static struct mempolicy * get_vma_policy(struct task_struct *task,
708get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) 944 struct vm_area_struct *vma, unsigned long addr)
709{ 945{
710 struct mempolicy *pol = task->mempolicy; 946 struct mempolicy *pol = task->mempolicy;
711 947
@@ -781,6 +1017,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
781 return nid; 1017 return nid;
782} 1018}
783 1019
1020/* Determine a node number for interleave */
1021static inline unsigned interleave_nid(struct mempolicy *pol,
1022 struct vm_area_struct *vma, unsigned long addr, int shift)
1023{
1024 if (vma) {
1025 unsigned long off;
1026
1027 off = vma->vm_pgoff;
1028 off += (addr - vma->vm_start) >> shift;
1029 return offset_il_node(pol, vma, off);
1030 } else
1031 return interleave_nodes(pol);
1032}
1033
1034/* Return a zonelist suitable for a huge page allocation. */
1035struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1036{
1037 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1038
1039 if (pol->policy == MPOL_INTERLEAVE) {
1040 unsigned nid;
1041
1042 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1043 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1044 }
1045 return zonelist_policy(GFP_HIGHUSER, pol);
1046}
1047
784/* Allocate a page in interleaved policy. 1048/* Allocate a page in interleaved policy.
785 Own path because it needs to do special accounting. */ 1049 Own path because it needs to do special accounting. */
786static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1050static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -825,19 +1089,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
825{ 1089{
826 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1090 struct mempolicy *pol = get_vma_policy(current, vma, addr);
827 1091
828 cpuset_update_current_mems_allowed(); 1092 cpuset_update_task_memory_state();
829 1093
830 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 1094 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
831 unsigned nid; 1095 unsigned nid;
832 if (vma) { 1096
833 unsigned long off; 1097 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
834 off = vma->vm_pgoff;
835 off += (addr - vma->vm_start) >> PAGE_SHIFT;
836 nid = offset_il_node(pol, vma, off);
837 } else {
838 /* fall back to process interleaving */
839 nid = interleave_nodes(pol);
840 }
841 return alloc_page_interleave(gfp, 0, nid); 1098 return alloc_page_interleave(gfp, 0, nid);
842 } 1099 }
843 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); 1100 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -858,7 +1115,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
858 * interrupt context and apply the current process NUMA policy. 1115 * interrupt context and apply the current process NUMA policy.
859 * Returns NULL when no page can be allocated. 1116 * Returns NULL when no page can be allocated.
860 * 1117 *
861 * Don't call cpuset_update_current_mems_allowed() unless 1118 * Don't call cpuset_update_task_memory_state() unless
862 * 1) it's ok to take cpuset_sem (can WAIT), and 1119 * 1) it's ok to take cpuset_sem (can WAIT), and
863 * 2) allocating for current task (not interrupt). 1120 * 2) allocating for current task (not interrupt).
864 */ 1121 */
@@ -867,7 +1124,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
867 struct mempolicy *pol = current->mempolicy; 1124 struct mempolicy *pol = current->mempolicy;
868 1125
869 if ((gfp & __GFP_WAIT) && !in_interrupt()) 1126 if ((gfp & __GFP_WAIT) && !in_interrupt())
870 cpuset_update_current_mems_allowed(); 1127 cpuset_update_task_memory_state();
871 if (!pol || in_interrupt()) 1128 if (!pol || in_interrupt())
872 pol = &default_policy; 1129 pol = &default_policy;
873 if (pol->policy == MPOL_INTERLEAVE) 1130 if (pol->policy == MPOL_INTERLEAVE)
@@ -876,6 +1133,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
876} 1133}
877EXPORT_SYMBOL(alloc_pages_current); 1134EXPORT_SYMBOL(alloc_pages_current);
878 1135
1136/*
1137 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1138 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1139 * with the mems_allowed returned by cpuset_mems_allowed(). This
1140 * keeps mempolicies cpuset relative after its cpuset moves. See
1141 * further kernel/cpuset.c update_nodemask().
1142 */
1143void *cpuset_being_rebound;
1144
879/* Slow path of a mempolicy copy */ 1145/* Slow path of a mempolicy copy */
880struct mempolicy *__mpol_copy(struct mempolicy *old) 1146struct mempolicy *__mpol_copy(struct mempolicy *old)
881{ 1147{
@@ -883,6 +1149,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
883 1149
884 if (!new) 1150 if (!new)
885 return ERR_PTR(-ENOMEM); 1151 return ERR_PTR(-ENOMEM);
1152 if (current_cpuset_is_being_rebound()) {
1153 nodemask_t mems = cpuset_mems_allowed(current);
1154 mpol_rebind_policy(old, &mems);
1155 }
886 *new = *old; 1156 *new = *old;
887 atomic_set(&new->refcnt, 1); 1157 atomic_set(&new->refcnt, 1);
888 if (new->policy == MPOL_BIND) { 1158 if (new->policy == MPOL_BIND) {
@@ -936,54 +1206,6 @@ void __mpol_free(struct mempolicy *p)
936} 1206}
937 1207
938/* 1208/*
939 * Hugetlb policy. Same as above, just works with node numbers instead of
940 * zonelists.
941 */
942
943/* Find first node suitable for an allocation */
944int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
945{
946 struct mempolicy *pol = get_vma_policy(current, vma, addr);
947
948 switch (pol->policy) {
949 case MPOL_DEFAULT:
950 return numa_node_id();
951 case MPOL_BIND:
952 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
953 case MPOL_INTERLEAVE:
954 return interleave_nodes(pol);
955 case MPOL_PREFERRED:
956 return pol->v.preferred_node >= 0 ?
957 pol->v.preferred_node : numa_node_id();
958 }
959 BUG();
960 return 0;
961}
962
963/* Find secondary valid nodes for an allocation */
964int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
965{
966 struct mempolicy *pol = get_vma_policy(current, vma, addr);
967
968 switch (pol->policy) {
969 case MPOL_PREFERRED:
970 case MPOL_DEFAULT:
971 case MPOL_INTERLEAVE:
972 return 1;
973 case MPOL_BIND: {
974 struct zone **z;
975 for (z = pol->v.zonelist->zones; *z; z++)
976 if ((*z)->zone_pgdat->node_id == nid)
977 return 1;
978 return 0;
979 }
980 default:
981 BUG();
982 return 0;
983 }
984}
985
986/*
987 * Shared memory backing store policy support. 1209 * Shared memory backing store policy support.
988 * 1210 *
989 * Remember policies even when nobody has shared memory mapped. 1211 * Remember policies even when nobody has shared memory mapped.
@@ -1205,25 +1427,31 @@ void numa_default_policy(void)
1205} 1427}
1206 1428
1207/* Migrate a policy to a different set of nodes */ 1429/* Migrate a policy to a different set of nodes */
1208static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, 1430void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1209 const nodemask_t *new)
1210{ 1431{
1432 nodemask_t *mpolmask;
1211 nodemask_t tmp; 1433 nodemask_t tmp;
1212 1434
1213 if (!pol) 1435 if (!pol)
1214 return; 1436 return;
1437 mpolmask = &pol->cpuset_mems_allowed;
1438 if (nodes_equal(*mpolmask, *newmask))
1439 return;
1215 1440
1216 switch (pol->policy) { 1441 switch (pol->policy) {
1217 case MPOL_DEFAULT: 1442 case MPOL_DEFAULT:
1218 break; 1443 break;
1219 case MPOL_INTERLEAVE: 1444 case MPOL_INTERLEAVE:
1220 nodes_remap(tmp, pol->v.nodes, *old, *new); 1445 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1221 pol->v.nodes = tmp; 1446 pol->v.nodes = tmp;
1222 current->il_next = node_remap(current->il_next, *old, *new); 1447 *mpolmask = *newmask;
1448 current->il_next = node_remap(current->il_next,
1449 *mpolmask, *newmask);
1223 break; 1450 break;
1224 case MPOL_PREFERRED: 1451 case MPOL_PREFERRED:
1225 pol->v.preferred_node = node_remap(pol->v.preferred_node, 1452 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1226 *old, *new); 1453 *mpolmask, *newmask);
1454 *mpolmask = *newmask;
1227 break; 1455 break;
1228 case MPOL_BIND: { 1456 case MPOL_BIND: {
1229 nodemask_t nodes; 1457 nodemask_t nodes;
@@ -1233,7 +1461,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1233 nodes_clear(nodes); 1461 nodes_clear(nodes);
1234 for (z = pol->v.zonelist->zones; *z; z++) 1462 for (z = pol->v.zonelist->zones; *z; z++)
1235 node_set((*z)->zone_pgdat->node_id, nodes); 1463 node_set((*z)->zone_pgdat->node_id, nodes);
1236 nodes_remap(tmp, nodes, *old, *new); 1464 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1237 nodes = tmp; 1465 nodes = tmp;
1238 1466
1239 zonelist = bind_zonelist(&nodes); 1467 zonelist = bind_zonelist(&nodes);
@@ -1248,6 +1476,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1248 kfree(pol->v.zonelist); 1476 kfree(pol->v.zonelist);
1249 pol->v.zonelist = zonelist; 1477 pol->v.zonelist = zonelist;
1250 } 1478 }
1479 *mpolmask = *newmask;
1251 break; 1480 break;
1252 } 1481 }
1253 default: 1482 default:
@@ -1257,12 +1486,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
1257} 1486}
1258 1487
1259/* 1488/*
1260 * Someone moved this task to different nodes. Fixup mempolicies. 1489 * Wrapper for mpol_rebind_policy() that just requires task
1490 * pointer, and updates task mempolicy.
1491 */
1492
1493void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1494{
1495 mpol_rebind_policy(tsk->mempolicy, new);
1496}
1497
1498/*
1499 * Rebind each vma in mm to new nodemask.
1261 * 1500 *
1262 * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, 1501 * Call holding a reference to mm. Takes mm->mmap_sem during call.
1263 * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
1264 */ 1502 */
1265void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) 1503
1504void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1505{
1506 struct vm_area_struct *vma;
1507
1508 down_write(&mm->mmap_sem);
1509 for (vma = mm->mmap; vma; vma = vma->vm_next)
1510 mpol_rebind_policy(vma->vm_policy, new);
1511 up_write(&mm->mmap_sem);
1512}
1513
1514/*
1515 * Display pages allocated per node and memory policy via /proc.
1516 */
1517
1518static const char *policy_types[] = { "default", "prefer", "bind",
1519 "interleave" };
1520
1521/*
1522 * Convert a mempolicy into a string.
1523 * Returns the number of characters in buffer (if positive)
1524 * or an error (negative)
1525 */
1526static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1527{
1528 char *p = buffer;
1529 int l;
1530 nodemask_t nodes;
1531 int mode = pol ? pol->policy : MPOL_DEFAULT;
1532
1533 switch (mode) {
1534 case MPOL_DEFAULT:
1535 nodes_clear(nodes);
1536 break;
1537
1538 case MPOL_PREFERRED:
1539 nodes_clear(nodes);
1540 node_set(pol->v.preferred_node, nodes);
1541 break;
1542
1543 case MPOL_BIND:
1544 get_zonemask(pol, &nodes);
1545 break;
1546
1547 case MPOL_INTERLEAVE:
1548 nodes = pol->v.nodes;
1549 break;
1550
1551 default:
1552 BUG();
1553 return -EFAULT;
1554 }
1555
1556 l = strlen(policy_types[mode]);
1557 if (buffer + maxlen < p + l + 1)
1558 return -ENOSPC;
1559
1560 strcpy(p, policy_types[mode]);
1561 p += l;
1562
1563 if (!nodes_empty(nodes)) {
1564 if (buffer + maxlen < p + 2)
1565 return -ENOSPC;
1566 *p++ = '=';
1567 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1568 }
1569 return p - buffer;
1570}
1571
1572struct numa_maps {
1573 unsigned long pages;
1574 unsigned long anon;
1575 unsigned long mapped;
1576 unsigned long mapcount_max;
1577 unsigned long node[MAX_NUMNODES];
1578};
1579
1580static void gather_stats(struct page *page, void *private)
1266{ 1581{
1267 rebind_policy(current->mempolicy, old, new); 1582 struct numa_maps *md = private;
1583 int count = page_mapcount(page);
1584
1585 if (count)
1586 md->mapped++;
1587
1588 if (count > md->mapcount_max)
1589 md->mapcount_max = count;
1590
1591 md->pages++;
1592
1593 if (PageAnon(page))
1594 md->anon++;
1595
1596 md->node[page_to_nid(page)]++;
1597 cond_resched();
1598}
1599
1600int show_numa_map(struct seq_file *m, void *v)
1601{
1602 struct task_struct *task = m->private;
1603 struct vm_area_struct *vma = v;
1604 struct numa_maps *md;
1605 int n;
1606 char buffer[50];
1607
1608 if (!vma->vm_mm)
1609 return 0;
1610
1611 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1612 if (!md)
1613 return 0;
1614
1615 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1616 &node_online_map, MPOL_MF_STATS, md);
1617
1618 if (md->pages) {
1619 mpol_to_str(buffer, sizeof(buffer),
1620 get_vma_policy(task, vma, vma->vm_start));
1621
1622 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1623 vma->vm_start, buffer, md->pages,
1624 md->mapped, md->mapcount_max);
1625
1626 if (md->anon)
1627 seq_printf(m," anon=%lu",md->anon);
1628
1629 for_each_online_node(n)
1630 if (md->node[n])
1631 seq_printf(m, " N%d=%lu", n, md->node[n]);
1632
1633 seq_putc(m, '\n');
1634 }
1635 kfree(md);
1636
1637 if (m->count < m->size)
1638 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1639 return 0;
1268} 1640}
1641
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ae3a46ff768..b90c59573abf 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -5,6 +5,7 @@
5 * (C) Copyright 2002 Christoph Hellwig 5 * (C) Copyright 2002 Christoph Hellwig
6 */ 6 */
7 7
8#include <linux/capability.h>
8#include <linux/mman.h> 9#include <linux/mman.h>
9#include <linux/mm.h> 10#include <linux/mm.h>
10#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 64ba4dbcb7de..47556d2b3e90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/capability.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/file.h> 18#include <linux/file.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index ddaeee9a0b69..1903bdf65e42 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -13,6 +13,7 @@
13#include <linux/shm.h> 13#include <linux/shm.h>
14#include <linux/mman.h> 14#include <linux/mman.h>
15#include <linux/swap.h> 15#include <linux/swap.h>
16#include <linux/capability.h>
16#include <linux/fs.h> 17#include <linux/fs.h>
17#include <linux/highmem.h> 18#include <linux/highmem.h>
18#include <linux/security.h> 19#include <linux/security.h>
diff --git a/mm/msync.c b/mm/msync.c
index 1b5b6f662dcf..3563a56e1a51 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -137,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma,
137 ret = filemap_fdatawrite(mapping); 137 ret = filemap_fdatawrite(mapping);
138 if (file->f_op && file->f_op->fsync) { 138 if (file->f_op && file->f_op->fsync) {
139 /* 139 /*
140 * We don't take i_sem here because mmap_sem 140 * We don't take i_mutex here because mmap_sem
141 * is already held. 141 * is already held.
142 */ 142 */
143 err = file->f_op->fsync(file,file->f_dentry,1); 143 err = file->f_op->fsync(file,file->f_dentry,1);
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876b..c10262d68232 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
1177{ 1177{
1178 return 0; 1178 return 0;
1179} 1179}
1180
1181struct page *filemap_nopage(struct vm_area_struct *area,
1182 unsigned long address, int *type)
1183{
1184 BUG();
1185 return NULL;
1186}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b9035955..4748b906aff2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,7 +298,8 @@ retry:
298 298
299 /* 299 /*
300 * Give "p" a good chance of killing itself before we 300 * Give "p" a good chance of killing itself before we
301 * retry to allocate memory. 301 * retry to allocate memory unless "p" is current
302 */ 302 */
303 schedule_timeout_interruptible(1); 303 if (!test_thread_flag(TIF_MEMDIE))
304 schedule_timeout_interruptible(1);
304} 305}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0166ea15c9ee..5240e426c1f7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -550,11 +550,17 @@ void __init page_writeback_init(void)
550 550
551int do_writepages(struct address_space *mapping, struct writeback_control *wbc) 551int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
552{ 552{
553 int ret;
554
553 if (wbc->nr_to_write <= 0) 555 if (wbc->nr_to_write <= 0)
554 return 0; 556 return 0;
557 wbc->for_writepages = 1;
555 if (mapping->a_ops->writepages) 558 if (mapping->a_ops->writepages)
556 return mapping->a_ops->writepages(mapping, wbc); 559 ret = mapping->a_ops->writepages(mapping, wbc);
557 return generic_writepages(mapping, wbc); 560 else
561 ret = generic_writepages(mapping, wbc);
562 wbc->for_writepages = 0;
563 return ret;
558} 564}
559 565
560/** 566/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc2..8c960b469593 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
36#include <linux/memory_hotplug.h> 36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly;
52unsigned long totalram_pages __read_mostly; 53unsigned long totalram_pages __read_mostly;
53unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
54long nr_swap_pages; 55long nr_swap_pages;
56int percpu_pagelist_fraction;
57
58static void fastcall free_hot_cold_page(struct page *page, int cold);
55 59
56/* 60/*
57 * results with 256, 32 in the lowmem_reserve sysctl: 61 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -81,6 +85,7 @@ int min_free_kbytes = 1024;
81unsigned long __initdata nr_kernel_pages; 85unsigned long __initdata nr_kernel_pages;
82unsigned long __initdata nr_all_pages; 86unsigned long __initdata nr_all_pages;
83 87
88#ifdef CONFIG_DEBUG_VM
84static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 89static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
85{ 90{
86 int ret = 0; 91 int ret = 0;
@@ -122,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page)
122 return 0; 127 return 0;
123} 128}
124 129
125static void bad_page(const char *function, struct page *page) 130#else
131static inline int bad_range(struct zone *zone, struct page *page)
126{ 132{
127 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 133 return 0;
128 function, current->comm, page); 134}
129 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 135#endif
130 (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, 136
131 page->mapping, page_mapcount(page), page_count(page)); 137static void bad_page(struct page *page)
132 printk(KERN_EMERG "Backtrace:\n"); 138{
139 printk(KERN_EMERG "Bad page state in process '%s'\n"
140 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
141 KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
142 KERN_EMERG "Backtrace:\n",
143 current->comm, page, (int)(2*sizeof(unsigned long)),
144 (unsigned long)page->flags, page->mapping,
145 page_mapcount(page), page_count(page));
133 dump_stack(); 146 dump_stack();
134 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
135 page->flags &= ~(1 << PG_lru | 147 page->flags &= ~(1 << PG_lru |
136 1 << PG_private | 148 1 << PG_private |
137 1 << PG_locked | 149 1 << PG_locked |
@@ -184,19 +196,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
184 int i; 196 int i;
185 int nr_pages = 1 << order; 197 int nr_pages = 1 << order;
186 198
187 if (!PageCompound(page)) 199 if (unlikely(page[1].index != order))
188 return; 200 bad_page(page);
189
190 if (page[1].index != order)
191 bad_page(__FUNCTION__, page);
192 201
193 for (i = 0; i < nr_pages; i++) { 202 for (i = 0; i < nr_pages; i++) {
194 struct page *p = page + i; 203 struct page *p = page + i;
195 204
196 if (!PageCompound(p)) 205 if (unlikely(!PageCompound(p) |
197 bad_page(__FUNCTION__, page); 206 (page_private(p) != (unsigned long)page)))
198 if (page_private(p) != (unsigned long)page) 207 bad_page(page);
199 bad_page(__FUNCTION__, page);
200 ClearPageCompound(p); 208 ClearPageCompound(p);
201 } 209 }
202} 210}
@@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
255/* 263/*
256 * This function checks whether a page is free && is the buddy 264 * This function checks whether a page is free && is the buddy
257 * we can do coalesce a page and its buddy if 265 * we can do coalesce a page and its buddy if
258 * (a) the buddy is free && 266 * (a) the buddy is not in a hole &&
259 * (b) the buddy is on the buddy system && 267 * (b) the buddy is free &&
260 * (c) a page and its buddy have the same order. 268 * (c) the buddy is on the buddy system &&
269 * (d) a page and its buddy have the same order.
261 * for recording page's order, we use page_private(page) and PG_private. 270 * for recording page's order, we use page_private(page) and PG_private.
262 * 271 *
263 */ 272 */
264static inline int page_is_buddy(struct page *page, int order) 273static inline int page_is_buddy(struct page *page, int order)
265{ 274{
275#ifdef CONFIG_HOLES_IN_ZONE
276 if (!pfn_valid(page_to_pfn(page)))
277 return 0;
278#endif
279
266 if (PagePrivate(page) && 280 if (PagePrivate(page) &&
267 (page_order(page) == order) && 281 (page_order(page) == order) &&
268 page_count(page) == 0) 282 page_count(page) == 0)
@@ -294,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order)
294 * -- wli 308 * -- wli
295 */ 309 */
296 310
297static inline void __free_pages_bulk (struct page *page, 311static inline void __free_one_page(struct page *page,
298 struct zone *zone, unsigned int order) 312 struct zone *zone, unsigned int order)
299{ 313{
300 unsigned long page_idx; 314 unsigned long page_idx;
301 int order_size = 1 << order; 315 int order_size = 1 << order;
302 316
303 if (unlikely(order)) 317 if (unlikely(PageCompound(page)))
304 destroy_compound_page(page, order); 318 destroy_compound_page(page, order);
305 319
306 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 320 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
314 struct free_area *area; 328 struct free_area *area;
315 struct page *buddy; 329 struct page *buddy;
316 330
317 combined_idx = __find_combined_index(page_idx, order);
318 buddy = __page_find_buddy(page, page_idx, order); 331 buddy = __page_find_buddy(page, page_idx, order);
319
320 if (bad_range(zone, buddy))
321 break;
322 if (!page_is_buddy(buddy, order)) 332 if (!page_is_buddy(buddy, order))
323 break; /* Move the buddy up one level. */ 333 break; /* Move the buddy up one level. */
334
324 list_del(&buddy->lru); 335 list_del(&buddy->lru);
325 area = zone->free_area + order; 336 area = zone->free_area + order;
326 area->nr_free--; 337 area->nr_free--;
327 rmv_page_order(buddy); 338 rmv_page_order(buddy);
339 combined_idx = __find_combined_index(page_idx, order);
328 page = page + (combined_idx - page_idx); 340 page = page + (combined_idx - page_idx);
329 page_idx = combined_idx; 341 page_idx = combined_idx;
330 order++; 342 order++;
@@ -334,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page,
334 zone->free_area[order].nr_free++; 346 zone->free_area[order].nr_free++;
335} 347}
336 348
337static inline int free_pages_check(const char *function, struct page *page) 349static inline int free_pages_check(struct page *page)
338{ 350{
339 if ( page_mapcount(page) || 351 if (unlikely(page_mapcount(page) |
340 page->mapping != NULL || 352 (page->mapping != NULL) |
341 page_count(page) != 0 || 353 (page_count(page) != 0) |
342 (page->flags & ( 354 (page->flags & (
343 1 << PG_lru | 355 1 << PG_lru |
344 1 << PG_private | 356 1 << PG_private |
@@ -348,8 +360,8 @@ static inline int free_pages_check(const char *function, struct page *page)
348 1 << PG_slab | 360 1 << PG_slab |
349 1 << PG_swapcache | 361 1 << PG_swapcache |
350 1 << PG_writeback | 362 1 << PG_writeback |
351 1 << PG_reserved ))) 363 1 << PG_reserved ))))
352 bad_page(function, page); 364 bad_page(page);
353 if (PageDirty(page)) 365 if (PageDirty(page))
354 __ClearPageDirty(page); 366 __ClearPageDirty(page);
355 /* 367 /*
@@ -371,51 +383,90 @@ static inline int free_pages_check(const char *function, struct page *page)
371 * And clear the zone's pages_scanned counter, to hold off the "all pages are 383 * And clear the zone's pages_scanned counter, to hold off the "all pages are
372 * pinned" detection logic. 384 * pinned" detection logic.
373 */ 385 */
374static int 386static void free_pages_bulk(struct zone *zone, int count,
375free_pages_bulk(struct zone *zone, int count, 387 struct list_head *list, int order)
376 struct list_head *list, unsigned int order)
377{ 388{
378 unsigned long flags; 389 spin_lock(&zone->lock);
379 struct page *page = NULL;
380 int ret = 0;
381
382 spin_lock_irqsave(&zone->lock, flags);
383 zone->all_unreclaimable = 0; 390 zone->all_unreclaimable = 0;
384 zone->pages_scanned = 0; 391 zone->pages_scanned = 0;
385 while (!list_empty(list) && count--) { 392 while (count--) {
393 struct page *page;
394
395 BUG_ON(list_empty(list));
386 page = list_entry(list->prev, struct page, lru); 396 page = list_entry(list->prev, struct page, lru);
387 /* have to delete it as __free_pages_bulk list manipulates */ 397 /* have to delete it as __free_one_page list manipulates */
388 list_del(&page->lru); 398 list_del(&page->lru);
389 __free_pages_bulk(page, zone, order); 399 __free_one_page(page, zone, order);
390 ret++;
391 } 400 }
392 spin_unlock_irqrestore(&zone->lock, flags); 401 spin_unlock(&zone->lock);
393 return ret;
394} 402}
395 403
396void __free_pages_ok(struct page *page, unsigned int order) 404static void free_one_page(struct zone *zone, struct page *page, int order)
397{ 405{
398 LIST_HEAD(list); 406 LIST_HEAD(list);
407 list_add(&page->lru, &list);
408 free_pages_bulk(zone, 1, &list, order);
409}
410
411static void __free_pages_ok(struct page *page, unsigned int order)
412{
413 unsigned long flags;
399 int i; 414 int i;
400 int reserved = 0; 415 int reserved = 0;
401 416
402 arch_free_page(page, order); 417 arch_free_page(page, order);
418 if (!PageHighMem(page))
419 mutex_debug_check_no_locks_freed(page_address(page),
420 PAGE_SIZE<<order);
403 421
404#ifndef CONFIG_MMU 422#ifndef CONFIG_MMU
405 if (order > 0) 423 for (i = 1 ; i < (1 << order) ; ++i)
406 for (i = 1 ; i < (1 << order) ; ++i) 424 __put_page(page + i);
407 __put_page(page + i);
408#endif 425#endif
409 426
410 for (i = 0 ; i < (1 << order) ; ++i) 427 for (i = 0 ; i < (1 << order) ; ++i)
411 reserved += free_pages_check(__FUNCTION__, page + i); 428 reserved += free_pages_check(page + i);
412 if (reserved) 429 if (reserved)
413 return; 430 return;
414 431
415 list_add(&page->lru, &list); 432 kernel_map_pages(page, 1 << order, 0);
416 mod_page_state(pgfree, 1 << order); 433 local_irq_save(flags);
417 kernel_map_pages(page, 1<<order, 0); 434 __mod_page_state(pgfree, 1 << order);
418 free_pages_bulk(page_zone(page), 1, &list, order); 435 free_one_page(page_zone(page), page, order);
436 local_irq_restore(flags);
437}
438
439/*
440 * permit the bootmem allocator to evade page validation on high-order frees
441 */
442void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
443{
444 if (order == 0) {
445 __ClearPageReserved(page);
446 set_page_count(page, 0);
447
448 free_hot_cold_page(page, 0);
449 } else {
450 LIST_HEAD(list);
451 int loop;
452
453 for (loop = 0; loop < BITS_PER_LONG; loop++) {
454 struct page *p = &page[loop];
455
456 if (loop + 16 < BITS_PER_LONG)
457 prefetchw(p + 16);
458 __ClearPageReserved(p);
459 set_page_count(p, 0);
460 }
461
462 arch_free_page(page, order);
463
464 mod_page_state(pgfree, 1 << order);
465
466 list_add(&page->lru, &list);
467 kernel_map_pages(page, 1 << order, 0);
468 free_pages_bulk(page_zone(page), 1, &list, order);
469 }
419} 470}
420 471
421 472
@@ -433,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
433 * 484 *
434 * -- wli 485 * -- wli
435 */ 486 */
436static inline struct page * 487static inline void expand(struct zone *zone, struct page *page,
437expand(struct zone *zone, struct page *page,
438 int low, int high, struct free_area *area) 488 int low, int high, struct free_area *area)
439{ 489{
440 unsigned long size = 1 << high; 490 unsigned long size = 1 << high;
@@ -448,24 +498,6 @@ expand(struct zone *zone, struct page *page,
448 area->nr_free++; 498 area->nr_free++;
449 set_page_order(&page[size], high); 499 set_page_order(&page[size], high);
450 } 500 }
451 return page;
452}
453
454void set_page_refs(struct page *page, int order)
455{
456#ifdef CONFIG_MMU
457 set_page_count(page, 1);
458#else
459 int i;
460
461 /*
462 * We need to reference all the pages for this order, otherwise if
463 * anyone accesses one of the pages with (get/put) it will be freed.
464 * - eg: access_process_vm()
465 */
466 for (i = 0; i < (1 << order); i++)
467 set_page_count(page + i, 1);
468#endif /* CONFIG_MMU */
469} 501}
470 502
471/* 503/*
@@ -473,9 +505,9 @@ void set_page_refs(struct page *page, int order)
473 */ 505 */
474static int prep_new_page(struct page *page, int order) 506static int prep_new_page(struct page *page, int order)
475{ 507{
476 if ( page_mapcount(page) || 508 if (unlikely(page_mapcount(page) |
477 page->mapping != NULL || 509 (page->mapping != NULL) |
478 page_count(page) != 0 || 510 (page_count(page) != 0) |
479 (page->flags & ( 511 (page->flags & (
480 1 << PG_lru | 512 1 << PG_lru |
481 1 << PG_private | 513 1 << PG_private |
@@ -486,8 +518,8 @@ static int prep_new_page(struct page *page, int order)
486 1 << PG_slab | 518 1 << PG_slab |
487 1 << PG_swapcache | 519 1 << PG_swapcache |
488 1 << PG_writeback | 520 1 << PG_writeback |
489 1 << PG_reserved ))) 521 1 << PG_reserved ))))
490 bad_page(__FUNCTION__, page); 522 bad_page(page);
491 523
492 /* 524 /*
493 * For now, we report if PG_reserved was found set, but do not 525 * For now, we report if PG_reserved was found set, but do not
@@ -525,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
525 rmv_page_order(page); 557 rmv_page_order(page);
526 area->nr_free--; 558 area->nr_free--;
527 zone->free_pages -= 1UL << order; 559 zone->free_pages -= 1UL << order;
528 return expand(zone, page, order, current_order, area); 560 expand(zone, page, order, current_order, area);
561 return page;
529 } 562 }
530 563
531 return NULL; 564 return NULL;
@@ -539,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
539static int rmqueue_bulk(struct zone *zone, unsigned int order, 572static int rmqueue_bulk(struct zone *zone, unsigned int order,
540 unsigned long count, struct list_head *list) 573 unsigned long count, struct list_head *list)
541{ 574{
542 unsigned long flags;
543 int i; 575 int i;
544 int allocated = 0;
545 struct page *page;
546 576
547 spin_lock_irqsave(&zone->lock, flags); 577 spin_lock(&zone->lock);
548 for (i = 0; i < count; ++i) { 578 for (i = 0; i < count; ++i) {
549 page = __rmqueue(zone, order); 579 struct page *page = __rmqueue(zone, order);
550 if (page == NULL) 580 if (unlikely(page == NULL))
551 break; 581 break;
552 allocated++;
553 list_add_tail(&page->lru, list); 582 list_add_tail(&page->lru, list);
554 } 583 }
555 spin_unlock_irqrestore(&zone->lock, flags); 584 spin_unlock(&zone->lock);
556 return allocated; 585 return i;
557} 586}
558 587
559#ifdef CONFIG_NUMA 588#ifdef CONFIG_NUMA
@@ -572,14 +601,13 @@ void drain_remote_pages(void)
572 if (zone->zone_pgdat->node_id == numa_node_id()) 601 if (zone->zone_pgdat->node_id == numa_node_id())
573 continue; 602 continue;
574 603
575 pset = zone->pageset[smp_processor_id()]; 604 pset = zone_pcp(zone, smp_processor_id());
576 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 605 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
577 struct per_cpu_pages *pcp; 606 struct per_cpu_pages *pcp;
578 607
579 pcp = &pset->pcp[i]; 608 pcp = &pset->pcp[i];
580 if (pcp->count) 609 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
581 pcp->count -= free_pages_bulk(zone, pcp->count, 610 pcp->count = 0;
582 &pcp->list, 0);
583 } 611 }
584 } 612 }
585 local_irq_restore(flags); 613 local_irq_restore(flags);
@@ -589,6 +617,7 @@ void drain_remote_pages(void)
589#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 617#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
590static void __drain_pages(unsigned int cpu) 618static void __drain_pages(unsigned int cpu)
591{ 619{
620 unsigned long flags;
592 struct zone *zone; 621 struct zone *zone;
593 int i; 622 int i;
594 623
@@ -600,8 +629,10 @@ static void __drain_pages(unsigned int cpu)
600 struct per_cpu_pages *pcp; 629 struct per_cpu_pages *pcp;
601 630
602 pcp = &pset->pcp[i]; 631 pcp = &pset->pcp[i];
603 pcp->count -= free_pages_bulk(zone, pcp->count, 632 local_irq_save(flags);
604 &pcp->list, 0); 633 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
634 pcp->count = 0;
635 local_irq_restore(flags);
605 } 636 }
606 } 637 }
607} 638}
@@ -647,18 +678,14 @@ void drain_local_pages(void)
647} 678}
648#endif /* CONFIG_PM */ 679#endif /* CONFIG_PM */
649 680
650static void zone_statistics(struct zonelist *zonelist, struct zone *z) 681static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
651{ 682{
652#ifdef CONFIG_NUMA 683#ifdef CONFIG_NUMA
653 unsigned long flags;
654 int cpu;
655 pg_data_t *pg = z->zone_pgdat; 684 pg_data_t *pg = z->zone_pgdat;
656 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 685 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
657 struct per_cpu_pageset *p; 686 struct per_cpu_pageset *p;
658 687
659 local_irq_save(flags); 688 p = zone_pcp(z, cpu);
660 cpu = smp_processor_id();
661 p = zone_pcp(z,cpu);
662 if (pg == orig) { 689 if (pg == orig) {
663 p->numa_hit++; 690 p->numa_hit++;
664 } else { 691 } else {
@@ -669,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
669 p->local_node++; 696 p->local_node++;
670 else 697 else
671 p->other_node++; 698 p->other_node++;
672 local_irq_restore(flags);
673#endif 699#endif
674} 700}
675 701
676/* 702/*
677 * Free a 0-order page 703 * Free a 0-order page
678 */ 704 */
679static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
680static void fastcall free_hot_cold_page(struct page *page, int cold) 705static void fastcall free_hot_cold_page(struct page *page, int cold)
681{ 706{
682 struct zone *zone = page_zone(page); 707 struct zone *zone = page_zone(page);
@@ -687,18 +712,20 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
687 712
688 if (PageAnon(page)) 713 if (PageAnon(page))
689 page->mapping = NULL; 714 page->mapping = NULL;
690 if (free_pages_check(__FUNCTION__, page)) 715 if (free_pages_check(page))
691 return; 716 return;
692 717
693 inc_page_state(pgfree);
694 kernel_map_pages(page, 1, 0); 718 kernel_map_pages(page, 1, 0);
695 719
696 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 720 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
697 local_irq_save(flags); 721 local_irq_save(flags);
722 __inc_page_state(pgfree);
698 list_add(&page->lru, &pcp->list); 723 list_add(&page->lru, &pcp->list);
699 pcp->count++; 724 pcp->count++;
700 if (pcp->count >= pcp->high) 725 if (pcp->count >= pcp->high) {
701 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 726 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
727 pcp->count -= pcp->batch;
728 }
702 local_irq_restore(flags); 729 local_irq_restore(flags);
703 put_cpu(); 730 put_cpu();
704} 731}
@@ -727,49 +754,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
727 * we cheat by calling it from here, in the order > 0 path. Saves a branch 754 * we cheat by calling it from here, in the order > 0 path. Saves a branch
728 * or two. 755 * or two.
729 */ 756 */
730static struct page * 757static struct page *buffered_rmqueue(struct zonelist *zonelist,
731buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 758 struct zone *zone, int order, gfp_t gfp_flags)
732{ 759{
733 unsigned long flags; 760 unsigned long flags;
734 struct page *page; 761 struct page *page;
735 int cold = !!(gfp_flags & __GFP_COLD); 762 int cold = !!(gfp_flags & __GFP_COLD);
763 int cpu;
736 764
737again: 765again:
738 if (order == 0) { 766 cpu = get_cpu();
767 if (likely(order == 0)) {
739 struct per_cpu_pages *pcp; 768 struct per_cpu_pages *pcp;
740 769
741 page = NULL; 770 pcp = &zone_pcp(zone, cpu)->pcp[cold];
742 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
743 local_irq_save(flags); 771 local_irq_save(flags);
744 if (pcp->count <= pcp->low) 772 if (!pcp->count) {
745 pcp->count += rmqueue_bulk(zone, 0, 773 pcp->count += rmqueue_bulk(zone, 0,
746 pcp->batch, &pcp->list); 774 pcp->batch, &pcp->list);
747 if (pcp->count) { 775 if (unlikely(!pcp->count))
748 page = list_entry(pcp->list.next, struct page, lru); 776 goto failed;
749 list_del(&page->lru);
750 pcp->count--;
751 } 777 }
752 local_irq_restore(flags); 778 page = list_entry(pcp->list.next, struct page, lru);
753 put_cpu(); 779 list_del(&page->lru);
780 pcp->count--;
754 } else { 781 } else {
755 spin_lock_irqsave(&zone->lock, flags); 782 spin_lock_irqsave(&zone->lock, flags);
756 page = __rmqueue(zone, order); 783 page = __rmqueue(zone, order);
757 spin_unlock_irqrestore(&zone->lock, flags); 784 spin_unlock(&zone->lock);
785 if (!page)
786 goto failed;
758 } 787 }
759 788
760 if (page != NULL) { 789 __mod_page_state_zone(zone, pgalloc, 1 << order);
761 BUG_ON(bad_range(zone, page)); 790 zone_statistics(zonelist, zone, cpu);
762 mod_page_state_zone(zone, pgalloc, 1 << order); 791 local_irq_restore(flags);
763 if (prep_new_page(page, order)) 792 put_cpu();
764 goto again;
765 793
766 if (gfp_flags & __GFP_ZERO) 794 BUG_ON(bad_range(zone, page));
767 prep_zero_page(page, order, gfp_flags); 795 if (prep_new_page(page, order))
796 goto again;
768 797
769 if (order && (gfp_flags & __GFP_COMP)) 798 if (gfp_flags & __GFP_ZERO)
770 prep_compound_page(page, order); 799 prep_zero_page(page, order, gfp_flags);
771 } 800
801 if (order && (gfp_flags & __GFP_COMP))
802 prep_compound_page(page, order);
772 return page; 803 return page;
804
805failed:
806 local_irq_restore(flags);
807 put_cpu();
808 return NULL;
773} 809}
774 810
775#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 811#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
@@ -845,9 +881,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
845 continue; 881 continue;
846 } 882 }
847 883
848 page = buffered_rmqueue(*z, order, gfp_mask); 884 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
849 if (page) { 885 if (page) {
850 zone_statistics(zonelist, *z);
851 break; 886 break;
852 } 887 }
853 } while (*(++z) != NULL); 888 } while (*(++z) != NULL);
@@ -896,15 +931,15 @@ restart:
896 * 931 *
897 * The caller may dip into page reserves a bit more if the caller 932 * The caller may dip into page reserves a bit more if the caller
898 * cannot run direct reclaim, or if the caller has realtime scheduling 933 * cannot run direct reclaim, or if the caller has realtime scheduling
899 * policy. 934 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
935 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
900 */ 936 */
901 alloc_flags = ALLOC_WMARK_MIN; 937 alloc_flags = ALLOC_WMARK_MIN;
902 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 938 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
903 alloc_flags |= ALLOC_HARDER; 939 alloc_flags |= ALLOC_HARDER;
904 if (gfp_mask & __GFP_HIGH) 940 if (gfp_mask & __GFP_HIGH)
905 alloc_flags |= ALLOC_HIGH; 941 alloc_flags |= ALLOC_HIGH;
906 if (wait) 942 alloc_flags |= ALLOC_CPUSET;
907 alloc_flags |= ALLOC_CPUSET;
908 943
909 /* 944 /*
910 * Go through the zonelist again. Let __GFP_HIGH and allocations 945 * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +961,7 @@ restart:
926nofail_alloc: 961nofail_alloc:
927 /* go through the zonelist yet again, ignoring mins */ 962 /* go through the zonelist yet again, ignoring mins */
928 page = get_page_from_freelist(gfp_mask, order, 963 page = get_page_from_freelist(gfp_mask, order,
929 zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); 964 zonelist, ALLOC_NO_WATERMARKS);
930 if (page) 965 if (page)
931 goto got_pg; 966 goto got_pg;
932 if (gfp_mask & __GFP_NOFAIL) { 967 if (gfp_mask & __GFP_NOFAIL) {
@@ -945,6 +980,7 @@ rebalance:
945 cond_resched(); 980 cond_resched();
946 981
947 /* We now go into synchronous reclaim */ 982 /* We now go into synchronous reclaim */
983 cpuset_memory_pressure_bump();
948 p->flags |= PF_MEMALLOC; 984 p->flags |= PF_MEMALLOC;
949 reclaim_state.reclaimed_slab = 0; 985 reclaim_state.reclaimed_slab = 0;
950 p->reclaim_state = &reclaim_state; 986 p->reclaim_state = &reclaim_state;
@@ -1171,7 +1207,7 @@ EXPORT_SYMBOL(nr_pagecache);
1171DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1207DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1172#endif 1208#endif
1173 1209
1174void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1210static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1175{ 1211{
1176 int cpu = 0; 1212 int cpu = 0;
1177 1213
@@ -1224,7 +1260,7 @@ void get_full_page_state(struct page_state *ret)
1224 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1260 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1225} 1261}
1226 1262
1227unsigned long __read_page_state(unsigned long offset) 1263unsigned long read_page_state_offset(unsigned long offset)
1228{ 1264{
1229 unsigned long ret = 0; 1265 unsigned long ret = 0;
1230 int cpu; 1266 int cpu;
@@ -1238,18 +1274,26 @@ unsigned long __read_page_state(unsigned long offset)
1238 return ret; 1274 return ret;
1239} 1275}
1240 1276
1241void __mod_page_state(unsigned long offset, unsigned long delta) 1277void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1278{
1279 void *ptr;
1280
1281 ptr = &__get_cpu_var(page_states);
1282 *(unsigned long *)(ptr + offset) += delta;
1283}
1284EXPORT_SYMBOL(__mod_page_state_offset);
1285
1286void mod_page_state_offset(unsigned long offset, unsigned long delta)
1242{ 1287{
1243 unsigned long flags; 1288 unsigned long flags;
1244 void* ptr; 1289 void *ptr;
1245 1290
1246 local_irq_save(flags); 1291 local_irq_save(flags);
1247 ptr = &__get_cpu_var(page_states); 1292 ptr = &__get_cpu_var(page_states);
1248 *(unsigned long*)(ptr + offset) += delta; 1293 *(unsigned long *)(ptr + offset) += delta;
1249 local_irq_restore(flags); 1294 local_irq_restore(flags);
1250} 1295}
1251 1296EXPORT_SYMBOL(mod_page_state_offset);
1252EXPORT_SYMBOL(__mod_page_state);
1253 1297
1254void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1298void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1255 unsigned long *free, struct pglist_data *pgdat) 1299 unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1379,7 @@ void show_free_areas(void)
1335 show_node(zone); 1379 show_node(zone);
1336 printk("%s per-cpu:", zone->name); 1380 printk("%s per-cpu:", zone->name);
1337 1381
1338 if (!zone->present_pages) { 1382 if (!populated_zone(zone)) {
1339 printk(" empty\n"); 1383 printk(" empty\n");
1340 continue; 1384 continue;
1341 } else 1385 } else
@@ -1347,10 +1391,9 @@ void show_free_areas(void)
1347 pageset = zone_pcp(zone, cpu); 1391 pageset = zone_pcp(zone, cpu);
1348 1392
1349 for (temperature = 0; temperature < 2; temperature++) 1393 for (temperature = 0; temperature < 2; temperature++)
1350 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1394 printk("cpu %d %s: high %d, batch %d used:%d\n",
1351 cpu, 1395 cpu,
1352 temperature ? "cold" : "hot", 1396 temperature ? "cold" : "hot",
1353 pageset->pcp[temperature].low,
1354 pageset->pcp[temperature].high, 1397 pageset->pcp[temperature].high,
1355 pageset->pcp[temperature].batch, 1398 pageset->pcp[temperature].batch,
1356 pageset->pcp[temperature].count); 1399 pageset->pcp[temperature].count);
@@ -1413,7 +1456,7 @@ void show_free_areas(void)
1413 1456
1414 show_node(zone); 1457 show_node(zone);
1415 printk("%s: ", zone->name); 1458 printk("%s: ", zone->name);
1416 if (!zone->present_pages) { 1459 if (!populated_zone(zone)) {
1417 printk("empty\n"); 1460 printk("empty\n");
1418 continue; 1461 continue;
1419 } 1462 }
@@ -1433,36 +1476,29 @@ void show_free_areas(void)
1433 1476
1434/* 1477/*
1435 * Builds allocation fallback zone lists. 1478 * Builds allocation fallback zone lists.
1479 *
1480 * Add all populated zones of a node to the zonelist.
1436 */ 1481 */
1437static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1482static int __init build_zonelists_node(pg_data_t *pgdat,
1438{ 1483 struct zonelist *zonelist, int nr_zones, int zone_type)
1439 switch (k) { 1484{
1440 struct zone *zone; 1485 struct zone *zone;
1441 default: 1486
1442 BUG(); 1487 BUG_ON(zone_type > ZONE_HIGHMEM);
1443 case ZONE_HIGHMEM: 1488
1444 zone = pgdat->node_zones + ZONE_HIGHMEM; 1489 do {
1445 if (zone->present_pages) { 1490 zone = pgdat->node_zones + zone_type;
1491 if (populated_zone(zone)) {
1446#ifndef CONFIG_HIGHMEM 1492#ifndef CONFIG_HIGHMEM
1447 BUG(); 1493 BUG_ON(zone_type > ZONE_NORMAL);
1448#endif 1494#endif
1449 zonelist->zones[j++] = zone; 1495 zonelist->zones[nr_zones++] = zone;
1496 check_highest_zone(zone_type);
1450 } 1497 }
1451 case ZONE_NORMAL: 1498 zone_type--;
1452 zone = pgdat->node_zones + ZONE_NORMAL;
1453 if (zone->present_pages)
1454 zonelist->zones[j++] = zone;
1455 case ZONE_DMA32:
1456 zone = pgdat->node_zones + ZONE_DMA32;
1457 if (zone->present_pages)
1458 zonelist->zones[j++] = zone;
1459 case ZONE_DMA:
1460 zone = pgdat->node_zones + ZONE_DMA;
1461 if (zone->present_pages)
1462 zonelist->zones[j++] = zone;
1463 }
1464 1499
1465 return j; 1500 } while (zone_type >= 0);
1501 return nr_zones;
1466} 1502}
1467 1503
1468static inline int highest_zone(int zone_bits) 1504static inline int highest_zone(int zone_bits)
@@ -1706,11 +1742,9 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1706 unsigned long end_pfn = start_pfn + size; 1742 unsigned long end_pfn = start_pfn + size;
1707 unsigned long pfn; 1743 unsigned long pfn;
1708 1744
1709 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1745 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1710 if (!early_pfn_valid(pfn)) 1746 if (!early_pfn_valid(pfn))
1711 continue; 1747 continue;
1712 if (!early_pfn_in_nid(pfn, nid))
1713 continue;
1714 page = pfn_to_page(pfn); 1748 page = pfn_to_page(pfn);
1715 set_page_links(page, zone, nid, pfn); 1749 set_page_links(page, zone, nid, pfn);
1716 set_page_count(page, 1); 1750 set_page_count(page, 1);
@@ -1794,19 +1828,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1794 1828
1795 pcp = &p->pcp[0]; /* hot */ 1829 pcp = &p->pcp[0]; /* hot */
1796 pcp->count = 0; 1830 pcp->count = 0;
1797 pcp->low = 0;
1798 pcp->high = 6 * batch; 1831 pcp->high = 6 * batch;
1799 pcp->batch = max(1UL, 1 * batch); 1832 pcp->batch = max(1UL, 1 * batch);
1800 INIT_LIST_HEAD(&pcp->list); 1833 INIT_LIST_HEAD(&pcp->list);
1801 1834
1802 pcp = &p->pcp[1]; /* cold*/ 1835 pcp = &p->pcp[1]; /* cold*/
1803 pcp->count = 0; 1836 pcp->count = 0;
1804 pcp->low = 0;
1805 pcp->high = 2 * batch; 1837 pcp->high = 2 * batch;
1806 pcp->batch = max(1UL, batch/2); 1838 pcp->batch = max(1UL, batch/2);
1807 INIT_LIST_HEAD(&pcp->list); 1839 INIT_LIST_HEAD(&pcp->list);
1808} 1840}
1809 1841
1842/*
1843 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1844 * to the value high for the pageset p.
1845 */
1846
1847static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1848 unsigned long high)
1849{
1850 struct per_cpu_pages *pcp;
1851
1852 pcp = &p->pcp[0]; /* hot list */
1853 pcp->high = high;
1854 pcp->batch = max(1UL, high/4);
1855 if ((high/4) > (PAGE_SHIFT * 8))
1856 pcp->batch = PAGE_SHIFT * 8;
1857}
1858
1859
1810#ifdef CONFIG_NUMA 1860#ifdef CONFIG_NUMA
1811/* 1861/*
1812 * Boot pageset table. One per cpu which is going to be used for all 1862 * Boot pageset table. One per cpu which is going to be used for all
@@ -1838,12 +1888,16 @@ static int __devinit process_zones(int cpu)
1838 1888
1839 for_each_zone(zone) { 1889 for_each_zone(zone) {
1840 1890
1841 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1891 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1842 GFP_KERNEL, cpu_to_node(cpu)); 1892 GFP_KERNEL, cpu_to_node(cpu));
1843 if (!zone->pageset[cpu]) 1893 if (!zone_pcp(zone, cpu))
1844 goto bad; 1894 goto bad;
1845 1895
1846 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1896 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
1897
1898 if (percpu_pagelist_fraction)
1899 setup_pagelist_highmark(zone_pcp(zone, cpu),
1900 (zone->present_pages / percpu_pagelist_fraction));
1847 } 1901 }
1848 1902
1849 return 0; 1903 return 0;
@@ -1851,15 +1905,14 @@ bad:
1851 for_each_zone(dzone) { 1905 for_each_zone(dzone) {
1852 if (dzone == zone) 1906 if (dzone == zone)
1853 break; 1907 break;
1854 kfree(dzone->pageset[cpu]); 1908 kfree(zone_pcp(dzone, cpu));
1855 dzone->pageset[cpu] = NULL; 1909 zone_pcp(dzone, cpu) = NULL;
1856 } 1910 }
1857 return -ENOMEM; 1911 return -ENOMEM;
1858} 1912}
1859 1913
1860static inline void free_zone_pagesets(int cpu) 1914static inline void free_zone_pagesets(int cpu)
1861{ 1915{
1862#ifdef CONFIG_NUMA
1863 struct zone *zone; 1916 struct zone *zone;
1864 1917
1865 for_each_zone(zone) { 1918 for_each_zone(zone) {
@@ -1868,7 +1921,6 @@ static inline void free_zone_pagesets(int cpu)
1868 zone_pcp(zone, cpu) = NULL; 1921 zone_pcp(zone, cpu) = NULL;
1869 kfree(pset); 1922 kfree(pset);
1870 } 1923 }
1871#endif
1872} 1924}
1873 1925
1874static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1926static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
@@ -1939,7 +1991,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
1939 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1991 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1940#ifdef CONFIG_NUMA 1992#ifdef CONFIG_NUMA
1941 /* Early boot. Slab allocator not functional yet */ 1993 /* Early boot. Slab allocator not functional yet */
1942 zone->pageset[cpu] = &boot_pageset[cpu]; 1994 zone_pcp(zone, cpu) = &boot_pageset[cpu];
1943 setup_pageset(&boot_pageset[cpu],0); 1995 setup_pageset(&boot_pageset[cpu],0);
1944#else 1996#else
1945 setup_pageset(zone_pcp(zone,cpu), batch); 1997 setup_pageset(zone_pcp(zone,cpu), batch);
@@ -2116,7 +2168,7 @@ static int frag_show(struct seq_file *m, void *arg)
2116 int order; 2168 int order;
2117 2169
2118 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2170 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2119 if (!zone->present_pages) 2171 if (!populated_zone(zone))
2120 continue; 2172 continue;
2121 2173
2122 spin_lock_irqsave(&zone->lock, flags); 2174 spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2201,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2149 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2201 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2150 int i; 2202 int i;
2151 2203
2152 if (!zone->present_pages) 2204 if (!populated_zone(zone))
2153 continue; 2205 continue;
2154 2206
2155 spin_lock_irqsave(&zone->lock, flags); 2207 spin_lock_irqsave(&zone->lock, flags);
@@ -2182,7 +2234,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2182 seq_printf(m, 2234 seq_printf(m,
2183 ")" 2235 ")"
2184 "\n pagesets"); 2236 "\n pagesets");
2185 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2237 for_each_online_cpu(i) {
2186 struct per_cpu_pageset *pageset; 2238 struct per_cpu_pageset *pageset;
2187 int j; 2239 int j;
2188 2240
@@ -2197,12 +2249,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2197 seq_printf(m, 2249 seq_printf(m,
2198 "\n cpu: %i pcp: %i" 2250 "\n cpu: %i pcp: %i"
2199 "\n count: %i" 2251 "\n count: %i"
2200 "\n low: %i"
2201 "\n high: %i" 2252 "\n high: %i"
2202 "\n batch: %i", 2253 "\n batch: %i",
2203 i, j, 2254 i, j,
2204 pageset->pcp[j].count, 2255 pageset->pcp[j].count,
2205 pageset->pcp[j].low,
2206 pageset->pcp[j].high, 2256 pageset->pcp[j].high,
2207 pageset->pcp[j].batch); 2257 pageset->pcp[j].batch);
2208 } 2258 }
@@ -2257,32 +2307,40 @@ static char *vmstat_text[] = {
2257 "pgpgout", 2307 "pgpgout",
2258 "pswpin", 2308 "pswpin",
2259 "pswpout", 2309 "pswpout",
2260 "pgalloc_high",
2261 2310
2311 "pgalloc_high",
2262 "pgalloc_normal", 2312 "pgalloc_normal",
2313 "pgalloc_dma32",
2263 "pgalloc_dma", 2314 "pgalloc_dma",
2315
2264 "pgfree", 2316 "pgfree",
2265 "pgactivate", 2317 "pgactivate",
2266 "pgdeactivate", 2318 "pgdeactivate",
2267 2319
2268 "pgfault", 2320 "pgfault",
2269 "pgmajfault", 2321 "pgmajfault",
2322
2270 "pgrefill_high", 2323 "pgrefill_high",
2271 "pgrefill_normal", 2324 "pgrefill_normal",
2325 "pgrefill_dma32",
2272 "pgrefill_dma", 2326 "pgrefill_dma",
2273 2327
2274 "pgsteal_high", 2328 "pgsteal_high",
2275 "pgsteal_normal", 2329 "pgsteal_normal",
2330 "pgsteal_dma32",
2276 "pgsteal_dma", 2331 "pgsteal_dma",
2332
2277 "pgscan_kswapd_high", 2333 "pgscan_kswapd_high",
2278 "pgscan_kswapd_normal", 2334 "pgscan_kswapd_normal",
2279 2335 "pgscan_kswapd_dma32",
2280 "pgscan_kswapd_dma", 2336 "pgscan_kswapd_dma",
2337
2281 "pgscan_direct_high", 2338 "pgscan_direct_high",
2282 "pgscan_direct_normal", 2339 "pgscan_direct_normal",
2340 "pgscan_direct_dma32",
2283 "pgscan_direct_dma", 2341 "pgscan_direct_dma",
2284 "pginodesteal",
2285 2342
2343 "pginodesteal",
2286 "slabs_scanned", 2344 "slabs_scanned",
2287 "kswapd_steal", 2345 "kswapd_steal",
2288 "kswapd_inodesteal", 2346 "kswapd_inodesteal",
@@ -2539,6 +2597,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2539 return 0; 2597 return 0;
2540} 2598}
2541 2599
2600/*
2601 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
2602 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
2603 * can have before it gets flushed back to buddy allocator.
2604 */
2605
2606int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2607 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2608{
2609 struct zone *zone;
2610 unsigned int cpu;
2611 int ret;
2612
2613 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2614 if (!write || (ret == -EINVAL))
2615 return ret;
2616 for_each_zone(zone) {
2617 for_each_online_cpu(cpu) {
2618 unsigned long high;
2619 high = zone->present_pages / percpu_pagelist_fraction;
2620 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
2621 }
2622 }
2623 return 0;
2624}
2625
2542__initdata int hashdist = HASHDIST_DEFAULT; 2626__initdata int hashdist = HASHDIST_DEFAULT;
2543 2627
2544#ifdef CONFIG_NUMA 2628#ifdef CONFIG_NUMA
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c489..c4b6d0afd736 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
90 90
91static int __pdflush(struct pdflush_work *my_work) 91static int __pdflush(struct pdflush_work *my_work)
92{ 92{
93 current->flags |= PF_FLUSHER; 93 current->flags |= PF_FLUSHER | PF_SWAPWRITE;
94 my_work->fn = NULL; 94 my_work->fn = NULL;
95 my_work->who = current; 95 my_work->who = current;
96 INIT_LIST_HEAD(&my_work->list); 96 INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c7..8d6eeaaa6296 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
158{ 158{
159 unsigned page_idx; 159 unsigned page_idx;
160 struct pagevec lru_pvec; 160 struct pagevec lru_pvec;
161 int ret = 0; 161 int ret;
162 162
163 if (mapping->a_ops->readpages) { 163 if (mapping->a_ops->readpages) {
164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); 164 ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
171 list_del(&page->lru); 171 list_del(&page->lru);
172 if (!add_to_page_cache(page, mapping, 172 if (!add_to_page_cache(page, mapping,
173 page->index, GFP_KERNEL)) { 173 page->index, GFP_KERNEL)) {
174 mapping->a_ops->readpage(filp, page); 174 ret = mapping->a_ops->readpage(filp, page);
175 if (!pagevec_add(&lru_pvec, page)) 175 if (ret != AOP_TRUNCATED_PAGE) {
176 __pagevec_lru_add(&lru_pvec); 176 if (!pagevec_add(&lru_pvec, page))
177 } else { 177 __pagevec_lru_add(&lru_pvec);
178 page_cache_release(page); 178 continue;
179 } /* else fall through to release */
179 } 180 }
181 page_cache_release(page);
180 } 182 }
181 pagevec_lru_add(&lru_pvec); 183 pagevec_lru_add(&lru_pvec);
184 ret = 0;
182out: 185out:
183 return ret; 186 return ret;
184} 187}
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def159..dfbb89f99a15 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,13 +20,13 @@
20/* 20/*
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_sem (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem 24 * inode->i_alloc_sem
25 * 25 *
26 * When a page fault occurs in writing from user to file, down_read 26 * When a page fault occurs in writing from user to file, down_read
27 * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within 27 * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
28 * down_read of mmap_sem; i_sem and down_write of mmap_sem are never 28 * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
29 * taken together; in truncation, i_sem is taken outermost. 29 * taken together; in truncation, i_mutex is taken outermost.
30 * 30 *
31 * mm->mmap_sem 31 * mm->mmap_sem
32 * page->flags PG_locked (lock_page) 32 * page->flags PG_locked (lock_page)
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked)
435} 435}
436 436
437/** 437/**
438 * page_set_anon_rmap - setup new anonymous rmap
439 * @page: the page to add the mapping to
440 * @vma: the vm area in which the mapping is added
441 * @address: the user virtual address mapped
442 */
443static void __page_set_anon_rmap(struct page *page,
444 struct vm_area_struct *vma, unsigned long address)
445{
446 struct anon_vma *anon_vma = vma->anon_vma;
447
448 BUG_ON(!anon_vma);
449 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
450 page->mapping = (struct address_space *) anon_vma;
451
452 page->index = linear_page_index(vma, address);
453
454 /*
455 * nr_mapped state can be updated without turning off
456 * interrupts because it is not modified via interrupt.
457 */
458 __inc_page_state(nr_mapped);
459}
460
461/**
438 * page_add_anon_rmap - add pte mapping to an anonymous page 462 * page_add_anon_rmap - add pte mapping to an anonymous page
439 * @page: the page to add the mapping to 463 * @page: the page to add the mapping to
440 * @vma: the vm area in which the mapping is added 464 * @vma: the vm area in which the mapping is added
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked)
445void page_add_anon_rmap(struct page *page, 469void page_add_anon_rmap(struct page *page,
446 struct vm_area_struct *vma, unsigned long address) 470 struct vm_area_struct *vma, unsigned long address)
447{ 471{
448 if (atomic_inc_and_test(&page->_mapcount)) { 472 if (atomic_inc_and_test(&page->_mapcount))
449 struct anon_vma *anon_vma = vma->anon_vma; 473 __page_set_anon_rmap(page, vma, address);
450
451 BUG_ON(!anon_vma);
452 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
453 page->mapping = (struct address_space *) anon_vma;
454
455 page->index = linear_page_index(vma, address);
456
457 inc_page_state(nr_mapped);
458 }
459 /* else checking page index and mapping is racy */ 474 /* else checking page index and mapping is racy */
460} 475}
461 476
477/*
478 * page_add_new_anon_rmap - add pte mapping to a new anonymous page
479 * @page: the page to add the mapping to
480 * @vma: the vm area in which the mapping is added
481 * @address: the user virtual address mapped
482 *
483 * Same as page_add_anon_rmap but must only be called on *new* pages.
484 * This means the inc-and-test can be bypassed.
485 */
486void page_add_new_anon_rmap(struct page *page,
487 struct vm_area_struct *vma, unsigned long address)
488{
489 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
490 __page_set_anon_rmap(page, vma, address);
491}
492
462/** 493/**
463 * page_add_file_rmap - add pte mapping to a file page 494 * page_add_file_rmap - add pte mapping to a file page
464 * @page: the page to add the mapping to 495 * @page: the page to add the mapping to
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page)
471 BUG_ON(!pfn_valid(page_to_pfn(page))); 502 BUG_ON(!pfn_valid(page_to_pfn(page)));
472 503
473 if (atomic_inc_and_test(&page->_mapcount)) 504 if (atomic_inc_and_test(&page->_mapcount))
474 inc_page_state(nr_mapped); 505 __inc_page_state(nr_mapped);
475} 506}
476 507
477/** 508/**
@@ -483,6 +514,13 @@ void page_add_file_rmap(struct page *page)
483void page_remove_rmap(struct page *page) 514void page_remove_rmap(struct page *page)
484{ 515{
485 if (atomic_add_negative(-1, &page->_mapcount)) { 516 if (atomic_add_negative(-1, &page->_mapcount)) {
517 if (page_mapcount(page) < 0) {
518 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
519 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
520 printk (KERN_EMERG " page->count = %x\n", page_count(page));
521 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
522 }
523
486 BUG_ON(page_mapcount(page) < 0); 524 BUG_ON(page_mapcount(page) < 0);
487 /* 525 /*
488 * It would be tidy to reset the PageAnon mapping here, 526 * It would be tidy to reset the PageAnon mapping here,
@@ -495,7 +533,7 @@ void page_remove_rmap(struct page *page)
495 */ 533 */
496 if (page_test_and_clear_dirty(page)) 534 if (page_test_and_clear_dirty(page))
497 set_page_dirty(page); 535 set_page_dirty(page);
498 dec_page_state(nr_mapped); 536 __dec_page_state(nr_mapped);
499 } 537 }
500} 538}
501 539
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e9..343b3c0937e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
457 } while (next); 457 } while (next);
458} 458}
459 459
460static void shmem_truncate(struct inode *inode) 460static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
461{ 461{
462 struct shmem_inode_info *info = SHMEM_I(inode); 462 struct shmem_inode_info *info = SHMEM_I(inode);
463 unsigned long idx; 463 unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
475 long nr_swaps_freed = 0; 475 long nr_swaps_freed = 0;
476 int offset; 476 int offset;
477 int freed; 477 int freed;
478 int punch_hole = 0;
478 479
479 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 480 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
480 idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 481 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
481 if (idx >= info->next_index) 482 if (idx >= info->next_index)
482 return; 483 return;
483 484
484 spin_lock(&info->lock); 485 spin_lock(&info->lock);
485 info->flags |= SHMEM_TRUNCATE; 486 info->flags |= SHMEM_TRUNCATE;
486 limit = info->next_index; 487 if (likely(end == (loff_t) -1)) {
487 info->next_index = idx; 488 limit = info->next_index;
489 info->next_index = idx;
490 } else {
491 limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
492 if (limit > info->next_index)
493 limit = info->next_index;
494 punch_hole = 1;
495 }
496
488 topdir = info->i_indirect; 497 topdir = info->i_indirect;
489 if (topdir && idx <= SHMEM_NR_DIRECT) { 498 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
490 info->i_indirect = NULL; 499 info->i_indirect = NULL;
491 nr_pages_to_free++; 500 nr_pages_to_free++;
492 list_add(&topdir->lru, &pages_to_free); 501 list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
573 set_page_private(subdir, page_private(subdir) - freed); 582 set_page_private(subdir, page_private(subdir) - freed);
574 if (offset) 583 if (offset)
575 spin_unlock(&info->lock); 584 spin_unlock(&info->lock);
576 BUG_ON(page_private(subdir) > offset); 585 if (!punch_hole)
586 BUG_ON(page_private(subdir) > offset);
577 } 587 }
578 if (offset) 588 if (offset)
579 offset = 0; 589 offset = 0;
580 else if (subdir) { 590 else if (subdir && !page_private(subdir)) {
581 dir[diroff] = NULL; 591 dir[diroff] = NULL;
582 nr_pages_to_free++; 592 nr_pages_to_free++;
583 list_add(&subdir->lru, &pages_to_free); 593 list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
594 * Also, though shmem_getpage checks i_size before adding to 604 * Also, though shmem_getpage checks i_size before adding to
595 * cache, no recheck after: so fix the narrow window there too. 605 * cache, no recheck after: so fix the narrow window there too.
596 */ 606 */
597 truncate_inode_pages(inode->i_mapping, inode->i_size); 607 truncate_inode_pages_range(inode->i_mapping, start, end);
598 } 608 }
599 609
600 spin_lock(&info->lock); 610 spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
614 } 624 }
615} 625}
616 626
627static void shmem_truncate(struct inode *inode)
628{
629 shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
630}
631
617static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) 632static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
618{ 633{
619 struct inode *inode = dentry->d_inode; 634 struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
855 swap_free(swap); 870 swap_free(swap);
856redirty: 871redirty:
857 set_page_dirty(page); 872 set_page_dirty(page);
858 return WRITEPAGE_ACTIVATE; /* Return with the page locked */ 873 return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */
859} 874}
860 875
861#ifdef CONFIG_NUMA 876#ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
1255 return retval; 1270 return retval;
1256} 1271}
1257 1272
1258static int shmem_mmap(struct file *file, struct vm_area_struct *vma) 1273int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1259{ 1274{
1260 file_accessed(file); 1275 file_accessed(file);
1261 vma->vm_ops = &shmem_vm_ops; 1276 vma->vm_ops = &shmem_vm_ops;
@@ -1355,7 +1370,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
1355 if (!access_ok(VERIFY_READ, buf, count)) 1370 if (!access_ok(VERIFY_READ, buf, count))
1356 return -EFAULT; 1371 return -EFAULT;
1357 1372
1358 down(&inode->i_sem); 1373 mutex_lock(&inode->i_mutex);
1359 1374
1360 pos = *ppos; 1375 pos = *ppos;
1361 written = 0; 1376 written = 0;
@@ -1440,7 +1455,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
1440 if (written) 1455 if (written)
1441 err = written; 1456 err = written;
1442out: 1457out:
1443 up(&inode->i_sem); 1458 mutex_unlock(&inode->i_mutex);
1444 return err; 1459 return err;
1445} 1460}
1446 1461
@@ -1476,7 +1491,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1476 1491
1477 /* 1492 /*
1478 * We must evaluate after, since reads (unlike writes) 1493 * We must evaluate after, since reads (unlike writes)
1479 * are called without i_sem protection against truncate 1494 * are called without i_mutex protection against truncate
1480 */ 1495 */
1481 nr = PAGE_CACHE_SIZE; 1496 nr = PAGE_CACHE_SIZE;
1482 i_size = i_size_read(inode); 1497 i_size = i_size_read(inode);
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = {
2083static struct inode_operations shmem_inode_operations = { 2098static struct inode_operations shmem_inode_operations = {
2084 .truncate = shmem_truncate, 2099 .truncate = shmem_truncate,
2085 .setattr = shmem_notify_change, 2100 .setattr = shmem_notify_change,
2101 .truncate_range = shmem_truncate_range,
2086}; 2102};
2087 2103
2088static struct inode_operations shmem_dir_inode_operations = { 2104static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c460..9374293a3012 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -130,7 +130,6 @@
130#define FORCED_DEBUG 0 130#define FORCED_DEBUG 0
131#endif 131#endif
132 132
133
134/* Shouldn't this be in a header file somewhere? */ 133/* Shouldn't this be in a header file somewhere? */
135#define BYTES_PER_WORD sizeof(void *) 134#define BYTES_PER_WORD sizeof(void *)
136 135
@@ -217,12 +216,12 @@ static unsigned long offslab_limit;
217 * Slabs are chained into three list: fully used, partial, fully free slabs. 216 * Slabs are chained into three list: fully used, partial, fully free slabs.
218 */ 217 */
219struct slab { 218struct slab {
220 struct list_head list; 219 struct list_head list;
221 unsigned long colouroff; 220 unsigned long colouroff;
222 void *s_mem; /* including colour offset */ 221 void *s_mem; /* including colour offset */
223 unsigned int inuse; /* num of objs active in slab */ 222 unsigned int inuse; /* num of objs active in slab */
224 kmem_bufctl_t free; 223 kmem_bufctl_t free;
225 unsigned short nodeid; 224 unsigned short nodeid;
226}; 225};
227 226
228/* 227/*
@@ -242,9 +241,9 @@ struct slab {
242 * We assume struct slab_rcu can overlay struct slab when destroying. 241 * We assume struct slab_rcu can overlay struct slab when destroying.
243 */ 242 */
244struct slab_rcu { 243struct slab_rcu {
245 struct rcu_head head; 244 struct rcu_head head;
246 kmem_cache_t *cachep; 245 kmem_cache_t *cachep;
247 void *addr; 246 void *addr;
248}; 247};
249 248
250/* 249/*
@@ -279,23 +278,23 @@ struct array_cache {
279#define BOOT_CPUCACHE_ENTRIES 1 278#define BOOT_CPUCACHE_ENTRIES 1
280struct arraycache_init { 279struct arraycache_init {
281 struct array_cache cache; 280 struct array_cache cache;
282 void * entries[BOOT_CPUCACHE_ENTRIES]; 281 void *entries[BOOT_CPUCACHE_ENTRIES];
283}; 282};
284 283
285/* 284/*
286 * The slab lists for all objects. 285 * The slab lists for all objects.
287 */ 286 */
288struct kmem_list3 { 287struct kmem_list3 {
289 struct list_head slabs_partial; /* partial list first, better asm code */ 288 struct list_head slabs_partial; /* partial list first, better asm code */
290 struct list_head slabs_full; 289 struct list_head slabs_full;
291 struct list_head slabs_free; 290 struct list_head slabs_free;
292 unsigned long free_objects; 291 unsigned long free_objects;
293 unsigned long next_reap; 292 unsigned long next_reap;
294 int free_touched; 293 int free_touched;
295 unsigned int free_limit; 294 unsigned int free_limit;
296 spinlock_t list_lock; 295 spinlock_t list_lock;
297 struct array_cache *shared; /* shared per node */ 296 struct array_cache *shared; /* shared per node */
298 struct array_cache **alien; /* on other nodes */ 297 struct array_cache **alien; /* on other nodes */
299}; 298};
300 299
301/* 300/*
@@ -367,63 +366,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
367 * 366 *
368 * manages a cache. 367 * manages a cache.
369 */ 368 */
370 369
371struct kmem_cache { 370struct kmem_cache {
372/* 1) per-cpu data, touched during every alloc/free */ 371/* 1) per-cpu data, touched during every alloc/free */
373 struct array_cache *array[NR_CPUS]; 372 struct array_cache *array[NR_CPUS];
374 unsigned int batchcount; 373 unsigned int batchcount;
375 unsigned int limit; 374 unsigned int limit;
376 unsigned int shared; 375 unsigned int shared;
377 unsigned int objsize; 376 unsigned int objsize;
378/* 2) touched by every alloc & free from the backend */ 377/* 2) touched by every alloc & free from the backend */
379 struct kmem_list3 *nodelists[MAX_NUMNODES]; 378 struct kmem_list3 *nodelists[MAX_NUMNODES];
380 unsigned int flags; /* constant flags */ 379 unsigned int flags; /* constant flags */
381 unsigned int num; /* # of objs per slab */ 380 unsigned int num; /* # of objs per slab */
382 spinlock_t spinlock; 381 spinlock_t spinlock;
383 382
384/* 3) cache_grow/shrink */ 383/* 3) cache_grow/shrink */
385 /* order of pgs per slab (2^n) */ 384 /* order of pgs per slab (2^n) */
386 unsigned int gfporder; 385 unsigned int gfporder;
387 386
388 /* force GFP flags, e.g. GFP_DMA */ 387 /* force GFP flags, e.g. GFP_DMA */
389 gfp_t gfpflags; 388 gfp_t gfpflags;
390 389
391 size_t colour; /* cache colouring range */ 390 size_t colour; /* cache colouring range */
392 unsigned int colour_off; /* colour offset */ 391 unsigned int colour_off; /* colour offset */
393 unsigned int colour_next; /* cache colouring */ 392 unsigned int colour_next; /* cache colouring */
394 kmem_cache_t *slabp_cache; 393 kmem_cache_t *slabp_cache;
395 unsigned int slab_size; 394 unsigned int slab_size;
396 unsigned int dflags; /* dynamic flags */ 395 unsigned int dflags; /* dynamic flags */
397 396
398 /* constructor func */ 397 /* constructor func */
399 void (*ctor)(void *, kmem_cache_t *, unsigned long); 398 void (*ctor) (void *, kmem_cache_t *, unsigned long);
400 399
401 /* de-constructor func */ 400 /* de-constructor func */
402 void (*dtor)(void *, kmem_cache_t *, unsigned long); 401 void (*dtor) (void *, kmem_cache_t *, unsigned long);
403 402
404/* 4) cache creation/removal */ 403/* 4) cache creation/removal */
405 const char *name; 404 const char *name;
406 struct list_head next; 405 struct list_head next;
407 406
408/* 5) statistics */ 407/* 5) statistics */
409#if STATS 408#if STATS
410 unsigned long num_active; 409 unsigned long num_active;
411 unsigned long num_allocations; 410 unsigned long num_allocations;
412 unsigned long high_mark; 411 unsigned long high_mark;
413 unsigned long grown; 412 unsigned long grown;
414 unsigned long reaped; 413 unsigned long reaped;
415 unsigned long errors; 414 unsigned long errors;
416 unsigned long max_freeable; 415 unsigned long max_freeable;
417 unsigned long node_allocs; 416 unsigned long node_allocs;
418 unsigned long node_frees; 417 unsigned long node_frees;
419 atomic_t allochit; 418 atomic_t allochit;
420 atomic_t allocmiss; 419 atomic_t allocmiss;
421 atomic_t freehit; 420 atomic_t freehit;
422 atomic_t freemiss; 421 atomic_t freemiss;
423#endif 422#endif
424#if DEBUG 423#if DEBUG
425 int dbghead; 424 int dbghead;
426 int reallen; 425 int reallen;
427#endif 426#endif
428}; 427};
429 428
@@ -523,14 +522,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
523{ 522{
524 BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); 523 BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
525 if (cachep->flags & SLAB_STORE_USER) 524 if (cachep->flags & SLAB_STORE_USER)
526 return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); 525 return (unsigned long *)(objp + cachep->objsize -
527 return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); 526 2 * BYTES_PER_WORD);
527 return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
528} 528}
529 529
530static void **dbg_userword(kmem_cache_t *cachep, void *objp) 530static void **dbg_userword(kmem_cache_t *cachep, void *objp)
531{ 531{
532 BUG_ON(!(cachep->flags & SLAB_STORE_USER)); 532 BUG_ON(!(cachep->flags & SLAB_STORE_USER));
533 return (void**)(objp+cachep->objsize-BYTES_PER_WORD); 533 return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
534} 534}
535 535
536#else 536#else
@@ -607,31 +607,31 @@ struct cache_names {
607static struct cache_names __initdata cache_names[] = { 607static struct cache_names __initdata cache_names[] = {
608#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, 608#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
609#include <linux/kmalloc_sizes.h> 609#include <linux/kmalloc_sizes.h>
610 { NULL, } 610 {NULL,}
611#undef CACHE 611#undef CACHE
612}; 612};
613 613
614static struct arraycache_init initarray_cache __initdata = 614static struct arraycache_init initarray_cache __initdata =
615 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 615 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
616static struct arraycache_init initarray_generic = 616static struct arraycache_init initarray_generic =
617 { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 617 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
618 618
619/* internal cache of cache description objs */ 619/* internal cache of cache description objs */
620static kmem_cache_t cache_cache = { 620static kmem_cache_t cache_cache = {
621 .batchcount = 1, 621 .batchcount = 1,
622 .limit = BOOT_CPUCACHE_ENTRIES, 622 .limit = BOOT_CPUCACHE_ENTRIES,
623 .shared = 1, 623 .shared = 1,
624 .objsize = sizeof(kmem_cache_t), 624 .objsize = sizeof(kmem_cache_t),
625 .flags = SLAB_NO_REAP, 625 .flags = SLAB_NO_REAP,
626 .spinlock = SPIN_LOCK_UNLOCKED, 626 .spinlock = SPIN_LOCK_UNLOCKED,
627 .name = "kmem_cache", 627 .name = "kmem_cache",
628#if DEBUG 628#if DEBUG
629 .reallen = sizeof(kmem_cache_t), 629 .reallen = sizeof(kmem_cache_t),
630#endif 630#endif
631}; 631};
632 632
633/* Guard access to the cache-chain. */ 633/* Guard access to the cache-chain. */
634static struct semaphore cache_chain_sem; 634static struct semaphore cache_chain_sem;
635static struct list_head cache_chain; 635static struct list_head cache_chain;
636 636
637/* 637/*
@@ -655,9 +655,9 @@ static enum {
655 655
656static DEFINE_PER_CPU(struct work_struct, reap_work); 656static DEFINE_PER_CPU(struct work_struct, reap_work);
657 657
658static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node); 658static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
659static void enable_cpucache (kmem_cache_t *cachep); 659static void enable_cpucache(kmem_cache_t *cachep);
660static void cache_reap (void *unused); 660static void cache_reap(void *unused);
661static int __node_shrink(kmem_cache_t *cachep, int node); 661static int __node_shrink(kmem_cache_t *cachep, int node);
662 662
663static inline struct array_cache *ac_data(kmem_cache_t *cachep) 663static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -671,9 +671,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
671 671
672#if DEBUG 672#if DEBUG
673 /* This happens if someone tries to call 673 /* This happens if someone tries to call
674 * kmem_cache_create(), or __kmalloc(), before 674 * kmem_cache_create(), or __kmalloc(), before
675 * the generic caches are initialized. 675 * the generic caches are initialized.
676 */ 676 */
677 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 677 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
678#endif 678#endif
679 while (size > csizep->cs_size) 679 while (size > csizep->cs_size)
@@ -697,10 +697,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
697 697
698/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 698/* Cal the num objs, wastage, and bytes left over for a given slab size. */
699static void cache_estimate(unsigned long gfporder, size_t size, size_t align, 699static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
700 int flags, size_t *left_over, unsigned int *num) 700 int flags, size_t *left_over, unsigned int *num)
701{ 701{
702 int i; 702 int i;
703 size_t wastage = PAGE_SIZE<<gfporder; 703 size_t wastage = PAGE_SIZE << gfporder;
704 size_t extra = 0; 704 size_t extra = 0;
705 size_t base = 0; 705 size_t base = 0;
706 706
@@ -709,7 +709,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
709 extra = sizeof(kmem_bufctl_t); 709 extra = sizeof(kmem_bufctl_t);
710 } 710 }
711 i = 0; 711 i = 0;
712 while (i*size + ALIGN(base+i*extra, align) <= wastage) 712 while (i * size + ALIGN(base + i * extra, align) <= wastage)
713 i++; 713 i++;
714 if (i > 0) 714 if (i > 0)
715 i--; 715 i--;
@@ -718,8 +718,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
718 i = SLAB_LIMIT; 718 i = SLAB_LIMIT;
719 719
720 *num = i; 720 *num = i;
721 wastage -= i*size; 721 wastage -= i * size;
722 wastage -= ALIGN(base+i*extra, align); 722 wastage -= ALIGN(base + i * extra, align);
723 *left_over = wastage; 723 *left_over = wastage;
724} 724}
725 725
@@ -728,7 +728,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
728static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) 728static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
729{ 729{
730 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", 730 printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
731 function, cachep->name, msg); 731 function, cachep->name, msg);
732 dump_stack(); 732 dump_stack();
733} 733}
734 734
@@ -755,9 +755,9 @@ static void __devinit start_cpu_timer(int cpu)
755} 755}
756 756
757static struct array_cache *alloc_arraycache(int node, int entries, 757static struct array_cache *alloc_arraycache(int node, int entries,
758 int batchcount) 758 int batchcount)
759{ 759{
760 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 760 int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
761 struct array_cache *nc = NULL; 761 struct array_cache *nc = NULL;
762 762
763 nc = kmalloc_node(memsize, GFP_KERNEL, node); 763 nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -775,7 +775,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
775static inline struct array_cache **alloc_alien_cache(int node, int limit) 775static inline struct array_cache **alloc_alien_cache(int node, int limit)
776{ 776{
777 struct array_cache **ac_ptr; 777 struct array_cache **ac_ptr;
778 int memsize = sizeof(void*)*MAX_NUMNODES; 778 int memsize = sizeof(void *) * MAX_NUMNODES;
779 int i; 779 int i;
780 780
781 if (limit > 1) 781 if (limit > 1)
@@ -789,7 +789,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
789 } 789 }
790 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); 790 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
791 if (!ac_ptr[i]) { 791 if (!ac_ptr[i]) {
792 for (i--; i <=0; i--) 792 for (i--; i <= 0; i--)
793 kfree(ac_ptr[i]); 793 kfree(ac_ptr[i]);
794 kfree(ac_ptr); 794 kfree(ac_ptr);
795 return NULL; 795 return NULL;
@@ -807,12 +807,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
807 return; 807 return;
808 808
809 for_each_node(i) 809 for_each_node(i)
810 kfree(ac_ptr[i]); 810 kfree(ac_ptr[i]);
811 811
812 kfree(ac_ptr); 812 kfree(ac_ptr);
813} 813}
814 814
815static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node) 815static inline void __drain_alien_cache(kmem_cache_t *cachep,
816 struct array_cache *ac, int node)
816{ 817{
817 struct kmem_list3 *rl3 = cachep->nodelists[node]; 818 struct kmem_list3 *rl3 = cachep->nodelists[node];
818 819
@@ -826,7 +827,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
826 827
827static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) 828static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
828{ 829{
829 int i=0; 830 int i = 0;
830 struct array_cache *ac; 831 struct array_cache *ac;
831 unsigned long flags; 832 unsigned long flags;
832 833
@@ -846,14 +847,13 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
846#endif 847#endif
847 848
848static int __devinit cpuup_callback(struct notifier_block *nfb, 849static int __devinit cpuup_callback(struct notifier_block *nfb,
849 unsigned long action, void *hcpu) 850 unsigned long action, void *hcpu)
850{ 851{
851 long cpu = (long)hcpu; 852 long cpu = (long)hcpu;
852 kmem_cache_t* cachep; 853 kmem_cache_t *cachep;
853 struct kmem_list3 *l3 = NULL; 854 struct kmem_list3 *l3 = NULL;
854 int node = cpu_to_node(cpu); 855 int node = cpu_to_node(cpu);
855 int memsize = sizeof(struct kmem_list3); 856 int memsize = sizeof(struct kmem_list3);
856 struct array_cache *nc = NULL;
857 857
858 switch (action) { 858 switch (action) {
859 case CPU_UP_PREPARE: 859 case CPU_UP_PREPARE:
@@ -871,27 +871,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
871 */ 871 */
872 if (!cachep->nodelists[node]) { 872 if (!cachep->nodelists[node]) {
873 if (!(l3 = kmalloc_node(memsize, 873 if (!(l3 = kmalloc_node(memsize,
874 GFP_KERNEL, node))) 874 GFP_KERNEL, node)))
875 goto bad; 875 goto bad;
876 kmem_list3_init(l3); 876 kmem_list3_init(l3);
877 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 877 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
878 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 878 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
879 879
880 cachep->nodelists[node] = l3; 880 cachep->nodelists[node] = l3;
881 } 881 }
882 882
883 spin_lock_irq(&cachep->nodelists[node]->list_lock); 883 spin_lock_irq(&cachep->nodelists[node]->list_lock);
884 cachep->nodelists[node]->free_limit = 884 cachep->nodelists[node]->free_limit =
885 (1 + nr_cpus_node(node)) * 885 (1 + nr_cpus_node(node)) *
886 cachep->batchcount + cachep->num; 886 cachep->batchcount + cachep->num;
887 spin_unlock_irq(&cachep->nodelists[node]->list_lock); 887 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
888 } 888 }
889 889
890 /* Now we can go ahead with allocating the shared array's 890 /* Now we can go ahead with allocating the shared array's
891 & array cache's */ 891 & array cache's */
892 list_for_each_entry(cachep, &cache_chain, next) { 892 list_for_each_entry(cachep, &cache_chain, next) {
893 struct array_cache *nc;
894
893 nc = alloc_arraycache(node, cachep->limit, 895 nc = alloc_arraycache(node, cachep->limit,
894 cachep->batchcount); 896 cachep->batchcount);
895 if (!nc) 897 if (!nc)
896 goto bad; 898 goto bad;
897 cachep->array[cpu] = nc; 899 cachep->array[cpu] = nc;
@@ -900,12 +902,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
900 BUG_ON(!l3); 902 BUG_ON(!l3);
901 if (!l3->shared) { 903 if (!l3->shared) {
902 if (!(nc = alloc_arraycache(node, 904 if (!(nc = alloc_arraycache(node,
903 cachep->shared*cachep->batchcount, 905 cachep->shared *
904 0xbaadf00d))) 906 cachep->batchcount,
905 goto bad; 907 0xbaadf00d)))
908 goto bad;
906 909
907 /* we are serialised from CPU_DEAD or 910 /* we are serialised from CPU_DEAD or
908 CPU_UP_CANCELLED by the cpucontrol lock */ 911 CPU_UP_CANCELLED by the cpucontrol lock */
909 l3->shared = nc; 912 l3->shared = nc;
910 } 913 }
911 } 914 }
@@ -942,13 +945,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
942 free_block(cachep, nc->entry, nc->avail, node); 945 free_block(cachep, nc->entry, nc->avail, node);
943 946
944 if (!cpus_empty(mask)) { 947 if (!cpus_empty(mask)) {
945 spin_unlock(&l3->list_lock); 948 spin_unlock(&l3->list_lock);
946 goto unlock_cache; 949 goto unlock_cache;
947 } 950 }
948 951
949 if (l3->shared) { 952 if (l3->shared) {
950 free_block(cachep, l3->shared->entry, 953 free_block(cachep, l3->shared->entry,
951 l3->shared->avail, node); 954 l3->shared->avail, node);
952 kfree(l3->shared); 955 kfree(l3->shared);
953 l3->shared = NULL; 956 l3->shared = NULL;
954 } 957 }
@@ -966,7 +969,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
966 } else { 969 } else {
967 spin_unlock(&l3->list_lock); 970 spin_unlock(&l3->list_lock);
968 } 971 }
969unlock_cache: 972 unlock_cache:
970 spin_unlock_irq(&cachep->spinlock); 973 spin_unlock_irq(&cachep->spinlock);
971 kfree(nc); 974 kfree(nc);
972 } 975 }
@@ -975,7 +978,7 @@ unlock_cache:
975#endif 978#endif
976 } 979 }
977 return NOTIFY_OK; 980 return NOTIFY_OK;
978bad: 981 bad:
979 up(&cache_chain_sem); 982 up(&cache_chain_sem);
980 return NOTIFY_BAD; 983 return NOTIFY_BAD;
981} 984}
@@ -985,8 +988,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
985/* 988/*
986 * swap the static kmem_list3 with kmalloced memory 989 * swap the static kmem_list3 with kmalloced memory
987 */ 990 */
988static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, 991static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
989 int nodeid)
990{ 992{
991 struct kmem_list3 *ptr; 993 struct kmem_list3 *ptr;
992 994
@@ -1055,14 +1057,14 @@ void __init kmem_cache_init(void)
1055 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1057 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
1056 1058
1057 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, 1059 cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
1058 &left_over, &cache_cache.num); 1060 &left_over, &cache_cache.num);
1059 if (!cache_cache.num) 1061 if (!cache_cache.num)
1060 BUG(); 1062 BUG();
1061 1063
1062 cache_cache.colour = left_over/cache_cache.colour_off; 1064 cache_cache.colour = left_over / cache_cache.colour_off;
1063 cache_cache.colour_next = 0; 1065 cache_cache.colour_next = 0;
1064 cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + 1066 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1065 sizeof(struct slab), cache_line_size()); 1067 sizeof(struct slab), cache_line_size());
1066 1068
1067 /* 2+3) create the kmalloc caches */ 1069 /* 2+3) create the kmalloc caches */
1068 sizes = malloc_sizes; 1070 sizes = malloc_sizes;
@@ -1074,14 +1076,18 @@ void __init kmem_cache_init(void)
1074 */ 1076 */
1075 1077
1076 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, 1078 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1077 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, 1079 sizes[INDEX_AC].cs_size,
1078 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1080 ARCH_KMALLOC_MINALIGN,
1081 (ARCH_KMALLOC_FLAGS |
1082 SLAB_PANIC), NULL, NULL);
1079 1083
1080 if (INDEX_AC != INDEX_L3) 1084 if (INDEX_AC != INDEX_L3)
1081 sizes[INDEX_L3].cs_cachep = 1085 sizes[INDEX_L3].cs_cachep =
1082 kmem_cache_create(names[INDEX_L3].name, 1086 kmem_cache_create(names[INDEX_L3].name,
1083 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, 1087 sizes[INDEX_L3].cs_size,
1084 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1088 ARCH_KMALLOC_MINALIGN,
1089 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
1090 NULL);
1085 1091
1086 while (sizes->cs_size != ULONG_MAX) { 1092 while (sizes->cs_size != ULONG_MAX) {
1087 /* 1093 /*
@@ -1091,35 +1097,41 @@ void __init kmem_cache_init(void)
1091 * Note for systems short on memory removing the alignment will 1097 * Note for systems short on memory removing the alignment will
1092 * allow tighter packing of the smaller caches. 1098 * allow tighter packing of the smaller caches.
1093 */ 1099 */
1094 if(!sizes->cs_cachep) 1100 if (!sizes->cs_cachep)
1095 sizes->cs_cachep = kmem_cache_create(names->name, 1101 sizes->cs_cachep = kmem_cache_create(names->name,
1096 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1102 sizes->cs_size,
1097 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1103 ARCH_KMALLOC_MINALIGN,
1104 (ARCH_KMALLOC_FLAGS
1105 | SLAB_PANIC),
1106 NULL, NULL);
1098 1107
1099 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1108 /* Inc off-slab bufctl limit until the ceiling is hit. */
1100 if (!(OFF_SLAB(sizes->cs_cachep))) { 1109 if (!(OFF_SLAB(sizes->cs_cachep))) {
1101 offslab_limit = sizes->cs_size-sizeof(struct slab); 1110 offslab_limit = sizes->cs_size - sizeof(struct slab);
1102 offslab_limit /= sizeof(kmem_bufctl_t); 1111 offslab_limit /= sizeof(kmem_bufctl_t);
1103 } 1112 }
1104 1113
1105 sizes->cs_dmacachep = kmem_cache_create(names->name_dma, 1114 sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
1106 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1115 sizes->cs_size,
1107 (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), 1116 ARCH_KMALLOC_MINALIGN,
1108 NULL, NULL); 1117 (ARCH_KMALLOC_FLAGS |
1118 SLAB_CACHE_DMA |
1119 SLAB_PANIC), NULL,
1120 NULL);
1109 1121
1110 sizes++; 1122 sizes++;
1111 names++; 1123 names++;
1112 } 1124 }
1113 /* 4) Replace the bootstrap head arrays */ 1125 /* 4) Replace the bootstrap head arrays */
1114 { 1126 {
1115 void * ptr; 1127 void *ptr;
1116 1128
1117 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1129 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1118 1130
1119 local_irq_disable(); 1131 local_irq_disable();
1120 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1132 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
1121 memcpy(ptr, ac_data(&cache_cache), 1133 memcpy(ptr, ac_data(&cache_cache),
1122 sizeof(struct arraycache_init)); 1134 sizeof(struct arraycache_init));
1123 cache_cache.array[smp_processor_id()] = ptr; 1135 cache_cache.array[smp_processor_id()] = ptr;
1124 local_irq_enable(); 1136 local_irq_enable();
1125 1137
@@ -1127,11 +1139,11 @@ void __init kmem_cache_init(void)
1127 1139
1128 local_irq_disable(); 1140 local_irq_disable();
1129 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) 1141 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
1130 != &initarray_generic.cache); 1142 != &initarray_generic.cache);
1131 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), 1143 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
1132 sizeof(struct arraycache_init)); 1144 sizeof(struct arraycache_init));
1133 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = 1145 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1134 ptr; 1146 ptr;
1135 local_irq_enable(); 1147 local_irq_enable();
1136 } 1148 }
1137 /* 5) Replace the bootstrap kmem_list3's */ 1149 /* 5) Replace the bootstrap kmem_list3's */
@@ -1139,16 +1151,16 @@ void __init kmem_cache_init(void)
1139 int node; 1151 int node;
1140 /* Replace the static kmem_list3 structures for the boot cpu */ 1152 /* Replace the static kmem_list3 structures for the boot cpu */
1141 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], 1153 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1142 numa_node_id()); 1154 numa_node_id());
1143 1155
1144 for_each_online_node(node) { 1156 for_each_online_node(node) {
1145 init_list(malloc_sizes[INDEX_AC].cs_cachep, 1157 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1146 &initkmem_list3[SIZE_AC+node], node); 1158 &initkmem_list3[SIZE_AC + node], node);
1147 1159
1148 if (INDEX_AC != INDEX_L3) { 1160 if (INDEX_AC != INDEX_L3) {
1149 init_list(malloc_sizes[INDEX_L3].cs_cachep, 1161 init_list(malloc_sizes[INDEX_L3].cs_cachep,
1150 &initkmem_list3[SIZE_L3+node], 1162 &initkmem_list3[SIZE_L3 + node],
1151 node); 1163 node);
1152 } 1164 }
1153 } 1165 }
1154 } 1166 }
@@ -1158,7 +1170,7 @@ void __init kmem_cache_init(void)
1158 kmem_cache_t *cachep; 1170 kmem_cache_t *cachep;
1159 down(&cache_chain_sem); 1171 down(&cache_chain_sem);
1160 list_for_each_entry(cachep, &cache_chain, next) 1172 list_for_each_entry(cachep, &cache_chain, next)
1161 enable_cpucache(cachep); 1173 enable_cpucache(cachep);
1162 up(&cache_chain_sem); 1174 up(&cache_chain_sem);
1163 } 1175 }
1164 1176
@@ -1184,7 +1196,7 @@ static int __init cpucache_init(void)
1184 * pages to gfp. 1196 * pages to gfp.
1185 */ 1197 */
1186 for_each_online_cpu(cpu) 1198 for_each_online_cpu(cpu)
1187 start_cpu_timer(cpu); 1199 start_cpu_timer(cpu);
1188 1200
1189 return 0; 1201 return 0;
1190} 1202}
@@ -1226,7 +1238,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
1226 */ 1238 */
1227static void kmem_freepages(kmem_cache_t *cachep, void *addr) 1239static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1228{ 1240{
1229 unsigned long i = (1<<cachep->gfporder); 1241 unsigned long i = (1 << cachep->gfporder);
1230 struct page *page = virt_to_page(addr); 1242 struct page *page = virt_to_page(addr);
1231 const unsigned long nr_freed = i; 1243 const unsigned long nr_freed = i;
1232 1244
@@ -1239,13 +1251,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
1239 if (current->reclaim_state) 1251 if (current->reclaim_state)
1240 current->reclaim_state->reclaimed_slab += nr_freed; 1252 current->reclaim_state->reclaimed_slab += nr_freed;
1241 free_pages((unsigned long)addr, cachep->gfporder); 1253 free_pages((unsigned long)addr, cachep->gfporder);
1242 if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 1254 if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
1243 atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); 1255 atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
1244} 1256}
1245 1257
1246static void kmem_rcu_free(struct rcu_head *head) 1258static void kmem_rcu_free(struct rcu_head *head)
1247{ 1259{
1248 struct slab_rcu *slab_rcu = (struct slab_rcu *) head; 1260 struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
1249 kmem_cache_t *cachep = slab_rcu->cachep; 1261 kmem_cache_t *cachep = slab_rcu->cachep;
1250 1262
1251 kmem_freepages(cachep, slab_rcu->addr); 1263 kmem_freepages(cachep, slab_rcu->addr);
@@ -1257,19 +1269,19 @@ static void kmem_rcu_free(struct rcu_head *head)
1257 1269
1258#ifdef CONFIG_DEBUG_PAGEALLOC 1270#ifdef CONFIG_DEBUG_PAGEALLOC
1259static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, 1271static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1260 unsigned long caller) 1272 unsigned long caller)
1261{ 1273{
1262 int size = obj_reallen(cachep); 1274 int size = obj_reallen(cachep);
1263 1275
1264 addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; 1276 addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
1265 1277
1266 if (size < 5*sizeof(unsigned long)) 1278 if (size < 5 * sizeof(unsigned long))
1267 return; 1279 return;
1268 1280
1269 *addr++=0x12345678; 1281 *addr++ = 0x12345678;
1270 *addr++=caller; 1282 *addr++ = caller;
1271 *addr++=smp_processor_id(); 1283 *addr++ = smp_processor_id();
1272 size -= 3*sizeof(unsigned long); 1284 size -= 3 * sizeof(unsigned long);
1273 { 1285 {
1274 unsigned long *sptr = &caller; 1286 unsigned long *sptr = &caller;
1275 unsigned long svalue; 1287 unsigned long svalue;
@@ -1277,7 +1289,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1277 while (!kstack_end(sptr)) { 1289 while (!kstack_end(sptr)) {
1278 svalue = *sptr++; 1290 svalue = *sptr++;
1279 if (kernel_text_address(svalue)) { 1291 if (kernel_text_address(svalue)) {
1280 *addr++=svalue; 1292 *addr++ = svalue;
1281 size -= sizeof(unsigned long); 1293 size -= sizeof(unsigned long);
1282 if (size <= sizeof(unsigned long)) 1294 if (size <= sizeof(unsigned long))
1283 break; 1295 break;
@@ -1285,25 +1297,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
1285 } 1297 }
1286 1298
1287 } 1299 }
1288 *addr++=0x87654321; 1300 *addr++ = 0x87654321;
1289} 1301}
1290#endif 1302#endif
1291 1303
1292static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) 1304static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
1293{ 1305{
1294 int size = obj_reallen(cachep); 1306 int size = obj_reallen(cachep);
1295 addr = &((char*)addr)[obj_dbghead(cachep)]; 1307 addr = &((char *)addr)[obj_dbghead(cachep)];
1296 1308
1297 memset(addr, val, size); 1309 memset(addr, val, size);
1298 *(unsigned char *)(addr+size-1) = POISON_END; 1310 *(unsigned char *)(addr + size - 1) = POISON_END;
1299} 1311}
1300 1312
1301static void dump_line(char *data, int offset, int limit) 1313static void dump_line(char *data, int offset, int limit)
1302{ 1314{
1303 int i; 1315 int i;
1304 printk(KERN_ERR "%03x:", offset); 1316 printk(KERN_ERR "%03x:", offset);
1305 for (i=0;i<limit;i++) { 1317 for (i = 0; i < limit; i++) {
1306 printk(" %02x", (unsigned char)data[offset+i]); 1318 printk(" %02x", (unsigned char)data[offset + i]);
1307 } 1319 }
1308 printk("\n"); 1320 printk("\n");
1309} 1321}
@@ -1318,24 +1330,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
1318 1330
1319 if (cachep->flags & SLAB_RED_ZONE) { 1331 if (cachep->flags & SLAB_RED_ZONE) {
1320 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", 1332 printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
1321 *dbg_redzone1(cachep, objp), 1333 *dbg_redzone1(cachep, objp),
1322 *dbg_redzone2(cachep, objp)); 1334 *dbg_redzone2(cachep, objp));
1323 } 1335 }
1324 1336
1325 if (cachep->flags & SLAB_STORE_USER) { 1337 if (cachep->flags & SLAB_STORE_USER) {
1326 printk(KERN_ERR "Last user: [<%p>]", 1338 printk(KERN_ERR "Last user: [<%p>]",
1327 *dbg_userword(cachep, objp)); 1339 *dbg_userword(cachep, objp));
1328 print_symbol("(%s)", 1340 print_symbol("(%s)",
1329 (unsigned long)*dbg_userword(cachep, objp)); 1341 (unsigned long)*dbg_userword(cachep, objp));
1330 printk("\n"); 1342 printk("\n");
1331 } 1343 }
1332 realobj = (char*)objp+obj_dbghead(cachep); 1344 realobj = (char *)objp + obj_dbghead(cachep);
1333 size = obj_reallen(cachep); 1345 size = obj_reallen(cachep);
1334 for (i=0; i<size && lines;i+=16, lines--) { 1346 for (i = 0; i < size && lines; i += 16, lines--) {
1335 int limit; 1347 int limit;
1336 limit = 16; 1348 limit = 16;
1337 if (i+limit > size) 1349 if (i + limit > size)
1338 limit = size-i; 1350 limit = size - i;
1339 dump_line(realobj, i, limit); 1351 dump_line(realobj, i, limit);
1340 } 1352 }
1341} 1353}
@@ -1346,27 +1358,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1346 int size, i; 1358 int size, i;
1347 int lines = 0; 1359 int lines = 0;
1348 1360
1349 realobj = (char*)objp+obj_dbghead(cachep); 1361 realobj = (char *)objp + obj_dbghead(cachep);
1350 size = obj_reallen(cachep); 1362 size = obj_reallen(cachep);
1351 1363
1352 for (i=0;i<size;i++) { 1364 for (i = 0; i < size; i++) {
1353 char exp = POISON_FREE; 1365 char exp = POISON_FREE;
1354 if (i == size-1) 1366 if (i == size - 1)
1355 exp = POISON_END; 1367 exp = POISON_END;
1356 if (realobj[i] != exp) { 1368 if (realobj[i] != exp) {
1357 int limit; 1369 int limit;
1358 /* Mismatch ! */ 1370 /* Mismatch ! */
1359 /* Print header */ 1371 /* Print header */
1360 if (lines == 0) { 1372 if (lines == 0) {
1361 printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", 1373 printk(KERN_ERR
1362 realobj, size); 1374 "Slab corruption: start=%p, len=%d\n",
1375 realobj, size);
1363 print_objinfo(cachep, objp, 0); 1376 print_objinfo(cachep, objp, 0);
1364 } 1377 }
1365 /* Hexdump the affected line */ 1378 /* Hexdump the affected line */
1366 i = (i/16)*16; 1379 i = (i / 16) * 16;
1367 limit = 16; 1380 limit = 16;
1368 if (i+limit > size) 1381 if (i + limit > size)
1369 limit = size-i; 1382 limit = size - i;
1370 dump_line(realobj, i, limit); 1383 dump_line(realobj, i, limit);
1371 i += 16; 1384 i += 16;
1372 lines++; 1385 lines++;
@@ -1382,19 +1395,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1382 struct slab *slabp = page_get_slab(virt_to_page(objp)); 1395 struct slab *slabp = page_get_slab(virt_to_page(objp));
1383 int objnr; 1396 int objnr;
1384 1397
1385 objnr = (objp-slabp->s_mem)/cachep->objsize; 1398 objnr = (objp - slabp->s_mem) / cachep->objsize;
1386 if (objnr) { 1399 if (objnr) {
1387 objp = slabp->s_mem+(objnr-1)*cachep->objsize; 1400 objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
1388 realobj = (char*)objp+obj_dbghead(cachep); 1401 realobj = (char *)objp + obj_dbghead(cachep);
1389 printk(KERN_ERR "Prev obj: start=%p, len=%d\n", 1402 printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1390 realobj, size); 1403 realobj, size);
1391 print_objinfo(cachep, objp, 2); 1404 print_objinfo(cachep, objp, 2);
1392 } 1405 }
1393 if (objnr+1 < cachep->num) { 1406 if (objnr + 1 < cachep->num) {
1394 objp = slabp->s_mem+(objnr+1)*cachep->objsize; 1407 objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
1395 realobj = (char*)objp+obj_dbghead(cachep); 1408 realobj = (char *)objp + obj_dbghead(cachep);
1396 printk(KERN_ERR "Next obj: start=%p, len=%d\n", 1409 printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1397 realobj, size); 1410 realobj, size);
1398 print_objinfo(cachep, objp, 2); 1411 print_objinfo(cachep, objp, 2);
1399 } 1412 }
1400 } 1413 }
@@ -1405,7 +1418,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1405 * Before calling the slab must have been unlinked from the cache. 1418 * Before calling the slab must have been unlinked from the cache.
1406 * The cache-lock is not held/needed. 1419 * The cache-lock is not held/needed.
1407 */ 1420 */
1408static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) 1421static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
1409{ 1422{
1410 void *addr = slabp->s_mem - slabp->colouroff; 1423 void *addr = slabp->s_mem - slabp->colouroff;
1411 1424
@@ -1416,8 +1429,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1416 1429
1417 if (cachep->flags & SLAB_POISON) { 1430 if (cachep->flags & SLAB_POISON) {
1418#ifdef CONFIG_DEBUG_PAGEALLOC 1431#ifdef CONFIG_DEBUG_PAGEALLOC
1419 if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) 1432 if ((cachep->objsize % PAGE_SIZE) == 0
1420 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); 1433 && OFF_SLAB(cachep))
1434 kernel_map_pages(virt_to_page(objp),
1435 cachep->objsize / PAGE_SIZE,
1436 1);
1421 else 1437 else
1422 check_poison_obj(cachep, objp); 1438 check_poison_obj(cachep, objp);
1423#else 1439#else
@@ -1427,20 +1443,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1427 if (cachep->flags & SLAB_RED_ZONE) { 1443 if (cachep->flags & SLAB_RED_ZONE) {
1428 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 1444 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1429 slab_error(cachep, "start of a freed object " 1445 slab_error(cachep, "start of a freed object "
1430 "was overwritten"); 1446 "was overwritten");
1431 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 1447 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1432 slab_error(cachep, "end of a freed object " 1448 slab_error(cachep, "end of a freed object "
1433 "was overwritten"); 1449 "was overwritten");
1434 } 1450 }
1435 if (cachep->dtor && !(cachep->flags & SLAB_POISON)) 1451 if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1436 (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); 1452 (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
1437 } 1453 }
1438#else 1454#else
1439 if (cachep->dtor) { 1455 if (cachep->dtor) {
1440 int i; 1456 int i;
1441 for (i = 0; i < cachep->num; i++) { 1457 for (i = 0; i < cachep->num; i++) {
1442 void* objp = slabp->s_mem+cachep->objsize*i; 1458 void *objp = slabp->s_mem + cachep->objsize * i;
1443 (cachep->dtor)(objp, cachep, 0); 1459 (cachep->dtor) (objp, cachep, 0);
1444 } 1460 }
1445 } 1461 }
1446#endif 1462#endif
@@ -1448,7 +1464,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1448 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { 1464 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1449 struct slab_rcu *slab_rcu; 1465 struct slab_rcu *slab_rcu;
1450 1466
1451 slab_rcu = (struct slab_rcu *) slabp; 1467 slab_rcu = (struct slab_rcu *)slabp;
1452 slab_rcu->cachep = cachep; 1468 slab_rcu->cachep = cachep;
1453 slab_rcu->addr = addr; 1469 slab_rcu->addr = addr;
1454 call_rcu(&slab_rcu->head, kmem_rcu_free); 1470 call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1466,11 +1482,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1466 int node; 1482 int node;
1467 1483
1468 for_each_online_node(node) { 1484 for_each_online_node(node) {
1469 cachep->nodelists[node] = &initkmem_list3[index+node]; 1485 cachep->nodelists[node] = &initkmem_list3[index + node];
1470 cachep->nodelists[node]->next_reap = jiffies + 1486 cachep->nodelists[node]->next_reap = jiffies +
1471 REAPTIMEOUT_LIST3 + 1487 REAPTIMEOUT_LIST3 +
1472 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1488 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1489 }
1490}
1491
1492/**
1493 * calculate_slab_order - calculate size (page order) of slabs and the number
1494 * of objects per slab.
1495 *
1496 * This could be made much more intelligent. For now, try to avoid using
1497 * high order pages for slabs. When the gfp() functions are more friendly
1498 * towards high-order requests, this should be changed.
1499 */
1500static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
1501 size_t align, gfp_t flags)
1502{
1503 size_t left_over = 0;
1504
1505 for (;; cachep->gfporder++) {
1506 unsigned int num;
1507 size_t remainder;
1508
1509 if (cachep->gfporder > MAX_GFP_ORDER) {
1510 cachep->num = 0;
1511 break;
1512 }
1513
1514 cache_estimate(cachep->gfporder, size, align, flags,
1515 &remainder, &num);
1516 if (!num)
1517 continue;
1518 /* More than offslab_limit objects will cause problems */
1519 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
1520 break;
1521
1522 cachep->num = num;
1523 left_over = remainder;
1524
1525 /*
1526 * Large number of objects is good, but very large slabs are
1527 * currently bad for the gfp()s.
1528 */
1529 if (cachep->gfporder >= slab_break_gfp_order)
1530 break;
1531
1532 if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
1533 /* Acceptable internal fragmentation */
1534 break;
1473 } 1535 }
1536 return left_over;
1474} 1537}
1475 1538
1476/** 1539/**
@@ -1519,14 +1582,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1519 * Sanity checks... these are all serious usage bugs. 1582 * Sanity checks... these are all serious usage bugs.
1520 */ 1583 */
1521 if ((!name) || 1584 if ((!name) ||
1522 in_interrupt() || 1585 in_interrupt() ||
1523 (size < BYTES_PER_WORD) || 1586 (size < BYTES_PER_WORD) ||
1524 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || 1587 (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
1525 (dtor && !ctor)) { 1588 printk(KERN_ERR "%s: Early error in slab %s\n",
1526 printk(KERN_ERR "%s: Early error in slab %s\n", 1589 __FUNCTION__, name);
1527 __FUNCTION__, name); 1590 BUG();
1528 BUG(); 1591 }
1529 }
1530 1592
1531 down(&cache_chain_sem); 1593 down(&cache_chain_sem);
1532 1594
@@ -1546,11 +1608,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1546 set_fs(old_fs); 1608 set_fs(old_fs);
1547 if (res) { 1609 if (res) {
1548 printk("SLAB: cache with size %d has lost its name\n", 1610 printk("SLAB: cache with size %d has lost its name\n",
1549 pc->objsize); 1611 pc->objsize);
1550 continue; 1612 continue;
1551 } 1613 }
1552 1614
1553 if (!strcmp(pc->name,name)) { 1615 if (!strcmp(pc->name, name)) {
1554 printk("kmem_cache_create: duplicate cache %s\n", name); 1616 printk("kmem_cache_create: duplicate cache %s\n", name);
1555 dump_stack(); 1617 dump_stack();
1556 goto oops; 1618 goto oops;
@@ -1562,10 +1624,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1562 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { 1624 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1563 /* No constructor, but inital state check requested */ 1625 /* No constructor, but inital state check requested */
1564 printk(KERN_ERR "%s: No con, but init state check " 1626 printk(KERN_ERR "%s: No con, but init state check "
1565 "requested - %s\n", __FUNCTION__, name); 1627 "requested - %s\n", __FUNCTION__, name);
1566 flags &= ~SLAB_DEBUG_INITIAL; 1628 flags &= ~SLAB_DEBUG_INITIAL;
1567 } 1629 }
1568
1569#if FORCED_DEBUG 1630#if FORCED_DEBUG
1570 /* 1631 /*
1571 * Enable redzoning and last user accounting, except for caches with 1632 * Enable redzoning and last user accounting, except for caches with
@@ -1573,8 +1634,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1573 * above the next power of two: caches with object sizes just above a 1634 * above the next power of two: caches with object sizes just above a
1574 * power of two have a significant amount of internal fragmentation. 1635 * power of two have a significant amount of internal fragmentation.
1575 */ 1636 */
1576 if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) 1637 if ((size < 4096
1577 flags |= SLAB_RED_ZONE|SLAB_STORE_USER; 1638 || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
1639 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
1578 if (!(flags & SLAB_DESTROY_BY_RCU)) 1640 if (!(flags & SLAB_DESTROY_BY_RCU))
1579 flags |= SLAB_POISON; 1641 flags |= SLAB_POISON;
1580#endif 1642#endif
@@ -1595,9 +1657,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1595 * unaligned accesses for some archs when redzoning is used, and makes 1657 * unaligned accesses for some archs when redzoning is used, and makes
1596 * sure any on-slab bufctl's are also correctly aligned. 1658 * sure any on-slab bufctl's are also correctly aligned.
1597 */ 1659 */
1598 if (size & (BYTES_PER_WORD-1)) { 1660 if (size & (BYTES_PER_WORD - 1)) {
1599 size += (BYTES_PER_WORD-1); 1661 size += (BYTES_PER_WORD - 1);
1600 size &= ~(BYTES_PER_WORD-1); 1662 size &= ~(BYTES_PER_WORD - 1);
1601 } 1663 }
1602 1664
1603 /* calculate out the final buffer alignment: */ 1665 /* calculate out the final buffer alignment: */
@@ -1608,7 +1670,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1608 * objects into one cacheline. 1670 * objects into one cacheline.
1609 */ 1671 */
1610 ralign = cache_line_size(); 1672 ralign = cache_line_size();
1611 while (size <= ralign/2) 1673 while (size <= ralign / 2)
1612 ralign /= 2; 1674 ralign /= 2;
1613 } else { 1675 } else {
1614 ralign = BYTES_PER_WORD; 1676 ralign = BYTES_PER_WORD;
@@ -1617,13 +1679,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1617 if (ralign < ARCH_SLAB_MINALIGN) { 1679 if (ralign < ARCH_SLAB_MINALIGN) {
1618 ralign = ARCH_SLAB_MINALIGN; 1680 ralign = ARCH_SLAB_MINALIGN;
1619 if (ralign > BYTES_PER_WORD) 1681 if (ralign > BYTES_PER_WORD)
1620 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1682 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1621 } 1683 }
1622 /* 3) caller mandated alignment: disables debug if necessary */ 1684 /* 3) caller mandated alignment: disables debug if necessary */
1623 if (ralign < align) { 1685 if (ralign < align) {
1624 ralign = align; 1686 ralign = align;
1625 if (ralign > BYTES_PER_WORD) 1687 if (ralign > BYTES_PER_WORD)
1626 flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); 1688 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
1627 } 1689 }
1628 /* 4) Store it. Note that the debug code below can reduce 1690 /* 4) Store it. Note that the debug code below can reduce
1629 * the alignment to BYTES_PER_WORD. 1691 * the alignment to BYTES_PER_WORD.
@@ -1645,7 +1707,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1645 1707
1646 /* add space for red zone words */ 1708 /* add space for red zone words */
1647 cachep->dbghead += BYTES_PER_WORD; 1709 cachep->dbghead += BYTES_PER_WORD;
1648 size += 2*BYTES_PER_WORD; 1710 size += 2 * BYTES_PER_WORD;
1649 } 1711 }
1650 if (flags & SLAB_STORE_USER) { 1712 if (flags & SLAB_STORE_USER) {
1651 /* user store requires word alignment and 1713 /* user store requires word alignment and
@@ -1656,7 +1718,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1656 size += BYTES_PER_WORD; 1718 size += BYTES_PER_WORD;
1657 } 1719 }
1658#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1720#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1659 if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1721 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
1722 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1660 cachep->dbghead += PAGE_SIZE - size; 1723 cachep->dbghead += PAGE_SIZE - size;
1661 size = PAGE_SIZE; 1724 size = PAGE_SIZE;
1662 } 1725 }
@@ -1664,7 +1727,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1664#endif 1727#endif
1665 1728
1666 /* Determine if the slab management is 'on' or 'off' slab. */ 1729 /* Determine if the slab management is 'on' or 'off' slab. */
1667 if (size >= (PAGE_SIZE>>3)) 1730 if (size >= (PAGE_SIZE >> 3))
1668 /* 1731 /*
1669 * Size is large, assume best to place the slab management obj 1732 * Size is large, assume best to place the slab management obj
1670 * off-slab (should allow better packing of objs). 1733 * off-slab (should allow better packing of objs).
@@ -1681,47 +1744,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1681 */ 1744 */
1682 cachep->gfporder = 0; 1745 cachep->gfporder = 0;
1683 cache_estimate(cachep->gfporder, size, align, flags, 1746 cache_estimate(cachep->gfporder, size, align, flags,
1684 &left_over, &cachep->num); 1747 &left_over, &cachep->num);
1685 } else { 1748 } else
1686 /* 1749 left_over = calculate_slab_order(cachep, size, align, flags);
1687 * Calculate size (in pages) of slabs, and the num of objs per
1688 * slab. This could be made much more intelligent. For now,
1689 * try to avoid using high page-orders for slabs. When the
1690 * gfp() funcs are more friendly towards high-order requests,
1691 * this should be changed.
1692 */
1693 do {
1694 unsigned int break_flag = 0;
1695cal_wastage:
1696 cache_estimate(cachep->gfporder, size, align, flags,
1697 &left_over, &cachep->num);
1698 if (break_flag)
1699 break;
1700 if (cachep->gfporder >= MAX_GFP_ORDER)
1701 break;
1702 if (!cachep->num)
1703 goto next;
1704 if (flags & CFLGS_OFF_SLAB &&
1705 cachep->num > offslab_limit) {
1706 /* This num of objs will cause problems. */
1707 cachep->gfporder--;
1708 break_flag++;
1709 goto cal_wastage;
1710 }
1711
1712 /*
1713 * Large num of objs is good, but v. large slabs are
1714 * currently bad for the gfp()s.
1715 */
1716 if (cachep->gfporder >= slab_break_gfp_order)
1717 break;
1718
1719 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
1720 break; /* Acceptable internal fragmentation. */
1721next:
1722 cachep->gfporder++;
1723 } while (1);
1724 }
1725 1750
1726 if (!cachep->num) { 1751 if (!cachep->num) {
1727 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1752 printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1729,8 +1754,8 @@ next:
1729 cachep = NULL; 1754 cachep = NULL;
1730 goto oops; 1755 goto oops;
1731 } 1756 }
1732 slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) 1757 slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
1733 + sizeof(struct slab), align); 1758 + sizeof(struct slab), align);
1734 1759
1735 /* 1760 /*
1736 * If the slab has been placed off-slab, and we have enough space then 1761 * If the slab has been placed off-slab, and we have enough space then
@@ -1743,14 +1768,15 @@ next:
1743 1768
1744 if (flags & CFLGS_OFF_SLAB) { 1769 if (flags & CFLGS_OFF_SLAB) {
1745 /* really off slab. No need for manual alignment */ 1770 /* really off slab. No need for manual alignment */
1746 slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); 1771 slab_size =
1772 cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
1747 } 1773 }
1748 1774
1749 cachep->colour_off = cache_line_size(); 1775 cachep->colour_off = cache_line_size();
1750 /* Offset must be a multiple of the alignment. */ 1776 /* Offset must be a multiple of the alignment. */
1751 if (cachep->colour_off < align) 1777 if (cachep->colour_off < align)
1752 cachep->colour_off = align; 1778 cachep->colour_off = align;
1753 cachep->colour = left_over/cachep->colour_off; 1779 cachep->colour = left_over / cachep->colour_off;
1754 cachep->slab_size = slab_size; 1780 cachep->slab_size = slab_size;
1755 cachep->flags = flags; 1781 cachep->flags = flags;
1756 cachep->gfpflags = 0; 1782 cachep->gfpflags = 0;
@@ -1777,7 +1803,7 @@ next:
1777 * the creation of further caches will BUG(). 1803 * the creation of further caches will BUG().
1778 */ 1804 */
1779 cachep->array[smp_processor_id()] = 1805 cachep->array[smp_processor_id()] =
1780 &initarray_generic.cache; 1806 &initarray_generic.cache;
1781 1807
1782 /* If the cache that's used by 1808 /* If the cache that's used by
1783 * kmalloc(sizeof(kmem_list3)) is the first cache, 1809 * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1791,8 +1817,7 @@ next:
1791 g_cpucache_up = PARTIAL_AC; 1817 g_cpucache_up = PARTIAL_AC;
1792 } else { 1818 } else {
1793 cachep->array[smp_processor_id()] = 1819 cachep->array[smp_processor_id()] =
1794 kmalloc(sizeof(struct arraycache_init), 1820 kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1795 GFP_KERNEL);
1796 1821
1797 if (g_cpucache_up == PARTIAL_AC) { 1822 if (g_cpucache_up == PARTIAL_AC) {
1798 set_up_list3s(cachep, SIZE_L3); 1823 set_up_list3s(cachep, SIZE_L3);
@@ -1802,16 +1827,18 @@ next:
1802 for_each_online_node(node) { 1827 for_each_online_node(node) {
1803 1828
1804 cachep->nodelists[node] = 1829 cachep->nodelists[node] =
1805 kmalloc_node(sizeof(struct kmem_list3), 1830 kmalloc_node(sizeof
1806 GFP_KERNEL, node); 1831 (struct kmem_list3),
1832 GFP_KERNEL, node);
1807 BUG_ON(!cachep->nodelists[node]); 1833 BUG_ON(!cachep->nodelists[node]);
1808 kmem_list3_init(cachep->nodelists[node]); 1834 kmem_list3_init(cachep->
1835 nodelists[node]);
1809 } 1836 }
1810 } 1837 }
1811 } 1838 }
1812 cachep->nodelists[numa_node_id()]->next_reap = 1839 cachep->nodelists[numa_node_id()]->next_reap =
1813 jiffies + REAPTIMEOUT_LIST3 + 1840 jiffies + REAPTIMEOUT_LIST3 +
1814 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 1841 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
1815 1842
1816 BUG_ON(!ac_data(cachep)); 1843 BUG_ON(!ac_data(cachep));
1817 ac_data(cachep)->avail = 0; 1844 ac_data(cachep)->avail = 0;
@@ -1820,15 +1847,15 @@ next:
1820 ac_data(cachep)->touched = 0; 1847 ac_data(cachep)->touched = 0;
1821 cachep->batchcount = 1; 1848 cachep->batchcount = 1;
1822 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1849 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1823 } 1850 }
1824 1851
1825 /* cache setup completed, link it into the list */ 1852 /* cache setup completed, link it into the list */
1826 list_add(&cachep->next, &cache_chain); 1853 list_add(&cachep->next, &cache_chain);
1827 unlock_cpu_hotplug(); 1854 unlock_cpu_hotplug();
1828oops: 1855 oops:
1829 if (!cachep && (flags & SLAB_PANIC)) 1856 if (!cachep && (flags & SLAB_PANIC))
1830 panic("kmem_cache_create(): failed to create slab `%s'\n", 1857 panic("kmem_cache_create(): failed to create slab `%s'\n",
1831 name); 1858 name);
1832 up(&cache_chain_sem); 1859 up(&cache_chain_sem);
1833 return cachep; 1860 return cachep;
1834} 1861}
@@ -1871,7 +1898,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
1871/* 1898/*
1872 * Waits for all CPUs to execute func(). 1899 * Waits for all CPUs to execute func().
1873 */ 1900 */
1874static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) 1901static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
1875{ 1902{
1876 check_irq_on(); 1903 check_irq_on();
1877 preempt_disable(); 1904 preempt_disable();
@@ -1886,12 +1913,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1886 preempt_enable(); 1913 preempt_enable();
1887} 1914}
1888 1915
1889static void drain_array_locked(kmem_cache_t* cachep, 1916static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
1890 struct array_cache *ac, int force, int node); 1917 int force, int node);
1891 1918
1892static void do_drain(void *arg) 1919static void do_drain(void *arg)
1893{ 1920{
1894 kmem_cache_t *cachep = (kmem_cache_t*)arg; 1921 kmem_cache_t *cachep = (kmem_cache_t *) arg;
1895 struct array_cache *ac; 1922 struct array_cache *ac;
1896 int node = numa_node_id(); 1923 int node = numa_node_id();
1897 1924
@@ -1911,7 +1938,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
1911 smp_call_function_all_cpus(do_drain, cachep); 1938 smp_call_function_all_cpus(do_drain, cachep);
1912 check_irq_on(); 1939 check_irq_on();
1913 spin_lock_irq(&cachep->spinlock); 1940 spin_lock_irq(&cachep->spinlock);
1914 for_each_online_node(node) { 1941 for_each_online_node(node) {
1915 l3 = cachep->nodelists[node]; 1942 l3 = cachep->nodelists[node];
1916 if (l3) { 1943 if (l3) {
1917 spin_lock(&l3->list_lock); 1944 spin_lock(&l3->list_lock);
@@ -1949,8 +1976,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
1949 slab_destroy(cachep, slabp); 1976 slab_destroy(cachep, slabp);
1950 spin_lock_irq(&l3->list_lock); 1977 spin_lock_irq(&l3->list_lock);
1951 } 1978 }
1952 ret = !list_empty(&l3->slabs_full) || 1979 ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
1953 !list_empty(&l3->slabs_partial);
1954 return ret; 1980 return ret;
1955} 1981}
1956 1982
@@ -2006,7 +2032,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
2006 * The caller must guarantee that noone will allocate memory from the cache 2032 * The caller must guarantee that noone will allocate memory from the cache
2007 * during the kmem_cache_destroy(). 2033 * during the kmem_cache_destroy().
2008 */ 2034 */
2009int kmem_cache_destroy(kmem_cache_t * cachep) 2035int kmem_cache_destroy(kmem_cache_t *cachep)
2010{ 2036{
2011 int i; 2037 int i;
2012 struct kmem_list3 *l3; 2038 struct kmem_list3 *l3;
@@ -2028,7 +2054,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2028 if (__cache_shrink(cachep)) { 2054 if (__cache_shrink(cachep)) {
2029 slab_error(cachep, "Can't free all objects"); 2055 slab_error(cachep, "Can't free all objects");
2030 down(&cache_chain_sem); 2056 down(&cache_chain_sem);
2031 list_add(&cachep->next,&cache_chain); 2057 list_add(&cachep->next, &cache_chain);
2032 up(&cache_chain_sem); 2058 up(&cache_chain_sem);
2033 unlock_cpu_hotplug(); 2059 unlock_cpu_hotplug();
2034 return 1; 2060 return 1;
@@ -2038,7 +2064,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2038 synchronize_rcu(); 2064 synchronize_rcu();
2039 2065
2040 for_each_online_cpu(i) 2066 for_each_online_cpu(i)
2041 kfree(cachep->array[i]); 2067 kfree(cachep->array[i]);
2042 2068
2043 /* NUMA: free the list3 structures */ 2069 /* NUMA: free the list3 structures */
2044 for_each_online_node(i) { 2070 for_each_online_node(i) {
@@ -2057,39 +2083,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
2057EXPORT_SYMBOL(kmem_cache_destroy); 2083EXPORT_SYMBOL(kmem_cache_destroy);
2058 2084
2059/* Get the memory for a slab management obj. */ 2085/* Get the memory for a slab management obj. */
2060static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp, 2086static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
2061 int colour_off, gfp_t local_flags) 2087 int colour_off, gfp_t local_flags)
2062{ 2088{
2063 struct slab *slabp; 2089 struct slab *slabp;
2064 2090
2065 if (OFF_SLAB(cachep)) { 2091 if (OFF_SLAB(cachep)) {
2066 /* Slab management obj is off-slab. */ 2092 /* Slab management obj is off-slab. */
2067 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); 2093 slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
2068 if (!slabp) 2094 if (!slabp)
2069 return NULL; 2095 return NULL;
2070 } else { 2096 } else {
2071 slabp = objp+colour_off; 2097 slabp = objp + colour_off;
2072 colour_off += cachep->slab_size; 2098 colour_off += cachep->slab_size;
2073 } 2099 }
2074 slabp->inuse = 0; 2100 slabp->inuse = 0;
2075 slabp->colouroff = colour_off; 2101 slabp->colouroff = colour_off;
2076 slabp->s_mem = objp+colour_off; 2102 slabp->s_mem = objp + colour_off;
2077 2103
2078 return slabp; 2104 return slabp;
2079} 2105}
2080 2106
2081static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) 2107static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
2082{ 2108{
2083 return (kmem_bufctl_t *)(slabp+1); 2109 return (kmem_bufctl_t *) (slabp + 1);
2084} 2110}
2085 2111
2086static void cache_init_objs(kmem_cache_t *cachep, 2112static void cache_init_objs(kmem_cache_t *cachep,
2087 struct slab *slabp, unsigned long ctor_flags) 2113 struct slab *slabp, unsigned long ctor_flags)
2088{ 2114{
2089 int i; 2115 int i;
2090 2116
2091 for (i = 0; i < cachep->num; i++) { 2117 for (i = 0; i < cachep->num; i++) {
2092 void *objp = slabp->s_mem+cachep->objsize*i; 2118 void *objp = slabp->s_mem + cachep->objsize * i;
2093#if DEBUG 2119#if DEBUG
2094 /* need to poison the objs? */ 2120 /* need to poison the objs? */
2095 if (cachep->flags & SLAB_POISON) 2121 if (cachep->flags & SLAB_POISON)
@@ -2107,25 +2133,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
2107 * Otherwise, deadlock. They must also be threaded. 2133 * Otherwise, deadlock. They must also be threaded.
2108 */ 2134 */
2109 if (cachep->ctor && !(cachep->flags & SLAB_POISON)) 2135 if (cachep->ctor && !(cachep->flags & SLAB_POISON))
2110 cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); 2136 cachep->ctor(objp + obj_dbghead(cachep), cachep,
2137 ctor_flags);
2111 2138
2112 if (cachep->flags & SLAB_RED_ZONE) { 2139 if (cachep->flags & SLAB_RED_ZONE) {
2113 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) 2140 if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
2114 slab_error(cachep, "constructor overwrote the" 2141 slab_error(cachep, "constructor overwrote the"
2115 " end of an object"); 2142 " end of an object");
2116 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) 2143 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
2117 slab_error(cachep, "constructor overwrote the" 2144 slab_error(cachep, "constructor overwrote the"
2118 " start of an object"); 2145 " start of an object");
2119 } 2146 }
2120 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) 2147 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
2121 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2148 && cachep->flags & SLAB_POISON)
2149 kernel_map_pages(virt_to_page(objp),
2150 cachep->objsize / PAGE_SIZE, 0);
2122#else 2151#else
2123 if (cachep->ctor) 2152 if (cachep->ctor)
2124 cachep->ctor(objp, cachep, ctor_flags); 2153 cachep->ctor(objp, cachep, ctor_flags);
2125#endif 2154#endif
2126 slab_bufctl(slabp)[i] = i+1; 2155 slab_bufctl(slabp)[i] = i + 1;
2127 } 2156 }
2128 slab_bufctl(slabp)[i-1] = BUFCTL_END; 2157 slab_bufctl(slabp)[i - 1] = BUFCTL_END;
2129 slabp->free = 0; 2158 slabp->free = 0;
2130} 2159}
2131 2160
@@ -2161,17 +2190,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
2161 */ 2190 */
2162static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2191static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2163{ 2192{
2164 struct slab *slabp; 2193 struct slab *slabp;
2165 void *objp; 2194 void *objp;
2166 size_t offset; 2195 size_t offset;
2167 gfp_t local_flags; 2196 gfp_t local_flags;
2168 unsigned long ctor_flags; 2197 unsigned long ctor_flags;
2169 struct kmem_list3 *l3; 2198 struct kmem_list3 *l3;
2170 2199
2171 /* Be lazy and only check for valid flags here, 2200 /* Be lazy and only check for valid flags here,
2172 * keeping it out of the critical path in kmem_cache_alloc(). 2201 * keeping it out of the critical path in kmem_cache_alloc().
2173 */ 2202 */
2174 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) 2203 if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
2175 BUG(); 2204 BUG();
2176 if (flags & SLAB_NO_GROW) 2205 if (flags & SLAB_NO_GROW)
2177 return 0; 2206 return 0;
@@ -2237,9 +2266,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2237 l3->free_objects += cachep->num; 2266 l3->free_objects += cachep->num;
2238 spin_unlock(&l3->list_lock); 2267 spin_unlock(&l3->list_lock);
2239 return 1; 2268 return 1;
2240opps1: 2269 opps1:
2241 kmem_freepages(cachep, objp); 2270 kmem_freepages(cachep, objp);
2242failed: 2271 failed:
2243 if (local_flags & __GFP_WAIT) 2272 if (local_flags & __GFP_WAIT)
2244 local_irq_disable(); 2273 local_irq_disable();
2245 return 0; 2274 return 0;
@@ -2259,18 +2288,19 @@ static void kfree_debugcheck(const void *objp)
2259 2288
2260 if (!virt_addr_valid(objp)) { 2289 if (!virt_addr_valid(objp)) {
2261 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", 2290 printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
2262 (unsigned long)objp); 2291 (unsigned long)objp);
2263 BUG(); 2292 BUG();
2264 } 2293 }
2265 page = virt_to_page(objp); 2294 page = virt_to_page(objp);
2266 if (!PageSlab(page)) { 2295 if (!PageSlab(page)) {
2267 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); 2296 printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
2297 (unsigned long)objp);
2268 BUG(); 2298 BUG();
2269 } 2299 }
2270} 2300}
2271 2301
2272static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, 2302static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2273 void *caller) 2303 void *caller)
2274{ 2304{
2275 struct page *page; 2305 struct page *page;
2276 unsigned int objnr; 2306 unsigned int objnr;
@@ -2281,20 +2311,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2281 page = virt_to_page(objp); 2311 page = virt_to_page(objp);
2282 2312
2283 if (page_get_cache(page) != cachep) { 2313 if (page_get_cache(page) != cachep) {
2284 printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", 2314 printk(KERN_ERR
2285 page_get_cache(page),cachep); 2315 "mismatch in kmem_cache_free: expected cache %p, got %p\n",
2316 page_get_cache(page), cachep);
2286 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); 2317 printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
2287 printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name); 2318 printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
2319 page_get_cache(page)->name);
2288 WARN_ON(1); 2320 WARN_ON(1);
2289 } 2321 }
2290 slabp = page_get_slab(page); 2322 slabp = page_get_slab(page);
2291 2323
2292 if (cachep->flags & SLAB_RED_ZONE) { 2324 if (cachep->flags & SLAB_RED_ZONE) {
2293 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { 2325 if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
2294 slab_error(cachep, "double free, or memory outside" 2326 || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
2295 " object was overwritten"); 2327 slab_error(cachep,
2296 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2328 "double free, or memory outside"
2297 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2329 " object was overwritten");
2330 printk(KERN_ERR
2331 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2332 objp, *dbg_redzone1(cachep, objp),
2333 *dbg_redzone2(cachep, objp));
2298 } 2334 }
2299 *dbg_redzone1(cachep, objp) = RED_INACTIVE; 2335 *dbg_redzone1(cachep, objp) = RED_INACTIVE;
2300 *dbg_redzone2(cachep, objp) = RED_INACTIVE; 2336 *dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2302,30 +2338,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
2302 if (cachep->flags & SLAB_STORE_USER) 2338 if (cachep->flags & SLAB_STORE_USER)
2303 *dbg_userword(cachep, objp) = caller; 2339 *dbg_userword(cachep, objp) = caller;
2304 2340
2305 objnr = (objp-slabp->s_mem)/cachep->objsize; 2341 objnr = (objp - slabp->s_mem) / cachep->objsize;
2306 2342
2307 BUG_ON(objnr >= cachep->num); 2343 BUG_ON(objnr >= cachep->num);
2308 BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); 2344 BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
2309 2345
2310 if (cachep->flags & SLAB_DEBUG_INITIAL) { 2346 if (cachep->flags & SLAB_DEBUG_INITIAL) {
2311 /* Need to call the slab's constructor so the 2347 /* Need to call the slab's constructor so the
2312 * caller can perform a verify of its state (debugging). 2348 * caller can perform a verify of its state (debugging).
2313 * Called without the cache-lock held. 2349 * Called without the cache-lock held.
2314 */ 2350 */
2315 cachep->ctor(objp+obj_dbghead(cachep), 2351 cachep->ctor(objp + obj_dbghead(cachep),
2316 cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); 2352 cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
2317 } 2353 }
2318 if (cachep->flags & SLAB_POISON && cachep->dtor) { 2354 if (cachep->flags & SLAB_POISON && cachep->dtor) {
2319 /* we want to cache poison the object, 2355 /* we want to cache poison the object,
2320 * call the destruction callback 2356 * call the destruction callback
2321 */ 2357 */
2322 cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); 2358 cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
2323 } 2359 }
2324 if (cachep->flags & SLAB_POISON) { 2360 if (cachep->flags & SLAB_POISON) {
2325#ifdef CONFIG_DEBUG_PAGEALLOC 2361#ifdef CONFIG_DEBUG_PAGEALLOC
2326 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { 2362 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
2327 store_stackinfo(cachep, objp, (unsigned long)caller); 2363 store_stackinfo(cachep, objp, (unsigned long)caller);
2328 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); 2364 kernel_map_pages(virt_to_page(objp),
2365 cachep->objsize / PAGE_SIZE, 0);
2329 } else { 2366 } else {
2330 poison_obj(cachep, objp, POISON_FREE); 2367 poison_obj(cachep, objp, POISON_FREE);
2331 } 2368 }
@@ -2340,7 +2377,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2340{ 2377{
2341 kmem_bufctl_t i; 2378 kmem_bufctl_t i;
2342 int entries = 0; 2379 int entries = 0;
2343 2380
2344 /* Check slab's freelist to see if this obj is there. */ 2381 /* Check slab's freelist to see if this obj is there. */
2345 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2382 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
2346 entries++; 2383 entries++;
@@ -2348,13 +2385,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
2348 goto bad; 2385 goto bad;
2349 } 2386 }
2350 if (entries != cachep->num - slabp->inuse) { 2387 if (entries != cachep->num - slabp->inuse) {
2351bad: 2388 bad:
2352 printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2389 printk(KERN_ERR
2353 cachep->name, cachep->num, slabp, slabp->inuse); 2390 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2354 for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { 2391 cachep->name, cachep->num, slabp, slabp->inuse);
2355 if ((i%16)==0) 2392 for (i = 0;
2393 i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
2394 i++) {
2395 if ((i % 16) == 0)
2356 printk("\n%03x:", i); 2396 printk("\n%03x:", i);
2357 printk(" %02x", ((unsigned char*)slabp)[i]); 2397 printk(" %02x", ((unsigned char *)slabp)[i]);
2358 } 2398 }
2359 printk("\n"); 2399 printk("\n");
2360 BUG(); 2400 BUG();
@@ -2374,7 +2414,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
2374 2414
2375 check_irq_off(); 2415 check_irq_off();
2376 ac = ac_data(cachep); 2416 ac = ac_data(cachep);
2377retry: 2417 retry:
2378 batchcount = ac->batchcount; 2418 batchcount = ac->batchcount;
2379 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 2419 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
2380 /* if there was little recent activity on this 2420 /* if there was little recent activity on this
@@ -2396,8 +2436,8 @@ retry:
2396 shared_array->avail -= batchcount; 2436 shared_array->avail -= batchcount;
2397 ac->avail = batchcount; 2437 ac->avail = batchcount;
2398 memcpy(ac->entry, 2438 memcpy(ac->entry,
2399 &(shared_array->entry[shared_array->avail]), 2439 &(shared_array->entry[shared_array->avail]),
2400 sizeof(void*)*batchcount); 2440 sizeof(void *) * batchcount);
2401 shared_array->touched = 1; 2441 shared_array->touched = 1;
2402 goto alloc_done; 2442 goto alloc_done;
2403 } 2443 }
@@ -2425,7 +2465,7 @@ retry:
2425 2465
2426 /* get obj pointer */ 2466 /* get obj pointer */
2427 ac->entry[ac->avail++] = slabp->s_mem + 2467 ac->entry[ac->avail++] = slabp->s_mem +
2428 slabp->free*cachep->objsize; 2468 slabp->free * cachep->objsize;
2429 2469
2430 slabp->inuse++; 2470 slabp->inuse++;
2431 next = slab_bufctl(slabp)[slabp->free]; 2471 next = slab_bufctl(slabp)[slabp->free];
@@ -2433,7 +2473,7 @@ retry:
2433 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2473 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2434 WARN_ON(numa_node_id() != slabp->nodeid); 2474 WARN_ON(numa_node_id() != slabp->nodeid);
2435#endif 2475#endif
2436 slabp->free = next; 2476 slabp->free = next;
2437 } 2477 }
2438 check_slabp(cachep, slabp); 2478 check_slabp(cachep, slabp);
2439 2479
@@ -2445,9 +2485,9 @@ retry:
2445 list_add(&slabp->list, &l3->slabs_partial); 2485 list_add(&slabp->list, &l3->slabs_partial);
2446 } 2486 }
2447 2487
2448must_grow: 2488 must_grow:
2449 l3->free_objects -= ac->avail; 2489 l3->free_objects -= ac->avail;
2450alloc_done: 2490 alloc_done:
2451 spin_unlock(&l3->list_lock); 2491 spin_unlock(&l3->list_lock);
2452 2492
2453 if (unlikely(!ac->avail)) { 2493 if (unlikely(!ac->avail)) {
@@ -2459,7 +2499,7 @@ alloc_done:
2459 if (!x && ac->avail == 0) // no objects in sight? abort 2499 if (!x && ac->avail == 0) // no objects in sight? abort
2460 return NULL; 2500 return NULL;
2461 2501
2462 if (!ac->avail) // objects refilled by interrupt? 2502 if (!ac->avail) // objects refilled by interrupt?
2463 goto retry; 2503 goto retry;
2464 } 2504 }
2465 ac->touched = 1; 2505 ac->touched = 1;
@@ -2476,16 +2516,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
2476} 2516}
2477 2517
2478#if DEBUG 2518#if DEBUG
2479static void * 2519static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
2480cache_alloc_debugcheck_after(kmem_cache_t *cachep, 2520 void *objp, void *caller)
2481 gfp_t flags, void *objp, void *caller)
2482{ 2521{
2483 if (!objp) 2522 if (!objp)
2484 return objp; 2523 return objp;
2485 if (cachep->flags & SLAB_POISON) { 2524 if (cachep->flags & SLAB_POISON) {
2486#ifdef CONFIG_DEBUG_PAGEALLOC 2525#ifdef CONFIG_DEBUG_PAGEALLOC
2487 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) 2526 if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2488 kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); 2527 kernel_map_pages(virt_to_page(objp),
2528 cachep->objsize / PAGE_SIZE, 1);
2489 else 2529 else
2490 check_poison_obj(cachep, objp); 2530 check_poison_obj(cachep, objp);
2491#else 2531#else
@@ -2497,24 +2537,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2497 *dbg_userword(cachep, objp) = caller; 2537 *dbg_userword(cachep, objp) = caller;
2498 2538
2499 if (cachep->flags & SLAB_RED_ZONE) { 2539 if (cachep->flags & SLAB_RED_ZONE) {
2500 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { 2540 if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
2501 slab_error(cachep, "double free, or memory outside" 2541 || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2502 " object was overwritten"); 2542 slab_error(cachep,
2503 printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", 2543 "double free, or memory outside"
2504 objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); 2544 " object was overwritten");
2545 printk(KERN_ERR
2546 "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2547 objp, *dbg_redzone1(cachep, objp),
2548 *dbg_redzone2(cachep, objp));
2505 } 2549 }
2506 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2550 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2507 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2551 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2508 } 2552 }
2509 objp += obj_dbghead(cachep); 2553 objp += obj_dbghead(cachep);
2510 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2554 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2511 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2555 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2512 2556
2513 if (!(flags & __GFP_WAIT)) 2557 if (!(flags & __GFP_WAIT))
2514 ctor_flags |= SLAB_CTOR_ATOMIC; 2558 ctor_flags |= SLAB_CTOR_ATOMIC;
2515 2559
2516 cachep->ctor(objp, cachep, ctor_flags); 2560 cachep->ctor(objp, cachep, ctor_flags);
2517 } 2561 }
2518 return objp; 2562 return objp;
2519} 2563}
2520#else 2564#else
@@ -2523,7 +2567,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2523 2567
2524static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2568static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2525{ 2569{
2526 void* objp; 2570 void *objp;
2527 struct array_cache *ac; 2571 struct array_cache *ac;
2528 2572
2529 check_irq_off(); 2573 check_irq_off();
@@ -2542,7 +2586,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2542static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) 2586static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2543{ 2587{
2544 unsigned long save_flags; 2588 unsigned long save_flags;
2545 void* objp; 2589 void *objp;
2546 2590
2547 cache_alloc_debugcheck_before(cachep, flags); 2591 cache_alloc_debugcheck_before(cachep, flags);
2548 2592
@@ -2550,7 +2594,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2550 objp = ____cache_alloc(cachep, flags); 2594 objp = ____cache_alloc(cachep, flags);
2551 local_irq_restore(save_flags); 2595 local_irq_restore(save_flags);
2552 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 2596 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
2553 __builtin_return_address(0)); 2597 __builtin_return_address(0));
2554 prefetchw(objp); 2598 prefetchw(objp);
2555 return objp; 2599 return objp;
2556} 2600}
@@ -2562,74 +2606,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2562static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) 2606static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2563{ 2607{
2564 struct list_head *entry; 2608 struct list_head *entry;
2565 struct slab *slabp; 2609 struct slab *slabp;
2566 struct kmem_list3 *l3; 2610 struct kmem_list3 *l3;
2567 void *obj; 2611 void *obj;
2568 kmem_bufctl_t next; 2612 kmem_bufctl_t next;
2569 int x; 2613 int x;
2570 2614
2571 l3 = cachep->nodelists[nodeid]; 2615 l3 = cachep->nodelists[nodeid];
2572 BUG_ON(!l3); 2616 BUG_ON(!l3);
2573 2617
2574retry: 2618 retry:
2575 spin_lock(&l3->list_lock); 2619 spin_lock(&l3->list_lock);
2576 entry = l3->slabs_partial.next; 2620 entry = l3->slabs_partial.next;
2577 if (entry == &l3->slabs_partial) { 2621 if (entry == &l3->slabs_partial) {
2578 l3->free_touched = 1; 2622 l3->free_touched = 1;
2579 entry = l3->slabs_free.next; 2623 entry = l3->slabs_free.next;
2580 if (entry == &l3->slabs_free) 2624 if (entry == &l3->slabs_free)
2581 goto must_grow; 2625 goto must_grow;
2582 } 2626 }
2583 2627
2584 slabp = list_entry(entry, struct slab, list); 2628 slabp = list_entry(entry, struct slab, list);
2585 check_spinlock_acquired_node(cachep, nodeid); 2629 check_spinlock_acquired_node(cachep, nodeid);
2586 check_slabp(cachep, slabp); 2630 check_slabp(cachep, slabp);
2587 2631
2588 STATS_INC_NODEALLOCS(cachep); 2632 STATS_INC_NODEALLOCS(cachep);
2589 STATS_INC_ACTIVE(cachep); 2633 STATS_INC_ACTIVE(cachep);
2590 STATS_SET_HIGH(cachep); 2634 STATS_SET_HIGH(cachep);
2591 2635
2592 BUG_ON(slabp->inuse == cachep->num); 2636 BUG_ON(slabp->inuse == cachep->num);
2593 2637
2594 /* get obj pointer */ 2638 /* get obj pointer */
2595 obj = slabp->s_mem + slabp->free*cachep->objsize; 2639 obj = slabp->s_mem + slabp->free * cachep->objsize;
2596 slabp->inuse++; 2640 slabp->inuse++;
2597 next = slab_bufctl(slabp)[slabp->free]; 2641 next = slab_bufctl(slabp)[slabp->free];
2598#if DEBUG 2642#if DEBUG
2599 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; 2643 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2600#endif 2644#endif
2601 slabp->free = next; 2645 slabp->free = next;
2602 check_slabp(cachep, slabp); 2646 check_slabp(cachep, slabp);
2603 l3->free_objects--; 2647 l3->free_objects--;
2604 /* move slabp to correct slabp list: */ 2648 /* move slabp to correct slabp list: */
2605 list_del(&slabp->list); 2649 list_del(&slabp->list);
2606 2650
2607 if (slabp->free == BUFCTL_END) { 2651 if (slabp->free == BUFCTL_END) {
2608 list_add(&slabp->list, &l3->slabs_full); 2652 list_add(&slabp->list, &l3->slabs_full);
2609 } else { 2653 } else {
2610 list_add(&slabp->list, &l3->slabs_partial); 2654 list_add(&slabp->list, &l3->slabs_partial);
2611 } 2655 }
2612 2656
2613 spin_unlock(&l3->list_lock); 2657 spin_unlock(&l3->list_lock);
2614 goto done; 2658 goto done;
2615 2659
2616must_grow: 2660 must_grow:
2617 spin_unlock(&l3->list_lock); 2661 spin_unlock(&l3->list_lock);
2618 x = cache_grow(cachep, flags, nodeid); 2662 x = cache_grow(cachep, flags, nodeid);
2619 2663
2620 if (!x) 2664 if (!x)
2621 return NULL; 2665 return NULL;
2622 2666
2623 goto retry; 2667 goto retry;
2624done: 2668 done:
2625 return obj; 2669 return obj;
2626} 2670}
2627#endif 2671#endif
2628 2672
2629/* 2673/*
2630 * Caller needs to acquire correct kmem_list's list_lock 2674 * Caller needs to acquire correct kmem_list's list_lock
2631 */ 2675 */
2632static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node) 2676static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
2677 int node)
2633{ 2678{
2634 int i; 2679 int i;
2635 struct kmem_list3 *l3; 2680 struct kmem_list3 *l3;
@@ -2652,7 +2697,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
2652 2697
2653 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2698 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2654 printk(KERN_ERR "slab: double free detected in cache " 2699 printk(KERN_ERR "slab: double free detected in cache "
2655 "'%s', objp %p\n", cachep->name, objp); 2700 "'%s', objp %p\n", cachep->name, objp);
2656 BUG(); 2701 BUG();
2657 } 2702 }
2658#endif 2703#endif
@@ -2696,20 +2741,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2696 spin_lock(&l3->list_lock); 2741 spin_lock(&l3->list_lock);
2697 if (l3->shared) { 2742 if (l3->shared) {
2698 struct array_cache *shared_array = l3->shared; 2743 struct array_cache *shared_array = l3->shared;
2699 int max = shared_array->limit-shared_array->avail; 2744 int max = shared_array->limit - shared_array->avail;
2700 if (max) { 2745 if (max) {
2701 if (batchcount > max) 2746 if (batchcount > max)
2702 batchcount = max; 2747 batchcount = max;
2703 memcpy(&(shared_array->entry[shared_array->avail]), 2748 memcpy(&(shared_array->entry[shared_array->avail]),
2704 ac->entry, 2749 ac->entry, sizeof(void *) * batchcount);
2705 sizeof(void*)*batchcount);
2706 shared_array->avail += batchcount; 2750 shared_array->avail += batchcount;
2707 goto free_done; 2751 goto free_done;
2708 } 2752 }
2709 } 2753 }
2710 2754
2711 free_block(cachep, ac->entry, batchcount, node); 2755 free_block(cachep, ac->entry, batchcount, node);
2712free_done: 2756 free_done:
2713#if STATS 2757#if STATS
2714 { 2758 {
2715 int i = 0; 2759 int i = 0;
@@ -2731,10 +2775,9 @@ free_done:
2731 spin_unlock(&l3->list_lock); 2775 spin_unlock(&l3->list_lock);
2732 ac->avail -= batchcount; 2776 ac->avail -= batchcount;
2733 memmove(ac->entry, &(ac->entry[batchcount]), 2777 memmove(ac->entry, &(ac->entry[batchcount]),
2734 sizeof(void*)*ac->avail); 2778 sizeof(void *) * ac->avail);
2735} 2779}
2736 2780
2737
2738/* 2781/*
2739 * __cache_free 2782 * __cache_free
2740 * Release an obj back to its cache. If the obj has a constructed 2783 * Release an obj back to its cache. If the obj has a constructed
@@ -2759,7 +2802,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2759 if (unlikely(slabp->nodeid != numa_node_id())) { 2802 if (unlikely(slabp->nodeid != numa_node_id())) {
2760 struct array_cache *alien = NULL; 2803 struct array_cache *alien = NULL;
2761 int nodeid = slabp->nodeid; 2804 int nodeid = slabp->nodeid;
2762 struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()]; 2805 struct kmem_list3 *l3 =
2806 cachep->nodelists[numa_node_id()];
2763 2807
2764 STATS_INC_NODEFREES(cachep); 2808 STATS_INC_NODEFREES(cachep);
2765 if (l3->alien && l3->alien[nodeid]) { 2809 if (l3->alien && l3->alien[nodeid]) {
@@ -2767,15 +2811,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2767 spin_lock(&alien->lock); 2811 spin_lock(&alien->lock);
2768 if (unlikely(alien->avail == alien->limit)) 2812 if (unlikely(alien->avail == alien->limit))
2769 __drain_alien_cache(cachep, 2813 __drain_alien_cache(cachep,
2770 alien, nodeid); 2814 alien, nodeid);
2771 alien->entry[alien->avail++] = objp; 2815 alien->entry[alien->avail++] = objp;
2772 spin_unlock(&alien->lock); 2816 spin_unlock(&alien->lock);
2773 } else { 2817 } else {
2774 spin_lock(&(cachep->nodelists[nodeid])-> 2818 spin_lock(&(cachep->nodelists[nodeid])->
2775 list_lock); 2819 list_lock);
2776 free_block(cachep, &objp, 1, nodeid); 2820 free_block(cachep, &objp, 1, nodeid);
2777 spin_unlock(&(cachep->nodelists[nodeid])-> 2821 spin_unlock(&(cachep->nodelists[nodeid])->
2778 list_lock); 2822 list_lock);
2779 } 2823 }
2780 return; 2824 return;
2781 } 2825 }
@@ -2822,9 +2866,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
2822 */ 2866 */
2823int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) 2867int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2824{ 2868{
2825 unsigned long addr = (unsigned long) ptr; 2869 unsigned long addr = (unsigned long)ptr;
2826 unsigned long min_addr = PAGE_OFFSET; 2870 unsigned long min_addr = PAGE_OFFSET;
2827 unsigned long align_mask = BYTES_PER_WORD-1; 2871 unsigned long align_mask = BYTES_PER_WORD - 1;
2828 unsigned long size = cachep->objsize; 2872 unsigned long size = cachep->objsize;
2829 struct page *page; 2873 struct page *page;
2830 2874
@@ -2844,7 +2888,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2844 if (unlikely(page_get_cache(page) != cachep)) 2888 if (unlikely(page_get_cache(page) != cachep))
2845 goto out; 2889 goto out;
2846 return 1; 2890 return 1;
2847out: 2891 out:
2848 return 0; 2892 return 0;
2849} 2893}
2850 2894
@@ -2871,8 +2915,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2871 2915
2872 if (unlikely(!cachep->nodelists[nodeid])) { 2916 if (unlikely(!cachep->nodelists[nodeid])) {
2873 /* Fall back to __cache_alloc if we run into trouble */ 2917 /* Fall back to __cache_alloc if we run into trouble */
2874 printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name); 2918 printk(KERN_WARNING
2875 return __cache_alloc(cachep,flags); 2919 "slab: not allocating in inactive node %d for cache %s\n",
2920 nodeid, cachep->name);
2921 return __cache_alloc(cachep, flags);
2876 } 2922 }
2877 2923
2878 cache_alloc_debugcheck_before(cachep, flags); 2924 cache_alloc_debugcheck_before(cachep, flags);
@@ -2882,7 +2928,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
2882 else 2928 else
2883 ptr = __cache_alloc_node(cachep, flags, nodeid); 2929 ptr = __cache_alloc_node(cachep, flags, nodeid);
2884 local_irq_restore(save_flags); 2930 local_irq_restore(save_flags);
2885 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); 2931 ptr =
2932 cache_alloc_debugcheck_after(cachep, flags, ptr,
2933 __builtin_return_address(0));
2886 2934
2887 return ptr; 2935 return ptr;
2888} 2936}
@@ -2944,12 +2992,11 @@ EXPORT_SYMBOL(__kmalloc);
2944 * Objects should be dereferenced using the per_cpu_ptr macro only. 2992 * Objects should be dereferenced using the per_cpu_ptr macro only.
2945 * 2993 *
2946 * @size: how many bytes of memory are required. 2994 * @size: how many bytes of memory are required.
2947 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
2948 */ 2995 */
2949void *__alloc_percpu(size_t size, size_t align) 2996void *__alloc_percpu(size_t size)
2950{ 2997{
2951 int i; 2998 int i;
2952 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); 2999 struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
2953 3000
2954 if (!pdata) 3001 if (!pdata)
2955 return NULL; 3002 return NULL;
@@ -2973,9 +3020,9 @@ void *__alloc_percpu(size_t size, size_t align)
2973 } 3020 }
2974 3021
2975 /* Catch derefs w/o wrappers */ 3022 /* Catch derefs w/o wrappers */
2976 return (void *) (~(unsigned long) pdata); 3023 return (void *)(~(unsigned long)pdata);
2977 3024
2978unwind_oom: 3025 unwind_oom:
2979 while (--i >= 0) { 3026 while (--i >= 0) {
2980 if (!cpu_possible(i)) 3027 if (!cpu_possible(i))
2981 continue; 3028 continue;
@@ -3006,20 +3053,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
3006EXPORT_SYMBOL(kmem_cache_free); 3053EXPORT_SYMBOL(kmem_cache_free);
3007 3054
3008/** 3055/**
3009 * kzalloc - allocate memory. The memory is set to zero.
3010 * @size: how many bytes of memory are required.
3011 * @flags: the type of memory to allocate.
3012 */
3013void *kzalloc(size_t size, gfp_t flags)
3014{
3015 void *ret = kmalloc(size, flags);
3016 if (ret)
3017 memset(ret, 0, size);
3018 return ret;
3019}
3020EXPORT_SYMBOL(kzalloc);
3021
3022/**
3023 * kfree - free previously allocated memory 3056 * kfree - free previously allocated memory
3024 * @objp: pointer returned by kmalloc. 3057 * @objp: pointer returned by kmalloc.
3025 * 3058 *
@@ -3038,7 +3071,8 @@ void kfree(const void *objp)
3038 local_irq_save(flags); 3071 local_irq_save(flags);
3039 kfree_debugcheck(objp); 3072 kfree_debugcheck(objp);
3040 c = page_get_cache(virt_to_page(objp)); 3073 c = page_get_cache(virt_to_page(objp));
3041 __cache_free(c, (void*)objp); 3074 mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
3075 __cache_free(c, (void *)objp);
3042 local_irq_restore(flags); 3076 local_irq_restore(flags);
3043} 3077}
3044EXPORT_SYMBOL(kfree); 3078EXPORT_SYMBOL(kfree);
@@ -3051,17 +3085,16 @@ EXPORT_SYMBOL(kfree);
3051 * Don't free memory not originally allocated by alloc_percpu() 3085 * Don't free memory not originally allocated by alloc_percpu()
3052 * The complemented objp is to check for that. 3086 * The complemented objp is to check for that.
3053 */ 3087 */
3054void 3088void free_percpu(const void *objp)
3055free_percpu(const void *objp)
3056{ 3089{
3057 int i; 3090 int i;
3058 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 3091 struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
3059 3092
3060 /* 3093 /*
3061 * We allocate for all cpus so we cannot use for online cpu here. 3094 * We allocate for all cpus so we cannot use for online cpu here.
3062 */ 3095 */
3063 for_each_cpu(i) 3096 for_each_cpu(i)
3064 kfree(p->ptrs[i]); 3097 kfree(p->ptrs[i]);
3065 kfree(p); 3098 kfree(p);
3066} 3099}
3067EXPORT_SYMBOL(free_percpu); 3100EXPORT_SYMBOL(free_percpu);
@@ -3095,44 +3128,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
3095 if (!(new_alien = alloc_alien_cache(node, cachep->limit))) 3128 if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3096 goto fail; 3129 goto fail;
3097#endif 3130#endif
3098 if (!(new = alloc_arraycache(node, (cachep->shared* 3131 if (!(new = alloc_arraycache(node, (cachep->shared *
3099 cachep->batchcount), 0xbaadf00d))) 3132 cachep->batchcount),
3133 0xbaadf00d)))
3100 goto fail; 3134 goto fail;
3101 if ((l3 = cachep->nodelists[node])) { 3135 if ((l3 = cachep->nodelists[node])) {
3102 3136
3103 spin_lock_irq(&l3->list_lock); 3137 spin_lock_irq(&l3->list_lock);
3104 3138
3105 if ((nc = cachep->nodelists[node]->shared)) 3139 if ((nc = cachep->nodelists[node]->shared))
3106 free_block(cachep, nc->entry, 3140 free_block(cachep, nc->entry, nc->avail, node);
3107 nc->avail, node);
3108 3141
3109 l3->shared = new; 3142 l3->shared = new;
3110 if (!cachep->nodelists[node]->alien) { 3143 if (!cachep->nodelists[node]->alien) {
3111 l3->alien = new_alien; 3144 l3->alien = new_alien;
3112 new_alien = NULL; 3145 new_alien = NULL;
3113 } 3146 }
3114 l3->free_limit = (1 + nr_cpus_node(node))* 3147 l3->free_limit = (1 + nr_cpus_node(node)) *
3115 cachep->batchcount + cachep->num; 3148 cachep->batchcount + cachep->num;
3116 spin_unlock_irq(&l3->list_lock); 3149 spin_unlock_irq(&l3->list_lock);
3117 kfree(nc); 3150 kfree(nc);
3118 free_alien_cache(new_alien); 3151 free_alien_cache(new_alien);
3119 continue; 3152 continue;
3120 } 3153 }
3121 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), 3154 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
3122 GFP_KERNEL, node))) 3155 GFP_KERNEL, node)))
3123 goto fail; 3156 goto fail;
3124 3157
3125 kmem_list3_init(l3); 3158 kmem_list3_init(l3);
3126 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3159 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3127 ((unsigned long)cachep)%REAPTIMEOUT_LIST3; 3160 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3128 l3->shared = new; 3161 l3->shared = new;
3129 l3->alien = new_alien; 3162 l3->alien = new_alien;
3130 l3->free_limit = (1 + nr_cpus_node(node))* 3163 l3->free_limit = (1 + nr_cpus_node(node)) *
3131 cachep->batchcount + cachep->num; 3164 cachep->batchcount + cachep->num;
3132 cachep->nodelists[node] = l3; 3165 cachep->nodelists[node] = l3;
3133 } 3166 }
3134 return err; 3167 return err;
3135fail: 3168 fail:
3136 err = -ENOMEM; 3169 err = -ENOMEM;
3137 return err; 3170 return err;
3138} 3171}
@@ -3154,18 +3187,19 @@ static void do_ccupdate_local(void *info)
3154 new->new[smp_processor_id()] = old; 3187 new->new[smp_processor_id()] = old;
3155} 3188}
3156 3189
3157
3158static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, 3190static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3159 int shared) 3191 int shared)
3160{ 3192{
3161 struct ccupdate_struct new; 3193 struct ccupdate_struct new;
3162 int i, err; 3194 int i, err;
3163 3195
3164 memset(&new.new,0,sizeof(new.new)); 3196 memset(&new.new, 0, sizeof(new.new));
3165 for_each_online_cpu(i) { 3197 for_each_online_cpu(i) {
3166 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); 3198 new.new[i] =
3199 alloc_arraycache(cpu_to_node(i), limit, batchcount);
3167 if (!new.new[i]) { 3200 if (!new.new[i]) {
3168 for (i--; i >= 0; i--) kfree(new.new[i]); 3201 for (i--; i >= 0; i--)
3202 kfree(new.new[i]);
3169 return -ENOMEM; 3203 return -ENOMEM;
3170 } 3204 }
3171 } 3205 }
@@ -3193,13 +3227,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
3193 err = alloc_kmemlist(cachep); 3227 err = alloc_kmemlist(cachep);
3194 if (err) { 3228 if (err) {
3195 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", 3229 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
3196 cachep->name, -err); 3230 cachep->name, -err);
3197 BUG(); 3231 BUG();
3198 } 3232 }
3199 return 0; 3233 return 0;
3200} 3234}
3201 3235
3202
3203static void enable_cpucache(kmem_cache_t *cachep) 3236static void enable_cpucache(kmem_cache_t *cachep)
3204{ 3237{
3205 int err; 3238 int err;
@@ -3246,14 +3279,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
3246 if (limit > 32) 3279 if (limit > 32)
3247 limit = 32; 3280 limit = 32;
3248#endif 3281#endif
3249 err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); 3282 err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
3250 if (err) 3283 if (err)
3251 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", 3284 printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
3252 cachep->name, -err); 3285 cachep->name, -err);
3253} 3286}
3254 3287
3255static void drain_array_locked(kmem_cache_t *cachep, 3288static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
3256 struct array_cache *ac, int force, int node) 3289 int force, int node)
3257{ 3290{
3258 int tofree; 3291 int tofree;
3259 3292
@@ -3261,14 +3294,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
3261 if (ac->touched && !force) { 3294 if (ac->touched && !force) {
3262 ac->touched = 0; 3295 ac->touched = 0;
3263 } else if (ac->avail) { 3296 } else if (ac->avail) {
3264 tofree = force ? ac->avail : (ac->limit+4)/5; 3297 tofree = force ? ac->avail : (ac->limit + 4) / 5;
3265 if (tofree > ac->avail) { 3298 if (tofree > ac->avail) {
3266 tofree = (ac->avail+1)/2; 3299 tofree = (ac->avail + 1) / 2;
3267 } 3300 }
3268 free_block(cachep, ac->entry, tofree, node); 3301 free_block(cachep, ac->entry, tofree, node);
3269 ac->avail -= tofree; 3302 ac->avail -= tofree;
3270 memmove(ac->entry, &(ac->entry[tofree]), 3303 memmove(ac->entry, &(ac->entry[tofree]),
3271 sizeof(void*)*ac->avail); 3304 sizeof(void *) * ac->avail);
3272 } 3305 }
3273} 3306}
3274 3307
@@ -3291,13 +3324,14 @@ static void cache_reap(void *unused)
3291 3324
3292 if (down_trylock(&cache_chain_sem)) { 3325 if (down_trylock(&cache_chain_sem)) {
3293 /* Give up. Setup the next iteration. */ 3326 /* Give up. Setup the next iteration. */
3294 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3327 schedule_delayed_work(&__get_cpu_var(reap_work),
3328 REAPTIMEOUT_CPUC);
3295 return; 3329 return;
3296 } 3330 }
3297 3331
3298 list_for_each(walk, &cache_chain) { 3332 list_for_each(walk, &cache_chain) {
3299 kmem_cache_t *searchp; 3333 kmem_cache_t *searchp;
3300 struct list_head* p; 3334 struct list_head *p;
3301 int tofree; 3335 int tofree;
3302 struct slab *slabp; 3336 struct slab *slabp;
3303 3337
@@ -3314,7 +3348,7 @@ static void cache_reap(void *unused)
3314 spin_lock_irq(&l3->list_lock); 3348 spin_lock_irq(&l3->list_lock);
3315 3349
3316 drain_array_locked(searchp, ac_data(searchp), 0, 3350 drain_array_locked(searchp, ac_data(searchp), 0,
3317 numa_node_id()); 3351 numa_node_id());
3318 3352
3319 if (time_after(l3->next_reap, jiffies)) 3353 if (time_after(l3->next_reap, jiffies))
3320 goto next_unlock; 3354 goto next_unlock;
@@ -3323,14 +3357,16 @@ static void cache_reap(void *unused)
3323 3357
3324 if (l3->shared) 3358 if (l3->shared)
3325 drain_array_locked(searchp, l3->shared, 0, 3359 drain_array_locked(searchp, l3->shared, 0,
3326 numa_node_id()); 3360 numa_node_id());
3327 3361
3328 if (l3->free_touched) { 3362 if (l3->free_touched) {
3329 l3->free_touched = 0; 3363 l3->free_touched = 0;
3330 goto next_unlock; 3364 goto next_unlock;
3331 } 3365 }
3332 3366
3333 tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num); 3367 tofree =
3368 (l3->free_limit + 5 * searchp->num -
3369 1) / (5 * searchp->num);
3334 do { 3370 do {
3335 p = l3->slabs_free.next; 3371 p = l3->slabs_free.next;
3336 if (p == &(l3->slabs_free)) 3372 if (p == &(l3->slabs_free))
@@ -3350,10 +3386,10 @@ static void cache_reap(void *unused)
3350 spin_unlock_irq(&l3->list_lock); 3386 spin_unlock_irq(&l3->list_lock);
3351 slab_destroy(searchp, slabp); 3387 slab_destroy(searchp, slabp);
3352 spin_lock_irq(&l3->list_lock); 3388 spin_lock_irq(&l3->list_lock);
3353 } while(--tofree > 0); 3389 } while (--tofree > 0);
3354next_unlock: 3390 next_unlock:
3355 spin_unlock_irq(&l3->list_lock); 3391 spin_unlock_irq(&l3->list_lock);
3356next: 3392 next:
3357 cond_resched(); 3393 cond_resched();
3358 } 3394 }
3359 check_irq_on(); 3395 check_irq_on();
@@ -3365,32 +3401,37 @@ next:
3365 3401
3366#ifdef CONFIG_PROC_FS 3402#ifdef CONFIG_PROC_FS
3367 3403
3368static void *s_start(struct seq_file *m, loff_t *pos) 3404static void print_slabinfo_header(struct seq_file *m)
3369{ 3405{
3370 loff_t n = *pos; 3406 /*
3371 struct list_head *p; 3407 * Output format version, so at least we can change it
3372 3408 * without _too_ many complaints.
3373 down(&cache_chain_sem); 3409 */
3374 if (!n) {
3375 /*
3376 * Output format version, so at least we can change it
3377 * without _too_ many complaints.
3378 */
3379#if STATS 3410#if STATS
3380 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 3411 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
3381#else 3412#else
3382 seq_puts(m, "slabinfo - version: 2.1\n"); 3413 seq_puts(m, "slabinfo - version: 2.1\n");
3383#endif 3414#endif
3384 seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); 3415 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
3385 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 3416 "<objperslab> <pagesperslab>");
3386 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3417 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
3418 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
3387#if STATS 3419#if STATS
3388 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" 3420 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
3389 " <error> <maxfreeable> <nodeallocs> <remotefrees>"); 3421 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
3390 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3422 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
3391#endif 3423#endif
3392 seq_putc(m, '\n'); 3424 seq_putc(m, '\n');
3393 } 3425}
3426
3427static void *s_start(struct seq_file *m, loff_t *pos)
3428{
3429 loff_t n = *pos;
3430 struct list_head *p;
3431
3432 down(&cache_chain_sem);
3433 if (!n)
3434 print_slabinfo_header(m);
3394 p = cache_chain.next; 3435 p = cache_chain.next;
3395 while (n--) { 3436 while (n--) {
3396 p = p->next; 3437 p = p->next;
@@ -3405,7 +3446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3405 kmem_cache_t *cachep = p; 3446 kmem_cache_t *cachep = p;
3406 ++*pos; 3447 ++*pos;
3407 return cachep->next.next == &cache_chain ? NULL 3448 return cachep->next.next == &cache_chain ? NULL
3408 : list_entry(cachep->next.next, kmem_cache_t, next); 3449 : list_entry(cachep->next.next, kmem_cache_t, next);
3409} 3450}
3410 3451
3411static void s_stop(struct seq_file *m, void *p) 3452static void s_stop(struct seq_file *m, void *p)
@@ -3417,11 +3458,11 @@ static int s_show(struct seq_file *m, void *p)
3417{ 3458{
3418 kmem_cache_t *cachep = p; 3459 kmem_cache_t *cachep = p;
3419 struct list_head *q; 3460 struct list_head *q;
3420 struct slab *slabp; 3461 struct slab *slabp;
3421 unsigned long active_objs; 3462 unsigned long active_objs;
3422 unsigned long num_objs; 3463 unsigned long num_objs;
3423 unsigned long active_slabs = 0; 3464 unsigned long active_slabs = 0;
3424 unsigned long num_slabs, free_objects = 0, shared_avail = 0; 3465 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
3425 const char *name; 3466 const char *name;
3426 char *error = NULL; 3467 char *error = NULL;
3427 int node; 3468 int node;
@@ -3438,14 +3479,14 @@ static int s_show(struct seq_file *m, void *p)
3438 3479
3439 spin_lock(&l3->list_lock); 3480 spin_lock(&l3->list_lock);
3440 3481
3441 list_for_each(q,&l3->slabs_full) { 3482 list_for_each(q, &l3->slabs_full) {
3442 slabp = list_entry(q, struct slab, list); 3483 slabp = list_entry(q, struct slab, list);
3443 if (slabp->inuse != cachep->num && !error) 3484 if (slabp->inuse != cachep->num && !error)
3444 error = "slabs_full accounting error"; 3485 error = "slabs_full accounting error";
3445 active_objs += cachep->num; 3486 active_objs += cachep->num;
3446 active_slabs++; 3487 active_slabs++;
3447 } 3488 }
3448 list_for_each(q,&l3->slabs_partial) { 3489 list_for_each(q, &l3->slabs_partial) {
3449 slabp = list_entry(q, struct slab, list); 3490 slabp = list_entry(q, struct slab, list);
3450 if (slabp->inuse == cachep->num && !error) 3491 if (slabp->inuse == cachep->num && !error)
3451 error = "slabs_partial inuse accounting error"; 3492 error = "slabs_partial inuse accounting error";
@@ -3454,7 +3495,7 @@ static int s_show(struct seq_file *m, void *p)
3454 active_objs += slabp->inuse; 3495 active_objs += slabp->inuse;
3455 active_slabs++; 3496 active_slabs++;
3456 } 3497 }
3457 list_for_each(q,&l3->slabs_free) { 3498 list_for_each(q, &l3->slabs_free) {
3458 slabp = list_entry(q, struct slab, list); 3499 slabp = list_entry(q, struct slab, list);
3459 if (slabp->inuse && !error) 3500 if (slabp->inuse && !error)
3460 error = "slabs_free/inuse accounting error"; 3501 error = "slabs_free/inuse accounting error";
@@ -3465,25 +3506,24 @@ static int s_show(struct seq_file *m, void *p)
3465 3506
3466 spin_unlock(&l3->list_lock); 3507 spin_unlock(&l3->list_lock);
3467 } 3508 }
3468 num_slabs+=active_slabs; 3509 num_slabs += active_slabs;
3469 num_objs = num_slabs*cachep->num; 3510 num_objs = num_slabs * cachep->num;
3470 if (num_objs - active_objs != free_objects && !error) 3511 if (num_objs - active_objs != free_objects && !error)
3471 error = "free_objects accounting error"; 3512 error = "free_objects accounting error";
3472 3513
3473 name = cachep->name; 3514 name = cachep->name;
3474 if (error) 3515 if (error)
3475 printk(KERN_ERR "slab: cache %s error: %s\n", name, error); 3516 printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
3476 3517
3477 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 3518 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
3478 name, active_objs, num_objs, cachep->objsize, 3519 name, active_objs, num_objs, cachep->objsize,
3479 cachep->num, (1<<cachep->gfporder)); 3520 cachep->num, (1 << cachep->gfporder));
3480 seq_printf(m, " : tunables %4u %4u %4u", 3521 seq_printf(m, " : tunables %4u %4u %4u",
3481 cachep->limit, cachep->batchcount, 3522 cachep->limit, cachep->batchcount, cachep->shared);
3482 cachep->shared);
3483 seq_printf(m, " : slabdata %6lu %6lu %6lu", 3523 seq_printf(m, " : slabdata %6lu %6lu %6lu",
3484 active_slabs, num_slabs, shared_avail); 3524 active_slabs, num_slabs, shared_avail);
3485#if STATS 3525#if STATS
3486 { /* list3 stats */ 3526 { /* list3 stats */
3487 unsigned long high = cachep->high_mark; 3527 unsigned long high = cachep->high_mark;
3488 unsigned long allocs = cachep->num_allocations; 3528 unsigned long allocs = cachep->num_allocations;
3489 unsigned long grown = cachep->grown; 3529 unsigned long grown = cachep->grown;
@@ -3494,9 +3534,7 @@ static int s_show(struct seq_file *m, void *p)
3494 unsigned long node_frees = cachep->node_frees; 3534 unsigned long node_frees = cachep->node_frees;
3495 3535
3496 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ 3536 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
3497 %4lu %4lu %4lu %4lu", 3537 %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
3498 allocs, high, grown, reaped, errors,
3499 max_freeable, node_allocs, node_frees);
3500 } 3538 }
3501 /* cpu stats */ 3539 /* cpu stats */
3502 { 3540 {
@@ -3506,7 +3544,7 @@ static int s_show(struct seq_file *m, void *p)
3506 unsigned long freemiss = atomic_read(&cachep->freemiss); 3544 unsigned long freemiss = atomic_read(&cachep->freemiss);
3507 3545
3508 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", 3546 seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
3509 allochit, allocmiss, freehit, freemiss); 3547 allochit, allocmiss, freehit, freemiss);
3510 } 3548 }
3511#endif 3549#endif
3512 seq_putc(m, '\n'); 3550 seq_putc(m, '\n');
@@ -3529,10 +3567,10 @@ static int s_show(struct seq_file *m, void *p)
3529 */ 3567 */
3530 3568
3531struct seq_operations slabinfo_op = { 3569struct seq_operations slabinfo_op = {
3532 .start = s_start, 3570 .start = s_start,
3533 .next = s_next, 3571 .next = s_next,
3534 .stop = s_stop, 3572 .stop = s_stop,
3535 .show = s_show, 3573 .show = s_show,
3536}; 3574};
3537 3575
3538#define MAX_SLABINFO_WRITE 128 3576#define MAX_SLABINFO_WRITE 128
@@ -3543,18 +3581,18 @@ struct seq_operations slabinfo_op = {
3543 * @count: data length 3581 * @count: data length
3544 * @ppos: unused 3582 * @ppos: unused
3545 */ 3583 */
3546ssize_t slabinfo_write(struct file *file, const char __user *buffer, 3584ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3547 size_t count, loff_t *ppos) 3585 size_t count, loff_t *ppos)
3548{ 3586{
3549 char kbuf[MAX_SLABINFO_WRITE+1], *tmp; 3587 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
3550 int limit, batchcount, shared, res; 3588 int limit, batchcount, shared, res;
3551 struct list_head *p; 3589 struct list_head *p;
3552 3590
3553 if (count > MAX_SLABINFO_WRITE) 3591 if (count > MAX_SLABINFO_WRITE)
3554 return -EINVAL; 3592 return -EINVAL;
3555 if (copy_from_user(&kbuf, buffer, count)) 3593 if (copy_from_user(&kbuf, buffer, count))
3556 return -EFAULT; 3594 return -EFAULT;
3557 kbuf[MAX_SLABINFO_WRITE] = '\0'; 3595 kbuf[MAX_SLABINFO_WRITE] = '\0';
3558 3596
3559 tmp = strchr(kbuf, ' '); 3597 tmp = strchr(kbuf, ' ');
3560 if (!tmp) 3598 if (!tmp)
@@ -3567,18 +3605,17 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
3567 /* Find the cache in the chain of caches. */ 3605 /* Find the cache in the chain of caches. */
3568 down(&cache_chain_sem); 3606 down(&cache_chain_sem);
3569 res = -EINVAL; 3607 res = -EINVAL;
3570 list_for_each(p,&cache_chain) { 3608 list_for_each(p, &cache_chain) {
3571 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); 3609 kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
3572 3610
3573 if (!strcmp(cachep->name, kbuf)) { 3611 if (!strcmp(cachep->name, kbuf)) {
3574 if (limit < 1 || 3612 if (limit < 1 ||
3575 batchcount < 1 || 3613 batchcount < 1 ||
3576 batchcount > limit || 3614 batchcount > limit || shared < 0) {
3577 shared < 0) {
3578 res = 0; 3615 res = 0;
3579 } else { 3616 } else {
3580 res = do_tune_cpucache(cachep, limit, 3617 res = do_tune_cpucache(cachep, limit,
3581 batchcount, shared); 3618 batchcount, shared);
3582 } 3619 }
3583 break; 3620 break;
3584 } 3621 }
@@ -3609,26 +3646,3 @@ unsigned int ksize(const void *objp)
3609 3646
3610 return obj_reallen(page_get_cache(virt_to_page(objp))); 3647 return obj_reallen(page_get_cache(virt_to_page(objp)));
3611} 3648}
3612
3613
3614/*
3615 * kstrdup - allocate space for and copy an existing string
3616 *
3617 * @s: the string to duplicate
3618 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
3619 */
3620char *kstrdup(const char *s, gfp_t gfp)
3621{
3622 size_t len;
3623 char *buf;
3624
3625 if (!s)
3626 return NULL;
3627
3628 len = strlen(s) + 1;
3629 buf = kmalloc(len, gfp);
3630 if (buf)
3631 memcpy(buf, s, len);
3632 return buf;
3633}
3634EXPORT_SYMBOL(kstrdup);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 000000000000..1c240c4b71d9
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
1/*
2 * SLOB Allocator: Simple List Of Blocks
3 *
4 * Matt Mackall <mpm@selenic.com> 12/30/03
5 *
6 * How SLOB works:
7 *
8 * The core of SLOB is a traditional K&R style heap allocator, with
9 * support for returning aligned objects. The granularity of this
10 * allocator is 8 bytes on x86, though it's perhaps possible to reduce
11 * this to 4 if it's deemed worth the effort. The slob heap is a
12 * singly-linked list of pages from __get_free_page, grown on demand
13 * and allocation from the heap is currently first-fit.
14 *
15 * Above this is an implementation of kmalloc/kfree. Blocks returned
16 * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
17 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
18 * __get_free_pages directly so that it can return page-aligned blocks
19 * and keeps a linked list of such pages and their orders. These
20 * objects are detected in kfree() by their page alignment.
21 *
22 * SLAB is emulated on top of SLOB by simply calling constructors and
23 * destructors for every SLAB allocation. Objects are returned with
24 * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
25 * set, in which case the low-level allocator will fragment blocks to
26 * create the proper alignment. Again, objects of page-size or greater
27 * are allocated by calling __get_free_pages. As SLAB objects know
28 * their size, no separate size bookkeeping is necessary and there is
29 * essentially no allocation space overhead.
30 */
31
32#include <linux/config.h>
33#include <linux/slab.h>
34#include <linux/mm.h>
35#include <linux/cache.h>
36#include <linux/init.h>
37#include <linux/module.h>
38#include <linux/timer.h>
39
40struct slob_block {
41 int units;
42 struct slob_block *next;
43};
44typedef struct slob_block slob_t;
45
46#define SLOB_UNIT sizeof(slob_t)
47#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
48#define SLOB_ALIGN L1_CACHE_BYTES
49
50struct bigblock {
51 int order;
52 void *pages;
53 struct bigblock *next;
54};
55typedef struct bigblock bigblock_t;
56
57static slob_t arena = { .next = &arena, .units = 1 };
58static slob_t *slobfree = &arena;
59static bigblock_t *bigblocks;
60static DEFINE_SPINLOCK(slob_lock);
61static DEFINE_SPINLOCK(block_lock);
62
63static void slob_free(void *b, int size);
64
65static void *slob_alloc(size_t size, gfp_t gfp, int align)
66{
67 slob_t *prev, *cur, *aligned = 0;
68 int delta = 0, units = SLOB_UNITS(size);
69 unsigned long flags;
70
71 spin_lock_irqsave(&slob_lock, flags);
72 prev = slobfree;
73 for (cur = prev->next; ; prev = cur, cur = cur->next) {
74 if (align) {
75 aligned = (slob_t *)ALIGN((unsigned long)cur, align);
76 delta = aligned - cur;
77 }
78 if (cur->units >= units + delta) { /* room enough? */
79 if (delta) { /* need to fragment head to align? */
80 aligned->units = cur->units - delta;
81 aligned->next = cur->next;
82 cur->next = aligned;
83 cur->units = delta;
84 prev = cur;
85 cur = aligned;
86 }
87
88 if (cur->units == units) /* exact fit? */
89 prev->next = cur->next; /* unlink */
90 else { /* fragment */
91 prev->next = cur + units;
92 prev->next->units = cur->units - units;
93 prev->next->next = cur->next;
94 cur->units = units;
95 }
96
97 slobfree = prev;
98 spin_unlock_irqrestore(&slob_lock, flags);
99 return cur;
100 }
101 if (cur == slobfree) {
102 spin_unlock_irqrestore(&slob_lock, flags);
103
104 if (size == PAGE_SIZE) /* trying to shrink arena? */
105 return 0;
106
107 cur = (slob_t *)__get_free_page(gfp);
108 if (!cur)
109 return 0;
110
111 slob_free(cur, PAGE_SIZE);
112 spin_lock_irqsave(&slob_lock, flags);
113 cur = slobfree;
114 }
115 }
116}
117
118static void slob_free(void *block, int size)
119{
120 slob_t *cur, *b = (slob_t *)block;
121 unsigned long flags;
122
123 if (!block)
124 return;
125
126 if (size)
127 b->units = SLOB_UNITS(size);
128
129 /* Find reinsertion point */
130 spin_lock_irqsave(&slob_lock, flags);
131 for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
132 if (cur >= cur->next && (b > cur || b < cur->next))
133 break;
134
135 if (b + b->units == cur->next) {
136 b->units += cur->next->units;
137 b->next = cur->next->next;
138 } else
139 b->next = cur->next;
140
141 if (cur + cur->units == b) {
142 cur->units += b->units;
143 cur->next = b->next;
144 } else
145 cur->next = b;
146
147 slobfree = cur;
148
149 spin_unlock_irqrestore(&slob_lock, flags);
150}
151
152static int FASTCALL(find_order(int size));
153static int fastcall find_order(int size)
154{
155 int order = 0;
156 for ( ; size > 4096 ; size >>=1)
157 order++;
158 return order;
159}
160
161void *kmalloc(size_t size, gfp_t gfp)
162{
163 slob_t *m;
164 bigblock_t *bb;
165 unsigned long flags;
166
167 if (size < PAGE_SIZE - SLOB_UNIT) {
168 m = slob_alloc(size + SLOB_UNIT, gfp, 0);
169 return m ? (void *)(m + 1) : 0;
170 }
171
172 bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
173 if (!bb)
174 return 0;
175
176 bb->order = find_order(size);
177 bb->pages = (void *)__get_free_pages(gfp, bb->order);
178
179 if (bb->pages) {
180 spin_lock_irqsave(&block_lock, flags);
181 bb->next = bigblocks;
182 bigblocks = bb;
183 spin_unlock_irqrestore(&block_lock, flags);
184 return bb->pages;
185 }
186
187 slob_free(bb, sizeof(bigblock_t));
188 return 0;
189}
190
191EXPORT_SYMBOL(kmalloc);
192
193void kfree(const void *block)
194{
195 bigblock_t *bb, **last = &bigblocks;
196 unsigned long flags;
197
198 if (!block)
199 return;
200
201 if (!((unsigned long)block & (PAGE_SIZE-1))) {
202 /* might be on the big block list */
203 spin_lock_irqsave(&block_lock, flags);
204 for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
205 if (bb->pages == block) {
206 *last = bb->next;
207 spin_unlock_irqrestore(&block_lock, flags);
208 free_pages((unsigned long)block, bb->order);
209 slob_free(bb, sizeof(bigblock_t));
210 return;
211 }
212 }
213 spin_unlock_irqrestore(&block_lock, flags);
214 }
215
216 slob_free((slob_t *)block - 1, 0);
217 return;
218}
219
220EXPORT_SYMBOL(kfree);
221
222unsigned int ksize(const void *block)
223{
224 bigblock_t *bb;
225 unsigned long flags;
226
227 if (!block)
228 return 0;
229
230 if (!((unsigned long)block & (PAGE_SIZE-1))) {
231 spin_lock_irqsave(&block_lock, flags);
232 for (bb = bigblocks; bb; bb = bb->next)
233 if (bb->pages == block) {
234 spin_unlock_irqrestore(&slob_lock, flags);
235 return PAGE_SIZE << bb->order;
236 }
237 spin_unlock_irqrestore(&block_lock, flags);
238 }
239
240 return ((slob_t *)block - 1)->units * SLOB_UNIT;
241}
242
243struct kmem_cache {
244 unsigned int size, align;
245 const char *name;
246 void (*ctor)(void *, struct kmem_cache *, unsigned long);
247 void (*dtor)(void *, struct kmem_cache *, unsigned long);
248};
249
250struct kmem_cache *kmem_cache_create(const char *name, size_t size,
251 size_t align, unsigned long flags,
252 void (*ctor)(void*, struct kmem_cache *, unsigned long),
253 void (*dtor)(void*, struct kmem_cache *, unsigned long))
254{
255 struct kmem_cache *c;
256
257 c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
258
259 if (c) {
260 c->name = name;
261 c->size = size;
262 c->ctor = ctor;
263 c->dtor = dtor;
264 /* ignore alignment unless it's forced */
265 c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
266 if (c->align < align)
267 c->align = align;
268 }
269
270 return c;
271}
272EXPORT_SYMBOL(kmem_cache_create);
273
274int kmem_cache_destroy(struct kmem_cache *c)
275{
276 slob_free(c, sizeof(struct kmem_cache));
277 return 0;
278}
279EXPORT_SYMBOL(kmem_cache_destroy);
280
281void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
282{
283 void *b;
284
285 if (c->size < PAGE_SIZE)
286 b = slob_alloc(c->size, flags, c->align);
287 else
288 b = (void *)__get_free_pages(flags, find_order(c->size));
289
290 if (c->ctor)
291 c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
292
293 return b;
294}
295EXPORT_SYMBOL(kmem_cache_alloc);
296
297void kmem_cache_free(struct kmem_cache *c, void *b)
298{
299 if (c->dtor)
300 c->dtor(b, c, 0);
301
302 if (c->size < PAGE_SIZE)
303 slob_free(b, c->size);
304 else
305 free_pages((unsigned long)b, find_order(c->size));
306}
307EXPORT_SYMBOL(kmem_cache_free);
308
309unsigned int kmem_cache_size(struct kmem_cache *c)
310{
311 return c->size;
312}
313EXPORT_SYMBOL(kmem_cache_size);
314
315const char *kmem_cache_name(struct kmem_cache *c)
316{
317 return c->name;
318}
319EXPORT_SYMBOL(kmem_cache_name);
320
321static struct timer_list slob_timer = TIMER_INITIALIZER(
322 (void (*)(unsigned long))kmem_cache_init, 0, 0);
323
324void kmem_cache_init(void)
325{
326 void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
327
328 if (p)
329 free_page((unsigned long)p);
330
331 mod_timer(&slob_timer, jiffies + HZ);
332}
333
334atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
335EXPORT_SYMBOL(slab_reclaim_pages);
336
337#ifdef CONFIG_SMP
338
339void *__alloc_percpu(size_t size, size_t align)
340{
341 int i;
342 struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
343
344 if (!pdata)
345 return NULL;
346
347 for (i = 0; i < NR_CPUS; i++) {
348 if (!cpu_possible(i))
349 continue;
350 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
351 if (!pdata->ptrs[i])
352 goto unwind_oom;
353 memset(pdata->ptrs[i], 0, size);
354 }
355
356 /* Catch derefs w/o wrappers */
357 return (void *) (~(unsigned long) pdata);
358
359unwind_oom:
360 while (--i >= 0) {
361 if (!cpu_possible(i))
362 continue;
363 kfree(pdata->ptrs[i]);
364 }
365 kfree(pdata);
366 return NULL;
367}
368EXPORT_SYMBOL(__alloc_percpu);
369
370void
371free_percpu(const void *objp)
372{
373 int i;
374 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
375
376 for (i = 0; i < NR_CPUS; i++) {
377 if (!cpu_possible(i))
378 continue;
379 kfree(p->ptrs[i]);
380 }
381 kfree(p);
382}
383EXPORT_SYMBOL(free_percpu);
384
385#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e2d..0a51f36ba3a1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
18 */ 18 */
19#ifdef CONFIG_SPARSEMEM_EXTREME 19#ifdef CONFIG_SPARSEMEM_EXTREME
20struct mem_section *mem_section[NR_SECTION_ROOTS] 20struct mem_section *mem_section[NR_SECTION_ROOTS]
21 ____cacheline_maxaligned_in_smp; 21 ____cacheline_internodealigned_in_smp;
22#else 22#else
23struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] 23struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
24 ____cacheline_maxaligned_in_smp; 24 ____cacheline_internodealigned_in_smp;
25#endif 25#endif
26EXPORT_SYMBOL(mem_section); 26EXPORT_SYMBOL(mem_section);
27 27
diff --git a/mm/swap.c b/mm/swap.c
index 73d351439ef6..cbb48e721ab9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page)
156 put_cpu_var(lru_add_active_pvecs); 156 put_cpu_var(lru_add_active_pvecs);
157} 157}
158 158
159void lru_add_drain(void) 159static void __lru_add_drain(int cpu)
160{ 160{
161 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 161 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
162 162
163 /* CPU is dead, so no locking needed. */
163 if (pagevec_count(pvec)) 164 if (pagevec_count(pvec))
164 __pagevec_lru_add(pvec); 165 __pagevec_lru_add(pvec);
165 pvec = &__get_cpu_var(lru_add_active_pvecs); 166 pvec = &per_cpu(lru_add_active_pvecs, cpu);
166 if (pagevec_count(pvec)) 167 if (pagevec_count(pvec))
167 __pagevec_lru_add_active(pvec); 168 __pagevec_lru_add_active(pvec);
168 put_cpu_var(lru_add_pvecs); 169}
170
171void lru_add_drain(void)
172{
173 __lru_add_drain(get_cpu());
174 put_cpu();
169} 175}
170 176
171/* 177/*
@@ -378,6 +384,8 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
378 return pagevec_count(pvec); 384 return pagevec_count(pvec);
379} 385}
380 386
387EXPORT_SYMBOL(pagevec_lookup);
388
381unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, 389unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
382 pgoff_t *index, int tag, unsigned nr_pages) 390 pgoff_t *index, int tag, unsigned nr_pages)
383{ 391{
@@ -412,17 +420,6 @@ void vm_acct_memory(long pages)
412} 420}
413 421
414#ifdef CONFIG_HOTPLUG_CPU 422#ifdef CONFIG_HOTPLUG_CPU
415static void lru_drain_cache(unsigned int cpu)
416{
417 struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
418
419 /* CPU is dead, so no locking needed. */
420 if (pagevec_count(pvec))
421 __pagevec_lru_add(pvec);
422 pvec = &per_cpu(lru_add_active_pvecs, cpu);
423 if (pagevec_count(pvec))
424 __pagevec_lru_add_active(pvec);
425}
426 423
427/* Drop the CPU's cached committed space back into the central pool. */ 424/* Drop the CPU's cached committed space back into the central pool. */
428static int cpu_swap_callback(struct notifier_block *nfb, 425static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +432,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
435 if (action == CPU_DEAD) { 432 if (action == CPU_DEAD) {
436 atomic_add(*committed, &vm_committed_space); 433 atomic_add(*committed, &vm_committed_space);
437 *committed = 0; 434 *committed = 0;
438 lru_drain_cache((long)hcpu); 435 __lru_add_drain((long)hcpu);
439 } 436 }
440 return NOTIFY_OK; 437 return NOTIFY_OK;
441} 438}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1de8..7b09ac503fec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
14#include <linux/pagemap.h> 14#include <linux/pagemap.h>
15#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
16#include <linux/backing-dev.h> 16#include <linux/backing-dev.h>
17#include <linux/pagevec.h>
17 18
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19 20
@@ -140,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
140 * Allocate swap space for the page and add the page to the 141 * Allocate swap space for the page and add the page to the
141 * swap cache. Caller needs to hold the page lock. 142 * swap cache. Caller needs to hold the page lock.
142 */ 143 */
143int add_to_swap(struct page * page) 144int add_to_swap(struct page * page, gfp_t gfp_mask)
144{ 145{
145 swp_entry_t entry; 146 swp_entry_t entry;
146 int err; 147 int err;
@@ -165,7 +166,7 @@ int add_to_swap(struct page * page)
165 * Add it to the swap cache and mark it dirty 166 * Add it to the swap cache and mark it dirty
166 */ 167 */
167 err = __add_to_swap_cache(page, entry, 168 err = __add_to_swap_cache(page, entry,
168 GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN); 169 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
169 170
170 switch (err) { 171 switch (err) {
171 case 0: /* Success */ 172 case 0: /* Success */
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
272 */ 273 */
273void free_pages_and_swap_cache(struct page **pages, int nr) 274void free_pages_and_swap_cache(struct page **pages, int nr)
274{ 275{
275 int chunk = 16;
276 struct page **pagep = pages; 276 struct page **pagep = pages;
277 277
278 lru_add_drain(); 278 lru_add_drain();
279 while (nr) { 279 while (nr) {
280 int todo = min(chunk, nr); 280 int todo = min(nr, PAGEVEC_SIZE);
281 int i; 281 int i;
282 282
283 for (i = 0; i < todo; i++) 283 for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301f..957fef43fa60 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,7 @@
25#include <linux/rmap.h> 25#include <linux/rmap.h>
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/capability.h>
28#include <linux/syscalls.h> 29#include <linux/syscalls.h>
29 30
30#include <asm/pgtable.h> 31#include <asm/pgtable.h>
@@ -211,6 +212,26 @@ noswap:
211 return (swp_entry_t) {0}; 212 return (swp_entry_t) {0};
212} 213}
213 214
215swp_entry_t get_swap_page_of_type(int type)
216{
217 struct swap_info_struct *si;
218 pgoff_t offset;
219
220 spin_lock(&swap_lock);
221 si = swap_info + type;
222 if (si->flags & SWP_WRITEOK) {
223 nr_swap_pages--;
224 offset = scan_swap_map(si);
225 if (offset) {
226 spin_unlock(&swap_lock);
227 return swp_entry(type, offset);
228 }
229 nr_swap_pages++;
230 }
231 spin_unlock(&swap_lock);
232 return (swp_entry_t) {0};
233}
234
214static struct swap_info_struct * swap_info_get(swp_entry_t entry) 235static struct swap_info_struct * swap_info_get(swp_entry_t entry)
215{ 236{
216 struct swap_info_struct * p; 237 struct swap_info_struct * p;
@@ -1167,9 +1188,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1167 set_blocksize(bdev, p->old_block_size); 1188 set_blocksize(bdev, p->old_block_size);
1168 bd_release(bdev); 1189 bd_release(bdev);
1169 } else { 1190 } else {
1170 down(&inode->i_sem); 1191 mutex_lock(&inode->i_mutex);
1171 inode->i_flags &= ~S_SWAPFILE; 1192 inode->i_flags &= ~S_SWAPFILE;
1172 up(&inode->i_sem); 1193 mutex_unlock(&inode->i_mutex);
1173 } 1194 }
1174 filp_close(swap_file, NULL); 1195 filp_close(swap_file, NULL);
1175 err = 0; 1196 err = 0;
@@ -1386,7 +1407,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1386 p->bdev = bdev; 1407 p->bdev = bdev;
1387 } else if (S_ISREG(inode->i_mode)) { 1408 } else if (S_ISREG(inode->i_mode)) {
1388 p->bdev = inode->i_sb->s_bdev; 1409 p->bdev = inode->i_sb->s_bdev;
1389 down(&inode->i_sem); 1410 mutex_lock(&inode->i_mutex);
1390 did_down = 1; 1411 did_down = 1;
1391 if (IS_SWAPFILE(inode)) { 1412 if (IS_SWAPFILE(inode)) {
1392 error = -EBUSY; 1413 error = -EBUSY;
@@ -1422,7 +1443,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1422 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) 1443 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1423 swap_header_version = 2; 1444 swap_header_version = 2;
1424 else { 1445 else {
1425 printk("Unable to find swap-space signature\n"); 1446 printk(KERN_ERR "Unable to find swap-space signature\n");
1426 error = -EINVAL; 1447 error = -EINVAL;
1427 goto bad_swap; 1448 goto bad_swap;
1428 } 1449 }
@@ -1473,7 +1494,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1473 goto bad_swap; 1494 goto bad_swap;
1474 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1495 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1475 goto bad_swap; 1496 goto bad_swap;
1476 1497
1477 /* OK, set up the swap map and apply the bad block list */ 1498 /* OK, set up the swap map and apply the bad block list */
1478 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { 1499 if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
1479 error = -ENOMEM; 1500 error = -ENOMEM;
@@ -1482,17 +1503,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1482 1503
1483 error = 0; 1504 error = 0;
1484 memset(p->swap_map, 0, maxpages * sizeof(short)); 1505 memset(p->swap_map, 0, maxpages * sizeof(short));
1485 for (i=0; i<swap_header->info.nr_badpages; i++) { 1506 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1486 int page = swap_header->info.badpages[i]; 1507 int page_nr = swap_header->info.badpages[i];
1487 if (page <= 0 || page >= swap_header->info.last_page) 1508 if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
1488 error = -EINVAL; 1509 error = -EINVAL;
1489 else 1510 else
1490 p->swap_map[page] = SWAP_MAP_BAD; 1511 p->swap_map[page_nr] = SWAP_MAP_BAD;
1491 } 1512 }
1492 nr_good_pages = swap_header->info.last_page - 1513 nr_good_pages = swap_header->info.last_page -
1493 swap_header->info.nr_badpages - 1514 swap_header->info.nr_badpages -
1494 1 /* header page */; 1515 1 /* header page */;
1495 if (error) 1516 if (error)
1496 goto bad_swap; 1517 goto bad_swap;
1497 } 1518 }
1498 1519
@@ -1576,7 +1597,7 @@ out:
1576 if (did_down) { 1597 if (did_down) {
1577 if (!error) 1598 if (!error)
1578 inode->i_flags |= S_SWAPFILE; 1599 inode->i_flags |= S_SWAPFILE;
1579 up(&inode->i_sem); 1600 mutex_unlock(&inode->i_mutex);
1580 } 1601 }
1581 return error; 1602 return error;
1582} 1603}
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed6..f9d6a9cc91c4 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
81 goto close_file; 81 goto close_file;
82 82
83 d_instantiate(dentry, inode); 83 d_instantiate(dentry, inode);
84 inode->i_size = size;
85 inode->i_nlink = 0; /* It is unlinked */ 84 inode->i_nlink = 0; /* It is unlinked */
85
86 file->f_vfsmnt = mntget(shm_mnt); 86 file->f_vfsmnt = mntget(shm_mnt);
87 file->f_dentry = dentry; 87 file->f_dentry = dentry;
88 file->f_mapping = inode->i_mapping; 88 file->f_mapping = inode->i_mapping;
89 file->f_op = &ramfs_file_operations; 89 file->f_op = &ramfs_file_operations;
90 file->f_mode = FMODE_WRITE | FMODE_READ; 90 file->f_mode = FMODE_WRITE | FMODE_READ;
91
92 /* notify everyone as to the change of file size */
93 error = do_truncate(dentry, size, 0, file);
94 if (error < 0)
95 goto close_file;
96
91 return file; 97 return file;
92 98
93close_file: 99close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
123{ 129{
124 return 0; 130 return 0;
125} 131}
132
133int shmem_mmap(struct file *file, struct vm_area_struct *vma)
134{
135 file_accessed(file);
136#ifndef CONFIG_MMU
137 return ramfs_nommu_mmap(file, vma);
138#else
139 return 0;
140#endif
141}
142
143#ifndef CONFIG_MMU
144unsigned long shmem_get_unmapped_area(struct file *file,
145 unsigned long addr,
146 unsigned long len,
147 unsigned long pgoff,
148 unsigned long flags)
149{
150 return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
151}
152#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab500604..6cb3fff25f67 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
82} 82}
83 83
84/** 84/**
85 * truncate_inode_pages - truncate *all* the pages from an offset 85 * truncate_inode_pages - truncate range of pages specified by start and
86 * end byte offsets
86 * @mapping: mapping to truncate 87 * @mapping: mapping to truncate
87 * @lstart: offset from which to truncate 88 * @lstart: offset from which to truncate
89 * @lend: offset to which to truncate
88 * 90 *
89 * Truncate the page cache at a set offset, removing the pages that are beyond 91 * Truncate the page cache, removing the pages that are between
90 * that offset (and zeroing out partial pages). 92 * specified offsets (and zeroing out partial page
93 * (if lstart is not page aligned)).
91 * 94 *
92 * Truncate takes two passes - the first pass is nonblocking. It will not 95 * Truncate takes two passes - the first pass is nonblocking. It will not
93 * block on page locks and it will not block on writeback. The second pass 96 * block on page locks and it will not block on writeback. The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
101 * We pass down the cache-hot hint to the page freeing code. Even if the 104 * We pass down the cache-hot hint to the page freeing code. Even if the
102 * mapping is large, it is probably the case that the final pages are the most 105 * mapping is large, it is probably the case that the final pages are the most
103 * recently touched, and freeing happens in ascending file offset order. 106 * recently touched, and freeing happens in ascending file offset order.
104 *
105 * Called under (and serialised by) inode->i_sem.
106 */ 107 */
107void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 108void truncate_inode_pages_range(struct address_space *mapping,
109 loff_t lstart, loff_t lend)
108{ 110{
109 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 111 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
112 pgoff_t end;
110 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 113 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
111 struct pagevec pvec; 114 struct pagevec pvec;
112 pgoff_t next; 115 pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
115 if (mapping->nrpages == 0) 118 if (mapping->nrpages == 0)
116 return; 119 return;
117 120
121 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
122 end = (lend >> PAGE_CACHE_SHIFT);
123
118 pagevec_init(&pvec, 0); 124 pagevec_init(&pvec, 0);
119 next = start; 125 next = start;
120 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 126 while (next <= end &&
127 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
121 for (i = 0; i < pagevec_count(&pvec); i++) { 128 for (i = 0; i < pagevec_count(&pvec); i++) {
122 struct page *page = pvec.pages[i]; 129 struct page *page = pvec.pages[i];
123 pgoff_t page_index = page->index; 130 pgoff_t page_index = page->index;
124 131
132 if (page_index > end) {
133 next = page_index;
134 break;
135 }
136
125 if (page_index > next) 137 if (page_index > next)
126 next = page_index; 138 next = page_index;
127 next++; 139 next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
157 next = start; 169 next = start;
158 continue; 170 continue;
159 } 171 }
172 if (pvec.pages[0]->index > end) {
173 pagevec_release(&pvec);
174 break;
175 }
160 for (i = 0; i < pagevec_count(&pvec); i++) { 176 for (i = 0; i < pagevec_count(&pvec); i++) {
161 struct page *page = pvec.pages[i]; 177 struct page *page = pvec.pages[i];
162 178
179 if (page->index > end)
180 break;
163 lock_page(page); 181 lock_page(page);
164 wait_on_page_writeback(page); 182 wait_on_page_writeback(page);
165 if (page->index > next) 183 if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
171 pagevec_release(&pvec); 189 pagevec_release(&pvec);
172 } 190 }
173} 191}
192EXPORT_SYMBOL(truncate_inode_pages_range);
174 193
194/**
195 * truncate_inode_pages - truncate *all* the pages from an offset
196 * @mapping: mapping to truncate
197 * @lstart: offset from which to truncate
198 *
199 * Called under (and serialised by) inode->i_mutex.
200 */
201void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
202{
203 truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
204}
175EXPORT_SYMBOL(truncate_inode_pages); 205EXPORT_SYMBOL(truncate_inode_pages);
176 206
177/** 207/**
@@ -219,7 +249,6 @@ unlock:
219 break; 249 break;
220 } 250 }
221 pagevec_release(&pvec); 251 pagevec_release(&pvec);
222 cond_resched();
223 } 252 }
224 return ret; 253 return ret;
225} 254}
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 000000000000..5f4bb59da63c
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
1#include <linux/slab.h>
2#include <linux/string.h>
3#include <linux/module.h>
4
5/**
6 * kzalloc - allocate memory. The memory is set to zero.
7 * @size: how many bytes of memory are required.
8 * @flags: the type of memory to allocate.
9 */
10void *kzalloc(size_t size, gfp_t flags)
11{
12 void *ret = kmalloc(size, flags);
13 if (ret)
14 memset(ret, 0, size);
15 return ret;
16}
17EXPORT_SYMBOL(kzalloc);
18
19/*
20 * kstrdup - allocate space for and copy an existing string
21 *
22 * @s: the string to duplicate
23 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
24 */
25char *kstrdup(const char *s, gfp_t gfp)
26{
27 size_t len;
28 char *buf;
29
30 if (!s)
31 return NULL;
32
33 len = strlen(s) + 1;
34 buf = kmalloc(len, gfp);
35 if (buf)
36 memcpy(buf, s, len);
37 return buf;
38}
39EXPORT_SYMBOL(kstrdup);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de6..bf903b2d198f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
63 63
64 unsigned long nr_mapped; /* From page_state */ 64 unsigned long nr_mapped; /* From page_state */
65 65
66 /* How many pages shrink_cache() should reclaim */
67 int nr_to_reclaim;
68
69 /* Ask shrink_caches, or shrink_zone to scan at this priority */ 66 /* Ask shrink_caches, or shrink_zone to scan at this priority */
70 unsigned int priority; 67 unsigned int priority;
71 68
@@ -74,9 +71,6 @@ struct scan_control {
74 71
75 int may_writepage; 72 int may_writepage;
76 73
77 /* Can pages be swapped as part of reclaim? */
78 int may_swap;
79
80 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 74 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
81 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 75 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
82 * In this context, it doesn't matter that we scan the 76 * In this context, it doesn't matter that we scan the
@@ -186,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker);
186 * 180 *
187 * Returns the number of slab objects which we shrunk. 181 * Returns the number of slab objects which we shrunk.
188 */ 182 */
189static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, 183int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
190 unsigned long lru_pages)
191{ 184{
192 struct shrinker *shrinker; 185 struct shrinker *shrinker;
193 int ret = 0; 186 int ret = 0;
@@ -275,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page)
275 268
276static int may_write_to_queue(struct backing_dev_info *bdi) 269static int may_write_to_queue(struct backing_dev_info *bdi)
277{ 270{
278 if (current_is_kswapd()) 271 if (current->flags & PF_SWAPWRITE)
279 return 1;
280 if (current_is_pdflush()) /* This is unlikely, but why not... */
281 return 1; 272 return 1;
282 if (!bdi_write_congested(bdi)) 273 if (!bdi_write_congested(bdi))
283 return 1; 274 return 1;
@@ -367,7 +358,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
367 res = mapping->a_ops->writepage(page, &wbc); 358 res = mapping->a_ops->writepage(page, &wbc);
368 if (res < 0) 359 if (res < 0)
369 handle_write_error(mapping, page, res); 360 handle_write_error(mapping, page, res);
370 if (res == WRITEPAGE_ACTIVATE) { 361 if (res == AOP_WRITEPAGE_ACTIVATE) {
371 ClearPageReclaim(page); 362 ClearPageReclaim(page);
372 return PAGE_ACTIVATE; 363 return PAGE_ACTIVATE;
373 } 364 }
@@ -382,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
382 return PAGE_CLEAN; 373 return PAGE_CLEAN;
383} 374}
384 375
376static int remove_mapping(struct address_space *mapping, struct page *page)
377{
378 if (!mapping)
379 return 0; /* truncate got there first */
380
381 write_lock_irq(&mapping->tree_lock);
382
383 /*
384 * The non-racy check for busy page. It is critical to check
385 * PageDirty _after_ making sure that the page is freeable and
386 * not in use by anybody. (pagecache + us == 2)
387 */
388 if (unlikely(page_count(page) != 2))
389 goto cannot_free;
390 smp_rmb();
391 if (unlikely(PageDirty(page)))
392 goto cannot_free;
393
394 if (PageSwapCache(page)) {
395 swp_entry_t swap = { .val = page_private(page) };
396 __delete_from_swap_cache(page);
397 write_unlock_irq(&mapping->tree_lock);
398 swap_free(swap);
399 __put_page(page); /* The pagecache ref */
400 return 1;
401 }
402
403 __remove_from_page_cache(page);
404 write_unlock_irq(&mapping->tree_lock);
405 __put_page(page);
406 return 1;
407
408cannot_free:
409 write_unlock_irq(&mapping->tree_lock);
410 return 0;
411}
412
385/* 413/*
386 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed 414 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
387 */ 415 */
@@ -430,9 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
430 * Try to allocate it some swap space here. 458 * Try to allocate it some swap space here.
431 */ 459 */
432 if (PageAnon(page) && !PageSwapCache(page)) { 460 if (PageAnon(page) && !PageSwapCache(page)) {
433 if (!sc->may_swap) 461 if (!add_to_swap(page, GFP_ATOMIC))
434 goto keep_locked;
435 if (!add_to_swap(page))
436 goto activate_locked; 462 goto activate_locked;
437 } 463 }
438#endif /* CONFIG_SWAP */ 464#endif /* CONFIG_SWAP */
@@ -515,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
515 goto free_it; 541 goto free_it;
516 } 542 }
517 543
518 if (!mapping) 544 if (!remove_mapping(mapping, page))
519 goto keep_locked; /* truncate got there first */ 545 goto keep_locked;
520
521 write_lock_irq(&mapping->tree_lock);
522
523 /*
524 * The non-racy check for busy page. It is critical to check
525 * PageDirty _after_ making sure that the page is freeable and
526 * not in use by anybody. (pagecache + us == 2)
527 */
528 if (unlikely(page_count(page) != 2))
529 goto cannot_free;
530 smp_rmb();
531 if (unlikely(PageDirty(page)))
532 goto cannot_free;
533
534#ifdef CONFIG_SWAP
535 if (PageSwapCache(page)) {
536 swp_entry_t swap = { .val = page_private(page) };
537 __delete_from_swap_cache(page);
538 write_unlock_irq(&mapping->tree_lock);
539 swap_free(swap);
540 __put_page(page); /* The pagecache ref */
541 goto free_it;
542 }
543#endif /* CONFIG_SWAP */
544
545 __remove_from_page_cache(page);
546 write_unlock_irq(&mapping->tree_lock);
547 __put_page(page);
548 546
549free_it: 547free_it:
550 unlock_page(page); 548 unlock_page(page);
@@ -553,10 +551,6 @@ free_it:
553 __pagevec_release_nonlru(&freed_pvec); 551 __pagevec_release_nonlru(&freed_pvec);
554 continue; 552 continue;
555 553
556cannot_free:
557 write_unlock_irq(&mapping->tree_lock);
558 goto keep_locked;
559
560activate_locked: 554activate_locked:
561 SetPageActive(page); 555 SetPageActive(page);
562 pgactivate++; 556 pgactivate++;
@@ -574,6 +568,241 @@ keep:
574 return reclaimed; 568 return reclaimed;
575} 569}
576 570
571#ifdef CONFIG_MIGRATION
572static inline void move_to_lru(struct page *page)
573{
574 list_del(&page->lru);
575 if (PageActive(page)) {
576 /*
577 * lru_cache_add_active checks that
578 * the PG_active bit is off.
579 */
580 ClearPageActive(page);
581 lru_cache_add_active(page);
582 } else {
583 lru_cache_add(page);
584 }
585 put_page(page);
586}
587
588/*
589 * Add isolated pages on the list back to the LRU
590 *
591 * returns the number of pages put back.
592 */
593int putback_lru_pages(struct list_head *l)
594{
595 struct page *page;
596 struct page *page2;
597 int count = 0;
598
599 list_for_each_entry_safe(page, page2, l, lru) {
600 move_to_lru(page);
601 count++;
602 }
603 return count;
604}
605
606/*
607 * swapout a single page
608 * page is locked upon entry, unlocked on exit
609 */
610static int swap_page(struct page *page)
611{
612 struct address_space *mapping = page_mapping(page);
613
614 if (page_mapped(page) && mapping)
615 if (try_to_unmap(page) != SWAP_SUCCESS)
616 goto unlock_retry;
617
618 if (PageDirty(page)) {
619 /* Page is dirty, try to write it out here */
620 switch(pageout(page, mapping)) {
621 case PAGE_KEEP:
622 case PAGE_ACTIVATE:
623 goto unlock_retry;
624
625 case PAGE_SUCCESS:
626 goto retry;
627
628 case PAGE_CLEAN:
629 ; /* try to free the page below */
630 }
631 }
632
633 if (PagePrivate(page)) {
634 if (!try_to_release_page(page, GFP_KERNEL) ||
635 (!mapping && page_count(page) == 1))
636 goto unlock_retry;
637 }
638
639 if (remove_mapping(mapping, page)) {
640 /* Success */
641 unlock_page(page);
642 return 0;
643 }
644
645unlock_retry:
646 unlock_page(page);
647
648retry:
649 return -EAGAIN;
650}
651/*
652 * migrate_pages
653 *
654 * Two lists are passed to this function. The first list
655 * contains the pages isolated from the LRU to be migrated.
656 * The second list contains new pages that the pages isolated
657 * can be moved to. If the second list is NULL then all
658 * pages are swapped out.
659 *
660 * The function returns after 10 attempts or if no pages
661 * are movable anymore because t has become empty
662 * or no retryable pages exist anymore.
663 *
664 * SIMPLIFIED VERSION: This implementation of migrate_pages
665 * is only swapping out pages and never touches the second
666 * list. The direct migration patchset
667 * extends this function to avoid the use of swap.
668 *
669 * Return: Number of pages not migrated when "to" ran empty.
670 */
671int migrate_pages(struct list_head *from, struct list_head *to,
672 struct list_head *moved, struct list_head *failed)
673{
674 int retry;
675 int nr_failed = 0;
676 int pass = 0;
677 struct page *page;
678 struct page *page2;
679 int swapwrite = current->flags & PF_SWAPWRITE;
680 int rc;
681
682 if (!swapwrite)
683 current->flags |= PF_SWAPWRITE;
684
685redo:
686 retry = 0;
687
688 list_for_each_entry_safe(page, page2, from, lru) {
689 cond_resched();
690
691 rc = 0;
692 if (page_count(page) == 1)
693 /* page was freed from under us. So we are done. */
694 goto next;
695
696 /*
697 * Skip locked pages during the first two passes to give the
698 * functions holding the lock time to release the page. Later we
699 * use lock_page() to have a higher chance of acquiring the
700 * lock.
701 */
702 rc = -EAGAIN;
703 if (pass > 2)
704 lock_page(page);
705 else
706 if (TestSetPageLocked(page))
707 goto next;
708
709 /*
710 * Only wait on writeback if we have already done a pass where
711 * we we may have triggered writeouts for lots of pages.
712 */
713 if (pass > 0) {
714 wait_on_page_writeback(page);
715 } else {
716 if (PageWriteback(page))
717 goto unlock_page;
718 }
719
720 /*
721 * Anonymous pages must have swap cache references otherwise
722 * the information contained in the page maps cannot be
723 * preserved.
724 */
725 if (PageAnon(page) && !PageSwapCache(page)) {
726 if (!add_to_swap(page, GFP_KERNEL)) {
727 rc = -ENOMEM;
728 goto unlock_page;
729 }
730 }
731
732 /*
733 * Page is properly locked and writeback is complete.
734 * Try to migrate the page.
735 */
736 rc = swap_page(page);
737 goto next;
738
739unlock_page:
740 unlock_page(page);
741
742next:
743 if (rc == -EAGAIN) {
744 retry++;
745 } else if (rc) {
746 /* Permanent failure */
747 list_move(&page->lru, failed);
748 nr_failed++;
749 } else {
750 /* Success */
751 list_move(&page->lru, moved);
752 }
753 }
754 if (retry && pass++ < 10)
755 goto redo;
756
757 if (!swapwrite)
758 current->flags &= ~PF_SWAPWRITE;
759
760 return nr_failed + retry;
761}
762
763static void lru_add_drain_per_cpu(void *dummy)
764{
765 lru_add_drain();
766}
767
768/*
769 * Isolate one page from the LRU lists and put it on the
770 * indicated list. Do necessary cache draining if the
771 * page is not on the LRU lists yet.
772 *
773 * Result:
774 * 0 = page not on LRU list
775 * 1 = page removed from LRU list and added to the specified list.
776 * -ENOENT = page is being freed elsewhere.
777 */
778int isolate_lru_page(struct page *page)
779{
780 int rc = 0;
781 struct zone *zone = page_zone(page);
782
783redo:
784 spin_lock_irq(&zone->lru_lock);
785 rc = __isolate_lru_page(page);
786 if (rc == 1) {
787 if (PageActive(page))
788 del_page_from_active_list(zone, page);
789 else
790 del_page_from_inactive_list(zone, page);
791 }
792 spin_unlock_irq(&zone->lru_lock);
793 if (rc == 0) {
794 /*
795 * Maybe this page is still waiting for a cpu to drain it
796 * from one of the lru lists?
797 */
798 rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
799 if (rc == 0 && PageLRU(page))
800 goto redo;
801 }
802 return rc;
803}
804#endif
805
577/* 806/*
578 * zone->lru_lock is heavily contended. Some of the functions that 807 * zone->lru_lock is heavily contended. Some of the functions that
579 * shrink the lists perform better by taking out a batch of pages 808 * shrink the lists perform better by taking out a batch of pages
@@ -602,20 +831,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
602 page = lru_to_page(src); 831 page = lru_to_page(src);
603 prefetchw_prev_lru_page(page, src, flags); 832 prefetchw_prev_lru_page(page, src, flags);
604 833
605 if (!TestClearPageLRU(page)) 834 switch (__isolate_lru_page(page)) {
606 BUG(); 835 case 1:
607 list_del(&page->lru); 836 /* Succeeded to isolate page */
608 if (get_page_testone(page)) { 837 list_move(&page->lru, dst);
609 /*
610 * It is being freed elsewhere
611 */
612 __put_page(page);
613 SetPageLRU(page);
614 list_add(&page->lru, src);
615 continue;
616 } else {
617 list_add(&page->lru, dst);
618 nr_taken++; 838 nr_taken++;
839 break;
840 case -ENOENT:
841 /* Not possible to isolate */
842 list_move(&page->lru, src);
843 break;
844 default:
845 BUG();
619 } 846 }
620 } 847 }
621 848
@@ -653,17 +880,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
653 goto done; 880 goto done;
654 881
655 max_scan -= nr_scan; 882 max_scan -= nr_scan;
656 if (current_is_kswapd())
657 mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
658 else
659 mod_page_state_zone(zone, pgscan_direct, nr_scan);
660 nr_freed = shrink_list(&page_list, sc); 883 nr_freed = shrink_list(&page_list, sc);
661 if (current_is_kswapd())
662 mod_page_state(kswapd_steal, nr_freed);
663 mod_page_state_zone(zone, pgsteal, nr_freed);
664 sc->nr_to_reclaim -= nr_freed;
665 884
666 spin_lock_irq(&zone->lru_lock); 885 local_irq_disable();
886 if (current_is_kswapd()) {
887 __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
888 __mod_page_state(kswapd_steal, nr_freed);
889 } else
890 __mod_page_state_zone(zone, pgscan_direct, nr_scan);
891 __mod_page_state_zone(zone, pgsteal, nr_freed);
892
893 spin_lock(&zone->lru_lock);
667 /* 894 /*
668 * Put back any unfreeable pages. 895 * Put back any unfreeable pages.
669 */ 896 */
@@ -825,11 +1052,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
825 } 1052 }
826 } 1053 }
827 zone->nr_active += pgmoved; 1054 zone->nr_active += pgmoved;
828 spin_unlock_irq(&zone->lru_lock); 1055 spin_unlock(&zone->lru_lock);
829 pagevec_release(&pvec);
830 1056
831 mod_page_state_zone(zone, pgrefill, pgscanned); 1057 __mod_page_state_zone(zone, pgrefill, pgscanned);
832 mod_page_state(pgdeactivate, pgdeactivate); 1058 __mod_page_state(pgdeactivate, pgdeactivate);
1059 local_irq_enable();
1060
1061 pagevec_release(&pvec);
833} 1062}
834 1063
835/* 1064/*
@@ -861,8 +1090,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
861 else 1090 else
862 nr_inactive = 0; 1091 nr_inactive = 0;
863 1092
864 sc->nr_to_reclaim = sc->swap_cluster_max;
865
866 while (nr_active || nr_inactive) { 1093 while (nr_active || nr_inactive) {
867 if (nr_active) { 1094 if (nr_active) {
868 sc->nr_to_scan = min(nr_active, 1095 sc->nr_to_scan = min(nr_active,
@@ -876,8 +1103,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
876 (unsigned long)sc->swap_cluster_max); 1103 (unsigned long)sc->swap_cluster_max);
877 nr_inactive -= sc->nr_to_scan; 1104 nr_inactive -= sc->nr_to_scan;
878 shrink_cache(zone, sc); 1105 shrink_cache(zone, sc);
879 if (sc->nr_to_reclaim <= 0)
880 break;
881 } 1106 }
882 } 1107 }
883 1108
@@ -910,7 +1135,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
910 for (i = 0; zones[i] != NULL; i++) { 1135 for (i = 0; zones[i] != NULL; i++) {
911 struct zone *zone = zones[i]; 1136 struct zone *zone = zones[i];
912 1137
913 if (zone->present_pages == 0) 1138 if (!populated_zone(zone))
914 continue; 1139 continue;
915 1140
916 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1141 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -952,7 +1177,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
952 1177
953 sc.gfp_mask = gfp_mask; 1178 sc.gfp_mask = gfp_mask;
954 sc.may_writepage = 0; 1179 sc.may_writepage = 0;
955 sc.may_swap = 1;
956 1180
957 inc_page_state(allocstall); 1181 inc_page_state(allocstall);
958 1182
@@ -1055,7 +1279,6 @@ loop_again:
1055 total_reclaimed = 0; 1279 total_reclaimed = 0;
1056 sc.gfp_mask = GFP_KERNEL; 1280 sc.gfp_mask = GFP_KERNEL;
1057 sc.may_writepage = 0; 1281 sc.may_writepage = 0;
1058 sc.may_swap = 1;
1059 sc.nr_mapped = read_page_state(nr_mapped); 1282 sc.nr_mapped = read_page_state(nr_mapped);
1060 1283
1061 inc_page_state(pageoutrun); 1284 inc_page_state(pageoutrun);
@@ -1084,7 +1307,7 @@ loop_again:
1084 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1307 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1085 struct zone *zone = pgdat->node_zones + i; 1308 struct zone *zone = pgdat->node_zones + i;
1086 1309
1087 if (zone->present_pages == 0) 1310 if (!populated_zone(zone))
1088 continue; 1311 continue;
1089 1312
1090 if (zone->all_unreclaimable && 1313 if (zone->all_unreclaimable &&
@@ -1121,7 +1344,7 @@ scan:
1121 struct zone *zone = pgdat->node_zones + i; 1344 struct zone *zone = pgdat->node_zones + i;
1122 int nr_slab; 1345 int nr_slab;
1123 1346
1124 if (zone->present_pages == 0) 1347 if (!populated_zone(zone))
1125 continue; 1348 continue;
1126 1349
1127 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1350 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1238,7 +1461,7 @@ static int kswapd(void *p)
1238 * us from recursively trying to free more memory as we're 1461 * us from recursively trying to free more memory as we're
1239 * trying to free the first piece of memory in the first place). 1462 * trying to free the first piece of memory in the first place).
1240 */ 1463 */
1241 tsk->flags |= PF_MEMALLOC|PF_KSWAPD; 1464 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
1242 1465
1243 order = 0; 1466 order = 0;
1244 for ( ; ; ) { 1467 for ( ; ; ) {
@@ -1273,7 +1496,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1273{ 1496{
1274 pg_data_t *pgdat; 1497 pg_data_t *pgdat;
1275 1498
1276 if (zone->present_pages == 0) 1499 if (!populated_zone(zone))
1277 return; 1500 return;
1278 1501
1279 pgdat = zone->zone_pgdat; 1502 pgdat = zone->zone_pgdat;
@@ -1353,76 +1576,3 @@ static int __init kswapd_init(void)
1353} 1576}
1354 1577
1355module_init(kswapd_init) 1578module_init(kswapd_init)
1356
1357
1358/*
1359 * Try to free up some pages from this zone through reclaim.
1360 */
1361int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1362{
1363 struct scan_control sc;
1364 int nr_pages = 1 << order;
1365 int total_reclaimed = 0;
1366
1367 /* The reclaim may sleep, so don't do it if sleep isn't allowed */
1368 if (!(gfp_mask & __GFP_WAIT))
1369 return 0;
1370 if (zone->all_unreclaimable)
1371 return 0;
1372
1373 sc.gfp_mask = gfp_mask;
1374 sc.may_writepage = 0;
1375 sc.may_swap = 0;
1376 sc.nr_mapped = read_page_state(nr_mapped);
1377 sc.nr_scanned = 0;
1378 sc.nr_reclaimed = 0;
1379 /* scan at the highest priority */
1380 sc.priority = 0;
1381 disable_swap_token();
1382
1383 if (nr_pages > SWAP_CLUSTER_MAX)
1384 sc.swap_cluster_max = nr_pages;
1385 else
1386 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1387
1388 /* Don't reclaim the zone if there are other reclaimers active */
1389 if (atomic_read(&zone->reclaim_in_progress) > 0)
1390 goto out;
1391
1392 shrink_zone(zone, &sc);
1393 total_reclaimed = sc.nr_reclaimed;
1394
1395 out:
1396 return total_reclaimed;
1397}
1398
1399asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
1400 unsigned int state)
1401{
1402 struct zone *z;
1403 int i;
1404
1405 if (!capable(CAP_SYS_ADMIN))
1406 return -EACCES;
1407
1408 if (node >= MAX_NUMNODES || !node_online(node))
1409 return -EINVAL;
1410
1411 /* This will break if we ever add more zones */
1412 if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
1413 return -EINVAL;
1414
1415 for (i = 0; i < MAX_NR_ZONES; i++) {
1416 if (!(zone & 1<<i))
1417 continue;
1418
1419 z = &NODE_DATA(node)->node_zones[i];
1420
1421 if (state)
1422 z->reclaim_pages = 1;
1423 else
1424 z->reclaim_pages = 0;
1425 }
1426
1427 return 0;
1428}