diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 9 | ||||
| -rw-r--r-- | mm/Makefile | 6 | ||||
| -rw-r--r-- | mm/bootmem.c | 58 | ||||
| -rw-r--r-- | mm/fadvise.c | 5 | ||||
| -rw-r--r-- | mm/filemap.c | 159 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 8 | ||||
| -rw-r--r-- | mm/hugetlb.c | 194 | ||||
| -rw-r--r-- | mm/internal.h | 21 | ||||
| -rw-r--r-- | mm/madvise.c | 35 | ||||
| -rw-r--r-- | mm/memory.c | 34 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 1 | ||||
| -rw-r--r-- | mm/mempolicy.c | 669 | ||||
| -rw-r--r-- | mm/mlock.c | 1 | ||||
| -rw-r--r-- | mm/mmap.c | 1 | ||||
| -rw-r--r-- | mm/mremap.c | 1 | ||||
| -rw-r--r-- | mm/msync.c | 2 | ||||
| -rw-r--r-- | mm/nommu.c | 7 | ||||
| -rw-r--r-- | mm/oom_kill.c | 5 | ||||
| -rw-r--r-- | mm/page-writeback.c | 10 | ||||
| -rw-r--r-- | mm/page_alloc.c | 472 | ||||
| -rw-r--r-- | mm/pdflush.c | 2 | ||||
| -rw-r--r-- | mm/readahead.c | 15 | ||||
| -rw-r--r-- | mm/rmap.c | 72 | ||||
| -rw-r--r-- | mm/shmem.c | 42 | ||||
| -rw-r--r-- | mm/slab.c | 1140 | ||||
| -rw-r--r-- | mm/slob.c | 385 | ||||
| -rw-r--r-- | mm/sparse.c | 4 | ||||
| -rw-r--r-- | mm/swap.c | 29 | ||||
| -rw-r--r-- | mm/swap_state.c | 8 | ||||
| -rw-r--r-- | mm/swapfile.c | 43 | ||||
| -rw-r--r-- | mm/tiny-shmem.c | 29 | ||||
| -rw-r--r-- | mm/truncate.c | 45 | ||||
| -rw-r--r-- | mm/util.c | 39 | ||||
| -rw-r--r-- | mm/vmscan.c | 468 |
34 files changed, 2722 insertions, 1297 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 21eb51d4da8f..a9cb80ae6409 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -11,7 +11,7 @@ choice | |||
| 11 | 11 | ||
| 12 | config FLATMEM_MANUAL | 12 | config FLATMEM_MANUAL |
| 13 | bool "Flat Memory" | 13 | bool "Flat Memory" |
| 14 | depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE | 14 | depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE |
| 15 | help | 15 | help |
| 16 | This option allows you to change some of the ways that | 16 | This option allows you to change some of the ways that |
| 17 | Linux manages its memory internally. Most users will | 17 | Linux manages its memory internally. Most users will |
| @@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS | |||
| 132 | default "4096" if ARM && !CPU_CACHE_VIPT | 132 | default "4096" if ARM && !CPU_CACHE_VIPT |
| 133 | default "4096" if PARISC && !PA20 | 133 | default "4096" if PARISC && !PA20 |
| 134 | default "4" | 134 | default "4" |
| 135 | |||
| 136 | # | ||
| 137 | # support for page migration | ||
| 138 | # | ||
| 139 | config MIGRATION | ||
| 140 | def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM | ||
| 141 | depends on SWAP | ||
diff --git a/mm/Makefile b/mm/Makefile index 2fa6d2ca9f28..9aa03fa1dcc3 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
| 9 | 9 | ||
| 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
| 11 | page_alloc.o page-writeback.o pdflush.o \ | 11 | page_alloc.o page-writeback.o pdflush.o \ |
| 12 | readahead.o slab.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
| 13 | prio_tree.o $(mmu-y) | 13 | prio_tree.o util.o $(mmu-y) |
| 14 | 14 | ||
| 15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
| 16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
| @@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) += mempolicy.o | |||
| 18 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 18 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
| 19 | obj-$(CONFIG_SHMEM) += shmem.o | 19 | obj-$(CONFIG_SHMEM) += shmem.o |
| 20 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 20 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
| 21 | obj-$(CONFIG_SLOB) += slob.o | ||
| 22 | obj-$(CONFIG_SLAB) += slab.o | ||
| 21 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
| 22 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 16b9465eb4eb..35c32290f717 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
| 296 | unsigned long v = ~map[i / BITS_PER_LONG]; | 296 | unsigned long v = ~map[i / BITS_PER_LONG]; |
| 297 | 297 | ||
| 298 | if (gofast && v == ~0UL) { | 298 | if (gofast && v == ~0UL) { |
| 299 | int j, order; | 299 | int order; |
| 300 | 300 | ||
| 301 | page = pfn_to_page(pfn); | 301 | page = pfn_to_page(pfn); |
| 302 | count += BITS_PER_LONG; | 302 | count += BITS_PER_LONG; |
| 303 | __ClearPageReserved(page); | ||
| 304 | order = ffs(BITS_PER_LONG) - 1; | 303 | order = ffs(BITS_PER_LONG) - 1; |
| 305 | set_page_refs(page, order); | 304 | __free_pages_bootmem(page, order); |
| 306 | for (j = 1; j < BITS_PER_LONG; j++) { | ||
| 307 | if (j + 16 < BITS_PER_LONG) | ||
| 308 | prefetchw(page + j + 16); | ||
| 309 | __ClearPageReserved(page + j); | ||
| 310 | set_page_count(page + j, 0); | ||
| 311 | } | ||
| 312 | __free_pages(page, order); | ||
| 313 | i += BITS_PER_LONG; | 305 | i += BITS_PER_LONG; |
| 314 | page += BITS_PER_LONG; | 306 | page += BITS_PER_LONG; |
| 315 | } else if (v) { | 307 | } else if (v) { |
| @@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
| 319 | for (m = 1; m && i < idx; m<<=1, page++, i++) { | 311 | for (m = 1; m && i < idx; m<<=1, page++, i++) { |
| 320 | if (v & m) { | 312 | if (v & m) { |
| 321 | count++; | 313 | count++; |
| 322 | __ClearPageReserved(page); | 314 | __free_pages_bootmem(page, 0); |
| 323 | set_page_refs(page, 0); | ||
| 324 | __free_page(page); | ||
| 325 | } | 315 | } |
| 326 | } | 316 | } |
| 327 | } else { | 317 | } else { |
| @@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | |||
| 339 | count = 0; | 329 | count = 0; |
| 340 | for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { | 330 | for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) { |
| 341 | count++; | 331 | count++; |
| 342 | __ClearPageReserved(page); | 332 | __free_pages_bootmem(page, 0); |
| 343 | set_page_count(page, 1); | ||
| 344 | __free_page(page); | ||
| 345 | } | 333 | } |
| 346 | total += count; | 334 | total += count; |
| 347 | bdata->node_bootmem_map = NULL; | 335 | bdata->node_bootmem_map = NULL; |
| @@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void) | |||
| 393 | return(free_all_bootmem_core(NODE_DATA(0))); | 381 | return(free_all_bootmem_core(NODE_DATA(0))); |
| 394 | } | 382 | } |
| 395 | 383 | ||
| 396 | void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal, | 384 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) |
| 397 | unsigned long limit) | ||
| 398 | { | 385 | { |
| 399 | pg_data_t *pgdat = pgdat_list; | 386 | pg_data_t *pgdat = pgdat_list; |
| 400 | void *ptr; | 387 | void *ptr; |
| 401 | 388 | ||
| 402 | for_each_pgdat(pgdat) | 389 | for_each_pgdat(pgdat) |
| 403 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, | 390 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, |
| 404 | align, goal, limit))) | 391 | align, goal, 0))) |
| 405 | return(ptr); | 392 | return(ptr); |
| 406 | 393 | ||
| 407 | /* | 394 | /* |
| @@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un | |||
| 413 | } | 400 | } |
| 414 | 401 | ||
| 415 | 402 | ||
| 416 | void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align, | 403 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, |
| 417 | unsigned long goal, unsigned long limit) | 404 | unsigned long goal) |
| 418 | { | 405 | { |
| 419 | void *ptr; | 406 | void *ptr; |
| 420 | 407 | ||
| 421 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit); | 408 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
| 422 | if (ptr) | 409 | if (ptr) |
| 423 | return (ptr); | 410 | return (ptr); |
| 424 | 411 | ||
| 425 | return __alloc_bootmem_limit(size, align, goal, limit); | 412 | return __alloc_bootmem(size, align, goal); |
| 413 | } | ||
| 414 | |||
| 415 | #define LOW32LIMIT 0xffffffff | ||
| 416 | |||
| 417 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) | ||
| 418 | { | ||
| 419 | pg_data_t *pgdat = pgdat_list; | ||
| 420 | void *ptr; | ||
| 421 | |||
| 422 | for_each_pgdat(pgdat) | ||
| 423 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, | ||
| 424 | align, goal, LOW32LIMIT))) | ||
| 425 | return(ptr); | ||
| 426 | |||
| 427 | /* | ||
| 428 | * Whoops, we cannot satisfy the allocation request. | ||
| 429 | */ | ||
| 430 | printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); | ||
| 431 | panic("Out of low memory"); | ||
| 432 | return NULL; | ||
| 426 | } | 433 | } |
| 427 | 434 | ||
| 435 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | ||
| 436 | unsigned long align, unsigned long goal) | ||
| 437 | { | ||
| 438 | return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT); | ||
| 439 | } | ||
diff --git a/mm/fadvise.c b/mm/fadvise.c index 5f19e87bc5af..d257c89e7704 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
| 37 | if (!file) | 37 | if (!file) |
| 38 | return -EBADF; | 38 | return -EBADF; |
| 39 | 39 | ||
| 40 | if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) { | ||
| 41 | ret = -ESPIPE; | ||
| 42 | goto out; | ||
| 43 | } | ||
| 44 | |||
| 40 | mapping = file->f_mapping; | 45 | mapping = file->f_mapping; |
| 41 | if (!mapping || len < 0) { | 46 | if (!mapping || len < 0) { |
| 42 | ret = -EINVAL; | 47 | ret = -EINVAL; |
diff --git a/mm/filemap.c b/mm/filemap.c index 33a28bfde158..a965b6b35f26 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/compiler.h> | 15 | #include <linux/compiler.h> |
| 16 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
| 17 | #include <linux/aio.h> | 17 | #include <linux/aio.h> |
| 18 | #include <linux/capability.h> | ||
| 18 | #include <linux/kernel_stat.h> | 19 | #include <linux/kernel_stat.h> |
| 19 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
| 20 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
| @@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
| 61 | * ->swap_lock (exclusive_swap_page, others) | 62 | * ->swap_lock (exclusive_swap_page, others) |
| 62 | * ->mapping->tree_lock | 63 | * ->mapping->tree_lock |
| 63 | * | 64 | * |
| 64 | * ->i_sem | 65 | * ->i_mutex |
| 65 | * ->i_mmap_lock (truncate->unmap_mapping_range) | 66 | * ->i_mmap_lock (truncate->unmap_mapping_range) |
| 66 | * | 67 | * |
| 67 | * ->mmap_sem | 68 | * ->mmap_sem |
| @@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
| 73 | * ->lock_page (access_process_vm) | 74 | * ->lock_page (access_process_vm) |
| 74 | * | 75 | * |
| 75 | * ->mmap_sem | 76 | * ->mmap_sem |
| 76 | * ->i_sem (msync) | 77 | * ->i_mutex (msync) |
| 77 | * | 78 | * |
| 78 | * ->i_sem | 79 | * ->i_mutex |
| 79 | * ->i_alloc_sem (various) | 80 | * ->i_alloc_sem (various) |
| 80 | * | 81 | * |
| 81 | * ->inode_lock | 82 | * ->inode_lock |
| @@ -276,11 +277,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping, | |||
| 276 | * integrity" operation. It waits upon in-flight writeout before starting and | 277 | * integrity" operation. It waits upon in-flight writeout before starting and |
| 277 | * waiting upon new writeout. If there was an IO error, return it. | 278 | * waiting upon new writeout. If there was an IO error, return it. |
| 278 | * | 279 | * |
| 279 | * We need to re-take i_sem during the generic_osync_inode list walk because | 280 | * We need to re-take i_mutex during the generic_osync_inode list walk because |
| 280 | * it is otherwise livelockable. | 281 | * it is otherwise livelockable. |
| 281 | */ | 282 | */ |
| 282 | int sync_page_range(struct inode *inode, struct address_space *mapping, | 283 | int sync_page_range(struct inode *inode, struct address_space *mapping, |
| 283 | loff_t pos, size_t count) | 284 | loff_t pos, loff_t count) |
| 284 | { | 285 | { |
| 285 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; | 286 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; |
| 286 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; | 287 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; |
| @@ -290,9 +291,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, | |||
| 290 | return 0; | 291 | return 0; |
| 291 | ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); | 292 | ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); |
| 292 | if (ret == 0) { | 293 | if (ret == 0) { |
| 293 | down(&inode->i_sem); | 294 | mutex_lock(&inode->i_mutex); |
| 294 | ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); | 295 | ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); |
| 295 | up(&inode->i_sem); | 296 | mutex_unlock(&inode->i_mutex); |
| 296 | } | 297 | } |
| 297 | if (ret == 0) | 298 | if (ret == 0) |
| 298 | ret = wait_on_page_writeback_range(mapping, start, end); | 299 | ret = wait_on_page_writeback_range(mapping, start, end); |
| @@ -301,13 +302,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping, | |||
| 301 | EXPORT_SYMBOL(sync_page_range); | 302 | EXPORT_SYMBOL(sync_page_range); |
| 302 | 303 | ||
| 303 | /* | 304 | /* |
| 304 | * Note: Holding i_sem across sync_page_range_nolock is not a good idea | 305 | * Note: Holding i_mutex across sync_page_range_nolock is not a good idea |
| 305 | * as it forces O_SYNC writers to different parts of the same file | 306 | * as it forces O_SYNC writers to different parts of the same file |
| 306 | * to be serialised right until io completion. | 307 | * to be serialised right until io completion. |
| 307 | */ | 308 | */ |
| 308 | static int sync_page_range_nolock(struct inode *inode, | 309 | int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, |
| 309 | struct address_space *mapping, | 310 | loff_t pos, loff_t count) |
| 310 | loff_t pos, size_t count) | ||
| 311 | { | 311 | { |
| 312 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; | 312 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; |
| 313 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; | 313 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; |
| @@ -322,6 +322,7 @@ static int sync_page_range_nolock(struct inode *inode, | |||
| 322 | ret = wait_on_page_writeback_range(mapping, start, end); | 322 | ret = wait_on_page_writeback_range(mapping, start, end); |
| 323 | return ret; | 323 | return ret; |
| 324 | } | 324 | } |
| 325 | EXPORT_SYMBOL(sync_page_range_nolock); | ||
| 325 | 326 | ||
| 326 | /** | 327 | /** |
| 327 | * filemap_fdatawait - walk the list of under-writeback pages of the given | 328 | * filemap_fdatawait - walk the list of under-writeback pages of the given |
| @@ -343,30 +344,44 @@ EXPORT_SYMBOL(filemap_fdatawait); | |||
| 343 | 344 | ||
| 344 | int filemap_write_and_wait(struct address_space *mapping) | 345 | int filemap_write_and_wait(struct address_space *mapping) |
| 345 | { | 346 | { |
| 346 | int retval = 0; | 347 | int err = 0; |
| 347 | 348 | ||
| 348 | if (mapping->nrpages) { | 349 | if (mapping->nrpages) { |
| 349 | retval = filemap_fdatawrite(mapping); | 350 | err = filemap_fdatawrite(mapping); |
| 350 | if (retval == 0) | 351 | /* |
| 351 | retval = filemap_fdatawait(mapping); | 352 | * Even if the above returned error, the pages may be |
| 353 | * written partially (e.g. -ENOSPC), so we wait for it. | ||
| 354 | * But the -EIO is special case, it may indicate the worst | ||
| 355 | * thing (e.g. bug) happened, so we avoid waiting for it. | ||
| 356 | */ | ||
| 357 | if (err != -EIO) { | ||
| 358 | int err2 = filemap_fdatawait(mapping); | ||
| 359 | if (!err) | ||
| 360 | err = err2; | ||
| 361 | } | ||
| 352 | } | 362 | } |
| 353 | return retval; | 363 | return err; |
| 354 | } | 364 | } |
| 365 | EXPORT_SYMBOL(filemap_write_and_wait); | ||
| 355 | 366 | ||
| 356 | int filemap_write_and_wait_range(struct address_space *mapping, | 367 | int filemap_write_and_wait_range(struct address_space *mapping, |
| 357 | loff_t lstart, loff_t lend) | 368 | loff_t lstart, loff_t lend) |
| 358 | { | 369 | { |
| 359 | int retval = 0; | 370 | int err = 0; |
| 360 | 371 | ||
| 361 | if (mapping->nrpages) { | 372 | if (mapping->nrpages) { |
| 362 | retval = __filemap_fdatawrite_range(mapping, lstart, lend, | 373 | err = __filemap_fdatawrite_range(mapping, lstart, lend, |
| 363 | WB_SYNC_ALL); | 374 | WB_SYNC_ALL); |
| 364 | if (retval == 0) | 375 | /* See comment of filemap_write_and_wait() */ |
| 365 | retval = wait_on_page_writeback_range(mapping, | 376 | if (err != -EIO) { |
| 366 | lstart >> PAGE_CACHE_SHIFT, | 377 | int err2 = wait_on_page_writeback_range(mapping, |
| 367 | lend >> PAGE_CACHE_SHIFT); | 378 | lstart >> PAGE_CACHE_SHIFT, |
| 379 | lend >> PAGE_CACHE_SHIFT); | ||
| 380 | if (!err) | ||
| 381 | err = err2; | ||
| 382 | } | ||
| 368 | } | 383 | } |
| 369 | return retval; | 384 | return err; |
| 370 | } | 385 | } |
| 371 | 386 | ||
| 372 | /* | 387 | /* |
| @@ -555,11 +570,12 @@ repeat: | |||
| 555 | page_cache_get(page); | 570 | page_cache_get(page); |
| 556 | if (TestSetPageLocked(page)) { | 571 | if (TestSetPageLocked(page)) { |
| 557 | read_unlock_irq(&mapping->tree_lock); | 572 | read_unlock_irq(&mapping->tree_lock); |
| 558 | lock_page(page); | 573 | __lock_page(page); |
| 559 | read_lock_irq(&mapping->tree_lock); | 574 | read_lock_irq(&mapping->tree_lock); |
| 560 | 575 | ||
| 561 | /* Has the page been truncated while we slept? */ | 576 | /* Has the page been truncated while we slept? */ |
| 562 | if (page->mapping != mapping || page->index != offset) { | 577 | if (unlikely(page->mapping != mapping || |
| 578 | page->index != offset)) { | ||
| 563 | unlock_page(page); | 579 | unlock_page(page); |
| 564 | page_cache_release(page); | 580 | page_cache_release(page); |
| 565 | goto repeat; | 581 | goto repeat; |
| @@ -831,8 +847,13 @@ readpage: | |||
| 831 | /* Start the actual read. The read will unlock the page. */ | 847 | /* Start the actual read. The read will unlock the page. */ |
| 832 | error = mapping->a_ops->readpage(filp, page); | 848 | error = mapping->a_ops->readpage(filp, page); |
| 833 | 849 | ||
| 834 | if (unlikely(error)) | 850 | if (unlikely(error)) { |
| 851 | if (error == AOP_TRUNCATED_PAGE) { | ||
| 852 | page_cache_release(page); | ||
| 853 | goto find_page; | ||
| 854 | } | ||
| 835 | goto readpage_error; | 855 | goto readpage_error; |
| 856 | } | ||
| 836 | 857 | ||
| 837 | if (!PageUptodate(page)) { | 858 | if (!PageUptodate(page)) { |
| 838 | lock_page(page); | 859 | lock_page(page); |
| @@ -1152,26 +1173,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset) | |||
| 1152 | { | 1173 | { |
| 1153 | struct address_space *mapping = file->f_mapping; | 1174 | struct address_space *mapping = file->f_mapping; |
| 1154 | struct page *page; | 1175 | struct page *page; |
| 1155 | int error; | 1176 | int ret; |
| 1156 | 1177 | ||
| 1157 | page = page_cache_alloc_cold(mapping); | 1178 | do { |
| 1158 | if (!page) | 1179 | page = page_cache_alloc_cold(mapping); |
| 1159 | return -ENOMEM; | 1180 | if (!page) |
| 1181 | return -ENOMEM; | ||
| 1182 | |||
| 1183 | ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | ||
| 1184 | if (ret == 0) | ||
| 1185 | ret = mapping->a_ops->readpage(file, page); | ||
| 1186 | else if (ret == -EEXIST) | ||
| 1187 | ret = 0; /* losing race to add is OK */ | ||
| 1160 | 1188 | ||
| 1161 | error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL); | ||
| 1162 | if (!error) { | ||
| 1163 | error = mapping->a_ops->readpage(file, page); | ||
| 1164 | page_cache_release(page); | 1189 | page_cache_release(page); |
| 1165 | return error; | ||
| 1166 | } | ||
| 1167 | 1190 | ||
| 1168 | /* | 1191 | } while (ret == AOP_TRUNCATED_PAGE); |
| 1169 | * We arrive here in the unlikely event that someone | 1192 | |
| 1170 | * raced with us and added our page to the cache first | 1193 | return ret; |
| 1171 | * or we are out of memory for radix-tree nodes. | ||
| 1172 | */ | ||
| 1173 | page_cache_release(page); | ||
| 1174 | return error == -EEXIST ? 0 : error; | ||
| 1175 | } | 1194 | } |
| 1176 | 1195 | ||
| 1177 | #define MMAP_LOTSAMISS (100) | 1196 | #define MMAP_LOTSAMISS (100) |
| @@ -1331,10 +1350,14 @@ page_not_uptodate: | |||
| 1331 | goto success; | 1350 | goto success; |
| 1332 | } | 1351 | } |
| 1333 | 1352 | ||
| 1334 | if (!mapping->a_ops->readpage(file, page)) { | 1353 | error = mapping->a_ops->readpage(file, page); |
| 1354 | if (!error) { | ||
| 1335 | wait_on_page_locked(page); | 1355 | wait_on_page_locked(page); |
| 1336 | if (PageUptodate(page)) | 1356 | if (PageUptodate(page)) |
| 1337 | goto success; | 1357 | goto success; |
| 1358 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
| 1359 | page_cache_release(page); | ||
| 1360 | goto retry_find; | ||
| 1338 | } | 1361 | } |
| 1339 | 1362 | ||
| 1340 | /* | 1363 | /* |
| @@ -1358,10 +1381,14 @@ page_not_uptodate: | |||
| 1358 | goto success; | 1381 | goto success; |
| 1359 | } | 1382 | } |
| 1360 | ClearPageError(page); | 1383 | ClearPageError(page); |
| 1361 | if (!mapping->a_ops->readpage(file, page)) { | 1384 | error = mapping->a_ops->readpage(file, page); |
| 1385 | if (!error) { | ||
| 1362 | wait_on_page_locked(page); | 1386 | wait_on_page_locked(page); |
| 1363 | if (PageUptodate(page)) | 1387 | if (PageUptodate(page)) |
| 1364 | goto success; | 1388 | goto success; |
| 1389 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
| 1390 | page_cache_release(page); | ||
| 1391 | goto retry_find; | ||
| 1365 | } | 1392 | } |
| 1366 | 1393 | ||
| 1367 | /* | 1394 | /* |
| @@ -1444,10 +1471,14 @@ page_not_uptodate: | |||
| 1444 | goto success; | 1471 | goto success; |
| 1445 | } | 1472 | } |
| 1446 | 1473 | ||
| 1447 | if (!mapping->a_ops->readpage(file, page)) { | 1474 | error = mapping->a_ops->readpage(file, page); |
| 1475 | if (!error) { | ||
| 1448 | wait_on_page_locked(page); | 1476 | wait_on_page_locked(page); |
| 1449 | if (PageUptodate(page)) | 1477 | if (PageUptodate(page)) |
| 1450 | goto success; | 1478 | goto success; |
| 1479 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
| 1480 | page_cache_release(page); | ||
| 1481 | goto retry_find; | ||
| 1451 | } | 1482 | } |
| 1452 | 1483 | ||
| 1453 | /* | 1484 | /* |
| @@ -1470,10 +1501,14 @@ page_not_uptodate: | |||
| 1470 | } | 1501 | } |
| 1471 | 1502 | ||
| 1472 | ClearPageError(page); | 1503 | ClearPageError(page); |
| 1473 | if (!mapping->a_ops->readpage(file, page)) { | 1504 | error = mapping->a_ops->readpage(file, page); |
| 1505 | if (!error) { | ||
| 1474 | wait_on_page_locked(page); | 1506 | wait_on_page_locked(page); |
| 1475 | if (PageUptodate(page)) | 1507 | if (PageUptodate(page)) |
| 1476 | goto success; | 1508 | goto success; |
| 1509 | } else if (error == AOP_TRUNCATED_PAGE) { | ||
| 1510 | page_cache_release(page); | ||
| 1511 | goto retry_find; | ||
| 1477 | } | 1512 | } |
| 1478 | 1513 | ||
| 1479 | /* | 1514 | /* |
| @@ -1858,7 +1893,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1858 | /* | 1893 | /* |
| 1859 | * Sync the fs metadata but not the minor inode changes and | 1894 | * Sync the fs metadata but not the minor inode changes and |
| 1860 | * of course not the data as we did direct DMA for the IO. | 1895 | * of course not the data as we did direct DMA for the IO. |
| 1861 | * i_sem is held, which protects generic_osync_inode() from | 1896 | * i_mutex is held, which protects generic_osync_inode() from |
| 1862 | * livelocking. | 1897 | * livelocking. |
| 1863 | */ | 1898 | */ |
| 1864 | if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 1899 | if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| @@ -1934,12 +1969,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1934 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 1969 | status = a_ops->prepare_write(file, page, offset, offset+bytes); |
| 1935 | if (unlikely(status)) { | 1970 | if (unlikely(status)) { |
| 1936 | loff_t isize = i_size_read(inode); | 1971 | loff_t isize = i_size_read(inode); |
| 1972 | |||
| 1973 | if (status != AOP_TRUNCATED_PAGE) | ||
| 1974 | unlock_page(page); | ||
| 1975 | page_cache_release(page); | ||
| 1976 | if (status == AOP_TRUNCATED_PAGE) | ||
| 1977 | continue; | ||
| 1937 | /* | 1978 | /* |
| 1938 | * prepare_write() may have instantiated a few blocks | 1979 | * prepare_write() may have instantiated a few blocks |
| 1939 | * outside i_size. Trim these off again. | 1980 | * outside i_size. Trim these off again. |
| 1940 | */ | 1981 | */ |
| 1941 | unlock_page(page); | ||
| 1942 | page_cache_release(page); | ||
| 1943 | if (pos + bytes > isize) | 1982 | if (pos + bytes > isize) |
| 1944 | vmtruncate(inode, isize); | 1983 | vmtruncate(inode, isize); |
| 1945 | break; | 1984 | break; |
| @@ -1952,6 +1991,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1952 | cur_iov, iov_base, bytes); | 1991 | cur_iov, iov_base, bytes); |
| 1953 | flush_dcache_page(page); | 1992 | flush_dcache_page(page); |
| 1954 | status = a_ops->commit_write(file, page, offset, offset+bytes); | 1993 | status = a_ops->commit_write(file, page, offset, offset+bytes); |
| 1994 | if (status == AOP_TRUNCATED_PAGE) { | ||
| 1995 | page_cache_release(page); | ||
| 1996 | continue; | ||
| 1997 | } | ||
| 1955 | if (likely(copied > 0)) { | 1998 | if (likely(copied > 0)) { |
| 1956 | if (!status) | 1999 | if (!status) |
| 1957 | status = copied; | 2000 | status = copied; |
| @@ -2066,7 +2109,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
| 2066 | if (err) | 2109 | if (err) |
| 2067 | goto out; | 2110 | goto out; |
| 2068 | 2111 | ||
| 2069 | inode_update_time(inode, 1); | 2112 | file_update_time(file); |
| 2070 | 2113 | ||
| 2071 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 2114 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
| 2072 | if (unlikely(file->f_flags & O_DIRECT)) { | 2115 | if (unlikely(file->f_flags & O_DIRECT)) { |
| @@ -2153,10 +2196,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf, | |||
| 2153 | 2196 | ||
| 2154 | BUG_ON(iocb->ki_pos != pos); | 2197 | BUG_ON(iocb->ki_pos != pos); |
| 2155 | 2198 | ||
| 2156 | down(&inode->i_sem); | 2199 | mutex_lock(&inode->i_mutex); |
| 2157 | ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, | 2200 | ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1, |
| 2158 | &iocb->ki_pos); | 2201 | &iocb->ki_pos); |
| 2159 | up(&inode->i_sem); | 2202 | mutex_unlock(&inode->i_mutex); |
| 2160 | 2203 | ||
| 2161 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2204 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 2162 | ssize_t err; | 2205 | ssize_t err; |
| @@ -2178,9 +2221,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf, | |||
| 2178 | struct iovec local_iov = { .iov_base = (void __user *)buf, | 2221 | struct iovec local_iov = { .iov_base = (void __user *)buf, |
| 2179 | .iov_len = count }; | 2222 | .iov_len = count }; |
| 2180 | 2223 | ||
| 2181 | down(&inode->i_sem); | 2224 | mutex_lock(&inode->i_mutex); |
| 2182 | ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); | 2225 | ret = __generic_file_write_nolock(file, &local_iov, 1, ppos); |
| 2183 | up(&inode->i_sem); | 2226 | mutex_unlock(&inode->i_mutex); |
| 2184 | 2227 | ||
| 2185 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2228 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 2186 | ssize_t err; | 2229 | ssize_t err; |
| @@ -2214,9 +2257,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, | |||
| 2214 | struct inode *inode = mapping->host; | 2257 | struct inode *inode = mapping->host; |
| 2215 | ssize_t ret; | 2258 | ssize_t ret; |
| 2216 | 2259 | ||
| 2217 | down(&inode->i_sem); | 2260 | mutex_lock(&inode->i_mutex); |
| 2218 | ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); | 2261 | ret = __generic_file_write_nolock(file, iov, nr_segs, ppos); |
| 2219 | up(&inode->i_sem); | 2262 | mutex_unlock(&inode->i_mutex); |
| 2220 | 2263 | ||
| 2221 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2264 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 2222 | int err; | 2265 | int err; |
| @@ -2230,7 +2273,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov, | |||
| 2230 | EXPORT_SYMBOL(generic_file_writev); | 2273 | EXPORT_SYMBOL(generic_file_writev); |
| 2231 | 2274 | ||
| 2232 | /* | 2275 | /* |
| 2233 | * Called under i_sem for writes to S_ISREG files. Returns -EIO if something | 2276 | * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something |
| 2234 | * went wrong during pagecache shootdown. | 2277 | * went wrong during pagecache shootdown. |
| 2235 | */ | 2278 | */ |
| 2236 | static ssize_t | 2279 | static ssize_t |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 9cf687e4a29a..b960ac8e5918 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf, | |||
| 338 | *ppos = pos; | 338 | *ppos = pos; |
| 339 | /* | 339 | /* |
| 340 | * No need to use i_size_read() here, the i_size | 340 | * No need to use i_size_read() here, the i_size |
| 341 | * cannot change under us because we hold i_sem. | 341 | * cannot change under us because we hold i_mutex. |
| 342 | */ | 342 | */ |
| 343 | if (pos > inode->i_size) { | 343 | if (pos > inode->i_size) { |
| 344 | i_size_write(inode, pos); | 344 | i_size_write(inode, pos); |
| @@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
| 358 | loff_t pos; | 358 | loff_t pos; |
| 359 | ssize_t ret; | 359 | ssize_t ret; |
| 360 | 360 | ||
| 361 | down(&inode->i_sem); | 361 | mutex_lock(&inode->i_mutex); |
| 362 | 362 | ||
| 363 | if (!access_ok(VERIFY_READ, buf, len)) { | 363 | if (!access_ok(VERIFY_READ, buf, len)) { |
| 364 | ret=-EFAULT; | 364 | ret=-EFAULT; |
| @@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
| 383 | if (ret) | 383 | if (ret) |
| 384 | goto out_backing; | 384 | goto out_backing; |
| 385 | 385 | ||
| 386 | inode_update_time(inode, 1); | 386 | file_update_time(filp); |
| 387 | 387 | ||
| 388 | ret = __xip_file_write (filp, buf, count, pos, ppos); | 388 | ret = __xip_file_write (filp, buf, count, pos, ppos); |
| 389 | 389 | ||
| 390 | out_backing: | 390 | out_backing: |
| 391 | current->backing_dev_info = NULL; | 391 | current->backing_dev_info = NULL; |
| 392 | out_up: | 392 | out_up: |
| 393 | up(&inode->i_sem); | 393 | mutex_unlock(&inode->i_mutex); |
| 394 | return ret; | 394 | return ret; |
| 395 | } | 395 | } |
| 396 | EXPORT_SYMBOL_GPL(xip_file_write); | 396 | EXPORT_SYMBOL_GPL(xip_file_write); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3e52df7c471b..b21d78c941b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -11,6 +11,9 @@ | |||
| 11 | #include <linux/highmem.h> | 11 | #include <linux/highmem.h> |
| 12 | #include <linux/nodemask.h> | 12 | #include <linux/nodemask.h> |
| 13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
| 14 | #include <linux/mempolicy.h> | ||
| 15 | #include <linux/cpuset.h> | ||
| 16 | |||
| 14 | #include <asm/page.h> | 17 | #include <asm/page.h> |
| 15 | #include <asm/pgtable.h> | 18 | #include <asm/pgtable.h> |
| 16 | 19 | ||
| @@ -36,18 +39,22 @@ static void enqueue_huge_page(struct page *page) | |||
| 36 | free_huge_pages_node[nid]++; | 39 | free_huge_pages_node[nid]++; |
| 37 | } | 40 | } |
| 38 | 41 | ||
| 39 | static struct page *dequeue_huge_page(void) | 42 | static struct page *dequeue_huge_page(struct vm_area_struct *vma, |
| 43 | unsigned long address) | ||
| 40 | { | 44 | { |
| 41 | int nid = numa_node_id(); | 45 | int nid = numa_node_id(); |
| 42 | struct page *page = NULL; | 46 | struct page *page = NULL; |
| 47 | struct zonelist *zonelist = huge_zonelist(vma, address); | ||
| 48 | struct zone **z; | ||
| 43 | 49 | ||
| 44 | if (list_empty(&hugepage_freelists[nid])) { | 50 | for (z = zonelist->zones; *z; z++) { |
| 45 | for (nid = 0; nid < MAX_NUMNODES; ++nid) | 51 | nid = (*z)->zone_pgdat->node_id; |
| 46 | if (!list_empty(&hugepage_freelists[nid])) | 52 | if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && |
| 47 | break; | 53 | !list_empty(&hugepage_freelists[nid])) |
| 54 | break; | ||
| 48 | } | 55 | } |
| 49 | if (nid >= 0 && nid < MAX_NUMNODES && | 56 | |
| 50 | !list_empty(&hugepage_freelists[nid])) { | 57 | if (*z) { |
| 51 | page = list_entry(hugepage_freelists[nid].next, | 58 | page = list_entry(hugepage_freelists[nid].next, |
| 52 | struct page, lru); | 59 | struct page, lru); |
| 53 | list_del(&page->lru); | 60 | list_del(&page->lru); |
| @@ -85,13 +92,13 @@ void free_huge_page(struct page *page) | |||
| 85 | spin_unlock(&hugetlb_lock); | 92 | spin_unlock(&hugetlb_lock); |
| 86 | } | 93 | } |
| 87 | 94 | ||
| 88 | struct page *alloc_huge_page(void) | 95 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) |
| 89 | { | 96 | { |
| 90 | struct page *page; | 97 | struct page *page; |
| 91 | int i; | 98 | int i; |
| 92 | 99 | ||
| 93 | spin_lock(&hugetlb_lock); | 100 | spin_lock(&hugetlb_lock); |
| 94 | page = dequeue_huge_page(); | 101 | page = dequeue_huge_page(vma, addr); |
| 95 | if (!page) { | 102 | if (!page) { |
| 96 | spin_unlock(&hugetlb_lock); | 103 | spin_unlock(&hugetlb_lock); |
| 97 | return NULL; | 104 | return NULL; |
| @@ -194,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
| 194 | spin_lock(&hugetlb_lock); | 201 | spin_lock(&hugetlb_lock); |
| 195 | try_to_free_low(count); | 202 | try_to_free_low(count); |
| 196 | while (count < nr_huge_pages) { | 203 | while (count < nr_huge_pages) { |
| 197 | struct page *page = dequeue_huge_page(); | 204 | struct page *page = dequeue_huge_page(NULL, 0); |
| 198 | if (!page) | 205 | if (!page) |
| 199 | break; | 206 | break; |
| 200 | update_and_free_page(page); | 207 | update_and_free_page(page); |
| @@ -261,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = { | |||
| 261 | .nopage = hugetlb_nopage, | 268 | .nopage = hugetlb_nopage, |
| 262 | }; | 269 | }; |
| 263 | 270 | ||
| 264 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | 271 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
| 272 | int writable) | ||
| 265 | { | 273 | { |
| 266 | pte_t entry; | 274 | pte_t entry; |
| 267 | 275 | ||
| 268 | if (vma->vm_flags & VM_WRITE) { | 276 | if (writable) { |
| 269 | entry = | 277 | entry = |
| 270 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); | 278 | pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); |
| 271 | } else { | 279 | } else { |
| @@ -277,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page) | |||
| 277 | return entry; | 285 | return entry; |
| 278 | } | 286 | } |
| 279 | 287 | ||
| 288 | static void set_huge_ptep_writable(struct vm_area_struct *vma, | ||
| 289 | unsigned long address, pte_t *ptep) | ||
| 290 | { | ||
| 291 | pte_t entry; | ||
| 292 | |||
| 293 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | ||
| 294 | ptep_set_access_flags(vma, address, ptep, entry, 1); | ||
| 295 | update_mmu_cache(vma, address, entry); | ||
| 296 | lazy_mmu_prot_update(entry); | ||
| 297 | } | ||
| 298 | |||
| 299 | |||
| 280 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | 300 | int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, |
| 281 | struct vm_area_struct *vma) | 301 | struct vm_area_struct *vma) |
| 282 | { | 302 | { |
| 283 | pte_t *src_pte, *dst_pte, entry; | 303 | pte_t *src_pte, *dst_pte, entry; |
| 284 | struct page *ptepage; | 304 | struct page *ptepage; |
| 285 | unsigned long addr; | 305 | unsigned long addr; |
| 306 | int cow; | ||
| 307 | |||
| 308 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | ||
| 286 | 309 | ||
| 287 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 310 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { |
| 288 | src_pte = huge_pte_offset(src, addr); | 311 | src_pte = huge_pte_offset(src, addr); |
| @@ -294,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
| 294 | spin_lock(&dst->page_table_lock); | 317 | spin_lock(&dst->page_table_lock); |
| 295 | spin_lock(&src->page_table_lock); | 318 | spin_lock(&src->page_table_lock); |
| 296 | if (!pte_none(*src_pte)) { | 319 | if (!pte_none(*src_pte)) { |
| 320 | if (cow) | ||
| 321 | ptep_set_wrprotect(src, addr, src_pte); | ||
| 297 | entry = *src_pte; | 322 | entry = *src_pte; |
| 298 | ptepage = pte_page(entry); | 323 | ptepage = pte_page(entry); |
| 299 | get_page(ptepage); | 324 | get_page(ptepage); |
| @@ -345,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 345 | flush_tlb_range(vma, start, end); | 370 | flush_tlb_range(vma, start, end); |
| 346 | } | 371 | } |
| 347 | 372 | ||
| 348 | static struct page *find_lock_huge_page(struct address_space *mapping, | 373 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
| 349 | unsigned long idx) | 374 | unsigned long address, pte_t *ptep, pte_t pte) |
| 350 | { | 375 | { |
| 351 | struct page *page; | 376 | struct page *old_page, *new_page; |
| 352 | int err; | 377 | int i, avoidcopy; |
| 353 | struct inode *inode = mapping->host; | ||
| 354 | unsigned long size; | ||
| 355 | 378 | ||
| 356 | retry: | 379 | old_page = pte_page(pte); |
| 357 | page = find_lock_page(mapping, idx); | ||
| 358 | if (page) | ||
| 359 | goto out; | ||
| 360 | 380 | ||
| 361 | /* Check to make sure the mapping hasn't been truncated */ | 381 | /* If no-one else is actually using this page, avoid the copy |
| 362 | size = i_size_read(inode) >> HPAGE_SHIFT; | 382 | * and just make the page writable */ |
| 363 | if (idx >= size) | 383 | avoidcopy = (page_count(old_page) == 1); |
| 364 | goto out; | 384 | if (avoidcopy) { |
| 385 | set_huge_ptep_writable(vma, address, ptep); | ||
| 386 | return VM_FAULT_MINOR; | ||
| 387 | } | ||
| 365 | 388 | ||
| 366 | if (hugetlb_get_quota(mapping)) | 389 | page_cache_get(old_page); |
| 367 | goto out; | 390 | new_page = alloc_huge_page(vma, address); |
| 368 | page = alloc_huge_page(); | 391 | |
| 369 | if (!page) { | 392 | if (!new_page) { |
| 370 | hugetlb_put_quota(mapping); | 393 | page_cache_release(old_page); |
| 371 | goto out; | 394 | |
| 395 | /* Logically this is OOM, not a SIGBUS, but an OOM | ||
| 396 | * could cause the kernel to go killing other | ||
| 397 | * processes which won't help the hugepage situation | ||
| 398 | * at all (?) */ | ||
| 399 | return VM_FAULT_SIGBUS; | ||
| 372 | } | 400 | } |
| 373 | 401 | ||
| 374 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | 402 | spin_unlock(&mm->page_table_lock); |
| 375 | if (err) { | 403 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) |
| 376 | put_page(page); | 404 | copy_user_highpage(new_page + i, old_page + i, |
| 377 | hugetlb_put_quota(mapping); | 405 | address + i*PAGE_SIZE); |
| 378 | if (err == -EEXIST) | 406 | spin_lock(&mm->page_table_lock); |
| 379 | goto retry; | 407 | |
| 380 | page = NULL; | 408 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
| 409 | if (likely(pte_same(*ptep, pte))) { | ||
| 410 | /* Break COW */ | ||
| 411 | set_huge_pte_at(mm, address, ptep, | ||
| 412 | make_huge_pte(vma, new_page, 1)); | ||
| 413 | /* Make the old page be freed below */ | ||
| 414 | new_page = old_page; | ||
| 381 | } | 415 | } |
| 382 | out: | 416 | page_cache_release(new_page); |
| 383 | return page; | 417 | page_cache_release(old_page); |
| 418 | return VM_FAULT_MINOR; | ||
| 384 | } | 419 | } |
| 385 | 420 | ||
| 386 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 421 | int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 387 | unsigned long address, int write_access) | 422 | unsigned long address, pte_t *ptep, int write_access) |
| 388 | { | 423 | { |
| 389 | int ret = VM_FAULT_SIGBUS; | 424 | int ret = VM_FAULT_SIGBUS; |
| 390 | unsigned long idx; | 425 | unsigned long idx; |
| 391 | unsigned long size; | 426 | unsigned long size; |
| 392 | pte_t *pte; | ||
| 393 | struct page *page; | 427 | struct page *page; |
| 394 | struct address_space *mapping; | 428 | struct address_space *mapping; |
| 395 | 429 | pte_t new_pte; | |
| 396 | pte = huge_pte_alloc(mm, address); | ||
| 397 | if (!pte) | ||
| 398 | goto out; | ||
| 399 | 430 | ||
| 400 | mapping = vma->vm_file->f_mapping; | 431 | mapping = vma->vm_file->f_mapping; |
| 401 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 432 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) |
| @@ -405,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 405 | * Use page lock to guard against racing truncation | 436 | * Use page lock to guard against racing truncation |
| 406 | * before we get page_table_lock. | 437 | * before we get page_table_lock. |
| 407 | */ | 438 | */ |
| 408 | page = find_lock_huge_page(mapping, idx); | 439 | retry: |
| 409 | if (!page) | 440 | page = find_lock_page(mapping, idx); |
| 410 | goto out; | 441 | if (!page) { |
| 442 | if (hugetlb_get_quota(mapping)) | ||
| 443 | goto out; | ||
| 444 | page = alloc_huge_page(vma, address); | ||
| 445 | if (!page) { | ||
| 446 | hugetlb_put_quota(mapping); | ||
| 447 | goto out; | ||
| 448 | } | ||
| 449 | |||
| 450 | if (vma->vm_flags & VM_SHARED) { | ||
| 451 | int err; | ||
| 452 | |||
| 453 | err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); | ||
| 454 | if (err) { | ||
| 455 | put_page(page); | ||
| 456 | hugetlb_put_quota(mapping); | ||
| 457 | if (err == -EEXIST) | ||
| 458 | goto retry; | ||
| 459 | goto out; | ||
| 460 | } | ||
| 461 | } else | ||
| 462 | lock_page(page); | ||
| 463 | } | ||
| 411 | 464 | ||
| 412 | spin_lock(&mm->page_table_lock); | 465 | spin_lock(&mm->page_table_lock); |
| 413 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 466 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; |
| @@ -415,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 415 | goto backout; | 468 | goto backout; |
| 416 | 469 | ||
| 417 | ret = VM_FAULT_MINOR; | 470 | ret = VM_FAULT_MINOR; |
| 418 | if (!pte_none(*pte)) | 471 | if (!pte_none(*ptep)) |
| 419 | goto backout; | 472 | goto backout; |
| 420 | 473 | ||
| 421 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); | 474 | add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); |
| 422 | set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page)); | 475 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
| 476 | && (vma->vm_flags & VM_SHARED))); | ||
| 477 | set_huge_pte_at(mm, address, ptep, new_pte); | ||
| 478 | |||
| 479 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | ||
| 480 | /* Optimization, do the COW without a second fault */ | ||
| 481 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | ||
| 482 | } | ||
| 483 | |||
| 423 | spin_unlock(&mm->page_table_lock); | 484 | spin_unlock(&mm->page_table_lock); |
| 424 | unlock_page(page); | 485 | unlock_page(page); |
| 425 | out: | 486 | out: |
| @@ -433,6 +494,33 @@ backout: | |||
| 433 | goto out; | 494 | goto out; |
| 434 | } | 495 | } |
| 435 | 496 | ||
| 497 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 498 | unsigned long address, int write_access) | ||
| 499 | { | ||
| 500 | pte_t *ptep; | ||
| 501 | pte_t entry; | ||
| 502 | int ret; | ||
| 503 | |||
| 504 | ptep = huge_pte_alloc(mm, address); | ||
| 505 | if (!ptep) | ||
| 506 | return VM_FAULT_OOM; | ||
| 507 | |||
| 508 | entry = *ptep; | ||
| 509 | if (pte_none(entry)) | ||
| 510 | return hugetlb_no_page(mm, vma, address, ptep, write_access); | ||
| 511 | |||
| 512 | ret = VM_FAULT_MINOR; | ||
| 513 | |||
| 514 | spin_lock(&mm->page_table_lock); | ||
| 515 | /* Check for a racing update before calling hugetlb_cow */ | ||
| 516 | if (likely(pte_same(entry, *ptep))) | ||
| 517 | if (write_access && !pte_write(entry)) | ||
| 518 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | ||
| 519 | spin_unlock(&mm->page_table_lock); | ||
| 520 | |||
| 521 | return ret; | ||
| 522 | } | ||
| 523 | |||
| 436 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 524 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 437 | struct page **pages, struct vm_area_struct **vmas, | 525 | struct page **pages, struct vm_area_struct **vmas, |
| 438 | unsigned long *position, int *length, int i) | 526 | unsigned long *position, int *length, int i) |
diff --git a/mm/internal.h b/mm/internal.h index 6bf134e8fb3d..17256bb2f4ef 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -9,5 +9,22 @@ | |||
| 9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
| 10 | */ | 10 | */ |
| 11 | 11 | ||
| 12 | /* page_alloc.c */ | 12 | static inline void set_page_refs(struct page *page, int order) |
| 13 | extern void set_page_refs(struct page *page, int order); | 13 | { |
| 14 | #ifdef CONFIG_MMU | ||
| 15 | set_page_count(page, 1); | ||
| 16 | #else | ||
| 17 | int i; | ||
| 18 | |||
| 19 | /* | ||
| 20 | * We need to reference all the pages for this order, otherwise if | ||
| 21 | * anyone accesses one of the pages with (get/put) it will be freed. | ||
| 22 | * - eg: access_process_vm() | ||
| 23 | */ | ||
| 24 | for (i = 0; i < (1 << order); i++) | ||
| 25 | set_page_count(page + i, 1); | ||
| 26 | #endif /* CONFIG_MMU */ | ||
| 27 | } | ||
| 28 | |||
| 29 | extern void fastcall __init __free_pages_bootmem(struct page *page, | ||
| 30 | unsigned int order); | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 2b7cf0400a21..ae0ae3ea299a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma, | |||
| 140 | return 0; | 140 | return 0; |
| 141 | } | 141 | } |
| 142 | 142 | ||
| 143 | /* | ||
| 144 | * Application wants to free up the pages and associated backing store. | ||
| 145 | * This is effectively punching a hole into the middle of a file. | ||
| 146 | * | ||
| 147 | * NOTE: Currently, only shmfs/tmpfs is supported for this operation. | ||
| 148 | * Other filesystems return -ENOSYS. | ||
| 149 | */ | ||
| 150 | static long madvise_remove(struct vm_area_struct *vma, | ||
| 151 | unsigned long start, unsigned long end) | ||
| 152 | { | ||
| 153 | struct address_space *mapping; | ||
| 154 | loff_t offset, endoff; | ||
| 155 | |||
| 156 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | ||
| 157 | return -EINVAL; | ||
| 158 | |||
| 159 | if (!vma->vm_file || !vma->vm_file->f_mapping | ||
| 160 | || !vma->vm_file->f_mapping->host) { | ||
| 161 | return -EINVAL; | ||
| 162 | } | ||
| 163 | |||
| 164 | mapping = vma->vm_file->f_mapping; | ||
| 165 | |||
| 166 | offset = (loff_t)(start - vma->vm_start) | ||
| 167 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
| 168 | endoff = (loff_t)(end - vma->vm_start - 1) | ||
| 169 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
| 170 | return vmtruncate_range(mapping->host, offset, endoff); | ||
| 171 | } | ||
| 172 | |||
| 143 | static long | 173 | static long |
| 144 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | 174 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, |
| 145 | unsigned long start, unsigned long end, int behavior) | 175 | unsigned long start, unsigned long end, int behavior) |
| @@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
| 152 | case MADV_RANDOM: | 182 | case MADV_RANDOM: |
| 153 | error = madvise_behavior(vma, prev, start, end, behavior); | 183 | error = madvise_behavior(vma, prev, start, end, behavior); |
| 154 | break; | 184 | break; |
| 185 | case MADV_REMOVE: | ||
| 186 | error = madvise_remove(vma, start, end); | ||
| 187 | break; | ||
| 155 | 188 | ||
| 156 | case MADV_WILLNEED: | 189 | case MADV_WILLNEED: |
| 157 | error = madvise_willneed(vma, prev, start, end); | 190 | error = madvise_willneed(vma, prev, start, end); |
| @@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
| 190 | * some pages ahead. | 223 | * some pages ahead. |
| 191 | * MADV_DONTNEED - the application is finished with the given range, | 224 | * MADV_DONTNEED - the application is finished with the given range, |
| 192 | * so the kernel can free resources associated with it. | 225 | * so the kernel can free resources associated with it. |
| 226 | * MADV_REMOVE - the application wants to free up the given range of | ||
| 227 | * pages and associated backing store. | ||
| 193 | * | 228 | * |
| 194 | * return values: | 229 | * return values: |
| 195 | * zero - success | 230 | * zero - success |
diff --git a/mm/memory.c b/mm/memory.c index d8dde07a3656..7a11ddd5060f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1498,7 +1498,7 @@ gotten: | |||
| 1498 | update_mmu_cache(vma, address, entry); | 1498 | update_mmu_cache(vma, address, entry); |
| 1499 | lazy_mmu_prot_update(entry); | 1499 | lazy_mmu_prot_update(entry); |
| 1500 | lru_cache_add_active(new_page); | 1500 | lru_cache_add_active(new_page); |
| 1501 | page_add_anon_rmap(new_page, vma, address); | 1501 | page_add_new_anon_rmap(new_page, vma, address); |
| 1502 | 1502 | ||
| 1503 | /* Free the old page.. */ | 1503 | /* Free the old page.. */ |
| 1504 | new_page = old_page; | 1504 | new_page = old_page; |
| @@ -1770,9 +1770,32 @@ out_big: | |||
| 1770 | out_busy: | 1770 | out_busy: |
| 1771 | return -ETXTBSY; | 1771 | return -ETXTBSY; |
| 1772 | } | 1772 | } |
| 1773 | |||
| 1774 | EXPORT_SYMBOL(vmtruncate); | 1773 | EXPORT_SYMBOL(vmtruncate); |
| 1775 | 1774 | ||
| 1775 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | ||
| 1776 | { | ||
| 1777 | struct address_space *mapping = inode->i_mapping; | ||
| 1778 | |||
| 1779 | /* | ||
| 1780 | * If the underlying filesystem is not going to provide | ||
| 1781 | * a way to truncate a range of blocks (punch a hole) - | ||
| 1782 | * we should return failure right now. | ||
| 1783 | */ | ||
| 1784 | if (!inode->i_op || !inode->i_op->truncate_range) | ||
| 1785 | return -ENOSYS; | ||
| 1786 | |||
| 1787 | mutex_lock(&inode->i_mutex); | ||
| 1788 | down_write(&inode->i_alloc_sem); | ||
| 1789 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
| 1790 | truncate_inode_pages_range(mapping, offset, end); | ||
| 1791 | inode->i_op->truncate_range(inode, offset, end); | ||
| 1792 | up_write(&inode->i_alloc_sem); | ||
| 1793 | mutex_unlock(&inode->i_mutex); | ||
| 1794 | |||
| 1795 | return 0; | ||
| 1796 | } | ||
| 1797 | EXPORT_SYMBOL(vmtruncate_range); | ||
| 1798 | |||
| 1776 | /* | 1799 | /* |
| 1777 | * Primitive swap readahead code. We simply read an aligned block of | 1800 | * Primitive swap readahead code. We simply read an aligned block of |
| 1778 | * (1 << page_cluster) entries in the swap area. This method is chosen | 1801 | * (1 << page_cluster) entries in the swap area. This method is chosen |
| @@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1954 | goto release; | 1977 | goto release; |
| 1955 | inc_mm_counter(mm, anon_rss); | 1978 | inc_mm_counter(mm, anon_rss); |
| 1956 | lru_cache_add_active(page); | 1979 | lru_cache_add_active(page); |
| 1957 | SetPageReferenced(page); | 1980 | page_add_new_anon_rmap(page, vma, address); |
| 1958 | page_add_anon_rmap(page, vma, address); | ||
| 1959 | } else { | 1981 | } else { |
| 1960 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ | 1982 | /* Map the ZERO_PAGE - vm_page_prot is readonly */ |
| 1961 | page = ZERO_PAGE(address); | 1983 | page = ZERO_PAGE(address); |
| @@ -2086,7 +2108,7 @@ retry: | |||
| 2086 | if (anon) { | 2108 | if (anon) { |
| 2087 | inc_mm_counter(mm, anon_rss); | 2109 | inc_mm_counter(mm, anon_rss); |
| 2088 | lru_cache_add_active(new_page); | 2110 | lru_cache_add_active(new_page); |
| 2089 | page_add_anon_rmap(new_page, vma, address); | 2111 | page_add_new_anon_rmap(new_page, vma, address); |
| 2090 | } else { | 2112 | } else { |
| 2091 | inc_mm_counter(mm, file_rss); | 2113 | inc_mm_counter(mm, file_rss); |
| 2092 | page_add_file_rmap(new_page); | 2114 | page_add_file_rmap(new_page); |
| @@ -2245,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2245 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); | 2267 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); |
| 2246 | } | 2268 | } |
| 2247 | 2269 | ||
| 2270 | EXPORT_SYMBOL_GPL(__handle_mm_fault); | ||
| 2271 | |||
| 2248 | #ifndef __PAGETABLE_PUD_FOLDED | 2272 | #ifndef __PAGETABLE_PUD_FOLDED |
| 2249 | /* | 2273 | /* |
| 2250 | * Allocate page upper directory. | 2274 | * Allocate page upper directory. |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f6d4af8af8a8..a918f77f02f3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, | |||
| 42 | int nr_pages); | 42 | int nr_pages); |
| 43 | static int __add_section(struct zone *zone, unsigned long phys_start_pfn) | 43 | static int __add_section(struct zone *zone, unsigned long phys_start_pfn) |
| 44 | { | 44 | { |
| 45 | struct pglist_data *pgdat = zone->zone_pgdat; | ||
| 46 | int nr_pages = PAGES_PER_SECTION; | 45 | int nr_pages = PAGES_PER_SECTION; |
| 47 | int ret; | 46 | int ret; |
| 48 | 47 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bec88c81244e..b62cab575a84 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -83,9 +83,18 @@ | |||
| 83 | #include <linux/init.h> | 83 | #include <linux/init.h> |
| 84 | #include <linux/compat.h> | 84 | #include <linux/compat.h> |
| 85 | #include <linux/mempolicy.h> | 85 | #include <linux/mempolicy.h> |
| 86 | #include <linux/swap.h> | ||
| 87 | #include <linux/seq_file.h> | ||
| 88 | #include <linux/proc_fs.h> | ||
| 89 | |||
| 86 | #include <asm/tlbflush.h> | 90 | #include <asm/tlbflush.h> |
| 87 | #include <asm/uaccess.h> | 91 | #include <asm/uaccess.h> |
| 88 | 92 | ||
| 93 | /* Internal flags */ | ||
| 94 | #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ | ||
| 95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | ||
| 96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | ||
| 97 | |||
| 89 | static kmem_cache_t *policy_cache; | 98 | static kmem_cache_t *policy_cache; |
| 90 | static kmem_cache_t *sn_cache; | 99 | static kmem_cache_t *sn_cache; |
| 91 | 100 | ||
| @@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache; | |||
| 93 | 102 | ||
| 94 | /* Highest zone. An specific allocation for a zone below that is not | 103 | /* Highest zone. An specific allocation for a zone below that is not |
| 95 | policied. */ | 104 | policied. */ |
| 96 | static int policy_zone; | 105 | int policy_zone = ZONE_DMA; |
| 97 | 106 | ||
| 98 | struct mempolicy default_policy = { | 107 | struct mempolicy default_policy = { |
| 99 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 108 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
| @@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) | |||
| 131 | if (!zl) | 140 | if (!zl) |
| 132 | return NULL; | 141 | return NULL; |
| 133 | num = 0; | 142 | num = 0; |
| 134 | for_each_node_mask(nd, *nodes) { | 143 | for_each_node_mask(nd, *nodes) |
| 135 | int k; | 144 | zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; |
| 136 | for (k = MAX_NR_ZONES-1; k >= 0; k--) { | ||
| 137 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
| 138 | if (!z->present_pages) | ||
| 139 | continue; | ||
| 140 | zl->zones[num++] = z; | ||
| 141 | if (k > policy_zone) | ||
| 142 | policy_zone = k; | ||
| 143 | } | ||
| 144 | } | ||
| 145 | zl->zones[num] = NULL; | 145 | zl->zones[num] = NULL; |
| 146 | return zl; | 146 | return zl; |
| 147 | } | 147 | } |
| @@ -161,6 +161,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
| 161 | switch (mode) { | 161 | switch (mode) { |
| 162 | case MPOL_INTERLEAVE: | 162 | case MPOL_INTERLEAVE: |
| 163 | policy->v.nodes = *nodes; | 163 | policy->v.nodes = *nodes; |
| 164 | if (nodes_weight(*nodes) == 0) { | ||
| 165 | kmem_cache_free(policy_cache, policy); | ||
| 166 | return ERR_PTR(-EINVAL); | ||
| 167 | } | ||
| 164 | break; | 168 | break; |
| 165 | case MPOL_PREFERRED: | 169 | case MPOL_PREFERRED: |
| 166 | policy->v.preferred_node = first_node(*nodes); | 170 | policy->v.preferred_node = first_node(*nodes); |
| @@ -176,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
| 176 | break; | 180 | break; |
| 177 | } | 181 | } |
| 178 | policy->policy = mode; | 182 | policy->policy = mode; |
| 183 | policy->cpuset_mems_allowed = cpuset_mems_allowed(current); | ||
| 179 | return policy; | 184 | return policy; |
| 180 | } | 185 | } |
| 181 | 186 | ||
| 182 | /* Ensure all existing pages follow the policy. */ | 187 | static void gather_stats(struct page *, void *); |
| 188 | static void migrate_page_add(struct vm_area_struct *vma, | ||
| 189 | struct page *page, struct list_head *pagelist, unsigned long flags); | ||
| 190 | |||
| 191 | /* Scan through pages checking if pages follow certain conditions. */ | ||
| 183 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 192 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
| 184 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 193 | unsigned long addr, unsigned long end, |
| 194 | const nodemask_t *nodes, unsigned long flags, | ||
| 195 | void *private) | ||
| 185 | { | 196 | { |
| 186 | pte_t *orig_pte; | 197 | pte_t *orig_pte; |
| 187 | pte_t *pte; | 198 | pte_t *pte; |
| @@ -197,8 +208,20 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 197 | page = vm_normal_page(vma, addr, *pte); | 208 | page = vm_normal_page(vma, addr, *pte); |
| 198 | if (!page) | 209 | if (!page) |
| 199 | continue; | 210 | continue; |
| 211 | if (PageReserved(page)) | ||
| 212 | continue; | ||
| 200 | nid = page_to_nid(page); | 213 | nid = page_to_nid(page); |
| 201 | if (!node_isset(nid, *nodes)) | 214 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
| 215 | continue; | ||
| 216 | |||
| 217 | if (flags & MPOL_MF_STATS) | ||
| 218 | gather_stats(page, private); | ||
| 219 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | ||
| 220 | spin_unlock(ptl); | ||
| 221 | migrate_page_add(vma, page, private, flags); | ||
| 222 | spin_lock(ptl); | ||
| 223 | } | ||
| 224 | else | ||
| 202 | break; | 225 | break; |
| 203 | } while (pte++, addr += PAGE_SIZE, addr != end); | 226 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 204 | pte_unmap_unlock(orig_pte, ptl); | 227 | pte_unmap_unlock(orig_pte, ptl); |
| @@ -206,7 +229,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 206 | } | 229 | } |
| 207 | 230 | ||
| 208 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 231 | static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
| 209 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 232 | unsigned long addr, unsigned long end, |
| 233 | const nodemask_t *nodes, unsigned long flags, | ||
| 234 | void *private) | ||
| 210 | { | 235 | { |
| 211 | pmd_t *pmd; | 236 | pmd_t *pmd; |
| 212 | unsigned long next; | 237 | unsigned long next; |
| @@ -216,14 +241,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
| 216 | next = pmd_addr_end(addr, end); | 241 | next = pmd_addr_end(addr, end); |
| 217 | if (pmd_none_or_clear_bad(pmd)) | 242 | if (pmd_none_or_clear_bad(pmd)) |
| 218 | continue; | 243 | continue; |
| 219 | if (check_pte_range(vma, pmd, addr, next, nodes)) | 244 | if (check_pte_range(vma, pmd, addr, next, nodes, |
| 245 | flags, private)) | ||
| 220 | return -EIO; | 246 | return -EIO; |
| 221 | } while (pmd++, addr = next, addr != end); | 247 | } while (pmd++, addr = next, addr != end); |
| 222 | return 0; | 248 | return 0; |
| 223 | } | 249 | } |
| 224 | 250 | ||
| 225 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 251 | static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
| 226 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 252 | unsigned long addr, unsigned long end, |
| 253 | const nodemask_t *nodes, unsigned long flags, | ||
| 254 | void *private) | ||
| 227 | { | 255 | { |
| 228 | pud_t *pud; | 256 | pud_t *pud; |
| 229 | unsigned long next; | 257 | unsigned long next; |
| @@ -233,14 +261,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
| 233 | next = pud_addr_end(addr, end); | 261 | next = pud_addr_end(addr, end); |
| 234 | if (pud_none_or_clear_bad(pud)) | 262 | if (pud_none_or_clear_bad(pud)) |
| 235 | continue; | 263 | continue; |
| 236 | if (check_pmd_range(vma, pud, addr, next, nodes)) | 264 | if (check_pmd_range(vma, pud, addr, next, nodes, |
| 265 | flags, private)) | ||
| 237 | return -EIO; | 266 | return -EIO; |
| 238 | } while (pud++, addr = next, addr != end); | 267 | } while (pud++, addr = next, addr != end); |
| 239 | return 0; | 268 | return 0; |
| 240 | } | 269 | } |
| 241 | 270 | ||
| 242 | static inline int check_pgd_range(struct vm_area_struct *vma, | 271 | static inline int check_pgd_range(struct vm_area_struct *vma, |
| 243 | unsigned long addr, unsigned long end, nodemask_t *nodes) | 272 | unsigned long addr, unsigned long end, |
| 273 | const nodemask_t *nodes, unsigned long flags, | ||
| 274 | void *private) | ||
| 244 | { | 275 | { |
| 245 | pgd_t *pgd; | 276 | pgd_t *pgd; |
| 246 | unsigned long next; | 277 | unsigned long next; |
| @@ -250,16 +281,30 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
| 250 | next = pgd_addr_end(addr, end); | 281 | next = pgd_addr_end(addr, end); |
| 251 | if (pgd_none_or_clear_bad(pgd)) | 282 | if (pgd_none_or_clear_bad(pgd)) |
| 252 | continue; | 283 | continue; |
| 253 | if (check_pud_range(vma, pgd, addr, next, nodes)) | 284 | if (check_pud_range(vma, pgd, addr, next, nodes, |
| 285 | flags, private)) | ||
| 254 | return -EIO; | 286 | return -EIO; |
| 255 | } while (pgd++, addr = next, addr != end); | 287 | } while (pgd++, addr = next, addr != end); |
| 256 | return 0; | 288 | return 0; |
| 257 | } | 289 | } |
| 258 | 290 | ||
| 259 | /* Step 1: check the range */ | 291 | /* Check if a vma is migratable */ |
| 292 | static inline int vma_migratable(struct vm_area_struct *vma) | ||
| 293 | { | ||
| 294 | if (vma->vm_flags & ( | ||
| 295 | VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) | ||
| 296 | return 0; | ||
| 297 | return 1; | ||
| 298 | } | ||
| 299 | |||
| 300 | /* | ||
| 301 | * Check if all pages in a range are on a set of nodes. | ||
| 302 | * If pagelist != NULL then isolate pages from the LRU and | ||
| 303 | * put them on the pagelist. | ||
| 304 | */ | ||
| 260 | static struct vm_area_struct * | 305 | static struct vm_area_struct * |
| 261 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 306 | check_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
| 262 | nodemask_t *nodes, unsigned long flags) | 307 | const nodemask_t *nodes, unsigned long flags, void *private) |
| 263 | { | 308 | { |
| 264 | int err; | 309 | int err; |
| 265 | struct vm_area_struct *first, *vma, *prev; | 310 | struct vm_area_struct *first, *vma, *prev; |
| @@ -269,17 +314,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
| 269 | return ERR_PTR(-EFAULT); | 314 | return ERR_PTR(-EFAULT); |
| 270 | prev = NULL; | 315 | prev = NULL; |
| 271 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 316 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
| 272 | if (!vma->vm_next && vma->vm_end < end) | 317 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
| 273 | return ERR_PTR(-EFAULT); | 318 | if (!vma->vm_next && vma->vm_end < end) |
| 274 | if (prev && prev->vm_end < vma->vm_start) | 319 | return ERR_PTR(-EFAULT); |
| 275 | return ERR_PTR(-EFAULT); | 320 | if (prev && prev->vm_end < vma->vm_start) |
| 276 | if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) { | 321 | return ERR_PTR(-EFAULT); |
| 322 | } | ||
| 323 | if (!is_vm_hugetlb_page(vma) && | ||
| 324 | ((flags & MPOL_MF_STRICT) || | ||
| 325 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
| 326 | vma_migratable(vma)))) { | ||
| 277 | unsigned long endvma = vma->vm_end; | 327 | unsigned long endvma = vma->vm_end; |
| 328 | |||
| 278 | if (endvma > end) | 329 | if (endvma > end) |
| 279 | endvma = end; | 330 | endvma = end; |
| 280 | if (vma->vm_start > start) | 331 | if (vma->vm_start > start) |
| 281 | start = vma->vm_start; | 332 | start = vma->vm_start; |
| 282 | err = check_pgd_range(vma, start, endvma, nodes); | 333 | err = check_pgd_range(vma, start, endvma, nodes, |
| 334 | flags, private); | ||
| 283 | if (err) { | 335 | if (err) { |
| 284 | first = ERR_PTR(err); | 336 | first = ERR_PTR(err); |
| 285 | break; | 337 | break; |
| @@ -338,51 +390,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes) | |||
| 338 | if (!nodes) | 390 | if (!nodes) |
| 339 | return 0; | 391 | return 0; |
| 340 | 392 | ||
| 341 | /* Update current mems_allowed */ | 393 | cpuset_update_task_memory_state(); |
| 342 | cpuset_update_current_mems_allowed(); | 394 | if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) |
| 343 | /* Ignore nodes not set in current->mems_allowed */ | ||
| 344 | cpuset_restrict_to_mems_allowed(nodes->bits); | ||
| 345 | return mpol_check_policy(mode, nodes); | ||
| 346 | } | ||
| 347 | |||
| 348 | long do_mbind(unsigned long start, unsigned long len, | ||
| 349 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
| 350 | { | ||
| 351 | struct vm_area_struct *vma; | ||
| 352 | struct mm_struct *mm = current->mm; | ||
| 353 | struct mempolicy *new; | ||
| 354 | unsigned long end; | ||
| 355 | int err; | ||
| 356 | |||
| 357 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | ||
| 358 | return -EINVAL; | ||
| 359 | if (start & ~PAGE_MASK) | ||
| 360 | return -EINVAL; | ||
| 361 | if (mode == MPOL_DEFAULT) | ||
| 362 | flags &= ~MPOL_MF_STRICT; | ||
| 363 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
| 364 | end = start + len; | ||
| 365 | if (end < start) | ||
| 366 | return -EINVAL; | ||
| 367 | if (end == start) | ||
| 368 | return 0; | ||
| 369 | if (mpol_check_policy(mode, nmask)) | ||
| 370 | return -EINVAL; | 395 | return -EINVAL; |
| 371 | new = mpol_new(mode, nmask); | 396 | return mpol_check_policy(mode, nodes); |
| 372 | if (IS_ERR(new)) | ||
| 373 | return PTR_ERR(new); | ||
| 374 | |||
| 375 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
| 376 | mode,nodes_addr(nodes)[0]); | ||
| 377 | |||
| 378 | down_write(&mm->mmap_sem); | ||
| 379 | vma = check_range(mm, start, end, nmask, flags); | ||
| 380 | err = PTR_ERR(vma); | ||
| 381 | if (!IS_ERR(vma)) | ||
| 382 | err = mbind_range(vma, start, end, new); | ||
| 383 | up_write(&mm->mmap_sem); | ||
| 384 | mpol_free(new); | ||
| 385 | return err; | ||
| 386 | } | 397 | } |
| 387 | 398 | ||
| 388 | /* Set the process memory policy */ | 399 | /* Set the process memory policy */ |
| @@ -453,7 +464,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 453 | struct vm_area_struct *vma = NULL; | 464 | struct vm_area_struct *vma = NULL; |
| 454 | struct mempolicy *pol = current->mempolicy; | 465 | struct mempolicy *pol = current->mempolicy; |
| 455 | 466 | ||
| 456 | cpuset_update_current_mems_allowed(); | 467 | cpuset_update_task_memory_state(); |
| 457 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 468 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
| 458 | return -EINVAL; | 469 | return -EINVAL; |
| 459 | if (flags & MPOL_F_ADDR) { | 470 | if (flags & MPOL_F_ADDR) { |
| @@ -505,11 +516,177 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 505 | } | 516 | } |
| 506 | 517 | ||
| 507 | /* | 518 | /* |
| 519 | * page migration | ||
| 520 | */ | ||
| 521 | |||
| 522 | /* Check if we are the only process mapping the page in question */ | ||
| 523 | static inline int single_mm_mapping(struct mm_struct *mm, | ||
| 524 | struct address_space *mapping) | ||
| 525 | { | ||
| 526 | struct vm_area_struct *vma; | ||
| 527 | struct prio_tree_iter iter; | ||
| 528 | int rc = 1; | ||
| 529 | |||
| 530 | spin_lock(&mapping->i_mmap_lock); | ||
| 531 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
| 532 | if (mm != vma->vm_mm) { | ||
| 533 | rc = 0; | ||
| 534 | goto out; | ||
| 535 | } | ||
| 536 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
| 537 | if (mm != vma->vm_mm) { | ||
| 538 | rc = 0; | ||
| 539 | goto out; | ||
| 540 | } | ||
| 541 | out: | ||
| 542 | spin_unlock(&mapping->i_mmap_lock); | ||
| 543 | return rc; | ||
| 544 | } | ||
| 545 | |||
| 546 | /* | ||
| 547 | * Add a page to be migrated to the pagelist | ||
| 548 | */ | ||
| 549 | static void migrate_page_add(struct vm_area_struct *vma, | ||
| 550 | struct page *page, struct list_head *pagelist, unsigned long flags) | ||
| 551 | { | ||
| 552 | /* | ||
| 553 | * Avoid migrating a page that is shared by others and not writable. | ||
| 554 | */ | ||
| 555 | if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || | ||
| 556 | mapping_writably_mapped(page->mapping) || | ||
| 557 | single_mm_mapping(vma->vm_mm, page->mapping)) { | ||
| 558 | int rc = isolate_lru_page(page); | ||
| 559 | |||
| 560 | if (rc == 1) | ||
| 561 | list_add(&page->lru, pagelist); | ||
| 562 | /* | ||
| 563 | * If the isolate attempt was not successful then we just | ||
| 564 | * encountered an unswappable page. Something must be wrong. | ||
| 565 | */ | ||
| 566 | WARN_ON(rc == 0); | ||
| 567 | } | ||
| 568 | } | ||
| 569 | |||
| 570 | static int swap_pages(struct list_head *pagelist) | ||
| 571 | { | ||
| 572 | LIST_HEAD(moved); | ||
| 573 | LIST_HEAD(failed); | ||
| 574 | int n; | ||
| 575 | |||
| 576 | n = migrate_pages(pagelist, NULL, &moved, &failed); | ||
| 577 | putback_lru_pages(&failed); | ||
| 578 | putback_lru_pages(&moved); | ||
| 579 | |||
| 580 | return n; | ||
| 581 | } | ||
| 582 | |||
| 583 | /* | ||
| 584 | * For now migrate_pages simply swaps out the pages from nodes that are in | ||
| 585 | * the source set but not in the target set. In the future, we would | ||
| 586 | * want a function that moves pages between the two nodesets in such | ||
| 587 | * a way as to preserve the physical layout as much as possible. | ||
| 588 | * | ||
| 589 | * Returns the number of page that could not be moved. | ||
| 590 | */ | ||
| 591 | int do_migrate_pages(struct mm_struct *mm, | ||
| 592 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | ||
| 593 | { | ||
| 594 | LIST_HEAD(pagelist); | ||
| 595 | int count = 0; | ||
| 596 | nodemask_t nodes; | ||
| 597 | |||
| 598 | nodes_andnot(nodes, *from_nodes, *to_nodes); | ||
| 599 | |||
| 600 | down_read(&mm->mmap_sem); | ||
| 601 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, | ||
| 602 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | ||
| 603 | |||
| 604 | if (!list_empty(&pagelist)) { | ||
| 605 | count = swap_pages(&pagelist); | ||
| 606 | putback_lru_pages(&pagelist); | ||
| 607 | } | ||
| 608 | |||
| 609 | up_read(&mm->mmap_sem); | ||
| 610 | return count; | ||
| 611 | } | ||
| 612 | |||
| 613 | long do_mbind(unsigned long start, unsigned long len, | ||
| 614 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
| 615 | { | ||
| 616 | struct vm_area_struct *vma; | ||
| 617 | struct mm_struct *mm = current->mm; | ||
| 618 | struct mempolicy *new; | ||
| 619 | unsigned long end; | ||
| 620 | int err; | ||
| 621 | LIST_HEAD(pagelist); | ||
| 622 | |||
| 623 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT | | ||
| 624 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
| 625 | || mode > MPOL_MAX) | ||
| 626 | return -EINVAL; | ||
| 627 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) | ||
| 628 | return -EPERM; | ||
| 629 | |||
| 630 | if (start & ~PAGE_MASK) | ||
| 631 | return -EINVAL; | ||
| 632 | |||
| 633 | if (mode == MPOL_DEFAULT) | ||
| 634 | flags &= ~MPOL_MF_STRICT; | ||
| 635 | |||
| 636 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
| 637 | end = start + len; | ||
| 638 | |||
| 639 | if (end < start) | ||
| 640 | return -EINVAL; | ||
| 641 | if (end == start) | ||
| 642 | return 0; | ||
| 643 | |||
| 644 | if (mpol_check_policy(mode, nmask)) | ||
| 645 | return -EINVAL; | ||
| 646 | |||
| 647 | new = mpol_new(mode, nmask); | ||
| 648 | if (IS_ERR(new)) | ||
| 649 | return PTR_ERR(new); | ||
| 650 | |||
| 651 | /* | ||
| 652 | * If we are using the default policy then operation | ||
| 653 | * on discontinuous address spaces is okay after all | ||
| 654 | */ | ||
| 655 | if (!new) | ||
| 656 | flags |= MPOL_MF_DISCONTIG_OK; | ||
| 657 | |||
| 658 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
| 659 | mode,nodes_addr(nodes)[0]); | ||
| 660 | |||
| 661 | down_write(&mm->mmap_sem); | ||
| 662 | vma = check_range(mm, start, end, nmask, | ||
| 663 | flags | MPOL_MF_INVERT, &pagelist); | ||
| 664 | |||
| 665 | err = PTR_ERR(vma); | ||
| 666 | if (!IS_ERR(vma)) { | ||
| 667 | int nr_failed = 0; | ||
| 668 | |||
| 669 | err = mbind_range(vma, start, end, new); | ||
| 670 | if (!list_empty(&pagelist)) | ||
| 671 | nr_failed = swap_pages(&pagelist); | ||
| 672 | |||
| 673 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | ||
| 674 | err = -EIO; | ||
| 675 | } | ||
| 676 | if (!list_empty(&pagelist)) | ||
| 677 | putback_lru_pages(&pagelist); | ||
| 678 | |||
| 679 | up_write(&mm->mmap_sem); | ||
| 680 | mpol_free(new); | ||
| 681 | return err; | ||
| 682 | } | ||
| 683 | |||
| 684 | /* | ||
| 508 | * User space interface with variable sized bitmaps for nodelists. | 685 | * User space interface with variable sized bitmaps for nodelists. |
| 509 | */ | 686 | */ |
| 510 | 687 | ||
| 511 | /* Copy a node mask from user space. */ | 688 | /* Copy a node mask from user space. */ |
| 512 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | 689 | static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, |
| 513 | unsigned long maxnode) | 690 | unsigned long maxnode) |
| 514 | { | 691 | { |
| 515 | unsigned long k; | 692 | unsigned long k; |
| @@ -598,6 +775,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | |||
| 598 | return do_set_mempolicy(mode, &nodes); | 775 | return do_set_mempolicy(mode, &nodes); |
| 599 | } | 776 | } |
| 600 | 777 | ||
| 778 | asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | ||
| 779 | const unsigned long __user *old_nodes, | ||
| 780 | const unsigned long __user *new_nodes) | ||
| 781 | { | ||
| 782 | struct mm_struct *mm; | ||
| 783 | struct task_struct *task; | ||
| 784 | nodemask_t old; | ||
| 785 | nodemask_t new; | ||
| 786 | nodemask_t task_nodes; | ||
| 787 | int err; | ||
| 788 | |||
| 789 | err = get_nodes(&old, old_nodes, maxnode); | ||
| 790 | if (err) | ||
| 791 | return err; | ||
| 792 | |||
| 793 | err = get_nodes(&new, new_nodes, maxnode); | ||
| 794 | if (err) | ||
| 795 | return err; | ||
| 796 | |||
| 797 | /* Find the mm_struct */ | ||
| 798 | read_lock(&tasklist_lock); | ||
| 799 | task = pid ? find_task_by_pid(pid) : current; | ||
| 800 | if (!task) { | ||
| 801 | read_unlock(&tasklist_lock); | ||
| 802 | return -ESRCH; | ||
| 803 | } | ||
| 804 | mm = get_task_mm(task); | ||
| 805 | read_unlock(&tasklist_lock); | ||
| 806 | |||
| 807 | if (!mm) | ||
| 808 | return -EINVAL; | ||
| 809 | |||
| 810 | /* | ||
| 811 | * Check if this process has the right to modify the specified | ||
| 812 | * process. The right exists if the process has administrative | ||
| 813 | * capabilities, superuser priviledges or the same | ||
| 814 | * userid as the target process. | ||
| 815 | */ | ||
| 816 | if ((current->euid != task->suid) && (current->euid != task->uid) && | ||
| 817 | (current->uid != task->suid) && (current->uid != task->uid) && | ||
| 818 | !capable(CAP_SYS_ADMIN)) { | ||
| 819 | err = -EPERM; | ||
| 820 | goto out; | ||
| 821 | } | ||
| 822 | |||
| 823 | task_nodes = cpuset_mems_allowed(task); | ||
| 824 | /* Is the user allowed to access the target nodes? */ | ||
| 825 | if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { | ||
| 826 | err = -EPERM; | ||
| 827 | goto out; | ||
| 828 | } | ||
| 829 | |||
| 830 | err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); | ||
| 831 | out: | ||
| 832 | mmput(mm); | ||
| 833 | return err; | ||
| 834 | } | ||
| 835 | |||
| 836 | |||
| 601 | /* Retrieve NUMA policy */ | 837 | /* Retrieve NUMA policy */ |
| 602 | asmlinkage long sys_get_mempolicy(int __user *policy, | 838 | asmlinkage long sys_get_mempolicy(int __user *policy, |
| 603 | unsigned long __user *nmask, | 839 | unsigned long __user *nmask, |
| @@ -704,8 +940,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
| 704 | #endif | 940 | #endif |
| 705 | 941 | ||
| 706 | /* Return effective policy for a VMA */ | 942 | /* Return effective policy for a VMA */ |
| 707 | struct mempolicy * | 943 | static struct mempolicy * get_vma_policy(struct task_struct *task, |
| 708 | get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) | 944 | struct vm_area_struct *vma, unsigned long addr) |
| 709 | { | 945 | { |
| 710 | struct mempolicy *pol = task->mempolicy; | 946 | struct mempolicy *pol = task->mempolicy; |
| 711 | 947 | ||
| @@ -781,6 +1017,34 @@ static unsigned offset_il_node(struct mempolicy *pol, | |||
| 781 | return nid; | 1017 | return nid; |
| 782 | } | 1018 | } |
| 783 | 1019 | ||
| 1020 | /* Determine a node number for interleave */ | ||
| 1021 | static inline unsigned interleave_nid(struct mempolicy *pol, | ||
| 1022 | struct vm_area_struct *vma, unsigned long addr, int shift) | ||
| 1023 | { | ||
| 1024 | if (vma) { | ||
| 1025 | unsigned long off; | ||
| 1026 | |||
| 1027 | off = vma->vm_pgoff; | ||
| 1028 | off += (addr - vma->vm_start) >> shift; | ||
| 1029 | return offset_il_node(pol, vma, off); | ||
| 1030 | } else | ||
| 1031 | return interleave_nodes(pol); | ||
| 1032 | } | ||
| 1033 | |||
| 1034 | /* Return a zonelist suitable for a huge page allocation. */ | ||
| 1035 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) | ||
| 1036 | { | ||
| 1037 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
| 1038 | |||
| 1039 | if (pol->policy == MPOL_INTERLEAVE) { | ||
| 1040 | unsigned nid; | ||
| 1041 | |||
| 1042 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | ||
| 1043 | return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); | ||
| 1044 | } | ||
| 1045 | return zonelist_policy(GFP_HIGHUSER, pol); | ||
| 1046 | } | ||
| 1047 | |||
| 784 | /* Allocate a page in interleaved policy. | 1048 | /* Allocate a page in interleaved policy. |
| 785 | Own path because it needs to do special accounting. */ | 1049 | Own path because it needs to do special accounting. */ |
| 786 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | 1050 | static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, |
| @@ -825,19 +1089,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 825 | { | 1089 | { |
| 826 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1090 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
| 827 | 1091 | ||
| 828 | cpuset_update_current_mems_allowed(); | 1092 | cpuset_update_task_memory_state(); |
| 829 | 1093 | ||
| 830 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { | 1094 | if (unlikely(pol->policy == MPOL_INTERLEAVE)) { |
| 831 | unsigned nid; | 1095 | unsigned nid; |
| 832 | if (vma) { | 1096 | |
| 833 | unsigned long off; | 1097 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); |
| 834 | off = vma->vm_pgoff; | ||
| 835 | off += (addr - vma->vm_start) >> PAGE_SHIFT; | ||
| 836 | nid = offset_il_node(pol, vma, off); | ||
| 837 | } else { | ||
| 838 | /* fall back to process interleaving */ | ||
| 839 | nid = interleave_nodes(pol); | ||
| 840 | } | ||
| 841 | return alloc_page_interleave(gfp, 0, nid); | 1098 | return alloc_page_interleave(gfp, 0, nid); |
| 842 | } | 1099 | } |
| 843 | return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); | 1100 | return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); |
| @@ -858,7 +1115,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 858 | * interrupt context and apply the current process NUMA policy. | 1115 | * interrupt context and apply the current process NUMA policy. |
| 859 | * Returns NULL when no page can be allocated. | 1116 | * Returns NULL when no page can be allocated. |
| 860 | * | 1117 | * |
| 861 | * Don't call cpuset_update_current_mems_allowed() unless | 1118 | * Don't call cpuset_update_task_memory_state() unless |
| 862 | * 1) it's ok to take cpuset_sem (can WAIT), and | 1119 | * 1) it's ok to take cpuset_sem (can WAIT), and |
| 863 | * 2) allocating for current task (not interrupt). | 1120 | * 2) allocating for current task (not interrupt). |
| 864 | */ | 1121 | */ |
| @@ -867,7 +1124,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
| 867 | struct mempolicy *pol = current->mempolicy; | 1124 | struct mempolicy *pol = current->mempolicy; |
| 868 | 1125 | ||
| 869 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | 1126 | if ((gfp & __GFP_WAIT) && !in_interrupt()) |
| 870 | cpuset_update_current_mems_allowed(); | 1127 | cpuset_update_task_memory_state(); |
| 871 | if (!pol || in_interrupt()) | 1128 | if (!pol || in_interrupt()) |
| 872 | pol = &default_policy; | 1129 | pol = &default_policy; |
| 873 | if (pol->policy == MPOL_INTERLEAVE) | 1130 | if (pol->policy == MPOL_INTERLEAVE) |
| @@ -876,6 +1133,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
| 876 | } | 1133 | } |
| 877 | EXPORT_SYMBOL(alloc_pages_current); | 1134 | EXPORT_SYMBOL(alloc_pages_current); |
| 878 | 1135 | ||
| 1136 | /* | ||
| 1137 | * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it | ||
| 1138 | * rebinds the mempolicy its copying by calling mpol_rebind_policy() | ||
| 1139 | * with the mems_allowed returned by cpuset_mems_allowed(). This | ||
| 1140 | * keeps mempolicies cpuset relative after its cpuset moves. See | ||
| 1141 | * further kernel/cpuset.c update_nodemask(). | ||
| 1142 | */ | ||
| 1143 | void *cpuset_being_rebound; | ||
| 1144 | |||
| 879 | /* Slow path of a mempolicy copy */ | 1145 | /* Slow path of a mempolicy copy */ |
| 880 | struct mempolicy *__mpol_copy(struct mempolicy *old) | 1146 | struct mempolicy *__mpol_copy(struct mempolicy *old) |
| 881 | { | 1147 | { |
| @@ -883,6 +1149,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
| 883 | 1149 | ||
| 884 | if (!new) | 1150 | if (!new) |
| 885 | return ERR_PTR(-ENOMEM); | 1151 | return ERR_PTR(-ENOMEM); |
| 1152 | if (current_cpuset_is_being_rebound()) { | ||
| 1153 | nodemask_t mems = cpuset_mems_allowed(current); | ||
| 1154 | mpol_rebind_policy(old, &mems); | ||
| 1155 | } | ||
| 886 | *new = *old; | 1156 | *new = *old; |
| 887 | atomic_set(&new->refcnt, 1); | 1157 | atomic_set(&new->refcnt, 1); |
| 888 | if (new->policy == MPOL_BIND) { | 1158 | if (new->policy == MPOL_BIND) { |
| @@ -936,54 +1206,6 @@ void __mpol_free(struct mempolicy *p) | |||
| 936 | } | 1206 | } |
| 937 | 1207 | ||
| 938 | /* | 1208 | /* |
| 939 | * Hugetlb policy. Same as above, just works with node numbers instead of | ||
| 940 | * zonelists. | ||
| 941 | */ | ||
| 942 | |||
| 943 | /* Find first node suitable for an allocation */ | ||
| 944 | int mpol_first_node(struct vm_area_struct *vma, unsigned long addr) | ||
| 945 | { | ||
| 946 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
| 947 | |||
| 948 | switch (pol->policy) { | ||
| 949 | case MPOL_DEFAULT: | ||
| 950 | return numa_node_id(); | ||
| 951 | case MPOL_BIND: | ||
| 952 | return pol->v.zonelist->zones[0]->zone_pgdat->node_id; | ||
| 953 | case MPOL_INTERLEAVE: | ||
| 954 | return interleave_nodes(pol); | ||
| 955 | case MPOL_PREFERRED: | ||
| 956 | return pol->v.preferred_node >= 0 ? | ||
| 957 | pol->v.preferred_node : numa_node_id(); | ||
| 958 | } | ||
| 959 | BUG(); | ||
| 960 | return 0; | ||
| 961 | } | ||
| 962 | |||
| 963 | /* Find secondary valid nodes for an allocation */ | ||
| 964 | int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr) | ||
| 965 | { | ||
| 966 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | ||
| 967 | |||
| 968 | switch (pol->policy) { | ||
| 969 | case MPOL_PREFERRED: | ||
| 970 | case MPOL_DEFAULT: | ||
| 971 | case MPOL_INTERLEAVE: | ||
| 972 | return 1; | ||
| 973 | case MPOL_BIND: { | ||
| 974 | struct zone **z; | ||
| 975 | for (z = pol->v.zonelist->zones; *z; z++) | ||
| 976 | if ((*z)->zone_pgdat->node_id == nid) | ||
| 977 | return 1; | ||
| 978 | return 0; | ||
| 979 | } | ||
| 980 | default: | ||
| 981 | BUG(); | ||
| 982 | return 0; | ||
| 983 | } | ||
| 984 | } | ||
| 985 | |||
| 986 | /* | ||
| 987 | * Shared memory backing store policy support. | 1209 | * Shared memory backing store policy support. |
| 988 | * | 1210 | * |
| 989 | * Remember policies even when nobody has shared memory mapped. | 1211 | * Remember policies even when nobody has shared memory mapped. |
| @@ -1205,25 +1427,31 @@ void numa_default_policy(void) | |||
| 1205 | } | 1427 | } |
| 1206 | 1428 | ||
| 1207 | /* Migrate a policy to a different set of nodes */ | 1429 | /* Migrate a policy to a different set of nodes */ |
| 1208 | static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | 1430 | void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) |
| 1209 | const nodemask_t *new) | ||
| 1210 | { | 1431 | { |
| 1432 | nodemask_t *mpolmask; | ||
| 1211 | nodemask_t tmp; | 1433 | nodemask_t tmp; |
| 1212 | 1434 | ||
| 1213 | if (!pol) | 1435 | if (!pol) |
| 1214 | return; | 1436 | return; |
| 1437 | mpolmask = &pol->cpuset_mems_allowed; | ||
| 1438 | if (nodes_equal(*mpolmask, *newmask)) | ||
| 1439 | return; | ||
| 1215 | 1440 | ||
| 1216 | switch (pol->policy) { | 1441 | switch (pol->policy) { |
| 1217 | case MPOL_DEFAULT: | 1442 | case MPOL_DEFAULT: |
| 1218 | break; | 1443 | break; |
| 1219 | case MPOL_INTERLEAVE: | 1444 | case MPOL_INTERLEAVE: |
| 1220 | nodes_remap(tmp, pol->v.nodes, *old, *new); | 1445 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); |
| 1221 | pol->v.nodes = tmp; | 1446 | pol->v.nodes = tmp; |
| 1222 | current->il_next = node_remap(current->il_next, *old, *new); | 1447 | *mpolmask = *newmask; |
| 1448 | current->il_next = node_remap(current->il_next, | ||
| 1449 | *mpolmask, *newmask); | ||
| 1223 | break; | 1450 | break; |
| 1224 | case MPOL_PREFERRED: | 1451 | case MPOL_PREFERRED: |
| 1225 | pol->v.preferred_node = node_remap(pol->v.preferred_node, | 1452 | pol->v.preferred_node = node_remap(pol->v.preferred_node, |
| 1226 | *old, *new); | 1453 | *mpolmask, *newmask); |
| 1454 | *mpolmask = *newmask; | ||
| 1227 | break; | 1455 | break; |
| 1228 | case MPOL_BIND: { | 1456 | case MPOL_BIND: { |
| 1229 | nodemask_t nodes; | 1457 | nodemask_t nodes; |
| @@ -1233,7 +1461,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
| 1233 | nodes_clear(nodes); | 1461 | nodes_clear(nodes); |
| 1234 | for (z = pol->v.zonelist->zones; *z; z++) | 1462 | for (z = pol->v.zonelist->zones; *z; z++) |
| 1235 | node_set((*z)->zone_pgdat->node_id, nodes); | 1463 | node_set((*z)->zone_pgdat->node_id, nodes); |
| 1236 | nodes_remap(tmp, nodes, *old, *new); | 1464 | nodes_remap(tmp, nodes, *mpolmask, *newmask); |
| 1237 | nodes = tmp; | 1465 | nodes = tmp; |
| 1238 | 1466 | ||
| 1239 | zonelist = bind_zonelist(&nodes); | 1467 | zonelist = bind_zonelist(&nodes); |
| @@ -1248,6 +1476,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
| 1248 | kfree(pol->v.zonelist); | 1476 | kfree(pol->v.zonelist); |
| 1249 | pol->v.zonelist = zonelist; | 1477 | pol->v.zonelist = zonelist; |
| 1250 | } | 1478 | } |
| 1479 | *mpolmask = *newmask; | ||
| 1251 | break; | 1480 | break; |
| 1252 | } | 1481 | } |
| 1253 | default: | 1482 | default: |
| @@ -1257,12 +1486,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, | |||
| 1257 | } | 1486 | } |
| 1258 | 1487 | ||
| 1259 | /* | 1488 | /* |
| 1260 | * Someone moved this task to different nodes. Fixup mempolicies. | 1489 | * Wrapper for mpol_rebind_policy() that just requires task |
| 1490 | * pointer, and updates task mempolicy. | ||
| 1491 | */ | ||
| 1492 | |||
| 1493 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | ||
| 1494 | { | ||
| 1495 | mpol_rebind_policy(tsk->mempolicy, new); | ||
| 1496 | } | ||
| 1497 | |||
| 1498 | /* | ||
| 1499 | * Rebind each vma in mm to new nodemask. | ||
| 1261 | * | 1500 | * |
| 1262 | * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, | 1501 | * Call holding a reference to mm. Takes mm->mmap_sem during call. |
| 1263 | * once we have a cpuset mechanism to mark which cpuset subtree is migrating. | ||
| 1264 | */ | 1502 | */ |
| 1265 | void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) | 1503 | |
| 1504 | void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) | ||
| 1505 | { | ||
| 1506 | struct vm_area_struct *vma; | ||
| 1507 | |||
| 1508 | down_write(&mm->mmap_sem); | ||
| 1509 | for (vma = mm->mmap; vma; vma = vma->vm_next) | ||
| 1510 | mpol_rebind_policy(vma->vm_policy, new); | ||
| 1511 | up_write(&mm->mmap_sem); | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | /* | ||
| 1515 | * Display pages allocated per node and memory policy via /proc. | ||
| 1516 | */ | ||
| 1517 | |||
| 1518 | static const char *policy_types[] = { "default", "prefer", "bind", | ||
| 1519 | "interleave" }; | ||
| 1520 | |||
| 1521 | /* | ||
| 1522 | * Convert a mempolicy into a string. | ||
| 1523 | * Returns the number of characters in buffer (if positive) | ||
| 1524 | * or an error (negative) | ||
| 1525 | */ | ||
| 1526 | static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | ||
| 1527 | { | ||
| 1528 | char *p = buffer; | ||
| 1529 | int l; | ||
| 1530 | nodemask_t nodes; | ||
| 1531 | int mode = pol ? pol->policy : MPOL_DEFAULT; | ||
| 1532 | |||
| 1533 | switch (mode) { | ||
| 1534 | case MPOL_DEFAULT: | ||
| 1535 | nodes_clear(nodes); | ||
| 1536 | break; | ||
| 1537 | |||
| 1538 | case MPOL_PREFERRED: | ||
| 1539 | nodes_clear(nodes); | ||
| 1540 | node_set(pol->v.preferred_node, nodes); | ||
| 1541 | break; | ||
| 1542 | |||
| 1543 | case MPOL_BIND: | ||
| 1544 | get_zonemask(pol, &nodes); | ||
| 1545 | break; | ||
| 1546 | |||
| 1547 | case MPOL_INTERLEAVE: | ||
| 1548 | nodes = pol->v.nodes; | ||
| 1549 | break; | ||
| 1550 | |||
| 1551 | default: | ||
| 1552 | BUG(); | ||
| 1553 | return -EFAULT; | ||
| 1554 | } | ||
| 1555 | |||
| 1556 | l = strlen(policy_types[mode]); | ||
| 1557 | if (buffer + maxlen < p + l + 1) | ||
| 1558 | return -ENOSPC; | ||
| 1559 | |||
| 1560 | strcpy(p, policy_types[mode]); | ||
| 1561 | p += l; | ||
| 1562 | |||
| 1563 | if (!nodes_empty(nodes)) { | ||
| 1564 | if (buffer + maxlen < p + 2) | ||
| 1565 | return -ENOSPC; | ||
| 1566 | *p++ = '='; | ||
| 1567 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | ||
| 1568 | } | ||
| 1569 | return p - buffer; | ||
| 1570 | } | ||
| 1571 | |||
| 1572 | struct numa_maps { | ||
| 1573 | unsigned long pages; | ||
| 1574 | unsigned long anon; | ||
| 1575 | unsigned long mapped; | ||
| 1576 | unsigned long mapcount_max; | ||
| 1577 | unsigned long node[MAX_NUMNODES]; | ||
| 1578 | }; | ||
| 1579 | |||
| 1580 | static void gather_stats(struct page *page, void *private) | ||
| 1266 | { | 1581 | { |
| 1267 | rebind_policy(current->mempolicy, old, new); | 1582 | struct numa_maps *md = private; |
| 1583 | int count = page_mapcount(page); | ||
| 1584 | |||
| 1585 | if (count) | ||
| 1586 | md->mapped++; | ||
| 1587 | |||
| 1588 | if (count > md->mapcount_max) | ||
| 1589 | md->mapcount_max = count; | ||
| 1590 | |||
| 1591 | md->pages++; | ||
| 1592 | |||
| 1593 | if (PageAnon(page)) | ||
| 1594 | md->anon++; | ||
| 1595 | |||
| 1596 | md->node[page_to_nid(page)]++; | ||
| 1597 | cond_resched(); | ||
| 1598 | } | ||
| 1599 | |||
| 1600 | int show_numa_map(struct seq_file *m, void *v) | ||
| 1601 | { | ||
| 1602 | struct task_struct *task = m->private; | ||
| 1603 | struct vm_area_struct *vma = v; | ||
| 1604 | struct numa_maps *md; | ||
| 1605 | int n; | ||
| 1606 | char buffer[50]; | ||
| 1607 | |||
| 1608 | if (!vma->vm_mm) | ||
| 1609 | return 0; | ||
| 1610 | |||
| 1611 | md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); | ||
| 1612 | if (!md) | ||
| 1613 | return 0; | ||
| 1614 | |||
| 1615 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | ||
| 1616 | &node_online_map, MPOL_MF_STATS, md); | ||
| 1617 | |||
| 1618 | if (md->pages) { | ||
| 1619 | mpol_to_str(buffer, sizeof(buffer), | ||
| 1620 | get_vma_policy(task, vma, vma->vm_start)); | ||
| 1621 | |||
| 1622 | seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", | ||
| 1623 | vma->vm_start, buffer, md->pages, | ||
| 1624 | md->mapped, md->mapcount_max); | ||
| 1625 | |||
| 1626 | if (md->anon) | ||
| 1627 | seq_printf(m," anon=%lu",md->anon); | ||
| 1628 | |||
| 1629 | for_each_online_node(n) | ||
| 1630 | if (md->node[n]) | ||
| 1631 | seq_printf(m, " N%d=%lu", n, md->node[n]); | ||
| 1632 | |||
| 1633 | seq_putc(m, '\n'); | ||
| 1634 | } | ||
| 1635 | kfree(md); | ||
| 1636 | |||
| 1637 | if (m->count < m->size) | ||
| 1638 | m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; | ||
| 1639 | return 0; | ||
| 1268 | } | 1640 | } |
| 1641 | |||
diff --git a/mm/mlock.c b/mm/mlock.c index 4ae3a46ff768..b90c59573abf 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | * (C) Copyright 2002 Christoph Hellwig | 5 | * (C) Copyright 2002 Christoph Hellwig |
| 6 | */ | 6 | */ |
| 7 | 7 | ||
| 8 | #include <linux/capability.h> | ||
| 8 | #include <linux/mman.h> | 9 | #include <linux/mman.h> |
| 9 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
| 10 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
| 14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
| 15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
| 16 | #include <linux/capability.h> | ||
| 16 | #include <linux/init.h> | 17 | #include <linux/init.h> |
| 17 | #include <linux/file.h> | 18 | #include <linux/file.h> |
| 18 | #include <linux/fs.h> | 19 | #include <linux/fs.h> |
diff --git a/mm/mremap.c b/mm/mremap.c index ddaeee9a0b69..1903bdf65e42 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/shm.h> | 13 | #include <linux/shm.h> |
| 14 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
| 15 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
| 16 | #include <linux/capability.h> | ||
| 16 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
| 17 | #include <linux/highmem.h> | 18 | #include <linux/highmem.h> |
| 18 | #include <linux/security.h> | 19 | #include <linux/security.h> |
diff --git a/mm/msync.c b/mm/msync.c index 1b5b6f662dcf..3563a56e1a51 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
| @@ -137,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma, | |||
| 137 | ret = filemap_fdatawrite(mapping); | 137 | ret = filemap_fdatawrite(mapping); |
| 138 | if (file->f_op && file->f_op->fsync) { | 138 | if (file->f_op && file->f_op->fsync) { |
| 139 | /* | 139 | /* |
| 140 | * We don't take i_sem here because mmap_sem | 140 | * We don't take i_mutex here because mmap_sem |
| 141 | * is already held. | 141 | * is already held. |
| 142 | */ | 142 | */ |
| 143 | err = file->f_op->fsync(file,file->f_dentry,1); | 143 | err = file->f_op->fsync(file,file->f_dentry,1); |
diff --git a/mm/nommu.c b/mm/nommu.c index c1196812876b..c10262d68232 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr) | |||
| 1177 | { | 1177 | { |
| 1178 | return 0; | 1178 | return 0; |
| 1179 | } | 1179 | } |
| 1180 | |||
| 1181 | struct page *filemap_nopage(struct vm_area_struct *area, | ||
| 1182 | unsigned long address, int *type) | ||
| 1183 | { | ||
| 1184 | BUG(); | ||
| 1185 | return NULL; | ||
| 1186 | } | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d348b9035955..4748b906aff2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -298,7 +298,8 @@ retry: | |||
| 298 | 298 | ||
| 299 | /* | 299 | /* |
| 300 | * Give "p" a good chance of killing itself before we | 300 | * Give "p" a good chance of killing itself before we |
| 301 | * retry to allocate memory. | 301 | * retry to allocate memory unless "p" is current |
| 302 | */ | 302 | */ |
| 303 | schedule_timeout_interruptible(1); | 303 | if (!test_thread_flag(TIF_MEMDIE)) |
| 304 | schedule_timeout_interruptible(1); | ||
| 304 | } | 305 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0166ea15c9ee..5240e426c1f7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -550,11 +550,17 @@ void __init page_writeback_init(void) | |||
| 550 | 550 | ||
| 551 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | 551 | int do_writepages(struct address_space *mapping, struct writeback_control *wbc) |
| 552 | { | 552 | { |
| 553 | int ret; | ||
| 554 | |||
| 553 | if (wbc->nr_to_write <= 0) | 555 | if (wbc->nr_to_write <= 0) |
| 554 | return 0; | 556 | return 0; |
| 557 | wbc->for_writepages = 1; | ||
| 555 | if (mapping->a_ops->writepages) | 558 | if (mapping->a_ops->writepages) |
| 556 | return mapping->a_ops->writepages(mapping, wbc); | 559 | ret = mapping->a_ops->writepages(mapping, wbc); |
| 557 | return generic_writepages(mapping, wbc); | 560 | else |
| 561 | ret = generic_writepages(mapping, wbc); | ||
| 562 | wbc->for_writepages = 0; | ||
| 563 | return ret; | ||
| 558 | } | 564 | } |
| 559 | 565 | ||
| 560 | /** | 566 | /** |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe14a8c87fc2..8c960b469593 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -36,6 +36,7 @@ | |||
| 36 | #include <linux/memory_hotplug.h> | 36 | #include <linux/memory_hotplug.h> |
| 37 | #include <linux/nodemask.h> | 37 | #include <linux/nodemask.h> |
| 38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
| 39 | #include <linux/mempolicy.h> | ||
| 39 | 40 | ||
| 40 | #include <asm/tlbflush.h> | 41 | #include <asm/tlbflush.h> |
| 41 | #include "internal.h" | 42 | #include "internal.h" |
| @@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly; | |||
| 52 | unsigned long totalram_pages __read_mostly; | 53 | unsigned long totalram_pages __read_mostly; |
| 53 | unsigned long totalhigh_pages __read_mostly; | 54 | unsigned long totalhigh_pages __read_mostly; |
| 54 | long nr_swap_pages; | 55 | long nr_swap_pages; |
| 56 | int percpu_pagelist_fraction; | ||
| 57 | |||
| 58 | static void fastcall free_hot_cold_page(struct page *page, int cold); | ||
| 55 | 59 | ||
| 56 | /* | 60 | /* |
| 57 | * results with 256, 32 in the lowmem_reserve sysctl: | 61 | * results with 256, 32 in the lowmem_reserve sysctl: |
| @@ -81,6 +85,7 @@ int min_free_kbytes = 1024; | |||
| 81 | unsigned long __initdata nr_kernel_pages; | 85 | unsigned long __initdata nr_kernel_pages; |
| 82 | unsigned long __initdata nr_all_pages; | 86 | unsigned long __initdata nr_all_pages; |
| 83 | 87 | ||
| 88 | #ifdef CONFIG_DEBUG_VM | ||
| 84 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 89 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| 85 | { | 90 | { |
| 86 | int ret = 0; | 91 | int ret = 0; |
| @@ -122,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page) | |||
| 122 | return 0; | 127 | return 0; |
| 123 | } | 128 | } |
| 124 | 129 | ||
| 125 | static void bad_page(const char *function, struct page *page) | 130 | #else |
| 131 | static inline int bad_range(struct zone *zone, struct page *page) | ||
| 126 | { | 132 | { |
| 127 | printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", | 133 | return 0; |
| 128 | function, current->comm, page); | 134 | } |
| 129 | printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", | 135 | #endif |
| 130 | (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, | 136 | |
| 131 | page->mapping, page_mapcount(page), page_count(page)); | 137 | static void bad_page(struct page *page) |
| 132 | printk(KERN_EMERG "Backtrace:\n"); | 138 | { |
| 139 | printk(KERN_EMERG "Bad page state in process '%s'\n" | ||
| 140 | KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" | ||
| 141 | KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | ||
| 142 | KERN_EMERG "Backtrace:\n", | ||
| 143 | current->comm, page, (int)(2*sizeof(unsigned long)), | ||
| 144 | (unsigned long)page->flags, page->mapping, | ||
| 145 | page_mapcount(page), page_count(page)); | ||
| 133 | dump_stack(); | 146 | dump_stack(); |
| 134 | printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); | ||
| 135 | page->flags &= ~(1 << PG_lru | | 147 | page->flags &= ~(1 << PG_lru | |
| 136 | 1 << PG_private | | 148 | 1 << PG_private | |
| 137 | 1 << PG_locked | | 149 | 1 << PG_locked | |
| @@ -184,19 +196,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
| 184 | int i; | 196 | int i; |
| 185 | int nr_pages = 1 << order; | 197 | int nr_pages = 1 << order; |
| 186 | 198 | ||
| 187 | if (!PageCompound(page)) | 199 | if (unlikely(page[1].index != order)) |
| 188 | return; | 200 | bad_page(page); |
| 189 | |||
| 190 | if (page[1].index != order) | ||
| 191 | bad_page(__FUNCTION__, page); | ||
| 192 | 201 | ||
| 193 | for (i = 0; i < nr_pages; i++) { | 202 | for (i = 0; i < nr_pages; i++) { |
| 194 | struct page *p = page + i; | 203 | struct page *p = page + i; |
| 195 | 204 | ||
| 196 | if (!PageCompound(p)) | 205 | if (unlikely(!PageCompound(p) | |
| 197 | bad_page(__FUNCTION__, page); | 206 | (page_private(p) != (unsigned long)page))) |
| 198 | if (page_private(p) != (unsigned long)page) | 207 | bad_page(page); |
| 199 | bad_page(__FUNCTION__, page); | ||
| 200 | ClearPageCompound(p); | 208 | ClearPageCompound(p); |
| 201 | } | 209 | } |
| 202 | } | 210 | } |
| @@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) | |||
| 255 | /* | 263 | /* |
| 256 | * This function checks whether a page is free && is the buddy | 264 | * This function checks whether a page is free && is the buddy |
| 257 | * we can do coalesce a page and its buddy if | 265 | * we can do coalesce a page and its buddy if |
| 258 | * (a) the buddy is free && | 266 | * (a) the buddy is not in a hole && |
| 259 | * (b) the buddy is on the buddy system && | 267 | * (b) the buddy is free && |
| 260 | * (c) a page and its buddy have the same order. | 268 | * (c) the buddy is on the buddy system && |
| 269 | * (d) a page and its buddy have the same order. | ||
| 261 | * for recording page's order, we use page_private(page) and PG_private. | 270 | * for recording page's order, we use page_private(page) and PG_private. |
| 262 | * | 271 | * |
| 263 | */ | 272 | */ |
| 264 | static inline int page_is_buddy(struct page *page, int order) | 273 | static inline int page_is_buddy(struct page *page, int order) |
| 265 | { | 274 | { |
| 275 | #ifdef CONFIG_HOLES_IN_ZONE | ||
| 276 | if (!pfn_valid(page_to_pfn(page))) | ||
| 277 | return 0; | ||
| 278 | #endif | ||
| 279 | |||
| 266 | if (PagePrivate(page) && | 280 | if (PagePrivate(page) && |
| 267 | (page_order(page) == order) && | 281 | (page_order(page) == order) && |
| 268 | page_count(page) == 0) | 282 | page_count(page) == 0) |
| @@ -294,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order) | |||
| 294 | * -- wli | 308 | * -- wli |
| 295 | */ | 309 | */ |
| 296 | 310 | ||
| 297 | static inline void __free_pages_bulk (struct page *page, | 311 | static inline void __free_one_page(struct page *page, |
| 298 | struct zone *zone, unsigned int order) | 312 | struct zone *zone, unsigned int order) |
| 299 | { | 313 | { |
| 300 | unsigned long page_idx; | 314 | unsigned long page_idx; |
| 301 | int order_size = 1 << order; | 315 | int order_size = 1 << order; |
| 302 | 316 | ||
| 303 | if (unlikely(order)) | 317 | if (unlikely(PageCompound(page))) |
| 304 | destroy_compound_page(page, order); | 318 | destroy_compound_page(page, order); |
| 305 | 319 | ||
| 306 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 320 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
| @@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page, | |||
| 314 | struct free_area *area; | 328 | struct free_area *area; |
| 315 | struct page *buddy; | 329 | struct page *buddy; |
| 316 | 330 | ||
| 317 | combined_idx = __find_combined_index(page_idx, order); | ||
| 318 | buddy = __page_find_buddy(page, page_idx, order); | 331 | buddy = __page_find_buddy(page, page_idx, order); |
| 319 | |||
| 320 | if (bad_range(zone, buddy)) | ||
| 321 | break; | ||
| 322 | if (!page_is_buddy(buddy, order)) | 332 | if (!page_is_buddy(buddy, order)) |
| 323 | break; /* Move the buddy up one level. */ | 333 | break; /* Move the buddy up one level. */ |
| 334 | |||
| 324 | list_del(&buddy->lru); | 335 | list_del(&buddy->lru); |
| 325 | area = zone->free_area + order; | 336 | area = zone->free_area + order; |
| 326 | area->nr_free--; | 337 | area->nr_free--; |
| 327 | rmv_page_order(buddy); | 338 | rmv_page_order(buddy); |
| 339 | combined_idx = __find_combined_index(page_idx, order); | ||
| 328 | page = page + (combined_idx - page_idx); | 340 | page = page + (combined_idx - page_idx); |
| 329 | page_idx = combined_idx; | 341 | page_idx = combined_idx; |
| 330 | order++; | 342 | order++; |
| @@ -334,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page, | |||
| 334 | zone->free_area[order].nr_free++; | 346 | zone->free_area[order].nr_free++; |
| 335 | } | 347 | } |
| 336 | 348 | ||
| 337 | static inline int free_pages_check(const char *function, struct page *page) | 349 | static inline int free_pages_check(struct page *page) |
| 338 | { | 350 | { |
| 339 | if ( page_mapcount(page) || | 351 | if (unlikely(page_mapcount(page) | |
| 340 | page->mapping != NULL || | 352 | (page->mapping != NULL) | |
| 341 | page_count(page) != 0 || | 353 | (page_count(page) != 0) | |
| 342 | (page->flags & ( | 354 | (page->flags & ( |
| 343 | 1 << PG_lru | | 355 | 1 << PG_lru | |
| 344 | 1 << PG_private | | 356 | 1 << PG_private | |
| @@ -348,8 +360,8 @@ static inline int free_pages_check(const char *function, struct page *page) | |||
| 348 | 1 << PG_slab | | 360 | 1 << PG_slab | |
| 349 | 1 << PG_swapcache | | 361 | 1 << PG_swapcache | |
| 350 | 1 << PG_writeback | | 362 | 1 << PG_writeback | |
| 351 | 1 << PG_reserved ))) | 363 | 1 << PG_reserved )))) |
| 352 | bad_page(function, page); | 364 | bad_page(page); |
| 353 | if (PageDirty(page)) | 365 | if (PageDirty(page)) |
| 354 | __ClearPageDirty(page); | 366 | __ClearPageDirty(page); |
| 355 | /* | 367 | /* |
| @@ -371,51 +383,90 @@ static inline int free_pages_check(const char *function, struct page *page) | |||
| 371 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 383 | * And clear the zone's pages_scanned counter, to hold off the "all pages are |
| 372 | * pinned" detection logic. | 384 | * pinned" detection logic. |
| 373 | */ | 385 | */ |
| 374 | static int | 386 | static void free_pages_bulk(struct zone *zone, int count, |
| 375 | free_pages_bulk(struct zone *zone, int count, | 387 | struct list_head *list, int order) |
| 376 | struct list_head *list, unsigned int order) | ||
| 377 | { | 388 | { |
| 378 | unsigned long flags; | 389 | spin_lock(&zone->lock); |
| 379 | struct page *page = NULL; | ||
| 380 | int ret = 0; | ||
| 381 | |||
| 382 | spin_lock_irqsave(&zone->lock, flags); | ||
| 383 | zone->all_unreclaimable = 0; | 390 | zone->all_unreclaimable = 0; |
| 384 | zone->pages_scanned = 0; | 391 | zone->pages_scanned = 0; |
| 385 | while (!list_empty(list) && count--) { | 392 | while (count--) { |
| 393 | struct page *page; | ||
| 394 | |||
| 395 | BUG_ON(list_empty(list)); | ||
| 386 | page = list_entry(list->prev, struct page, lru); | 396 | page = list_entry(list->prev, struct page, lru); |
| 387 | /* have to delete it as __free_pages_bulk list manipulates */ | 397 | /* have to delete it as __free_one_page list manipulates */ |
| 388 | list_del(&page->lru); | 398 | list_del(&page->lru); |
| 389 | __free_pages_bulk(page, zone, order); | 399 | __free_one_page(page, zone, order); |
| 390 | ret++; | ||
| 391 | } | 400 | } |
| 392 | spin_unlock_irqrestore(&zone->lock, flags); | 401 | spin_unlock(&zone->lock); |
| 393 | return ret; | ||
| 394 | } | 402 | } |
| 395 | 403 | ||
| 396 | void __free_pages_ok(struct page *page, unsigned int order) | 404 | static void free_one_page(struct zone *zone, struct page *page, int order) |
| 397 | { | 405 | { |
| 398 | LIST_HEAD(list); | 406 | LIST_HEAD(list); |
| 407 | list_add(&page->lru, &list); | ||
| 408 | free_pages_bulk(zone, 1, &list, order); | ||
| 409 | } | ||
| 410 | |||
| 411 | static void __free_pages_ok(struct page *page, unsigned int order) | ||
| 412 | { | ||
| 413 | unsigned long flags; | ||
| 399 | int i; | 414 | int i; |
| 400 | int reserved = 0; | 415 | int reserved = 0; |
| 401 | 416 | ||
| 402 | arch_free_page(page, order); | 417 | arch_free_page(page, order); |
| 418 | if (!PageHighMem(page)) | ||
| 419 | mutex_debug_check_no_locks_freed(page_address(page), | ||
| 420 | PAGE_SIZE<<order); | ||
| 403 | 421 | ||
| 404 | #ifndef CONFIG_MMU | 422 | #ifndef CONFIG_MMU |
| 405 | if (order > 0) | 423 | for (i = 1 ; i < (1 << order) ; ++i) |
| 406 | for (i = 1 ; i < (1 << order) ; ++i) | 424 | __put_page(page + i); |
| 407 | __put_page(page + i); | ||
| 408 | #endif | 425 | #endif |
| 409 | 426 | ||
| 410 | for (i = 0 ; i < (1 << order) ; ++i) | 427 | for (i = 0 ; i < (1 << order) ; ++i) |
| 411 | reserved += free_pages_check(__FUNCTION__, page + i); | 428 | reserved += free_pages_check(page + i); |
| 412 | if (reserved) | 429 | if (reserved) |
| 413 | return; | 430 | return; |
| 414 | 431 | ||
| 415 | list_add(&page->lru, &list); | 432 | kernel_map_pages(page, 1 << order, 0); |
| 416 | mod_page_state(pgfree, 1 << order); | 433 | local_irq_save(flags); |
| 417 | kernel_map_pages(page, 1<<order, 0); | 434 | __mod_page_state(pgfree, 1 << order); |
| 418 | free_pages_bulk(page_zone(page), 1, &list, order); | 435 | free_one_page(page_zone(page), page, order); |
| 436 | local_irq_restore(flags); | ||
| 437 | } | ||
| 438 | |||
| 439 | /* | ||
| 440 | * permit the bootmem allocator to evade page validation on high-order frees | ||
| 441 | */ | ||
| 442 | void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | ||
| 443 | { | ||
| 444 | if (order == 0) { | ||
| 445 | __ClearPageReserved(page); | ||
| 446 | set_page_count(page, 0); | ||
| 447 | |||
| 448 | free_hot_cold_page(page, 0); | ||
| 449 | } else { | ||
| 450 | LIST_HEAD(list); | ||
| 451 | int loop; | ||
| 452 | |||
| 453 | for (loop = 0; loop < BITS_PER_LONG; loop++) { | ||
| 454 | struct page *p = &page[loop]; | ||
| 455 | |||
| 456 | if (loop + 16 < BITS_PER_LONG) | ||
| 457 | prefetchw(p + 16); | ||
| 458 | __ClearPageReserved(p); | ||
| 459 | set_page_count(p, 0); | ||
| 460 | } | ||
| 461 | |||
| 462 | arch_free_page(page, order); | ||
| 463 | |||
| 464 | mod_page_state(pgfree, 1 << order); | ||
| 465 | |||
| 466 | list_add(&page->lru, &list); | ||
| 467 | kernel_map_pages(page, 1 << order, 0); | ||
| 468 | free_pages_bulk(page_zone(page), 1, &list, order); | ||
| 469 | } | ||
| 419 | } | 470 | } |
| 420 | 471 | ||
| 421 | 472 | ||
| @@ -433,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order) | |||
| 433 | * | 484 | * |
| 434 | * -- wli | 485 | * -- wli |
| 435 | */ | 486 | */ |
| 436 | static inline struct page * | 487 | static inline void expand(struct zone *zone, struct page *page, |
| 437 | expand(struct zone *zone, struct page *page, | ||
| 438 | int low, int high, struct free_area *area) | 488 | int low, int high, struct free_area *area) |
| 439 | { | 489 | { |
| 440 | unsigned long size = 1 << high; | 490 | unsigned long size = 1 << high; |
| @@ -448,24 +498,6 @@ expand(struct zone *zone, struct page *page, | |||
| 448 | area->nr_free++; | 498 | area->nr_free++; |
| 449 | set_page_order(&page[size], high); | 499 | set_page_order(&page[size], high); |
| 450 | } | 500 | } |
| 451 | return page; | ||
| 452 | } | ||
| 453 | |||
| 454 | void set_page_refs(struct page *page, int order) | ||
| 455 | { | ||
| 456 | #ifdef CONFIG_MMU | ||
| 457 | set_page_count(page, 1); | ||
| 458 | #else | ||
| 459 | int i; | ||
| 460 | |||
| 461 | /* | ||
| 462 | * We need to reference all the pages for this order, otherwise if | ||
| 463 | * anyone accesses one of the pages with (get/put) it will be freed. | ||
| 464 | * - eg: access_process_vm() | ||
| 465 | */ | ||
| 466 | for (i = 0; i < (1 << order); i++) | ||
| 467 | set_page_count(page + i, 1); | ||
| 468 | #endif /* CONFIG_MMU */ | ||
| 469 | } | 501 | } |
| 470 | 502 | ||
| 471 | /* | 503 | /* |
| @@ -473,9 +505,9 @@ void set_page_refs(struct page *page, int order) | |||
| 473 | */ | 505 | */ |
| 474 | static int prep_new_page(struct page *page, int order) | 506 | static int prep_new_page(struct page *page, int order) |
| 475 | { | 507 | { |
| 476 | if ( page_mapcount(page) || | 508 | if (unlikely(page_mapcount(page) | |
| 477 | page->mapping != NULL || | 509 | (page->mapping != NULL) | |
| 478 | page_count(page) != 0 || | 510 | (page_count(page) != 0) | |
| 479 | (page->flags & ( | 511 | (page->flags & ( |
| 480 | 1 << PG_lru | | 512 | 1 << PG_lru | |
| 481 | 1 << PG_private | | 513 | 1 << PG_private | |
| @@ -486,8 +518,8 @@ static int prep_new_page(struct page *page, int order) | |||
| 486 | 1 << PG_slab | | 518 | 1 << PG_slab | |
| 487 | 1 << PG_swapcache | | 519 | 1 << PG_swapcache | |
| 488 | 1 << PG_writeback | | 520 | 1 << PG_writeback | |
| 489 | 1 << PG_reserved ))) | 521 | 1 << PG_reserved )))) |
| 490 | bad_page(__FUNCTION__, page); | 522 | bad_page(page); |
| 491 | 523 | ||
| 492 | /* | 524 | /* |
| 493 | * For now, we report if PG_reserved was found set, but do not | 525 | * For now, we report if PG_reserved was found set, but do not |
| @@ -525,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) | |||
| 525 | rmv_page_order(page); | 557 | rmv_page_order(page); |
| 526 | area->nr_free--; | 558 | area->nr_free--; |
| 527 | zone->free_pages -= 1UL << order; | 559 | zone->free_pages -= 1UL << order; |
| 528 | return expand(zone, page, order, current_order, area); | 560 | expand(zone, page, order, current_order, area); |
| 561 | return page; | ||
| 529 | } | 562 | } |
| 530 | 563 | ||
| 531 | return NULL; | 564 | return NULL; |
| @@ -539,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) | |||
| 539 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 572 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
| 540 | unsigned long count, struct list_head *list) | 573 | unsigned long count, struct list_head *list) |
| 541 | { | 574 | { |
| 542 | unsigned long flags; | ||
| 543 | int i; | 575 | int i; |
| 544 | int allocated = 0; | ||
| 545 | struct page *page; | ||
| 546 | 576 | ||
| 547 | spin_lock_irqsave(&zone->lock, flags); | 577 | spin_lock(&zone->lock); |
| 548 | for (i = 0; i < count; ++i) { | 578 | for (i = 0; i < count; ++i) { |
| 549 | page = __rmqueue(zone, order); | 579 | struct page *page = __rmqueue(zone, order); |
| 550 | if (page == NULL) | 580 | if (unlikely(page == NULL)) |
| 551 | break; | 581 | break; |
| 552 | allocated++; | ||
| 553 | list_add_tail(&page->lru, list); | 582 | list_add_tail(&page->lru, list); |
| 554 | } | 583 | } |
| 555 | spin_unlock_irqrestore(&zone->lock, flags); | 584 | spin_unlock(&zone->lock); |
| 556 | return allocated; | 585 | return i; |
| 557 | } | 586 | } |
| 558 | 587 | ||
| 559 | #ifdef CONFIG_NUMA | 588 | #ifdef CONFIG_NUMA |
| @@ -572,14 +601,13 @@ void drain_remote_pages(void) | |||
| 572 | if (zone->zone_pgdat->node_id == numa_node_id()) | 601 | if (zone->zone_pgdat->node_id == numa_node_id()) |
| 573 | continue; | 602 | continue; |
| 574 | 603 | ||
| 575 | pset = zone->pageset[smp_processor_id()]; | 604 | pset = zone_pcp(zone, smp_processor_id()); |
| 576 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 605 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
| 577 | struct per_cpu_pages *pcp; | 606 | struct per_cpu_pages *pcp; |
| 578 | 607 | ||
| 579 | pcp = &pset->pcp[i]; | 608 | pcp = &pset->pcp[i]; |
| 580 | if (pcp->count) | 609 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); |
| 581 | pcp->count -= free_pages_bulk(zone, pcp->count, | 610 | pcp->count = 0; |
| 582 | &pcp->list, 0); | ||
| 583 | } | 611 | } |
| 584 | } | 612 | } |
| 585 | local_irq_restore(flags); | 613 | local_irq_restore(flags); |
| @@ -589,6 +617,7 @@ void drain_remote_pages(void) | |||
| 589 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) | 617 | #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) |
| 590 | static void __drain_pages(unsigned int cpu) | 618 | static void __drain_pages(unsigned int cpu) |
| 591 | { | 619 | { |
| 620 | unsigned long flags; | ||
| 592 | struct zone *zone; | 621 | struct zone *zone; |
| 593 | int i; | 622 | int i; |
| 594 | 623 | ||
| @@ -600,8 +629,10 @@ static void __drain_pages(unsigned int cpu) | |||
| 600 | struct per_cpu_pages *pcp; | 629 | struct per_cpu_pages *pcp; |
| 601 | 630 | ||
| 602 | pcp = &pset->pcp[i]; | 631 | pcp = &pset->pcp[i]; |
| 603 | pcp->count -= free_pages_bulk(zone, pcp->count, | 632 | local_irq_save(flags); |
| 604 | &pcp->list, 0); | 633 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); |
| 634 | pcp->count = 0; | ||
| 635 | local_irq_restore(flags); | ||
| 605 | } | 636 | } |
| 606 | } | 637 | } |
| 607 | } | 638 | } |
| @@ -647,18 +678,14 @@ void drain_local_pages(void) | |||
| 647 | } | 678 | } |
| 648 | #endif /* CONFIG_PM */ | 679 | #endif /* CONFIG_PM */ |
| 649 | 680 | ||
| 650 | static void zone_statistics(struct zonelist *zonelist, struct zone *z) | 681 | static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) |
| 651 | { | 682 | { |
| 652 | #ifdef CONFIG_NUMA | 683 | #ifdef CONFIG_NUMA |
| 653 | unsigned long flags; | ||
| 654 | int cpu; | ||
| 655 | pg_data_t *pg = z->zone_pgdat; | 684 | pg_data_t *pg = z->zone_pgdat; |
| 656 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; | 685 | pg_data_t *orig = zonelist->zones[0]->zone_pgdat; |
| 657 | struct per_cpu_pageset *p; | 686 | struct per_cpu_pageset *p; |
| 658 | 687 | ||
| 659 | local_irq_save(flags); | 688 | p = zone_pcp(z, cpu); |
| 660 | cpu = smp_processor_id(); | ||
| 661 | p = zone_pcp(z,cpu); | ||
| 662 | if (pg == orig) { | 689 | if (pg == orig) { |
| 663 | p->numa_hit++; | 690 | p->numa_hit++; |
| 664 | } else { | 691 | } else { |
| @@ -669,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) | |||
| 669 | p->local_node++; | 696 | p->local_node++; |
| 670 | else | 697 | else |
| 671 | p->other_node++; | 698 | p->other_node++; |
| 672 | local_irq_restore(flags); | ||
| 673 | #endif | 699 | #endif |
| 674 | } | 700 | } |
| 675 | 701 | ||
| 676 | /* | 702 | /* |
| 677 | * Free a 0-order page | 703 | * Free a 0-order page |
| 678 | */ | 704 | */ |
| 679 | static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); | ||
| 680 | static void fastcall free_hot_cold_page(struct page *page, int cold) | 705 | static void fastcall free_hot_cold_page(struct page *page, int cold) |
| 681 | { | 706 | { |
| 682 | struct zone *zone = page_zone(page); | 707 | struct zone *zone = page_zone(page); |
| @@ -687,18 +712,20 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) | |||
| 687 | 712 | ||
| 688 | if (PageAnon(page)) | 713 | if (PageAnon(page)) |
| 689 | page->mapping = NULL; | 714 | page->mapping = NULL; |
| 690 | if (free_pages_check(__FUNCTION__, page)) | 715 | if (free_pages_check(page)) |
| 691 | return; | 716 | return; |
| 692 | 717 | ||
| 693 | inc_page_state(pgfree); | ||
| 694 | kernel_map_pages(page, 1, 0); | 718 | kernel_map_pages(page, 1, 0); |
| 695 | 719 | ||
| 696 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 720 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
| 697 | local_irq_save(flags); | 721 | local_irq_save(flags); |
| 722 | __inc_page_state(pgfree); | ||
| 698 | list_add(&page->lru, &pcp->list); | 723 | list_add(&page->lru, &pcp->list); |
| 699 | pcp->count++; | 724 | pcp->count++; |
| 700 | if (pcp->count >= pcp->high) | 725 | if (pcp->count >= pcp->high) { |
| 701 | pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 726 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
| 727 | pcp->count -= pcp->batch; | ||
| 728 | } | ||
| 702 | local_irq_restore(flags); | 729 | local_irq_restore(flags); |
| 703 | put_cpu(); | 730 | put_cpu(); |
| 704 | } | 731 | } |
| @@ -727,49 +754,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 727 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 754 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
| 728 | * or two. | 755 | * or two. |
| 729 | */ | 756 | */ |
| 730 | static struct page * | 757 | static struct page *buffered_rmqueue(struct zonelist *zonelist, |
| 731 | buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) | 758 | struct zone *zone, int order, gfp_t gfp_flags) |
| 732 | { | 759 | { |
| 733 | unsigned long flags; | 760 | unsigned long flags; |
| 734 | struct page *page; | 761 | struct page *page; |
| 735 | int cold = !!(gfp_flags & __GFP_COLD); | 762 | int cold = !!(gfp_flags & __GFP_COLD); |
| 763 | int cpu; | ||
| 736 | 764 | ||
| 737 | again: | 765 | again: |
| 738 | if (order == 0) { | 766 | cpu = get_cpu(); |
| 767 | if (likely(order == 0)) { | ||
| 739 | struct per_cpu_pages *pcp; | 768 | struct per_cpu_pages *pcp; |
| 740 | 769 | ||
| 741 | page = NULL; | 770 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; |
| 742 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | ||
| 743 | local_irq_save(flags); | 771 | local_irq_save(flags); |
| 744 | if (pcp->count <= pcp->low) | 772 | if (!pcp->count) { |
| 745 | pcp->count += rmqueue_bulk(zone, 0, | 773 | pcp->count += rmqueue_bulk(zone, 0, |
| 746 | pcp->batch, &pcp->list); | 774 | pcp->batch, &pcp->list); |
| 747 | if (pcp->count) { | 775 | if (unlikely(!pcp->count)) |
| 748 | page = list_entry(pcp->list.next, struct page, lru); | 776 | goto failed; |
| 749 | list_del(&page->lru); | ||
| 750 | pcp->count--; | ||
| 751 | } | 777 | } |
| 752 | local_irq_restore(flags); | 778 | page = list_entry(pcp->list.next, struct page, lru); |
| 753 | put_cpu(); | 779 | list_del(&page->lru); |
| 780 | pcp->count--; | ||
| 754 | } else { | 781 | } else { |
| 755 | spin_lock_irqsave(&zone->lock, flags); | 782 | spin_lock_irqsave(&zone->lock, flags); |
| 756 | page = __rmqueue(zone, order); | 783 | page = __rmqueue(zone, order); |
| 757 | spin_unlock_irqrestore(&zone->lock, flags); | 784 | spin_unlock(&zone->lock); |
| 785 | if (!page) | ||
| 786 | goto failed; | ||
| 758 | } | 787 | } |
| 759 | 788 | ||
| 760 | if (page != NULL) { | 789 | __mod_page_state_zone(zone, pgalloc, 1 << order); |
| 761 | BUG_ON(bad_range(zone, page)); | 790 | zone_statistics(zonelist, zone, cpu); |
| 762 | mod_page_state_zone(zone, pgalloc, 1 << order); | 791 | local_irq_restore(flags); |
| 763 | if (prep_new_page(page, order)) | 792 | put_cpu(); |
| 764 | goto again; | ||
| 765 | 793 | ||
| 766 | if (gfp_flags & __GFP_ZERO) | 794 | BUG_ON(bad_range(zone, page)); |
| 767 | prep_zero_page(page, order, gfp_flags); | 795 | if (prep_new_page(page, order)) |
| 796 | goto again; | ||
| 768 | 797 | ||
| 769 | if (order && (gfp_flags & __GFP_COMP)) | 798 | if (gfp_flags & __GFP_ZERO) |
| 770 | prep_compound_page(page, order); | 799 | prep_zero_page(page, order, gfp_flags); |
| 771 | } | 800 | |
| 801 | if (order && (gfp_flags & __GFP_COMP)) | ||
| 802 | prep_compound_page(page, order); | ||
| 772 | return page; | 803 | return page; |
| 804 | |||
| 805 | failed: | ||
| 806 | local_irq_restore(flags); | ||
| 807 | put_cpu(); | ||
| 808 | return NULL; | ||
| 773 | } | 809 | } |
| 774 | 810 | ||
| 775 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 811 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ |
| @@ -845,9 +881,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
| 845 | continue; | 881 | continue; |
| 846 | } | 882 | } |
| 847 | 883 | ||
| 848 | page = buffered_rmqueue(*z, order, gfp_mask); | 884 | page = buffered_rmqueue(zonelist, *z, order, gfp_mask); |
| 849 | if (page) { | 885 | if (page) { |
| 850 | zone_statistics(zonelist, *z); | ||
| 851 | break; | 886 | break; |
| 852 | } | 887 | } |
| 853 | } while (*(++z) != NULL); | 888 | } while (*(++z) != NULL); |
| @@ -896,15 +931,15 @@ restart: | |||
| 896 | * | 931 | * |
| 897 | * The caller may dip into page reserves a bit more if the caller | 932 | * The caller may dip into page reserves a bit more if the caller |
| 898 | * cannot run direct reclaim, or if the caller has realtime scheduling | 933 | * cannot run direct reclaim, or if the caller has realtime scheduling |
| 899 | * policy. | 934 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will |
| 935 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
| 900 | */ | 936 | */ |
| 901 | alloc_flags = ALLOC_WMARK_MIN; | 937 | alloc_flags = ALLOC_WMARK_MIN; |
| 902 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | 938 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) |
| 903 | alloc_flags |= ALLOC_HARDER; | 939 | alloc_flags |= ALLOC_HARDER; |
| 904 | if (gfp_mask & __GFP_HIGH) | 940 | if (gfp_mask & __GFP_HIGH) |
| 905 | alloc_flags |= ALLOC_HIGH; | 941 | alloc_flags |= ALLOC_HIGH; |
| 906 | if (wait) | 942 | alloc_flags |= ALLOC_CPUSET; |
| 907 | alloc_flags |= ALLOC_CPUSET; | ||
| 908 | 943 | ||
| 909 | /* | 944 | /* |
| 910 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 945 | * Go through the zonelist again. Let __GFP_HIGH and allocations |
| @@ -926,7 +961,7 @@ restart: | |||
| 926 | nofail_alloc: | 961 | nofail_alloc: |
| 927 | /* go through the zonelist yet again, ignoring mins */ | 962 | /* go through the zonelist yet again, ignoring mins */ |
| 928 | page = get_page_from_freelist(gfp_mask, order, | 963 | page = get_page_from_freelist(gfp_mask, order, |
| 929 | zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); | 964 | zonelist, ALLOC_NO_WATERMARKS); |
| 930 | if (page) | 965 | if (page) |
| 931 | goto got_pg; | 966 | goto got_pg; |
| 932 | if (gfp_mask & __GFP_NOFAIL) { | 967 | if (gfp_mask & __GFP_NOFAIL) { |
| @@ -945,6 +980,7 @@ rebalance: | |||
| 945 | cond_resched(); | 980 | cond_resched(); |
| 946 | 981 | ||
| 947 | /* We now go into synchronous reclaim */ | 982 | /* We now go into synchronous reclaim */ |
| 983 | cpuset_memory_pressure_bump(); | ||
| 948 | p->flags |= PF_MEMALLOC; | 984 | p->flags |= PF_MEMALLOC; |
| 949 | reclaim_state.reclaimed_slab = 0; | 985 | reclaim_state.reclaimed_slab = 0; |
| 950 | p->reclaim_state = &reclaim_state; | 986 | p->reclaim_state = &reclaim_state; |
| @@ -1171,7 +1207,7 @@ EXPORT_SYMBOL(nr_pagecache); | |||
| 1171 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | 1207 | DEFINE_PER_CPU(long, nr_pagecache_local) = 0; |
| 1172 | #endif | 1208 | #endif |
| 1173 | 1209 | ||
| 1174 | void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 1210 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) |
| 1175 | { | 1211 | { |
| 1176 | int cpu = 0; | 1212 | int cpu = 0; |
| 1177 | 1213 | ||
| @@ -1224,7 +1260,7 @@ void get_full_page_state(struct page_state *ret) | |||
| 1224 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); | 1260 | __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); |
| 1225 | } | 1261 | } |
| 1226 | 1262 | ||
| 1227 | unsigned long __read_page_state(unsigned long offset) | 1263 | unsigned long read_page_state_offset(unsigned long offset) |
| 1228 | { | 1264 | { |
| 1229 | unsigned long ret = 0; | 1265 | unsigned long ret = 0; |
| 1230 | int cpu; | 1266 | int cpu; |
| @@ -1238,18 +1274,26 @@ unsigned long __read_page_state(unsigned long offset) | |||
| 1238 | return ret; | 1274 | return ret; |
| 1239 | } | 1275 | } |
| 1240 | 1276 | ||
| 1241 | void __mod_page_state(unsigned long offset, unsigned long delta) | 1277 | void __mod_page_state_offset(unsigned long offset, unsigned long delta) |
| 1278 | { | ||
| 1279 | void *ptr; | ||
| 1280 | |||
| 1281 | ptr = &__get_cpu_var(page_states); | ||
| 1282 | *(unsigned long *)(ptr + offset) += delta; | ||
| 1283 | } | ||
| 1284 | EXPORT_SYMBOL(__mod_page_state_offset); | ||
| 1285 | |||
| 1286 | void mod_page_state_offset(unsigned long offset, unsigned long delta) | ||
| 1242 | { | 1287 | { |
| 1243 | unsigned long flags; | 1288 | unsigned long flags; |
| 1244 | void* ptr; | 1289 | void *ptr; |
| 1245 | 1290 | ||
| 1246 | local_irq_save(flags); | 1291 | local_irq_save(flags); |
| 1247 | ptr = &__get_cpu_var(page_states); | 1292 | ptr = &__get_cpu_var(page_states); |
| 1248 | *(unsigned long*)(ptr + offset) += delta; | 1293 | *(unsigned long *)(ptr + offset) += delta; |
| 1249 | local_irq_restore(flags); | 1294 | local_irq_restore(flags); |
| 1250 | } | 1295 | } |
| 1251 | 1296 | EXPORT_SYMBOL(mod_page_state_offset); | |
| 1252 | EXPORT_SYMBOL(__mod_page_state); | ||
| 1253 | 1297 | ||
| 1254 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, | 1298 | void __get_zone_counts(unsigned long *active, unsigned long *inactive, |
| 1255 | unsigned long *free, struct pglist_data *pgdat) | 1299 | unsigned long *free, struct pglist_data *pgdat) |
| @@ -1335,7 +1379,7 @@ void show_free_areas(void) | |||
| 1335 | show_node(zone); | 1379 | show_node(zone); |
| 1336 | printk("%s per-cpu:", zone->name); | 1380 | printk("%s per-cpu:", zone->name); |
| 1337 | 1381 | ||
| 1338 | if (!zone->present_pages) { | 1382 | if (!populated_zone(zone)) { |
| 1339 | printk(" empty\n"); | 1383 | printk(" empty\n"); |
| 1340 | continue; | 1384 | continue; |
| 1341 | } else | 1385 | } else |
| @@ -1347,10 +1391,9 @@ void show_free_areas(void) | |||
| 1347 | pageset = zone_pcp(zone, cpu); | 1391 | pageset = zone_pcp(zone, cpu); |
| 1348 | 1392 | ||
| 1349 | for (temperature = 0; temperature < 2; temperature++) | 1393 | for (temperature = 0; temperature < 2; temperature++) |
| 1350 | printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", | 1394 | printk("cpu %d %s: high %d, batch %d used:%d\n", |
| 1351 | cpu, | 1395 | cpu, |
| 1352 | temperature ? "cold" : "hot", | 1396 | temperature ? "cold" : "hot", |
| 1353 | pageset->pcp[temperature].low, | ||
| 1354 | pageset->pcp[temperature].high, | 1397 | pageset->pcp[temperature].high, |
| 1355 | pageset->pcp[temperature].batch, | 1398 | pageset->pcp[temperature].batch, |
| 1356 | pageset->pcp[temperature].count); | 1399 | pageset->pcp[temperature].count); |
| @@ -1413,7 +1456,7 @@ void show_free_areas(void) | |||
| 1413 | 1456 | ||
| 1414 | show_node(zone); | 1457 | show_node(zone); |
| 1415 | printk("%s: ", zone->name); | 1458 | printk("%s: ", zone->name); |
| 1416 | if (!zone->present_pages) { | 1459 | if (!populated_zone(zone)) { |
| 1417 | printk("empty\n"); | 1460 | printk("empty\n"); |
| 1418 | continue; | 1461 | continue; |
| 1419 | } | 1462 | } |
| @@ -1433,36 +1476,29 @@ void show_free_areas(void) | |||
| 1433 | 1476 | ||
| 1434 | /* | 1477 | /* |
| 1435 | * Builds allocation fallback zone lists. | 1478 | * Builds allocation fallback zone lists. |
| 1479 | * | ||
| 1480 | * Add all populated zones of a node to the zonelist. | ||
| 1436 | */ | 1481 | */ |
| 1437 | static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) | 1482 | static int __init build_zonelists_node(pg_data_t *pgdat, |
| 1438 | { | 1483 | struct zonelist *zonelist, int nr_zones, int zone_type) |
| 1439 | switch (k) { | 1484 | { |
| 1440 | struct zone *zone; | 1485 | struct zone *zone; |
| 1441 | default: | 1486 | |
| 1442 | BUG(); | 1487 | BUG_ON(zone_type > ZONE_HIGHMEM); |
| 1443 | case ZONE_HIGHMEM: | 1488 | |
| 1444 | zone = pgdat->node_zones + ZONE_HIGHMEM; | 1489 | do { |
| 1445 | if (zone->present_pages) { | 1490 | zone = pgdat->node_zones + zone_type; |
| 1491 | if (populated_zone(zone)) { | ||
| 1446 | #ifndef CONFIG_HIGHMEM | 1492 | #ifndef CONFIG_HIGHMEM |
| 1447 | BUG(); | 1493 | BUG_ON(zone_type > ZONE_NORMAL); |
| 1448 | #endif | 1494 | #endif |
| 1449 | zonelist->zones[j++] = zone; | 1495 | zonelist->zones[nr_zones++] = zone; |
| 1496 | check_highest_zone(zone_type); | ||
| 1450 | } | 1497 | } |
| 1451 | case ZONE_NORMAL: | 1498 | zone_type--; |
| 1452 | zone = pgdat->node_zones + ZONE_NORMAL; | ||
| 1453 | if (zone->present_pages) | ||
| 1454 | zonelist->zones[j++] = zone; | ||
| 1455 | case ZONE_DMA32: | ||
| 1456 | zone = pgdat->node_zones + ZONE_DMA32; | ||
| 1457 | if (zone->present_pages) | ||
| 1458 | zonelist->zones[j++] = zone; | ||
| 1459 | case ZONE_DMA: | ||
| 1460 | zone = pgdat->node_zones + ZONE_DMA; | ||
| 1461 | if (zone->present_pages) | ||
| 1462 | zonelist->zones[j++] = zone; | ||
| 1463 | } | ||
| 1464 | 1499 | ||
| 1465 | return j; | 1500 | } while (zone_type >= 0); |
| 1501 | return nr_zones; | ||
| 1466 | } | 1502 | } |
| 1467 | 1503 | ||
| 1468 | static inline int highest_zone(int zone_bits) | 1504 | static inline int highest_zone(int zone_bits) |
| @@ -1706,11 +1742,9 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 1706 | unsigned long end_pfn = start_pfn + size; | 1742 | unsigned long end_pfn = start_pfn + size; |
| 1707 | unsigned long pfn; | 1743 | unsigned long pfn; |
| 1708 | 1744 | ||
| 1709 | for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { | 1745 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
| 1710 | if (!early_pfn_valid(pfn)) | 1746 | if (!early_pfn_valid(pfn)) |
| 1711 | continue; | 1747 | continue; |
| 1712 | if (!early_pfn_in_nid(pfn, nid)) | ||
| 1713 | continue; | ||
| 1714 | page = pfn_to_page(pfn); | 1748 | page = pfn_to_page(pfn); |
| 1715 | set_page_links(page, zone, nid, pfn); | 1749 | set_page_links(page, zone, nid, pfn); |
| 1716 | set_page_count(page, 1); | 1750 | set_page_count(page, 1); |
| @@ -1794,19 +1828,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
| 1794 | 1828 | ||
| 1795 | pcp = &p->pcp[0]; /* hot */ | 1829 | pcp = &p->pcp[0]; /* hot */ |
| 1796 | pcp->count = 0; | 1830 | pcp->count = 0; |
| 1797 | pcp->low = 0; | ||
| 1798 | pcp->high = 6 * batch; | 1831 | pcp->high = 6 * batch; |
| 1799 | pcp->batch = max(1UL, 1 * batch); | 1832 | pcp->batch = max(1UL, 1 * batch); |
| 1800 | INIT_LIST_HEAD(&pcp->list); | 1833 | INIT_LIST_HEAD(&pcp->list); |
| 1801 | 1834 | ||
| 1802 | pcp = &p->pcp[1]; /* cold*/ | 1835 | pcp = &p->pcp[1]; /* cold*/ |
| 1803 | pcp->count = 0; | 1836 | pcp->count = 0; |
| 1804 | pcp->low = 0; | ||
| 1805 | pcp->high = 2 * batch; | 1837 | pcp->high = 2 * batch; |
| 1806 | pcp->batch = max(1UL, batch/2); | 1838 | pcp->batch = max(1UL, batch/2); |
| 1807 | INIT_LIST_HEAD(&pcp->list); | 1839 | INIT_LIST_HEAD(&pcp->list); |
| 1808 | } | 1840 | } |
| 1809 | 1841 | ||
| 1842 | /* | ||
| 1843 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | ||
| 1844 | * to the value high for the pageset p. | ||
| 1845 | */ | ||
| 1846 | |||
| 1847 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | ||
| 1848 | unsigned long high) | ||
| 1849 | { | ||
| 1850 | struct per_cpu_pages *pcp; | ||
| 1851 | |||
| 1852 | pcp = &p->pcp[0]; /* hot list */ | ||
| 1853 | pcp->high = high; | ||
| 1854 | pcp->batch = max(1UL, high/4); | ||
| 1855 | if ((high/4) > (PAGE_SHIFT * 8)) | ||
| 1856 | pcp->batch = PAGE_SHIFT * 8; | ||
| 1857 | } | ||
| 1858 | |||
| 1859 | |||
| 1810 | #ifdef CONFIG_NUMA | 1860 | #ifdef CONFIG_NUMA |
| 1811 | /* | 1861 | /* |
| 1812 | * Boot pageset table. One per cpu which is going to be used for all | 1862 | * Boot pageset table. One per cpu which is going to be used for all |
| @@ -1838,12 +1888,16 @@ static int __devinit process_zones(int cpu) | |||
| 1838 | 1888 | ||
| 1839 | for_each_zone(zone) { | 1889 | for_each_zone(zone) { |
| 1840 | 1890 | ||
| 1841 | zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), | 1891 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
| 1842 | GFP_KERNEL, cpu_to_node(cpu)); | 1892 | GFP_KERNEL, cpu_to_node(cpu)); |
| 1843 | if (!zone->pageset[cpu]) | 1893 | if (!zone_pcp(zone, cpu)) |
| 1844 | goto bad; | 1894 | goto bad; |
| 1845 | 1895 | ||
| 1846 | setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); | 1896 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); |
| 1897 | |||
| 1898 | if (percpu_pagelist_fraction) | ||
| 1899 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
| 1900 | (zone->present_pages / percpu_pagelist_fraction)); | ||
| 1847 | } | 1901 | } |
| 1848 | 1902 | ||
| 1849 | return 0; | 1903 | return 0; |
| @@ -1851,15 +1905,14 @@ bad: | |||
| 1851 | for_each_zone(dzone) { | 1905 | for_each_zone(dzone) { |
| 1852 | if (dzone == zone) | 1906 | if (dzone == zone) |
| 1853 | break; | 1907 | break; |
| 1854 | kfree(dzone->pageset[cpu]); | 1908 | kfree(zone_pcp(dzone, cpu)); |
| 1855 | dzone->pageset[cpu] = NULL; | 1909 | zone_pcp(dzone, cpu) = NULL; |
| 1856 | } | 1910 | } |
| 1857 | return -ENOMEM; | 1911 | return -ENOMEM; |
| 1858 | } | 1912 | } |
| 1859 | 1913 | ||
| 1860 | static inline void free_zone_pagesets(int cpu) | 1914 | static inline void free_zone_pagesets(int cpu) |
| 1861 | { | 1915 | { |
| 1862 | #ifdef CONFIG_NUMA | ||
| 1863 | struct zone *zone; | 1916 | struct zone *zone; |
| 1864 | 1917 | ||
| 1865 | for_each_zone(zone) { | 1918 | for_each_zone(zone) { |
| @@ -1868,7 +1921,6 @@ static inline void free_zone_pagesets(int cpu) | |||
| 1868 | zone_pcp(zone, cpu) = NULL; | 1921 | zone_pcp(zone, cpu) = NULL; |
| 1869 | kfree(pset); | 1922 | kfree(pset); |
| 1870 | } | 1923 | } |
| 1871 | #endif | ||
| 1872 | } | 1924 | } |
| 1873 | 1925 | ||
| 1874 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, | 1926 | static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, |
| @@ -1939,7 +1991,7 @@ static __devinit void zone_pcp_init(struct zone *zone) | |||
| 1939 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 1991 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
| 1940 | #ifdef CONFIG_NUMA | 1992 | #ifdef CONFIG_NUMA |
| 1941 | /* Early boot. Slab allocator not functional yet */ | 1993 | /* Early boot. Slab allocator not functional yet */ |
| 1942 | zone->pageset[cpu] = &boot_pageset[cpu]; | 1994 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; |
| 1943 | setup_pageset(&boot_pageset[cpu],0); | 1995 | setup_pageset(&boot_pageset[cpu],0); |
| 1944 | #else | 1996 | #else |
| 1945 | setup_pageset(zone_pcp(zone,cpu), batch); | 1997 | setup_pageset(zone_pcp(zone,cpu), batch); |
| @@ -2116,7 +2168,7 @@ static int frag_show(struct seq_file *m, void *arg) | |||
| 2116 | int order; | 2168 | int order; |
| 2117 | 2169 | ||
| 2118 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 2170 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
| 2119 | if (!zone->present_pages) | 2171 | if (!populated_zone(zone)) |
| 2120 | continue; | 2172 | continue; |
| 2121 | 2173 | ||
| 2122 | spin_lock_irqsave(&zone->lock, flags); | 2174 | spin_lock_irqsave(&zone->lock, flags); |
| @@ -2149,7 +2201,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
| 2149 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { | 2201 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { |
| 2150 | int i; | 2202 | int i; |
| 2151 | 2203 | ||
| 2152 | if (!zone->present_pages) | 2204 | if (!populated_zone(zone)) |
| 2153 | continue; | 2205 | continue; |
| 2154 | 2206 | ||
| 2155 | spin_lock_irqsave(&zone->lock, flags); | 2207 | spin_lock_irqsave(&zone->lock, flags); |
| @@ -2182,7 +2234,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
| 2182 | seq_printf(m, | 2234 | seq_printf(m, |
| 2183 | ")" | 2235 | ")" |
| 2184 | "\n pagesets"); | 2236 | "\n pagesets"); |
| 2185 | for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { | 2237 | for_each_online_cpu(i) { |
| 2186 | struct per_cpu_pageset *pageset; | 2238 | struct per_cpu_pageset *pageset; |
| 2187 | int j; | 2239 | int j; |
| 2188 | 2240 | ||
| @@ -2197,12 +2249,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) | |||
| 2197 | seq_printf(m, | 2249 | seq_printf(m, |
| 2198 | "\n cpu: %i pcp: %i" | 2250 | "\n cpu: %i pcp: %i" |
| 2199 | "\n count: %i" | 2251 | "\n count: %i" |
| 2200 | "\n low: %i" | ||
| 2201 | "\n high: %i" | 2252 | "\n high: %i" |
| 2202 | "\n batch: %i", | 2253 | "\n batch: %i", |
| 2203 | i, j, | 2254 | i, j, |
| 2204 | pageset->pcp[j].count, | 2255 | pageset->pcp[j].count, |
| 2205 | pageset->pcp[j].low, | ||
| 2206 | pageset->pcp[j].high, | 2256 | pageset->pcp[j].high, |
| 2207 | pageset->pcp[j].batch); | 2257 | pageset->pcp[j].batch); |
| 2208 | } | 2258 | } |
| @@ -2257,32 +2307,40 @@ static char *vmstat_text[] = { | |||
| 2257 | "pgpgout", | 2307 | "pgpgout", |
| 2258 | "pswpin", | 2308 | "pswpin", |
| 2259 | "pswpout", | 2309 | "pswpout", |
| 2260 | "pgalloc_high", | ||
| 2261 | 2310 | ||
| 2311 | "pgalloc_high", | ||
| 2262 | "pgalloc_normal", | 2312 | "pgalloc_normal", |
| 2313 | "pgalloc_dma32", | ||
| 2263 | "pgalloc_dma", | 2314 | "pgalloc_dma", |
| 2315 | |||
| 2264 | "pgfree", | 2316 | "pgfree", |
| 2265 | "pgactivate", | 2317 | "pgactivate", |
| 2266 | "pgdeactivate", | 2318 | "pgdeactivate", |
| 2267 | 2319 | ||
| 2268 | "pgfault", | 2320 | "pgfault", |
| 2269 | "pgmajfault", | 2321 | "pgmajfault", |
| 2322 | |||
| 2270 | "pgrefill_high", | 2323 | "pgrefill_high", |
| 2271 | "pgrefill_normal", | 2324 | "pgrefill_normal", |
| 2325 | "pgrefill_dma32", | ||
| 2272 | "pgrefill_dma", | 2326 | "pgrefill_dma", |
| 2273 | 2327 | ||
| 2274 | "pgsteal_high", | 2328 | "pgsteal_high", |
| 2275 | "pgsteal_normal", | 2329 | "pgsteal_normal", |
| 2330 | "pgsteal_dma32", | ||
| 2276 | "pgsteal_dma", | 2331 | "pgsteal_dma", |
| 2332 | |||
| 2277 | "pgscan_kswapd_high", | 2333 | "pgscan_kswapd_high", |
| 2278 | "pgscan_kswapd_normal", | 2334 | "pgscan_kswapd_normal", |
| 2279 | 2335 | "pgscan_kswapd_dma32", | |
| 2280 | "pgscan_kswapd_dma", | 2336 | "pgscan_kswapd_dma", |
| 2337 | |||
| 2281 | "pgscan_direct_high", | 2338 | "pgscan_direct_high", |
| 2282 | "pgscan_direct_normal", | 2339 | "pgscan_direct_normal", |
| 2340 | "pgscan_direct_dma32", | ||
| 2283 | "pgscan_direct_dma", | 2341 | "pgscan_direct_dma", |
| 2284 | "pginodesteal", | ||
| 2285 | 2342 | ||
| 2343 | "pginodesteal", | ||
| 2286 | "slabs_scanned", | 2344 | "slabs_scanned", |
| 2287 | "kswapd_steal", | 2345 | "kswapd_steal", |
| 2288 | "kswapd_inodesteal", | 2346 | "kswapd_inodesteal", |
| @@ -2539,6 +2597,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
| 2539 | return 0; | 2597 | return 0; |
| 2540 | } | 2598 | } |
| 2541 | 2599 | ||
| 2600 | /* | ||
| 2601 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each | ||
| 2602 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | ||
| 2603 | * can have before it gets flushed back to buddy allocator. | ||
| 2604 | */ | ||
| 2605 | |||
| 2606 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | ||
| 2607 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | ||
| 2608 | { | ||
| 2609 | struct zone *zone; | ||
| 2610 | unsigned int cpu; | ||
| 2611 | int ret; | ||
| 2612 | |||
| 2613 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | ||
| 2614 | if (!write || (ret == -EINVAL)) | ||
| 2615 | return ret; | ||
| 2616 | for_each_zone(zone) { | ||
| 2617 | for_each_online_cpu(cpu) { | ||
| 2618 | unsigned long high; | ||
| 2619 | high = zone->present_pages / percpu_pagelist_fraction; | ||
| 2620 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | ||
| 2621 | } | ||
| 2622 | } | ||
| 2623 | return 0; | ||
| 2624 | } | ||
| 2625 | |||
| 2542 | __initdata int hashdist = HASHDIST_DEFAULT; | 2626 | __initdata int hashdist = HASHDIST_DEFAULT; |
| 2543 | 2627 | ||
| 2544 | #ifdef CONFIG_NUMA | 2628 | #ifdef CONFIG_NUMA |
diff --git a/mm/pdflush.c b/mm/pdflush.c index 52822c98c489..c4b6d0afd736 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
| @@ -90,7 +90,7 @@ struct pdflush_work { | |||
| 90 | 90 | ||
| 91 | static int __pdflush(struct pdflush_work *my_work) | 91 | static int __pdflush(struct pdflush_work *my_work) |
| 92 | { | 92 | { |
| 93 | current->flags |= PF_FLUSHER; | 93 | current->flags |= PF_FLUSHER | PF_SWAPWRITE; |
| 94 | my_work->fn = NULL; | 94 | my_work->fn = NULL; |
| 95 | my_work->who = current; | 95 | my_work->who = current; |
| 96 | INIT_LIST_HEAD(&my_work->list); | 96 | INIT_LIST_HEAD(&my_work->list); |
diff --git a/mm/readahead.c b/mm/readahead.c index 72e7adbb87c7..8d6eeaaa6296 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
| 158 | { | 158 | { |
| 159 | unsigned page_idx; | 159 | unsigned page_idx; |
| 160 | struct pagevec lru_pvec; | 160 | struct pagevec lru_pvec; |
| 161 | int ret = 0; | 161 | int ret; |
| 162 | 162 | ||
| 163 | if (mapping->a_ops->readpages) { | 163 | if (mapping->a_ops->readpages) { |
| 164 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); | 164 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); |
| @@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
| 171 | list_del(&page->lru); | 171 | list_del(&page->lru); |
| 172 | if (!add_to_page_cache(page, mapping, | 172 | if (!add_to_page_cache(page, mapping, |
| 173 | page->index, GFP_KERNEL)) { | 173 | page->index, GFP_KERNEL)) { |
| 174 | mapping->a_ops->readpage(filp, page); | 174 | ret = mapping->a_ops->readpage(filp, page); |
| 175 | if (!pagevec_add(&lru_pvec, page)) | 175 | if (ret != AOP_TRUNCATED_PAGE) { |
| 176 | __pagevec_lru_add(&lru_pvec); | 176 | if (!pagevec_add(&lru_pvec, page)) |
| 177 | } else { | 177 | __pagevec_lru_add(&lru_pvec); |
| 178 | page_cache_release(page); | 178 | continue; |
| 179 | } /* else fall through to release */ | ||
| 179 | } | 180 | } |
| 181 | page_cache_release(page); | ||
| 180 | } | 182 | } |
| 181 | pagevec_lru_add(&lru_pvec); | 183 | pagevec_lru_add(&lru_pvec); |
| 184 | ret = 0; | ||
| 182 | out: | 185 | out: |
| 183 | return ret; | 186 | return ret; |
| 184 | } | 187 | } |
| @@ -20,13 +20,13 @@ | |||
| 20 | /* | 20 | /* |
| 21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
| 22 | * | 22 | * |
| 23 | * inode->i_sem (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
| 24 | * inode->i_alloc_sem | 24 | * inode->i_alloc_sem |
| 25 | * | 25 | * |
| 26 | * When a page fault occurs in writing from user to file, down_read | 26 | * When a page fault occurs in writing from user to file, down_read |
| 27 | * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within | 27 | * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within |
| 28 | * down_read of mmap_sem; i_sem and down_write of mmap_sem are never | 28 | * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never |
| 29 | * taken together; in truncation, i_sem is taken outermost. | 29 | * taken together; in truncation, i_mutex is taken outermost. |
| 30 | * | 30 | * |
| 31 | * mm->mmap_sem | 31 | * mm->mmap_sem |
| 32 | * page->flags PG_locked (lock_page) | 32 | * page->flags PG_locked (lock_page) |
| @@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked) | |||
| 435 | } | 435 | } |
| 436 | 436 | ||
| 437 | /** | 437 | /** |
| 438 | * page_set_anon_rmap - setup new anonymous rmap | ||
| 439 | * @page: the page to add the mapping to | ||
| 440 | * @vma: the vm area in which the mapping is added | ||
| 441 | * @address: the user virtual address mapped | ||
| 442 | */ | ||
| 443 | static void __page_set_anon_rmap(struct page *page, | ||
| 444 | struct vm_area_struct *vma, unsigned long address) | ||
| 445 | { | ||
| 446 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 447 | |||
| 448 | BUG_ON(!anon_vma); | ||
| 449 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
| 450 | page->mapping = (struct address_space *) anon_vma; | ||
| 451 | |||
| 452 | page->index = linear_page_index(vma, address); | ||
| 453 | |||
| 454 | /* | ||
| 455 | * nr_mapped state can be updated without turning off | ||
| 456 | * interrupts because it is not modified via interrupt. | ||
| 457 | */ | ||
| 458 | __inc_page_state(nr_mapped); | ||
| 459 | } | ||
| 460 | |||
| 461 | /** | ||
| 438 | * page_add_anon_rmap - add pte mapping to an anonymous page | 462 | * page_add_anon_rmap - add pte mapping to an anonymous page |
| 439 | * @page: the page to add the mapping to | 463 | * @page: the page to add the mapping to |
| 440 | * @vma: the vm area in which the mapping is added | 464 | * @vma: the vm area in which the mapping is added |
| @@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked) | |||
| 445 | void page_add_anon_rmap(struct page *page, | 469 | void page_add_anon_rmap(struct page *page, |
| 446 | struct vm_area_struct *vma, unsigned long address) | 470 | struct vm_area_struct *vma, unsigned long address) |
| 447 | { | 471 | { |
| 448 | if (atomic_inc_and_test(&page->_mapcount)) { | 472 | if (atomic_inc_and_test(&page->_mapcount)) |
| 449 | struct anon_vma *anon_vma = vma->anon_vma; | 473 | __page_set_anon_rmap(page, vma, address); |
| 450 | |||
| 451 | BUG_ON(!anon_vma); | ||
| 452 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
| 453 | page->mapping = (struct address_space *) anon_vma; | ||
| 454 | |||
| 455 | page->index = linear_page_index(vma, address); | ||
| 456 | |||
| 457 | inc_page_state(nr_mapped); | ||
| 458 | } | ||
| 459 | /* else checking page index and mapping is racy */ | 474 | /* else checking page index and mapping is racy */ |
| 460 | } | 475 | } |
| 461 | 476 | ||
| 477 | /* | ||
| 478 | * page_add_new_anon_rmap - add pte mapping to a new anonymous page | ||
| 479 | * @page: the page to add the mapping to | ||
| 480 | * @vma: the vm area in which the mapping is added | ||
| 481 | * @address: the user virtual address mapped | ||
| 482 | * | ||
| 483 | * Same as page_add_anon_rmap but must only be called on *new* pages. | ||
| 484 | * This means the inc-and-test can be bypassed. | ||
| 485 | */ | ||
| 486 | void page_add_new_anon_rmap(struct page *page, | ||
| 487 | struct vm_area_struct *vma, unsigned long address) | ||
| 488 | { | ||
| 489 | atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ | ||
| 490 | __page_set_anon_rmap(page, vma, address); | ||
| 491 | } | ||
| 492 | |||
| 462 | /** | 493 | /** |
| 463 | * page_add_file_rmap - add pte mapping to a file page | 494 | * page_add_file_rmap - add pte mapping to a file page |
| 464 | * @page: the page to add the mapping to | 495 | * @page: the page to add the mapping to |
| @@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page) | |||
| 471 | BUG_ON(!pfn_valid(page_to_pfn(page))); | 502 | BUG_ON(!pfn_valid(page_to_pfn(page))); |
| 472 | 503 | ||
| 473 | if (atomic_inc_and_test(&page->_mapcount)) | 504 | if (atomic_inc_and_test(&page->_mapcount)) |
| 474 | inc_page_state(nr_mapped); | 505 | __inc_page_state(nr_mapped); |
| 475 | } | 506 | } |
| 476 | 507 | ||
| 477 | /** | 508 | /** |
| @@ -483,6 +514,13 @@ void page_add_file_rmap(struct page *page) | |||
| 483 | void page_remove_rmap(struct page *page) | 514 | void page_remove_rmap(struct page *page) |
| 484 | { | 515 | { |
| 485 | if (atomic_add_negative(-1, &page->_mapcount)) { | 516 | if (atomic_add_negative(-1, &page->_mapcount)) { |
| 517 | if (page_mapcount(page) < 0) { | ||
| 518 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | ||
| 519 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | ||
| 520 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | ||
| 521 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | ||
| 522 | } | ||
| 523 | |||
| 486 | BUG_ON(page_mapcount(page) < 0); | 524 | BUG_ON(page_mapcount(page) < 0); |
| 487 | /* | 525 | /* |
| 488 | * It would be tidy to reset the PageAnon mapping here, | 526 | * It would be tidy to reset the PageAnon mapping here, |
| @@ -495,7 +533,7 @@ void page_remove_rmap(struct page *page) | |||
| 495 | */ | 533 | */ |
| 496 | if (page_test_and_clear_dirty(page)) | 534 | if (page_test_and_clear_dirty(page)) |
| 497 | set_page_dirty(page); | 535 | set_page_dirty(page); |
| 498 | dec_page_state(nr_mapped); | 536 | __dec_page_state(nr_mapped); |
| 499 | } | 537 | } |
| 500 | } | 538 | } |
| 501 | 539 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index dc25565a61e9..343b3c0937e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next) | |||
| 457 | } while (next); | 457 | } while (next); |
| 458 | } | 458 | } |
| 459 | 459 | ||
| 460 | static void shmem_truncate(struct inode *inode) | 460 | static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) |
| 461 | { | 461 | { |
| 462 | struct shmem_inode_info *info = SHMEM_I(inode); | 462 | struct shmem_inode_info *info = SHMEM_I(inode); |
| 463 | unsigned long idx; | 463 | unsigned long idx; |
| @@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode) | |||
| 475 | long nr_swaps_freed = 0; | 475 | long nr_swaps_freed = 0; |
| 476 | int offset; | 476 | int offset; |
| 477 | int freed; | 477 | int freed; |
| 478 | int punch_hole = 0; | ||
| 478 | 479 | ||
| 479 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 480 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
| 480 | idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 481 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 481 | if (idx >= info->next_index) | 482 | if (idx >= info->next_index) |
| 482 | return; | 483 | return; |
| 483 | 484 | ||
| 484 | spin_lock(&info->lock); | 485 | spin_lock(&info->lock); |
| 485 | info->flags |= SHMEM_TRUNCATE; | 486 | info->flags |= SHMEM_TRUNCATE; |
| 486 | limit = info->next_index; | 487 | if (likely(end == (loff_t) -1)) { |
| 487 | info->next_index = idx; | 488 | limit = info->next_index; |
| 489 | info->next_index = idx; | ||
| 490 | } else { | ||
| 491 | limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 492 | if (limit > info->next_index) | ||
| 493 | limit = info->next_index; | ||
| 494 | punch_hole = 1; | ||
| 495 | } | ||
| 496 | |||
| 488 | topdir = info->i_indirect; | 497 | topdir = info->i_indirect; |
| 489 | if (topdir && idx <= SHMEM_NR_DIRECT) { | 498 | if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { |
| 490 | info->i_indirect = NULL; | 499 | info->i_indirect = NULL; |
| 491 | nr_pages_to_free++; | 500 | nr_pages_to_free++; |
| 492 | list_add(&topdir->lru, &pages_to_free); | 501 | list_add(&topdir->lru, &pages_to_free); |
| @@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode) | |||
| 573 | set_page_private(subdir, page_private(subdir) - freed); | 582 | set_page_private(subdir, page_private(subdir) - freed); |
| 574 | if (offset) | 583 | if (offset) |
| 575 | spin_unlock(&info->lock); | 584 | spin_unlock(&info->lock); |
| 576 | BUG_ON(page_private(subdir) > offset); | 585 | if (!punch_hole) |
| 586 | BUG_ON(page_private(subdir) > offset); | ||
| 577 | } | 587 | } |
| 578 | if (offset) | 588 | if (offset) |
| 579 | offset = 0; | 589 | offset = 0; |
| 580 | else if (subdir) { | 590 | else if (subdir && !page_private(subdir)) { |
| 581 | dir[diroff] = NULL; | 591 | dir[diroff] = NULL; |
| 582 | nr_pages_to_free++; | 592 | nr_pages_to_free++; |
| 583 | list_add(&subdir->lru, &pages_to_free); | 593 | list_add(&subdir->lru, &pages_to_free); |
| @@ -594,7 +604,7 @@ done2: | |||
| 594 | * Also, though shmem_getpage checks i_size before adding to | 604 | * Also, though shmem_getpage checks i_size before adding to |
| 595 | * cache, no recheck after: so fix the narrow window there too. | 605 | * cache, no recheck after: so fix the narrow window there too. |
| 596 | */ | 606 | */ |
| 597 | truncate_inode_pages(inode->i_mapping, inode->i_size); | 607 | truncate_inode_pages_range(inode->i_mapping, start, end); |
| 598 | } | 608 | } |
| 599 | 609 | ||
| 600 | spin_lock(&info->lock); | 610 | spin_lock(&info->lock); |
| @@ -614,6 +624,11 @@ done2: | |||
| 614 | } | 624 | } |
| 615 | } | 625 | } |
| 616 | 626 | ||
| 627 | static void shmem_truncate(struct inode *inode) | ||
| 628 | { | ||
| 629 | shmem_truncate_range(inode, inode->i_size, (loff_t)-1); | ||
| 630 | } | ||
| 631 | |||
| 617 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | 632 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) |
| 618 | { | 633 | { |
| 619 | struct inode *inode = dentry->d_inode; | 634 | struct inode *inode = dentry->d_inode; |
| @@ -855,7 +870,7 @@ unlock: | |||
| 855 | swap_free(swap); | 870 | swap_free(swap); |
| 856 | redirty: | 871 | redirty: |
| 857 | set_page_dirty(page); | 872 | set_page_dirty(page); |
| 858 | return WRITEPAGE_ACTIVATE; /* Return with the page locked */ | 873 | return AOP_WRITEPAGE_ACTIVATE; /* Return with the page locked */ |
| 859 | } | 874 | } |
| 860 | 875 | ||
| 861 | #ifdef CONFIG_NUMA | 876 | #ifdef CONFIG_NUMA |
| @@ -1255,7 +1270,7 @@ out_nomem: | |||
| 1255 | return retval; | 1270 | return retval; |
| 1256 | } | 1271 | } |
| 1257 | 1272 | ||
| 1258 | static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | 1273 | int shmem_mmap(struct file *file, struct vm_area_struct *vma) |
| 1259 | { | 1274 | { |
| 1260 | file_accessed(file); | 1275 | file_accessed(file); |
| 1261 | vma->vm_ops = &shmem_vm_ops; | 1276 | vma->vm_ops = &shmem_vm_ops; |
| @@ -1355,7 +1370,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t | |||
| 1355 | if (!access_ok(VERIFY_READ, buf, count)) | 1370 | if (!access_ok(VERIFY_READ, buf, count)) |
| 1356 | return -EFAULT; | 1371 | return -EFAULT; |
| 1357 | 1372 | ||
| 1358 | down(&inode->i_sem); | 1373 | mutex_lock(&inode->i_mutex); |
| 1359 | 1374 | ||
| 1360 | pos = *ppos; | 1375 | pos = *ppos; |
| 1361 | written = 0; | 1376 | written = 0; |
| @@ -1440,7 +1455,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t | |||
| 1440 | if (written) | 1455 | if (written) |
| 1441 | err = written; | 1456 | err = written; |
| 1442 | out: | 1457 | out: |
| 1443 | up(&inode->i_sem); | 1458 | mutex_unlock(&inode->i_mutex); |
| 1444 | return err; | 1459 | return err; |
| 1445 | } | 1460 | } |
| 1446 | 1461 | ||
| @@ -1476,7 +1491,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
| 1476 | 1491 | ||
| 1477 | /* | 1492 | /* |
| 1478 | * We must evaluate after, since reads (unlike writes) | 1493 | * We must evaluate after, since reads (unlike writes) |
| 1479 | * are called without i_sem protection against truncate | 1494 | * are called without i_mutex protection against truncate |
| 1480 | */ | 1495 | */ |
| 1481 | nr = PAGE_CACHE_SIZE; | 1496 | nr = PAGE_CACHE_SIZE; |
| 1482 | i_size = i_size_read(inode); | 1497 | i_size = i_size_read(inode); |
| @@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = { | |||
| 2083 | static struct inode_operations shmem_inode_operations = { | 2098 | static struct inode_operations shmem_inode_operations = { |
| 2084 | .truncate = shmem_truncate, | 2099 | .truncate = shmem_truncate, |
| 2085 | .setattr = shmem_notify_change, | 2100 | .setattr = shmem_notify_change, |
| 2101 | .truncate_range = shmem_truncate_range, | ||
| 2086 | }; | 2102 | }; |
| 2087 | 2103 | ||
| 2088 | static struct inode_operations shmem_dir_inode_operations = { | 2104 | static struct inode_operations shmem_dir_inode_operations = { |
| @@ -130,7 +130,6 @@ | |||
| 130 | #define FORCED_DEBUG 0 | 130 | #define FORCED_DEBUG 0 |
| 131 | #endif | 131 | #endif |
| 132 | 132 | ||
| 133 | |||
| 134 | /* Shouldn't this be in a header file somewhere? */ | 133 | /* Shouldn't this be in a header file somewhere? */ |
| 135 | #define BYTES_PER_WORD sizeof(void *) | 134 | #define BYTES_PER_WORD sizeof(void *) |
| 136 | 135 | ||
| @@ -217,12 +216,12 @@ static unsigned long offslab_limit; | |||
| 217 | * Slabs are chained into three list: fully used, partial, fully free slabs. | 216 | * Slabs are chained into three list: fully used, partial, fully free slabs. |
| 218 | */ | 217 | */ |
| 219 | struct slab { | 218 | struct slab { |
| 220 | struct list_head list; | 219 | struct list_head list; |
| 221 | unsigned long colouroff; | 220 | unsigned long colouroff; |
| 222 | void *s_mem; /* including colour offset */ | 221 | void *s_mem; /* including colour offset */ |
| 223 | unsigned int inuse; /* num of objs active in slab */ | 222 | unsigned int inuse; /* num of objs active in slab */ |
| 224 | kmem_bufctl_t free; | 223 | kmem_bufctl_t free; |
| 225 | unsigned short nodeid; | 224 | unsigned short nodeid; |
| 226 | }; | 225 | }; |
| 227 | 226 | ||
| 228 | /* | 227 | /* |
| @@ -242,9 +241,9 @@ struct slab { | |||
| 242 | * We assume struct slab_rcu can overlay struct slab when destroying. | 241 | * We assume struct slab_rcu can overlay struct slab when destroying. |
| 243 | */ | 242 | */ |
| 244 | struct slab_rcu { | 243 | struct slab_rcu { |
| 245 | struct rcu_head head; | 244 | struct rcu_head head; |
| 246 | kmem_cache_t *cachep; | 245 | kmem_cache_t *cachep; |
| 247 | void *addr; | 246 | void *addr; |
| 248 | }; | 247 | }; |
| 249 | 248 | ||
| 250 | /* | 249 | /* |
| @@ -279,23 +278,23 @@ struct array_cache { | |||
| 279 | #define BOOT_CPUCACHE_ENTRIES 1 | 278 | #define BOOT_CPUCACHE_ENTRIES 1 |
| 280 | struct arraycache_init { | 279 | struct arraycache_init { |
| 281 | struct array_cache cache; | 280 | struct array_cache cache; |
| 282 | void * entries[BOOT_CPUCACHE_ENTRIES]; | 281 | void *entries[BOOT_CPUCACHE_ENTRIES]; |
| 283 | }; | 282 | }; |
| 284 | 283 | ||
| 285 | /* | 284 | /* |
| 286 | * The slab lists for all objects. | 285 | * The slab lists for all objects. |
| 287 | */ | 286 | */ |
| 288 | struct kmem_list3 { | 287 | struct kmem_list3 { |
| 289 | struct list_head slabs_partial; /* partial list first, better asm code */ | 288 | struct list_head slabs_partial; /* partial list first, better asm code */ |
| 290 | struct list_head slabs_full; | 289 | struct list_head slabs_full; |
| 291 | struct list_head slabs_free; | 290 | struct list_head slabs_free; |
| 292 | unsigned long free_objects; | 291 | unsigned long free_objects; |
| 293 | unsigned long next_reap; | 292 | unsigned long next_reap; |
| 294 | int free_touched; | 293 | int free_touched; |
| 295 | unsigned int free_limit; | 294 | unsigned int free_limit; |
| 296 | spinlock_t list_lock; | 295 | spinlock_t list_lock; |
| 297 | struct array_cache *shared; /* shared per node */ | 296 | struct array_cache *shared; /* shared per node */ |
| 298 | struct array_cache **alien; /* on other nodes */ | 297 | struct array_cache **alien; /* on other nodes */ |
| 299 | }; | 298 | }; |
| 300 | 299 | ||
| 301 | /* | 300 | /* |
| @@ -367,63 +366,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent) | |||
| 367 | * | 366 | * |
| 368 | * manages a cache. | 367 | * manages a cache. |
| 369 | */ | 368 | */ |
| 370 | 369 | ||
| 371 | struct kmem_cache { | 370 | struct kmem_cache { |
| 372 | /* 1) per-cpu data, touched during every alloc/free */ | 371 | /* 1) per-cpu data, touched during every alloc/free */ |
| 373 | struct array_cache *array[NR_CPUS]; | 372 | struct array_cache *array[NR_CPUS]; |
| 374 | unsigned int batchcount; | 373 | unsigned int batchcount; |
| 375 | unsigned int limit; | 374 | unsigned int limit; |
| 376 | unsigned int shared; | 375 | unsigned int shared; |
| 377 | unsigned int objsize; | 376 | unsigned int objsize; |
| 378 | /* 2) touched by every alloc & free from the backend */ | 377 | /* 2) touched by every alloc & free from the backend */ |
| 379 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | 378 | struct kmem_list3 *nodelists[MAX_NUMNODES]; |
| 380 | unsigned int flags; /* constant flags */ | 379 | unsigned int flags; /* constant flags */ |
| 381 | unsigned int num; /* # of objs per slab */ | 380 | unsigned int num; /* # of objs per slab */ |
| 382 | spinlock_t spinlock; | 381 | spinlock_t spinlock; |
| 383 | 382 | ||
| 384 | /* 3) cache_grow/shrink */ | 383 | /* 3) cache_grow/shrink */ |
| 385 | /* order of pgs per slab (2^n) */ | 384 | /* order of pgs per slab (2^n) */ |
| 386 | unsigned int gfporder; | 385 | unsigned int gfporder; |
| 387 | 386 | ||
| 388 | /* force GFP flags, e.g. GFP_DMA */ | 387 | /* force GFP flags, e.g. GFP_DMA */ |
| 389 | gfp_t gfpflags; | 388 | gfp_t gfpflags; |
| 390 | 389 | ||
| 391 | size_t colour; /* cache colouring range */ | 390 | size_t colour; /* cache colouring range */ |
| 392 | unsigned int colour_off; /* colour offset */ | 391 | unsigned int colour_off; /* colour offset */ |
| 393 | unsigned int colour_next; /* cache colouring */ | 392 | unsigned int colour_next; /* cache colouring */ |
| 394 | kmem_cache_t *slabp_cache; | 393 | kmem_cache_t *slabp_cache; |
| 395 | unsigned int slab_size; | 394 | unsigned int slab_size; |
| 396 | unsigned int dflags; /* dynamic flags */ | 395 | unsigned int dflags; /* dynamic flags */ |
| 397 | 396 | ||
| 398 | /* constructor func */ | 397 | /* constructor func */ |
| 399 | void (*ctor)(void *, kmem_cache_t *, unsigned long); | 398 | void (*ctor) (void *, kmem_cache_t *, unsigned long); |
| 400 | 399 | ||
| 401 | /* de-constructor func */ | 400 | /* de-constructor func */ |
| 402 | void (*dtor)(void *, kmem_cache_t *, unsigned long); | 401 | void (*dtor) (void *, kmem_cache_t *, unsigned long); |
| 403 | 402 | ||
| 404 | /* 4) cache creation/removal */ | 403 | /* 4) cache creation/removal */ |
| 405 | const char *name; | 404 | const char *name; |
| 406 | struct list_head next; | 405 | struct list_head next; |
| 407 | 406 | ||
| 408 | /* 5) statistics */ | 407 | /* 5) statistics */ |
| 409 | #if STATS | 408 | #if STATS |
| 410 | unsigned long num_active; | 409 | unsigned long num_active; |
| 411 | unsigned long num_allocations; | 410 | unsigned long num_allocations; |
| 412 | unsigned long high_mark; | 411 | unsigned long high_mark; |
| 413 | unsigned long grown; | 412 | unsigned long grown; |
| 414 | unsigned long reaped; | 413 | unsigned long reaped; |
| 415 | unsigned long errors; | 414 | unsigned long errors; |
| 416 | unsigned long max_freeable; | 415 | unsigned long max_freeable; |
| 417 | unsigned long node_allocs; | 416 | unsigned long node_allocs; |
| 418 | unsigned long node_frees; | 417 | unsigned long node_frees; |
| 419 | atomic_t allochit; | 418 | atomic_t allochit; |
| 420 | atomic_t allocmiss; | 419 | atomic_t allocmiss; |
| 421 | atomic_t freehit; | 420 | atomic_t freehit; |
| 422 | atomic_t freemiss; | 421 | atomic_t freemiss; |
| 423 | #endif | 422 | #endif |
| 424 | #if DEBUG | 423 | #if DEBUG |
| 425 | int dbghead; | 424 | int dbghead; |
| 426 | int reallen; | 425 | int reallen; |
| 427 | #endif | 426 | #endif |
| 428 | }; | 427 | }; |
| 429 | 428 | ||
| @@ -523,14 +522,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp) | |||
| 523 | { | 522 | { |
| 524 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); | 523 | BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); |
| 525 | if (cachep->flags & SLAB_STORE_USER) | 524 | if (cachep->flags & SLAB_STORE_USER) |
| 526 | return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD); | 525 | return (unsigned long *)(objp + cachep->objsize - |
| 527 | return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD); | 526 | 2 * BYTES_PER_WORD); |
| 527 | return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD); | ||
| 528 | } | 528 | } |
| 529 | 529 | ||
| 530 | static void **dbg_userword(kmem_cache_t *cachep, void *objp) | 530 | static void **dbg_userword(kmem_cache_t *cachep, void *objp) |
| 531 | { | 531 | { |
| 532 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); | 532 | BUG_ON(!(cachep->flags & SLAB_STORE_USER)); |
| 533 | return (void**)(objp+cachep->objsize-BYTES_PER_WORD); | 533 | return (void **)(objp + cachep->objsize - BYTES_PER_WORD); |
| 534 | } | 534 | } |
| 535 | 535 | ||
| 536 | #else | 536 | #else |
| @@ -607,31 +607,31 @@ struct cache_names { | |||
| 607 | static struct cache_names __initdata cache_names[] = { | 607 | static struct cache_names __initdata cache_names[] = { |
| 608 | #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, | 608 | #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" }, |
| 609 | #include <linux/kmalloc_sizes.h> | 609 | #include <linux/kmalloc_sizes.h> |
| 610 | { NULL, } | 610 | {NULL,} |
| 611 | #undef CACHE | 611 | #undef CACHE |
| 612 | }; | 612 | }; |
| 613 | 613 | ||
| 614 | static struct arraycache_init initarray_cache __initdata = | 614 | static struct arraycache_init initarray_cache __initdata = |
| 615 | { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 615 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
| 616 | static struct arraycache_init initarray_generic = | 616 | static struct arraycache_init initarray_generic = |
| 617 | { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 617 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
| 618 | 618 | ||
| 619 | /* internal cache of cache description objs */ | 619 | /* internal cache of cache description objs */ |
| 620 | static kmem_cache_t cache_cache = { | 620 | static kmem_cache_t cache_cache = { |
| 621 | .batchcount = 1, | 621 | .batchcount = 1, |
| 622 | .limit = BOOT_CPUCACHE_ENTRIES, | 622 | .limit = BOOT_CPUCACHE_ENTRIES, |
| 623 | .shared = 1, | 623 | .shared = 1, |
| 624 | .objsize = sizeof(kmem_cache_t), | 624 | .objsize = sizeof(kmem_cache_t), |
| 625 | .flags = SLAB_NO_REAP, | 625 | .flags = SLAB_NO_REAP, |
| 626 | .spinlock = SPIN_LOCK_UNLOCKED, | 626 | .spinlock = SPIN_LOCK_UNLOCKED, |
| 627 | .name = "kmem_cache", | 627 | .name = "kmem_cache", |
| 628 | #if DEBUG | 628 | #if DEBUG |
| 629 | .reallen = sizeof(kmem_cache_t), | 629 | .reallen = sizeof(kmem_cache_t), |
| 630 | #endif | 630 | #endif |
| 631 | }; | 631 | }; |
| 632 | 632 | ||
| 633 | /* Guard access to the cache-chain. */ | 633 | /* Guard access to the cache-chain. */ |
| 634 | static struct semaphore cache_chain_sem; | 634 | static struct semaphore cache_chain_sem; |
| 635 | static struct list_head cache_chain; | 635 | static struct list_head cache_chain; |
| 636 | 636 | ||
| 637 | /* | 637 | /* |
| @@ -655,9 +655,9 @@ static enum { | |||
| 655 | 655 | ||
| 656 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 656 | static DEFINE_PER_CPU(struct work_struct, reap_work); |
| 657 | 657 | ||
| 658 | static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node); | 658 | static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node); |
| 659 | static void enable_cpucache (kmem_cache_t *cachep); | 659 | static void enable_cpucache(kmem_cache_t *cachep); |
| 660 | static void cache_reap (void *unused); | 660 | static void cache_reap(void *unused); |
| 661 | static int __node_shrink(kmem_cache_t *cachep, int node); | 661 | static int __node_shrink(kmem_cache_t *cachep, int node); |
| 662 | 662 | ||
| 663 | static inline struct array_cache *ac_data(kmem_cache_t *cachep) | 663 | static inline struct array_cache *ac_data(kmem_cache_t *cachep) |
| @@ -671,9 +671,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags) | |||
| 671 | 671 | ||
| 672 | #if DEBUG | 672 | #if DEBUG |
| 673 | /* This happens if someone tries to call | 673 | /* This happens if someone tries to call |
| 674 | * kmem_cache_create(), or __kmalloc(), before | 674 | * kmem_cache_create(), or __kmalloc(), before |
| 675 | * the generic caches are initialized. | 675 | * the generic caches are initialized. |
| 676 | */ | 676 | */ |
| 677 | BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); | 677 | BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); |
| 678 | #endif | 678 | #endif |
| 679 | while (size > csizep->cs_size) | 679 | while (size > csizep->cs_size) |
| @@ -697,10 +697,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep); | |||
| 697 | 697 | ||
| 698 | /* Cal the num objs, wastage, and bytes left over for a given slab size. */ | 698 | /* Cal the num objs, wastage, and bytes left over for a given slab size. */ |
| 699 | static void cache_estimate(unsigned long gfporder, size_t size, size_t align, | 699 | static void cache_estimate(unsigned long gfporder, size_t size, size_t align, |
| 700 | int flags, size_t *left_over, unsigned int *num) | 700 | int flags, size_t *left_over, unsigned int *num) |
| 701 | { | 701 | { |
| 702 | int i; | 702 | int i; |
| 703 | size_t wastage = PAGE_SIZE<<gfporder; | 703 | size_t wastage = PAGE_SIZE << gfporder; |
| 704 | size_t extra = 0; | 704 | size_t extra = 0; |
| 705 | size_t base = 0; | 705 | size_t base = 0; |
| 706 | 706 | ||
| @@ -709,7 +709,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align, | |||
| 709 | extra = sizeof(kmem_bufctl_t); | 709 | extra = sizeof(kmem_bufctl_t); |
| 710 | } | 710 | } |
| 711 | i = 0; | 711 | i = 0; |
| 712 | while (i*size + ALIGN(base+i*extra, align) <= wastage) | 712 | while (i * size + ALIGN(base + i * extra, align) <= wastage) |
| 713 | i++; | 713 | i++; |
| 714 | if (i > 0) | 714 | if (i > 0) |
| 715 | i--; | 715 | i--; |
| @@ -718,8 +718,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align, | |||
| 718 | i = SLAB_LIMIT; | 718 | i = SLAB_LIMIT; |
| 719 | 719 | ||
| 720 | *num = i; | 720 | *num = i; |
| 721 | wastage -= i*size; | 721 | wastage -= i * size; |
| 722 | wastage -= ALIGN(base+i*extra, align); | 722 | wastage -= ALIGN(base + i * extra, align); |
| 723 | *left_over = wastage; | 723 | *left_over = wastage; |
| 724 | } | 724 | } |
| 725 | 725 | ||
| @@ -728,7 +728,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align, | |||
| 728 | static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) | 728 | static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg) |
| 729 | { | 729 | { |
| 730 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", | 730 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", |
| 731 | function, cachep->name, msg); | 731 | function, cachep->name, msg); |
| 732 | dump_stack(); | 732 | dump_stack(); |
| 733 | } | 733 | } |
| 734 | 734 | ||
| @@ -755,9 +755,9 @@ static void __devinit start_cpu_timer(int cpu) | |||
| 755 | } | 755 | } |
| 756 | 756 | ||
| 757 | static struct array_cache *alloc_arraycache(int node, int entries, | 757 | static struct array_cache *alloc_arraycache(int node, int entries, |
| 758 | int batchcount) | 758 | int batchcount) |
| 759 | { | 759 | { |
| 760 | int memsize = sizeof(void*)*entries+sizeof(struct array_cache); | 760 | int memsize = sizeof(void *) * entries + sizeof(struct array_cache); |
| 761 | struct array_cache *nc = NULL; | 761 | struct array_cache *nc = NULL; |
| 762 | 762 | ||
| 763 | nc = kmalloc_node(memsize, GFP_KERNEL, node); | 763 | nc = kmalloc_node(memsize, GFP_KERNEL, node); |
| @@ -775,7 +775,7 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
| 775 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | 775 | static inline struct array_cache **alloc_alien_cache(int node, int limit) |
| 776 | { | 776 | { |
| 777 | struct array_cache **ac_ptr; | 777 | struct array_cache **ac_ptr; |
| 778 | int memsize = sizeof(void*)*MAX_NUMNODES; | 778 | int memsize = sizeof(void *) * MAX_NUMNODES; |
| 779 | int i; | 779 | int i; |
| 780 | 780 | ||
| 781 | if (limit > 1) | 781 | if (limit > 1) |
| @@ -789,7 +789,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit) | |||
| 789 | } | 789 | } |
| 790 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); | 790 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); |
| 791 | if (!ac_ptr[i]) { | 791 | if (!ac_ptr[i]) { |
| 792 | for (i--; i <=0; i--) | 792 | for (i--; i <= 0; i--) |
| 793 | kfree(ac_ptr[i]); | 793 | kfree(ac_ptr[i]); |
| 794 | kfree(ac_ptr); | 794 | kfree(ac_ptr); |
| 795 | return NULL; | 795 | return NULL; |
| @@ -807,12 +807,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr) | |||
| 807 | return; | 807 | return; |
| 808 | 808 | ||
| 809 | for_each_node(i) | 809 | for_each_node(i) |
| 810 | kfree(ac_ptr[i]); | 810 | kfree(ac_ptr[i]); |
| 811 | 811 | ||
| 812 | kfree(ac_ptr); | 812 | kfree(ac_ptr); |
| 813 | } | 813 | } |
| 814 | 814 | ||
| 815 | static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node) | 815 | static inline void __drain_alien_cache(kmem_cache_t *cachep, |
| 816 | struct array_cache *ac, int node) | ||
| 816 | { | 817 | { |
| 817 | struct kmem_list3 *rl3 = cachep->nodelists[node]; | 818 | struct kmem_list3 *rl3 = cachep->nodelists[node]; |
| 818 | 819 | ||
| @@ -826,7 +827,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache | |||
| 826 | 827 | ||
| 827 | static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) | 828 | static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) |
| 828 | { | 829 | { |
| 829 | int i=0; | 830 | int i = 0; |
| 830 | struct array_cache *ac; | 831 | struct array_cache *ac; |
| 831 | unsigned long flags; | 832 | unsigned long flags; |
| 832 | 833 | ||
| @@ -846,14 +847,13 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3) | |||
| 846 | #endif | 847 | #endif |
| 847 | 848 | ||
| 848 | static int __devinit cpuup_callback(struct notifier_block *nfb, | 849 | static int __devinit cpuup_callback(struct notifier_block *nfb, |
| 849 | unsigned long action, void *hcpu) | 850 | unsigned long action, void *hcpu) |
| 850 | { | 851 | { |
| 851 | long cpu = (long)hcpu; | 852 | long cpu = (long)hcpu; |
| 852 | kmem_cache_t* cachep; | 853 | kmem_cache_t *cachep; |
| 853 | struct kmem_list3 *l3 = NULL; | 854 | struct kmem_list3 *l3 = NULL; |
| 854 | int node = cpu_to_node(cpu); | 855 | int node = cpu_to_node(cpu); |
| 855 | int memsize = sizeof(struct kmem_list3); | 856 | int memsize = sizeof(struct kmem_list3); |
| 856 | struct array_cache *nc = NULL; | ||
| 857 | 857 | ||
| 858 | switch (action) { | 858 | switch (action) { |
| 859 | case CPU_UP_PREPARE: | 859 | case CPU_UP_PREPARE: |
| @@ -871,27 +871,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 871 | */ | 871 | */ |
| 872 | if (!cachep->nodelists[node]) { | 872 | if (!cachep->nodelists[node]) { |
| 873 | if (!(l3 = kmalloc_node(memsize, | 873 | if (!(l3 = kmalloc_node(memsize, |
| 874 | GFP_KERNEL, node))) | 874 | GFP_KERNEL, node))) |
| 875 | goto bad; | 875 | goto bad; |
| 876 | kmem_list3_init(l3); | 876 | kmem_list3_init(l3); |
| 877 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 877 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
| 878 | ((unsigned long)cachep)%REAPTIMEOUT_LIST3; | 878 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
| 879 | 879 | ||
| 880 | cachep->nodelists[node] = l3; | 880 | cachep->nodelists[node] = l3; |
| 881 | } | 881 | } |
| 882 | 882 | ||
| 883 | spin_lock_irq(&cachep->nodelists[node]->list_lock); | 883 | spin_lock_irq(&cachep->nodelists[node]->list_lock); |
| 884 | cachep->nodelists[node]->free_limit = | 884 | cachep->nodelists[node]->free_limit = |
| 885 | (1 + nr_cpus_node(node)) * | 885 | (1 + nr_cpus_node(node)) * |
| 886 | cachep->batchcount + cachep->num; | 886 | cachep->batchcount + cachep->num; |
| 887 | spin_unlock_irq(&cachep->nodelists[node]->list_lock); | 887 | spin_unlock_irq(&cachep->nodelists[node]->list_lock); |
| 888 | } | 888 | } |
| 889 | 889 | ||
| 890 | /* Now we can go ahead with allocating the shared array's | 890 | /* Now we can go ahead with allocating the shared array's |
| 891 | & array cache's */ | 891 | & array cache's */ |
| 892 | list_for_each_entry(cachep, &cache_chain, next) { | 892 | list_for_each_entry(cachep, &cache_chain, next) { |
| 893 | struct array_cache *nc; | ||
| 894 | |||
| 893 | nc = alloc_arraycache(node, cachep->limit, | 895 | nc = alloc_arraycache(node, cachep->limit, |
| 894 | cachep->batchcount); | 896 | cachep->batchcount); |
| 895 | if (!nc) | 897 | if (!nc) |
| 896 | goto bad; | 898 | goto bad; |
| 897 | cachep->array[cpu] = nc; | 899 | cachep->array[cpu] = nc; |
| @@ -900,12 +902,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 900 | BUG_ON(!l3); | 902 | BUG_ON(!l3); |
| 901 | if (!l3->shared) { | 903 | if (!l3->shared) { |
| 902 | if (!(nc = alloc_arraycache(node, | 904 | if (!(nc = alloc_arraycache(node, |
| 903 | cachep->shared*cachep->batchcount, | 905 | cachep->shared * |
| 904 | 0xbaadf00d))) | 906 | cachep->batchcount, |
| 905 | goto bad; | 907 | 0xbaadf00d))) |
| 908 | goto bad; | ||
| 906 | 909 | ||
| 907 | /* we are serialised from CPU_DEAD or | 910 | /* we are serialised from CPU_DEAD or |
| 908 | CPU_UP_CANCELLED by the cpucontrol lock */ | 911 | CPU_UP_CANCELLED by the cpucontrol lock */ |
| 909 | l3->shared = nc; | 912 | l3->shared = nc; |
| 910 | } | 913 | } |
| 911 | } | 914 | } |
| @@ -942,13 +945,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 942 | free_block(cachep, nc->entry, nc->avail, node); | 945 | free_block(cachep, nc->entry, nc->avail, node); |
| 943 | 946 | ||
| 944 | if (!cpus_empty(mask)) { | 947 | if (!cpus_empty(mask)) { |
| 945 | spin_unlock(&l3->list_lock); | 948 | spin_unlock(&l3->list_lock); |
| 946 | goto unlock_cache; | 949 | goto unlock_cache; |
| 947 | } | 950 | } |
| 948 | 951 | ||
| 949 | if (l3->shared) { | 952 | if (l3->shared) { |
| 950 | free_block(cachep, l3->shared->entry, | 953 | free_block(cachep, l3->shared->entry, |
| 951 | l3->shared->avail, node); | 954 | l3->shared->avail, node); |
| 952 | kfree(l3->shared); | 955 | kfree(l3->shared); |
| 953 | l3->shared = NULL; | 956 | l3->shared = NULL; |
| 954 | } | 957 | } |
| @@ -966,7 +969,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 966 | } else { | 969 | } else { |
| 967 | spin_unlock(&l3->list_lock); | 970 | spin_unlock(&l3->list_lock); |
| 968 | } | 971 | } |
| 969 | unlock_cache: | 972 | unlock_cache: |
| 970 | spin_unlock_irq(&cachep->spinlock); | 973 | spin_unlock_irq(&cachep->spinlock); |
| 971 | kfree(nc); | 974 | kfree(nc); |
| 972 | } | 975 | } |
| @@ -975,7 +978,7 @@ unlock_cache: | |||
| 975 | #endif | 978 | #endif |
| 976 | } | 979 | } |
| 977 | return NOTIFY_OK; | 980 | return NOTIFY_OK; |
| 978 | bad: | 981 | bad: |
| 979 | up(&cache_chain_sem); | 982 | up(&cache_chain_sem); |
| 980 | return NOTIFY_BAD; | 983 | return NOTIFY_BAD; |
| 981 | } | 984 | } |
| @@ -985,8 +988,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; | |||
| 985 | /* | 988 | /* |
| 986 | * swap the static kmem_list3 with kmalloced memory | 989 | * swap the static kmem_list3 with kmalloced memory |
| 987 | */ | 990 | */ |
| 988 | static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, | 991 | static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid) |
| 989 | int nodeid) | ||
| 990 | { | 992 | { |
| 991 | struct kmem_list3 *ptr; | 993 | struct kmem_list3 *ptr; |
| 992 | 994 | ||
| @@ -1055,14 +1057,14 @@ void __init kmem_cache_init(void) | |||
| 1055 | cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); | 1057 | cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); |
| 1056 | 1058 | ||
| 1057 | cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, | 1059 | cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, |
| 1058 | &left_over, &cache_cache.num); | 1060 | &left_over, &cache_cache.num); |
| 1059 | if (!cache_cache.num) | 1061 | if (!cache_cache.num) |
| 1060 | BUG(); | 1062 | BUG(); |
| 1061 | 1063 | ||
| 1062 | cache_cache.colour = left_over/cache_cache.colour_off; | 1064 | cache_cache.colour = left_over / cache_cache.colour_off; |
| 1063 | cache_cache.colour_next = 0; | 1065 | cache_cache.colour_next = 0; |
| 1064 | cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + | 1066 | cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + |
| 1065 | sizeof(struct slab), cache_line_size()); | 1067 | sizeof(struct slab), cache_line_size()); |
| 1066 | 1068 | ||
| 1067 | /* 2+3) create the kmalloc caches */ | 1069 | /* 2+3) create the kmalloc caches */ |
| 1068 | sizes = malloc_sizes; | 1070 | sizes = malloc_sizes; |
| @@ -1074,14 +1076,18 @@ void __init kmem_cache_init(void) | |||
| 1074 | */ | 1076 | */ |
| 1075 | 1077 | ||
| 1076 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, | 1078 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, |
| 1077 | sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN, | 1079 | sizes[INDEX_AC].cs_size, |
| 1078 | (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); | 1080 | ARCH_KMALLOC_MINALIGN, |
| 1081 | (ARCH_KMALLOC_FLAGS | | ||
| 1082 | SLAB_PANIC), NULL, NULL); | ||
| 1079 | 1083 | ||
| 1080 | if (INDEX_AC != INDEX_L3) | 1084 | if (INDEX_AC != INDEX_L3) |
| 1081 | sizes[INDEX_L3].cs_cachep = | 1085 | sizes[INDEX_L3].cs_cachep = |
| 1082 | kmem_cache_create(names[INDEX_L3].name, | 1086 | kmem_cache_create(names[INDEX_L3].name, |
| 1083 | sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN, | 1087 | sizes[INDEX_L3].cs_size, |
| 1084 | (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); | 1088 | ARCH_KMALLOC_MINALIGN, |
| 1089 | (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, | ||
| 1090 | NULL); | ||
| 1085 | 1091 | ||
| 1086 | while (sizes->cs_size != ULONG_MAX) { | 1092 | while (sizes->cs_size != ULONG_MAX) { |
| 1087 | /* | 1093 | /* |
| @@ -1091,35 +1097,41 @@ void __init kmem_cache_init(void) | |||
| 1091 | * Note for systems short on memory removing the alignment will | 1097 | * Note for systems short on memory removing the alignment will |
| 1092 | * allow tighter packing of the smaller caches. | 1098 | * allow tighter packing of the smaller caches. |
| 1093 | */ | 1099 | */ |
| 1094 | if(!sizes->cs_cachep) | 1100 | if (!sizes->cs_cachep) |
| 1095 | sizes->cs_cachep = kmem_cache_create(names->name, | 1101 | sizes->cs_cachep = kmem_cache_create(names->name, |
| 1096 | sizes->cs_size, ARCH_KMALLOC_MINALIGN, | 1102 | sizes->cs_size, |
| 1097 | (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); | 1103 | ARCH_KMALLOC_MINALIGN, |
| 1104 | (ARCH_KMALLOC_FLAGS | ||
| 1105 | | SLAB_PANIC), | ||
| 1106 | NULL, NULL); | ||
| 1098 | 1107 | ||
| 1099 | /* Inc off-slab bufctl limit until the ceiling is hit. */ | 1108 | /* Inc off-slab bufctl limit until the ceiling is hit. */ |
| 1100 | if (!(OFF_SLAB(sizes->cs_cachep))) { | 1109 | if (!(OFF_SLAB(sizes->cs_cachep))) { |
| 1101 | offslab_limit = sizes->cs_size-sizeof(struct slab); | 1110 | offslab_limit = sizes->cs_size - sizeof(struct slab); |
| 1102 | offslab_limit /= sizeof(kmem_bufctl_t); | 1111 | offslab_limit /= sizeof(kmem_bufctl_t); |
| 1103 | } | 1112 | } |
| 1104 | 1113 | ||
| 1105 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, | 1114 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, |
| 1106 | sizes->cs_size, ARCH_KMALLOC_MINALIGN, | 1115 | sizes->cs_size, |
| 1107 | (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC), | 1116 | ARCH_KMALLOC_MINALIGN, |
| 1108 | NULL, NULL); | 1117 | (ARCH_KMALLOC_FLAGS | |
| 1118 | SLAB_CACHE_DMA | | ||
| 1119 | SLAB_PANIC), NULL, | ||
| 1120 | NULL); | ||
| 1109 | 1121 | ||
| 1110 | sizes++; | 1122 | sizes++; |
| 1111 | names++; | 1123 | names++; |
| 1112 | } | 1124 | } |
| 1113 | /* 4) Replace the bootstrap head arrays */ | 1125 | /* 4) Replace the bootstrap head arrays */ |
| 1114 | { | 1126 | { |
| 1115 | void * ptr; | 1127 | void *ptr; |
| 1116 | 1128 | ||
| 1117 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 1129 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); |
| 1118 | 1130 | ||
| 1119 | local_irq_disable(); | 1131 | local_irq_disable(); |
| 1120 | BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); | 1132 | BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); |
| 1121 | memcpy(ptr, ac_data(&cache_cache), | 1133 | memcpy(ptr, ac_data(&cache_cache), |
| 1122 | sizeof(struct arraycache_init)); | 1134 | sizeof(struct arraycache_init)); |
| 1123 | cache_cache.array[smp_processor_id()] = ptr; | 1135 | cache_cache.array[smp_processor_id()] = ptr; |
| 1124 | local_irq_enable(); | 1136 | local_irq_enable(); |
| 1125 | 1137 | ||
| @@ -1127,11 +1139,11 @@ void __init kmem_cache_init(void) | |||
| 1127 | 1139 | ||
| 1128 | local_irq_disable(); | 1140 | local_irq_disable(); |
| 1129 | BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) | 1141 | BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep) |
| 1130 | != &initarray_generic.cache); | 1142 | != &initarray_generic.cache); |
| 1131 | memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), | 1143 | memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep), |
| 1132 | sizeof(struct arraycache_init)); | 1144 | sizeof(struct arraycache_init)); |
| 1133 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = | 1145 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = |
| 1134 | ptr; | 1146 | ptr; |
| 1135 | local_irq_enable(); | 1147 | local_irq_enable(); |
| 1136 | } | 1148 | } |
| 1137 | /* 5) Replace the bootstrap kmem_list3's */ | 1149 | /* 5) Replace the bootstrap kmem_list3's */ |
| @@ -1139,16 +1151,16 @@ void __init kmem_cache_init(void) | |||
| 1139 | int node; | 1151 | int node; |
| 1140 | /* Replace the static kmem_list3 structures for the boot cpu */ | 1152 | /* Replace the static kmem_list3 structures for the boot cpu */ |
| 1141 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], | 1153 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], |
| 1142 | numa_node_id()); | 1154 | numa_node_id()); |
| 1143 | 1155 | ||
| 1144 | for_each_online_node(node) { | 1156 | for_each_online_node(node) { |
| 1145 | init_list(malloc_sizes[INDEX_AC].cs_cachep, | 1157 | init_list(malloc_sizes[INDEX_AC].cs_cachep, |
| 1146 | &initkmem_list3[SIZE_AC+node], node); | 1158 | &initkmem_list3[SIZE_AC + node], node); |
| 1147 | 1159 | ||
| 1148 | if (INDEX_AC != INDEX_L3) { | 1160 | if (INDEX_AC != INDEX_L3) { |
| 1149 | init_list(malloc_sizes[INDEX_L3].cs_cachep, | 1161 | init_list(malloc_sizes[INDEX_L3].cs_cachep, |
| 1150 | &initkmem_list3[SIZE_L3+node], | 1162 | &initkmem_list3[SIZE_L3 + node], |
| 1151 | node); | 1163 | node); |
| 1152 | } | 1164 | } |
| 1153 | } | 1165 | } |
| 1154 | } | 1166 | } |
| @@ -1158,7 +1170,7 @@ void __init kmem_cache_init(void) | |||
| 1158 | kmem_cache_t *cachep; | 1170 | kmem_cache_t *cachep; |
| 1159 | down(&cache_chain_sem); | 1171 | down(&cache_chain_sem); |
| 1160 | list_for_each_entry(cachep, &cache_chain, next) | 1172 | list_for_each_entry(cachep, &cache_chain, next) |
| 1161 | enable_cpucache(cachep); | 1173 | enable_cpucache(cachep); |
| 1162 | up(&cache_chain_sem); | 1174 | up(&cache_chain_sem); |
| 1163 | } | 1175 | } |
| 1164 | 1176 | ||
| @@ -1184,7 +1196,7 @@ static int __init cpucache_init(void) | |||
| 1184 | * pages to gfp. | 1196 | * pages to gfp. |
| 1185 | */ | 1197 | */ |
| 1186 | for_each_online_cpu(cpu) | 1198 | for_each_online_cpu(cpu) |
| 1187 | start_cpu_timer(cpu); | 1199 | start_cpu_timer(cpu); |
| 1188 | 1200 | ||
| 1189 | return 0; | 1201 | return 0; |
| 1190 | } | 1202 | } |
| @@ -1226,7 +1238,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid) | |||
| 1226 | */ | 1238 | */ |
| 1227 | static void kmem_freepages(kmem_cache_t *cachep, void *addr) | 1239 | static void kmem_freepages(kmem_cache_t *cachep, void *addr) |
| 1228 | { | 1240 | { |
| 1229 | unsigned long i = (1<<cachep->gfporder); | 1241 | unsigned long i = (1 << cachep->gfporder); |
| 1230 | struct page *page = virt_to_page(addr); | 1242 | struct page *page = virt_to_page(addr); |
| 1231 | const unsigned long nr_freed = i; | 1243 | const unsigned long nr_freed = i; |
| 1232 | 1244 | ||
| @@ -1239,13 +1251,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr) | |||
| 1239 | if (current->reclaim_state) | 1251 | if (current->reclaim_state) |
| 1240 | current->reclaim_state->reclaimed_slab += nr_freed; | 1252 | current->reclaim_state->reclaimed_slab += nr_freed; |
| 1241 | free_pages((unsigned long)addr, cachep->gfporder); | 1253 | free_pages((unsigned long)addr, cachep->gfporder); |
| 1242 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1254 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
| 1243 | atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages); | 1255 | atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages); |
| 1244 | } | 1256 | } |
| 1245 | 1257 | ||
| 1246 | static void kmem_rcu_free(struct rcu_head *head) | 1258 | static void kmem_rcu_free(struct rcu_head *head) |
| 1247 | { | 1259 | { |
| 1248 | struct slab_rcu *slab_rcu = (struct slab_rcu *) head; | 1260 | struct slab_rcu *slab_rcu = (struct slab_rcu *)head; |
| 1249 | kmem_cache_t *cachep = slab_rcu->cachep; | 1261 | kmem_cache_t *cachep = slab_rcu->cachep; |
| 1250 | 1262 | ||
| 1251 | kmem_freepages(cachep, slab_rcu->addr); | 1263 | kmem_freepages(cachep, slab_rcu->addr); |
| @@ -1257,19 +1269,19 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
| 1257 | 1269 | ||
| 1258 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1270 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 1259 | static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, | 1271 | static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, |
| 1260 | unsigned long caller) | 1272 | unsigned long caller) |
| 1261 | { | 1273 | { |
| 1262 | int size = obj_reallen(cachep); | 1274 | int size = obj_reallen(cachep); |
| 1263 | 1275 | ||
| 1264 | addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)]; | 1276 | addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)]; |
| 1265 | 1277 | ||
| 1266 | if (size < 5*sizeof(unsigned long)) | 1278 | if (size < 5 * sizeof(unsigned long)) |
| 1267 | return; | 1279 | return; |
| 1268 | 1280 | ||
| 1269 | *addr++=0x12345678; | 1281 | *addr++ = 0x12345678; |
| 1270 | *addr++=caller; | 1282 | *addr++ = caller; |
| 1271 | *addr++=smp_processor_id(); | 1283 | *addr++ = smp_processor_id(); |
| 1272 | size -= 3*sizeof(unsigned long); | 1284 | size -= 3 * sizeof(unsigned long); |
| 1273 | { | 1285 | { |
| 1274 | unsigned long *sptr = &caller; | 1286 | unsigned long *sptr = &caller; |
| 1275 | unsigned long svalue; | 1287 | unsigned long svalue; |
| @@ -1277,7 +1289,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, | |||
| 1277 | while (!kstack_end(sptr)) { | 1289 | while (!kstack_end(sptr)) { |
| 1278 | svalue = *sptr++; | 1290 | svalue = *sptr++; |
| 1279 | if (kernel_text_address(svalue)) { | 1291 | if (kernel_text_address(svalue)) { |
| 1280 | *addr++=svalue; | 1292 | *addr++ = svalue; |
| 1281 | size -= sizeof(unsigned long); | 1293 | size -= sizeof(unsigned long); |
| 1282 | if (size <= sizeof(unsigned long)) | 1294 | if (size <= sizeof(unsigned long)) |
| 1283 | break; | 1295 | break; |
| @@ -1285,25 +1297,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, | |||
| 1285 | } | 1297 | } |
| 1286 | 1298 | ||
| 1287 | } | 1299 | } |
| 1288 | *addr++=0x87654321; | 1300 | *addr++ = 0x87654321; |
| 1289 | } | 1301 | } |
| 1290 | #endif | 1302 | #endif |
| 1291 | 1303 | ||
| 1292 | static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) | 1304 | static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val) |
| 1293 | { | 1305 | { |
| 1294 | int size = obj_reallen(cachep); | 1306 | int size = obj_reallen(cachep); |
| 1295 | addr = &((char*)addr)[obj_dbghead(cachep)]; | 1307 | addr = &((char *)addr)[obj_dbghead(cachep)]; |
| 1296 | 1308 | ||
| 1297 | memset(addr, val, size); | 1309 | memset(addr, val, size); |
| 1298 | *(unsigned char *)(addr+size-1) = POISON_END; | 1310 | *(unsigned char *)(addr + size - 1) = POISON_END; |
| 1299 | } | 1311 | } |
| 1300 | 1312 | ||
| 1301 | static void dump_line(char *data, int offset, int limit) | 1313 | static void dump_line(char *data, int offset, int limit) |
| 1302 | { | 1314 | { |
| 1303 | int i; | 1315 | int i; |
| 1304 | printk(KERN_ERR "%03x:", offset); | 1316 | printk(KERN_ERR "%03x:", offset); |
| 1305 | for (i=0;i<limit;i++) { | 1317 | for (i = 0; i < limit; i++) { |
| 1306 | printk(" %02x", (unsigned char)data[offset+i]); | 1318 | printk(" %02x", (unsigned char)data[offset + i]); |
| 1307 | } | 1319 | } |
| 1308 | printk("\n"); | 1320 | printk("\n"); |
| 1309 | } | 1321 | } |
| @@ -1318,24 +1330,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines) | |||
| 1318 | 1330 | ||
| 1319 | if (cachep->flags & SLAB_RED_ZONE) { | 1331 | if (cachep->flags & SLAB_RED_ZONE) { |
| 1320 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", | 1332 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", |
| 1321 | *dbg_redzone1(cachep, objp), | 1333 | *dbg_redzone1(cachep, objp), |
| 1322 | *dbg_redzone2(cachep, objp)); | 1334 | *dbg_redzone2(cachep, objp)); |
| 1323 | } | 1335 | } |
| 1324 | 1336 | ||
| 1325 | if (cachep->flags & SLAB_STORE_USER) { | 1337 | if (cachep->flags & SLAB_STORE_USER) { |
| 1326 | printk(KERN_ERR "Last user: [<%p>]", | 1338 | printk(KERN_ERR "Last user: [<%p>]", |
| 1327 | *dbg_userword(cachep, objp)); | 1339 | *dbg_userword(cachep, objp)); |
| 1328 | print_symbol("(%s)", | 1340 | print_symbol("(%s)", |
| 1329 | (unsigned long)*dbg_userword(cachep, objp)); | 1341 | (unsigned long)*dbg_userword(cachep, objp)); |
| 1330 | printk("\n"); | 1342 | printk("\n"); |
| 1331 | } | 1343 | } |
| 1332 | realobj = (char*)objp+obj_dbghead(cachep); | 1344 | realobj = (char *)objp + obj_dbghead(cachep); |
| 1333 | size = obj_reallen(cachep); | 1345 | size = obj_reallen(cachep); |
| 1334 | for (i=0; i<size && lines;i+=16, lines--) { | 1346 | for (i = 0; i < size && lines; i += 16, lines--) { |
| 1335 | int limit; | 1347 | int limit; |
| 1336 | limit = 16; | 1348 | limit = 16; |
| 1337 | if (i+limit > size) | 1349 | if (i + limit > size) |
| 1338 | limit = size-i; | 1350 | limit = size - i; |
| 1339 | dump_line(realobj, i, limit); | 1351 | dump_line(realobj, i, limit); |
| 1340 | } | 1352 | } |
| 1341 | } | 1353 | } |
| @@ -1346,27 +1358,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) | |||
| 1346 | int size, i; | 1358 | int size, i; |
| 1347 | int lines = 0; | 1359 | int lines = 0; |
| 1348 | 1360 | ||
| 1349 | realobj = (char*)objp+obj_dbghead(cachep); | 1361 | realobj = (char *)objp + obj_dbghead(cachep); |
| 1350 | size = obj_reallen(cachep); | 1362 | size = obj_reallen(cachep); |
| 1351 | 1363 | ||
| 1352 | for (i=0;i<size;i++) { | 1364 | for (i = 0; i < size; i++) { |
| 1353 | char exp = POISON_FREE; | 1365 | char exp = POISON_FREE; |
| 1354 | if (i == size-1) | 1366 | if (i == size - 1) |
| 1355 | exp = POISON_END; | 1367 | exp = POISON_END; |
| 1356 | if (realobj[i] != exp) { | 1368 | if (realobj[i] != exp) { |
| 1357 | int limit; | 1369 | int limit; |
| 1358 | /* Mismatch ! */ | 1370 | /* Mismatch ! */ |
| 1359 | /* Print header */ | 1371 | /* Print header */ |
| 1360 | if (lines == 0) { | 1372 | if (lines == 0) { |
| 1361 | printk(KERN_ERR "Slab corruption: start=%p, len=%d\n", | 1373 | printk(KERN_ERR |
| 1362 | realobj, size); | 1374 | "Slab corruption: start=%p, len=%d\n", |
| 1375 | realobj, size); | ||
| 1363 | print_objinfo(cachep, objp, 0); | 1376 | print_objinfo(cachep, objp, 0); |
| 1364 | } | 1377 | } |
| 1365 | /* Hexdump the affected line */ | 1378 | /* Hexdump the affected line */ |
| 1366 | i = (i/16)*16; | 1379 | i = (i / 16) * 16; |
| 1367 | limit = 16; | 1380 | limit = 16; |
| 1368 | if (i+limit > size) | 1381 | if (i + limit > size) |
| 1369 | limit = size-i; | 1382 | limit = size - i; |
| 1370 | dump_line(realobj, i, limit); | 1383 | dump_line(realobj, i, limit); |
| 1371 | i += 16; | 1384 | i += 16; |
| 1372 | lines++; | 1385 | lines++; |
| @@ -1382,19 +1395,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) | |||
| 1382 | struct slab *slabp = page_get_slab(virt_to_page(objp)); | 1395 | struct slab *slabp = page_get_slab(virt_to_page(objp)); |
| 1383 | int objnr; | 1396 | int objnr; |
| 1384 | 1397 | ||
| 1385 | objnr = (objp-slabp->s_mem)/cachep->objsize; | 1398 | objnr = (objp - slabp->s_mem) / cachep->objsize; |
| 1386 | if (objnr) { | 1399 | if (objnr) { |
| 1387 | objp = slabp->s_mem+(objnr-1)*cachep->objsize; | 1400 | objp = slabp->s_mem + (objnr - 1) * cachep->objsize; |
| 1388 | realobj = (char*)objp+obj_dbghead(cachep); | 1401 | realobj = (char *)objp + obj_dbghead(cachep); |
| 1389 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", | 1402 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", |
| 1390 | realobj, size); | 1403 | realobj, size); |
| 1391 | print_objinfo(cachep, objp, 2); | 1404 | print_objinfo(cachep, objp, 2); |
| 1392 | } | 1405 | } |
| 1393 | if (objnr+1 < cachep->num) { | 1406 | if (objnr + 1 < cachep->num) { |
| 1394 | objp = slabp->s_mem+(objnr+1)*cachep->objsize; | 1407 | objp = slabp->s_mem + (objnr + 1) * cachep->objsize; |
| 1395 | realobj = (char*)objp+obj_dbghead(cachep); | 1408 | realobj = (char *)objp + obj_dbghead(cachep); |
| 1396 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", | 1409 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", |
| 1397 | realobj, size); | 1410 | realobj, size); |
| 1398 | print_objinfo(cachep, objp, 2); | 1411 | print_objinfo(cachep, objp, 2); |
| 1399 | } | 1412 | } |
| 1400 | } | 1413 | } |
| @@ -1405,7 +1418,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp) | |||
| 1405 | * Before calling the slab must have been unlinked from the cache. | 1418 | * Before calling the slab must have been unlinked from the cache. |
| 1406 | * The cache-lock is not held/needed. | 1419 | * The cache-lock is not held/needed. |
| 1407 | */ | 1420 | */ |
| 1408 | static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) | 1421 | static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp) |
| 1409 | { | 1422 | { |
| 1410 | void *addr = slabp->s_mem - slabp->colouroff; | 1423 | void *addr = slabp->s_mem - slabp->colouroff; |
| 1411 | 1424 | ||
| @@ -1416,8 +1429,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) | |||
| 1416 | 1429 | ||
| 1417 | if (cachep->flags & SLAB_POISON) { | 1430 | if (cachep->flags & SLAB_POISON) { |
| 1418 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1431 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 1419 | if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep)) | 1432 | if ((cachep->objsize % PAGE_SIZE) == 0 |
| 1420 | kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1); | 1433 | && OFF_SLAB(cachep)) |
| 1434 | kernel_map_pages(virt_to_page(objp), | ||
| 1435 | cachep->objsize / PAGE_SIZE, | ||
| 1436 | 1); | ||
| 1421 | else | 1437 | else |
| 1422 | check_poison_obj(cachep, objp); | 1438 | check_poison_obj(cachep, objp); |
| 1423 | #else | 1439 | #else |
| @@ -1427,20 +1443,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) | |||
| 1427 | if (cachep->flags & SLAB_RED_ZONE) { | 1443 | if (cachep->flags & SLAB_RED_ZONE) { |
| 1428 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) | 1444 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) |
| 1429 | slab_error(cachep, "start of a freed object " | 1445 | slab_error(cachep, "start of a freed object " |
| 1430 | "was overwritten"); | 1446 | "was overwritten"); |
| 1431 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 1447 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
| 1432 | slab_error(cachep, "end of a freed object " | 1448 | slab_error(cachep, "end of a freed object " |
| 1433 | "was overwritten"); | 1449 | "was overwritten"); |
| 1434 | } | 1450 | } |
| 1435 | if (cachep->dtor && !(cachep->flags & SLAB_POISON)) | 1451 | if (cachep->dtor && !(cachep->flags & SLAB_POISON)) |
| 1436 | (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0); | 1452 | (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0); |
| 1437 | } | 1453 | } |
| 1438 | #else | 1454 | #else |
| 1439 | if (cachep->dtor) { | 1455 | if (cachep->dtor) { |
| 1440 | int i; | 1456 | int i; |
| 1441 | for (i = 0; i < cachep->num; i++) { | 1457 | for (i = 0; i < cachep->num; i++) { |
| 1442 | void* objp = slabp->s_mem+cachep->objsize*i; | 1458 | void *objp = slabp->s_mem + cachep->objsize * i; |
| 1443 | (cachep->dtor)(objp, cachep, 0); | 1459 | (cachep->dtor) (objp, cachep, 0); |
| 1444 | } | 1460 | } |
| 1445 | } | 1461 | } |
| 1446 | #endif | 1462 | #endif |
| @@ -1448,7 +1464,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp) | |||
| 1448 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { | 1464 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { |
| 1449 | struct slab_rcu *slab_rcu; | 1465 | struct slab_rcu *slab_rcu; |
| 1450 | 1466 | ||
| 1451 | slab_rcu = (struct slab_rcu *) slabp; | 1467 | slab_rcu = (struct slab_rcu *)slabp; |
| 1452 | slab_rcu->cachep = cachep; | 1468 | slab_rcu->cachep = cachep; |
| 1453 | slab_rcu->addr = addr; | 1469 | slab_rcu->addr = addr; |
| 1454 | call_rcu(&slab_rcu->head, kmem_rcu_free); | 1470 | call_rcu(&slab_rcu->head, kmem_rcu_free); |
| @@ -1466,11 +1482,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index) | |||
| 1466 | int node; | 1482 | int node; |
| 1467 | 1483 | ||
| 1468 | for_each_online_node(node) { | 1484 | for_each_online_node(node) { |
| 1469 | cachep->nodelists[node] = &initkmem_list3[index+node]; | 1485 | cachep->nodelists[node] = &initkmem_list3[index + node]; |
| 1470 | cachep->nodelists[node]->next_reap = jiffies + | 1486 | cachep->nodelists[node]->next_reap = jiffies + |
| 1471 | REAPTIMEOUT_LIST3 + | 1487 | REAPTIMEOUT_LIST3 + |
| 1472 | ((unsigned long)cachep)%REAPTIMEOUT_LIST3; | 1488 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
| 1489 | } | ||
| 1490 | } | ||
| 1491 | |||
| 1492 | /** | ||
| 1493 | * calculate_slab_order - calculate size (page order) of slabs and the number | ||
| 1494 | * of objects per slab. | ||
| 1495 | * | ||
| 1496 | * This could be made much more intelligent. For now, try to avoid using | ||
| 1497 | * high order pages for slabs. When the gfp() functions are more friendly | ||
| 1498 | * towards high-order requests, this should be changed. | ||
| 1499 | */ | ||
| 1500 | static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size, | ||
| 1501 | size_t align, gfp_t flags) | ||
| 1502 | { | ||
| 1503 | size_t left_over = 0; | ||
| 1504 | |||
| 1505 | for (;; cachep->gfporder++) { | ||
| 1506 | unsigned int num; | ||
| 1507 | size_t remainder; | ||
| 1508 | |||
| 1509 | if (cachep->gfporder > MAX_GFP_ORDER) { | ||
| 1510 | cachep->num = 0; | ||
| 1511 | break; | ||
| 1512 | } | ||
| 1513 | |||
| 1514 | cache_estimate(cachep->gfporder, size, align, flags, | ||
| 1515 | &remainder, &num); | ||
| 1516 | if (!num) | ||
| 1517 | continue; | ||
| 1518 | /* More than offslab_limit objects will cause problems */ | ||
| 1519 | if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) | ||
| 1520 | break; | ||
| 1521 | |||
| 1522 | cachep->num = num; | ||
| 1523 | left_over = remainder; | ||
| 1524 | |||
| 1525 | /* | ||
| 1526 | * Large number of objects is good, but very large slabs are | ||
| 1527 | * currently bad for the gfp()s. | ||
| 1528 | */ | ||
| 1529 | if (cachep->gfporder >= slab_break_gfp_order) | ||
| 1530 | break; | ||
| 1531 | |||
| 1532 | if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) | ||
| 1533 | /* Acceptable internal fragmentation */ | ||
| 1534 | break; | ||
| 1473 | } | 1535 | } |
| 1536 | return left_over; | ||
| 1474 | } | 1537 | } |
| 1475 | 1538 | ||
| 1476 | /** | 1539 | /** |
| @@ -1519,14 +1582,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1519 | * Sanity checks... these are all serious usage bugs. | 1582 | * Sanity checks... these are all serious usage bugs. |
| 1520 | */ | 1583 | */ |
| 1521 | if ((!name) || | 1584 | if ((!name) || |
| 1522 | in_interrupt() || | 1585 | in_interrupt() || |
| 1523 | (size < BYTES_PER_WORD) || | 1586 | (size < BYTES_PER_WORD) || |
| 1524 | (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) || | 1587 | (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { |
| 1525 | (dtor && !ctor)) { | 1588 | printk(KERN_ERR "%s: Early error in slab %s\n", |
| 1526 | printk(KERN_ERR "%s: Early error in slab %s\n", | 1589 | __FUNCTION__, name); |
| 1527 | __FUNCTION__, name); | 1590 | BUG(); |
| 1528 | BUG(); | 1591 | } |
| 1529 | } | ||
| 1530 | 1592 | ||
| 1531 | down(&cache_chain_sem); | 1593 | down(&cache_chain_sem); |
| 1532 | 1594 | ||
| @@ -1546,11 +1608,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1546 | set_fs(old_fs); | 1608 | set_fs(old_fs); |
| 1547 | if (res) { | 1609 | if (res) { |
| 1548 | printk("SLAB: cache with size %d has lost its name\n", | 1610 | printk("SLAB: cache with size %d has lost its name\n", |
| 1549 | pc->objsize); | 1611 | pc->objsize); |
| 1550 | continue; | 1612 | continue; |
| 1551 | } | 1613 | } |
| 1552 | 1614 | ||
| 1553 | if (!strcmp(pc->name,name)) { | 1615 | if (!strcmp(pc->name, name)) { |
| 1554 | printk("kmem_cache_create: duplicate cache %s\n", name); | 1616 | printk("kmem_cache_create: duplicate cache %s\n", name); |
| 1555 | dump_stack(); | 1617 | dump_stack(); |
| 1556 | goto oops; | 1618 | goto oops; |
| @@ -1562,10 +1624,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1562 | if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { | 1624 | if ((flags & SLAB_DEBUG_INITIAL) && !ctor) { |
| 1563 | /* No constructor, but inital state check requested */ | 1625 | /* No constructor, but inital state check requested */ |
| 1564 | printk(KERN_ERR "%s: No con, but init state check " | 1626 | printk(KERN_ERR "%s: No con, but init state check " |
| 1565 | "requested - %s\n", __FUNCTION__, name); | 1627 | "requested - %s\n", __FUNCTION__, name); |
| 1566 | flags &= ~SLAB_DEBUG_INITIAL; | 1628 | flags &= ~SLAB_DEBUG_INITIAL; |
| 1567 | } | 1629 | } |
| 1568 | |||
| 1569 | #if FORCED_DEBUG | 1630 | #if FORCED_DEBUG |
| 1570 | /* | 1631 | /* |
| 1571 | * Enable redzoning and last user accounting, except for caches with | 1632 | * Enable redzoning and last user accounting, except for caches with |
| @@ -1573,8 +1634,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1573 | * above the next power of two: caches with object sizes just above a | 1634 | * above the next power of two: caches with object sizes just above a |
| 1574 | * power of two have a significant amount of internal fragmentation. | 1635 | * power of two have a significant amount of internal fragmentation. |
| 1575 | */ | 1636 | */ |
| 1576 | if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) | 1637 | if ((size < 4096 |
| 1577 | flags |= SLAB_RED_ZONE|SLAB_STORE_USER; | 1638 | || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) |
| 1639 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; | ||
| 1578 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 1640 | if (!(flags & SLAB_DESTROY_BY_RCU)) |
| 1579 | flags |= SLAB_POISON; | 1641 | flags |= SLAB_POISON; |
| 1580 | #endif | 1642 | #endif |
| @@ -1595,9 +1657,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1595 | * unaligned accesses for some archs when redzoning is used, and makes | 1657 | * unaligned accesses for some archs when redzoning is used, and makes |
| 1596 | * sure any on-slab bufctl's are also correctly aligned. | 1658 | * sure any on-slab bufctl's are also correctly aligned. |
| 1597 | */ | 1659 | */ |
| 1598 | if (size & (BYTES_PER_WORD-1)) { | 1660 | if (size & (BYTES_PER_WORD - 1)) { |
| 1599 | size += (BYTES_PER_WORD-1); | 1661 | size += (BYTES_PER_WORD - 1); |
| 1600 | size &= ~(BYTES_PER_WORD-1); | 1662 | size &= ~(BYTES_PER_WORD - 1); |
| 1601 | } | 1663 | } |
| 1602 | 1664 | ||
| 1603 | /* calculate out the final buffer alignment: */ | 1665 | /* calculate out the final buffer alignment: */ |
| @@ -1608,7 +1670,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1608 | * objects into one cacheline. | 1670 | * objects into one cacheline. |
| 1609 | */ | 1671 | */ |
| 1610 | ralign = cache_line_size(); | 1672 | ralign = cache_line_size(); |
| 1611 | while (size <= ralign/2) | 1673 | while (size <= ralign / 2) |
| 1612 | ralign /= 2; | 1674 | ralign /= 2; |
| 1613 | } else { | 1675 | } else { |
| 1614 | ralign = BYTES_PER_WORD; | 1676 | ralign = BYTES_PER_WORD; |
| @@ -1617,13 +1679,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1617 | if (ralign < ARCH_SLAB_MINALIGN) { | 1679 | if (ralign < ARCH_SLAB_MINALIGN) { |
| 1618 | ralign = ARCH_SLAB_MINALIGN; | 1680 | ralign = ARCH_SLAB_MINALIGN; |
| 1619 | if (ralign > BYTES_PER_WORD) | 1681 | if (ralign > BYTES_PER_WORD) |
| 1620 | flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); | 1682 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
| 1621 | } | 1683 | } |
| 1622 | /* 3) caller mandated alignment: disables debug if necessary */ | 1684 | /* 3) caller mandated alignment: disables debug if necessary */ |
| 1623 | if (ralign < align) { | 1685 | if (ralign < align) { |
| 1624 | ralign = align; | 1686 | ralign = align; |
| 1625 | if (ralign > BYTES_PER_WORD) | 1687 | if (ralign > BYTES_PER_WORD) |
| 1626 | flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); | 1688 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
| 1627 | } | 1689 | } |
| 1628 | /* 4) Store it. Note that the debug code below can reduce | 1690 | /* 4) Store it. Note that the debug code below can reduce |
| 1629 | * the alignment to BYTES_PER_WORD. | 1691 | * the alignment to BYTES_PER_WORD. |
| @@ -1645,7 +1707,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1645 | 1707 | ||
| 1646 | /* add space for red zone words */ | 1708 | /* add space for red zone words */ |
| 1647 | cachep->dbghead += BYTES_PER_WORD; | 1709 | cachep->dbghead += BYTES_PER_WORD; |
| 1648 | size += 2*BYTES_PER_WORD; | 1710 | size += 2 * BYTES_PER_WORD; |
| 1649 | } | 1711 | } |
| 1650 | if (flags & SLAB_STORE_USER) { | 1712 | if (flags & SLAB_STORE_USER) { |
| 1651 | /* user store requires word alignment and | 1713 | /* user store requires word alignment and |
| @@ -1656,7 +1718,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1656 | size += BYTES_PER_WORD; | 1718 | size += BYTES_PER_WORD; |
| 1657 | } | 1719 | } |
| 1658 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 1720 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) |
| 1659 | if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { | 1721 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size |
| 1722 | && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { | ||
| 1660 | cachep->dbghead += PAGE_SIZE - size; | 1723 | cachep->dbghead += PAGE_SIZE - size; |
| 1661 | size = PAGE_SIZE; | 1724 | size = PAGE_SIZE; |
| 1662 | } | 1725 | } |
| @@ -1664,7 +1727,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1664 | #endif | 1727 | #endif |
| 1665 | 1728 | ||
| 1666 | /* Determine if the slab management is 'on' or 'off' slab. */ | 1729 | /* Determine if the slab management is 'on' or 'off' slab. */ |
| 1667 | if (size >= (PAGE_SIZE>>3)) | 1730 | if (size >= (PAGE_SIZE >> 3)) |
| 1668 | /* | 1731 | /* |
| 1669 | * Size is large, assume best to place the slab management obj | 1732 | * Size is large, assume best to place the slab management obj |
| 1670 | * off-slab (should allow better packing of objs). | 1733 | * off-slab (should allow better packing of objs). |
| @@ -1681,47 +1744,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1681 | */ | 1744 | */ |
| 1682 | cachep->gfporder = 0; | 1745 | cachep->gfporder = 0; |
| 1683 | cache_estimate(cachep->gfporder, size, align, flags, | 1746 | cache_estimate(cachep->gfporder, size, align, flags, |
| 1684 | &left_over, &cachep->num); | 1747 | &left_over, &cachep->num); |
| 1685 | } else { | 1748 | } else |
| 1686 | /* | 1749 | left_over = calculate_slab_order(cachep, size, align, flags); |
| 1687 | * Calculate size (in pages) of slabs, and the num of objs per | ||
| 1688 | * slab. This could be made much more intelligent. For now, | ||
| 1689 | * try to avoid using high page-orders for slabs. When the | ||
| 1690 | * gfp() funcs are more friendly towards high-order requests, | ||
| 1691 | * this should be changed. | ||
| 1692 | */ | ||
| 1693 | do { | ||
| 1694 | unsigned int break_flag = 0; | ||
| 1695 | cal_wastage: | ||
| 1696 | cache_estimate(cachep->gfporder, size, align, flags, | ||
| 1697 | &left_over, &cachep->num); | ||
| 1698 | if (break_flag) | ||
| 1699 | break; | ||
| 1700 | if (cachep->gfporder >= MAX_GFP_ORDER) | ||
| 1701 | break; | ||
| 1702 | if (!cachep->num) | ||
| 1703 | goto next; | ||
| 1704 | if (flags & CFLGS_OFF_SLAB && | ||
| 1705 | cachep->num > offslab_limit) { | ||
| 1706 | /* This num of objs will cause problems. */ | ||
| 1707 | cachep->gfporder--; | ||
| 1708 | break_flag++; | ||
| 1709 | goto cal_wastage; | ||
| 1710 | } | ||
| 1711 | |||
| 1712 | /* | ||
| 1713 | * Large num of objs is good, but v. large slabs are | ||
| 1714 | * currently bad for the gfp()s. | ||
| 1715 | */ | ||
| 1716 | if (cachep->gfporder >= slab_break_gfp_order) | ||
| 1717 | break; | ||
| 1718 | |||
| 1719 | if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder)) | ||
| 1720 | break; /* Acceptable internal fragmentation. */ | ||
| 1721 | next: | ||
| 1722 | cachep->gfporder++; | ||
| 1723 | } while (1); | ||
| 1724 | } | ||
| 1725 | 1750 | ||
| 1726 | if (!cachep->num) { | 1751 | if (!cachep->num) { |
| 1727 | printk("kmem_cache_create: couldn't create cache %s.\n", name); | 1752 | printk("kmem_cache_create: couldn't create cache %s.\n", name); |
| @@ -1729,8 +1754,8 @@ next: | |||
| 1729 | cachep = NULL; | 1754 | cachep = NULL; |
| 1730 | goto oops; | 1755 | goto oops; |
| 1731 | } | 1756 | } |
| 1732 | slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) | 1757 | slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) |
| 1733 | + sizeof(struct slab), align); | 1758 | + sizeof(struct slab), align); |
| 1734 | 1759 | ||
| 1735 | /* | 1760 | /* |
| 1736 | * If the slab has been placed off-slab, and we have enough space then | 1761 | * If the slab has been placed off-slab, and we have enough space then |
| @@ -1743,14 +1768,15 @@ next: | |||
| 1743 | 1768 | ||
| 1744 | if (flags & CFLGS_OFF_SLAB) { | 1769 | if (flags & CFLGS_OFF_SLAB) { |
| 1745 | /* really off slab. No need for manual alignment */ | 1770 | /* really off slab. No need for manual alignment */ |
| 1746 | slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); | 1771 | slab_size = |
| 1772 | cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); | ||
| 1747 | } | 1773 | } |
| 1748 | 1774 | ||
| 1749 | cachep->colour_off = cache_line_size(); | 1775 | cachep->colour_off = cache_line_size(); |
| 1750 | /* Offset must be a multiple of the alignment. */ | 1776 | /* Offset must be a multiple of the alignment. */ |
| 1751 | if (cachep->colour_off < align) | 1777 | if (cachep->colour_off < align) |
| 1752 | cachep->colour_off = align; | 1778 | cachep->colour_off = align; |
| 1753 | cachep->colour = left_over/cachep->colour_off; | 1779 | cachep->colour = left_over / cachep->colour_off; |
| 1754 | cachep->slab_size = slab_size; | 1780 | cachep->slab_size = slab_size; |
| 1755 | cachep->flags = flags; | 1781 | cachep->flags = flags; |
| 1756 | cachep->gfpflags = 0; | 1782 | cachep->gfpflags = 0; |
| @@ -1777,7 +1803,7 @@ next: | |||
| 1777 | * the creation of further caches will BUG(). | 1803 | * the creation of further caches will BUG(). |
| 1778 | */ | 1804 | */ |
| 1779 | cachep->array[smp_processor_id()] = | 1805 | cachep->array[smp_processor_id()] = |
| 1780 | &initarray_generic.cache; | 1806 | &initarray_generic.cache; |
| 1781 | 1807 | ||
| 1782 | /* If the cache that's used by | 1808 | /* If the cache that's used by |
| 1783 | * kmalloc(sizeof(kmem_list3)) is the first cache, | 1809 | * kmalloc(sizeof(kmem_list3)) is the first cache, |
| @@ -1791,8 +1817,7 @@ next: | |||
| 1791 | g_cpucache_up = PARTIAL_AC; | 1817 | g_cpucache_up = PARTIAL_AC; |
| 1792 | } else { | 1818 | } else { |
| 1793 | cachep->array[smp_processor_id()] = | 1819 | cachep->array[smp_processor_id()] = |
| 1794 | kmalloc(sizeof(struct arraycache_init), | 1820 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); |
| 1795 | GFP_KERNEL); | ||
| 1796 | 1821 | ||
| 1797 | if (g_cpucache_up == PARTIAL_AC) { | 1822 | if (g_cpucache_up == PARTIAL_AC) { |
| 1798 | set_up_list3s(cachep, SIZE_L3); | 1823 | set_up_list3s(cachep, SIZE_L3); |
| @@ -1802,16 +1827,18 @@ next: | |||
| 1802 | for_each_online_node(node) { | 1827 | for_each_online_node(node) { |
| 1803 | 1828 | ||
| 1804 | cachep->nodelists[node] = | 1829 | cachep->nodelists[node] = |
| 1805 | kmalloc_node(sizeof(struct kmem_list3), | 1830 | kmalloc_node(sizeof |
| 1806 | GFP_KERNEL, node); | 1831 | (struct kmem_list3), |
| 1832 | GFP_KERNEL, node); | ||
| 1807 | BUG_ON(!cachep->nodelists[node]); | 1833 | BUG_ON(!cachep->nodelists[node]); |
| 1808 | kmem_list3_init(cachep->nodelists[node]); | 1834 | kmem_list3_init(cachep-> |
| 1835 | nodelists[node]); | ||
| 1809 | } | 1836 | } |
| 1810 | } | 1837 | } |
| 1811 | } | 1838 | } |
| 1812 | cachep->nodelists[numa_node_id()]->next_reap = | 1839 | cachep->nodelists[numa_node_id()]->next_reap = |
| 1813 | jiffies + REAPTIMEOUT_LIST3 + | 1840 | jiffies + REAPTIMEOUT_LIST3 + |
| 1814 | ((unsigned long)cachep)%REAPTIMEOUT_LIST3; | 1841 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
| 1815 | 1842 | ||
| 1816 | BUG_ON(!ac_data(cachep)); | 1843 | BUG_ON(!ac_data(cachep)); |
| 1817 | ac_data(cachep)->avail = 0; | 1844 | ac_data(cachep)->avail = 0; |
| @@ -1820,15 +1847,15 @@ next: | |||
| 1820 | ac_data(cachep)->touched = 0; | 1847 | ac_data(cachep)->touched = 0; |
| 1821 | cachep->batchcount = 1; | 1848 | cachep->batchcount = 1; |
| 1822 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | 1849 | cachep->limit = BOOT_CPUCACHE_ENTRIES; |
| 1823 | } | 1850 | } |
| 1824 | 1851 | ||
| 1825 | /* cache setup completed, link it into the list */ | 1852 | /* cache setup completed, link it into the list */ |
| 1826 | list_add(&cachep->next, &cache_chain); | 1853 | list_add(&cachep->next, &cache_chain); |
| 1827 | unlock_cpu_hotplug(); | 1854 | unlock_cpu_hotplug(); |
| 1828 | oops: | 1855 | oops: |
| 1829 | if (!cachep && (flags & SLAB_PANIC)) | 1856 | if (!cachep && (flags & SLAB_PANIC)) |
| 1830 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 1857 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
| 1831 | name); | 1858 | name); |
| 1832 | up(&cache_chain_sem); | 1859 | up(&cache_chain_sem); |
| 1833 | return cachep; | 1860 | return cachep; |
| 1834 | } | 1861 | } |
| @@ -1871,7 +1898,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node) | |||
| 1871 | /* | 1898 | /* |
| 1872 | * Waits for all CPUs to execute func(). | 1899 | * Waits for all CPUs to execute func(). |
| 1873 | */ | 1900 | */ |
| 1874 | static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) | 1901 | static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) |
| 1875 | { | 1902 | { |
| 1876 | check_irq_on(); | 1903 | check_irq_on(); |
| 1877 | preempt_disable(); | 1904 | preempt_disable(); |
| @@ -1886,12 +1913,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) | |||
| 1886 | preempt_enable(); | 1913 | preempt_enable(); |
| 1887 | } | 1914 | } |
| 1888 | 1915 | ||
| 1889 | static void drain_array_locked(kmem_cache_t* cachep, | 1916 | static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, |
| 1890 | struct array_cache *ac, int force, int node); | 1917 | int force, int node); |
| 1891 | 1918 | ||
| 1892 | static void do_drain(void *arg) | 1919 | static void do_drain(void *arg) |
| 1893 | { | 1920 | { |
| 1894 | kmem_cache_t *cachep = (kmem_cache_t*)arg; | 1921 | kmem_cache_t *cachep = (kmem_cache_t *) arg; |
| 1895 | struct array_cache *ac; | 1922 | struct array_cache *ac; |
| 1896 | int node = numa_node_id(); | 1923 | int node = numa_node_id(); |
| 1897 | 1924 | ||
| @@ -1911,7 +1938,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep) | |||
| 1911 | smp_call_function_all_cpus(do_drain, cachep); | 1938 | smp_call_function_all_cpus(do_drain, cachep); |
| 1912 | check_irq_on(); | 1939 | check_irq_on(); |
| 1913 | spin_lock_irq(&cachep->spinlock); | 1940 | spin_lock_irq(&cachep->spinlock); |
| 1914 | for_each_online_node(node) { | 1941 | for_each_online_node(node) { |
| 1915 | l3 = cachep->nodelists[node]; | 1942 | l3 = cachep->nodelists[node]; |
| 1916 | if (l3) { | 1943 | if (l3) { |
| 1917 | spin_lock(&l3->list_lock); | 1944 | spin_lock(&l3->list_lock); |
| @@ -1949,8 +1976,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node) | |||
| 1949 | slab_destroy(cachep, slabp); | 1976 | slab_destroy(cachep, slabp); |
| 1950 | spin_lock_irq(&l3->list_lock); | 1977 | spin_lock_irq(&l3->list_lock); |
| 1951 | } | 1978 | } |
| 1952 | ret = !list_empty(&l3->slabs_full) || | 1979 | ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial); |
| 1953 | !list_empty(&l3->slabs_partial); | ||
| 1954 | return ret; | 1980 | return ret; |
| 1955 | } | 1981 | } |
| 1956 | 1982 | ||
| @@ -2006,7 +2032,7 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
| 2006 | * The caller must guarantee that noone will allocate memory from the cache | 2032 | * The caller must guarantee that noone will allocate memory from the cache |
| 2007 | * during the kmem_cache_destroy(). | 2033 | * during the kmem_cache_destroy(). |
| 2008 | */ | 2034 | */ |
| 2009 | int kmem_cache_destroy(kmem_cache_t * cachep) | 2035 | int kmem_cache_destroy(kmem_cache_t *cachep) |
| 2010 | { | 2036 | { |
| 2011 | int i; | 2037 | int i; |
| 2012 | struct kmem_list3 *l3; | 2038 | struct kmem_list3 *l3; |
| @@ -2028,7 +2054,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep) | |||
| 2028 | if (__cache_shrink(cachep)) { | 2054 | if (__cache_shrink(cachep)) { |
| 2029 | slab_error(cachep, "Can't free all objects"); | 2055 | slab_error(cachep, "Can't free all objects"); |
| 2030 | down(&cache_chain_sem); | 2056 | down(&cache_chain_sem); |
| 2031 | list_add(&cachep->next,&cache_chain); | 2057 | list_add(&cachep->next, &cache_chain); |
| 2032 | up(&cache_chain_sem); | 2058 | up(&cache_chain_sem); |
| 2033 | unlock_cpu_hotplug(); | 2059 | unlock_cpu_hotplug(); |
| 2034 | return 1; | 2060 | return 1; |
| @@ -2038,7 +2064,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep) | |||
| 2038 | synchronize_rcu(); | 2064 | synchronize_rcu(); |
| 2039 | 2065 | ||
| 2040 | for_each_online_cpu(i) | 2066 | for_each_online_cpu(i) |
| 2041 | kfree(cachep->array[i]); | 2067 | kfree(cachep->array[i]); |
| 2042 | 2068 | ||
| 2043 | /* NUMA: free the list3 structures */ | 2069 | /* NUMA: free the list3 structures */ |
| 2044 | for_each_online_node(i) { | 2070 | for_each_online_node(i) { |
| @@ -2057,39 +2083,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep) | |||
| 2057 | EXPORT_SYMBOL(kmem_cache_destroy); | 2083 | EXPORT_SYMBOL(kmem_cache_destroy); |
| 2058 | 2084 | ||
| 2059 | /* Get the memory for a slab management obj. */ | 2085 | /* Get the memory for a slab management obj. */ |
| 2060 | static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp, | 2086 | static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp, |
| 2061 | int colour_off, gfp_t local_flags) | 2087 | int colour_off, gfp_t local_flags) |
| 2062 | { | 2088 | { |
| 2063 | struct slab *slabp; | 2089 | struct slab *slabp; |
| 2064 | 2090 | ||
| 2065 | if (OFF_SLAB(cachep)) { | 2091 | if (OFF_SLAB(cachep)) { |
| 2066 | /* Slab management obj is off-slab. */ | 2092 | /* Slab management obj is off-slab. */ |
| 2067 | slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); | 2093 | slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags); |
| 2068 | if (!slabp) | 2094 | if (!slabp) |
| 2069 | return NULL; | 2095 | return NULL; |
| 2070 | } else { | 2096 | } else { |
| 2071 | slabp = objp+colour_off; | 2097 | slabp = objp + colour_off; |
| 2072 | colour_off += cachep->slab_size; | 2098 | colour_off += cachep->slab_size; |
| 2073 | } | 2099 | } |
| 2074 | slabp->inuse = 0; | 2100 | slabp->inuse = 0; |
| 2075 | slabp->colouroff = colour_off; | 2101 | slabp->colouroff = colour_off; |
| 2076 | slabp->s_mem = objp+colour_off; | 2102 | slabp->s_mem = objp + colour_off; |
| 2077 | 2103 | ||
| 2078 | return slabp; | 2104 | return slabp; |
| 2079 | } | 2105 | } |
| 2080 | 2106 | ||
| 2081 | static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) | 2107 | static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) |
| 2082 | { | 2108 | { |
| 2083 | return (kmem_bufctl_t *)(slabp+1); | 2109 | return (kmem_bufctl_t *) (slabp + 1); |
| 2084 | } | 2110 | } |
| 2085 | 2111 | ||
| 2086 | static void cache_init_objs(kmem_cache_t *cachep, | 2112 | static void cache_init_objs(kmem_cache_t *cachep, |
| 2087 | struct slab *slabp, unsigned long ctor_flags) | 2113 | struct slab *slabp, unsigned long ctor_flags) |
| 2088 | { | 2114 | { |
| 2089 | int i; | 2115 | int i; |
| 2090 | 2116 | ||
| 2091 | for (i = 0; i < cachep->num; i++) { | 2117 | for (i = 0; i < cachep->num; i++) { |
| 2092 | void *objp = slabp->s_mem+cachep->objsize*i; | 2118 | void *objp = slabp->s_mem + cachep->objsize * i; |
| 2093 | #if DEBUG | 2119 | #if DEBUG |
| 2094 | /* need to poison the objs? */ | 2120 | /* need to poison the objs? */ |
| 2095 | if (cachep->flags & SLAB_POISON) | 2121 | if (cachep->flags & SLAB_POISON) |
| @@ -2107,25 +2133,28 @@ static void cache_init_objs(kmem_cache_t *cachep, | |||
| 2107 | * Otherwise, deadlock. They must also be threaded. | 2133 | * Otherwise, deadlock. They must also be threaded. |
| 2108 | */ | 2134 | */ |
| 2109 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2135 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
| 2110 | cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags); | 2136 | cachep->ctor(objp + obj_dbghead(cachep), cachep, |
| 2137 | ctor_flags); | ||
| 2111 | 2138 | ||
| 2112 | if (cachep->flags & SLAB_RED_ZONE) { | 2139 | if (cachep->flags & SLAB_RED_ZONE) { |
| 2113 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2140 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
| 2114 | slab_error(cachep, "constructor overwrote the" | 2141 | slab_error(cachep, "constructor overwrote the" |
| 2115 | " end of an object"); | 2142 | " end of an object"); |
| 2116 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) | 2143 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) |
| 2117 | slab_error(cachep, "constructor overwrote the" | 2144 | slab_error(cachep, "constructor overwrote the" |
| 2118 | " start of an object"); | 2145 | " start of an object"); |
| 2119 | } | 2146 | } |
| 2120 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) | 2147 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) |
| 2121 | kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); | 2148 | && cachep->flags & SLAB_POISON) |
| 2149 | kernel_map_pages(virt_to_page(objp), | ||
| 2150 | cachep->objsize / PAGE_SIZE, 0); | ||
| 2122 | #else | 2151 | #else |
| 2123 | if (cachep->ctor) | 2152 | if (cachep->ctor) |
| 2124 | cachep->ctor(objp, cachep, ctor_flags); | 2153 | cachep->ctor(objp, cachep, ctor_flags); |
| 2125 | #endif | 2154 | #endif |
| 2126 | slab_bufctl(slabp)[i] = i+1; | 2155 | slab_bufctl(slabp)[i] = i + 1; |
| 2127 | } | 2156 | } |
| 2128 | slab_bufctl(slabp)[i-1] = BUFCTL_END; | 2157 | slab_bufctl(slabp)[i - 1] = BUFCTL_END; |
| 2129 | slabp->free = 0; | 2158 | slabp->free = 0; |
| 2130 | } | 2159 | } |
| 2131 | 2160 | ||
| @@ -2161,17 +2190,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp) | |||
| 2161 | */ | 2190 | */ |
| 2162 | static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) | 2191 | static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) |
| 2163 | { | 2192 | { |
| 2164 | struct slab *slabp; | 2193 | struct slab *slabp; |
| 2165 | void *objp; | 2194 | void *objp; |
| 2166 | size_t offset; | 2195 | size_t offset; |
| 2167 | gfp_t local_flags; | 2196 | gfp_t local_flags; |
| 2168 | unsigned long ctor_flags; | 2197 | unsigned long ctor_flags; |
| 2169 | struct kmem_list3 *l3; | 2198 | struct kmem_list3 *l3; |
| 2170 | 2199 | ||
| 2171 | /* Be lazy and only check for valid flags here, | 2200 | /* Be lazy and only check for valid flags here, |
| 2172 | * keeping it out of the critical path in kmem_cache_alloc(). | 2201 | * keeping it out of the critical path in kmem_cache_alloc(). |
| 2173 | */ | 2202 | */ |
| 2174 | if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW)) | 2203 | if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) |
| 2175 | BUG(); | 2204 | BUG(); |
| 2176 | if (flags & SLAB_NO_GROW) | 2205 | if (flags & SLAB_NO_GROW) |
| 2177 | return 0; | 2206 | return 0; |
| @@ -2237,9 +2266,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid) | |||
| 2237 | l3->free_objects += cachep->num; | 2266 | l3->free_objects += cachep->num; |
| 2238 | spin_unlock(&l3->list_lock); | 2267 | spin_unlock(&l3->list_lock); |
| 2239 | return 1; | 2268 | return 1; |
| 2240 | opps1: | 2269 | opps1: |
| 2241 | kmem_freepages(cachep, objp); | 2270 | kmem_freepages(cachep, objp); |
| 2242 | failed: | 2271 | failed: |
| 2243 | if (local_flags & __GFP_WAIT) | 2272 | if (local_flags & __GFP_WAIT) |
| 2244 | local_irq_disable(); | 2273 | local_irq_disable(); |
| 2245 | return 0; | 2274 | return 0; |
| @@ -2259,18 +2288,19 @@ static void kfree_debugcheck(const void *objp) | |||
| 2259 | 2288 | ||
| 2260 | if (!virt_addr_valid(objp)) { | 2289 | if (!virt_addr_valid(objp)) { |
| 2261 | printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", | 2290 | printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", |
| 2262 | (unsigned long)objp); | 2291 | (unsigned long)objp); |
| 2263 | BUG(); | 2292 | BUG(); |
| 2264 | } | 2293 | } |
| 2265 | page = virt_to_page(objp); | 2294 | page = virt_to_page(objp); |
| 2266 | if (!PageSlab(page)) { | 2295 | if (!PageSlab(page)) { |
| 2267 | printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp); | 2296 | printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", |
| 2297 | (unsigned long)objp); | ||
| 2268 | BUG(); | 2298 | BUG(); |
| 2269 | } | 2299 | } |
| 2270 | } | 2300 | } |
| 2271 | 2301 | ||
| 2272 | static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, | 2302 | static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, |
| 2273 | void *caller) | 2303 | void *caller) |
| 2274 | { | 2304 | { |
| 2275 | struct page *page; | 2305 | struct page *page; |
| 2276 | unsigned int objnr; | 2306 | unsigned int objnr; |
| @@ -2281,20 +2311,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, | |||
| 2281 | page = virt_to_page(objp); | 2311 | page = virt_to_page(objp); |
| 2282 | 2312 | ||
| 2283 | if (page_get_cache(page) != cachep) { | 2313 | if (page_get_cache(page) != cachep) { |
| 2284 | printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n", | 2314 | printk(KERN_ERR |
| 2285 | page_get_cache(page),cachep); | 2315 | "mismatch in kmem_cache_free: expected cache %p, got %p\n", |
| 2316 | page_get_cache(page), cachep); | ||
| 2286 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); | 2317 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); |
| 2287 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name); | 2318 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), |
| 2319 | page_get_cache(page)->name); | ||
| 2288 | WARN_ON(1); | 2320 | WARN_ON(1); |
| 2289 | } | 2321 | } |
| 2290 | slabp = page_get_slab(page); | 2322 | slabp = page_get_slab(page); |
| 2291 | 2323 | ||
| 2292 | if (cachep->flags & SLAB_RED_ZONE) { | 2324 | if (cachep->flags & SLAB_RED_ZONE) { |
| 2293 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { | 2325 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE |
| 2294 | slab_error(cachep, "double free, or memory outside" | 2326 | || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { |
| 2295 | " object was overwritten"); | 2327 | slab_error(cachep, |
| 2296 | printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | 2328 | "double free, or memory outside" |
| 2297 | objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); | 2329 | " object was overwritten"); |
| 2330 | printk(KERN_ERR | ||
| 2331 | "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | ||
| 2332 | objp, *dbg_redzone1(cachep, objp), | ||
| 2333 | *dbg_redzone2(cachep, objp)); | ||
| 2298 | } | 2334 | } |
| 2299 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; | 2335 | *dbg_redzone1(cachep, objp) = RED_INACTIVE; |
| 2300 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2336 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
| @@ -2302,30 +2338,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp, | |||
| 2302 | if (cachep->flags & SLAB_STORE_USER) | 2338 | if (cachep->flags & SLAB_STORE_USER) |
| 2303 | *dbg_userword(cachep, objp) = caller; | 2339 | *dbg_userword(cachep, objp) = caller; |
| 2304 | 2340 | ||
| 2305 | objnr = (objp-slabp->s_mem)/cachep->objsize; | 2341 | objnr = (objp - slabp->s_mem) / cachep->objsize; |
| 2306 | 2342 | ||
| 2307 | BUG_ON(objnr >= cachep->num); | 2343 | BUG_ON(objnr >= cachep->num); |
| 2308 | BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize); | 2344 | BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize); |
| 2309 | 2345 | ||
| 2310 | if (cachep->flags & SLAB_DEBUG_INITIAL) { | 2346 | if (cachep->flags & SLAB_DEBUG_INITIAL) { |
| 2311 | /* Need to call the slab's constructor so the | 2347 | /* Need to call the slab's constructor so the |
| 2312 | * caller can perform a verify of its state (debugging). | 2348 | * caller can perform a verify of its state (debugging). |
| 2313 | * Called without the cache-lock held. | 2349 | * Called without the cache-lock held. |
| 2314 | */ | 2350 | */ |
| 2315 | cachep->ctor(objp+obj_dbghead(cachep), | 2351 | cachep->ctor(objp + obj_dbghead(cachep), |
| 2316 | cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY); | 2352 | cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); |
| 2317 | } | 2353 | } |
| 2318 | if (cachep->flags & SLAB_POISON && cachep->dtor) { | 2354 | if (cachep->flags & SLAB_POISON && cachep->dtor) { |
| 2319 | /* we want to cache poison the object, | 2355 | /* we want to cache poison the object, |
| 2320 | * call the destruction callback | 2356 | * call the destruction callback |
| 2321 | */ | 2357 | */ |
| 2322 | cachep->dtor(objp+obj_dbghead(cachep), cachep, 0); | 2358 | cachep->dtor(objp + obj_dbghead(cachep), cachep, 0); |
| 2323 | } | 2359 | } |
| 2324 | if (cachep->flags & SLAB_POISON) { | 2360 | if (cachep->flags & SLAB_POISON) { |
| 2325 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2361 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 2326 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { | 2362 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { |
| 2327 | store_stackinfo(cachep, objp, (unsigned long)caller); | 2363 | store_stackinfo(cachep, objp, (unsigned long)caller); |
| 2328 | kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0); | 2364 | kernel_map_pages(virt_to_page(objp), |
| 2365 | cachep->objsize / PAGE_SIZE, 0); | ||
| 2329 | } else { | 2366 | } else { |
| 2330 | poison_obj(cachep, objp, POISON_FREE); | 2367 | poison_obj(cachep, objp, POISON_FREE); |
| 2331 | } | 2368 | } |
| @@ -2340,7 +2377,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) | |||
| 2340 | { | 2377 | { |
| 2341 | kmem_bufctl_t i; | 2378 | kmem_bufctl_t i; |
| 2342 | int entries = 0; | 2379 | int entries = 0; |
| 2343 | 2380 | ||
| 2344 | /* Check slab's freelist to see if this obj is there. */ | 2381 | /* Check slab's freelist to see if this obj is there. */ |
| 2345 | for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { | 2382 | for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { |
| 2346 | entries++; | 2383 | entries++; |
| @@ -2348,13 +2385,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp) | |||
| 2348 | goto bad; | 2385 | goto bad; |
| 2349 | } | 2386 | } |
| 2350 | if (entries != cachep->num - slabp->inuse) { | 2387 | if (entries != cachep->num - slabp->inuse) { |
| 2351 | bad: | 2388 | bad: |
| 2352 | printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", | 2389 | printk(KERN_ERR |
| 2353 | cachep->name, cachep->num, slabp, slabp->inuse); | 2390 | "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", |
| 2354 | for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) { | 2391 | cachep->name, cachep->num, slabp, slabp->inuse); |
| 2355 | if ((i%16)==0) | 2392 | for (i = 0; |
| 2393 | i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); | ||
| 2394 | i++) { | ||
| 2395 | if ((i % 16) == 0) | ||
| 2356 | printk("\n%03x:", i); | 2396 | printk("\n%03x:", i); |
| 2357 | printk(" %02x", ((unsigned char*)slabp)[i]); | 2397 | printk(" %02x", ((unsigned char *)slabp)[i]); |
| 2358 | } | 2398 | } |
| 2359 | printk("\n"); | 2399 | printk("\n"); |
| 2360 | BUG(); | 2400 | BUG(); |
| @@ -2374,7 +2414,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags) | |||
| 2374 | 2414 | ||
| 2375 | check_irq_off(); | 2415 | check_irq_off(); |
| 2376 | ac = ac_data(cachep); | 2416 | ac = ac_data(cachep); |
| 2377 | retry: | 2417 | retry: |
| 2378 | batchcount = ac->batchcount; | 2418 | batchcount = ac->batchcount; |
| 2379 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 2419 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
| 2380 | /* if there was little recent activity on this | 2420 | /* if there was little recent activity on this |
| @@ -2396,8 +2436,8 @@ retry: | |||
| 2396 | shared_array->avail -= batchcount; | 2436 | shared_array->avail -= batchcount; |
| 2397 | ac->avail = batchcount; | 2437 | ac->avail = batchcount; |
| 2398 | memcpy(ac->entry, | 2438 | memcpy(ac->entry, |
| 2399 | &(shared_array->entry[shared_array->avail]), | 2439 | &(shared_array->entry[shared_array->avail]), |
| 2400 | sizeof(void*)*batchcount); | 2440 | sizeof(void *) * batchcount); |
| 2401 | shared_array->touched = 1; | 2441 | shared_array->touched = 1; |
| 2402 | goto alloc_done; | 2442 | goto alloc_done; |
| 2403 | } | 2443 | } |
| @@ -2425,7 +2465,7 @@ retry: | |||
| 2425 | 2465 | ||
| 2426 | /* get obj pointer */ | 2466 | /* get obj pointer */ |
| 2427 | ac->entry[ac->avail++] = slabp->s_mem + | 2467 | ac->entry[ac->avail++] = slabp->s_mem + |
| 2428 | slabp->free*cachep->objsize; | 2468 | slabp->free * cachep->objsize; |
| 2429 | 2469 | ||
| 2430 | slabp->inuse++; | 2470 | slabp->inuse++; |
| 2431 | next = slab_bufctl(slabp)[slabp->free]; | 2471 | next = slab_bufctl(slabp)[slabp->free]; |
| @@ -2433,7 +2473,7 @@ retry: | |||
| 2433 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; | 2473 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; |
| 2434 | WARN_ON(numa_node_id() != slabp->nodeid); | 2474 | WARN_ON(numa_node_id() != slabp->nodeid); |
| 2435 | #endif | 2475 | #endif |
| 2436 | slabp->free = next; | 2476 | slabp->free = next; |
| 2437 | } | 2477 | } |
| 2438 | check_slabp(cachep, slabp); | 2478 | check_slabp(cachep, slabp); |
| 2439 | 2479 | ||
| @@ -2445,9 +2485,9 @@ retry: | |||
| 2445 | list_add(&slabp->list, &l3->slabs_partial); | 2485 | list_add(&slabp->list, &l3->slabs_partial); |
| 2446 | } | 2486 | } |
| 2447 | 2487 | ||
| 2448 | must_grow: | 2488 | must_grow: |
| 2449 | l3->free_objects -= ac->avail; | 2489 | l3->free_objects -= ac->avail; |
| 2450 | alloc_done: | 2490 | alloc_done: |
| 2451 | spin_unlock(&l3->list_lock); | 2491 | spin_unlock(&l3->list_lock); |
| 2452 | 2492 | ||
| 2453 | if (unlikely(!ac->avail)) { | 2493 | if (unlikely(!ac->avail)) { |
| @@ -2459,7 +2499,7 @@ alloc_done: | |||
| 2459 | if (!x && ac->avail == 0) // no objects in sight? abort | 2499 | if (!x && ac->avail == 0) // no objects in sight? abort |
| 2460 | return NULL; | 2500 | return NULL; |
| 2461 | 2501 | ||
| 2462 | if (!ac->avail) // objects refilled by interrupt? | 2502 | if (!ac->avail) // objects refilled by interrupt? |
| 2463 | goto retry; | 2503 | goto retry; |
| 2464 | } | 2504 | } |
| 2465 | ac->touched = 1; | 2505 | ac->touched = 1; |
| @@ -2476,16 +2516,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags) | |||
| 2476 | } | 2516 | } |
| 2477 | 2517 | ||
| 2478 | #if DEBUG | 2518 | #if DEBUG |
| 2479 | static void * | 2519 | static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags, |
| 2480 | cache_alloc_debugcheck_after(kmem_cache_t *cachep, | 2520 | void *objp, void *caller) |
| 2481 | gfp_t flags, void *objp, void *caller) | ||
| 2482 | { | 2521 | { |
| 2483 | if (!objp) | 2522 | if (!objp) |
| 2484 | return objp; | 2523 | return objp; |
| 2485 | if (cachep->flags & SLAB_POISON) { | 2524 | if (cachep->flags & SLAB_POISON) { |
| 2486 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2525 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 2487 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) | 2526 | if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) |
| 2488 | kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1); | 2527 | kernel_map_pages(virt_to_page(objp), |
| 2528 | cachep->objsize / PAGE_SIZE, 1); | ||
| 2489 | else | 2529 | else |
| 2490 | check_poison_obj(cachep, objp); | 2530 | check_poison_obj(cachep, objp); |
| 2491 | #else | 2531 | #else |
| @@ -2497,24 +2537,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep, | |||
| 2497 | *dbg_userword(cachep, objp) = caller; | 2537 | *dbg_userword(cachep, objp) = caller; |
| 2498 | 2538 | ||
| 2499 | if (cachep->flags & SLAB_RED_ZONE) { | 2539 | if (cachep->flags & SLAB_RED_ZONE) { |
| 2500 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { | 2540 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE |
| 2501 | slab_error(cachep, "double free, or memory outside" | 2541 | || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { |
| 2502 | " object was overwritten"); | 2542 | slab_error(cachep, |
| 2503 | printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | 2543 | "double free, or memory outside" |
| 2504 | objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp)); | 2544 | " object was overwritten"); |
| 2545 | printk(KERN_ERR | ||
| 2546 | "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | ||
| 2547 | objp, *dbg_redzone1(cachep, objp), | ||
| 2548 | *dbg_redzone2(cachep, objp)); | ||
| 2505 | } | 2549 | } |
| 2506 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; | 2550 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; |
| 2507 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | 2551 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; |
| 2508 | } | 2552 | } |
| 2509 | objp += obj_dbghead(cachep); | 2553 | objp += obj_dbghead(cachep); |
| 2510 | if (cachep->ctor && cachep->flags & SLAB_POISON) { | 2554 | if (cachep->ctor && cachep->flags & SLAB_POISON) { |
| 2511 | unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; | 2555 | unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; |
| 2512 | 2556 | ||
| 2513 | if (!(flags & __GFP_WAIT)) | 2557 | if (!(flags & __GFP_WAIT)) |
| 2514 | ctor_flags |= SLAB_CTOR_ATOMIC; | 2558 | ctor_flags |= SLAB_CTOR_ATOMIC; |
| 2515 | 2559 | ||
| 2516 | cachep->ctor(objp, cachep, ctor_flags); | 2560 | cachep->ctor(objp, cachep, ctor_flags); |
| 2517 | } | 2561 | } |
| 2518 | return objp; | 2562 | return objp; |
| 2519 | } | 2563 | } |
| 2520 | #else | 2564 | #else |
| @@ -2523,7 +2567,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep, | |||
| 2523 | 2567 | ||
| 2524 | static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) | 2568 | static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) |
| 2525 | { | 2569 | { |
| 2526 | void* objp; | 2570 | void *objp; |
| 2527 | struct array_cache *ac; | 2571 | struct array_cache *ac; |
| 2528 | 2572 | ||
| 2529 | check_irq_off(); | 2573 | check_irq_off(); |
| @@ -2542,7 +2586,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags) | |||
| 2542 | static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) | 2586 | static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) |
| 2543 | { | 2587 | { |
| 2544 | unsigned long save_flags; | 2588 | unsigned long save_flags; |
| 2545 | void* objp; | 2589 | void *objp; |
| 2546 | 2590 | ||
| 2547 | cache_alloc_debugcheck_before(cachep, flags); | 2591 | cache_alloc_debugcheck_before(cachep, flags); |
| 2548 | 2592 | ||
| @@ -2550,7 +2594,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) | |||
| 2550 | objp = ____cache_alloc(cachep, flags); | 2594 | objp = ____cache_alloc(cachep, flags); |
| 2551 | local_irq_restore(save_flags); | 2595 | local_irq_restore(save_flags); |
| 2552 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, | 2596 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, |
| 2553 | __builtin_return_address(0)); | 2597 | __builtin_return_address(0)); |
| 2554 | prefetchw(objp); | 2598 | prefetchw(objp); |
| 2555 | return objp; | 2599 | return objp; |
| 2556 | } | 2600 | } |
| @@ -2562,74 +2606,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags) | |||
| 2562 | static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) | 2606 | static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) |
| 2563 | { | 2607 | { |
| 2564 | struct list_head *entry; | 2608 | struct list_head *entry; |
| 2565 | struct slab *slabp; | 2609 | struct slab *slabp; |
| 2566 | struct kmem_list3 *l3; | 2610 | struct kmem_list3 *l3; |
| 2567 | void *obj; | 2611 | void *obj; |
| 2568 | kmem_bufctl_t next; | 2612 | kmem_bufctl_t next; |
| 2569 | int x; | 2613 | int x; |
| 2570 | 2614 | ||
| 2571 | l3 = cachep->nodelists[nodeid]; | 2615 | l3 = cachep->nodelists[nodeid]; |
| 2572 | BUG_ON(!l3); | 2616 | BUG_ON(!l3); |
| 2573 | 2617 | ||
| 2574 | retry: | 2618 | retry: |
| 2575 | spin_lock(&l3->list_lock); | 2619 | spin_lock(&l3->list_lock); |
| 2576 | entry = l3->slabs_partial.next; | 2620 | entry = l3->slabs_partial.next; |
| 2577 | if (entry == &l3->slabs_partial) { | 2621 | if (entry == &l3->slabs_partial) { |
| 2578 | l3->free_touched = 1; | 2622 | l3->free_touched = 1; |
| 2579 | entry = l3->slabs_free.next; | 2623 | entry = l3->slabs_free.next; |
| 2580 | if (entry == &l3->slabs_free) | 2624 | if (entry == &l3->slabs_free) |
| 2581 | goto must_grow; | 2625 | goto must_grow; |
| 2582 | } | 2626 | } |
| 2583 | 2627 | ||
| 2584 | slabp = list_entry(entry, struct slab, list); | 2628 | slabp = list_entry(entry, struct slab, list); |
| 2585 | check_spinlock_acquired_node(cachep, nodeid); | 2629 | check_spinlock_acquired_node(cachep, nodeid); |
| 2586 | check_slabp(cachep, slabp); | 2630 | check_slabp(cachep, slabp); |
| 2587 | 2631 | ||
| 2588 | STATS_INC_NODEALLOCS(cachep); | 2632 | STATS_INC_NODEALLOCS(cachep); |
| 2589 | STATS_INC_ACTIVE(cachep); | 2633 | STATS_INC_ACTIVE(cachep); |
| 2590 | STATS_SET_HIGH(cachep); | 2634 | STATS_SET_HIGH(cachep); |
| 2591 | 2635 | ||
| 2592 | BUG_ON(slabp->inuse == cachep->num); | 2636 | BUG_ON(slabp->inuse == cachep->num); |
| 2593 | 2637 | ||
| 2594 | /* get obj pointer */ | 2638 | /* get obj pointer */ |
| 2595 | obj = slabp->s_mem + slabp->free*cachep->objsize; | 2639 | obj = slabp->s_mem + slabp->free * cachep->objsize; |
| 2596 | slabp->inuse++; | 2640 | slabp->inuse++; |
| 2597 | next = slab_bufctl(slabp)[slabp->free]; | 2641 | next = slab_bufctl(slabp)[slabp->free]; |
| 2598 | #if DEBUG | 2642 | #if DEBUG |
| 2599 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; | 2643 | slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; |
| 2600 | #endif | 2644 | #endif |
| 2601 | slabp->free = next; | 2645 | slabp->free = next; |
| 2602 | check_slabp(cachep, slabp); | 2646 | check_slabp(cachep, slabp); |
| 2603 | l3->free_objects--; | 2647 | l3->free_objects--; |
| 2604 | /* move slabp to correct slabp list: */ | 2648 | /* move slabp to correct slabp list: */ |
| 2605 | list_del(&slabp->list); | 2649 | list_del(&slabp->list); |
| 2606 | 2650 | ||
| 2607 | if (slabp->free == BUFCTL_END) { | 2651 | if (slabp->free == BUFCTL_END) { |
| 2608 | list_add(&slabp->list, &l3->slabs_full); | 2652 | list_add(&slabp->list, &l3->slabs_full); |
| 2609 | } else { | 2653 | } else { |
| 2610 | list_add(&slabp->list, &l3->slabs_partial); | 2654 | list_add(&slabp->list, &l3->slabs_partial); |
| 2611 | } | 2655 | } |
| 2612 | 2656 | ||
| 2613 | spin_unlock(&l3->list_lock); | 2657 | spin_unlock(&l3->list_lock); |
| 2614 | goto done; | 2658 | goto done; |
| 2615 | 2659 | ||
| 2616 | must_grow: | 2660 | must_grow: |
| 2617 | spin_unlock(&l3->list_lock); | 2661 | spin_unlock(&l3->list_lock); |
| 2618 | x = cache_grow(cachep, flags, nodeid); | 2662 | x = cache_grow(cachep, flags, nodeid); |
| 2619 | 2663 | ||
| 2620 | if (!x) | 2664 | if (!x) |
| 2621 | return NULL; | 2665 | return NULL; |
| 2622 | 2666 | ||
| 2623 | goto retry; | 2667 | goto retry; |
| 2624 | done: | 2668 | done: |
| 2625 | return obj; | 2669 | return obj; |
| 2626 | } | 2670 | } |
| 2627 | #endif | 2671 | #endif |
| 2628 | 2672 | ||
| 2629 | /* | 2673 | /* |
| 2630 | * Caller needs to acquire correct kmem_list's list_lock | 2674 | * Caller needs to acquire correct kmem_list's list_lock |
| 2631 | */ | 2675 | */ |
| 2632 | static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node) | 2676 | static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, |
| 2677 | int node) | ||
| 2633 | { | 2678 | { |
| 2634 | int i; | 2679 | int i; |
| 2635 | struct kmem_list3 *l3; | 2680 | struct kmem_list3 *l3; |
| @@ -2652,7 +2697,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n | |||
| 2652 | 2697 | ||
| 2653 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { | 2698 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { |
| 2654 | printk(KERN_ERR "slab: double free detected in cache " | 2699 | printk(KERN_ERR "slab: double free detected in cache " |
| 2655 | "'%s', objp %p\n", cachep->name, objp); | 2700 | "'%s', objp %p\n", cachep->name, objp); |
| 2656 | BUG(); | 2701 | BUG(); |
| 2657 | } | 2702 | } |
| 2658 | #endif | 2703 | #endif |
| @@ -2696,20 +2741,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) | |||
| 2696 | spin_lock(&l3->list_lock); | 2741 | spin_lock(&l3->list_lock); |
| 2697 | if (l3->shared) { | 2742 | if (l3->shared) { |
| 2698 | struct array_cache *shared_array = l3->shared; | 2743 | struct array_cache *shared_array = l3->shared; |
| 2699 | int max = shared_array->limit-shared_array->avail; | 2744 | int max = shared_array->limit - shared_array->avail; |
| 2700 | if (max) { | 2745 | if (max) { |
| 2701 | if (batchcount > max) | 2746 | if (batchcount > max) |
| 2702 | batchcount = max; | 2747 | batchcount = max; |
| 2703 | memcpy(&(shared_array->entry[shared_array->avail]), | 2748 | memcpy(&(shared_array->entry[shared_array->avail]), |
| 2704 | ac->entry, | 2749 | ac->entry, sizeof(void *) * batchcount); |
| 2705 | sizeof(void*)*batchcount); | ||
| 2706 | shared_array->avail += batchcount; | 2750 | shared_array->avail += batchcount; |
| 2707 | goto free_done; | 2751 | goto free_done; |
| 2708 | } | 2752 | } |
| 2709 | } | 2753 | } |
| 2710 | 2754 | ||
| 2711 | free_block(cachep, ac->entry, batchcount, node); | 2755 | free_block(cachep, ac->entry, batchcount, node); |
| 2712 | free_done: | 2756 | free_done: |
| 2713 | #if STATS | 2757 | #if STATS |
| 2714 | { | 2758 | { |
| 2715 | int i = 0; | 2759 | int i = 0; |
| @@ -2731,10 +2775,9 @@ free_done: | |||
| 2731 | spin_unlock(&l3->list_lock); | 2775 | spin_unlock(&l3->list_lock); |
| 2732 | ac->avail -= batchcount; | 2776 | ac->avail -= batchcount; |
| 2733 | memmove(ac->entry, &(ac->entry[batchcount]), | 2777 | memmove(ac->entry, &(ac->entry[batchcount]), |
| 2734 | sizeof(void*)*ac->avail); | 2778 | sizeof(void *) * ac->avail); |
| 2735 | } | 2779 | } |
| 2736 | 2780 | ||
| 2737 | |||
| 2738 | /* | 2781 | /* |
| 2739 | * __cache_free | 2782 | * __cache_free |
| 2740 | * Release an obj back to its cache. If the obj has a constructed | 2783 | * Release an obj back to its cache. If the obj has a constructed |
| @@ -2759,7 +2802,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp) | |||
| 2759 | if (unlikely(slabp->nodeid != numa_node_id())) { | 2802 | if (unlikely(slabp->nodeid != numa_node_id())) { |
| 2760 | struct array_cache *alien = NULL; | 2803 | struct array_cache *alien = NULL; |
| 2761 | int nodeid = slabp->nodeid; | 2804 | int nodeid = slabp->nodeid; |
| 2762 | struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()]; | 2805 | struct kmem_list3 *l3 = |
| 2806 | cachep->nodelists[numa_node_id()]; | ||
| 2763 | 2807 | ||
| 2764 | STATS_INC_NODEFREES(cachep); | 2808 | STATS_INC_NODEFREES(cachep); |
| 2765 | if (l3->alien && l3->alien[nodeid]) { | 2809 | if (l3->alien && l3->alien[nodeid]) { |
| @@ -2767,15 +2811,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp) | |||
| 2767 | spin_lock(&alien->lock); | 2811 | spin_lock(&alien->lock); |
| 2768 | if (unlikely(alien->avail == alien->limit)) | 2812 | if (unlikely(alien->avail == alien->limit)) |
| 2769 | __drain_alien_cache(cachep, | 2813 | __drain_alien_cache(cachep, |
| 2770 | alien, nodeid); | 2814 | alien, nodeid); |
| 2771 | alien->entry[alien->avail++] = objp; | 2815 | alien->entry[alien->avail++] = objp; |
| 2772 | spin_unlock(&alien->lock); | 2816 | spin_unlock(&alien->lock); |
| 2773 | } else { | 2817 | } else { |
| 2774 | spin_lock(&(cachep->nodelists[nodeid])-> | 2818 | spin_lock(&(cachep->nodelists[nodeid])-> |
| 2775 | list_lock); | 2819 | list_lock); |
| 2776 | free_block(cachep, &objp, 1, nodeid); | 2820 | free_block(cachep, &objp, 1, nodeid); |
| 2777 | spin_unlock(&(cachep->nodelists[nodeid])-> | 2821 | spin_unlock(&(cachep->nodelists[nodeid])-> |
| 2778 | list_lock); | 2822 | list_lock); |
| 2779 | } | 2823 | } |
| 2780 | return; | 2824 | return; |
| 2781 | } | 2825 | } |
| @@ -2822,9 +2866,9 @@ EXPORT_SYMBOL(kmem_cache_alloc); | |||
| 2822 | */ | 2866 | */ |
| 2823 | int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) | 2867 | int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) |
| 2824 | { | 2868 | { |
| 2825 | unsigned long addr = (unsigned long) ptr; | 2869 | unsigned long addr = (unsigned long)ptr; |
| 2826 | unsigned long min_addr = PAGE_OFFSET; | 2870 | unsigned long min_addr = PAGE_OFFSET; |
| 2827 | unsigned long align_mask = BYTES_PER_WORD-1; | 2871 | unsigned long align_mask = BYTES_PER_WORD - 1; |
| 2828 | unsigned long size = cachep->objsize; | 2872 | unsigned long size = cachep->objsize; |
| 2829 | struct page *page; | 2873 | struct page *page; |
| 2830 | 2874 | ||
| @@ -2844,7 +2888,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr) | |||
| 2844 | if (unlikely(page_get_cache(page) != cachep)) | 2888 | if (unlikely(page_get_cache(page) != cachep)) |
| 2845 | goto out; | 2889 | goto out; |
| 2846 | return 1; | 2890 | return 1; |
| 2847 | out: | 2891 | out: |
| 2848 | return 0; | 2892 | return 0; |
| 2849 | } | 2893 | } |
| 2850 | 2894 | ||
| @@ -2871,8 +2915,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) | |||
| 2871 | 2915 | ||
| 2872 | if (unlikely(!cachep->nodelists[nodeid])) { | 2916 | if (unlikely(!cachep->nodelists[nodeid])) { |
| 2873 | /* Fall back to __cache_alloc if we run into trouble */ | 2917 | /* Fall back to __cache_alloc if we run into trouble */ |
| 2874 | printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name); | 2918 | printk(KERN_WARNING |
| 2875 | return __cache_alloc(cachep,flags); | 2919 | "slab: not allocating in inactive node %d for cache %s\n", |
| 2920 | nodeid, cachep->name); | ||
| 2921 | return __cache_alloc(cachep, flags); | ||
| 2876 | } | 2922 | } |
| 2877 | 2923 | ||
| 2878 | cache_alloc_debugcheck_before(cachep, flags); | 2924 | cache_alloc_debugcheck_before(cachep, flags); |
| @@ -2882,7 +2928,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid) | |||
| 2882 | else | 2928 | else |
| 2883 | ptr = __cache_alloc_node(cachep, flags, nodeid); | 2929 | ptr = __cache_alloc_node(cachep, flags, nodeid); |
| 2884 | local_irq_restore(save_flags); | 2930 | local_irq_restore(save_flags); |
| 2885 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0)); | 2931 | ptr = |
| 2932 | cache_alloc_debugcheck_after(cachep, flags, ptr, | ||
| 2933 | __builtin_return_address(0)); | ||
| 2886 | 2934 | ||
| 2887 | return ptr; | 2935 | return ptr; |
| 2888 | } | 2936 | } |
| @@ -2944,12 +2992,11 @@ EXPORT_SYMBOL(__kmalloc); | |||
| 2944 | * Objects should be dereferenced using the per_cpu_ptr macro only. | 2992 | * Objects should be dereferenced using the per_cpu_ptr macro only. |
| 2945 | * | 2993 | * |
| 2946 | * @size: how many bytes of memory are required. | 2994 | * @size: how many bytes of memory are required. |
| 2947 | * @align: the alignment, which can't be greater than SMP_CACHE_BYTES. | ||
| 2948 | */ | 2995 | */ |
| 2949 | void *__alloc_percpu(size_t size, size_t align) | 2996 | void *__alloc_percpu(size_t size) |
| 2950 | { | 2997 | { |
| 2951 | int i; | 2998 | int i; |
| 2952 | struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); | 2999 | struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL); |
| 2953 | 3000 | ||
| 2954 | if (!pdata) | 3001 | if (!pdata) |
| 2955 | return NULL; | 3002 | return NULL; |
| @@ -2973,9 +3020,9 @@ void *__alloc_percpu(size_t size, size_t align) | |||
| 2973 | } | 3020 | } |
| 2974 | 3021 | ||
| 2975 | /* Catch derefs w/o wrappers */ | 3022 | /* Catch derefs w/o wrappers */ |
| 2976 | return (void *) (~(unsigned long) pdata); | 3023 | return (void *)(~(unsigned long)pdata); |
| 2977 | 3024 | ||
| 2978 | unwind_oom: | 3025 | unwind_oom: |
| 2979 | while (--i >= 0) { | 3026 | while (--i >= 0) { |
| 2980 | if (!cpu_possible(i)) | 3027 | if (!cpu_possible(i)) |
| 2981 | continue; | 3028 | continue; |
| @@ -3006,20 +3053,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp) | |||
| 3006 | EXPORT_SYMBOL(kmem_cache_free); | 3053 | EXPORT_SYMBOL(kmem_cache_free); |
| 3007 | 3054 | ||
| 3008 | /** | 3055 | /** |
| 3009 | * kzalloc - allocate memory. The memory is set to zero. | ||
| 3010 | * @size: how many bytes of memory are required. | ||
| 3011 | * @flags: the type of memory to allocate. | ||
| 3012 | */ | ||
| 3013 | void *kzalloc(size_t size, gfp_t flags) | ||
| 3014 | { | ||
| 3015 | void *ret = kmalloc(size, flags); | ||
| 3016 | if (ret) | ||
| 3017 | memset(ret, 0, size); | ||
| 3018 | return ret; | ||
| 3019 | } | ||
| 3020 | EXPORT_SYMBOL(kzalloc); | ||
| 3021 | |||
| 3022 | /** | ||
| 3023 | * kfree - free previously allocated memory | 3056 | * kfree - free previously allocated memory |
| 3024 | * @objp: pointer returned by kmalloc. | 3057 | * @objp: pointer returned by kmalloc. |
| 3025 | * | 3058 | * |
| @@ -3038,7 +3071,8 @@ void kfree(const void *objp) | |||
| 3038 | local_irq_save(flags); | 3071 | local_irq_save(flags); |
| 3039 | kfree_debugcheck(objp); | 3072 | kfree_debugcheck(objp); |
| 3040 | c = page_get_cache(virt_to_page(objp)); | 3073 | c = page_get_cache(virt_to_page(objp)); |
| 3041 | __cache_free(c, (void*)objp); | 3074 | mutex_debug_check_no_locks_freed(objp, obj_reallen(c)); |
| 3075 | __cache_free(c, (void *)objp); | ||
| 3042 | local_irq_restore(flags); | 3076 | local_irq_restore(flags); |
| 3043 | } | 3077 | } |
| 3044 | EXPORT_SYMBOL(kfree); | 3078 | EXPORT_SYMBOL(kfree); |
| @@ -3051,17 +3085,16 @@ EXPORT_SYMBOL(kfree); | |||
| 3051 | * Don't free memory not originally allocated by alloc_percpu() | 3085 | * Don't free memory not originally allocated by alloc_percpu() |
| 3052 | * The complemented objp is to check for that. | 3086 | * The complemented objp is to check for that. |
| 3053 | */ | 3087 | */ |
| 3054 | void | 3088 | void free_percpu(const void *objp) |
| 3055 | free_percpu(const void *objp) | ||
| 3056 | { | 3089 | { |
| 3057 | int i; | 3090 | int i; |
| 3058 | struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); | 3091 | struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp); |
| 3059 | 3092 | ||
| 3060 | /* | 3093 | /* |
| 3061 | * We allocate for all cpus so we cannot use for online cpu here. | 3094 | * We allocate for all cpus so we cannot use for online cpu here. |
| 3062 | */ | 3095 | */ |
| 3063 | for_each_cpu(i) | 3096 | for_each_cpu(i) |
| 3064 | kfree(p->ptrs[i]); | 3097 | kfree(p->ptrs[i]); |
| 3065 | kfree(p); | 3098 | kfree(p); |
| 3066 | } | 3099 | } |
| 3067 | EXPORT_SYMBOL(free_percpu); | 3100 | EXPORT_SYMBOL(free_percpu); |
| @@ -3095,44 +3128,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep) | |||
| 3095 | if (!(new_alien = alloc_alien_cache(node, cachep->limit))) | 3128 | if (!(new_alien = alloc_alien_cache(node, cachep->limit))) |
| 3096 | goto fail; | 3129 | goto fail; |
| 3097 | #endif | 3130 | #endif |
| 3098 | if (!(new = alloc_arraycache(node, (cachep->shared* | 3131 | if (!(new = alloc_arraycache(node, (cachep->shared * |
| 3099 | cachep->batchcount), 0xbaadf00d))) | 3132 | cachep->batchcount), |
| 3133 | 0xbaadf00d))) | ||
| 3100 | goto fail; | 3134 | goto fail; |
| 3101 | if ((l3 = cachep->nodelists[node])) { | 3135 | if ((l3 = cachep->nodelists[node])) { |
| 3102 | 3136 | ||
| 3103 | spin_lock_irq(&l3->list_lock); | 3137 | spin_lock_irq(&l3->list_lock); |
| 3104 | 3138 | ||
| 3105 | if ((nc = cachep->nodelists[node]->shared)) | 3139 | if ((nc = cachep->nodelists[node]->shared)) |
| 3106 | free_block(cachep, nc->entry, | 3140 | free_block(cachep, nc->entry, nc->avail, node); |
| 3107 | nc->avail, node); | ||
| 3108 | 3141 | ||
| 3109 | l3->shared = new; | 3142 | l3->shared = new; |
| 3110 | if (!cachep->nodelists[node]->alien) { | 3143 | if (!cachep->nodelists[node]->alien) { |
| 3111 | l3->alien = new_alien; | 3144 | l3->alien = new_alien; |
| 3112 | new_alien = NULL; | 3145 | new_alien = NULL; |
| 3113 | } | 3146 | } |
| 3114 | l3->free_limit = (1 + nr_cpus_node(node))* | 3147 | l3->free_limit = (1 + nr_cpus_node(node)) * |
| 3115 | cachep->batchcount + cachep->num; | 3148 | cachep->batchcount + cachep->num; |
| 3116 | spin_unlock_irq(&l3->list_lock); | 3149 | spin_unlock_irq(&l3->list_lock); |
| 3117 | kfree(nc); | 3150 | kfree(nc); |
| 3118 | free_alien_cache(new_alien); | 3151 | free_alien_cache(new_alien); |
| 3119 | continue; | 3152 | continue; |
| 3120 | } | 3153 | } |
| 3121 | if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), | 3154 | if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), |
| 3122 | GFP_KERNEL, node))) | 3155 | GFP_KERNEL, node))) |
| 3123 | goto fail; | 3156 | goto fail; |
| 3124 | 3157 | ||
| 3125 | kmem_list3_init(l3); | 3158 | kmem_list3_init(l3); |
| 3126 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 3159 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
| 3127 | ((unsigned long)cachep)%REAPTIMEOUT_LIST3; | 3160 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
| 3128 | l3->shared = new; | 3161 | l3->shared = new; |
| 3129 | l3->alien = new_alien; | 3162 | l3->alien = new_alien; |
| 3130 | l3->free_limit = (1 + nr_cpus_node(node))* | 3163 | l3->free_limit = (1 + nr_cpus_node(node)) * |
| 3131 | cachep->batchcount + cachep->num; | 3164 | cachep->batchcount + cachep->num; |
| 3132 | cachep->nodelists[node] = l3; | 3165 | cachep->nodelists[node] = l3; |
| 3133 | } | 3166 | } |
| 3134 | return err; | 3167 | return err; |
| 3135 | fail: | 3168 | fail: |
| 3136 | err = -ENOMEM; | 3169 | err = -ENOMEM; |
| 3137 | return err; | 3170 | return err; |
| 3138 | } | 3171 | } |
| @@ -3154,18 +3187,19 @@ static void do_ccupdate_local(void *info) | |||
| 3154 | new->new[smp_processor_id()] = old; | 3187 | new->new[smp_processor_id()] = old; |
| 3155 | } | 3188 | } |
| 3156 | 3189 | ||
| 3157 | |||
| 3158 | static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, | 3190 | static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, |
| 3159 | int shared) | 3191 | int shared) |
| 3160 | { | 3192 | { |
| 3161 | struct ccupdate_struct new; | 3193 | struct ccupdate_struct new; |
| 3162 | int i, err; | 3194 | int i, err; |
| 3163 | 3195 | ||
| 3164 | memset(&new.new,0,sizeof(new.new)); | 3196 | memset(&new.new, 0, sizeof(new.new)); |
| 3165 | for_each_online_cpu(i) { | 3197 | for_each_online_cpu(i) { |
| 3166 | new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount); | 3198 | new.new[i] = |
| 3199 | alloc_arraycache(cpu_to_node(i), limit, batchcount); | ||
| 3167 | if (!new.new[i]) { | 3200 | if (!new.new[i]) { |
| 3168 | for (i--; i >= 0; i--) kfree(new.new[i]); | 3201 | for (i--; i >= 0; i--) |
| 3202 | kfree(new.new[i]); | ||
| 3169 | return -ENOMEM; | 3203 | return -ENOMEM; |
| 3170 | } | 3204 | } |
| 3171 | } | 3205 | } |
| @@ -3193,13 +3227,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount, | |||
| 3193 | err = alloc_kmemlist(cachep); | 3227 | err = alloc_kmemlist(cachep); |
| 3194 | if (err) { | 3228 | if (err) { |
| 3195 | printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", | 3229 | printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n", |
| 3196 | cachep->name, -err); | 3230 | cachep->name, -err); |
| 3197 | BUG(); | 3231 | BUG(); |
| 3198 | } | 3232 | } |
| 3199 | return 0; | 3233 | return 0; |
| 3200 | } | 3234 | } |
| 3201 | 3235 | ||
| 3202 | |||
| 3203 | static void enable_cpucache(kmem_cache_t *cachep) | 3236 | static void enable_cpucache(kmem_cache_t *cachep) |
| 3204 | { | 3237 | { |
| 3205 | int err; | 3238 | int err; |
| @@ -3246,14 +3279,14 @@ static void enable_cpucache(kmem_cache_t *cachep) | |||
| 3246 | if (limit > 32) | 3279 | if (limit > 32) |
| 3247 | limit = 32; | 3280 | limit = 32; |
| 3248 | #endif | 3281 | #endif |
| 3249 | err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared); | 3282 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); |
| 3250 | if (err) | 3283 | if (err) |
| 3251 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 3284 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", |
| 3252 | cachep->name, -err); | 3285 | cachep->name, -err); |
| 3253 | } | 3286 | } |
| 3254 | 3287 | ||
| 3255 | static void drain_array_locked(kmem_cache_t *cachep, | 3288 | static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac, |
| 3256 | struct array_cache *ac, int force, int node) | 3289 | int force, int node) |
| 3257 | { | 3290 | { |
| 3258 | int tofree; | 3291 | int tofree; |
| 3259 | 3292 | ||
| @@ -3261,14 +3294,14 @@ static void drain_array_locked(kmem_cache_t *cachep, | |||
| 3261 | if (ac->touched && !force) { | 3294 | if (ac->touched && !force) { |
| 3262 | ac->touched = 0; | 3295 | ac->touched = 0; |
| 3263 | } else if (ac->avail) { | 3296 | } else if (ac->avail) { |
| 3264 | tofree = force ? ac->avail : (ac->limit+4)/5; | 3297 | tofree = force ? ac->avail : (ac->limit + 4) / 5; |
| 3265 | if (tofree > ac->avail) { | 3298 | if (tofree > ac->avail) { |
| 3266 | tofree = (ac->avail+1)/2; | 3299 | tofree = (ac->avail + 1) / 2; |
| 3267 | } | 3300 | } |
| 3268 | free_block(cachep, ac->entry, tofree, node); | 3301 | free_block(cachep, ac->entry, tofree, node); |
| 3269 | ac->avail -= tofree; | 3302 | ac->avail -= tofree; |
| 3270 | memmove(ac->entry, &(ac->entry[tofree]), | 3303 | memmove(ac->entry, &(ac->entry[tofree]), |
| 3271 | sizeof(void*)*ac->avail); | 3304 | sizeof(void *) * ac->avail); |
| 3272 | } | 3305 | } |
| 3273 | } | 3306 | } |
| 3274 | 3307 | ||
| @@ -3291,13 +3324,14 @@ static void cache_reap(void *unused) | |||
| 3291 | 3324 | ||
| 3292 | if (down_trylock(&cache_chain_sem)) { | 3325 | if (down_trylock(&cache_chain_sem)) { |
| 3293 | /* Give up. Setup the next iteration. */ | 3326 | /* Give up. Setup the next iteration. */ |
| 3294 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 3327 | schedule_delayed_work(&__get_cpu_var(reap_work), |
| 3328 | REAPTIMEOUT_CPUC); | ||
| 3295 | return; | 3329 | return; |
| 3296 | } | 3330 | } |
| 3297 | 3331 | ||
| 3298 | list_for_each(walk, &cache_chain) { | 3332 | list_for_each(walk, &cache_chain) { |
| 3299 | kmem_cache_t *searchp; | 3333 | kmem_cache_t *searchp; |
| 3300 | struct list_head* p; | 3334 | struct list_head *p; |
| 3301 | int tofree; | 3335 | int tofree; |
| 3302 | struct slab *slabp; | 3336 | struct slab *slabp; |
| 3303 | 3337 | ||
| @@ -3314,7 +3348,7 @@ static void cache_reap(void *unused) | |||
| 3314 | spin_lock_irq(&l3->list_lock); | 3348 | spin_lock_irq(&l3->list_lock); |
| 3315 | 3349 | ||
| 3316 | drain_array_locked(searchp, ac_data(searchp), 0, | 3350 | drain_array_locked(searchp, ac_data(searchp), 0, |
| 3317 | numa_node_id()); | 3351 | numa_node_id()); |
| 3318 | 3352 | ||
| 3319 | if (time_after(l3->next_reap, jiffies)) | 3353 | if (time_after(l3->next_reap, jiffies)) |
| 3320 | goto next_unlock; | 3354 | goto next_unlock; |
| @@ -3323,14 +3357,16 @@ static void cache_reap(void *unused) | |||
| 3323 | 3357 | ||
| 3324 | if (l3->shared) | 3358 | if (l3->shared) |
| 3325 | drain_array_locked(searchp, l3->shared, 0, | 3359 | drain_array_locked(searchp, l3->shared, 0, |
| 3326 | numa_node_id()); | 3360 | numa_node_id()); |
| 3327 | 3361 | ||
| 3328 | if (l3->free_touched) { | 3362 | if (l3->free_touched) { |
| 3329 | l3->free_touched = 0; | 3363 | l3->free_touched = 0; |
| 3330 | goto next_unlock; | 3364 | goto next_unlock; |
| 3331 | } | 3365 | } |
| 3332 | 3366 | ||
| 3333 | tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num); | 3367 | tofree = |
| 3368 | (l3->free_limit + 5 * searchp->num - | ||
| 3369 | 1) / (5 * searchp->num); | ||
| 3334 | do { | 3370 | do { |
| 3335 | p = l3->slabs_free.next; | 3371 | p = l3->slabs_free.next; |
| 3336 | if (p == &(l3->slabs_free)) | 3372 | if (p == &(l3->slabs_free)) |
| @@ -3350,10 +3386,10 @@ static void cache_reap(void *unused) | |||
| 3350 | spin_unlock_irq(&l3->list_lock); | 3386 | spin_unlock_irq(&l3->list_lock); |
| 3351 | slab_destroy(searchp, slabp); | 3387 | slab_destroy(searchp, slabp); |
| 3352 | spin_lock_irq(&l3->list_lock); | 3388 | spin_lock_irq(&l3->list_lock); |
| 3353 | } while(--tofree > 0); | 3389 | } while (--tofree > 0); |
| 3354 | next_unlock: | 3390 | next_unlock: |
| 3355 | spin_unlock_irq(&l3->list_lock); | 3391 | spin_unlock_irq(&l3->list_lock); |
| 3356 | next: | 3392 | next: |
| 3357 | cond_resched(); | 3393 | cond_resched(); |
| 3358 | } | 3394 | } |
| 3359 | check_irq_on(); | 3395 | check_irq_on(); |
| @@ -3365,32 +3401,37 @@ next: | |||
| 3365 | 3401 | ||
| 3366 | #ifdef CONFIG_PROC_FS | 3402 | #ifdef CONFIG_PROC_FS |
| 3367 | 3403 | ||
| 3368 | static void *s_start(struct seq_file *m, loff_t *pos) | 3404 | static void print_slabinfo_header(struct seq_file *m) |
| 3369 | { | 3405 | { |
| 3370 | loff_t n = *pos; | 3406 | /* |
| 3371 | struct list_head *p; | 3407 | * Output format version, so at least we can change it |
| 3372 | 3408 | * without _too_ many complaints. | |
| 3373 | down(&cache_chain_sem); | 3409 | */ |
| 3374 | if (!n) { | ||
| 3375 | /* | ||
| 3376 | * Output format version, so at least we can change it | ||
| 3377 | * without _too_ many complaints. | ||
| 3378 | */ | ||
| 3379 | #if STATS | 3410 | #if STATS |
| 3380 | seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); | 3411 | seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); |
| 3381 | #else | 3412 | #else |
| 3382 | seq_puts(m, "slabinfo - version: 2.1\n"); | 3413 | seq_puts(m, "slabinfo - version: 2.1\n"); |
| 3383 | #endif | 3414 | #endif |
| 3384 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); | 3415 | seq_puts(m, "# name <active_objs> <num_objs> <objsize> " |
| 3385 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); | 3416 | "<objperslab> <pagesperslab>"); |
| 3386 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | 3417 | seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); |
| 3418 | seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); | ||
| 3387 | #if STATS | 3419 | #if STATS |
| 3388 | seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" | 3420 | seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " |
| 3389 | " <error> <maxfreeable> <nodeallocs> <remotefrees>"); | 3421 | "<error> <maxfreeable> <nodeallocs> <remotefrees>"); |
| 3390 | seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); | 3422 | seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); |
| 3391 | #endif | 3423 | #endif |
| 3392 | seq_putc(m, '\n'); | 3424 | seq_putc(m, '\n'); |
| 3393 | } | 3425 | } |
| 3426 | |||
| 3427 | static void *s_start(struct seq_file *m, loff_t *pos) | ||
| 3428 | { | ||
| 3429 | loff_t n = *pos; | ||
| 3430 | struct list_head *p; | ||
| 3431 | |||
| 3432 | down(&cache_chain_sem); | ||
| 3433 | if (!n) | ||
| 3434 | print_slabinfo_header(m); | ||
| 3394 | p = cache_chain.next; | 3435 | p = cache_chain.next; |
| 3395 | while (n--) { | 3436 | while (n--) { |
| 3396 | p = p->next; | 3437 | p = p->next; |
| @@ -3405,7 +3446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) | |||
| 3405 | kmem_cache_t *cachep = p; | 3446 | kmem_cache_t *cachep = p; |
| 3406 | ++*pos; | 3447 | ++*pos; |
| 3407 | return cachep->next.next == &cache_chain ? NULL | 3448 | return cachep->next.next == &cache_chain ? NULL |
| 3408 | : list_entry(cachep->next.next, kmem_cache_t, next); | 3449 | : list_entry(cachep->next.next, kmem_cache_t, next); |
| 3409 | } | 3450 | } |
| 3410 | 3451 | ||
| 3411 | static void s_stop(struct seq_file *m, void *p) | 3452 | static void s_stop(struct seq_file *m, void *p) |
| @@ -3417,11 +3458,11 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3417 | { | 3458 | { |
| 3418 | kmem_cache_t *cachep = p; | 3459 | kmem_cache_t *cachep = p; |
| 3419 | struct list_head *q; | 3460 | struct list_head *q; |
| 3420 | struct slab *slabp; | 3461 | struct slab *slabp; |
| 3421 | unsigned long active_objs; | 3462 | unsigned long active_objs; |
| 3422 | unsigned long num_objs; | 3463 | unsigned long num_objs; |
| 3423 | unsigned long active_slabs = 0; | 3464 | unsigned long active_slabs = 0; |
| 3424 | unsigned long num_slabs, free_objects = 0, shared_avail = 0; | 3465 | unsigned long num_slabs, free_objects = 0, shared_avail = 0; |
| 3425 | const char *name; | 3466 | const char *name; |
| 3426 | char *error = NULL; | 3467 | char *error = NULL; |
| 3427 | int node; | 3468 | int node; |
| @@ -3438,14 +3479,14 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3438 | 3479 | ||
| 3439 | spin_lock(&l3->list_lock); | 3480 | spin_lock(&l3->list_lock); |
| 3440 | 3481 | ||
| 3441 | list_for_each(q,&l3->slabs_full) { | 3482 | list_for_each(q, &l3->slabs_full) { |
| 3442 | slabp = list_entry(q, struct slab, list); | 3483 | slabp = list_entry(q, struct slab, list); |
| 3443 | if (slabp->inuse != cachep->num && !error) | 3484 | if (slabp->inuse != cachep->num && !error) |
| 3444 | error = "slabs_full accounting error"; | 3485 | error = "slabs_full accounting error"; |
| 3445 | active_objs += cachep->num; | 3486 | active_objs += cachep->num; |
| 3446 | active_slabs++; | 3487 | active_slabs++; |
| 3447 | } | 3488 | } |
| 3448 | list_for_each(q,&l3->slabs_partial) { | 3489 | list_for_each(q, &l3->slabs_partial) { |
| 3449 | slabp = list_entry(q, struct slab, list); | 3490 | slabp = list_entry(q, struct slab, list); |
| 3450 | if (slabp->inuse == cachep->num && !error) | 3491 | if (slabp->inuse == cachep->num && !error) |
| 3451 | error = "slabs_partial inuse accounting error"; | 3492 | error = "slabs_partial inuse accounting error"; |
| @@ -3454,7 +3495,7 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3454 | active_objs += slabp->inuse; | 3495 | active_objs += slabp->inuse; |
| 3455 | active_slabs++; | 3496 | active_slabs++; |
| 3456 | } | 3497 | } |
| 3457 | list_for_each(q,&l3->slabs_free) { | 3498 | list_for_each(q, &l3->slabs_free) { |
| 3458 | slabp = list_entry(q, struct slab, list); | 3499 | slabp = list_entry(q, struct slab, list); |
| 3459 | if (slabp->inuse && !error) | 3500 | if (slabp->inuse && !error) |
| 3460 | error = "slabs_free/inuse accounting error"; | 3501 | error = "slabs_free/inuse accounting error"; |
| @@ -3465,25 +3506,24 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3465 | 3506 | ||
| 3466 | spin_unlock(&l3->list_lock); | 3507 | spin_unlock(&l3->list_lock); |
| 3467 | } | 3508 | } |
| 3468 | num_slabs+=active_slabs; | 3509 | num_slabs += active_slabs; |
| 3469 | num_objs = num_slabs*cachep->num; | 3510 | num_objs = num_slabs * cachep->num; |
| 3470 | if (num_objs - active_objs != free_objects && !error) | 3511 | if (num_objs - active_objs != free_objects && !error) |
| 3471 | error = "free_objects accounting error"; | 3512 | error = "free_objects accounting error"; |
| 3472 | 3513 | ||
| 3473 | name = cachep->name; | 3514 | name = cachep->name; |
| 3474 | if (error) | 3515 | if (error) |
| 3475 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); | 3516 | printk(KERN_ERR "slab: cache %s error: %s\n", name, error); |
| 3476 | 3517 | ||
| 3477 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", | 3518 | seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", |
| 3478 | name, active_objs, num_objs, cachep->objsize, | 3519 | name, active_objs, num_objs, cachep->objsize, |
| 3479 | cachep->num, (1<<cachep->gfporder)); | 3520 | cachep->num, (1 << cachep->gfporder)); |
| 3480 | seq_printf(m, " : tunables %4u %4u %4u", | 3521 | seq_printf(m, " : tunables %4u %4u %4u", |
| 3481 | cachep->limit, cachep->batchcount, | 3522 | cachep->limit, cachep->batchcount, cachep->shared); |
| 3482 | cachep->shared); | ||
| 3483 | seq_printf(m, " : slabdata %6lu %6lu %6lu", | 3523 | seq_printf(m, " : slabdata %6lu %6lu %6lu", |
| 3484 | active_slabs, num_slabs, shared_avail); | 3524 | active_slabs, num_slabs, shared_avail); |
| 3485 | #if STATS | 3525 | #if STATS |
| 3486 | { /* list3 stats */ | 3526 | { /* list3 stats */ |
| 3487 | unsigned long high = cachep->high_mark; | 3527 | unsigned long high = cachep->high_mark; |
| 3488 | unsigned long allocs = cachep->num_allocations; | 3528 | unsigned long allocs = cachep->num_allocations; |
| 3489 | unsigned long grown = cachep->grown; | 3529 | unsigned long grown = cachep->grown; |
| @@ -3494,9 +3534,7 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3494 | unsigned long node_frees = cachep->node_frees; | 3534 | unsigned long node_frees = cachep->node_frees; |
| 3495 | 3535 | ||
| 3496 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ | 3536 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ |
| 3497 | %4lu %4lu %4lu %4lu", | 3537 | %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); |
| 3498 | allocs, high, grown, reaped, errors, | ||
| 3499 | max_freeable, node_allocs, node_frees); | ||
| 3500 | } | 3538 | } |
| 3501 | /* cpu stats */ | 3539 | /* cpu stats */ |
| 3502 | { | 3540 | { |
| @@ -3506,7 +3544,7 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3506 | unsigned long freemiss = atomic_read(&cachep->freemiss); | 3544 | unsigned long freemiss = atomic_read(&cachep->freemiss); |
| 3507 | 3545 | ||
| 3508 | seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", | 3546 | seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu", |
| 3509 | allochit, allocmiss, freehit, freemiss); | 3547 | allochit, allocmiss, freehit, freemiss); |
| 3510 | } | 3548 | } |
| 3511 | #endif | 3549 | #endif |
| 3512 | seq_putc(m, '\n'); | 3550 | seq_putc(m, '\n'); |
| @@ -3529,10 +3567,10 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3529 | */ | 3567 | */ |
| 3530 | 3568 | ||
| 3531 | struct seq_operations slabinfo_op = { | 3569 | struct seq_operations slabinfo_op = { |
| 3532 | .start = s_start, | 3570 | .start = s_start, |
| 3533 | .next = s_next, | 3571 | .next = s_next, |
| 3534 | .stop = s_stop, | 3572 | .stop = s_stop, |
| 3535 | .show = s_show, | 3573 | .show = s_show, |
| 3536 | }; | 3574 | }; |
| 3537 | 3575 | ||
| 3538 | #define MAX_SLABINFO_WRITE 128 | 3576 | #define MAX_SLABINFO_WRITE 128 |
| @@ -3543,18 +3581,18 @@ struct seq_operations slabinfo_op = { | |||
| 3543 | * @count: data length | 3581 | * @count: data length |
| 3544 | * @ppos: unused | 3582 | * @ppos: unused |
| 3545 | */ | 3583 | */ |
| 3546 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | 3584 | ssize_t slabinfo_write(struct file *file, const char __user * buffer, |
| 3547 | size_t count, loff_t *ppos) | 3585 | size_t count, loff_t *ppos) |
| 3548 | { | 3586 | { |
| 3549 | char kbuf[MAX_SLABINFO_WRITE+1], *tmp; | 3587 | char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; |
| 3550 | int limit, batchcount, shared, res; | 3588 | int limit, batchcount, shared, res; |
| 3551 | struct list_head *p; | 3589 | struct list_head *p; |
| 3552 | 3590 | ||
| 3553 | if (count > MAX_SLABINFO_WRITE) | 3591 | if (count > MAX_SLABINFO_WRITE) |
| 3554 | return -EINVAL; | 3592 | return -EINVAL; |
| 3555 | if (copy_from_user(&kbuf, buffer, count)) | 3593 | if (copy_from_user(&kbuf, buffer, count)) |
| 3556 | return -EFAULT; | 3594 | return -EFAULT; |
| 3557 | kbuf[MAX_SLABINFO_WRITE] = '\0'; | 3595 | kbuf[MAX_SLABINFO_WRITE] = '\0'; |
| 3558 | 3596 | ||
| 3559 | tmp = strchr(kbuf, ' '); | 3597 | tmp = strchr(kbuf, ' '); |
| 3560 | if (!tmp) | 3598 | if (!tmp) |
| @@ -3567,18 +3605,17 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
| 3567 | /* Find the cache in the chain of caches. */ | 3605 | /* Find the cache in the chain of caches. */ |
| 3568 | down(&cache_chain_sem); | 3606 | down(&cache_chain_sem); |
| 3569 | res = -EINVAL; | 3607 | res = -EINVAL; |
| 3570 | list_for_each(p,&cache_chain) { | 3608 | list_for_each(p, &cache_chain) { |
| 3571 | kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); | 3609 | kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next); |
| 3572 | 3610 | ||
| 3573 | if (!strcmp(cachep->name, kbuf)) { | 3611 | if (!strcmp(cachep->name, kbuf)) { |
| 3574 | if (limit < 1 || | 3612 | if (limit < 1 || |
| 3575 | batchcount < 1 || | 3613 | batchcount < 1 || |
| 3576 | batchcount > limit || | 3614 | batchcount > limit || shared < 0) { |
| 3577 | shared < 0) { | ||
| 3578 | res = 0; | 3615 | res = 0; |
| 3579 | } else { | 3616 | } else { |
| 3580 | res = do_tune_cpucache(cachep, limit, | 3617 | res = do_tune_cpucache(cachep, limit, |
| 3581 | batchcount, shared); | 3618 | batchcount, shared); |
| 3582 | } | 3619 | } |
| 3583 | break; | 3620 | break; |
| 3584 | } | 3621 | } |
| @@ -3609,26 +3646,3 @@ unsigned int ksize(const void *objp) | |||
| 3609 | 3646 | ||
| 3610 | return obj_reallen(page_get_cache(virt_to_page(objp))); | 3647 | return obj_reallen(page_get_cache(virt_to_page(objp))); |
| 3611 | } | 3648 | } |
| 3612 | |||
| 3613 | |||
| 3614 | /* | ||
| 3615 | * kstrdup - allocate space for and copy an existing string | ||
| 3616 | * | ||
| 3617 | * @s: the string to duplicate | ||
| 3618 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | ||
| 3619 | */ | ||
| 3620 | char *kstrdup(const char *s, gfp_t gfp) | ||
| 3621 | { | ||
| 3622 | size_t len; | ||
| 3623 | char *buf; | ||
| 3624 | |||
| 3625 | if (!s) | ||
| 3626 | return NULL; | ||
| 3627 | |||
| 3628 | len = strlen(s) + 1; | ||
| 3629 | buf = kmalloc(len, gfp); | ||
| 3630 | if (buf) | ||
| 3631 | memcpy(buf, s, len); | ||
| 3632 | return buf; | ||
| 3633 | } | ||
| 3634 | EXPORT_SYMBOL(kstrdup); | ||
diff --git a/mm/slob.c b/mm/slob.c new file mode 100644 index 000000000000..1c240c4b71d9 --- /dev/null +++ b/mm/slob.c | |||
| @@ -0,0 +1,385 @@ | |||
| 1 | /* | ||
| 2 | * SLOB Allocator: Simple List Of Blocks | ||
| 3 | * | ||
| 4 | * Matt Mackall <mpm@selenic.com> 12/30/03 | ||
| 5 | * | ||
| 6 | * How SLOB works: | ||
| 7 | * | ||
| 8 | * The core of SLOB is a traditional K&R style heap allocator, with | ||
| 9 | * support for returning aligned objects. The granularity of this | ||
| 10 | * allocator is 8 bytes on x86, though it's perhaps possible to reduce | ||
| 11 | * this to 4 if it's deemed worth the effort. The slob heap is a | ||
| 12 | * singly-linked list of pages from __get_free_page, grown on demand | ||
| 13 | * and allocation from the heap is currently first-fit. | ||
| 14 | * | ||
| 15 | * Above this is an implementation of kmalloc/kfree. Blocks returned | ||
| 16 | * from kmalloc are 8-byte aligned and prepended with a 8-byte header. | ||
| 17 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls | ||
| 18 | * __get_free_pages directly so that it can return page-aligned blocks | ||
| 19 | * and keeps a linked list of such pages and their orders. These | ||
| 20 | * objects are detected in kfree() by their page alignment. | ||
| 21 | * | ||
| 22 | * SLAB is emulated on top of SLOB by simply calling constructors and | ||
| 23 | * destructors for every SLAB allocation. Objects are returned with | ||
| 24 | * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is | ||
| 25 | * set, in which case the low-level allocator will fragment blocks to | ||
| 26 | * create the proper alignment. Again, objects of page-size or greater | ||
| 27 | * are allocated by calling __get_free_pages. As SLAB objects know | ||
| 28 | * their size, no separate size bookkeeping is necessary and there is | ||
| 29 | * essentially no allocation space overhead. | ||
| 30 | */ | ||
| 31 | |||
| 32 | #include <linux/config.h> | ||
| 33 | #include <linux/slab.h> | ||
| 34 | #include <linux/mm.h> | ||
| 35 | #include <linux/cache.h> | ||
| 36 | #include <linux/init.h> | ||
| 37 | #include <linux/module.h> | ||
| 38 | #include <linux/timer.h> | ||
| 39 | |||
| 40 | struct slob_block { | ||
| 41 | int units; | ||
| 42 | struct slob_block *next; | ||
| 43 | }; | ||
| 44 | typedef struct slob_block slob_t; | ||
| 45 | |||
| 46 | #define SLOB_UNIT sizeof(slob_t) | ||
| 47 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) | ||
| 48 | #define SLOB_ALIGN L1_CACHE_BYTES | ||
| 49 | |||
| 50 | struct bigblock { | ||
| 51 | int order; | ||
| 52 | void *pages; | ||
| 53 | struct bigblock *next; | ||
| 54 | }; | ||
| 55 | typedef struct bigblock bigblock_t; | ||
| 56 | |||
| 57 | static slob_t arena = { .next = &arena, .units = 1 }; | ||
| 58 | static slob_t *slobfree = &arena; | ||
| 59 | static bigblock_t *bigblocks; | ||
| 60 | static DEFINE_SPINLOCK(slob_lock); | ||
| 61 | static DEFINE_SPINLOCK(block_lock); | ||
| 62 | |||
| 63 | static void slob_free(void *b, int size); | ||
| 64 | |||
| 65 | static void *slob_alloc(size_t size, gfp_t gfp, int align) | ||
| 66 | { | ||
| 67 | slob_t *prev, *cur, *aligned = 0; | ||
| 68 | int delta = 0, units = SLOB_UNITS(size); | ||
| 69 | unsigned long flags; | ||
| 70 | |||
| 71 | spin_lock_irqsave(&slob_lock, flags); | ||
| 72 | prev = slobfree; | ||
| 73 | for (cur = prev->next; ; prev = cur, cur = cur->next) { | ||
| 74 | if (align) { | ||
| 75 | aligned = (slob_t *)ALIGN((unsigned long)cur, align); | ||
| 76 | delta = aligned - cur; | ||
| 77 | } | ||
| 78 | if (cur->units >= units + delta) { /* room enough? */ | ||
| 79 | if (delta) { /* need to fragment head to align? */ | ||
| 80 | aligned->units = cur->units - delta; | ||
| 81 | aligned->next = cur->next; | ||
| 82 | cur->next = aligned; | ||
| 83 | cur->units = delta; | ||
| 84 | prev = cur; | ||
| 85 | cur = aligned; | ||
| 86 | } | ||
| 87 | |||
| 88 | if (cur->units == units) /* exact fit? */ | ||
| 89 | prev->next = cur->next; /* unlink */ | ||
| 90 | else { /* fragment */ | ||
| 91 | prev->next = cur + units; | ||
| 92 | prev->next->units = cur->units - units; | ||
| 93 | prev->next->next = cur->next; | ||
| 94 | cur->units = units; | ||
| 95 | } | ||
| 96 | |||
| 97 | slobfree = prev; | ||
| 98 | spin_unlock_irqrestore(&slob_lock, flags); | ||
| 99 | return cur; | ||
| 100 | } | ||
| 101 | if (cur == slobfree) { | ||
| 102 | spin_unlock_irqrestore(&slob_lock, flags); | ||
| 103 | |||
| 104 | if (size == PAGE_SIZE) /* trying to shrink arena? */ | ||
| 105 | return 0; | ||
| 106 | |||
| 107 | cur = (slob_t *)__get_free_page(gfp); | ||
| 108 | if (!cur) | ||
| 109 | return 0; | ||
| 110 | |||
| 111 | slob_free(cur, PAGE_SIZE); | ||
| 112 | spin_lock_irqsave(&slob_lock, flags); | ||
| 113 | cur = slobfree; | ||
| 114 | } | ||
| 115 | } | ||
| 116 | } | ||
| 117 | |||
| 118 | static void slob_free(void *block, int size) | ||
| 119 | { | ||
| 120 | slob_t *cur, *b = (slob_t *)block; | ||
| 121 | unsigned long flags; | ||
| 122 | |||
| 123 | if (!block) | ||
| 124 | return; | ||
| 125 | |||
| 126 | if (size) | ||
| 127 | b->units = SLOB_UNITS(size); | ||
| 128 | |||
| 129 | /* Find reinsertion point */ | ||
| 130 | spin_lock_irqsave(&slob_lock, flags); | ||
| 131 | for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next) | ||
| 132 | if (cur >= cur->next && (b > cur || b < cur->next)) | ||
| 133 | break; | ||
| 134 | |||
| 135 | if (b + b->units == cur->next) { | ||
| 136 | b->units += cur->next->units; | ||
| 137 | b->next = cur->next->next; | ||
| 138 | } else | ||
| 139 | b->next = cur->next; | ||
| 140 | |||
| 141 | if (cur + cur->units == b) { | ||
| 142 | cur->units += b->units; | ||
| 143 | cur->next = b->next; | ||
| 144 | } else | ||
| 145 | cur->next = b; | ||
| 146 | |||
| 147 | slobfree = cur; | ||
| 148 | |||
| 149 | spin_unlock_irqrestore(&slob_lock, flags); | ||
| 150 | } | ||
| 151 | |||
| 152 | static int FASTCALL(find_order(int size)); | ||
| 153 | static int fastcall find_order(int size) | ||
| 154 | { | ||
| 155 | int order = 0; | ||
| 156 | for ( ; size > 4096 ; size >>=1) | ||
| 157 | order++; | ||
| 158 | return order; | ||
| 159 | } | ||
| 160 | |||
| 161 | void *kmalloc(size_t size, gfp_t gfp) | ||
| 162 | { | ||
| 163 | slob_t *m; | ||
| 164 | bigblock_t *bb; | ||
| 165 | unsigned long flags; | ||
| 166 | |||
| 167 | if (size < PAGE_SIZE - SLOB_UNIT) { | ||
| 168 | m = slob_alloc(size + SLOB_UNIT, gfp, 0); | ||
| 169 | return m ? (void *)(m + 1) : 0; | ||
| 170 | } | ||
| 171 | |||
| 172 | bb = slob_alloc(sizeof(bigblock_t), gfp, 0); | ||
| 173 | if (!bb) | ||
| 174 | return 0; | ||
| 175 | |||
| 176 | bb->order = find_order(size); | ||
| 177 | bb->pages = (void *)__get_free_pages(gfp, bb->order); | ||
| 178 | |||
| 179 | if (bb->pages) { | ||
| 180 | spin_lock_irqsave(&block_lock, flags); | ||
| 181 | bb->next = bigblocks; | ||
| 182 | bigblocks = bb; | ||
| 183 | spin_unlock_irqrestore(&block_lock, flags); | ||
| 184 | return bb->pages; | ||
| 185 | } | ||
| 186 | |||
| 187 | slob_free(bb, sizeof(bigblock_t)); | ||
| 188 | return 0; | ||
| 189 | } | ||
| 190 | |||
| 191 | EXPORT_SYMBOL(kmalloc); | ||
| 192 | |||
| 193 | void kfree(const void *block) | ||
| 194 | { | ||
| 195 | bigblock_t *bb, **last = &bigblocks; | ||
| 196 | unsigned long flags; | ||
| 197 | |||
| 198 | if (!block) | ||
| 199 | return; | ||
| 200 | |||
| 201 | if (!((unsigned long)block & (PAGE_SIZE-1))) { | ||
| 202 | /* might be on the big block list */ | ||
| 203 | spin_lock_irqsave(&block_lock, flags); | ||
| 204 | for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) { | ||
| 205 | if (bb->pages == block) { | ||
| 206 | *last = bb->next; | ||
| 207 | spin_unlock_irqrestore(&block_lock, flags); | ||
| 208 | free_pages((unsigned long)block, bb->order); | ||
| 209 | slob_free(bb, sizeof(bigblock_t)); | ||
| 210 | return; | ||
| 211 | } | ||
| 212 | } | ||
| 213 | spin_unlock_irqrestore(&block_lock, flags); | ||
| 214 | } | ||
| 215 | |||
| 216 | slob_free((slob_t *)block - 1, 0); | ||
| 217 | return; | ||
| 218 | } | ||
| 219 | |||
| 220 | EXPORT_SYMBOL(kfree); | ||
| 221 | |||
| 222 | unsigned int ksize(const void *block) | ||
| 223 | { | ||
| 224 | bigblock_t *bb; | ||
| 225 | unsigned long flags; | ||
| 226 | |||
| 227 | if (!block) | ||
| 228 | return 0; | ||
| 229 | |||
| 230 | if (!((unsigned long)block & (PAGE_SIZE-1))) { | ||
| 231 | spin_lock_irqsave(&block_lock, flags); | ||
| 232 | for (bb = bigblocks; bb; bb = bb->next) | ||
| 233 | if (bb->pages == block) { | ||
| 234 | spin_unlock_irqrestore(&slob_lock, flags); | ||
| 235 | return PAGE_SIZE << bb->order; | ||
| 236 | } | ||
| 237 | spin_unlock_irqrestore(&block_lock, flags); | ||
| 238 | } | ||
| 239 | |||
| 240 | return ((slob_t *)block - 1)->units * SLOB_UNIT; | ||
| 241 | } | ||
| 242 | |||
| 243 | struct kmem_cache { | ||
| 244 | unsigned int size, align; | ||
| 245 | const char *name; | ||
| 246 | void (*ctor)(void *, struct kmem_cache *, unsigned long); | ||
| 247 | void (*dtor)(void *, struct kmem_cache *, unsigned long); | ||
| 248 | }; | ||
| 249 | |||
| 250 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | ||
| 251 | size_t align, unsigned long flags, | ||
| 252 | void (*ctor)(void*, struct kmem_cache *, unsigned long), | ||
| 253 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) | ||
| 254 | { | ||
| 255 | struct kmem_cache *c; | ||
| 256 | |||
| 257 | c = slob_alloc(sizeof(struct kmem_cache), flags, 0); | ||
| 258 | |||
| 259 | if (c) { | ||
| 260 | c->name = name; | ||
| 261 | c->size = size; | ||
| 262 | c->ctor = ctor; | ||
| 263 | c->dtor = dtor; | ||
| 264 | /* ignore alignment unless it's forced */ | ||
| 265 | c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; | ||
| 266 | if (c->align < align) | ||
| 267 | c->align = align; | ||
| 268 | } | ||
| 269 | |||
| 270 | return c; | ||
| 271 | } | ||
| 272 | EXPORT_SYMBOL(kmem_cache_create); | ||
| 273 | |||
| 274 | int kmem_cache_destroy(struct kmem_cache *c) | ||
| 275 | { | ||
| 276 | slob_free(c, sizeof(struct kmem_cache)); | ||
| 277 | return 0; | ||
| 278 | } | ||
| 279 | EXPORT_SYMBOL(kmem_cache_destroy); | ||
| 280 | |||
| 281 | void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) | ||
| 282 | { | ||
| 283 | void *b; | ||
| 284 | |||
| 285 | if (c->size < PAGE_SIZE) | ||
| 286 | b = slob_alloc(c->size, flags, c->align); | ||
| 287 | else | ||
| 288 | b = (void *)__get_free_pages(flags, find_order(c->size)); | ||
| 289 | |||
| 290 | if (c->ctor) | ||
| 291 | c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR); | ||
| 292 | |||
| 293 | return b; | ||
| 294 | } | ||
| 295 | EXPORT_SYMBOL(kmem_cache_alloc); | ||
| 296 | |||
| 297 | void kmem_cache_free(struct kmem_cache *c, void *b) | ||
| 298 | { | ||
| 299 | if (c->dtor) | ||
| 300 | c->dtor(b, c, 0); | ||
| 301 | |||
| 302 | if (c->size < PAGE_SIZE) | ||
| 303 | slob_free(b, c->size); | ||
| 304 | else | ||
| 305 | free_pages((unsigned long)b, find_order(c->size)); | ||
| 306 | } | ||
| 307 | EXPORT_SYMBOL(kmem_cache_free); | ||
| 308 | |||
| 309 | unsigned int kmem_cache_size(struct kmem_cache *c) | ||
| 310 | { | ||
| 311 | return c->size; | ||
| 312 | } | ||
| 313 | EXPORT_SYMBOL(kmem_cache_size); | ||
| 314 | |||
| 315 | const char *kmem_cache_name(struct kmem_cache *c) | ||
| 316 | { | ||
| 317 | return c->name; | ||
| 318 | } | ||
| 319 | EXPORT_SYMBOL(kmem_cache_name); | ||
| 320 | |||
| 321 | static struct timer_list slob_timer = TIMER_INITIALIZER( | ||
| 322 | (void (*)(unsigned long))kmem_cache_init, 0, 0); | ||
| 323 | |||
| 324 | void kmem_cache_init(void) | ||
| 325 | { | ||
| 326 | void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); | ||
| 327 | |||
| 328 | if (p) | ||
| 329 | free_page((unsigned long)p); | ||
| 330 | |||
| 331 | mod_timer(&slob_timer, jiffies + HZ); | ||
| 332 | } | ||
| 333 | |||
| 334 | atomic_t slab_reclaim_pages = ATOMIC_INIT(0); | ||
| 335 | EXPORT_SYMBOL(slab_reclaim_pages); | ||
| 336 | |||
| 337 | #ifdef CONFIG_SMP | ||
| 338 | |||
| 339 | void *__alloc_percpu(size_t size, size_t align) | ||
| 340 | { | ||
| 341 | int i; | ||
| 342 | struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL); | ||
| 343 | |||
| 344 | if (!pdata) | ||
| 345 | return NULL; | ||
| 346 | |||
| 347 | for (i = 0; i < NR_CPUS; i++) { | ||
| 348 | if (!cpu_possible(i)) | ||
| 349 | continue; | ||
| 350 | pdata->ptrs[i] = kmalloc(size, GFP_KERNEL); | ||
| 351 | if (!pdata->ptrs[i]) | ||
| 352 | goto unwind_oom; | ||
| 353 | memset(pdata->ptrs[i], 0, size); | ||
| 354 | } | ||
| 355 | |||
| 356 | /* Catch derefs w/o wrappers */ | ||
| 357 | return (void *) (~(unsigned long) pdata); | ||
| 358 | |||
| 359 | unwind_oom: | ||
| 360 | while (--i >= 0) { | ||
| 361 | if (!cpu_possible(i)) | ||
| 362 | continue; | ||
| 363 | kfree(pdata->ptrs[i]); | ||
| 364 | } | ||
| 365 | kfree(pdata); | ||
| 366 | return NULL; | ||
| 367 | } | ||
| 368 | EXPORT_SYMBOL(__alloc_percpu); | ||
| 369 | |||
| 370 | void | ||
| 371 | free_percpu(const void *objp) | ||
| 372 | { | ||
| 373 | int i; | ||
| 374 | struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); | ||
| 375 | |||
| 376 | for (i = 0; i < NR_CPUS; i++) { | ||
| 377 | if (!cpu_possible(i)) | ||
| 378 | continue; | ||
| 379 | kfree(p->ptrs[i]); | ||
| 380 | } | ||
| 381 | kfree(p); | ||
| 382 | } | ||
| 383 | EXPORT_SYMBOL(free_percpu); | ||
| 384 | |||
| 385 | #endif | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 72079b538e2d..0a51f36ba3a1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -18,10 +18,10 @@ | |||
| 18 | */ | 18 | */ |
| 19 | #ifdef CONFIG_SPARSEMEM_EXTREME | 19 | #ifdef CONFIG_SPARSEMEM_EXTREME |
| 20 | struct mem_section *mem_section[NR_SECTION_ROOTS] | 20 | struct mem_section *mem_section[NR_SECTION_ROOTS] |
| 21 | ____cacheline_maxaligned_in_smp; | 21 | ____cacheline_internodealigned_in_smp; |
| 22 | #else | 22 | #else |
| 23 | struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] | 23 | struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT] |
| 24 | ____cacheline_maxaligned_in_smp; | 24 | ____cacheline_internodealigned_in_smp; |
| 25 | #endif | 25 | #endif |
| 26 | EXPORT_SYMBOL(mem_section); | 26 | EXPORT_SYMBOL(mem_section); |
| 27 | 27 | ||
| @@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page) | |||
| 156 | put_cpu_var(lru_add_active_pvecs); | 156 | put_cpu_var(lru_add_active_pvecs); |
| 157 | } | 157 | } |
| 158 | 158 | ||
| 159 | void lru_add_drain(void) | 159 | static void __lru_add_drain(int cpu) |
| 160 | { | 160 | { |
| 161 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); | 161 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); |
| 162 | 162 | ||
| 163 | /* CPU is dead, so no locking needed. */ | ||
| 163 | if (pagevec_count(pvec)) | 164 | if (pagevec_count(pvec)) |
| 164 | __pagevec_lru_add(pvec); | 165 | __pagevec_lru_add(pvec); |
| 165 | pvec = &__get_cpu_var(lru_add_active_pvecs); | 166 | pvec = &per_cpu(lru_add_active_pvecs, cpu); |
| 166 | if (pagevec_count(pvec)) | 167 | if (pagevec_count(pvec)) |
| 167 | __pagevec_lru_add_active(pvec); | 168 | __pagevec_lru_add_active(pvec); |
| 168 | put_cpu_var(lru_add_pvecs); | 169 | } |
| 170 | |||
| 171 | void lru_add_drain(void) | ||
| 172 | { | ||
| 173 | __lru_add_drain(get_cpu()); | ||
| 174 | put_cpu(); | ||
| 169 | } | 175 | } |
| 170 | 176 | ||
| 171 | /* | 177 | /* |
| @@ -378,6 +384,8 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, | |||
| 378 | return pagevec_count(pvec); | 384 | return pagevec_count(pvec); |
| 379 | } | 385 | } |
| 380 | 386 | ||
| 387 | EXPORT_SYMBOL(pagevec_lookup); | ||
| 388 | |||
| 381 | unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, | 389 | unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, |
| 382 | pgoff_t *index, int tag, unsigned nr_pages) | 390 | pgoff_t *index, int tag, unsigned nr_pages) |
| 383 | { | 391 | { |
| @@ -412,17 +420,6 @@ void vm_acct_memory(long pages) | |||
| 412 | } | 420 | } |
| 413 | 421 | ||
| 414 | #ifdef CONFIG_HOTPLUG_CPU | 422 | #ifdef CONFIG_HOTPLUG_CPU |
| 415 | static void lru_drain_cache(unsigned int cpu) | ||
| 416 | { | ||
| 417 | struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu); | ||
| 418 | |||
| 419 | /* CPU is dead, so no locking needed. */ | ||
| 420 | if (pagevec_count(pvec)) | ||
| 421 | __pagevec_lru_add(pvec); | ||
| 422 | pvec = &per_cpu(lru_add_active_pvecs, cpu); | ||
| 423 | if (pagevec_count(pvec)) | ||
| 424 | __pagevec_lru_add_active(pvec); | ||
| 425 | } | ||
| 426 | 423 | ||
| 427 | /* Drop the CPU's cached committed space back into the central pool. */ | 424 | /* Drop the CPU's cached committed space back into the central pool. */ |
| 428 | static int cpu_swap_callback(struct notifier_block *nfb, | 425 | static int cpu_swap_callback(struct notifier_block *nfb, |
| @@ -435,7 +432,7 @@ static int cpu_swap_callback(struct notifier_block *nfb, | |||
| 435 | if (action == CPU_DEAD) { | 432 | if (action == CPU_DEAD) { |
| 436 | atomic_add(*committed, &vm_committed_space); | 433 | atomic_add(*committed, &vm_committed_space); |
| 437 | *committed = 0; | 434 | *committed = 0; |
| 438 | lru_drain_cache((long)hcpu); | 435 | __lru_add_drain((long)hcpu); |
| 439 | } | 436 | } |
| 440 | return NOTIFY_OK; | 437 | return NOTIFY_OK; |
| 441 | } | 438 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 0df9a57b1de8..7b09ac503fec 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
| 15 | #include <linux/buffer_head.h> | 15 | #include <linux/buffer_head.h> |
| 16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
| 17 | #include <linux/pagevec.h> | ||
| 17 | 18 | ||
| 18 | #include <asm/pgtable.h> | 19 | #include <asm/pgtable.h> |
| 19 | 20 | ||
| @@ -140,7 +141,7 @@ void __delete_from_swap_cache(struct page *page) | |||
| 140 | * Allocate swap space for the page and add the page to the | 141 | * Allocate swap space for the page and add the page to the |
| 141 | * swap cache. Caller needs to hold the page lock. | 142 | * swap cache. Caller needs to hold the page lock. |
| 142 | */ | 143 | */ |
| 143 | int add_to_swap(struct page * page) | 144 | int add_to_swap(struct page * page, gfp_t gfp_mask) |
| 144 | { | 145 | { |
| 145 | swp_entry_t entry; | 146 | swp_entry_t entry; |
| 146 | int err; | 147 | int err; |
| @@ -165,7 +166,7 @@ int add_to_swap(struct page * page) | |||
| 165 | * Add it to the swap cache and mark it dirty | 166 | * Add it to the swap cache and mark it dirty |
| 166 | */ | 167 | */ |
| 167 | err = __add_to_swap_cache(page, entry, | 168 | err = __add_to_swap_cache(page, entry, |
| 168 | GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN); | 169 | gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); |
| 169 | 170 | ||
| 170 | switch (err) { | 171 | switch (err) { |
| 171 | case 0: /* Success */ | 172 | case 0: /* Success */ |
| @@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page) | |||
| 272 | */ | 273 | */ |
| 273 | void free_pages_and_swap_cache(struct page **pages, int nr) | 274 | void free_pages_and_swap_cache(struct page **pages, int nr) |
| 274 | { | 275 | { |
| 275 | int chunk = 16; | ||
| 276 | struct page **pagep = pages; | 276 | struct page **pagep = pages; |
| 277 | 277 | ||
| 278 | lru_add_drain(); | 278 | lru_add_drain(); |
| 279 | while (nr) { | 279 | while (nr) { |
| 280 | int todo = min(chunk, nr); | 280 | int todo = min(nr, PAGEVEC_SIZE); |
| 281 | int i; | 281 | int i; |
| 282 | 282 | ||
| 283 | for (i = 0; i < todo; i++) | 283 | for (i = 0; i < todo; i++) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index edafeace301f..957fef43fa60 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -25,6 +25,7 @@ | |||
| 25 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
| 26 | #include <linux/security.h> | 26 | #include <linux/security.h> |
| 27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
| 28 | #include <linux/capability.h> | ||
| 28 | #include <linux/syscalls.h> | 29 | #include <linux/syscalls.h> |
| 29 | 30 | ||
| 30 | #include <asm/pgtable.h> | 31 | #include <asm/pgtable.h> |
| @@ -211,6 +212,26 @@ noswap: | |||
| 211 | return (swp_entry_t) {0}; | 212 | return (swp_entry_t) {0}; |
| 212 | } | 213 | } |
| 213 | 214 | ||
| 215 | swp_entry_t get_swap_page_of_type(int type) | ||
| 216 | { | ||
| 217 | struct swap_info_struct *si; | ||
| 218 | pgoff_t offset; | ||
| 219 | |||
| 220 | spin_lock(&swap_lock); | ||
| 221 | si = swap_info + type; | ||
| 222 | if (si->flags & SWP_WRITEOK) { | ||
| 223 | nr_swap_pages--; | ||
| 224 | offset = scan_swap_map(si); | ||
| 225 | if (offset) { | ||
| 226 | spin_unlock(&swap_lock); | ||
| 227 | return swp_entry(type, offset); | ||
| 228 | } | ||
| 229 | nr_swap_pages++; | ||
| 230 | } | ||
| 231 | spin_unlock(&swap_lock); | ||
| 232 | return (swp_entry_t) {0}; | ||
| 233 | } | ||
| 234 | |||
| 214 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | 235 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) |
| 215 | { | 236 | { |
| 216 | struct swap_info_struct * p; | 237 | struct swap_info_struct * p; |
| @@ -1167,9 +1188,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
| 1167 | set_blocksize(bdev, p->old_block_size); | 1188 | set_blocksize(bdev, p->old_block_size); |
| 1168 | bd_release(bdev); | 1189 | bd_release(bdev); |
| 1169 | } else { | 1190 | } else { |
| 1170 | down(&inode->i_sem); | 1191 | mutex_lock(&inode->i_mutex); |
| 1171 | inode->i_flags &= ~S_SWAPFILE; | 1192 | inode->i_flags &= ~S_SWAPFILE; |
| 1172 | up(&inode->i_sem); | 1193 | mutex_unlock(&inode->i_mutex); |
| 1173 | } | 1194 | } |
| 1174 | filp_close(swap_file, NULL); | 1195 | filp_close(swap_file, NULL); |
| 1175 | err = 0; | 1196 | err = 0; |
| @@ -1386,7 +1407,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1386 | p->bdev = bdev; | 1407 | p->bdev = bdev; |
| 1387 | } else if (S_ISREG(inode->i_mode)) { | 1408 | } else if (S_ISREG(inode->i_mode)) { |
| 1388 | p->bdev = inode->i_sb->s_bdev; | 1409 | p->bdev = inode->i_sb->s_bdev; |
| 1389 | down(&inode->i_sem); | 1410 | mutex_lock(&inode->i_mutex); |
| 1390 | did_down = 1; | 1411 | did_down = 1; |
| 1391 | if (IS_SWAPFILE(inode)) { | 1412 | if (IS_SWAPFILE(inode)) { |
| 1392 | error = -EBUSY; | 1413 | error = -EBUSY; |
| @@ -1422,7 +1443,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1422 | else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) | 1443 | else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) |
| 1423 | swap_header_version = 2; | 1444 | swap_header_version = 2; |
| 1424 | else { | 1445 | else { |
| 1425 | printk("Unable to find swap-space signature\n"); | 1446 | printk(KERN_ERR "Unable to find swap-space signature\n"); |
| 1426 | error = -EINVAL; | 1447 | error = -EINVAL; |
| 1427 | goto bad_swap; | 1448 | goto bad_swap; |
| 1428 | } | 1449 | } |
| @@ -1473,7 +1494,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1473 | goto bad_swap; | 1494 | goto bad_swap; |
| 1474 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) | 1495 | if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) |
| 1475 | goto bad_swap; | 1496 | goto bad_swap; |
| 1476 | 1497 | ||
| 1477 | /* OK, set up the swap map and apply the bad block list */ | 1498 | /* OK, set up the swap map and apply the bad block list */ |
| 1478 | if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { | 1499 | if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { |
| 1479 | error = -ENOMEM; | 1500 | error = -ENOMEM; |
| @@ -1482,17 +1503,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1482 | 1503 | ||
| 1483 | error = 0; | 1504 | error = 0; |
| 1484 | memset(p->swap_map, 0, maxpages * sizeof(short)); | 1505 | memset(p->swap_map, 0, maxpages * sizeof(short)); |
| 1485 | for (i=0; i<swap_header->info.nr_badpages; i++) { | 1506 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
| 1486 | int page = swap_header->info.badpages[i]; | 1507 | int page_nr = swap_header->info.badpages[i]; |
| 1487 | if (page <= 0 || page >= swap_header->info.last_page) | 1508 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) |
| 1488 | error = -EINVAL; | 1509 | error = -EINVAL; |
| 1489 | else | 1510 | else |
| 1490 | p->swap_map[page] = SWAP_MAP_BAD; | 1511 | p->swap_map[page_nr] = SWAP_MAP_BAD; |
| 1491 | } | 1512 | } |
| 1492 | nr_good_pages = swap_header->info.last_page - | 1513 | nr_good_pages = swap_header->info.last_page - |
| 1493 | swap_header->info.nr_badpages - | 1514 | swap_header->info.nr_badpages - |
| 1494 | 1 /* header page */; | 1515 | 1 /* header page */; |
| 1495 | if (error) | 1516 | if (error) |
| 1496 | goto bad_swap; | 1517 | goto bad_swap; |
| 1497 | } | 1518 | } |
| 1498 | 1519 | ||
| @@ -1576,7 +1597,7 @@ out: | |||
| 1576 | if (did_down) { | 1597 | if (did_down) { |
| 1577 | if (!error) | 1598 | if (!error) |
| 1578 | inode->i_flags |= S_SWAPFILE; | 1599 | inode->i_flags |= S_SWAPFILE; |
| 1579 | up(&inode->i_sem); | 1600 | mutex_unlock(&inode->i_mutex); |
| 1580 | } | 1601 | } |
| 1581 | return error; | 1602 | return error; |
| 1582 | } | 1603 | } |
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index b58abcf44ed6..f9d6a9cc91c4 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c | |||
| @@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
| 81 | goto close_file; | 81 | goto close_file; |
| 82 | 82 | ||
| 83 | d_instantiate(dentry, inode); | 83 | d_instantiate(dentry, inode); |
| 84 | inode->i_size = size; | ||
| 85 | inode->i_nlink = 0; /* It is unlinked */ | 84 | inode->i_nlink = 0; /* It is unlinked */ |
| 85 | |||
| 86 | file->f_vfsmnt = mntget(shm_mnt); | 86 | file->f_vfsmnt = mntget(shm_mnt); |
| 87 | file->f_dentry = dentry; | 87 | file->f_dentry = dentry; |
| 88 | file->f_mapping = inode->i_mapping; | 88 | file->f_mapping = inode->i_mapping; |
| 89 | file->f_op = &ramfs_file_operations; | 89 | file->f_op = &ramfs_file_operations; |
| 90 | file->f_mode = FMODE_WRITE | FMODE_READ; | 90 | file->f_mode = FMODE_WRITE | FMODE_READ; |
| 91 | |||
| 92 | /* notify everyone as to the change of file size */ | ||
| 93 | error = do_truncate(dentry, size, 0, file); | ||
| 94 | if (error < 0) | ||
| 95 | goto close_file; | ||
| 96 | |||
| 91 | return file; | 97 | return file; |
| 92 | 98 | ||
| 93 | close_file: | 99 | close_file: |
| @@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
| 123 | { | 129 | { |
| 124 | return 0; | 130 | return 0; |
| 125 | } | 131 | } |
| 132 | |||
| 133 | int shmem_mmap(struct file *file, struct vm_area_struct *vma) | ||
| 134 | { | ||
| 135 | file_accessed(file); | ||
| 136 | #ifndef CONFIG_MMU | ||
| 137 | return ramfs_nommu_mmap(file, vma); | ||
| 138 | #else | ||
| 139 | return 0; | ||
| 140 | #endif | ||
| 141 | } | ||
| 142 | |||
| 143 | #ifndef CONFIG_MMU | ||
| 144 | unsigned long shmem_get_unmapped_area(struct file *file, | ||
| 145 | unsigned long addr, | ||
| 146 | unsigned long len, | ||
| 147 | unsigned long pgoff, | ||
| 148 | unsigned long flags) | ||
| 149 | { | ||
| 150 | return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags); | ||
| 151 | } | ||
| 152 | #endif | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 9173ab500604..6cb3fff25f67 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
| 82 | } | 82 | } |
| 83 | 83 | ||
| 84 | /** | 84 | /** |
| 85 | * truncate_inode_pages - truncate *all* the pages from an offset | 85 | * truncate_inode_pages - truncate range of pages specified by start and |
| 86 | * end byte offsets | ||
| 86 | * @mapping: mapping to truncate | 87 | * @mapping: mapping to truncate |
| 87 | * @lstart: offset from which to truncate | 88 | * @lstart: offset from which to truncate |
| 89 | * @lend: offset to which to truncate | ||
| 88 | * | 90 | * |
| 89 | * Truncate the page cache at a set offset, removing the pages that are beyond | 91 | * Truncate the page cache, removing the pages that are between |
| 90 | * that offset (and zeroing out partial pages). | 92 | * specified offsets (and zeroing out partial page |
| 93 | * (if lstart is not page aligned)). | ||
| 91 | * | 94 | * |
| 92 | * Truncate takes two passes - the first pass is nonblocking. It will not | 95 | * Truncate takes two passes - the first pass is nonblocking. It will not |
| 93 | * block on page locks and it will not block on writeback. The second pass | 96 | * block on page locks and it will not block on writeback. The second pass |
| @@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
| 101 | * We pass down the cache-hot hint to the page freeing code. Even if the | 104 | * We pass down the cache-hot hint to the page freeing code. Even if the |
| 102 | * mapping is large, it is probably the case that the final pages are the most | 105 | * mapping is large, it is probably the case that the final pages are the most |
| 103 | * recently touched, and freeing happens in ascending file offset order. | 106 | * recently touched, and freeing happens in ascending file offset order. |
| 104 | * | ||
| 105 | * Called under (and serialised by) inode->i_sem. | ||
| 106 | */ | 107 | */ |
| 107 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | 108 | void truncate_inode_pages_range(struct address_space *mapping, |
| 109 | loff_t lstart, loff_t lend) | ||
| 108 | { | 110 | { |
| 109 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 111 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
| 112 | pgoff_t end; | ||
| 110 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 113 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
| 111 | struct pagevec pvec; | 114 | struct pagevec pvec; |
| 112 | pgoff_t next; | 115 | pgoff_t next; |
| @@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
| 115 | if (mapping->nrpages == 0) | 118 | if (mapping->nrpages == 0) |
| 116 | return; | 119 | return; |
| 117 | 120 | ||
| 121 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); | ||
| 122 | end = (lend >> PAGE_CACHE_SHIFT); | ||
| 123 | |||
| 118 | pagevec_init(&pvec, 0); | 124 | pagevec_init(&pvec, 0); |
| 119 | next = start; | 125 | next = start; |
| 120 | while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 126 | while (next <= end && |
| 127 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | ||
| 121 | for (i = 0; i < pagevec_count(&pvec); i++) { | 128 | for (i = 0; i < pagevec_count(&pvec); i++) { |
| 122 | struct page *page = pvec.pages[i]; | 129 | struct page *page = pvec.pages[i]; |
| 123 | pgoff_t page_index = page->index; | 130 | pgoff_t page_index = page->index; |
| 124 | 131 | ||
| 132 | if (page_index > end) { | ||
| 133 | next = page_index; | ||
| 134 | break; | ||
| 135 | } | ||
| 136 | |||
| 125 | if (page_index > next) | 137 | if (page_index > next) |
| 126 | next = page_index; | 138 | next = page_index; |
| 127 | next++; | 139 | next++; |
| @@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
| 157 | next = start; | 169 | next = start; |
| 158 | continue; | 170 | continue; |
| 159 | } | 171 | } |
| 172 | if (pvec.pages[0]->index > end) { | ||
| 173 | pagevec_release(&pvec); | ||
| 174 | break; | ||
| 175 | } | ||
| 160 | for (i = 0; i < pagevec_count(&pvec); i++) { | 176 | for (i = 0; i < pagevec_count(&pvec); i++) { |
| 161 | struct page *page = pvec.pages[i]; | 177 | struct page *page = pvec.pages[i]; |
| 162 | 178 | ||
| 179 | if (page->index > end) | ||
| 180 | break; | ||
| 163 | lock_page(page); | 181 | lock_page(page); |
| 164 | wait_on_page_writeback(page); | 182 | wait_on_page_writeback(page); |
| 165 | if (page->index > next) | 183 | if (page->index > next) |
| @@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
| 171 | pagevec_release(&pvec); | 189 | pagevec_release(&pvec); |
| 172 | } | 190 | } |
| 173 | } | 191 | } |
| 192 | EXPORT_SYMBOL(truncate_inode_pages_range); | ||
| 174 | 193 | ||
| 194 | /** | ||
| 195 | * truncate_inode_pages - truncate *all* the pages from an offset | ||
| 196 | * @mapping: mapping to truncate | ||
| 197 | * @lstart: offset from which to truncate | ||
| 198 | * | ||
| 199 | * Called under (and serialised by) inode->i_mutex. | ||
| 200 | */ | ||
| 201 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | ||
| 202 | { | ||
| 203 | truncate_inode_pages_range(mapping, lstart, (loff_t)-1); | ||
| 204 | } | ||
| 175 | EXPORT_SYMBOL(truncate_inode_pages); | 205 | EXPORT_SYMBOL(truncate_inode_pages); |
| 176 | 206 | ||
| 177 | /** | 207 | /** |
| @@ -219,7 +249,6 @@ unlock: | |||
| 219 | break; | 249 | break; |
| 220 | } | 250 | } |
| 221 | pagevec_release(&pvec); | 251 | pagevec_release(&pvec); |
| 222 | cond_resched(); | ||
| 223 | } | 252 | } |
| 224 | return ret; | 253 | return ret; |
| 225 | } | 254 | } |
diff --git a/mm/util.c b/mm/util.c new file mode 100644 index 000000000000..5f4bb59da63c --- /dev/null +++ b/mm/util.c | |||
| @@ -0,0 +1,39 @@ | |||
| 1 | #include <linux/slab.h> | ||
| 2 | #include <linux/string.h> | ||
| 3 | #include <linux/module.h> | ||
| 4 | |||
| 5 | /** | ||
| 6 | * kzalloc - allocate memory. The memory is set to zero. | ||
| 7 | * @size: how many bytes of memory are required. | ||
| 8 | * @flags: the type of memory to allocate. | ||
| 9 | */ | ||
| 10 | void *kzalloc(size_t size, gfp_t flags) | ||
| 11 | { | ||
| 12 | void *ret = kmalloc(size, flags); | ||
| 13 | if (ret) | ||
| 14 | memset(ret, 0, size); | ||
| 15 | return ret; | ||
| 16 | } | ||
| 17 | EXPORT_SYMBOL(kzalloc); | ||
| 18 | |||
| 19 | /* | ||
| 20 | * kstrdup - allocate space for and copy an existing string | ||
| 21 | * | ||
| 22 | * @s: the string to duplicate | ||
| 23 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | ||
| 24 | */ | ||
| 25 | char *kstrdup(const char *s, gfp_t gfp) | ||
| 26 | { | ||
| 27 | size_t len; | ||
| 28 | char *buf; | ||
| 29 | |||
| 30 | if (!s) | ||
| 31 | return NULL; | ||
| 32 | |||
| 33 | len = strlen(s) + 1; | ||
| 34 | buf = kmalloc(len, gfp); | ||
| 35 | if (buf) | ||
| 36 | memcpy(buf, s, len); | ||
| 37 | return buf; | ||
| 38 | } | ||
| 39 | EXPORT_SYMBOL(kstrdup); | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index b0cd81c32de6..bf903b2d198f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -63,9 +63,6 @@ struct scan_control { | |||
| 63 | 63 | ||
| 64 | unsigned long nr_mapped; /* From page_state */ | 64 | unsigned long nr_mapped; /* From page_state */ |
| 65 | 65 | ||
| 66 | /* How many pages shrink_cache() should reclaim */ | ||
| 67 | int nr_to_reclaim; | ||
| 68 | |||
| 69 | /* Ask shrink_caches, or shrink_zone to scan at this priority */ | 66 | /* Ask shrink_caches, or shrink_zone to scan at this priority */ |
| 70 | unsigned int priority; | 67 | unsigned int priority; |
| 71 | 68 | ||
| @@ -74,9 +71,6 @@ struct scan_control { | |||
| 74 | 71 | ||
| 75 | int may_writepage; | 72 | int may_writepage; |
| 76 | 73 | ||
| 77 | /* Can pages be swapped as part of reclaim? */ | ||
| 78 | int may_swap; | ||
| 79 | |||
| 80 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | 74 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for |
| 81 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | 75 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. |
| 82 | * In this context, it doesn't matter that we scan the | 76 | * In this context, it doesn't matter that we scan the |
| @@ -186,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker); | |||
| 186 | * | 180 | * |
| 187 | * Returns the number of slab objects which we shrunk. | 181 | * Returns the number of slab objects which we shrunk. |
| 188 | */ | 182 | */ |
| 189 | static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, | 183 | int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) |
| 190 | unsigned long lru_pages) | ||
| 191 | { | 184 | { |
| 192 | struct shrinker *shrinker; | 185 | struct shrinker *shrinker; |
| 193 | int ret = 0; | 186 | int ret = 0; |
| @@ -275,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page) | |||
| 275 | 268 | ||
| 276 | static int may_write_to_queue(struct backing_dev_info *bdi) | 269 | static int may_write_to_queue(struct backing_dev_info *bdi) |
| 277 | { | 270 | { |
| 278 | if (current_is_kswapd()) | 271 | if (current->flags & PF_SWAPWRITE) |
| 279 | return 1; | ||
| 280 | if (current_is_pdflush()) /* This is unlikely, but why not... */ | ||
| 281 | return 1; | 272 | return 1; |
| 282 | if (!bdi_write_congested(bdi)) | 273 | if (!bdi_write_congested(bdi)) |
| 283 | return 1; | 274 | return 1; |
| @@ -367,7 +358,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
| 367 | res = mapping->a_ops->writepage(page, &wbc); | 358 | res = mapping->a_ops->writepage(page, &wbc); |
| 368 | if (res < 0) | 359 | if (res < 0) |
| 369 | handle_write_error(mapping, page, res); | 360 | handle_write_error(mapping, page, res); |
| 370 | if (res == WRITEPAGE_ACTIVATE) { | 361 | if (res == AOP_WRITEPAGE_ACTIVATE) { |
| 371 | ClearPageReclaim(page); | 362 | ClearPageReclaim(page); |
| 372 | return PAGE_ACTIVATE; | 363 | return PAGE_ACTIVATE; |
| 373 | } | 364 | } |
| @@ -382,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
| 382 | return PAGE_CLEAN; | 373 | return PAGE_CLEAN; |
| 383 | } | 374 | } |
| 384 | 375 | ||
| 376 | static int remove_mapping(struct address_space *mapping, struct page *page) | ||
| 377 | { | ||
| 378 | if (!mapping) | ||
| 379 | return 0; /* truncate got there first */ | ||
| 380 | |||
| 381 | write_lock_irq(&mapping->tree_lock); | ||
| 382 | |||
| 383 | /* | ||
| 384 | * The non-racy check for busy page. It is critical to check | ||
| 385 | * PageDirty _after_ making sure that the page is freeable and | ||
| 386 | * not in use by anybody. (pagecache + us == 2) | ||
| 387 | */ | ||
| 388 | if (unlikely(page_count(page) != 2)) | ||
| 389 | goto cannot_free; | ||
| 390 | smp_rmb(); | ||
| 391 | if (unlikely(PageDirty(page))) | ||
| 392 | goto cannot_free; | ||
| 393 | |||
| 394 | if (PageSwapCache(page)) { | ||
| 395 | swp_entry_t swap = { .val = page_private(page) }; | ||
| 396 | __delete_from_swap_cache(page); | ||
| 397 | write_unlock_irq(&mapping->tree_lock); | ||
| 398 | swap_free(swap); | ||
| 399 | __put_page(page); /* The pagecache ref */ | ||
| 400 | return 1; | ||
| 401 | } | ||
| 402 | |||
| 403 | __remove_from_page_cache(page); | ||
| 404 | write_unlock_irq(&mapping->tree_lock); | ||
| 405 | __put_page(page); | ||
| 406 | return 1; | ||
| 407 | |||
| 408 | cannot_free: | ||
| 409 | write_unlock_irq(&mapping->tree_lock); | ||
| 410 | return 0; | ||
| 411 | } | ||
| 412 | |||
| 385 | /* | 413 | /* |
| 386 | * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed | 414 | * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed |
| 387 | */ | 415 | */ |
| @@ -430,9 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
| 430 | * Try to allocate it some swap space here. | 458 | * Try to allocate it some swap space here. |
| 431 | */ | 459 | */ |
| 432 | if (PageAnon(page) && !PageSwapCache(page)) { | 460 | if (PageAnon(page) && !PageSwapCache(page)) { |
| 433 | if (!sc->may_swap) | 461 | if (!add_to_swap(page, GFP_ATOMIC)) |
| 434 | goto keep_locked; | ||
| 435 | if (!add_to_swap(page)) | ||
| 436 | goto activate_locked; | 462 | goto activate_locked; |
| 437 | } | 463 | } |
| 438 | #endif /* CONFIG_SWAP */ | 464 | #endif /* CONFIG_SWAP */ |
| @@ -515,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
| 515 | goto free_it; | 541 | goto free_it; |
| 516 | } | 542 | } |
| 517 | 543 | ||
| 518 | if (!mapping) | 544 | if (!remove_mapping(mapping, page)) |
| 519 | goto keep_locked; /* truncate got there first */ | 545 | goto keep_locked; |
| 520 | |||
| 521 | write_lock_irq(&mapping->tree_lock); | ||
| 522 | |||
| 523 | /* | ||
| 524 | * The non-racy check for busy page. It is critical to check | ||
| 525 | * PageDirty _after_ making sure that the page is freeable and | ||
| 526 | * not in use by anybody. (pagecache + us == 2) | ||
| 527 | */ | ||
| 528 | if (unlikely(page_count(page) != 2)) | ||
| 529 | goto cannot_free; | ||
| 530 | smp_rmb(); | ||
| 531 | if (unlikely(PageDirty(page))) | ||
| 532 | goto cannot_free; | ||
| 533 | |||
| 534 | #ifdef CONFIG_SWAP | ||
| 535 | if (PageSwapCache(page)) { | ||
| 536 | swp_entry_t swap = { .val = page_private(page) }; | ||
| 537 | __delete_from_swap_cache(page); | ||
| 538 | write_unlock_irq(&mapping->tree_lock); | ||
| 539 | swap_free(swap); | ||
| 540 | __put_page(page); /* The pagecache ref */ | ||
| 541 | goto free_it; | ||
| 542 | } | ||
| 543 | #endif /* CONFIG_SWAP */ | ||
| 544 | |||
| 545 | __remove_from_page_cache(page); | ||
| 546 | write_unlock_irq(&mapping->tree_lock); | ||
| 547 | __put_page(page); | ||
| 548 | 546 | ||
| 549 | free_it: | 547 | free_it: |
| 550 | unlock_page(page); | 548 | unlock_page(page); |
| @@ -553,10 +551,6 @@ free_it: | |||
| 553 | __pagevec_release_nonlru(&freed_pvec); | 551 | __pagevec_release_nonlru(&freed_pvec); |
| 554 | continue; | 552 | continue; |
| 555 | 553 | ||
| 556 | cannot_free: | ||
| 557 | write_unlock_irq(&mapping->tree_lock); | ||
| 558 | goto keep_locked; | ||
| 559 | |||
| 560 | activate_locked: | 554 | activate_locked: |
| 561 | SetPageActive(page); | 555 | SetPageActive(page); |
| 562 | pgactivate++; | 556 | pgactivate++; |
| @@ -574,6 +568,241 @@ keep: | |||
| 574 | return reclaimed; | 568 | return reclaimed; |
| 575 | } | 569 | } |
| 576 | 570 | ||
| 571 | #ifdef CONFIG_MIGRATION | ||
| 572 | static inline void move_to_lru(struct page *page) | ||
| 573 | { | ||
| 574 | list_del(&page->lru); | ||
| 575 | if (PageActive(page)) { | ||
| 576 | /* | ||
| 577 | * lru_cache_add_active checks that | ||
| 578 | * the PG_active bit is off. | ||
| 579 | */ | ||
| 580 | ClearPageActive(page); | ||
| 581 | lru_cache_add_active(page); | ||
| 582 | } else { | ||
| 583 | lru_cache_add(page); | ||
| 584 | } | ||
| 585 | put_page(page); | ||
| 586 | } | ||
| 587 | |||
| 588 | /* | ||
| 589 | * Add isolated pages on the list back to the LRU | ||
| 590 | * | ||
| 591 | * returns the number of pages put back. | ||
| 592 | */ | ||
| 593 | int putback_lru_pages(struct list_head *l) | ||
| 594 | { | ||
| 595 | struct page *page; | ||
| 596 | struct page *page2; | ||
| 597 | int count = 0; | ||
| 598 | |||
| 599 | list_for_each_entry_safe(page, page2, l, lru) { | ||
| 600 | move_to_lru(page); | ||
| 601 | count++; | ||
| 602 | } | ||
| 603 | return count; | ||
| 604 | } | ||
| 605 | |||
| 606 | /* | ||
| 607 | * swapout a single page | ||
| 608 | * page is locked upon entry, unlocked on exit | ||
| 609 | */ | ||
| 610 | static int swap_page(struct page *page) | ||
| 611 | { | ||
| 612 | struct address_space *mapping = page_mapping(page); | ||
| 613 | |||
| 614 | if (page_mapped(page) && mapping) | ||
| 615 | if (try_to_unmap(page) != SWAP_SUCCESS) | ||
| 616 | goto unlock_retry; | ||
| 617 | |||
| 618 | if (PageDirty(page)) { | ||
| 619 | /* Page is dirty, try to write it out here */ | ||
| 620 | switch(pageout(page, mapping)) { | ||
| 621 | case PAGE_KEEP: | ||
| 622 | case PAGE_ACTIVATE: | ||
| 623 | goto unlock_retry; | ||
| 624 | |||
| 625 | case PAGE_SUCCESS: | ||
| 626 | goto retry; | ||
| 627 | |||
| 628 | case PAGE_CLEAN: | ||
| 629 | ; /* try to free the page below */ | ||
| 630 | } | ||
| 631 | } | ||
| 632 | |||
| 633 | if (PagePrivate(page)) { | ||
| 634 | if (!try_to_release_page(page, GFP_KERNEL) || | ||
| 635 | (!mapping && page_count(page) == 1)) | ||
| 636 | goto unlock_retry; | ||
| 637 | } | ||
| 638 | |||
| 639 | if (remove_mapping(mapping, page)) { | ||
| 640 | /* Success */ | ||
| 641 | unlock_page(page); | ||
| 642 | return 0; | ||
| 643 | } | ||
| 644 | |||
| 645 | unlock_retry: | ||
| 646 | unlock_page(page); | ||
| 647 | |||
| 648 | retry: | ||
| 649 | return -EAGAIN; | ||
| 650 | } | ||
| 651 | /* | ||
| 652 | * migrate_pages | ||
| 653 | * | ||
| 654 | * Two lists are passed to this function. The first list | ||
| 655 | * contains the pages isolated from the LRU to be migrated. | ||
| 656 | * The second list contains new pages that the pages isolated | ||
| 657 | * can be moved to. If the second list is NULL then all | ||
| 658 | * pages are swapped out. | ||
| 659 | * | ||
| 660 | * The function returns after 10 attempts or if no pages | ||
| 661 | * are movable anymore because t has become empty | ||
| 662 | * or no retryable pages exist anymore. | ||
| 663 | * | ||
| 664 | * SIMPLIFIED VERSION: This implementation of migrate_pages | ||
| 665 | * is only swapping out pages and never touches the second | ||
| 666 | * list. The direct migration patchset | ||
| 667 | * extends this function to avoid the use of swap. | ||
| 668 | * | ||
| 669 | * Return: Number of pages not migrated when "to" ran empty. | ||
| 670 | */ | ||
| 671 | int migrate_pages(struct list_head *from, struct list_head *to, | ||
| 672 | struct list_head *moved, struct list_head *failed) | ||
| 673 | { | ||
| 674 | int retry; | ||
| 675 | int nr_failed = 0; | ||
| 676 | int pass = 0; | ||
| 677 | struct page *page; | ||
| 678 | struct page *page2; | ||
| 679 | int swapwrite = current->flags & PF_SWAPWRITE; | ||
| 680 | int rc; | ||
| 681 | |||
| 682 | if (!swapwrite) | ||
| 683 | current->flags |= PF_SWAPWRITE; | ||
| 684 | |||
| 685 | redo: | ||
| 686 | retry = 0; | ||
| 687 | |||
| 688 | list_for_each_entry_safe(page, page2, from, lru) { | ||
| 689 | cond_resched(); | ||
| 690 | |||
| 691 | rc = 0; | ||
| 692 | if (page_count(page) == 1) | ||
| 693 | /* page was freed from under us. So we are done. */ | ||
| 694 | goto next; | ||
| 695 | |||
| 696 | /* | ||
| 697 | * Skip locked pages during the first two passes to give the | ||
| 698 | * functions holding the lock time to release the page. Later we | ||
| 699 | * use lock_page() to have a higher chance of acquiring the | ||
| 700 | * lock. | ||
| 701 | */ | ||
| 702 | rc = -EAGAIN; | ||
| 703 | if (pass > 2) | ||
| 704 | lock_page(page); | ||
| 705 | else | ||
| 706 | if (TestSetPageLocked(page)) | ||
| 707 | goto next; | ||
| 708 | |||
| 709 | /* | ||
| 710 | * Only wait on writeback if we have already done a pass where | ||
| 711 | * we we may have triggered writeouts for lots of pages. | ||
| 712 | */ | ||
| 713 | if (pass > 0) { | ||
| 714 | wait_on_page_writeback(page); | ||
| 715 | } else { | ||
| 716 | if (PageWriteback(page)) | ||
| 717 | goto unlock_page; | ||
| 718 | } | ||
| 719 | |||
| 720 | /* | ||
| 721 | * Anonymous pages must have swap cache references otherwise | ||
| 722 | * the information contained in the page maps cannot be | ||
| 723 | * preserved. | ||
| 724 | */ | ||
| 725 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
| 726 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
| 727 | rc = -ENOMEM; | ||
| 728 | goto unlock_page; | ||
| 729 | } | ||
| 730 | } | ||
| 731 | |||
| 732 | /* | ||
| 733 | * Page is properly locked and writeback is complete. | ||
| 734 | * Try to migrate the page. | ||
| 735 | */ | ||
| 736 | rc = swap_page(page); | ||
| 737 | goto next; | ||
| 738 | |||
| 739 | unlock_page: | ||
| 740 | unlock_page(page); | ||
| 741 | |||
| 742 | next: | ||
| 743 | if (rc == -EAGAIN) { | ||
| 744 | retry++; | ||
| 745 | } else if (rc) { | ||
| 746 | /* Permanent failure */ | ||
| 747 | list_move(&page->lru, failed); | ||
| 748 | nr_failed++; | ||
| 749 | } else { | ||
| 750 | /* Success */ | ||
| 751 | list_move(&page->lru, moved); | ||
| 752 | } | ||
| 753 | } | ||
| 754 | if (retry && pass++ < 10) | ||
| 755 | goto redo; | ||
| 756 | |||
| 757 | if (!swapwrite) | ||
| 758 | current->flags &= ~PF_SWAPWRITE; | ||
| 759 | |||
| 760 | return nr_failed + retry; | ||
| 761 | } | ||
| 762 | |||
| 763 | static void lru_add_drain_per_cpu(void *dummy) | ||
| 764 | { | ||
| 765 | lru_add_drain(); | ||
| 766 | } | ||
| 767 | |||
| 768 | /* | ||
| 769 | * Isolate one page from the LRU lists and put it on the | ||
| 770 | * indicated list. Do necessary cache draining if the | ||
| 771 | * page is not on the LRU lists yet. | ||
| 772 | * | ||
| 773 | * Result: | ||
| 774 | * 0 = page not on LRU list | ||
| 775 | * 1 = page removed from LRU list and added to the specified list. | ||
| 776 | * -ENOENT = page is being freed elsewhere. | ||
| 777 | */ | ||
| 778 | int isolate_lru_page(struct page *page) | ||
| 779 | { | ||
| 780 | int rc = 0; | ||
| 781 | struct zone *zone = page_zone(page); | ||
| 782 | |||
| 783 | redo: | ||
| 784 | spin_lock_irq(&zone->lru_lock); | ||
| 785 | rc = __isolate_lru_page(page); | ||
| 786 | if (rc == 1) { | ||
| 787 | if (PageActive(page)) | ||
| 788 | del_page_from_active_list(zone, page); | ||
| 789 | else | ||
| 790 | del_page_from_inactive_list(zone, page); | ||
| 791 | } | ||
| 792 | spin_unlock_irq(&zone->lru_lock); | ||
| 793 | if (rc == 0) { | ||
| 794 | /* | ||
| 795 | * Maybe this page is still waiting for a cpu to drain it | ||
| 796 | * from one of the lru lists? | ||
| 797 | */ | ||
| 798 | rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); | ||
| 799 | if (rc == 0 && PageLRU(page)) | ||
| 800 | goto redo; | ||
| 801 | } | ||
| 802 | return rc; | ||
| 803 | } | ||
| 804 | #endif | ||
| 805 | |||
| 577 | /* | 806 | /* |
| 578 | * zone->lru_lock is heavily contended. Some of the functions that | 807 | * zone->lru_lock is heavily contended. Some of the functions that |
| 579 | * shrink the lists perform better by taking out a batch of pages | 808 | * shrink the lists perform better by taking out a batch of pages |
| @@ -602,20 +831,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | |||
| 602 | page = lru_to_page(src); | 831 | page = lru_to_page(src); |
| 603 | prefetchw_prev_lru_page(page, src, flags); | 832 | prefetchw_prev_lru_page(page, src, flags); |
| 604 | 833 | ||
| 605 | if (!TestClearPageLRU(page)) | 834 | switch (__isolate_lru_page(page)) { |
| 606 | BUG(); | 835 | case 1: |
| 607 | list_del(&page->lru); | 836 | /* Succeeded to isolate page */ |
| 608 | if (get_page_testone(page)) { | 837 | list_move(&page->lru, dst); |
| 609 | /* | ||
| 610 | * It is being freed elsewhere | ||
| 611 | */ | ||
| 612 | __put_page(page); | ||
| 613 | SetPageLRU(page); | ||
| 614 | list_add(&page->lru, src); | ||
| 615 | continue; | ||
| 616 | } else { | ||
| 617 | list_add(&page->lru, dst); | ||
| 618 | nr_taken++; | 838 | nr_taken++; |
| 839 | break; | ||
| 840 | case -ENOENT: | ||
| 841 | /* Not possible to isolate */ | ||
| 842 | list_move(&page->lru, src); | ||
| 843 | break; | ||
| 844 | default: | ||
| 845 | BUG(); | ||
| 619 | } | 846 | } |
| 620 | } | 847 | } |
| 621 | 848 | ||
| @@ -653,17 +880,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
| 653 | goto done; | 880 | goto done; |
| 654 | 881 | ||
| 655 | max_scan -= nr_scan; | 882 | max_scan -= nr_scan; |
| 656 | if (current_is_kswapd()) | ||
| 657 | mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | ||
| 658 | else | ||
| 659 | mod_page_state_zone(zone, pgscan_direct, nr_scan); | ||
| 660 | nr_freed = shrink_list(&page_list, sc); | 883 | nr_freed = shrink_list(&page_list, sc); |
| 661 | if (current_is_kswapd()) | ||
| 662 | mod_page_state(kswapd_steal, nr_freed); | ||
| 663 | mod_page_state_zone(zone, pgsteal, nr_freed); | ||
| 664 | sc->nr_to_reclaim -= nr_freed; | ||
| 665 | 884 | ||
| 666 | spin_lock_irq(&zone->lru_lock); | 885 | local_irq_disable(); |
| 886 | if (current_is_kswapd()) { | ||
| 887 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | ||
| 888 | __mod_page_state(kswapd_steal, nr_freed); | ||
| 889 | } else | ||
| 890 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | ||
| 891 | __mod_page_state_zone(zone, pgsteal, nr_freed); | ||
| 892 | |||
| 893 | spin_lock(&zone->lru_lock); | ||
| 667 | /* | 894 | /* |
| 668 | * Put back any unfreeable pages. | 895 | * Put back any unfreeable pages. |
| 669 | */ | 896 | */ |
| @@ -825,11 +1052,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
| 825 | } | 1052 | } |
| 826 | } | 1053 | } |
| 827 | zone->nr_active += pgmoved; | 1054 | zone->nr_active += pgmoved; |
| 828 | spin_unlock_irq(&zone->lru_lock); | 1055 | spin_unlock(&zone->lru_lock); |
| 829 | pagevec_release(&pvec); | ||
| 830 | 1056 | ||
| 831 | mod_page_state_zone(zone, pgrefill, pgscanned); | 1057 | __mod_page_state_zone(zone, pgrefill, pgscanned); |
| 832 | mod_page_state(pgdeactivate, pgdeactivate); | 1058 | __mod_page_state(pgdeactivate, pgdeactivate); |
| 1059 | local_irq_enable(); | ||
| 1060 | |||
| 1061 | pagevec_release(&pvec); | ||
| 833 | } | 1062 | } |
| 834 | 1063 | ||
| 835 | /* | 1064 | /* |
| @@ -861,8 +1090,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 861 | else | 1090 | else |
| 862 | nr_inactive = 0; | 1091 | nr_inactive = 0; |
| 863 | 1092 | ||
| 864 | sc->nr_to_reclaim = sc->swap_cluster_max; | ||
| 865 | |||
| 866 | while (nr_active || nr_inactive) { | 1093 | while (nr_active || nr_inactive) { |
| 867 | if (nr_active) { | 1094 | if (nr_active) { |
| 868 | sc->nr_to_scan = min(nr_active, | 1095 | sc->nr_to_scan = min(nr_active, |
| @@ -876,8 +1103,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 876 | (unsigned long)sc->swap_cluster_max); | 1103 | (unsigned long)sc->swap_cluster_max); |
| 877 | nr_inactive -= sc->nr_to_scan; | 1104 | nr_inactive -= sc->nr_to_scan; |
| 878 | shrink_cache(zone, sc); | 1105 | shrink_cache(zone, sc); |
| 879 | if (sc->nr_to_reclaim <= 0) | ||
| 880 | break; | ||
| 881 | } | 1106 | } |
| 882 | } | 1107 | } |
| 883 | 1108 | ||
| @@ -910,7 +1135,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
| 910 | for (i = 0; zones[i] != NULL; i++) { | 1135 | for (i = 0; zones[i] != NULL; i++) { |
| 911 | struct zone *zone = zones[i]; | 1136 | struct zone *zone = zones[i]; |
| 912 | 1137 | ||
| 913 | if (zone->present_pages == 0) | 1138 | if (!populated_zone(zone)) |
| 914 | continue; | 1139 | continue; |
| 915 | 1140 | ||
| 916 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 1141 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
| @@ -952,7 +1177,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
| 952 | 1177 | ||
| 953 | sc.gfp_mask = gfp_mask; | 1178 | sc.gfp_mask = gfp_mask; |
| 954 | sc.may_writepage = 0; | 1179 | sc.may_writepage = 0; |
| 955 | sc.may_swap = 1; | ||
| 956 | 1180 | ||
| 957 | inc_page_state(allocstall); | 1181 | inc_page_state(allocstall); |
| 958 | 1182 | ||
| @@ -1055,7 +1279,6 @@ loop_again: | |||
| 1055 | total_reclaimed = 0; | 1279 | total_reclaimed = 0; |
| 1056 | sc.gfp_mask = GFP_KERNEL; | 1280 | sc.gfp_mask = GFP_KERNEL; |
| 1057 | sc.may_writepage = 0; | 1281 | sc.may_writepage = 0; |
| 1058 | sc.may_swap = 1; | ||
| 1059 | sc.nr_mapped = read_page_state(nr_mapped); | 1282 | sc.nr_mapped = read_page_state(nr_mapped); |
| 1060 | 1283 | ||
| 1061 | inc_page_state(pageoutrun); | 1284 | inc_page_state(pageoutrun); |
| @@ -1084,7 +1307,7 @@ loop_again: | |||
| 1084 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 1307 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
| 1085 | struct zone *zone = pgdat->node_zones + i; | 1308 | struct zone *zone = pgdat->node_zones + i; |
| 1086 | 1309 | ||
| 1087 | if (zone->present_pages == 0) | 1310 | if (!populated_zone(zone)) |
| 1088 | continue; | 1311 | continue; |
| 1089 | 1312 | ||
| 1090 | if (zone->all_unreclaimable && | 1313 | if (zone->all_unreclaimable && |
| @@ -1121,7 +1344,7 @@ scan: | |||
| 1121 | struct zone *zone = pgdat->node_zones + i; | 1344 | struct zone *zone = pgdat->node_zones + i; |
| 1122 | int nr_slab; | 1345 | int nr_slab; |
| 1123 | 1346 | ||
| 1124 | if (zone->present_pages == 0) | 1347 | if (!populated_zone(zone)) |
| 1125 | continue; | 1348 | continue; |
| 1126 | 1349 | ||
| 1127 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1350 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| @@ -1238,7 +1461,7 @@ static int kswapd(void *p) | |||
| 1238 | * us from recursively trying to free more memory as we're | 1461 | * us from recursively trying to free more memory as we're |
| 1239 | * trying to free the first piece of memory in the first place). | 1462 | * trying to free the first piece of memory in the first place). |
| 1240 | */ | 1463 | */ |
| 1241 | tsk->flags |= PF_MEMALLOC|PF_KSWAPD; | 1464 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
| 1242 | 1465 | ||
| 1243 | order = 0; | 1466 | order = 0; |
| 1244 | for ( ; ; ) { | 1467 | for ( ; ; ) { |
| @@ -1273,7 +1496,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
| 1273 | { | 1496 | { |
| 1274 | pg_data_t *pgdat; | 1497 | pg_data_t *pgdat; |
| 1275 | 1498 | ||
| 1276 | if (zone->present_pages == 0) | 1499 | if (!populated_zone(zone)) |
| 1277 | return; | 1500 | return; |
| 1278 | 1501 | ||
| 1279 | pgdat = zone->zone_pgdat; | 1502 | pgdat = zone->zone_pgdat; |
| @@ -1353,76 +1576,3 @@ static int __init kswapd_init(void) | |||
| 1353 | } | 1576 | } |
| 1354 | 1577 | ||
| 1355 | module_init(kswapd_init) | 1578 | module_init(kswapd_init) |
| 1356 | |||
| 1357 | |||
| 1358 | /* | ||
| 1359 | * Try to free up some pages from this zone through reclaim. | ||
| 1360 | */ | ||
| 1361 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | ||
| 1362 | { | ||
| 1363 | struct scan_control sc; | ||
| 1364 | int nr_pages = 1 << order; | ||
| 1365 | int total_reclaimed = 0; | ||
| 1366 | |||
| 1367 | /* The reclaim may sleep, so don't do it if sleep isn't allowed */ | ||
| 1368 | if (!(gfp_mask & __GFP_WAIT)) | ||
| 1369 | return 0; | ||
| 1370 | if (zone->all_unreclaimable) | ||
| 1371 | return 0; | ||
| 1372 | |||
| 1373 | sc.gfp_mask = gfp_mask; | ||
| 1374 | sc.may_writepage = 0; | ||
| 1375 | sc.may_swap = 0; | ||
| 1376 | sc.nr_mapped = read_page_state(nr_mapped); | ||
| 1377 | sc.nr_scanned = 0; | ||
| 1378 | sc.nr_reclaimed = 0; | ||
| 1379 | /* scan at the highest priority */ | ||
| 1380 | sc.priority = 0; | ||
| 1381 | disable_swap_token(); | ||
| 1382 | |||
| 1383 | if (nr_pages > SWAP_CLUSTER_MAX) | ||
| 1384 | sc.swap_cluster_max = nr_pages; | ||
| 1385 | else | ||
| 1386 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
| 1387 | |||
| 1388 | /* Don't reclaim the zone if there are other reclaimers active */ | ||
| 1389 | if (atomic_read(&zone->reclaim_in_progress) > 0) | ||
| 1390 | goto out; | ||
| 1391 | |||
| 1392 | shrink_zone(zone, &sc); | ||
| 1393 | total_reclaimed = sc.nr_reclaimed; | ||
| 1394 | |||
| 1395 | out: | ||
| 1396 | return total_reclaimed; | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone, | ||
| 1400 | unsigned int state) | ||
| 1401 | { | ||
| 1402 | struct zone *z; | ||
| 1403 | int i; | ||
| 1404 | |||
| 1405 | if (!capable(CAP_SYS_ADMIN)) | ||
| 1406 | return -EACCES; | ||
| 1407 | |||
| 1408 | if (node >= MAX_NUMNODES || !node_online(node)) | ||
| 1409 | return -EINVAL; | ||
| 1410 | |||
| 1411 | /* This will break if we ever add more zones */ | ||
| 1412 | if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM))) | ||
| 1413 | return -EINVAL; | ||
| 1414 | |||
| 1415 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
| 1416 | if (!(zone & 1<<i)) | ||
| 1417 | continue; | ||
| 1418 | |||
| 1419 | z = &NODE_DATA(node)->node_zones[i]; | ||
| 1420 | |||
| 1421 | if (state) | ||
| 1422 | z->reclaim_pages = 1; | ||
| 1423 | else | ||
| 1424 | z->reclaim_pages = 0; | ||
| 1425 | } | ||
| 1426 | |||
| 1427 | return 0; | ||
| 1428 | } | ||
