diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 10 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 41 | ||||
-rw-r--r-- | mm/fadvise.c | 46 | ||||
-rw-r--r-- | mm/filemap.c | 43 | ||||
-rw-r--r-- | mm/highmem.c | 26 | ||||
-rw-r--r-- | mm/hugetlb.c | 286 | ||||
-rw-r--r-- | mm/internal.h | 34 | ||||
-rw-r--r-- | mm/memory.c | 21 | ||||
-rw-r--r-- | mm/mempolicy.c | 151 | ||||
-rw-r--r-- | mm/mempool.c | 50 | ||||
-rw-r--r-- | mm/migrate.c | 655 | ||||
-rw-r--r-- | mm/mmap.c | 16 | ||||
-rw-r--r-- | mm/mmzone.c | 50 | ||||
-rw-r--r-- | mm/mprotect.c | 12 | ||||
-rw-r--r-- | mm/msync.c | 139 | ||||
-rw-r--r-- | mm/nommu.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 64 | ||||
-rw-r--r-- | mm/page_alloc.c | 180 | ||||
-rw-r--r-- | mm/readahead.c | 33 | ||||
-rw-r--r-- | mm/rmap.c | 14 | ||||
-rw-r--r-- | mm/shmem.c | 7 | ||||
-rw-r--r-- | mm/slab.c | 1233 | ||||
-rw-r--r-- | mm/slob.c | 10 | ||||
-rw-r--r-- | mm/swap.c | 66 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 59 | ||||
-rw-r--r-- | mm/util.c | 47 | ||||
-rw-r--r-- | mm/vmscan.c | 888 |
29 files changed, 2603 insertions, 1587 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a9cb80ae6409..332f5c29b53a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS | |||
137 | # support for page migration | 137 | # support for page migration |
138 | # | 138 | # |
139 | config MIGRATION | 139 | config MIGRATION |
140 | def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM | 140 | bool "Page migration" |
141 | depends on SWAP | 141 | def_bool y if NUMA |
142 | depends on SWAP && NUMA | ||
143 | help | ||
144 | Allows the migration of the physical location of pages of processes | ||
145 | while the virtual addresses are not changed. This is useful for | ||
146 | example on NUMA systems to put pages nearer to the processors accessing | ||
147 | the page. | ||
diff --git a/mm/Makefile b/mm/Makefile index 9aa03fa1dcc3..0b8f73f2ed16 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | |||
10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
11 | page_alloc.o page-writeback.o pdflush.o \ | 11 | page_alloc.o page-writeback.o pdflush.o \ |
12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
13 | prio_tree.o util.o $(mmu-y) | 13 | prio_tree.o util.o mmzone.o $(mmu-y) |
14 | 14 | ||
15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 15 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 16 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o | |||
22 | obj-$(CONFIG_SLAB) += slab.o | 22 | obj-$(CONFIG_SLAB) += slab.o |
23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
25 | obj-$(CONFIG_MIGRATION) += migrate.o | ||
26 | |||
diff --git a/mm/bootmem.c b/mm/bootmem.c index 35c32290f717..d3e3bd2ffcea 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn); /* This is exported so | |||
33 | * dma_get_required_mask(), which uses | 33 | * dma_get_required_mask(), which uses |
34 | * it, can be an inline function */ | 34 | * it, can be an inline function */ |
35 | 35 | ||
36 | static LIST_HEAD(bdata_list); | ||
36 | #ifdef CONFIG_CRASH_DUMP | 37 | #ifdef CONFIG_CRASH_DUMP |
37 | /* | 38 | /* |
38 | * If we have booted due to a crash, max_pfn will be a very low value. We need | 39 | * If we have booted due to a crash, max_pfn will be a very low value. We need |
@@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages) | |||
52 | 53 | ||
53 | return mapsize; | 54 | return mapsize; |
54 | } | 55 | } |
56 | /* | ||
57 | * link bdata in order | ||
58 | */ | ||
59 | static void link_bootmem(bootmem_data_t *bdata) | ||
60 | { | ||
61 | bootmem_data_t *ent; | ||
62 | if (list_empty(&bdata_list)) { | ||
63 | list_add(&bdata->list, &bdata_list); | ||
64 | return; | ||
65 | } | ||
66 | /* insert in order */ | ||
67 | list_for_each_entry(ent, &bdata_list, list) { | ||
68 | if (bdata->node_boot_start < ent->node_boot_start) { | ||
69 | list_add_tail(&bdata->list, &ent->list); | ||
70 | return; | ||
71 | } | ||
72 | } | ||
73 | list_add_tail(&bdata->list, &bdata_list); | ||
74 | return; | ||
75 | } | ||
76 | |||
55 | 77 | ||
56 | /* | 78 | /* |
57 | * Called once to set up the allocator itself. | 79 | * Called once to set up the allocator itself. |
@@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat, | |||
62 | bootmem_data_t *bdata = pgdat->bdata; | 84 | bootmem_data_t *bdata = pgdat->bdata; |
63 | unsigned long mapsize = ((end - start)+7)/8; | 85 | unsigned long mapsize = ((end - start)+7)/8; |
64 | 86 | ||
65 | pgdat->pgdat_next = pgdat_list; | ||
66 | pgdat_list = pgdat; | ||
67 | |||
68 | mapsize = ALIGN(mapsize, sizeof(long)); | 87 | mapsize = ALIGN(mapsize, sizeof(long)); |
69 | bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); | 88 | bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT); |
70 | bdata->node_boot_start = (start << PAGE_SHIFT); | 89 | bdata->node_boot_start = (start << PAGE_SHIFT); |
71 | bdata->node_low_pfn = end; | 90 | bdata->node_low_pfn = end; |
91 | link_bootmem(bdata); | ||
72 | 92 | ||
73 | /* | 93 | /* |
74 | * Initially all pages are reserved - setup_arch() has to | 94 | * Initially all pages are reserved - setup_arch() has to |
@@ -152,7 +172,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | |||
152 | * | 172 | * |
153 | * NOTE: This function is _not_ reentrant. | 173 | * NOTE: This function is _not_ reentrant. |
154 | */ | 174 | */ |
155 | static void * __init | 175 | void * __init |
156 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | 176 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, |
157 | unsigned long align, unsigned long goal, unsigned long limit) | 177 | unsigned long align, unsigned long goal, unsigned long limit) |
158 | { | 178 | { |
@@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void) | |||
383 | 403 | ||
384 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) | 404 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) |
385 | { | 405 | { |
386 | pg_data_t *pgdat = pgdat_list; | 406 | bootmem_data_t *bdata; |
387 | void *ptr; | 407 | void *ptr; |
388 | 408 | ||
389 | for_each_pgdat(pgdat) | 409 | list_for_each_entry(bdata, &bdata_list, list) |
390 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, | 410 | if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0))) |
391 | align, goal, 0))) | ||
392 | return(ptr); | 411 | return(ptr); |
393 | 412 | ||
394 | /* | 413 | /* |
@@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne | |||
416 | 435 | ||
417 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) | 436 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) |
418 | { | 437 | { |
419 | pg_data_t *pgdat = pgdat_list; | 438 | bootmem_data_t *bdata; |
420 | void *ptr; | 439 | void *ptr; |
421 | 440 | ||
422 | for_each_pgdat(pgdat) | 441 | list_for_each_entry(bdata, &bdata_list, list) |
423 | if ((ptr = __alloc_bootmem_core(pgdat->bdata, size, | 442 | if ((ptr = __alloc_bootmem_core(bdata, size, |
424 | align, goal, LOW32LIMIT))) | 443 | align, goal, LOW32LIMIT))) |
425 | return(ptr); | 444 | return(ptr); |
426 | 445 | ||
diff --git a/mm/fadvise.c b/mm/fadvise.c index d257c89e7704..907c39257ca0 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <linux/pagevec.h> | 16 | #include <linux/pagevec.h> |
17 | #include <linux/fadvise.h> | 17 | #include <linux/fadvise.h> |
18 | #include <linux/writeback.h> | ||
18 | #include <linux/syscalls.h> | 19 | #include <linux/syscalls.h> |
19 | 20 | ||
20 | #include <asm/unistd.h> | 21 | #include <asm/unistd.h> |
@@ -22,13 +23,36 @@ | |||
22 | /* | 23 | /* |
23 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could | 24 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could |
24 | * deactivate the pages and clear PG_Referenced. | 25 | * deactivate the pages and clear PG_Referenced. |
26 | * | ||
27 | * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file | ||
28 | * offsets `offset' and `offset+len' inclusive. Any pages which are currently | ||
29 | * under writeout are skipped, whether or not they are dirty. | ||
30 | * | ||
31 | * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file | ||
32 | * offsets `offset' and `offset+len'. | ||
33 | * | ||
34 | * By combining these two operations the application may do several things: | ||
35 | * | ||
36 | * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. | ||
37 | * | ||
38 | * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently | ||
39 | * dirty pages at the disk. | ||
40 | * | ||
41 | * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push | ||
42 | * all of the currently dirty pages at the disk, wait until they have been | ||
43 | * written. | ||
44 | * | ||
45 | * It should be noted that none of these operations write out the file's | ||
46 | * metadata. So unless the application is strictly performing overwrites of | ||
47 | * already-instantiated disk blocks, there are no guarantees here that the data | ||
48 | * will be available after a crash. | ||
25 | */ | 49 | */ |
26 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | 50 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) |
27 | { | 51 | { |
28 | struct file *file = fget(fd); | 52 | struct file *file = fget(fd); |
29 | struct address_space *mapping; | 53 | struct address_space *mapping; |
30 | struct backing_dev_info *bdi; | 54 | struct backing_dev_info *bdi; |
31 | loff_t endbyte; | 55 | loff_t endbyte; /* inclusive */ |
32 | pgoff_t start_index; | 56 | pgoff_t start_index; |
33 | pgoff_t end_index; | 57 | pgoff_t end_index; |
34 | unsigned long nrpages; | 58 | unsigned long nrpages; |
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
56 | endbyte = offset + len; | 80 | endbyte = offset + len; |
57 | if (!len || endbyte < len) | 81 | if (!len || endbyte < len) |
58 | endbyte = -1; | 82 | endbyte = -1; |
83 | else | ||
84 | endbyte--; /* inclusive */ | ||
59 | 85 | ||
60 | bdi = mapping->backing_dev_info; | 86 | bdi = mapping->backing_dev_info; |
61 | 87 | ||
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
78 | 104 | ||
79 | /* First and last PARTIAL page! */ | 105 | /* First and last PARTIAL page! */ |
80 | start_index = offset >> PAGE_CACHE_SHIFT; | 106 | start_index = offset >> PAGE_CACHE_SHIFT; |
81 | end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; | 107 | end_index = endbyte >> PAGE_CACHE_SHIFT; |
82 | 108 | ||
83 | /* Careful about overflow on the "+1" */ | 109 | /* Careful about overflow on the "+1" */ |
84 | nrpages = end_index - start_index + 1; | 110 | nrpages = end_index - start_index + 1; |
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
96 | filemap_flush(mapping); | 122 | filemap_flush(mapping); |
97 | 123 | ||
98 | /* First and last FULL page! */ | 124 | /* First and last FULL page! */ |
99 | start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 125 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
100 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | 126 | end_index = (endbyte >> PAGE_CACHE_SHIFT); |
101 | 127 | ||
102 | if (end_index > start_index) | 128 | if (end_index >= start_index) |
103 | invalidate_mapping_pages(mapping, start_index, end_index-1); | 129 | invalidate_mapping_pages(mapping, start_index, |
130 | end_index); | ||
131 | break; | ||
132 | case LINUX_FADV_ASYNC_WRITE: | ||
133 | ret = __filemap_fdatawrite_range(mapping, offset, endbyte, | ||
134 | WB_SYNC_NONE); | ||
135 | break; | ||
136 | case LINUX_FADV_WRITE_WAIT: | ||
137 | ret = wait_on_page_writeback_range(mapping, | ||
138 | offset >> PAGE_CACHE_SHIFT, | ||
139 | endbyte >> PAGE_CACHE_SHIFT); | ||
104 | break; | 140 | break; |
105 | default: | 141 | default: |
106 | ret = -EINVAL; | 142 | ret = -EINVAL; |
diff --git a/mm/filemap.c b/mm/filemap.c index 7624c26fcea6..1120338a5d0f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -29,7 +29,10 @@ | |||
29 | #include <linux/blkdev.h> | 29 | #include <linux/blkdev.h> |
30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
32 | #include <linux/cpuset.h> | ||
32 | #include "filemap.h" | 33 | #include "filemap.h" |
34 | #include "internal.h" | ||
35 | |||
33 | /* | 36 | /* |
34 | * FIXME: remove all knowledge of the buffer layer from the core VM | 37 | * FIXME: remove all knowledge of the buffer layer from the core VM |
35 | */ | 38 | */ |
@@ -172,7 +175,7 @@ static int sync_page(void *word) | |||
172 | * dirty pages that lie within the byte offsets <start, end> | 175 | * dirty pages that lie within the byte offsets <start, end> |
173 | * @mapping: address space structure to write | 176 | * @mapping: address space structure to write |
174 | * @start: offset in bytes where the range starts | 177 | * @start: offset in bytes where the range starts |
175 | * @end: offset in bytes where the range ends | 178 | * @end: offset in bytes where the range ends (inclusive) |
176 | * @sync_mode: enable synchronous operation | 179 | * @sync_mode: enable synchronous operation |
177 | * | 180 | * |
178 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as | 181 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as |
@@ -180,8 +183,8 @@ static int sync_page(void *word) | |||
180 | * these two operations is that if a dirty page/buffer is encountered, it must | 183 | * these two operations is that if a dirty page/buffer is encountered, it must |
181 | * be waited upon, and not just skipped over. | 184 | * be waited upon, and not just skipped over. |
182 | */ | 185 | */ |
183 | static int __filemap_fdatawrite_range(struct address_space *mapping, | 186 | int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
184 | loff_t start, loff_t end, int sync_mode) | 187 | loff_t end, int sync_mode) |
185 | { | 188 | { |
186 | int ret; | 189 | int ret; |
187 | struct writeback_control wbc = { | 190 | struct writeback_control wbc = { |
@@ -210,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping) | |||
210 | } | 213 | } |
211 | EXPORT_SYMBOL(filemap_fdatawrite); | 214 | EXPORT_SYMBOL(filemap_fdatawrite); |
212 | 215 | ||
213 | static int filemap_fdatawrite_range(struct address_space *mapping, | 216 | static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
214 | loff_t start, loff_t end) | 217 | loff_t end) |
215 | { | 218 | { |
216 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); | 219 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); |
217 | } | 220 | } |
@@ -230,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush); | |||
230 | * Wait for writeback to complete against pages indexed by start->end | 233 | * Wait for writeback to complete against pages indexed by start->end |
231 | * inclusive | 234 | * inclusive |
232 | */ | 235 | */ |
233 | static int wait_on_page_writeback_range(struct address_space *mapping, | 236 | int wait_on_page_writeback_range(struct address_space *mapping, |
234 | pgoff_t start, pgoff_t end) | 237 | pgoff_t start, pgoff_t end) |
235 | { | 238 | { |
236 | struct pagevec pvec; | 239 | struct pagevec pvec; |
@@ -365,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping) | |||
365 | } | 368 | } |
366 | EXPORT_SYMBOL(filemap_write_and_wait); | 369 | EXPORT_SYMBOL(filemap_write_and_wait); |
367 | 370 | ||
371 | /* | ||
372 | * Write out and wait upon file offsets lstart->lend, inclusive. | ||
373 | * | ||
374 | * Note that `lend' is inclusive (describes the last byte to be written) so | ||
375 | * that this function can be used to write to the very end-of-file (end = -1). | ||
376 | */ | ||
368 | int filemap_write_and_wait_range(struct address_space *mapping, | 377 | int filemap_write_and_wait_range(struct address_space *mapping, |
369 | loff_t lstart, loff_t lend) | 378 | loff_t lstart, loff_t lend) |
370 | { | 379 | { |
@@ -425,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
425 | return ret; | 434 | return ret; |
426 | } | 435 | } |
427 | 436 | ||
437 | #ifdef CONFIG_NUMA | ||
438 | struct page *page_cache_alloc(struct address_space *x) | ||
439 | { | ||
440 | if (cpuset_do_page_mem_spread()) { | ||
441 | int n = cpuset_mem_spread_node(); | ||
442 | return alloc_pages_node(n, mapping_gfp_mask(x), 0); | ||
443 | } | ||
444 | return alloc_pages(mapping_gfp_mask(x), 0); | ||
445 | } | ||
446 | EXPORT_SYMBOL(page_cache_alloc); | ||
447 | |||
448 | struct page *page_cache_alloc_cold(struct address_space *x) | ||
449 | { | ||
450 | if (cpuset_do_page_mem_spread()) { | ||
451 | int n = cpuset_mem_spread_node(); | ||
452 | return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); | ||
453 | } | ||
454 | return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); | ||
455 | } | ||
456 | EXPORT_SYMBOL(page_cache_alloc_cold); | ||
457 | #endif | ||
458 | |||
428 | /* | 459 | /* |
429 | * In order to wait for pages to become available there must be | 460 | * In order to wait for pages to become available there must be |
430 | * waitqueues associated with pages. By using a hash table of | 461 | * waitqueues associated with pages. By using a hash table of |
diff --git a/mm/highmem.c b/mm/highmem.c index ce2e7e8bbfa7..55885f64af40 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -26,18 +26,14 @@ | |||
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | #include <linux/hash.h> | 27 | #include <linux/hash.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/blktrace_api.h> | ||
29 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
30 | 31 | ||
31 | static mempool_t *page_pool, *isa_page_pool; | 32 | static mempool_t *page_pool, *isa_page_pool; |
32 | 33 | ||
33 | static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data) | 34 | static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) |
34 | { | 35 | { |
35 | return alloc_page(gfp_mask | GFP_DMA); | 36 | return mempool_alloc_pages(gfp_mask | GFP_DMA, data); |
36 | } | ||
37 | |||
38 | static void page_pool_free(void *page, void *data) | ||
39 | { | ||
40 | __free_page(page); | ||
41 | } | 37 | } |
42 | 38 | ||
43 | /* | 39 | /* |
@@ -50,11 +46,6 @@ static void page_pool_free(void *page, void *data) | |||
50 | */ | 46 | */ |
51 | #ifdef CONFIG_HIGHMEM | 47 | #ifdef CONFIG_HIGHMEM |
52 | 48 | ||
53 | static void *page_pool_alloc(gfp_t gfp_mask, void *data) | ||
54 | { | ||
55 | return alloc_page(gfp_mask); | ||
56 | } | ||
57 | |||
58 | static int pkmap_count[LAST_PKMAP]; | 49 | static int pkmap_count[LAST_PKMAP]; |
59 | static unsigned int last_pkmap_nr; | 50 | static unsigned int last_pkmap_nr; |
60 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); | 51 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock); |
@@ -228,7 +219,7 @@ static __init int init_emergency_pool(void) | |||
228 | if (!i.totalhigh) | 219 | if (!i.totalhigh) |
229 | return 0; | 220 | return 0; |
230 | 221 | ||
231 | page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL); | 222 | page_pool = mempool_create_page_pool(POOL_SIZE, 0); |
232 | if (!page_pool) | 223 | if (!page_pool) |
233 | BUG(); | 224 | BUG(); |
234 | printk("highmem bounce pool size: %d pages\n", POOL_SIZE); | 225 | printk("highmem bounce pool size: %d pages\n", POOL_SIZE); |
@@ -271,7 +262,8 @@ int init_emergency_isa_pool(void) | |||
271 | if (isa_page_pool) | 262 | if (isa_page_pool) |
272 | return 0; | 263 | return 0; |
273 | 264 | ||
274 | isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL); | 265 | isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa, |
266 | mempool_free_pages, (void *) 0); | ||
275 | if (!isa_page_pool) | 267 | if (!isa_page_pool) |
276 | BUG(); | 268 | BUG(); |
277 | 269 | ||
@@ -336,7 +328,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) | |||
336 | bio_put(bio); | 328 | bio_put(bio); |
337 | } | 329 | } |
338 | 330 | ||
339 | static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err) | 331 | static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err) |
340 | { | 332 | { |
341 | if (bio->bi_size) | 333 | if (bio->bi_size) |
342 | return 1; | 334 | return 1; |
@@ -383,7 +375,7 @@ static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int | |||
383 | } | 375 | } |
384 | 376 | ||
385 | static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | 377 | static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, |
386 | mempool_t *pool) | 378 | mempool_t *pool) |
387 | { | 379 | { |
388 | struct page *page; | 380 | struct page *page; |
389 | struct bio *bio = NULL; | 381 | struct bio *bio = NULL; |
@@ -483,6 +475,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) | |||
483 | pool = isa_page_pool; | 475 | pool = isa_page_pool; |
484 | } | 476 | } |
485 | 477 | ||
478 | blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | ||
479 | |||
486 | /* | 480 | /* |
487 | * slow path | 481 | * slow path |
488 | */ | 482 | */ |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 508707704d2c..ebad6bbb3501 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -13,24 +13,48 @@ | |||
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
15 | #include <linux/cpuset.h> | 15 | #include <linux/cpuset.h> |
16 | #include <linux/mutex.h> | ||
16 | 17 | ||
17 | #include <asm/page.h> | 18 | #include <asm/page.h> |
18 | #include <asm/pgtable.h> | 19 | #include <asm/pgtable.h> |
19 | 20 | ||
20 | #include <linux/hugetlb.h> | 21 | #include <linux/hugetlb.h> |
22 | #include "internal.h" | ||
21 | 23 | ||
22 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
23 | static unsigned long nr_huge_pages, free_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; |
24 | unsigned long max_huge_pages; | 26 | unsigned long max_huge_pages; |
25 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
26 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
27 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | 29 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; |
28 | |||
29 | /* | 30 | /* |
30 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 31 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
31 | */ | 32 | */ |
32 | static DEFINE_SPINLOCK(hugetlb_lock); | 33 | static DEFINE_SPINLOCK(hugetlb_lock); |
33 | 34 | ||
35 | static void clear_huge_page(struct page *page, unsigned long addr) | ||
36 | { | ||
37 | int i; | ||
38 | |||
39 | might_sleep(); | ||
40 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { | ||
41 | cond_resched(); | ||
42 | clear_user_highpage(page + i, addr); | ||
43 | } | ||
44 | } | ||
45 | |||
46 | static void copy_huge_page(struct page *dst, struct page *src, | ||
47 | unsigned long addr) | ||
48 | { | ||
49 | int i; | ||
50 | |||
51 | might_sleep(); | ||
52 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | ||
53 | cond_resched(); | ||
54 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); | ||
55 | } | ||
56 | } | ||
57 | |||
34 | static void enqueue_huge_page(struct page *page) | 58 | static void enqueue_huge_page(struct page *page) |
35 | { | 59 | { |
36 | int nid = page_to_nid(page); | 60 | int nid = page_to_nid(page); |
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
64 | return page; | 88 | return page; |
65 | } | 89 | } |
66 | 90 | ||
67 | static struct page *alloc_fresh_huge_page(void) | 91 | static void free_huge_page(struct page *page) |
92 | { | ||
93 | BUG_ON(page_count(page)); | ||
94 | |||
95 | INIT_LIST_HEAD(&page->lru); | ||
96 | |||
97 | spin_lock(&hugetlb_lock); | ||
98 | enqueue_huge_page(page); | ||
99 | spin_unlock(&hugetlb_lock); | ||
100 | } | ||
101 | |||
102 | static int alloc_fresh_huge_page(void) | ||
68 | { | 103 | { |
69 | static int nid = 0; | 104 | static int nid = 0; |
70 | struct page *page; | 105 | struct page *page; |
71 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, | 106 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, |
72 | HUGETLB_PAGE_ORDER); | 107 | HUGETLB_PAGE_ORDER); |
73 | nid = (nid + 1) % num_online_nodes(); | 108 | nid = next_node(nid, node_online_map); |
109 | if (nid == MAX_NUMNODES) | ||
110 | nid = first_node(node_online_map); | ||
74 | if (page) { | 111 | if (page) { |
112 | page[1].lru.next = (void *)free_huge_page; /* dtor */ | ||
75 | spin_lock(&hugetlb_lock); | 113 | spin_lock(&hugetlb_lock); |
76 | nr_huge_pages++; | 114 | nr_huge_pages++; |
77 | nr_huge_pages_node[page_to_nid(page)]++; | 115 | nr_huge_pages_node[page_to_nid(page)]++; |
78 | spin_unlock(&hugetlb_lock); | 116 | spin_unlock(&hugetlb_lock); |
117 | put_page(page); /* free it into the hugepage allocator */ | ||
118 | return 1; | ||
79 | } | 119 | } |
80 | return page; | 120 | return 0; |
81 | } | 121 | } |
82 | 122 | ||
83 | void free_huge_page(struct page *page) | 123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
124 | unsigned long addr) | ||
84 | { | 125 | { |
85 | BUG_ON(page_count(page)); | 126 | struct inode *inode = vma->vm_file->f_dentry->d_inode; |
127 | struct page *page; | ||
128 | int use_reserve = 0; | ||
129 | unsigned long idx; | ||
86 | 130 | ||
87 | INIT_LIST_HEAD(&page->lru); | 131 | spin_lock(&hugetlb_lock); |
88 | page[1].lru.next = NULL; /* reset dtor */ | 132 | |
133 | if (vma->vm_flags & VM_MAYSHARE) { | ||
134 | |||
135 | /* idx = radix tree index, i.e. offset into file in | ||
136 | * HPAGE_SIZE units */ | ||
137 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
138 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
139 | |||
140 | /* The hugetlbfs specific inode info stores the number | ||
141 | * of "guaranteed available" (huge) pages. That is, | ||
142 | * the first 'prereserved_hpages' pages of the inode | ||
143 | * are either already instantiated, or have been | ||
144 | * pre-reserved (by hugetlb_reserve_for_inode()). Here | ||
145 | * we're in the process of instantiating the page, so | ||
146 | * we use this to determine whether to draw from the | ||
147 | * pre-reserved pool or the truly free pool. */ | ||
148 | if (idx < HUGETLBFS_I(inode)->prereserved_hpages) | ||
149 | use_reserve = 1; | ||
150 | } | ||
151 | |||
152 | if (!use_reserve) { | ||
153 | if (free_huge_pages <= reserved_huge_pages) | ||
154 | goto fail; | ||
155 | } else { | ||
156 | BUG_ON(reserved_huge_pages == 0); | ||
157 | reserved_huge_pages--; | ||
158 | } | ||
159 | |||
160 | page = dequeue_huge_page(vma, addr); | ||
161 | if (!page) | ||
162 | goto fail; | ||
163 | |||
164 | spin_unlock(&hugetlb_lock); | ||
165 | set_page_refcounted(page); | ||
166 | return page; | ||
167 | |||
168 | fail: | ||
169 | WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ | ||
170 | spin_unlock(&hugetlb_lock); | ||
171 | return NULL; | ||
172 | } | ||
173 | |||
174 | /* hugetlb_extend_reservation() | ||
175 | * | ||
176 | * Ensure that at least 'atleast' hugepages are, and will remain, | ||
177 | * available to instantiate the first 'atleast' pages of the given | ||
178 | * inode. If the inode doesn't already have this many pages reserved | ||
179 | * or instantiated, set aside some hugepages in the reserved pool to | ||
180 | * satisfy later faults (or fail now if there aren't enough, rather | ||
181 | * than getting the SIGBUS later). | ||
182 | */ | ||
183 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
184 | unsigned long atleast) | ||
185 | { | ||
186 | struct inode *inode = &info->vfs_inode; | ||
187 | unsigned long change_in_reserve = 0; | ||
188 | int ret = 0; | ||
89 | 189 | ||
90 | spin_lock(&hugetlb_lock); | 190 | spin_lock(&hugetlb_lock); |
91 | enqueue_huge_page(page); | 191 | read_lock_irq(&inode->i_mapping->tree_lock); |
192 | |||
193 | if (info->prereserved_hpages >= atleast) | ||
194 | goto out; | ||
195 | |||
196 | /* Because we always call this on shared mappings, none of the | ||
197 | * pages beyond info->prereserved_hpages can have been | ||
198 | * instantiated, so we need to reserve all of them now. */ | ||
199 | change_in_reserve = atleast - info->prereserved_hpages; | ||
200 | |||
201 | if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { | ||
202 | ret = -ENOMEM; | ||
203 | goto out; | ||
204 | } | ||
205 | |||
206 | reserved_huge_pages += change_in_reserve; | ||
207 | info->prereserved_hpages = atleast; | ||
208 | |||
209 | out: | ||
210 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
92 | spin_unlock(&hugetlb_lock); | 211 | spin_unlock(&hugetlb_lock); |
212 | |||
213 | return ret; | ||
93 | } | 214 | } |
94 | 215 | ||
95 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) | 216 | /* hugetlb_truncate_reservation() |
217 | * | ||
218 | * This returns pages reserved for the given inode to the general free | ||
219 | * hugepage pool. If the inode has any pages prereserved, but not | ||
220 | * instantiated, beyond offset (atmost << HPAGE_SIZE), then release | ||
221 | * them. | ||
222 | */ | ||
223 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
224 | unsigned long atmost) | ||
96 | { | 225 | { |
226 | struct inode *inode = &info->vfs_inode; | ||
227 | struct address_space *mapping = inode->i_mapping; | ||
228 | unsigned long idx; | ||
229 | unsigned long change_in_reserve = 0; | ||
97 | struct page *page; | 230 | struct page *page; |
98 | int i; | ||
99 | 231 | ||
100 | spin_lock(&hugetlb_lock); | 232 | spin_lock(&hugetlb_lock); |
101 | page = dequeue_huge_page(vma, addr); | 233 | read_lock_irq(&inode->i_mapping->tree_lock); |
102 | if (!page) { | 234 | |
103 | spin_unlock(&hugetlb_lock); | 235 | if (info->prereserved_hpages <= atmost) |
104 | return NULL; | 236 | goto out; |
237 | |||
238 | /* Count pages which were reserved, but not instantiated, and | ||
239 | * which we can now release. */ | ||
240 | for (idx = atmost; idx < info->prereserved_hpages; idx++) { | ||
241 | page = radix_tree_lookup(&mapping->page_tree, idx); | ||
242 | if (!page) | ||
243 | /* Pages which are already instantiated can't | ||
244 | * be unreserved (and in fact have already | ||
245 | * been removed from the reserved pool) */ | ||
246 | change_in_reserve++; | ||
105 | } | 247 | } |
248 | |||
249 | BUG_ON(reserved_huge_pages < change_in_reserve); | ||
250 | reserved_huge_pages -= change_in_reserve; | ||
251 | info->prereserved_hpages = atmost; | ||
252 | |||
253 | out: | ||
254 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
106 | spin_unlock(&hugetlb_lock); | 255 | spin_unlock(&hugetlb_lock); |
107 | set_page_count(page, 1); | ||
108 | page[1].lru.next = (void *)free_huge_page; /* set dtor */ | ||
109 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) | ||
110 | clear_user_highpage(&page[i], addr); | ||
111 | return page; | ||
112 | } | 256 | } |
113 | 257 | ||
114 | static int __init hugetlb_init(void) | 258 | static int __init hugetlb_init(void) |
115 | { | 259 | { |
116 | unsigned long i; | 260 | unsigned long i; |
117 | struct page *page; | ||
118 | 261 | ||
119 | if (HPAGE_SHIFT == 0) | 262 | if (HPAGE_SHIFT == 0) |
120 | return 0; | 263 | return 0; |
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void) | |||
123 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 266 | INIT_LIST_HEAD(&hugepage_freelists[i]); |
124 | 267 | ||
125 | for (i = 0; i < max_huge_pages; ++i) { | 268 | for (i = 0; i < max_huge_pages; ++i) { |
126 | page = alloc_fresh_huge_page(); | 269 | if (!alloc_fresh_huge_page()) |
127 | if (!page) | ||
128 | break; | 270 | break; |
129 | spin_lock(&hugetlb_lock); | ||
130 | enqueue_huge_page(page); | ||
131 | spin_unlock(&hugetlb_lock); | ||
132 | } | 271 | } |
133 | max_huge_pages = free_huge_pages = nr_huge_pages = i; | 272 | max_huge_pages = free_huge_pages = nr_huge_pages = i; |
134 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); | 273 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); |
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page) | |||
154 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 293 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | |
155 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 294 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | |
156 | 1 << PG_private | 1<< PG_writeback); | 295 | 1 << PG_private | 1<< PG_writeback); |
157 | set_page_count(&page[i], 0); | ||
158 | } | 296 | } |
159 | set_page_count(page, 1); | 297 | page[1].lru.next = NULL; |
298 | set_page_refcounted(page); | ||
160 | __free_pages(page, HUGETLB_PAGE_ORDER); | 299 | __free_pages(page, HUGETLB_PAGE_ORDER); |
161 | } | 300 | } |
162 | 301 | ||
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count) | |||
188 | static unsigned long set_max_huge_pages(unsigned long count) | 327 | static unsigned long set_max_huge_pages(unsigned long count) |
189 | { | 328 | { |
190 | while (count > nr_huge_pages) { | 329 | while (count > nr_huge_pages) { |
191 | struct page *page = alloc_fresh_huge_page(); | 330 | if (!alloc_fresh_huge_page()) |
192 | if (!page) | ||
193 | return nr_huge_pages; | 331 | return nr_huge_pages; |
194 | spin_lock(&hugetlb_lock); | ||
195 | enqueue_huge_page(page); | ||
196 | spin_unlock(&hugetlb_lock); | ||
197 | } | 332 | } |
198 | if (count >= nr_huge_pages) | 333 | if (count >= nr_huge_pages) |
199 | return nr_huge_pages; | 334 | return nr_huge_pages; |
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf) | |||
225 | return sprintf(buf, | 360 | return sprintf(buf, |
226 | "HugePages_Total: %5lu\n" | 361 | "HugePages_Total: %5lu\n" |
227 | "HugePages_Free: %5lu\n" | 362 | "HugePages_Free: %5lu\n" |
363 | "HugePages_Rsvd: %5lu\n" | ||
228 | "Hugepagesize: %5lu kB\n", | 364 | "Hugepagesize: %5lu kB\n", |
229 | nr_huge_pages, | 365 | nr_huge_pages, |
230 | free_huge_pages, | 366 | free_huge_pages, |
367 | reserved_huge_pages, | ||
231 | HPAGE_SIZE/1024); | 368 | HPAGE_SIZE/1024); |
232 | } | 369 | } |
233 | 370 | ||
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) | |||
240 | nid, free_huge_pages_node[nid]); | 377 | nid, free_huge_pages_node[nid]); |
241 | } | 378 | } |
242 | 379 | ||
243 | int is_hugepage_mem_enough(size_t size) | ||
244 | { | ||
245 | return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; | ||
246 | } | ||
247 | |||
248 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 380 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
249 | unsigned long hugetlb_total_pages(void) | 381 | unsigned long hugetlb_total_pages(void) |
250 | { | 382 | { |
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
374 | unsigned long address, pte_t *ptep, pte_t pte) | 506 | unsigned long address, pte_t *ptep, pte_t pte) |
375 | { | 507 | { |
376 | struct page *old_page, *new_page; | 508 | struct page *old_page, *new_page; |
377 | int i, avoidcopy; | 509 | int avoidcopy; |
378 | 510 | ||
379 | old_page = pte_page(pte); | 511 | old_page = pte_page(pte); |
380 | 512 | ||
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
395 | } | 527 | } |
396 | 528 | ||
397 | spin_unlock(&mm->page_table_lock); | 529 | spin_unlock(&mm->page_table_lock); |
398 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) | 530 | copy_huge_page(new_page, old_page, address); |
399 | copy_user_highpage(new_page + i, old_page + i, | ||
400 | address + i*PAGE_SIZE); | ||
401 | spin_lock(&mm->page_table_lock); | 531 | spin_lock(&mm->page_table_lock); |
402 | 532 | ||
403 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 533 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
@@ -442,6 +572,7 @@ retry: | |||
442 | ret = VM_FAULT_OOM; | 572 | ret = VM_FAULT_OOM; |
443 | goto out; | 573 | goto out; |
444 | } | 574 | } |
575 | clear_huge_page(page, address); | ||
445 | 576 | ||
446 | if (vma->vm_flags & VM_SHARED) { | 577 | if (vma->vm_flags & VM_SHARED) { |
447 | int err; | 578 | int err; |
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
496 | pte_t *ptep; | 627 | pte_t *ptep; |
497 | pte_t entry; | 628 | pte_t entry; |
498 | int ret; | 629 | int ret; |
630 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | ||
499 | 631 | ||
500 | ptep = huge_pte_alloc(mm, address); | 632 | ptep = huge_pte_alloc(mm, address); |
501 | if (!ptep) | 633 | if (!ptep) |
502 | return VM_FAULT_OOM; | 634 | return VM_FAULT_OOM; |
503 | 635 | ||
636 | /* | ||
637 | * Serialize hugepage allocation and instantiation, so that we don't | ||
638 | * get spurious allocation failures if two CPUs race to instantiate | ||
639 | * the same page in the page cache. | ||
640 | */ | ||
641 | mutex_lock(&hugetlb_instantiation_mutex); | ||
504 | entry = *ptep; | 642 | entry = *ptep; |
505 | if (pte_none(entry)) | 643 | if (pte_none(entry)) { |
506 | return hugetlb_no_page(mm, vma, address, ptep, write_access); | 644 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
645 | mutex_unlock(&hugetlb_instantiation_mutex); | ||
646 | return ret; | ||
647 | } | ||
507 | 648 | ||
508 | ret = VM_FAULT_MINOR; | 649 | ret = VM_FAULT_MINOR; |
509 | 650 | ||
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
513 | if (write_access && !pte_write(entry)) | 654 | if (write_access && !pte_write(entry)) |
514 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | 655 | ret = hugetlb_cow(mm, vma, address, ptep, entry); |
515 | spin_unlock(&mm->page_table_lock); | 656 | spin_unlock(&mm->page_table_lock); |
657 | mutex_unlock(&hugetlb_instantiation_mutex); | ||
516 | 658 | ||
517 | return ret; | 659 | return ret; |
518 | } | 660 | } |
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
521 | struct page **pages, struct vm_area_struct **vmas, | 663 | struct page **pages, struct vm_area_struct **vmas, |
522 | unsigned long *position, int *length, int i) | 664 | unsigned long *position, int *length, int i) |
523 | { | 665 | { |
524 | unsigned long vpfn, vaddr = *position; | 666 | unsigned long pfn_offset; |
667 | unsigned long vaddr = *position; | ||
525 | int remainder = *length; | 668 | int remainder = *length; |
526 | 669 | ||
527 | vpfn = vaddr/PAGE_SIZE; | ||
528 | spin_lock(&mm->page_table_lock); | 670 | spin_lock(&mm->page_table_lock); |
529 | while (vaddr < vma->vm_end && remainder) { | 671 | while (vaddr < vma->vm_end && remainder) { |
530 | pte_t *pte; | 672 | pte_t *pte; |
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
552 | break; | 694 | break; |
553 | } | 695 | } |
554 | 696 | ||
555 | if (pages) { | 697 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; |
556 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | 698 | page = pte_page(*pte); |
557 | get_page(page); | 699 | same_page: |
558 | pages[i] = page; | 700 | get_page(page); |
559 | } | 701 | if (pages) |
702 | pages[i] = page + pfn_offset; | ||
560 | 703 | ||
561 | if (vmas) | 704 | if (vmas) |
562 | vmas[i] = vma; | 705 | vmas[i] = vma; |
563 | 706 | ||
564 | vaddr += PAGE_SIZE; | 707 | vaddr += PAGE_SIZE; |
565 | ++vpfn; | 708 | ++pfn_offset; |
566 | --remainder; | 709 | --remainder; |
567 | ++i; | 710 | ++i; |
711 | if (vaddr < vma->vm_end && remainder && | ||
712 | pfn_offset < HPAGE_SIZE/PAGE_SIZE) { | ||
713 | /* | ||
714 | * We use pfn_offset to avoid touching the pageframes | ||
715 | * of this compound page. | ||
716 | */ | ||
717 | goto same_page; | ||
718 | } | ||
568 | } | 719 | } |
569 | spin_unlock(&mm->page_table_lock); | 720 | spin_unlock(&mm->page_table_lock); |
570 | *length = remainder; | 721 | *length = remainder; |
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
572 | 723 | ||
573 | return i; | 724 | return i; |
574 | } | 725 | } |
726 | |||
727 | void hugetlb_change_protection(struct vm_area_struct *vma, | ||
728 | unsigned long address, unsigned long end, pgprot_t newprot) | ||
729 | { | ||
730 | struct mm_struct *mm = vma->vm_mm; | ||
731 | unsigned long start = address; | ||
732 | pte_t *ptep; | ||
733 | pte_t pte; | ||
734 | |||
735 | BUG_ON(address >= end); | ||
736 | flush_cache_range(vma, address, end); | ||
737 | |||
738 | spin_lock(&mm->page_table_lock); | ||
739 | for (; address < end; address += HPAGE_SIZE) { | ||
740 | ptep = huge_pte_offset(mm, address); | ||
741 | if (!ptep) | ||
742 | continue; | ||
743 | if (!pte_none(*ptep)) { | ||
744 | pte = huge_ptep_get_and_clear(mm, address, ptep); | ||
745 | pte = pte_mkhuge(pte_modify(pte, newprot)); | ||
746 | set_huge_pte_at(mm, address, ptep, pte); | ||
747 | lazy_mmu_prot_update(pte); | ||
748 | } | ||
749 | } | ||
750 | spin_unlock(&mm->page_table_lock); | ||
751 | |||
752 | flush_tlb_range(vma, start, end); | ||
753 | } | ||
754 | |||
diff --git a/mm/internal.h b/mm/internal.h index 17256bb2f4ef..d20e3cc4aef0 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -8,23 +8,33 @@ | |||
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | #ifndef __MM_INTERNAL_H | ||
12 | #define __MM_INTERNAL_H | ||
11 | 13 | ||
12 | static inline void set_page_refs(struct page *page, int order) | 14 | #include <linux/mm.h> |
15 | |||
16 | static inline void set_page_count(struct page *page, int v) | ||
17 | { | ||
18 | atomic_set(&page->_count, v); | ||
19 | } | ||
20 | |||
21 | /* | ||
22 | * Turn a non-refcounted page (->_count == 0) into refcounted with | ||
23 | * a count of one. | ||
24 | */ | ||
25 | static inline void set_page_refcounted(struct page *page) | ||
13 | { | 26 | { |
14 | #ifdef CONFIG_MMU | 27 | BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); |
28 | BUG_ON(atomic_read(&page->_count)); | ||
15 | set_page_count(page, 1); | 29 | set_page_count(page, 1); |
16 | #else | 30 | } |
17 | int i; | ||
18 | 31 | ||
19 | /* | 32 | static inline void __put_page(struct page *page) |
20 | * We need to reference all the pages for this order, otherwise if | 33 | { |
21 | * anyone accesses one of the pages with (get/put) it will be freed. | 34 | atomic_dec(&page->_count); |
22 | * - eg: access_process_vm() | ||
23 | */ | ||
24 | for (i = 0; i < (1 << order); i++) | ||
25 | set_page_count(page + i, 1); | ||
26 | #endif /* CONFIG_MMU */ | ||
27 | } | 35 | } |
28 | 36 | ||
29 | extern void fastcall __init __free_pages_bootmem(struct page *page, | 37 | extern void fastcall __init __free_pages_bootmem(struct page *page, |
30 | unsigned int order); | 38 | unsigned int order); |
39 | |||
40 | #endif | ||
diff --git a/mm/memory.c b/mm/memory.c index 85e80a57db29..8d8f52569f32 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
277 | anon_vma_unlink(vma); | 277 | anon_vma_unlink(vma); |
278 | unlink_file_vma(vma); | 278 | unlink_file_vma(vma); |
279 | 279 | ||
280 | if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { | 280 | if (is_vm_hugetlb_page(vma)) { |
281 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, | 281 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, |
282 | floor, next? next->vm_start: ceiling); | 282 | floor, next? next->vm_start: ceiling); |
283 | } else { | 283 | } else { |
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
285 | * Optimization: gather nearby vmas into one call down | 285 | * Optimization: gather nearby vmas into one call down |
286 | */ | 286 | */ |
287 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE | 287 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE |
288 | && !is_hugepage_only_range(vma->vm_mm, next->vm_start, | 288 | && !is_vm_hugetlb_page(next)) { |
289 | HPAGE_SIZE)) { | ||
290 | vma = next; | 289 | vma = next; |
291 | next = vma->vm_next; | 290 | next = vma->vm_next; |
292 | anon_vma_unlink(vma); | 291 | anon_vma_unlink(vma); |
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ | |||
388 | { | 387 | { |
389 | unsigned long pfn = pte_pfn(pte); | 388 | unsigned long pfn = pte_pfn(pte); |
390 | 389 | ||
391 | if (vma->vm_flags & VM_PFNMAP) { | 390 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { |
392 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; | 391 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; |
393 | if (pfn == vma->vm_pgoff + off) | 392 | if (pfn == vma->vm_pgoff + off) |
394 | return NULL; | 393 | return NULL; |
@@ -401,8 +400,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ | |||
401 | * we should just do "return pfn_to_page(pfn)", but | 400 | * we should just do "return pfn_to_page(pfn)", but |
402 | * in the meantime we check that we get a valid pfn, | 401 | * in the meantime we check that we get a valid pfn, |
403 | * and that the resulting page looks ok. | 402 | * and that the resulting page looks ok. |
404 | * | ||
405 | * Remove this test eventually! | ||
406 | */ | 403 | */ |
407 | if (unlikely(!pfn_valid(pfn))) { | 404 | if (unlikely(!pfn_valid(pfn))) { |
408 | print_bad_pte(vma, pte, addr); | 405 | print_bad_pte(vma, pte, addr); |
@@ -1074,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1074 | } | 1071 | } |
1075 | if (pages) { | 1072 | if (pages) { |
1076 | pages[i] = page; | 1073 | pages[i] = page; |
1074 | |||
1075 | flush_anon_page(page, start); | ||
1077 | flush_dcache_page(page); | 1076 | flush_dcache_page(page); |
1078 | } | 1077 | } |
1079 | if (vmas) | 1078 | if (vmas) |
@@ -1221,9 +1220,7 @@ out: | |||
1221 | * The page has to be a nice clean _individual_ kernel allocation. | 1220 | * The page has to be a nice clean _individual_ kernel allocation. |
1222 | * If you allocate a compound page, you need to have marked it as | 1221 | * If you allocate a compound page, you need to have marked it as |
1223 | * such (__GFP_COMP), or manually just split the page up yourself | 1222 | * such (__GFP_COMP), or manually just split the page up yourself |
1224 | * (which is mainly an issue of doing "set_page_count(page, 1)" for | 1223 | * (see split_page()). |
1225 | * each sub-page, and then freeing them one by one when you free | ||
1226 | * them rather than freeing it as a compound page). | ||
1227 | * | 1224 | * |
1228 | * NOTE! Traditionally this was done with "remap_pfn_range()" which | 1225 | * NOTE! Traditionally this was done with "remap_pfn_range()" which |
1229 | * took an arbitrary page protection parameter. This doesn't allow | 1226 | * took an arbitrary page protection parameter. This doesn't allow |
@@ -2357,10 +2354,8 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
2357 | if (!vma) | 2354 | if (!vma) |
2358 | return -1; | 2355 | return -1; |
2359 | write = (vma->vm_flags & VM_WRITE) != 0; | 2356 | write = (vma->vm_flags & VM_WRITE) != 0; |
2360 | if (addr >= end) | 2357 | BUG_ON(addr >= end); |
2361 | BUG(); | 2358 | BUG_ON(end > vma->vm_end); |
2362 | if (end > vma->vm_end) | ||
2363 | BUG(); | ||
2364 | len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; | 2359 | len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; |
2365 | ret = get_user_pages(current, current->mm, addr, | 2360 | ret = get_user_pages(current, current->mm, addr, |
2366 | len, write, 0, NULL, NULL); | 2361 | len, write, 0, NULL, NULL); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b21869a39f0b..dec8249e972d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -86,6 +86,7 @@ | |||
86 | #include <linux/swap.h> | 86 | #include <linux/swap.h> |
87 | #include <linux/seq_file.h> | 87 | #include <linux/seq_file.h> |
88 | #include <linux/proc_fs.h> | 88 | #include <linux/proc_fs.h> |
89 | #include <linux/migrate.h> | ||
89 | 90 | ||
90 | #include <asm/tlbflush.h> | 91 | #include <asm/tlbflush.h> |
91 | #include <asm/uaccess.h> | 92 | #include <asm/uaccess.h> |
@@ -95,11 +96,8 @@ | |||
95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 96 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | 97 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ |
97 | 98 | ||
98 | /* The number of pages to migrate per call to migrate_pages() */ | 99 | static struct kmem_cache *policy_cache; |
99 | #define MIGRATE_CHUNK_SIZE 256 | 100 | static struct kmem_cache *sn_cache; |
100 | |||
101 | static kmem_cache_t *policy_cache; | ||
102 | static kmem_cache_t *sn_cache; | ||
103 | 101 | ||
104 | #define PDprintk(fmt...) | 102 | #define PDprintk(fmt...) |
105 | 103 | ||
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
331 | struct vm_area_struct *first, *vma, *prev; | 329 | struct vm_area_struct *first, *vma, *prev; |
332 | 330 | ||
333 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 331 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { |
334 | /* Must have swap device for migration */ | ||
335 | if (nr_swap_pages <= 0) | ||
336 | return ERR_PTR(-ENODEV); | ||
337 | 332 | ||
338 | /* | 333 | err = migrate_prep(); |
339 | * Clear the LRU lists so pages can be isolated. | 334 | if (err) |
340 | * Note that pages may be moved off the LRU after we have | 335 | return ERR_PTR(err); |
341 | * drained them. Those pages will fail to migrate like other | ||
342 | * pages that may be busy. | ||
343 | */ | ||
344 | lru_add_drain_all(); | ||
345 | } | 336 | } |
346 | 337 | ||
347 | first = find_vma(mm, start); | 338 | first = find_vma(mm, start); |
@@ -431,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes) | |||
431 | return mpol_check_policy(mode, nodes); | 422 | return mpol_check_policy(mode, nodes); |
432 | } | 423 | } |
433 | 424 | ||
425 | |||
426 | /* | ||
427 | * Update task->flags PF_MEMPOLICY bit: set iff non-default | ||
428 | * mempolicy. Allows more rapid checking of this (combined perhaps | ||
429 | * with other PF_* flag bits) on memory allocation hot code paths. | ||
430 | * | ||
431 | * If called from outside this file, the task 'p' should -only- be | ||
432 | * a newly forked child not yet visible on the task list, because | ||
433 | * manipulating the task flags of a visible task is not safe. | ||
434 | * | ||
435 | * The above limitation is why this routine has the funny name | ||
436 | * mpol_fix_fork_child_flag(). | ||
437 | * | ||
438 | * It is also safe to call this with a task pointer of current, | ||
439 | * which the static wrapper mpol_set_task_struct_flag() does, | ||
440 | * for use within this file. | ||
441 | */ | ||
442 | |||
443 | void mpol_fix_fork_child_flag(struct task_struct *p) | ||
444 | { | ||
445 | if (p->mempolicy) | ||
446 | p->flags |= PF_MEMPOLICY; | ||
447 | else | ||
448 | p->flags &= ~PF_MEMPOLICY; | ||
449 | } | ||
450 | |||
451 | static void mpol_set_task_struct_flag(void) | ||
452 | { | ||
453 | mpol_fix_fork_child_flag(current); | ||
454 | } | ||
455 | |||
434 | /* Set the process memory policy */ | 456 | /* Set the process memory policy */ |
435 | long do_set_mempolicy(int mode, nodemask_t *nodes) | 457 | long do_set_mempolicy(int mode, nodemask_t *nodes) |
436 | { | 458 | { |
@@ -443,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) | |||
443 | return PTR_ERR(new); | 465 | return PTR_ERR(new); |
444 | mpol_free(current->mempolicy); | 466 | mpol_free(current->mempolicy); |
445 | current->mempolicy = new; | 467 | current->mempolicy = new; |
468 | mpol_set_task_struct_flag(); | ||
446 | if (new && new->policy == MPOL_INTERLEAVE) | 469 | if (new && new->policy == MPOL_INTERLEAVE) |
447 | current->il_next = first_node(new->v.nodes); | 470 | current->il_next = first_node(new->v.nodes); |
448 | return 0; | 471 | return 0; |
@@ -550,92 +573,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
550 | return err; | 573 | return err; |
551 | } | 574 | } |
552 | 575 | ||
576 | #ifdef CONFIG_MIGRATION | ||
553 | /* | 577 | /* |
554 | * page migration | 578 | * page migration |
555 | */ | 579 | */ |
556 | |||
557 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 580 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
558 | unsigned long flags) | 581 | unsigned long flags) |
559 | { | 582 | { |
560 | /* | 583 | /* |
561 | * Avoid migrating a page that is shared with others. | 584 | * Avoid migrating a page that is shared with others. |
562 | */ | 585 | */ |
563 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 586 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) |
564 | if (isolate_lru_page(page)) | 587 | isolate_lru_page(page, pagelist); |
565 | list_add_tail(&page->lru, pagelist); | ||
566 | } | ||
567 | } | ||
568 | |||
569 | /* | ||
570 | * Migrate the list 'pagelist' of pages to a certain destination. | ||
571 | * | ||
572 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
573 | * Return the number of pages not migrated or error code | ||
574 | */ | ||
575 | static int migrate_pages_to(struct list_head *pagelist, | ||
576 | struct vm_area_struct *vma, int dest) | ||
577 | { | ||
578 | LIST_HEAD(newlist); | ||
579 | LIST_HEAD(moved); | ||
580 | LIST_HEAD(failed); | ||
581 | int err = 0; | ||
582 | unsigned long offset = 0; | ||
583 | int nr_pages; | ||
584 | struct page *page; | ||
585 | struct list_head *p; | ||
586 | |||
587 | redo: | ||
588 | nr_pages = 0; | ||
589 | list_for_each(p, pagelist) { | ||
590 | if (vma) { | ||
591 | /* | ||
592 | * The address passed to alloc_page_vma is used to | ||
593 | * generate the proper interleave behavior. We fake | ||
594 | * the address here by an increasing offset in order | ||
595 | * to get the proper distribution of pages. | ||
596 | * | ||
597 | * No decision has been made as to which page | ||
598 | * a certain old page is moved to so we cannot | ||
599 | * specify the correct address. | ||
600 | */ | ||
601 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
602 | offset + vma->vm_start); | ||
603 | offset += PAGE_SIZE; | ||
604 | } | ||
605 | else | ||
606 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
607 | |||
608 | if (!page) { | ||
609 | err = -ENOMEM; | ||
610 | goto out; | ||
611 | } | ||
612 | list_add_tail(&page->lru, &newlist); | ||
613 | nr_pages++; | ||
614 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
615 | break; | ||
616 | } | ||
617 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | ||
618 | |||
619 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | ||
620 | |||
621 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
622 | goto redo; | ||
623 | out: | ||
624 | /* Return leftover allocated pages */ | ||
625 | while (!list_empty(&newlist)) { | ||
626 | page = list_entry(newlist.next, struct page, lru); | ||
627 | list_del(&page->lru); | ||
628 | __free_page(page); | ||
629 | } | ||
630 | list_splice(&failed, pagelist); | ||
631 | if (err < 0) | ||
632 | return err; | ||
633 | |||
634 | /* Calculate number of leftover pages */ | ||
635 | nr_pages = 0; | ||
636 | list_for_each(p, pagelist) | ||
637 | nr_pages++; | ||
638 | return nr_pages; | ||
639 | } | 588 | } |
640 | 589 | ||
641 | /* | 590 | /* |
@@ -742,8 +691,23 @@ int do_migrate_pages(struct mm_struct *mm, | |||
742 | if (err < 0) | 691 | if (err < 0) |
743 | return err; | 692 | return err; |
744 | return busy; | 693 | return busy; |
694 | |||
745 | } | 695 | } |
746 | 696 | ||
697 | #else | ||
698 | |||
699 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | ||
700 | unsigned long flags) | ||
701 | { | ||
702 | } | ||
703 | |||
704 | int do_migrate_pages(struct mm_struct *mm, | ||
705 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | ||
706 | { | ||
707 | return -ENOSYS; | ||
708 | } | ||
709 | #endif | ||
710 | |||
747 | long do_mbind(unsigned long start, unsigned long len, | 711 | long do_mbind(unsigned long start, unsigned long len, |
748 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | 712 | unsigned long mode, nodemask_t *nmask, unsigned long flags) |
749 | { | 713 | { |
@@ -808,6 +772,7 @@ long do_mbind(unsigned long start, unsigned long len, | |||
808 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 772 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
809 | err = -EIO; | 773 | err = -EIO; |
810 | } | 774 | } |
775 | |||
811 | if (!list_empty(&pagelist)) | 776 | if (!list_empty(&pagelist)) |
812 | putback_lru_pages(&pagelist); | 777 | putback_lru_pages(&pagelist); |
813 | 778 | ||
@@ -947,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | |||
947 | /* | 912 | /* |
948 | * Check if this process has the right to modify the specified | 913 | * Check if this process has the right to modify the specified |
949 | * process. The right exists if the process has administrative | 914 | * process. The right exists if the process has administrative |
950 | * capabilities, superuser priviledges or the same | 915 | * capabilities, superuser privileges or the same |
951 | * userid as the target process. | 916 | * userid as the target process. |
952 | */ | 917 | */ |
953 | if ((current->euid != task->suid) && (current->euid != task->uid) && | 918 | if ((current->euid != task->suid) && (current->euid != task->uid) && |
diff --git a/mm/mempool.c b/mm/mempool.c index 1a99b80480d3..fe6e05289cc5 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -183,8 +183,8 @@ EXPORT_SYMBOL(mempool_resize); | |||
183 | */ | 183 | */ |
184 | void mempool_destroy(mempool_t *pool) | 184 | void mempool_destroy(mempool_t *pool) |
185 | { | 185 | { |
186 | if (pool->curr_nr != pool->min_nr) | 186 | /* Check for outstanding elements */ |
187 | BUG(); /* There were outstanding elements */ | 187 | BUG_ON(pool->curr_nr != pool->min_nr); |
188 | free_pool(pool); | 188 | free_pool(pool); |
189 | } | 189 | } |
190 | EXPORT_SYMBOL(mempool_destroy); | 190 | EXPORT_SYMBOL(mempool_destroy); |
@@ -278,14 +278,56 @@ EXPORT_SYMBOL(mempool_free); | |||
278 | */ | 278 | */ |
279 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) | 279 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) |
280 | { | 280 | { |
281 | kmem_cache_t *mem = (kmem_cache_t *) pool_data; | 281 | struct kmem_cache *mem = pool_data; |
282 | return kmem_cache_alloc(mem, gfp_mask); | 282 | return kmem_cache_alloc(mem, gfp_mask); |
283 | } | 283 | } |
284 | EXPORT_SYMBOL(mempool_alloc_slab); | 284 | EXPORT_SYMBOL(mempool_alloc_slab); |
285 | 285 | ||
286 | void mempool_free_slab(void *element, void *pool_data) | 286 | void mempool_free_slab(void *element, void *pool_data) |
287 | { | 287 | { |
288 | kmem_cache_t *mem = (kmem_cache_t *) pool_data; | 288 | struct kmem_cache *mem = pool_data; |
289 | kmem_cache_free(mem, element); | 289 | kmem_cache_free(mem, element); |
290 | } | 290 | } |
291 | EXPORT_SYMBOL(mempool_free_slab); | 291 | EXPORT_SYMBOL(mempool_free_slab); |
292 | |||
293 | /* | ||
294 | * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory | ||
295 | * specfied by pool_data | ||
296 | */ | ||
297 | void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) | ||
298 | { | ||
299 | size_t size = (size_t)(long)pool_data; | ||
300 | return kmalloc(size, gfp_mask); | ||
301 | } | ||
302 | EXPORT_SYMBOL(mempool_kmalloc); | ||
303 | |||
304 | void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) | ||
305 | { | ||
306 | size_t size = (size_t) pool_data; | ||
307 | return kzalloc(size, gfp_mask); | ||
308 | } | ||
309 | EXPORT_SYMBOL(mempool_kzalloc); | ||
310 | |||
311 | void mempool_kfree(void *element, void *pool_data) | ||
312 | { | ||
313 | kfree(element); | ||
314 | } | ||
315 | EXPORT_SYMBOL(mempool_kfree); | ||
316 | |||
317 | /* | ||
318 | * A simple mempool-backed page allocator that allocates pages | ||
319 | * of the order specified by pool_data. | ||
320 | */ | ||
321 | void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) | ||
322 | { | ||
323 | int order = (int)(long)pool_data; | ||
324 | return alloc_pages(gfp_mask, order); | ||
325 | } | ||
326 | EXPORT_SYMBOL(mempool_alloc_pages); | ||
327 | |||
328 | void mempool_free_pages(void *element, void *pool_data) | ||
329 | { | ||
330 | int order = (int)(long)pool_data; | ||
331 | __free_pages(element, order); | ||
332 | } | ||
333 | EXPORT_SYMBOL(mempool_free_pages); | ||
diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 000000000000..09f6e4aa87fc --- /dev/null +++ b/mm/migrate.c | |||
@@ -0,0 +1,655 @@ | |||
1 | /* | ||
2 | * Memory Migration functionality - linux/mm/migration.c | ||
3 | * | ||
4 | * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter | ||
5 | * | ||
6 | * Page migration was first developed in the context of the memory hotplug | ||
7 | * project. The main authors of the migration code are: | ||
8 | * | ||
9 | * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> | ||
10 | * Hirokazu Takahashi <taka@valinux.co.jp> | ||
11 | * Dave Hansen <haveblue@us.ibm.com> | ||
12 | * Christoph Lameter <clameter@sgi.com> | ||
13 | */ | ||
14 | |||
15 | #include <linux/migrate.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/swap.h> | ||
18 | #include <linux/pagemap.h> | ||
19 | #include <linux/buffer_head.h> /* for try_to_release_page(), | ||
20 | buffer_heads_over_limit */ | ||
21 | #include <linux/mm_inline.h> | ||
22 | #include <linux/pagevec.h> | ||
23 | #include <linux/rmap.h> | ||
24 | #include <linux/topology.h> | ||
25 | #include <linux/cpu.h> | ||
26 | #include <linux/cpuset.h> | ||
27 | #include <linux/swapops.h> | ||
28 | |||
29 | #include "internal.h" | ||
30 | |||
31 | #include "internal.h" | ||
32 | |||
33 | /* The maximum number of pages to take off the LRU for migration */ | ||
34 | #define MIGRATE_CHUNK_SIZE 256 | ||
35 | |||
36 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | ||
37 | |||
38 | /* | ||
39 | * Isolate one page from the LRU lists. If successful put it onto | ||
40 | * the indicated list with elevated page count. | ||
41 | * | ||
42 | * Result: | ||
43 | * -EBUSY: page not on LRU list | ||
44 | * 0: page removed from LRU list and added to the specified list. | ||
45 | */ | ||
46 | int isolate_lru_page(struct page *page, struct list_head *pagelist) | ||
47 | { | ||
48 | int ret = -EBUSY; | ||
49 | |||
50 | if (PageLRU(page)) { | ||
51 | struct zone *zone = page_zone(page); | ||
52 | |||
53 | spin_lock_irq(&zone->lru_lock); | ||
54 | if (PageLRU(page)) { | ||
55 | ret = 0; | ||
56 | get_page(page); | ||
57 | ClearPageLRU(page); | ||
58 | if (PageActive(page)) | ||
59 | del_page_from_active_list(zone, page); | ||
60 | else | ||
61 | del_page_from_inactive_list(zone, page); | ||
62 | list_add_tail(&page->lru, pagelist); | ||
63 | } | ||
64 | spin_unlock_irq(&zone->lru_lock); | ||
65 | } | ||
66 | return ret; | ||
67 | } | ||
68 | |||
69 | /* | ||
70 | * migrate_prep() needs to be called after we have compiled the list of pages | ||
71 | * to be migrated using isolate_lru_page() but before we begin a series of calls | ||
72 | * to migrate_pages(). | ||
73 | */ | ||
74 | int migrate_prep(void) | ||
75 | { | ||
76 | /* Must have swap device for migration */ | ||
77 | if (nr_swap_pages <= 0) | ||
78 | return -ENODEV; | ||
79 | |||
80 | /* | ||
81 | * Clear the LRU lists so pages can be isolated. | ||
82 | * Note that pages may be moved off the LRU after we have | ||
83 | * drained them. Those pages will fail to migrate like other | ||
84 | * pages that may be busy. | ||
85 | */ | ||
86 | lru_add_drain_all(); | ||
87 | |||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | static inline void move_to_lru(struct page *page) | ||
92 | { | ||
93 | list_del(&page->lru); | ||
94 | if (PageActive(page)) { | ||
95 | /* | ||
96 | * lru_cache_add_active checks that | ||
97 | * the PG_active bit is off. | ||
98 | */ | ||
99 | ClearPageActive(page); | ||
100 | lru_cache_add_active(page); | ||
101 | } else { | ||
102 | lru_cache_add(page); | ||
103 | } | ||
104 | put_page(page); | ||
105 | } | ||
106 | |||
107 | /* | ||
108 | * Add isolated pages on the list back to the LRU. | ||
109 | * | ||
110 | * returns the number of pages put back. | ||
111 | */ | ||
112 | int putback_lru_pages(struct list_head *l) | ||
113 | { | ||
114 | struct page *page; | ||
115 | struct page *page2; | ||
116 | int count = 0; | ||
117 | |||
118 | list_for_each_entry_safe(page, page2, l, lru) { | ||
119 | move_to_lru(page); | ||
120 | count++; | ||
121 | } | ||
122 | return count; | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * Non migratable page | ||
127 | */ | ||
128 | int fail_migrate_page(struct page *newpage, struct page *page) | ||
129 | { | ||
130 | return -EIO; | ||
131 | } | ||
132 | EXPORT_SYMBOL(fail_migrate_page); | ||
133 | |||
134 | /* | ||
135 | * swapout a single page | ||
136 | * page is locked upon entry, unlocked on exit | ||
137 | */ | ||
138 | static int swap_page(struct page *page) | ||
139 | { | ||
140 | struct address_space *mapping = page_mapping(page); | ||
141 | |||
142 | if (page_mapped(page) && mapping) | ||
143 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) | ||
144 | goto unlock_retry; | ||
145 | |||
146 | if (PageDirty(page)) { | ||
147 | /* Page is dirty, try to write it out here */ | ||
148 | switch(pageout(page, mapping)) { | ||
149 | case PAGE_KEEP: | ||
150 | case PAGE_ACTIVATE: | ||
151 | goto unlock_retry; | ||
152 | |||
153 | case PAGE_SUCCESS: | ||
154 | goto retry; | ||
155 | |||
156 | case PAGE_CLEAN: | ||
157 | ; /* try to free the page below */ | ||
158 | } | ||
159 | } | ||
160 | |||
161 | if (PagePrivate(page)) { | ||
162 | if (!try_to_release_page(page, GFP_KERNEL) || | ||
163 | (!mapping && page_count(page) == 1)) | ||
164 | goto unlock_retry; | ||
165 | } | ||
166 | |||
167 | if (remove_mapping(mapping, page)) { | ||
168 | /* Success */ | ||
169 | unlock_page(page); | ||
170 | return 0; | ||
171 | } | ||
172 | |||
173 | unlock_retry: | ||
174 | unlock_page(page); | ||
175 | |||
176 | retry: | ||
177 | return -EAGAIN; | ||
178 | } | ||
179 | EXPORT_SYMBOL(swap_page); | ||
180 | |||
181 | /* | ||
182 | * Remove references for a page and establish the new page with the correct | ||
183 | * basic settings to be able to stop accesses to the page. | ||
184 | */ | ||
185 | int migrate_page_remove_references(struct page *newpage, | ||
186 | struct page *page, int nr_refs) | ||
187 | { | ||
188 | struct address_space *mapping = page_mapping(page); | ||
189 | struct page **radix_pointer; | ||
190 | |||
191 | /* | ||
192 | * Avoid doing any of the following work if the page count | ||
193 | * indicates that the page is in use or truncate has removed | ||
194 | * the page. | ||
195 | */ | ||
196 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | ||
197 | return -EAGAIN; | ||
198 | |||
199 | /* | ||
200 | * Establish swap ptes for anonymous pages or destroy pte | ||
201 | * maps for files. | ||
202 | * | ||
203 | * In order to reestablish file backed mappings the fault handlers | ||
204 | * will take the radix tree_lock which may then be used to stop | ||
205 | * processses from accessing this page until the new page is ready. | ||
206 | * | ||
207 | * A process accessing via a swap pte (an anonymous page) will take a | ||
208 | * page_lock on the old page which will block the process until the | ||
209 | * migration attempt is complete. At that time the PageSwapCache bit | ||
210 | * will be examined. If the page was migrated then the PageSwapCache | ||
211 | * bit will be clear and the operation to retrieve the page will be | ||
212 | * retried which will find the new page in the radix tree. Then a new | ||
213 | * direct mapping may be generated based on the radix tree contents. | ||
214 | * | ||
215 | * If the page was not migrated then the PageSwapCache bit | ||
216 | * is still set and the operation may continue. | ||
217 | */ | ||
218 | if (try_to_unmap(page, 1) == SWAP_FAIL) | ||
219 | /* A vma has VM_LOCKED set -> permanent failure */ | ||
220 | return -EPERM; | ||
221 | |||
222 | /* | ||
223 | * Give up if we were unable to remove all mappings. | ||
224 | */ | ||
225 | if (page_mapcount(page)) | ||
226 | return -EAGAIN; | ||
227 | |||
228 | write_lock_irq(&mapping->tree_lock); | ||
229 | |||
230 | radix_pointer = (struct page **)radix_tree_lookup_slot( | ||
231 | &mapping->page_tree, | ||
232 | page_index(page)); | ||
233 | |||
234 | if (!page_mapping(page) || page_count(page) != nr_refs || | ||
235 | *radix_pointer != page) { | ||
236 | write_unlock_irq(&mapping->tree_lock); | ||
237 | return 1; | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * Now we know that no one else is looking at the page. | ||
242 | * | ||
243 | * Certain minimal information about a page must be available | ||
244 | * in order for other subsystems to properly handle the page if they | ||
245 | * find it through the radix tree update before we are finished | ||
246 | * copying the page. | ||
247 | */ | ||
248 | get_page(newpage); | ||
249 | newpage->index = page->index; | ||
250 | newpage->mapping = page->mapping; | ||
251 | if (PageSwapCache(page)) { | ||
252 | SetPageSwapCache(newpage); | ||
253 | set_page_private(newpage, page_private(page)); | ||
254 | } | ||
255 | |||
256 | *radix_pointer = newpage; | ||
257 | __put_page(page); | ||
258 | write_unlock_irq(&mapping->tree_lock); | ||
259 | |||
260 | return 0; | ||
261 | } | ||
262 | EXPORT_SYMBOL(migrate_page_remove_references); | ||
263 | |||
264 | /* | ||
265 | * Copy the page to its new location | ||
266 | */ | ||
267 | void migrate_page_copy(struct page *newpage, struct page *page) | ||
268 | { | ||
269 | copy_highpage(newpage, page); | ||
270 | |||
271 | if (PageError(page)) | ||
272 | SetPageError(newpage); | ||
273 | if (PageReferenced(page)) | ||
274 | SetPageReferenced(newpage); | ||
275 | if (PageUptodate(page)) | ||
276 | SetPageUptodate(newpage); | ||
277 | if (PageActive(page)) | ||
278 | SetPageActive(newpage); | ||
279 | if (PageChecked(page)) | ||
280 | SetPageChecked(newpage); | ||
281 | if (PageMappedToDisk(page)) | ||
282 | SetPageMappedToDisk(newpage); | ||
283 | |||
284 | if (PageDirty(page)) { | ||
285 | clear_page_dirty_for_io(page); | ||
286 | set_page_dirty(newpage); | ||
287 | } | ||
288 | |||
289 | ClearPageSwapCache(page); | ||
290 | ClearPageActive(page); | ||
291 | ClearPagePrivate(page); | ||
292 | set_page_private(page, 0); | ||
293 | page->mapping = NULL; | ||
294 | |||
295 | /* | ||
296 | * If any waiters have accumulated on the new page then | ||
297 | * wake them up. | ||
298 | */ | ||
299 | if (PageWriteback(newpage)) | ||
300 | end_page_writeback(newpage); | ||
301 | } | ||
302 | EXPORT_SYMBOL(migrate_page_copy); | ||
303 | |||
304 | /* | ||
305 | * Common logic to directly migrate a single page suitable for | ||
306 | * pages that do not use PagePrivate. | ||
307 | * | ||
308 | * Pages are locked upon entry and exit. | ||
309 | */ | ||
310 | int migrate_page(struct page *newpage, struct page *page) | ||
311 | { | ||
312 | int rc; | ||
313 | |||
314 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | ||
315 | |||
316 | rc = migrate_page_remove_references(newpage, page, 2); | ||
317 | |||
318 | if (rc) | ||
319 | return rc; | ||
320 | |||
321 | migrate_page_copy(newpage, page); | ||
322 | |||
323 | /* | ||
324 | * Remove auxiliary swap entries and replace | ||
325 | * them with real ptes. | ||
326 | * | ||
327 | * Note that a real pte entry will allow processes that are not | ||
328 | * waiting on the page lock to use the new page via the page tables | ||
329 | * before the new page is unlocked. | ||
330 | */ | ||
331 | remove_from_swap(newpage); | ||
332 | return 0; | ||
333 | } | ||
334 | EXPORT_SYMBOL(migrate_page); | ||
335 | |||
336 | /* | ||
337 | * migrate_pages | ||
338 | * | ||
339 | * Two lists are passed to this function. The first list | ||
340 | * contains the pages isolated from the LRU to be migrated. | ||
341 | * The second list contains new pages that the pages isolated | ||
342 | * can be moved to. If the second list is NULL then all | ||
343 | * pages are swapped out. | ||
344 | * | ||
345 | * The function returns after 10 attempts or if no pages | ||
346 | * are movable anymore because to has become empty | ||
347 | * or no retryable pages exist anymore. | ||
348 | * | ||
349 | * Return: Number of pages not migrated when "to" ran empty. | ||
350 | */ | ||
351 | int migrate_pages(struct list_head *from, struct list_head *to, | ||
352 | struct list_head *moved, struct list_head *failed) | ||
353 | { | ||
354 | int retry; | ||
355 | int nr_failed = 0; | ||
356 | int pass = 0; | ||
357 | struct page *page; | ||
358 | struct page *page2; | ||
359 | int swapwrite = current->flags & PF_SWAPWRITE; | ||
360 | int rc; | ||
361 | |||
362 | if (!swapwrite) | ||
363 | current->flags |= PF_SWAPWRITE; | ||
364 | |||
365 | redo: | ||
366 | retry = 0; | ||
367 | |||
368 | list_for_each_entry_safe(page, page2, from, lru) { | ||
369 | struct page *newpage = NULL; | ||
370 | struct address_space *mapping; | ||
371 | |||
372 | cond_resched(); | ||
373 | |||
374 | rc = 0; | ||
375 | if (page_count(page) == 1) | ||
376 | /* page was freed from under us. So we are done. */ | ||
377 | goto next; | ||
378 | |||
379 | if (to && list_empty(to)) | ||
380 | break; | ||
381 | |||
382 | /* | ||
383 | * Skip locked pages during the first two passes to give the | ||
384 | * functions holding the lock time to release the page. Later we | ||
385 | * use lock_page() to have a higher chance of acquiring the | ||
386 | * lock. | ||
387 | */ | ||
388 | rc = -EAGAIN; | ||
389 | if (pass > 2) | ||
390 | lock_page(page); | ||
391 | else | ||
392 | if (TestSetPageLocked(page)) | ||
393 | goto next; | ||
394 | |||
395 | /* | ||
396 | * Only wait on writeback if we have already done a pass where | ||
397 | * we we may have triggered writeouts for lots of pages. | ||
398 | */ | ||
399 | if (pass > 0) { | ||
400 | wait_on_page_writeback(page); | ||
401 | } else { | ||
402 | if (PageWriteback(page)) | ||
403 | goto unlock_page; | ||
404 | } | ||
405 | |||
406 | /* | ||
407 | * Anonymous pages must have swap cache references otherwise | ||
408 | * the information contained in the page maps cannot be | ||
409 | * preserved. | ||
410 | */ | ||
411 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
412 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
413 | rc = -ENOMEM; | ||
414 | goto unlock_page; | ||
415 | } | ||
416 | } | ||
417 | |||
418 | if (!to) { | ||
419 | rc = swap_page(page); | ||
420 | goto next; | ||
421 | } | ||
422 | |||
423 | newpage = lru_to_page(to); | ||
424 | lock_page(newpage); | ||
425 | |||
426 | /* | ||
427 | * Pages are properly locked and writeback is complete. | ||
428 | * Try to migrate the page. | ||
429 | */ | ||
430 | mapping = page_mapping(page); | ||
431 | if (!mapping) | ||
432 | goto unlock_both; | ||
433 | |||
434 | if (mapping->a_ops->migratepage) { | ||
435 | /* | ||
436 | * Most pages have a mapping and most filesystems | ||
437 | * should provide a migration function. Anonymous | ||
438 | * pages are part of swap space which also has its | ||
439 | * own migration function. This is the most common | ||
440 | * path for page migration. | ||
441 | */ | ||
442 | rc = mapping->a_ops->migratepage(newpage, page); | ||
443 | goto unlock_both; | ||
444 | } | ||
445 | |||
446 | /* | ||
447 | * Default handling if a filesystem does not provide | ||
448 | * a migration function. We can only migrate clean | ||
449 | * pages so try to write out any dirty pages first. | ||
450 | */ | ||
451 | if (PageDirty(page)) { | ||
452 | switch (pageout(page, mapping)) { | ||
453 | case PAGE_KEEP: | ||
454 | case PAGE_ACTIVATE: | ||
455 | goto unlock_both; | ||
456 | |||
457 | case PAGE_SUCCESS: | ||
458 | unlock_page(newpage); | ||
459 | goto next; | ||
460 | |||
461 | case PAGE_CLEAN: | ||
462 | ; /* try to migrate the page below */ | ||
463 | } | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * Buffers are managed in a filesystem specific way. | ||
468 | * We must have no buffers or drop them. | ||
469 | */ | ||
470 | if (!page_has_buffers(page) || | ||
471 | try_to_release_page(page, GFP_KERNEL)) { | ||
472 | rc = migrate_page(newpage, page); | ||
473 | goto unlock_both; | ||
474 | } | ||
475 | |||
476 | /* | ||
477 | * On early passes with mapped pages simply | ||
478 | * retry. There may be a lock held for some | ||
479 | * buffers that may go away. Later | ||
480 | * swap them out. | ||
481 | */ | ||
482 | if (pass > 4) { | ||
483 | /* | ||
484 | * Persistently unable to drop buffers..... As a | ||
485 | * measure of last resort we fall back to | ||
486 | * swap_page(). | ||
487 | */ | ||
488 | unlock_page(newpage); | ||
489 | newpage = NULL; | ||
490 | rc = swap_page(page); | ||
491 | goto next; | ||
492 | } | ||
493 | |||
494 | unlock_both: | ||
495 | unlock_page(newpage); | ||
496 | |||
497 | unlock_page: | ||
498 | unlock_page(page); | ||
499 | |||
500 | next: | ||
501 | if (rc == -EAGAIN) { | ||
502 | retry++; | ||
503 | } else if (rc) { | ||
504 | /* Permanent failure */ | ||
505 | list_move(&page->lru, failed); | ||
506 | nr_failed++; | ||
507 | } else { | ||
508 | if (newpage) { | ||
509 | /* Successful migration. Return page to LRU */ | ||
510 | move_to_lru(newpage); | ||
511 | } | ||
512 | list_move(&page->lru, moved); | ||
513 | } | ||
514 | } | ||
515 | if (retry && pass++ < 10) | ||
516 | goto redo; | ||
517 | |||
518 | if (!swapwrite) | ||
519 | current->flags &= ~PF_SWAPWRITE; | ||
520 | |||
521 | return nr_failed + retry; | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * Migration function for pages with buffers. This function can only be used | ||
526 | * if the underlying filesystem guarantees that no other references to "page" | ||
527 | * exist. | ||
528 | */ | ||
529 | int buffer_migrate_page(struct page *newpage, struct page *page) | ||
530 | { | ||
531 | struct address_space *mapping = page->mapping; | ||
532 | struct buffer_head *bh, *head; | ||
533 | int rc; | ||
534 | |||
535 | if (!mapping) | ||
536 | return -EAGAIN; | ||
537 | |||
538 | if (!page_has_buffers(page)) | ||
539 | return migrate_page(newpage, page); | ||
540 | |||
541 | head = page_buffers(page); | ||
542 | |||
543 | rc = migrate_page_remove_references(newpage, page, 3); | ||
544 | |||
545 | if (rc) | ||
546 | return rc; | ||
547 | |||
548 | bh = head; | ||
549 | do { | ||
550 | get_bh(bh); | ||
551 | lock_buffer(bh); | ||
552 | bh = bh->b_this_page; | ||
553 | |||
554 | } while (bh != head); | ||
555 | |||
556 | ClearPagePrivate(page); | ||
557 | set_page_private(newpage, page_private(page)); | ||
558 | set_page_private(page, 0); | ||
559 | put_page(page); | ||
560 | get_page(newpage); | ||
561 | |||
562 | bh = head; | ||
563 | do { | ||
564 | set_bh_page(bh, newpage, bh_offset(bh)); | ||
565 | bh = bh->b_this_page; | ||
566 | |||
567 | } while (bh != head); | ||
568 | |||
569 | SetPagePrivate(newpage); | ||
570 | |||
571 | migrate_page_copy(newpage, page); | ||
572 | |||
573 | bh = head; | ||
574 | do { | ||
575 | unlock_buffer(bh); | ||
576 | put_bh(bh); | ||
577 | bh = bh->b_this_page; | ||
578 | |||
579 | } while (bh != head); | ||
580 | |||
581 | return 0; | ||
582 | } | ||
583 | EXPORT_SYMBOL(buffer_migrate_page); | ||
584 | |||
585 | /* | ||
586 | * Migrate the list 'pagelist' of pages to a certain destination. | ||
587 | * | ||
588 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
589 | * Return the number of pages not migrated or error code | ||
590 | */ | ||
591 | int migrate_pages_to(struct list_head *pagelist, | ||
592 | struct vm_area_struct *vma, int dest) | ||
593 | { | ||
594 | LIST_HEAD(newlist); | ||
595 | LIST_HEAD(moved); | ||
596 | LIST_HEAD(failed); | ||
597 | int err = 0; | ||
598 | unsigned long offset = 0; | ||
599 | int nr_pages; | ||
600 | struct page *page; | ||
601 | struct list_head *p; | ||
602 | |||
603 | redo: | ||
604 | nr_pages = 0; | ||
605 | list_for_each(p, pagelist) { | ||
606 | if (vma) { | ||
607 | /* | ||
608 | * The address passed to alloc_page_vma is used to | ||
609 | * generate the proper interleave behavior. We fake | ||
610 | * the address here by an increasing offset in order | ||
611 | * to get the proper distribution of pages. | ||
612 | * | ||
613 | * No decision has been made as to which page | ||
614 | * a certain old page is moved to so we cannot | ||
615 | * specify the correct address. | ||
616 | */ | ||
617 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
618 | offset + vma->vm_start); | ||
619 | offset += PAGE_SIZE; | ||
620 | } | ||
621 | else | ||
622 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
623 | |||
624 | if (!page) { | ||
625 | err = -ENOMEM; | ||
626 | goto out; | ||
627 | } | ||
628 | list_add_tail(&page->lru, &newlist); | ||
629 | nr_pages++; | ||
630 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
631 | break; | ||
632 | } | ||
633 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | ||
634 | |||
635 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | ||
636 | |||
637 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
638 | goto redo; | ||
639 | out: | ||
640 | /* Return leftover allocated pages */ | ||
641 | while (!list_empty(&newlist)) { | ||
642 | page = list_entry(newlist.next, struct page, lru); | ||
643 | list_del(&page->lru); | ||
644 | __free_page(page); | ||
645 | } | ||
646 | list_splice(&failed, pagelist); | ||
647 | if (err < 0) | ||
648 | return err; | ||
649 | |||
650 | /* Calculate number of leftover pages */ | ||
651 | nr_pages = 0; | ||
652 | list_for_each(p, pagelist) | ||
653 | nr_pages++; | ||
654 | return nr_pages; | ||
655 | } | ||
@@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
612 | * If the vma has a ->close operation then the driver probably needs to release | 612 | * If the vma has a ->close operation then the driver probably needs to release |
613 | * per-vma resources, so we don't attempt to merge those. | 613 | * per-vma resources, so we don't attempt to merge those. |
614 | */ | 614 | */ |
615 | #define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) | 615 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) |
616 | 616 | ||
617 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 617 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
618 | struct file *file, unsigned long vm_flags) | 618 | struct file *file, unsigned long vm_flags) |
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
845 | const unsigned long stack_flags | 845 | const unsigned long stack_flags |
846 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 846 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
847 | 847 | ||
848 | #ifdef CONFIG_HUGETLB | ||
849 | if (flags & VM_HUGETLB) { | ||
850 | if (!(flags & VM_DONTCOPY)) | ||
851 | mm->shared_vm += pages; | ||
852 | return; | ||
853 | } | ||
854 | #endif /* CONFIG_HUGETLB */ | ||
855 | |||
856 | if (file) { | 848 | if (file) { |
857 | mm->shared_vm += pages; | 849 | mm->shared_vm += pages; |
858 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 850 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
@@ -1048,12 +1040,11 @@ munmap_back: | |||
1048 | * specific mapper. the address has already been validated, but | 1040 | * specific mapper. the address has already been validated, but |
1049 | * not unmapped, but the maps are removed from the list. | 1041 | * not unmapped, but the maps are removed from the list. |
1050 | */ | 1042 | */ |
1051 | vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 1043 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
1052 | if (!vma) { | 1044 | if (!vma) { |
1053 | error = -ENOMEM; | 1045 | error = -ENOMEM; |
1054 | goto unacct_error; | 1046 | goto unacct_error; |
1055 | } | 1047 | } |
1056 | memset(vma, 0, sizeof(*vma)); | ||
1057 | 1048 | ||
1058 | vma->vm_mm = mm; | 1049 | vma->vm_mm = mm; |
1059 | vma->vm_start = addr; | 1050 | vma->vm_start = addr; |
@@ -1904,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1904 | /* | 1895 | /* |
1905 | * create a vma struct for an anonymous mapping | 1896 | * create a vma struct for an anonymous mapping |
1906 | */ | 1897 | */ |
1907 | vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 1898 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
1908 | if (!vma) { | 1899 | if (!vma) { |
1909 | vm_unacct_memory(len >> PAGE_SHIFT); | 1900 | vm_unacct_memory(len >> PAGE_SHIFT); |
1910 | return -ENOMEM; | 1901 | return -ENOMEM; |
1911 | } | 1902 | } |
1912 | memset(vma, 0, sizeof(*vma)); | ||
1913 | 1903 | ||
1914 | vma->vm_mm = mm; | 1904 | vma->vm_mm = mm; |
1915 | vma->vm_start = addr; | 1905 | vma->vm_start = addr; |
diff --git a/mm/mmzone.c b/mm/mmzone.c new file mode 100644 index 000000000000..b022370e612e --- /dev/null +++ b/mm/mmzone.c | |||
@@ -0,0 +1,50 @@ | |||
1 | /* | ||
2 | * linux/mm/mmzone.c | ||
3 | * | ||
4 | * management codes for pgdats and zones. | ||
5 | */ | ||
6 | |||
7 | |||
8 | #include <linux/config.h> | ||
9 | #include <linux/stddef.h> | ||
10 | #include <linux/mmzone.h> | ||
11 | #include <linux/module.h> | ||
12 | |||
13 | struct pglist_data *first_online_pgdat(void) | ||
14 | { | ||
15 | return NODE_DATA(first_online_node); | ||
16 | } | ||
17 | |||
18 | EXPORT_SYMBOL(first_online_pgdat); | ||
19 | |||
20 | struct pglist_data *next_online_pgdat(struct pglist_data *pgdat) | ||
21 | { | ||
22 | int nid = next_online_node(pgdat->node_id); | ||
23 | |||
24 | if (nid == MAX_NUMNODES) | ||
25 | return NULL; | ||
26 | return NODE_DATA(nid); | ||
27 | } | ||
28 | EXPORT_SYMBOL(next_online_pgdat); | ||
29 | |||
30 | |||
31 | /* | ||
32 | * next_zone - helper magic for for_each_zone() | ||
33 | */ | ||
34 | struct zone *next_zone(struct zone *zone) | ||
35 | { | ||
36 | pg_data_t *pgdat = zone->zone_pgdat; | ||
37 | |||
38 | if (zone < pgdat->node_zones + MAX_NR_ZONES - 1) | ||
39 | zone++; | ||
40 | else { | ||
41 | pgdat = next_online_pgdat(pgdat); | ||
42 | if (pgdat) | ||
43 | zone = pgdat->node_zones; | ||
44 | else | ||
45 | zone = NULL; | ||
46 | } | ||
47 | return zone; | ||
48 | } | ||
49 | EXPORT_SYMBOL(next_zone); | ||
50 | |||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 653b8571c1ed..4c14d4289b61 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
124 | * a MAP_NORESERVE private mapping to writable will now reserve. | 124 | * a MAP_NORESERVE private mapping to writable will now reserve. |
125 | */ | 125 | */ |
126 | if (newflags & VM_WRITE) { | 126 | if (newflags & VM_WRITE) { |
127 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { | 127 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { |
128 | charged = nrpages; | 128 | charged = nrpages; |
129 | if (security_vm_enough_memory(charged)) | 129 | if (security_vm_enough_memory(charged)) |
130 | return -ENOMEM; | 130 | return -ENOMEM; |
@@ -166,7 +166,10 @@ success: | |||
166 | */ | 166 | */ |
167 | vma->vm_flags = newflags; | 167 | vma->vm_flags = newflags; |
168 | vma->vm_page_prot = newprot; | 168 | vma->vm_page_prot = newprot; |
169 | change_protection(vma, start, end, newprot); | 169 | if (is_vm_hugetlb_page(vma)) |
170 | hugetlb_change_protection(vma, start, end, newprot); | ||
171 | else | ||
172 | change_protection(vma, start, end, newprot); | ||
170 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 173 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
171 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 174 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
172 | return 0; | 175 | return 0; |
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) | |||
240 | 243 | ||
241 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 244 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
242 | 245 | ||
243 | if (is_vm_hugetlb_page(vma)) { | ||
244 | error = -EACCES; | ||
245 | goto out; | ||
246 | } | ||
247 | |||
248 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); | 246 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); |
249 | 247 | ||
250 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ | 248 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ |
diff --git a/mm/msync.c b/mm/msync.c index 3563a56e1a51..bc6c95376366 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -9,20 +9,24 @@ | |||
9 | */ | 9 | */ |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
12 | #include <linux/fs.h> | ||
12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
13 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
14 | #include <linux/hugetlb.h> | 15 | #include <linux/hugetlb.h> |
16 | #include <linux/writeback.h> | ||
17 | #include <linux/file.h> | ||
15 | #include <linux/syscalls.h> | 18 | #include <linux/syscalls.h> |
16 | 19 | ||
17 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
18 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
19 | 22 | ||
20 | static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 23 | static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
21 | unsigned long addr, unsigned long end) | 24 | unsigned long addr, unsigned long end) |
22 | { | 25 | { |
23 | pte_t *pte; | 26 | pte_t *pte; |
24 | spinlock_t *ptl; | 27 | spinlock_t *ptl; |
25 | int progress = 0; | 28 | int progress = 0; |
29 | unsigned long ret = 0; | ||
26 | 30 | ||
27 | again: | 31 | again: |
28 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 32 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
@@ -43,58 +47,64 @@ again: | |||
43 | if (!page) | 47 | if (!page) |
44 | continue; | 48 | continue; |
45 | if (ptep_clear_flush_dirty(vma, addr, pte) || | 49 | if (ptep_clear_flush_dirty(vma, addr, pte) || |
46 | page_test_and_clear_dirty(page)) | 50 | page_test_and_clear_dirty(page)) |
47 | set_page_dirty(page); | 51 | ret += set_page_dirty(page); |
48 | progress += 3; | 52 | progress += 3; |
49 | } while (pte++, addr += PAGE_SIZE, addr != end); | 53 | } while (pte++, addr += PAGE_SIZE, addr != end); |
50 | pte_unmap_unlock(pte - 1, ptl); | 54 | pte_unmap_unlock(pte - 1, ptl); |
51 | cond_resched(); | 55 | cond_resched(); |
52 | if (addr != end) | 56 | if (addr != end) |
53 | goto again; | 57 | goto again; |
58 | return ret; | ||
54 | } | 59 | } |
55 | 60 | ||
56 | static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 61 | static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, |
57 | unsigned long addr, unsigned long end) | 62 | pud_t *pud, unsigned long addr, unsigned long end) |
58 | { | 63 | { |
59 | pmd_t *pmd; | 64 | pmd_t *pmd; |
60 | unsigned long next; | 65 | unsigned long next; |
66 | unsigned long ret = 0; | ||
61 | 67 | ||
62 | pmd = pmd_offset(pud, addr); | 68 | pmd = pmd_offset(pud, addr); |
63 | do { | 69 | do { |
64 | next = pmd_addr_end(addr, end); | 70 | next = pmd_addr_end(addr, end); |
65 | if (pmd_none_or_clear_bad(pmd)) | 71 | if (pmd_none_or_clear_bad(pmd)) |
66 | continue; | 72 | continue; |
67 | msync_pte_range(vma, pmd, addr, next); | 73 | ret += msync_pte_range(vma, pmd, addr, next); |
68 | } while (pmd++, addr = next, addr != end); | 74 | } while (pmd++, addr = next, addr != end); |
75 | return ret; | ||
69 | } | 76 | } |
70 | 77 | ||
71 | static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 78 | static inline unsigned long msync_pud_range(struct vm_area_struct *vma, |
72 | unsigned long addr, unsigned long end) | 79 | pgd_t *pgd, unsigned long addr, unsigned long end) |
73 | { | 80 | { |
74 | pud_t *pud; | 81 | pud_t *pud; |
75 | unsigned long next; | 82 | unsigned long next; |
83 | unsigned long ret = 0; | ||
76 | 84 | ||
77 | pud = pud_offset(pgd, addr); | 85 | pud = pud_offset(pgd, addr); |
78 | do { | 86 | do { |
79 | next = pud_addr_end(addr, end); | 87 | next = pud_addr_end(addr, end); |
80 | if (pud_none_or_clear_bad(pud)) | 88 | if (pud_none_or_clear_bad(pud)) |
81 | continue; | 89 | continue; |
82 | msync_pmd_range(vma, pud, addr, next); | 90 | ret += msync_pmd_range(vma, pud, addr, next); |
83 | } while (pud++, addr = next, addr != end); | 91 | } while (pud++, addr = next, addr != end); |
92 | return ret; | ||
84 | } | 93 | } |
85 | 94 | ||
86 | static void msync_page_range(struct vm_area_struct *vma, | 95 | static unsigned long msync_page_range(struct vm_area_struct *vma, |
87 | unsigned long addr, unsigned long end) | 96 | unsigned long addr, unsigned long end) |
88 | { | 97 | { |
89 | pgd_t *pgd; | 98 | pgd_t *pgd; |
90 | unsigned long next; | 99 | unsigned long next; |
100 | unsigned long ret = 0; | ||
91 | 101 | ||
92 | /* For hugepages we can't go walking the page table normally, | 102 | /* For hugepages we can't go walking the page table normally, |
93 | * but that's ok, hugetlbfs is memory based, so we don't need | 103 | * but that's ok, hugetlbfs is memory based, so we don't need |
94 | * to do anything more on an msync(). | 104 | * to do anything more on an msync(). |
95 | */ | 105 | */ |
96 | if (vma->vm_flags & VM_HUGETLB) | 106 | if (vma->vm_flags & VM_HUGETLB) |
97 | return; | 107 | return 0; |
98 | 108 | ||
99 | BUG_ON(addr >= end); | 109 | BUG_ON(addr >= end); |
100 | pgd = pgd_offset(vma->vm_mm, addr); | 110 | pgd = pgd_offset(vma->vm_mm, addr); |
@@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma, | |||
103 | next = pgd_addr_end(addr, end); | 113 | next = pgd_addr_end(addr, end); |
104 | if (pgd_none_or_clear_bad(pgd)) | 114 | if (pgd_none_or_clear_bad(pgd)) |
105 | continue; | 115 | continue; |
106 | msync_pud_range(vma, pgd, addr, next); | 116 | ret += msync_pud_range(vma, pgd, addr, next); |
107 | } while (pgd++, addr = next, addr != end); | 117 | } while (pgd++, addr = next, addr != end); |
118 | return ret; | ||
108 | } | 119 | } |
109 | 120 | ||
110 | /* | 121 | /* |
@@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma, | |||
115 | * write out the dirty pages and wait on the writeout and check the result. | 126 | * write out the dirty pages and wait on the writeout and check the result. |
116 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start | 127 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start |
117 | * async writeout immediately. | 128 | * async writeout immediately. |
118 | * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to | 129 | * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to |
119 | * applications. | 130 | * applications. |
120 | */ | 131 | */ |
121 | static int msync_interval(struct vm_area_struct *vma, | 132 | static int msync_interval(struct vm_area_struct *vma, unsigned long addr, |
122 | unsigned long addr, unsigned long end, int flags) | 133 | unsigned long end, int flags, |
134 | unsigned long *nr_pages_dirtied) | ||
123 | { | 135 | { |
124 | int ret = 0; | ||
125 | struct file *file = vma->vm_file; | 136 | struct file *file = vma->vm_file; |
126 | 137 | ||
127 | if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) | 138 | if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) |
128 | return -EBUSY; | 139 | return -EBUSY; |
129 | 140 | ||
130 | if (file && (vma->vm_flags & VM_SHARED)) { | 141 | if (file && (vma->vm_flags & VM_SHARED)) |
131 | msync_page_range(vma, addr, end); | 142 | *nr_pages_dirtied = msync_page_range(vma, addr, end); |
132 | 143 | return 0; | |
133 | if (flags & MS_SYNC) { | ||
134 | struct address_space *mapping = file->f_mapping; | ||
135 | int err; | ||
136 | |||
137 | ret = filemap_fdatawrite(mapping); | ||
138 | if (file->f_op && file->f_op->fsync) { | ||
139 | /* | ||
140 | * We don't take i_mutex here because mmap_sem | ||
141 | * is already held. | ||
142 | */ | ||
143 | err = file->f_op->fsync(file,file->f_dentry,1); | ||
144 | if (err && !ret) | ||
145 | ret = err; | ||
146 | } | ||
147 | err = filemap_fdatawait(mapping); | ||
148 | if (!ret) | ||
149 | ret = err; | ||
150 | } | ||
151 | } | ||
152 | return ret; | ||
153 | } | 144 | } |
154 | 145 | ||
155 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | 146 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) |
156 | { | 147 | { |
157 | unsigned long end; | 148 | unsigned long end; |
158 | struct vm_area_struct *vma; | 149 | struct vm_area_struct *vma; |
159 | int unmapped_error, error = -EINVAL; | 150 | int unmapped_error = 0; |
160 | 151 | int error = -EINVAL; | |
161 | if (flags & MS_SYNC) | 152 | int done = 0; |
162 | current->flags |= PF_SYNCWRITE; | ||
163 | 153 | ||
164 | down_read(¤t->mm->mmap_sem); | ||
165 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) | 154 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) |
166 | goto out; | 155 | goto out; |
167 | if (start & ~PAGE_MASK) | 156 | if (start & ~PAGE_MASK) |
@@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
180 | * If the interval [start,end) covers some unmapped address ranges, | 169 | * If the interval [start,end) covers some unmapped address ranges, |
181 | * just ignore them, but return -ENOMEM at the end. | 170 | * just ignore them, but return -ENOMEM at the end. |
182 | */ | 171 | */ |
172 | down_read(¤t->mm->mmap_sem); | ||
173 | if (flags & MS_SYNC) | ||
174 | current->flags |= PF_SYNCWRITE; | ||
183 | vma = find_vma(current->mm, start); | 175 | vma = find_vma(current->mm, start); |
184 | unmapped_error = 0; | 176 | if (!vma) { |
185 | for (;;) { | ||
186 | /* Still start < end. */ | ||
187 | error = -ENOMEM; | 177 | error = -ENOMEM; |
188 | if (!vma) | 178 | goto out_unlock; |
189 | goto out; | 179 | } |
180 | do { | ||
181 | unsigned long nr_pages_dirtied = 0; | ||
182 | struct file *file; | ||
183 | |||
190 | /* Here start < vma->vm_end. */ | 184 | /* Here start < vma->vm_end. */ |
191 | if (start < vma->vm_start) { | 185 | if (start < vma->vm_start) { |
192 | unmapped_error = -ENOMEM; | 186 | unmapped_error = -ENOMEM; |
@@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
195 | /* Here vma->vm_start <= start < vma->vm_end. */ | 189 | /* Here vma->vm_start <= start < vma->vm_end. */ |
196 | if (end <= vma->vm_end) { | 190 | if (end <= vma->vm_end) { |
197 | if (start < end) { | 191 | if (start < end) { |
198 | error = msync_interval(vma, start, end, flags); | 192 | error = msync_interval(vma, start, end, flags, |
193 | &nr_pages_dirtied); | ||
199 | if (error) | 194 | if (error) |
200 | goto out; | 195 | goto out_unlock; |
201 | } | 196 | } |
202 | error = unmapped_error; | 197 | error = unmapped_error; |
203 | goto out; | 198 | done = 1; |
199 | } else { | ||
200 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | ||
201 | error = msync_interval(vma, start, vma->vm_end, flags, | ||
202 | &nr_pages_dirtied); | ||
203 | if (error) | ||
204 | goto out_unlock; | ||
204 | } | 205 | } |
205 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | 206 | file = vma->vm_file; |
206 | error = msync_interval(vma, start, vma->vm_end, flags); | ||
207 | if (error) | ||
208 | goto out; | ||
209 | start = vma->vm_end; | 207 | start = vma->vm_end; |
210 | vma = vma->vm_next; | 208 | if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { |
211 | } | 209 | get_file(file); |
212 | out: | 210 | up_read(¤t->mm->mmap_sem); |
213 | up_read(¤t->mm->mmap_sem); | 211 | balance_dirty_pages_ratelimited_nr(file->f_mapping, |
212 | nr_pages_dirtied); | ||
213 | fput(file); | ||
214 | down_read(¤t->mm->mmap_sem); | ||
215 | vma = find_vma(current->mm, start); | ||
216 | } else if ((flags & MS_SYNC) && file && | ||
217 | (vma->vm_flags & VM_SHARED)) { | ||
218 | get_file(file); | ||
219 | up_read(¤t->mm->mmap_sem); | ||
220 | error = do_fsync(file, 0); | ||
221 | fput(file); | ||
222 | down_read(¤t->mm->mmap_sem); | ||
223 | if (error) | ||
224 | goto out_unlock; | ||
225 | vma = find_vma(current->mm, start); | ||
226 | } else { | ||
227 | vma = vma->vm_next; | ||
228 | } | ||
229 | } while (vma && !done); | ||
230 | out_unlock: | ||
214 | current->flags &= ~PF_SYNCWRITE; | 231 | current->flags &= ~PF_SYNCWRITE; |
232 | up_read(¤t->mm->mmap_sem); | ||
233 | out: | ||
215 | return error; | 234 | return error; |
216 | } | 235 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index 4951f4786f28..db45efac17cc 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | |||
159 | /* | 159 | /* |
160 | * kmalloc doesn't like __GFP_HIGHMEM for some reason | 160 | * kmalloc doesn't like __GFP_HIGHMEM for some reason |
161 | */ | 161 | */ |
162 | return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); | 162 | return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); |
163 | } | 163 | } |
164 | 164 | ||
165 | struct page * vmalloc_to_page(void *addr) | 165 | struct page * vmalloc_to_page(void *addr) |
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | |||
623 | * - note that this may not return a page-aligned address if the object | 623 | * - note that this may not return a page-aligned address if the object |
624 | * we're allocating is smaller than a page | 624 | * we're allocating is smaller than a page |
625 | */ | 625 | */ |
626 | base = kmalloc(len, GFP_KERNEL); | 626 | base = kmalloc(len, GFP_KERNEL|__GFP_COMP); |
627 | if (!base) | 627 | if (!base) |
628 | goto enomem; | 628 | goto enomem; |
629 | 629 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 945559fb63d2..893d7677579e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40; | |||
75 | * The interval between `kupdate'-style writebacks, in centiseconds | 75 | * The interval between `kupdate'-style writebacks, in centiseconds |
76 | * (hundredths of a second) | 76 | * (hundredths of a second) |
77 | */ | 77 | */ |
78 | int dirty_writeback_centisecs = 5 * 100; | 78 | int dirty_writeback_interval = 5 * HZ; |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * The longest number of centiseconds for which data is allowed to remain dirty | 81 | * The longest number of centiseconds for which data is allowed to remain dirty |
82 | */ | 82 | */ |
83 | int dirty_expire_centisecs = 30 * 100; | 83 | int dirty_expire_interval = 30 * HZ; |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * Flag that makes the machine dump writes/reads and block dirtyings. | 86 | * Flag that makes the machine dump writes/reads and block dirtyings. |
@@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100; | |||
88 | int block_dump; | 88 | int block_dump; |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * Flag that puts the machine in "laptop mode". | 91 | * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: |
92 | * a full sync is triggered after this time elapses without any disk activity. | ||
92 | */ | 93 | */ |
93 | int laptop_mode; | 94 | int laptop_mode; |
94 | 95 | ||
@@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
255 | } | 256 | } |
256 | 257 | ||
257 | /** | 258 | /** |
258 | * balance_dirty_pages_ratelimited - balance dirty memory state | 259 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
259 | * @mapping: address_space which was dirtied | 260 | * @mapping: address_space which was dirtied |
261 | * @nr_pages: number of pages which the caller has just dirtied | ||
260 | * | 262 | * |
261 | * Processes which are dirtying memory should call in here once for each page | 263 | * Processes which are dirtying memory should call in here once for each page |
262 | * which was newly dirtied. The function will periodically check the system's | 264 | * which was newly dirtied. The function will periodically check the system's |
@@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
267 | * limit we decrease the ratelimiting by a lot, to prevent individual processes | 269 | * limit we decrease the ratelimiting by a lot, to prevent individual processes |
268 | * from overshooting the limit by (ratelimit_pages) each. | 270 | * from overshooting the limit by (ratelimit_pages) each. |
269 | */ | 271 | */ |
270 | void balance_dirty_pages_ratelimited(struct address_space *mapping) | 272 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
273 | unsigned long nr_pages_dirtied) | ||
271 | { | 274 | { |
272 | static DEFINE_PER_CPU(int, ratelimits) = 0; | 275 | static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; |
273 | long ratelimit; | 276 | unsigned long ratelimit; |
277 | unsigned long *p; | ||
274 | 278 | ||
275 | ratelimit = ratelimit_pages; | 279 | ratelimit = ratelimit_pages; |
276 | if (dirty_exceeded) | 280 | if (dirty_exceeded) |
@@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
280 | * Check the rate limiting. Also, we do not want to throttle real-time | 284 | * Check the rate limiting. Also, we do not want to throttle real-time |
281 | * tasks in balance_dirty_pages(). Period. | 285 | * tasks in balance_dirty_pages(). Period. |
282 | */ | 286 | */ |
283 | if (get_cpu_var(ratelimits)++ >= ratelimit) { | 287 | preempt_disable(); |
284 | __get_cpu_var(ratelimits) = 0; | 288 | p = &__get_cpu_var(ratelimits); |
285 | put_cpu_var(ratelimits); | 289 | *p += nr_pages_dirtied; |
290 | if (unlikely(*p >= ratelimit)) { | ||
291 | *p = 0; | ||
292 | preempt_enable(); | ||
286 | balance_dirty_pages(mapping); | 293 | balance_dirty_pages(mapping); |
287 | return; | 294 | return; |
288 | } | 295 | } |
289 | put_cpu_var(ratelimits); | 296 | preempt_enable(); |
290 | } | 297 | } |
291 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited); | 298 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); |
292 | 299 | ||
293 | void throttle_vm_writeout(void) | 300 | void throttle_vm_writeout(void) |
294 | { | 301 | { |
@@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); | |||
380 | * just walks the superblock inode list, writing back any inodes which are | 387 | * just walks the superblock inode list, writing back any inodes which are |
381 | * older than a specific point in time. | 388 | * older than a specific point in time. |
382 | * | 389 | * |
383 | * Try to run once per dirty_writeback_centisecs. But if a writeback event | 390 | * Try to run once per dirty_writeback_interval. But if a writeback event |
384 | * takes longer than a dirty_writeback_centisecs interval, then leave a | 391 | * takes longer than a dirty_writeback_interval interval, then leave a |
385 | * one-second gap. | 392 | * one-second gap. |
386 | * | 393 | * |
387 | * older_than_this takes precedence over nr_to_write. So we'll only write back | 394 | * older_than_this takes precedence over nr_to_write. So we'll only write back |
@@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg) | |||
406 | sync_supers(); | 413 | sync_supers(); |
407 | 414 | ||
408 | get_writeback_state(&wbs); | 415 | get_writeback_state(&wbs); |
409 | oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; | 416 | oldest_jif = jiffies - dirty_expire_interval; |
410 | start_jif = jiffies; | 417 | start_jif = jiffies; |
411 | next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; | 418 | next_jif = start_jif + dirty_writeback_interval; |
412 | nr_to_write = wbs.nr_dirty + wbs.nr_unstable + | 419 | nr_to_write = wbs.nr_dirty + wbs.nr_unstable + |
413 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 420 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
414 | while (nr_to_write > 0) { | 421 | while (nr_to_write > 0) { |
@@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg) | |||
425 | } | 432 | } |
426 | if (time_before(next_jif, jiffies + HZ)) | 433 | if (time_before(next_jif, jiffies + HZ)) |
427 | next_jif = jiffies + HZ; | 434 | next_jif = jiffies + HZ; |
428 | if (dirty_writeback_centisecs) | 435 | if (dirty_writeback_interval) |
429 | mod_timer(&wb_timer, next_jif); | 436 | mod_timer(&wb_timer, next_jif); |
430 | } | 437 | } |
431 | 438 | ||
@@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg) | |||
435 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | 442 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
436 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 443 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
437 | { | 444 | { |
438 | proc_dointvec(table, write, file, buffer, length, ppos); | 445 | proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); |
439 | if (dirty_writeback_centisecs) { | 446 | if (dirty_writeback_interval) { |
440 | mod_timer(&wb_timer, | 447 | mod_timer(&wb_timer, |
441 | jiffies + (dirty_writeback_centisecs * HZ) / 100); | 448 | jiffies + dirty_writeback_interval); |
442 | } else { | 449 | } else { |
443 | del_timer(&wb_timer); | 450 | del_timer(&wb_timer); |
444 | } | 451 | } |
445 | return 0; | 452 | return 0; |
@@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused) | |||
468 | */ | 475 | */ |
469 | void laptop_io_completion(void) | 476 | void laptop_io_completion(void) |
470 | { | 477 | { |
471 | mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); | 478 | mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); |
472 | } | 479 | } |
473 | 480 | ||
474 | /* | 481 | /* |
@@ -544,7 +551,7 @@ void __init page_writeback_init(void) | |||
544 | if (vm_dirty_ratio <= 0) | 551 | if (vm_dirty_ratio <= 0) |
545 | vm_dirty_ratio = 1; | 552 | vm_dirty_ratio = 1; |
546 | } | 553 | } |
547 | mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); | 554 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
548 | set_ratelimit(); | 555 | set_ratelimit(); |
549 | register_cpu_notifier(&ratelimit_nb); | 556 | register_cpu_notifier(&ratelimit_nb); |
550 | } | 557 | } |
@@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page); | |||
621 | */ | 628 | */ |
622 | int __set_page_dirty_nobuffers(struct page *page) | 629 | int __set_page_dirty_nobuffers(struct page *page) |
623 | { | 630 | { |
624 | int ret = 0; | ||
625 | |||
626 | if (!TestSetPageDirty(page)) { | 631 | if (!TestSetPageDirty(page)) { |
627 | struct address_space *mapping = page_mapping(page); | 632 | struct address_space *mapping = page_mapping(page); |
628 | struct address_space *mapping2; | 633 | struct address_space *mapping2; |
@@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
644 | I_DIRTY_PAGES); | 649 | I_DIRTY_PAGES); |
645 | } | 650 | } |
646 | } | 651 | } |
652 | return 1; | ||
647 | } | 653 | } |
648 | return ret; | 654 | return 0; |
649 | } | 655 | } |
650 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); | 656 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); |
651 | 657 | ||
@@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page) | |||
675 | return (*spd)(page); | 681 | return (*spd)(page); |
676 | return __set_page_dirty_buffers(page); | 682 | return __set_page_dirty_buffers(page); |
677 | } | 683 | } |
678 | if (!PageDirty(page)) | 684 | if (!PageDirty(page)) { |
679 | SetPageDirty(page); | 685 | if (!TestSetPageDirty(page)) |
686 | return 1; | ||
687 | } | ||
680 | return 0; | 688 | return 0; |
681 | } | 689 | } |
682 | EXPORT_SYMBOL(set_page_dirty); | 690 | EXPORT_SYMBOL(set_page_dirty); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 234bd4895d14..dc523a1f270d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -49,13 +49,11 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; | |||
49 | EXPORT_SYMBOL(node_online_map); | 49 | EXPORT_SYMBOL(node_online_map); |
50 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 50 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; |
51 | EXPORT_SYMBOL(node_possible_map); | 51 | EXPORT_SYMBOL(node_possible_map); |
52 | struct pglist_data *pgdat_list __read_mostly; | ||
53 | unsigned long totalram_pages __read_mostly; | 52 | unsigned long totalram_pages __read_mostly; |
54 | unsigned long totalhigh_pages __read_mostly; | 53 | unsigned long totalhigh_pages __read_mostly; |
55 | long nr_swap_pages; | 54 | long nr_swap_pages; |
56 | int percpu_pagelist_fraction; | 55 | int percpu_pagelist_fraction; |
57 | 56 | ||
58 | static void fastcall free_hot_cold_page(struct page *page, int cold); | ||
59 | static void __free_pages_ok(struct page *page, unsigned int order); | 57 | static void __free_pages_ok(struct page *page, unsigned int order); |
60 | 58 | ||
61 | /* | 59 | /* |
@@ -190,7 +188,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
190 | for (i = 0; i < nr_pages; i++) { | 188 | for (i = 0; i < nr_pages; i++) { |
191 | struct page *p = page + i; | 189 | struct page *p = page + i; |
192 | 190 | ||
193 | SetPageCompound(p); | 191 | __SetPageCompound(p); |
194 | set_page_private(p, (unsigned long)page); | 192 | set_page_private(p, (unsigned long)page); |
195 | } | 193 | } |
196 | } | 194 | } |
@@ -209,10 +207,24 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
209 | if (unlikely(!PageCompound(p) | | 207 | if (unlikely(!PageCompound(p) | |
210 | (page_private(p) != (unsigned long)page))) | 208 | (page_private(p) != (unsigned long)page))) |
211 | bad_page(page); | 209 | bad_page(page); |
212 | ClearPageCompound(p); | 210 | __ClearPageCompound(p); |
213 | } | 211 | } |
214 | } | 212 | } |
215 | 213 | ||
214 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | ||
215 | { | ||
216 | int i; | ||
217 | |||
218 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | ||
219 | /* | ||
220 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | ||
221 | * and __GFP_HIGHMEM from hard or soft interrupt context. | ||
222 | */ | ||
223 | BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | ||
224 | for (i = 0; i < (1 << order); i++) | ||
225 | clear_highpage(page + i); | ||
226 | } | ||
227 | |||
216 | /* | 228 | /* |
217 | * function for dealing with page's order in buddy system. | 229 | * function for dealing with page's order in buddy system. |
218 | * zone->lock is already acquired when we use these. | 230 | * zone->lock is already acquired when we use these. |
@@ -423,11 +435,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
423 | mutex_debug_check_no_locks_freed(page_address(page), | 435 | mutex_debug_check_no_locks_freed(page_address(page), |
424 | PAGE_SIZE<<order); | 436 | PAGE_SIZE<<order); |
425 | 437 | ||
426 | #ifndef CONFIG_MMU | ||
427 | for (i = 1 ; i < (1 << order) ; ++i) | ||
428 | __put_page(page + i); | ||
429 | #endif | ||
430 | |||
431 | for (i = 0 ; i < (1 << order) ; ++i) | 438 | for (i = 0 ; i < (1 << order) ; ++i) |
432 | reserved += free_pages_check(page + i); | 439 | reserved += free_pages_check(page + i); |
433 | if (reserved) | 440 | if (reserved) |
@@ -448,28 +455,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | |||
448 | if (order == 0) { | 455 | if (order == 0) { |
449 | __ClearPageReserved(page); | 456 | __ClearPageReserved(page); |
450 | set_page_count(page, 0); | 457 | set_page_count(page, 0); |
451 | 458 | set_page_refcounted(page); | |
452 | free_hot_cold_page(page, 0); | 459 | __free_page(page); |
453 | } else { | 460 | } else { |
454 | LIST_HEAD(list); | ||
455 | int loop; | 461 | int loop; |
456 | 462 | ||
463 | prefetchw(page); | ||
457 | for (loop = 0; loop < BITS_PER_LONG; loop++) { | 464 | for (loop = 0; loop < BITS_PER_LONG; loop++) { |
458 | struct page *p = &page[loop]; | 465 | struct page *p = &page[loop]; |
459 | 466 | ||
460 | if (loop + 16 < BITS_PER_LONG) | 467 | if (loop + 1 < BITS_PER_LONG) |
461 | prefetchw(p + 16); | 468 | prefetchw(p + 1); |
462 | __ClearPageReserved(p); | 469 | __ClearPageReserved(p); |
463 | set_page_count(p, 0); | 470 | set_page_count(p, 0); |
464 | } | 471 | } |
465 | 472 | ||
466 | arch_free_page(page, order); | 473 | set_page_refcounted(page); |
467 | 474 | __free_pages(page, order); | |
468 | mod_page_state(pgfree, 1 << order); | ||
469 | |||
470 | list_add(&page->lru, &list); | ||
471 | kernel_map_pages(page, 1 << order, 0); | ||
472 | free_pages_bulk(page_zone(page), 1, &list, order); | ||
473 | } | 475 | } |
474 | } | 476 | } |
475 | 477 | ||
@@ -507,7 +509,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
507 | /* | 509 | /* |
508 | * This page is about to be returned from the page allocator | 510 | * This page is about to be returned from the page allocator |
509 | */ | 511 | */ |
510 | static int prep_new_page(struct page *page, int order) | 512 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) |
511 | { | 513 | { |
512 | if (unlikely(page_mapcount(page) | | 514 | if (unlikely(page_mapcount(page) | |
513 | (page->mapping != NULL) | | 515 | (page->mapping != NULL) | |
@@ -536,8 +538,15 @@ static int prep_new_page(struct page *page, int order) | |||
536 | 1 << PG_referenced | 1 << PG_arch_1 | | 538 | 1 << PG_referenced | 1 << PG_arch_1 | |
537 | 1 << PG_checked | 1 << PG_mappedtodisk); | 539 | 1 << PG_checked | 1 << PG_mappedtodisk); |
538 | set_page_private(page, 0); | 540 | set_page_private(page, 0); |
539 | set_page_refs(page, order); | 541 | set_page_refcounted(page); |
540 | kernel_map_pages(page, 1 << order, 1); | 542 | kernel_map_pages(page, 1 << order, 1); |
543 | |||
544 | if (gfp_flags & __GFP_ZERO) | ||
545 | prep_zero_page(page, order, gfp_flags); | ||
546 | |||
547 | if (order && (gfp_flags & __GFP_COMP)) | ||
548 | prep_compound_page(page, order); | ||
549 | |||
541 | return 0; | 550 | return 0; |
542 | } | 551 | } |
543 | 552 | ||
@@ -593,13 +602,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
593 | /* | 602 | /* |
594 | * Called from the slab reaper to drain pagesets on a particular node that | 603 | * Called from the slab reaper to drain pagesets on a particular node that |
595 | * belong to the currently executing processor. | 604 | * belong to the currently executing processor. |
605 | * Note that this function must be called with the thread pinned to | ||
606 | * a single processor. | ||
596 | */ | 607 | */ |
597 | void drain_node_pages(int nodeid) | 608 | void drain_node_pages(int nodeid) |
598 | { | 609 | { |
599 | int i, z; | 610 | int i, z; |
600 | unsigned long flags; | 611 | unsigned long flags; |
601 | 612 | ||
602 | local_irq_save(flags); | ||
603 | for (z = 0; z < MAX_NR_ZONES; z++) { | 613 | for (z = 0; z < MAX_NR_ZONES; z++) { |
604 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | 614 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; |
605 | struct per_cpu_pageset *pset; | 615 | struct per_cpu_pageset *pset; |
@@ -609,11 +619,14 @@ void drain_node_pages(int nodeid) | |||
609 | struct per_cpu_pages *pcp; | 619 | struct per_cpu_pages *pcp; |
610 | 620 | ||
611 | pcp = &pset->pcp[i]; | 621 | pcp = &pset->pcp[i]; |
612 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 622 | if (pcp->count) { |
613 | pcp->count = 0; | 623 | local_irq_save(flags); |
624 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | ||
625 | pcp->count = 0; | ||
626 | local_irq_restore(flags); | ||
627 | } | ||
614 | } | 628 | } |
615 | } | 629 | } |
616 | local_irq_restore(flags); | ||
617 | } | 630 | } |
618 | #endif | 631 | #endif |
619 | 632 | ||
@@ -743,13 +756,22 @@ void fastcall free_cold_page(struct page *page) | |||
743 | free_hot_cold_page(page, 1); | 756 | free_hot_cold_page(page, 1); |
744 | } | 757 | } |
745 | 758 | ||
746 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 759 | /* |
760 | * split_page takes a non-compound higher-order page, and splits it into | ||
761 | * n (1<<order) sub-pages: page[0..n] | ||
762 | * Each sub-page must be freed individually. | ||
763 | * | ||
764 | * Note: this is probably too low level an operation for use in drivers. | ||
765 | * Please consult with lkml before using this in your driver. | ||
766 | */ | ||
767 | void split_page(struct page *page, unsigned int order) | ||
747 | { | 768 | { |
748 | int i; | 769 | int i; |
749 | 770 | ||
750 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | 771 | BUG_ON(PageCompound(page)); |
751 | for(i = 0; i < (1 << order); i++) | 772 | BUG_ON(!page_count(page)); |
752 | clear_highpage(page + i); | 773 | for (i = 1; i < (1 << order); i++) |
774 | set_page_refcounted(page + i); | ||
753 | } | 775 | } |
754 | 776 | ||
755 | /* | 777 | /* |
@@ -795,14 +817,8 @@ again: | |||
795 | put_cpu(); | 817 | put_cpu(); |
796 | 818 | ||
797 | BUG_ON(bad_range(zone, page)); | 819 | BUG_ON(bad_range(zone, page)); |
798 | if (prep_new_page(page, order)) | 820 | if (prep_new_page(page, order, gfp_flags)) |
799 | goto again; | 821 | goto again; |
800 | |||
801 | if (gfp_flags & __GFP_ZERO) | ||
802 | prep_zero_page(page, order, gfp_flags); | ||
803 | |||
804 | if (order && (gfp_flags & __GFP_COMP)) | ||
805 | prep_compound_page(page, order); | ||
806 | return page; | 822 | return page; |
807 | 823 | ||
808 | failed: | 824 | failed: |
@@ -926,7 +942,8 @@ restart: | |||
926 | goto got_pg; | 942 | goto got_pg; |
927 | 943 | ||
928 | do { | 944 | do { |
929 | wakeup_kswapd(*z, order); | 945 | if (cpuset_zone_allowed(*z, gfp_mask)) |
946 | wakeup_kswapd(*z, order); | ||
930 | } while (*(++z)); | 947 | } while (*(++z)); |
931 | 948 | ||
932 | /* | 949 | /* |
@@ -1183,7 +1200,7 @@ unsigned int nr_free_highpages (void) | |||
1183 | pg_data_t *pgdat; | 1200 | pg_data_t *pgdat; |
1184 | unsigned int pages = 0; | 1201 | unsigned int pages = 0; |
1185 | 1202 | ||
1186 | for_each_pgdat(pgdat) | 1203 | for_each_online_pgdat(pgdat) |
1187 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; | 1204 | pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; |
1188 | 1205 | ||
1189 | return pages; | 1206 | return pages; |
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | |||
1214 | 1231 | ||
1215 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 1232 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) |
1216 | { | 1233 | { |
1217 | int cpu = 0; | 1234 | unsigned cpu; |
1218 | 1235 | ||
1219 | memset(ret, 0, nr * sizeof(unsigned long)); | 1236 | memset(ret, 0, nr * sizeof(unsigned long)); |
1220 | cpus_and(*cpumask, *cpumask, cpu_online_map); | 1237 | cpus_and(*cpumask, *cpumask, cpu_online_map); |
1221 | 1238 | ||
1222 | cpu = first_cpu(*cpumask); | 1239 | for_each_cpu_mask(cpu, *cpumask) { |
1223 | while (cpu < NR_CPUS) { | 1240 | unsigned long *in; |
1224 | unsigned long *in, *out, off; | 1241 | unsigned long *out; |
1225 | 1242 | unsigned off; | |
1226 | if (!cpu_isset(cpu, *cpumask)) | 1243 | unsigned next_cpu; |
1227 | continue; | ||
1228 | 1244 | ||
1229 | in = (unsigned long *)&per_cpu(page_states, cpu); | 1245 | in = (unsigned long *)&per_cpu(page_states, cpu); |
1230 | 1246 | ||
1231 | cpu = next_cpu(cpu, *cpumask); | 1247 | next_cpu = next_cpu(cpu, *cpumask); |
1232 | 1248 | if (likely(next_cpu < NR_CPUS)) | |
1233 | if (likely(cpu < NR_CPUS)) | 1249 | prefetch(&per_cpu(page_states, next_cpu)); |
1234 | prefetch(&per_cpu(page_states, cpu)); | ||
1235 | 1250 | ||
1236 | out = (unsigned long *)ret; | 1251 | out = (unsigned long *)ret; |
1237 | for (off = 0; off < nr; off++) | 1252 | for (off = 0; off < nr; off++) |
@@ -1327,7 +1342,7 @@ void get_zone_counts(unsigned long *active, | |||
1327 | *active = 0; | 1342 | *active = 0; |
1328 | *inactive = 0; | 1343 | *inactive = 0; |
1329 | *free = 0; | 1344 | *free = 0; |
1330 | for_each_pgdat(pgdat) { | 1345 | for_each_online_pgdat(pgdat) { |
1331 | unsigned long l, m, n; | 1346 | unsigned long l, m, n; |
1332 | __get_zone_counts(&l, &m, &n, pgdat); | 1347 | __get_zone_counts(&l, &m, &n, pgdat); |
1333 | *active += l; | 1348 | *active += l; |
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
1764 | continue; | 1779 | continue; |
1765 | page = pfn_to_page(pfn); | 1780 | page = pfn_to_page(pfn); |
1766 | set_page_links(page, zone, nid, pfn); | 1781 | set_page_links(page, zone, nid, pfn); |
1767 | set_page_count(page, 1); | 1782 | init_page_count(page); |
1768 | reset_page_mapcount(page); | 1783 | reset_page_mapcount(page); |
1769 | SetPageReserved(page); | 1784 | SetPageReserved(page); |
1770 | INIT_LIST_HEAD(&page->lru); | 1785 | INIT_LIST_HEAD(&page->lru); |
@@ -2013,8 +2028,9 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
2013 | setup_pageset(zone_pcp(zone,cpu), batch); | 2028 | setup_pageset(zone_pcp(zone,cpu), batch); |
2014 | #endif | 2029 | #endif |
2015 | } | 2030 | } |
2016 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 2031 | if (zone->present_pages) |
2017 | zone->name, zone->present_pages, batch); | 2032 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", |
2033 | zone->name, zone->present_pages, batch); | ||
2018 | } | 2034 | } |
2019 | 2035 | ||
2020 | static __meminit void init_currently_empty_zone(struct zone *zone, | 2036 | static __meminit void init_currently_empty_zone(struct zone *zone, |
@@ -2025,7 +2041,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone, | |||
2025 | zone_wait_table_init(zone, size); | 2041 | zone_wait_table_init(zone, size); |
2026 | pgdat->nr_zones = zone_idx(zone) + 1; | 2042 | pgdat->nr_zones = zone_idx(zone) + 1; |
2027 | 2043 | ||
2028 | zone->zone_mem_map = pfn_to_page(zone_start_pfn); | ||
2029 | zone->zone_start_pfn = zone_start_pfn; | 2044 | zone->zone_start_pfn = zone_start_pfn; |
2030 | 2045 | ||
2031 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | 2046 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); |
@@ -2153,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos) | |||
2153 | { | 2168 | { |
2154 | pg_data_t *pgdat; | 2169 | pg_data_t *pgdat; |
2155 | loff_t node = *pos; | 2170 | loff_t node = *pos; |
2156 | 2171 | for (pgdat = first_online_pgdat(); | |
2157 | for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) | 2172 | pgdat && node; |
2173 | pgdat = next_online_pgdat(pgdat)) | ||
2158 | --node; | 2174 | --node; |
2159 | 2175 | ||
2160 | return pgdat; | 2176 | return pgdat; |
@@ -2165,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | |||
2165 | pg_data_t *pgdat = (pg_data_t *)arg; | 2181 | pg_data_t *pgdat = (pg_data_t *)arg; |
2166 | 2182 | ||
2167 | (*pos)++; | 2183 | (*pos)++; |
2168 | return pgdat->pgdat_next; | 2184 | return next_online_pgdat(pgdat); |
2169 | } | 2185 | } |
2170 | 2186 | ||
2171 | static void frag_stop(struct seq_file *m, void *arg) | 2187 | static void frag_stop(struct seq_file *m, void *arg) |
@@ -2466,7 +2482,7 @@ static void setup_per_zone_lowmem_reserve(void) | |||
2466 | struct pglist_data *pgdat; | 2482 | struct pglist_data *pgdat; |
2467 | int j, idx; | 2483 | int j, idx; |
2468 | 2484 | ||
2469 | for_each_pgdat(pgdat) { | 2485 | for_each_online_pgdat(pgdat) { |
2470 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2486 | for (j = 0; j < MAX_NR_ZONES; j++) { |
2471 | struct zone *zone = pgdat->node_zones + j; | 2487 | struct zone *zone = pgdat->node_zones + j; |
2472 | unsigned long present_pages = zone->present_pages; | 2488 | unsigned long present_pages = zone->present_pages; |
@@ -2685,8 +2701,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2685 | else | 2701 | else |
2686 | numentries <<= (PAGE_SHIFT - scale); | 2702 | numentries <<= (PAGE_SHIFT - scale); |
2687 | } | 2703 | } |
2688 | /* rounded up to nearest power of 2 in size */ | 2704 | numentries = roundup_pow_of_two(numentries); |
2689 | numentries = 1UL << (long_log2(numentries) + 1); | ||
2690 | 2705 | ||
2691 | /* limit allocation size to 1/16 total memory by default */ | 2706 | /* limit allocation size to 1/16 total memory by default */ |
2692 | if (max == 0) { | 2707 | if (max == 0) { |
@@ -2729,3 +2744,44 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2729 | 2744 | ||
2730 | return table; | 2745 | return table; |
2731 | } | 2746 | } |
2747 | |||
2748 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE | ||
2749 | /* | ||
2750 | * pfn <-> page translation. out-of-line version. | ||
2751 | * (see asm-generic/memory_model.h) | ||
2752 | */ | ||
2753 | #if defined(CONFIG_FLATMEM) | ||
2754 | struct page *pfn_to_page(unsigned long pfn) | ||
2755 | { | ||
2756 | return mem_map + (pfn - ARCH_PFN_OFFSET); | ||
2757 | } | ||
2758 | unsigned long page_to_pfn(struct page *page) | ||
2759 | { | ||
2760 | return (page - mem_map) + ARCH_PFN_OFFSET; | ||
2761 | } | ||
2762 | #elif defined(CONFIG_DISCONTIGMEM) | ||
2763 | struct page *pfn_to_page(unsigned long pfn) | ||
2764 | { | ||
2765 | int nid = arch_pfn_to_nid(pfn); | ||
2766 | return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); | ||
2767 | } | ||
2768 | unsigned long page_to_pfn(struct page *page) | ||
2769 | { | ||
2770 | struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); | ||
2771 | return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; | ||
2772 | } | ||
2773 | #elif defined(CONFIG_SPARSEMEM) | ||
2774 | struct page *pfn_to_page(unsigned long pfn) | ||
2775 | { | ||
2776 | return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; | ||
2777 | } | ||
2778 | |||
2779 | unsigned long page_to_pfn(struct page *page) | ||
2780 | { | ||
2781 | long section_id = page_to_section(page); | ||
2782 | return page - __section_mem_map_addr(__nr_to_section(section_id)); | ||
2783 | } | ||
2784 | #endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ | ||
2785 | EXPORT_SYMBOL(pfn_to_page); | ||
2786 | EXPORT_SYMBOL(page_to_pfn); | ||
2787 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | ||
diff --git a/mm/readahead.c b/mm/readahead.c index 9f0b98227b41..ba7db816f4c8 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -53,13 +53,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) | |||
53 | return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 53 | return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
54 | } | 54 | } |
55 | 55 | ||
56 | static inline void reset_ahead_window(struct file_ra_state *ra) | ||
57 | { | ||
58 | /* | ||
59 | * ... but preserve ahead_start + ahead_size value, | ||
60 | * see 'recheck:' label in page_cache_readahead(). | ||
61 | * Note: We never use ->ahead_size as rvalue without | ||
62 | * checking ->ahead_start != 0 first. | ||
63 | */ | ||
64 | ra->ahead_size += ra->ahead_start; | ||
65 | ra->ahead_start = 0; | ||
66 | } | ||
67 | |||
56 | static inline void ra_off(struct file_ra_state *ra) | 68 | static inline void ra_off(struct file_ra_state *ra) |
57 | { | 69 | { |
58 | ra->start = 0; | 70 | ra->start = 0; |
59 | ra->flags = 0; | 71 | ra->flags = 0; |
60 | ra->size = 0; | 72 | ra->size = 0; |
61 | ra->ahead_start = 0; | 73 | reset_ahead_window(ra); |
62 | ra->ahead_size = 0; | ||
63 | return; | 74 | return; |
64 | } | 75 | } |
65 | 76 | ||
@@ -73,10 +84,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | |||
73 | { | 84 | { |
74 | unsigned long newsize = roundup_pow_of_two(size); | 85 | unsigned long newsize = roundup_pow_of_two(size); |
75 | 86 | ||
76 | if (newsize <= max / 64) | 87 | if (newsize <= max / 32) |
77 | newsize = newsize * newsize; | 88 | newsize = newsize * 4; |
78 | else if (newsize <= max / 4) | 89 | else if (newsize <= max / 4) |
79 | newsize = max / 4; | 90 | newsize = newsize * 2; |
80 | else | 91 | else |
81 | newsize = max; | 92 | newsize = max; |
82 | return newsize; | 93 | return newsize; |
@@ -427,8 +438,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, | |||
427 | * congestion. The ahead window will any way be closed | 438 | * congestion. The ahead window will any way be closed |
428 | * in case we failed due to excessive page cache hits. | 439 | * in case we failed due to excessive page cache hits. |
429 | */ | 440 | */ |
430 | ra->ahead_start = 0; | 441 | reset_ahead_window(ra); |
431 | ra->ahead_size = 0; | ||
432 | } | 442 | } |
433 | 443 | ||
434 | return ret; | 444 | return ret; |
@@ -521,11 +531,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | |||
521 | * If we get here we are doing sequential IO and this was not the first | 531 | * If we get here we are doing sequential IO and this was not the first |
522 | * occurence (ie we have an existing window) | 532 | * occurence (ie we have an existing window) |
523 | */ | 533 | */ |
524 | |||
525 | if (ra->ahead_start == 0) { /* no ahead window yet */ | 534 | if (ra->ahead_start == 0) { /* no ahead window yet */ |
526 | if (!make_ahead_window(mapping, filp, ra, 0)) | 535 | if (!make_ahead_window(mapping, filp, ra, 0)) |
527 | goto out; | 536 | goto recheck; |
528 | } | 537 | } |
538 | |||
529 | /* | 539 | /* |
530 | * Already have an ahead window, check if we crossed into it. | 540 | * Already have an ahead window, check if we crossed into it. |
531 | * If so, shift windows and issue a new ahead window. | 541 | * If so, shift windows and issue a new ahead window. |
@@ -537,11 +547,16 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | |||
537 | ra->start = ra->ahead_start; | 547 | ra->start = ra->ahead_start; |
538 | ra->size = ra->ahead_size; | 548 | ra->size = ra->ahead_size; |
539 | make_ahead_window(mapping, filp, ra, 0); | 549 | make_ahead_window(mapping, filp, ra, 0); |
550 | recheck: | ||
551 | /* prev_page shouldn't overrun the ahead window */ | ||
552 | ra->prev_page = min(ra->prev_page, | ||
553 | ra->ahead_start + ra->ahead_size - 1); | ||
540 | } | 554 | } |
541 | 555 | ||
542 | out: | 556 | out: |
543 | return ra->prev_page + 1; | 557 | return ra->prev_page + 1; |
544 | } | 558 | } |
559 | EXPORT_SYMBOL_GPL(page_cache_readahead); | ||
545 | 560 | ||
546 | /* | 561 | /* |
547 | * handle_ra_miss() is called when it is known that a page which should have | 562 | * handle_ra_miss() is called when it is known that a page which should have |
@@ -56,13 +56,11 @@ | |||
56 | 56 | ||
57 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
58 | 58 | ||
59 | //#define RMAP_DEBUG /* can be enabled only for debugging */ | 59 | struct kmem_cache *anon_vma_cachep; |
60 | |||
61 | kmem_cache_t *anon_vma_cachep; | ||
62 | 60 | ||
63 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) | 61 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) |
64 | { | 62 | { |
65 | #ifdef RMAP_DEBUG | 63 | #ifdef CONFIG_DEBUG_VM |
66 | struct anon_vma *anon_vma = find_vma->anon_vma; | 64 | struct anon_vma *anon_vma = find_vma->anon_vma; |
67 | struct vm_area_struct *vma; | 65 | struct vm_area_struct *vma; |
68 | unsigned int mapcount = 0; | 66 | unsigned int mapcount = 0; |
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
166 | anon_vma_free(anon_vma); | 164 | anon_vma_free(anon_vma); |
167 | } | 165 | } |
168 | 166 | ||
169 | static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) | 167 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, |
168 | unsigned long flags) | ||
170 | { | 169 | { |
171 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | 170 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == |
172 | SLAB_CTOR_CONSTRUCTOR) { | 171 | SLAB_CTOR_CONSTRUCTOR) { |
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page) | |||
550 | void page_remove_rmap(struct page *page) | 549 | void page_remove_rmap(struct page *page) |
551 | { | 550 | { |
552 | if (atomic_add_negative(-1, &page->_mapcount)) { | 551 | if (atomic_add_negative(-1, &page->_mapcount)) { |
553 | if (page_mapcount(page) < 0) { | 552 | #ifdef CONFIG_DEBUG_VM |
553 | if (unlikely(page_mapcount(page) < 0)) { | ||
554 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | 554 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); |
555 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | 555 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); |
556 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | 556 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); |
557 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | 557 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); |
558 | } | 558 | } |
559 | 559 | #endif | |
560 | BUG_ON(page_mapcount(page) < 0); | 560 | BUG_ON(page_mapcount(page) < 0); |
561 | /* | 561 | /* |
562 | * It would be tidy to reset the PageAnon mapping here, | 562 | * It would be tidy to reset the PageAnon mapping here, |
diff --git a/mm/shmem.c b/mm/shmem.c index 7c455fbaff7b..37eaf42ed2c6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -875,7 +875,7 @@ redirty: | |||
875 | } | 875 | } |
876 | 876 | ||
877 | #ifdef CONFIG_NUMA | 877 | #ifdef CONFIG_NUMA |
878 | static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | 878 | static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) |
879 | { | 879 | { |
880 | char *nodelist = strchr(value, ':'); | 880 | char *nodelist = strchr(value, ':'); |
881 | int err = 1; | 881 | int err = 1; |
@@ -2119,7 +2119,7 @@ failed: | |||
2119 | return err; | 2119 | return err; |
2120 | } | 2120 | } |
2121 | 2121 | ||
2122 | static kmem_cache_t *shmem_inode_cachep; | 2122 | static struct kmem_cache *shmem_inode_cachep; |
2123 | 2123 | ||
2124 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2124 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2125 | { | 2125 | { |
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode) | |||
2139 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2139 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
2140 | } | 2140 | } |
2141 | 2141 | ||
2142 | static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) | 2142 | static void init_once(void *foo, struct kmem_cache *cachep, |
2143 | unsigned long flags) | ||
2143 | { | 2144 | { |
2144 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2145 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
2145 | 2146 | ||
@@ -50,7 +50,7 @@ | |||
50 | * The head array is strictly LIFO and should improve the cache hit rates. | 50 | * The head array is strictly LIFO and should improve the cache hit rates. |
51 | * On SMP, it additionally reduces the spinlock operations. | 51 | * On SMP, it additionally reduces the spinlock operations. |
52 | * | 52 | * |
53 | * The c_cpuarray may not be read with enabled local interrupts - | 53 | * The c_cpuarray may not be read with enabled local interrupts - |
54 | * it's changed with a smp_call_function(). | 54 | * it's changed with a smp_call_function(). |
55 | * | 55 | * |
56 | * SMP synchronization: | 56 | * SMP synchronization: |
@@ -94,6 +94,7 @@ | |||
94 | #include <linux/interrupt.h> | 94 | #include <linux/interrupt.h> |
95 | #include <linux/init.h> | 95 | #include <linux/init.h> |
96 | #include <linux/compiler.h> | 96 | #include <linux/compiler.h> |
97 | #include <linux/cpuset.h> | ||
97 | #include <linux/seq_file.h> | 98 | #include <linux/seq_file.h> |
98 | #include <linux/notifier.h> | 99 | #include <linux/notifier.h> |
99 | #include <linux/kallsyms.h> | 100 | #include <linux/kallsyms.h> |
@@ -170,15 +171,15 @@ | |||
170 | #if DEBUG | 171 | #if DEBUG |
171 | # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ | 172 | # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ |
172 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ | 173 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ |
173 | SLAB_NO_REAP | SLAB_CACHE_DMA | \ | 174 | SLAB_CACHE_DMA | \ |
174 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ | 175 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ |
175 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 176 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
176 | SLAB_DESTROY_BY_RCU) | 177 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) |
177 | #else | 178 | #else |
178 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ | 179 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
179 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ | 180 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ |
180 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 181 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
181 | SLAB_DESTROY_BY_RCU) | 182 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) |
182 | #endif | 183 | #endif |
183 | 184 | ||
184 | /* | 185 | /* |
@@ -203,7 +204,8 @@ | |||
203 | typedef unsigned int kmem_bufctl_t; | 204 | typedef unsigned int kmem_bufctl_t; |
204 | #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) | 205 | #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) |
205 | #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) | 206 | #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) |
206 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) | 207 | #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) |
208 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) | ||
207 | 209 | ||
208 | /* Max number of objs-per-slab for caches which use off-slab slabs. | 210 | /* Max number of objs-per-slab for caches which use off-slab slabs. |
209 | * Needed to avoid a possible looping condition in cache_grow(). | 211 | * Needed to avoid a possible looping condition in cache_grow(). |
@@ -266,16 +268,17 @@ struct array_cache { | |||
266 | unsigned int batchcount; | 268 | unsigned int batchcount; |
267 | unsigned int touched; | 269 | unsigned int touched; |
268 | spinlock_t lock; | 270 | spinlock_t lock; |
269 | void *entry[0]; /* | 271 | void *entry[0]; /* |
270 | * Must have this definition in here for the proper | 272 | * Must have this definition in here for the proper |
271 | * alignment of array_cache. Also simplifies accessing | 273 | * alignment of array_cache. Also simplifies accessing |
272 | * the entries. | 274 | * the entries. |
273 | * [0] is for gcc 2.95. It should really be []. | 275 | * [0] is for gcc 2.95. It should really be []. |
274 | */ | 276 | */ |
275 | }; | 277 | }; |
276 | 278 | ||
277 | /* bootstrap: The caches do not work without cpuarrays anymore, | 279 | /* |
278 | * but the cpuarrays are allocated from the generic caches... | 280 | * bootstrap: The caches do not work without cpuarrays anymore, but the |
281 | * cpuarrays are allocated from the generic caches... | ||
279 | */ | 282 | */ |
280 | #define BOOT_CPUCACHE_ENTRIES 1 | 283 | #define BOOT_CPUCACHE_ENTRIES 1 |
281 | struct arraycache_init { | 284 | struct arraycache_init { |
@@ -291,13 +294,13 @@ struct kmem_list3 { | |||
291 | struct list_head slabs_full; | 294 | struct list_head slabs_full; |
292 | struct list_head slabs_free; | 295 | struct list_head slabs_free; |
293 | unsigned long free_objects; | 296 | unsigned long free_objects; |
294 | unsigned long next_reap; | ||
295 | int free_touched; | ||
296 | unsigned int free_limit; | 297 | unsigned int free_limit; |
297 | unsigned int colour_next; /* Per-node cache coloring */ | 298 | unsigned int colour_next; /* Per-node cache coloring */ |
298 | spinlock_t list_lock; | 299 | spinlock_t list_lock; |
299 | struct array_cache *shared; /* shared per node */ | 300 | struct array_cache *shared; /* shared per node */ |
300 | struct array_cache **alien; /* on other nodes */ | 301 | struct array_cache **alien; /* on other nodes */ |
302 | unsigned long next_reap; /* updated without locking */ | ||
303 | int free_touched; /* updated without locking */ | ||
301 | }; | 304 | }; |
302 | 305 | ||
303 | /* | 306 | /* |
@@ -310,10 +313,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; | |||
310 | #define SIZE_L3 (1 + MAX_NUMNODES) | 313 | #define SIZE_L3 (1 + MAX_NUMNODES) |
311 | 314 | ||
312 | /* | 315 | /* |
313 | * This function must be completely optimized away if | 316 | * This function must be completely optimized away if a constant is passed to |
314 | * a constant is passed to it. Mostly the same as | 317 | * it. Mostly the same as what is in linux/slab.h except it returns an index. |
315 | * what is in linux/slab.h except it returns an | ||
316 | * index. | ||
317 | */ | 318 | */ |
318 | static __always_inline int index_of(const size_t size) | 319 | static __always_inline int index_of(const size_t size) |
319 | { | 320 | { |
@@ -351,14 +352,14 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
351 | parent->free_touched = 0; | 352 | parent->free_touched = 0; |
352 | } | 353 | } |
353 | 354 | ||
354 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ | 355 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ |
355 | do { \ | 356 | do { \ |
356 | INIT_LIST_HEAD(listp); \ | 357 | INIT_LIST_HEAD(listp); \ |
357 | list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ | 358 | list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ |
358 | } while (0) | 359 | } while (0) |
359 | 360 | ||
360 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ | 361 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ |
361 | do { \ | 362 | do { \ |
362 | MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ | 363 | MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ |
363 | MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ | 364 | MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ |
364 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 365 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
@@ -373,28 +374,30 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
373 | struct kmem_cache { | 374 | struct kmem_cache { |
374 | /* 1) per-cpu data, touched during every alloc/free */ | 375 | /* 1) per-cpu data, touched during every alloc/free */ |
375 | struct array_cache *array[NR_CPUS]; | 376 | struct array_cache *array[NR_CPUS]; |
377 | /* 2) Cache tunables. Protected by cache_chain_mutex */ | ||
376 | unsigned int batchcount; | 378 | unsigned int batchcount; |
377 | unsigned int limit; | 379 | unsigned int limit; |
378 | unsigned int shared; | 380 | unsigned int shared; |
381 | |||
379 | unsigned int buffer_size; | 382 | unsigned int buffer_size; |
380 | /* 2) touched by every alloc & free from the backend */ | 383 | /* 3) touched by every alloc & free from the backend */ |
381 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | 384 | struct kmem_list3 *nodelists[MAX_NUMNODES]; |
382 | unsigned int flags; /* constant flags */ | ||
383 | unsigned int num; /* # of objs per slab */ | ||
384 | spinlock_t spinlock; | ||
385 | 385 | ||
386 | /* 3) cache_grow/shrink */ | 386 | unsigned int flags; /* constant flags */ |
387 | unsigned int num; /* # of objs per slab */ | ||
388 | |||
389 | /* 4) cache_grow/shrink */ | ||
387 | /* order of pgs per slab (2^n) */ | 390 | /* order of pgs per slab (2^n) */ |
388 | unsigned int gfporder; | 391 | unsigned int gfporder; |
389 | 392 | ||
390 | /* force GFP flags, e.g. GFP_DMA */ | 393 | /* force GFP flags, e.g. GFP_DMA */ |
391 | gfp_t gfpflags; | 394 | gfp_t gfpflags; |
392 | 395 | ||
393 | size_t colour; /* cache colouring range */ | 396 | size_t colour; /* cache colouring range */ |
394 | unsigned int colour_off; /* colour offset */ | 397 | unsigned int colour_off; /* colour offset */ |
395 | struct kmem_cache *slabp_cache; | 398 | struct kmem_cache *slabp_cache; |
396 | unsigned int slab_size; | 399 | unsigned int slab_size; |
397 | unsigned int dflags; /* dynamic flags */ | 400 | unsigned int dflags; /* dynamic flags */ |
398 | 401 | ||
399 | /* constructor func */ | 402 | /* constructor func */ |
400 | void (*ctor) (void *, struct kmem_cache *, unsigned long); | 403 | void (*ctor) (void *, struct kmem_cache *, unsigned long); |
@@ -402,11 +405,11 @@ struct kmem_cache { | |||
402 | /* de-constructor func */ | 405 | /* de-constructor func */ |
403 | void (*dtor) (void *, struct kmem_cache *, unsigned long); | 406 | void (*dtor) (void *, struct kmem_cache *, unsigned long); |
404 | 407 | ||
405 | /* 4) cache creation/removal */ | 408 | /* 5) cache creation/removal */ |
406 | const char *name; | 409 | const char *name; |
407 | struct list_head next; | 410 | struct list_head next; |
408 | 411 | ||
409 | /* 5) statistics */ | 412 | /* 6) statistics */ |
410 | #if STATS | 413 | #if STATS |
411 | unsigned long num_active; | 414 | unsigned long num_active; |
412 | unsigned long num_allocations; | 415 | unsigned long num_allocations; |
@@ -438,8 +441,9 @@ struct kmem_cache { | |||
438 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 441 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
439 | 442 | ||
440 | #define BATCHREFILL_LIMIT 16 | 443 | #define BATCHREFILL_LIMIT 16 |
441 | /* Optimization question: fewer reaps means less | 444 | /* |
442 | * probability for unnessary cpucache drain/refill cycles. | 445 | * Optimization question: fewer reaps means less probability for unnessary |
446 | * cpucache drain/refill cycles. | ||
443 | * | 447 | * |
444 | * OTOH the cpuarrays can contain lots of objects, | 448 | * OTOH the cpuarrays can contain lots of objects, |
445 | * which could lock up otherwise freeable slabs. | 449 | * which could lock up otherwise freeable slabs. |
@@ -453,17 +457,19 @@ struct kmem_cache { | |||
453 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) | 457 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) |
454 | #define STATS_INC_GROWN(x) ((x)->grown++) | 458 | #define STATS_INC_GROWN(x) ((x)->grown++) |
455 | #define STATS_INC_REAPED(x) ((x)->reaped++) | 459 | #define STATS_INC_REAPED(x) ((x)->reaped++) |
456 | #define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ | 460 | #define STATS_SET_HIGH(x) \ |
457 | (x)->high_mark = (x)->num_active; \ | 461 | do { \ |
458 | } while (0) | 462 | if ((x)->num_active > (x)->high_mark) \ |
463 | (x)->high_mark = (x)->num_active; \ | ||
464 | } while (0) | ||
459 | #define STATS_INC_ERR(x) ((x)->errors++) | 465 | #define STATS_INC_ERR(x) ((x)->errors++) |
460 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) | 466 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) |
461 | #define STATS_INC_NODEFREES(x) ((x)->node_frees++) | 467 | #define STATS_INC_NODEFREES(x) ((x)->node_frees++) |
462 | #define STATS_SET_FREEABLE(x, i) \ | 468 | #define STATS_SET_FREEABLE(x, i) \ |
463 | do { if ((x)->max_freeable < i) \ | 469 | do { \ |
464 | (x)->max_freeable = i; \ | 470 | if ((x)->max_freeable < i) \ |
465 | } while (0) | 471 | (x)->max_freeable = i; \ |
466 | 472 | } while (0) | |
467 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) | 473 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) |
468 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) | 474 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) |
469 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) | 475 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) |
@@ -478,9 +484,7 @@ struct kmem_cache { | |||
478 | #define STATS_INC_ERR(x) do { } while (0) | 484 | #define STATS_INC_ERR(x) do { } while (0) |
479 | #define STATS_INC_NODEALLOCS(x) do { } while (0) | 485 | #define STATS_INC_NODEALLOCS(x) do { } while (0) |
480 | #define STATS_INC_NODEFREES(x) do { } while (0) | 486 | #define STATS_INC_NODEFREES(x) do { } while (0) |
481 | #define STATS_SET_FREEABLE(x, i) \ | 487 | #define STATS_SET_FREEABLE(x, i) do { } while (0) |
482 | do { } while (0) | ||
483 | |||
484 | #define STATS_INC_ALLOCHIT(x) do { } while (0) | 488 | #define STATS_INC_ALLOCHIT(x) do { } while (0) |
485 | #define STATS_INC_ALLOCMISS(x) do { } while (0) | 489 | #define STATS_INC_ALLOCMISS(x) do { } while (0) |
486 | #define STATS_INC_FREEHIT(x) do { } while (0) | 490 | #define STATS_INC_FREEHIT(x) do { } while (0) |
@@ -488,7 +492,8 @@ struct kmem_cache { | |||
488 | #endif | 492 | #endif |
489 | 493 | ||
490 | #if DEBUG | 494 | #if DEBUG |
491 | /* Magic nums for obj red zoning. | 495 | /* |
496 | * Magic nums for obj red zoning. | ||
492 | * Placed in the first word before and the first word after an obj. | 497 | * Placed in the first word before and the first word after an obj. |
493 | */ | 498 | */ |
494 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ | 499 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ |
@@ -499,7 +504,8 @@ struct kmem_cache { | |||
499 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ | 504 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ |
500 | #define POISON_END 0xa5 /* end-byte of poisoning */ | 505 | #define POISON_END 0xa5 /* end-byte of poisoning */ |
501 | 506 | ||
502 | /* memory layout of objects: | 507 | /* |
508 | * memory layout of objects: | ||
503 | * 0 : objp | 509 | * 0 : objp |
504 | * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that | 510 | * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that |
505 | * the end of an object is aligned with the end of the real | 511 | * the end of an object is aligned with the end of the real |
@@ -508,7 +514,8 @@ struct kmem_cache { | |||
508 | * redzone word. | 514 | * redzone word. |
509 | * cachep->obj_offset: The real object. | 515 | * cachep->obj_offset: The real object. |
510 | * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] | 516 | * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] |
511 | * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] | 517 | * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address |
518 | * [BYTES_PER_WORD long] | ||
512 | */ | 519 | */ |
513 | static int obj_offset(struct kmem_cache *cachep) | 520 | static int obj_offset(struct kmem_cache *cachep) |
514 | { | 521 | { |
@@ -552,8 +559,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
552 | #endif | 559 | #endif |
553 | 560 | ||
554 | /* | 561 | /* |
555 | * Maximum size of an obj (in 2^order pages) | 562 | * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp |
556 | * and absolute limit for the gfp order. | 563 | * order. |
557 | */ | 564 | */ |
558 | #if defined(CONFIG_LARGE_ALLOCS) | 565 | #if defined(CONFIG_LARGE_ALLOCS) |
559 | #define MAX_OBJ_ORDER 13 /* up to 32Mb */ | 566 | #define MAX_OBJ_ORDER 13 /* up to 32Mb */ |
@@ -573,9 +580,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
573 | #define BREAK_GFP_ORDER_LO 0 | 580 | #define BREAK_GFP_ORDER_LO 0 |
574 | static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; | 581 | static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; |
575 | 582 | ||
576 | /* Functions for storing/retrieving the cachep and or slab from the | 583 | /* |
577 | * global 'mem_map'. These are used to find the slab an obj belongs to. | 584 | * Functions for storing/retrieving the cachep and or slab from the page |
578 | * With kfree(), these are used to find the cache which an obj belongs to. | 585 | * allocator. These are used to find the slab an obj belongs to. With kfree(), |
586 | * these are used to find the cache which an obj belongs to. | ||
579 | */ | 587 | */ |
580 | static inline void page_set_cache(struct page *page, struct kmem_cache *cache) | 588 | static inline void page_set_cache(struct page *page, struct kmem_cache *cache) |
581 | { | 589 | { |
@@ -584,6 +592,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) | |||
584 | 592 | ||
585 | static inline struct kmem_cache *page_get_cache(struct page *page) | 593 | static inline struct kmem_cache *page_get_cache(struct page *page) |
586 | { | 594 | { |
595 | if (unlikely(PageCompound(page))) | ||
596 | page = (struct page *)page_private(page); | ||
587 | return (struct kmem_cache *)page->lru.next; | 597 | return (struct kmem_cache *)page->lru.next; |
588 | } | 598 | } |
589 | 599 | ||
@@ -594,6 +604,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab) | |||
594 | 604 | ||
595 | static inline struct slab *page_get_slab(struct page *page) | 605 | static inline struct slab *page_get_slab(struct page *page) |
596 | { | 606 | { |
607 | if (unlikely(PageCompound(page))) | ||
608 | page = (struct page *)page_private(page); | ||
597 | return (struct slab *)page->lru.prev; | 609 | return (struct slab *)page->lru.prev; |
598 | } | 610 | } |
599 | 611 | ||
@@ -609,7 +621,21 @@ static inline struct slab *virt_to_slab(const void *obj) | |||
609 | return page_get_slab(page); | 621 | return page_get_slab(page); |
610 | } | 622 | } |
611 | 623 | ||
612 | /* These are the default caches for kmalloc. Custom caches can have other sizes. */ | 624 | static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, |
625 | unsigned int idx) | ||
626 | { | ||
627 | return slab->s_mem + cache->buffer_size * idx; | ||
628 | } | ||
629 | |||
630 | static inline unsigned int obj_to_index(struct kmem_cache *cache, | ||
631 | struct slab *slab, void *obj) | ||
632 | { | ||
633 | return (unsigned)(obj - slab->s_mem) / cache->buffer_size; | ||
634 | } | ||
635 | |||
636 | /* | ||
637 | * These are the default caches for kmalloc. Custom caches can have other sizes. | ||
638 | */ | ||
613 | struct cache_sizes malloc_sizes[] = { | 639 | struct cache_sizes malloc_sizes[] = { |
614 | #define CACHE(x) { .cs_size = (x) }, | 640 | #define CACHE(x) { .cs_size = (x) }, |
615 | #include <linux/kmalloc_sizes.h> | 641 | #include <linux/kmalloc_sizes.h> |
@@ -642,8 +668,6 @@ static struct kmem_cache cache_cache = { | |||
642 | .limit = BOOT_CPUCACHE_ENTRIES, | 668 | .limit = BOOT_CPUCACHE_ENTRIES, |
643 | .shared = 1, | 669 | .shared = 1, |
644 | .buffer_size = sizeof(struct kmem_cache), | 670 | .buffer_size = sizeof(struct kmem_cache), |
645 | .flags = SLAB_NO_REAP, | ||
646 | .spinlock = SPIN_LOCK_UNLOCKED, | ||
647 | .name = "kmem_cache", | 671 | .name = "kmem_cache", |
648 | #if DEBUG | 672 | #if DEBUG |
649 | .obj_size = sizeof(struct kmem_cache), | 673 | .obj_size = sizeof(struct kmem_cache), |
@@ -655,8 +679,8 @@ static DEFINE_MUTEX(cache_chain_mutex); | |||
655 | static struct list_head cache_chain; | 679 | static struct list_head cache_chain; |
656 | 680 | ||
657 | /* | 681 | /* |
658 | * vm_enough_memory() looks at this to determine how many | 682 | * vm_enough_memory() looks at this to determine how many slab-allocated pages |
659 | * slab-allocated pages are possibly freeable under pressure | 683 | * are possibly freeable under pressure |
660 | * | 684 | * |
661 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab | 685 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab |
662 | */ | 686 | */ |
@@ -675,7 +699,8 @@ static enum { | |||
675 | 699 | ||
676 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 700 | static DEFINE_PER_CPU(struct work_struct, reap_work); |
677 | 701 | ||
678 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); | 702 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
703 | int node); | ||
679 | static void enable_cpucache(struct kmem_cache *cachep); | 704 | static void enable_cpucache(struct kmem_cache *cachep); |
680 | static void cache_reap(void *unused); | 705 | static void cache_reap(void *unused); |
681 | static int __node_shrink(struct kmem_cache *cachep, int node); | 706 | static int __node_shrink(struct kmem_cache *cachep, int node); |
@@ -685,7 +710,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | |||
685 | return cachep->array[smp_processor_id()]; | 710 | return cachep->array[smp_processor_id()]; |
686 | } | 711 | } |
687 | 712 | ||
688 | static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) | 713 | static inline struct kmem_cache *__find_general_cachep(size_t size, |
714 | gfp_t gfpflags) | ||
689 | { | 715 | { |
690 | struct cache_sizes *csizep = malloc_sizes; | 716 | struct cache_sizes *csizep = malloc_sizes; |
691 | 717 | ||
@@ -720,8 +746,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align) | |||
720 | return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); | 746 | return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); |
721 | } | 747 | } |
722 | 748 | ||
723 | /* Calculate the number of objects and left-over bytes for a given | 749 | /* |
724 | buffer size. */ | 750 | * Calculate the number of objects and left-over bytes for a given buffer size. |
751 | */ | ||
725 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, | 752 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, |
726 | size_t align, int flags, size_t *left_over, | 753 | size_t align, int flags, size_t *left_over, |
727 | unsigned int *num) | 754 | unsigned int *num) |
@@ -782,7 +809,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, | |||
782 | 809 | ||
783 | #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) | 810 | #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) |
784 | 811 | ||
785 | static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) | 812 | static void __slab_error(const char *function, struct kmem_cache *cachep, |
813 | char *msg) | ||
786 | { | 814 | { |
787 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", | 815 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", |
788 | function, cachep->name, msg); | 816 | function, cachep->name, msg); |
@@ -804,7 +832,7 @@ static void init_reap_node(int cpu) | |||
804 | 832 | ||
805 | node = next_node(cpu_to_node(cpu), node_online_map); | 833 | node = next_node(cpu_to_node(cpu), node_online_map); |
806 | if (node == MAX_NUMNODES) | 834 | if (node == MAX_NUMNODES) |
807 | node = 0; | 835 | node = first_node(node_online_map); |
808 | 836 | ||
809 | __get_cpu_var(reap_node) = node; | 837 | __get_cpu_var(reap_node) = node; |
810 | } | 838 | } |
@@ -870,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
870 | return nc; | 898 | return nc; |
871 | } | 899 | } |
872 | 900 | ||
901 | /* | ||
902 | * Transfer objects in one arraycache to another. | ||
903 | * Locking must be handled by the caller. | ||
904 | * | ||
905 | * Return the number of entries transferred. | ||
906 | */ | ||
907 | static int transfer_objects(struct array_cache *to, | ||
908 | struct array_cache *from, unsigned int max) | ||
909 | { | ||
910 | /* Figure out how many entries to transfer */ | ||
911 | int nr = min(min(from->avail, max), to->limit - to->avail); | ||
912 | |||
913 | if (!nr) | ||
914 | return 0; | ||
915 | |||
916 | memcpy(to->entry + to->avail, from->entry + from->avail -nr, | ||
917 | sizeof(void *) *nr); | ||
918 | |||
919 | from->avail -= nr; | ||
920 | to->avail += nr; | ||
921 | to->touched = 1; | ||
922 | return nr; | ||
923 | } | ||
924 | |||
873 | #ifdef CONFIG_NUMA | 925 | #ifdef CONFIG_NUMA |
874 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 926 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); |
927 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | ||
875 | 928 | ||
876 | static struct array_cache **alloc_alien_cache(int node, int limit) | 929 | static struct array_cache **alloc_alien_cache(int node, int limit) |
877 | { | 930 | { |
@@ -906,10 +959,8 @@ static void free_alien_cache(struct array_cache **ac_ptr) | |||
906 | 959 | ||
907 | if (!ac_ptr) | 960 | if (!ac_ptr) |
908 | return; | 961 | return; |
909 | |||
910 | for_each_node(i) | 962 | for_each_node(i) |
911 | kfree(ac_ptr[i]); | 963 | kfree(ac_ptr[i]); |
912 | |||
913 | kfree(ac_ptr); | 964 | kfree(ac_ptr); |
914 | } | 965 | } |
915 | 966 | ||
@@ -920,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
920 | 971 | ||
921 | if (ac->avail) { | 972 | if (ac->avail) { |
922 | spin_lock(&rl3->list_lock); | 973 | spin_lock(&rl3->list_lock); |
974 | /* | ||
975 | * Stuff objects into the remote nodes shared array first. | ||
976 | * That way we could avoid the overhead of putting the objects | ||
977 | * into the free lists and getting them back later. | ||
978 | */ | ||
979 | transfer_objects(rl3->shared, ac, ac->limit); | ||
980 | |||
923 | free_block(cachep, ac->entry, ac->avail, node); | 981 | free_block(cachep, ac->entry, ac->avail, node); |
924 | ac->avail = 0; | 982 | ac->avail = 0; |
925 | spin_unlock(&rl3->list_lock); | 983 | spin_unlock(&rl3->list_lock); |
@@ -935,15 +993,16 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | |||
935 | 993 | ||
936 | if (l3->alien) { | 994 | if (l3->alien) { |
937 | struct array_cache *ac = l3->alien[node]; | 995 | struct array_cache *ac = l3->alien[node]; |
938 | if (ac && ac->avail) { | 996 | |
939 | spin_lock_irq(&ac->lock); | 997 | if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { |
940 | __drain_alien_cache(cachep, ac, node); | 998 | __drain_alien_cache(cachep, ac, node); |
941 | spin_unlock_irq(&ac->lock); | 999 | spin_unlock_irq(&ac->lock); |
942 | } | 1000 | } |
943 | } | 1001 | } |
944 | } | 1002 | } |
945 | 1003 | ||
946 | static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) | 1004 | static void drain_alien_cache(struct kmem_cache *cachep, |
1005 | struct array_cache **alien) | ||
947 | { | 1006 | { |
948 | int i = 0; | 1007 | int i = 0; |
949 | struct array_cache *ac; | 1008 | struct array_cache *ac; |
@@ -986,20 +1045,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
986 | switch (action) { | 1045 | switch (action) { |
987 | case CPU_UP_PREPARE: | 1046 | case CPU_UP_PREPARE: |
988 | mutex_lock(&cache_chain_mutex); | 1047 | mutex_lock(&cache_chain_mutex); |
989 | /* we need to do this right in the beginning since | 1048 | /* |
1049 | * We need to do this right in the beginning since | ||
990 | * alloc_arraycache's are going to use this list. | 1050 | * alloc_arraycache's are going to use this list. |
991 | * kmalloc_node allows us to add the slab to the right | 1051 | * kmalloc_node allows us to add the slab to the right |
992 | * kmem_list3 and not this cpu's kmem_list3 | 1052 | * kmem_list3 and not this cpu's kmem_list3 |
993 | */ | 1053 | */ |
994 | 1054 | ||
995 | list_for_each_entry(cachep, &cache_chain, next) { | 1055 | list_for_each_entry(cachep, &cache_chain, next) { |
996 | /* setup the size64 kmemlist for cpu before we can | 1056 | /* |
1057 | * Set up the size64 kmemlist for cpu before we can | ||
997 | * begin anything. Make sure some other cpu on this | 1058 | * begin anything. Make sure some other cpu on this |
998 | * node has not already allocated this | 1059 | * node has not already allocated this |
999 | */ | 1060 | */ |
1000 | if (!cachep->nodelists[node]) { | 1061 | if (!cachep->nodelists[node]) { |
1001 | if (!(l3 = kmalloc_node(memsize, | 1062 | l3 = kmalloc_node(memsize, GFP_KERNEL, node); |
1002 | GFP_KERNEL, node))) | 1063 | if (!l3) |
1003 | goto bad; | 1064 | goto bad; |
1004 | kmem_list3_init(l3); | 1065 | kmem_list3_init(l3); |
1005 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 1066 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
@@ -1015,13 +1076,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1015 | 1076 | ||
1016 | spin_lock_irq(&cachep->nodelists[node]->list_lock); | 1077 | spin_lock_irq(&cachep->nodelists[node]->list_lock); |
1017 | cachep->nodelists[node]->free_limit = | 1078 | cachep->nodelists[node]->free_limit = |
1018 | (1 + nr_cpus_node(node)) * | 1079 | (1 + nr_cpus_node(node)) * |
1019 | cachep->batchcount + cachep->num; | 1080 | cachep->batchcount + cachep->num; |
1020 | spin_unlock_irq(&cachep->nodelists[node]->list_lock); | 1081 | spin_unlock_irq(&cachep->nodelists[node]->list_lock); |
1021 | } | 1082 | } |
1022 | 1083 | ||
1023 | /* Now we can go ahead with allocating the shared array's | 1084 | /* |
1024 | & array cache's */ | 1085 | * Now we can go ahead with allocating the shared arrays and |
1086 | * array caches | ||
1087 | */ | ||
1025 | list_for_each_entry(cachep, &cache_chain, next) { | 1088 | list_for_each_entry(cachep, &cache_chain, next) { |
1026 | struct array_cache *nc; | 1089 | struct array_cache *nc; |
1027 | struct array_cache *shared; | 1090 | struct array_cache *shared; |
@@ -1041,7 +1104,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1041 | if (!alien) | 1104 | if (!alien) |
1042 | goto bad; | 1105 | goto bad; |
1043 | cachep->array[cpu] = nc; | 1106 | cachep->array[cpu] = nc; |
1044 | |||
1045 | l3 = cachep->nodelists[node]; | 1107 | l3 = cachep->nodelists[node]; |
1046 | BUG_ON(!l3); | 1108 | BUG_ON(!l3); |
1047 | 1109 | ||
@@ -1061,7 +1123,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1061 | } | 1123 | } |
1062 | #endif | 1124 | #endif |
1063 | spin_unlock_irq(&l3->list_lock); | 1125 | spin_unlock_irq(&l3->list_lock); |
1064 | |||
1065 | kfree(shared); | 1126 | kfree(shared); |
1066 | free_alien_cache(alien); | 1127 | free_alien_cache(alien); |
1067 | } | 1128 | } |
@@ -1083,7 +1144,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
1083 | /* fall thru */ | 1144 | /* fall thru */ |
1084 | case CPU_UP_CANCELED: | 1145 | case CPU_UP_CANCELED: |
1085 | mutex_lock(&cache_chain_mutex); | 1146 | mutex_lock(&cache_chain_mutex); |
1086 | |||
1087 | list_for_each_entry(cachep, &cache_chain, next) { | 1147 | list_for_each_entry(cachep, &cache_chain, next) { |
1088 | struct array_cache *nc; | 1148 | struct array_cache *nc; |
1089 | struct array_cache *shared; | 1149 | struct array_cache *shared; |
@@ -1150,7 +1210,7 @@ free_array_cache: | |||
1150 | #endif | 1210 | #endif |
1151 | } | 1211 | } |
1152 | return NOTIFY_OK; | 1212 | return NOTIFY_OK; |
1153 | bad: | 1213 | bad: |
1154 | mutex_unlock(&cache_chain_mutex); | 1214 | mutex_unlock(&cache_chain_mutex); |
1155 | return NOTIFY_BAD; | 1215 | return NOTIFY_BAD; |
1156 | } | 1216 | } |
@@ -1160,7 +1220,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; | |||
1160 | /* | 1220 | /* |
1161 | * swap the static kmem_list3 with kmalloced memory | 1221 | * swap the static kmem_list3 with kmalloced memory |
1162 | */ | 1222 | */ |
1163 | static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) | 1223 | static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, |
1224 | int nodeid) | ||
1164 | { | 1225 | { |
1165 | struct kmem_list3 *ptr; | 1226 | struct kmem_list3 *ptr; |
1166 | 1227 | ||
@@ -1175,8 +1236,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no | |||
1175 | local_irq_enable(); | 1236 | local_irq_enable(); |
1176 | } | 1237 | } |
1177 | 1238 | ||
1178 | /* Initialisation. | 1239 | /* |
1179 | * Called after the gfp() functions have been enabled, and before smp_init(). | 1240 | * Initialisation. Called after the page allocator have been initialised and |
1241 | * before smp_init(). | ||
1180 | */ | 1242 | */ |
1181 | void __init kmem_cache_init(void) | 1243 | void __init kmem_cache_init(void) |
1182 | { | 1244 | { |
@@ -1201,9 +1263,9 @@ void __init kmem_cache_init(void) | |||
1201 | 1263 | ||
1202 | /* Bootstrap is tricky, because several objects are allocated | 1264 | /* Bootstrap is tricky, because several objects are allocated |
1203 | * from caches that do not exist yet: | 1265 | * from caches that do not exist yet: |
1204 | * 1) initialize the cache_cache cache: it contains the struct kmem_cache | 1266 | * 1) initialize the cache_cache cache: it contains the struct |
1205 | * structures of all caches, except cache_cache itself: cache_cache | 1267 | * kmem_cache structures of all caches, except cache_cache itself: |
1206 | * is statically allocated. | 1268 | * cache_cache is statically allocated. |
1207 | * Initially an __init data area is used for the head array and the | 1269 | * Initially an __init data area is used for the head array and the |
1208 | * kmem_list3 structures, it's replaced with a kmalloc allocated | 1270 | * kmem_list3 structures, it's replaced with a kmalloc allocated |
1209 | * array at the end of the bootstrap. | 1271 | * array at the end of the bootstrap. |
@@ -1226,7 +1288,8 @@ void __init kmem_cache_init(void) | |||
1226 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; | 1288 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; |
1227 | cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; | 1289 | cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; |
1228 | 1290 | ||
1229 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); | 1291 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, |
1292 | cache_line_size()); | ||
1230 | 1293 | ||
1231 | for (order = 0; order < MAX_ORDER; order++) { | 1294 | for (order = 0; order < MAX_ORDER; order++) { |
1232 | cache_estimate(order, cache_cache.buffer_size, | 1295 | cache_estimate(order, cache_cache.buffer_size, |
@@ -1245,24 +1308,26 @@ void __init kmem_cache_init(void) | |||
1245 | sizes = malloc_sizes; | 1308 | sizes = malloc_sizes; |
1246 | names = cache_names; | 1309 | names = cache_names; |
1247 | 1310 | ||
1248 | /* Initialize the caches that provide memory for the array cache | 1311 | /* |
1249 | * and the kmem_list3 structures first. | 1312 | * Initialize the caches that provide memory for the array cache and the |
1250 | * Without this, further allocations will bug | 1313 | * kmem_list3 structures first. Without this, further allocations will |
1314 | * bug. | ||
1251 | */ | 1315 | */ |
1252 | 1316 | ||
1253 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, | 1317 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, |
1254 | sizes[INDEX_AC].cs_size, | 1318 | sizes[INDEX_AC].cs_size, |
1255 | ARCH_KMALLOC_MINALIGN, | 1319 | ARCH_KMALLOC_MINALIGN, |
1256 | (ARCH_KMALLOC_FLAGS | | 1320 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1257 | SLAB_PANIC), NULL, NULL); | 1321 | NULL, NULL); |
1258 | 1322 | ||
1259 | if (INDEX_AC != INDEX_L3) | 1323 | if (INDEX_AC != INDEX_L3) { |
1260 | sizes[INDEX_L3].cs_cachep = | 1324 | sizes[INDEX_L3].cs_cachep = |
1261 | kmem_cache_create(names[INDEX_L3].name, | 1325 | kmem_cache_create(names[INDEX_L3].name, |
1262 | sizes[INDEX_L3].cs_size, | 1326 | sizes[INDEX_L3].cs_size, |
1263 | ARCH_KMALLOC_MINALIGN, | 1327 | ARCH_KMALLOC_MINALIGN, |
1264 | (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, | 1328 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1265 | NULL); | 1329 | NULL, NULL); |
1330 | } | ||
1266 | 1331 | ||
1267 | while (sizes->cs_size != ULONG_MAX) { | 1332 | while (sizes->cs_size != ULONG_MAX) { |
1268 | /* | 1333 | /* |
@@ -1272,13 +1337,13 @@ void __init kmem_cache_init(void) | |||
1272 | * Note for systems short on memory removing the alignment will | 1337 | * Note for systems short on memory removing the alignment will |
1273 | * allow tighter packing of the smaller caches. | 1338 | * allow tighter packing of the smaller caches. |
1274 | */ | 1339 | */ |
1275 | if (!sizes->cs_cachep) | 1340 | if (!sizes->cs_cachep) { |
1276 | sizes->cs_cachep = kmem_cache_create(names->name, | 1341 | sizes->cs_cachep = kmem_cache_create(names->name, |
1277 | sizes->cs_size, | 1342 | sizes->cs_size, |
1278 | ARCH_KMALLOC_MINALIGN, | 1343 | ARCH_KMALLOC_MINALIGN, |
1279 | (ARCH_KMALLOC_FLAGS | 1344 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
1280 | | SLAB_PANIC), | 1345 | NULL, NULL); |
1281 | NULL, NULL); | 1346 | } |
1282 | 1347 | ||
1283 | /* Inc off-slab bufctl limit until the ceiling is hit. */ | 1348 | /* Inc off-slab bufctl limit until the ceiling is hit. */ |
1284 | if (!(OFF_SLAB(sizes->cs_cachep))) { | 1349 | if (!(OFF_SLAB(sizes->cs_cachep))) { |
@@ -1287,13 +1352,11 @@ void __init kmem_cache_init(void) | |||
1287 | } | 1352 | } |
1288 | 1353 | ||
1289 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, | 1354 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, |
1290 | sizes->cs_size, | 1355 | sizes->cs_size, |
1291 | ARCH_KMALLOC_MINALIGN, | 1356 | ARCH_KMALLOC_MINALIGN, |
1292 | (ARCH_KMALLOC_FLAGS | | 1357 | ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| |
1293 | SLAB_CACHE_DMA | | 1358 | SLAB_PANIC, |
1294 | SLAB_PANIC), NULL, | 1359 | NULL, NULL); |
1295 | NULL); | ||
1296 | |||
1297 | sizes++; | 1360 | sizes++; |
1298 | names++; | 1361 | names++; |
1299 | } | 1362 | } |
@@ -1345,20 +1408,22 @@ void __init kmem_cache_init(void) | |||
1345 | struct kmem_cache *cachep; | 1408 | struct kmem_cache *cachep; |
1346 | mutex_lock(&cache_chain_mutex); | 1409 | mutex_lock(&cache_chain_mutex); |
1347 | list_for_each_entry(cachep, &cache_chain, next) | 1410 | list_for_each_entry(cachep, &cache_chain, next) |
1348 | enable_cpucache(cachep); | 1411 | enable_cpucache(cachep); |
1349 | mutex_unlock(&cache_chain_mutex); | 1412 | mutex_unlock(&cache_chain_mutex); |
1350 | } | 1413 | } |
1351 | 1414 | ||
1352 | /* Done! */ | 1415 | /* Done! */ |
1353 | g_cpucache_up = FULL; | 1416 | g_cpucache_up = FULL; |
1354 | 1417 | ||
1355 | /* Register a cpu startup notifier callback | 1418 | /* |
1356 | * that initializes cpu_cache_get for all new cpus | 1419 | * Register a cpu startup notifier callback that initializes |
1420 | * cpu_cache_get for all new cpus | ||
1357 | */ | 1421 | */ |
1358 | register_cpu_notifier(&cpucache_notifier); | 1422 | register_cpu_notifier(&cpucache_notifier); |
1359 | 1423 | ||
1360 | /* The reap timers are started later, with a module init call: | 1424 | /* |
1361 | * That part of the kernel is not yet operational. | 1425 | * The reap timers are started later, with a module init call: That part |
1426 | * of the kernel is not yet operational. | ||
1362 | */ | 1427 | */ |
1363 | } | 1428 | } |
1364 | 1429 | ||
@@ -1366,16 +1431,13 @@ static int __init cpucache_init(void) | |||
1366 | { | 1431 | { |
1367 | int cpu; | 1432 | int cpu; |
1368 | 1433 | ||
1369 | /* | 1434 | /* |
1370 | * Register the timers that return unneeded | 1435 | * Register the timers that return unneeded pages to the page allocator |
1371 | * pages to gfp. | ||
1372 | */ | 1436 | */ |
1373 | for_each_online_cpu(cpu) | 1437 | for_each_online_cpu(cpu) |
1374 | start_cpu_timer(cpu); | 1438 | start_cpu_timer(cpu); |
1375 | |||
1376 | return 0; | 1439 | return 0; |
1377 | } | 1440 | } |
1378 | |||
1379 | __initcall(cpucache_init); | 1441 | __initcall(cpucache_init); |
1380 | 1442 | ||
1381 | /* | 1443 | /* |
@@ -1402,7 +1464,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
1402 | atomic_add(i, &slab_reclaim_pages); | 1464 | atomic_add(i, &slab_reclaim_pages); |
1403 | add_page_state(nr_slab, i); | 1465 | add_page_state(nr_slab, i); |
1404 | while (i--) { | 1466 | while (i--) { |
1405 | SetPageSlab(page); | 1467 | __SetPageSlab(page); |
1406 | page++; | 1468 | page++; |
1407 | } | 1469 | } |
1408 | return addr; | 1470 | return addr; |
@@ -1418,8 +1480,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
1418 | const unsigned long nr_freed = i; | 1480 | const unsigned long nr_freed = i; |
1419 | 1481 | ||
1420 | while (i--) { | 1482 | while (i--) { |
1421 | if (!TestClearPageSlab(page)) | 1483 | BUG_ON(!PageSlab(page)); |
1422 | BUG(); | 1484 | __ClearPageSlab(page); |
1423 | page++; | 1485 | page++; |
1424 | } | 1486 | } |
1425 | sub_page_state(nr_slab, nr_freed); | 1487 | sub_page_state(nr_slab, nr_freed); |
@@ -1489,9 +1551,8 @@ static void dump_line(char *data, int offset, int limit) | |||
1489 | { | 1551 | { |
1490 | int i; | 1552 | int i; |
1491 | printk(KERN_ERR "%03x:", offset); | 1553 | printk(KERN_ERR "%03x:", offset); |
1492 | for (i = 0; i < limit; i++) { | 1554 | for (i = 0; i < limit; i++) |
1493 | printk(" %02x", (unsigned char)data[offset + i]); | 1555 | printk(" %02x", (unsigned char)data[offset + i]); |
1494 | } | ||
1495 | printk("\n"); | 1556 | printk("\n"); |
1496 | } | 1557 | } |
1497 | #endif | 1558 | #endif |
@@ -1505,15 +1566,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) | |||
1505 | 1566 | ||
1506 | if (cachep->flags & SLAB_RED_ZONE) { | 1567 | if (cachep->flags & SLAB_RED_ZONE) { |
1507 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", | 1568 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", |
1508 | *dbg_redzone1(cachep, objp), | 1569 | *dbg_redzone1(cachep, objp), |
1509 | *dbg_redzone2(cachep, objp)); | 1570 | *dbg_redzone2(cachep, objp)); |
1510 | } | 1571 | } |
1511 | 1572 | ||
1512 | if (cachep->flags & SLAB_STORE_USER) { | 1573 | if (cachep->flags & SLAB_STORE_USER) { |
1513 | printk(KERN_ERR "Last user: [<%p>]", | 1574 | printk(KERN_ERR "Last user: [<%p>]", |
1514 | *dbg_userword(cachep, objp)); | 1575 | *dbg_userword(cachep, objp)); |
1515 | print_symbol("(%s)", | 1576 | print_symbol("(%s)", |
1516 | (unsigned long)*dbg_userword(cachep, objp)); | 1577 | (unsigned long)*dbg_userword(cachep, objp)); |
1517 | printk("\n"); | 1578 | printk("\n"); |
1518 | } | 1579 | } |
1519 | realobj = (char *)objp + obj_offset(cachep); | 1580 | realobj = (char *)objp + obj_offset(cachep); |
@@ -1546,8 +1607,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1546 | /* Print header */ | 1607 | /* Print header */ |
1547 | if (lines == 0) { | 1608 | if (lines == 0) { |
1548 | printk(KERN_ERR | 1609 | printk(KERN_ERR |
1549 | "Slab corruption: start=%p, len=%d\n", | 1610 | "Slab corruption: start=%p, len=%d\n", |
1550 | realobj, size); | 1611 | realobj, size); |
1551 | print_objinfo(cachep, objp, 0); | 1612 | print_objinfo(cachep, objp, 0); |
1552 | } | 1613 | } |
1553 | /* Hexdump the affected line */ | 1614 | /* Hexdump the affected line */ |
@@ -1568,18 +1629,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1568 | * exist: | 1629 | * exist: |
1569 | */ | 1630 | */ |
1570 | struct slab *slabp = virt_to_slab(objp); | 1631 | struct slab *slabp = virt_to_slab(objp); |
1571 | int objnr; | 1632 | unsigned int objnr; |
1572 | 1633 | ||
1573 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | 1634 | objnr = obj_to_index(cachep, slabp, objp); |
1574 | if (objnr) { | 1635 | if (objnr) { |
1575 | objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; | 1636 | objp = index_to_obj(cachep, slabp, objnr - 1); |
1576 | realobj = (char *)objp + obj_offset(cachep); | 1637 | realobj = (char *)objp + obj_offset(cachep); |
1577 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", | 1638 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", |
1578 | realobj, size); | 1639 | realobj, size); |
1579 | print_objinfo(cachep, objp, 2); | 1640 | print_objinfo(cachep, objp, 2); |
1580 | } | 1641 | } |
1581 | if (objnr + 1 < cachep->num) { | 1642 | if (objnr + 1 < cachep->num) { |
1582 | objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; | 1643 | objp = index_to_obj(cachep, slabp, objnr + 1); |
1583 | realobj = (char *)objp + obj_offset(cachep); | 1644 | realobj = (char *)objp + obj_offset(cachep); |
1584 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", | 1645 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", |
1585 | realobj, size); | 1646 | realobj, size); |
@@ -1591,22 +1652,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
1591 | 1652 | ||
1592 | #if DEBUG | 1653 | #if DEBUG |
1593 | /** | 1654 | /** |
1594 | * slab_destroy_objs - call the registered destructor for each object in | 1655 | * slab_destroy_objs - destroy a slab and its objects |
1595 | * a slab that is to be destroyed. | 1656 | * @cachep: cache pointer being destroyed |
1657 | * @slabp: slab pointer being destroyed | ||
1658 | * | ||
1659 | * Call the registered destructor for each object in a slab that is being | ||
1660 | * destroyed. | ||
1596 | */ | 1661 | */ |
1597 | static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | 1662 | static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) |
1598 | { | 1663 | { |
1599 | int i; | 1664 | int i; |
1600 | for (i = 0; i < cachep->num; i++) { | 1665 | for (i = 0; i < cachep->num; i++) { |
1601 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 1666 | void *objp = index_to_obj(cachep, slabp, i); |
1602 | 1667 | ||
1603 | if (cachep->flags & SLAB_POISON) { | 1668 | if (cachep->flags & SLAB_POISON) { |
1604 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1669 | #ifdef CONFIG_DEBUG_PAGEALLOC |
1605 | if ((cachep->buffer_size % PAGE_SIZE) == 0 | 1670 | if (cachep->buffer_size % PAGE_SIZE == 0 && |
1606 | && OFF_SLAB(cachep)) | 1671 | OFF_SLAB(cachep)) |
1607 | kernel_map_pages(virt_to_page(objp), | 1672 | kernel_map_pages(virt_to_page(objp), |
1608 | cachep->buffer_size / PAGE_SIZE, | 1673 | cachep->buffer_size / PAGE_SIZE, 1); |
1609 | 1); | ||
1610 | else | 1674 | else |
1611 | check_poison_obj(cachep, objp); | 1675 | check_poison_obj(cachep, objp); |
1612 | #else | 1676 | #else |
@@ -1631,7 +1695,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | |||
1631 | if (cachep->dtor) { | 1695 | if (cachep->dtor) { |
1632 | int i; | 1696 | int i; |
1633 | for (i = 0; i < cachep->num; i++) { | 1697 | for (i = 0; i < cachep->num; i++) { |
1634 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 1698 | void *objp = index_to_obj(cachep, slabp, i); |
1635 | (cachep->dtor) (objp, cachep, 0); | 1699 | (cachep->dtor) (objp, cachep, 0); |
1636 | } | 1700 | } |
1637 | } | 1701 | } |
@@ -1639,9 +1703,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | |||
1639 | #endif | 1703 | #endif |
1640 | 1704 | ||
1641 | /** | 1705 | /** |
1706 | * slab_destroy - destroy and release all objects in a slab | ||
1707 | * @cachep: cache pointer being destroyed | ||
1708 | * @slabp: slab pointer being destroyed | ||
1709 | * | ||
1642 | * Destroy all the objs in a slab, and release the mem back to the system. | 1710 | * Destroy all the objs in a slab, and release the mem back to the system. |
1643 | * Before calling the slab must have been unlinked from the cache. | 1711 | * Before calling the slab must have been unlinked from the cache. The |
1644 | * The cache-lock is not held/needed. | 1712 | * cache-lock is not held/needed. |
1645 | */ | 1713 | */ |
1646 | static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) | 1714 | static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) |
1647 | { | 1715 | { |
@@ -1662,8 +1730,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) | |||
1662 | } | 1730 | } |
1663 | } | 1731 | } |
1664 | 1732 | ||
1665 | /* For setting up all the kmem_list3s for cache whose buffer_size is same | 1733 | /* |
1666 | as size of kmem_list3. */ | 1734 | * For setting up all the kmem_list3s for cache whose buffer_size is same as |
1735 | * size of kmem_list3. | ||
1736 | */ | ||
1667 | static void set_up_list3s(struct kmem_cache *cachep, int index) | 1737 | static void set_up_list3s(struct kmem_cache *cachep, int index) |
1668 | { | 1738 | { |
1669 | int node; | 1739 | int node; |
@@ -1689,13 +1759,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) | |||
1689 | * high order pages for slabs. When the gfp() functions are more friendly | 1759 | * high order pages for slabs. When the gfp() functions are more friendly |
1690 | * towards high-order requests, this should be changed. | 1760 | * towards high-order requests, this should be changed. |
1691 | */ | 1761 | */ |
1692 | static inline size_t calculate_slab_order(struct kmem_cache *cachep, | 1762 | static size_t calculate_slab_order(struct kmem_cache *cachep, |
1693 | size_t size, size_t align, unsigned long flags) | 1763 | size_t size, size_t align, unsigned long flags) |
1694 | { | 1764 | { |
1695 | size_t left_over = 0; | 1765 | size_t left_over = 0; |
1696 | int gfporder; | 1766 | int gfporder; |
1697 | 1767 | ||
1698 | for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { | 1768 | for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { |
1699 | unsigned int num; | 1769 | unsigned int num; |
1700 | size_t remainder; | 1770 | size_t remainder; |
1701 | 1771 | ||
@@ -1730,12 +1800,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1730 | /* | 1800 | /* |
1731 | * Acceptable internal fragmentation? | 1801 | * Acceptable internal fragmentation? |
1732 | */ | 1802 | */ |
1733 | if ((left_over * 8) <= (PAGE_SIZE << gfporder)) | 1803 | if (left_over * 8 <= (PAGE_SIZE << gfporder)) |
1734 | break; | 1804 | break; |
1735 | } | 1805 | } |
1736 | return left_over; | 1806 | return left_over; |
1737 | } | 1807 | } |
1738 | 1808 | ||
1809 | static void setup_cpu_cache(struct kmem_cache *cachep) | ||
1810 | { | ||
1811 | if (g_cpucache_up == FULL) { | ||
1812 | enable_cpucache(cachep); | ||
1813 | return; | ||
1814 | } | ||
1815 | if (g_cpucache_up == NONE) { | ||
1816 | /* | ||
1817 | * Note: the first kmem_cache_create must create the cache | ||
1818 | * that's used by kmalloc(24), otherwise the creation of | ||
1819 | * further caches will BUG(). | ||
1820 | */ | ||
1821 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
1822 | |||
1823 | /* | ||
1824 | * If the cache that's used by kmalloc(sizeof(kmem_list3)) is | ||
1825 | * the first cache, then we need to set up all its list3s, | ||
1826 | * otherwise the creation of further caches will BUG(). | ||
1827 | */ | ||
1828 | set_up_list3s(cachep, SIZE_AC); | ||
1829 | if (INDEX_AC == INDEX_L3) | ||
1830 | g_cpucache_up = PARTIAL_L3; | ||
1831 | else | ||
1832 | g_cpucache_up = PARTIAL_AC; | ||
1833 | } else { | ||
1834 | cachep->array[smp_processor_id()] = | ||
1835 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | ||
1836 | |||
1837 | if (g_cpucache_up == PARTIAL_AC) { | ||
1838 | set_up_list3s(cachep, SIZE_L3); | ||
1839 | g_cpucache_up = PARTIAL_L3; | ||
1840 | } else { | ||
1841 | int node; | ||
1842 | for_each_online_node(node) { | ||
1843 | cachep->nodelists[node] = | ||
1844 | kmalloc_node(sizeof(struct kmem_list3), | ||
1845 | GFP_KERNEL, node); | ||
1846 | BUG_ON(!cachep->nodelists[node]); | ||
1847 | kmem_list3_init(cachep->nodelists[node]); | ||
1848 | } | ||
1849 | } | ||
1850 | } | ||
1851 | cachep->nodelists[numa_node_id()]->next_reap = | ||
1852 | jiffies + REAPTIMEOUT_LIST3 + | ||
1853 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | ||
1854 | |||
1855 | cpu_cache_get(cachep)->avail = 0; | ||
1856 | cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; | ||
1857 | cpu_cache_get(cachep)->batchcount = 1; | ||
1858 | cpu_cache_get(cachep)->touched = 0; | ||
1859 | cachep->batchcount = 1; | ||
1860 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | ||
1861 | } | ||
1862 | |||
1739 | /** | 1863 | /** |
1740 | * kmem_cache_create - Create a cache. | 1864 | * kmem_cache_create - Create a cache. |
1741 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 1865 | * @name: A string which is used in /proc/slabinfo to identify this cache. |
@@ -1751,9 +1875,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1751 | * and the @dtor is run before the pages are handed back. | 1875 | * and the @dtor is run before the pages are handed back. |
1752 | * | 1876 | * |
1753 | * @name must be valid until the cache is destroyed. This implies that | 1877 | * @name must be valid until the cache is destroyed. This implies that |
1754 | * the module calling this has to destroy the cache before getting | 1878 | * the module calling this has to destroy the cache before getting unloaded. |
1755 | * unloaded. | 1879 | * |
1756 | * | ||
1757 | * The flags are | 1880 | * The flags are |
1758 | * | 1881 | * |
1759 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) | 1882 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) |
@@ -1762,16 +1885,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1762 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check | 1885 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check |
1763 | * for buffer overruns. | 1886 | * for buffer overruns. |
1764 | * | 1887 | * |
1765 | * %SLAB_NO_REAP - Don't automatically reap this cache when we're under | ||
1766 | * memory pressure. | ||
1767 | * | ||
1768 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware | 1888 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware |
1769 | * cacheline. This can be beneficial if you're counting cycles as closely | 1889 | * cacheline. This can be beneficial if you're counting cycles as closely |
1770 | * as davem. | 1890 | * as davem. |
1771 | */ | 1891 | */ |
1772 | struct kmem_cache * | 1892 | struct kmem_cache * |
1773 | kmem_cache_create (const char *name, size_t size, size_t align, | 1893 | kmem_cache_create (const char *name, size_t size, size_t align, |
1774 | unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), | 1894 | unsigned long flags, |
1895 | void (*ctor)(void*, struct kmem_cache *, unsigned long), | ||
1775 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) | 1896 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) |
1776 | { | 1897 | { |
1777 | size_t left_over, slab_size, ralign; | 1898 | size_t left_over, slab_size, ralign; |
@@ -1781,12 +1902,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1781 | /* | 1902 | /* |
1782 | * Sanity checks... these are all serious usage bugs. | 1903 | * Sanity checks... these are all serious usage bugs. |
1783 | */ | 1904 | */ |
1784 | if ((!name) || | 1905 | if (!name || in_interrupt() || (size < BYTES_PER_WORD) || |
1785 | in_interrupt() || | ||
1786 | (size < BYTES_PER_WORD) || | ||
1787 | (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { | 1906 | (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { |
1788 | printk(KERN_ERR "%s: Early error in slab %s\n", | 1907 | printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, |
1789 | __FUNCTION__, name); | 1908 | name); |
1790 | BUG(); | 1909 | BUG(); |
1791 | } | 1910 | } |
1792 | 1911 | ||
@@ -1840,8 +1959,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1840 | * above the next power of two: caches with object sizes just above a | 1959 | * above the next power of two: caches with object sizes just above a |
1841 | * power of two have a significant amount of internal fragmentation. | 1960 | * power of two have a significant amount of internal fragmentation. |
1842 | */ | 1961 | */ |
1843 | if ((size < 4096 | 1962 | if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) |
1844 | || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) | ||
1845 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; | 1963 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; |
1846 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 1964 | if (!(flags & SLAB_DESTROY_BY_RCU)) |
1847 | flags |= SLAB_POISON; | 1965 | flags |= SLAB_POISON; |
@@ -1853,13 +1971,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1853 | BUG_ON(dtor); | 1971 | BUG_ON(dtor); |
1854 | 1972 | ||
1855 | /* | 1973 | /* |
1856 | * Always checks flags, a caller might be expecting debug | 1974 | * Always checks flags, a caller might be expecting debug support which |
1857 | * support which isn't available. | 1975 | * isn't available. |
1858 | */ | 1976 | */ |
1859 | if (flags & ~CREATE_MASK) | 1977 | if (flags & ~CREATE_MASK) |
1860 | BUG(); | 1978 | BUG(); |
1861 | 1979 | ||
1862 | /* Check that size is in terms of words. This is needed to avoid | 1980 | /* |
1981 | * Check that size is in terms of words. This is needed to avoid | ||
1863 | * unaligned accesses for some archs when redzoning is used, and makes | 1982 | * unaligned accesses for some archs when redzoning is used, and makes |
1864 | * sure any on-slab bufctl's are also correctly aligned. | 1983 | * sure any on-slab bufctl's are also correctly aligned. |
1865 | */ | 1984 | */ |
@@ -1868,12 +1987,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1868 | size &= ~(BYTES_PER_WORD - 1); | 1987 | size &= ~(BYTES_PER_WORD - 1); |
1869 | } | 1988 | } |
1870 | 1989 | ||
1871 | /* calculate out the final buffer alignment: */ | 1990 | /* calculate the final buffer alignment: */ |
1991 | |||
1872 | /* 1) arch recommendation: can be overridden for debug */ | 1992 | /* 1) arch recommendation: can be overridden for debug */ |
1873 | if (flags & SLAB_HWCACHE_ALIGN) { | 1993 | if (flags & SLAB_HWCACHE_ALIGN) { |
1874 | /* Default alignment: as specified by the arch code. | 1994 | /* |
1875 | * Except if an object is really small, then squeeze multiple | 1995 | * Default alignment: as specified by the arch code. Except if |
1876 | * objects into one cacheline. | 1996 | * an object is really small, then squeeze multiple objects into |
1997 | * one cacheline. | ||
1877 | */ | 1998 | */ |
1878 | ralign = cache_line_size(); | 1999 | ralign = cache_line_size(); |
1879 | while (size <= ralign / 2) | 2000 | while (size <= ralign / 2) |
@@ -1893,16 +2014,16 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1893 | if (ralign > BYTES_PER_WORD) | 2014 | if (ralign > BYTES_PER_WORD) |
1894 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 2015 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
1895 | } | 2016 | } |
1896 | /* 4) Store it. Note that the debug code below can reduce | 2017 | /* |
2018 | * 4) Store it. Note that the debug code below can reduce | ||
1897 | * the alignment to BYTES_PER_WORD. | 2019 | * the alignment to BYTES_PER_WORD. |
1898 | */ | 2020 | */ |
1899 | align = ralign; | 2021 | align = ralign; |
1900 | 2022 | ||
1901 | /* Get cache's description obj. */ | 2023 | /* Get cache's description obj. */ |
1902 | cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL); | 2024 | cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); |
1903 | if (!cachep) | 2025 | if (!cachep) |
1904 | goto oops; | 2026 | goto oops; |
1905 | memset(cachep, 0, sizeof(struct kmem_cache)); | ||
1906 | 2027 | ||
1907 | #if DEBUG | 2028 | #if DEBUG |
1908 | cachep->obj_size = size; | 2029 | cachep->obj_size = size; |
@@ -1978,7 +2099,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1978 | cachep->gfpflags = 0; | 2099 | cachep->gfpflags = 0; |
1979 | if (flags & SLAB_CACHE_DMA) | 2100 | if (flags & SLAB_CACHE_DMA) |
1980 | cachep->gfpflags |= GFP_DMA; | 2101 | cachep->gfpflags |= GFP_DMA; |
1981 | spin_lock_init(&cachep->spinlock); | ||
1982 | cachep->buffer_size = size; | 2102 | cachep->buffer_size = size; |
1983 | 2103 | ||
1984 | if (flags & CFLGS_OFF_SLAB) | 2104 | if (flags & CFLGS_OFF_SLAB) |
@@ -1988,64 +2108,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1988 | cachep->name = name; | 2108 | cachep->name = name; |
1989 | 2109 | ||
1990 | 2110 | ||
1991 | if (g_cpucache_up == FULL) { | 2111 | setup_cpu_cache(cachep); |
1992 | enable_cpucache(cachep); | ||
1993 | } else { | ||
1994 | if (g_cpucache_up == NONE) { | ||
1995 | /* Note: the first kmem_cache_create must create | ||
1996 | * the cache that's used by kmalloc(24), otherwise | ||
1997 | * the creation of further caches will BUG(). | ||
1998 | */ | ||
1999 | cachep->array[smp_processor_id()] = | ||
2000 | &initarray_generic.cache; | ||
2001 | |||
2002 | /* If the cache that's used by | ||
2003 | * kmalloc(sizeof(kmem_list3)) is the first cache, | ||
2004 | * then we need to set up all its list3s, otherwise | ||
2005 | * the creation of further caches will BUG(). | ||
2006 | */ | ||
2007 | set_up_list3s(cachep, SIZE_AC); | ||
2008 | if (INDEX_AC == INDEX_L3) | ||
2009 | g_cpucache_up = PARTIAL_L3; | ||
2010 | else | ||
2011 | g_cpucache_up = PARTIAL_AC; | ||
2012 | } else { | ||
2013 | cachep->array[smp_processor_id()] = | ||
2014 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | ||
2015 | |||
2016 | if (g_cpucache_up == PARTIAL_AC) { | ||
2017 | set_up_list3s(cachep, SIZE_L3); | ||
2018 | g_cpucache_up = PARTIAL_L3; | ||
2019 | } else { | ||
2020 | int node; | ||
2021 | for_each_online_node(node) { | ||
2022 | |||
2023 | cachep->nodelists[node] = | ||
2024 | kmalloc_node(sizeof | ||
2025 | (struct kmem_list3), | ||
2026 | GFP_KERNEL, node); | ||
2027 | BUG_ON(!cachep->nodelists[node]); | ||
2028 | kmem_list3_init(cachep-> | ||
2029 | nodelists[node]); | ||
2030 | } | ||
2031 | } | ||
2032 | } | ||
2033 | cachep->nodelists[numa_node_id()]->next_reap = | ||
2034 | jiffies + REAPTIMEOUT_LIST3 + | ||
2035 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | ||
2036 | |||
2037 | BUG_ON(!cpu_cache_get(cachep)); | ||
2038 | cpu_cache_get(cachep)->avail = 0; | ||
2039 | cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; | ||
2040 | cpu_cache_get(cachep)->batchcount = 1; | ||
2041 | cpu_cache_get(cachep)->touched = 0; | ||
2042 | cachep->batchcount = 1; | ||
2043 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | ||
2044 | } | ||
2045 | 2112 | ||
2046 | /* cache setup completed, link it into the list */ | 2113 | /* cache setup completed, link it into the list */ |
2047 | list_add(&cachep->next, &cache_chain); | 2114 | list_add(&cachep->next, &cache_chain); |
2048 | oops: | 2115 | oops: |
2049 | if (!cachep && (flags & SLAB_PANIC)) | 2116 | if (!cachep && (flags & SLAB_PANIC)) |
2050 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2117 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
2051 | name); | 2118 | name); |
@@ -2089,30 +2156,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) | |||
2089 | #define check_spinlock_acquired_node(x, y) do { } while(0) | 2156 | #define check_spinlock_acquired_node(x, y) do { } while(0) |
2090 | #endif | 2157 | #endif |
2091 | 2158 | ||
2092 | /* | 2159 | static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, |
2093 | * Waits for all CPUs to execute func(). | 2160 | struct array_cache *ac, |
2094 | */ | 2161 | int force, int node); |
2095 | static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) | ||
2096 | { | ||
2097 | check_irq_on(); | ||
2098 | preempt_disable(); | ||
2099 | |||
2100 | local_irq_disable(); | ||
2101 | func(arg); | ||
2102 | local_irq_enable(); | ||
2103 | |||
2104 | if (smp_call_function(func, arg, 1, 1)) | ||
2105 | BUG(); | ||
2106 | |||
2107 | preempt_enable(); | ||
2108 | } | ||
2109 | |||
2110 | static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, | ||
2111 | int force, int node); | ||
2112 | 2162 | ||
2113 | static void do_drain(void *arg) | 2163 | static void do_drain(void *arg) |
2114 | { | 2164 | { |
2115 | struct kmem_cache *cachep = (struct kmem_cache *) arg; | 2165 | struct kmem_cache *cachep = arg; |
2116 | struct array_cache *ac; | 2166 | struct array_cache *ac; |
2117 | int node = numa_node_id(); | 2167 | int node = numa_node_id(); |
2118 | 2168 | ||
@@ -2129,14 +2179,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) | |||
2129 | struct kmem_list3 *l3; | 2179 | struct kmem_list3 *l3; |
2130 | int node; | 2180 | int node; |
2131 | 2181 | ||
2132 | smp_call_function_all_cpus(do_drain, cachep); | 2182 | on_each_cpu(do_drain, cachep, 1, 1); |
2133 | check_irq_on(); | 2183 | check_irq_on(); |
2134 | for_each_online_node(node) { | 2184 | for_each_online_node(node) { |
2135 | l3 = cachep->nodelists[node]; | 2185 | l3 = cachep->nodelists[node]; |
2136 | if (l3) { | 2186 | if (l3) { |
2137 | spin_lock_irq(&l3->list_lock); | 2187 | drain_array(cachep, l3, l3->shared, 1, node); |
2138 | drain_array_locked(cachep, l3->shared, 1, node); | ||
2139 | spin_unlock_irq(&l3->list_lock); | ||
2140 | if (l3->alien) | 2188 | if (l3->alien) |
2141 | drain_alien_cache(cachep, l3->alien); | 2189 | drain_alien_cache(cachep, l3->alien); |
2142 | } | 2190 | } |
@@ -2260,16 +2308,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep) | |||
2260 | 2308 | ||
2261 | /* NUMA: free the list3 structures */ | 2309 | /* NUMA: free the list3 structures */ |
2262 | for_each_online_node(i) { | 2310 | for_each_online_node(i) { |
2263 | if ((l3 = cachep->nodelists[i])) { | 2311 | l3 = cachep->nodelists[i]; |
2312 | if (l3) { | ||
2264 | kfree(l3->shared); | 2313 | kfree(l3->shared); |
2265 | free_alien_cache(l3->alien); | 2314 | free_alien_cache(l3->alien); |
2266 | kfree(l3); | 2315 | kfree(l3); |
2267 | } | 2316 | } |
2268 | } | 2317 | } |
2269 | kmem_cache_free(&cache_cache, cachep); | 2318 | kmem_cache_free(&cache_cache, cachep); |
2270 | |||
2271 | unlock_cpu_hotplug(); | 2319 | unlock_cpu_hotplug(); |
2272 | |||
2273 | return 0; | 2320 | return 0; |
2274 | } | 2321 | } |
2275 | EXPORT_SYMBOL(kmem_cache_destroy); | 2322 | EXPORT_SYMBOL(kmem_cache_destroy); |
@@ -2292,7 +2339,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
2292 | slabp->inuse = 0; | 2339 | slabp->inuse = 0; |
2293 | slabp->colouroff = colour_off; | 2340 | slabp->colouroff = colour_off; |
2294 | slabp->s_mem = objp + colour_off; | 2341 | slabp->s_mem = objp + colour_off; |
2295 | |||
2296 | return slabp; | 2342 | return slabp; |
2297 | } | 2343 | } |
2298 | 2344 | ||
@@ -2307,7 +2353,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2307 | int i; | 2353 | int i; |
2308 | 2354 | ||
2309 | for (i = 0; i < cachep->num; i++) { | 2355 | for (i = 0; i < cachep->num; i++) { |
2310 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 2356 | void *objp = index_to_obj(cachep, slabp, i); |
2311 | #if DEBUG | 2357 | #if DEBUG |
2312 | /* need to poison the objs? */ | 2358 | /* need to poison the objs? */ |
2313 | if (cachep->flags & SLAB_POISON) | 2359 | if (cachep->flags & SLAB_POISON) |
@@ -2320,9 +2366,9 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2320 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2366 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
2321 | } | 2367 | } |
2322 | /* | 2368 | /* |
2323 | * Constructors are not allowed to allocate memory from | 2369 | * Constructors are not allowed to allocate memory from the same |
2324 | * the same cache which they are a constructor for. | 2370 | * cache which they are a constructor for. Otherwise, deadlock. |
2325 | * Otherwise, deadlock. They must also be threaded. | 2371 | * They must also be threaded. |
2326 | */ | 2372 | */ |
2327 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2373 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
2328 | cachep->ctor(objp + obj_offset(cachep), cachep, | 2374 | cachep->ctor(objp + obj_offset(cachep), cachep, |
@@ -2336,8 +2382,8 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2336 | slab_error(cachep, "constructor overwrote the" | 2382 | slab_error(cachep, "constructor overwrote the" |
2337 | " start of an object"); | 2383 | " start of an object"); |
2338 | } | 2384 | } |
2339 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) | 2385 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && |
2340 | && cachep->flags & SLAB_POISON) | 2386 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) |
2341 | kernel_map_pages(virt_to_page(objp), | 2387 | kernel_map_pages(virt_to_page(objp), |
2342 | cachep->buffer_size / PAGE_SIZE, 0); | 2388 | cachep->buffer_size / PAGE_SIZE, 0); |
2343 | #else | 2389 | #else |
@@ -2352,18 +2398,16 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
2352 | 2398 | ||
2353 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | 2399 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) |
2354 | { | 2400 | { |
2355 | if (flags & SLAB_DMA) { | 2401 | if (flags & SLAB_DMA) |
2356 | if (!(cachep->gfpflags & GFP_DMA)) | 2402 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); |
2357 | BUG(); | 2403 | else |
2358 | } else { | 2404 | BUG_ON(cachep->gfpflags & GFP_DMA); |
2359 | if (cachep->gfpflags & GFP_DMA) | ||
2360 | BUG(); | ||
2361 | } | ||
2362 | } | 2405 | } |
2363 | 2406 | ||
2364 | static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) | 2407 | static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, |
2408 | int nodeid) | ||
2365 | { | 2409 | { |
2366 | void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); | 2410 | void *objp = index_to_obj(cachep, slabp, slabp->free); |
2367 | kmem_bufctl_t next; | 2411 | kmem_bufctl_t next; |
2368 | 2412 | ||
2369 | slabp->inuse++; | 2413 | slabp->inuse++; |
@@ -2377,18 +2421,18 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod | |||
2377 | return objp; | 2421 | return objp; |
2378 | } | 2422 | } |
2379 | 2423 | ||
2380 | static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, | 2424 | static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, |
2381 | int nodeid) | 2425 | void *objp, int nodeid) |
2382 | { | 2426 | { |
2383 | unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; | 2427 | unsigned int objnr = obj_to_index(cachep, slabp, objp); |
2384 | 2428 | ||
2385 | #if DEBUG | 2429 | #if DEBUG |
2386 | /* Verify that the slab belongs to the intended node */ | 2430 | /* Verify that the slab belongs to the intended node */ |
2387 | WARN_ON(slabp->nodeid != nodeid); | 2431 | WARN_ON(slabp->nodeid != nodeid); |
2388 | 2432 | ||
2389 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { | 2433 | if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { |
2390 | printk(KERN_ERR "slab: double free detected in cache " | 2434 | printk(KERN_ERR "slab: double free detected in cache " |
2391 | "'%s', objp %p\n", cachep->name, objp); | 2435 | "'%s', objp %p\n", cachep->name, objp); |
2392 | BUG(); | 2436 | BUG(); |
2393 | } | 2437 | } |
2394 | #endif | 2438 | #endif |
@@ -2397,14 +2441,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob | |||
2397 | slabp->inuse--; | 2441 | slabp->inuse--; |
2398 | } | 2442 | } |
2399 | 2443 | ||
2400 | static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) | 2444 | static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, |
2445 | void *objp) | ||
2401 | { | 2446 | { |
2402 | int i; | 2447 | int i; |
2403 | struct page *page; | 2448 | struct page *page; |
2404 | 2449 | ||
2405 | /* Nasty!!!!!! I hope this is OK. */ | 2450 | /* Nasty!!!!!! I hope this is OK. */ |
2406 | i = 1 << cachep->gfporder; | ||
2407 | page = virt_to_page(objp); | 2451 | page = virt_to_page(objp); |
2452 | |||
2453 | i = 1; | ||
2454 | if (likely(!PageCompound(page))) | ||
2455 | i <<= cachep->gfporder; | ||
2408 | do { | 2456 | do { |
2409 | page_set_cache(page, cachep); | 2457 | page_set_cache(page, cachep); |
2410 | page_set_slab(page, slabp); | 2458 | page_set_slab(page, slabp); |
@@ -2425,8 +2473,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2425 | unsigned long ctor_flags; | 2473 | unsigned long ctor_flags; |
2426 | struct kmem_list3 *l3; | 2474 | struct kmem_list3 *l3; |
2427 | 2475 | ||
2428 | /* Be lazy and only check for valid flags here, | 2476 | /* |
2429 | * keeping it out of the critical path in kmem_cache_alloc(). | 2477 | * Be lazy and only check for valid flags here, keeping it out of the |
2478 | * critical path in kmem_cache_alloc(). | ||
2430 | */ | 2479 | */ |
2431 | if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) | 2480 | if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) |
2432 | BUG(); | 2481 | BUG(); |
@@ -2467,14 +2516,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2467 | */ | 2516 | */ |
2468 | kmem_flagcheck(cachep, flags); | 2517 | kmem_flagcheck(cachep, flags); |
2469 | 2518 | ||
2470 | /* Get mem for the objs. | 2519 | /* |
2471 | * Attempt to allocate a physical page from 'nodeid', | 2520 | * Get mem for the objs. Attempt to allocate a physical page from |
2521 | * 'nodeid'. | ||
2472 | */ | 2522 | */ |
2473 | if (!(objp = kmem_getpages(cachep, flags, nodeid))) | 2523 | objp = kmem_getpages(cachep, flags, nodeid); |
2524 | if (!objp) | ||
2474 | goto failed; | 2525 | goto failed; |
2475 | 2526 | ||
2476 | /* Get slab management. */ | 2527 | /* Get slab management. */ |
2477 | if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) | 2528 | slabp = alloc_slabmgmt(cachep, objp, offset, local_flags); |
2529 | if (!slabp) | ||
2478 | goto opps1; | 2530 | goto opps1; |
2479 | 2531 | ||
2480 | slabp->nodeid = nodeid; | 2532 | slabp->nodeid = nodeid; |
@@ -2493,9 +2545,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
2493 | l3->free_objects += cachep->num; | 2545 | l3->free_objects += cachep->num; |
2494 | spin_unlock(&l3->list_lock); | 2546 | spin_unlock(&l3->list_lock); |
2495 | return 1; | 2547 | return 1; |
2496 | opps1: | 2548 | opps1: |
2497 | kmem_freepages(cachep, objp); | 2549 | kmem_freepages(cachep, objp); |
2498 | failed: | 2550 | failed: |
2499 | if (local_flags & __GFP_WAIT) | 2551 | if (local_flags & __GFP_WAIT) |
2500 | local_irq_disable(); | 2552 | local_irq_disable(); |
2501 | return 0; | 2553 | return 0; |
@@ -2538,8 +2590,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2538 | page = virt_to_page(objp); | 2590 | page = virt_to_page(objp); |
2539 | 2591 | ||
2540 | if (page_get_cache(page) != cachep) { | 2592 | if (page_get_cache(page) != cachep) { |
2541 | printk(KERN_ERR | 2593 | printk(KERN_ERR "mismatch in kmem_cache_free: expected " |
2542 | "mismatch in kmem_cache_free: expected cache %p, got %p\n", | 2594 | "cache %p, got %p\n", |
2543 | page_get_cache(page), cachep); | 2595 | page_get_cache(page), cachep); |
2544 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); | 2596 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); |
2545 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), | 2597 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), |
@@ -2549,13 +2601,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2549 | slabp = page_get_slab(page); | 2601 | slabp = page_get_slab(page); |
2550 | 2602 | ||
2551 | if (cachep->flags & SLAB_RED_ZONE) { | 2603 | if (cachep->flags & SLAB_RED_ZONE) { |
2552 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE | 2604 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || |
2553 | || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { | 2605 | *dbg_redzone2(cachep, objp) != RED_ACTIVE) { |
2554 | slab_error(cachep, | 2606 | slab_error(cachep, "double free, or memory outside" |
2555 | "double free, or memory outside" | 2607 | " object was overwritten"); |
2556 | " object was overwritten"); | 2608 | printk(KERN_ERR "%p: redzone 1:0x%lx, " |
2557 | printk(KERN_ERR | 2609 | "redzone 2:0x%lx.\n", |
2558 | "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | ||
2559 | objp, *dbg_redzone1(cachep, objp), | 2610 | objp, *dbg_redzone1(cachep, objp), |
2560 | *dbg_redzone2(cachep, objp)); | 2611 | *dbg_redzone2(cachep, objp)); |
2561 | } | 2612 | } |
@@ -2565,15 +2616,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2565 | if (cachep->flags & SLAB_STORE_USER) | 2616 | if (cachep->flags & SLAB_STORE_USER) |
2566 | *dbg_userword(cachep, objp) = caller; | 2617 | *dbg_userword(cachep, objp) = caller; |
2567 | 2618 | ||
2568 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | 2619 | objnr = obj_to_index(cachep, slabp, objp); |
2569 | 2620 | ||
2570 | BUG_ON(objnr >= cachep->num); | 2621 | BUG_ON(objnr >= cachep->num); |
2571 | BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); | 2622 | BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); |
2572 | 2623 | ||
2573 | if (cachep->flags & SLAB_DEBUG_INITIAL) { | 2624 | if (cachep->flags & SLAB_DEBUG_INITIAL) { |
2574 | /* Need to call the slab's constructor so the | 2625 | /* |
2575 | * caller can perform a verify of its state (debugging). | 2626 | * Need to call the slab's constructor so the caller can |
2576 | * Called without the cache-lock held. | 2627 | * perform a verify of its state (debugging). Called without |
2628 | * the cache-lock held. | ||
2577 | */ | 2629 | */ |
2578 | cachep->ctor(objp + obj_offset(cachep), | 2630 | cachep->ctor(objp + obj_offset(cachep), |
2579 | cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); | 2631 | cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); |
@@ -2584,9 +2636,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2584 | */ | 2636 | */ |
2585 | cachep->dtor(objp + obj_offset(cachep), cachep, 0); | 2637 | cachep->dtor(objp + obj_offset(cachep), cachep, 0); |
2586 | } | 2638 | } |
2639 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
2640 | slab_bufctl(slabp)[objnr] = BUFCTL_FREE; | ||
2641 | #endif | ||
2587 | if (cachep->flags & SLAB_POISON) { | 2642 | if (cachep->flags & SLAB_POISON) { |
2588 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2643 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2589 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { | 2644 | if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
2590 | store_stackinfo(cachep, objp, (unsigned long)caller); | 2645 | store_stackinfo(cachep, objp, (unsigned long)caller); |
2591 | kernel_map_pages(virt_to_page(objp), | 2646 | kernel_map_pages(virt_to_page(objp), |
2592 | cachep->buffer_size / PAGE_SIZE, 0); | 2647 | cachep->buffer_size / PAGE_SIZE, 0); |
@@ -2612,14 +2667,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) | |||
2612 | goto bad; | 2667 | goto bad; |
2613 | } | 2668 | } |
2614 | if (entries != cachep->num - slabp->inuse) { | 2669 | if (entries != cachep->num - slabp->inuse) { |
2615 | bad: | 2670 | bad: |
2616 | printk(KERN_ERR | 2671 | printk(KERN_ERR "slab: Internal list corruption detected in " |
2617 | "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", | 2672 | "cache '%s'(%d), slabp %p(%d). Hexdump:\n", |
2618 | cachep->name, cachep->num, slabp, slabp->inuse); | 2673 | cachep->name, cachep->num, slabp, slabp->inuse); |
2619 | for (i = 0; | 2674 | for (i = 0; |
2620 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); | 2675 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); |
2621 | i++) { | 2676 | i++) { |
2622 | if ((i % 16) == 0) | 2677 | if (i % 16 == 0) |
2623 | printk("\n%03x:", i); | 2678 | printk("\n%03x:", i); |
2624 | printk(" %02x", ((unsigned char *)slabp)[i]); | 2679 | printk(" %02x", ((unsigned char *)slabp)[i]); |
2625 | } | 2680 | } |
@@ -2641,12 +2696,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | |||
2641 | 2696 | ||
2642 | check_irq_off(); | 2697 | check_irq_off(); |
2643 | ac = cpu_cache_get(cachep); | 2698 | ac = cpu_cache_get(cachep); |
2644 | retry: | 2699 | retry: |
2645 | batchcount = ac->batchcount; | 2700 | batchcount = ac->batchcount; |
2646 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 2701 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
2647 | /* if there was little recent activity on this | 2702 | /* |
2648 | * cache, then perform only a partial refill. | 2703 | * If there was little recent activity on this cache, then |
2649 | * Otherwise we could generate refill bouncing. | 2704 | * perform only a partial refill. Otherwise we could generate |
2705 | * refill bouncing. | ||
2650 | */ | 2706 | */ |
2651 | batchcount = BATCHREFILL_LIMIT; | 2707 | batchcount = BATCHREFILL_LIMIT; |
2652 | } | 2708 | } |
@@ -2655,20 +2711,10 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | |||
2655 | BUG_ON(ac->avail > 0 || !l3); | 2711 | BUG_ON(ac->avail > 0 || !l3); |
2656 | spin_lock(&l3->list_lock); | 2712 | spin_lock(&l3->list_lock); |
2657 | 2713 | ||
2658 | if (l3->shared) { | 2714 | /* See if we can refill from the shared array */ |
2659 | struct array_cache *shared_array = l3->shared; | 2715 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) |
2660 | if (shared_array->avail) { | 2716 | goto alloc_done; |
2661 | if (batchcount > shared_array->avail) | 2717 | |
2662 | batchcount = shared_array->avail; | ||
2663 | shared_array->avail -= batchcount; | ||
2664 | ac->avail = batchcount; | ||
2665 | memcpy(ac->entry, | ||
2666 | &(shared_array->entry[shared_array->avail]), | ||
2667 | sizeof(void *) * batchcount); | ||
2668 | shared_array->touched = 1; | ||
2669 | goto alloc_done; | ||
2670 | } | ||
2671 | } | ||
2672 | while (batchcount > 0) { | 2718 | while (batchcount > 0) { |
2673 | struct list_head *entry; | 2719 | struct list_head *entry; |
2674 | struct slab *slabp; | 2720 | struct slab *slabp; |
@@ -2702,29 +2748,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | |||
2702 | list_add(&slabp->list, &l3->slabs_partial); | 2748 | list_add(&slabp->list, &l3->slabs_partial); |
2703 | } | 2749 | } |
2704 | 2750 | ||
2705 | must_grow: | 2751 | must_grow: |
2706 | l3->free_objects -= ac->avail; | 2752 | l3->free_objects -= ac->avail; |
2707 | alloc_done: | 2753 | alloc_done: |
2708 | spin_unlock(&l3->list_lock); | 2754 | spin_unlock(&l3->list_lock); |
2709 | 2755 | ||
2710 | if (unlikely(!ac->avail)) { | 2756 | if (unlikely(!ac->avail)) { |
2711 | int x; | 2757 | int x; |
2712 | x = cache_grow(cachep, flags, numa_node_id()); | 2758 | x = cache_grow(cachep, flags, numa_node_id()); |
2713 | 2759 | ||
2714 | // cache_grow can reenable interrupts, then ac could change. | 2760 | /* cache_grow can reenable interrupts, then ac could change. */ |
2715 | ac = cpu_cache_get(cachep); | 2761 | ac = cpu_cache_get(cachep); |
2716 | if (!x && ac->avail == 0) // no objects in sight? abort | 2762 | if (!x && ac->avail == 0) /* no objects in sight? abort */ |
2717 | return NULL; | 2763 | return NULL; |
2718 | 2764 | ||
2719 | if (!ac->avail) // objects refilled by interrupt? | 2765 | if (!ac->avail) /* objects refilled by interrupt? */ |
2720 | goto retry; | 2766 | goto retry; |
2721 | } | 2767 | } |
2722 | ac->touched = 1; | 2768 | ac->touched = 1; |
2723 | return ac->entry[--ac->avail]; | 2769 | return ac->entry[--ac->avail]; |
2724 | } | 2770 | } |
2725 | 2771 | ||
2726 | static inline void | 2772 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
2727 | cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) | 2773 | gfp_t flags) |
2728 | { | 2774 | { |
2729 | might_sleep_if(flags & __GFP_WAIT); | 2775 | might_sleep_if(flags & __GFP_WAIT); |
2730 | #if DEBUG | 2776 | #if DEBUG |
@@ -2733,8 +2779,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) | |||
2733 | } | 2779 | } |
2734 | 2780 | ||
2735 | #if DEBUG | 2781 | #if DEBUG |
2736 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, | 2782 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, |
2737 | void *objp, void *caller) | 2783 | gfp_t flags, void *objp, void *caller) |
2738 | { | 2784 | { |
2739 | if (!objp) | 2785 | if (!objp) |
2740 | return objp; | 2786 | return objp; |
@@ -2754,19 +2800,28 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags | |||
2754 | *dbg_userword(cachep, objp) = caller; | 2800 | *dbg_userword(cachep, objp) = caller; |
2755 | 2801 | ||
2756 | if (cachep->flags & SLAB_RED_ZONE) { | 2802 | if (cachep->flags & SLAB_RED_ZONE) { |
2757 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE | 2803 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || |
2758 | || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { | 2804 | *dbg_redzone2(cachep, objp) != RED_INACTIVE) { |
2759 | slab_error(cachep, | 2805 | slab_error(cachep, "double free, or memory outside" |
2760 | "double free, or memory outside" | 2806 | " object was overwritten"); |
2761 | " object was overwritten"); | ||
2762 | printk(KERN_ERR | 2807 | printk(KERN_ERR |
2763 | "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | 2808 | "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", |
2764 | objp, *dbg_redzone1(cachep, objp), | 2809 | objp, *dbg_redzone1(cachep, objp), |
2765 | *dbg_redzone2(cachep, objp)); | 2810 | *dbg_redzone2(cachep, objp)); |
2766 | } | 2811 | } |
2767 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; | 2812 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; |
2768 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | 2813 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; |
2769 | } | 2814 | } |
2815 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
2816 | { | ||
2817 | struct slab *slabp; | ||
2818 | unsigned objnr; | ||
2819 | |||
2820 | slabp = page_get_slab(virt_to_page(objp)); | ||
2821 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | ||
2822 | slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; | ||
2823 | } | ||
2824 | #endif | ||
2770 | objp += obj_offset(cachep); | 2825 | objp += obj_offset(cachep); |
2771 | if (cachep->ctor && cachep->flags & SLAB_POISON) { | 2826 | if (cachep->ctor && cachep->flags & SLAB_POISON) { |
2772 | unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; | 2827 | unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; |
@@ -2788,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
2788 | struct array_cache *ac; | 2843 | struct array_cache *ac; |
2789 | 2844 | ||
2790 | #ifdef CONFIG_NUMA | 2845 | #ifdef CONFIG_NUMA |
2791 | if (unlikely(current->mempolicy && !in_interrupt())) { | 2846 | if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { |
2792 | int nid = slab_node(current->mempolicy); | 2847 | objp = alternate_node_alloc(cachep, flags); |
2793 | 2848 | if (objp != NULL) | |
2794 | if (nid != numa_node_id()) | 2849 | return objp; |
2795 | return __cache_alloc_node(cachep, flags, nid); | ||
2796 | } | 2850 | } |
2797 | #endif | 2851 | #endif |
2798 | 2852 | ||
@@ -2809,8 +2863,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
2809 | return objp; | 2863 | return objp; |
2810 | } | 2864 | } |
2811 | 2865 | ||
2812 | static __always_inline void * | 2866 | static __always_inline void *__cache_alloc(struct kmem_cache *cachep, |
2813 | __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | 2867 | gfp_t flags, void *caller) |
2814 | { | 2868 | { |
2815 | unsigned long save_flags; | 2869 | unsigned long save_flags; |
2816 | void *objp; | 2870 | void *objp; |
@@ -2828,9 +2882,32 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
2828 | 2882 | ||
2829 | #ifdef CONFIG_NUMA | 2883 | #ifdef CONFIG_NUMA |
2830 | /* | 2884 | /* |
2885 | * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. | ||
2886 | * | ||
2887 | * If we are in_interrupt, then process context, including cpusets and | ||
2888 | * mempolicy, may not apply and should not be used for allocation policy. | ||
2889 | */ | ||
2890 | static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | ||
2891 | { | ||
2892 | int nid_alloc, nid_here; | ||
2893 | |||
2894 | if (in_interrupt()) | ||
2895 | return NULL; | ||
2896 | nid_alloc = nid_here = numa_node_id(); | ||
2897 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | ||
2898 | nid_alloc = cpuset_mem_spread_node(); | ||
2899 | else if (current->mempolicy) | ||
2900 | nid_alloc = slab_node(current->mempolicy); | ||
2901 | if (nid_alloc != nid_here) | ||
2902 | return __cache_alloc_node(cachep, flags, nid_alloc); | ||
2903 | return NULL; | ||
2904 | } | ||
2905 | |||
2906 | /* | ||
2831 | * A interface to enable slab creation on nodeid | 2907 | * A interface to enable slab creation on nodeid |
2832 | */ | 2908 | */ |
2833 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 2909 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
2910 | int nodeid) | ||
2834 | { | 2911 | { |
2835 | struct list_head *entry; | 2912 | struct list_head *entry; |
2836 | struct slab *slabp; | 2913 | struct slab *slabp; |
@@ -2841,7 +2918,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
2841 | l3 = cachep->nodelists[nodeid]; | 2918 | l3 = cachep->nodelists[nodeid]; |
2842 | BUG_ON(!l3); | 2919 | BUG_ON(!l3); |
2843 | 2920 | ||
2844 | retry: | 2921 | retry: |
2845 | check_irq_off(); | 2922 | check_irq_off(); |
2846 | spin_lock(&l3->list_lock); | 2923 | spin_lock(&l3->list_lock); |
2847 | entry = l3->slabs_partial.next; | 2924 | entry = l3->slabs_partial.next; |
@@ -2868,16 +2945,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
2868 | /* move slabp to correct slabp list: */ | 2945 | /* move slabp to correct slabp list: */ |
2869 | list_del(&slabp->list); | 2946 | list_del(&slabp->list); |
2870 | 2947 | ||
2871 | if (slabp->free == BUFCTL_END) { | 2948 | if (slabp->free == BUFCTL_END) |
2872 | list_add(&slabp->list, &l3->slabs_full); | 2949 | list_add(&slabp->list, &l3->slabs_full); |
2873 | } else { | 2950 | else |
2874 | list_add(&slabp->list, &l3->slabs_partial); | 2951 | list_add(&slabp->list, &l3->slabs_partial); |
2875 | } | ||
2876 | 2952 | ||
2877 | spin_unlock(&l3->list_lock); | 2953 | spin_unlock(&l3->list_lock); |
2878 | goto done; | 2954 | goto done; |
2879 | 2955 | ||
2880 | must_grow: | 2956 | must_grow: |
2881 | spin_unlock(&l3->list_lock); | 2957 | spin_unlock(&l3->list_lock); |
2882 | x = cache_grow(cachep, flags, nodeid); | 2958 | x = cache_grow(cachep, flags, nodeid); |
2883 | 2959 | ||
@@ -2885,7 +2961,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
2885 | return NULL; | 2961 | return NULL; |
2886 | 2962 | ||
2887 | goto retry; | 2963 | goto retry; |
2888 | done: | 2964 | done: |
2889 | return obj; | 2965 | return obj; |
2890 | } | 2966 | } |
2891 | #endif | 2967 | #endif |
@@ -2958,7 +3034,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
2958 | } | 3034 | } |
2959 | 3035 | ||
2960 | free_block(cachep, ac->entry, batchcount, node); | 3036 | free_block(cachep, ac->entry, batchcount, node); |
2961 | free_done: | 3037 | free_done: |
2962 | #if STATS | 3038 | #if STATS |
2963 | { | 3039 | { |
2964 | int i = 0; | 3040 | int i = 0; |
@@ -2979,16 +3055,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
2979 | #endif | 3055 | #endif |
2980 | spin_unlock(&l3->list_lock); | 3056 | spin_unlock(&l3->list_lock); |
2981 | ac->avail -= batchcount; | 3057 | ac->avail -= batchcount; |
2982 | memmove(ac->entry, &(ac->entry[batchcount]), | 3058 | memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); |
2983 | sizeof(void *) * ac->avail); | ||
2984 | } | 3059 | } |
2985 | 3060 | ||
2986 | /* | 3061 | /* |
2987 | * __cache_free | 3062 | * Release an obj back to its cache. If the obj has a constructed state, it must |
2988 | * Release an obj back to its cache. If the obj has a constructed | 3063 | * be in this state _before_ it is released. Called with disabled ints. |
2989 | * state, it must be in this state _before_ it is released. | ||
2990 | * | ||
2991 | * Called with disabled ints. | ||
2992 | */ | 3064 | */ |
2993 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) | 3065 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) |
2994 | { | 3066 | { |
@@ -3007,9 +3079,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3007 | if (unlikely(slabp->nodeid != numa_node_id())) { | 3079 | if (unlikely(slabp->nodeid != numa_node_id())) { |
3008 | struct array_cache *alien = NULL; | 3080 | struct array_cache *alien = NULL; |
3009 | int nodeid = slabp->nodeid; | 3081 | int nodeid = slabp->nodeid; |
3010 | struct kmem_list3 *l3 = | 3082 | struct kmem_list3 *l3; |
3011 | cachep->nodelists[numa_node_id()]; | ||
3012 | 3083 | ||
3084 | l3 = cachep->nodelists[numa_node_id()]; | ||
3013 | STATS_INC_NODEFREES(cachep); | 3085 | STATS_INC_NODEFREES(cachep); |
3014 | if (l3->alien && l3->alien[nodeid]) { | 3086 | if (l3->alien && l3->alien[nodeid]) { |
3015 | alien = l3->alien[nodeid]; | 3087 | alien = l3->alien[nodeid]; |
@@ -3056,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3056 | EXPORT_SYMBOL(kmem_cache_alloc); | 3128 | EXPORT_SYMBOL(kmem_cache_alloc); |
3057 | 3129 | ||
3058 | /** | 3130 | /** |
3131 | * kmem_cache_alloc - Allocate an object. The memory is set to zero. | ||
3132 | * @cache: The cache to allocate from. | ||
3133 | * @flags: See kmalloc(). | ||
3134 | * | ||
3135 | * Allocate an object from this cache and set the allocated memory to zero. | ||
3136 | * The flags are only relevant if the cache has no available objects. | ||
3137 | */ | ||
3138 | void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) | ||
3139 | { | ||
3140 | void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); | ||
3141 | if (ret) | ||
3142 | memset(ret, 0, obj_size(cache)); | ||
3143 | return ret; | ||
3144 | } | ||
3145 | EXPORT_SYMBOL(kmem_cache_zalloc); | ||
3146 | |||
3147 | /** | ||
3059 | * kmem_ptr_validate - check if an untrusted pointer might | 3148 | * kmem_ptr_validate - check if an untrusted pointer might |
3060 | * be a slab entry. | 3149 | * be a slab entry. |
3061 | * @cachep: the cache we're checking against | 3150 | * @cachep: the cache we're checking against |
@@ -3093,7 +3182,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) | |||
3093 | if (unlikely(page_get_cache(page) != cachep)) | 3182 | if (unlikely(page_get_cache(page) != cachep)) |
3094 | goto out; | 3183 | goto out; |
3095 | return 1; | 3184 | return 1; |
3096 | out: | 3185 | out: |
3097 | return 0; | 3186 | return 0; |
3098 | } | 3187 | } |
3099 | 3188 | ||
@@ -3119,7 +3208,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3119 | local_irq_save(save_flags); | 3208 | local_irq_save(save_flags); |
3120 | 3209 | ||
3121 | if (nodeid == -1 || nodeid == numa_node_id() || | 3210 | if (nodeid == -1 || nodeid == numa_node_id() || |
3122 | !cachep->nodelists[nodeid]) | 3211 | !cachep->nodelists[nodeid]) |
3123 | ptr = ____cache_alloc(cachep, flags); | 3212 | ptr = ____cache_alloc(cachep, flags); |
3124 | else | 3213 | else |
3125 | ptr = __cache_alloc_node(cachep, flags, nodeid); | 3214 | ptr = __cache_alloc_node(cachep, flags, nodeid); |
@@ -3148,6 +3237,7 @@ EXPORT_SYMBOL(kmalloc_node); | |||
3148 | * kmalloc - allocate memory | 3237 | * kmalloc - allocate memory |
3149 | * @size: how many bytes of memory are required. | 3238 | * @size: how many bytes of memory are required. |
3150 | * @flags: the type of memory to allocate. | 3239 | * @flags: the type of memory to allocate. |
3240 | * @caller: function caller for debug tracking of the caller | ||
3151 | * | 3241 | * |
3152 | * kmalloc is the normal method of allocating memory | 3242 | * kmalloc is the normal method of allocating memory |
3153 | * in the kernel. | 3243 | * in the kernel. |
@@ -3181,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3181 | return __cache_alloc(cachep, flags, caller); | 3271 | return __cache_alloc(cachep, flags, caller); |
3182 | } | 3272 | } |
3183 | 3273 | ||
3184 | #ifndef CONFIG_DEBUG_SLAB | ||
3185 | 3274 | ||
3186 | void *__kmalloc(size_t size, gfp_t flags) | 3275 | void *__kmalloc(size_t size, gfp_t flags) |
3187 | { | 3276 | { |
3277 | #ifndef CONFIG_DEBUG_SLAB | ||
3188 | return __do_kmalloc(size, flags, NULL); | 3278 | return __do_kmalloc(size, flags, NULL); |
3279 | #else | ||
3280 | return __do_kmalloc(size, flags, __builtin_return_address(0)); | ||
3281 | #endif | ||
3189 | } | 3282 | } |
3190 | EXPORT_SYMBOL(__kmalloc); | 3283 | EXPORT_SYMBOL(__kmalloc); |
3191 | 3284 | ||
3192 | #else | 3285 | #ifdef CONFIG_DEBUG_SLAB |
3193 | |||
3194 | void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) | 3286 | void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) |
3195 | { | 3287 | { |
3196 | return __do_kmalloc(size, flags, caller); | 3288 | return __do_kmalloc(size, flags, caller); |
3197 | } | 3289 | } |
3198 | EXPORT_SYMBOL(__kmalloc_track_caller); | 3290 | EXPORT_SYMBOL(__kmalloc_track_caller); |
3199 | |||
3200 | #endif | 3291 | #endif |
3201 | 3292 | ||
3202 | #ifdef CONFIG_SMP | 3293 | #ifdef CONFIG_SMP |
@@ -3220,7 +3311,7 @@ void *__alloc_percpu(size_t size) | |||
3220 | * and we have no way of figuring out how to fix the array | 3311 | * and we have no way of figuring out how to fix the array |
3221 | * that we have allocated then.... | 3312 | * that we have allocated then.... |
3222 | */ | 3313 | */ |
3223 | for_each_cpu(i) { | 3314 | for_each_possible_cpu(i) { |
3224 | int node = cpu_to_node(i); | 3315 | int node = cpu_to_node(i); |
3225 | 3316 | ||
3226 | if (node_online(node)) | 3317 | if (node_online(node)) |
@@ -3236,7 +3327,7 @@ void *__alloc_percpu(size_t size) | |||
3236 | /* Catch derefs w/o wrappers */ | 3327 | /* Catch derefs w/o wrappers */ |
3237 | return (void *)(~(unsigned long)pdata); | 3328 | return (void *)(~(unsigned long)pdata); |
3238 | 3329 | ||
3239 | unwind_oom: | 3330 | unwind_oom: |
3240 | while (--i >= 0) { | 3331 | while (--i >= 0) { |
3241 | if (!cpu_possible(i)) | 3332 | if (!cpu_possible(i)) |
3242 | continue; | 3333 | continue; |
@@ -3307,7 +3398,7 @@ void free_percpu(const void *objp) | |||
3307 | /* | 3398 | /* |
3308 | * We allocate for all cpus so we cannot use for online cpu here. | 3399 | * We allocate for all cpus so we cannot use for online cpu here. |
3309 | */ | 3400 | */ |
3310 | for_each_cpu(i) | 3401 | for_each_possible_cpu(i) |
3311 | kfree(p->ptrs[i]); | 3402 | kfree(p->ptrs[i]); |
3312 | kfree(p); | 3403 | kfree(p); |
3313 | } | 3404 | } |
@@ -3327,61 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep) | |||
3327 | EXPORT_SYMBOL_GPL(kmem_cache_name); | 3418 | EXPORT_SYMBOL_GPL(kmem_cache_name); |
3328 | 3419 | ||
3329 | /* | 3420 | /* |
3330 | * This initializes kmem_list3 for all nodes. | 3421 | * This initializes kmem_list3 or resizes varioius caches for all nodes. |
3331 | */ | 3422 | */ |
3332 | static int alloc_kmemlist(struct kmem_cache *cachep) | 3423 | static int alloc_kmemlist(struct kmem_cache *cachep) |
3333 | { | 3424 | { |
3334 | int node; | 3425 | int node; |
3335 | struct kmem_list3 *l3; | 3426 | struct kmem_list3 *l3; |
3336 | int err = 0; | 3427 | struct array_cache *new_shared; |
3428 | struct array_cache **new_alien; | ||
3337 | 3429 | ||
3338 | for_each_online_node(node) { | 3430 | for_each_online_node(node) { |
3339 | struct array_cache *nc = NULL, *new; | 3431 | |
3340 | struct array_cache **new_alien = NULL; | 3432 | new_alien = alloc_alien_cache(node, cachep->limit); |
3341 | #ifdef CONFIG_NUMA | 3433 | if (!new_alien) |
3342 | if (!(new_alien = alloc_alien_cache(node, cachep->limit))) | ||
3343 | goto fail; | 3434 | goto fail; |
3344 | #endif | 3435 | |
3345 | if (!(new = alloc_arraycache(node, (cachep->shared * | 3436 | new_shared = alloc_arraycache(node, |
3346 | cachep->batchcount), | 3437 | cachep->shared*cachep->batchcount, |
3347 | 0xbaadf00d))) | 3438 | 0xbaadf00d); |
3439 | if (!new_shared) { | ||
3440 | free_alien_cache(new_alien); | ||
3348 | goto fail; | 3441 | goto fail; |
3349 | if ((l3 = cachep->nodelists[node])) { | 3442 | } |
3443 | |||
3444 | l3 = cachep->nodelists[node]; | ||
3445 | if (l3) { | ||
3446 | struct array_cache *shared = l3->shared; | ||
3350 | 3447 | ||
3351 | spin_lock_irq(&l3->list_lock); | 3448 | spin_lock_irq(&l3->list_lock); |
3352 | 3449 | ||
3353 | if ((nc = cachep->nodelists[node]->shared)) | 3450 | if (shared) |
3354 | free_block(cachep, nc->entry, nc->avail, node); | 3451 | free_block(cachep, shared->entry, |
3452 | shared->avail, node); | ||
3355 | 3453 | ||
3356 | l3->shared = new; | 3454 | l3->shared = new_shared; |
3357 | if (!cachep->nodelists[node]->alien) { | 3455 | if (!l3->alien) { |
3358 | l3->alien = new_alien; | 3456 | l3->alien = new_alien; |
3359 | new_alien = NULL; | 3457 | new_alien = NULL; |
3360 | } | 3458 | } |
3361 | l3->free_limit = (1 + nr_cpus_node(node)) * | 3459 | l3->free_limit = (1 + nr_cpus_node(node)) * |
3362 | cachep->batchcount + cachep->num; | 3460 | cachep->batchcount + cachep->num; |
3363 | spin_unlock_irq(&l3->list_lock); | 3461 | spin_unlock_irq(&l3->list_lock); |
3364 | kfree(nc); | 3462 | kfree(shared); |
3365 | free_alien_cache(new_alien); | 3463 | free_alien_cache(new_alien); |
3366 | continue; | 3464 | continue; |
3367 | } | 3465 | } |
3368 | if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), | 3466 | l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); |
3369 | GFP_KERNEL, node))) | 3467 | if (!l3) { |
3468 | free_alien_cache(new_alien); | ||
3469 | kfree(new_shared); | ||
3370 | goto fail; | 3470 | goto fail; |
3471 | } | ||
3371 | 3472 | ||
3372 | kmem_list3_init(l3); | 3473 | kmem_list3_init(l3); |
3373 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 3474 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
3374 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | 3475 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
3375 | l3->shared = new; | 3476 | l3->shared = new_shared; |
3376 | l3->alien = new_alien; | 3477 | l3->alien = new_alien; |
3377 | l3->free_limit = (1 + nr_cpus_node(node)) * | 3478 | l3->free_limit = (1 + nr_cpus_node(node)) * |
3378 | cachep->batchcount + cachep->num; | 3479 | cachep->batchcount + cachep->num; |
3379 | cachep->nodelists[node] = l3; | 3480 | cachep->nodelists[node] = l3; |
3380 | } | 3481 | } |
3381 | return err; | 3482 | return 0; |
3382 | fail: | 3483 | |
3383 | err = -ENOMEM; | 3484 | fail: |
3384 | return err; | 3485 | if (!cachep->next.next) { |
3486 | /* Cache is not active yet. Roll back what we did */ | ||
3487 | node--; | ||
3488 | while (node >= 0) { | ||
3489 | if (cachep->nodelists[node]) { | ||
3490 | l3 = cachep->nodelists[node]; | ||
3491 | |||
3492 | kfree(l3->shared); | ||
3493 | free_alien_cache(l3->alien); | ||
3494 | kfree(l3); | ||
3495 | cachep->nodelists[node] = NULL; | ||
3496 | } | ||
3497 | node--; | ||
3498 | } | ||
3499 | } | ||
3500 | return -ENOMEM; | ||
3385 | } | 3501 | } |
3386 | 3502 | ||
3387 | struct ccupdate_struct { | 3503 | struct ccupdate_struct { |
@@ -3391,7 +3507,7 @@ struct ccupdate_struct { | |||
3391 | 3507 | ||
3392 | static void do_ccupdate_local(void *info) | 3508 | static void do_ccupdate_local(void *info) |
3393 | { | 3509 | { |
3394 | struct ccupdate_struct *new = (struct ccupdate_struct *)info; | 3510 | struct ccupdate_struct *new = info; |
3395 | struct array_cache *old; | 3511 | struct array_cache *old; |
3396 | 3512 | ||
3397 | check_irq_off(); | 3513 | check_irq_off(); |
@@ -3401,16 +3517,17 @@ static void do_ccupdate_local(void *info) | |||
3401 | new->new[smp_processor_id()] = old; | 3517 | new->new[smp_processor_id()] = old; |
3402 | } | 3518 | } |
3403 | 3519 | ||
3404 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, | 3520 | /* Always called with the cache_chain_mutex held */ |
3405 | int shared) | 3521 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3522 | int batchcount, int shared) | ||
3406 | { | 3523 | { |
3407 | struct ccupdate_struct new; | 3524 | struct ccupdate_struct new; |
3408 | int i, err; | 3525 | int i, err; |
3409 | 3526 | ||
3410 | memset(&new.new, 0, sizeof(new.new)); | 3527 | memset(&new.new, 0, sizeof(new.new)); |
3411 | for_each_online_cpu(i) { | 3528 | for_each_online_cpu(i) { |
3412 | new.new[i] = | 3529 | new.new[i] = alloc_arraycache(cpu_to_node(i), limit, |
3413 | alloc_arraycache(cpu_to_node(i), limit, batchcount); | 3530 | batchcount); |
3414 | if (!new.new[i]) { | 3531 | if (!new.new[i]) { |
3415 | for (i--; i >= 0; i--) | 3532 | for (i--; i >= 0; i--) |
3416 | kfree(new.new[i]); | 3533 | kfree(new.new[i]); |
@@ -3419,14 +3536,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount | |||
3419 | } | 3536 | } |
3420 | new.cachep = cachep; | 3537 | new.cachep = cachep; |
3421 | 3538 | ||
3422 | smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); | 3539 | on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); |
3423 | 3540 | ||
3424 | check_irq_on(); | 3541 | check_irq_on(); |
3425 | spin_lock(&cachep->spinlock); | ||
3426 | cachep->batchcount = batchcount; | 3542 | cachep->batchcount = batchcount; |
3427 | cachep->limit = limit; | 3543 | cachep->limit = limit; |
3428 | cachep->shared = shared; | 3544 | cachep->shared = shared; |
3429 | spin_unlock(&cachep->spinlock); | ||
3430 | 3545 | ||
3431 | for_each_online_cpu(i) { | 3546 | for_each_online_cpu(i) { |
3432 | struct array_cache *ccold = new.new[i]; | 3547 | struct array_cache *ccold = new.new[i]; |
@@ -3447,15 +3562,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount | |||
3447 | return 0; | 3562 | return 0; |
3448 | } | 3563 | } |
3449 | 3564 | ||
3565 | /* Called with cache_chain_mutex held always */ | ||
3450 | static void enable_cpucache(struct kmem_cache *cachep) | 3566 | static void enable_cpucache(struct kmem_cache *cachep) |
3451 | { | 3567 | { |
3452 | int err; | 3568 | int err; |
3453 | int limit, shared; | 3569 | int limit, shared; |
3454 | 3570 | ||
3455 | /* The head array serves three purposes: | 3571 | /* |
3572 | * The head array serves three purposes: | ||
3456 | * - create a LIFO ordering, i.e. return objects that are cache-warm | 3573 | * - create a LIFO ordering, i.e. return objects that are cache-warm |
3457 | * - reduce the number of spinlock operations. | 3574 | * - reduce the number of spinlock operations. |
3458 | * - reduce the number of linked list operations on the slab and | 3575 | * - reduce the number of linked list operations on the slab and |
3459 | * bufctl chains: array operations are cheaper. | 3576 | * bufctl chains: array operations are cheaper. |
3460 | * The numbers are guessed, we should auto-tune as described by | 3577 | * The numbers are guessed, we should auto-tune as described by |
3461 | * Bonwick. | 3578 | * Bonwick. |
@@ -3471,7 +3588,8 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
3471 | else | 3588 | else |
3472 | limit = 120; | 3589 | limit = 120; |
3473 | 3590 | ||
3474 | /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound | 3591 | /* |
3592 | * CPU bound tasks (e.g. network routing) can exhibit cpu bound | ||
3475 | * allocation behaviour: Most allocs on one cpu, most free operations | 3593 | * allocation behaviour: Most allocs on one cpu, most free operations |
3476 | * on another cpu. For these cases, an efficient object passing between | 3594 | * on another cpu. For these cases, an efficient object passing between |
3477 | * cpus is necessary. This is provided by a shared array. The array | 3595 | * cpus is necessary. This is provided by a shared array. The array |
@@ -3486,9 +3604,9 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
3486 | #endif | 3604 | #endif |
3487 | 3605 | ||
3488 | #if DEBUG | 3606 | #if DEBUG |
3489 | /* With debugging enabled, large batchcount lead to excessively | 3607 | /* |
3490 | * long periods with disabled local interrupts. Limit the | 3608 | * With debugging enabled, large batchcount lead to excessively long |
3491 | * batchcount | 3609 | * periods with disabled local interrupts. Limit the batchcount |
3492 | */ | 3610 | */ |
3493 | if (limit > 32) | 3611 | if (limit > 32) |
3494 | limit = 32; | 3612 | limit = 32; |
@@ -3499,23 +3617,32 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
3499 | cachep->name, -err); | 3617 | cachep->name, -err); |
3500 | } | 3618 | } |
3501 | 3619 | ||
3502 | static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, | 3620 | /* |
3503 | int force, int node) | 3621 | * Drain an array if it contains any elements taking the l3 lock only if |
3622 | * necessary. Note that the l3 listlock also protects the array_cache | ||
3623 | * if drain_array() is used on the shared array. | ||
3624 | */ | ||
3625 | void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, | ||
3626 | struct array_cache *ac, int force, int node) | ||
3504 | { | 3627 | { |
3505 | int tofree; | 3628 | int tofree; |
3506 | 3629 | ||
3507 | check_spinlock_acquired_node(cachep, node); | 3630 | if (!ac || !ac->avail) |
3631 | return; | ||
3508 | if (ac->touched && !force) { | 3632 | if (ac->touched && !force) { |
3509 | ac->touched = 0; | 3633 | ac->touched = 0; |
3510 | } else if (ac->avail) { | 3634 | } else { |
3511 | tofree = force ? ac->avail : (ac->limit + 4) / 5; | 3635 | spin_lock_irq(&l3->list_lock); |
3512 | if (tofree > ac->avail) { | 3636 | if (ac->avail) { |
3513 | tofree = (ac->avail + 1) / 2; | 3637 | tofree = force ? ac->avail : (ac->limit + 4) / 5; |
3638 | if (tofree > ac->avail) | ||
3639 | tofree = (ac->avail + 1) / 2; | ||
3640 | free_block(cachep, ac->entry, tofree, node); | ||
3641 | ac->avail -= tofree; | ||
3642 | memmove(ac->entry, &(ac->entry[tofree]), | ||
3643 | sizeof(void *) * ac->avail); | ||
3514 | } | 3644 | } |
3515 | free_block(cachep, ac->entry, tofree, node); | 3645 | spin_unlock_irq(&l3->list_lock); |
3516 | ac->avail -= tofree; | ||
3517 | memmove(ac->entry, &(ac->entry[tofree]), | ||
3518 | sizeof(void *) * ac->avail); | ||
3519 | } | 3646 | } |
3520 | } | 3647 | } |
3521 | 3648 | ||
@@ -3528,13 +3655,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac | |||
3528 | * - clear the per-cpu caches for this CPU. | 3655 | * - clear the per-cpu caches for this CPU. |
3529 | * - return freeable pages to the main free memory pool. | 3656 | * - return freeable pages to the main free memory pool. |
3530 | * | 3657 | * |
3531 | * If we cannot acquire the cache chain mutex then just give up - we'll | 3658 | * If we cannot acquire the cache chain mutex then just give up - we'll try |
3532 | * try again on the next iteration. | 3659 | * again on the next iteration. |
3533 | */ | 3660 | */ |
3534 | static void cache_reap(void *unused) | 3661 | static void cache_reap(void *unused) |
3535 | { | 3662 | { |
3536 | struct list_head *walk; | 3663 | struct list_head *walk; |
3537 | struct kmem_list3 *l3; | 3664 | struct kmem_list3 *l3; |
3665 | int node = numa_node_id(); | ||
3538 | 3666 | ||
3539 | if (!mutex_trylock(&cache_chain_mutex)) { | 3667 | if (!mutex_trylock(&cache_chain_mutex)) { |
3540 | /* Give up. Setup the next iteration. */ | 3668 | /* Give up. Setup the next iteration. */ |
@@ -3550,65 +3678,72 @@ static void cache_reap(void *unused) | |||
3550 | struct slab *slabp; | 3678 | struct slab *slabp; |
3551 | 3679 | ||
3552 | searchp = list_entry(walk, struct kmem_cache, next); | 3680 | searchp = list_entry(walk, struct kmem_cache, next); |
3553 | |||
3554 | if (searchp->flags & SLAB_NO_REAP) | ||
3555 | goto next; | ||
3556 | |||
3557 | check_irq_on(); | 3681 | check_irq_on(); |
3558 | 3682 | ||
3559 | l3 = searchp->nodelists[numa_node_id()]; | 3683 | /* |
3684 | * We only take the l3 lock if absolutely necessary and we | ||
3685 | * have established with reasonable certainty that | ||
3686 | * we can do some work if the lock was obtained. | ||
3687 | */ | ||
3688 | l3 = searchp->nodelists[node]; | ||
3689 | |||
3560 | reap_alien(searchp, l3); | 3690 | reap_alien(searchp, l3); |
3561 | spin_lock_irq(&l3->list_lock); | ||
3562 | 3691 | ||
3563 | drain_array_locked(searchp, cpu_cache_get(searchp), 0, | 3692 | drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); |
3564 | numa_node_id()); | ||
3565 | 3693 | ||
3694 | /* | ||
3695 | * These are racy checks but it does not matter | ||
3696 | * if we skip one check or scan twice. | ||
3697 | */ | ||
3566 | if (time_after(l3->next_reap, jiffies)) | 3698 | if (time_after(l3->next_reap, jiffies)) |
3567 | goto next_unlock; | 3699 | goto next; |
3568 | 3700 | ||
3569 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3; | 3701 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3; |
3570 | 3702 | ||
3571 | if (l3->shared) | 3703 | drain_array(searchp, l3, l3->shared, 0, node); |
3572 | drain_array_locked(searchp, l3->shared, 0, | ||
3573 | numa_node_id()); | ||
3574 | 3704 | ||
3575 | if (l3->free_touched) { | 3705 | if (l3->free_touched) { |
3576 | l3->free_touched = 0; | 3706 | l3->free_touched = 0; |
3577 | goto next_unlock; | 3707 | goto next; |
3578 | } | 3708 | } |
3579 | 3709 | ||
3580 | tofree = | 3710 | tofree = (l3->free_limit + 5 * searchp->num - 1) / |
3581 | (l3->free_limit + 5 * searchp->num - | 3711 | (5 * searchp->num); |
3582 | 1) / (5 * searchp->num); | ||
3583 | do { | 3712 | do { |
3713 | /* | ||
3714 | * Do not lock if there are no free blocks. | ||
3715 | */ | ||
3716 | if (list_empty(&l3->slabs_free)) | ||
3717 | break; | ||
3718 | |||
3719 | spin_lock_irq(&l3->list_lock); | ||
3584 | p = l3->slabs_free.next; | 3720 | p = l3->slabs_free.next; |
3585 | if (p == &(l3->slabs_free)) | 3721 | if (p == &(l3->slabs_free)) { |
3722 | spin_unlock_irq(&l3->list_lock); | ||
3586 | break; | 3723 | break; |
3724 | } | ||
3587 | 3725 | ||
3588 | slabp = list_entry(p, struct slab, list); | 3726 | slabp = list_entry(p, struct slab, list); |
3589 | BUG_ON(slabp->inuse); | 3727 | BUG_ON(slabp->inuse); |
3590 | list_del(&slabp->list); | 3728 | list_del(&slabp->list); |
3591 | STATS_INC_REAPED(searchp); | 3729 | STATS_INC_REAPED(searchp); |
3592 | 3730 | ||
3593 | /* Safe to drop the lock. The slab is no longer | 3731 | /* |
3594 | * linked to the cache. | 3732 | * Safe to drop the lock. The slab is no longer linked |
3595 | * searchp cannot disappear, we hold | 3733 | * to the cache. searchp cannot disappear, we hold |
3596 | * cache_chain_lock | 3734 | * cache_chain_lock |
3597 | */ | 3735 | */ |
3598 | l3->free_objects -= searchp->num; | 3736 | l3->free_objects -= searchp->num; |
3599 | spin_unlock_irq(&l3->list_lock); | 3737 | spin_unlock_irq(&l3->list_lock); |
3600 | slab_destroy(searchp, slabp); | 3738 | slab_destroy(searchp, slabp); |
3601 | spin_lock_irq(&l3->list_lock); | ||
3602 | } while (--tofree > 0); | 3739 | } while (--tofree > 0); |
3603 | next_unlock: | 3740 | next: |
3604 | spin_unlock_irq(&l3->list_lock); | ||
3605 | next: | ||
3606 | cond_resched(); | 3741 | cond_resched(); |
3607 | } | 3742 | } |
3608 | check_irq_on(); | 3743 | check_irq_on(); |
3609 | mutex_unlock(&cache_chain_mutex); | 3744 | mutex_unlock(&cache_chain_mutex); |
3610 | next_reap_node(); | 3745 | next_reap_node(); |
3611 | /* Setup the next iteration */ | 3746 | /* Set up the next iteration */ |
3612 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 3747 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); |
3613 | } | 3748 | } |
3614 | 3749 | ||
@@ -3658,8 +3793,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) | |||
3658 | { | 3793 | { |
3659 | struct kmem_cache *cachep = p; | 3794 | struct kmem_cache *cachep = p; |
3660 | ++*pos; | 3795 | ++*pos; |
3661 | return cachep->next.next == &cache_chain ? NULL | 3796 | return cachep->next.next == &cache_chain ? |
3662 | : list_entry(cachep->next.next, struct kmem_cache, next); | 3797 | NULL : list_entry(cachep->next.next, struct kmem_cache, next); |
3663 | } | 3798 | } |
3664 | 3799 | ||
3665 | static void s_stop(struct seq_file *m, void *p) | 3800 | static void s_stop(struct seq_file *m, void *p) |
@@ -3681,7 +3816,6 @@ static int s_show(struct seq_file *m, void *p) | |||
3681 | int node; | 3816 | int node; |
3682 | struct kmem_list3 *l3; | 3817 | struct kmem_list3 *l3; |
3683 | 3818 | ||
3684 | spin_lock(&cachep->spinlock); | ||
3685 | active_objs = 0; | 3819 | active_objs = 0; |
3686 | num_slabs = 0; | 3820 | num_slabs = 0; |
3687 | for_each_online_node(node) { | 3821 | for_each_online_node(node) { |
@@ -3748,7 +3882,9 @@ static int s_show(struct seq_file *m, void *p) | |||
3748 | unsigned long node_frees = cachep->node_frees; | 3882 | unsigned long node_frees = cachep->node_frees; |
3749 | 3883 | ||
3750 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ | 3884 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ |
3751 | %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); | 3885 | %4lu %4lu %4lu %4lu", allocs, high, grown, |
3886 | reaped, errors, max_freeable, node_allocs, | ||
3887 | node_frees); | ||
3752 | } | 3888 | } |
3753 | /* cpu stats */ | 3889 | /* cpu stats */ |
3754 | { | 3890 | { |
@@ -3762,7 +3898,6 @@ static int s_show(struct seq_file *m, void *p) | |||
3762 | } | 3898 | } |
3763 | #endif | 3899 | #endif |
3764 | seq_putc(m, '\n'); | 3900 | seq_putc(m, '\n'); |
3765 | spin_unlock(&cachep->spinlock); | ||
3766 | return 0; | 3901 | return 0; |
3767 | } | 3902 | } |
3768 | 3903 | ||
@@ -3820,13 +3955,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3820 | mutex_lock(&cache_chain_mutex); | 3955 | mutex_lock(&cache_chain_mutex); |
3821 | res = -EINVAL; | 3956 | res = -EINVAL; |
3822 | list_for_each(p, &cache_chain) { | 3957 | list_for_each(p, &cache_chain) { |
3823 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, | 3958 | struct kmem_cache *cachep; |
3824 | next); | ||
3825 | 3959 | ||
3960 | cachep = list_entry(p, struct kmem_cache, next); | ||
3826 | if (!strcmp(cachep->name, kbuf)) { | 3961 | if (!strcmp(cachep->name, kbuf)) { |
3827 | if (limit < 1 || | 3962 | if (limit < 1 || batchcount < 1 || |
3828 | batchcount < 1 || | 3963 | batchcount > limit || shared < 0) { |
3829 | batchcount > limit || shared < 0) { | ||
3830 | res = 0; | 3964 | res = 0; |
3831 | } else { | 3965 | } else { |
3832 | res = do_tune_cpucache(cachep, limit, | 3966 | res = do_tune_cpucache(cachep, limit, |
@@ -3840,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3840 | res = count; | 3974 | res = count; |
3841 | return res; | 3975 | return res; |
3842 | } | 3976 | } |
3977 | |||
3978 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
3979 | |||
3980 | static void *leaks_start(struct seq_file *m, loff_t *pos) | ||
3981 | { | ||
3982 | loff_t n = *pos; | ||
3983 | struct list_head *p; | ||
3984 | |||
3985 | mutex_lock(&cache_chain_mutex); | ||
3986 | p = cache_chain.next; | ||
3987 | while (n--) { | ||
3988 | p = p->next; | ||
3989 | if (p == &cache_chain) | ||
3990 | return NULL; | ||
3991 | } | ||
3992 | return list_entry(p, struct kmem_cache, next); | ||
3993 | } | ||
3994 | |||
3995 | static inline int add_caller(unsigned long *n, unsigned long v) | ||
3996 | { | ||
3997 | unsigned long *p; | ||
3998 | int l; | ||
3999 | if (!v) | ||
4000 | return 1; | ||
4001 | l = n[1]; | ||
4002 | p = n + 2; | ||
4003 | while (l) { | ||
4004 | int i = l/2; | ||
4005 | unsigned long *q = p + 2 * i; | ||
4006 | if (*q == v) { | ||
4007 | q[1]++; | ||
4008 | return 1; | ||
4009 | } | ||
4010 | if (*q > v) { | ||
4011 | l = i; | ||
4012 | } else { | ||
4013 | p = q + 2; | ||
4014 | l -= i + 1; | ||
4015 | } | ||
4016 | } | ||
4017 | if (++n[1] == n[0]) | ||
4018 | return 0; | ||
4019 | memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); | ||
4020 | p[0] = v; | ||
4021 | p[1] = 1; | ||
4022 | return 1; | ||
4023 | } | ||
4024 | |||
4025 | static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) | ||
4026 | { | ||
4027 | void *p; | ||
4028 | int i; | ||
4029 | if (n[0] == n[1]) | ||
4030 | return; | ||
4031 | for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { | ||
4032 | if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) | ||
4033 | continue; | ||
4034 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) | ||
4035 | return; | ||
4036 | } | ||
4037 | } | ||
4038 | |||
4039 | static void show_symbol(struct seq_file *m, unsigned long address) | ||
4040 | { | ||
4041 | #ifdef CONFIG_KALLSYMS | ||
4042 | char *modname; | ||
4043 | const char *name; | ||
4044 | unsigned long offset, size; | ||
4045 | char namebuf[KSYM_NAME_LEN+1]; | ||
4046 | |||
4047 | name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); | ||
4048 | |||
4049 | if (name) { | ||
4050 | seq_printf(m, "%s+%#lx/%#lx", name, offset, size); | ||
4051 | if (modname) | ||
4052 | seq_printf(m, " [%s]", modname); | ||
4053 | return; | ||
4054 | } | ||
4055 | #endif | ||
4056 | seq_printf(m, "%p", (void *)address); | ||
4057 | } | ||
4058 | |||
4059 | static int leaks_show(struct seq_file *m, void *p) | ||
4060 | { | ||
4061 | struct kmem_cache *cachep = p; | ||
4062 | struct list_head *q; | ||
4063 | struct slab *slabp; | ||
4064 | struct kmem_list3 *l3; | ||
4065 | const char *name; | ||
4066 | unsigned long *n = m->private; | ||
4067 | int node; | ||
4068 | int i; | ||
4069 | |||
4070 | if (!(cachep->flags & SLAB_STORE_USER)) | ||
4071 | return 0; | ||
4072 | if (!(cachep->flags & SLAB_RED_ZONE)) | ||
4073 | return 0; | ||
4074 | |||
4075 | /* OK, we can do it */ | ||
4076 | |||
4077 | n[1] = 0; | ||
4078 | |||
4079 | for_each_online_node(node) { | ||
4080 | l3 = cachep->nodelists[node]; | ||
4081 | if (!l3) | ||
4082 | continue; | ||
4083 | |||
4084 | check_irq_on(); | ||
4085 | spin_lock_irq(&l3->list_lock); | ||
4086 | |||
4087 | list_for_each(q, &l3->slabs_full) { | ||
4088 | slabp = list_entry(q, struct slab, list); | ||
4089 | handle_slab(n, cachep, slabp); | ||
4090 | } | ||
4091 | list_for_each(q, &l3->slabs_partial) { | ||
4092 | slabp = list_entry(q, struct slab, list); | ||
4093 | handle_slab(n, cachep, slabp); | ||
4094 | } | ||
4095 | spin_unlock_irq(&l3->list_lock); | ||
4096 | } | ||
4097 | name = cachep->name; | ||
4098 | if (n[0] == n[1]) { | ||
4099 | /* Increase the buffer size */ | ||
4100 | mutex_unlock(&cache_chain_mutex); | ||
4101 | m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); | ||
4102 | if (!m->private) { | ||
4103 | /* Too bad, we are really out */ | ||
4104 | m->private = n; | ||
4105 | mutex_lock(&cache_chain_mutex); | ||
4106 | return -ENOMEM; | ||
4107 | } | ||
4108 | *(unsigned long *)m->private = n[0] * 2; | ||
4109 | kfree(n); | ||
4110 | mutex_lock(&cache_chain_mutex); | ||
4111 | /* Now make sure this entry will be retried */ | ||
4112 | m->count = m->size; | ||
4113 | return 0; | ||
4114 | } | ||
4115 | for (i = 0; i < n[1]; i++) { | ||
4116 | seq_printf(m, "%s: %lu ", name, n[2*i+3]); | ||
4117 | show_symbol(m, n[2*i+2]); | ||
4118 | seq_putc(m, '\n'); | ||
4119 | } | ||
4120 | return 0; | ||
4121 | } | ||
4122 | |||
4123 | struct seq_operations slabstats_op = { | ||
4124 | .start = leaks_start, | ||
4125 | .next = s_next, | ||
4126 | .stop = s_stop, | ||
4127 | .show = leaks_show, | ||
4128 | }; | ||
4129 | #endif | ||
3843 | #endif | 4130 | #endif |
3844 | 4131 | ||
3845 | /** | 4132 | /** |
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) | |||
294 | } | 294 | } |
295 | EXPORT_SYMBOL(kmem_cache_alloc); | 295 | EXPORT_SYMBOL(kmem_cache_alloc); |
296 | 296 | ||
297 | void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) | ||
298 | { | ||
299 | void *ret = kmem_cache_alloc(c, flags); | ||
300 | if (ret) | ||
301 | memset(ret, 0, c->size); | ||
302 | |||
303 | return ret; | ||
304 | } | ||
305 | EXPORT_SYMBOL(kmem_cache_zalloc); | ||
306 | |||
297 | void kmem_cache_free(struct kmem_cache *c, void *b) | 307 | void kmem_cache_free(struct kmem_cache *c, void *b) |
298 | { | 308 | { |
299 | if (c->dtor) | 309 | if (c->dtor) |
@@ -209,19 +209,18 @@ int lru_add_drain_all(void) | |||
209 | */ | 209 | */ |
210 | void fastcall __page_cache_release(struct page *page) | 210 | void fastcall __page_cache_release(struct page *page) |
211 | { | 211 | { |
212 | unsigned long flags; | 212 | if (PageLRU(page)) { |
213 | struct zone *zone = page_zone(page); | 213 | unsigned long flags; |
214 | struct zone *zone = page_zone(page); | ||
214 | 215 | ||
215 | spin_lock_irqsave(&zone->lru_lock, flags); | 216 | spin_lock_irqsave(&zone->lru_lock, flags); |
216 | if (TestClearPageLRU(page)) | 217 | BUG_ON(!PageLRU(page)); |
218 | __ClearPageLRU(page); | ||
217 | del_page_from_lru(zone, page); | 219 | del_page_from_lru(zone, page); |
218 | if (page_count(page) != 0) | 220 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
219 | page = NULL; | 221 | } |
220 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 222 | free_hot_page(page); |
221 | if (page) | ||
222 | free_hot_page(page); | ||
223 | } | 223 | } |
224 | |||
225 | EXPORT_SYMBOL(__page_cache_release); | 224 | EXPORT_SYMBOL(__page_cache_release); |
226 | 225 | ||
227 | /* | 226 | /* |
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold) | |||
245 | pagevec_init(&pages_to_free, cold); | 244 | pagevec_init(&pages_to_free, cold); |
246 | for (i = 0; i < nr; i++) { | 245 | for (i = 0; i < nr; i++) { |
247 | struct page *page = pages[i]; | 246 | struct page *page = pages[i]; |
248 | struct zone *pagezone; | ||
249 | 247 | ||
250 | if (unlikely(PageCompound(page))) { | 248 | if (unlikely(PageCompound(page))) { |
251 | if (zone) { | 249 | if (zone) { |
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold) | |||
259 | if (!put_page_testzero(page)) | 257 | if (!put_page_testzero(page)) |
260 | continue; | 258 | continue; |
261 | 259 | ||
262 | pagezone = page_zone(page); | 260 | if (PageLRU(page)) { |
263 | if (pagezone != zone) { | 261 | struct zone *pagezone = page_zone(page); |
264 | if (zone) | 262 | if (pagezone != zone) { |
265 | spin_unlock_irq(&zone->lru_lock); | 263 | if (zone) |
266 | zone = pagezone; | 264 | spin_unlock_irq(&zone->lru_lock); |
267 | spin_lock_irq(&zone->lru_lock); | 265 | zone = pagezone; |
268 | } | 266 | spin_lock_irq(&zone->lru_lock); |
269 | if (TestClearPageLRU(page)) | 267 | } |
268 | BUG_ON(!PageLRU(page)); | ||
269 | __ClearPageLRU(page); | ||
270 | del_page_from_lru(zone, page); | 270 | del_page_from_lru(zone, page); |
271 | if (page_count(page) == 0) { | 271 | } |
272 | if (!pagevec_add(&pages_to_free, page)) { | 272 | |
273 | if (!pagevec_add(&pages_to_free, page)) { | ||
274 | if (zone) { | ||
273 | spin_unlock_irq(&zone->lru_lock); | 275 | spin_unlock_irq(&zone->lru_lock); |
274 | __pagevec_free(&pages_to_free); | 276 | zone = NULL; |
275 | pagevec_reinit(&pages_to_free); | ||
276 | zone = NULL; /* No lock is held */ | ||
277 | } | 277 | } |
278 | } | 278 | __pagevec_free(&pages_to_free); |
279 | pagevec_reinit(&pages_to_free); | ||
280 | } | ||
279 | } | 281 | } |
280 | if (zone) | 282 | if (zone) |
281 | spin_unlock_irq(&zone->lru_lock); | 283 | spin_unlock_irq(&zone->lru_lock); |
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
343 | zone = pagezone; | 345 | zone = pagezone; |
344 | spin_lock_irq(&zone->lru_lock); | 346 | spin_lock_irq(&zone->lru_lock); |
345 | } | 347 | } |
346 | if (TestSetPageLRU(page)) | 348 | BUG_ON(PageLRU(page)); |
347 | BUG(); | 349 | SetPageLRU(page); |
348 | add_page_to_inactive_list(zone, page); | 350 | add_page_to_inactive_list(zone, page); |
349 | } | 351 | } |
350 | if (zone) | 352 | if (zone) |
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec) | |||
370 | zone = pagezone; | 372 | zone = pagezone; |
371 | spin_lock_irq(&zone->lru_lock); | 373 | spin_lock_irq(&zone->lru_lock); |
372 | } | 374 | } |
373 | if (TestSetPageLRU(page)) | 375 | BUG_ON(PageLRU(page)); |
374 | BUG(); | 376 | SetPageLRU(page); |
375 | if (TestSetPageActive(page)) | 377 | BUG_ON(PageActive(page)); |
376 | BUG(); | 378 | SetPageActive(page); |
377 | add_page_to_active_list(zone, page); | 379 | add_page_to_active_list(zone, page); |
378 | } | 380 | } |
379 | if (zone) | 381 | if (zone) |
@@ -510,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc) | |||
510 | 512 | ||
511 | spin_lock(&fbc->lock); | 513 | spin_lock(&fbc->lock); |
512 | ret = fbc->count; | 514 | ret = fbc->count; |
513 | for_each_cpu(cpu) { | 515 | for_each_possible_cpu(cpu) { |
514 | long *pcount = per_cpu_ptr(fbc->counters, cpu); | 516 | long *pcount = per_cpu_ptr(fbc->counters, cpu); |
515 | ret += *pcount; | 517 | ret += *pcount; |
516 | } | 518 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index db8a3d3e1636..d7af296833fc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/buffer_head.h> | 15 | #include <linux/buffer_head.h> |
16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
17 | #include <linux/pagevec.h> | 17 | #include <linux/pagevec.h> |
18 | #include <linux/migrate.h> | ||
18 | 19 | ||
19 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
20 | 21 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f9cf0d073b8..39aa9d129612 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; | |||
45 | 45 | ||
46 | struct swap_list_t swap_list = {-1, -1}; | 46 | struct swap_list_t swap_list = {-1, -1}; |
47 | 47 | ||
48 | struct swap_info_struct swap_info[MAX_SWAPFILES]; | 48 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; |
49 | 49 | ||
50 | static DEFINE_MUTEX(swapon_mutex); | 50 | static DEFINE_MUTEX(swapon_mutex); |
51 | 51 | ||
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
116 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 116 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
117 | else if (offset == last_in_cluster) { | 117 | else if (offset == last_in_cluster) { |
118 | spin_lock(&swap_lock); | 118 | spin_lock(&swap_lock); |
119 | si->cluster_next = offset-SWAPFILE_CLUSTER-1; | 119 | si->cluster_next = offset-SWAPFILE_CLUSTER+1; |
120 | goto cluster; | 120 | goto cluster; |
121 | } | 121 | } |
122 | if (unlikely(--latency_ration < 0)) { | 122 | if (unlikely(--latency_ration < 0)) { |
@@ -417,6 +417,61 @@ void free_swap_and_cache(swp_entry_t entry) | |||
417 | } | 417 | } |
418 | } | 418 | } |
419 | 419 | ||
420 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
421 | /* | ||
422 | * Find the swap type that corresponds to given device (if any) | ||
423 | * | ||
424 | * This is needed for software suspend and is done in such a way that inode | ||
425 | * aliasing is allowed. | ||
426 | */ | ||
427 | int swap_type_of(dev_t device) | ||
428 | { | ||
429 | int i; | ||
430 | |||
431 | spin_lock(&swap_lock); | ||
432 | for (i = 0; i < nr_swapfiles; i++) { | ||
433 | struct inode *inode; | ||
434 | |||
435 | if (!(swap_info[i].flags & SWP_WRITEOK)) | ||
436 | continue; | ||
437 | if (!device) { | ||
438 | spin_unlock(&swap_lock); | ||
439 | return i; | ||
440 | } | ||
441 | inode = swap_info->swap_file->f_dentry->d_inode; | ||
442 | if (S_ISBLK(inode->i_mode) && | ||
443 | device == MKDEV(imajor(inode), iminor(inode))) { | ||
444 | spin_unlock(&swap_lock); | ||
445 | return i; | ||
446 | } | ||
447 | } | ||
448 | spin_unlock(&swap_lock); | ||
449 | return -ENODEV; | ||
450 | } | ||
451 | |||
452 | /* | ||
453 | * Return either the total number of swap pages of given type, or the number | ||
454 | * of free pages of that type (depending on @free) | ||
455 | * | ||
456 | * This is needed for software suspend | ||
457 | */ | ||
458 | unsigned int count_swap_pages(int type, int free) | ||
459 | { | ||
460 | unsigned int n = 0; | ||
461 | |||
462 | if (type < nr_swapfiles) { | ||
463 | spin_lock(&swap_lock); | ||
464 | if (swap_info[type].flags & SWP_WRITEOK) { | ||
465 | n = swap_info[type].pages; | ||
466 | if (free) | ||
467 | n -= swap_info[type].inuse_pages; | ||
468 | } | ||
469 | spin_unlock(&swap_lock); | ||
470 | } | ||
471 | return n; | ||
472 | } | ||
473 | #endif | ||
474 | |||
420 | /* | 475 | /* |
421 | * No need to decide whether this PTE shares the swap entry with others, | 476 | * No need to decide whether this PTE shares the swap entry with others, |
422 | * just let do_wp_page work it out if a write is requested later - to | 477 | * just let do_wp_page work it out if a write is requested later - to |
@@ -1,20 +1,22 @@ | |||
1 | #include <linux/slab.h> | 1 | #include <linux/slab.h> |
2 | #include <linux/string.h> | 2 | #include <linux/string.h> |
3 | #include <linux/module.h> | 3 | #include <linux/module.h> |
4 | #include <linux/err.h> | ||
5 | #include <asm/uaccess.h> | ||
4 | 6 | ||
5 | /** | 7 | /** |
6 | * kzalloc - allocate memory. The memory is set to zero. | 8 | * __kzalloc - allocate memory. The memory is set to zero. |
7 | * @size: how many bytes of memory are required. | 9 | * @size: how many bytes of memory are required. |
8 | * @flags: the type of memory to allocate. | 10 | * @flags: the type of memory to allocate. |
9 | */ | 11 | */ |
10 | void *kzalloc(size_t size, gfp_t flags) | 12 | void *__kzalloc(size_t size, gfp_t flags) |
11 | { | 13 | { |
12 | void *ret = kmalloc(size, flags); | 14 | void *ret = ____kmalloc(size, flags); |
13 | if (ret) | 15 | if (ret) |
14 | memset(ret, 0, size); | 16 | memset(ret, 0, size); |
15 | return ret; | 17 | return ret; |
16 | } | 18 | } |
17 | EXPORT_SYMBOL(kzalloc); | 19 | EXPORT_SYMBOL(__kzalloc); |
18 | 20 | ||
19 | /* | 21 | /* |
20 | * kstrdup - allocate space for and copy an existing string | 22 | * kstrdup - allocate space for and copy an existing string |
@@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp) | |||
31 | return NULL; | 33 | return NULL; |
32 | 34 | ||
33 | len = strlen(s) + 1; | 35 | len = strlen(s) + 1; |
34 | buf = kmalloc(len, gfp); | 36 | buf = ____kmalloc(len, gfp); |
35 | if (buf) | 37 | if (buf) |
36 | memcpy(buf, s, len); | 38 | memcpy(buf, s, len); |
37 | return buf; | 39 | return buf; |
38 | } | 40 | } |
39 | EXPORT_SYMBOL(kstrdup); | 41 | EXPORT_SYMBOL(kstrdup); |
42 | |||
43 | /* | ||
44 | * strndup_user - duplicate an existing string from user space | ||
45 | * | ||
46 | * @s: The string to duplicate | ||
47 | * @n: Maximum number of bytes to copy, including the trailing NUL. | ||
48 | */ | ||
49 | char *strndup_user(const char __user *s, long n) | ||
50 | { | ||
51 | char *p; | ||
52 | long length; | ||
53 | |||
54 | length = strnlen_user(s, n); | ||
55 | |||
56 | if (!length) | ||
57 | return ERR_PTR(-EFAULT); | ||
58 | |||
59 | if (length > n) | ||
60 | return ERR_PTR(-EINVAL); | ||
61 | |||
62 | p = kmalloc(length, GFP_KERNEL); | ||
63 | |||
64 | if (!p) | ||
65 | return ERR_PTR(-ENOMEM); | ||
66 | |||
67 | if (copy_from_user(p, s, length)) { | ||
68 | kfree(p); | ||
69 | return ERR_PTR(-EFAULT); | ||
70 | } | ||
71 | |||
72 | p[length - 1] = '\0'; | ||
73 | |||
74 | return p; | ||
75 | } | ||
76 | EXPORT_SYMBOL(strndup_user); | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 4fe7e3aa02e2..acdf001d6941 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -33,39 +33,21 @@ | |||
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/rwsem.h> | 35 | #include <linux/rwsem.h> |
36 | #include <linux/delay.h> | ||
36 | 37 | ||
37 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
38 | #include <asm/div64.h> | 39 | #include <asm/div64.h> |
39 | 40 | ||
40 | #include <linux/swapops.h> | 41 | #include <linux/swapops.h> |
41 | 42 | ||
42 | /* possible outcome of pageout() */ | 43 | #include "internal.h" |
43 | typedef enum { | ||
44 | /* failed to write page out, page is locked */ | ||
45 | PAGE_KEEP, | ||
46 | /* move page to the active list, page is locked */ | ||
47 | PAGE_ACTIVATE, | ||
48 | /* page has been sent to the disk successfully, page is unlocked */ | ||
49 | PAGE_SUCCESS, | ||
50 | /* page is clean and locked */ | ||
51 | PAGE_CLEAN, | ||
52 | } pageout_t; | ||
53 | 44 | ||
54 | struct scan_control { | 45 | struct scan_control { |
55 | /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ | ||
56 | unsigned long nr_to_scan; | ||
57 | |||
58 | /* Incremented by the number of inactive pages that were scanned */ | 46 | /* Incremented by the number of inactive pages that were scanned */ |
59 | unsigned long nr_scanned; | 47 | unsigned long nr_scanned; |
60 | 48 | ||
61 | /* Incremented by the number of pages reclaimed */ | ||
62 | unsigned long nr_reclaimed; | ||
63 | |||
64 | unsigned long nr_mapped; /* From page_state */ | 49 | unsigned long nr_mapped; /* From page_state */ |
65 | 50 | ||
66 | /* Ask shrink_caches, or shrink_zone to scan at this priority */ | ||
67 | unsigned int priority; | ||
68 | |||
69 | /* This context's GFP mask */ | 51 | /* This context's GFP mask */ |
70 | gfp_t gfp_mask; | 52 | gfp_t gfp_mask; |
71 | 53 | ||
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker); | |||
183 | * | 165 | * |
184 | * Returns the number of slab objects which we shrunk. | 166 | * Returns the number of slab objects which we shrunk. |
185 | */ | 167 | */ |
186 | int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) | 168 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, |
169 | unsigned long lru_pages) | ||
187 | { | 170 | { |
188 | struct shrinker *shrinker; | 171 | struct shrinker *shrinker; |
189 | int ret = 0; | 172 | unsigned long ret = 0; |
190 | 173 | ||
191 | if (scanned == 0) | 174 | if (scanned == 0) |
192 | scanned = SWAP_CLUSTER_MAX; | 175 | scanned = SWAP_CLUSTER_MAX; |
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping, | |||
306 | } | 289 | } |
307 | 290 | ||
308 | /* | 291 | /* |
309 | * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). | 292 | * pageout is called by shrink_page_list() for each dirty page. |
293 | * Calls ->writepage(). | ||
310 | */ | 294 | */ |
311 | static pageout_t pageout(struct page *page, struct address_space *mapping) | 295 | pageout_t pageout(struct page *page, struct address_space *mapping) |
312 | { | 296 | { |
313 | /* | 297 | /* |
314 | * If the page is dirty, only perform writeback if that write | 298 | * If the page is dirty, only perform writeback if that write |
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
376 | return PAGE_CLEAN; | 360 | return PAGE_CLEAN; |
377 | } | 361 | } |
378 | 362 | ||
379 | static int remove_mapping(struct address_space *mapping, struct page *page) | 363 | int remove_mapping(struct address_space *mapping, struct page *page) |
380 | { | 364 | { |
381 | if (!mapping) | 365 | if (!mapping) |
382 | return 0; /* truncate got there first */ | 366 | return 0; /* truncate got there first */ |
@@ -414,14 +398,15 @@ cannot_free: | |||
414 | } | 398 | } |
415 | 399 | ||
416 | /* | 400 | /* |
417 | * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed | 401 | * shrink_page_list() returns the number of reclaimed pages |
418 | */ | 402 | */ |
419 | static int shrink_list(struct list_head *page_list, struct scan_control *sc) | 403 | static unsigned long shrink_page_list(struct list_head *page_list, |
404 | struct scan_control *sc) | ||
420 | { | 405 | { |
421 | LIST_HEAD(ret_pages); | 406 | LIST_HEAD(ret_pages); |
422 | struct pagevec freed_pvec; | 407 | struct pagevec freed_pvec; |
423 | int pgactivate = 0; | 408 | int pgactivate = 0; |
424 | int reclaimed = 0; | 409 | unsigned long nr_reclaimed = 0; |
425 | 410 | ||
426 | cond_resched(); | 411 | cond_resched(); |
427 | 412 | ||
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
464 | * Anonymous process memory has backing store? | 449 | * Anonymous process memory has backing store? |
465 | * Try to allocate it some swap space here. | 450 | * Try to allocate it some swap space here. |
466 | */ | 451 | */ |
467 | if (PageAnon(page) && !PageSwapCache(page)) { | 452 | if (PageAnon(page) && !PageSwapCache(page)) |
468 | if (!sc->may_swap) | ||
469 | goto keep_locked; | ||
470 | if (!add_to_swap(page, GFP_ATOMIC)) | 453 | if (!add_to_swap(page, GFP_ATOMIC)) |
471 | goto activate_locked; | 454 | goto activate_locked; |
472 | } | ||
473 | #endif /* CONFIG_SWAP */ | 455 | #endif /* CONFIG_SWAP */ |
474 | 456 | ||
475 | mapping = page_mapping(page); | 457 | mapping = page_mapping(page); |
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
481 | * processes. Try to unmap it here. | 463 | * processes. Try to unmap it here. |
482 | */ | 464 | */ |
483 | if (page_mapped(page) && mapping) { | 465 | if (page_mapped(page) && mapping) { |
484 | /* | ||
485 | * No unmapping if we do not swap | ||
486 | */ | ||
487 | if (!sc->may_swap) | ||
488 | goto keep_locked; | ||
489 | |||
490 | switch (try_to_unmap(page, 0)) { | 466 | switch (try_to_unmap(page, 0)) { |
491 | case SWAP_FAIL: | 467 | case SWAP_FAIL: |
492 | goto activate_locked; | 468 | goto activate_locked; |
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
561 | 537 | ||
562 | free_it: | 538 | free_it: |
563 | unlock_page(page); | 539 | unlock_page(page); |
564 | reclaimed++; | 540 | nr_reclaimed++; |
565 | if (!pagevec_add(&freed_pvec, page)) | 541 | if (!pagevec_add(&freed_pvec, page)) |
566 | __pagevec_release_nonlru(&freed_pvec); | 542 | __pagevec_release_nonlru(&freed_pvec); |
567 | continue; | 543 | continue; |
@@ -579,483 +555,8 @@ keep: | |||
579 | if (pagevec_count(&freed_pvec)) | 555 | if (pagevec_count(&freed_pvec)) |
580 | __pagevec_release_nonlru(&freed_pvec); | 556 | __pagevec_release_nonlru(&freed_pvec); |
581 | mod_page_state(pgactivate, pgactivate); | 557 | mod_page_state(pgactivate, pgactivate); |
582 | sc->nr_reclaimed += reclaimed; | 558 | return nr_reclaimed; |
583 | return reclaimed; | ||
584 | } | ||
585 | |||
586 | #ifdef CONFIG_MIGRATION | ||
587 | static inline void move_to_lru(struct page *page) | ||
588 | { | ||
589 | list_del(&page->lru); | ||
590 | if (PageActive(page)) { | ||
591 | /* | ||
592 | * lru_cache_add_active checks that | ||
593 | * the PG_active bit is off. | ||
594 | */ | ||
595 | ClearPageActive(page); | ||
596 | lru_cache_add_active(page); | ||
597 | } else { | ||
598 | lru_cache_add(page); | ||
599 | } | ||
600 | put_page(page); | ||
601 | } | ||
602 | |||
603 | /* | ||
604 | * Add isolated pages on the list back to the LRU. | ||
605 | * | ||
606 | * returns the number of pages put back. | ||
607 | */ | ||
608 | int putback_lru_pages(struct list_head *l) | ||
609 | { | ||
610 | struct page *page; | ||
611 | struct page *page2; | ||
612 | int count = 0; | ||
613 | |||
614 | list_for_each_entry_safe(page, page2, l, lru) { | ||
615 | move_to_lru(page); | ||
616 | count++; | ||
617 | } | ||
618 | return count; | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * Non migratable page | ||
623 | */ | ||
624 | int fail_migrate_page(struct page *newpage, struct page *page) | ||
625 | { | ||
626 | return -EIO; | ||
627 | } | ||
628 | EXPORT_SYMBOL(fail_migrate_page); | ||
629 | |||
630 | /* | ||
631 | * swapout a single page | ||
632 | * page is locked upon entry, unlocked on exit | ||
633 | */ | ||
634 | static int swap_page(struct page *page) | ||
635 | { | ||
636 | struct address_space *mapping = page_mapping(page); | ||
637 | |||
638 | if (page_mapped(page) && mapping) | ||
639 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) | ||
640 | goto unlock_retry; | ||
641 | |||
642 | if (PageDirty(page)) { | ||
643 | /* Page is dirty, try to write it out here */ | ||
644 | switch(pageout(page, mapping)) { | ||
645 | case PAGE_KEEP: | ||
646 | case PAGE_ACTIVATE: | ||
647 | goto unlock_retry; | ||
648 | |||
649 | case PAGE_SUCCESS: | ||
650 | goto retry; | ||
651 | |||
652 | case PAGE_CLEAN: | ||
653 | ; /* try to free the page below */ | ||
654 | } | ||
655 | } | ||
656 | |||
657 | if (PagePrivate(page)) { | ||
658 | if (!try_to_release_page(page, GFP_KERNEL) || | ||
659 | (!mapping && page_count(page) == 1)) | ||
660 | goto unlock_retry; | ||
661 | } | ||
662 | |||
663 | if (remove_mapping(mapping, page)) { | ||
664 | /* Success */ | ||
665 | unlock_page(page); | ||
666 | return 0; | ||
667 | } | ||
668 | |||
669 | unlock_retry: | ||
670 | unlock_page(page); | ||
671 | |||
672 | retry: | ||
673 | return -EAGAIN; | ||
674 | } | 559 | } |
675 | EXPORT_SYMBOL(swap_page); | ||
676 | |||
677 | /* | ||
678 | * Page migration was first developed in the context of the memory hotplug | ||
679 | * project. The main authors of the migration code are: | ||
680 | * | ||
681 | * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> | ||
682 | * Hirokazu Takahashi <taka@valinux.co.jp> | ||
683 | * Dave Hansen <haveblue@us.ibm.com> | ||
684 | * Christoph Lameter <clameter@sgi.com> | ||
685 | */ | ||
686 | |||
687 | /* | ||
688 | * Remove references for a page and establish the new page with the correct | ||
689 | * basic settings to be able to stop accesses to the page. | ||
690 | */ | ||
691 | int migrate_page_remove_references(struct page *newpage, | ||
692 | struct page *page, int nr_refs) | ||
693 | { | ||
694 | struct address_space *mapping = page_mapping(page); | ||
695 | struct page **radix_pointer; | ||
696 | |||
697 | /* | ||
698 | * Avoid doing any of the following work if the page count | ||
699 | * indicates that the page is in use or truncate has removed | ||
700 | * the page. | ||
701 | */ | ||
702 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | ||
703 | return -EAGAIN; | ||
704 | |||
705 | /* | ||
706 | * Establish swap ptes for anonymous pages or destroy pte | ||
707 | * maps for files. | ||
708 | * | ||
709 | * In order to reestablish file backed mappings the fault handlers | ||
710 | * will take the radix tree_lock which may then be used to stop | ||
711 | * processses from accessing this page until the new page is ready. | ||
712 | * | ||
713 | * A process accessing via a swap pte (an anonymous page) will take a | ||
714 | * page_lock on the old page which will block the process until the | ||
715 | * migration attempt is complete. At that time the PageSwapCache bit | ||
716 | * will be examined. If the page was migrated then the PageSwapCache | ||
717 | * bit will be clear and the operation to retrieve the page will be | ||
718 | * retried which will find the new page in the radix tree. Then a new | ||
719 | * direct mapping may be generated based on the radix tree contents. | ||
720 | * | ||
721 | * If the page was not migrated then the PageSwapCache bit | ||
722 | * is still set and the operation may continue. | ||
723 | */ | ||
724 | if (try_to_unmap(page, 1) == SWAP_FAIL) | ||
725 | /* A vma has VM_LOCKED set -> Permanent failure */ | ||
726 | return -EPERM; | ||
727 | |||
728 | /* | ||
729 | * Give up if we were unable to remove all mappings. | ||
730 | */ | ||
731 | if (page_mapcount(page)) | ||
732 | return -EAGAIN; | ||
733 | |||
734 | write_lock_irq(&mapping->tree_lock); | ||
735 | |||
736 | radix_pointer = (struct page **)radix_tree_lookup_slot( | ||
737 | &mapping->page_tree, | ||
738 | page_index(page)); | ||
739 | |||
740 | if (!page_mapping(page) || page_count(page) != nr_refs || | ||
741 | *radix_pointer != page) { | ||
742 | write_unlock_irq(&mapping->tree_lock); | ||
743 | return -EAGAIN; | ||
744 | } | ||
745 | |||
746 | /* | ||
747 | * Now we know that no one else is looking at the page. | ||
748 | * | ||
749 | * Certain minimal information about a page must be available | ||
750 | * in order for other subsystems to properly handle the page if they | ||
751 | * find it through the radix tree update before we are finished | ||
752 | * copying the page. | ||
753 | */ | ||
754 | get_page(newpage); | ||
755 | newpage->index = page->index; | ||
756 | newpage->mapping = page->mapping; | ||
757 | if (PageSwapCache(page)) { | ||
758 | SetPageSwapCache(newpage); | ||
759 | set_page_private(newpage, page_private(page)); | ||
760 | } | ||
761 | |||
762 | *radix_pointer = newpage; | ||
763 | __put_page(page); | ||
764 | write_unlock_irq(&mapping->tree_lock); | ||
765 | |||
766 | return 0; | ||
767 | } | ||
768 | EXPORT_SYMBOL(migrate_page_remove_references); | ||
769 | |||
770 | /* | ||
771 | * Copy the page to its new location | ||
772 | */ | ||
773 | void migrate_page_copy(struct page *newpage, struct page *page) | ||
774 | { | ||
775 | copy_highpage(newpage, page); | ||
776 | |||
777 | if (PageError(page)) | ||
778 | SetPageError(newpage); | ||
779 | if (PageReferenced(page)) | ||
780 | SetPageReferenced(newpage); | ||
781 | if (PageUptodate(page)) | ||
782 | SetPageUptodate(newpage); | ||
783 | if (PageActive(page)) | ||
784 | SetPageActive(newpage); | ||
785 | if (PageChecked(page)) | ||
786 | SetPageChecked(newpage); | ||
787 | if (PageMappedToDisk(page)) | ||
788 | SetPageMappedToDisk(newpage); | ||
789 | |||
790 | if (PageDirty(page)) { | ||
791 | clear_page_dirty_for_io(page); | ||
792 | set_page_dirty(newpage); | ||
793 | } | ||
794 | |||
795 | ClearPageSwapCache(page); | ||
796 | ClearPageActive(page); | ||
797 | ClearPagePrivate(page); | ||
798 | set_page_private(page, 0); | ||
799 | page->mapping = NULL; | ||
800 | |||
801 | /* | ||
802 | * If any waiters have accumulated on the new page then | ||
803 | * wake them up. | ||
804 | */ | ||
805 | if (PageWriteback(newpage)) | ||
806 | end_page_writeback(newpage); | ||
807 | } | ||
808 | EXPORT_SYMBOL(migrate_page_copy); | ||
809 | |||
810 | /* | ||
811 | * Common logic to directly migrate a single page suitable for | ||
812 | * pages that do not use PagePrivate. | ||
813 | * | ||
814 | * Pages are locked upon entry and exit. | ||
815 | */ | ||
816 | int migrate_page(struct page *newpage, struct page *page) | ||
817 | { | ||
818 | int rc; | ||
819 | |||
820 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | ||
821 | |||
822 | rc = migrate_page_remove_references(newpage, page, 2); | ||
823 | |||
824 | if (rc) | ||
825 | return rc; | ||
826 | |||
827 | migrate_page_copy(newpage, page); | ||
828 | |||
829 | /* | ||
830 | * Remove auxiliary swap entries and replace | ||
831 | * them with real ptes. | ||
832 | * | ||
833 | * Note that a real pte entry will allow processes that are not | ||
834 | * waiting on the page lock to use the new page via the page tables | ||
835 | * before the new page is unlocked. | ||
836 | */ | ||
837 | remove_from_swap(newpage); | ||
838 | return 0; | ||
839 | } | ||
840 | EXPORT_SYMBOL(migrate_page); | ||
841 | |||
842 | /* | ||
843 | * migrate_pages | ||
844 | * | ||
845 | * Two lists are passed to this function. The first list | ||
846 | * contains the pages isolated from the LRU to be migrated. | ||
847 | * The second list contains new pages that the pages isolated | ||
848 | * can be moved to. If the second list is NULL then all | ||
849 | * pages are swapped out. | ||
850 | * | ||
851 | * The function returns after 10 attempts or if no pages | ||
852 | * are movable anymore because to has become empty | ||
853 | * or no retryable pages exist anymore. | ||
854 | * | ||
855 | * Return: Number of pages not migrated when "to" ran empty. | ||
856 | */ | ||
857 | int migrate_pages(struct list_head *from, struct list_head *to, | ||
858 | struct list_head *moved, struct list_head *failed) | ||
859 | { | ||
860 | int retry; | ||
861 | int nr_failed = 0; | ||
862 | int pass = 0; | ||
863 | struct page *page; | ||
864 | struct page *page2; | ||
865 | int swapwrite = current->flags & PF_SWAPWRITE; | ||
866 | int rc; | ||
867 | |||
868 | if (!swapwrite) | ||
869 | current->flags |= PF_SWAPWRITE; | ||
870 | |||
871 | redo: | ||
872 | retry = 0; | ||
873 | |||
874 | list_for_each_entry_safe(page, page2, from, lru) { | ||
875 | struct page *newpage = NULL; | ||
876 | struct address_space *mapping; | ||
877 | |||
878 | cond_resched(); | ||
879 | |||
880 | rc = 0; | ||
881 | if (page_count(page) == 1) | ||
882 | /* page was freed from under us. So we are done. */ | ||
883 | goto next; | ||
884 | |||
885 | if (to && list_empty(to)) | ||
886 | break; | ||
887 | |||
888 | /* | ||
889 | * Skip locked pages during the first two passes to give the | ||
890 | * functions holding the lock time to release the page. Later we | ||
891 | * use lock_page() to have a higher chance of acquiring the | ||
892 | * lock. | ||
893 | */ | ||
894 | rc = -EAGAIN; | ||
895 | if (pass > 2) | ||
896 | lock_page(page); | ||
897 | else | ||
898 | if (TestSetPageLocked(page)) | ||
899 | goto next; | ||
900 | |||
901 | /* | ||
902 | * Only wait on writeback if we have already done a pass where | ||
903 | * we we may have triggered writeouts for lots of pages. | ||
904 | */ | ||
905 | if (pass > 0) { | ||
906 | wait_on_page_writeback(page); | ||
907 | } else { | ||
908 | if (PageWriteback(page)) | ||
909 | goto unlock_page; | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Anonymous pages must have swap cache references otherwise | ||
914 | * the information contained in the page maps cannot be | ||
915 | * preserved. | ||
916 | */ | ||
917 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
918 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
919 | rc = -ENOMEM; | ||
920 | goto unlock_page; | ||
921 | } | ||
922 | } | ||
923 | |||
924 | if (!to) { | ||
925 | rc = swap_page(page); | ||
926 | goto next; | ||
927 | } | ||
928 | |||
929 | newpage = lru_to_page(to); | ||
930 | lock_page(newpage); | ||
931 | |||
932 | /* | ||
933 | * Pages are properly locked and writeback is complete. | ||
934 | * Try to migrate the page. | ||
935 | */ | ||
936 | mapping = page_mapping(page); | ||
937 | if (!mapping) | ||
938 | goto unlock_both; | ||
939 | |||
940 | if (mapping->a_ops->migratepage) { | ||
941 | /* | ||
942 | * Most pages have a mapping and most filesystems | ||
943 | * should provide a migration function. Anonymous | ||
944 | * pages are part of swap space which also has its | ||
945 | * own migration function. This is the most common | ||
946 | * path for page migration. | ||
947 | */ | ||
948 | rc = mapping->a_ops->migratepage(newpage, page); | ||
949 | goto unlock_both; | ||
950 | } | ||
951 | |||
952 | /* | ||
953 | * Default handling if a filesystem does not provide | ||
954 | * a migration function. We can only migrate clean | ||
955 | * pages so try to write out any dirty pages first. | ||
956 | */ | ||
957 | if (PageDirty(page)) { | ||
958 | switch (pageout(page, mapping)) { | ||
959 | case PAGE_KEEP: | ||
960 | case PAGE_ACTIVATE: | ||
961 | goto unlock_both; | ||
962 | |||
963 | case PAGE_SUCCESS: | ||
964 | unlock_page(newpage); | ||
965 | goto next; | ||
966 | |||
967 | case PAGE_CLEAN: | ||
968 | ; /* try to migrate the page below */ | ||
969 | } | ||
970 | } | ||
971 | |||
972 | /* | ||
973 | * Buffers are managed in a filesystem specific way. | ||
974 | * We must have no buffers or drop them. | ||
975 | */ | ||
976 | if (!page_has_buffers(page) || | ||
977 | try_to_release_page(page, GFP_KERNEL)) { | ||
978 | rc = migrate_page(newpage, page); | ||
979 | goto unlock_both; | ||
980 | } | ||
981 | |||
982 | /* | ||
983 | * On early passes with mapped pages simply | ||
984 | * retry. There may be a lock held for some | ||
985 | * buffers that may go away. Later | ||
986 | * swap them out. | ||
987 | */ | ||
988 | if (pass > 4) { | ||
989 | /* | ||
990 | * Persistently unable to drop buffers..... As a | ||
991 | * measure of last resort we fall back to | ||
992 | * swap_page(). | ||
993 | */ | ||
994 | unlock_page(newpage); | ||
995 | newpage = NULL; | ||
996 | rc = swap_page(page); | ||
997 | goto next; | ||
998 | } | ||
999 | |||
1000 | unlock_both: | ||
1001 | unlock_page(newpage); | ||
1002 | |||
1003 | unlock_page: | ||
1004 | unlock_page(page); | ||
1005 | |||
1006 | next: | ||
1007 | if (rc == -EAGAIN) { | ||
1008 | retry++; | ||
1009 | } else if (rc) { | ||
1010 | /* Permanent failure */ | ||
1011 | list_move(&page->lru, failed); | ||
1012 | nr_failed++; | ||
1013 | } else { | ||
1014 | if (newpage) { | ||
1015 | /* Successful migration. Return page to LRU */ | ||
1016 | move_to_lru(newpage); | ||
1017 | } | ||
1018 | list_move(&page->lru, moved); | ||
1019 | } | ||
1020 | } | ||
1021 | if (retry && pass++ < 10) | ||
1022 | goto redo; | ||
1023 | |||
1024 | if (!swapwrite) | ||
1025 | current->flags &= ~PF_SWAPWRITE; | ||
1026 | |||
1027 | return nr_failed + retry; | ||
1028 | } | ||
1029 | |||
1030 | /* | ||
1031 | * Isolate one page from the LRU lists and put it on the | ||
1032 | * indicated list with elevated refcount. | ||
1033 | * | ||
1034 | * Result: | ||
1035 | * 0 = page not on LRU list | ||
1036 | * 1 = page removed from LRU list and added to the specified list. | ||
1037 | */ | ||
1038 | int isolate_lru_page(struct page *page) | ||
1039 | { | ||
1040 | int ret = 0; | ||
1041 | |||
1042 | if (PageLRU(page)) { | ||
1043 | struct zone *zone = page_zone(page); | ||
1044 | spin_lock_irq(&zone->lru_lock); | ||
1045 | if (TestClearPageLRU(page)) { | ||
1046 | ret = 1; | ||
1047 | get_page(page); | ||
1048 | if (PageActive(page)) | ||
1049 | del_page_from_active_list(zone, page); | ||
1050 | else | ||
1051 | del_page_from_inactive_list(zone, page); | ||
1052 | } | ||
1053 | spin_unlock_irq(&zone->lru_lock); | ||
1054 | } | ||
1055 | |||
1056 | return ret; | ||
1057 | } | ||
1058 | #endif | ||
1059 | 560 | ||
1060 | /* | 561 | /* |
1061 | * zone->lru_lock is heavily contended. Some of the functions that | 562 | * zone->lru_lock is heavily contended. Some of the functions that |
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page) | |||
1074 | * | 575 | * |
1075 | * returns how many pages were moved onto *@dst. | 576 | * returns how many pages were moved onto *@dst. |
1076 | */ | 577 | */ |
1077 | static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | 578 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1078 | struct list_head *dst, int *scanned) | 579 | struct list_head *src, struct list_head *dst, |
580 | unsigned long *scanned) | ||
1079 | { | 581 | { |
1080 | int nr_taken = 0; | 582 | unsigned long nr_taken = 0; |
1081 | struct page *page; | 583 | struct page *page; |
1082 | int scan = 0; | 584 | unsigned long scan; |
1083 | 585 | ||
1084 | while (scan++ < nr_to_scan && !list_empty(src)) { | 586 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
587 | struct list_head *target; | ||
1085 | page = lru_to_page(src); | 588 | page = lru_to_page(src); |
1086 | prefetchw_prev_lru_page(page, src, flags); | 589 | prefetchw_prev_lru_page(page, src, flags); |
1087 | 590 | ||
1088 | if (!TestClearPageLRU(page)) | 591 | BUG_ON(!PageLRU(page)); |
1089 | BUG(); | 592 | |
1090 | list_del(&page->lru); | 593 | list_del(&page->lru); |
1091 | if (get_page_testone(page)) { | 594 | target = src; |
595 | if (likely(get_page_unless_zero(page))) { | ||
1092 | /* | 596 | /* |
1093 | * It is being freed elsewhere | 597 | * Be careful not to clear PageLRU until after we're |
598 | * sure the page is not being freed elsewhere -- the | ||
599 | * page release code relies on it. | ||
1094 | */ | 600 | */ |
1095 | __put_page(page); | 601 | ClearPageLRU(page); |
1096 | SetPageLRU(page); | 602 | target = dst; |
1097 | list_add(&page->lru, src); | ||
1098 | continue; | ||
1099 | } else { | ||
1100 | list_add(&page->lru, dst); | ||
1101 | nr_taken++; | 603 | nr_taken++; |
1102 | } | 604 | } /* else it is being freed elsewhere */ |
605 | |||
606 | list_add(&page->lru, target); | ||
1103 | } | 607 | } |
1104 | 608 | ||
1105 | *scanned = scan; | 609 | *scanned = scan; |
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | |||
1107 | } | 611 | } |
1108 | 612 | ||
1109 | /* | 613 | /* |
1110 | * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed | 614 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
615 | * of reclaimed pages | ||
1111 | */ | 616 | */ |
1112 | static void shrink_cache(struct zone *zone, struct scan_control *sc) | 617 | static unsigned long shrink_inactive_list(unsigned long max_scan, |
618 | struct zone *zone, struct scan_control *sc) | ||
1113 | { | 619 | { |
1114 | LIST_HEAD(page_list); | 620 | LIST_HEAD(page_list); |
1115 | struct pagevec pvec; | 621 | struct pagevec pvec; |
1116 | int max_scan = sc->nr_to_scan; | 622 | unsigned long nr_scanned = 0; |
623 | unsigned long nr_reclaimed = 0; | ||
1117 | 624 | ||
1118 | pagevec_init(&pvec, 1); | 625 | pagevec_init(&pvec, 1); |
1119 | 626 | ||
1120 | lru_add_drain(); | 627 | lru_add_drain(); |
1121 | spin_lock_irq(&zone->lru_lock); | 628 | spin_lock_irq(&zone->lru_lock); |
1122 | while (max_scan > 0) { | 629 | do { |
1123 | struct page *page; | 630 | struct page *page; |
1124 | int nr_taken; | 631 | unsigned long nr_taken; |
1125 | int nr_scan; | 632 | unsigned long nr_scan; |
1126 | int nr_freed; | 633 | unsigned long nr_freed; |
1127 | 634 | ||
1128 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, | 635 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, |
1129 | &zone->inactive_list, | 636 | &zone->inactive_list, |
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
1132 | zone->pages_scanned += nr_scan; | 639 | zone->pages_scanned += nr_scan; |
1133 | spin_unlock_irq(&zone->lru_lock); | 640 | spin_unlock_irq(&zone->lru_lock); |
1134 | 641 | ||
1135 | if (nr_taken == 0) | 642 | nr_scanned += nr_scan; |
1136 | goto done; | 643 | nr_freed = shrink_page_list(&page_list, sc); |
1137 | 644 | nr_reclaimed += nr_freed; | |
1138 | max_scan -= nr_scan; | ||
1139 | nr_freed = shrink_list(&page_list, sc); | ||
1140 | |||
1141 | local_irq_disable(); | 645 | local_irq_disable(); |
1142 | if (current_is_kswapd()) { | 646 | if (current_is_kswapd()) { |
1143 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | 647 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); |
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
1146 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | 650 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); |
1147 | __mod_page_state_zone(zone, pgsteal, nr_freed); | 651 | __mod_page_state_zone(zone, pgsteal, nr_freed); |
1148 | 652 | ||
653 | if (nr_taken == 0) | ||
654 | goto done; | ||
655 | |||
1149 | spin_lock(&zone->lru_lock); | 656 | spin_lock(&zone->lru_lock); |
1150 | /* | 657 | /* |
1151 | * Put back any unfreeable pages. | 658 | * Put back any unfreeable pages. |
1152 | */ | 659 | */ |
1153 | while (!list_empty(&page_list)) { | 660 | while (!list_empty(&page_list)) { |
1154 | page = lru_to_page(&page_list); | 661 | page = lru_to_page(&page_list); |
1155 | if (TestSetPageLRU(page)) | 662 | BUG_ON(PageLRU(page)); |
1156 | BUG(); | 663 | SetPageLRU(page); |
1157 | list_del(&page->lru); | 664 | list_del(&page->lru); |
1158 | if (PageActive(page)) | 665 | if (PageActive(page)) |
1159 | add_page_to_active_list(zone, page); | 666 | add_page_to_active_list(zone, page); |
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
1165 | spin_lock_irq(&zone->lru_lock); | 672 | spin_lock_irq(&zone->lru_lock); |
1166 | } | 673 | } |
1167 | } | 674 | } |
1168 | } | 675 | } while (nr_scanned < max_scan); |
1169 | spin_unlock_irq(&zone->lru_lock); | 676 | spin_unlock(&zone->lru_lock); |
1170 | done: | 677 | done: |
678 | local_irq_enable(); | ||
1171 | pagevec_release(&pvec); | 679 | pagevec_release(&pvec); |
680 | return nr_reclaimed; | ||
1172 | } | 681 | } |
1173 | 682 | ||
1174 | /* | 683 | /* |
@@ -1188,13 +697,12 @@ done: | |||
1188 | * The downside is that we have to touch page->_count against each page. | 697 | * The downside is that we have to touch page->_count against each page. |
1189 | * But we had to alter page->flags anyway. | 698 | * But we had to alter page->flags anyway. |
1190 | */ | 699 | */ |
1191 | static void | 700 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
1192 | refill_inactive_zone(struct zone *zone, struct scan_control *sc) | 701 | struct scan_control *sc) |
1193 | { | 702 | { |
1194 | int pgmoved; | 703 | unsigned long pgmoved; |
1195 | int pgdeactivate = 0; | 704 | int pgdeactivate = 0; |
1196 | int pgscanned; | 705 | unsigned long pgscanned; |
1197 | int nr_pages = sc->nr_to_scan; | ||
1198 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 706 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
1199 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ | 707 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ |
1200 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ | 708 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ |
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1202 | struct pagevec pvec; | 710 | struct pagevec pvec; |
1203 | int reclaim_mapped = 0; | 711 | int reclaim_mapped = 0; |
1204 | 712 | ||
1205 | if (unlikely(sc->may_swap)) { | 713 | if (sc->may_swap) { |
1206 | long mapped_ratio; | 714 | long mapped_ratio; |
1207 | long distress; | 715 | long distress; |
1208 | long swap_tendency; | 716 | long swap_tendency; |
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1272 | while (!list_empty(&l_inactive)) { | 780 | while (!list_empty(&l_inactive)) { |
1273 | page = lru_to_page(&l_inactive); | 781 | page = lru_to_page(&l_inactive); |
1274 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 782 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
1275 | if (TestSetPageLRU(page)) | 783 | BUG_ON(PageLRU(page)); |
1276 | BUG(); | 784 | SetPageLRU(page); |
1277 | if (!TestClearPageActive(page)) | 785 | BUG_ON(!PageActive(page)); |
1278 | BUG(); | 786 | ClearPageActive(page); |
787 | |||
1279 | list_move(&page->lru, &zone->inactive_list); | 788 | list_move(&page->lru, &zone->inactive_list); |
1280 | pgmoved++; | 789 | pgmoved++; |
1281 | if (!pagevec_add(&pvec, page)) { | 790 | if (!pagevec_add(&pvec, page)) { |
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1301 | while (!list_empty(&l_active)) { | 810 | while (!list_empty(&l_active)) { |
1302 | page = lru_to_page(&l_active); | 811 | page = lru_to_page(&l_active); |
1303 | prefetchw_prev_lru_page(page, &l_active, flags); | 812 | prefetchw_prev_lru_page(page, &l_active, flags); |
1304 | if (TestSetPageLRU(page)) | 813 | BUG_ON(PageLRU(page)); |
1305 | BUG(); | 814 | SetPageLRU(page); |
1306 | BUG_ON(!PageActive(page)); | 815 | BUG_ON(!PageActive(page)); |
1307 | list_move(&page->lru, &zone->active_list); | 816 | list_move(&page->lru, &zone->active_list); |
1308 | pgmoved++; | 817 | pgmoved++; |
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1327 | /* | 836 | /* |
1328 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 837 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1329 | */ | 838 | */ |
1330 | static void | 839 | static unsigned long shrink_zone(int priority, struct zone *zone, |
1331 | shrink_zone(struct zone *zone, struct scan_control *sc) | 840 | struct scan_control *sc) |
1332 | { | 841 | { |
1333 | unsigned long nr_active; | 842 | unsigned long nr_active; |
1334 | unsigned long nr_inactive; | 843 | unsigned long nr_inactive; |
844 | unsigned long nr_to_scan; | ||
845 | unsigned long nr_reclaimed = 0; | ||
1335 | 846 | ||
1336 | atomic_inc(&zone->reclaim_in_progress); | 847 | atomic_inc(&zone->reclaim_in_progress); |
1337 | 848 | ||
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
1339 | * Add one to `nr_to_scan' just to make sure that the kernel will | 850 | * Add one to `nr_to_scan' just to make sure that the kernel will |
1340 | * slowly sift through the active list. | 851 | * slowly sift through the active list. |
1341 | */ | 852 | */ |
1342 | zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; | 853 | zone->nr_scan_active += (zone->nr_active >> priority) + 1; |
1343 | nr_active = zone->nr_scan_active; | 854 | nr_active = zone->nr_scan_active; |
1344 | if (nr_active >= sc->swap_cluster_max) | 855 | if (nr_active >= sc->swap_cluster_max) |
1345 | zone->nr_scan_active = 0; | 856 | zone->nr_scan_active = 0; |
1346 | else | 857 | else |
1347 | nr_active = 0; | 858 | nr_active = 0; |
1348 | 859 | ||
1349 | zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; | 860 | zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; |
1350 | nr_inactive = zone->nr_scan_inactive; | 861 | nr_inactive = zone->nr_scan_inactive; |
1351 | if (nr_inactive >= sc->swap_cluster_max) | 862 | if (nr_inactive >= sc->swap_cluster_max) |
1352 | zone->nr_scan_inactive = 0; | 863 | zone->nr_scan_inactive = 0; |
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
1355 | 866 | ||
1356 | while (nr_active || nr_inactive) { | 867 | while (nr_active || nr_inactive) { |
1357 | if (nr_active) { | 868 | if (nr_active) { |
1358 | sc->nr_to_scan = min(nr_active, | 869 | nr_to_scan = min(nr_active, |
1359 | (unsigned long)sc->swap_cluster_max); | 870 | (unsigned long)sc->swap_cluster_max); |
1360 | nr_active -= sc->nr_to_scan; | 871 | nr_active -= nr_to_scan; |
1361 | refill_inactive_zone(zone, sc); | 872 | shrink_active_list(nr_to_scan, zone, sc); |
1362 | } | 873 | } |
1363 | 874 | ||
1364 | if (nr_inactive) { | 875 | if (nr_inactive) { |
1365 | sc->nr_to_scan = min(nr_inactive, | 876 | nr_to_scan = min(nr_inactive, |
1366 | (unsigned long)sc->swap_cluster_max); | 877 | (unsigned long)sc->swap_cluster_max); |
1367 | nr_inactive -= sc->nr_to_scan; | 878 | nr_inactive -= nr_to_scan; |
1368 | shrink_cache(zone, sc); | 879 | nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, |
880 | sc); | ||
1369 | } | 881 | } |
1370 | } | 882 | } |
1371 | 883 | ||
1372 | throttle_vm_writeout(); | 884 | throttle_vm_writeout(); |
1373 | 885 | ||
1374 | atomic_dec(&zone->reclaim_in_progress); | 886 | atomic_dec(&zone->reclaim_in_progress); |
887 | return nr_reclaimed; | ||
1375 | } | 888 | } |
1376 | 889 | ||
1377 | /* | 890 | /* |
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
1390 | * If a zone is deemed to be full of pinned pages then just give it a light | 903 | * If a zone is deemed to be full of pinned pages then just give it a light |
1391 | * scan then give up on it. | 904 | * scan then give up on it. |
1392 | */ | 905 | */ |
1393 | static void | 906 | static unsigned long shrink_zones(int priority, struct zone **zones, |
1394 | shrink_caches(struct zone **zones, struct scan_control *sc) | 907 | struct scan_control *sc) |
1395 | { | 908 | { |
909 | unsigned long nr_reclaimed = 0; | ||
1396 | int i; | 910 | int i; |
1397 | 911 | ||
1398 | for (i = 0; zones[i] != NULL; i++) { | 912 | for (i = 0; zones[i] != NULL; i++) { |
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
1404 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 918 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
1405 | continue; | 919 | continue; |
1406 | 920 | ||
1407 | zone->temp_priority = sc->priority; | 921 | zone->temp_priority = priority; |
1408 | if (zone->prev_priority > sc->priority) | 922 | if (zone->prev_priority > priority) |
1409 | zone->prev_priority = sc->priority; | 923 | zone->prev_priority = priority; |
1410 | 924 | ||
1411 | if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) | 925 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1412 | continue; /* Let kswapd poll it */ | 926 | continue; /* Let kswapd poll it */ |
1413 | 927 | ||
1414 | shrink_zone(zone, sc); | 928 | nr_reclaimed += shrink_zone(priority, zone, sc); |
1415 | } | 929 | } |
930 | return nr_reclaimed; | ||
1416 | } | 931 | } |
1417 | 932 | ||
1418 | /* | 933 | /* |
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
1428 | * holds filesystem locks which prevent writeout this might not work, and the | 943 | * holds filesystem locks which prevent writeout this might not work, and the |
1429 | * allocation attempt will fail. | 944 | * allocation attempt will fail. |
1430 | */ | 945 | */ |
1431 | int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | 946 | unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) |
1432 | { | 947 | { |
1433 | int priority; | 948 | int priority; |
1434 | int ret = 0; | 949 | int ret = 0; |
1435 | int total_scanned = 0, total_reclaimed = 0; | 950 | unsigned long total_scanned = 0; |
951 | unsigned long nr_reclaimed = 0; | ||
1436 | struct reclaim_state *reclaim_state = current->reclaim_state; | 952 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1437 | struct scan_control sc; | ||
1438 | unsigned long lru_pages = 0; | 953 | unsigned long lru_pages = 0; |
1439 | int i; | 954 | int i; |
1440 | 955 | struct scan_control sc = { | |
1441 | sc.gfp_mask = gfp_mask; | 956 | .gfp_mask = gfp_mask, |
1442 | sc.may_writepage = !laptop_mode; | 957 | .may_writepage = !laptop_mode, |
1443 | sc.may_swap = 1; | 958 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
959 | .may_swap = 1, | ||
960 | }; | ||
1444 | 961 | ||
1445 | inc_page_state(allocstall); | 962 | inc_page_state(allocstall); |
1446 | 963 | ||
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1457 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 974 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1458 | sc.nr_mapped = read_page_state(nr_mapped); | 975 | sc.nr_mapped = read_page_state(nr_mapped); |
1459 | sc.nr_scanned = 0; | 976 | sc.nr_scanned = 0; |
1460 | sc.nr_reclaimed = 0; | ||
1461 | sc.priority = priority; | ||
1462 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
1463 | if (!priority) | 977 | if (!priority) |
1464 | disable_swap_token(); | 978 | disable_swap_token(); |
1465 | shrink_caches(zones, &sc); | 979 | nr_reclaimed += shrink_zones(priority, zones, &sc); |
1466 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | 980 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); |
1467 | if (reclaim_state) { | 981 | if (reclaim_state) { |
1468 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 982 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1469 | reclaim_state->reclaimed_slab = 0; | 983 | reclaim_state->reclaimed_slab = 0; |
1470 | } | 984 | } |
1471 | total_scanned += sc.nr_scanned; | 985 | total_scanned += sc.nr_scanned; |
1472 | total_reclaimed += sc.nr_reclaimed; | 986 | if (nr_reclaimed >= sc.swap_cluster_max) { |
1473 | if (total_reclaimed >= sc.swap_cluster_max) { | ||
1474 | ret = 1; | 987 | ret = 1; |
1475 | goto out; | 988 | goto out; |
1476 | } | 989 | } |
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
1482 | * that's undesirable in laptop mode, where we *want* lumpy | 995 | * that's undesirable in laptop mode, where we *want* lumpy |
1483 | * writeout. So in laptop mode, write out the whole world. | 996 | * writeout. So in laptop mode, write out the whole world. |
1484 | */ | 997 | */ |
1485 | if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { | 998 | if (total_scanned > sc.swap_cluster_max + |
999 | sc.swap_cluster_max / 2) { | ||
1486 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1000 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); |
1487 | sc.may_writepage = 1; | 1001 | sc.may_writepage = 1; |
1488 | } | 1002 | } |
@@ -1528,22 +1042,26 @@ out: | |||
1528 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1042 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
1529 | * across the zones. | 1043 | * across the zones. |
1530 | */ | 1044 | */ |
1531 | static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) | 1045 | static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, |
1046 | int order) | ||
1532 | { | 1047 | { |
1533 | int to_free = nr_pages; | 1048 | unsigned long to_free = nr_pages; |
1534 | int all_zones_ok; | 1049 | int all_zones_ok; |
1535 | int priority; | 1050 | int priority; |
1536 | int i; | 1051 | int i; |
1537 | int total_scanned, total_reclaimed; | 1052 | unsigned long total_scanned; |
1053 | unsigned long nr_reclaimed; | ||
1538 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1054 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1539 | struct scan_control sc; | 1055 | struct scan_control sc = { |
1056 | .gfp_mask = GFP_KERNEL, | ||
1057 | .may_swap = 1, | ||
1058 | .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, | ||
1059 | }; | ||
1540 | 1060 | ||
1541 | loop_again: | 1061 | loop_again: |
1542 | total_scanned = 0; | 1062 | total_scanned = 0; |
1543 | total_reclaimed = 0; | 1063 | nr_reclaimed = 0; |
1544 | sc.gfp_mask = GFP_KERNEL; | 1064 | sc.may_writepage = !laptop_mode, |
1545 | sc.may_writepage = !laptop_mode; | ||
1546 | sc.may_swap = 1; | ||
1547 | sc.nr_mapped = read_page_state(nr_mapped); | 1065 | sc.nr_mapped = read_page_state(nr_mapped); |
1548 | 1066 | ||
1549 | inc_page_state(pageoutrun); | 1067 | inc_page_state(pageoutrun); |
@@ -1624,15 +1142,11 @@ scan: | |||
1624 | if (zone->prev_priority > priority) | 1142 | if (zone->prev_priority > priority) |
1625 | zone->prev_priority = priority; | 1143 | zone->prev_priority = priority; |
1626 | sc.nr_scanned = 0; | 1144 | sc.nr_scanned = 0; |
1627 | sc.nr_reclaimed = 0; | 1145 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
1628 | sc.priority = priority; | ||
1629 | sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; | ||
1630 | shrink_zone(zone, &sc); | ||
1631 | reclaim_state->reclaimed_slab = 0; | 1146 | reclaim_state->reclaimed_slab = 0; |
1632 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1147 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1633 | lru_pages); | 1148 | lru_pages); |
1634 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 1149 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1635 | total_reclaimed += sc.nr_reclaimed; | ||
1636 | total_scanned += sc.nr_scanned; | 1150 | total_scanned += sc.nr_scanned; |
1637 | if (zone->all_unreclaimable) | 1151 | if (zone->all_unreclaimable) |
1638 | continue; | 1152 | continue; |
@@ -1645,10 +1159,10 @@ scan: | |||
1645 | * even in laptop mode | 1159 | * even in laptop mode |
1646 | */ | 1160 | */ |
1647 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 1161 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
1648 | total_scanned > total_reclaimed+total_reclaimed/2) | 1162 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
1649 | sc.may_writepage = 1; | 1163 | sc.may_writepage = 1; |
1650 | } | 1164 | } |
1651 | if (nr_pages && to_free > total_reclaimed) | 1165 | if (nr_pages && to_free > nr_reclaimed) |
1652 | continue; /* swsusp: need to do more work */ | 1166 | continue; /* swsusp: need to do more work */ |
1653 | if (all_zones_ok) | 1167 | if (all_zones_ok) |
1654 | break; /* kswapd: all done */ | 1168 | break; /* kswapd: all done */ |
@@ -1665,7 +1179,7 @@ scan: | |||
1665 | * matches the direct reclaim path behaviour in terms of impact | 1179 | * matches the direct reclaim path behaviour in terms of impact |
1666 | * on zone->*_priority. | 1180 | * on zone->*_priority. |
1667 | */ | 1181 | */ |
1668 | if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) | 1182 | if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) |
1669 | break; | 1183 | break; |
1670 | } | 1184 | } |
1671 | out: | 1185 | out: |
@@ -1679,7 +1193,7 @@ out: | |||
1679 | goto loop_again; | 1193 | goto loop_again; |
1680 | } | 1194 | } |
1681 | 1195 | ||
1682 | return total_reclaimed; | 1196 | return nr_reclaimed; |
1683 | } | 1197 | } |
1684 | 1198 | ||
1685 | /* | 1199 | /* |
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1779 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | 1293 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed |
1780 | * pages. | 1294 | * pages. |
1781 | */ | 1295 | */ |
1782 | int shrink_all_memory(int nr_pages) | 1296 | unsigned long shrink_all_memory(unsigned long nr_pages) |
1783 | { | 1297 | { |
1784 | pg_data_t *pgdat; | 1298 | pg_data_t *pgdat; |
1785 | int nr_to_free = nr_pages; | 1299 | unsigned long nr_to_free = nr_pages; |
1786 | int ret = 0; | 1300 | unsigned long ret = 0; |
1301 | unsigned retry = 2; | ||
1787 | struct reclaim_state reclaim_state = { | 1302 | struct reclaim_state reclaim_state = { |
1788 | .reclaimed_slab = 0, | 1303 | .reclaimed_slab = 0, |
1789 | }; | 1304 | }; |
1790 | 1305 | ||
1791 | current->reclaim_state = &reclaim_state; | 1306 | current->reclaim_state = &reclaim_state; |
1792 | for_each_pgdat(pgdat) { | 1307 | repeat: |
1793 | int freed; | 1308 | for_each_online_pgdat(pgdat) { |
1309 | unsigned long freed; | ||
1310 | |||
1794 | freed = balance_pgdat(pgdat, nr_to_free, 0); | 1311 | freed = balance_pgdat(pgdat, nr_to_free, 0); |
1795 | ret += freed; | 1312 | ret += freed; |
1796 | nr_to_free -= freed; | 1313 | nr_to_free -= freed; |
1797 | if (nr_to_free <= 0) | 1314 | if ((long)nr_to_free <= 0) |
1798 | break; | 1315 | break; |
1799 | } | 1316 | } |
1317 | if (retry-- && ret < nr_pages) { | ||
1318 | blk_congestion_wait(WRITE, HZ/5); | ||
1319 | goto repeat; | ||
1320 | } | ||
1800 | current->reclaim_state = NULL; | 1321 | current->reclaim_state = NULL; |
1801 | return ret; | 1322 | return ret; |
1802 | } | 1323 | } |
@@ -1808,14 +1329,13 @@ int shrink_all_memory(int nr_pages) | |||
1808 | away, we get changed to run anywhere: as the first one comes back, | 1329 | away, we get changed to run anywhere: as the first one comes back, |
1809 | restore their cpu bindings. */ | 1330 | restore their cpu bindings. */ |
1810 | static int __devinit cpu_callback(struct notifier_block *nfb, | 1331 | static int __devinit cpu_callback(struct notifier_block *nfb, |
1811 | unsigned long action, | 1332 | unsigned long action, void *hcpu) |
1812 | void *hcpu) | ||
1813 | { | 1333 | { |
1814 | pg_data_t *pgdat; | 1334 | pg_data_t *pgdat; |
1815 | cpumask_t mask; | 1335 | cpumask_t mask; |
1816 | 1336 | ||
1817 | if (action == CPU_ONLINE) { | 1337 | if (action == CPU_ONLINE) { |
1818 | for_each_pgdat(pgdat) { | 1338 | for_each_online_pgdat(pgdat) { |
1819 | mask = node_to_cpumask(pgdat->node_id); | 1339 | mask = node_to_cpumask(pgdat->node_id); |
1820 | if (any_online_cpu(mask) != NR_CPUS) | 1340 | if (any_online_cpu(mask) != NR_CPUS) |
1821 | /* One of our CPUs online: restore mask */ | 1341 | /* One of our CPUs online: restore mask */ |
@@ -1829,10 +1349,17 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
1829 | static int __init kswapd_init(void) | 1349 | static int __init kswapd_init(void) |
1830 | { | 1350 | { |
1831 | pg_data_t *pgdat; | 1351 | pg_data_t *pgdat; |
1352 | |||
1832 | swap_setup(); | 1353 | swap_setup(); |
1833 | for_each_pgdat(pgdat) | 1354 | for_each_online_pgdat(pgdat) { |
1834 | pgdat->kswapd | 1355 | pid_t pid; |
1835 | = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); | 1356 | |
1357 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | ||
1358 | BUG_ON(pid < 0); | ||
1359 | read_lock(&tasklist_lock); | ||
1360 | pgdat->kswapd = find_task_by_pid(pid); | ||
1361 | read_unlock(&tasklist_lock); | ||
1362 | } | ||
1836 | total_memory = nr_free_pagecache_pages(); | 1363 | total_memory = nr_free_pagecache_pages(); |
1837 | hotcpu_notifier(cpu_callback, 0); | 1364 | hotcpu_notifier(cpu_callback, 0); |
1838 | return 0; | 1365 | return 0; |
@@ -1874,46 +1401,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ; | |||
1874 | /* | 1401 | /* |
1875 | * Try to free up some pages from this zone through reclaim. | 1402 | * Try to free up some pages from this zone through reclaim. |
1876 | */ | 1403 | */ |
1877 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1404 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
1878 | { | 1405 | { |
1879 | int nr_pages; | 1406 | /* Minimum pages needed in order to stay on node */ |
1407 | const unsigned long nr_pages = 1 << order; | ||
1880 | struct task_struct *p = current; | 1408 | struct task_struct *p = current; |
1881 | struct reclaim_state reclaim_state; | 1409 | struct reclaim_state reclaim_state; |
1882 | struct scan_control sc; | 1410 | int priority; |
1883 | cpumask_t mask; | 1411 | unsigned long nr_reclaimed = 0; |
1884 | int node_id; | 1412 | struct scan_control sc = { |
1885 | 1413 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | |
1886 | if (time_before(jiffies, | 1414 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
1887 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | 1415 | .nr_mapped = read_page_state(nr_mapped), |
1888 | return 0; | 1416 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1889 | 1417 | SWAP_CLUSTER_MAX), | |
1890 | if (!(gfp_mask & __GFP_WAIT) || | 1418 | .gfp_mask = gfp_mask, |
1891 | zone->all_unreclaimable || | 1419 | }; |
1892 | atomic_read(&zone->reclaim_in_progress) > 0 || | ||
1893 | (p->flags & PF_MEMALLOC)) | ||
1894 | return 0; | ||
1895 | |||
1896 | node_id = zone->zone_pgdat->node_id; | ||
1897 | mask = node_to_cpumask(node_id); | ||
1898 | if (!cpus_empty(mask) && node_id != numa_node_id()) | ||
1899 | return 0; | ||
1900 | |||
1901 | sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); | ||
1902 | sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); | ||
1903 | sc.nr_scanned = 0; | ||
1904 | sc.nr_reclaimed = 0; | ||
1905 | sc.priority = ZONE_RECLAIM_PRIORITY + 1; | ||
1906 | sc.nr_mapped = read_page_state(nr_mapped); | ||
1907 | sc.gfp_mask = gfp_mask; | ||
1908 | 1420 | ||
1909 | disable_swap_token(); | 1421 | disable_swap_token(); |
1910 | |||
1911 | nr_pages = 1 << order; | ||
1912 | if (nr_pages > SWAP_CLUSTER_MAX) | ||
1913 | sc.swap_cluster_max = nr_pages; | ||
1914 | else | ||
1915 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
1916 | |||
1917 | cond_resched(); | 1422 | cond_resched(); |
1918 | /* | 1423 | /* |
1919 | * We need to be able to allocate from the reserves for RECLAIM_SWAP | 1424 | * We need to be able to allocate from the reserves for RECLAIM_SWAP |
@@ -1928,17 +1433,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1928 | * Free memory by calling shrink zone with increasing priorities | 1433 | * Free memory by calling shrink zone with increasing priorities |
1929 | * until we have enough memory freed. | 1434 | * until we have enough memory freed. |
1930 | */ | 1435 | */ |
1436 | priority = ZONE_RECLAIM_PRIORITY; | ||
1931 | do { | 1437 | do { |
1932 | sc.priority--; | 1438 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
1933 | shrink_zone(zone, &sc); | 1439 | priority--; |
1440 | } while (priority >= 0 && nr_reclaimed < nr_pages); | ||
1934 | 1441 | ||
1935 | } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); | 1442 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { |
1936 | |||
1937 | if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | ||
1938 | /* | 1443 | /* |
1939 | * shrink_slab does not currently allow us to determine | 1444 | * shrink_slab() does not currently allow us to determine how |
1940 | * how many pages were freed in the zone. So we just | 1445 | * many pages were freed in this zone. So we just shake the slab |
1941 | * shake the slab and then go offnode for a single allocation. | 1446 | * a bit and then go off node for this particular allocation |
1447 | * despite possibly having freed enough memory to allocate in | ||
1448 | * this zone. If we freed local memory then the next | ||
1449 | * allocations will be local again. | ||
1942 | * | 1450 | * |
1943 | * shrink_slab will free memory on all zones and may take | 1451 | * shrink_slab will free memory on all zones and may take |
1944 | * a long time. | 1452 | * a long time. |
@@ -1949,10 +1457,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1949 | p->reclaim_state = NULL; | 1457 | p->reclaim_state = NULL; |
1950 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 1458 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
1951 | 1459 | ||
1952 | if (sc.nr_reclaimed == 0) | 1460 | if (nr_reclaimed == 0) { |
1461 | /* | ||
1462 | * We were unable to reclaim enough pages to stay on node. We | ||
1463 | * now allow off node accesses for a certain time period before | ||
1464 | * trying again to reclaim pages from the local zone. | ||
1465 | */ | ||
1953 | zone->last_unsuccessful_zone_reclaim = jiffies; | 1466 | zone->last_unsuccessful_zone_reclaim = jiffies; |
1467 | } | ||
1954 | 1468 | ||
1955 | return sc.nr_reclaimed >= nr_pages; | 1469 | return nr_reclaimed >= nr_pages; |
1956 | } | 1470 | } |
1957 | #endif | ||
1958 | 1471 | ||
1472 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | ||
1473 | { | ||
1474 | cpumask_t mask; | ||
1475 | int node_id; | ||
1476 | |||
1477 | /* | ||
1478 | * Do not reclaim if there was a recent unsuccessful attempt at zone | ||
1479 | * reclaim. In that case we let allocations go off node for the | ||
1480 | * zone_reclaim_interval. Otherwise we would scan for each off-node | ||
1481 | * page allocation. | ||
1482 | */ | ||
1483 | if (time_before(jiffies, | ||
1484 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | ||
1485 | return 0; | ||
1486 | |||
1487 | /* | ||
1488 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | ||
1489 | * not have reclaimable pages and if we should not delay the allocation | ||
1490 | * then do not scan. | ||
1491 | */ | ||
1492 | if (!(gfp_mask & __GFP_WAIT) || | ||
1493 | zone->all_unreclaimable || | ||
1494 | atomic_read(&zone->reclaim_in_progress) > 0 || | ||
1495 | (current->flags & PF_MEMALLOC)) | ||
1496 | return 0; | ||
1497 | |||
1498 | /* | ||
1499 | * Only run zone reclaim on the local zone or on zones that do not | ||
1500 | * have associated processors. This will favor the local processor | ||
1501 | * over remote processors and spread off node memory allocations | ||
1502 | * as wide as possible. | ||
1503 | */ | ||
1504 | node_id = zone->zone_pgdat->node_id; | ||
1505 | mask = node_to_cpumask(node_id); | ||
1506 | if (!cpus_empty(mask) && node_id != numa_node_id()) | ||
1507 | return 0; | ||
1508 | return __zone_reclaim(zone, gfp_mask, order); | ||
1509 | } | ||
1510 | #endif | ||