diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/bootmem.c | 2 | ||||
-rw-r--r-- | mm/fadvise.c | 46 | ||||
-rw-r--r-- | mm/filemap.c | 41 | ||||
-rw-r--r-- | mm/memory.c | 8 | ||||
-rw-r--r-- | mm/mempolicy.c | 32 | ||||
-rw-r--r-- | mm/mmap.c | 6 | ||||
-rw-r--r-- | mm/msync.c | 139 | ||||
-rw-r--r-- | mm/page-writeback.c | 64 | ||||
-rw-r--r-- | mm/page_alloc.c | 11 | ||||
-rw-r--r-- | mm/slab.c | 351 | ||||
-rw-r--r-- | mm/slob.c | 10 | ||||
-rw-r--r-- | mm/util.c | 47 | ||||
-rw-r--r-- | mm/vmscan.c | 2 |
14 files changed, 595 insertions, 168 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index bd80460360db..332f5c29b53a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS | |||
138 | # | 138 | # |
139 | config MIGRATION | 139 | config MIGRATION |
140 | bool "Page migration" | 140 | bool "Page migration" |
141 | def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM | 141 | def_bool y if NUMA |
142 | depends on SWAP | 142 | depends on SWAP && NUMA |
143 | help | 143 | help |
144 | Allows the migration of the physical location of pages of processes | 144 | Allows the migration of the physical location of pages of processes |
145 | while the virtual addresses are not changed. This is useful for | 145 | while the virtual addresses are not changed. This is useful for |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 35c32290f717..b55bd39fc5dd 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -152,7 +152,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | |||
152 | * | 152 | * |
153 | * NOTE: This function is _not_ reentrant. | 153 | * NOTE: This function is _not_ reentrant. |
154 | */ | 154 | */ |
155 | static void * __init | 155 | void * __init |
156 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | 156 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, |
157 | unsigned long align, unsigned long goal, unsigned long limit) | 157 | unsigned long align, unsigned long goal, unsigned long limit) |
158 | { | 158 | { |
diff --git a/mm/fadvise.c b/mm/fadvise.c index d257c89e7704..907c39257ca0 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <linux/pagevec.h> | 16 | #include <linux/pagevec.h> |
17 | #include <linux/fadvise.h> | 17 | #include <linux/fadvise.h> |
18 | #include <linux/writeback.h> | ||
18 | #include <linux/syscalls.h> | 19 | #include <linux/syscalls.h> |
19 | 20 | ||
20 | #include <asm/unistd.h> | 21 | #include <asm/unistd.h> |
@@ -22,13 +23,36 @@ | |||
22 | /* | 23 | /* |
23 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could | 24 | * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could |
24 | * deactivate the pages and clear PG_Referenced. | 25 | * deactivate the pages and clear PG_Referenced. |
26 | * | ||
27 | * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file | ||
28 | * offsets `offset' and `offset+len' inclusive. Any pages which are currently | ||
29 | * under writeout are skipped, whether or not they are dirty. | ||
30 | * | ||
31 | * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file | ||
32 | * offsets `offset' and `offset+len'. | ||
33 | * | ||
34 | * By combining these two operations the application may do several things: | ||
35 | * | ||
36 | * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. | ||
37 | * | ||
38 | * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently | ||
39 | * dirty pages at the disk. | ||
40 | * | ||
41 | * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push | ||
42 | * all of the currently dirty pages at the disk, wait until they have been | ||
43 | * written. | ||
44 | * | ||
45 | * It should be noted that none of these operations write out the file's | ||
46 | * metadata. So unless the application is strictly performing overwrites of | ||
47 | * already-instantiated disk blocks, there are no guarantees here that the data | ||
48 | * will be available after a crash. | ||
25 | */ | 49 | */ |
26 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | 50 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) |
27 | { | 51 | { |
28 | struct file *file = fget(fd); | 52 | struct file *file = fget(fd); |
29 | struct address_space *mapping; | 53 | struct address_space *mapping; |
30 | struct backing_dev_info *bdi; | 54 | struct backing_dev_info *bdi; |
31 | loff_t endbyte; | 55 | loff_t endbyte; /* inclusive */ |
32 | pgoff_t start_index; | 56 | pgoff_t start_index; |
33 | pgoff_t end_index; | 57 | pgoff_t end_index; |
34 | unsigned long nrpages; | 58 | unsigned long nrpages; |
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
56 | endbyte = offset + len; | 80 | endbyte = offset + len; |
57 | if (!len || endbyte < len) | 81 | if (!len || endbyte < len) |
58 | endbyte = -1; | 82 | endbyte = -1; |
83 | else | ||
84 | endbyte--; /* inclusive */ | ||
59 | 85 | ||
60 | bdi = mapping->backing_dev_info; | 86 | bdi = mapping->backing_dev_info; |
61 | 87 | ||
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
78 | 104 | ||
79 | /* First and last PARTIAL page! */ | 105 | /* First and last PARTIAL page! */ |
80 | start_index = offset >> PAGE_CACHE_SHIFT; | 106 | start_index = offset >> PAGE_CACHE_SHIFT; |
81 | end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; | 107 | end_index = endbyte >> PAGE_CACHE_SHIFT; |
82 | 108 | ||
83 | /* Careful about overflow on the "+1" */ | 109 | /* Careful about overflow on the "+1" */ |
84 | nrpages = end_index - start_index + 1; | 110 | nrpages = end_index - start_index + 1; |
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) | |||
96 | filemap_flush(mapping); | 122 | filemap_flush(mapping); |
97 | 123 | ||
98 | /* First and last FULL page! */ | 124 | /* First and last FULL page! */ |
99 | start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 125 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
100 | end_index = (endbyte >> PAGE_CACHE_SHIFT); | 126 | end_index = (endbyte >> PAGE_CACHE_SHIFT); |
101 | 127 | ||
102 | if (end_index > start_index) | 128 | if (end_index >= start_index) |
103 | invalidate_mapping_pages(mapping, start_index, end_index-1); | 129 | invalidate_mapping_pages(mapping, start_index, |
130 | end_index); | ||
131 | break; | ||
132 | case LINUX_FADV_ASYNC_WRITE: | ||
133 | ret = __filemap_fdatawrite_range(mapping, offset, endbyte, | ||
134 | WB_SYNC_NONE); | ||
135 | break; | ||
136 | case LINUX_FADV_WRITE_WAIT: | ||
137 | ret = wait_on_page_writeback_range(mapping, | ||
138 | offset >> PAGE_CACHE_SHIFT, | ||
139 | endbyte >> PAGE_CACHE_SHIFT); | ||
104 | break; | 140 | break; |
105 | default: | 141 | default: |
106 | ret = -EINVAL; | 142 | ret = -EINVAL; |
diff --git a/mm/filemap.c b/mm/filemap.c index e8f58f7dd7a5..3ef20739e725 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/blkdev.h> | 29 | #include <linux/blkdev.h> |
30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
32 | #include <linux/cpuset.h> | ||
32 | #include "filemap.h" | 33 | #include "filemap.h" |
33 | #include "internal.h" | 34 | #include "internal.h" |
34 | 35 | ||
@@ -174,7 +175,7 @@ static int sync_page(void *word) | |||
174 | * dirty pages that lie within the byte offsets <start, end> | 175 | * dirty pages that lie within the byte offsets <start, end> |
175 | * @mapping: address space structure to write | 176 | * @mapping: address space structure to write |
176 | * @start: offset in bytes where the range starts | 177 | * @start: offset in bytes where the range starts |
177 | * @end: offset in bytes where the range ends | 178 | * @end: offset in bytes where the range ends (inclusive) |
178 | * @sync_mode: enable synchronous operation | 179 | * @sync_mode: enable synchronous operation |
179 | * | 180 | * |
180 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as | 181 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as |
@@ -182,8 +183,8 @@ static int sync_page(void *word) | |||
182 | * these two operations is that if a dirty page/buffer is encountered, it must | 183 | * these two operations is that if a dirty page/buffer is encountered, it must |
183 | * be waited upon, and not just skipped over. | 184 | * be waited upon, and not just skipped over. |
184 | */ | 185 | */ |
185 | static int __filemap_fdatawrite_range(struct address_space *mapping, | 186 | int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
186 | loff_t start, loff_t end, int sync_mode) | 187 | loff_t end, int sync_mode) |
187 | { | 188 | { |
188 | int ret; | 189 | int ret; |
189 | struct writeback_control wbc = { | 190 | struct writeback_control wbc = { |
@@ -212,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping) | |||
212 | } | 213 | } |
213 | EXPORT_SYMBOL(filemap_fdatawrite); | 214 | EXPORT_SYMBOL(filemap_fdatawrite); |
214 | 215 | ||
215 | static int filemap_fdatawrite_range(struct address_space *mapping, | 216 | static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
216 | loff_t start, loff_t end) | 217 | loff_t end) |
217 | { | 218 | { |
218 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); | 219 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); |
219 | } | 220 | } |
@@ -232,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush); | |||
232 | * Wait for writeback to complete against pages indexed by start->end | 233 | * Wait for writeback to complete against pages indexed by start->end |
233 | * inclusive | 234 | * inclusive |
234 | */ | 235 | */ |
235 | static int wait_on_page_writeback_range(struct address_space *mapping, | 236 | int wait_on_page_writeback_range(struct address_space *mapping, |
236 | pgoff_t start, pgoff_t end) | 237 | pgoff_t start, pgoff_t end) |
237 | { | 238 | { |
238 | struct pagevec pvec; | 239 | struct pagevec pvec; |
@@ -367,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping) | |||
367 | } | 368 | } |
368 | EXPORT_SYMBOL(filemap_write_and_wait); | 369 | EXPORT_SYMBOL(filemap_write_and_wait); |
369 | 370 | ||
371 | /* | ||
372 | * Write out and wait upon file offsets lstart->lend, inclusive. | ||
373 | * | ||
374 | * Note that `lend' is inclusive (describes the last byte to be written) so | ||
375 | * that this function can be used to write to the very end-of-file (end = -1). | ||
376 | */ | ||
370 | int filemap_write_and_wait_range(struct address_space *mapping, | 377 | int filemap_write_and_wait_range(struct address_space *mapping, |
371 | loff_t lstart, loff_t lend) | 378 | loff_t lstart, loff_t lend) |
372 | { | 379 | { |
@@ -427,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
427 | return ret; | 434 | return ret; |
428 | } | 435 | } |
429 | 436 | ||
437 | #ifdef CONFIG_NUMA | ||
438 | struct page *page_cache_alloc(struct address_space *x) | ||
439 | { | ||
440 | if (cpuset_do_page_mem_spread()) { | ||
441 | int n = cpuset_mem_spread_node(); | ||
442 | return alloc_pages_node(n, mapping_gfp_mask(x), 0); | ||
443 | } | ||
444 | return alloc_pages(mapping_gfp_mask(x), 0); | ||
445 | } | ||
446 | EXPORT_SYMBOL(page_cache_alloc); | ||
447 | |||
448 | struct page *page_cache_alloc_cold(struct address_space *x) | ||
449 | { | ||
450 | if (cpuset_do_page_mem_spread()) { | ||
451 | int n = cpuset_mem_spread_node(); | ||
452 | return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0); | ||
453 | } | ||
454 | return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0); | ||
455 | } | ||
456 | EXPORT_SYMBOL(page_cache_alloc_cold); | ||
457 | #endif | ||
458 | |||
430 | /* | 459 | /* |
431 | * In order to wait for pages to become available there must be | 460 | * In order to wait for pages to become available there must be |
432 | * waitqueues associated with pages. By using a hash table of | 461 | * waitqueues associated with pages. By using a hash table of |
diff --git a/mm/memory.c b/mm/memory.c index 80c3fb370f91..e347e106ca3a 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -395,12 +395,16 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ | |||
395 | return NULL; | 395 | return NULL; |
396 | } | 396 | } |
397 | 397 | ||
398 | #ifdef CONFIG_DEBUG_VM | 398 | /* |
399 | * Add some anal sanity checks for now. Eventually, | ||
400 | * we should just do "return pfn_to_page(pfn)", but | ||
401 | * in the meantime we check that we get a valid pfn, | ||
402 | * and that the resulting page looks ok. | ||
403 | */ | ||
399 | if (unlikely(!pfn_valid(pfn))) { | 404 | if (unlikely(!pfn_valid(pfn))) { |
400 | print_bad_pte(vma, pte, addr); | 405 | print_bad_pte(vma, pte, addr); |
401 | return NULL; | 406 | return NULL; |
402 | } | 407 | } |
403 | #endif | ||
404 | 408 | ||
405 | /* | 409 | /* |
406 | * NOTE! We still have PageReserved() pages in the page | 410 | * NOTE! We still have PageReserved() pages in the page |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e93cc740c22b..4f71cfd29c6f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes) | |||
422 | return mpol_check_policy(mode, nodes); | 422 | return mpol_check_policy(mode, nodes); |
423 | } | 423 | } |
424 | 424 | ||
425 | |||
426 | /* | ||
427 | * Update task->flags PF_MEMPOLICY bit: set iff non-default | ||
428 | * mempolicy. Allows more rapid checking of this (combined perhaps | ||
429 | * with other PF_* flag bits) on memory allocation hot code paths. | ||
430 | * | ||
431 | * If called from outside this file, the task 'p' should -only- be | ||
432 | * a newly forked child not yet visible on the task list, because | ||
433 | * manipulating the task flags of a visible task is not safe. | ||
434 | * | ||
435 | * The above limitation is why this routine has the funny name | ||
436 | * mpol_fix_fork_child_flag(). | ||
437 | * | ||
438 | * It is also safe to call this with a task pointer of current, | ||
439 | * which the static wrapper mpol_set_task_struct_flag() does, | ||
440 | * for use within this file. | ||
441 | */ | ||
442 | |||
443 | void mpol_fix_fork_child_flag(struct task_struct *p) | ||
444 | { | ||
445 | if (p->mempolicy) | ||
446 | p->flags |= PF_MEMPOLICY; | ||
447 | else | ||
448 | p->flags &= ~PF_MEMPOLICY; | ||
449 | } | ||
450 | |||
451 | static void mpol_set_task_struct_flag(void) | ||
452 | { | ||
453 | mpol_fix_fork_child_flag(current); | ||
454 | } | ||
455 | |||
425 | /* Set the process memory policy */ | 456 | /* Set the process memory policy */ |
426 | long do_set_mempolicy(int mode, nodemask_t *nodes) | 457 | long do_set_mempolicy(int mode, nodemask_t *nodes) |
427 | { | 458 | { |
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) | |||
434 | return PTR_ERR(new); | 465 | return PTR_ERR(new); |
435 | mpol_free(current->mempolicy); | 466 | mpol_free(current->mempolicy); |
436 | current->mempolicy = new; | 467 | current->mempolicy = new; |
468 | mpol_set_task_struct_flag(); | ||
437 | if (new && new->policy == MPOL_INTERLEAVE) | 469 | if (new && new->policy == MPOL_INTERLEAVE) |
438 | current->il_next = first_node(new->v.nodes); | 470 | current->il_next = first_node(new->v.nodes); |
439 | return 0; | 471 | return 0; |
@@ -1040,12 +1040,11 @@ munmap_back: | |||
1040 | * specific mapper. the address has already been validated, but | 1040 | * specific mapper. the address has already been validated, but |
1041 | * not unmapped, but the maps are removed from the list. | 1041 | * not unmapped, but the maps are removed from the list. |
1042 | */ | 1042 | */ |
1043 | vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 1043 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
1044 | if (!vma) { | 1044 | if (!vma) { |
1045 | error = -ENOMEM; | 1045 | error = -ENOMEM; |
1046 | goto unacct_error; | 1046 | goto unacct_error; |
1047 | } | 1047 | } |
1048 | memset(vma, 0, sizeof(*vma)); | ||
1049 | 1048 | ||
1050 | vma->vm_mm = mm; | 1049 | vma->vm_mm = mm; |
1051 | vma->vm_start = addr; | 1050 | vma->vm_start = addr; |
@@ -1896,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1896 | /* | 1895 | /* |
1897 | * create a vma struct for an anonymous mapping | 1896 | * create a vma struct for an anonymous mapping |
1898 | */ | 1897 | */ |
1899 | vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 1898 | vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); |
1900 | if (!vma) { | 1899 | if (!vma) { |
1901 | vm_unacct_memory(len >> PAGE_SHIFT); | 1900 | vm_unacct_memory(len >> PAGE_SHIFT); |
1902 | return -ENOMEM; | 1901 | return -ENOMEM; |
1903 | } | 1902 | } |
1904 | memset(vma, 0, sizeof(*vma)); | ||
1905 | 1903 | ||
1906 | vma->vm_mm = mm; | 1904 | vma->vm_mm = mm; |
1907 | vma->vm_start = addr; | 1905 | vma->vm_start = addr; |
diff --git a/mm/msync.c b/mm/msync.c index 3563a56e1a51..bc6c95376366 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -9,20 +9,24 @@ | |||
9 | */ | 9 | */ |
10 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
12 | #include <linux/fs.h> | ||
12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
13 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
14 | #include <linux/hugetlb.h> | 15 | #include <linux/hugetlb.h> |
16 | #include <linux/writeback.h> | ||
17 | #include <linux/file.h> | ||
15 | #include <linux/syscalls.h> | 18 | #include <linux/syscalls.h> |
16 | 19 | ||
17 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
18 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
19 | 22 | ||
20 | static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 23 | static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
21 | unsigned long addr, unsigned long end) | 24 | unsigned long addr, unsigned long end) |
22 | { | 25 | { |
23 | pte_t *pte; | 26 | pte_t *pte; |
24 | spinlock_t *ptl; | 27 | spinlock_t *ptl; |
25 | int progress = 0; | 28 | int progress = 0; |
29 | unsigned long ret = 0; | ||
26 | 30 | ||
27 | again: | 31 | again: |
28 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 32 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
@@ -43,58 +47,64 @@ again: | |||
43 | if (!page) | 47 | if (!page) |
44 | continue; | 48 | continue; |
45 | if (ptep_clear_flush_dirty(vma, addr, pte) || | 49 | if (ptep_clear_flush_dirty(vma, addr, pte) || |
46 | page_test_and_clear_dirty(page)) | 50 | page_test_and_clear_dirty(page)) |
47 | set_page_dirty(page); | 51 | ret += set_page_dirty(page); |
48 | progress += 3; | 52 | progress += 3; |
49 | } while (pte++, addr += PAGE_SIZE, addr != end); | 53 | } while (pte++, addr += PAGE_SIZE, addr != end); |
50 | pte_unmap_unlock(pte - 1, ptl); | 54 | pte_unmap_unlock(pte - 1, ptl); |
51 | cond_resched(); | 55 | cond_resched(); |
52 | if (addr != end) | 56 | if (addr != end) |
53 | goto again; | 57 | goto again; |
58 | return ret; | ||
54 | } | 59 | } |
55 | 60 | ||
56 | static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 61 | static inline unsigned long msync_pmd_range(struct vm_area_struct *vma, |
57 | unsigned long addr, unsigned long end) | 62 | pud_t *pud, unsigned long addr, unsigned long end) |
58 | { | 63 | { |
59 | pmd_t *pmd; | 64 | pmd_t *pmd; |
60 | unsigned long next; | 65 | unsigned long next; |
66 | unsigned long ret = 0; | ||
61 | 67 | ||
62 | pmd = pmd_offset(pud, addr); | 68 | pmd = pmd_offset(pud, addr); |
63 | do { | 69 | do { |
64 | next = pmd_addr_end(addr, end); | 70 | next = pmd_addr_end(addr, end); |
65 | if (pmd_none_or_clear_bad(pmd)) | 71 | if (pmd_none_or_clear_bad(pmd)) |
66 | continue; | 72 | continue; |
67 | msync_pte_range(vma, pmd, addr, next); | 73 | ret += msync_pte_range(vma, pmd, addr, next); |
68 | } while (pmd++, addr = next, addr != end); | 74 | } while (pmd++, addr = next, addr != end); |
75 | return ret; | ||
69 | } | 76 | } |
70 | 77 | ||
71 | static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 78 | static inline unsigned long msync_pud_range(struct vm_area_struct *vma, |
72 | unsigned long addr, unsigned long end) | 79 | pgd_t *pgd, unsigned long addr, unsigned long end) |
73 | { | 80 | { |
74 | pud_t *pud; | 81 | pud_t *pud; |
75 | unsigned long next; | 82 | unsigned long next; |
83 | unsigned long ret = 0; | ||
76 | 84 | ||
77 | pud = pud_offset(pgd, addr); | 85 | pud = pud_offset(pgd, addr); |
78 | do { | 86 | do { |
79 | next = pud_addr_end(addr, end); | 87 | next = pud_addr_end(addr, end); |
80 | if (pud_none_or_clear_bad(pud)) | 88 | if (pud_none_or_clear_bad(pud)) |
81 | continue; | 89 | continue; |
82 | msync_pmd_range(vma, pud, addr, next); | 90 | ret += msync_pmd_range(vma, pud, addr, next); |
83 | } while (pud++, addr = next, addr != end); | 91 | } while (pud++, addr = next, addr != end); |
92 | return ret; | ||
84 | } | 93 | } |
85 | 94 | ||
86 | static void msync_page_range(struct vm_area_struct *vma, | 95 | static unsigned long msync_page_range(struct vm_area_struct *vma, |
87 | unsigned long addr, unsigned long end) | 96 | unsigned long addr, unsigned long end) |
88 | { | 97 | { |
89 | pgd_t *pgd; | 98 | pgd_t *pgd; |
90 | unsigned long next; | 99 | unsigned long next; |
100 | unsigned long ret = 0; | ||
91 | 101 | ||
92 | /* For hugepages we can't go walking the page table normally, | 102 | /* For hugepages we can't go walking the page table normally, |
93 | * but that's ok, hugetlbfs is memory based, so we don't need | 103 | * but that's ok, hugetlbfs is memory based, so we don't need |
94 | * to do anything more on an msync(). | 104 | * to do anything more on an msync(). |
95 | */ | 105 | */ |
96 | if (vma->vm_flags & VM_HUGETLB) | 106 | if (vma->vm_flags & VM_HUGETLB) |
97 | return; | 107 | return 0; |
98 | 108 | ||
99 | BUG_ON(addr >= end); | 109 | BUG_ON(addr >= end); |
100 | pgd = pgd_offset(vma->vm_mm, addr); | 110 | pgd = pgd_offset(vma->vm_mm, addr); |
@@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma, | |||
103 | next = pgd_addr_end(addr, end); | 113 | next = pgd_addr_end(addr, end); |
104 | if (pgd_none_or_clear_bad(pgd)) | 114 | if (pgd_none_or_clear_bad(pgd)) |
105 | continue; | 115 | continue; |
106 | msync_pud_range(vma, pgd, addr, next); | 116 | ret += msync_pud_range(vma, pgd, addr, next); |
107 | } while (pgd++, addr = next, addr != end); | 117 | } while (pgd++, addr = next, addr != end); |
118 | return ret; | ||
108 | } | 119 | } |
109 | 120 | ||
110 | /* | 121 | /* |
@@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma, | |||
115 | * write out the dirty pages and wait on the writeout and check the result. | 126 | * write out the dirty pages and wait on the writeout and check the result. |
116 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start | 127 | * Or the application may run fadvise(FADV_DONTNEED) against the fd to start |
117 | * async writeout immediately. | 128 | * async writeout immediately. |
118 | * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to | 129 | * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to |
119 | * applications. | 130 | * applications. |
120 | */ | 131 | */ |
121 | static int msync_interval(struct vm_area_struct *vma, | 132 | static int msync_interval(struct vm_area_struct *vma, unsigned long addr, |
122 | unsigned long addr, unsigned long end, int flags) | 133 | unsigned long end, int flags, |
134 | unsigned long *nr_pages_dirtied) | ||
123 | { | 135 | { |
124 | int ret = 0; | ||
125 | struct file *file = vma->vm_file; | 136 | struct file *file = vma->vm_file; |
126 | 137 | ||
127 | if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) | 138 | if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) |
128 | return -EBUSY; | 139 | return -EBUSY; |
129 | 140 | ||
130 | if (file && (vma->vm_flags & VM_SHARED)) { | 141 | if (file && (vma->vm_flags & VM_SHARED)) |
131 | msync_page_range(vma, addr, end); | 142 | *nr_pages_dirtied = msync_page_range(vma, addr, end); |
132 | 143 | return 0; | |
133 | if (flags & MS_SYNC) { | ||
134 | struct address_space *mapping = file->f_mapping; | ||
135 | int err; | ||
136 | |||
137 | ret = filemap_fdatawrite(mapping); | ||
138 | if (file->f_op && file->f_op->fsync) { | ||
139 | /* | ||
140 | * We don't take i_mutex here because mmap_sem | ||
141 | * is already held. | ||
142 | */ | ||
143 | err = file->f_op->fsync(file,file->f_dentry,1); | ||
144 | if (err && !ret) | ||
145 | ret = err; | ||
146 | } | ||
147 | err = filemap_fdatawait(mapping); | ||
148 | if (!ret) | ||
149 | ret = err; | ||
150 | } | ||
151 | } | ||
152 | return ret; | ||
153 | } | 144 | } |
154 | 145 | ||
155 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | 146 | asmlinkage long sys_msync(unsigned long start, size_t len, int flags) |
156 | { | 147 | { |
157 | unsigned long end; | 148 | unsigned long end; |
158 | struct vm_area_struct *vma; | 149 | struct vm_area_struct *vma; |
159 | int unmapped_error, error = -EINVAL; | 150 | int unmapped_error = 0; |
160 | 151 | int error = -EINVAL; | |
161 | if (flags & MS_SYNC) | 152 | int done = 0; |
162 | current->flags |= PF_SYNCWRITE; | ||
163 | 153 | ||
164 | down_read(¤t->mm->mmap_sem); | ||
165 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) | 154 | if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) |
166 | goto out; | 155 | goto out; |
167 | if (start & ~PAGE_MASK) | 156 | if (start & ~PAGE_MASK) |
@@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
180 | * If the interval [start,end) covers some unmapped address ranges, | 169 | * If the interval [start,end) covers some unmapped address ranges, |
181 | * just ignore them, but return -ENOMEM at the end. | 170 | * just ignore them, but return -ENOMEM at the end. |
182 | */ | 171 | */ |
172 | down_read(¤t->mm->mmap_sem); | ||
173 | if (flags & MS_SYNC) | ||
174 | current->flags |= PF_SYNCWRITE; | ||
183 | vma = find_vma(current->mm, start); | 175 | vma = find_vma(current->mm, start); |
184 | unmapped_error = 0; | 176 | if (!vma) { |
185 | for (;;) { | ||
186 | /* Still start < end. */ | ||
187 | error = -ENOMEM; | 177 | error = -ENOMEM; |
188 | if (!vma) | 178 | goto out_unlock; |
189 | goto out; | 179 | } |
180 | do { | ||
181 | unsigned long nr_pages_dirtied = 0; | ||
182 | struct file *file; | ||
183 | |||
190 | /* Here start < vma->vm_end. */ | 184 | /* Here start < vma->vm_end. */ |
191 | if (start < vma->vm_start) { | 185 | if (start < vma->vm_start) { |
192 | unmapped_error = -ENOMEM; | 186 | unmapped_error = -ENOMEM; |
@@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) | |||
195 | /* Here vma->vm_start <= start < vma->vm_end. */ | 189 | /* Here vma->vm_start <= start < vma->vm_end. */ |
196 | if (end <= vma->vm_end) { | 190 | if (end <= vma->vm_end) { |
197 | if (start < end) { | 191 | if (start < end) { |
198 | error = msync_interval(vma, start, end, flags); | 192 | error = msync_interval(vma, start, end, flags, |
193 | &nr_pages_dirtied); | ||
199 | if (error) | 194 | if (error) |
200 | goto out; | 195 | goto out_unlock; |
201 | } | 196 | } |
202 | error = unmapped_error; | 197 | error = unmapped_error; |
203 | goto out; | 198 | done = 1; |
199 | } else { | ||
200 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | ||
201 | error = msync_interval(vma, start, vma->vm_end, flags, | ||
202 | &nr_pages_dirtied); | ||
203 | if (error) | ||
204 | goto out_unlock; | ||
204 | } | 205 | } |
205 | /* Here vma->vm_start <= start < vma->vm_end < end. */ | 206 | file = vma->vm_file; |
206 | error = msync_interval(vma, start, vma->vm_end, flags); | ||
207 | if (error) | ||
208 | goto out; | ||
209 | start = vma->vm_end; | 207 | start = vma->vm_end; |
210 | vma = vma->vm_next; | 208 | if ((flags & MS_ASYNC) && file && nr_pages_dirtied) { |
211 | } | 209 | get_file(file); |
212 | out: | 210 | up_read(¤t->mm->mmap_sem); |
213 | up_read(¤t->mm->mmap_sem); | 211 | balance_dirty_pages_ratelimited_nr(file->f_mapping, |
212 | nr_pages_dirtied); | ||
213 | fput(file); | ||
214 | down_read(¤t->mm->mmap_sem); | ||
215 | vma = find_vma(current->mm, start); | ||
216 | } else if ((flags & MS_SYNC) && file && | ||
217 | (vma->vm_flags & VM_SHARED)) { | ||
218 | get_file(file); | ||
219 | up_read(¤t->mm->mmap_sem); | ||
220 | error = do_fsync(file, 0); | ||
221 | fput(file); | ||
222 | down_read(¤t->mm->mmap_sem); | ||
223 | if (error) | ||
224 | goto out_unlock; | ||
225 | vma = find_vma(current->mm, start); | ||
226 | } else { | ||
227 | vma = vma->vm_next; | ||
228 | } | ||
229 | } while (vma && !done); | ||
230 | out_unlock: | ||
214 | current->flags &= ~PF_SYNCWRITE; | 231 | current->flags &= ~PF_SYNCWRITE; |
232 | up_read(¤t->mm->mmap_sem); | ||
233 | out: | ||
215 | return error; | 234 | return error; |
216 | } | 235 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 945559fb63d2..893d7677579e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40; | |||
75 | * The interval between `kupdate'-style writebacks, in centiseconds | 75 | * The interval between `kupdate'-style writebacks, in centiseconds |
76 | * (hundredths of a second) | 76 | * (hundredths of a second) |
77 | */ | 77 | */ |
78 | int dirty_writeback_centisecs = 5 * 100; | 78 | int dirty_writeback_interval = 5 * HZ; |
79 | 79 | ||
80 | /* | 80 | /* |
81 | * The longest number of centiseconds for which data is allowed to remain dirty | 81 | * The longest number of centiseconds for which data is allowed to remain dirty |
82 | */ | 82 | */ |
83 | int dirty_expire_centisecs = 30 * 100; | 83 | int dirty_expire_interval = 30 * HZ; |
84 | 84 | ||
85 | /* | 85 | /* |
86 | * Flag that makes the machine dump writes/reads and block dirtyings. | 86 | * Flag that makes the machine dump writes/reads and block dirtyings. |
@@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100; | |||
88 | int block_dump; | 88 | int block_dump; |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * Flag that puts the machine in "laptop mode". | 91 | * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies: |
92 | * a full sync is triggered after this time elapses without any disk activity. | ||
92 | */ | 93 | */ |
93 | int laptop_mode; | 94 | int laptop_mode; |
94 | 95 | ||
@@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
255 | } | 256 | } |
256 | 257 | ||
257 | /** | 258 | /** |
258 | * balance_dirty_pages_ratelimited - balance dirty memory state | 259 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
259 | * @mapping: address_space which was dirtied | 260 | * @mapping: address_space which was dirtied |
261 | * @nr_pages: number of pages which the caller has just dirtied | ||
260 | * | 262 | * |
261 | * Processes which are dirtying memory should call in here once for each page | 263 | * Processes which are dirtying memory should call in here once for each page |
262 | * which was newly dirtied. The function will periodically check the system's | 264 | * which was newly dirtied. The function will periodically check the system's |
@@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
267 | * limit we decrease the ratelimiting by a lot, to prevent individual processes | 269 | * limit we decrease the ratelimiting by a lot, to prevent individual processes |
268 | * from overshooting the limit by (ratelimit_pages) each. | 270 | * from overshooting the limit by (ratelimit_pages) each. |
269 | */ | 271 | */ |
270 | void balance_dirty_pages_ratelimited(struct address_space *mapping) | 272 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
273 | unsigned long nr_pages_dirtied) | ||
271 | { | 274 | { |
272 | static DEFINE_PER_CPU(int, ratelimits) = 0; | 275 | static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; |
273 | long ratelimit; | 276 | unsigned long ratelimit; |
277 | unsigned long *p; | ||
274 | 278 | ||
275 | ratelimit = ratelimit_pages; | 279 | ratelimit = ratelimit_pages; |
276 | if (dirty_exceeded) | 280 | if (dirty_exceeded) |
@@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) | |||
280 | * Check the rate limiting. Also, we do not want to throttle real-time | 284 | * Check the rate limiting. Also, we do not want to throttle real-time |
281 | * tasks in balance_dirty_pages(). Period. | 285 | * tasks in balance_dirty_pages(). Period. |
282 | */ | 286 | */ |
283 | if (get_cpu_var(ratelimits)++ >= ratelimit) { | 287 | preempt_disable(); |
284 | __get_cpu_var(ratelimits) = 0; | 288 | p = &__get_cpu_var(ratelimits); |
285 | put_cpu_var(ratelimits); | 289 | *p += nr_pages_dirtied; |
290 | if (unlikely(*p >= ratelimit)) { | ||
291 | *p = 0; | ||
292 | preempt_enable(); | ||
286 | balance_dirty_pages(mapping); | 293 | balance_dirty_pages(mapping); |
287 | return; | 294 | return; |
288 | } | 295 | } |
289 | put_cpu_var(ratelimits); | 296 | preempt_enable(); |
290 | } | 297 | } |
291 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited); | 298 | EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); |
292 | 299 | ||
293 | void throttle_vm_writeout(void) | 300 | void throttle_vm_writeout(void) |
294 | { | 301 | { |
@@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); | |||
380 | * just walks the superblock inode list, writing back any inodes which are | 387 | * just walks the superblock inode list, writing back any inodes which are |
381 | * older than a specific point in time. | 388 | * older than a specific point in time. |
382 | * | 389 | * |
383 | * Try to run once per dirty_writeback_centisecs. But if a writeback event | 390 | * Try to run once per dirty_writeback_interval. But if a writeback event |
384 | * takes longer than a dirty_writeback_centisecs interval, then leave a | 391 | * takes longer than a dirty_writeback_interval interval, then leave a |
385 | * one-second gap. | 392 | * one-second gap. |
386 | * | 393 | * |
387 | * older_than_this takes precedence over nr_to_write. So we'll only write back | 394 | * older_than_this takes precedence over nr_to_write. So we'll only write back |
@@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg) | |||
406 | sync_supers(); | 413 | sync_supers(); |
407 | 414 | ||
408 | get_writeback_state(&wbs); | 415 | get_writeback_state(&wbs); |
409 | oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; | 416 | oldest_jif = jiffies - dirty_expire_interval; |
410 | start_jif = jiffies; | 417 | start_jif = jiffies; |
411 | next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; | 418 | next_jif = start_jif + dirty_writeback_interval; |
412 | nr_to_write = wbs.nr_dirty + wbs.nr_unstable + | 419 | nr_to_write = wbs.nr_dirty + wbs.nr_unstable + |
413 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | 420 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); |
414 | while (nr_to_write > 0) { | 421 | while (nr_to_write > 0) { |
@@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg) | |||
425 | } | 432 | } |
426 | if (time_before(next_jif, jiffies + HZ)) | 433 | if (time_before(next_jif, jiffies + HZ)) |
427 | next_jif = jiffies + HZ; | 434 | next_jif = jiffies + HZ; |
428 | if (dirty_writeback_centisecs) | 435 | if (dirty_writeback_interval) |
429 | mod_timer(&wb_timer, next_jif); | 436 | mod_timer(&wb_timer, next_jif); |
430 | } | 437 | } |
431 | 438 | ||
@@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg) | |||
435 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | 442 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
436 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 443 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
437 | { | 444 | { |
438 | proc_dointvec(table, write, file, buffer, length, ppos); | 445 | proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); |
439 | if (dirty_writeback_centisecs) { | 446 | if (dirty_writeback_interval) { |
440 | mod_timer(&wb_timer, | 447 | mod_timer(&wb_timer, |
441 | jiffies + (dirty_writeback_centisecs * HZ) / 100); | 448 | jiffies + dirty_writeback_interval); |
442 | } else { | 449 | } else { |
443 | del_timer(&wb_timer); | 450 | del_timer(&wb_timer); |
444 | } | 451 | } |
445 | return 0; | 452 | return 0; |
@@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused) | |||
468 | */ | 475 | */ |
469 | void laptop_io_completion(void) | 476 | void laptop_io_completion(void) |
470 | { | 477 | { |
471 | mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); | 478 | mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); |
472 | } | 479 | } |
473 | 480 | ||
474 | /* | 481 | /* |
@@ -544,7 +551,7 @@ void __init page_writeback_init(void) | |||
544 | if (vm_dirty_ratio <= 0) | 551 | if (vm_dirty_ratio <= 0) |
545 | vm_dirty_ratio = 1; | 552 | vm_dirty_ratio = 1; |
546 | } | 553 | } |
547 | mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); | 554 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
548 | set_ratelimit(); | 555 | set_ratelimit(); |
549 | register_cpu_notifier(&ratelimit_nb); | 556 | register_cpu_notifier(&ratelimit_nb); |
550 | } | 557 | } |
@@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page); | |||
621 | */ | 628 | */ |
622 | int __set_page_dirty_nobuffers(struct page *page) | 629 | int __set_page_dirty_nobuffers(struct page *page) |
623 | { | 630 | { |
624 | int ret = 0; | ||
625 | |||
626 | if (!TestSetPageDirty(page)) { | 631 | if (!TestSetPageDirty(page)) { |
627 | struct address_space *mapping = page_mapping(page); | 632 | struct address_space *mapping = page_mapping(page); |
628 | struct address_space *mapping2; | 633 | struct address_space *mapping2; |
@@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
644 | I_DIRTY_PAGES); | 649 | I_DIRTY_PAGES); |
645 | } | 650 | } |
646 | } | 651 | } |
652 | return 1; | ||
647 | } | 653 | } |
648 | return ret; | 654 | return 0; |
649 | } | 655 | } |
650 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); | 656 | EXPORT_SYMBOL(__set_page_dirty_nobuffers); |
651 | 657 | ||
@@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page) | |||
675 | return (*spd)(page); | 681 | return (*spd)(page); |
676 | return __set_page_dirty_buffers(page); | 682 | return __set_page_dirty_buffers(page); |
677 | } | 683 | } |
678 | if (!PageDirty(page)) | 684 | if (!PageDirty(page)) { |
679 | SetPageDirty(page); | 685 | if (!TestSetPageDirty(page)) |
686 | return 1; | ||
687 | } | ||
680 | return 0; | 688 | return 0; |
681 | } | 689 | } |
682 | EXPORT_SYMBOL(set_page_dirty); | 690 | EXPORT_SYMBOL(set_page_dirty); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b7f14a4799a5..338a02bb004d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -943,7 +943,8 @@ restart: | |||
943 | goto got_pg; | 943 | goto got_pg; |
944 | 944 | ||
945 | do { | 945 | do { |
946 | wakeup_kswapd(*z, order); | 946 | if (cpuset_zone_allowed(*z, gfp_mask)) |
947 | wakeup_kswapd(*z, order); | ||
947 | } while (*(++z)); | 948 | } while (*(++z)); |
948 | 949 | ||
949 | /* | 950 | /* |
@@ -2028,8 +2029,9 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
2028 | setup_pageset(zone_pcp(zone,cpu), batch); | 2029 | setup_pageset(zone_pcp(zone,cpu), batch); |
2029 | #endif | 2030 | #endif |
2030 | } | 2031 | } |
2031 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 2032 | if (zone->present_pages) |
2032 | zone->name, zone->present_pages, batch); | 2033 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", |
2034 | zone->name, zone->present_pages, batch); | ||
2033 | } | 2035 | } |
2034 | 2036 | ||
2035 | static __meminit void init_currently_empty_zone(struct zone *zone, | 2037 | static __meminit void init_currently_empty_zone(struct zone *zone, |
@@ -2700,8 +2702,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
2700 | else | 2702 | else |
2701 | numentries <<= (PAGE_SHIFT - scale); | 2703 | numentries <<= (PAGE_SHIFT - scale); |
2702 | } | 2704 | } |
2703 | /* rounded up to nearest power of 2 in size */ | 2705 | numentries = roundup_pow_of_two(numentries); |
2704 | numentries = 1UL << (long_log2(numentries) + 1); | ||
2705 | 2706 | ||
2706 | /* limit allocation size to 1/16 total memory by default */ | 2707 | /* limit allocation size to 1/16 total memory by default */ |
2707 | if (max == 0) { | 2708 | if (max == 0) { |
@@ -94,6 +94,7 @@ | |||
94 | #include <linux/interrupt.h> | 94 | #include <linux/interrupt.h> |
95 | #include <linux/init.h> | 95 | #include <linux/init.h> |
96 | #include <linux/compiler.h> | 96 | #include <linux/compiler.h> |
97 | #include <linux/cpuset.h> | ||
97 | #include <linux/seq_file.h> | 98 | #include <linux/seq_file.h> |
98 | #include <linux/notifier.h> | 99 | #include <linux/notifier.h> |
99 | #include <linux/kallsyms.h> | 100 | #include <linux/kallsyms.h> |
@@ -173,12 +174,12 @@ | |||
173 | SLAB_CACHE_DMA | \ | 174 | SLAB_CACHE_DMA | \ |
174 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ | 175 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ |
175 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 176 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
176 | SLAB_DESTROY_BY_RCU) | 177 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) |
177 | #else | 178 | #else |
178 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ | 179 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
179 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ | 180 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ |
180 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 181 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
181 | SLAB_DESTROY_BY_RCU) | 182 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD) |
182 | #endif | 183 | #endif |
183 | 184 | ||
184 | /* | 185 | /* |
@@ -203,7 +204,8 @@ | |||
203 | typedef unsigned int kmem_bufctl_t; | 204 | typedef unsigned int kmem_bufctl_t; |
204 | #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) | 205 | #define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) |
205 | #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) | 206 | #define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) |
206 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) | 207 | #define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) |
208 | #define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) | ||
207 | 209 | ||
208 | /* Max number of objs-per-slab for caches which use off-slab slabs. | 210 | /* Max number of objs-per-slab for caches which use off-slab slabs. |
209 | * Needed to avoid a possible looping condition in cache_grow(). | 211 | * Needed to avoid a possible looping condition in cache_grow(). |
@@ -896,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
896 | return nc; | 898 | return nc; |
897 | } | 899 | } |
898 | 900 | ||
901 | /* | ||
902 | * Transfer objects in one arraycache to another. | ||
903 | * Locking must be handled by the caller. | ||
904 | * | ||
905 | * Return the number of entries transferred. | ||
906 | */ | ||
907 | static int transfer_objects(struct array_cache *to, | ||
908 | struct array_cache *from, unsigned int max) | ||
909 | { | ||
910 | /* Figure out how many entries to transfer */ | ||
911 | int nr = min(min(from->avail, max), to->limit - to->avail); | ||
912 | |||
913 | if (!nr) | ||
914 | return 0; | ||
915 | |||
916 | memcpy(to->entry + to->avail, from->entry + from->avail -nr, | ||
917 | sizeof(void *) *nr); | ||
918 | |||
919 | from->avail -= nr; | ||
920 | to->avail += nr; | ||
921 | to->touched = 1; | ||
922 | return nr; | ||
923 | } | ||
924 | |||
899 | #ifdef CONFIG_NUMA | 925 | #ifdef CONFIG_NUMA |
900 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); | 926 | static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); |
927 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | ||
901 | 928 | ||
902 | static struct array_cache **alloc_alien_cache(int node, int limit) | 929 | static struct array_cache **alloc_alien_cache(int node, int limit) |
903 | { | 930 | { |
@@ -944,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
944 | 971 | ||
945 | if (ac->avail) { | 972 | if (ac->avail) { |
946 | spin_lock(&rl3->list_lock); | 973 | spin_lock(&rl3->list_lock); |
974 | /* | ||
975 | * Stuff objects into the remote nodes shared array first. | ||
976 | * That way we could avoid the overhead of putting the objects | ||
977 | * into the free lists and getting them back later. | ||
978 | */ | ||
979 | transfer_objects(rl3->shared, ac, ac->limit); | ||
980 | |||
947 | free_block(cachep, ac->entry, ac->avail, node); | 981 | free_block(cachep, ac->entry, ac->avail, node); |
948 | ac->avail = 0; | 982 | ac->avail = 0; |
949 | spin_unlock(&rl3->list_lock); | 983 | spin_unlock(&rl3->list_lock); |
@@ -959,8 +993,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | |||
959 | 993 | ||
960 | if (l3->alien) { | 994 | if (l3->alien) { |
961 | struct array_cache *ac = l3->alien[node]; | 995 | struct array_cache *ac = l3->alien[node]; |
962 | if (ac && ac->avail) { | 996 | |
963 | spin_lock_irq(&ac->lock); | 997 | if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { |
964 | __drain_alien_cache(cachep, ac, node); | 998 | __drain_alien_cache(cachep, ac, node); |
965 | spin_unlock_irq(&ac->lock); | 999 | spin_unlock_irq(&ac->lock); |
966 | } | 1000 | } |
@@ -1987,10 +2021,9 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1987 | align = ralign; | 2021 | align = ralign; |
1988 | 2022 | ||
1989 | /* Get cache's description obj. */ | 2023 | /* Get cache's description obj. */ |
1990 | cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL); | 2024 | cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); |
1991 | if (!cachep) | 2025 | if (!cachep) |
1992 | goto oops; | 2026 | goto oops; |
1993 | memset(cachep, 0, sizeof(struct kmem_cache)); | ||
1994 | 2027 | ||
1995 | #if DEBUG | 2028 | #if DEBUG |
1996 | cachep->obj_size = size; | 2029 | cachep->obj_size = size; |
@@ -2397,7 +2430,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, | |||
2397 | /* Verify that the slab belongs to the intended node */ | 2430 | /* Verify that the slab belongs to the intended node */ |
2398 | WARN_ON(slabp->nodeid != nodeid); | 2431 | WARN_ON(slabp->nodeid != nodeid); |
2399 | 2432 | ||
2400 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { | 2433 | if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { |
2401 | printk(KERN_ERR "slab: double free detected in cache " | 2434 | printk(KERN_ERR "slab: double free detected in cache " |
2402 | "'%s', objp %p\n", cachep->name, objp); | 2435 | "'%s', objp %p\n", cachep->name, objp); |
2403 | BUG(); | 2436 | BUG(); |
@@ -2603,6 +2636,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
2603 | */ | 2636 | */ |
2604 | cachep->dtor(objp + obj_offset(cachep), cachep, 0); | 2637 | cachep->dtor(objp + obj_offset(cachep), cachep, 0); |
2605 | } | 2638 | } |
2639 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
2640 | slab_bufctl(slabp)[objnr] = BUFCTL_FREE; | ||
2641 | #endif | ||
2606 | if (cachep->flags & SLAB_POISON) { | 2642 | if (cachep->flags & SLAB_POISON) { |
2607 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2643 | #ifdef CONFIG_DEBUG_PAGEALLOC |
2608 | if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { | 2644 | if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
@@ -2675,20 +2711,10 @@ retry: | |||
2675 | BUG_ON(ac->avail > 0 || !l3); | 2711 | BUG_ON(ac->avail > 0 || !l3); |
2676 | spin_lock(&l3->list_lock); | 2712 | spin_lock(&l3->list_lock); |
2677 | 2713 | ||
2678 | if (l3->shared) { | 2714 | /* See if we can refill from the shared array */ |
2679 | struct array_cache *shared_array = l3->shared; | 2715 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) |
2680 | if (shared_array->avail) { | 2716 | goto alloc_done; |
2681 | if (batchcount > shared_array->avail) | 2717 | |
2682 | batchcount = shared_array->avail; | ||
2683 | shared_array->avail -= batchcount; | ||
2684 | ac->avail = batchcount; | ||
2685 | memcpy(ac->entry, | ||
2686 | &(shared_array->entry[shared_array->avail]), | ||
2687 | sizeof(void *) * batchcount); | ||
2688 | shared_array->touched = 1; | ||
2689 | goto alloc_done; | ||
2690 | } | ||
2691 | } | ||
2692 | while (batchcount > 0) { | 2718 | while (batchcount > 0) { |
2693 | struct list_head *entry; | 2719 | struct list_head *entry; |
2694 | struct slab *slabp; | 2720 | struct slab *slabp; |
@@ -2786,6 +2812,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
2786 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; | 2812 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; |
2787 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | 2813 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; |
2788 | } | 2814 | } |
2815 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
2816 | { | ||
2817 | struct slab *slabp; | ||
2818 | unsigned objnr; | ||
2819 | |||
2820 | slabp = page_get_slab(virt_to_page(objp)); | ||
2821 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | ||
2822 | slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; | ||
2823 | } | ||
2824 | #endif | ||
2789 | objp += obj_offset(cachep); | 2825 | objp += obj_offset(cachep); |
2790 | if (cachep->ctor && cachep->flags & SLAB_POISON) { | 2826 | if (cachep->ctor && cachep->flags & SLAB_POISON) { |
2791 | unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; | 2827 | unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; |
@@ -2807,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
2807 | struct array_cache *ac; | 2843 | struct array_cache *ac; |
2808 | 2844 | ||
2809 | #ifdef CONFIG_NUMA | 2845 | #ifdef CONFIG_NUMA |
2810 | if (unlikely(current->mempolicy && !in_interrupt())) { | 2846 | if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) { |
2811 | int nid = slab_node(current->mempolicy); | 2847 | objp = alternate_node_alloc(cachep, flags); |
2812 | 2848 | if (objp != NULL) | |
2813 | if (nid != numa_node_id()) | 2849 | return objp; |
2814 | return __cache_alloc_node(cachep, flags, nid); | ||
2815 | } | 2850 | } |
2816 | #endif | 2851 | #endif |
2817 | 2852 | ||
@@ -2847,6 +2882,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep, | |||
2847 | 2882 | ||
2848 | #ifdef CONFIG_NUMA | 2883 | #ifdef CONFIG_NUMA |
2849 | /* | 2884 | /* |
2885 | * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY. | ||
2886 | * | ||
2887 | * If we are in_interrupt, then process context, including cpusets and | ||
2888 | * mempolicy, may not apply and should not be used for allocation policy. | ||
2889 | */ | ||
2890 | static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | ||
2891 | { | ||
2892 | int nid_alloc, nid_here; | ||
2893 | |||
2894 | if (in_interrupt()) | ||
2895 | return NULL; | ||
2896 | nid_alloc = nid_here = numa_node_id(); | ||
2897 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | ||
2898 | nid_alloc = cpuset_mem_spread_node(); | ||
2899 | else if (current->mempolicy) | ||
2900 | nid_alloc = slab_node(current->mempolicy); | ||
2901 | if (nid_alloc != nid_here) | ||
2902 | return __cache_alloc_node(cachep, flags, nid_alloc); | ||
2903 | return NULL; | ||
2904 | } | ||
2905 | |||
2906 | /* | ||
2850 | * A interface to enable slab creation on nodeid | 2907 | * A interface to enable slab creation on nodeid |
2851 | */ | 2908 | */ |
2852 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, | 2909 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
@@ -3071,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3071 | EXPORT_SYMBOL(kmem_cache_alloc); | 3128 | EXPORT_SYMBOL(kmem_cache_alloc); |
3072 | 3129 | ||
3073 | /** | 3130 | /** |
3131 | * kmem_cache_alloc - Allocate an object. The memory is set to zero. | ||
3132 | * @cache: The cache to allocate from. | ||
3133 | * @flags: See kmalloc(). | ||
3134 | * | ||
3135 | * Allocate an object from this cache and set the allocated memory to zero. | ||
3136 | * The flags are only relevant if the cache has no available objects. | ||
3137 | */ | ||
3138 | void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags) | ||
3139 | { | ||
3140 | void *ret = __cache_alloc(cache, flags, __builtin_return_address(0)); | ||
3141 | if (ret) | ||
3142 | memset(ret, 0, obj_size(cache)); | ||
3143 | return ret; | ||
3144 | } | ||
3145 | EXPORT_SYMBOL(kmem_cache_zalloc); | ||
3146 | |||
3147 | /** | ||
3074 | * kmem_ptr_validate - check if an untrusted pointer might | 3148 | * kmem_ptr_validate - check if an untrusted pointer might |
3075 | * be a slab entry. | 3149 | * be a slab entry. |
3076 | * @cachep: the cache we're checking against | 3150 | * @cachep: the cache we're checking against |
@@ -3197,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3197 | return __cache_alloc(cachep, flags, caller); | 3271 | return __cache_alloc(cachep, flags, caller); |
3198 | } | 3272 | } |
3199 | 3273 | ||
3200 | #ifndef CONFIG_DEBUG_SLAB | ||
3201 | 3274 | ||
3202 | void *__kmalloc(size_t size, gfp_t flags) | 3275 | void *__kmalloc(size_t size, gfp_t flags) |
3203 | { | 3276 | { |
3277 | #ifndef CONFIG_DEBUG_SLAB | ||
3204 | return __do_kmalloc(size, flags, NULL); | 3278 | return __do_kmalloc(size, flags, NULL); |
3279 | #else | ||
3280 | return __do_kmalloc(size, flags, __builtin_return_address(0)); | ||
3281 | #endif | ||
3205 | } | 3282 | } |
3206 | EXPORT_SYMBOL(__kmalloc); | 3283 | EXPORT_SYMBOL(__kmalloc); |
3207 | 3284 | ||
3208 | #else | 3285 | #ifdef CONFIG_DEBUG_SLAB |
3209 | |||
3210 | void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) | 3286 | void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) |
3211 | { | 3287 | { |
3212 | return __do_kmalloc(size, flags, caller); | 3288 | return __do_kmalloc(size, flags, caller); |
3213 | } | 3289 | } |
3214 | EXPORT_SYMBOL(__kmalloc_track_caller); | 3290 | EXPORT_SYMBOL(__kmalloc_track_caller); |
3215 | |||
3216 | #endif | 3291 | #endif |
3217 | 3292 | ||
3218 | #ifdef CONFIG_SMP | 3293 | #ifdef CONFIG_SMP |
@@ -3343,63 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep) | |||
3343 | EXPORT_SYMBOL_GPL(kmem_cache_name); | 3418 | EXPORT_SYMBOL_GPL(kmem_cache_name); |
3344 | 3419 | ||
3345 | /* | 3420 | /* |
3346 | * This initializes kmem_list3 for all nodes. | 3421 | * This initializes kmem_list3 or resizes varioius caches for all nodes. |
3347 | */ | 3422 | */ |
3348 | static int alloc_kmemlist(struct kmem_cache *cachep) | 3423 | static int alloc_kmemlist(struct kmem_cache *cachep) |
3349 | { | 3424 | { |
3350 | int node; | 3425 | int node; |
3351 | struct kmem_list3 *l3; | 3426 | struct kmem_list3 *l3; |
3352 | int err = 0; | 3427 | struct array_cache *new_shared; |
3428 | struct array_cache **new_alien; | ||
3353 | 3429 | ||
3354 | for_each_online_node(node) { | 3430 | for_each_online_node(node) { |
3355 | struct array_cache *nc = NULL, *new; | 3431 | |
3356 | struct array_cache **new_alien = NULL; | ||
3357 | #ifdef CONFIG_NUMA | ||
3358 | new_alien = alloc_alien_cache(node, cachep->limit); | 3432 | new_alien = alloc_alien_cache(node, cachep->limit); |
3359 | if (!new_alien) | 3433 | if (!new_alien) |
3360 | goto fail; | 3434 | goto fail; |
3361 | #endif | 3435 | |
3362 | new = alloc_arraycache(node, cachep->shared*cachep->batchcount, | 3436 | new_shared = alloc_arraycache(node, |
3437 | cachep->shared*cachep->batchcount, | ||
3363 | 0xbaadf00d); | 3438 | 0xbaadf00d); |
3364 | if (!new) | 3439 | if (!new_shared) { |
3440 | free_alien_cache(new_alien); | ||
3365 | goto fail; | 3441 | goto fail; |
3442 | } | ||
3443 | |||
3366 | l3 = cachep->nodelists[node]; | 3444 | l3 = cachep->nodelists[node]; |
3367 | if (l3) { | 3445 | if (l3) { |
3446 | struct array_cache *shared = l3->shared; | ||
3447 | |||
3368 | spin_lock_irq(&l3->list_lock); | 3448 | spin_lock_irq(&l3->list_lock); |
3369 | 3449 | ||
3370 | nc = cachep->nodelists[node]->shared; | 3450 | if (shared) |
3371 | if (nc) | 3451 | free_block(cachep, shared->entry, |
3372 | free_block(cachep, nc->entry, nc->avail, node); | 3452 | shared->avail, node); |
3373 | 3453 | ||
3374 | l3->shared = new; | 3454 | l3->shared = new_shared; |
3375 | if (!cachep->nodelists[node]->alien) { | 3455 | if (!l3->alien) { |
3376 | l3->alien = new_alien; | 3456 | l3->alien = new_alien; |
3377 | new_alien = NULL; | 3457 | new_alien = NULL; |
3378 | } | 3458 | } |
3379 | l3->free_limit = (1 + nr_cpus_node(node)) * | 3459 | l3->free_limit = (1 + nr_cpus_node(node)) * |
3380 | cachep->batchcount + cachep->num; | 3460 | cachep->batchcount + cachep->num; |
3381 | spin_unlock_irq(&l3->list_lock); | 3461 | spin_unlock_irq(&l3->list_lock); |
3382 | kfree(nc); | 3462 | kfree(shared); |
3383 | free_alien_cache(new_alien); | 3463 | free_alien_cache(new_alien); |
3384 | continue; | 3464 | continue; |
3385 | } | 3465 | } |
3386 | l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); | 3466 | l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); |
3387 | if (!l3) | 3467 | if (!l3) { |
3468 | free_alien_cache(new_alien); | ||
3469 | kfree(new_shared); | ||
3388 | goto fail; | 3470 | goto fail; |
3471 | } | ||
3389 | 3472 | ||
3390 | kmem_list3_init(l3); | 3473 | kmem_list3_init(l3); |
3391 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 3474 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
3392 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | 3475 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
3393 | l3->shared = new; | 3476 | l3->shared = new_shared; |
3394 | l3->alien = new_alien; | 3477 | l3->alien = new_alien; |
3395 | l3->free_limit = (1 + nr_cpus_node(node)) * | 3478 | l3->free_limit = (1 + nr_cpus_node(node)) * |
3396 | cachep->batchcount + cachep->num; | 3479 | cachep->batchcount + cachep->num; |
3397 | cachep->nodelists[node] = l3; | 3480 | cachep->nodelists[node] = l3; |
3398 | } | 3481 | } |
3399 | return err; | 3482 | return 0; |
3483 | |||
3400 | fail: | 3484 | fail: |
3401 | err = -ENOMEM; | 3485 | if (!cachep->next.next) { |
3402 | return err; | 3486 | /* Cache is not active yet. Roll back what we did */ |
3487 | node--; | ||
3488 | while (node >= 0) { | ||
3489 | if (cachep->nodelists[node]) { | ||
3490 | l3 = cachep->nodelists[node]; | ||
3491 | |||
3492 | kfree(l3->shared); | ||
3493 | free_alien_cache(l3->alien); | ||
3494 | kfree(l3); | ||
3495 | cachep->nodelists[node] = NULL; | ||
3496 | } | ||
3497 | node--; | ||
3498 | } | ||
3499 | } | ||
3500 | return -ENOMEM; | ||
3403 | } | 3501 | } |
3404 | 3502 | ||
3405 | struct ccupdate_struct { | 3503 | struct ccupdate_struct { |
@@ -3876,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
3876 | res = count; | 3974 | res = count; |
3877 | return res; | 3975 | return res; |
3878 | } | 3976 | } |
3977 | |||
3978 | #ifdef CONFIG_DEBUG_SLAB_LEAK | ||
3979 | |||
3980 | static void *leaks_start(struct seq_file *m, loff_t *pos) | ||
3981 | { | ||
3982 | loff_t n = *pos; | ||
3983 | struct list_head *p; | ||
3984 | |||
3985 | mutex_lock(&cache_chain_mutex); | ||
3986 | p = cache_chain.next; | ||
3987 | while (n--) { | ||
3988 | p = p->next; | ||
3989 | if (p == &cache_chain) | ||
3990 | return NULL; | ||
3991 | } | ||
3992 | return list_entry(p, struct kmem_cache, next); | ||
3993 | } | ||
3994 | |||
3995 | static inline int add_caller(unsigned long *n, unsigned long v) | ||
3996 | { | ||
3997 | unsigned long *p; | ||
3998 | int l; | ||
3999 | if (!v) | ||
4000 | return 1; | ||
4001 | l = n[1]; | ||
4002 | p = n + 2; | ||
4003 | while (l) { | ||
4004 | int i = l/2; | ||
4005 | unsigned long *q = p + 2 * i; | ||
4006 | if (*q == v) { | ||
4007 | q[1]++; | ||
4008 | return 1; | ||
4009 | } | ||
4010 | if (*q > v) { | ||
4011 | l = i; | ||
4012 | } else { | ||
4013 | p = q + 2; | ||
4014 | l -= i + 1; | ||
4015 | } | ||
4016 | } | ||
4017 | if (++n[1] == n[0]) | ||
4018 | return 0; | ||
4019 | memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); | ||
4020 | p[0] = v; | ||
4021 | p[1] = 1; | ||
4022 | return 1; | ||
4023 | } | ||
4024 | |||
4025 | static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) | ||
4026 | { | ||
4027 | void *p; | ||
4028 | int i; | ||
4029 | if (n[0] == n[1]) | ||
4030 | return; | ||
4031 | for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) { | ||
4032 | if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) | ||
4033 | continue; | ||
4034 | if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) | ||
4035 | return; | ||
4036 | } | ||
4037 | } | ||
4038 | |||
4039 | static void show_symbol(struct seq_file *m, unsigned long address) | ||
4040 | { | ||
4041 | #ifdef CONFIG_KALLSYMS | ||
4042 | char *modname; | ||
4043 | const char *name; | ||
4044 | unsigned long offset, size; | ||
4045 | char namebuf[KSYM_NAME_LEN+1]; | ||
4046 | |||
4047 | name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); | ||
4048 | |||
4049 | if (name) { | ||
4050 | seq_printf(m, "%s+%#lx/%#lx", name, offset, size); | ||
4051 | if (modname) | ||
4052 | seq_printf(m, " [%s]", modname); | ||
4053 | return; | ||
4054 | } | ||
4055 | #endif | ||
4056 | seq_printf(m, "%p", (void *)address); | ||
4057 | } | ||
4058 | |||
4059 | static int leaks_show(struct seq_file *m, void *p) | ||
4060 | { | ||
4061 | struct kmem_cache *cachep = p; | ||
4062 | struct list_head *q; | ||
4063 | struct slab *slabp; | ||
4064 | struct kmem_list3 *l3; | ||
4065 | const char *name; | ||
4066 | unsigned long *n = m->private; | ||
4067 | int node; | ||
4068 | int i; | ||
4069 | |||
4070 | if (!(cachep->flags & SLAB_STORE_USER)) | ||
4071 | return 0; | ||
4072 | if (!(cachep->flags & SLAB_RED_ZONE)) | ||
4073 | return 0; | ||
4074 | |||
4075 | /* OK, we can do it */ | ||
4076 | |||
4077 | n[1] = 0; | ||
4078 | |||
4079 | for_each_online_node(node) { | ||
4080 | l3 = cachep->nodelists[node]; | ||
4081 | if (!l3) | ||
4082 | continue; | ||
4083 | |||
4084 | check_irq_on(); | ||
4085 | spin_lock_irq(&l3->list_lock); | ||
4086 | |||
4087 | list_for_each(q, &l3->slabs_full) { | ||
4088 | slabp = list_entry(q, struct slab, list); | ||
4089 | handle_slab(n, cachep, slabp); | ||
4090 | } | ||
4091 | list_for_each(q, &l3->slabs_partial) { | ||
4092 | slabp = list_entry(q, struct slab, list); | ||
4093 | handle_slab(n, cachep, slabp); | ||
4094 | } | ||
4095 | spin_unlock_irq(&l3->list_lock); | ||
4096 | } | ||
4097 | name = cachep->name; | ||
4098 | if (n[0] == n[1]) { | ||
4099 | /* Increase the buffer size */ | ||
4100 | mutex_unlock(&cache_chain_mutex); | ||
4101 | m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL); | ||
4102 | if (!m->private) { | ||
4103 | /* Too bad, we are really out */ | ||
4104 | m->private = n; | ||
4105 | mutex_lock(&cache_chain_mutex); | ||
4106 | return -ENOMEM; | ||
4107 | } | ||
4108 | *(unsigned long *)m->private = n[0] * 2; | ||
4109 | kfree(n); | ||
4110 | mutex_lock(&cache_chain_mutex); | ||
4111 | /* Now make sure this entry will be retried */ | ||
4112 | m->count = m->size; | ||
4113 | return 0; | ||
4114 | } | ||
4115 | for (i = 0; i < n[1]; i++) { | ||
4116 | seq_printf(m, "%s: %lu ", name, n[2*i+3]); | ||
4117 | show_symbol(m, n[2*i+2]); | ||
4118 | seq_putc(m, '\n'); | ||
4119 | } | ||
4120 | return 0; | ||
4121 | } | ||
4122 | |||
4123 | struct seq_operations slabstats_op = { | ||
4124 | .start = leaks_start, | ||
4125 | .next = s_next, | ||
4126 | .stop = s_stop, | ||
4127 | .show = leaks_show, | ||
4128 | }; | ||
4129 | #endif | ||
3879 | #endif | 4130 | #endif |
3880 | 4131 | ||
3881 | /** | 4132 | /** |
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) | |||
294 | } | 294 | } |
295 | EXPORT_SYMBOL(kmem_cache_alloc); | 295 | EXPORT_SYMBOL(kmem_cache_alloc); |
296 | 296 | ||
297 | void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) | ||
298 | { | ||
299 | void *ret = kmem_cache_alloc(c, flags); | ||
300 | if (ret) | ||
301 | memset(ret, 0, c->size); | ||
302 | |||
303 | return ret; | ||
304 | } | ||
305 | EXPORT_SYMBOL(kmem_cache_zalloc); | ||
306 | |||
297 | void kmem_cache_free(struct kmem_cache *c, void *b) | 307 | void kmem_cache_free(struct kmem_cache *c, void *b) |
298 | { | 308 | { |
299 | if (c->dtor) | 309 | if (c->dtor) |
@@ -1,20 +1,22 @@ | |||
1 | #include <linux/slab.h> | 1 | #include <linux/slab.h> |
2 | #include <linux/string.h> | 2 | #include <linux/string.h> |
3 | #include <linux/module.h> | 3 | #include <linux/module.h> |
4 | #include <linux/err.h> | ||
5 | #include <asm/uaccess.h> | ||
4 | 6 | ||
5 | /** | 7 | /** |
6 | * kzalloc - allocate memory. The memory is set to zero. | 8 | * __kzalloc - allocate memory. The memory is set to zero. |
7 | * @size: how many bytes of memory are required. | 9 | * @size: how many bytes of memory are required. |
8 | * @flags: the type of memory to allocate. | 10 | * @flags: the type of memory to allocate. |
9 | */ | 11 | */ |
10 | void *kzalloc(size_t size, gfp_t flags) | 12 | void *__kzalloc(size_t size, gfp_t flags) |
11 | { | 13 | { |
12 | void *ret = kmalloc(size, flags); | 14 | void *ret = ____kmalloc(size, flags); |
13 | if (ret) | 15 | if (ret) |
14 | memset(ret, 0, size); | 16 | memset(ret, 0, size); |
15 | return ret; | 17 | return ret; |
16 | } | 18 | } |
17 | EXPORT_SYMBOL(kzalloc); | 19 | EXPORT_SYMBOL(__kzalloc); |
18 | 20 | ||
19 | /* | 21 | /* |
20 | * kstrdup - allocate space for and copy an existing string | 22 | * kstrdup - allocate space for and copy an existing string |
@@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp) | |||
31 | return NULL; | 33 | return NULL; |
32 | 34 | ||
33 | len = strlen(s) + 1; | 35 | len = strlen(s) + 1; |
34 | buf = kmalloc(len, gfp); | 36 | buf = ____kmalloc(len, gfp); |
35 | if (buf) | 37 | if (buf) |
36 | memcpy(buf, s, len); | 38 | memcpy(buf, s, len); |
37 | return buf; | 39 | return buf; |
38 | } | 40 | } |
39 | EXPORT_SYMBOL(kstrdup); | 41 | EXPORT_SYMBOL(kstrdup); |
42 | |||
43 | /* | ||
44 | * strndup_user - duplicate an existing string from user space | ||
45 | * | ||
46 | * @s: The string to duplicate | ||
47 | * @n: Maximum number of bytes to copy, including the trailing NUL. | ||
48 | */ | ||
49 | char *strndup_user(const char __user *s, long n) | ||
50 | { | ||
51 | char *p; | ||
52 | long length; | ||
53 | |||
54 | length = strnlen_user(s, n); | ||
55 | |||
56 | if (!length) | ||
57 | return ERR_PTR(-EFAULT); | ||
58 | |||
59 | if (length > n) | ||
60 | return ERR_PTR(-EINVAL); | ||
61 | |||
62 | p = kmalloc(length, GFP_KERNEL); | ||
63 | |||
64 | if (!p) | ||
65 | return ERR_PTR(-ENOMEM); | ||
66 | |||
67 | if (copy_from_user(p, s, length)) { | ||
68 | kfree(p); | ||
69 | return ERR_PTR(-EFAULT); | ||
70 | } | ||
71 | |||
72 | p[length - 1] = '\0'; | ||
73 | |||
74 | return p; | ||
75 | } | ||
76 | EXPORT_SYMBOL(strndup_user); | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index fd572bbdc9f5..78865c849f8f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1356,7 +1356,9 @@ static int __init kswapd_init(void) | |||
1356 | 1356 | ||
1357 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | 1357 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); |
1358 | BUG_ON(pid < 0); | 1358 | BUG_ON(pid < 0); |
1359 | read_lock(&tasklist_lock); | ||
1359 | pgdat->kswapd = find_task_by_pid(pid); | 1360 | pgdat->kswapd = find_task_by_pid(pid); |
1361 | read_unlock(&tasklist_lock); | ||
1360 | } | 1362 | } |
1361 | total_memory = nr_free_pagecache_pages(); | 1363 | total_memory = nr_free_pagecache_pages(); |
1362 | hotcpu_notifier(cpu_callback, 0); | 1364 | hotcpu_notifier(cpu_callback, 0); |