aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/bootmem.c2
-rw-r--r--mm/fadvise.c46
-rw-r--r--mm/filemap.c41
-rw-r--r--mm/memory.c8
-rw-r--r--mm/mempolicy.c32
-rw-r--r--mm/mmap.c6
-rw-r--r--mm/msync.c139
-rw-r--r--mm/page-writeback.c64
-rw-r--r--mm/page_alloc.c11
-rw-r--r--mm/slab.c351
-rw-r--r--mm/slob.c10
-rw-r--r--mm/util.c47
-rw-r--r--mm/vmscan.c2
14 files changed, 595 insertions, 168 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index bd80460360db..332f5c29b53a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS
138# 138#
139config MIGRATION 139config MIGRATION
140 bool "Page migration" 140 bool "Page migration"
141 def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM 141 def_bool y if NUMA
142 depends on SWAP 142 depends on SWAP && NUMA
143 help 143 help
144 Allows the migration of the physical location of pages of processes 144 Allows the migration of the physical location of pages of processes
145 while the virtual addresses are not changed. This is useful for 145 while the virtual addresses are not changed. This is useful for
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 35c32290f717..b55bd39fc5dd 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -152,7 +152,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
152 * 152 *
153 * NOTE: This function is _not_ reentrant. 153 * NOTE: This function is _not_ reentrant.
154 */ 154 */
155static void * __init 155void * __init
156__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, 156__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
157 unsigned long align, unsigned long goal, unsigned long limit) 157 unsigned long align, unsigned long goal, unsigned long limit)
158{ 158{
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89e7704..907c39257ca0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/pagevec.h> 16#include <linux/pagevec.h>
17#include <linux/fadvise.h> 17#include <linux/fadvise.h>
18#include <linux/writeback.h>
18#include <linux/syscalls.h> 19#include <linux/syscalls.h>
19 20
20#include <asm/unistd.h> 21#include <asm/unistd.h>
@@ -22,13 +23,36 @@
22/* 23/*
23 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could 24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
24 * deactivate the pages and clear PG_Referenced. 25 * deactivate the pages and clear PG_Referenced.
26 *
27 * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
28 * offsets `offset' and `offset+len' inclusive. Any pages which are currently
29 * under writeout are skipped, whether or not they are dirty.
30 *
31 * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
32 * offsets `offset' and `offset+len'.
33 *
34 * By combining these two operations the application may do several things:
35 *
36 * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
37 *
38 * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
39 * dirty pages at the disk.
40 *
41 * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
42 * all of the currently dirty pages at the disk, wait until they have been
43 * written.
44 *
45 * It should be noted that none of these operations write out the file's
46 * metadata. So unless the application is strictly performing overwrites of
47 * already-instantiated disk blocks, there are no guarantees here that the data
48 * will be available after a crash.
25 */ 49 */
26asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) 50asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
27{ 51{
28 struct file *file = fget(fd); 52 struct file *file = fget(fd);
29 struct address_space *mapping; 53 struct address_space *mapping;
30 struct backing_dev_info *bdi; 54 struct backing_dev_info *bdi;
31 loff_t endbyte; 55 loff_t endbyte; /* inclusive */
32 pgoff_t start_index; 56 pgoff_t start_index;
33 pgoff_t end_index; 57 pgoff_t end_index;
34 unsigned long nrpages; 58 unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
56 endbyte = offset + len; 80 endbyte = offset + len;
57 if (!len || endbyte < len) 81 if (!len || endbyte < len)
58 endbyte = -1; 82 endbyte = -1;
83 else
84 endbyte--; /* inclusive */
59 85
60 bdi = mapping->backing_dev_info; 86 bdi = mapping->backing_dev_info;
61 87
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
78 104
79 /* First and last PARTIAL page! */ 105 /* First and last PARTIAL page! */
80 start_index = offset >> PAGE_CACHE_SHIFT; 106 start_index = offset >> PAGE_CACHE_SHIFT;
81 end_index = (endbyte-1) >> PAGE_CACHE_SHIFT; 107 end_index = endbyte >> PAGE_CACHE_SHIFT;
82 108
83 /* Careful about overflow on the "+1" */ 109 /* Careful about overflow on the "+1" */
84 nrpages = end_index - start_index + 1; 110 nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
96 filemap_flush(mapping); 122 filemap_flush(mapping);
97 123
98 /* First and last FULL page! */ 124 /* First and last FULL page! */
99 start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 125 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
100 end_index = (endbyte >> PAGE_CACHE_SHIFT); 126 end_index = (endbyte >> PAGE_CACHE_SHIFT);
101 127
102 if (end_index > start_index) 128 if (end_index >= start_index)
103 invalidate_mapping_pages(mapping, start_index, end_index-1); 129 invalidate_mapping_pages(mapping, start_index,
130 end_index);
131 break;
132 case LINUX_FADV_ASYNC_WRITE:
133 ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
134 WB_SYNC_NONE);
135 break;
136 case LINUX_FADV_WRITE_WAIT:
137 ret = wait_on_page_writeback_range(mapping,
138 offset >> PAGE_CACHE_SHIFT,
139 endbyte >> PAGE_CACHE_SHIFT);
104 break; 140 break;
105 default: 141 default:
106 ret = -EINVAL; 142 ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index e8f58f7dd7a5..3ef20739e725 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,6 +29,7 @@
29#include <linux/blkdev.h> 29#include <linux/blkdev.h>
30#include <linux/security.h> 30#include <linux/security.h>
31#include <linux/syscalls.h> 31#include <linux/syscalls.h>
32#include <linux/cpuset.h>
32#include "filemap.h" 33#include "filemap.h"
33#include "internal.h" 34#include "internal.h"
34 35
@@ -174,7 +175,7 @@ static int sync_page(void *word)
174 * dirty pages that lie within the byte offsets <start, end> 175 * dirty pages that lie within the byte offsets <start, end>
175 * @mapping: address space structure to write 176 * @mapping: address space structure to write
176 * @start: offset in bytes where the range starts 177 * @start: offset in bytes where the range starts
177 * @end: offset in bytes where the range ends 178 * @end: offset in bytes where the range ends (inclusive)
178 * @sync_mode: enable synchronous operation 179 * @sync_mode: enable synchronous operation
179 * 180 *
180 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 181 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -182,8 +183,8 @@ static int sync_page(void *word)
182 * these two operations is that if a dirty page/buffer is encountered, it must 183 * these two operations is that if a dirty page/buffer is encountered, it must
183 * be waited upon, and not just skipped over. 184 * be waited upon, and not just skipped over.
184 */ 185 */
185static int __filemap_fdatawrite_range(struct address_space *mapping, 186int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
186 loff_t start, loff_t end, int sync_mode) 187 loff_t end, int sync_mode)
187{ 188{
188 int ret; 189 int ret;
189 struct writeback_control wbc = { 190 struct writeback_control wbc = {
@@ -212,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
212} 213}
213EXPORT_SYMBOL(filemap_fdatawrite); 214EXPORT_SYMBOL(filemap_fdatawrite);
214 215
215static int filemap_fdatawrite_range(struct address_space *mapping, 216static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
216 loff_t start, loff_t end) 217 loff_t end)
217{ 218{
218 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); 219 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
219} 220}
@@ -232,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
232 * Wait for writeback to complete against pages indexed by start->end 233 * Wait for writeback to complete against pages indexed by start->end
233 * inclusive 234 * inclusive
234 */ 235 */
235static int wait_on_page_writeback_range(struct address_space *mapping, 236int wait_on_page_writeback_range(struct address_space *mapping,
236 pgoff_t start, pgoff_t end) 237 pgoff_t start, pgoff_t end)
237{ 238{
238 struct pagevec pvec; 239 struct pagevec pvec;
@@ -367,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping)
367} 368}
368EXPORT_SYMBOL(filemap_write_and_wait); 369EXPORT_SYMBOL(filemap_write_and_wait);
369 370
371/*
372 * Write out and wait upon file offsets lstart->lend, inclusive.
373 *
374 * Note that `lend' is inclusive (describes the last byte to be written) so
375 * that this function can be used to write to the very end-of-file (end = -1).
376 */
370int filemap_write_and_wait_range(struct address_space *mapping, 377int filemap_write_and_wait_range(struct address_space *mapping,
371 loff_t lstart, loff_t lend) 378 loff_t lstart, loff_t lend)
372{ 379{
@@ -427,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
427 return ret; 434 return ret;
428} 435}
429 436
437#ifdef CONFIG_NUMA
438struct page *page_cache_alloc(struct address_space *x)
439{
440 if (cpuset_do_page_mem_spread()) {
441 int n = cpuset_mem_spread_node();
442 return alloc_pages_node(n, mapping_gfp_mask(x), 0);
443 }
444 return alloc_pages(mapping_gfp_mask(x), 0);
445}
446EXPORT_SYMBOL(page_cache_alloc);
447
448struct page *page_cache_alloc_cold(struct address_space *x)
449{
450 if (cpuset_do_page_mem_spread()) {
451 int n = cpuset_mem_spread_node();
452 return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
453 }
454 return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
455}
456EXPORT_SYMBOL(page_cache_alloc_cold);
457#endif
458
430/* 459/*
431 * In order to wait for pages to become available there must be 460 * In order to wait for pages to become available there must be
432 * waitqueues associated with pages. By using a hash table of 461 * waitqueues associated with pages. By using a hash table of
diff --git a/mm/memory.c b/mm/memory.c
index 80c3fb370f91..e347e106ca3a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -395,12 +395,16 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
395 return NULL; 395 return NULL;
396 } 396 }
397 397
398#ifdef CONFIG_DEBUG_VM 398 /*
399 * Add some anal sanity checks for now. Eventually,
400 * we should just do "return pfn_to_page(pfn)", but
401 * in the meantime we check that we get a valid pfn,
402 * and that the resulting page looks ok.
403 */
399 if (unlikely(!pfn_valid(pfn))) { 404 if (unlikely(!pfn_valid(pfn))) {
400 print_bad_pte(vma, pte, addr); 405 print_bad_pte(vma, pte, addr);
401 return NULL; 406 return NULL;
402 } 407 }
403#endif
404 408
405 /* 409 /*
406 * NOTE! We still have PageReserved() pages in the page 410 * NOTE! We still have PageReserved() pages in the page
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e93cc740c22b..4f71cfd29c6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
422 return mpol_check_policy(mode, nodes); 422 return mpol_check_policy(mode, nodes);
423} 423}
424 424
425
426/*
427 * Update task->flags PF_MEMPOLICY bit: set iff non-default
428 * mempolicy. Allows more rapid checking of this (combined perhaps
429 * with other PF_* flag bits) on memory allocation hot code paths.
430 *
431 * If called from outside this file, the task 'p' should -only- be
432 * a newly forked child not yet visible on the task list, because
433 * manipulating the task flags of a visible task is not safe.
434 *
435 * The above limitation is why this routine has the funny name
436 * mpol_fix_fork_child_flag().
437 *
438 * It is also safe to call this with a task pointer of current,
439 * which the static wrapper mpol_set_task_struct_flag() does,
440 * for use within this file.
441 */
442
443void mpol_fix_fork_child_flag(struct task_struct *p)
444{
445 if (p->mempolicy)
446 p->flags |= PF_MEMPOLICY;
447 else
448 p->flags &= ~PF_MEMPOLICY;
449}
450
451static void mpol_set_task_struct_flag(void)
452{
453 mpol_fix_fork_child_flag(current);
454}
455
425/* Set the process memory policy */ 456/* Set the process memory policy */
426long do_set_mempolicy(int mode, nodemask_t *nodes) 457long do_set_mempolicy(int mode, nodemask_t *nodes)
427{ 458{
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
434 return PTR_ERR(new); 465 return PTR_ERR(new);
435 mpol_free(current->mempolicy); 466 mpol_free(current->mempolicy);
436 current->mempolicy = new; 467 current->mempolicy = new;
468 mpol_set_task_struct_flag();
437 if (new && new->policy == MPOL_INTERLEAVE) 469 if (new && new->policy == MPOL_INTERLEAVE)
438 current->il_next = first_node(new->v.nodes); 470 current->il_next = first_node(new->v.nodes);
439 return 0; 471 return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 0eb9894db6de..4f5b5709136a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1040,12 +1040,11 @@ munmap_back:
1040 * specific mapper. the address has already been validated, but 1040 * specific mapper. the address has already been validated, but
1041 * not unmapped, but the maps are removed from the list. 1041 * not unmapped, but the maps are removed from the list.
1042 */ 1042 */
1043 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 1043 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1044 if (!vma) { 1044 if (!vma) {
1045 error = -ENOMEM; 1045 error = -ENOMEM;
1046 goto unacct_error; 1046 goto unacct_error;
1047 } 1047 }
1048 memset(vma, 0, sizeof(*vma));
1049 1048
1050 vma->vm_mm = mm; 1049 vma->vm_mm = mm;
1051 vma->vm_start = addr; 1050 vma->vm_start = addr;
@@ -1896,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1896 /* 1895 /*
1897 * create a vma struct for an anonymous mapping 1896 * create a vma struct for an anonymous mapping
1898 */ 1897 */
1899 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 1898 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1900 if (!vma) { 1899 if (!vma) {
1901 vm_unacct_memory(len >> PAGE_SHIFT); 1900 vm_unacct_memory(len >> PAGE_SHIFT);
1902 return -ENOMEM; 1901 return -ENOMEM;
1903 } 1902 }
1904 memset(vma, 0, sizeof(*vma));
1905 1903
1906 vma->vm_mm = mm; 1904 vma->vm_mm = mm;
1907 vma->vm_start = addr; 1905 vma->vm_start = addr;
diff --git a/mm/msync.c b/mm/msync.c
index 3563a56e1a51..bc6c95376366 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -9,20 +9,24 @@
9 */ 9 */
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/fs.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
13#include <linux/mman.h> 14#include <linux/mman.h>
14#include <linux/hugetlb.h> 15#include <linux/hugetlb.h>
16#include <linux/writeback.h>
17#include <linux/file.h>
15#include <linux/syscalls.h> 18#include <linux/syscalls.h>
16 19
17#include <asm/pgtable.h> 20#include <asm/pgtable.h>
18#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
19 22
20static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 23static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
21 unsigned long addr, unsigned long end) 24 unsigned long addr, unsigned long end)
22{ 25{
23 pte_t *pte; 26 pte_t *pte;
24 spinlock_t *ptl; 27 spinlock_t *ptl;
25 int progress = 0; 28 int progress = 0;
29 unsigned long ret = 0;
26 30
27again: 31again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 32 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -43,58 +47,64 @@ again:
43 if (!page) 47 if (!page)
44 continue; 48 continue;
45 if (ptep_clear_flush_dirty(vma, addr, pte) || 49 if (ptep_clear_flush_dirty(vma, addr, pte) ||
46 page_test_and_clear_dirty(page)) 50 page_test_and_clear_dirty(page))
47 set_page_dirty(page); 51 ret += set_page_dirty(page);
48 progress += 3; 52 progress += 3;
49 } while (pte++, addr += PAGE_SIZE, addr != end); 53 } while (pte++, addr += PAGE_SIZE, addr != end);
50 pte_unmap_unlock(pte - 1, ptl); 54 pte_unmap_unlock(pte - 1, ptl);
51 cond_resched(); 55 cond_resched();
52 if (addr != end) 56 if (addr != end)
53 goto again; 57 goto again;
58 return ret;
54} 59}
55 60
56static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, 61static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
57 unsigned long addr, unsigned long end) 62 pud_t *pud, unsigned long addr, unsigned long end)
58{ 63{
59 pmd_t *pmd; 64 pmd_t *pmd;
60 unsigned long next; 65 unsigned long next;
66 unsigned long ret = 0;
61 67
62 pmd = pmd_offset(pud, addr); 68 pmd = pmd_offset(pud, addr);
63 do { 69 do {
64 next = pmd_addr_end(addr, end); 70 next = pmd_addr_end(addr, end);
65 if (pmd_none_or_clear_bad(pmd)) 71 if (pmd_none_or_clear_bad(pmd))
66 continue; 72 continue;
67 msync_pte_range(vma, pmd, addr, next); 73 ret += msync_pte_range(vma, pmd, addr, next);
68 } while (pmd++, addr = next, addr != end); 74 } while (pmd++, addr = next, addr != end);
75 return ret;
69} 76}
70 77
71static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 78static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
72 unsigned long addr, unsigned long end) 79 pgd_t *pgd, unsigned long addr, unsigned long end)
73{ 80{
74 pud_t *pud; 81 pud_t *pud;
75 unsigned long next; 82 unsigned long next;
83 unsigned long ret = 0;
76 84
77 pud = pud_offset(pgd, addr); 85 pud = pud_offset(pgd, addr);
78 do { 86 do {
79 next = pud_addr_end(addr, end); 87 next = pud_addr_end(addr, end);
80 if (pud_none_or_clear_bad(pud)) 88 if (pud_none_or_clear_bad(pud))
81 continue; 89 continue;
82 msync_pmd_range(vma, pud, addr, next); 90 ret += msync_pmd_range(vma, pud, addr, next);
83 } while (pud++, addr = next, addr != end); 91 } while (pud++, addr = next, addr != end);
92 return ret;
84} 93}
85 94
86static void msync_page_range(struct vm_area_struct *vma, 95static unsigned long msync_page_range(struct vm_area_struct *vma,
87 unsigned long addr, unsigned long end) 96 unsigned long addr, unsigned long end)
88{ 97{
89 pgd_t *pgd; 98 pgd_t *pgd;
90 unsigned long next; 99 unsigned long next;
100 unsigned long ret = 0;
91 101
92 /* For hugepages we can't go walking the page table normally, 102 /* For hugepages we can't go walking the page table normally,
93 * but that's ok, hugetlbfs is memory based, so we don't need 103 * but that's ok, hugetlbfs is memory based, so we don't need
94 * to do anything more on an msync(). 104 * to do anything more on an msync().
95 */ 105 */
96 if (vma->vm_flags & VM_HUGETLB) 106 if (vma->vm_flags & VM_HUGETLB)
97 return; 107 return 0;
98 108
99 BUG_ON(addr >= end); 109 BUG_ON(addr >= end);
100 pgd = pgd_offset(vma->vm_mm, addr); 110 pgd = pgd_offset(vma->vm_mm, addr);
@@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma,
103 next = pgd_addr_end(addr, end); 113 next = pgd_addr_end(addr, end);
104 if (pgd_none_or_clear_bad(pgd)) 114 if (pgd_none_or_clear_bad(pgd))
105 continue; 115 continue;
106 msync_pud_range(vma, pgd, addr, next); 116 ret += msync_pud_range(vma, pgd, addr, next);
107 } while (pgd++, addr = next, addr != end); 117 } while (pgd++, addr = next, addr != end);
118 return ret;
108} 119}
109 120
110/* 121/*
@@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma,
115 * write out the dirty pages and wait on the writeout and check the result. 126 * write out the dirty pages and wait on the writeout and check the result.
116 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start 127 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
117 * async writeout immediately. 128 * async writeout immediately.
118 * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to 129 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
119 * applications. 130 * applications.
120 */ 131 */
121static int msync_interval(struct vm_area_struct *vma, 132static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
122 unsigned long addr, unsigned long end, int flags) 133 unsigned long end, int flags,
134 unsigned long *nr_pages_dirtied)
123{ 135{
124 int ret = 0;
125 struct file *file = vma->vm_file; 136 struct file *file = vma->vm_file;
126 137
127 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED)) 138 if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
128 return -EBUSY; 139 return -EBUSY;
129 140
130 if (file && (vma->vm_flags & VM_SHARED)) { 141 if (file && (vma->vm_flags & VM_SHARED))
131 msync_page_range(vma, addr, end); 142 *nr_pages_dirtied = msync_page_range(vma, addr, end);
132 143 return 0;
133 if (flags & MS_SYNC) {
134 struct address_space *mapping = file->f_mapping;
135 int err;
136
137 ret = filemap_fdatawrite(mapping);
138 if (file->f_op && file->f_op->fsync) {
139 /*
140 * We don't take i_mutex here because mmap_sem
141 * is already held.
142 */
143 err = file->f_op->fsync(file,file->f_dentry,1);
144 if (err && !ret)
145 ret = err;
146 }
147 err = filemap_fdatawait(mapping);
148 if (!ret)
149 ret = err;
150 }
151 }
152 return ret;
153} 144}
154 145
155asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 146asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
156{ 147{
157 unsigned long end; 148 unsigned long end;
158 struct vm_area_struct *vma; 149 struct vm_area_struct *vma;
159 int unmapped_error, error = -EINVAL; 150 int unmapped_error = 0;
160 151 int error = -EINVAL;
161 if (flags & MS_SYNC) 152 int done = 0;
162 current->flags |= PF_SYNCWRITE;
163 153
164 down_read(&current->mm->mmap_sem);
165 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) 154 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
166 goto out; 155 goto out;
167 if (start & ~PAGE_MASK) 156 if (start & ~PAGE_MASK)
@@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
180 * If the interval [start,end) covers some unmapped address ranges, 169 * If the interval [start,end) covers some unmapped address ranges,
181 * just ignore them, but return -ENOMEM at the end. 170 * just ignore them, but return -ENOMEM at the end.
182 */ 171 */
172 down_read(&current->mm->mmap_sem);
173 if (flags & MS_SYNC)
174 current->flags |= PF_SYNCWRITE;
183 vma = find_vma(current->mm, start); 175 vma = find_vma(current->mm, start);
184 unmapped_error = 0; 176 if (!vma) {
185 for (;;) {
186 /* Still start < end. */
187 error = -ENOMEM; 177 error = -ENOMEM;
188 if (!vma) 178 goto out_unlock;
189 goto out; 179 }
180 do {
181 unsigned long nr_pages_dirtied = 0;
182 struct file *file;
183
190 /* Here start < vma->vm_end. */ 184 /* Here start < vma->vm_end. */
191 if (start < vma->vm_start) { 185 if (start < vma->vm_start) {
192 unmapped_error = -ENOMEM; 186 unmapped_error = -ENOMEM;
@@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
195 /* Here vma->vm_start <= start < vma->vm_end. */ 189 /* Here vma->vm_start <= start < vma->vm_end. */
196 if (end <= vma->vm_end) { 190 if (end <= vma->vm_end) {
197 if (start < end) { 191 if (start < end) {
198 error = msync_interval(vma, start, end, flags); 192 error = msync_interval(vma, start, end, flags,
193 &nr_pages_dirtied);
199 if (error) 194 if (error)
200 goto out; 195 goto out_unlock;
201 } 196 }
202 error = unmapped_error; 197 error = unmapped_error;
203 goto out; 198 done = 1;
199 } else {
200 /* Here vma->vm_start <= start < vma->vm_end < end. */
201 error = msync_interval(vma, start, vma->vm_end, flags,
202 &nr_pages_dirtied);
203 if (error)
204 goto out_unlock;
204 } 205 }
205 /* Here vma->vm_start <= start < vma->vm_end < end. */ 206 file = vma->vm_file;
206 error = msync_interval(vma, start, vma->vm_end, flags);
207 if (error)
208 goto out;
209 start = vma->vm_end; 207 start = vma->vm_end;
210 vma = vma->vm_next; 208 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
211 } 209 get_file(file);
212out: 210 up_read(&current->mm->mmap_sem);
213 up_read(&current->mm->mmap_sem); 211 balance_dirty_pages_ratelimited_nr(file->f_mapping,
212 nr_pages_dirtied);
213 fput(file);
214 down_read(&current->mm->mmap_sem);
215 vma = find_vma(current->mm, start);
216 } else if ((flags & MS_SYNC) && file &&
217 (vma->vm_flags & VM_SHARED)) {
218 get_file(file);
219 up_read(&current->mm->mmap_sem);
220 error = do_fsync(file, 0);
221 fput(file);
222 down_read(&current->mm->mmap_sem);
223 if (error)
224 goto out_unlock;
225 vma = find_vma(current->mm, start);
226 } else {
227 vma = vma->vm_next;
228 }
229 } while (vma && !done);
230out_unlock:
214 current->flags &= ~PF_SYNCWRITE; 231 current->flags &= ~PF_SYNCWRITE;
232 up_read(&current->mm->mmap_sem);
233out:
215 return error; 234 return error;
216} 235}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559fb63d2..893d7677579e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
75 * The interval between `kupdate'-style writebacks, in centiseconds 75 * The interval between `kupdate'-style writebacks, in centiseconds
76 * (hundredths of a second) 76 * (hundredths of a second)
77 */ 77 */
78int dirty_writeback_centisecs = 5 * 100; 78int dirty_writeback_interval = 5 * HZ;
79 79
80/* 80/*
81 * The longest number of centiseconds for which data is allowed to remain dirty 81 * The longest number of centiseconds for which data is allowed to remain dirty
82 */ 82 */
83int dirty_expire_centisecs = 30 * 100; 83int dirty_expire_interval = 30 * HZ;
84 84
85/* 85/*
86 * Flag that makes the machine dump writes/reads and block dirtyings. 86 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100;
88int block_dump; 88int block_dump;
89 89
90/* 90/*
91 * Flag that puts the machine in "laptop mode". 91 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
92 * a full sync is triggered after this time elapses without any disk activity.
92 */ 93 */
93int laptop_mode; 94int laptop_mode;
94 95
@@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping)
255} 256}
256 257
257/** 258/**
258 * balance_dirty_pages_ratelimited - balance dirty memory state 259 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
259 * @mapping: address_space which was dirtied 260 * @mapping: address_space which was dirtied
261 * @nr_pages: number of pages which the caller has just dirtied
260 * 262 *
261 * Processes which are dirtying memory should call in here once for each page 263 * Processes which are dirtying memory should call in here once for each page
262 * which was newly dirtied. The function will periodically check the system's 264 * which was newly dirtied. The function will periodically check the system's
@@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping)
267 * limit we decrease the ratelimiting by a lot, to prevent individual processes 269 * limit we decrease the ratelimiting by a lot, to prevent individual processes
268 * from overshooting the limit by (ratelimit_pages) each. 270 * from overshooting the limit by (ratelimit_pages) each.
269 */ 271 */
270void balance_dirty_pages_ratelimited(struct address_space *mapping) 272void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
273 unsigned long nr_pages_dirtied)
271{ 274{
272 static DEFINE_PER_CPU(int, ratelimits) = 0; 275 static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
273 long ratelimit; 276 unsigned long ratelimit;
277 unsigned long *p;
274 278
275 ratelimit = ratelimit_pages; 279 ratelimit = ratelimit_pages;
276 if (dirty_exceeded) 280 if (dirty_exceeded)
@@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
280 * Check the rate limiting. Also, we do not want to throttle real-time 284 * Check the rate limiting. Also, we do not want to throttle real-time
281 * tasks in balance_dirty_pages(). Period. 285 * tasks in balance_dirty_pages(). Period.
282 */ 286 */
283 if (get_cpu_var(ratelimits)++ >= ratelimit) { 287 preempt_disable();
284 __get_cpu_var(ratelimits) = 0; 288 p = &__get_cpu_var(ratelimits);
285 put_cpu_var(ratelimits); 289 *p += nr_pages_dirtied;
290 if (unlikely(*p >= ratelimit)) {
291 *p = 0;
292 preempt_enable();
286 balance_dirty_pages(mapping); 293 balance_dirty_pages(mapping);
287 return; 294 return;
288 } 295 }
289 put_cpu_var(ratelimits); 296 preempt_enable();
290} 297}
291EXPORT_SYMBOL(balance_dirty_pages_ratelimited); 298EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
292 299
293void throttle_vm_writeout(void) 300void throttle_vm_writeout(void)
294{ 301{
@@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
380 * just walks the superblock inode list, writing back any inodes which are 387 * just walks the superblock inode list, writing back any inodes which are
381 * older than a specific point in time. 388 * older than a specific point in time.
382 * 389 *
383 * Try to run once per dirty_writeback_centisecs. But if a writeback event 390 * Try to run once per dirty_writeback_interval. But if a writeback event
384 * takes longer than a dirty_writeback_centisecs interval, then leave a 391 * takes longer than a dirty_writeback_interval interval, then leave a
385 * one-second gap. 392 * one-second gap.
386 * 393 *
387 * older_than_this takes precedence over nr_to_write. So we'll only write back 394 * older_than_this takes precedence over nr_to_write. So we'll only write back
@@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg)
406 sync_supers(); 413 sync_supers();
407 414
408 get_writeback_state(&wbs); 415 get_writeback_state(&wbs);
409 oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; 416 oldest_jif = jiffies - dirty_expire_interval;
410 start_jif = jiffies; 417 start_jif = jiffies;
411 next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; 418 next_jif = start_jif + dirty_writeback_interval;
412 nr_to_write = wbs.nr_dirty + wbs.nr_unstable + 419 nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
413 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 420 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
414 while (nr_to_write > 0) { 421 while (nr_to_write > 0) {
@@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg)
425 } 432 }
426 if (time_before(next_jif, jiffies + HZ)) 433 if (time_before(next_jif, jiffies + HZ))
427 next_jif = jiffies + HZ; 434 next_jif = jiffies + HZ;
428 if (dirty_writeback_centisecs) 435 if (dirty_writeback_interval)
429 mod_timer(&wb_timer, next_jif); 436 mod_timer(&wb_timer, next_jif);
430} 437}
431 438
@@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg)
435int dirty_writeback_centisecs_handler(ctl_table *table, int write, 442int dirty_writeback_centisecs_handler(ctl_table *table, int write,
436 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 443 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
437{ 444{
438 proc_dointvec(table, write, file, buffer, length, ppos); 445 proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
439 if (dirty_writeback_centisecs) { 446 if (dirty_writeback_interval) {
440 mod_timer(&wb_timer, 447 mod_timer(&wb_timer,
441 jiffies + (dirty_writeback_centisecs * HZ) / 100); 448 jiffies + dirty_writeback_interval);
442 } else { 449 } else {
443 del_timer(&wb_timer); 450 del_timer(&wb_timer);
444 } 451 }
445 return 0; 452 return 0;
@@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused)
468 */ 475 */
469void laptop_io_completion(void) 476void laptop_io_completion(void)
470{ 477{
471 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ); 478 mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
472} 479}
473 480
474/* 481/*
@@ -544,7 +551,7 @@ void __init page_writeback_init(void)
544 if (vm_dirty_ratio <= 0) 551 if (vm_dirty_ratio <= 0)
545 vm_dirty_ratio = 1; 552 vm_dirty_ratio = 1;
546 } 553 }
547 mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100); 554 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
548 set_ratelimit(); 555 set_ratelimit();
549 register_cpu_notifier(&ratelimit_nb); 556 register_cpu_notifier(&ratelimit_nb);
550} 557}
@@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page);
621 */ 628 */
622int __set_page_dirty_nobuffers(struct page *page) 629int __set_page_dirty_nobuffers(struct page *page)
623{ 630{
624 int ret = 0;
625
626 if (!TestSetPageDirty(page)) { 631 if (!TestSetPageDirty(page)) {
627 struct address_space *mapping = page_mapping(page); 632 struct address_space *mapping = page_mapping(page);
628 struct address_space *mapping2; 633 struct address_space *mapping2;
@@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page)
644 I_DIRTY_PAGES); 649 I_DIRTY_PAGES);
645 } 650 }
646 } 651 }
652 return 1;
647 } 653 }
648 return ret; 654 return 0;
649} 655}
650EXPORT_SYMBOL(__set_page_dirty_nobuffers); 656EXPORT_SYMBOL(__set_page_dirty_nobuffers);
651 657
@@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page)
675 return (*spd)(page); 681 return (*spd)(page);
676 return __set_page_dirty_buffers(page); 682 return __set_page_dirty_buffers(page);
677 } 683 }
678 if (!PageDirty(page)) 684 if (!PageDirty(page)) {
679 SetPageDirty(page); 685 if (!TestSetPageDirty(page))
686 return 1;
687 }
680 return 0; 688 return 0;
681} 689}
682EXPORT_SYMBOL(set_page_dirty); 690EXPORT_SYMBOL(set_page_dirty);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b7f14a4799a5..338a02bb004d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -943,7 +943,8 @@ restart:
943 goto got_pg; 943 goto got_pg;
944 944
945 do { 945 do {
946 wakeup_kswapd(*z, order); 946 if (cpuset_zone_allowed(*z, gfp_mask))
947 wakeup_kswapd(*z, order);
947 } while (*(++z)); 948 } while (*(++z));
948 949
949 /* 950 /*
@@ -2028,8 +2029,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
2028 setup_pageset(zone_pcp(zone,cpu), batch); 2029 setup_pageset(zone_pcp(zone,cpu), batch);
2029#endif 2030#endif
2030 } 2031 }
2031 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 2032 if (zone->present_pages)
2032 zone->name, zone->present_pages, batch); 2033 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
2034 zone->name, zone->present_pages, batch);
2033} 2035}
2034 2036
2035static __meminit void init_currently_empty_zone(struct zone *zone, 2037static __meminit void init_currently_empty_zone(struct zone *zone,
@@ -2700,8 +2702,7 @@ void *__init alloc_large_system_hash(const char *tablename,
2700 else 2702 else
2701 numentries <<= (PAGE_SHIFT - scale); 2703 numentries <<= (PAGE_SHIFT - scale);
2702 } 2704 }
2703 /* rounded up to nearest power of 2 in size */ 2705 numentries = roundup_pow_of_two(numentries);
2704 numentries = 1UL << (long_log2(numentries) + 1);
2705 2706
2706 /* limit allocation size to 1/16 total memory by default */ 2707 /* limit allocation size to 1/16 total memory by default */
2707 if (max == 0) { 2708 if (max == 0) {
diff --git a/mm/slab.c b/mm/slab.c
index 1c8f5ee230d5..681837499d7d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -94,6 +94,7 @@
94#include <linux/interrupt.h> 94#include <linux/interrupt.h>
95#include <linux/init.h> 95#include <linux/init.h>
96#include <linux/compiler.h> 96#include <linux/compiler.h>
97#include <linux/cpuset.h>
97#include <linux/seq_file.h> 98#include <linux/seq_file.h>
98#include <linux/notifier.h> 99#include <linux/notifier.h>
99#include <linux/kallsyms.h> 100#include <linux/kallsyms.h>
@@ -173,12 +174,12 @@
173 SLAB_CACHE_DMA | \ 174 SLAB_CACHE_DMA | \
174 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ 175 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
175 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 176 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
176 SLAB_DESTROY_BY_RCU) 177 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
177#else 178#else
178# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ 179# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \
179 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ 180 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
180 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ 181 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
181 SLAB_DESTROY_BY_RCU) 182 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
182#endif 183#endif
183 184
184/* 185/*
@@ -203,7 +204,8 @@
203typedef unsigned int kmem_bufctl_t; 204typedef unsigned int kmem_bufctl_t;
204#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) 205#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
205#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) 206#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
206#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-2) 207#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
208#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
207 209
208/* Max number of objs-per-slab for caches which use off-slab slabs. 210/* Max number of objs-per-slab for caches which use off-slab slabs.
209 * Needed to avoid a possible looping condition in cache_grow(). 211 * Needed to avoid a possible looping condition in cache_grow().
@@ -896,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries,
896 return nc; 898 return nc;
897} 899}
898 900
901/*
902 * Transfer objects in one arraycache to another.
903 * Locking must be handled by the caller.
904 *
905 * Return the number of entries transferred.
906 */
907static int transfer_objects(struct array_cache *to,
908 struct array_cache *from, unsigned int max)
909{
910 /* Figure out how many entries to transfer */
911 int nr = min(min(from->avail, max), to->limit - to->avail);
912
913 if (!nr)
914 return 0;
915
916 memcpy(to->entry + to->avail, from->entry + from->avail -nr,
917 sizeof(void *) *nr);
918
919 from->avail -= nr;
920 to->avail += nr;
921 to->touched = 1;
922 return nr;
923}
924
899#ifdef CONFIG_NUMA 925#ifdef CONFIG_NUMA
900static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 926static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
927static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
901 928
902static struct array_cache **alloc_alien_cache(int node, int limit) 929static struct array_cache **alloc_alien_cache(int node, int limit)
903{ 930{
@@ -944,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
944 971
945 if (ac->avail) { 972 if (ac->avail) {
946 spin_lock(&rl3->list_lock); 973 spin_lock(&rl3->list_lock);
974 /*
975 * Stuff objects into the remote nodes shared array first.
976 * That way we could avoid the overhead of putting the objects
977 * into the free lists and getting them back later.
978 */
979 transfer_objects(rl3->shared, ac, ac->limit);
980
947 free_block(cachep, ac->entry, ac->avail, node); 981 free_block(cachep, ac->entry, ac->avail, node);
948 ac->avail = 0; 982 ac->avail = 0;
949 spin_unlock(&rl3->list_lock); 983 spin_unlock(&rl3->list_lock);
@@ -959,8 +993,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
959 993
960 if (l3->alien) { 994 if (l3->alien) {
961 struct array_cache *ac = l3->alien[node]; 995 struct array_cache *ac = l3->alien[node];
962 if (ac && ac->avail) { 996
963 spin_lock_irq(&ac->lock); 997 if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
964 __drain_alien_cache(cachep, ac, node); 998 __drain_alien_cache(cachep, ac, node);
965 spin_unlock_irq(&ac->lock); 999 spin_unlock_irq(&ac->lock);
966 } 1000 }
@@ -1987,10 +2021,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1987 align = ralign; 2021 align = ralign;
1988 2022
1989 /* Get cache's description obj. */ 2023 /* Get cache's description obj. */
1990 cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL); 2024 cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
1991 if (!cachep) 2025 if (!cachep)
1992 goto oops; 2026 goto oops;
1993 memset(cachep, 0, sizeof(struct kmem_cache));
1994 2027
1995#if DEBUG 2028#if DEBUG
1996 cachep->obj_size = size; 2029 cachep->obj_size = size;
@@ -2397,7 +2430,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2397 /* Verify that the slab belongs to the intended node */ 2430 /* Verify that the slab belongs to the intended node */
2398 WARN_ON(slabp->nodeid != nodeid); 2431 WARN_ON(slabp->nodeid != nodeid);
2399 2432
2400 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2433 if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
2401 printk(KERN_ERR "slab: double free detected in cache " 2434 printk(KERN_ERR "slab: double free detected in cache "
2402 "'%s', objp %p\n", cachep->name, objp); 2435 "'%s', objp %p\n", cachep->name, objp);
2403 BUG(); 2436 BUG();
@@ -2603,6 +2636,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
2603 */ 2636 */
2604 cachep->dtor(objp + obj_offset(cachep), cachep, 0); 2637 cachep->dtor(objp + obj_offset(cachep), cachep, 0);
2605 } 2638 }
2639#ifdef CONFIG_DEBUG_SLAB_LEAK
2640 slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
2641#endif
2606 if (cachep->flags & SLAB_POISON) { 2642 if (cachep->flags & SLAB_POISON) {
2607#ifdef CONFIG_DEBUG_PAGEALLOC 2643#ifdef CONFIG_DEBUG_PAGEALLOC
2608 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { 2644 if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2675,20 +2711,10 @@ retry:
2675 BUG_ON(ac->avail > 0 || !l3); 2711 BUG_ON(ac->avail > 0 || !l3);
2676 spin_lock(&l3->list_lock); 2712 spin_lock(&l3->list_lock);
2677 2713
2678 if (l3->shared) { 2714 /* See if we can refill from the shared array */
2679 struct array_cache *shared_array = l3->shared; 2715 if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
2680 if (shared_array->avail) { 2716 goto alloc_done;
2681 if (batchcount > shared_array->avail) 2717
2682 batchcount = shared_array->avail;
2683 shared_array->avail -= batchcount;
2684 ac->avail = batchcount;
2685 memcpy(ac->entry,
2686 &(shared_array->entry[shared_array->avail]),
2687 sizeof(void *) * batchcount);
2688 shared_array->touched = 1;
2689 goto alloc_done;
2690 }
2691 }
2692 while (batchcount > 0) { 2718 while (batchcount > 0) {
2693 struct list_head *entry; 2719 struct list_head *entry;
2694 struct slab *slabp; 2720 struct slab *slabp;
@@ -2786,6 +2812,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
2786 *dbg_redzone1(cachep, objp) = RED_ACTIVE; 2812 *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2787 *dbg_redzone2(cachep, objp) = RED_ACTIVE; 2813 *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2788 } 2814 }
2815#ifdef CONFIG_DEBUG_SLAB_LEAK
2816 {
2817 struct slab *slabp;
2818 unsigned objnr;
2819
2820 slabp = page_get_slab(virt_to_page(objp));
2821 objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
2822 slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
2823 }
2824#endif
2789 objp += obj_offset(cachep); 2825 objp += obj_offset(cachep);
2790 if (cachep->ctor && cachep->flags & SLAB_POISON) { 2826 if (cachep->ctor && cachep->flags & SLAB_POISON) {
2791 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2827 unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
@@ -2807,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
2807 struct array_cache *ac; 2843 struct array_cache *ac;
2808 2844
2809#ifdef CONFIG_NUMA 2845#ifdef CONFIG_NUMA
2810 if (unlikely(current->mempolicy && !in_interrupt())) { 2846 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
2811 int nid = slab_node(current->mempolicy); 2847 objp = alternate_node_alloc(cachep, flags);
2812 2848 if (objp != NULL)
2813 if (nid != numa_node_id()) 2849 return objp;
2814 return __cache_alloc_node(cachep, flags, nid);
2815 } 2850 }
2816#endif 2851#endif
2817 2852
@@ -2847,6 +2882,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
2847 2882
2848#ifdef CONFIG_NUMA 2883#ifdef CONFIG_NUMA
2849/* 2884/*
2885 * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
2886 *
2887 * If we are in_interrupt, then process context, including cpusets and
2888 * mempolicy, may not apply and should not be used for allocation policy.
2889 */
2890static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
2891{
2892 int nid_alloc, nid_here;
2893
2894 if (in_interrupt())
2895 return NULL;
2896 nid_alloc = nid_here = numa_node_id();
2897 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
2898 nid_alloc = cpuset_mem_spread_node();
2899 else if (current->mempolicy)
2900 nid_alloc = slab_node(current->mempolicy);
2901 if (nid_alloc != nid_here)
2902 return __cache_alloc_node(cachep, flags, nid_alloc);
2903 return NULL;
2904}
2905
2906/*
2850 * A interface to enable slab creation on nodeid 2907 * A interface to enable slab creation on nodeid
2851 */ 2908 */
2852static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 2909static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3071,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3071EXPORT_SYMBOL(kmem_cache_alloc); 3128EXPORT_SYMBOL(kmem_cache_alloc);
3072 3129
3073/** 3130/**
3131 * kmem_cache_alloc - Allocate an object. The memory is set to zero.
3132 * @cache: The cache to allocate from.
3133 * @flags: See kmalloc().
3134 *
3135 * Allocate an object from this cache and set the allocated memory to zero.
3136 * The flags are only relevant if the cache has no available objects.
3137 */
3138void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
3139{
3140 void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
3141 if (ret)
3142 memset(ret, 0, obj_size(cache));
3143 return ret;
3144}
3145EXPORT_SYMBOL(kmem_cache_zalloc);
3146
3147/**
3074 * kmem_ptr_validate - check if an untrusted pointer might 3148 * kmem_ptr_validate - check if an untrusted pointer might
3075 * be a slab entry. 3149 * be a slab entry.
3076 * @cachep: the cache we're checking against 3150 * @cachep: the cache we're checking against
@@ -3197,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3197 return __cache_alloc(cachep, flags, caller); 3271 return __cache_alloc(cachep, flags, caller);
3198} 3272}
3199 3273
3200#ifndef CONFIG_DEBUG_SLAB
3201 3274
3202void *__kmalloc(size_t size, gfp_t flags) 3275void *__kmalloc(size_t size, gfp_t flags)
3203{ 3276{
3277#ifndef CONFIG_DEBUG_SLAB
3204 return __do_kmalloc(size, flags, NULL); 3278 return __do_kmalloc(size, flags, NULL);
3279#else
3280 return __do_kmalloc(size, flags, __builtin_return_address(0));
3281#endif
3205} 3282}
3206EXPORT_SYMBOL(__kmalloc); 3283EXPORT_SYMBOL(__kmalloc);
3207 3284
3208#else 3285#ifdef CONFIG_DEBUG_SLAB
3209
3210void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller) 3286void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
3211{ 3287{
3212 return __do_kmalloc(size, flags, caller); 3288 return __do_kmalloc(size, flags, caller);
3213} 3289}
3214EXPORT_SYMBOL(__kmalloc_track_caller); 3290EXPORT_SYMBOL(__kmalloc_track_caller);
3215
3216#endif 3291#endif
3217 3292
3218#ifdef CONFIG_SMP 3293#ifdef CONFIG_SMP
@@ -3343,63 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep)
3343EXPORT_SYMBOL_GPL(kmem_cache_name); 3418EXPORT_SYMBOL_GPL(kmem_cache_name);
3344 3419
3345/* 3420/*
3346 * This initializes kmem_list3 for all nodes. 3421 * This initializes kmem_list3 or resizes varioius caches for all nodes.
3347 */ 3422 */
3348static int alloc_kmemlist(struct kmem_cache *cachep) 3423static int alloc_kmemlist(struct kmem_cache *cachep)
3349{ 3424{
3350 int node; 3425 int node;
3351 struct kmem_list3 *l3; 3426 struct kmem_list3 *l3;
3352 int err = 0; 3427 struct array_cache *new_shared;
3428 struct array_cache **new_alien;
3353 3429
3354 for_each_online_node(node) { 3430 for_each_online_node(node) {
3355 struct array_cache *nc = NULL, *new; 3431
3356 struct array_cache **new_alien = NULL;
3357#ifdef CONFIG_NUMA
3358 new_alien = alloc_alien_cache(node, cachep->limit); 3432 new_alien = alloc_alien_cache(node, cachep->limit);
3359 if (!new_alien) 3433 if (!new_alien)
3360 goto fail; 3434 goto fail;
3361#endif 3435
3362 new = alloc_arraycache(node, cachep->shared*cachep->batchcount, 3436 new_shared = alloc_arraycache(node,
3437 cachep->shared*cachep->batchcount,
3363 0xbaadf00d); 3438 0xbaadf00d);
3364 if (!new) 3439 if (!new_shared) {
3440 free_alien_cache(new_alien);
3365 goto fail; 3441 goto fail;
3442 }
3443
3366 l3 = cachep->nodelists[node]; 3444 l3 = cachep->nodelists[node];
3367 if (l3) { 3445 if (l3) {
3446 struct array_cache *shared = l3->shared;
3447
3368 spin_lock_irq(&l3->list_lock); 3448 spin_lock_irq(&l3->list_lock);
3369 3449
3370 nc = cachep->nodelists[node]->shared; 3450 if (shared)
3371 if (nc) 3451 free_block(cachep, shared->entry,
3372 free_block(cachep, nc->entry, nc->avail, node); 3452 shared->avail, node);
3373 3453
3374 l3->shared = new; 3454 l3->shared = new_shared;
3375 if (!cachep->nodelists[node]->alien) { 3455 if (!l3->alien) {
3376 l3->alien = new_alien; 3456 l3->alien = new_alien;
3377 new_alien = NULL; 3457 new_alien = NULL;
3378 } 3458 }
3379 l3->free_limit = (1 + nr_cpus_node(node)) * 3459 l3->free_limit = (1 + nr_cpus_node(node)) *
3380 cachep->batchcount + cachep->num; 3460 cachep->batchcount + cachep->num;
3381 spin_unlock_irq(&l3->list_lock); 3461 spin_unlock_irq(&l3->list_lock);
3382 kfree(nc); 3462 kfree(shared);
3383 free_alien_cache(new_alien); 3463 free_alien_cache(new_alien);
3384 continue; 3464 continue;
3385 } 3465 }
3386 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); 3466 l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
3387 if (!l3) 3467 if (!l3) {
3468 free_alien_cache(new_alien);
3469 kfree(new_shared);
3388 goto fail; 3470 goto fail;
3471 }
3389 3472
3390 kmem_list3_init(l3); 3473 kmem_list3_init(l3);
3391 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + 3474 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3392 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 3475 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
3393 l3->shared = new; 3476 l3->shared = new_shared;
3394 l3->alien = new_alien; 3477 l3->alien = new_alien;
3395 l3->free_limit = (1 + nr_cpus_node(node)) * 3478 l3->free_limit = (1 + nr_cpus_node(node)) *
3396 cachep->batchcount + cachep->num; 3479 cachep->batchcount + cachep->num;
3397 cachep->nodelists[node] = l3; 3480 cachep->nodelists[node] = l3;
3398 } 3481 }
3399 return err; 3482 return 0;
3483
3400fail: 3484fail:
3401 err = -ENOMEM; 3485 if (!cachep->next.next) {
3402 return err; 3486 /* Cache is not active yet. Roll back what we did */
3487 node--;
3488 while (node >= 0) {
3489 if (cachep->nodelists[node]) {
3490 l3 = cachep->nodelists[node];
3491
3492 kfree(l3->shared);
3493 free_alien_cache(l3->alien);
3494 kfree(l3);
3495 cachep->nodelists[node] = NULL;
3496 }
3497 node--;
3498 }
3499 }
3500 return -ENOMEM;
3403} 3501}
3404 3502
3405struct ccupdate_struct { 3503struct ccupdate_struct {
@@ -3876,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
3876 res = count; 3974 res = count;
3877 return res; 3975 return res;
3878} 3976}
3977
3978#ifdef CONFIG_DEBUG_SLAB_LEAK
3979
3980static void *leaks_start(struct seq_file *m, loff_t *pos)
3981{
3982 loff_t n = *pos;
3983 struct list_head *p;
3984
3985 mutex_lock(&cache_chain_mutex);
3986 p = cache_chain.next;
3987 while (n--) {
3988 p = p->next;
3989 if (p == &cache_chain)
3990 return NULL;
3991 }
3992 return list_entry(p, struct kmem_cache, next);
3993}
3994
3995static inline int add_caller(unsigned long *n, unsigned long v)
3996{
3997 unsigned long *p;
3998 int l;
3999 if (!v)
4000 return 1;
4001 l = n[1];
4002 p = n + 2;
4003 while (l) {
4004 int i = l/2;
4005 unsigned long *q = p + 2 * i;
4006 if (*q == v) {
4007 q[1]++;
4008 return 1;
4009 }
4010 if (*q > v) {
4011 l = i;
4012 } else {
4013 p = q + 2;
4014 l -= i + 1;
4015 }
4016 }
4017 if (++n[1] == n[0])
4018 return 0;
4019 memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
4020 p[0] = v;
4021 p[1] = 1;
4022 return 1;
4023}
4024
4025static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
4026{
4027 void *p;
4028 int i;
4029 if (n[0] == n[1])
4030 return;
4031 for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
4032 if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
4033 continue;
4034 if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
4035 return;
4036 }
4037}
4038
4039static void show_symbol(struct seq_file *m, unsigned long address)
4040{
4041#ifdef CONFIG_KALLSYMS
4042 char *modname;
4043 const char *name;
4044 unsigned long offset, size;
4045 char namebuf[KSYM_NAME_LEN+1];
4046
4047 name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
4048
4049 if (name) {
4050 seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
4051 if (modname)
4052 seq_printf(m, " [%s]", modname);
4053 return;
4054 }
4055#endif
4056 seq_printf(m, "%p", (void *)address);
4057}
4058
4059static int leaks_show(struct seq_file *m, void *p)
4060{
4061 struct kmem_cache *cachep = p;
4062 struct list_head *q;
4063 struct slab *slabp;
4064 struct kmem_list3 *l3;
4065 const char *name;
4066 unsigned long *n = m->private;
4067 int node;
4068 int i;
4069
4070 if (!(cachep->flags & SLAB_STORE_USER))
4071 return 0;
4072 if (!(cachep->flags & SLAB_RED_ZONE))
4073 return 0;
4074
4075 /* OK, we can do it */
4076
4077 n[1] = 0;
4078
4079 for_each_online_node(node) {
4080 l3 = cachep->nodelists[node];
4081 if (!l3)
4082 continue;
4083
4084 check_irq_on();
4085 spin_lock_irq(&l3->list_lock);
4086
4087 list_for_each(q, &l3->slabs_full) {
4088 slabp = list_entry(q, struct slab, list);
4089 handle_slab(n, cachep, slabp);
4090 }
4091 list_for_each(q, &l3->slabs_partial) {
4092 slabp = list_entry(q, struct slab, list);
4093 handle_slab(n, cachep, slabp);
4094 }
4095 spin_unlock_irq(&l3->list_lock);
4096 }
4097 name = cachep->name;
4098 if (n[0] == n[1]) {
4099 /* Increase the buffer size */
4100 mutex_unlock(&cache_chain_mutex);
4101 m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
4102 if (!m->private) {
4103 /* Too bad, we are really out */
4104 m->private = n;
4105 mutex_lock(&cache_chain_mutex);
4106 return -ENOMEM;
4107 }
4108 *(unsigned long *)m->private = n[0] * 2;
4109 kfree(n);
4110 mutex_lock(&cache_chain_mutex);
4111 /* Now make sure this entry will be retried */
4112 m->count = m->size;
4113 return 0;
4114 }
4115 for (i = 0; i < n[1]; i++) {
4116 seq_printf(m, "%s: %lu ", name, n[2*i+3]);
4117 show_symbol(m, n[2*i+2]);
4118 seq_putc(m, '\n');
4119 }
4120 return 0;
4121}
4122
4123struct seq_operations slabstats_op = {
4124 .start = leaks_start,
4125 .next = s_next,
4126 .stop = s_stop,
4127 .show = leaks_show,
4128};
4129#endif
3879#endif 4130#endif
3880 4131
3881/** 4132/**
diff --git a/mm/slob.c b/mm/slob.c
index a1f42bdc0245..9bcc7e2cabfd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
294} 294}
295EXPORT_SYMBOL(kmem_cache_alloc); 295EXPORT_SYMBOL(kmem_cache_alloc);
296 296
297void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
298{
299 void *ret = kmem_cache_alloc(c, flags);
300 if (ret)
301 memset(ret, 0, c->size);
302
303 return ret;
304}
305EXPORT_SYMBOL(kmem_cache_zalloc);
306
297void kmem_cache_free(struct kmem_cache *c, void *b) 307void kmem_cache_free(struct kmem_cache *c, void *b)
298{ 308{
299 if (c->dtor) 309 if (c->dtor)
diff --git a/mm/util.c b/mm/util.c
index 5f4bb59da63c..7368479220b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,20 +1,22 @@
1#include <linux/slab.h> 1#include <linux/slab.h>
2#include <linux/string.h> 2#include <linux/string.h>
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/err.h>
5#include <asm/uaccess.h>
4 6
5/** 7/**
6 * kzalloc - allocate memory. The memory is set to zero. 8 * __kzalloc - allocate memory. The memory is set to zero.
7 * @size: how many bytes of memory are required. 9 * @size: how many bytes of memory are required.
8 * @flags: the type of memory to allocate. 10 * @flags: the type of memory to allocate.
9 */ 11 */
10void *kzalloc(size_t size, gfp_t flags) 12void *__kzalloc(size_t size, gfp_t flags)
11{ 13{
12 void *ret = kmalloc(size, flags); 14 void *ret = ____kmalloc(size, flags);
13 if (ret) 15 if (ret)
14 memset(ret, 0, size); 16 memset(ret, 0, size);
15 return ret; 17 return ret;
16} 18}
17EXPORT_SYMBOL(kzalloc); 19EXPORT_SYMBOL(__kzalloc);
18 20
19/* 21/*
20 * kstrdup - allocate space for and copy an existing string 22 * kstrdup - allocate space for and copy an existing string
@@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp)
31 return NULL; 33 return NULL;
32 34
33 len = strlen(s) + 1; 35 len = strlen(s) + 1;
34 buf = kmalloc(len, gfp); 36 buf = ____kmalloc(len, gfp);
35 if (buf) 37 if (buf)
36 memcpy(buf, s, len); 38 memcpy(buf, s, len);
37 return buf; 39 return buf;
38} 40}
39EXPORT_SYMBOL(kstrdup); 41EXPORT_SYMBOL(kstrdup);
42
43/*
44 * strndup_user - duplicate an existing string from user space
45 *
46 * @s: The string to duplicate
47 * @n: Maximum number of bytes to copy, including the trailing NUL.
48 */
49char *strndup_user(const char __user *s, long n)
50{
51 char *p;
52 long length;
53
54 length = strnlen_user(s, n);
55
56 if (!length)
57 return ERR_PTR(-EFAULT);
58
59 if (length > n)
60 return ERR_PTR(-EINVAL);
61
62 p = kmalloc(length, GFP_KERNEL);
63
64 if (!p)
65 return ERR_PTR(-ENOMEM);
66
67 if (copy_from_user(p, s, length)) {
68 kfree(p);
69 return ERR_PTR(-EFAULT);
70 }
71
72 p[length - 1] = '\0';
73
74 return p;
75}
76EXPORT_SYMBOL(strndup_user);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fd572bbdc9f5..78865c849f8f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1356,7 +1356,9 @@ static int __init kswapd_init(void)
1356 1356
1357 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); 1357 pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
1358 BUG_ON(pid < 0); 1358 BUG_ON(pid < 0);
1359 read_lock(&tasklist_lock);
1359 pgdat->kswapd = find_task_by_pid(pid); 1360 pgdat->kswapd = find_task_by_pid(pid);
1361 read_unlock(&tasklist_lock);
1360 } 1362 }
1361 total_memory = nr_free_pagecache_pages(); 1363 total_memory = nr_free_pagecache_pages();
1362 hotcpu_notifier(cpu_callback, 0); 1364 hotcpu_notifier(cpu_callback, 0);