aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c46
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/mempool.c35
-rw-r--r--mm/mmap.c34
-rw-r--r--mm/mremap.c6
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c36
-rw-r--r--mm/rmap.c113
-rw-r--r--mm/slab.c47
-rw-r--r--mm/swap_state.c27
-rw-r--r--mm/truncate.c4
-rw-r--r--mm/vmalloc.c8
12 files changed, 202 insertions, 162 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 93595c327bbd..d5fdae2eb183 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -123,8 +123,7 @@ void remove_from_page_cache(struct page *page)
123{ 123{
124 struct address_space *mapping = page->mapping; 124 struct address_space *mapping = page->mapping;
125 125
126 if (unlikely(!PageLocked(page))) 126 BUG_ON(!PageLocked(page));
127 PAGE_BUG(page);
128 127
129 write_lock_irq(&mapping->tree_lock); 128 write_lock_irq(&mapping->tree_lock);
130 __remove_from_page_cache(page); 129 __remove_from_page_cache(page);
@@ -139,7 +138,25 @@ static int sync_page(void *word)
139 page = container_of((page_flags_t *)word, struct page, flags); 138 page = container_of((page_flags_t *)word, struct page, flags);
140 139
141 /* 140 /*
142 * FIXME, fercrissake. What is this barrier here for? 141 * page_mapping() is being called without PG_locked held.
142 * Some knowledge of the state and use of the page is used to
143 * reduce the requirements down to a memory barrier.
144 * The danger here is of a stale page_mapping() return value
145 * indicating a struct address_space different from the one it's
146 * associated with when it is associated with one.
147 * After smp_mb(), it's either the correct page_mapping() for
148 * the page, or an old page_mapping() and the page's own
149 * page_mapping() has gone NULL.
150 * The ->sync_page() address_space operation must tolerate
151 * page_mapping() going NULL. By an amazing coincidence,
152 * this comes about because none of the users of the page
153 * in the ->sync_page() methods make essential use of the
154 * page_mapping(), merely passing the page down to the backing
155 * device's unplug functions when it's non-NULL, which in turn
156 * ignore it for all cases but swap, where only page->private is
157 * of interest. When page_mapping() does go NULL, the entire
158 * call stack gracefully ignores the page and returns.
159 * -- wli
143 */ 160 */
144 smp_mb(); 161 smp_mb();
145 mapping = page_mapping(page); 162 mapping = page_mapping(page);
@@ -152,9 +169,10 @@ static int sync_page(void *word)
152/** 169/**
153 * filemap_fdatawrite_range - start writeback against all of a mapping's 170 * filemap_fdatawrite_range - start writeback against all of a mapping's
154 * dirty pages that lie within the byte offsets <start, end> 171 * dirty pages that lie within the byte offsets <start, end>
155 * @mapping: address space structure to write 172 * @mapping: address space structure to write
156 * @start: offset in bytes where the range starts 173 * @start: offset in bytes where the range starts
157 * @end : offset in bytes where the range ends 174 * @end: offset in bytes where the range ends
175 * @sync_mode: enable synchronous operation
158 * 176 *
159 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as 177 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
160 * opposed to a regular memory * cleansing writeback. The difference between 178 * opposed to a regular memory * cleansing writeback. The difference between
@@ -518,8 +536,8 @@ EXPORT_SYMBOL(find_trylock_page);
518/** 536/**
519 * find_lock_page - locate, pin and lock a pagecache page 537 * find_lock_page - locate, pin and lock a pagecache page
520 * 538 *
521 * @mapping - the address_space to search 539 * @mapping: the address_space to search
522 * @offset - the page index 540 * @offset: the page index
523 * 541 *
524 * Locates the desired pagecache page, locks it, increments its reference 542 * Locates the desired pagecache page, locks it, increments its reference
525 * count and returns its address. 543 * count and returns its address.
@@ -558,9 +576,9 @@ EXPORT_SYMBOL(find_lock_page);
558/** 576/**
559 * find_or_create_page - locate or add a pagecache page 577 * find_or_create_page - locate or add a pagecache page
560 * 578 *
561 * @mapping - the page's address_space 579 * @mapping: the page's address_space
562 * @index - the page's index into the mapping 580 * @index: the page's index into the mapping
563 * @gfp_mask - page allocation mode 581 * @gfp_mask: page allocation mode
564 * 582 *
565 * Locates a page in the pagecache. If the page is not present, a new page 583 * Locates a page in the pagecache. If the page is not present, a new page
566 * is allocated using @gfp_mask and is added to the pagecache and to the VM's 584 * is allocated using @gfp_mask and is added to the pagecache and to the VM's
@@ -1949,7 +1967,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1949 buf = iov->iov_base + written; 1967 buf = iov->iov_base + written;
1950 else { 1968 else {
1951 filemap_set_next_iovec(&cur_iov, &iov_base, written); 1969 filemap_set_next_iovec(&cur_iov, &iov_base, written);
1952 buf = iov->iov_base + iov_base; 1970 buf = cur_iov->iov_base + iov_base;
1953 } 1971 }
1954 1972
1955 do { 1973 do {
@@ -2007,9 +2025,11 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2007 count -= status; 2025 count -= status;
2008 pos += status; 2026 pos += status;
2009 buf += status; 2027 buf += status;
2010 if (unlikely(nr_segs > 1)) 2028 if (unlikely(nr_segs > 1)) {
2011 filemap_set_next_iovec(&cur_iov, 2029 filemap_set_next_iovec(&cur_iov,
2012 &iov_base, status); 2030 &iov_base, status);
2031 buf = cur_iov->iov_base + iov_base;
2032 }
2013 } 2033 }
2014 } 2034 }
2015 if (unlikely(copied != bytes)) 2035 if (unlikely(copied != bytes))
diff --git a/mm/highmem.c b/mm/highmem.c
index d01276506b00..400911599468 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -325,6 +325,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
325 continue; 325 continue;
326 326
327 mempool_free(bvec->bv_page, pool); 327 mempool_free(bvec->bv_page, pool);
328 dec_page_state(nr_bounce);
328 } 329 }
329 330
330 bio_endio(bio_orig, bio_orig->bi_size, err); 331 bio_endio(bio_orig, bio_orig->bi_size, err);
@@ -405,6 +406,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
405 to->bv_page = mempool_alloc(pool, q->bounce_gfp); 406 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
406 to->bv_len = from->bv_len; 407 to->bv_len = from->bv_len;
407 to->bv_offset = from->bv_offset; 408 to->bv_offset = from->bv_offset;
409 inc_page_state(nr_bounce);
408 410
409 if (rw == WRITE) { 411 if (rw == WRITE) {
410 char *vto, *vfrom; 412 char *vto, *vfrom;
diff --git a/mm/mempool.c b/mm/mempool.c
index b014ffeaa413..c9f3d4620428 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -198,31 +198,22 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
198 void *element; 198 void *element;
199 unsigned long flags; 199 unsigned long flags;
200 DEFINE_WAIT(wait); 200 DEFINE_WAIT(wait);
201 int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); 201 int gfp_temp;
202 202
203 might_sleep_if(gfp_mask & __GFP_WAIT); 203 might_sleep_if(gfp_mask & __GFP_WAIT);
204
205 gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
206 gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
207 gfp_mask |= __GFP_NOWARN; /* failures are OK */
208
209 gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO);
210
204repeat_alloc: 211repeat_alloc:
205 element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data); 212
213 element = pool->alloc(gfp_temp, pool->pool_data);
206 if (likely(element != NULL)) 214 if (likely(element != NULL))
207 return element; 215 return element;
208 216
209 /*
210 * If the pool is less than 50% full and we can perform effective
211 * page reclaim then try harder to allocate an element.
212 */
213 mb();
214 if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) &&
215 (pool->curr_nr <= pool->min_nr/2)) {
216 element = pool->alloc(gfp_mask, pool->pool_data);
217 if (likely(element != NULL))
218 return element;
219 }
220
221 /*
222 * Kick the VM at this point.
223 */
224 wakeup_bdflush(0);
225
226 spin_lock_irqsave(&pool->lock, flags); 217 spin_lock_irqsave(&pool->lock, flags);
227 if (likely(pool->curr_nr)) { 218 if (likely(pool->curr_nr)) {
228 element = remove_element(pool); 219 element = remove_element(pool);
@@ -235,8 +226,10 @@ repeat_alloc:
235 if (!(gfp_mask & __GFP_WAIT)) 226 if (!(gfp_mask & __GFP_WAIT))
236 return NULL; 227 return NULL;
237 228
229 /* Now start performing page reclaim */
230 gfp_temp = gfp_mask;
238 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 231 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
239 mb(); 232 smp_mb();
240 if (!pool->curr_nr) 233 if (!pool->curr_nr)
241 io_schedule(); 234 io_schedule();
242 finish_wait(&pool->wait, &wait); 235 finish_wait(&pool->wait, &wait);
@@ -257,7 +250,7 @@ void mempool_free(void *element, mempool_t *pool)
257{ 250{
258 unsigned long flags; 251 unsigned long flags;
259 252
260 mb(); 253 smp_mb();
261 if (pool->curr_nr < pool->min_nr) { 254 if (pool->curr_nr < pool->min_nr) {
262 spin_lock_irqsave(&pool->lock, flags); 255 spin_lock_irqsave(&pool->lock, flags);
263 if (pool->curr_nr < pool->min_nr) { 256 if (pool->curr_nr < pool->min_nr) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 6ea204cc751e..01f9793591f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -937,9 +937,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
937 /* mlock MCL_FUTURE? */ 937 /* mlock MCL_FUTURE? */
938 if (vm_flags & VM_LOCKED) { 938 if (vm_flags & VM_LOCKED) {
939 unsigned long locked, lock_limit; 939 unsigned long locked, lock_limit;
940 locked = mm->locked_vm << PAGE_SHIFT; 940 locked = len >> PAGE_SHIFT;
941 locked += mm->locked_vm;
941 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 942 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
942 locked += len; 943 lock_limit >>= PAGE_SHIFT;
943 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 944 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
944 return -EAGAIN; 945 return -EAGAIN;
945 } 946 }
@@ -1009,8 +1010,7 @@ munmap_back:
1009 } 1010 }
1010 1011
1011 /* Check against address space limit. */ 1012 /* Check against address space limit. */
1012 if ((mm->total_vm << PAGE_SHIFT) + len 1013 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1013 > current->signal->rlim[RLIMIT_AS].rlim_cur)
1014 return -ENOMEM; 1014 return -ENOMEM;
1015 1015
1016 if (accountable && (!(flags & MAP_NORESERVE) || 1016 if (accountable && (!(flags & MAP_NORESERVE) ||
@@ -1421,7 +1421,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1421 struct rlimit *rlim = current->signal->rlim; 1421 struct rlimit *rlim = current->signal->rlim;
1422 1422
1423 /* address space limit tests */ 1423 /* address space limit tests */
1424 if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT) 1424 if (!may_expand_vm(mm, grow))
1425 return -ENOMEM; 1425 return -ENOMEM;
1426 1426
1427 /* Stack limit test */ 1427 /* Stack limit test */
@@ -1823,9 +1823,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1823 */ 1823 */
1824 if (mm->def_flags & VM_LOCKED) { 1824 if (mm->def_flags & VM_LOCKED) {
1825 unsigned long locked, lock_limit; 1825 unsigned long locked, lock_limit;
1826 locked = mm->locked_vm << PAGE_SHIFT; 1826 locked = len >> PAGE_SHIFT;
1827 locked += mm->locked_vm;
1827 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 1828 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1828 locked += len; 1829 lock_limit >>= PAGE_SHIFT;
1829 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 1830 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
1830 return -EAGAIN; 1831 return -EAGAIN;
1831 } 1832 }
@@ -1848,8 +1849,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1848 } 1849 }
1849 1850
1850 /* Check against address space limits *after* clearing old maps... */ 1851 /* Check against address space limits *after* clearing old maps... */
1851 if ((mm->total_vm << PAGE_SHIFT) + len 1852 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1852 > current->signal->rlim[RLIMIT_AS].rlim_cur)
1853 return -ENOMEM; 1853 return -ENOMEM;
1854 1854
1855 if (mm->map_count > sysctl_max_map_count) 1855 if (mm->map_count > sysctl_max_map_count)
@@ -2019,3 +2019,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2019 } 2019 }
2020 return new_vma; 2020 return new_vma;
2021} 2021}
2022
2023/*
2024 * Return true if the calling process may expand its vm space by the passed
2025 * number of pages
2026 */
2027int may_expand_vm(struct mm_struct *mm, unsigned long npages)
2028{
2029 unsigned long cur = mm->total_vm; /* pages */
2030 unsigned long lim;
2031
2032 lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
2033
2034 if (cur + npages > lim)
2035 return 0;
2036 return 1;
2037}
diff --git a/mm/mremap.c b/mm/mremap.c
index 0d1c1b9c7a0a..0dd7ace94e51 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -347,10 +347,10 @@ unsigned long do_mremap(unsigned long addr,
347 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 347 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
348 goto out; 348 goto out;
349 } 349 }
350 ret = -ENOMEM; 350 if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) {
351 if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) 351 ret = -ENOMEM;
352 > current->signal->rlim[RLIMIT_AS].rlim_cur)
353 goto out; 352 goto out;
353 }
354 354
355 if (vma->vm_flags & VM_ACCOUNT) { 355 if (vma->vm_flags & VM_ACCOUNT) {
356 charged = (new_len - old_len) >> PAGE_SHIFT; 356 charged = (new_len - old_len) >> PAGE_SHIFT;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6ddd6a29c73b..613b99a55917 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -255,7 +255,7 @@ static void balance_dirty_pages(struct address_space *mapping)
255 255
256/** 256/**
257 * balance_dirty_pages_ratelimited - balance dirty memory state 257 * balance_dirty_pages_ratelimited - balance dirty memory state
258 * @mapping - address_space which was dirtied 258 * @mapping: address_space which was dirtied
259 * 259 *
260 * Processes which are dirtying memory should call in here once for each page 260 * Processes which are dirtying memory should call in here once for each page
261 * which was newly dirtied. The function will periodically check the system's 261 * which was newly dirtied. The function will periodically check the system's
@@ -562,8 +562,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
562/** 562/**
563 * write_one_page - write out a single page and optionally wait on I/O 563 * write_one_page - write out a single page and optionally wait on I/O
564 * 564 *
565 * @page - the page to write 565 * @page: the page to write
566 * @wait - if true, wait on writeout 566 * @wait: if true, wait on writeout
567 * 567 *
568 * The page must be locked by the caller and will be unlocked upon return. 568 * The page must be locked by the caller and will be unlocked upon return.
569 * 569 *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c73dbbc1cd8f..fc1b1064c505 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -799,14 +799,18 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
799 } 799 }
800 800
801 /* This allocation should allow future memory freeing. */ 801 /* This allocation should allow future memory freeing. */
802 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { 802
803 /* go through the zonelist yet again, ignoring mins */ 803 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
804 for (i = 0; (z = zones[i]) != NULL; i++) { 804 && !in_interrupt()) {
805 if (!cpuset_zone_allowed(z)) 805 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
806 continue; 806 /* go through the zonelist yet again, ignoring mins */
807 page = buffered_rmqueue(z, order, gfp_mask); 807 for (i = 0; (z = zones[i]) != NULL; i++) {
808 if (page) 808 if (!cpuset_zone_allowed(z))
809 goto got_pg; 809 continue;
810 page = buffered_rmqueue(z, order, gfp_mask);
811 if (page)
812 goto got_pg;
813 }
810 } 814 }
811 goto nopage; 815 goto nopage;
812 } 816 }
@@ -1351,8 +1355,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
1351#define MAX_NODE_LOAD (num_online_nodes()) 1355#define MAX_NODE_LOAD (num_online_nodes())
1352static int __initdata node_load[MAX_NUMNODES]; 1356static int __initdata node_load[MAX_NUMNODES];
1353/** 1357/**
1354 * find_next_best_node - find the next node that should appear in a given 1358 * find_next_best_node - find the next node that should appear in a given node's fallback list
1355 * node's fallback list
1356 * @node: node whose fallback list we're appending 1359 * @node: node whose fallback list we're appending
1357 * @used_node_mask: nodemask_t of already used nodes 1360 * @used_node_mask: nodemask_t of already used nodes
1358 * 1361 *
@@ -1671,6 +1674,18 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1671 if (batch < 1) 1674 if (batch < 1)
1672 batch = 1; 1675 batch = 1;
1673 1676
1677 /*
1678 * Clamp the batch to a 2^n - 1 value. Having a power
1679 * of 2 value was found to be more likely to have
1680 * suboptimal cache aliasing properties in some cases.
1681 *
1682 * For example if 2 tasks are alternately allocating
1683 * batches of pages, one task can end up with a lot
1684 * of pages of one half of the possible page colors
1685 * and the other with pages of the other colors.
1686 */
1687 batch = (1 << fls(batch + batch/2)) - 1;
1688
1674 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1689 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1675 struct per_cpu_pages *pcp; 1690 struct per_cpu_pages *pcp;
1676 1691
@@ -1881,6 +1896,7 @@ static char *vmstat_text[] = {
1881 "allocstall", 1896 "allocstall",
1882 1897
1883 "pgrotated", 1898 "pgrotated",
1899 "nr_bounce",
1884}; 1900};
1885 1901
1886static void *vmstat_start(struct seq_file *m, loff_t *pos) 1902static void *vmstat_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/rmap.c b/mm/rmap.c
index 884d6d1928bc..378de234c12b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -243,6 +243,42 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
243} 243}
244 244
245/* 245/*
246 * Check that @page is mapped at @address into @mm.
247 *
248 * On success returns with mapped pte and locked mm->page_table_lock.
249 */
250static pte_t *page_check_address(struct page *page, struct mm_struct *mm,
251 unsigned long address)
252{
253 pgd_t *pgd;
254 pud_t *pud;
255 pmd_t *pmd;
256 pte_t *pte;
257
258 /*
259 * We need the page_table_lock to protect us from page faults,
260 * munmap, fork, etc...
261 */
262 spin_lock(&mm->page_table_lock);
263 pgd = pgd_offset(mm, address);
264 if (likely(pgd_present(*pgd))) {
265 pud = pud_offset(pgd, address);
266 if (likely(pud_present(*pud))) {
267 pmd = pmd_offset(pud, address);
268 if (likely(pmd_present(*pmd))) {
269 pte = pte_offset_map(pmd, address);
270 if (likely(pte_present(*pte) &&
271 page_to_pfn(page) == pte_pfn(*pte)))
272 return pte;
273 pte_unmap(pte);
274 }
275 }
276 }
277 spin_unlock(&mm->page_table_lock);
278 return ERR_PTR(-ENOENT);
279}
280
281/*
246 * Subfunctions of page_referenced: page_referenced_one called 282 * Subfunctions of page_referenced: page_referenced_one called
247 * repeatedly from either page_referenced_anon or page_referenced_file. 283 * repeatedly from either page_referenced_anon or page_referenced_file.
248 */ 284 */
@@ -251,9 +287,6 @@ static int page_referenced_one(struct page *page,
251{ 287{
252 struct mm_struct *mm = vma->vm_mm; 288 struct mm_struct *mm = vma->vm_mm;
253 unsigned long address; 289 unsigned long address;
254 pgd_t *pgd;
255 pud_t *pud;
256 pmd_t *pmd;
257 pte_t *pte; 290 pte_t *pte;
258 int referenced = 0; 291 int referenced = 0;
259 292
@@ -263,39 +296,18 @@ static int page_referenced_one(struct page *page,
263 if (address == -EFAULT) 296 if (address == -EFAULT)
264 goto out; 297 goto out;
265 298
266 spin_lock(&mm->page_table_lock); 299 pte = page_check_address(page, mm, address);
267 300 if (!IS_ERR(pte)) {
268 pgd = pgd_offset(mm, address); 301 if (ptep_clear_flush_young(vma, address, pte))
269 if (!pgd_present(*pgd)) 302 referenced++;
270 goto out_unlock;
271
272 pud = pud_offset(pgd, address);
273 if (!pud_present(*pud))
274 goto out_unlock;
275
276 pmd = pmd_offset(pud, address);
277 if (!pmd_present(*pmd))
278 goto out_unlock;
279
280 pte = pte_offset_map(pmd, address);
281 if (!pte_present(*pte))
282 goto out_unmap;
283
284 if (page_to_pfn(page) != pte_pfn(*pte))
285 goto out_unmap;
286
287 if (ptep_clear_flush_young(vma, address, pte))
288 referenced++;
289
290 if (mm != current->mm && !ignore_token && has_swap_token(mm))
291 referenced++;
292 303
293 (*mapcount)--; 304 if (mm != current->mm && !ignore_token && has_swap_token(mm))
305 referenced++;
294 306
295out_unmap: 307 (*mapcount)--;
296 pte_unmap(pte); 308 pte_unmap(pte);
297out_unlock: 309 spin_unlock(&mm->page_table_lock);
298 spin_unlock(&mm->page_table_lock); 310 }
299out: 311out:
300 return referenced; 312 return referenced;
301} 313}
@@ -502,9 +514,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
502{ 514{
503 struct mm_struct *mm = vma->vm_mm; 515 struct mm_struct *mm = vma->vm_mm;
504 unsigned long address; 516 unsigned long address;
505 pgd_t *pgd;
506 pud_t *pud;
507 pmd_t *pmd;
508 pte_t *pte; 517 pte_t *pte;
509 pte_t pteval; 518 pte_t pteval;
510 int ret = SWAP_AGAIN; 519 int ret = SWAP_AGAIN;
@@ -515,30 +524,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
515 if (address == -EFAULT) 524 if (address == -EFAULT)
516 goto out; 525 goto out;
517 526
518 /* 527 pte = page_check_address(page, mm, address);
519 * We need the page_table_lock to protect us from page faults, 528 if (IS_ERR(pte))
520 * munmap, fork, etc... 529 goto out;
521 */
522 spin_lock(&mm->page_table_lock);
523
524 pgd = pgd_offset(mm, address);
525 if (!pgd_present(*pgd))
526 goto out_unlock;
527
528 pud = pud_offset(pgd, address);
529 if (!pud_present(*pud))
530 goto out_unlock;
531
532 pmd = pmd_offset(pud, address);
533 if (!pmd_present(*pmd))
534 goto out_unlock;
535
536 pte = pte_offset_map(pmd, address);
537 if (!pte_present(*pte))
538 goto out_unmap;
539
540 if (page_to_pfn(page) != pte_pfn(*pte))
541 goto out_unmap;
542 530
543 /* 531 /*
544 * If the page is mlock()d, we cannot swap it out. 532 * If the page is mlock()d, we cannot swap it out.
@@ -604,7 +592,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
604 592
605out_unmap: 593out_unmap:
606 pte_unmap(pte); 594 pte_unmap(pte);
607out_unlock:
608 spin_unlock(&mm->page_table_lock); 595 spin_unlock(&mm->page_table_lock);
609out: 596out:
610 return ret; 597 return ret;
@@ -708,7 +695,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
708 } 695 }
709 696
710 pte_unmap(pte); 697 pte_unmap(pte);
711
712out_unlock: 698out_unlock:
713 spin_unlock(&mm->page_table_lock); 699 spin_unlock(&mm->page_table_lock);
714} 700}
@@ -860,3 +846,4 @@ int try_to_unmap(struct page *page)
860 ret = SWAP_SUCCESS; 846 ret = SWAP_SUCCESS;
861 return ret; 847 return ret;
862} 848}
849
diff --git a/mm/slab.c b/mm/slab.c
index ec660d85ddd7..840742641152 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -583,7 +583,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
583 return cachep->array[smp_processor_id()]; 583 return cachep->array[smp_processor_id()];
584} 584}
585 585
586static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) 586static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags)
587{ 587{
588 struct cache_sizes *csizep = malloc_sizes; 588 struct cache_sizes *csizep = malloc_sizes;
589 589
@@ -607,6 +607,12 @@ static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
607 return csizep->cs_cachep; 607 return csizep->cs_cachep;
608} 608}
609 609
610kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
611{
612 return __find_general_cachep(size, gfpflags);
613}
614EXPORT_SYMBOL(kmem_find_general_cachep);
615
610/* Cal the num objs, wastage, and bytes left over for a given slab size. */ 616/* Cal the num objs, wastage, and bytes left over for a given slab size. */
611static void cache_estimate(unsigned long gfporder, size_t size, size_t align, 617static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
612 int flags, size_t *left_over, unsigned int *num) 618 int flags, size_t *left_over, unsigned int *num)
@@ -672,14 +678,11 @@ static struct array_cache *alloc_arraycache(int cpu, int entries,
672 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 678 int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
673 struct array_cache *nc = NULL; 679 struct array_cache *nc = NULL;
674 680
675 if (cpu != -1) { 681 if (cpu == -1)
676 kmem_cache_t *cachep;
677 cachep = kmem_find_general_cachep(memsize, GFP_KERNEL);
678 if (cachep)
679 nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu));
680 }
681 if (!nc)
682 nc = kmalloc(memsize, GFP_KERNEL); 682 nc = kmalloc(memsize, GFP_KERNEL);
683 else
684 nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
685
683 if (nc) { 686 if (nc) {
684 nc->avail = 0; 687 nc->avail = 0;
685 nc->limit = entries; 688 nc->limit = entries;
@@ -1663,7 +1666,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
1663 } 1666 }
1664 1667
1665 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 1668 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
1666 synchronize_kernel(); 1669 synchronize_rcu();
1667 1670
1668 /* no cpu_online check required here since we clear the percpu 1671 /* no cpu_online check required here since we clear the percpu
1669 * array on cpu offline and set this to NULL. 1672 * array on cpu offline and set this to NULL.
@@ -2361,7 +2364,7 @@ out:
2361 * and can sleep. And it will allocate memory on the given node, which 2364 * and can sleep. And it will allocate memory on the given node, which
2362 * can improve the performance for cpu bound structures. 2365 * can improve the performance for cpu bound structures.
2363 */ 2366 */
2364void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid) 2367void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
2365{ 2368{
2366 int loop; 2369 int loop;
2367 void *objp; 2370 void *objp;
@@ -2393,7 +2396,7 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
2393 spin_unlock_irq(&cachep->spinlock); 2396 spin_unlock_irq(&cachep->spinlock);
2394 2397
2395 local_irq_disable(); 2398 local_irq_disable();
2396 if (!cache_grow(cachep, GFP_KERNEL, nodeid)) { 2399 if (!cache_grow(cachep, flags, nodeid)) {
2397 local_irq_enable(); 2400 local_irq_enable();
2398 return NULL; 2401 return NULL;
2399 } 2402 }
@@ -2435,6 +2438,16 @@ got_slabp:
2435} 2438}
2436EXPORT_SYMBOL(kmem_cache_alloc_node); 2439EXPORT_SYMBOL(kmem_cache_alloc_node);
2437 2440
2441void *kmalloc_node(size_t size, int flags, int node)
2442{
2443 kmem_cache_t *cachep;
2444
2445 cachep = kmem_find_general_cachep(size, flags);
2446 if (unlikely(cachep == NULL))
2447 return NULL;
2448 return kmem_cache_alloc_node(cachep, flags, node);
2449}
2450EXPORT_SYMBOL(kmalloc_node);
2438#endif 2451#endif
2439 2452
2440/** 2453/**
@@ -2462,7 +2475,12 @@ void *__kmalloc(size_t size, unsigned int __nocast flags)
2462{ 2475{
2463 kmem_cache_t *cachep; 2476 kmem_cache_t *cachep;
2464 2477
2465 cachep = kmem_find_general_cachep(size, flags); 2478 /* If you want to save a few bytes .text space: replace
2479 * __ with kmem_.
2480 * Then kmalloc uses the uninlined functions instead of the inline
2481 * functions.
2482 */
2483 cachep = __find_general_cachep(size, flags);
2466 if (unlikely(cachep == NULL)) 2484 if (unlikely(cachep == NULL))
2467 return NULL; 2485 return NULL;
2468 return __cache_alloc(cachep, flags); 2486 return __cache_alloc(cachep, flags);
@@ -2489,9 +2507,8 @@ void *__alloc_percpu(size_t size, size_t align)
2489 for (i = 0; i < NR_CPUS; i++) { 2507 for (i = 0; i < NR_CPUS; i++) {
2490 if (!cpu_possible(i)) 2508 if (!cpu_possible(i))
2491 continue; 2509 continue;
2492 pdata->ptrs[i] = kmem_cache_alloc_node( 2510 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL,
2493 kmem_find_general_cachep(size, GFP_KERNEL), 2511 cpu_to_node(i));
2494 cpu_to_node(i));
2495 2512
2496 if (!pdata->ptrs[i]) 2513 if (!pdata->ptrs[i])
2497 goto unwind_oom; 2514 goto unwind_oom;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index a063a902ed03..4f251775ef90 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -143,7 +143,6 @@ void __delete_from_swap_cache(struct page *page)
143int add_to_swap(struct page * page) 143int add_to_swap(struct page * page)
144{ 144{
145 swp_entry_t entry; 145 swp_entry_t entry;
146 int pf_flags;
147 int err; 146 int err;
148 147
149 if (!PageLocked(page)) 148 if (!PageLocked(page))
@@ -154,29 +153,19 @@ int add_to_swap(struct page * page)
154 if (!entry.val) 153 if (!entry.val)
155 return 0; 154 return 0;
156 155
157 /* Radix-tree node allocations are performing 156 /*
158 * GFP_ATOMIC allocations under PF_MEMALLOC. 157 * Radix-tree node allocations from PF_MEMALLOC contexts could
159 * They can completely exhaust the page allocator. 158 * completely exhaust the page allocator. __GFP_NOMEMALLOC
160 * 159 * stops emergency reserves from being allocated.
161 * So PF_MEMALLOC is dropped here. This causes the slab
162 * allocations to fail earlier, so radix-tree nodes will
163 * then be allocated from the mempool reserves.
164 * 160 *
165 * We're still using __GFP_HIGH for radix-tree node 161 * TODO: this could cause a theoretical memory reclaim
166 * allocations, so some of the emergency pools are available, 162 * deadlock in the swap out path.
167 * just not all of them.
168 */ 163 */
169
170 pf_flags = current->flags;
171 current->flags &= ~PF_MEMALLOC;
172
173 /* 164 /*
174 * Add it to the swap cache and mark it dirty 165 * Add it to the swap cache and mark it dirty
175 */ 166 */
176 err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN); 167 err = __add_to_swap_cache(page, entry,
177 168 GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
178 if (pf_flags & PF_MEMALLOC)
179 current->flags |= PF_MEMALLOC;
180 169
181 switch (err) { 170 switch (err) {
182 case 0: /* Success */ 171 case 0: /* Success */
diff --git a/mm/truncate.c b/mm/truncate.c
index c9a63f0b69a2..60c8764bfac2 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -242,7 +242,7 @@ EXPORT_SYMBOL(invalidate_inode_pages);
242 242
243/** 243/**
244 * invalidate_inode_pages2_range - remove range of pages from an address_space 244 * invalidate_inode_pages2_range - remove range of pages from an address_space
245 * @mapping - the address_space 245 * @mapping: the address_space
246 * @start: the page offset 'from' which to invalidate 246 * @start: the page offset 'from' which to invalidate
247 * @end: the page offset 'to' which to invalidate (inclusive) 247 * @end: the page offset 'to' which to invalidate (inclusive)
248 * 248 *
@@ -322,7 +322,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
322 322
323/** 323/**
324 * invalidate_inode_pages2 - remove all pages from an address_space 324 * invalidate_inode_pages2 - remove all pages from an address_space
325 * @mapping - the address_space 325 * @mapping: the address_space
326 * 326 *
327 * Any pages which are found to be mapped into pagetables are unmapped prior to 327 * Any pages which are found to be mapped into pagetables are unmapped prior to
328 * invalidation. 328 * invalidation.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c6182f6f1305..2bd83e5c2bbf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -475,6 +475,10 @@ void *vmalloc(unsigned long size)
475 475
476EXPORT_SYMBOL(vmalloc); 476EXPORT_SYMBOL(vmalloc);
477 477
478#ifndef PAGE_KERNEL_EXEC
479# define PAGE_KERNEL_EXEC PAGE_KERNEL
480#endif
481
478/** 482/**
479 * vmalloc_exec - allocate virtually contiguous, executable memory 483 * vmalloc_exec - allocate virtually contiguous, executable memory
480 * 484 *
@@ -488,10 +492,6 @@ EXPORT_SYMBOL(vmalloc);
488 * use __vmalloc() instead. 492 * use __vmalloc() instead.
489 */ 493 */
490 494
491#ifndef PAGE_KERNEL_EXEC
492# define PAGE_KERNEL_EXEC PAGE_KERNEL
493#endif
494
495void *vmalloc_exec(unsigned long size) 495void *vmalloc_exec(unsigned long size)
496{ 496{
497 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); 497 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);