diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 46 | ||||
-rw-r--r-- | mm/highmem.c | 2 | ||||
-rw-r--r-- | mm/mempool.c | 35 | ||||
-rw-r--r-- | mm/mmap.c | 34 | ||||
-rw-r--r-- | mm/mremap.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 36 | ||||
-rw-r--r-- | mm/rmap.c | 113 | ||||
-rw-r--r-- | mm/slab.c | 47 | ||||
-rw-r--r-- | mm/swap_state.c | 27 | ||||
-rw-r--r-- | mm/truncate.c | 4 | ||||
-rw-r--r-- | mm/vmalloc.c | 8 |
12 files changed, 202 insertions, 162 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 93595c327bbd..d5fdae2eb183 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -123,8 +123,7 @@ void remove_from_page_cache(struct page *page) | |||
123 | { | 123 | { |
124 | struct address_space *mapping = page->mapping; | 124 | struct address_space *mapping = page->mapping; |
125 | 125 | ||
126 | if (unlikely(!PageLocked(page))) | 126 | BUG_ON(!PageLocked(page)); |
127 | PAGE_BUG(page); | ||
128 | 127 | ||
129 | write_lock_irq(&mapping->tree_lock); | 128 | write_lock_irq(&mapping->tree_lock); |
130 | __remove_from_page_cache(page); | 129 | __remove_from_page_cache(page); |
@@ -139,7 +138,25 @@ static int sync_page(void *word) | |||
139 | page = container_of((page_flags_t *)word, struct page, flags); | 138 | page = container_of((page_flags_t *)word, struct page, flags); |
140 | 139 | ||
141 | /* | 140 | /* |
142 | * FIXME, fercrissake. What is this barrier here for? | 141 | * page_mapping() is being called without PG_locked held. |
142 | * Some knowledge of the state and use of the page is used to | ||
143 | * reduce the requirements down to a memory barrier. | ||
144 | * The danger here is of a stale page_mapping() return value | ||
145 | * indicating a struct address_space different from the one it's | ||
146 | * associated with when it is associated with one. | ||
147 | * After smp_mb(), it's either the correct page_mapping() for | ||
148 | * the page, or an old page_mapping() and the page's own | ||
149 | * page_mapping() has gone NULL. | ||
150 | * The ->sync_page() address_space operation must tolerate | ||
151 | * page_mapping() going NULL. By an amazing coincidence, | ||
152 | * this comes about because none of the users of the page | ||
153 | * in the ->sync_page() methods make essential use of the | ||
154 | * page_mapping(), merely passing the page down to the backing | ||
155 | * device's unplug functions when it's non-NULL, which in turn | ||
156 | * ignore it for all cases but swap, where only page->private is | ||
157 | * of interest. When page_mapping() does go NULL, the entire | ||
158 | * call stack gracefully ignores the page and returns. | ||
159 | * -- wli | ||
143 | */ | 160 | */ |
144 | smp_mb(); | 161 | smp_mb(); |
145 | mapping = page_mapping(page); | 162 | mapping = page_mapping(page); |
@@ -152,9 +169,10 @@ static int sync_page(void *word) | |||
152 | /** | 169 | /** |
153 | * filemap_fdatawrite_range - start writeback against all of a mapping's | 170 | * filemap_fdatawrite_range - start writeback against all of a mapping's |
154 | * dirty pages that lie within the byte offsets <start, end> | 171 | * dirty pages that lie within the byte offsets <start, end> |
155 | * @mapping: address space structure to write | 172 | * @mapping: address space structure to write |
156 | * @start: offset in bytes where the range starts | 173 | * @start: offset in bytes where the range starts |
157 | * @end : offset in bytes where the range ends | 174 | * @end: offset in bytes where the range ends |
175 | * @sync_mode: enable synchronous operation | ||
158 | * | 176 | * |
159 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as | 177 | * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as |
160 | * opposed to a regular memory * cleansing writeback. The difference between | 178 | * opposed to a regular memory * cleansing writeback. The difference between |
@@ -518,8 +536,8 @@ EXPORT_SYMBOL(find_trylock_page); | |||
518 | /** | 536 | /** |
519 | * find_lock_page - locate, pin and lock a pagecache page | 537 | * find_lock_page - locate, pin and lock a pagecache page |
520 | * | 538 | * |
521 | * @mapping - the address_space to search | 539 | * @mapping: the address_space to search |
522 | * @offset - the page index | 540 | * @offset: the page index |
523 | * | 541 | * |
524 | * Locates the desired pagecache page, locks it, increments its reference | 542 | * Locates the desired pagecache page, locks it, increments its reference |
525 | * count and returns its address. | 543 | * count and returns its address. |
@@ -558,9 +576,9 @@ EXPORT_SYMBOL(find_lock_page); | |||
558 | /** | 576 | /** |
559 | * find_or_create_page - locate or add a pagecache page | 577 | * find_or_create_page - locate or add a pagecache page |
560 | * | 578 | * |
561 | * @mapping - the page's address_space | 579 | * @mapping: the page's address_space |
562 | * @index - the page's index into the mapping | 580 | * @index: the page's index into the mapping |
563 | * @gfp_mask - page allocation mode | 581 | * @gfp_mask: page allocation mode |
564 | * | 582 | * |
565 | * Locates a page in the pagecache. If the page is not present, a new page | 583 | * Locates a page in the pagecache. If the page is not present, a new page |
566 | * is allocated using @gfp_mask and is added to the pagecache and to the VM's | 584 | * is allocated using @gfp_mask and is added to the pagecache and to the VM's |
@@ -1949,7 +1967,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1949 | buf = iov->iov_base + written; | 1967 | buf = iov->iov_base + written; |
1950 | else { | 1968 | else { |
1951 | filemap_set_next_iovec(&cur_iov, &iov_base, written); | 1969 | filemap_set_next_iovec(&cur_iov, &iov_base, written); |
1952 | buf = iov->iov_base + iov_base; | 1970 | buf = cur_iov->iov_base + iov_base; |
1953 | } | 1971 | } |
1954 | 1972 | ||
1955 | do { | 1973 | do { |
@@ -2007,9 +2025,11 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2007 | count -= status; | 2025 | count -= status; |
2008 | pos += status; | 2026 | pos += status; |
2009 | buf += status; | 2027 | buf += status; |
2010 | if (unlikely(nr_segs > 1)) | 2028 | if (unlikely(nr_segs > 1)) { |
2011 | filemap_set_next_iovec(&cur_iov, | 2029 | filemap_set_next_iovec(&cur_iov, |
2012 | &iov_base, status); | 2030 | &iov_base, status); |
2031 | buf = cur_iov->iov_base + iov_base; | ||
2032 | } | ||
2013 | } | 2033 | } |
2014 | } | 2034 | } |
2015 | if (unlikely(copied != bytes)) | 2035 | if (unlikely(copied != bytes)) |
diff --git a/mm/highmem.c b/mm/highmem.c index d01276506b00..400911599468 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -325,6 +325,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) | |||
325 | continue; | 325 | continue; |
326 | 326 | ||
327 | mempool_free(bvec->bv_page, pool); | 327 | mempool_free(bvec->bv_page, pool); |
328 | dec_page_state(nr_bounce); | ||
328 | } | 329 | } |
329 | 330 | ||
330 | bio_endio(bio_orig, bio_orig->bi_size, err); | 331 | bio_endio(bio_orig, bio_orig->bi_size, err); |
@@ -405,6 +406,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig, | |||
405 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); | 406 | to->bv_page = mempool_alloc(pool, q->bounce_gfp); |
406 | to->bv_len = from->bv_len; | 407 | to->bv_len = from->bv_len; |
407 | to->bv_offset = from->bv_offset; | 408 | to->bv_offset = from->bv_offset; |
409 | inc_page_state(nr_bounce); | ||
408 | 410 | ||
409 | if (rw == WRITE) { | 411 | if (rw == WRITE) { |
410 | char *vto, *vfrom; | 412 | char *vto, *vfrom; |
diff --git a/mm/mempool.c b/mm/mempool.c index b014ffeaa413..c9f3d4620428 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -198,31 +198,22 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask) | |||
198 | void *element; | 198 | void *element; |
199 | unsigned long flags; | 199 | unsigned long flags; |
200 | DEFINE_WAIT(wait); | 200 | DEFINE_WAIT(wait); |
201 | int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO); | 201 | int gfp_temp; |
202 | 202 | ||
203 | might_sleep_if(gfp_mask & __GFP_WAIT); | 203 | might_sleep_if(gfp_mask & __GFP_WAIT); |
204 | |||
205 | gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ | ||
206 | gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ | ||
207 | gfp_mask |= __GFP_NOWARN; /* failures are OK */ | ||
208 | |||
209 | gfp_temp = gfp_mask & ~(__GFP_WAIT|__GFP_IO); | ||
210 | |||
204 | repeat_alloc: | 211 | repeat_alloc: |
205 | element = pool->alloc(gfp_nowait|__GFP_NOWARN, pool->pool_data); | 212 | |
213 | element = pool->alloc(gfp_temp, pool->pool_data); | ||
206 | if (likely(element != NULL)) | 214 | if (likely(element != NULL)) |
207 | return element; | 215 | return element; |
208 | 216 | ||
209 | /* | ||
210 | * If the pool is less than 50% full and we can perform effective | ||
211 | * page reclaim then try harder to allocate an element. | ||
212 | */ | ||
213 | mb(); | ||
214 | if ((gfp_mask & __GFP_FS) && (gfp_mask != gfp_nowait) && | ||
215 | (pool->curr_nr <= pool->min_nr/2)) { | ||
216 | element = pool->alloc(gfp_mask, pool->pool_data); | ||
217 | if (likely(element != NULL)) | ||
218 | return element; | ||
219 | } | ||
220 | |||
221 | /* | ||
222 | * Kick the VM at this point. | ||
223 | */ | ||
224 | wakeup_bdflush(0); | ||
225 | |||
226 | spin_lock_irqsave(&pool->lock, flags); | 217 | spin_lock_irqsave(&pool->lock, flags); |
227 | if (likely(pool->curr_nr)) { | 218 | if (likely(pool->curr_nr)) { |
228 | element = remove_element(pool); | 219 | element = remove_element(pool); |
@@ -235,8 +226,10 @@ repeat_alloc: | |||
235 | if (!(gfp_mask & __GFP_WAIT)) | 226 | if (!(gfp_mask & __GFP_WAIT)) |
236 | return NULL; | 227 | return NULL; |
237 | 228 | ||
229 | /* Now start performing page reclaim */ | ||
230 | gfp_temp = gfp_mask; | ||
238 | prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); | 231 | prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); |
239 | mb(); | 232 | smp_mb(); |
240 | if (!pool->curr_nr) | 233 | if (!pool->curr_nr) |
241 | io_schedule(); | 234 | io_schedule(); |
242 | finish_wait(&pool->wait, &wait); | 235 | finish_wait(&pool->wait, &wait); |
@@ -257,7 +250,7 @@ void mempool_free(void *element, mempool_t *pool) | |||
257 | { | 250 | { |
258 | unsigned long flags; | 251 | unsigned long flags; |
259 | 252 | ||
260 | mb(); | 253 | smp_mb(); |
261 | if (pool->curr_nr < pool->min_nr) { | 254 | if (pool->curr_nr < pool->min_nr) { |
262 | spin_lock_irqsave(&pool->lock, flags); | 255 | spin_lock_irqsave(&pool->lock, flags); |
263 | if (pool->curr_nr < pool->min_nr) { | 256 | if (pool->curr_nr < pool->min_nr) { |
@@ -937,9 +937,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
937 | /* mlock MCL_FUTURE? */ | 937 | /* mlock MCL_FUTURE? */ |
938 | if (vm_flags & VM_LOCKED) { | 938 | if (vm_flags & VM_LOCKED) { |
939 | unsigned long locked, lock_limit; | 939 | unsigned long locked, lock_limit; |
940 | locked = mm->locked_vm << PAGE_SHIFT; | 940 | locked = len >> PAGE_SHIFT; |
941 | locked += mm->locked_vm; | ||
941 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 942 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
942 | locked += len; | 943 | lock_limit >>= PAGE_SHIFT; |
943 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 944 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
944 | return -EAGAIN; | 945 | return -EAGAIN; |
945 | } | 946 | } |
@@ -1009,8 +1010,7 @@ munmap_back: | |||
1009 | } | 1010 | } |
1010 | 1011 | ||
1011 | /* Check against address space limit. */ | 1012 | /* Check against address space limit. */ |
1012 | if ((mm->total_vm << PAGE_SHIFT) + len | 1013 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
1013 | > current->signal->rlim[RLIMIT_AS].rlim_cur) | ||
1014 | return -ENOMEM; | 1014 | return -ENOMEM; |
1015 | 1015 | ||
1016 | if (accountable && (!(flags & MAP_NORESERVE) || | 1016 | if (accountable && (!(flags & MAP_NORESERVE) || |
@@ -1421,7 +1421,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un | |||
1421 | struct rlimit *rlim = current->signal->rlim; | 1421 | struct rlimit *rlim = current->signal->rlim; |
1422 | 1422 | ||
1423 | /* address space limit tests */ | 1423 | /* address space limit tests */ |
1424 | if (mm->total_vm + grow > rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT) | 1424 | if (!may_expand_vm(mm, grow)) |
1425 | return -ENOMEM; | 1425 | return -ENOMEM; |
1426 | 1426 | ||
1427 | /* Stack limit test */ | 1427 | /* Stack limit test */ |
@@ -1823,9 +1823,10 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1823 | */ | 1823 | */ |
1824 | if (mm->def_flags & VM_LOCKED) { | 1824 | if (mm->def_flags & VM_LOCKED) { |
1825 | unsigned long locked, lock_limit; | 1825 | unsigned long locked, lock_limit; |
1826 | locked = mm->locked_vm << PAGE_SHIFT; | 1826 | locked = len >> PAGE_SHIFT; |
1827 | locked += mm->locked_vm; | ||
1827 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 1828 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
1828 | locked += len; | 1829 | lock_limit >>= PAGE_SHIFT; |
1829 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 1830 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
1830 | return -EAGAIN; | 1831 | return -EAGAIN; |
1831 | } | 1832 | } |
@@ -1848,8 +1849,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
1848 | } | 1849 | } |
1849 | 1850 | ||
1850 | /* Check against address space limits *after* clearing old maps... */ | 1851 | /* Check against address space limits *after* clearing old maps... */ |
1851 | if ((mm->total_vm << PAGE_SHIFT) + len | 1852 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
1852 | > current->signal->rlim[RLIMIT_AS].rlim_cur) | ||
1853 | return -ENOMEM; | 1853 | return -ENOMEM; |
1854 | 1854 | ||
1855 | if (mm->map_count > sysctl_max_map_count) | 1855 | if (mm->map_count > sysctl_max_map_count) |
@@ -2019,3 +2019,19 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2019 | } | 2019 | } |
2020 | return new_vma; | 2020 | return new_vma; |
2021 | } | 2021 | } |
2022 | |||
2023 | /* | ||
2024 | * Return true if the calling process may expand its vm space by the passed | ||
2025 | * number of pages | ||
2026 | */ | ||
2027 | int may_expand_vm(struct mm_struct *mm, unsigned long npages) | ||
2028 | { | ||
2029 | unsigned long cur = mm->total_vm; /* pages */ | ||
2030 | unsigned long lim; | ||
2031 | |||
2032 | lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | ||
2033 | |||
2034 | if (cur + npages > lim) | ||
2035 | return 0; | ||
2036 | return 1; | ||
2037 | } | ||
diff --git a/mm/mremap.c b/mm/mremap.c index 0d1c1b9c7a0a..0dd7ace94e51 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -347,10 +347,10 @@ unsigned long do_mremap(unsigned long addr, | |||
347 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 347 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
348 | goto out; | 348 | goto out; |
349 | } | 349 | } |
350 | ret = -ENOMEM; | 350 | if (!may_expand_vm(current->mm, (new_len - old_len) >> PAGE_SHIFT)) { |
351 | if ((current->mm->total_vm << PAGE_SHIFT) + (new_len - old_len) | 351 | ret = -ENOMEM; |
352 | > current->signal->rlim[RLIMIT_AS].rlim_cur) | ||
353 | goto out; | 352 | goto out; |
353 | } | ||
354 | 354 | ||
355 | if (vma->vm_flags & VM_ACCOUNT) { | 355 | if (vma->vm_flags & VM_ACCOUNT) { |
356 | charged = (new_len - old_len) >> PAGE_SHIFT; | 356 | charged = (new_len - old_len) >> PAGE_SHIFT; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6ddd6a29c73b..613b99a55917 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -255,7 +255,7 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
255 | 255 | ||
256 | /** | 256 | /** |
257 | * balance_dirty_pages_ratelimited - balance dirty memory state | 257 | * balance_dirty_pages_ratelimited - balance dirty memory state |
258 | * @mapping - address_space which was dirtied | 258 | * @mapping: address_space which was dirtied |
259 | * | 259 | * |
260 | * Processes which are dirtying memory should call in here once for each page | 260 | * Processes which are dirtying memory should call in here once for each page |
261 | * which was newly dirtied. The function will periodically check the system's | 261 | * which was newly dirtied. The function will periodically check the system's |
@@ -562,8 +562,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | |||
562 | /** | 562 | /** |
563 | * write_one_page - write out a single page and optionally wait on I/O | 563 | * write_one_page - write out a single page and optionally wait on I/O |
564 | * | 564 | * |
565 | * @page - the page to write | 565 | * @page: the page to write |
566 | * @wait - if true, wait on writeout | 566 | * @wait: if true, wait on writeout |
567 | * | 567 | * |
568 | * The page must be locked by the caller and will be unlocked upon return. | 568 | * The page must be locked by the caller and will be unlocked upon return. |
569 | * | 569 | * |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c73dbbc1cd8f..fc1b1064c505 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -799,14 +799,18 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order, | |||
799 | } | 799 | } |
800 | 800 | ||
801 | /* This allocation should allow future memory freeing. */ | 801 | /* This allocation should allow future memory freeing. */ |
802 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) { | 802 | |
803 | /* go through the zonelist yet again, ignoring mins */ | 803 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
804 | for (i = 0; (z = zones[i]) != NULL; i++) { | 804 | && !in_interrupt()) { |
805 | if (!cpuset_zone_allowed(z)) | 805 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
806 | continue; | 806 | /* go through the zonelist yet again, ignoring mins */ |
807 | page = buffered_rmqueue(z, order, gfp_mask); | 807 | for (i = 0; (z = zones[i]) != NULL; i++) { |
808 | if (page) | 808 | if (!cpuset_zone_allowed(z)) |
809 | goto got_pg; | 809 | continue; |
810 | page = buffered_rmqueue(z, order, gfp_mask); | ||
811 | if (page) | ||
812 | goto got_pg; | ||
813 | } | ||
810 | } | 814 | } |
811 | goto nopage; | 815 | goto nopage; |
812 | } | 816 | } |
@@ -1351,8 +1355,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli | |||
1351 | #define MAX_NODE_LOAD (num_online_nodes()) | 1355 | #define MAX_NODE_LOAD (num_online_nodes()) |
1352 | static int __initdata node_load[MAX_NUMNODES]; | 1356 | static int __initdata node_load[MAX_NUMNODES]; |
1353 | /** | 1357 | /** |
1354 | * find_next_best_node - find the next node that should appear in a given | 1358 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1355 | * node's fallback list | ||
1356 | * @node: node whose fallback list we're appending | 1359 | * @node: node whose fallback list we're appending |
1357 | * @used_node_mask: nodemask_t of already used nodes | 1360 | * @used_node_mask: nodemask_t of already used nodes |
1358 | * | 1361 | * |
@@ -1671,6 +1674,18 @@ static void __init free_area_init_core(struct pglist_data *pgdat, | |||
1671 | if (batch < 1) | 1674 | if (batch < 1) |
1672 | batch = 1; | 1675 | batch = 1; |
1673 | 1676 | ||
1677 | /* | ||
1678 | * Clamp the batch to a 2^n - 1 value. Having a power | ||
1679 | * of 2 value was found to be more likely to have | ||
1680 | * suboptimal cache aliasing properties in some cases. | ||
1681 | * | ||
1682 | * For example if 2 tasks are alternately allocating | ||
1683 | * batches of pages, one task can end up with a lot | ||
1684 | * of pages of one half of the possible page colors | ||
1685 | * and the other with pages of the other colors. | ||
1686 | */ | ||
1687 | batch = (1 << fls(batch + batch/2)) - 1; | ||
1688 | |||
1674 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 1689 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
1675 | struct per_cpu_pages *pcp; | 1690 | struct per_cpu_pages *pcp; |
1676 | 1691 | ||
@@ -1881,6 +1896,7 @@ static char *vmstat_text[] = { | |||
1881 | "allocstall", | 1896 | "allocstall", |
1882 | 1897 | ||
1883 | "pgrotated", | 1898 | "pgrotated", |
1899 | "nr_bounce", | ||
1884 | }; | 1900 | }; |
1885 | 1901 | ||
1886 | static void *vmstat_start(struct seq_file *m, loff_t *pos) | 1902 | static void *vmstat_start(struct seq_file *m, loff_t *pos) |
@@ -243,6 +243,42 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
243 | } | 243 | } |
244 | 244 | ||
245 | /* | 245 | /* |
246 | * Check that @page is mapped at @address into @mm. | ||
247 | * | ||
248 | * On success returns with mapped pte and locked mm->page_table_lock. | ||
249 | */ | ||
250 | static pte_t *page_check_address(struct page *page, struct mm_struct *mm, | ||
251 | unsigned long address) | ||
252 | { | ||
253 | pgd_t *pgd; | ||
254 | pud_t *pud; | ||
255 | pmd_t *pmd; | ||
256 | pte_t *pte; | ||
257 | |||
258 | /* | ||
259 | * We need the page_table_lock to protect us from page faults, | ||
260 | * munmap, fork, etc... | ||
261 | */ | ||
262 | spin_lock(&mm->page_table_lock); | ||
263 | pgd = pgd_offset(mm, address); | ||
264 | if (likely(pgd_present(*pgd))) { | ||
265 | pud = pud_offset(pgd, address); | ||
266 | if (likely(pud_present(*pud))) { | ||
267 | pmd = pmd_offset(pud, address); | ||
268 | if (likely(pmd_present(*pmd))) { | ||
269 | pte = pte_offset_map(pmd, address); | ||
270 | if (likely(pte_present(*pte) && | ||
271 | page_to_pfn(page) == pte_pfn(*pte))) | ||
272 | return pte; | ||
273 | pte_unmap(pte); | ||
274 | } | ||
275 | } | ||
276 | } | ||
277 | spin_unlock(&mm->page_table_lock); | ||
278 | return ERR_PTR(-ENOENT); | ||
279 | } | ||
280 | |||
281 | /* | ||
246 | * Subfunctions of page_referenced: page_referenced_one called | 282 | * Subfunctions of page_referenced: page_referenced_one called |
247 | * repeatedly from either page_referenced_anon or page_referenced_file. | 283 | * repeatedly from either page_referenced_anon or page_referenced_file. |
248 | */ | 284 | */ |
@@ -251,9 +287,6 @@ static int page_referenced_one(struct page *page, | |||
251 | { | 287 | { |
252 | struct mm_struct *mm = vma->vm_mm; | 288 | struct mm_struct *mm = vma->vm_mm; |
253 | unsigned long address; | 289 | unsigned long address; |
254 | pgd_t *pgd; | ||
255 | pud_t *pud; | ||
256 | pmd_t *pmd; | ||
257 | pte_t *pte; | 290 | pte_t *pte; |
258 | int referenced = 0; | 291 | int referenced = 0; |
259 | 292 | ||
@@ -263,39 +296,18 @@ static int page_referenced_one(struct page *page, | |||
263 | if (address == -EFAULT) | 296 | if (address == -EFAULT) |
264 | goto out; | 297 | goto out; |
265 | 298 | ||
266 | spin_lock(&mm->page_table_lock); | 299 | pte = page_check_address(page, mm, address); |
267 | 300 | if (!IS_ERR(pte)) { | |
268 | pgd = pgd_offset(mm, address); | 301 | if (ptep_clear_flush_young(vma, address, pte)) |
269 | if (!pgd_present(*pgd)) | 302 | referenced++; |
270 | goto out_unlock; | ||
271 | |||
272 | pud = pud_offset(pgd, address); | ||
273 | if (!pud_present(*pud)) | ||
274 | goto out_unlock; | ||
275 | |||
276 | pmd = pmd_offset(pud, address); | ||
277 | if (!pmd_present(*pmd)) | ||
278 | goto out_unlock; | ||
279 | |||
280 | pte = pte_offset_map(pmd, address); | ||
281 | if (!pte_present(*pte)) | ||
282 | goto out_unmap; | ||
283 | |||
284 | if (page_to_pfn(page) != pte_pfn(*pte)) | ||
285 | goto out_unmap; | ||
286 | |||
287 | if (ptep_clear_flush_young(vma, address, pte)) | ||
288 | referenced++; | ||
289 | |||
290 | if (mm != current->mm && !ignore_token && has_swap_token(mm)) | ||
291 | referenced++; | ||
292 | 303 | ||
293 | (*mapcount)--; | 304 | if (mm != current->mm && !ignore_token && has_swap_token(mm)) |
305 | referenced++; | ||
294 | 306 | ||
295 | out_unmap: | 307 | (*mapcount)--; |
296 | pte_unmap(pte); | 308 | pte_unmap(pte); |
297 | out_unlock: | 309 | spin_unlock(&mm->page_table_lock); |
298 | spin_unlock(&mm->page_table_lock); | 310 | } |
299 | out: | 311 | out: |
300 | return referenced; | 312 | return referenced; |
301 | } | 313 | } |
@@ -502,9 +514,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
502 | { | 514 | { |
503 | struct mm_struct *mm = vma->vm_mm; | 515 | struct mm_struct *mm = vma->vm_mm; |
504 | unsigned long address; | 516 | unsigned long address; |
505 | pgd_t *pgd; | ||
506 | pud_t *pud; | ||
507 | pmd_t *pmd; | ||
508 | pte_t *pte; | 517 | pte_t *pte; |
509 | pte_t pteval; | 518 | pte_t pteval; |
510 | int ret = SWAP_AGAIN; | 519 | int ret = SWAP_AGAIN; |
@@ -515,30 +524,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
515 | if (address == -EFAULT) | 524 | if (address == -EFAULT) |
516 | goto out; | 525 | goto out; |
517 | 526 | ||
518 | /* | 527 | pte = page_check_address(page, mm, address); |
519 | * We need the page_table_lock to protect us from page faults, | 528 | if (IS_ERR(pte)) |
520 | * munmap, fork, etc... | 529 | goto out; |
521 | */ | ||
522 | spin_lock(&mm->page_table_lock); | ||
523 | |||
524 | pgd = pgd_offset(mm, address); | ||
525 | if (!pgd_present(*pgd)) | ||
526 | goto out_unlock; | ||
527 | |||
528 | pud = pud_offset(pgd, address); | ||
529 | if (!pud_present(*pud)) | ||
530 | goto out_unlock; | ||
531 | |||
532 | pmd = pmd_offset(pud, address); | ||
533 | if (!pmd_present(*pmd)) | ||
534 | goto out_unlock; | ||
535 | |||
536 | pte = pte_offset_map(pmd, address); | ||
537 | if (!pte_present(*pte)) | ||
538 | goto out_unmap; | ||
539 | |||
540 | if (page_to_pfn(page) != pte_pfn(*pte)) | ||
541 | goto out_unmap; | ||
542 | 530 | ||
543 | /* | 531 | /* |
544 | * If the page is mlock()d, we cannot swap it out. | 532 | * If the page is mlock()d, we cannot swap it out. |
@@ -604,7 +592,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) | |||
604 | 592 | ||
605 | out_unmap: | 593 | out_unmap: |
606 | pte_unmap(pte); | 594 | pte_unmap(pte); |
607 | out_unlock: | ||
608 | spin_unlock(&mm->page_table_lock); | 595 | spin_unlock(&mm->page_table_lock); |
609 | out: | 596 | out: |
610 | return ret; | 597 | return ret; |
@@ -708,7 +695,6 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
708 | } | 695 | } |
709 | 696 | ||
710 | pte_unmap(pte); | 697 | pte_unmap(pte); |
711 | |||
712 | out_unlock: | 698 | out_unlock: |
713 | spin_unlock(&mm->page_table_lock); | 699 | spin_unlock(&mm->page_table_lock); |
714 | } | 700 | } |
@@ -860,3 +846,4 @@ int try_to_unmap(struct page *page) | |||
860 | ret = SWAP_SUCCESS; | 846 | ret = SWAP_SUCCESS; |
861 | return ret; | 847 | return ret; |
862 | } | 848 | } |
849 | |||
@@ -583,7 +583,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep) | |||
583 | return cachep->array[smp_processor_id()]; | 583 | return cachep->array[smp_processor_id()]; |
584 | } | 584 | } |
585 | 585 | ||
586 | static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) | 586 | static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags) |
587 | { | 587 | { |
588 | struct cache_sizes *csizep = malloc_sizes; | 588 | struct cache_sizes *csizep = malloc_sizes; |
589 | 589 | ||
@@ -607,6 +607,12 @@ static inline kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) | |||
607 | return csizep->cs_cachep; | 607 | return csizep->cs_cachep; |
608 | } | 608 | } |
609 | 609 | ||
610 | kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) | ||
611 | { | ||
612 | return __find_general_cachep(size, gfpflags); | ||
613 | } | ||
614 | EXPORT_SYMBOL(kmem_find_general_cachep); | ||
615 | |||
610 | /* Cal the num objs, wastage, and bytes left over for a given slab size. */ | 616 | /* Cal the num objs, wastage, and bytes left over for a given slab size. */ |
611 | static void cache_estimate(unsigned long gfporder, size_t size, size_t align, | 617 | static void cache_estimate(unsigned long gfporder, size_t size, size_t align, |
612 | int flags, size_t *left_over, unsigned int *num) | 618 | int flags, size_t *left_over, unsigned int *num) |
@@ -672,14 +678,11 @@ static struct array_cache *alloc_arraycache(int cpu, int entries, | |||
672 | int memsize = sizeof(void*)*entries+sizeof(struct array_cache); | 678 | int memsize = sizeof(void*)*entries+sizeof(struct array_cache); |
673 | struct array_cache *nc = NULL; | 679 | struct array_cache *nc = NULL; |
674 | 680 | ||
675 | if (cpu != -1) { | 681 | if (cpu == -1) |
676 | kmem_cache_t *cachep; | ||
677 | cachep = kmem_find_general_cachep(memsize, GFP_KERNEL); | ||
678 | if (cachep) | ||
679 | nc = kmem_cache_alloc_node(cachep, cpu_to_node(cpu)); | ||
680 | } | ||
681 | if (!nc) | ||
682 | nc = kmalloc(memsize, GFP_KERNEL); | 682 | nc = kmalloc(memsize, GFP_KERNEL); |
683 | else | ||
684 | nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu)); | ||
685 | |||
683 | if (nc) { | 686 | if (nc) { |
684 | nc->avail = 0; | 687 | nc->avail = 0; |
685 | nc->limit = entries; | 688 | nc->limit = entries; |
@@ -1663,7 +1666,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep) | |||
1663 | } | 1666 | } |
1664 | 1667 | ||
1665 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 1668 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) |
1666 | synchronize_kernel(); | 1669 | synchronize_rcu(); |
1667 | 1670 | ||
1668 | /* no cpu_online check required here since we clear the percpu | 1671 | /* no cpu_online check required here since we clear the percpu |
1669 | * array on cpu offline and set this to NULL. | 1672 | * array on cpu offline and set this to NULL. |
@@ -2361,7 +2364,7 @@ out: | |||
2361 | * and can sleep. And it will allocate memory on the given node, which | 2364 | * and can sleep. And it will allocate memory on the given node, which |
2362 | * can improve the performance for cpu bound structures. | 2365 | * can improve the performance for cpu bound structures. |
2363 | */ | 2366 | */ |
2364 | void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid) | 2367 | void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid) |
2365 | { | 2368 | { |
2366 | int loop; | 2369 | int loop; |
2367 | void *objp; | 2370 | void *objp; |
@@ -2393,7 +2396,7 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid) | |||
2393 | spin_unlock_irq(&cachep->spinlock); | 2396 | spin_unlock_irq(&cachep->spinlock); |
2394 | 2397 | ||
2395 | local_irq_disable(); | 2398 | local_irq_disable(); |
2396 | if (!cache_grow(cachep, GFP_KERNEL, nodeid)) { | 2399 | if (!cache_grow(cachep, flags, nodeid)) { |
2397 | local_irq_enable(); | 2400 | local_irq_enable(); |
2398 | return NULL; | 2401 | return NULL; |
2399 | } | 2402 | } |
@@ -2435,6 +2438,16 @@ got_slabp: | |||
2435 | } | 2438 | } |
2436 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 2439 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
2437 | 2440 | ||
2441 | void *kmalloc_node(size_t size, int flags, int node) | ||
2442 | { | ||
2443 | kmem_cache_t *cachep; | ||
2444 | |||
2445 | cachep = kmem_find_general_cachep(size, flags); | ||
2446 | if (unlikely(cachep == NULL)) | ||
2447 | return NULL; | ||
2448 | return kmem_cache_alloc_node(cachep, flags, node); | ||
2449 | } | ||
2450 | EXPORT_SYMBOL(kmalloc_node); | ||
2438 | #endif | 2451 | #endif |
2439 | 2452 | ||
2440 | /** | 2453 | /** |
@@ -2462,7 +2475,12 @@ void *__kmalloc(size_t size, unsigned int __nocast flags) | |||
2462 | { | 2475 | { |
2463 | kmem_cache_t *cachep; | 2476 | kmem_cache_t *cachep; |
2464 | 2477 | ||
2465 | cachep = kmem_find_general_cachep(size, flags); | 2478 | /* If you want to save a few bytes .text space: replace |
2479 | * __ with kmem_. | ||
2480 | * Then kmalloc uses the uninlined functions instead of the inline | ||
2481 | * functions. | ||
2482 | */ | ||
2483 | cachep = __find_general_cachep(size, flags); | ||
2466 | if (unlikely(cachep == NULL)) | 2484 | if (unlikely(cachep == NULL)) |
2467 | return NULL; | 2485 | return NULL; |
2468 | return __cache_alloc(cachep, flags); | 2486 | return __cache_alloc(cachep, flags); |
@@ -2489,9 +2507,8 @@ void *__alloc_percpu(size_t size, size_t align) | |||
2489 | for (i = 0; i < NR_CPUS; i++) { | 2507 | for (i = 0; i < NR_CPUS; i++) { |
2490 | if (!cpu_possible(i)) | 2508 | if (!cpu_possible(i)) |
2491 | continue; | 2509 | continue; |
2492 | pdata->ptrs[i] = kmem_cache_alloc_node( | 2510 | pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, |
2493 | kmem_find_general_cachep(size, GFP_KERNEL), | 2511 | cpu_to_node(i)); |
2494 | cpu_to_node(i)); | ||
2495 | 2512 | ||
2496 | if (!pdata->ptrs[i]) | 2513 | if (!pdata->ptrs[i]) |
2497 | goto unwind_oom; | 2514 | goto unwind_oom; |
diff --git a/mm/swap_state.c b/mm/swap_state.c index a063a902ed03..4f251775ef90 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -143,7 +143,6 @@ void __delete_from_swap_cache(struct page *page) | |||
143 | int add_to_swap(struct page * page) | 143 | int add_to_swap(struct page * page) |
144 | { | 144 | { |
145 | swp_entry_t entry; | 145 | swp_entry_t entry; |
146 | int pf_flags; | ||
147 | int err; | 146 | int err; |
148 | 147 | ||
149 | if (!PageLocked(page)) | 148 | if (!PageLocked(page)) |
@@ -154,29 +153,19 @@ int add_to_swap(struct page * page) | |||
154 | if (!entry.val) | 153 | if (!entry.val) |
155 | return 0; | 154 | return 0; |
156 | 155 | ||
157 | /* Radix-tree node allocations are performing | 156 | /* |
158 | * GFP_ATOMIC allocations under PF_MEMALLOC. | 157 | * Radix-tree node allocations from PF_MEMALLOC contexts could |
159 | * They can completely exhaust the page allocator. | 158 | * completely exhaust the page allocator. __GFP_NOMEMALLOC |
160 | * | 159 | * stops emergency reserves from being allocated. |
161 | * So PF_MEMALLOC is dropped here. This causes the slab | ||
162 | * allocations to fail earlier, so radix-tree nodes will | ||
163 | * then be allocated from the mempool reserves. | ||
164 | * | 160 | * |
165 | * We're still using __GFP_HIGH for radix-tree node | 161 | * TODO: this could cause a theoretical memory reclaim |
166 | * allocations, so some of the emergency pools are available, | 162 | * deadlock in the swap out path. |
167 | * just not all of them. | ||
168 | */ | 163 | */ |
169 | |||
170 | pf_flags = current->flags; | ||
171 | current->flags &= ~PF_MEMALLOC; | ||
172 | |||
173 | /* | 164 | /* |
174 | * Add it to the swap cache and mark it dirty | 165 | * Add it to the swap cache and mark it dirty |
175 | */ | 166 | */ |
176 | err = __add_to_swap_cache(page, entry, GFP_ATOMIC|__GFP_NOWARN); | 167 | err = __add_to_swap_cache(page, entry, |
177 | 168 | GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN); | |
178 | if (pf_flags & PF_MEMALLOC) | ||
179 | current->flags |= PF_MEMALLOC; | ||
180 | 169 | ||
181 | switch (err) { | 170 | switch (err) { |
182 | case 0: /* Success */ | 171 | case 0: /* Success */ |
diff --git a/mm/truncate.c b/mm/truncate.c index c9a63f0b69a2..60c8764bfac2 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -242,7 +242,7 @@ EXPORT_SYMBOL(invalidate_inode_pages); | |||
242 | 242 | ||
243 | /** | 243 | /** |
244 | * invalidate_inode_pages2_range - remove range of pages from an address_space | 244 | * invalidate_inode_pages2_range - remove range of pages from an address_space |
245 | * @mapping - the address_space | 245 | * @mapping: the address_space |
246 | * @start: the page offset 'from' which to invalidate | 246 | * @start: the page offset 'from' which to invalidate |
247 | * @end: the page offset 'to' which to invalidate (inclusive) | 247 | * @end: the page offset 'to' which to invalidate (inclusive) |
248 | * | 248 | * |
@@ -322,7 +322,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | |||
322 | 322 | ||
323 | /** | 323 | /** |
324 | * invalidate_inode_pages2 - remove all pages from an address_space | 324 | * invalidate_inode_pages2 - remove all pages from an address_space |
325 | * @mapping - the address_space | 325 | * @mapping: the address_space |
326 | * | 326 | * |
327 | * Any pages which are found to be mapped into pagetables are unmapped prior to | 327 | * Any pages which are found to be mapped into pagetables are unmapped prior to |
328 | * invalidation. | 328 | * invalidation. |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c6182f6f1305..2bd83e5c2bbf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -475,6 +475,10 @@ void *vmalloc(unsigned long size) | |||
475 | 475 | ||
476 | EXPORT_SYMBOL(vmalloc); | 476 | EXPORT_SYMBOL(vmalloc); |
477 | 477 | ||
478 | #ifndef PAGE_KERNEL_EXEC | ||
479 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | ||
480 | #endif | ||
481 | |||
478 | /** | 482 | /** |
479 | * vmalloc_exec - allocate virtually contiguous, executable memory | 483 | * vmalloc_exec - allocate virtually contiguous, executable memory |
480 | * | 484 | * |
@@ -488,10 +492,6 @@ EXPORT_SYMBOL(vmalloc); | |||
488 | * use __vmalloc() instead. | 492 | * use __vmalloc() instead. |
489 | */ | 493 | */ |
490 | 494 | ||
491 | #ifndef PAGE_KERNEL_EXEC | ||
492 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | ||
493 | #endif | ||
494 | |||
495 | void *vmalloc_exec(unsigned long size) | 495 | void *vmalloc_exec(unsigned long size) |
496 | { | 496 | { |
497 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | 497 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); |